Merge branch 'main' of https://github.com/mt21625457/aicodex2api

2026-01-12 10:38:44 +08:00
parent 9dd0ef187d fd8473f267
commit 839ab37d40
124 changed files with 26865 additions and 605 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -126,6 +126,4 @@ backend/cmd/server/server
 deploy/docker-compose.override.yml
 .gocache/
 vite.config.js
 !docs/
 docs/*
 !docs/dependency-security.md
--- a/backend/.dockerignore
+++ b/backend/.dockerignore
@@ -0,0 +1,2 @@
 .cache/
 .DS_Store
--- a/backend/.golangci.yml
+++ b/backend/.golangci.yml
@@ -18,6 +18,12 @@ linters:
          list-mode: original
          files:
            - "**/internal/service/**"
            - "!**/internal/service/ops_aggregation_service.go"
            - "!**/internal/service/ops_alert_evaluator_service.go"
            - "!**/internal/service/ops_cleanup_service.go"
            - "!**/internal/service/ops_metrics_collector.go"
            - "!**/internal/service/ops_scheduled_report_service.go"
            - "!**/internal/service/wire.go"
          deny:
            - pkg: github.com/Wei-Shaw/sub2api/internal/repository
              desc: "service must not import repository"
--- a/backend/cmd/server/wire.go
+++ b/backend/cmd/server/wire.go
@@ -62,6 +62,11 @@ func provideServiceBuildInfo(buildInfo handler.BuildInfo) service.BuildInfo {
 func provideCleanup(
 	entClient *ent.Client,
 	rdb *redis.Client,
 	opsMetricsCollector *service.OpsMetricsCollector,
 	opsAggregation *service.OpsAggregationService,
 	opsAlertEvaluator *service.OpsAlertEvaluatorService,
 	opsCleanup *service.OpsCleanupService,
 	opsScheduledReport *service.OpsScheduledReportService,
 	tokenRefresh *service.TokenRefreshService,
 	accountExpiry *service.AccountExpiryService,
 	pricing *service.PricingService,
@@ -81,6 +86,36 @@ func provideCleanup(
 			name string
 			fn   func() error
 		}{
 			{"OpsScheduledReportService", func() error {
 				if opsScheduledReport != nil {
 					opsScheduledReport.Stop()
 				}
 				return nil
 			}},
 			{"OpsCleanupService", func() error {
 				if opsCleanup != nil {
 					opsCleanup.Stop()
 				}
 				return nil
 			}},
 			{"OpsAlertEvaluatorService", func() error {
 				if opsAlertEvaluator != nil {
 					opsAlertEvaluator.Stop()
 				}
 				return nil
 			}},
 			{"OpsAggregationService", func() error {
 				if opsAggregation != nil {
 					opsAggregation.Stop()
 				}
 				return nil
 			}},
 			{"OpsMetricsCollector", func() error {
 				if opsMetricsCollector != nil {
 					opsMetricsCollector.Stop()
 				}
 				return nil
 			}},
 			{"TokenRefreshService", func() error {
 				tokenRefresh.Stop()
 				return nil
--- a/backend/cmd/server/wire_gen.go
+++ b/backend/cmd/server/wire_gen.go
@@ -120,7 +120,22 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
 	proxyHandler := admin.NewProxyHandler(adminService)
 	adminRedeemHandler := admin.NewRedeemHandler(adminService)
 	promoHandler := admin.NewPromoHandler(promoService)
-	settingHandler := admin.NewSettingHandler(settingService, emailService, turnstileService)
+	opsRepository := repository.NewOpsRepository(db)
 	pricingRemoteClient := repository.ProvidePricingRemoteClient(configConfig)
 	pricingService, err := service.ProvidePricingService(configConfig, pricingRemoteClient)
 	if err != nil {
 		return nil, err
 	}
 	billingService := service.NewBillingService(configConfig, pricingService)
 	identityCache := repository.NewIdentityCache(redisClient)
 	identityService := service.NewIdentityService(identityCache)
 	deferredService := service.ProvideDeferredService(accountRepository, timingWheelService)
 	gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService)
 	openAIGatewayService := service.NewOpenAIGatewayService(accountRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, httpUpstream, deferredService)
 	geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig)
 	opsService := service.NewOpsService(opsRepository, settingRepository, configConfig, accountRepository, concurrencyService, gatewayService, openAIGatewayService, geminiMessagesCompatService, antigravityGatewayService)
 	settingHandler := admin.NewSettingHandler(settingService, emailService, turnstileService, opsService)
 	opsHandler := admin.NewOpsHandler(opsService)
 	updateCache := repository.NewUpdateCache(redisClient)
 	gitHubReleaseClient := repository.ProvideGitHubReleaseClient(configConfig)
 	serviceBuildInfo := provideServiceBuildInfo(buildInfo)
@@ -132,31 +147,24 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
 	userAttributeValueRepository := repository.NewUserAttributeValueRepository(client)
 	userAttributeService := service.NewUserAttributeService(userAttributeDefinitionRepository, userAttributeValueRepository)
 	userAttributeHandler := admin.NewUserAttributeHandler(userAttributeService)
-	adminHandlers := handler.ProvideAdminHandlers(dashboardHandler, adminUserHandler, groupHandler, accountHandler, oAuthHandler, openAIOAuthHandler, geminiOAuthHandler, antigravityOAuthHandler, proxyHandler, adminRedeemHandler, promoHandler, settingHandler, systemHandler, adminSubscriptionHandler, adminUsageHandler, userAttributeHandler)
+	adminHandlers := handler.ProvideAdminHandlers(dashboardHandler, adminUserHandler, groupHandler, accountHandler, oAuthHandler, openAIOAuthHandler, geminiOAuthHandler, antigravityOAuthHandler, proxyHandler, adminRedeemHandler, promoHandler, settingHandler, opsHandler, systemHandler, adminSubscriptionHandler, adminUsageHandler, userAttributeHandler)
 	pricingRemoteClient := repository.ProvidePricingRemoteClient(configConfig)
 	pricingService, err := service.ProvidePricingService(configConfig, pricingRemoteClient)
 	if err != nil {
 		return nil, err
 	}
 	billingService := service.NewBillingService(configConfig, pricingService)
 	identityCache := repository.NewIdentityCache(redisClient)
 	identityService := service.NewIdentityService(identityCache)
 	deferredService := service.ProvideDeferredService(accountRepository, timingWheelService)
 	gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService)
 	geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig)
 	gatewayHandler := handler.NewGatewayHandler(gatewayService, geminiMessagesCompatService, antigravityGatewayService, userService, concurrencyService, billingCacheService, configConfig)
 	openAIGatewayService := service.NewOpenAIGatewayService(accountRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, httpUpstream, deferredService)
 	openAIGatewayHandler := handler.NewOpenAIGatewayHandler(openAIGatewayService, concurrencyService, billingCacheService, configConfig)
 	handlerSettingHandler := handler.ProvideSettingHandler(settingService, buildInfo)
 	handlers := handler.ProvideHandlers(authHandler, userHandler, apiKeyHandler, usageHandler, redeemHandler, subscriptionHandler, adminHandlers, gatewayHandler, openAIGatewayHandler, handlerSettingHandler)
 	jwtAuthMiddleware := middleware.NewJWTAuthMiddleware(authService, userService)
 	adminAuthMiddleware := middleware.NewAdminAuthMiddleware(authService, userService, settingService)
 	apiKeyAuthMiddleware := middleware.NewAPIKeyAuthMiddleware(apiKeyService, subscriptionService, configConfig)
-	engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, settingService, redisClient)
+	engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, opsService, settingService, redisClient)
 	httpServer := server.ProvideHTTPServer(configConfig, engine)
 	opsMetricsCollector := service.ProvideOpsMetricsCollector(opsRepository, settingRepository, accountRepository, concurrencyService, db, redisClient, configConfig)
 	opsAggregationService := service.ProvideOpsAggregationService(opsRepository, settingRepository, db, redisClient, configConfig)
 	opsAlertEvaluatorService := service.ProvideOpsAlertEvaluatorService(opsService, opsRepository, emailService, redisClient, configConfig)
 	opsCleanupService := service.ProvideOpsCleanupService(opsRepository, db, redisClient, configConfig)
 	opsScheduledReportService := service.ProvideOpsScheduledReportService(opsService, userService, emailService, redisClient, configConfig)
 	tokenRefreshService := service.ProvideTokenRefreshService(accountRepository, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService, configConfig)
 	accountExpiryService := service.ProvideAccountExpiryService(accountRepository)
-	v := provideCleanup(client, redisClient, tokenRefreshService, accountExpiryService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService)
+	v := provideCleanup(client, redisClient, opsMetricsCollector, opsAggregationService, opsAlertEvaluatorService, opsCleanupService, opsScheduledReportService, tokenRefreshService, accountExpiryService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService)
 	application := &Application{
 		Server:  httpServer,
 		Cleanup: v,
@@ -181,6 +189,11 @@ func provideServiceBuildInfo(buildInfo handler.BuildInfo) service.BuildInfo {
 func provideCleanup(
 	entClient *ent.Client,
 	rdb *redis.Client,
 	opsMetricsCollector *service.OpsMetricsCollector,
 	opsAggregation *service.OpsAggregationService,
 	opsAlertEvaluator *service.OpsAlertEvaluatorService,
 	opsCleanup *service.OpsCleanupService,
 	opsScheduledReport *service.OpsScheduledReportService,
 	tokenRefresh *service.TokenRefreshService,
 	accountExpiry *service.AccountExpiryService,
 	pricing *service.PricingService,
@@ -199,6 +212,36 @@ func provideCleanup(
 			name string
 			fn   func() error
 		}{
 			{"OpsScheduledReportService", func() error {
 				if opsScheduledReport != nil {
 					opsScheduledReport.Stop()
 				}
 				return nil
 			}},
 			{"OpsCleanupService", func() error {
 				if opsCleanup != nil {
 					opsCleanup.Stop()
 				}
 				return nil
 			}},
 			{"OpsAlertEvaluatorService", func() error {
 				if opsAlertEvaluator != nil {
 					opsAlertEvaluator.Stop()
 				}
 				return nil
 			}},
 			{"OpsAggregationService", func() error {
 				if opsAggregation != nil {
 					opsAggregation.Stop()
 				}
 				return nil
 			}},
 			{"OpsMetricsCollector", func() error {
 				if opsMetricsCollector != nil {
 					opsMetricsCollector.Stop()
 				}
 				return nil
 			}},
 			{"TokenRefreshService", func() error {
 				tokenRefresh.Stop()
 				return nil
--- a/backend/go.mod
+++ b/backend/go.mod
@@ -8,9 +8,11 @@ require (
 	github.com/golang-jwt/jwt/v5 v5.2.2
 	github.com/google/uuid v1.6.0
 	github.com/google/wire v0.7.0
 	github.com/gorilla/websocket v1.5.3
 	github.com/imroc/req/v3 v3.57.0
 	github.com/lib/pq v1.10.9
 	github.com/redis/go-redis/v9 v9.17.2
 	github.com/shirou/gopsutil/v4 v4.25.6
 	github.com/spf13/viper v1.18.2
 	github.com/stretchr/testify v1.11.1
 	github.com/testcontainers/testcontainers-go/modules/postgres v0.40.0
@@ -106,9 +108,9 @@ require (
 	github.com/quic-go/quic-go v0.57.1 // indirect
 	github.com/refraction-networking/utls v1.8.1 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
 	github.com/robfig/cron/v3 v3.0.1 // indirect
 	github.com/sagikazarmark/locafero v0.4.0 // indirect
 	github.com/sagikazarmark/slog-shim v0.1.0 // indirect
 	github.com/shirou/gopsutil/v4 v4.25.6 // indirect
 	github.com/sirupsen/logrus v1.9.3 // indirect
 	github.com/sourcegraph/conc v0.3.0 // indirect
 	github.com/spaolacci/murmur3 v1.1.0 // indirect
--- a/backend/go.sum
+++ b/backend/go.sum
@@ -117,6 +117,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/wire v0.7.0 h1:JxUKI6+CVBgCO2WToKy/nQk0sS+amI9z9EjVmdaocj4=
 github.com/google/wire v0.7.0/go.mod h1:n6YbUQD9cPKTnHXEBN2DXlOp/mVADhVErcMFb0v3J18=
 github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
 github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg=
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4=
 github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
@@ -224,6 +226,8 @@ github.com/refraction-networking/utls v1.8.1 h1:yNY1kapmQU8JeM1sSw2H2asfTIwWxIkr
 github.com/refraction-networking/utls v1.8.1/go.mod h1:jkSOEkLqn+S/jtpEHPOsVv/4V4EVnelwbMQl4vCWXAM=
 github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
 github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
 github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
 github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
 github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
--- a/backend/internal/config/config.go
+++ b/backend/internal/config/config.go
@@ -43,6 +43,7 @@ type Config struct {
 	Turnstile    TurnstileConfig            `mapstructure:"turnstile"`
 	Database     DatabaseConfig             `mapstructure:"database"`
 	Redis        RedisConfig                `mapstructure:"redis"`
 	Ops          OpsConfig                  `mapstructure:"ops"`
 	JWT          JWTConfig                  `mapstructure:"jwt"`
 	LinuxDo      LinuxDoConnectConfig       `mapstructure:"linuxdo_connect"`
 	Default      DefaultConfig              `mapstructure:"default"`
@@ -60,14 +61,6 @@ type Config struct {
 	Update       UpdateConfig               `mapstructure:"update"`
 }
 // UpdateConfig 在线更新相关配置
 type UpdateConfig struct {
 	// ProxyURL 用于访问 GitHub 的代理地址
 	// 支持 http/https/socks5/socks5h 协议
 	// 例如: "http://127.0.0.1:7890", "socks5://127.0.0.1:1080"
 	ProxyURL string `mapstructure:"proxy_url"`
 }
 type GeminiConfig struct {
 	OAuth GeminiOAuthConfig `mapstructure:"oauth"`
 	Quota GeminiQuotaConfig `mapstructure:"quota"`
@@ -90,6 +83,33 @@ type GeminiTierQuotaConfig struct {
 	CooldownMinutes *int   `mapstructure:"cooldown_minutes" json:"cooldown_minutes"`
 }
 type UpdateConfig struct {
 	// ProxyURL 用于访问 GitHub 的代理地址
 	// 支持 http/https/socks5/socks5h 协议
 	// 例如: "http://127.0.0.1:7890", "socks5://127.0.0.1:1080"
 	ProxyURL string `mapstructure:"proxy_url"`
 }
 type LinuxDoConnectConfig struct {
 	Enabled             bool   `mapstructure:"enabled"`
 	ClientID            string `mapstructure:"client_id"`
 	ClientSecret        string `mapstructure:"client_secret"`
 	AuthorizeURL        string `mapstructure:"authorize_url"`
 	TokenURL            string `mapstructure:"token_url"`
 	UserInfoURL         string `mapstructure:"userinfo_url"`
 	Scopes              string `mapstructure:"scopes"`
 	RedirectURL         string `mapstructure:"redirect_url"`          // 后端回调地址（需在提供方后台登记）
 	FrontendRedirectURL string `mapstructure:"frontend_redirect_url"` // 前端接收 token 的路由（默认：/auth/linuxdo/callback）
 	TokenAuthMethod     string `mapstructure:"token_auth_method"`     // client_secret_post / client_secret_basic / none
 	UsePKCE             bool   `mapstructure:"use_pkce"`
 	// 可选：用于从 userinfo JSON 中提取字段的 gjson 路径。
 	// 为空时，服务端会尝试一组常见字段名。
 	UserInfoEmailPath    string `mapstructure:"userinfo_email_path"`
 	UserInfoIDPath       string `mapstructure:"userinfo_id_path"`
 	UserInfoUsernamePath string `mapstructure:"userinfo_username_path"`
 }
 // TokenRefreshConfig OAuth token自动刷新配置
 type TokenRefreshConfig struct {
 	// 是否启用自动刷新
@@ -332,6 +352,47 @@ func (r *RedisConfig) Address() string {
 	return fmt.Sprintf("%s:%d", r.Host, r.Port)
 }
 type OpsConfig struct {
 	// Enabled controls whether ops features should run.
 	//
 	// NOTE: vNext still has a DB-backed feature flag (ops_monitoring_enabled) for runtime on/off.
 	// This config flag is the "hard switch" for deployments that want to disable ops completely.
 	Enabled bool `mapstructure:"enabled"`
 	// UsePreaggregatedTables prefers ops_metrics_hourly/daily for long-window dashboard queries.
 	UsePreaggregatedTables bool `mapstructure:"use_preaggregated_tables"`
 	// Cleanup controls periodic deletion of old ops data to prevent unbounded growth.
 	Cleanup OpsCleanupConfig `mapstructure:"cleanup"`
 	// MetricsCollectorCache controls Redis caching for expensive per-window collector queries.
 	MetricsCollectorCache OpsMetricsCollectorCacheConfig `mapstructure:"metrics_collector_cache"`
 	// Pre-aggregation configuration.
 	Aggregation OpsAggregationConfig `mapstructure:"aggregation"`
 }
 type OpsCleanupConfig struct {
 	Enabled  bool   `mapstructure:"enabled"`
 	Schedule string `mapstructure:"schedule"`
 	// Retention days (0 disables that cleanup target).
 	//
 	// vNext requirement: default 30 days across ops datasets.
 	ErrorLogRetentionDays      int `mapstructure:"error_log_retention_days"`
 	MinuteMetricsRetentionDays int `mapstructure:"minute_metrics_retention_days"`
 	HourlyMetricsRetentionDays int `mapstructure:"hourly_metrics_retention_days"`
 }
 type OpsAggregationConfig struct {
 	Enabled bool `mapstructure:"enabled"`
 }
 type OpsMetricsCollectorCacheConfig struct {
 	Enabled bool          `mapstructure:"enabled"`
 	TTL     time.Duration `mapstructure:"ttl"`
 }
 type JWTConfig struct {
 	Secret     string `mapstructure:"secret"`
 	ExpireHour int    `mapstructure:"expire_hour"`
@@ -341,30 +402,6 @@ type TurnstileConfig struct {
 	Required bool `mapstructure:"required"`
 }
 // LinuxDoConnectConfig 用于 LinuxDo Connect OAuth 登录（终端用户 SSO）。
 //
 // 注意：这与上游账号的 OAuth（例如 OpenAI/Gemini 账号接入）不是一回事。
 // 这里是用于登录 Sub2API 本身的用户体系。
 type LinuxDoConnectConfig struct {
 	Enabled             bool   `mapstructure:"enabled"`
 	ClientID            string `mapstructure:"client_id"`
 	ClientSecret        string `mapstructure:"client_secret"`
 	AuthorizeURL        string `mapstructure:"authorize_url"`
 	TokenURL            string `mapstructure:"token_url"`
 	UserInfoURL         string `mapstructure:"userinfo_url"`
 	Scopes              string `mapstructure:"scopes"`
 	RedirectURL         string `mapstructure:"redirect_url"`          // 后端回调地址（需在提供方后台登记）
 	FrontendRedirectURL string `mapstructure:"frontend_redirect_url"` // 前端接收 token 的路由（默认：/auth/linuxdo/callback）
 	TokenAuthMethod     string `mapstructure:"token_auth_method"`     // client_secret_post / client_secret_basic / none
 	UsePKCE             bool   `mapstructure:"use_pkce"`
 	// 可选：用于从 userinfo JSON 中提取字段的 gjson 路径。
 	// 为空时，服务端会尝试一组常见字段名。
 	UserInfoEmailPath    string `mapstructure:"userinfo_email_path"`
 	UserInfoIDPath       string `mapstructure:"userinfo_id_path"`
 	UserInfoUsernamePath string `mapstructure:"userinfo_username_path"`
 }
 type DefaultConfig struct {
 	AdminEmail      string  `mapstructure:"admin_email"`
 	AdminPassword   string  `mapstructure:"admin_password"`
@@ -531,81 +568,6 @@ func Load() (*Config, error) {
 	return &cfg, nil
 }
 // ValidateAbsoluteHTTPURL 校验一个绝对 http(s) URL（禁止 fragment）。
 func ValidateAbsoluteHTTPURL(raw string) error {
 	raw = strings.TrimSpace(raw)
 	if raw == "" {
 		return fmt.Errorf("empty url")
 	}
 	u, err := url.Parse(raw)
 	if err != nil {
 		return err
 	}
 	if !u.IsAbs() {
 		return fmt.Errorf("must be absolute")
 	}
 	if !isHTTPScheme(u.Scheme) {
 		return fmt.Errorf("unsupported scheme: %s", u.Scheme)
 	}
 	if strings.TrimSpace(u.Host) == "" {
 		return fmt.Errorf("missing host")
 	}
 	if u.Fragment != "" {
 		return fmt.Errorf("must not include fragment")
 	}
 	return nil
 }
 // ValidateFrontendRedirectURL 校验前端回调地址：
 // - 允许同源相对路径（以 / 开头）
 // - 或绝对 http(s) URL（禁止 fragment）
 func ValidateFrontendRedirectURL(raw string) error {
 	raw = strings.TrimSpace(raw)
 	if raw == "" {
 		return fmt.Errorf("empty url")
 	}
 	if strings.ContainsAny(raw, "\r\n") {
 		return fmt.Errorf("contains invalid characters")
 	}
 	if strings.HasPrefix(raw, "/") {
 		if strings.HasPrefix(raw, "//") {
 			return fmt.Errorf("must not start with //")
 		}
 		return nil
 	}
 	u, err := url.Parse(raw)
 	if err != nil {
 		return err
 	}
 	if !u.IsAbs() {
 		return fmt.Errorf("must be absolute http(s) url or relative path")
 	}
 	if !isHTTPScheme(u.Scheme) {
 		return fmt.Errorf("unsupported scheme: %s", u.Scheme)
 	}
 	if strings.TrimSpace(u.Host) == "" {
 		return fmt.Errorf("missing host")
 	}
 	if u.Fragment != "" {
 		return fmt.Errorf("must not include fragment")
 	}
 	return nil
 }
 func isHTTPScheme(scheme string) bool {
 	return strings.EqualFold(scheme, "http") || strings.EqualFold(scheme, "https")
 }
 func warnIfInsecureURL(field, raw string) {
 	u, err := url.Parse(strings.TrimSpace(raw))
 	if err != nil {
 		return
 	}
 	if strings.EqualFold(u.Scheme, "http") {
 		log.Printf("Warning: %s uses http scheme; use https in production to avoid token leakage.", field)
 	}
 }
 func setDefaults() {
 	viper.SetDefault("run_mode", RunModeStandard)
@@ -655,7 +617,7 @@ func setDefaults() {
 	// Turnstile
 	viper.SetDefault("turnstile.required", false)
-	// LinuxDo Connect OAuth 登录（终端用户 SSO）
+	// LinuxDo Connect OAuth 登录
 	viper.SetDefault("linuxdo_connect.enabled", false)
 	viper.SetDefault("linuxdo_connect.client_id", "")
 	viper.SetDefault("linuxdo_connect.client_secret", "")
@@ -694,6 +656,20 @@ func setDefaults() {
 	viper.SetDefault("redis.pool_size", 128)
 	viper.SetDefault("redis.min_idle_conns", 10)
 	// Ops (vNext)
 	viper.SetDefault("ops.enabled", true)
 	viper.SetDefault("ops.use_preaggregated_tables", false)
 	viper.SetDefault("ops.cleanup.enabled", true)
 	viper.SetDefault("ops.cleanup.schedule", "0 2 * * *")
 	// Retention days: vNext defaults to 30 days across ops datasets.
 	viper.SetDefault("ops.cleanup.error_log_retention_days", 30)
 	viper.SetDefault("ops.cleanup.minute_metrics_retention_days", 30)
 	viper.SetDefault("ops.cleanup.hourly_metrics_retention_days", 30)
 	viper.SetDefault("ops.aggregation.enabled", true)
 	viper.SetDefault("ops.metrics_collector_cache.enabled", true)
 	// TTL should be slightly larger than collection interval (1m) to maximize cross-replica cache hits.
 	viper.SetDefault("ops.metrics_collector_cache.ttl", 65*time.Second)
 	// JWT
 	viper.SetDefault("jwt.secret", "")
 	viper.SetDefault("jwt.expire_hour", 24)
@@ -750,7 +726,7 @@ func setDefaults() {
 	// Gateway
 	viper.SetDefault("gateway.response_header_timeout", 600) // 600秒(10分钟)等待上游响应头，LLM高负载时可能排队较久
-	viper.SetDefault("gateway.log_upstream_error_body", false)
+	viper.SetDefault("gateway.log_upstream_error_body", true)
 	viper.SetDefault("gateway.log_upstream_error_body_max_bytes", 2048)
 	viper.SetDefault("gateway.inject_beta_for_apikey", false)
 	viper.SetDefault("gateway.failover_on_400", false)
@@ -766,7 +742,7 @@ func setDefaults() {
 	viper.SetDefault("gateway.concurrency_slot_ttl_minutes", 30) // 并发槽位过期时间（支持超长请求）
 	viper.SetDefault("gateway.stream_data_interval_timeout", 180)
 	viper.SetDefault("gateway.stream_keepalive_interval", 10)
-	viper.SetDefault("gateway.max_line_size", 40*1024*1024)
+	viper.SetDefault("gateway.max_line_size", 10*1024*1024)
 	viper.SetDefault("gateway.scheduling.sticky_session_max_waiting", 3)
 	viper.SetDefault("gateway.scheduling.sticky_session_wait_timeout", 45*time.Second)
 	viper.SetDefault("gateway.scheduling.fallback_wait_timeout", 30*time.Second)
@@ -789,10 +765,6 @@ func setDefaults() {
 	viper.SetDefault("gemini.oauth.client_secret", "")
 	viper.SetDefault("gemini.oauth.scopes", "")
 	viper.SetDefault("gemini.quota.policy", "")
 	// Update - 在线更新配置
 	// 代理地址为空表示直连 GitHub（适用于海外服务器）
 	viper.SetDefault("update.proxy_url", "")
 }
 func (c *Config) Validate() error {
@@ -833,7 +805,8 @@ func (c *Config) Validate() error {
 		if method == "none" && !c.LinuxDo.UsePKCE {
 			return fmt.Errorf("linuxdo_connect.use_pkce must be true when linuxdo_connect.token_auth_method=none")
 		}
-		if (method == "" || method == "client_secret_post" || method == "client_secret_basic") && strings.TrimSpace(c.LinuxDo.ClientSecret) == "" {
+		if (method == "" || method == "client_secret_post" || method == "client_secret_basic") &&
 			strings.TrimSpace(c.LinuxDo.ClientSecret) == "" {
 			return fmt.Errorf("linuxdo_connect.client_secret is required when linuxdo_connect.enabled=true and token_auth_method is client_secret_post/client_secret_basic")
 		}
 		if strings.TrimSpace(c.LinuxDo.FrontendRedirectURL) == "" {
@@ -1048,6 +1021,21 @@ func (c *Config) Validate() error {
 	if c.Gateway.Scheduling.SlotCleanupInterval < 0 {
 		return fmt.Errorf("gateway.scheduling.slot_cleanup_interval must be non-negative")
 	}
 	if c.Ops.MetricsCollectorCache.TTL < 0 {
 		return fmt.Errorf("ops.metrics_collector_cache.ttl must be non-negative")
 	}
 	if c.Ops.Cleanup.ErrorLogRetentionDays < 0 {
 		return fmt.Errorf("ops.cleanup.error_log_retention_days must be non-negative")
 	}
 	if c.Ops.Cleanup.MinuteMetricsRetentionDays < 0 {
 		return fmt.Errorf("ops.cleanup.minute_metrics_retention_days must be non-negative")
 	}
 	if c.Ops.Cleanup.HourlyMetricsRetentionDays < 0 {
 		return fmt.Errorf("ops.cleanup.hourly_metrics_retention_days must be non-negative")
 	}
 	if c.Ops.Cleanup.Enabled && strings.TrimSpace(c.Ops.Cleanup.Schedule) == "" {
 		return fmt.Errorf("ops.cleanup.schedule is required when ops.cleanup.enabled=true")
 	}
 	if c.Concurrency.PingInterval < 5 || c.Concurrency.PingInterval > 30 {
 		return fmt.Errorf("concurrency.ping_interval must be between 5-30 seconds")
 	}
@@ -1124,3 +1112,77 @@ func GetServerAddress() string {
 	port := v.GetInt("server.port")
 	return fmt.Sprintf("%s:%d", host, port)
 }
 // ValidateAbsoluteHTTPURL 验证是否为有效的绝对 HTTP(S) URL
 func ValidateAbsoluteHTTPURL(raw string) error {
 	raw = strings.TrimSpace(raw)
 	if raw == "" {
 		return fmt.Errorf("empty url")
 	}
 	u, err := url.Parse(raw)
 	if err != nil {
 		return err
 	}
 	if !u.IsAbs() {
 		return fmt.Errorf("must be absolute")
 	}
 	if !isHTTPScheme(u.Scheme) {
 		return fmt.Errorf("unsupported scheme: %s", u.Scheme)
 	}
 	if strings.TrimSpace(u.Host) == "" {
 		return fmt.Errorf("missing host")
 	}
 	if u.Fragment != "" {
 		return fmt.Errorf("must not include fragment")
 	}
 	return nil
 }
 // ValidateFrontendRedirectURL 验证前端重定向 URL（可以是绝对 URL 或相对路径）
 func ValidateFrontendRedirectURL(raw string) error {
 	raw = strings.TrimSpace(raw)
 	if raw == "" {
 		return fmt.Errorf("empty url")
 	}
 	if strings.ContainsAny(raw, "\r\n") {
 		return fmt.Errorf("contains invalid characters")
 	}
 	if strings.HasPrefix(raw, "/") {
 		if strings.HasPrefix(raw, "//") {
 			return fmt.Errorf("must not start with //")
 		}
 		return nil
 	}
 	u, err := url.Parse(raw)
 	if err != nil {
 		return err
 	}
 	if !u.IsAbs() {
 		return fmt.Errorf("must be absolute http(s) url or relative path")
 	}
 	if !isHTTPScheme(u.Scheme) {
 		return fmt.Errorf("unsupported scheme: %s", u.Scheme)
 	}
 	if strings.TrimSpace(u.Host) == "" {
 		return fmt.Errorf("missing host")
 	}
 	if u.Fragment != "" {
 		return fmt.Errorf("must not include fragment")
 	}
 	return nil
 }
 // isHTTPScheme 检查是否为 HTTP 或 HTTPS 协议
 func isHTTPScheme(scheme string) bool {
 	return strings.EqualFold(scheme, "http") || strings.EqualFold(scheme, "https")
 }
 func warnIfInsecureURL(field, raw string) {
 	u, err := url.Parse(strings.TrimSpace(raw))
 	if err != nil {
 		return
 	}
 	if strings.EqualFold(u.Scheme, "http") {
 		log.Printf("Warning: %s uses http scheme; use https in production to avoid token leakage.", field)
 	}
 }
--- a/backend/internal/handler/admin/ops_alerts_handler.go
+++ b/backend/internal/handler/admin/ops_alerts_handler.go
@@ -0,0 +1,432 @@
 package admin
 import (
 	"encoding/json"
 	"fmt"
 	"math"
 	"net/http"
 	"strconv"
 	"strings"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/response"
 	"github.com/Wei-Shaw/sub2api/internal/service"
 	"github.com/gin-gonic/gin"
 	"github.com/gin-gonic/gin/binding"
 )
 var validOpsAlertMetricTypes = []string{
 	"success_rate",
 	"error_rate",
 	"upstream_error_rate",
 	"p95_latency_ms",
 	"p99_latency_ms",
 	"cpu_usage_percent",
 	"memory_usage_percent",
 	"concurrency_queue_depth",
 }
 var validOpsAlertMetricTypeSet = func() map[string]struct{} {
 	set := make(map[string]struct{}, len(validOpsAlertMetricTypes))
 	for _, v := range validOpsAlertMetricTypes {
 		set[v] = struct{}{}
 	}
 	return set
 }()
 var validOpsAlertOperators = []string{">", "<", ">=", "<=", "==", "!="}
 var validOpsAlertOperatorSet = func() map[string]struct{} {
 	set := make(map[string]struct{}, len(validOpsAlertOperators))
 	for _, v := range validOpsAlertOperators {
 		set[v] = struct{}{}
 	}
 	return set
 }()
 var validOpsAlertSeverities = []string{"P0", "P1", "P2", "P3"}
 var validOpsAlertSeveritySet = func() map[string]struct{} {
 	set := make(map[string]struct{}, len(validOpsAlertSeverities))
 	for _, v := range validOpsAlertSeverities {
 		set[v] = struct{}{}
 	}
 	return set
 }()
 type opsAlertRuleValidatedInput struct {
 	Name       string
 	MetricType string
 	Operator   string
 	Threshold  float64
 	Severity string
 	WindowMinutes    int
 	SustainedMinutes int
 	CooldownMinutes  int
 	Enabled     bool
 	NotifyEmail bool
 	WindowProvided    bool
 	SustainedProvided bool
 	CooldownProvided  bool
 	SeverityProvided  bool
 	EnabledProvided   bool
 	NotifyProvided    bool
 }
 func isPercentOrRateMetric(metricType string) bool {
 	switch metricType {
 	case "success_rate",
 		"error_rate",
 		"upstream_error_rate",
 		"cpu_usage_percent",
 		"memory_usage_percent":
 		return true
 	default:
 		return false
 	}
 }
 func validateOpsAlertRulePayload(raw map[string]json.RawMessage) (*opsAlertRuleValidatedInput, error) {
 	if raw == nil {
 		return nil, fmt.Errorf("invalid request body")
 	}
 	requiredFields := []string{"name", "metric_type", "operator", "threshold"}
 	for _, field := range requiredFields {
 		if _, ok := raw[field]; !ok {
 			return nil, fmt.Errorf("%s is required", field)
 		}
 	}
 	var name string
 	if err := json.Unmarshal(raw["name"], &name); err != nil || strings.TrimSpace(name) == "" {
 		return nil, fmt.Errorf("name is required")
 	}
 	name = strings.TrimSpace(name)
 	var metricType string
 	if err := json.Unmarshal(raw["metric_type"], &metricType); err != nil || strings.TrimSpace(metricType) == "" {
 		return nil, fmt.Errorf("metric_type is required")
 	}
 	metricType = strings.TrimSpace(metricType)
 	if _, ok := validOpsAlertMetricTypeSet[metricType]; !ok {
 		return nil, fmt.Errorf("metric_type must be one of: %s", strings.Join(validOpsAlertMetricTypes, ", "))
 	}
 	var operator string
 	if err := json.Unmarshal(raw["operator"], &operator); err != nil || strings.TrimSpace(operator) == "" {
 		return nil, fmt.Errorf("operator is required")
 	}
 	operator = strings.TrimSpace(operator)
 	if _, ok := validOpsAlertOperatorSet[operator]; !ok {
 		return nil, fmt.Errorf("operator must be one of: %s", strings.Join(validOpsAlertOperators, ", "))
 	}
 	var threshold float64
 	if err := json.Unmarshal(raw["threshold"], &threshold); err != nil {
 		return nil, fmt.Errorf("threshold must be a number")
 	}
 	if math.IsNaN(threshold) || math.IsInf(threshold, 0) {
 		return nil, fmt.Errorf("threshold must be a finite number")
 	}
 	if isPercentOrRateMetric(metricType) {
 		if threshold < 0 || threshold > 100 {
 			return nil, fmt.Errorf("threshold must be between 0 and 100 for metric_type %s", metricType)
 		}
 	} else if threshold < 0 {
 		return nil, fmt.Errorf("threshold must be >= 0")
 	}
 	validated := &opsAlertRuleValidatedInput{
 		Name:       name,
 		MetricType: metricType,
 		Operator:   operator,
 		Threshold:  threshold,
 	}
 	if v, ok := raw["severity"]; ok {
 		validated.SeverityProvided = true
 		var sev string
 		if err := json.Unmarshal(v, &sev); err != nil {
 			return nil, fmt.Errorf("severity must be a string")
 		}
 		sev = strings.ToUpper(strings.TrimSpace(sev))
 		if sev != "" {
 			if _, ok := validOpsAlertSeveritySet[sev]; !ok {
 				return nil, fmt.Errorf("severity must be one of: %s", strings.Join(validOpsAlertSeverities, ", "))
 			}
 			validated.Severity = sev
 		}
 	}
 	if validated.Severity == "" {
 		validated.Severity = "P2"
 	}
 	if v, ok := raw["enabled"]; ok {
 		validated.EnabledProvided = true
 		if err := json.Unmarshal(v, &validated.Enabled); err != nil {
 			return nil, fmt.Errorf("enabled must be a boolean")
 		}
 	} else {
 		validated.Enabled = true
 	}
 	if v, ok := raw["notify_email"]; ok {
 		validated.NotifyProvided = true
 		if err := json.Unmarshal(v, &validated.NotifyEmail); err != nil {
 			return nil, fmt.Errorf("notify_email must be a boolean")
 		}
 	} else {
 		validated.NotifyEmail = true
 	}
 	if v, ok := raw["window_minutes"]; ok {
 		validated.WindowProvided = true
 		if err := json.Unmarshal(v, &validated.WindowMinutes); err != nil {
 			return nil, fmt.Errorf("window_minutes must be an integer")
 		}
 		switch validated.WindowMinutes {
 		case 1, 5, 60:
 		default:
 			return nil, fmt.Errorf("window_minutes must be one of: 1, 5, 60")
 		}
 	} else {
 		validated.WindowMinutes = 1
 	}
 	if v, ok := raw["sustained_minutes"]; ok {
 		validated.SustainedProvided = true
 		if err := json.Unmarshal(v, &validated.SustainedMinutes); err != nil {
 			return nil, fmt.Errorf("sustained_minutes must be an integer")
 		}
 		if validated.SustainedMinutes < 1 || validated.SustainedMinutes > 1440 {
 			return nil, fmt.Errorf("sustained_minutes must be between 1 and 1440")
 		}
 	} else {
 		validated.SustainedMinutes = 1
 	}
 	if v, ok := raw["cooldown_minutes"]; ok {
 		validated.CooldownProvided = true
 		if err := json.Unmarshal(v, &validated.CooldownMinutes); err != nil {
 			return nil, fmt.Errorf("cooldown_minutes must be an integer")
 		}
 		if validated.CooldownMinutes < 0 || validated.CooldownMinutes > 1440 {
 			return nil, fmt.Errorf("cooldown_minutes must be between 0 and 1440")
 		}
 	} else {
 		validated.CooldownMinutes = 0
 	}
 	return validated, nil
 }
 // ListAlertRules returns all ops alert rules.
 // GET /api/v1/admin/ops/alert-rules
 func (h *OpsHandler) ListAlertRules(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	rules, err := h.opsService.ListAlertRules(c.Request.Context())
 	if err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	response.Success(c, rules)
 }
 // CreateAlertRule creates an ops alert rule.
 // POST /api/v1/admin/ops/alert-rules
 func (h *OpsHandler) CreateAlertRule(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	var raw map[string]json.RawMessage
 	if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil {
 		response.BadRequest(c, "Invalid request body")
 		return
 	}
 	validated, err := validateOpsAlertRulePayload(raw)
 	if err != nil {
 		response.BadRequest(c, err.Error())
 		return
 	}
 	var rule service.OpsAlertRule
 	if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil {
 		response.BadRequest(c, "Invalid request body")
 		return
 	}
 	rule.Name = validated.Name
 	rule.MetricType = validated.MetricType
 	rule.Operator = validated.Operator
 	rule.Threshold = validated.Threshold
 	rule.WindowMinutes = validated.WindowMinutes
 	rule.SustainedMinutes = validated.SustainedMinutes
 	rule.CooldownMinutes = validated.CooldownMinutes
 	rule.Severity = validated.Severity
 	rule.Enabled = validated.Enabled
 	rule.NotifyEmail = validated.NotifyEmail
 	created, err := h.opsService.CreateAlertRule(c.Request.Context(), &rule)
 	if err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	response.Success(c, created)
 }
 // UpdateAlertRule updates an existing ops alert rule.
 // PUT /api/v1/admin/ops/alert-rules/:id
 func (h *OpsHandler) UpdateAlertRule(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	id, err := strconv.ParseInt(c.Param("id"), 10, 64)
 	if err != nil || id <= 0 {
 		response.BadRequest(c, "Invalid rule ID")
 		return
 	}
 	var raw map[string]json.RawMessage
 	if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil {
 		response.BadRequest(c, "Invalid request body")
 		return
 	}
 	validated, err := validateOpsAlertRulePayload(raw)
 	if err != nil {
 		response.BadRequest(c, err.Error())
 		return
 	}
 	var rule service.OpsAlertRule
 	if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil {
 		response.BadRequest(c, "Invalid request body")
 		return
 	}
 	rule.ID = id
 	rule.Name = validated.Name
 	rule.MetricType = validated.MetricType
 	rule.Operator = validated.Operator
 	rule.Threshold = validated.Threshold
 	rule.WindowMinutes = validated.WindowMinutes
 	rule.SustainedMinutes = validated.SustainedMinutes
 	rule.CooldownMinutes = validated.CooldownMinutes
 	rule.Severity = validated.Severity
 	rule.Enabled = validated.Enabled
 	rule.NotifyEmail = validated.NotifyEmail
 	updated, err := h.opsService.UpdateAlertRule(c.Request.Context(), &rule)
 	if err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	response.Success(c, updated)
 }
 // DeleteAlertRule deletes an ops alert rule.
 // DELETE /api/v1/admin/ops/alert-rules/:id
 func (h *OpsHandler) DeleteAlertRule(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	id, err := strconv.ParseInt(c.Param("id"), 10, 64)
 	if err != nil || id <= 0 {
 		response.BadRequest(c, "Invalid rule ID")
 		return
 	}
 	if err := h.opsService.DeleteAlertRule(c.Request.Context(), id); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	response.Success(c, gin.H{"deleted": true})
 }
 // ListAlertEvents lists recent ops alert events.
 // GET /api/v1/admin/ops/alert-events
 func (h *OpsHandler) ListAlertEvents(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	limit := 100
 	if raw := strings.TrimSpace(c.Query("limit")); raw != "" {
 		n, err := strconv.Atoi(raw)
 		if err != nil || n <= 0 {
 			response.BadRequest(c, "Invalid limit")
 			return
 		}
 		limit = n
 	}
 	filter := &service.OpsAlertEventFilter{
 		Limit:    limit,
 		Status:   strings.TrimSpace(c.Query("status")),
 		Severity: strings.TrimSpace(c.Query("severity")),
 	}
 	// Optional global filter support (platform/group/time range).
 	if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
 		filter.Platform = platform
 	}
 	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
 		id, err := strconv.ParseInt(v, 10, 64)
 		if err != nil || id <= 0 {
 			response.BadRequest(c, "Invalid group_id")
 			return
 		}
 		filter.GroupID = &id
 	}
 	if startTime, endTime, err := parseOpsTimeRange(c, "24h"); err == nil {
 		// Only apply when explicitly provided to avoid surprising default narrowing.
 		if strings.TrimSpace(c.Query("start_time")) != "" || strings.TrimSpace(c.Query("end_time")) != "" || strings.TrimSpace(c.Query("time_range")) != "" {
 			filter.StartTime = &startTime
 			filter.EndTime = &endTime
 		}
 	} else {
 		response.BadRequest(c, err.Error())
 		return
 	}
 	events, err := h.opsService.ListAlertEvents(c.Request.Context(), filter)
 	if err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	response.Success(c, events)
 }
--- a/backend/internal/handler/admin/ops_dashboard_handler.go
+++ b/backend/internal/handler/admin/ops_dashboard_handler.go
@@ -0,0 +1,243 @@
 package admin
 import (
 	"net/http"
 	"strconv"
 	"strings"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/response"
 	"github.com/Wei-Shaw/sub2api/internal/service"
 	"github.com/gin-gonic/gin"
 )
 // GetDashboardOverview returns vNext ops dashboard overview (raw path).
 // GET /api/v1/admin/ops/dashboard/overview
 func (h *OpsHandler) GetDashboardOverview(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	startTime, endTime, err := parseOpsTimeRange(c, "1h")
 	if err != nil {
 		response.BadRequest(c, err.Error())
 		return
 	}
 	filter := &service.OpsDashboardFilter{
 		StartTime: startTime,
 		EndTime:   endTime,
 		Platform:  strings.TrimSpace(c.Query("platform")),
 		QueryMode: parseOpsQueryMode(c),
 	}
 	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
 		id, err := strconv.ParseInt(v, 10, 64)
 		if err != nil || id <= 0 {
 			response.BadRequest(c, "Invalid group_id")
 			return
 		}
 		filter.GroupID = &id
 	}
 	data, err := h.opsService.GetDashboardOverview(c.Request.Context(), filter)
 	if err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	response.Success(c, data)
 }
 // GetDashboardThroughputTrend returns throughput time series (raw path).
 // GET /api/v1/admin/ops/dashboard/throughput-trend
 func (h *OpsHandler) GetDashboardThroughputTrend(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	startTime, endTime, err := parseOpsTimeRange(c, "1h")
 	if err != nil {
 		response.BadRequest(c, err.Error())
 		return
 	}
 	filter := &service.OpsDashboardFilter{
 		StartTime: startTime,
 		EndTime:   endTime,
 		Platform:  strings.TrimSpace(c.Query("platform")),
 		QueryMode: parseOpsQueryMode(c),
 	}
 	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
 		id, err := strconv.ParseInt(v, 10, 64)
 		if err != nil || id <= 0 {
 			response.BadRequest(c, "Invalid group_id")
 			return
 		}
 		filter.GroupID = &id
 	}
 	bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime))
 	data, err := h.opsService.GetThroughputTrend(c.Request.Context(), filter, bucketSeconds)
 	if err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	response.Success(c, data)
 }
 // GetDashboardLatencyHistogram returns the latency distribution histogram (success requests).
 // GET /api/v1/admin/ops/dashboard/latency-histogram
 func (h *OpsHandler) GetDashboardLatencyHistogram(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	startTime, endTime, err := parseOpsTimeRange(c, "1h")
 	if err != nil {
 		response.BadRequest(c, err.Error())
 		return
 	}
 	filter := &service.OpsDashboardFilter{
 		StartTime: startTime,
 		EndTime:   endTime,
 		Platform:  strings.TrimSpace(c.Query("platform")),
 		QueryMode: parseOpsQueryMode(c),
 	}
 	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
 		id, err := strconv.ParseInt(v, 10, 64)
 		if err != nil || id <= 0 {
 			response.BadRequest(c, "Invalid group_id")
 			return
 		}
 		filter.GroupID = &id
 	}
 	data, err := h.opsService.GetLatencyHistogram(c.Request.Context(), filter)
 	if err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	response.Success(c, data)
 }
 // GetDashboardErrorTrend returns error counts time series (raw path).
 // GET /api/v1/admin/ops/dashboard/error-trend
 func (h *OpsHandler) GetDashboardErrorTrend(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	startTime, endTime, err := parseOpsTimeRange(c, "1h")
 	if err != nil {
 		response.BadRequest(c, err.Error())
 		return
 	}
 	filter := &service.OpsDashboardFilter{
 		StartTime: startTime,
 		EndTime:   endTime,
 		Platform:  strings.TrimSpace(c.Query("platform")),
 		QueryMode: parseOpsQueryMode(c),
 	}
 	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
 		id, err := strconv.ParseInt(v, 10, 64)
 		if err != nil || id <= 0 {
 			response.BadRequest(c, "Invalid group_id")
 			return
 		}
 		filter.GroupID = &id
 	}
 	bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime))
 	data, err := h.opsService.GetErrorTrend(c.Request.Context(), filter, bucketSeconds)
 	if err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	response.Success(c, data)
 }
 // GetDashboardErrorDistribution returns error distribution by status code (raw path).
 // GET /api/v1/admin/ops/dashboard/error-distribution
 func (h *OpsHandler) GetDashboardErrorDistribution(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	startTime, endTime, err := parseOpsTimeRange(c, "1h")
 	if err != nil {
 		response.BadRequest(c, err.Error())
 		return
 	}
 	filter := &service.OpsDashboardFilter{
 		StartTime: startTime,
 		EndTime:   endTime,
 		Platform:  strings.TrimSpace(c.Query("platform")),
 		QueryMode: parseOpsQueryMode(c),
 	}
 	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
 		id, err := strconv.ParseInt(v, 10, 64)
 		if err != nil || id <= 0 {
 			response.BadRequest(c, "Invalid group_id")
 			return
 		}
 		filter.GroupID = &id
 	}
 	data, err := h.opsService.GetErrorDistribution(c.Request.Context(), filter)
 	if err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	response.Success(c, data)
 }
 func pickThroughputBucketSeconds(window time.Duration) int {
 	// Keep buckets predictable and avoid huge responses.
 	switch {
 	case window <= 2*time.Hour:
 		return 60
 	case window <= 24*time.Hour:
 		return 300
 	default:
 		return 3600
 	}
 }
 func parseOpsQueryMode(c *gin.Context) service.OpsQueryMode {
 	if c == nil {
 		return ""
 	}
 	raw := strings.TrimSpace(c.Query("mode"))
 	if raw == "" {
 		// Empty means "use server default" (DB setting ops_query_mode_default).
 		return ""
 	}
 	return service.ParseOpsQueryMode(raw)
 }
--- a/backend/internal/handler/admin/ops_handler.go
+++ b/backend/internal/handler/admin/ops_handler.go
@@ -0,0 +1,364 @@
 package admin
 import (
 	"errors"
 	"fmt"
 	"io"
 	"net/http"
 	"strconv"
 	"strings"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/response"
 	"github.com/Wei-Shaw/sub2api/internal/server/middleware"
 	"github.com/Wei-Shaw/sub2api/internal/service"
 	"github.com/gin-gonic/gin"
 )
 type OpsHandler struct {
 	opsService *service.OpsService
 }
 func NewOpsHandler(opsService *service.OpsService) *OpsHandler {
 	return &OpsHandler{opsService: opsService}
 }
 // GetErrorLogs lists ops error logs.
 // GET /api/v1/admin/ops/errors
 func (h *OpsHandler) GetErrorLogs(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	page, pageSize := response.ParsePagination(c)
 	// Ops list can be larger than standard admin tables.
 	if pageSize > 500 {
 		pageSize = 500
 	}
 	startTime, endTime, err := parseOpsTimeRange(c, "1h")
 	if err != nil {
 		response.BadRequest(c, err.Error())
 		return
 	}
 	filter := &service.OpsErrorLogFilter{
 		Page:     page,
 		PageSize: pageSize,
 	}
 	if !startTime.IsZero() {
 		filter.StartTime = &startTime
 	}
 	if !endTime.IsZero() {
 		filter.EndTime = &endTime
 	}
 	if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
 		filter.Platform = platform
 	}
 	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
 		id, err := strconv.ParseInt(v, 10, 64)
 		if err != nil || id <= 0 {
 			response.BadRequest(c, "Invalid group_id")
 			return
 		}
 		filter.GroupID = &id
 	}
 	if v := strings.TrimSpace(c.Query("account_id")); v != "" {
 		id, err := strconv.ParseInt(v, 10, 64)
 		if err != nil || id <= 0 {
 			response.BadRequest(c, "Invalid account_id")
 			return
 		}
 		filter.AccountID = &id
 	}
 	if phase := strings.TrimSpace(c.Query("phase")); phase != "" {
 		filter.Phase = phase
 	}
 	if q := strings.TrimSpace(c.Query("q")); q != "" {
 		filter.Query = q
 	}
 	if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" {
 		parts := strings.Split(statusCodesStr, ",")
 		out := make([]int, 0, len(parts))
 		for _, part := range parts {
 			p := strings.TrimSpace(part)
 			if p == "" {
 				continue
 			}
 			n, err := strconv.Atoi(p)
 			if err != nil || n < 0 {
 				response.BadRequest(c, "Invalid status_codes")
 				return
 			}
 			out = append(out, n)
 		}
 		filter.StatusCodes = out
 	}
 	result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter)
 	if err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize)
 }
 // GetErrorLogByID returns a single error log detail.
 // GET /api/v1/admin/ops/errors/:id
 func (h *OpsHandler) GetErrorLogByID(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	idStr := strings.TrimSpace(c.Param("id"))
 	id, err := strconv.ParseInt(idStr, 10, 64)
 	if err != nil || id <= 0 {
 		response.BadRequest(c, "Invalid error id")
 		return
 	}
 	detail, err := h.opsService.GetErrorLogByID(c.Request.Context(), id)
 	if err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	response.Success(c, detail)
 }
 // ListRequestDetails returns a request-level list (success + error) for drill-down.
 // GET /api/v1/admin/ops/requests
 func (h *OpsHandler) ListRequestDetails(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	page, pageSize := response.ParsePagination(c)
 	if pageSize > 100 {
 		pageSize = 100
 	}
 	startTime, endTime, err := parseOpsTimeRange(c, "1h")
 	if err != nil {
 		response.BadRequest(c, err.Error())
 		return
 	}
 	filter := &service.OpsRequestDetailFilter{
 		Page:      page,
 		PageSize:  pageSize,
 		StartTime: &startTime,
 		EndTime:   &endTime,
 	}
 	filter.Kind = strings.TrimSpace(c.Query("kind"))
 	filter.Platform = strings.TrimSpace(c.Query("platform"))
 	filter.Model = strings.TrimSpace(c.Query("model"))
 	filter.RequestID = strings.TrimSpace(c.Query("request_id"))
 	filter.Query = strings.TrimSpace(c.Query("q"))
 	filter.Sort = strings.TrimSpace(c.Query("sort"))
 	if v := strings.TrimSpace(c.Query("user_id")); v != "" {
 		id, err := strconv.ParseInt(v, 10, 64)
 		if err != nil || id <= 0 {
 			response.BadRequest(c, "Invalid user_id")
 			return
 		}
 		filter.UserID = &id
 	}
 	if v := strings.TrimSpace(c.Query("api_key_id")); v != "" {
 		id, err := strconv.ParseInt(v, 10, 64)
 		if err != nil || id <= 0 {
 			response.BadRequest(c, "Invalid api_key_id")
 			return
 		}
 		filter.APIKeyID = &id
 	}
 	if v := strings.TrimSpace(c.Query("account_id")); v != "" {
 		id, err := strconv.ParseInt(v, 10, 64)
 		if err != nil || id <= 0 {
 			response.BadRequest(c, "Invalid account_id")
 			return
 		}
 		filter.AccountID = &id
 	}
 	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
 		id, err := strconv.ParseInt(v, 10, 64)
 		if err != nil || id <= 0 {
 			response.BadRequest(c, "Invalid group_id")
 			return
 		}
 		filter.GroupID = &id
 	}
 	if v := strings.TrimSpace(c.Query("min_duration_ms")); v != "" {
 		parsed, err := strconv.Atoi(v)
 		if err != nil || parsed < 0 {
 			response.BadRequest(c, "Invalid min_duration_ms")
 			return
 		}
 		filter.MinDurationMs = &parsed
 	}
 	if v := strings.TrimSpace(c.Query("max_duration_ms")); v != "" {
 		parsed, err := strconv.Atoi(v)
 		if err != nil || parsed < 0 {
 			response.BadRequest(c, "Invalid max_duration_ms")
 			return
 		}
 		filter.MaxDurationMs = &parsed
 	}
 	out, err := h.opsService.ListRequestDetails(c.Request.Context(), filter)
 	if err != nil {
 		// Invalid sort/kind/platform etc should be a bad request; keep it simple.
 		if strings.Contains(strings.ToLower(err.Error()), "invalid") {
 			response.BadRequest(c, err.Error())
 			return
 		}
 		response.Error(c, http.StatusInternalServerError, "Failed to list request details")
 		return
 	}
 	response.Paginated(c, out.Items, out.Total, out.Page, out.PageSize)
 }
 type opsRetryRequest struct {
 	Mode            string `json:"mode"`
 	PinnedAccountID *int64 `json:"pinned_account_id"`
 }
 // RetryErrorRequest retries a failed request using stored request_body.
 // POST /api/v1/admin/ops/errors/:id/retry
 func (h *OpsHandler) RetryErrorRequest(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	subject, ok := middleware.GetAuthSubjectFromContext(c)
 	if !ok || subject.UserID <= 0 {
 		response.Error(c, http.StatusUnauthorized, "Unauthorized")
 		return
 	}
 	idStr := strings.TrimSpace(c.Param("id"))
 	id, err := strconv.ParseInt(idStr, 10, 64)
 	if err != nil || id <= 0 {
 		response.BadRequest(c, "Invalid error id")
 		return
 	}
 	req := opsRetryRequest{Mode: service.OpsRetryModeClient}
 	if err := c.ShouldBindJSON(&req); err != nil && !errors.Is(err, io.EOF) {
 		response.BadRequest(c, "Invalid request: "+err.Error())
 		return
 	}
 	if strings.TrimSpace(req.Mode) == "" {
 		req.Mode = service.OpsRetryModeClient
 	}
 	result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, req.Mode, req.PinnedAccountID)
 	if err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	response.Success(c, result)
 }
 func parseOpsTimeRange(c *gin.Context, defaultRange string) (time.Time, time.Time, error) {
 	startStr := strings.TrimSpace(c.Query("start_time"))
 	endStr := strings.TrimSpace(c.Query("end_time"))
 	parseTS := func(s string) (time.Time, error) {
 		if s == "" {
 			return time.Time{}, nil
 		}
 		if t, err := time.Parse(time.RFC3339Nano, s); err == nil {
 			return t, nil
 		}
 		return time.Parse(time.RFC3339, s)
 	}
 	start, err := parseTS(startStr)
 	if err != nil {
 		return time.Time{}, time.Time{}, err
 	}
 	end, err := parseTS(endStr)
 	if err != nil {
 		return time.Time{}, time.Time{}, err
 	}
 	// start/end explicitly provided (even partially)
 	if startStr != "" || endStr != "" {
 		if end.IsZero() {
 			end = time.Now()
 		}
 		if start.IsZero() {
 			dur, _ := parseOpsDuration(defaultRange)
 			start = end.Add(-dur)
 		}
 		if start.After(end) {
 			return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: start_time must be <= end_time")
 		}
 		if end.Sub(start) > 30*24*time.Hour {
 			return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days")
 		}
 		return start, end, nil
 	}
 	// time_range fallback
 	tr := strings.TrimSpace(c.Query("time_range"))
 	if tr == "" {
 		tr = defaultRange
 	}
 	dur, ok := parseOpsDuration(tr)
 	if !ok {
 		dur, _ = parseOpsDuration(defaultRange)
 	}
 	end = time.Now()
 	start = end.Add(-dur)
 	if end.Sub(start) > 30*24*time.Hour {
 		return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days")
 	}
 	return start, end, nil
 }
 func parseOpsDuration(v string) (time.Duration, bool) {
 	switch strings.TrimSpace(v) {
 	case "5m":
 		return 5 * time.Minute, true
 	case "30m":
 		return 30 * time.Minute, true
 	case "1h":
 		return time.Hour, true
 	case "6h":
 		return 6 * time.Hour, true
 	case "24h":
 		return 24 * time.Hour, true
 	default:
 		return 0, false
 	}
 }
--- a/backend/internal/handler/admin/ops_realtime_handler.go
+++ b/backend/internal/handler/admin/ops_realtime_handler.go
@@ -0,0 +1,120 @@
 package admin
 import (
 	"net/http"
 	"strconv"
 	"strings"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/response"
 	"github.com/Wei-Shaw/sub2api/internal/service"
 	"github.com/gin-gonic/gin"
 )
 // GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account.
 // GET /api/v1/admin/ops/concurrency
 func (h *OpsHandler) GetConcurrencyStats(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
 		response.Success(c, gin.H{
 			"enabled":   false,
 			"platform":  map[string]*service.PlatformConcurrencyInfo{},
 			"group":     map[int64]*service.GroupConcurrencyInfo{},
 			"account":   map[int64]*service.AccountConcurrencyInfo{},
 			"timestamp": time.Now().UTC(),
 		})
 		return
 	}
 	platformFilter := strings.TrimSpace(c.Query("platform"))
 	var groupID *int64
 	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
 		id, err := strconv.ParseInt(v, 10, 64)
 		if err != nil || id <= 0 {
 			response.BadRequest(c, "Invalid group_id")
 			return
 		}
 		groupID = &id
 	}
 	platform, group, account, collectedAt, err := h.opsService.GetConcurrencyStats(c.Request.Context(), platformFilter, groupID)
 	if err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	payload := gin.H{
 		"enabled":  true,
 		"platform": platform,
 		"group":    group,
 		"account":  account,
 	}
 	if collectedAt != nil {
 		payload["timestamp"] = collectedAt.UTC()
 	}
 	response.Success(c, payload)
 }
 // GetAccountAvailability returns account availability statistics.
 // GET /api/v1/admin/ops/account-availability
 //
 // Query params:
 // - platform: optional
 // - group_id: optional
 func (h *OpsHandler) GetAccountAvailability(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
 		response.Success(c, gin.H{
 			"enabled":   false,
 			"platform":  map[string]*service.PlatformAvailability{},
 			"group":     map[int64]*service.GroupAvailability{},
 			"account":   map[int64]*service.AccountAvailability{},
 			"timestamp": time.Now().UTC(),
 		})
 		return
 	}
 	platform := strings.TrimSpace(c.Query("platform"))
 	var groupID *int64
 	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
 		id, err := strconv.ParseInt(v, 10, 64)
 		if err != nil || id <= 0 {
 			response.BadRequest(c, "Invalid group_id")
 			return
 		}
 		groupID = &id
 	}
 	platformStats, groupStats, accountStats, collectedAt, err := h.opsService.GetAccountAvailabilityStats(c.Request.Context(), platform, groupID)
 	if err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	payload := gin.H{
 		"enabled":  true,
 		"platform": platformStats,
 		"group":    groupStats,
 		"account":  accountStats,
 	}
 	if collectedAt != nil {
 		payload["timestamp"] = collectedAt.UTC()
 	}
 	response.Success(c, payload)
 }
--- a/backend/internal/handler/admin/ops_settings_handler.go
+++ b/backend/internal/handler/admin/ops_settings_handler.go
@@ -0,0 +1,148 @@
 package admin
 import (
 	"net/http"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/response"
 	"github.com/Wei-Shaw/sub2api/internal/service"
 	"github.com/gin-gonic/gin"
 )
 // GetEmailNotificationConfig returns Ops email notification config (DB-backed).
 // GET /api/v1/admin/ops/email-notification/config
 func (h *OpsHandler) GetEmailNotificationConfig(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	cfg, err := h.opsService.GetEmailNotificationConfig(c.Request.Context())
 	if err != nil {
 		response.Error(c, http.StatusInternalServerError, "Failed to get email notification config")
 		return
 	}
 	response.Success(c, cfg)
 }
 // UpdateEmailNotificationConfig updates Ops email notification config (DB-backed).
 // PUT /api/v1/admin/ops/email-notification/config
 func (h *OpsHandler) UpdateEmailNotificationConfig(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	var req service.OpsEmailNotificationConfigUpdateRequest
 	if err := c.ShouldBindJSON(&req); err != nil {
 		response.BadRequest(c, "Invalid request body")
 		return
 	}
 	updated, err := h.opsService.UpdateEmailNotificationConfig(c.Request.Context(), &req)
 	if err != nil {
 		// Most failures here are validation errors from request payload; treat as 400.
 		response.Error(c, http.StatusBadRequest, err.Error())
 		return
 	}
 	response.Success(c, updated)
 }
 // GetAlertRuntimeSettings returns Ops alert evaluator runtime settings (DB-backed).
 // GET /api/v1/admin/ops/runtime/alert
 func (h *OpsHandler) GetAlertRuntimeSettings(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	cfg, err := h.opsService.GetOpsAlertRuntimeSettings(c.Request.Context())
 	if err != nil {
 		response.Error(c, http.StatusInternalServerError, "Failed to get alert runtime settings")
 		return
 	}
 	response.Success(c, cfg)
 }
 // UpdateAlertRuntimeSettings updates Ops alert evaluator runtime settings (DB-backed).
 // PUT /api/v1/admin/ops/runtime/alert
 func (h *OpsHandler) UpdateAlertRuntimeSettings(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	var req service.OpsAlertRuntimeSettings
 	if err := c.ShouldBindJSON(&req); err != nil {
 		response.BadRequest(c, "Invalid request body")
 		return
 	}
 	updated, err := h.opsService.UpdateOpsAlertRuntimeSettings(c.Request.Context(), &req)
 	if err != nil {
 		response.Error(c, http.StatusBadRequest, err.Error())
 		return
 	}
 	response.Success(c, updated)
 }
 // GetAdvancedSettings returns Ops advanced settings (DB-backed).
 // GET /api/v1/admin/ops/advanced-settings
 func (h *OpsHandler) GetAdvancedSettings(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	cfg, err := h.opsService.GetOpsAdvancedSettings(c.Request.Context())
 	if err != nil {
 		response.Error(c, http.StatusInternalServerError, "Failed to get advanced settings")
 		return
 	}
 	response.Success(c, cfg)
 }
 // UpdateAdvancedSettings updates Ops advanced settings (DB-backed).
 // PUT /api/v1/admin/ops/advanced-settings
 func (h *OpsHandler) UpdateAdvancedSettings(c *gin.Context) {
 	if h.opsService == nil {
 		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
 		return
 	}
 	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
 		response.ErrorFrom(c, err)
 		return
 	}
 	var req service.OpsAdvancedSettings
 	if err := c.ShouldBindJSON(&req); err != nil {
 		response.BadRequest(c, "Invalid request body")
 		return
 	}
 	updated, err := h.opsService.UpdateOpsAdvancedSettings(c.Request.Context(), &req)
 	if err != nil {
 		response.Error(c, http.StatusBadRequest, err.Error())
 		return
 	}
 	response.Success(c, updated)
 }
--- a/backend/internal/handler/admin/ops_ws_handler.go
+++ b/backend/internal/handler/admin/ops_ws_handler.go
@@ -0,0 +1,771 @@
 package admin
 import (
 	"context"
 	"encoding/json"
 	"log"
 	"math"
 	"net"
 	"net/http"
 	"net/netip"
 	"net/url"
 	"os"
 	"strconv"
 	"strings"
 	"sync"
 	"sync/atomic"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/service"
 	"github.com/gin-gonic/gin"
 	"github.com/gorilla/websocket"
 )
 type OpsWSProxyConfig struct {
 	TrustProxy     bool
 	TrustedProxies []netip.Prefix
 	OriginPolicy   string
 }
 const (
 	envOpsWSTrustProxy     = "OPS_WS_TRUST_PROXY"
 	envOpsWSTrustedProxies = "OPS_WS_TRUSTED_PROXIES"
 	envOpsWSOriginPolicy   = "OPS_WS_ORIGIN_POLICY"
 	envOpsWSMaxConns       = "OPS_WS_MAX_CONNS"
 	envOpsWSMaxConnsPerIP  = "OPS_WS_MAX_CONNS_PER_IP"
 )
 const (
 	OriginPolicyStrict     = "strict"
 	OriginPolicyPermissive = "permissive"
 )
 var opsWSProxyConfig = loadOpsWSProxyConfigFromEnv()
 var upgrader = websocket.Upgrader{
 	CheckOrigin: func(r *http.Request) bool {
 		return isAllowedOpsWSOrigin(r)
 	},
 	// Subprotocol negotiation:
 	// - The frontend passes ["sub2api-admin", "jwt.<token>"].
 	// - We always select "sub2api-admin" so the token is never echoed back in the handshake response.
 	Subprotocols: []string{"sub2api-admin"},
 }
 const (
 	qpsWSPushInterval       = 2 * time.Second
 	qpsWSRefreshInterval    = 5 * time.Second
 	qpsWSRequestCountWindow = 1 * time.Minute
 	defaultMaxWSConns      = 100
 	defaultMaxWSConnsPerIP = 20
 )
 var wsConnCount atomic.Int32
 var wsConnCountByIP sync.Map // map[string]*atomic.Int32
 const qpsWSIdleStopDelay = 30 * time.Second
 const (
 	opsWSCloseRealtimeDisabled = 4001
 )
 var qpsWSIdleStopMu sync.Mutex
 var qpsWSIdleStopTimer *time.Timer
 func cancelQPSWSIdleStop() {
 	qpsWSIdleStopMu.Lock()
 	if qpsWSIdleStopTimer != nil {
 		qpsWSIdleStopTimer.Stop()
 		qpsWSIdleStopTimer = nil
 	}
 	qpsWSIdleStopMu.Unlock()
 }
 func scheduleQPSWSIdleStop() {
 	qpsWSIdleStopMu.Lock()
 	if qpsWSIdleStopTimer != nil {
 		qpsWSIdleStopMu.Unlock()
 		return
 	}
 	qpsWSIdleStopTimer = time.AfterFunc(qpsWSIdleStopDelay, func() {
 		// Only stop if truly idle at fire time.
 		if wsConnCount.Load() == 0 {
 			qpsWSCache.Stop()
 		}
 		qpsWSIdleStopMu.Lock()
 		qpsWSIdleStopTimer = nil
 		qpsWSIdleStopMu.Unlock()
 	})
 	qpsWSIdleStopMu.Unlock()
 }
 type opsWSRuntimeLimits struct {
 	MaxConns      int32
 	MaxConnsPerIP int32
 }
 var opsWSLimits = loadOpsWSRuntimeLimitsFromEnv()
 const (
 	qpsWSWriteTimeout = 10 * time.Second
 	qpsWSPongWait     = 60 * time.Second
 	qpsWSPingInterval = 30 * time.Second
 	// We don't expect clients to send application messages; we only read to process control frames (Pong/Close).
 	qpsWSMaxReadBytes = 1024
 )
 type opsWSQPSCache struct {
 	refreshInterval    time.Duration
 	requestCountWindow time.Duration
 	lastUpdatedUnixNano atomic.Int64
 	payload             atomic.Value // []byte
 	opsService *service.OpsService
 	cancel     context.CancelFunc
 	done       chan struct{}
 	mu      sync.Mutex
 	running bool
 }
 var qpsWSCache = &opsWSQPSCache{
 	refreshInterval:    qpsWSRefreshInterval,
 	requestCountWindow: qpsWSRequestCountWindow,
 }
 func (c *opsWSQPSCache) start(opsService *service.OpsService) {
 	if c == nil || opsService == nil {
 		return
 	}
 	for {
 		c.mu.Lock()
 		if c.running {
 			c.mu.Unlock()
 			return
 		}
 		// If a previous refresh loop is currently stopping, wait for it to fully exit.
 		done := c.done
 		if done != nil {
 			c.mu.Unlock()
 			<-done
 			c.mu.Lock()
 			if c.done == done && !c.running {
 				c.done = nil
 			}
 			c.mu.Unlock()
 			continue
 		}
 		c.opsService = opsService
 		ctx, cancel := context.WithCancel(context.Background())
 		c.cancel = cancel
 		c.done = make(chan struct{})
 		done = c.done
 		c.running = true
 		c.mu.Unlock()
 		go func() {
 			defer close(done)
 			c.refreshLoop(ctx)
 		}()
 		return
 	}
 }
 // Stop stops the background refresh loop.
 // It is safe to call multiple times.
 func (c *opsWSQPSCache) Stop() {
 	if c == nil {
 		return
 	}
 	c.mu.Lock()
 	if !c.running {
 		done := c.done
 		c.mu.Unlock()
 		if done != nil {
 			<-done
 		}
 		return
 	}
 	cancel := c.cancel
 	c.cancel = nil
 	c.running = false
 	c.opsService = nil
 	done := c.done
 	c.mu.Unlock()
 	if cancel != nil {
 		cancel()
 	}
 	if done != nil {
 		<-done
 	}
 	c.mu.Lock()
 	if c.done == done && !c.running {
 		c.done = nil
 	}
 	c.mu.Unlock()
 }
 func (c *opsWSQPSCache) refreshLoop(ctx context.Context) {
 	ticker := time.NewTicker(c.refreshInterval)
 	defer ticker.Stop()
 	c.refresh(ctx)
 	for {
 		select {
 		case <-ticker.C:
 			c.refresh(ctx)
 		case <-ctx.Done():
 			return
 		}
 	}
 }
 func (c *opsWSQPSCache) refresh(parentCtx context.Context) {
 	if c == nil {
 		return
 	}
 	c.mu.Lock()
 	opsService := c.opsService
 	c.mu.Unlock()
 	if opsService == nil {
 		return
 	}
 	if parentCtx == nil {
 		parentCtx = context.Background()
 	}
 	ctx, cancel := context.WithTimeout(parentCtx, 10*time.Second)
 	defer cancel()
 	now := time.Now().UTC()
 	stats, err := opsService.GetWindowStats(ctx, now.Add(-c.requestCountWindow), now)
 	if err != nil || stats == nil {
 		if err != nil {
 			log.Printf("[OpsWS] refresh: get window stats failed: %v", err)
 		}
 		return
 	}
 	requestCount := stats.SuccessCount + stats.ErrorCountTotal
 	qps := 0.0
 	tps := 0.0
 	if c.requestCountWindow > 0 {
 		seconds := c.requestCountWindow.Seconds()
 		qps = roundTo1DP(float64(requestCount) / seconds)
 		tps = roundTo1DP(float64(stats.TokenConsumed) / seconds)
 	}
 	payload := gin.H{
 		"type":      "qps_update",
 		"timestamp": now.Format(time.RFC3339),
 		"data": gin.H{
 			"qps":           qps,
 			"tps":           tps,
 			"request_count": requestCount,
 		},
 	}
 	msg, err := json.Marshal(payload)
 	if err != nil {
 		log.Printf("[OpsWS] refresh: marshal payload failed: %v", err)
 		return
 	}
 	c.payload.Store(msg)
 	c.lastUpdatedUnixNano.Store(now.UnixNano())
 }
 func roundTo1DP(v float64) float64 {
 	return math.Round(v*10) / 10
 }
 func (c *opsWSQPSCache) getPayload() []byte {
 	if c == nil {
 		return nil
 	}
 	if cached, ok := c.payload.Load().([]byte); ok && cached != nil {
 		return cached
 	}
 	return nil
 }
 func closeWS(conn *websocket.Conn, code int, reason string) {
 	if conn == nil {
 		return
 	}
 	msg := websocket.FormatCloseMessage(code, reason)
 	_ = conn.WriteControl(websocket.CloseMessage, msg, time.Now().Add(qpsWSWriteTimeout))
 	_ = conn.Close()
 }
 // QPSWSHandler handles realtime QPS push via WebSocket.
 // GET /api/v1/admin/ops/ws/qps
 func (h *OpsHandler) QPSWSHandler(c *gin.Context) {
 	clientIP := requestClientIP(c.Request)
 	if h == nil || h.opsService == nil {
 		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "ops service not initialized"})
 		return
 	}
 	// If realtime monitoring is disabled, prefer a successful WS upgrade followed by a clean close
 	// with a deterministic close code. This prevents clients from spinning on 404/1006 reconnect loops.
 	if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
 		conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
 		if err != nil {
 			c.JSON(http.StatusNotFound, gin.H{"error": "ops realtime monitoring is disabled"})
 			return
 		}
 		closeWS(conn, opsWSCloseRealtimeDisabled, "realtime_disabled")
 		return
 	}
 	cancelQPSWSIdleStop()
 	// Lazily start the background refresh loop so unit tests that never hit the
 	// websocket route don't spawn goroutines that depend on DB/Redis stubs.
 	qpsWSCache.start(h.opsService)
 	// Reserve a global slot before upgrading the connection to keep the limit strict.
 	if !tryAcquireOpsWSTotalSlot(opsWSLimits.MaxConns) {
 		log.Printf("[OpsWS] connection limit reached: %d/%d", wsConnCount.Load(), opsWSLimits.MaxConns)
 		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"})
 		return
 	}
 	defer func() {
 		if wsConnCount.Add(-1) == 0 {
 			scheduleQPSWSIdleStop()
 		}
 	}()
 	if opsWSLimits.MaxConnsPerIP > 0 && clientIP != "" {
 		if !tryAcquireOpsWSIPSlot(clientIP, opsWSLimits.MaxConnsPerIP) {
 			log.Printf("[OpsWS] per-ip connection limit reached: ip=%s limit=%d", clientIP, opsWSLimits.MaxConnsPerIP)
 			c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"})
 			return
 		}
 		defer releaseOpsWSIPSlot(clientIP)
 	}
 	conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
 	if err != nil {
 		log.Printf("[OpsWS] upgrade failed: %v", err)
 		return
 	}
 	defer func() {
 		_ = conn.Close()
 	}()
 	handleQPSWebSocket(c.Request.Context(), conn)
 }
 func tryAcquireOpsWSTotalSlot(limit int32) bool {
 	if limit <= 0 {
 		return true
 	}
 	for {
 		current := wsConnCount.Load()
 		if current >= limit {
 			return false
 		}
 		if wsConnCount.CompareAndSwap(current, current+1) {
 			return true
 		}
 	}
 }
 func tryAcquireOpsWSIPSlot(clientIP string, limit int32) bool {
 	if strings.TrimSpace(clientIP) == "" || limit <= 0 {
 		return true
 	}
 	v, _ := wsConnCountByIP.LoadOrStore(clientIP, &atomic.Int32{})
 	counter, ok := v.(*atomic.Int32)
 	if !ok {
 		return false
 	}
 	for {
 		current := counter.Load()
 		if current >= limit {
 			return false
 		}
 		if counter.CompareAndSwap(current, current+1) {
 			return true
 		}
 	}
 }
 func releaseOpsWSIPSlot(clientIP string) {
 	if strings.TrimSpace(clientIP) == "" {
 		return
 	}
 	v, ok := wsConnCountByIP.Load(clientIP)
 	if !ok {
 		return
 	}
 	counter, ok := v.(*atomic.Int32)
 	if !ok {
 		return
 	}
 	next := counter.Add(-1)
 	if next <= 0 {
 		// Best-effort cleanup; safe even if a new slot was acquired concurrently.
 		wsConnCountByIP.Delete(clientIP)
 	}
 }
 func handleQPSWebSocket(parentCtx context.Context, conn *websocket.Conn) {
 	if conn == nil {
 		return
 	}
 	ctx, cancel := context.WithCancel(parentCtx)
 	defer cancel()
 	var closeOnce sync.Once
 	closeConn := func() {
 		closeOnce.Do(func() {
 			_ = conn.Close()
 		})
 	}
 	closeFrameCh := make(chan []byte, 1)
 	var wg sync.WaitGroup
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
 		defer cancel()
 		conn.SetReadLimit(qpsWSMaxReadBytes)
 		if err := conn.SetReadDeadline(time.Now().Add(qpsWSPongWait)); err != nil {
 			log.Printf("[OpsWS] set read deadline failed: %v", err)
 			return
 		}
 		conn.SetPongHandler(func(string) error {
 			return conn.SetReadDeadline(time.Now().Add(qpsWSPongWait))
 		})
 		conn.SetCloseHandler(func(code int, text string) error {
 			select {
 			case closeFrameCh <- websocket.FormatCloseMessage(code, text):
 			default:
 			}
 			cancel()
 			return nil
 		})
 		for {
 			_, _, err := conn.ReadMessage()
 			if err != nil {
 				if websocket.IsUnexpectedCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway, websocket.CloseNoStatusReceived) {
 					log.Printf("[OpsWS] read failed: %v", err)
 				}
 				return
 			}
 		}
 	}()
 	// Push QPS data every 2 seconds (values are globally cached and refreshed at most once per qpsWSRefreshInterval).
 	pushTicker := time.NewTicker(qpsWSPushInterval)
 	defer pushTicker.Stop()
 	// Heartbeat ping every 30 seconds.
 	pingTicker := time.NewTicker(qpsWSPingInterval)
 	defer pingTicker.Stop()
 	writeWithTimeout := func(messageType int, data []byte) error {
 		if err := conn.SetWriteDeadline(time.Now().Add(qpsWSWriteTimeout)); err != nil {
 			return err
 		}
 		return conn.WriteMessage(messageType, data)
 	}
 	sendClose := func(closeFrame []byte) {
 		if closeFrame == nil {
 			closeFrame = websocket.FormatCloseMessage(websocket.CloseNormalClosure, "")
 		}
 		_ = writeWithTimeout(websocket.CloseMessage, closeFrame)
 	}
 	for {
 		select {
 		case <-pushTicker.C:
 			msg := qpsWSCache.getPayload()
 			if msg == nil {
 				continue
 			}
 			if err := writeWithTimeout(websocket.TextMessage, msg); err != nil {
 				log.Printf("[OpsWS] write failed: %v", err)
 				cancel()
 				closeConn()
 				wg.Wait()
 				return
 			}
 		case <-pingTicker.C:
 			if err := writeWithTimeout(websocket.PingMessage, nil); err != nil {
 				log.Printf("[OpsWS] ping failed: %v", err)
 				cancel()
 				closeConn()
 				wg.Wait()
 				return
 			}
 		case closeFrame := <-closeFrameCh:
 			sendClose(closeFrame)
 			closeConn()
 			wg.Wait()
 			return
 		case <-ctx.Done():
 			var closeFrame []byte
 			select {
 			case closeFrame = <-closeFrameCh:
 			default:
 			}
 			sendClose(closeFrame)
 			closeConn()
 			wg.Wait()
 			return
 		}
 	}
 }
 func isAllowedOpsWSOrigin(r *http.Request) bool {
 	if r == nil {
 		return false
 	}
 	origin := strings.TrimSpace(r.Header.Get("Origin"))
 	if origin == "" {
 		switch strings.ToLower(strings.TrimSpace(opsWSProxyConfig.OriginPolicy)) {
 		case OriginPolicyStrict:
 			return false
 		case OriginPolicyPermissive, "":
 			return true
 		default:
 			return true
 		}
 	}
 	parsed, err := url.Parse(origin)
 	if err != nil || parsed.Hostname() == "" {
 		return false
 	}
 	originHost := strings.ToLower(parsed.Hostname())
 	trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r)
 	reqHost := hostWithoutPort(r.Host)
 	if trustProxyHeaders {
 		xfHost := strings.TrimSpace(r.Header.Get("X-Forwarded-Host"))
 		if xfHost != "" {
 			xfHost = strings.TrimSpace(strings.Split(xfHost, ",")[0])
 			if xfHost != "" {
 				reqHost = hostWithoutPort(xfHost)
 			}
 		}
 	}
 	reqHost = strings.ToLower(reqHost)
 	if reqHost == "" {
 		return false
 	}
 	return originHost == reqHost
 }
 func shouldTrustOpsWSProxyHeaders(r *http.Request) bool {
 	if r == nil {
 		return false
 	}
 	if !opsWSProxyConfig.TrustProxy {
 		return false
 	}
 	peerIP, ok := requestPeerIP(r)
 	if !ok {
 		return false
 	}
 	return isAddrInTrustedProxies(peerIP, opsWSProxyConfig.TrustedProxies)
 }
 func requestPeerIP(r *http.Request) (netip.Addr, bool) {
 	if r == nil {
 		return netip.Addr{}, false
 	}
 	host, _, err := net.SplitHostPort(strings.TrimSpace(r.RemoteAddr))
 	if err != nil {
 		host = strings.TrimSpace(r.RemoteAddr)
 	}
 	host = strings.TrimPrefix(host, "[")
 	host = strings.TrimSuffix(host, "]")
 	if host == "" {
 		return netip.Addr{}, false
 	}
 	addr, err := netip.ParseAddr(host)
 	if err != nil {
 		return netip.Addr{}, false
 	}
 	return addr.Unmap(), true
 }
 func requestClientIP(r *http.Request) string {
 	if r == nil {
 		return ""
 	}
 	trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r)
 	if trustProxyHeaders {
 		xff := strings.TrimSpace(r.Header.Get("X-Forwarded-For"))
 		if xff != "" {
 			// Use the left-most entry (original client). If multiple proxies add values, they are comma-separated.
 			xff = strings.TrimSpace(strings.Split(xff, ",")[0])
 			xff = strings.TrimPrefix(xff, "[")
 			xff = strings.TrimSuffix(xff, "]")
 			if addr, err := netip.ParseAddr(xff); err == nil && addr.IsValid() {
 				return addr.Unmap().String()
 			}
 		}
 	}
 	if peer, ok := requestPeerIP(r); ok && peer.IsValid() {
 		return peer.String()
 	}
 	return ""
 }
 func isAddrInTrustedProxies(addr netip.Addr, trusted []netip.Prefix) bool {
 	if !addr.IsValid() {
 		return false
 	}
 	for _, p := range trusted {
 		if p.Contains(addr) {
 			return true
 		}
 	}
 	return false
 }
 func loadOpsWSProxyConfigFromEnv() OpsWSProxyConfig {
 	cfg := OpsWSProxyConfig{
 		TrustProxy:     true,
 		TrustedProxies: defaultTrustedProxies(),
 		OriginPolicy:   OriginPolicyPermissive,
 	}
 	if v := strings.TrimSpace(os.Getenv(envOpsWSTrustProxy)); v != "" {
 		if parsed, err := strconv.ParseBool(v); err == nil {
 			cfg.TrustProxy = parsed
 		} else {
 			log.Printf("[OpsWS] invalid %s=%q (expected bool); using default=%v", envOpsWSTrustProxy, v, cfg.TrustProxy)
 		}
 	}
 	if raw := strings.TrimSpace(os.Getenv(envOpsWSTrustedProxies)); raw != "" {
 		prefixes, invalid := parseTrustedProxyList(raw)
 		if len(invalid) > 0 {
 			log.Printf("[OpsWS] invalid %s entries ignored: %s", envOpsWSTrustedProxies, strings.Join(invalid, ", "))
 		}
 		cfg.TrustedProxies = prefixes
 	}
 	if v := strings.TrimSpace(os.Getenv(envOpsWSOriginPolicy)); v != "" {
 		normalized := strings.ToLower(v)
 		switch normalized {
 		case OriginPolicyStrict, OriginPolicyPermissive:
 			cfg.OriginPolicy = normalized
 		default:
 			log.Printf("[OpsWS] invalid %s=%q (expected %q or %q); using default=%q", envOpsWSOriginPolicy, v, OriginPolicyStrict, OriginPolicyPermissive, cfg.OriginPolicy)
 		}
 	}
 	return cfg
 }
 func loadOpsWSRuntimeLimitsFromEnv() opsWSRuntimeLimits {
 	cfg := opsWSRuntimeLimits{
 		MaxConns:      defaultMaxWSConns,
 		MaxConnsPerIP: defaultMaxWSConnsPerIP,
 	}
 	if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConns)); v != "" {
 		if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 {
 			cfg.MaxConns = int32(parsed)
 		} else {
 			log.Printf("[OpsWS] invalid %s=%q (expected int>0); using default=%d", envOpsWSMaxConns, v, cfg.MaxConns)
 		}
 	}
 	if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConnsPerIP)); v != "" {
 		if parsed, err := strconv.Atoi(v); err == nil && parsed >= 0 {
 			cfg.MaxConnsPerIP = int32(parsed)
 		} else {
 			log.Printf("[OpsWS] invalid %s=%q (expected int>=0); using default=%d", envOpsWSMaxConnsPerIP, v, cfg.MaxConnsPerIP)
 		}
 	}
 	return cfg
 }
 func defaultTrustedProxies() []netip.Prefix {
 	prefixes, _ := parseTrustedProxyList("127.0.0.0/8,::1/128")
 	return prefixes
 }
 func parseTrustedProxyList(raw string) (prefixes []netip.Prefix, invalid []string) {
 	for _, token := range strings.Split(raw, ",") {
 		item := strings.TrimSpace(token)
 		if item == "" {
 			continue
 		}
 		var (
 			p   netip.Prefix
 			err error
 		)
 		if strings.Contains(item, "/") {
 			p, err = netip.ParsePrefix(item)
 		} else {
 			var addr netip.Addr
 			addr, err = netip.ParseAddr(item)
 			if err == nil {
 				addr = addr.Unmap()
 				bits := 128
 				if addr.Is4() {
 					bits = 32
 				}
 				p = netip.PrefixFrom(addr, bits)
 			}
 		}
 		if err != nil || !p.IsValid() {
 			invalid = append(invalid, item)
 			continue
 		}
 		prefixes = append(prefixes, p.Masked())
 	}
 	return prefixes, invalid
 }
 func hostWithoutPort(hostport string) string {
 	hostport = strings.TrimSpace(hostport)
 	if hostport == "" {
 		return ""
 	}
 	if host, _, err := net.SplitHostPort(hostport); err == nil {
 		return host
 	}
 	if strings.HasPrefix(hostport, "[") && strings.HasSuffix(hostport, "]") {
 		return strings.Trim(hostport, "[]")
 	}
 	parts := strings.Split(hostport, ":")
 	return parts[0]
 }
--- a/backend/internal/handler/admin/setting_handler.go
+++ b/backend/internal/handler/admin/setting_handler.go
@@ -19,14 +19,16 @@ type SettingHandler struct {
 	settingService   *service.SettingService
 	emailService     *service.EmailService
 	turnstileService *service.TurnstileService
 	opsService       *service.OpsService
 }
 // NewSettingHandler 创建系统设置处理器
-func NewSettingHandler(settingService *service.SettingService, emailService *service.EmailService, turnstileService *service.TurnstileService) *SettingHandler {
+func NewSettingHandler(settingService *service.SettingService, emailService *service.EmailService, turnstileService *service.TurnstileService, opsService *service.OpsService) *SettingHandler {
 	return &SettingHandler{
 		settingService:   settingService,
 		emailService:     emailService,
 		turnstileService: turnstileService,
 		opsService:       opsService,
 	}
 }
@@ -39,6 +41,9 @@ func (h *SettingHandler) GetSettings(c *gin.Context) {
 		return
 	}
 	// Check if ops monitoring is enabled (respects config.ops.enabled)
 	opsEnabled := h.opsService != nil && h.opsService.IsMonitoringEnabled(c.Request.Context())
 	response.Success(c, dto.SystemSettings{
 		RegistrationEnabled:                  settings.RegistrationEnabled,
 		EmailVerifyEnabled:                   settings.EmailVerifyEnabled,
@@ -72,6 +77,10 @@ func (h *SettingHandler) GetSettings(c *gin.Context) {
 		FallbackModelAntigravity:             settings.FallbackModelAntigravity,
 		EnableIdentityPatch:                  settings.EnableIdentityPatch,
 		IdentityPatchPrompt:                  settings.IdentityPatchPrompt,
 		OpsMonitoringEnabled:                 opsEnabled && settings.OpsMonitoringEnabled,
 		OpsRealtimeMonitoringEnabled:         settings.OpsRealtimeMonitoringEnabled,
 		OpsQueryModeDefault:                  settings.OpsQueryModeDefault,
 		OpsMetricsIntervalSeconds:            settings.OpsMetricsIntervalSeconds,
 	})
 }
@@ -95,7 +104,7 @@ type UpdateSettingsRequest struct {
 	TurnstileSiteKey   string `json:"turnstile_site_key"`
 	TurnstileSecretKey string `json:"turnstile_secret_key"`
-	// LinuxDo Connect OAuth 登录（终端用户 SSO）
+	// LinuxDo Connect OAuth 登录
 	LinuxDoConnectEnabled      bool   `json:"linuxdo_connect_enabled"`
 	LinuxDoConnectClientID     string `json:"linuxdo_connect_client_id"`
 	LinuxDoConnectClientSecret string `json:"linuxdo_connect_client_secret"`
@@ -124,6 +133,12 @@ type UpdateSettingsRequest struct {
 	// Identity patch configuration (Claude -> Gemini)
 	EnableIdentityPatch bool   `json:"enable_identity_patch"`
 	IdentityPatchPrompt string `json:"identity_patch_prompt"`
 	// Ops monitoring (vNext)
 	OpsMonitoringEnabled         *bool   `json:"ops_monitoring_enabled"`
 	OpsRealtimeMonitoringEnabled *bool   `json:"ops_realtime_monitoring_enabled"`
 	OpsQueryModeDefault          *string `json:"ops_query_mode_default"`
 	OpsMetricsIntervalSeconds    *int    `json:"ops_metrics_interval_seconds"`
 }
 // UpdateSettings 更新系统设置
@@ -208,6 +223,18 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
 		}
 	}
 	// Ops metrics collector interval validation (seconds).
 	if req.OpsMetricsIntervalSeconds != nil {
 		v := *req.OpsMetricsIntervalSeconds
 		if v < 60 {
 			v = 60
 		}
 		if v > 3600 {
 			v = 3600
 		}
 		req.OpsMetricsIntervalSeconds = &v
 	}
 	settings := &service.SystemSettings{
 		RegistrationEnabled:        req.RegistrationEnabled,
 		EmailVerifyEnabled:         req.EmailVerifyEnabled,
@@ -241,6 +268,30 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
 		FallbackModelAntigravity:   req.FallbackModelAntigravity,
 		EnableIdentityPatch:        req.EnableIdentityPatch,
 		IdentityPatchPrompt:        req.IdentityPatchPrompt,
 		OpsMonitoringEnabled: func() bool {
 			if req.OpsMonitoringEnabled != nil {
 				return *req.OpsMonitoringEnabled
 			}
 			return previousSettings.OpsMonitoringEnabled
 		}(),
 		OpsRealtimeMonitoringEnabled: func() bool {
 			if req.OpsRealtimeMonitoringEnabled != nil {
 				return *req.OpsRealtimeMonitoringEnabled
 			}
 			return previousSettings.OpsRealtimeMonitoringEnabled
 		}(),
 		OpsQueryModeDefault: func() string {
 			if req.OpsQueryModeDefault != nil {
 				return *req.OpsQueryModeDefault
 			}
 			return previousSettings.OpsQueryModeDefault
 		}(),
 		OpsMetricsIntervalSeconds: func() int {
 			if req.OpsMetricsIntervalSeconds != nil {
 				return *req.OpsMetricsIntervalSeconds
 			}
 			return previousSettings.OpsMetricsIntervalSeconds
 		}(),
 	}
 	if err := h.settingService.UpdateSettings(c.Request.Context(), settings); err != nil {
@@ -290,6 +341,10 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
 		FallbackModelAntigravity:             updatedSettings.FallbackModelAntigravity,
 		EnableIdentityPatch:                  updatedSettings.EnableIdentityPatch,
 		IdentityPatchPrompt:                  updatedSettings.IdentityPatchPrompt,
 		OpsMonitoringEnabled:                 updatedSettings.OpsMonitoringEnabled,
 		OpsRealtimeMonitoringEnabled:         updatedSettings.OpsRealtimeMonitoringEnabled,
 		OpsQueryModeDefault:                  updatedSettings.OpsQueryModeDefault,
 		OpsMetricsIntervalSeconds:            updatedSettings.OpsMetricsIntervalSeconds,
 	})
 }
@@ -411,6 +466,18 @@ func diffSettings(before *service.SystemSettings, after *service.SystemSettings,
 	if before.IdentityPatchPrompt != after.IdentityPatchPrompt {
 		changed = append(changed, "identity_patch_prompt")
 	}
 	if before.OpsMonitoringEnabled != after.OpsMonitoringEnabled {
 		changed = append(changed, "ops_monitoring_enabled")
 	}
 	if before.OpsRealtimeMonitoringEnabled != after.OpsRealtimeMonitoringEnabled {
 		changed = append(changed, "ops_realtime_monitoring_enabled")
 	}
 	if before.OpsQueryModeDefault != after.OpsQueryModeDefault {
 		changed = append(changed, "ops_query_mode_default")
 	}
 	if before.OpsMetricsIntervalSeconds != after.OpsMetricsIntervalSeconds {
 		changed = append(changed, "ops_metrics_interval_seconds")
 	}
 	return changed
 }
--- a/backend/internal/handler/dto/settings.go
+++ b/backend/internal/handler/dto/settings.go
@@ -43,6 +43,12 @@ type SystemSettings struct {
 	// Identity patch configuration (Claude -> Gemini)
 	EnableIdentityPatch bool   `json:"enable_identity_patch"`
 	IdentityPatchPrompt string `json:"identity_patch_prompt"`
 	// Ops monitoring (vNext)
 	OpsMonitoringEnabled         bool   `json:"ops_monitoring_enabled"`
 	OpsRealtimeMonitoringEnabled bool   `json:"ops_realtime_monitoring_enabled"`
 	OpsQueryModeDefault          string `json:"ops_query_mode_default"`
 	OpsMetricsIntervalSeconds    int    `json:"ops_metrics_interval_seconds"`
 }
 type PublicSettings struct {
--- a/backend/internal/handler/gateway_handler.go
+++ b/backend/internal/handler/gateway_handler.go
@@ -15,7 +15,6 @@ import (
 	"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/claude"
 	pkgerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/ip"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/openai"
 	middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
 	"github.com/Wei-Shaw/sub2api/internal/service"
@@ -89,6 +88,8 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 		return
 	}
 	setOpsRequestContext(c, "", false, body)
 	parsedReq, err := service.ParseGatewayRequest(body)
 	if err != nil {
 		h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
@@ -97,8 +98,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 	reqModel := parsedReq.Model
 	reqStream := parsedReq.Stream
-	// 设置 Claude Code 客户端标识到 context（用于分组限制检查）
+	setOpsRequestContext(c, reqModel, reqStream, body)
 	SetClaudeCodeClientContext(c, body)
 	// 验证 model 必填
 	if reqModel == "" {
@@ -112,15 +112,10 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 	// 获取订阅信息（可能为nil）- 提前获取用于后续检查
 	subscription, _ := middleware2.GetSubscriptionFromContext(c)
 	// 获取 User-Agent
 	userAgent := c.Request.UserAgent()
 	// 获取客户端 IP
 	clientIP := ip.GetClientIP(c)
 	// 0. 检查wait队列是否已满
 	maxWait := service.CalculateMaxWait(subject.Concurrency)
 	canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait)
 	waitCounted := false
 	if err != nil {
 		log.Printf("Increment wait count failed: %v", err)
 		// On error, allow request to proceed
@@ -128,8 +123,15 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 		h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later")
 		return
 	}
-	// 确保在函数退出时减少wait计数
+	if err == nil && canWait {
-	defer h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
+		waitCounted = true
 	}
 	// Ensure we decrement if we exit before acquiring the user slot.
 	defer func() {
 		if waitCounted {
 			h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
 		}
 	}()
 	// 1. 首先获取用户并发槽位
 	userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted)
@@ -138,6 +140,11 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 		h.handleConcurrencyError(c, err, "user", streamStarted)
 		return
 	}
 	// User slot acquired: no longer waiting in the queue.
 	if waitCounted {
 		h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
 		waitCounted = false
 	}
 	// 在请求结束或 Context 取消时确保释放槽位，避免客户端断开造成泄漏
 	userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
 	if userReleaseFunc != nil {
@@ -184,6 +191,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 				return
 			}
 			account := selection.Account
 			setOpsSelectedAccount(c, account.ID)
 			// 检查预热请求拦截（在账号选择后、转发前检查）
 			if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) {
@@ -200,12 +208,12 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 			// 3. 获取账号并发槽位
 			accountReleaseFunc := selection.ReleaseFunc
 			var accountWaitRelease func()
 			if !selection.Acquired {
 				if selection.WaitPlan == nil {
 					h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
 					return
 				}
 				accountWaitCounted := false
 				canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
 				if err != nil {
 					log.Printf("Increment account wait count failed: %v", err)
@@ -213,12 +221,16 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 					log.Printf("Account wait queue full: account=%d", account.ID)
 					h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
 					return
-				} else {
+				}
-					// Only set release function if increment succeeded
+				if err == nil && canWait {
-					accountWaitRelease = func() {
+					accountWaitCounted = true
 				}
 				// Ensure the wait counter is decremented if we exit before acquiring the slot.
 				defer func() {
 					if accountWaitCounted {
 						h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
 					}
-				}
+				}()
 				accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
 					c,
@@ -229,20 +241,21 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 					&streamStarted,
 				)
 				if err != nil {
 					if accountWaitRelease != nil {
 						accountWaitRelease()
 					}
 					log.Printf("Account concurrency acquire failed: %v", err)
 					h.handleConcurrencyError(c, err, "account", streamStarted)
 					return
 				}
 				// Slot acquired: no longer waiting in queue.
 				if accountWaitCounted {
 					h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
 					accountWaitCounted = false
 				}
 				if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil {
 					log.Printf("Bind sticky session failed: %v", err)
 				}
 			}
 			// 账号槽位/等待计数需要在超时或断开时安全回收
 			accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
 			accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
 			// 转发请求 - 根据账号平台分流
 			var result *service.ForwardResult
@@ -254,9 +267,6 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 			if accountReleaseFunc != nil {
 				accountReleaseFunc()
 			}
 			if accountWaitRelease != nil {
 				accountWaitRelease()
 			}
 			if err != nil {
 				var failoverErr *service.UpstreamFailoverError
 				if errors.As(err, &failoverErr) {
@@ -277,7 +287,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 			}
 			// 异步记录使用量（subscription已在函数开头获取）
-			go func(result *service.ForwardResult, usedAccount *service.Account, ua string, cip string) {
+			go func(result *service.ForwardResult, usedAccount *service.Account) {
 				ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 				defer cancel()
 				if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{
@@ -286,12 +296,10 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 					User:         apiKey.User,
 					Account:      usedAccount,
 					Subscription: subscription,
 					UserAgent:    ua,
 					IPAddress:    cip,
 				}); err != nil {
 					log.Printf("Record usage failed: %v", err)
 				}
-			}(result, account, userAgent, clientIP)
+			}(result, account)
 			return
 		}
 	}
@@ -313,6 +321,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 			return
 		}
 		account := selection.Account
 		setOpsSelectedAccount(c, account.ID)
 		// 检查预热请求拦截（在账号选择后、转发前检查）
 		if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) {
@@ -329,12 +338,12 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 		// 3. 获取账号并发槽位
 		accountReleaseFunc := selection.ReleaseFunc
 		var accountWaitRelease func()
 		if !selection.Acquired {
 			if selection.WaitPlan == nil {
 				h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
 				return
 			}
 			accountWaitCounted := false
 			canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
 			if err != nil {
 				log.Printf("Increment account wait count failed: %v", err)
@@ -342,12 +351,15 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 				log.Printf("Account wait queue full: account=%d", account.ID)
 				h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
 				return
-			} else {
+			}
-				// Only set release function if increment succeeded
+			if err == nil && canWait {
-				accountWaitRelease = func() {
+				accountWaitCounted = true
 			}
 			defer func() {
 				if accountWaitCounted {
 					h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
 				}
-			}
+			}()
 			accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
 				c,
@@ -358,20 +370,20 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 				&streamStarted,
 			)
 			if err != nil {
 				if accountWaitRelease != nil {
 					accountWaitRelease()
 				}
 				log.Printf("Account concurrency acquire failed: %v", err)
 				h.handleConcurrencyError(c, err, "account", streamStarted)
 				return
 			}
 			if accountWaitCounted {
 				h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
 				accountWaitCounted = false
 			}
 			if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil {
 				log.Printf("Bind sticky session failed: %v", err)
 			}
 		}
 		// 账号槽位/等待计数需要在超时或断开时安全回收
 		accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
 		accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
 		// 转发请求 - 根据账号平台分流
 		var result *service.ForwardResult
@@ -383,9 +395,6 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 		if accountReleaseFunc != nil {
 			accountReleaseFunc()
 		}
 		if accountWaitRelease != nil {
 			accountWaitRelease()
 		}
 		if err != nil {
 			var failoverErr *service.UpstreamFailoverError
 			if errors.As(err, &failoverErr) {
@@ -406,7 +415,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 		}
 		// 异步记录使用量（subscription已在函数开头获取）
-		go func(result *service.ForwardResult, usedAccount *service.Account, ua string, cip string) {
+		go func(result *service.ForwardResult, usedAccount *service.Account) {
 			ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 			defer cancel()
 			if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{
@@ -415,12 +424,10 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 				User:         apiKey.User,
 				Account:      usedAccount,
 				Subscription: subscription,
 				UserAgent:    ua,
 				IPAddress:    cip,
 			}); err != nil {
 				log.Printf("Record usage failed: %v", err)
 			}
-		}(result, account, userAgent, clientIP)
+		}(result, account)
 		return
 	}
 }
@@ -686,21 +693,22 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) {
 		return
 	}
 	setOpsRequestContext(c, "", false, body)
 	parsedReq, err := service.ParseGatewayRequest(body)
 	if err != nil {
 		h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
 		return
 	}
 	// 设置 Claude Code 客户端标识到 context（用于分组限制检查）
 	SetClaudeCodeClientContext(c, body)
 	// 验证 model 必填
 	if parsedReq.Model == "" {
 		h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "model is required")
 		return
 	}
 	setOpsRequestContext(c, parsedReq.Model, parsedReq.Stream, body)
 	// 获取订阅信息（可能为nil）
 	subscription, _ := middleware2.GetSubscriptionFromContext(c)
@@ -721,6 +729,7 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) {
 		h.errorResponse(c, http.StatusServiceUnavailable, "api_error", "No available accounts: "+err.Error())
 		return
 	}
 	setOpsSelectedAccount(c, account.ID)
 	// 转发请求（不记录使用量）
 	if err := h.gatewayService.ForwardCountTokens(c.Request.Context(), c, account, parsedReq); err != nil {
--- a/backend/internal/handler/gemini_v1beta_handler.go
+++ b/backend/internal/handler/gemini_v1beta_handler.go
@@ -12,7 +12,6 @@ import (
 	"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/gemini"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/googleapi"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/ip"
 	"github.com/Wei-Shaw/sub2api/internal/server/middleware"
 	"github.com/Wei-Shaw/sub2api/internal/service"
@@ -162,28 +161,32 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 		return
 	}
 	setOpsRequestContext(c, modelName, stream, body)
 	// Get subscription (may be nil)
 	subscription, _ := middleware.GetSubscriptionFromContext(c)
 	// 获取 User-Agent
 	userAgent := c.Request.UserAgent()
 	// 获取客户端 IP
 	clientIP := ip.GetClientIP(c)
 	// For Gemini native API, do not send Claude-style ping frames.
 	geminiConcurrency := NewConcurrencyHelper(h.concurrencyHelper.concurrencyService, SSEPingFormatNone, 0)
 	// 0) wait queue check
 	maxWait := service.CalculateMaxWait(authSubject.Concurrency)
 	canWait, err := geminiConcurrency.IncrementWaitCount(c.Request.Context(), authSubject.UserID, maxWait)
 	waitCounted := false
 	if err != nil {
 		log.Printf("Increment wait count failed: %v", err)
 	} else if !canWait {
 		googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later")
 		return
 	}
-	defer geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID)
+	if err == nil && canWait {
 		waitCounted = true
 	}
 	defer func() {
 		if waitCounted {
 			geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID)
 		}
 	}()
 	// 1) user concurrency slot
 	streamStarted := false
@@ -192,6 +195,10 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 		googleError(c, http.StatusTooManyRequests, err.Error())
 		return
 	}
 	if waitCounted {
 		geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID)
 		waitCounted = false
 	}
 	// 确保请求取消时也会释放槽位，避免长连接被动中断造成泄漏
 	userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
 	if userReleaseFunc != nil {
@@ -207,10 +214,6 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 	// 3) select account (sticky session based on request body)
 	parsedReq, _ := service.ParseGatewayRequest(body)
 	// 设置 Claude Code 客户端标识到 context（用于分组限制检查）
 	SetClaudeCodeClientContext(c, body)
 	sessionHash := h.gatewayService.GenerateSessionHash(parsedReq)
 	sessionKey := sessionHash
 	if sessionHash != "" {
@@ -232,15 +235,16 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 			return
 		}
 		account := selection.Account
 		setOpsSelectedAccount(c, account.ID)
 		// 4) account concurrency slot
 		accountReleaseFunc := selection.ReleaseFunc
 		var accountWaitRelease func()
 		if !selection.Acquired {
 			if selection.WaitPlan == nil {
 				googleError(c, http.StatusServiceUnavailable, "No available Gemini accounts")
 				return
 			}
 			accountWaitCounted := false
 			canWait, err := geminiConcurrency.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
 			if err != nil {
 				log.Printf("Increment account wait count failed: %v", err)
@@ -248,12 +252,15 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 				log.Printf("Account wait queue full: account=%d", account.ID)
 				googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later")
 				return
-			} else {
+			}
-				// Only set release function if increment succeeded
+			if err == nil && canWait {
-				accountWaitRelease = func() {
+				accountWaitCounted = true
 			}
 			defer func() {
 				if accountWaitCounted {
 					geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID)
 				}
-			}
+			}()
 			accountReleaseFunc, err = geminiConcurrency.AcquireAccountSlotWithWaitTimeout(
 				c,
@@ -264,19 +271,19 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 				&streamStarted,
 			)
 			if err != nil {
 				if accountWaitRelease != nil {
 					accountWaitRelease()
 				}
 				googleError(c, http.StatusTooManyRequests, err.Error())
 				return
 			}
 			if accountWaitCounted {
 				geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID)
 				accountWaitCounted = false
 			}
 			if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil {
 				log.Printf("Bind sticky session failed: %v", err)
 			}
 		}
 		// 账号槽位/等待计数需要在超时或断开时安全回收
 		accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
 		accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
 		// 5) forward (根据平台分流)
 		var result *service.ForwardResult
@@ -288,9 +295,6 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 		if accountReleaseFunc != nil {
 			accountReleaseFunc()
 		}
 		if accountWaitRelease != nil {
 			accountWaitRelease()
 		}
 		if err != nil {
 			var failoverErr *service.UpstreamFailoverError
 			if errors.As(err, &failoverErr) {
@@ -311,7 +315,7 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 		}
 		// 6) record usage async
-		go func(result *service.ForwardResult, usedAccount *service.Account, ua string, cip string) {
+		go func(result *service.ForwardResult, usedAccount *service.Account) {
 			ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 			defer cancel()
 			if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{
@@ -320,12 +324,10 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 				User:         apiKey.User,
 				Account:      usedAccount,
 				Subscription: subscription,
 				UserAgent:    ua,
 				IPAddress:    cip,
 			}); err != nil {
 				log.Printf("Record usage failed: %v", err)
 			}
-		}(result, account, userAgent, clientIP)
+		}(result, account)
 		return
 	}
 }
--- a/backend/internal/handler/handler.go
+++ b/backend/internal/handler/handler.go
@@ -18,6 +18,7 @@ type AdminHandlers struct {
 	Redeem           *admin.RedeemHandler
 	Promo            *admin.PromoHandler
 	Setting          *admin.SettingHandler
 	Ops              *admin.OpsHandler
 	System           *admin.SystemHandler
 	Subscription     *admin.SubscriptionHandler
 	Usage            *admin.UsageHandler
--- a/backend/internal/handler/openai_gateway_handler.go
+++ b/backend/internal/handler/openai_gateway_handler.go
@@ -12,7 +12,6 @@ import (
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/config"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/ip"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/openai"
 	middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
 	"github.com/Wei-Shaw/sub2api/internal/service"
@@ -77,6 +76,8 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 		return
 	}
 	setOpsRequestContext(c, "", false, body)
 	// Parse request body to map for potential modification
 	var reqBody map[string]any
 	if err := json.Unmarshal(body, &reqBody); err != nil {
@@ -95,10 +96,6 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 	}
 	userAgent := c.GetHeader("User-Agent")
 	// 获取客户端 IP
 	clientIP := ip.GetClientIP(c)
 	if !openai.IsCodexCLIRequest(userAgent) {
 		existingInstructions, _ := reqBody["instructions"].(string)
 		if strings.TrimSpace(existingInstructions) == "" {
@@ -114,6 +111,8 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 		}
 	}
 	setOpsRequestContext(c, reqModel, reqStream, body)
 	// Track if we've started streaming (for error handling)
 	streamStarted := false
@@ -123,6 +122,7 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 	// 0. Check if wait queue is full
 	maxWait := service.CalculateMaxWait(subject.Concurrency)
 	canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait)
 	waitCounted := false
 	if err != nil {
 		log.Printf("Increment wait count failed: %v", err)
 		// On error, allow request to proceed
@@ -130,8 +130,14 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 		h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later")
 		return
 	}
-	// Ensure wait count is decremented when function exits
+	if err == nil && canWait {
-	defer h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
+		waitCounted = true
 	}
 	defer func() {
 		if waitCounted {
 			h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
 		}
 	}()
 	// 1. First acquire user concurrency slot
 	userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted)
@@ -140,6 +146,11 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 		h.handleConcurrencyError(c, err, "user", streamStarted)
 		return
 	}
 	// User slot acquired: no longer waiting.
 	if waitCounted {
 		h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
 		waitCounted = false
 	}
 	// 确保请求取消时也会释放槽位，避免长连接被动中断造成泄漏
 	userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
 	if userReleaseFunc != nil {
@@ -177,15 +188,16 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 		}
 		account := selection.Account
 		log.Printf("[OpenAI Handler] Selected account: id=%d name=%s", account.ID, account.Name)
 		setOpsSelectedAccount(c, account.ID)
 		// 3. Acquire account concurrency slot
 		accountReleaseFunc := selection.ReleaseFunc
 		var accountWaitRelease func()
 		if !selection.Acquired {
 			if selection.WaitPlan == nil {
 				h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
 				return
 			}
 			accountWaitCounted := false
 			canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
 			if err != nil {
 				log.Printf("Increment account wait count failed: %v", err)
@@ -193,12 +205,15 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 				log.Printf("Account wait queue full: account=%d", account.ID)
 				h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
 				return
-			} else {
+			}
-				// Only set release function if increment succeeded
+			if err == nil && canWait {
-				accountWaitRelease = func() {
+				accountWaitCounted = true
 			}
 			defer func() {
 				if accountWaitCounted {
 					h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
 				}
-			}
+			}()
 			accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
 				c,
@@ -209,29 +224,26 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 				&streamStarted,
 			)
 			if err != nil {
 				if accountWaitRelease != nil {
 					accountWaitRelease()
 				}
 				log.Printf("Account concurrency acquire failed: %v", err)
 				h.handleConcurrencyError(c, err, "account", streamStarted)
 				return
 			}
 			if accountWaitCounted {
 				h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
 				accountWaitCounted = false
 			}
 			if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionHash, account.ID); err != nil {
 				log.Printf("Bind sticky session failed: %v", err)
 			}
 		}
 		// 账号槽位/等待计数需要在超时或断开时安全回收
 		accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
 		accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
 		// Forward request
 		result, err := h.gatewayService.Forward(c.Request.Context(), c, account, body)
 		if accountReleaseFunc != nil {
 			accountReleaseFunc()
 		}
 		if accountWaitRelease != nil {
 			accountWaitRelease()
 		}
 		if err != nil {
 			var failoverErr *service.UpstreamFailoverError
 			if errors.As(err, &failoverErr) {
@@ -252,7 +264,7 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 		}
 		// Async record usage
-		go func(result *service.OpenAIForwardResult, usedAccount *service.Account, ua string, cip string) {
+		go func(result *service.OpenAIForwardResult, usedAccount *service.Account) {
 			ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 			defer cancel()
 			if err := h.gatewayService.RecordUsage(ctx, &service.OpenAIRecordUsageInput{
@@ -261,12 +273,10 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 				User:         apiKey.User,
 				Account:      usedAccount,
 				Subscription: subscription,
 				UserAgent:    ua,
 				IPAddress:    cip,
 			}); err != nil {
 				log.Printf("Record usage failed: %v", err)
 			}
-		}(result, account, userAgent, clientIP)
+		}(result, account)
 		return
 	}
 }
--- a/backend/internal/handler/ops_error_logger.go
+++ b/backend/internal/handler/ops_error_logger.go
@@ -0,0 +1,954 @@
 package handler
 import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"log"
 	"runtime"
 	"runtime/debug"
 	"strconv"
 	"strings"
 	"sync"
 	"sync/atomic"
 	"time"
 	"unicode/utf8"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
 	middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
 	"github.com/Wei-Shaw/sub2api/internal/service"
 	"github.com/gin-gonic/gin"
 )
 const (
 	opsModelKey       = "ops_model"
 	opsStreamKey      = "ops_stream"
 	opsRequestBodyKey = "ops_request_body"
 	opsAccountIDKey   = "ops_account_id"
 )
 const (
 	opsErrorLogTimeout      = 5 * time.Second
 	opsErrorLogDrainTimeout = 10 * time.Second
 	opsErrorLogMinWorkerCount = 4
 	opsErrorLogMaxWorkerCount = 32
 	opsErrorLogQueueSizePerWorker = 128
 	opsErrorLogMinQueueSize       = 256
 	opsErrorLogMaxQueueSize       = 8192
 )
 type opsErrorLogJob struct {
 	ops         *service.OpsService
 	entry       *service.OpsInsertErrorLogInput
 	requestBody []byte
 }
 var (
 	opsErrorLogOnce  sync.Once
 	opsErrorLogQueue chan opsErrorLogJob
 	opsErrorLogStopOnce  sync.Once
 	opsErrorLogWorkersWg sync.WaitGroup
 	opsErrorLogMu        sync.RWMutex
 	opsErrorLogStopping  bool
 	opsErrorLogQueueLen  atomic.Int64
 	opsErrorLogEnqueued  atomic.Int64
 	opsErrorLogDropped   atomic.Int64
 	opsErrorLogProcessed atomic.Int64
 	opsErrorLogLastDropLogAt atomic.Int64
 	opsErrorLogShutdownCh   = make(chan struct{})
 	opsErrorLogShutdownOnce sync.Once
 	opsErrorLogDrained      atomic.Bool
 )
 func startOpsErrorLogWorkers() {
 	opsErrorLogMu.Lock()
 	defer opsErrorLogMu.Unlock()
 	if opsErrorLogStopping {
 		return
 	}
 	workerCount, queueSize := opsErrorLogConfig()
 	opsErrorLogQueue = make(chan opsErrorLogJob, queueSize)
 	opsErrorLogQueueLen.Store(0)
 	opsErrorLogWorkersWg.Add(workerCount)
 	for i := 0; i < workerCount; i++ {
 		go func() {
 			defer opsErrorLogWorkersWg.Done()
 			for job := range opsErrorLogQueue {
 				opsErrorLogQueueLen.Add(-1)
 				if job.ops == nil || job.entry == nil {
 					continue
 				}
 				func() {
 					defer func() {
 						if r := recover(); r != nil {
 							log.Printf("[OpsErrorLogger] worker panic: %v\n%s", r, debug.Stack())
 						}
 					}()
 					ctx, cancel := context.WithTimeout(context.Background(), opsErrorLogTimeout)
 					_ = job.ops.RecordError(ctx, job.entry, job.requestBody)
 					cancel()
 					opsErrorLogProcessed.Add(1)
 				}()
 			}
 		}()
 	}
 }
 func enqueueOpsErrorLog(ops *service.OpsService, entry *service.OpsInsertErrorLogInput, requestBody []byte) {
 	if ops == nil || entry == nil {
 		return
 	}
 	select {
 	case <-opsErrorLogShutdownCh:
 		return
 	default:
 	}
 	opsErrorLogMu.RLock()
 	stopping := opsErrorLogStopping
 	opsErrorLogMu.RUnlock()
 	if stopping {
 		return
 	}
 	opsErrorLogOnce.Do(startOpsErrorLogWorkers)
 	opsErrorLogMu.RLock()
 	defer opsErrorLogMu.RUnlock()
 	if opsErrorLogStopping || opsErrorLogQueue == nil {
 		return
 	}
 	select {
 	case opsErrorLogQueue <- opsErrorLogJob{ops: ops, entry: entry, requestBody: requestBody}:
 		opsErrorLogQueueLen.Add(1)
 		opsErrorLogEnqueued.Add(1)
 	default:
 		// Queue is full; drop to avoid blocking request handling.
 		opsErrorLogDropped.Add(1)
 		maybeLogOpsErrorLogDrop()
 	}
 }
 func StopOpsErrorLogWorkers() bool {
 	opsErrorLogStopOnce.Do(func() {
 		opsErrorLogShutdownOnce.Do(func() {
 			close(opsErrorLogShutdownCh)
 		})
 		opsErrorLogDrained.Store(stopOpsErrorLogWorkers())
 	})
 	return opsErrorLogDrained.Load()
 }
 func stopOpsErrorLogWorkers() bool {
 	opsErrorLogMu.Lock()
 	opsErrorLogStopping = true
 	ch := opsErrorLogQueue
 	if ch != nil {
 		close(ch)
 	}
 	opsErrorLogQueue = nil
 	opsErrorLogMu.Unlock()
 	if ch == nil {
 		opsErrorLogQueueLen.Store(0)
 		return true
 	}
 	done := make(chan struct{})
 	go func() {
 		opsErrorLogWorkersWg.Wait()
 		close(done)
 	}()
 	select {
 	case <-done:
 		opsErrorLogQueueLen.Store(0)
 		return true
 	case <-time.After(opsErrorLogDrainTimeout):
 		return false
 	}
 }
 func OpsErrorLogQueueLength() int64 {
 	return opsErrorLogQueueLen.Load()
 }
 func OpsErrorLogQueueCapacity() int {
 	opsErrorLogMu.RLock()
 	ch := opsErrorLogQueue
 	opsErrorLogMu.RUnlock()
 	if ch == nil {
 		return 0
 	}
 	return cap(ch)
 }
 func OpsErrorLogDroppedTotal() int64 {
 	return opsErrorLogDropped.Load()
 }
 func OpsErrorLogEnqueuedTotal() int64 {
 	return opsErrorLogEnqueued.Load()
 }
 func OpsErrorLogProcessedTotal() int64 {
 	return opsErrorLogProcessed.Load()
 }
 func maybeLogOpsErrorLogDrop() {
 	now := time.Now().Unix()
 	for {
 		last := opsErrorLogLastDropLogAt.Load()
 		if last != 0 && now-last < 60 {
 			return
 		}
 		if opsErrorLogLastDropLogAt.CompareAndSwap(last, now) {
 			break
 		}
 	}
 	queued := opsErrorLogQueueLen.Load()
 	queueCap := OpsErrorLogQueueCapacity()
 	log.Printf(
 		"[OpsErrorLogger] queue is full; dropping logs (queued=%d cap=%d enqueued_total=%d dropped_total=%d processed_total=%d)",
 		queued,
 		queueCap,
 		opsErrorLogEnqueued.Load(),
 		opsErrorLogDropped.Load(),
 		opsErrorLogProcessed.Load(),
 	)
 }
 func opsErrorLogConfig() (workerCount int, queueSize int) {
 	workerCount = runtime.GOMAXPROCS(0) * 2
 	if workerCount < opsErrorLogMinWorkerCount {
 		workerCount = opsErrorLogMinWorkerCount
 	}
 	if workerCount > opsErrorLogMaxWorkerCount {
 		workerCount = opsErrorLogMaxWorkerCount
 	}
 	queueSize = workerCount * opsErrorLogQueueSizePerWorker
 	if queueSize < opsErrorLogMinQueueSize {
 		queueSize = opsErrorLogMinQueueSize
 	}
 	if queueSize > opsErrorLogMaxQueueSize {
 		queueSize = opsErrorLogMaxQueueSize
 	}
 	return workerCount, queueSize
 }
 func setOpsRequestContext(c *gin.Context, model string, stream bool, requestBody []byte) {
 	if c == nil {
 		return
 	}
 	c.Set(opsModelKey, model)
 	c.Set(opsStreamKey, stream)
 	if len(requestBody) > 0 {
 		c.Set(opsRequestBodyKey, requestBody)
 	}
 }
 func setOpsSelectedAccount(c *gin.Context, accountID int64) {
 	if c == nil || accountID <= 0 {
 		return
 	}
 	c.Set(opsAccountIDKey, accountID)
 }
 type opsCaptureWriter struct {
 	gin.ResponseWriter
 	limit int
 	buf   bytes.Buffer
 }
 func (w *opsCaptureWriter) Write(b []byte) (int, error) {
 	if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit {
 		remaining := w.limit - w.buf.Len()
 		if len(b) > remaining {
 			_, _ = w.buf.Write(b[:remaining])
 		} else {
 			_, _ = w.buf.Write(b)
 		}
 	}
 	return w.ResponseWriter.Write(b)
 }
 func (w *opsCaptureWriter) WriteString(s string) (int, error) {
 	if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit {
 		remaining := w.limit - w.buf.Len()
 		if len(s) > remaining {
 			_, _ = w.buf.WriteString(s[:remaining])
 		} else {
 			_, _ = w.buf.WriteString(s)
 		}
 	}
 	return w.ResponseWriter.WriteString(s)
 }
 // OpsErrorLoggerMiddleware records error responses (status >= 400) into ops_error_logs.
 //
 // Notes:
 // - It buffers response bodies only when status >= 400 to avoid overhead for successful traffic.
 // - Streaming errors after the response has started (SSE) may still need explicit logging.
 func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
 	return func(c *gin.Context) {
 		w := &opsCaptureWriter{ResponseWriter: c.Writer, limit: 64 * 1024}
 		c.Writer = w
 		c.Next()
 		if ops == nil {
 			return
 		}
 		if !ops.IsMonitoringEnabled(c.Request.Context()) {
 			return
 		}
 		status := c.Writer.Status()
 		if status < 400 {
 			// Even when the client request succeeds, we still want to persist upstream error attempts
 			// (retries/failover) so ops can observe upstream instability that gets "covered" by retries.
 			var events []*service.OpsUpstreamErrorEvent
 			if v, ok := c.Get(service.OpsUpstreamErrorsKey); ok {
 				if arr, ok := v.([]*service.OpsUpstreamErrorEvent); ok && len(arr) > 0 {
 					events = arr
 				}
 			}
 			// Also accept single upstream fields set by gateway services (rare for successful requests).
 			hasUpstreamContext := len(events) > 0
 			if !hasUpstreamContext {
 				if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
 					switch t := v.(type) {
 					case int:
 						hasUpstreamContext = t > 0
 					case int64:
 						hasUpstreamContext = t > 0
 					}
 				}
 			}
 			if !hasUpstreamContext {
 				if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
 					if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
 						hasUpstreamContext = true
 					}
 				}
 			}
 			if !hasUpstreamContext {
 				if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
 					if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
 						hasUpstreamContext = true
 					}
 				}
 			}
 			if !hasUpstreamContext {
 				return
 			}
 			apiKey, _ := middleware2.GetAPIKeyFromContext(c)
 			clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string)
 			model, _ := c.Get(opsModelKey)
 			streamV, _ := c.Get(opsStreamKey)
 			accountIDV, _ := c.Get(opsAccountIDKey)
 			var modelName string
 			if s, ok := model.(string); ok {
 				modelName = s
 			}
 			stream := false
 			if b, ok := streamV.(bool); ok {
 				stream = b
 			}
 			// Prefer showing the account that experienced the upstream error (if we have events),
 			// otherwise fall back to the final selected account (best-effort).
 			var accountID *int64
 			if len(events) > 0 {
 				if last := events[len(events)-1]; last != nil && last.AccountID > 0 {
 					v := last.AccountID
 					accountID = &v
 				}
 			}
 			if accountID == nil {
 				if v, ok := accountIDV.(int64); ok && v > 0 {
 					accountID = &v
 				}
 			}
 			fallbackPlatform := guessPlatformFromPath(c.Request.URL.Path)
 			platform := resolveOpsPlatform(apiKey, fallbackPlatform)
 			requestID := c.Writer.Header().Get("X-Request-Id")
 			if requestID == "" {
 				requestID = c.Writer.Header().Get("x-request-id")
 			}
 			// Best-effort backfill single upstream fields from the last event (if present).
 			var upstreamStatusCode *int
 			var upstreamErrorMessage *string
 			var upstreamErrorDetail *string
 			if len(events) > 0 {
 				last := events[len(events)-1]
 				if last != nil {
 					if last.UpstreamStatusCode > 0 {
 						code := last.UpstreamStatusCode
 						upstreamStatusCode = &code
 					}
 					if msg := strings.TrimSpace(last.Message); msg != "" {
 						upstreamErrorMessage = &msg
 					}
 					if detail := strings.TrimSpace(last.Detail); detail != "" {
 						upstreamErrorDetail = &detail
 					}
 				}
 			}
 			if upstreamStatusCode == nil {
 				if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
 					switch t := v.(type) {
 					case int:
 						if t > 0 {
 							code := t
 							upstreamStatusCode = &code
 						}
 					case int64:
 						if t > 0 {
 							code := int(t)
 							upstreamStatusCode = &code
 						}
 					}
 				}
 			}
 			if upstreamErrorMessage == nil {
 				if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
 					if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
 						msg := strings.TrimSpace(s)
 						upstreamErrorMessage = &msg
 					}
 				}
 			}
 			if upstreamErrorDetail == nil {
 				if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
 					if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
 						detail := strings.TrimSpace(s)
 						upstreamErrorDetail = &detail
 					}
 				}
 			}
 			// If we still have nothing meaningful, skip.
 			if upstreamStatusCode == nil && upstreamErrorMessage == nil && upstreamErrorDetail == nil && len(events) == 0 {
 				return
 			}
 			effectiveUpstreamStatus := 0
 			if upstreamStatusCode != nil {
 				effectiveUpstreamStatus = *upstreamStatusCode
 			}
 			recoveredMsg := "Recovered upstream error"
 			if effectiveUpstreamStatus > 0 {
 				recoveredMsg += " " + strconvItoa(effectiveUpstreamStatus)
 			}
 			if upstreamErrorMessage != nil && strings.TrimSpace(*upstreamErrorMessage) != "" {
 				recoveredMsg += ": " + strings.TrimSpace(*upstreamErrorMessage)
 			}
 			recoveredMsg = truncateString(recoveredMsg, 2048)
 			entry := &service.OpsInsertErrorLogInput{
 				RequestID:       requestID,
 				ClientRequestID: clientRequestID,
 				AccountID: accountID,
 				Platform:  platform,
 				Model:     modelName,
 				RequestPath: func() string {
 					if c.Request != nil && c.Request.URL != nil {
 						return c.Request.URL.Path
 					}
 					return ""
 				}(),
 				Stream:    stream,
 				UserAgent: c.GetHeader("User-Agent"),
 				ErrorPhase: "upstream",
 				ErrorType:  "upstream_error",
 				// Severity/retryability should reflect the upstream failure, not the final client status (200).
 				Severity:          classifyOpsSeverity("upstream_error", effectiveUpstreamStatus),
 				StatusCode:        status,
 				IsBusinessLimited: false,
 				ErrorMessage: recoveredMsg,
 				ErrorBody:    "",
 				ErrorSource: "upstream_http",
 				ErrorOwner:  "provider",
 				UpstreamStatusCode:   upstreamStatusCode,
 				UpstreamErrorMessage: upstreamErrorMessage,
 				UpstreamErrorDetail:  upstreamErrorDetail,
 				UpstreamErrors:       events,
 				IsRetryable: classifyOpsIsRetryable("upstream_error", effectiveUpstreamStatus),
 				RetryCount:  0,
 				CreatedAt:   time.Now(),
 			}
 			if apiKey != nil {
 				entry.APIKeyID = &apiKey.ID
 				if apiKey.User != nil {
 					entry.UserID = &apiKey.User.ID
 				}
 				if apiKey.GroupID != nil {
 					entry.GroupID = apiKey.GroupID
 				}
 				// Prefer group platform if present (more stable than inferring from path).
 				if apiKey.Group != nil && apiKey.Group.Platform != "" {
 					entry.Platform = apiKey.Group.Platform
 				}
 			}
 			var clientIP string
 			if ip := strings.TrimSpace(c.ClientIP()); ip != "" {
 				clientIP = ip
 				entry.ClientIP = &clientIP
 			}
 			var requestBody []byte
 			if v, ok := c.Get(opsRequestBodyKey); ok {
 				if b, ok := v.([]byte); ok && len(b) > 0 {
 					requestBody = b
 				}
 			}
 			// Store request headers/body only when an upstream error occurred to keep overhead minimal.
 			entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c)
 			enqueueOpsErrorLog(ops, entry, requestBody)
 			return
 		}
 		body := w.buf.Bytes()
 		parsed := parseOpsErrorResponse(body)
 		apiKey, _ := middleware2.GetAPIKeyFromContext(c)
 		clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string)
 		model, _ := c.Get(opsModelKey)
 		streamV, _ := c.Get(opsStreamKey)
 		accountIDV, _ := c.Get(opsAccountIDKey)
 		var modelName string
 		if s, ok := model.(string); ok {
 			modelName = s
 		}
 		stream := false
 		if b, ok := streamV.(bool); ok {
 			stream = b
 		}
 		var accountID *int64
 		if v, ok := accountIDV.(int64); ok && v > 0 {
 			accountID = &v
 		}
 		fallbackPlatform := guessPlatformFromPath(c.Request.URL.Path)
 		platform := resolveOpsPlatform(apiKey, fallbackPlatform)
 		requestID := c.Writer.Header().Get("X-Request-Id")
 		if requestID == "" {
 			requestID = c.Writer.Header().Get("x-request-id")
 		}
 		phase := classifyOpsPhase(parsed.ErrorType, parsed.Message, parsed.Code)
 		isBusinessLimited := classifyOpsIsBusinessLimited(parsed.ErrorType, phase, parsed.Code, status, parsed.Message)
 		errorOwner := classifyOpsErrorOwner(phase, parsed.Message)
 		errorSource := classifyOpsErrorSource(phase, parsed.Message)
 		entry := &service.OpsInsertErrorLogInput{
 			RequestID:       requestID,
 			ClientRequestID: clientRequestID,
 			AccountID: accountID,
 			Platform:  platform,
 			Model:     modelName,
 			RequestPath: func() string {
 				if c.Request != nil && c.Request.URL != nil {
 					return c.Request.URL.Path
 				}
 				return ""
 			}(),
 			Stream:    stream,
 			UserAgent: c.GetHeader("User-Agent"),
 			ErrorPhase:        phase,
 			ErrorType:         normalizeOpsErrorType(parsed.ErrorType, parsed.Code),
 			Severity:          classifyOpsSeverity(parsed.ErrorType, status),
 			StatusCode:        status,
 			IsBusinessLimited: isBusinessLimited,
 			ErrorMessage: parsed.Message,
 			// Keep the full captured error body (capture is already capped at 64KB) so the
 			// service layer can sanitize JSON before truncating for storage.
 			ErrorBody:   string(body),
 			ErrorSource: errorSource,
 			ErrorOwner:  errorOwner,
 			IsRetryable: classifyOpsIsRetryable(parsed.ErrorType, status),
 			RetryCount:  0,
 			CreatedAt:   time.Now(),
 		}
 		// Capture upstream error context set by gateway services (if present).
 		// This does NOT affect the client response; it enriches Ops troubleshooting data.
 		{
 			if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
 				switch t := v.(type) {
 				case int:
 					if t > 0 {
 						code := t
 						entry.UpstreamStatusCode = &code
 					}
 				case int64:
 					if t > 0 {
 						code := int(t)
 						entry.UpstreamStatusCode = &code
 					}
 				}
 			}
 			if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
 				if s, ok := v.(string); ok {
 					if msg := strings.TrimSpace(s); msg != "" {
 						entry.UpstreamErrorMessage = &msg
 					}
 				}
 			}
 			if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
 				if s, ok := v.(string); ok {
 					if detail := strings.TrimSpace(s); detail != "" {
 						entry.UpstreamErrorDetail = &detail
 					}
 				}
 			}
 			if v, ok := c.Get(service.OpsUpstreamErrorsKey); ok {
 				if events, ok := v.([]*service.OpsUpstreamErrorEvent); ok && len(events) > 0 {
 					entry.UpstreamErrors = events
 					// Best-effort backfill the single upstream fields from the last event when missing.
 					last := events[len(events)-1]
 					if last != nil {
 						if entry.UpstreamStatusCode == nil && last.UpstreamStatusCode > 0 {
 							code := last.UpstreamStatusCode
 							entry.UpstreamStatusCode = &code
 						}
 						if entry.UpstreamErrorMessage == nil && strings.TrimSpace(last.Message) != "" {
 							msg := strings.TrimSpace(last.Message)
 							entry.UpstreamErrorMessage = &msg
 						}
 						if entry.UpstreamErrorDetail == nil && strings.TrimSpace(last.Detail) != "" {
 							detail := strings.TrimSpace(last.Detail)
 							entry.UpstreamErrorDetail = &detail
 						}
 					}
 				}
 			}
 		}
 		if apiKey != nil {
 			entry.APIKeyID = &apiKey.ID
 			if apiKey.User != nil {
 				entry.UserID = &apiKey.User.ID
 			}
 			if apiKey.GroupID != nil {
 				entry.GroupID = apiKey.GroupID
 			}
 			// Prefer group platform if present (more stable than inferring from path).
 			if apiKey.Group != nil && apiKey.Group.Platform != "" {
 				entry.Platform = apiKey.Group.Platform
 			}
 		}
 		var clientIP string
 		if ip := strings.TrimSpace(c.ClientIP()); ip != "" {
 			clientIP = ip
 			entry.ClientIP = &clientIP
 		}
 		var requestBody []byte
 		if v, ok := c.Get(opsRequestBodyKey); ok {
 			if b, ok := v.([]byte); ok && len(b) > 0 {
 				requestBody = b
 			}
 		}
 		// Persist only a minimal, whitelisted set of request headers to improve retry fidelity.
 		// Do NOT store Authorization/Cookie/etc.
 		entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c)
 		enqueueOpsErrorLog(ops, entry, requestBody)
 	}
 }
 var opsRetryRequestHeaderAllowlist = []string{
 	"anthropic-beta",
 	"anthropic-version",
 }
 func extractOpsRetryRequestHeaders(c *gin.Context) *string {
 	if c == nil || c.Request == nil {
 		return nil
 	}
 	headers := make(map[string]string, 4)
 	for _, key := range opsRetryRequestHeaderAllowlist {
 		v := strings.TrimSpace(c.GetHeader(key))
 		if v == "" {
 			continue
 		}
 		// Keep headers small even if a client sends something unexpected.
 		headers[key] = truncateString(v, 512)
 	}
 	if len(headers) == 0 {
 		return nil
 	}
 	raw, err := json.Marshal(headers)
 	if err != nil {
 		return nil
 	}
 	s := string(raw)
 	return &s
 }
 type parsedOpsError struct {
 	ErrorType string
 	Message   string
 	Code      string
 }
 func parseOpsErrorResponse(body []byte) parsedOpsError {
 	if len(body) == 0 {
 		return parsedOpsError{}
 	}
 	// Fast path: attempt to decode into a generic map.
 	var m map[string]any
 	if err := json.Unmarshal(body, &m); err != nil {
 		return parsedOpsError{Message: truncateString(string(body), 1024)}
 	}
 	// Claude/OpenAI-style gateway error: { type:"error", error:{ type, message } }
 	if errObj, ok := m["error"].(map[string]any); ok {
 		t, _ := errObj["type"].(string)
 		msg, _ := errObj["message"].(string)
 		// Gemini googleError also uses "error": { code, message, status }
 		if msg == "" {
 			if v, ok := errObj["message"]; ok {
 				msg, _ = v.(string)
 			}
 		}
 		if t == "" {
 			// Gemini error does not have "type" field.
 			t = "api_error"
 		}
 		// For gemini error, capture numeric code as string for business-limited mapping if needed.
 		var code string
 		if v, ok := errObj["code"]; ok {
 			switch n := v.(type) {
 			case float64:
 				code = strconvItoa(int(n))
 			case int:
 				code = strconvItoa(n)
 			}
 		}
 		return parsedOpsError{ErrorType: t, Message: msg, Code: code}
 	}
 	// APIKeyAuth-style: { code:"INSUFFICIENT_BALANCE", message:"..." }
 	code, _ := m["code"].(string)
 	msg, _ := m["message"].(string)
 	if code != "" || msg != "" {
 		return parsedOpsError{ErrorType: "api_error", Message: msg, Code: code}
 	}
 	return parsedOpsError{Message: truncateString(string(body), 1024)}
 }
 func resolveOpsPlatform(apiKey *service.APIKey, fallback string) string {
 	if apiKey != nil && apiKey.Group != nil && apiKey.Group.Platform != "" {
 		return apiKey.Group.Platform
 	}
 	return fallback
 }
 func guessPlatformFromPath(path string) string {
 	p := strings.ToLower(path)
 	switch {
 	case strings.HasPrefix(p, "/antigravity/"):
 		return service.PlatformAntigravity
 	case strings.HasPrefix(p, "/v1beta/"):
 		return service.PlatformGemini
 	case strings.Contains(p, "/responses"):
 		return service.PlatformOpenAI
 	default:
 		return ""
 	}
 }
 func normalizeOpsErrorType(errType string, code string) string {
 	if errType != "" {
 		return errType
 	}
 	switch strings.TrimSpace(code) {
 	case "INSUFFICIENT_BALANCE":
 		return "billing_error"
 	case "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
 		return "subscription_error"
 	default:
 		return "api_error"
 	}
 }
 func classifyOpsPhase(errType, message, code string) string {
 	msg := strings.ToLower(message)
 	switch strings.TrimSpace(code) {
 	case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
 		return "billing"
 	}
 	switch errType {
 	case "authentication_error":
 		return "auth"
 	case "billing_error", "subscription_error":
 		return "billing"
 	case "rate_limit_error":
 		if strings.Contains(msg, "concurrency") || strings.Contains(msg, "pending") || strings.Contains(msg, "queue") {
 			return "concurrency"
 		}
 		return "upstream"
 	case "invalid_request_error":
 		return "response"
 	case "upstream_error", "overloaded_error":
 		return "upstream"
 	case "api_error":
 		if strings.Contains(msg, "no available accounts") {
 			return "scheduling"
 		}
 		return "internal"
 	default:
 		return "internal"
 	}
 }
 func classifyOpsSeverity(errType string, status int) string {
 	switch errType {
 	case "invalid_request_error", "authentication_error", "billing_error", "subscription_error":
 		return "P3"
 	}
 	if status >= 500 {
 		return "P1"
 	}
 	if status == 429 {
 		return "P1"
 	}
 	if status >= 400 {
 		return "P2"
 	}
 	return "P3"
 }
 func classifyOpsIsRetryable(errType string, statusCode int) bool {
 	switch errType {
 	case "authentication_error", "invalid_request_error":
 		return false
 	case "timeout_error":
 		return true
 	case "rate_limit_error":
 		// May be transient (upstream or queue); retry can help.
 		return true
 	case "billing_error", "subscription_error":
 		return false
 	case "upstream_error", "overloaded_error":
 		return statusCode >= 500 || statusCode == 429 || statusCode == 529
 	default:
 		return statusCode >= 500
 	}
 }
 func classifyOpsIsBusinessLimited(errType, phase, code string, status int, message string) bool {
 	switch strings.TrimSpace(code) {
 	case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
 		return true
 	}
 	if phase == "billing" || phase == "concurrency" {
 		// SLA/错误率排除“用户级业务限制”
 		return true
 	}
 	// Avoid treating upstream rate limits as business-limited.
 	if errType == "rate_limit_error" && strings.Contains(strings.ToLower(message), "upstream") {
 		return false
 	}
 	_ = status
 	return false
 }
 func classifyOpsErrorOwner(phase string, message string) string {
 	switch phase {
 	case "upstream", "network":
 		return "provider"
 	case "billing", "concurrency", "auth", "response":
 		return "client"
 	default:
 		if strings.Contains(strings.ToLower(message), "upstream") {
 			return "provider"
 		}
 		return "sub2api"
 	}
 }
 func classifyOpsErrorSource(phase string, message string) string {
 	switch phase {
 	case "upstream":
 		return "upstream_http"
 	case "network":
 		return "upstream_network"
 	case "billing":
 		return "billing"
 	case "concurrency":
 		return "concurrency"
 	default:
 		if strings.Contains(strings.ToLower(message), "upstream") {
 			return "upstream_http"
 		}
 		return "internal"
 	}
 }
 func truncateString(s string, max int) string {
 	if max <= 0 {
 		return ""
 	}
 	if len(s) <= max {
 		return s
 	}
 	cut := s[:max]
 	// Ensure truncation does not split multi-byte characters.
 	for len(cut) > 0 && !utf8.ValidString(cut) {
 		cut = cut[:len(cut)-1]
 	}
 	return cut
 }
 func strconvItoa(v int) string {
 	return strconv.Itoa(v)
 }
--- a/backend/internal/handler/wire.go
+++ b/backend/internal/handler/wire.go
@@ -21,6 +21,7 @@ func ProvideAdminHandlers(
 	redeemHandler *admin.RedeemHandler,
 	promoHandler *admin.PromoHandler,
 	settingHandler *admin.SettingHandler,
 	opsHandler *admin.OpsHandler,
 	systemHandler *admin.SystemHandler,
 	subscriptionHandler *admin.SubscriptionHandler,
 	usageHandler *admin.UsageHandler,
@@ -39,6 +40,7 @@ func ProvideAdminHandlers(
 		Redeem:           redeemHandler,
 		Promo:            promoHandler,
 		Setting:          settingHandler,
 		Ops:              opsHandler,
 		System:           systemHandler,
 		Subscription:     subscriptionHandler,
 		Usage:            usageHandler,
@@ -109,6 +111,7 @@ var ProviderSet = wire.NewSet(
 	admin.NewRedeemHandler,
 	admin.NewPromoHandler,
 	admin.NewSettingHandler,
 	admin.NewOpsHandler,
 	ProvideSystemHandler,
 	admin.NewSubscriptionHandler,
 	admin.NewUsageHandler,
--- a/backend/internal/pkg/ctxkey/ctxkey.go
+++ b/backend/internal/pkg/ctxkey/ctxkey.go
@@ -7,7 +7,14 @@ type Key string
 const (
 	// ForcePlatform 强制平台（用于 /antigravity 路由），由 middleware.ForcePlatform 设置
 	ForcePlatform Key = "ctx_force_platform"
-	// IsClaudeCodeClient 是否为 Claude Code 客户端，由中间件设置
+
 	// ClientRequestID 客户端请求的唯一标识，用于追踪请求全生命周期（用于 Ops 监控与排障）。
 	ClientRequestID Key = "ctx_client_request_id"
 	// RetryCount 表示当前请求在网关层的重试次数（用于 Ops 记录与排障）。
 	RetryCount Key = "ctx_retry_count"
 	// IsClaudeCodeClient 标识当前请求是否来自 Claude Code 客户端
 	IsClaudeCodeClient Key = "ctx_is_claude_code_client"
 	// Group 认证后的分组信息，由 API Key 认证中间件设置
 	Group Key = "ctx_group"
--- a/backend/internal/repository/concurrency_cache.go
+++ b/backend/internal/repository/concurrency_cache.go
@@ -93,7 +93,7 @@ var (
 		return redis.call('ZCARD', key)
 	`)
-	// incrementWaitScript - only sets TTL on first creation to avoid refreshing
+	// incrementWaitScript - refreshes TTL on each increment to keep queue depth accurate
 	// KEYS[1] = wait queue key
 	// ARGV[1] = maxWait
 	// ARGV[2] = TTL in seconds
@@ -111,15 +111,13 @@ var (
 		local newVal = redis.call('INCR', KEYS[1])
-		-- Only set TTL on first creation to avoid refreshing zombie data
+		-- Refresh TTL so long-running traffic doesn't expire active queue counters.
-		if newVal == 1 then
+		redis.call('EXPIRE', KEYS[1], ARGV[2])
 			redis.call('EXPIRE', KEYS[1], ARGV[2])
 		end
 			return 1
 		`)
-	// incrementAccountWaitScript - account-level wait queue count
+	// incrementAccountWaitScript - account-level wait queue count (refresh TTL on each increment)
 	incrementAccountWaitScript = redis.NewScript(`
 			local current = redis.call('GET', KEYS[1])
 			if current == false then
@@ -134,10 +132,8 @@ var (
 			local newVal = redis.call('INCR', KEYS[1])
-			-- Only set TTL on first creation to avoid refreshing zombie data
+			-- Refresh TTL so long-running traffic doesn't expire active queue counters.
-			if newVal == 1 then
+			redis.call('EXPIRE', KEYS[1], ARGV[2])
 				redis.call('EXPIRE', KEYS[1], ARGV[2])
 			end
 			return 1
 		`)
--- a/backend/internal/repository/ops_repo.go
+++ b/backend/internal/repository/ops_repo.go
@@ -0,0 +1,707 @@
 package repository
 import (
 	"context"
 	"database/sql"
 	"fmt"
 	"strings"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/service"
 	"github.com/lib/pq"
 )
 type opsRepository struct {
 	db *sql.DB
 }
 func NewOpsRepository(db *sql.DB) service.OpsRepository {
 	return &opsRepository{db: db}
 }
 func (r *opsRepository) InsertErrorLog(ctx context.Context, input *service.OpsInsertErrorLogInput) (int64, error) {
 	if r == nil || r.db == nil {
 		return 0, fmt.Errorf("nil ops repository")
 	}
 	if input == nil {
 		return 0, fmt.Errorf("nil input")
 	}
 	q := `
 INSERT INTO ops_error_logs (
  request_id,
  client_request_id,
  user_id,
  api_key_id,
  account_id,
  group_id,
  client_ip,
  platform,
  model,
  request_path,
  stream,
  user_agent,
  error_phase,
  error_type,
  severity,
  status_code,
  is_business_limited,
  error_message,
  error_body,
  error_source,
  error_owner,
  upstream_status_code,
  upstream_error_message,
  upstream_error_detail,
  upstream_errors,
  duration_ms,
  time_to_first_token_ms,
  request_body,
  request_body_truncated,
  request_body_bytes,
  request_headers,
  is_retryable,
  retry_count,
  created_at
 ) VALUES (
  $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34
 ) RETURNING id`
 	var id int64
 	err := r.db.QueryRowContext(
 		ctx,
 		q,
 		opsNullString(input.RequestID),
 		opsNullString(input.ClientRequestID),
 		opsNullInt64(input.UserID),
 		opsNullInt64(input.APIKeyID),
 		opsNullInt64(input.AccountID),
 		opsNullInt64(input.GroupID),
 		opsNullString(input.ClientIP),
 		opsNullString(input.Platform),
 		opsNullString(input.Model),
 		opsNullString(input.RequestPath),
 		input.Stream,
 		opsNullString(input.UserAgent),
 		input.ErrorPhase,
 		input.ErrorType,
 		opsNullString(input.Severity),
 		opsNullInt(input.StatusCode),
 		input.IsBusinessLimited,
 		opsNullString(input.ErrorMessage),
 		opsNullString(input.ErrorBody),
 		opsNullString(input.ErrorSource),
 		opsNullString(input.ErrorOwner),
 		opsNullInt(input.UpstreamStatusCode),
 		opsNullString(input.UpstreamErrorMessage),
 		opsNullString(input.UpstreamErrorDetail),
 		opsNullString(input.UpstreamErrorsJSON),
 		opsNullInt(input.DurationMs),
 		opsNullInt64(input.TimeToFirstTokenMs),
 		opsNullString(input.RequestBodyJSON),
 		input.RequestBodyTruncated,
 		opsNullInt(input.RequestBodyBytes),
 		opsNullString(input.RequestHeadersJSON),
 		input.IsRetryable,
 		input.RetryCount,
 		input.CreatedAt,
 	).Scan(&id)
 	if err != nil {
 		return 0, err
 	}
 	return id, nil
 }
 func (r *opsRepository) ListErrorLogs(ctx context.Context, filter *service.OpsErrorLogFilter) (*service.OpsErrorLogList, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	if filter == nil {
 		filter = &service.OpsErrorLogFilter{}
 	}
 	page := filter.Page
 	if page <= 0 {
 		page = 1
 	}
 	pageSize := filter.PageSize
 	if pageSize <= 0 {
 		pageSize = 20
 	}
 	if pageSize > 500 {
 		pageSize = 500
 	}
 	where, args := buildOpsErrorLogsWhere(filter)
 	countSQL := "SELECT COUNT(*) FROM ops_error_logs " + where
 	var total int
 	if err := r.db.QueryRowContext(ctx, countSQL, args...).Scan(&total); err != nil {
 		return nil, err
 	}
 	offset := (page - 1) * pageSize
 	argsWithLimit := append(args, pageSize, offset)
 	selectSQL := `
 SELECT
  id,
  created_at,
  error_phase,
  error_type,
  severity,
  COALESCE(upstream_status_code, status_code, 0),
  COALESCE(platform, ''),
  COALESCE(model, ''),
  duration_ms,
  COALESCE(client_request_id, ''),
  COALESCE(request_id, ''),
  COALESCE(error_message, ''),
  user_id,
  api_key_id,
  account_id,
  group_id,
  CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END,
  COALESCE(request_path, ''),
  stream
 FROM ops_error_logs
 ` + where + `
 ORDER BY created_at DESC
 LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2)
 	rows, err := r.db.QueryContext(ctx, selectSQL, argsWithLimit...)
 	if err != nil {
 		return nil, err
 	}
 	defer func() { _ = rows.Close() }()
 	out := make([]*service.OpsErrorLog, 0, pageSize)
 	for rows.Next() {
 		var item service.OpsErrorLog
 		var latency sql.NullInt64
 		var statusCode sql.NullInt64
 		var clientIP sql.NullString
 		var userID sql.NullInt64
 		var apiKeyID sql.NullInt64
 		var accountID sql.NullInt64
 		var groupID sql.NullInt64
 		if err := rows.Scan(
 			&item.ID,
 			&item.CreatedAt,
 			&item.Phase,
 			&item.Type,
 			&item.Severity,
 			&statusCode,
 			&item.Platform,
 			&item.Model,
 			&latency,
 			&item.ClientRequestID,
 			&item.RequestID,
 			&item.Message,
 			&userID,
 			&apiKeyID,
 			&accountID,
 			&groupID,
 			&clientIP,
 			&item.RequestPath,
 			&item.Stream,
 		); err != nil {
 			return nil, err
 		}
 		if latency.Valid {
 			v := int(latency.Int64)
 			item.LatencyMs = &v
 		}
 		item.StatusCode = int(statusCode.Int64)
 		if clientIP.Valid {
 			s := clientIP.String
 			item.ClientIP = &s
 		}
 		if userID.Valid {
 			v := userID.Int64
 			item.UserID = &v
 		}
 		if apiKeyID.Valid {
 			v := apiKeyID.Int64
 			item.APIKeyID = &v
 		}
 		if accountID.Valid {
 			v := accountID.Int64
 			item.AccountID = &v
 		}
 		if groupID.Valid {
 			v := groupID.Int64
 			item.GroupID = &v
 		}
 		out = append(out, &item)
 	}
 	if err := rows.Err(); err != nil {
 		return nil, err
 	}
 	return &service.OpsErrorLogList{
 		Errors:   out,
 		Total:    total,
 		Page:     page,
 		PageSize: pageSize,
 	}, nil
 }
 func (r *opsRepository) GetErrorLogByID(ctx context.Context, id int64) (*service.OpsErrorLogDetail, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	if id <= 0 {
 		return nil, fmt.Errorf("invalid id")
 	}
 	q := `
 SELECT
  id,
  created_at,
  error_phase,
  error_type,
  severity,
  COALESCE(upstream_status_code, status_code, 0),
  COALESCE(platform, ''),
  COALESCE(model, ''),
  duration_ms,
  COALESCE(client_request_id, ''),
  COALESCE(request_id, ''),
  COALESCE(error_message, ''),
  COALESCE(error_body, ''),
  upstream_status_code,
  COALESCE(upstream_error_message, ''),
  COALESCE(upstream_error_detail, ''),
  COALESCE(upstream_errors::text, ''),
  is_business_limited,
  user_id,
  api_key_id,
  account_id,
  group_id,
  CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END,
  COALESCE(request_path, ''),
  stream,
  COALESCE(user_agent, ''),
  auth_latency_ms,
  routing_latency_ms,
  upstream_latency_ms,
  response_latency_ms,
  time_to_first_token_ms,
  COALESCE(request_body::text, ''),
  request_body_truncated,
  request_body_bytes,
  COALESCE(request_headers::text, '')
 FROM ops_error_logs
 WHERE id = $1
 LIMIT 1`
 	var out service.OpsErrorLogDetail
 	var latency sql.NullInt64
 	var statusCode sql.NullInt64
 	var upstreamStatusCode sql.NullInt64
 	var clientIP sql.NullString
 	var userID sql.NullInt64
 	var apiKeyID sql.NullInt64
 	var accountID sql.NullInt64
 	var groupID sql.NullInt64
 	var authLatency sql.NullInt64
 	var routingLatency sql.NullInt64
 	var upstreamLatency sql.NullInt64
 	var responseLatency sql.NullInt64
 	var ttft sql.NullInt64
 	var requestBodyBytes sql.NullInt64
 	err := r.db.QueryRowContext(ctx, q, id).Scan(
 		&out.ID,
 		&out.CreatedAt,
 		&out.Phase,
 		&out.Type,
 		&out.Severity,
 		&statusCode,
 		&out.Platform,
 		&out.Model,
 		&latency,
 		&out.ClientRequestID,
 		&out.RequestID,
 		&out.Message,
 		&out.ErrorBody,
 		&upstreamStatusCode,
 		&out.UpstreamErrorMessage,
 		&out.UpstreamErrorDetail,
 		&out.UpstreamErrors,
 		&out.IsBusinessLimited,
 		&userID,
 		&apiKeyID,
 		&accountID,
 		&groupID,
 		&clientIP,
 		&out.RequestPath,
 		&out.Stream,
 		&out.UserAgent,
 		&authLatency,
 		&routingLatency,
 		&upstreamLatency,
 		&responseLatency,
 		&ttft,
 		&out.RequestBody,
 		&out.RequestBodyTruncated,
 		&requestBodyBytes,
 		&out.RequestHeaders,
 	)
 	if err != nil {
 		return nil, err
 	}
 	out.StatusCode = int(statusCode.Int64)
 	if latency.Valid {
 		v := int(latency.Int64)
 		out.LatencyMs = &v
 	}
 	if clientIP.Valid {
 		s := clientIP.String
 		out.ClientIP = &s
 	}
 	if upstreamStatusCode.Valid && upstreamStatusCode.Int64 > 0 {
 		v := int(upstreamStatusCode.Int64)
 		out.UpstreamStatusCode = &v
 	}
 	if userID.Valid {
 		v := userID.Int64
 		out.UserID = &v
 	}
 	if apiKeyID.Valid {
 		v := apiKeyID.Int64
 		out.APIKeyID = &v
 	}
 	if accountID.Valid {
 		v := accountID.Int64
 		out.AccountID = &v
 	}
 	if groupID.Valid {
 		v := groupID.Int64
 		out.GroupID = &v
 	}
 	if authLatency.Valid {
 		v := authLatency.Int64
 		out.AuthLatencyMs = &v
 	}
 	if routingLatency.Valid {
 		v := routingLatency.Int64
 		out.RoutingLatencyMs = &v
 	}
 	if upstreamLatency.Valid {
 		v := upstreamLatency.Int64
 		out.UpstreamLatencyMs = &v
 	}
 	if responseLatency.Valid {
 		v := responseLatency.Int64
 		out.ResponseLatencyMs = &v
 	}
 	if ttft.Valid {
 		v := ttft.Int64
 		out.TimeToFirstTokenMs = &v
 	}
 	if requestBodyBytes.Valid {
 		v := int(requestBodyBytes.Int64)
 		out.RequestBodyBytes = &v
 	}
 	// Normalize request_body to empty string when stored as JSON null.
 	out.RequestBody = strings.TrimSpace(out.RequestBody)
 	if out.RequestBody == "null" {
 		out.RequestBody = ""
 	}
 	// Normalize request_headers to empty string when stored as JSON null.
 	out.RequestHeaders = strings.TrimSpace(out.RequestHeaders)
 	if out.RequestHeaders == "null" {
 		out.RequestHeaders = ""
 	}
 	// Normalize upstream_errors to empty string when stored as JSON null.
 	out.UpstreamErrors = strings.TrimSpace(out.UpstreamErrors)
 	if out.UpstreamErrors == "null" {
 		out.UpstreamErrors = ""
 	}
 	return &out, nil
 }
 func (r *opsRepository) InsertRetryAttempt(ctx context.Context, input *service.OpsInsertRetryAttemptInput) (int64, error) {
 	if r == nil || r.db == nil {
 		return 0, fmt.Errorf("nil ops repository")
 	}
 	if input == nil {
 		return 0, fmt.Errorf("nil input")
 	}
 	if input.SourceErrorID <= 0 {
 		return 0, fmt.Errorf("invalid source_error_id")
 	}
 	if strings.TrimSpace(input.Mode) == "" {
 		return 0, fmt.Errorf("invalid mode")
 	}
 	q := `
 INSERT INTO ops_retry_attempts (
  requested_by_user_id,
  source_error_id,
  mode,
  pinned_account_id,
  status,
  started_at
 ) VALUES (
  $1,$2,$3,$4,$5,$6
 ) RETURNING id`
 	var id int64
 	err := r.db.QueryRowContext(
 		ctx,
 		q,
 		opsNullInt64(&input.RequestedByUserID),
 		input.SourceErrorID,
 		strings.TrimSpace(input.Mode),
 		opsNullInt64(input.PinnedAccountID),
 		strings.TrimSpace(input.Status),
 		input.StartedAt,
 	).Scan(&id)
 	if err != nil {
 		return 0, err
 	}
 	return id, nil
 }
 func (r *opsRepository) UpdateRetryAttempt(ctx context.Context, input *service.OpsUpdateRetryAttemptInput) error {
 	if r == nil || r.db == nil {
 		return fmt.Errorf("nil ops repository")
 	}
 	if input == nil {
 		return fmt.Errorf("nil input")
 	}
 	if input.ID <= 0 {
 		return fmt.Errorf("invalid id")
 	}
 	q := `
 UPDATE ops_retry_attempts
 SET
  status = $2,
  finished_at = $3,
  duration_ms = $4,
  result_request_id = $5,
  result_error_id = $6,
  error_message = $7
 WHERE id = $1`
 	_, err := r.db.ExecContext(
 		ctx,
 		q,
 		input.ID,
 		strings.TrimSpace(input.Status),
 		nullTime(input.FinishedAt),
 		input.DurationMs,
 		opsNullString(input.ResultRequestID),
 		opsNullInt64(input.ResultErrorID),
 		opsNullString(input.ErrorMessage),
 	)
 	return err
 }
 func (r *opsRepository) GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*service.OpsRetryAttempt, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	if sourceErrorID <= 0 {
 		return nil, fmt.Errorf("invalid source_error_id")
 	}
 	q := `
 SELECT
  id,
  created_at,
  COALESCE(requested_by_user_id, 0),
  source_error_id,
  COALESCE(mode, ''),
  pinned_account_id,
  COALESCE(status, ''),
  started_at,
  finished_at,
  duration_ms,
  result_request_id,
  result_error_id,
  error_message
 FROM ops_retry_attempts
 WHERE source_error_id = $1
 ORDER BY created_at DESC
 LIMIT 1`
 	var out service.OpsRetryAttempt
 	var pinnedAccountID sql.NullInt64
 	var requestedBy sql.NullInt64
 	var startedAt sql.NullTime
 	var finishedAt sql.NullTime
 	var durationMs sql.NullInt64
 	var resultRequestID sql.NullString
 	var resultErrorID sql.NullInt64
 	var errorMessage sql.NullString
 	err := r.db.QueryRowContext(ctx, q, sourceErrorID).Scan(
 		&out.ID,
 		&out.CreatedAt,
 		&requestedBy,
 		&out.SourceErrorID,
 		&out.Mode,
 		&pinnedAccountID,
 		&out.Status,
 		&startedAt,
 		&finishedAt,
 		&durationMs,
 		&resultRequestID,
 		&resultErrorID,
 		&errorMessage,
 	)
 	if err != nil {
 		return nil, err
 	}
 	out.RequestedByUserID = requestedBy.Int64
 	if pinnedAccountID.Valid {
 		v := pinnedAccountID.Int64
 		out.PinnedAccountID = &v
 	}
 	if startedAt.Valid {
 		t := startedAt.Time
 		out.StartedAt = &t
 	}
 	if finishedAt.Valid {
 		t := finishedAt.Time
 		out.FinishedAt = &t
 	}
 	if durationMs.Valid {
 		v := durationMs.Int64
 		out.DurationMs = &v
 	}
 	if resultRequestID.Valid {
 		s := resultRequestID.String
 		out.ResultRequestID = &s
 	}
 	if resultErrorID.Valid {
 		v := resultErrorID.Int64
 		out.ResultErrorID = &v
 	}
 	if errorMessage.Valid {
 		s := errorMessage.String
 		out.ErrorMessage = &s
 	}
 	return &out, nil
 }
 func nullTime(t time.Time) sql.NullTime {
 	if t.IsZero() {
 		return sql.NullTime{}
 	}
 	return sql.NullTime{Time: t, Valid: true}
 }
 func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) {
 	clauses := make([]string, 0, 8)
 	args := make([]any, 0, 8)
 	clauses = append(clauses, "1=1")
 	phaseFilter := ""
 	if filter != nil {
 		phaseFilter = strings.TrimSpace(strings.ToLower(filter.Phase))
 	}
 	// ops_error_logs primarily stores client-visible error requests (status>=400),
 	// but we also persist "recovered" upstream errors (status<400) for upstream health visibility.
 	// By default, keep list endpoints scoped to client errors unless explicitly filtering upstream phase.
 	if phaseFilter != "upstream" {
 		clauses = append(clauses, "COALESCE(status_code, 0) >= 400")
 	}
 	if filter.StartTime != nil && !filter.StartTime.IsZero() {
 		args = append(args, filter.StartTime.UTC())
 		clauses = append(clauses, "created_at >= $"+itoa(len(args)))
 	}
 	if filter.EndTime != nil && !filter.EndTime.IsZero() {
 		args = append(args, filter.EndTime.UTC())
 		// Keep time-window semantics consistent with other ops queries: [start, end)
 		clauses = append(clauses, "created_at < $"+itoa(len(args)))
 	}
 	if p := strings.TrimSpace(filter.Platform); p != "" {
 		args = append(args, p)
 		clauses = append(clauses, "platform = $"+itoa(len(args)))
 	}
 	if filter.GroupID != nil && *filter.GroupID > 0 {
 		args = append(args, *filter.GroupID)
 		clauses = append(clauses, "group_id = $"+itoa(len(args)))
 	}
 	if filter.AccountID != nil && *filter.AccountID > 0 {
 		args = append(args, *filter.AccountID)
 		clauses = append(clauses, "account_id = $"+itoa(len(args)))
 	}
 	if phase := phaseFilter; phase != "" {
 		args = append(args, phase)
 		clauses = append(clauses, "error_phase = $"+itoa(len(args)))
 	}
 	if len(filter.StatusCodes) > 0 {
 		args = append(args, pq.Array(filter.StatusCodes))
 		clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) = ANY($"+itoa(len(args))+")")
 	}
 	if q := strings.TrimSpace(filter.Query); q != "" {
 		like := "%" + q + "%"
 		args = append(args, like)
 		n := itoa(len(args))
 		clauses = append(clauses, "(request_id ILIKE $"+n+" OR client_request_id ILIKE $"+n+" OR error_message ILIKE $"+n+")")
 	}
 	return "WHERE " + strings.Join(clauses, " AND "), args
 }
 // Helpers for nullable args
 func opsNullString(v any) any {
 	switch s := v.(type) {
 	case nil:
 		return sql.NullString{}
 	case *string:
 		if s == nil || strings.TrimSpace(*s) == "" {
 			return sql.NullString{}
 		}
 		return sql.NullString{String: strings.TrimSpace(*s), Valid: true}
 	case string:
 		if strings.TrimSpace(s) == "" {
 			return sql.NullString{}
 		}
 		return sql.NullString{String: strings.TrimSpace(s), Valid: true}
 	default:
 		return sql.NullString{}
 	}
 }
 func opsNullInt64(v *int64) any {
 	if v == nil || *v == 0 {
 		return sql.NullInt64{}
 	}
 	return sql.NullInt64{Int64: *v, Valid: true}
 }
 func opsNullInt(v any) any {
 	switch n := v.(type) {
 	case nil:
 		return sql.NullInt64{}
 	case *int:
 		if n == nil || *n == 0 {
 			return sql.NullInt64{}
 		}
 		return sql.NullInt64{Int64: int64(*n), Valid: true}
 	case *int64:
 		if n == nil || *n == 0 {
 			return sql.NullInt64{}
 		}
 		return sql.NullInt64{Int64: *n, Valid: true}
 	case int:
 		if n == 0 {
 			return sql.NullInt64{}
 		}
 		return sql.NullInt64{Int64: int64(n), Valid: true}
 	default:
 		return sql.NullInt64{}
 	}
 }
--- a/backend/internal/repository/ops_repo_alerts.go
+++ b/backend/internal/repository/ops_repo_alerts.go
@@ -0,0 +1,689 @@
 package repository
 import (
 	"context"
 	"database/sql"
 	"encoding/json"
 	"fmt"
 	"strings"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/service"
 )
 func (r *opsRepository) ListAlertRules(ctx context.Context) ([]*service.OpsAlertRule, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	q := `
 SELECT
  id,
  name,
  COALESCE(description, ''),
  enabled,
  COALESCE(severity, ''),
  metric_type,
  operator,
  threshold,
  window_minutes,
  sustained_minutes,
  cooldown_minutes,
  COALESCE(notify_email, true),
  filters,
  last_triggered_at,
  created_at,
  updated_at
 FROM ops_alert_rules
 ORDER BY id DESC`
 	rows, err := r.db.QueryContext(ctx, q)
 	if err != nil {
 		return nil, err
 	}
 	defer func() { _ = rows.Close() }()
 	out := []*service.OpsAlertRule{}
 	for rows.Next() {
 		var rule service.OpsAlertRule
 		var filtersRaw []byte
 		var lastTriggeredAt sql.NullTime
 		if err := rows.Scan(
 			&rule.ID,
 			&rule.Name,
 			&rule.Description,
 			&rule.Enabled,
 			&rule.Severity,
 			&rule.MetricType,
 			&rule.Operator,
 			&rule.Threshold,
 			&rule.WindowMinutes,
 			&rule.SustainedMinutes,
 			&rule.CooldownMinutes,
 			&rule.NotifyEmail,
 			&filtersRaw,
 			&lastTriggeredAt,
 			&rule.CreatedAt,
 			&rule.UpdatedAt,
 		); err != nil {
 			return nil, err
 		}
 		if lastTriggeredAt.Valid {
 			v := lastTriggeredAt.Time
 			rule.LastTriggeredAt = &v
 		}
 		if len(filtersRaw) > 0 && string(filtersRaw) != "null" {
 			var decoded map[string]any
 			if err := json.Unmarshal(filtersRaw, &decoded); err == nil {
 				rule.Filters = decoded
 			}
 		}
 		out = append(out, &rule)
 	}
 	if err := rows.Err(); err != nil {
 		return nil, err
 	}
 	return out, nil
 }
 func (r *opsRepository) CreateAlertRule(ctx context.Context, input *service.OpsAlertRule) (*service.OpsAlertRule, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	if input == nil {
 		return nil, fmt.Errorf("nil input")
 	}
 	filtersArg, err := opsNullJSONMap(input.Filters)
 	if err != nil {
 		return nil, err
 	}
 	q := `
 INSERT INTO ops_alert_rules (
  name,
  description,
  enabled,
  severity,
  metric_type,
  operator,
  threshold,
  window_minutes,
  sustained_minutes,
  cooldown_minutes,
  notify_email,
  filters,
  created_at,
  updated_at
 ) VALUES (
  $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,NOW(),NOW()
 )
 RETURNING
  id,
  name,
  COALESCE(description, ''),
  enabled,
  COALESCE(severity, ''),
  metric_type,
  operator,
  threshold,
  window_minutes,
  sustained_minutes,
  cooldown_minutes,
  COALESCE(notify_email, true),
  filters,
  last_triggered_at,
  created_at,
  updated_at`
 	var out service.OpsAlertRule
 	var filtersRaw []byte
 	var lastTriggeredAt sql.NullTime
 	if err := r.db.QueryRowContext(
 		ctx,
 		q,
 		strings.TrimSpace(input.Name),
 		strings.TrimSpace(input.Description),
 		input.Enabled,
 		strings.TrimSpace(input.Severity),
 		strings.TrimSpace(input.MetricType),
 		strings.TrimSpace(input.Operator),
 		input.Threshold,
 		input.WindowMinutes,
 		input.SustainedMinutes,
 		input.CooldownMinutes,
 		input.NotifyEmail,
 		filtersArg,
 	).Scan(
 		&out.ID,
 		&out.Name,
 		&out.Description,
 		&out.Enabled,
 		&out.Severity,
 		&out.MetricType,
 		&out.Operator,
 		&out.Threshold,
 		&out.WindowMinutes,
 		&out.SustainedMinutes,
 		&out.CooldownMinutes,
 		&out.NotifyEmail,
 		&filtersRaw,
 		&lastTriggeredAt,
 		&out.CreatedAt,
 		&out.UpdatedAt,
 	); err != nil {
 		return nil, err
 	}
 	if lastTriggeredAt.Valid {
 		v := lastTriggeredAt.Time
 		out.LastTriggeredAt = &v
 	}
 	if len(filtersRaw) > 0 && string(filtersRaw) != "null" {
 		var decoded map[string]any
 		if err := json.Unmarshal(filtersRaw, &decoded); err == nil {
 			out.Filters = decoded
 		}
 	}
 	return &out, nil
 }
 func (r *opsRepository) UpdateAlertRule(ctx context.Context, input *service.OpsAlertRule) (*service.OpsAlertRule, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	if input == nil {
 		return nil, fmt.Errorf("nil input")
 	}
 	if input.ID <= 0 {
 		return nil, fmt.Errorf("invalid id")
 	}
 	filtersArg, err := opsNullJSONMap(input.Filters)
 	if err != nil {
 		return nil, err
 	}
 	q := `
 UPDATE ops_alert_rules
 SET
  name = $2,
  description = $3,
  enabled = $4,
  severity = $5,
  metric_type = $6,
  operator = $7,
  threshold = $8,
  window_minutes = $9,
  sustained_minutes = $10,
  cooldown_minutes = $11,
  notify_email = $12,
  filters = $13,
  updated_at = NOW()
 WHERE id = $1
 RETURNING
  id,
  name,
  COALESCE(description, ''),
  enabled,
  COALESCE(severity, ''),
  metric_type,
  operator,
  threshold,
  window_minutes,
  sustained_minutes,
  cooldown_minutes,
  COALESCE(notify_email, true),
  filters,
  last_triggered_at,
  created_at,
  updated_at`
 	var out service.OpsAlertRule
 	var filtersRaw []byte
 	var lastTriggeredAt sql.NullTime
 	if err := r.db.QueryRowContext(
 		ctx,
 		q,
 		input.ID,
 		strings.TrimSpace(input.Name),
 		strings.TrimSpace(input.Description),
 		input.Enabled,
 		strings.TrimSpace(input.Severity),
 		strings.TrimSpace(input.MetricType),
 		strings.TrimSpace(input.Operator),
 		input.Threshold,
 		input.WindowMinutes,
 		input.SustainedMinutes,
 		input.CooldownMinutes,
 		input.NotifyEmail,
 		filtersArg,
 	).Scan(
 		&out.ID,
 		&out.Name,
 		&out.Description,
 		&out.Enabled,
 		&out.Severity,
 		&out.MetricType,
 		&out.Operator,
 		&out.Threshold,
 		&out.WindowMinutes,
 		&out.SustainedMinutes,
 		&out.CooldownMinutes,
 		&out.NotifyEmail,
 		&filtersRaw,
 		&lastTriggeredAt,
 		&out.CreatedAt,
 		&out.UpdatedAt,
 	); err != nil {
 		return nil, err
 	}
 	if lastTriggeredAt.Valid {
 		v := lastTriggeredAt.Time
 		out.LastTriggeredAt = &v
 	}
 	if len(filtersRaw) > 0 && string(filtersRaw) != "null" {
 		var decoded map[string]any
 		if err := json.Unmarshal(filtersRaw, &decoded); err == nil {
 			out.Filters = decoded
 		}
 	}
 	return &out, nil
 }
 func (r *opsRepository) DeleteAlertRule(ctx context.Context, id int64) error {
 	if r == nil || r.db == nil {
 		return fmt.Errorf("nil ops repository")
 	}
 	if id <= 0 {
 		return fmt.Errorf("invalid id")
 	}
 	res, err := r.db.ExecContext(ctx, "DELETE FROM ops_alert_rules WHERE id = $1", id)
 	if err != nil {
 		return err
 	}
 	affected, err := res.RowsAffected()
 	if err != nil {
 		return err
 	}
 	if affected == 0 {
 		return sql.ErrNoRows
 	}
 	return nil
 }
 func (r *opsRepository) ListAlertEvents(ctx context.Context, filter *service.OpsAlertEventFilter) ([]*service.OpsAlertEvent, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	if filter == nil {
 		filter = &service.OpsAlertEventFilter{}
 	}
 	limit := filter.Limit
 	if limit <= 0 {
 		limit = 100
 	}
 	if limit > 500 {
 		limit = 500
 	}
 	where, args := buildOpsAlertEventsWhere(filter)
 	args = append(args, limit)
 	limitArg := "$" + itoa(len(args))
 	q := `
 SELECT
  id,
  COALESCE(rule_id, 0),
  COALESCE(severity, ''),
  COALESCE(status, ''),
  COALESCE(title, ''),
  COALESCE(description, ''),
  metric_value,
  threshold_value,
  dimensions,
  fired_at,
  resolved_at,
  email_sent,
  created_at
 FROM ops_alert_events
 ` + where + `
 ORDER BY fired_at DESC
 LIMIT ` + limitArg
 	rows, err := r.db.QueryContext(ctx, q, args...)
 	if err != nil {
 		return nil, err
 	}
 	defer func() { _ = rows.Close() }()
 	out := []*service.OpsAlertEvent{}
 	for rows.Next() {
 		var ev service.OpsAlertEvent
 		var metricValue sql.NullFloat64
 		var thresholdValue sql.NullFloat64
 		var dimensionsRaw []byte
 		var resolvedAt sql.NullTime
 		if err := rows.Scan(
 			&ev.ID,
 			&ev.RuleID,
 			&ev.Severity,
 			&ev.Status,
 			&ev.Title,
 			&ev.Description,
 			&metricValue,
 			&thresholdValue,
 			&dimensionsRaw,
 			&ev.FiredAt,
 			&resolvedAt,
 			&ev.EmailSent,
 			&ev.CreatedAt,
 		); err != nil {
 			return nil, err
 		}
 		if metricValue.Valid {
 			v := metricValue.Float64
 			ev.MetricValue = &v
 		}
 		if thresholdValue.Valid {
 			v := thresholdValue.Float64
 			ev.ThresholdValue = &v
 		}
 		if resolvedAt.Valid {
 			v := resolvedAt.Time
 			ev.ResolvedAt = &v
 		}
 		if len(dimensionsRaw) > 0 && string(dimensionsRaw) != "null" {
 			var decoded map[string]any
 			if err := json.Unmarshal(dimensionsRaw, &decoded); err == nil {
 				ev.Dimensions = decoded
 			}
 		}
 		out = append(out, &ev)
 	}
 	if err := rows.Err(); err != nil {
 		return nil, err
 	}
 	return out, nil
 }
 func (r *opsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	if ruleID <= 0 {
 		return nil, fmt.Errorf("invalid rule id")
 	}
 	q := `
 SELECT
  id,
  COALESCE(rule_id, 0),
  COALESCE(severity, ''),
  COALESCE(status, ''),
  COALESCE(title, ''),
  COALESCE(description, ''),
  metric_value,
  threshold_value,
  dimensions,
  fired_at,
  resolved_at,
  email_sent,
  created_at
 FROM ops_alert_events
 WHERE rule_id = $1 AND status = $2
 ORDER BY fired_at DESC
 LIMIT 1`
 	row := r.db.QueryRowContext(ctx, q, ruleID, service.OpsAlertStatusFiring)
 	ev, err := scanOpsAlertEvent(row)
 	if err != nil {
 		if err == sql.ErrNoRows {
 			return nil, nil
 		}
 		return nil, err
 	}
 	return ev, nil
 }
 func (r *opsRepository) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	if ruleID <= 0 {
 		return nil, fmt.Errorf("invalid rule id")
 	}
 	q := `
 SELECT
  id,
  COALESCE(rule_id, 0),
  COALESCE(severity, ''),
  COALESCE(status, ''),
  COALESCE(title, ''),
  COALESCE(description, ''),
  metric_value,
  threshold_value,
  dimensions,
  fired_at,
  resolved_at,
  email_sent,
  created_at
 FROM ops_alert_events
 WHERE rule_id = $1
 ORDER BY fired_at DESC
 LIMIT 1`
 	row := r.db.QueryRowContext(ctx, q, ruleID)
 	ev, err := scanOpsAlertEvent(row)
 	if err != nil {
 		if err == sql.ErrNoRows {
 			return nil, nil
 		}
 		return nil, err
 	}
 	return ev, nil
 }
 func (r *opsRepository) CreateAlertEvent(ctx context.Context, event *service.OpsAlertEvent) (*service.OpsAlertEvent, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	if event == nil {
 		return nil, fmt.Errorf("nil event")
 	}
 	dimensionsArg, err := opsNullJSONMap(event.Dimensions)
 	if err != nil {
 		return nil, err
 	}
 	q := `
 INSERT INTO ops_alert_events (
  rule_id,
  severity,
  status,
  title,
  description,
  metric_value,
  threshold_value,
  dimensions,
  fired_at,
  resolved_at,
  email_sent,
  created_at
 ) VALUES (
  $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,NOW()
 )
 RETURNING
  id,
  COALESCE(rule_id, 0),
  COALESCE(severity, ''),
  COALESCE(status, ''),
  COALESCE(title, ''),
  COALESCE(description, ''),
  metric_value,
  threshold_value,
  dimensions,
  fired_at,
  resolved_at,
  email_sent,
  created_at`
 	row := r.db.QueryRowContext(
 		ctx,
 		q,
 		opsNullInt64(&event.RuleID),
 		opsNullString(event.Severity),
 		opsNullString(event.Status),
 		opsNullString(event.Title),
 		opsNullString(event.Description),
 		opsNullFloat64(event.MetricValue),
 		opsNullFloat64(event.ThresholdValue),
 		dimensionsArg,
 		event.FiredAt,
 		opsNullTime(event.ResolvedAt),
 		event.EmailSent,
 	)
 	return scanOpsAlertEvent(row)
 }
 func (r *opsRepository) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
 	if r == nil || r.db == nil {
 		return fmt.Errorf("nil ops repository")
 	}
 	if eventID <= 0 {
 		return fmt.Errorf("invalid event id")
 	}
 	if strings.TrimSpace(status) == "" {
 		return fmt.Errorf("invalid status")
 	}
 	q := `
 UPDATE ops_alert_events
 SET status = $2,
    resolved_at = $3
 WHERE id = $1`
 	_, err := r.db.ExecContext(ctx, q, eventID, strings.TrimSpace(status), opsNullTime(resolvedAt))
 	return err
 }
 func (r *opsRepository) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error {
 	if r == nil || r.db == nil {
 		return fmt.Errorf("nil ops repository")
 	}
 	if eventID <= 0 {
 		return fmt.Errorf("invalid event id")
 	}
 	_, err := r.db.ExecContext(ctx, "UPDATE ops_alert_events SET email_sent = $2 WHERE id = $1", eventID, emailSent)
 	return err
 }
 type opsAlertEventRow interface {
 	Scan(dest ...any) error
 }
 func scanOpsAlertEvent(row opsAlertEventRow) (*service.OpsAlertEvent, error) {
 	var ev service.OpsAlertEvent
 	var metricValue sql.NullFloat64
 	var thresholdValue sql.NullFloat64
 	var dimensionsRaw []byte
 	var resolvedAt sql.NullTime
 	if err := row.Scan(
 		&ev.ID,
 		&ev.RuleID,
 		&ev.Severity,
 		&ev.Status,
 		&ev.Title,
 		&ev.Description,
 		&metricValue,
 		&thresholdValue,
 		&dimensionsRaw,
 		&ev.FiredAt,
 		&resolvedAt,
 		&ev.EmailSent,
 		&ev.CreatedAt,
 	); err != nil {
 		return nil, err
 	}
 	if metricValue.Valid {
 		v := metricValue.Float64
 		ev.MetricValue = &v
 	}
 	if thresholdValue.Valid {
 		v := thresholdValue.Float64
 		ev.ThresholdValue = &v
 	}
 	if resolvedAt.Valid {
 		v := resolvedAt.Time
 		ev.ResolvedAt = &v
 	}
 	if len(dimensionsRaw) > 0 && string(dimensionsRaw) != "null" {
 		var decoded map[string]any
 		if err := json.Unmarshal(dimensionsRaw, &decoded); err == nil {
 			ev.Dimensions = decoded
 		}
 	}
 	return &ev, nil
 }
 func buildOpsAlertEventsWhere(filter *service.OpsAlertEventFilter) (string, []any) {
 	clauses := []string{"1=1"}
 	args := []any{}
 	if filter == nil {
 		return "WHERE " + strings.Join(clauses, " AND "), args
 	}
 	if status := strings.TrimSpace(filter.Status); status != "" {
 		args = append(args, status)
 		clauses = append(clauses, "status = $"+itoa(len(args)))
 	}
 	if severity := strings.TrimSpace(filter.Severity); severity != "" {
 		args = append(args, severity)
 		clauses = append(clauses, "severity = $"+itoa(len(args)))
 	}
 	if filter.StartTime != nil && !filter.StartTime.IsZero() {
 		args = append(args, *filter.StartTime)
 		clauses = append(clauses, "fired_at >= $"+itoa(len(args)))
 	}
 	if filter.EndTime != nil && !filter.EndTime.IsZero() {
 		args = append(args, *filter.EndTime)
 		clauses = append(clauses, "fired_at < $"+itoa(len(args)))
 	}
 	// Dimensions are stored in JSONB. We filter best-effort without requiring GIN indexes.
 	if platform := strings.TrimSpace(filter.Platform); platform != "" {
 		args = append(args, platform)
 		clauses = append(clauses, "(dimensions->>'platform') = $"+itoa(len(args)))
 	}
 	if filter.GroupID != nil && *filter.GroupID > 0 {
 		args = append(args, fmt.Sprintf("%d", *filter.GroupID))
 		clauses = append(clauses, "(dimensions->>'group_id') = $"+itoa(len(args)))
 	}
 	return "WHERE " + strings.Join(clauses, " AND "), args
 }
 func opsNullJSONMap(v map[string]any) (any, error) {
 	if v == nil {
 		return sql.NullString{}, nil
 	}
 	b, err := json.Marshal(v)
 	if err != nil {
 		return nil, err
 	}
 	if len(b) == 0 {
 		return sql.NullString{}, nil
 	}
 	return sql.NullString{String: string(b), Valid: true}, nil
 }
--- a/backend/internal/repository/ops_repo_dashboard.go
+++ b/backend/internal/repository/ops_repo_dashboard.go
--- a/backend/internal/repository/ops_repo_histograms.go
+++ b/backend/internal/repository/ops_repo_histograms.go
@@ -0,0 +1,79 @@
 package repository
 import (
 	"context"
 	"fmt"
 	"strings"
 	"github.com/Wei-Shaw/sub2api/internal/service"
 )
 func (r *opsRepository) GetLatencyHistogram(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsLatencyHistogramResponse, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	if filter == nil {
 		return nil, fmt.Errorf("nil filter")
 	}
 	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
 		return nil, fmt.Errorf("start_time/end_time required")
 	}
 	start := filter.StartTime.UTC()
 	end := filter.EndTime.UTC()
 	join, where, args, _ := buildUsageWhere(filter, start, end, 1)
 	rangeExpr := latencyHistogramRangeCaseExpr("ul.duration_ms")
 	orderExpr := latencyHistogramRangeOrderCaseExpr("ul.duration_ms")
 	q := `
 SELECT
  ` + rangeExpr + ` AS range,
  COALESCE(COUNT(*), 0) AS count,
  ` + orderExpr + ` AS ord
 FROM usage_logs ul
 ` + join + `
 ` + where + `
 AND ul.duration_ms IS NOT NULL
 GROUP BY 1, 3
 ORDER BY 3 ASC`
 	rows, err := r.db.QueryContext(ctx, q, args...)
 	if err != nil {
 		return nil, err
 	}
 	defer func() { _ = rows.Close() }()
 	counts := make(map[string]int64, len(latencyHistogramOrderedRanges))
 	var total int64
 	for rows.Next() {
 		var label string
 		var count int64
 		var _ord int
 		if err := rows.Scan(&label, &count, &_ord); err != nil {
 			return nil, err
 		}
 		counts[label] = count
 		total += count
 	}
 	if err := rows.Err(); err != nil {
 		return nil, err
 	}
 	buckets := make([]*service.OpsLatencyHistogramBucket, 0, len(latencyHistogramOrderedRanges))
 	for _, label := range latencyHistogramOrderedRanges {
 		buckets = append(buckets, &service.OpsLatencyHistogramBucket{
 			Range: label,
 			Count: counts[label],
 		})
 	}
 	return &service.OpsLatencyHistogramResponse{
 		StartTime:     start,
 		EndTime:       end,
 		Platform:      strings.TrimSpace(filter.Platform),
 		GroupID:       filter.GroupID,
 		TotalRequests: total,
 		Buckets:       buckets,
 	}, nil
 }
--- a/backend/internal/repository/ops_repo_latency_histogram_buckets.go
+++ b/backend/internal/repository/ops_repo_latency_histogram_buckets.go
@@ -0,0 +1,64 @@
 package repository
 import (
 	"fmt"
 	"strings"
 )
 type latencyHistogramBucket struct {
 	upperMs int
 	label   string
 }
 var latencyHistogramBuckets = []latencyHistogramBucket{
 	{upperMs: 100, label: "0-100ms"},
 	{upperMs: 200, label: "100-200ms"},
 	{upperMs: 500, label: "200-500ms"},
 	{upperMs: 1000, label: "500-1000ms"},
 	{upperMs: 2000, label: "1000-2000ms"},
 	{upperMs: 0, label: "2000ms+"}, // default bucket
 }
 var latencyHistogramOrderedRanges = func() []string {
 	out := make([]string, 0, len(latencyHistogramBuckets))
 	for _, b := range latencyHistogramBuckets {
 		out = append(out, b.label)
 	}
 	return out
 }()
 func latencyHistogramRangeCaseExpr(column string) string {
 	var sb strings.Builder
 	_, _ = sb.WriteString("CASE\n")
 	for _, b := range latencyHistogramBuckets {
 		if b.upperMs <= 0 {
 			continue
 		}
 		_, _ = sb.WriteString(fmt.Sprintf("\tWHEN %s < %d THEN '%s'\n", column, b.upperMs, b.label))
 	}
 	// Default bucket.
 	last := latencyHistogramBuckets[len(latencyHistogramBuckets)-1]
 	_, _ = sb.WriteString(fmt.Sprintf("\tELSE '%s'\n", last.label))
 	_, _ = sb.WriteString("END")
 	return sb.String()
 }
 func latencyHistogramRangeOrderCaseExpr(column string) string {
 	var sb strings.Builder
 	_, _ = sb.WriteString("CASE\n")
 	order := 1
 	for _, b := range latencyHistogramBuckets {
 		if b.upperMs <= 0 {
 			continue
 		}
 		_, _ = sb.WriteString(fmt.Sprintf("\tWHEN %s < %d THEN %d\n", column, b.upperMs, order))
 		order++
 	}
 	_, _ = sb.WriteString(fmt.Sprintf("\tELSE %d\n", order))
 	_, _ = sb.WriteString("END")
 	return sb.String()
 }
--- a/backend/internal/repository/ops_repo_latency_histogram_buckets_test.go
+++ b/backend/internal/repository/ops_repo_latency_histogram_buckets_test.go
@@ -0,0 +1,14 @@
 package repository
 import (
 	"testing"
 	"github.com/stretchr/testify/require"
 )
 func TestLatencyHistogramBuckets_AreConsistent(t *testing.T) {
 	require.Equal(t, len(latencyHistogramBuckets), len(latencyHistogramOrderedRanges))
 	for i, b := range latencyHistogramBuckets {
 		require.Equal(t, b.label, latencyHistogramOrderedRanges[i])
 	}
 }
--- a/backend/internal/repository/ops_repo_metrics.go
+++ b/backend/internal/repository/ops_repo_metrics.go
@@ -0,0 +1,422 @@
 package repository
 import (
 	"context"
 	"database/sql"
 	"fmt"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/service"
 )
 func (r *opsRepository) InsertSystemMetrics(ctx context.Context, input *service.OpsInsertSystemMetricsInput) error {
 	if r == nil || r.db == nil {
 		return fmt.Errorf("nil ops repository")
 	}
 	if input == nil {
 		return fmt.Errorf("nil input")
 	}
 	window := input.WindowMinutes
 	if window <= 0 {
 		window = 1
 	}
 	createdAt := input.CreatedAt
 	if createdAt.IsZero() {
 		createdAt = time.Now().UTC()
 	}
 	q := `
 INSERT INTO ops_system_metrics (
  created_at,
  window_minutes,
  platform,
  group_id,
  success_count,
  error_count_total,
  business_limited_count,
  error_count_sla,
  upstream_error_count_excl_429_529,
  upstream_429_count,
  upstream_529_count,
  token_consumed,
  qps,
  tps,
  duration_p50_ms,
  duration_p90_ms,
  duration_p95_ms,
  duration_p99_ms,
  duration_avg_ms,
  duration_max_ms,
  ttft_p50_ms,
  ttft_p90_ms,
  ttft_p95_ms,
  ttft_p99_ms,
  ttft_avg_ms,
  ttft_max_ms,
  cpu_usage_percent,
  memory_used_mb,
  memory_total_mb,
  memory_usage_percent,
  db_ok,
  redis_ok,
  redis_conn_total,
  redis_conn_idle,
  db_conn_active,
  db_conn_idle,
  db_conn_waiting,
  goroutine_count,
  concurrency_queue_depth
 ) VALUES (
  $1,$2,$3,$4,
  $5,$6,$7,$8,
  $9,$10,$11,
  $12,$13,$14,
  $15,$16,$17,$18,$19,$20,
  $21,$22,$23,$24,$25,$26,
  $27,$28,$29,$30,
  $31,$32,
  $33,$34,
  $35,$36,$37,
  $38,$39
 )`
 	_, err := r.db.ExecContext(
 		ctx,
 		q,
 		createdAt,
 		window,
 		opsNullString(input.Platform),
 		opsNullInt64(input.GroupID),
 		input.SuccessCount,
 		input.ErrorCountTotal,
 		input.BusinessLimitedCount,
 		input.ErrorCountSLA,
 		input.UpstreamErrorCountExcl429529,
 		input.Upstream429Count,
 		input.Upstream529Count,
 		input.TokenConsumed,
 		opsNullFloat64(input.QPS),
 		opsNullFloat64(input.TPS),
 		opsNullInt(input.DurationP50Ms),
 		opsNullInt(input.DurationP90Ms),
 		opsNullInt(input.DurationP95Ms),
 		opsNullInt(input.DurationP99Ms),
 		opsNullFloat64(input.DurationAvgMs),
 		opsNullInt(input.DurationMaxMs),
 		opsNullInt(input.TTFTP50Ms),
 		opsNullInt(input.TTFTP90Ms),
 		opsNullInt(input.TTFTP95Ms),
 		opsNullInt(input.TTFTP99Ms),
 		opsNullFloat64(input.TTFTAvgMs),
 		opsNullInt(input.TTFTMaxMs),
 		opsNullFloat64(input.CPUUsagePercent),
 		opsNullInt(input.MemoryUsedMB),
 		opsNullInt(input.MemoryTotalMB),
 		opsNullFloat64(input.MemoryUsagePercent),
 		opsNullBool(input.DBOK),
 		opsNullBool(input.RedisOK),
 		opsNullInt(input.RedisConnTotal),
 		opsNullInt(input.RedisConnIdle),
 		opsNullInt(input.DBConnActive),
 		opsNullInt(input.DBConnIdle),
 		opsNullInt(input.DBConnWaiting),
 		opsNullInt(input.GoroutineCount),
 		opsNullInt(input.ConcurrencyQueueDepth),
 	)
 	return err
 }
 func (r *opsRepository) GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*service.OpsSystemMetricsSnapshot, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	if windowMinutes <= 0 {
 		windowMinutes = 1
 	}
 	q := `
 SELECT
  id,
  created_at,
  window_minutes,
  cpu_usage_percent,
  memory_used_mb,
  memory_total_mb,
  memory_usage_percent,
  db_ok,
  redis_ok,
  redis_conn_total,
  redis_conn_idle,
  db_conn_active,
  db_conn_idle,
  db_conn_waiting,
  goroutine_count,
  concurrency_queue_depth
 FROM ops_system_metrics
 WHERE window_minutes = $1
  AND platform IS NULL
  AND group_id IS NULL
 ORDER BY created_at DESC
 LIMIT 1`
 	var out service.OpsSystemMetricsSnapshot
 	var cpu sql.NullFloat64
 	var memUsed sql.NullInt64
 	var memTotal sql.NullInt64
 	var memPct sql.NullFloat64
 	var dbOK sql.NullBool
 	var redisOK sql.NullBool
 	var redisTotal sql.NullInt64
 	var redisIdle sql.NullInt64
 	var dbActive sql.NullInt64
 	var dbIdle sql.NullInt64
 	var dbWaiting sql.NullInt64
 	var goroutines sql.NullInt64
 	var queueDepth sql.NullInt64
 	if err := r.db.QueryRowContext(ctx, q, windowMinutes).Scan(
 		&out.ID,
 		&out.CreatedAt,
 		&out.WindowMinutes,
 		&cpu,
 		&memUsed,
 		&memTotal,
 		&memPct,
 		&dbOK,
 		&redisOK,
 		&redisTotal,
 		&redisIdle,
 		&dbActive,
 		&dbIdle,
 		&dbWaiting,
 		&goroutines,
 		&queueDepth,
 	); err != nil {
 		return nil, err
 	}
 	if cpu.Valid {
 		v := cpu.Float64
 		out.CPUUsagePercent = &v
 	}
 	if memUsed.Valid {
 		v := memUsed.Int64
 		out.MemoryUsedMB = &v
 	}
 	if memTotal.Valid {
 		v := memTotal.Int64
 		out.MemoryTotalMB = &v
 	}
 	if memPct.Valid {
 		v := memPct.Float64
 		out.MemoryUsagePercent = &v
 	}
 	if dbOK.Valid {
 		v := dbOK.Bool
 		out.DBOK = &v
 	}
 	if redisOK.Valid {
 		v := redisOK.Bool
 		out.RedisOK = &v
 	}
 	if redisTotal.Valid {
 		v := int(redisTotal.Int64)
 		out.RedisConnTotal = &v
 	}
 	if redisIdle.Valid {
 		v := int(redisIdle.Int64)
 		out.RedisConnIdle = &v
 	}
 	if dbActive.Valid {
 		v := int(dbActive.Int64)
 		out.DBConnActive = &v
 	}
 	if dbIdle.Valid {
 		v := int(dbIdle.Int64)
 		out.DBConnIdle = &v
 	}
 	if dbWaiting.Valid {
 		v := int(dbWaiting.Int64)
 		out.DBConnWaiting = &v
 	}
 	if goroutines.Valid {
 		v := int(goroutines.Int64)
 		out.GoroutineCount = &v
 	}
 	if queueDepth.Valid {
 		v := int(queueDepth.Int64)
 		out.ConcurrencyQueueDepth = &v
 	}
 	return &out, nil
 }
 func (r *opsRepository) UpsertJobHeartbeat(ctx context.Context, input *service.OpsUpsertJobHeartbeatInput) error {
 	if r == nil || r.db == nil {
 		return fmt.Errorf("nil ops repository")
 	}
 	if input == nil {
 		return fmt.Errorf("nil input")
 	}
 	if input.JobName == "" {
 		return fmt.Errorf("job_name required")
 	}
 	q := `
 INSERT INTO ops_job_heartbeats (
  job_name,
  last_run_at,
  last_success_at,
  last_error_at,
  last_error,
  last_duration_ms,
  updated_at
 ) VALUES (
  $1,$2,$3,$4,$5,$6,NOW()
 )
 ON CONFLICT (job_name) DO UPDATE SET
  last_run_at = COALESCE(EXCLUDED.last_run_at, ops_job_heartbeats.last_run_at),
  last_success_at = COALESCE(EXCLUDED.last_success_at, ops_job_heartbeats.last_success_at),
  last_error_at = CASE
    WHEN EXCLUDED.last_success_at IS NOT NULL THEN NULL
    ELSE COALESCE(EXCLUDED.last_error_at, ops_job_heartbeats.last_error_at)
  END,
  last_error = CASE
    WHEN EXCLUDED.last_success_at IS NOT NULL THEN NULL
    ELSE COALESCE(EXCLUDED.last_error, ops_job_heartbeats.last_error)
  END,
  last_duration_ms = COALESCE(EXCLUDED.last_duration_ms, ops_job_heartbeats.last_duration_ms),
  updated_at = NOW()`
 	_, err := r.db.ExecContext(
 		ctx,
 		q,
 		input.JobName,
 		opsNullTime(input.LastRunAt),
 		opsNullTime(input.LastSuccessAt),
 		opsNullTime(input.LastErrorAt),
 		opsNullString(input.LastError),
 		opsNullInt(input.LastDurationMs),
 	)
 	return err
 }
 func (r *opsRepository) ListJobHeartbeats(ctx context.Context) ([]*service.OpsJobHeartbeat, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	q := `
 SELECT
  job_name,
  last_run_at,
  last_success_at,
  last_error_at,
  last_error,
  last_duration_ms,
  updated_at
 FROM ops_job_heartbeats
 ORDER BY job_name ASC`
 	rows, err := r.db.QueryContext(ctx, q)
 	if err != nil {
 		return nil, err
 	}
 	defer func() { _ = rows.Close() }()
 	out := make([]*service.OpsJobHeartbeat, 0, 8)
 	for rows.Next() {
 		var item service.OpsJobHeartbeat
 		var lastRun sql.NullTime
 		var lastSuccess sql.NullTime
 		var lastErrorAt sql.NullTime
 		var lastError sql.NullString
 		var lastDuration sql.NullInt64
 		if err := rows.Scan(
 			&item.JobName,
 			&lastRun,
 			&lastSuccess,
 			&lastErrorAt,
 			&lastError,
 			&lastDuration,
 			&item.UpdatedAt,
 		); err != nil {
 			return nil, err
 		}
 		if lastRun.Valid {
 			v := lastRun.Time
 			item.LastRunAt = &v
 		}
 		if lastSuccess.Valid {
 			v := lastSuccess.Time
 			item.LastSuccessAt = &v
 		}
 		if lastErrorAt.Valid {
 			v := lastErrorAt.Time
 			item.LastErrorAt = &v
 		}
 		if lastError.Valid {
 			v := lastError.String
 			item.LastError = &v
 		}
 		if lastDuration.Valid {
 			v := lastDuration.Int64
 			item.LastDurationMs = &v
 		}
 		out = append(out, &item)
 	}
 	if err := rows.Err(); err != nil {
 		return nil, err
 	}
 	return out, nil
 }
 func opsNullBool(v *bool) any {
 	if v == nil {
 		return sql.NullBool{}
 	}
 	return sql.NullBool{Bool: *v, Valid: true}
 }
 func opsNullFloat64(v *float64) any {
 	if v == nil {
 		return sql.NullFloat64{}
 	}
 	return sql.NullFloat64{Float64: *v, Valid: true}
 }
 func opsNullTime(v *time.Time) any {
 	if v == nil || v.IsZero() {
 		return sql.NullTime{}
 	}
 	return sql.NullTime{Time: *v, Valid: true}
 }
--- a/backend/internal/repository/ops_repo_preagg.go
+++ b/backend/internal/repository/ops_repo_preagg.go
@@ -0,0 +1,359 @@
 package repository
 import (
 	"context"
 	"database/sql"
 	"fmt"
 	"time"
 )
 func (r *opsRepository) UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error {
 	if r == nil || r.db == nil {
 		return fmt.Errorf("nil ops repository")
 	}
 	if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) {
 		return nil
 	}
 	start := startTime.UTC()
 	end := endTime.UTC()
 	// NOTE:
 	// - We aggregate usage_logs + ops_error_logs into ops_metrics_hourly.
 	// - We emit three dimension granularities via GROUPING SETS:
 	//   1) overall: (bucket_start)
 	//   2) platform: (bucket_start, platform)
 	//   3) group: (bucket_start, platform, group_id)
 	//
 	// IMPORTANT: Postgres UNIQUE treats NULLs as distinct, so the table uses a COALESCE-based
 	// unique index; our ON CONFLICT target must match that expression set.
 	q := `
 WITH usage_base AS (
  SELECT
    date_trunc('hour', ul.created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start,
    g.platform AS platform,
    ul.group_id AS group_id,
    ul.duration_ms AS duration_ms,
    ul.first_token_ms AS first_token_ms,
    (ul.input_tokens + ul.output_tokens + ul.cache_creation_tokens + ul.cache_read_tokens) AS tokens
  FROM usage_logs ul
  JOIN groups g ON g.id = ul.group_id
  WHERE ul.created_at >= $1 AND ul.created_at < $2
 ),
 usage_agg AS (
  SELECT
    bucket_start,
    CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform,
    CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id,
    COUNT(*) AS success_count,
    COALESCE(SUM(tokens), 0) AS token_consumed,
    percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p50_ms,
    percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p90_ms,
    percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p95_ms,
    percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p99_ms,
    AVG(duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_avg_ms,
    MAX(duration_ms) AS duration_max_ms,
    percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p50_ms,
    percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p90_ms,
    percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p95_ms,
    percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p99_ms,
    AVG(first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_avg_ms,
    MAX(first_token_ms) AS ttft_max_ms
  FROM usage_base
  GROUP BY GROUPING SETS (
    (bucket_start),
    (bucket_start, platform),
    (bucket_start, platform, group_id)
  )
 ),
 error_base AS (
  SELECT
    date_trunc('hour', created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start,
    platform AS platform,
    group_id AS group_id,
    is_business_limited AS is_business_limited,
    error_owner AS error_owner,
    status_code AS client_status_code,
    COALESCE(upstream_status_code, status_code, 0) AS effective_status_code
  FROM ops_error_logs
  WHERE created_at >= $1 AND created_at < $2
 ),
 error_agg AS (
  SELECT
    bucket_start,
    CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform,
    CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id,
    COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400) AS error_count_total,
    COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND is_business_limited) AS business_limited_count,
    COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND NOT is_business_limited) AS error_count_sla,
    COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) NOT IN (429, 529)) AS upstream_error_count_excl_429_529,
    COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 429) AS upstream_429_count,
    COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 529) AS upstream_529_count
  FROM error_base
  GROUP BY GROUPING SETS (
    (bucket_start),
    (bucket_start, platform),
    (bucket_start, platform, group_id)
  )
  HAVING GROUPING(group_id) = 1 OR group_id IS NOT NULL
 ),
 combined AS (
  SELECT
    COALESCE(u.bucket_start, e.bucket_start) AS bucket_start,
    COALESCE(u.platform, e.platform) AS platform,
    COALESCE(u.group_id, e.group_id) AS group_id,
    COALESCE(u.success_count, 0) AS success_count,
    COALESCE(e.error_count_total, 0) AS error_count_total,
    COALESCE(e.business_limited_count, 0) AS business_limited_count,
    COALESCE(e.error_count_sla, 0) AS error_count_sla,
    COALESCE(e.upstream_error_count_excl_429_529, 0) AS upstream_error_count_excl_429_529,
    COALESCE(e.upstream_429_count, 0) AS upstream_429_count,
    COALESCE(e.upstream_529_count, 0) AS upstream_529_count,
    COALESCE(u.token_consumed, 0) AS token_consumed,
    u.duration_p50_ms,
    u.duration_p90_ms,
    u.duration_p95_ms,
    u.duration_p99_ms,
    u.duration_avg_ms,
    u.duration_max_ms,
    u.ttft_p50_ms,
    u.ttft_p90_ms,
    u.ttft_p95_ms,
    u.ttft_p99_ms,
    u.ttft_avg_ms,
    u.ttft_max_ms
  FROM usage_agg u
  FULL OUTER JOIN error_agg e
    ON u.bucket_start = e.bucket_start
   AND COALESCE(u.platform, '') = COALESCE(e.platform, '')
   AND COALESCE(u.group_id, 0) = COALESCE(e.group_id, 0)
 )
 INSERT INTO ops_metrics_hourly (
  bucket_start,
  platform,
  group_id,
  success_count,
  error_count_total,
  business_limited_count,
  error_count_sla,
  upstream_error_count_excl_429_529,
  upstream_429_count,
  upstream_529_count,
  token_consumed,
  duration_p50_ms,
  duration_p90_ms,
  duration_p95_ms,
  duration_p99_ms,
  duration_avg_ms,
  duration_max_ms,
  ttft_p50_ms,
  ttft_p90_ms,
  ttft_p95_ms,
  ttft_p99_ms,
  ttft_avg_ms,
  ttft_max_ms,
  computed_at
 )
 SELECT
  bucket_start,
  NULLIF(platform, '') AS platform,
  group_id,
  success_count,
  error_count_total,
  business_limited_count,
  error_count_sla,
  upstream_error_count_excl_429_529,
  upstream_429_count,
  upstream_529_count,
  token_consumed,
  duration_p50_ms::int,
  duration_p90_ms::int,
  duration_p95_ms::int,
  duration_p99_ms::int,
  duration_avg_ms,
  duration_max_ms::int,
  ttft_p50_ms::int,
  ttft_p90_ms::int,
  ttft_p95_ms::int,
  ttft_p99_ms::int,
  ttft_avg_ms,
  ttft_max_ms::int,
  NOW()
 FROM combined
 WHERE bucket_start IS NOT NULL
  AND (platform IS NULL OR platform <> '')
 ON CONFLICT (bucket_start, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET
  success_count = EXCLUDED.success_count,
  error_count_total = EXCLUDED.error_count_total,
  business_limited_count = EXCLUDED.business_limited_count,
  error_count_sla = EXCLUDED.error_count_sla,
  upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529,
  upstream_429_count = EXCLUDED.upstream_429_count,
  upstream_529_count = EXCLUDED.upstream_529_count,
  token_consumed = EXCLUDED.token_consumed,
  duration_p50_ms = EXCLUDED.duration_p50_ms,
  duration_p90_ms = EXCLUDED.duration_p90_ms,
  duration_p95_ms = EXCLUDED.duration_p95_ms,
  duration_p99_ms = EXCLUDED.duration_p99_ms,
  duration_avg_ms = EXCLUDED.duration_avg_ms,
  duration_max_ms = EXCLUDED.duration_max_ms,
  ttft_p50_ms = EXCLUDED.ttft_p50_ms,
  ttft_p90_ms = EXCLUDED.ttft_p90_ms,
  ttft_p95_ms = EXCLUDED.ttft_p95_ms,
  ttft_p99_ms = EXCLUDED.ttft_p99_ms,
  ttft_avg_ms = EXCLUDED.ttft_avg_ms,
  ttft_max_ms = EXCLUDED.ttft_max_ms,
  computed_at = NOW()
 `
 	_, err := r.db.ExecContext(ctx, q, start, end)
 	return err
 }
 func (r *opsRepository) UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error {
 	if r == nil || r.db == nil {
 		return fmt.Errorf("nil ops repository")
 	}
 	if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) {
 		return nil
 	}
 	start := startTime.UTC()
 	end := endTime.UTC()
 	q := `
 INSERT INTO ops_metrics_daily (
  bucket_date,
  platform,
  group_id,
  success_count,
  error_count_total,
  business_limited_count,
  error_count_sla,
  upstream_error_count_excl_429_529,
  upstream_429_count,
  upstream_529_count,
  token_consumed,
  duration_p50_ms,
  duration_p90_ms,
  duration_p95_ms,
  duration_p99_ms,
  duration_avg_ms,
  duration_max_ms,
  ttft_p50_ms,
  ttft_p90_ms,
  ttft_p95_ms,
  ttft_p99_ms,
  ttft_avg_ms,
  ttft_max_ms,
  computed_at
 )
 SELECT
  (bucket_start AT TIME ZONE 'UTC')::date AS bucket_date,
  platform,
  group_id,
  COALESCE(SUM(success_count), 0) AS success_count,
  COALESCE(SUM(error_count_total), 0) AS error_count_total,
  COALESCE(SUM(business_limited_count), 0) AS business_limited_count,
  COALESCE(SUM(error_count_sla), 0) AS error_count_sla,
  COALESCE(SUM(upstream_error_count_excl_429_529), 0) AS upstream_error_count_excl_429_529,
  COALESCE(SUM(upstream_429_count), 0) AS upstream_429_count,
  COALESCE(SUM(upstream_529_count), 0) AS upstream_529_count,
  COALESCE(SUM(token_consumed), 0) AS token_consumed,
  -- Approximation: weighted average for p50/p90, max for p95/p99 (conservative tail).
  ROUND(SUM(duration_p50_ms::double precision * success_count) FILTER (WHERE duration_p50_ms IS NOT NULL)
    / NULLIF(SUM(success_count) FILTER (WHERE duration_p50_ms IS NOT NULL), 0))::int AS duration_p50_ms,
  ROUND(SUM(duration_p90_ms::double precision * success_count) FILTER (WHERE duration_p90_ms IS NOT NULL)
    / NULLIF(SUM(success_count) FILTER (WHERE duration_p90_ms IS NOT NULL), 0))::int AS duration_p90_ms,
  MAX(duration_p95_ms) AS duration_p95_ms,
  MAX(duration_p99_ms) AS duration_p99_ms,
  SUM(duration_avg_ms * success_count) FILTER (WHERE duration_avg_ms IS NOT NULL)
    / NULLIF(SUM(success_count) FILTER (WHERE duration_avg_ms IS NOT NULL), 0) AS duration_avg_ms,
  MAX(duration_max_ms) AS duration_max_ms,
  ROUND(SUM(ttft_p50_ms::double precision * success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL)
    / NULLIF(SUM(success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL), 0))::int AS ttft_p50_ms,
  ROUND(SUM(ttft_p90_ms::double precision * success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL)
    / NULLIF(SUM(success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL), 0))::int AS ttft_p90_ms,
  MAX(ttft_p95_ms) AS ttft_p95_ms,
  MAX(ttft_p99_ms) AS ttft_p99_ms,
  SUM(ttft_avg_ms * success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL)
    / NULLIF(SUM(success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL), 0) AS ttft_avg_ms,
  MAX(ttft_max_ms) AS ttft_max_ms,
  NOW()
 FROM ops_metrics_hourly
 WHERE bucket_start >= $1 AND bucket_start < $2
 GROUP BY 1, 2, 3
 ON CONFLICT (bucket_date, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET
  success_count = EXCLUDED.success_count,
  error_count_total = EXCLUDED.error_count_total,
  business_limited_count = EXCLUDED.business_limited_count,
  error_count_sla = EXCLUDED.error_count_sla,
  upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529,
  upstream_429_count = EXCLUDED.upstream_429_count,
  upstream_529_count = EXCLUDED.upstream_529_count,
  token_consumed = EXCLUDED.token_consumed,
  duration_p50_ms = EXCLUDED.duration_p50_ms,
  duration_p90_ms = EXCLUDED.duration_p90_ms,
  duration_p95_ms = EXCLUDED.duration_p95_ms,
  duration_p99_ms = EXCLUDED.duration_p99_ms,
  duration_avg_ms = EXCLUDED.duration_avg_ms,
  duration_max_ms = EXCLUDED.duration_max_ms,
  ttft_p50_ms = EXCLUDED.ttft_p50_ms,
  ttft_p90_ms = EXCLUDED.ttft_p90_ms,
  ttft_p95_ms = EXCLUDED.ttft_p95_ms,
  ttft_p99_ms = EXCLUDED.ttft_p99_ms,
  ttft_avg_ms = EXCLUDED.ttft_avg_ms,
  ttft_max_ms = EXCLUDED.ttft_max_ms,
  computed_at = NOW()
 `
 	_, err := r.db.ExecContext(ctx, q, start, end)
 	return err
 }
 func (r *opsRepository) GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error) {
 	if r == nil || r.db == nil {
 		return time.Time{}, false, fmt.Errorf("nil ops repository")
 	}
 	var value sql.NullTime
 	if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_start) FROM ops_metrics_hourly`).Scan(&value); err != nil {
 		return time.Time{}, false, err
 	}
 	if !value.Valid {
 		return time.Time{}, false, nil
 	}
 	return value.Time.UTC(), true, nil
 }
 func (r *opsRepository) GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error) {
 	if r == nil || r.db == nil {
 		return time.Time{}, false, fmt.Errorf("nil ops repository")
 	}
 	var value sql.NullTime
 	if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_date) FROM ops_metrics_daily`).Scan(&value); err != nil {
 		return time.Time{}, false, err
 	}
 	if !value.Valid {
 		return time.Time{}, false, nil
 	}
 	t := value.Time.UTC()
 	return time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, time.UTC), true, nil
 }
--- a/backend/internal/repository/ops_repo_request_details.go
+++ b/backend/internal/repository/ops_repo_request_details.go
@@ -0,0 +1,286 @@
 package repository
 import (
 	"context"
 	"database/sql"
 	"fmt"
 	"strings"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/service"
 )
 func (r *opsRepository) ListRequestDetails(ctx context.Context, filter *service.OpsRequestDetailFilter) ([]*service.OpsRequestDetail, int64, error) {
 	if r == nil || r.db == nil {
 		return nil, 0, fmt.Errorf("nil ops repository")
 	}
 	page, pageSize, startTime, endTime := filter.Normalize()
 	offset := (page - 1) * pageSize
 	conditions := make([]string, 0, 16)
 	args := make([]any, 0, 24)
 	// Placeholders $1/$2 reserved for time window inside the CTE.
 	args = append(args, startTime.UTC(), endTime.UTC())
 	addCondition := func(condition string, values ...any) {
 		conditions = append(conditions, condition)
 		args = append(args, values...)
 	}
 	if filter != nil {
 		if kind := strings.TrimSpace(strings.ToLower(filter.Kind)); kind != "" && kind != "all" {
 			if kind != string(service.OpsRequestKindSuccess) && kind != string(service.OpsRequestKindError) {
 				return nil, 0, fmt.Errorf("invalid kind")
 			}
 			addCondition(fmt.Sprintf("kind = $%d", len(args)+1), kind)
 		}
 		if platform := strings.TrimSpace(strings.ToLower(filter.Platform)); platform != "" {
 			addCondition(fmt.Sprintf("platform = $%d", len(args)+1), platform)
 		}
 		if filter.GroupID != nil && *filter.GroupID > 0 {
 			addCondition(fmt.Sprintf("group_id = $%d", len(args)+1), *filter.GroupID)
 		}
 		if filter.UserID != nil && *filter.UserID > 0 {
 			addCondition(fmt.Sprintf("user_id = $%d", len(args)+1), *filter.UserID)
 		}
 		if filter.APIKeyID != nil && *filter.APIKeyID > 0 {
 			addCondition(fmt.Sprintf("api_key_id = $%d", len(args)+1), *filter.APIKeyID)
 		}
 		if filter.AccountID != nil && *filter.AccountID > 0 {
 			addCondition(fmt.Sprintf("account_id = $%d", len(args)+1), *filter.AccountID)
 		}
 		if model := strings.TrimSpace(filter.Model); model != "" {
 			addCondition(fmt.Sprintf("model = $%d", len(args)+1), model)
 		}
 		if requestID := strings.TrimSpace(filter.RequestID); requestID != "" {
 			addCondition(fmt.Sprintf("request_id = $%d", len(args)+1), requestID)
 		}
 		if q := strings.TrimSpace(filter.Query); q != "" {
 			like := "%" + strings.ToLower(q) + "%"
 			startIdx := len(args) + 1
 			addCondition(
 				fmt.Sprintf("(LOWER(COALESCE(request_id,'')) LIKE $%d OR LOWER(COALESCE(model,'')) LIKE $%d OR LOWER(COALESCE(message,'')) LIKE $%d)",
 					startIdx, startIdx+1, startIdx+2,
 				),
 				like, like, like,
 			)
 		}
 		if filter.MinDurationMs != nil {
 			addCondition(fmt.Sprintf("duration_ms >= $%d", len(args)+1), *filter.MinDurationMs)
 		}
 		if filter.MaxDurationMs != nil {
 			addCondition(fmt.Sprintf("duration_ms <= $%d", len(args)+1), *filter.MaxDurationMs)
 		}
 	}
 	where := ""
 	if len(conditions) > 0 {
 		where = "WHERE " + strings.Join(conditions, " AND ")
 	}
 	cte := `
 WITH combined AS (
  SELECT
    'success'::TEXT AS kind,
    ul.created_at AS created_at,
    ul.request_id AS request_id,
    COALESCE(NULLIF(g.platform, ''), NULLIF(a.platform, ''), '') AS platform,
    ul.model AS model,
    ul.duration_ms AS duration_ms,
    NULL::INT AS status_code,
    NULL::BIGINT AS error_id,
    NULL::TEXT AS phase,
    NULL::TEXT AS severity,
    NULL::TEXT AS message,
    ul.user_id AS user_id,
    ul.api_key_id AS api_key_id,
    ul.account_id AS account_id,
    ul.group_id AS group_id,
    ul.stream AS stream
  FROM usage_logs ul
  LEFT JOIN groups g ON g.id = ul.group_id
  LEFT JOIN accounts a ON a.id = ul.account_id
  WHERE ul.created_at >= $1 AND ul.created_at < $2
  UNION ALL
  SELECT
    'error'::TEXT AS kind,
    o.created_at AS created_at,
    COALESCE(NULLIF(o.request_id,''), NULLIF(o.client_request_id,''), '') AS request_id,
    COALESCE(NULLIF(o.platform, ''), NULLIF(g.platform, ''), NULLIF(a.platform, ''), '') AS platform,
    o.model AS model,
    o.duration_ms AS duration_ms,
    o.status_code AS status_code,
    o.id AS error_id,
    o.error_phase AS phase,
    o.severity AS severity,
    o.error_message AS message,
    o.user_id AS user_id,
    o.api_key_id AS api_key_id,
    o.account_id AS account_id,
    o.group_id AS group_id,
    o.stream AS stream
  FROM ops_error_logs o
  LEFT JOIN groups g ON g.id = o.group_id
  LEFT JOIN accounts a ON a.id = o.account_id
  WHERE o.created_at >= $1 AND o.created_at < $2
    AND COALESCE(o.status_code, 0) >= 400
 )
 `
 	countQuery := fmt.Sprintf(`%s SELECT COUNT(1) FROM combined %s`, cte, where)
 	var total int64
 	if err := r.db.QueryRowContext(ctx, countQuery, args...).Scan(&total); err != nil {
 		if err == sql.ErrNoRows {
 			total = 0
 		} else {
 			return nil, 0, err
 		}
 	}
 	sort := "ORDER BY created_at DESC"
 	if filter != nil {
 		switch strings.TrimSpace(strings.ToLower(filter.Sort)) {
 		case "", "created_at_desc":
 			// default
 		case "duration_desc":
 			sort = "ORDER BY duration_ms DESC NULLS LAST, created_at DESC"
 		default:
 			return nil, 0, fmt.Errorf("invalid sort")
 		}
 	}
 	listQuery := fmt.Sprintf(`
 %s
 SELECT
  kind,
  created_at,
  request_id,
  platform,
  model,
  duration_ms,
  status_code,
  error_id,
  phase,
  severity,
  message,
  user_id,
  api_key_id,
  account_id,
  group_id,
  stream
 FROM combined
 %s
 %s
 LIMIT $%d OFFSET $%d
 `, cte, where, sort, len(args)+1, len(args)+2)
 	listArgs := append(append([]any{}, args...), pageSize, offset)
 	rows, err := r.db.QueryContext(ctx, listQuery, listArgs...)
 	if err != nil {
 		return nil, 0, err
 	}
 	defer func() { _ = rows.Close() }()
 	toIntPtr := func(v sql.NullInt64) *int {
 		if !v.Valid {
 			return nil
 		}
 		i := int(v.Int64)
 		return &i
 	}
 	toInt64Ptr := func(v sql.NullInt64) *int64 {
 		if !v.Valid {
 			return nil
 		}
 		i := v.Int64
 		return &i
 	}
 	out := make([]*service.OpsRequestDetail, 0, pageSize)
 	for rows.Next() {
 		var (
 			kind      string
 			createdAt time.Time
 			requestID sql.NullString
 			platform  sql.NullString
 			model     sql.NullString
 			durationMs sql.NullInt64
 			statusCode sql.NullInt64
 			errorID    sql.NullInt64
 			phase    sql.NullString
 			severity sql.NullString
 			message  sql.NullString
 			userID    sql.NullInt64
 			apiKeyID  sql.NullInt64
 			accountID sql.NullInt64
 			groupID   sql.NullInt64
 			stream bool
 		)
 		if err := rows.Scan(
 			&kind,
 			&createdAt,
 			&requestID,
 			&platform,
 			&model,
 			&durationMs,
 			&statusCode,
 			&errorID,
 			&phase,
 			&severity,
 			&message,
 			&userID,
 			&apiKeyID,
 			&accountID,
 			&groupID,
 			&stream,
 		); err != nil {
 			return nil, 0, err
 		}
 		item := &service.OpsRequestDetail{
 			Kind:      service.OpsRequestKind(kind),
 			CreatedAt: createdAt,
 			RequestID: strings.TrimSpace(requestID.String),
 			Platform:  strings.TrimSpace(platform.String),
 			Model:     strings.TrimSpace(model.String),
 			DurationMs: toIntPtr(durationMs),
 			StatusCode: toIntPtr(statusCode),
 			ErrorID:    toInt64Ptr(errorID),
 			Phase:      phase.String,
 			Severity:   severity.String,
 			Message:    message.String,
 			UserID:    toInt64Ptr(userID),
 			APIKeyID:  toInt64Ptr(apiKeyID),
 			AccountID: toInt64Ptr(accountID),
 			GroupID:   toInt64Ptr(groupID),
 			Stream: stream,
 		}
 		if item.Platform == "" {
 			item.Platform = "unknown"
 		}
 		out = append(out, item)
 	}
 	if err := rows.Err(); err != nil {
 		return nil, 0, err
 	}
 	return out, total, nil
 }
--- a/backend/internal/repository/ops_repo_trends.go
+++ b/backend/internal/repository/ops_repo_trends.go
@@ -0,0 +1,571 @@
 package repository
 import (
 	"context"
 	"database/sql"
 	"fmt"
 	"strings"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/service"
 )
 func (r *opsRepository) GetThroughputTrend(ctx context.Context, filter *service.OpsDashboardFilter, bucketSeconds int) (*service.OpsThroughputTrendResponse, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	if filter == nil {
 		return nil, fmt.Errorf("nil filter")
 	}
 	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
 		return nil, fmt.Errorf("start_time/end_time required")
 	}
 	if bucketSeconds <= 0 {
 		bucketSeconds = 60
 	}
 	if bucketSeconds != 60 && bucketSeconds != 300 && bucketSeconds != 3600 {
 		// Keep a small, predictable set of supported buckets for now.
 		bucketSeconds = 60
 	}
 	start := filter.StartTime.UTC()
 	end := filter.EndTime.UTC()
 	usageJoin, usageWhere, usageArgs, next := buildUsageWhere(filter, start, end, 1)
 	errorWhere, errorArgs, _ := buildErrorWhere(filter, start, end, next)
 	usageBucketExpr := opsBucketExprForUsage(bucketSeconds)
 	errorBucketExpr := opsBucketExprForError(bucketSeconds)
 	q := `
 WITH usage_buckets AS (
  SELECT ` + usageBucketExpr + ` AS bucket,
         COUNT(*) AS success_count,
         COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
  FROM usage_logs ul
  ` + usageJoin + `
  ` + usageWhere + `
  GROUP BY 1
 ),
 error_buckets AS (
  SELECT ` + errorBucketExpr + ` AS bucket,
         COUNT(*) AS error_count
  FROM ops_error_logs
  ` + errorWhere + `
    AND COALESCE(status_code, 0) >= 400
  GROUP BY 1
 ),
 combined AS (
  SELECT COALESCE(u.bucket, e.bucket) AS bucket,
         COALESCE(u.success_count, 0) AS success_count,
         COALESCE(e.error_count, 0) AS error_count,
         COALESCE(u.token_consumed, 0) AS token_consumed
  FROM usage_buckets u
  FULL OUTER JOIN error_buckets e ON u.bucket = e.bucket
 )
 SELECT
  bucket,
  (success_count + error_count) AS request_count,
  token_consumed
 FROM combined
 ORDER BY bucket ASC`
 	args := append(usageArgs, errorArgs...)
 	rows, err := r.db.QueryContext(ctx, q, args...)
 	if err != nil {
 		return nil, err
 	}
 	defer func() { _ = rows.Close() }()
 	points := make([]*service.OpsThroughputTrendPoint, 0, 256)
 	for rows.Next() {
 		var bucket time.Time
 		var requests int64
 		var tokens sql.NullInt64
 		if err := rows.Scan(&bucket, &requests, &tokens); err != nil {
 			return nil, err
 		}
 		tokenConsumed := int64(0)
 		if tokens.Valid {
 			tokenConsumed = tokens.Int64
 		}
 		denom := float64(bucketSeconds)
 		if denom <= 0 {
 			denom = 60
 		}
 		qps := roundTo1DP(float64(requests) / denom)
 		tps := roundTo1DP(float64(tokenConsumed) / denom)
 		points = append(points, &service.OpsThroughputTrendPoint{
 			BucketStart:   bucket.UTC(),
 			RequestCount:  requests,
 			TokenConsumed: tokenConsumed,
 			QPS:           qps,
 			TPS:           tps,
 		})
 	}
 	if err := rows.Err(); err != nil {
 		return nil, err
 	}
 	// Fill missing buckets with zeros so charts render continuous timelines.
 	points = fillOpsThroughputBuckets(start, end, bucketSeconds, points)
 	var byPlatform []*service.OpsThroughputPlatformBreakdownItem
 	var topGroups []*service.OpsThroughputGroupBreakdownItem
 	platform := ""
 	if filter != nil {
 		platform = strings.TrimSpace(strings.ToLower(filter.Platform))
 	}
 	groupID := (*int64)(nil)
 	if filter != nil {
 		groupID = filter.GroupID
 	}
 	// Drilldown helpers:
 	// - No platform/group: totals by platform
 	// - Platform selected but no group: top groups in that platform
 	if platform == "" && (groupID == nil || *groupID <= 0) {
 		items, err := r.getThroughputBreakdownByPlatform(ctx, start, end)
 		if err != nil {
 			return nil, err
 		}
 		byPlatform = items
 	} else if platform != "" && (groupID == nil || *groupID <= 0) {
 		items, err := r.getThroughputTopGroupsByPlatform(ctx, start, end, platform, 10)
 		if err != nil {
 			return nil, err
 		}
 		topGroups = items
 	}
 	return &service.OpsThroughputTrendResponse{
 		Bucket: opsBucketLabel(bucketSeconds),
 		Points: points,
 		ByPlatform: byPlatform,
 		TopGroups:  topGroups,
 	}, nil
 }
 func (r *opsRepository) getThroughputBreakdownByPlatform(ctx context.Context, start, end time.Time) ([]*service.OpsThroughputPlatformBreakdownItem, error) {
 	q := `
 WITH usage_totals AS (
  SELECT COALESCE(NULLIF(g.platform,''), a.platform) AS platform,
         COUNT(*) AS success_count,
         COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
  FROM usage_logs ul
  LEFT JOIN groups g ON g.id = ul.group_id
  LEFT JOIN accounts a ON a.id = ul.account_id
  WHERE ul.created_at >= $1 AND ul.created_at < $2
  GROUP BY 1
 ),
 error_totals AS (
  SELECT platform,
         COUNT(*) AS error_count
  FROM ops_error_logs
  WHERE created_at >= $1 AND created_at < $2
    AND COALESCE(status_code, 0) >= 400
  GROUP BY 1
 ),
 combined AS (
  SELECT COALESCE(u.platform, e.platform) AS platform,
         COALESCE(u.success_count, 0) AS success_count,
         COALESCE(e.error_count, 0) AS error_count,
         COALESCE(u.token_consumed, 0) AS token_consumed
  FROM usage_totals u
  FULL OUTER JOIN error_totals e ON u.platform = e.platform
 )
 SELECT platform, (success_count + error_count) AS request_count, token_consumed
 FROM combined
 WHERE platform IS NOT NULL AND platform <> ''
 ORDER BY request_count DESC`
 	rows, err := r.db.QueryContext(ctx, q, start, end)
 	if err != nil {
 		return nil, err
 	}
 	defer func() { _ = rows.Close() }()
 	items := make([]*service.OpsThroughputPlatformBreakdownItem, 0, 8)
 	for rows.Next() {
 		var platform string
 		var requests int64
 		var tokens sql.NullInt64
 		if err := rows.Scan(&platform, &requests, &tokens); err != nil {
 			return nil, err
 		}
 		tokenConsumed := int64(0)
 		if tokens.Valid {
 			tokenConsumed = tokens.Int64
 		}
 		items = append(items, &service.OpsThroughputPlatformBreakdownItem{
 			Platform:      platform,
 			RequestCount:  requests,
 			TokenConsumed: tokenConsumed,
 		})
 	}
 	if err := rows.Err(); err != nil {
 		return nil, err
 	}
 	return items, nil
 }
 func (r *opsRepository) getThroughputTopGroupsByPlatform(ctx context.Context, start, end time.Time, platform string, limit int) ([]*service.OpsThroughputGroupBreakdownItem, error) {
 	if strings.TrimSpace(platform) == "" {
 		return nil, nil
 	}
 	if limit <= 0 || limit > 100 {
 		limit = 10
 	}
 	q := `
 WITH usage_totals AS (
  SELECT ul.group_id AS group_id,
         g.name AS group_name,
         COUNT(*) AS success_count,
         COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
  FROM usage_logs ul
  JOIN groups g ON g.id = ul.group_id
  WHERE ul.created_at >= $1 AND ul.created_at < $2
    AND g.platform = $3
  GROUP BY 1, 2
 ),
 error_totals AS (
  SELECT group_id,
         COUNT(*) AS error_count
  FROM ops_error_logs
  WHERE created_at >= $1 AND created_at < $2
    AND platform = $3
    AND group_id IS NOT NULL
    AND COALESCE(status_code, 0) >= 400
  GROUP BY 1
 ),
 combined AS (
  SELECT COALESCE(u.group_id, e.group_id) AS group_id,
         COALESCE(u.group_name, g2.name, '') AS group_name,
         COALESCE(u.success_count, 0) AS success_count,
         COALESCE(e.error_count, 0) AS error_count,
         COALESCE(u.token_consumed, 0) AS token_consumed
  FROM usage_totals u
  FULL OUTER JOIN error_totals e ON u.group_id = e.group_id
  LEFT JOIN groups g2 ON g2.id = COALESCE(u.group_id, e.group_id)
 )
 SELECT group_id, group_name, (success_count + error_count) AS request_count, token_consumed
 FROM combined
 WHERE group_id IS NOT NULL
 ORDER BY request_count DESC
 LIMIT $4`
 	rows, err := r.db.QueryContext(ctx, q, start, end, platform, limit)
 	if err != nil {
 		return nil, err
 	}
 	defer func() { _ = rows.Close() }()
 	items := make([]*service.OpsThroughputGroupBreakdownItem, 0, limit)
 	for rows.Next() {
 		var groupID int64
 		var groupName sql.NullString
 		var requests int64
 		var tokens sql.NullInt64
 		if err := rows.Scan(&groupID, &groupName, &requests, &tokens); err != nil {
 			return nil, err
 		}
 		tokenConsumed := int64(0)
 		if tokens.Valid {
 			tokenConsumed = tokens.Int64
 		}
 		name := ""
 		if groupName.Valid {
 			name = groupName.String
 		}
 		items = append(items, &service.OpsThroughputGroupBreakdownItem{
 			GroupID:       groupID,
 			GroupName:     name,
 			RequestCount:  requests,
 			TokenConsumed: tokenConsumed,
 		})
 	}
 	if err := rows.Err(); err != nil {
 		return nil, err
 	}
 	return items, nil
 }
 func opsBucketExprForUsage(bucketSeconds int) string {
 	switch bucketSeconds {
 	case 3600:
 		return "date_trunc('hour', ul.created_at)"
 	case 300:
 		// 5-minute buckets in UTC.
 		return "to_timestamp(floor(extract(epoch from ul.created_at) / 300) * 300)"
 	default:
 		return "date_trunc('minute', ul.created_at)"
 	}
 }
 func opsBucketExprForError(bucketSeconds int) string {
 	switch bucketSeconds {
 	case 3600:
 		return "date_trunc('hour', created_at)"
 	case 300:
 		return "to_timestamp(floor(extract(epoch from created_at) / 300) * 300)"
 	default:
 		return "date_trunc('minute', created_at)"
 	}
 }
 func opsBucketLabel(bucketSeconds int) string {
 	if bucketSeconds <= 0 {
 		return "1m"
 	}
 	if bucketSeconds%3600 == 0 {
 		h := bucketSeconds / 3600
 		if h <= 0 {
 			h = 1
 		}
 		return fmt.Sprintf("%dh", h)
 	}
 	m := bucketSeconds / 60
 	if m <= 0 {
 		m = 1
 	}
 	return fmt.Sprintf("%dm", m)
 }
 func opsFloorToBucketStart(t time.Time, bucketSeconds int) time.Time {
 	t = t.UTC()
 	if bucketSeconds <= 0 {
 		bucketSeconds = 60
 	}
 	secs := t.Unix()
 	floored := secs - (secs % int64(bucketSeconds))
 	return time.Unix(floored, 0).UTC()
 }
 func fillOpsThroughputBuckets(start, end time.Time, bucketSeconds int, points []*service.OpsThroughputTrendPoint) []*service.OpsThroughputTrendPoint {
 	if bucketSeconds <= 0 {
 		bucketSeconds = 60
 	}
 	if !start.Before(end) {
 		return points
 	}
 	endMinus := end.Add(-time.Nanosecond)
 	if endMinus.Before(start) {
 		return points
 	}
 	first := opsFloorToBucketStart(start, bucketSeconds)
 	last := opsFloorToBucketStart(endMinus, bucketSeconds)
 	step := time.Duration(bucketSeconds) * time.Second
 	existing := make(map[int64]*service.OpsThroughputTrendPoint, len(points))
 	for _, p := range points {
 		if p == nil {
 			continue
 		}
 		existing[p.BucketStart.UTC().Unix()] = p
 	}
 	out := make([]*service.OpsThroughputTrendPoint, 0, int(last.Sub(first)/step)+1)
 	for cursor := first; !cursor.After(last); cursor = cursor.Add(step) {
 		if p, ok := existing[cursor.Unix()]; ok && p != nil {
 			out = append(out, p)
 			continue
 		}
 		out = append(out, &service.OpsThroughputTrendPoint{
 			BucketStart:   cursor,
 			RequestCount:  0,
 			TokenConsumed: 0,
 			QPS:           0,
 			TPS:           0,
 		})
 	}
 	return out
 }
 func (r *opsRepository) GetErrorTrend(ctx context.Context, filter *service.OpsDashboardFilter, bucketSeconds int) (*service.OpsErrorTrendResponse, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	if filter == nil {
 		return nil, fmt.Errorf("nil filter")
 	}
 	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
 		return nil, fmt.Errorf("start_time/end_time required")
 	}
 	if bucketSeconds <= 0 {
 		bucketSeconds = 60
 	}
 	if bucketSeconds != 60 && bucketSeconds != 300 && bucketSeconds != 3600 {
 		bucketSeconds = 60
 	}
 	start := filter.StartTime.UTC()
 	end := filter.EndTime.UTC()
 	where, args, _ := buildErrorWhere(filter, start, end, 1)
 	bucketExpr := opsBucketExprForError(bucketSeconds)
 	q := `
 SELECT
  ` + bucketExpr + ` AS bucket,
  COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400) AS error_total,
  COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited) AS business_limited,
  COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited) AS error_sla,
  COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)) AS upstream_excl,
  COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429) AS upstream_429,
  COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529) AS upstream_529
 FROM ops_error_logs
 ` + where + `
 GROUP BY 1
 ORDER BY 1 ASC`
 	rows, err := r.db.QueryContext(ctx, q, args...)
 	if err != nil {
 		return nil, err
 	}
 	defer func() { _ = rows.Close() }()
 	points := make([]*service.OpsErrorTrendPoint, 0, 256)
 	for rows.Next() {
 		var bucket time.Time
 		var total, businessLimited, sla, upstreamExcl, upstream429, upstream529 int64
 		if err := rows.Scan(&bucket, &total, &businessLimited, &sla, &upstreamExcl, &upstream429, &upstream529); err != nil {
 			return nil, err
 		}
 		points = append(points, &service.OpsErrorTrendPoint{
 			BucketStart: bucket.UTC(),
 			ErrorCountTotal:      total,
 			BusinessLimitedCount: businessLimited,
 			ErrorCountSLA:        sla,
 			UpstreamErrorCountExcl429529: upstreamExcl,
 			Upstream429Count:             upstream429,
 			Upstream529Count:             upstream529,
 		})
 	}
 	if err := rows.Err(); err != nil {
 		return nil, err
 	}
 	points = fillOpsErrorTrendBuckets(start, end, bucketSeconds, points)
 	return &service.OpsErrorTrendResponse{
 		Bucket: opsBucketLabel(bucketSeconds),
 		Points: points,
 	}, nil
 }
 func fillOpsErrorTrendBuckets(start, end time.Time, bucketSeconds int, points []*service.OpsErrorTrendPoint) []*service.OpsErrorTrendPoint {
 	if bucketSeconds <= 0 {
 		bucketSeconds = 60
 	}
 	if !start.Before(end) {
 		return points
 	}
 	endMinus := end.Add(-time.Nanosecond)
 	if endMinus.Before(start) {
 		return points
 	}
 	first := opsFloorToBucketStart(start, bucketSeconds)
 	last := opsFloorToBucketStart(endMinus, bucketSeconds)
 	step := time.Duration(bucketSeconds) * time.Second
 	existing := make(map[int64]*service.OpsErrorTrendPoint, len(points))
 	for _, p := range points {
 		if p == nil {
 			continue
 		}
 		existing[p.BucketStart.UTC().Unix()] = p
 	}
 	out := make([]*service.OpsErrorTrendPoint, 0, int(last.Sub(first)/step)+1)
 	for cursor := first; !cursor.After(last); cursor = cursor.Add(step) {
 		if p, ok := existing[cursor.Unix()]; ok && p != nil {
 			out = append(out, p)
 			continue
 		}
 		out = append(out, &service.OpsErrorTrendPoint{
 			BucketStart: cursor,
 			ErrorCountTotal:      0,
 			BusinessLimitedCount: 0,
 			ErrorCountSLA:        0,
 			UpstreamErrorCountExcl429529: 0,
 			Upstream429Count:             0,
 			Upstream529Count:             0,
 		})
 	}
 	return out
 }
 func (r *opsRepository) GetErrorDistribution(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsErrorDistributionResponse, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	if filter == nil {
 		return nil, fmt.Errorf("nil filter")
 	}
 	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
 		return nil, fmt.Errorf("start_time/end_time required")
 	}
 	start := filter.StartTime.UTC()
 	end := filter.EndTime.UTC()
 	where, args, _ := buildErrorWhere(filter, start, end, 1)
 	q := `
 SELECT
  COALESCE(upstream_status_code, status_code, 0) AS status_code,
  COUNT(*) AS total,
  COUNT(*) FILTER (WHERE NOT is_business_limited) AS sla,
  COUNT(*) FILTER (WHERE is_business_limited) AS business_limited
 FROM ops_error_logs
 ` + where + `
  AND COALESCE(status_code, 0) >= 400
 GROUP BY 1
 ORDER BY total DESC
 LIMIT 20`
 	rows, err := r.db.QueryContext(ctx, q, args...)
 	if err != nil {
 		return nil, err
 	}
 	defer func() { _ = rows.Close() }()
 	items := make([]*service.OpsErrorDistributionItem, 0, 16)
 	var total int64
 	for rows.Next() {
 		var statusCode int
 		var cntTotal, cntSLA, cntBiz int64
 		if err := rows.Scan(&statusCode, &cntTotal, &cntSLA, &cntBiz); err != nil {
 			return nil, err
 		}
 		total += cntTotal
 		items = append(items, &service.OpsErrorDistributionItem{
 			StatusCode:      statusCode,
 			Total:           cntTotal,
 			SLA:             cntSLA,
 			BusinessLimited: cntBiz,
 		})
 	}
 	if err := rows.Err(); err != nil {
 		return nil, err
 	}
 	return &service.OpsErrorDistributionResponse{
 		Total: total,
 		Items: items,
 	}, nil
 }
--- a/backend/internal/repository/ops_repo_window_stats.go
+++ b/backend/internal/repository/ops_repo_window_stats.go
@@ -0,0 +1,50 @@
 package repository
 import (
 	"context"
 	"fmt"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/service"
 )
 func (r *opsRepository) GetWindowStats(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsWindowStats, error) {
 	if r == nil || r.db == nil {
 		return nil, fmt.Errorf("nil ops repository")
 	}
 	if filter == nil {
 		return nil, fmt.Errorf("nil filter")
 	}
 	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
 		return nil, fmt.Errorf("start_time/end_time required")
 	}
 	start := filter.StartTime.UTC()
 	end := filter.EndTime.UTC()
 	if start.After(end) {
 		return nil, fmt.Errorf("start_time must be <= end_time")
 	}
 	// Bound excessively large windows to prevent accidental heavy queries.
 	if end.Sub(start) > 24*time.Hour {
 		return nil, fmt.Errorf("window too large")
 	}
 	successCount, tokenConsumed, err := r.queryUsageCounts(ctx, filter, start, end)
 	if err != nil {
 		return nil, err
 	}
 	errorTotal, _, _, _, _, _, err := r.queryErrorCounts(ctx, filter, start, end)
 	if err != nil {
 		return nil, err
 	}
 	return &service.OpsWindowStats{
 		StartTime: start,
 		EndTime:   end,
 		SuccessCount:    successCount,
 		ErrorCountTotal: errorTotal,
 		TokenConsumed:   tokenConsumed,
 	}, nil
 }
--- a/backend/internal/repository/usage_log_repo_integration_test.go
+++ b/backend/internal/repository/usage_log_repo_integration_test.go
@@ -204,7 +204,7 @@ func (s *UsageLogRepoSuite) TestDashboardStats_TodayTotalsAndPerformance() {
 	userToday := mustCreateUser(s.T(), s.client, &service.User{
 		Email:     "today@example.com",
-		CreatedAt: maxTime(todayStart.Add(10*time.Second), now.Add(-10*time.Second)),
+		CreatedAt: testMaxTime(todayStart.Add(10*time.Second), now.Add(-10*time.Second)),
 		UpdatedAt: now,
 	})
 	userOld := mustCreateUser(s.T(), s.client, &service.User{
@@ -237,7 +237,7 @@ func (s *UsageLogRepoSuite) TestDashboardStats_TodayTotalsAndPerformance() {
 		TotalCost:           1.5,
 		ActualCost:          1.2,
 		DurationMs:          &d1,
-		CreatedAt:           maxTime(todayStart.Add(2*time.Minute), now.Add(-2*time.Minute)),
+		CreatedAt:           testMaxTime(todayStart.Add(2*time.Minute), now.Add(-2*time.Minute)),
 	}
 	_, err = s.repo.Create(s.ctx, logToday)
 	s.Require().NoError(err, "Create logToday")
@@ -413,9 +413,17 @@ func (s *UsageLogRepoSuite) TestGetAccountTodayStats() {
 func (s *UsageLogRepoSuite) TestDashboardAggregationConsistency() {
 	now := time.Now().UTC().Truncate(time.Second)
-	hour1 := now.Add(-90 * time.Minute).Truncate(time.Hour)
+	// 使用固定的时间偏移确保 hour1 和 hour2 在同一天且都在过去
-	hour2 := now.Add(-30 * time.Minute).Truncate(time.Hour)
+	// 选择当天 02:00 和 03:00 作为测试时间点（基于 now 的日期）
 	dayStart := truncateToDayUTC(now)
 	hour1 := dayStart.Add(2 * time.Hour)  // 当天 02:00
 	hour2 := dayStart.Add(3 * time.Hour)  // 当天 03:00
 	// 如果当前时间早于 hour2，则使用昨天的时间
 	if now.Before(hour2.Add(time.Hour)) {
 		dayStart = dayStart.Add(-24 * time.Hour)
 		hour1 = dayStart.Add(2 * time.Hour)
 		hour2 = dayStart.Add(3 * time.Hour)
 	}
 	user1 := mustCreateUser(s.T(), s.client, &service.User{Email: "agg-u1@test.com"})
 	user2 := mustCreateUser(s.T(), s.client, &service.User{Email: "agg-u2@test.com"})
@@ -473,7 +481,7 @@ func (s *UsageLogRepoSuite) TestDashboardAggregationConsistency() {
 	aggRepo := newDashboardAggregationRepositoryWithSQL(s.tx)
 	aggStart := hour1.Add(-5 * time.Minute)
-	aggEnd := now.Add(5 * time.Minute)
+	aggEnd := hour2.Add(time.Hour) // 确保覆盖 hour2 的所有数据
 	s.Require().NoError(aggRepo.AggregateRange(s.ctx, aggStart, aggEnd))
 	type hourlyRow struct {
@@ -621,7 +629,7 @@ func (s *UsageLogRepoSuite) TestGetGlobalStats() {
 	s.Require().Equal(int64(45), stats.TotalOutputTokens)
 }
-func maxTime(a, b time.Time) time.Time {
+func testMaxTime(a, b time.Time) time.Time {
 	if a.After(b) {
 		return a
 	}
--- a/backend/internal/repository/wire.go
+++ b/backend/internal/repository/wire.go
@@ -49,6 +49,7 @@ var ProviderSet = wire.NewSet(
 	NewUsageLogRepository,
 	NewDashboardAggregationRepository,
 	NewSettingRepository,
 	NewOpsRepository,
 	NewUserSubscriptionRepository,
 	NewUserAttributeDefinitionRepository,
 	NewUserAttributeValueRepository,
--- a/backend/internal/server/api_contract_test.go
+++ b/backend/internal/server/api_contract_test.go
@@ -262,11 +262,11 @@ func TestAPIContracts(t *testing.T) {
 			name: "GET /api/v1/admin/settings",
 			setup: func(t *testing.T, deps *contractDeps) {
 				t.Helper()
-				deps.settingRepo.SetAll(map[string]string{
+					deps.settingRepo.SetAll(map[string]string{
-					service.SettingKeyRegistrationEnabled: "true",
+						service.SettingKeyRegistrationEnabled: "true",
-					service.SettingKeyEmailVerifyEnabled:  "false",
+						service.SettingKeyEmailVerifyEnabled:  "false",
-					service.SettingKeySMTPHost:     "smtp.example.com",
+						service.SettingKeySMTPHost:     "smtp.example.com",
 					service.SettingKeySMTPPort:     "587",
 					service.SettingKeySMTPUsername: "user",
 					service.SettingKeySMTPPassword: "secret",
@@ -285,10 +285,15 @@ func TestAPIContracts(t *testing.T) {
 					service.SettingKeyContactInfo:  "support",
 					service.SettingKeyDocURL:       "https://docs.example.com",
-					service.SettingKeyDefaultConcurrency: "5",
+						service.SettingKeyDefaultConcurrency: "5",
-					service.SettingKeyDefaultBalance:     "1.25",
+						service.SettingKeyDefaultBalance:     "1.25",
-				})
+
-			},
+						service.SettingKeyOpsMonitoringEnabled:         "false",
 						service.SettingKeyOpsRealtimeMonitoringEnabled: "true",
 						service.SettingKeyOpsQueryModeDefault:          "auto",
 						service.SettingKeyOpsMetricsIntervalSeconds:    "60",
 					})
 				},
 			method:     http.MethodGet,
 			path:       "/api/v1/admin/settings",
 			wantStatus: http.StatusOK,
@@ -309,13 +314,17 @@ func TestAPIContracts(t *testing.T) {
 					"turnstile_site_key": "site-key",
 					"turnstile_secret_key_configured": true,
 					"linuxdo_connect_enabled": false,
-					"linuxdo_connect_client_id": "",
+						"linuxdo_connect_client_id": "",
-					"linuxdo_connect_client_secret_configured": false,
+						"linuxdo_connect_client_secret_configured": false,
-					"linuxdo_connect_redirect_url": "",
+						"linuxdo_connect_redirect_url": "",
-					"site_name": "Sub2API",
+						"ops_monitoring_enabled": false,
-					"site_logo": "",
+						"ops_realtime_monitoring_enabled": true,
-					"site_subtitle": "Subtitle",
+						"ops_query_mode_default": "auto",
-					"api_base_url": "https://api.example.com",
+						"ops_metrics_interval_seconds": 60,
 						"site_name": "Sub2API",
 						"site_logo": "",
 						"site_subtitle": "Subtitle",
 						"api_base_url": "https://api.example.com",
 					"contact_info": "support",
 					"doc_url": "https://docs.example.com",
 					"default_concurrency": 5,
@@ -430,7 +439,7 @@ func newContractDeps(t *testing.T) *contractDeps {
 	authHandler := handler.NewAuthHandler(cfg, nil, userService, settingService, nil)
 	apiKeyHandler := handler.NewAPIKeyHandler(apiKeyService)
 	usageHandler := handler.NewUsageHandler(usageService, apiKeyService)
-	adminSettingHandler := adminhandler.NewSettingHandler(settingService, nil, nil)
+	adminSettingHandler := adminhandler.NewSettingHandler(settingService, nil, nil, nil)
 	adminAccountHandler := adminhandler.NewAccountHandler(adminService, nil, nil, nil, nil, nil, nil, nil, nil, nil)
 	jwtAuth := func(c *gin.Context) {
--- a/backend/internal/server/http.go
+++ b/backend/internal/server/http.go
@@ -31,6 +31,7 @@ func ProvideRouter(
 	apiKeyAuth middleware2.APIKeyAuthMiddleware,
 	apiKeyService *service.APIKeyService,
 	subscriptionService *service.SubscriptionService,
 	opsService *service.OpsService,
 	settingService *service.SettingService,
 	redisClient *redis.Client,
 ) *gin.Engine {
@@ -50,7 +51,7 @@ func ProvideRouter(
 		}
 	}
-	return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, settingService, cfg, redisClient)
+	return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, cfg, redisClient)
 }
 // ProvideHTTPServer 提供 HTTP 服务器
--- a/backend/internal/server/middleware/admin_auth.go
+++ b/backend/internal/server/middleware/admin_auth.go
@@ -30,6 +30,20 @@ func adminAuth(
 	settingService *service.SettingService,
 ) gin.HandlerFunc {
 	return func(c *gin.Context) {
 		// WebSocket upgrade requests cannot set Authorization headers in browsers.
 		// For admin WebSocket endpoints (e.g. Ops realtime), allow passing the JWT via
 		// Sec-WebSocket-Protocol (subprotocol list) using a prefixed token item:
 		//   Sec-WebSocket-Protocol: sub2api-admin, jwt.<token>
 		if isWebSocketUpgradeRequest(c) {
 			if token := extractJWTFromWebSocketSubprotocol(c); token != "" {
 				if !validateJWTForAdmin(c, token, authService, userService) {
 					return
 				}
 				c.Next()
 				return
 			}
 		}
 		// 检查 x-api-key header（Admin API Key 认证）
 		apiKey := c.GetHeader("x-api-key")
 		if apiKey != "" {
@@ -58,6 +72,44 @@ func adminAuth(
 	}
 }
 func isWebSocketUpgradeRequest(c *gin.Context) bool {
 	if c == nil || c.Request == nil {
 		return false
 	}
 	// RFC6455 handshake uses:
 	//   Connection: Upgrade
 	//   Upgrade: websocket
 	upgrade := strings.ToLower(strings.TrimSpace(c.GetHeader("Upgrade")))
 	if upgrade != "websocket" {
 		return false
 	}
 	connection := strings.ToLower(c.GetHeader("Connection"))
 	return strings.Contains(connection, "upgrade")
 }
 func extractJWTFromWebSocketSubprotocol(c *gin.Context) string {
 	if c == nil {
 		return ""
 	}
 	raw := strings.TrimSpace(c.GetHeader("Sec-WebSocket-Protocol"))
 	if raw == "" {
 		return ""
 	}
 	// The header is a comma-separated list of tokens. We reserve the prefix "jwt."
 	// for carrying the admin JWT.
 	for _, part := range strings.Split(raw, ",") {
 		p := strings.TrimSpace(part)
 		if strings.HasPrefix(p, "jwt.") {
 			token := strings.TrimSpace(strings.TrimPrefix(p, "jwt."))
 			if token != "" {
 				return token
 			}
 		}
 	}
 	return ""
 }
 // validateAdminAPIKey 验证管理员 API Key
 func validateAdminAPIKey(
 	c *gin.Context,
--- a/backend/internal/server/middleware/client_request_id.go
+++ b/backend/internal/server/middleware/client_request_id.go
@@ -0,0 +1,30 @@
 package middleware
 import (
 	"context"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
 	"github.com/gin-gonic/gin"
 	"github.com/google/uuid"
 )
 // ClientRequestID ensures every request has a unique client_request_id in request.Context().
 //
 // This is used by the Ops monitoring module for end-to-end request correlation.
 func ClientRequestID() gin.HandlerFunc {
 	return func(c *gin.Context) {
 		if c.Request == nil {
 			c.Next()
 			return
 		}
 		if v := c.Request.Context().Value(ctxkey.ClientRequestID); v != nil {
 			c.Next()
 			return
 		}
 		id := uuid.New().String()
 		c.Request = c.Request.WithContext(context.WithValue(c.Request.Context(), ctxkey.ClientRequestID, id))
 		c.Next()
 	}
 }
--- a/backend/internal/server/router.go
+++ b/backend/internal/server/router.go
@@ -23,6 +23,7 @@ func SetupRouter(
 	apiKeyAuth middleware2.APIKeyAuthMiddleware,
 	apiKeyService *service.APIKeyService,
 	subscriptionService *service.SubscriptionService,
 	opsService *service.OpsService,
 	settingService *service.SettingService,
 	cfg *config.Config,
 	redisClient *redis.Client,
@@ -46,7 +47,7 @@ func SetupRouter(
 	}
 	// 注册路由
-	registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, cfg, redisClient)
+	registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg, redisClient)
 	return r
 }
@@ -60,6 +61,7 @@ func registerRoutes(
 	apiKeyAuth middleware2.APIKeyAuthMiddleware,
 	apiKeyService *service.APIKeyService,
 	subscriptionService *service.SubscriptionService,
 	opsService *service.OpsService,
 	cfg *config.Config,
 	redisClient *redis.Client,
 ) {
@@ -73,5 +75,5 @@ func registerRoutes(
 	routes.RegisterAuthRoutes(v1, h, jwtAuth, redisClient)
 	routes.RegisterUserRoutes(v1, h, jwtAuth)
 	routes.RegisterAdminRoutes(v1, h, adminAuth)
-	routes.RegisterGatewayRoutes(r, h, apiKeyAuth, apiKeyService, subscriptionService, cfg)
+	routes.RegisterGatewayRoutes(r, h, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg)
 }
--- a/backend/internal/server/routes/admin.go
+++ b/backend/internal/server/routes/admin.go
@@ -50,6 +50,9 @@ func RegisterAdminRoutes(
 		// 系统设置
 		registerSettingsRoutes(admin, h)
 		// 运维监控（Ops）
 		registerOpsRoutes(admin, h)
 		// 系统管理
 		registerSystemRoutes(admin, h)
@@ -64,6 +67,58 @@ func RegisterAdminRoutes(
 	}
 }
 func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) {
 	ops := admin.Group("/ops")
 	{
 		// Realtime ops signals
 		ops.GET("/concurrency", h.Admin.Ops.GetConcurrencyStats)
 		ops.GET("/account-availability", h.Admin.Ops.GetAccountAvailability)
 		// Alerts (rules + events)
 		ops.GET("/alert-rules", h.Admin.Ops.ListAlertRules)
 		ops.POST("/alert-rules", h.Admin.Ops.CreateAlertRule)
 		ops.PUT("/alert-rules/:id", h.Admin.Ops.UpdateAlertRule)
 		ops.DELETE("/alert-rules/:id", h.Admin.Ops.DeleteAlertRule)
 		ops.GET("/alert-events", h.Admin.Ops.ListAlertEvents)
 		// Email notification config (DB-backed)
 		ops.GET("/email-notification/config", h.Admin.Ops.GetEmailNotificationConfig)
 		ops.PUT("/email-notification/config", h.Admin.Ops.UpdateEmailNotificationConfig)
 		// Runtime settings (DB-backed)
 		runtime := ops.Group("/runtime")
 		{
 			runtime.GET("/alert", h.Admin.Ops.GetAlertRuntimeSettings)
 			runtime.PUT("/alert", h.Admin.Ops.UpdateAlertRuntimeSettings)
 		}
 		// Advanced settings (DB-backed)
 		ops.GET("/advanced-settings", h.Admin.Ops.GetAdvancedSettings)
 		ops.PUT("/advanced-settings", h.Admin.Ops.UpdateAdvancedSettings)
 		// WebSocket realtime (QPS/TPS)
 		ws := ops.Group("/ws")
 		{
 			ws.GET("/qps", h.Admin.Ops.QPSWSHandler)
 		}
 		// Error logs (MVP-1)
 		ops.GET("/errors", h.Admin.Ops.GetErrorLogs)
 		ops.GET("/errors/:id", h.Admin.Ops.GetErrorLogByID)
 		ops.POST("/errors/:id/retry", h.Admin.Ops.RetryErrorRequest)
 		// Request drilldown (success + error)
 		ops.GET("/requests", h.Admin.Ops.ListRequestDetails)
 		// Dashboard (vNext - raw path for MVP)
 		ops.GET("/dashboard/overview", h.Admin.Ops.GetDashboardOverview)
 		ops.GET("/dashboard/throughput-trend", h.Admin.Ops.GetDashboardThroughputTrend)
 		ops.GET("/dashboard/latency-histogram", h.Admin.Ops.GetDashboardLatencyHistogram)
 		ops.GET("/dashboard/error-trend", h.Admin.Ops.GetDashboardErrorTrend)
 		ops.GET("/dashboard/error-distribution", h.Admin.Ops.GetDashboardErrorDistribution)
 	}
 }
 func registerDashboardRoutes(admin *gin.RouterGroup, h *handler.Handlers) {
 	dashboard := admin.Group("/dashboard")
 	{
--- a/backend/internal/server/routes/gateway.go
+++ b/backend/internal/server/routes/gateway.go
@@ -16,13 +16,18 @@ func RegisterGatewayRoutes(
 	apiKeyAuth middleware.APIKeyAuthMiddleware,
 	apiKeyService *service.APIKeyService,
 	subscriptionService *service.SubscriptionService,
 	opsService *service.OpsService,
 	cfg *config.Config,
 ) {
 	bodyLimit := middleware.RequestBodyLimit(cfg.Gateway.MaxBodySize)
 	clientRequestID := middleware.ClientRequestID()
 	opsErrorLogger := handler.OpsErrorLoggerMiddleware(opsService)
 	// API网关（Claude API兼容）
 	gateway := r.Group("/v1")
 	gateway.Use(bodyLimit)
 	gateway.Use(clientRequestID)
 	gateway.Use(opsErrorLogger)
 	gateway.Use(gin.HandlerFunc(apiKeyAuth))
 	{
 		gateway.POST("/messages", h.Gateway.Messages)
@@ -36,6 +41,8 @@ func RegisterGatewayRoutes(
 	// Gemini 原生 API 兼容层（Gemini SDK/CLI 直连）
 	gemini := r.Group("/v1beta")
 	gemini.Use(bodyLimit)
 	gemini.Use(clientRequestID)
 	gemini.Use(opsErrorLogger)
 	gemini.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg))
 	{
 		gemini.GET("/models", h.Gateway.GeminiV1BetaListModels)
@@ -45,7 +52,7 @@ func RegisterGatewayRoutes(
 	}
 	// OpenAI Responses API（不带v1前缀的别名）
-	r.POST("/responses", bodyLimit, gin.HandlerFunc(apiKeyAuth), h.OpenAIGateway.Responses)
+	r.POST("/responses", bodyLimit, clientRequestID, opsErrorLogger, gin.HandlerFunc(apiKeyAuth), h.OpenAIGateway.Responses)
 	// Antigravity 模型列表
 	r.GET("/antigravity/models", gin.HandlerFunc(apiKeyAuth), h.Gateway.AntigravityModels)
@@ -53,6 +60,8 @@ func RegisterGatewayRoutes(
 	// Antigravity 专用路由（仅使用 antigravity 账户，不混合调度）
 	antigravityV1 := r.Group("/antigravity/v1")
 	antigravityV1.Use(bodyLimit)
 	antigravityV1.Use(clientRequestID)
 	antigravityV1.Use(opsErrorLogger)
 	antigravityV1.Use(middleware.ForcePlatform(service.PlatformAntigravity))
 	antigravityV1.Use(gin.HandlerFunc(apiKeyAuth))
 	{
@@ -64,6 +73,8 @@ func RegisterGatewayRoutes(
 	antigravityV1Beta := r.Group("/antigravity/v1beta")
 	antigravityV1Beta.Use(bodyLimit)
 	antigravityV1Beta.Use(clientRequestID)
 	antigravityV1Beta.Use(opsErrorLogger)
 	antigravityV1Beta.Use(middleware.ForcePlatform(service.PlatformAntigravity))
 	antigravityV1Beta.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg))
 	{
--- a/backend/internal/service/antigravity_gateway_service.go
+++ b/backend/internal/service/antigravity_gateway_service.go
@@ -564,6 +564,14 @@ urlFallbackLoop:
 			resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
 			if err != nil {
 				safeErr := sanitizeUpstreamErrorMessage(err.Error())
 				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 					Platform:           account.Platform,
 					AccountID:          account.ID,
 					UpstreamStatusCode: 0,
 					Kind:               "request_error",
 					Message:            safeErr,
 				})
 				// 检查是否应触发 URL 降级
 				if shouldAntigravityFallbackToNextURL(err, 0) && urlIdx < len(availableURLs)-1 {
 					antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
@@ -579,6 +587,7 @@ urlFallbackLoop:
 					continue
 				}
 				log.Printf("%s status=request_failed retries_exhausted error=%v", prefix, err)
 				setOpsUpstreamError(c, 0, safeErr, "")
 				return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries")
 			}
@@ -586,6 +595,26 @@ urlFallbackLoop:
 			if resp.StatusCode == http.StatusTooManyRequests && urlIdx < len(availableURLs)-1 {
 				respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 				_ = resp.Body.Close()
 				upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
 				upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 				logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
 				maxBytes := 2048
 				if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
 					maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 				}
 				upstreamDetail := ""
 				if logBody {
 					upstreamDetail = truncateString(string(respBody), maxBytes)
 				}
 				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 					Platform:           account.Platform,
 					AccountID:          account.ID,
 					UpstreamStatusCode: resp.StatusCode,
 					UpstreamRequestID:  resp.Header.Get("x-request-id"),
 					Kind:               "retry",
 					Message:            upstreamMsg,
 					Detail:             upstreamDetail,
 				})
 				antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
 				log.Printf("%s URL fallback (HTTP 429): %s -> %s body=%s", prefix, baseURL, availableURLs[urlIdx+1], truncateForLog(respBody, 200))
 				continue urlFallbackLoop
@@ -596,6 +625,26 @@ urlFallbackLoop:
 				_ = resp.Body.Close()
 				if attempt < antigravityMaxRetries {
 					upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
 					upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 					logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
 					maxBytes := 2048
 					if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
 						maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 					}
 					upstreamDetail := ""
 					if logBody {
 						upstreamDetail = truncateString(string(respBody), maxBytes)
 					}
 					appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 						Platform:           account.Platform,
 						AccountID:          account.ID,
 						UpstreamStatusCode: resp.StatusCode,
 						UpstreamRequestID:  resp.Header.Get("x-request-id"),
 						Kind:               "retry",
 						Message:            upstreamMsg,
 						Detail:             upstreamDetail,
 					})
 					log.Printf("%s status=%d retry=%d/%d body=%s", prefix, resp.StatusCode, attempt, antigravityMaxRetries, truncateForLog(respBody, 500))
 					if !sleepAntigravityBackoffWithContext(ctx, attempt) {
 						log.Printf("%s status=context_canceled_during_backoff", prefix)
@@ -628,6 +677,27 @@ urlFallbackLoop:
 		// Antigravity /v1internal 链路在部分场景会对 thought/thinking signature 做严格校验，
 		// 当历史消息携带的 signature 不合法时会直接 400；去除 thinking 后可继续完成请求。
 		if resp.StatusCode == http.StatusBadRequest && isSignatureRelatedError(respBody) {
 			upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
 			upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 			logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
 			maxBytes := 2048
 			if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
 				maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 			}
 			upstreamDetail := ""
 			if logBody {
 				upstreamDetail = truncateString(string(respBody), maxBytes)
 			}
 			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 				Platform:           account.Platform,
 				AccountID:          account.ID,
 				UpstreamStatusCode: resp.StatusCode,
 				UpstreamRequestID:  resp.Header.Get("x-request-id"),
 				Kind:               "signature_error",
 				Message:            upstreamMsg,
 				Detail:             upstreamDetail,
 			})
 			// Conservative two-stage fallback:
 			// 1) Disable top-level thinking + thinking->text
 			// 2) Only if still signature-related 400: also downgrade tool_use/tool_result to text.
@@ -661,6 +731,13 @@ urlFallbackLoop:
 				}
 				retryResp, retryErr := s.httpUpstream.Do(retryReq, proxyURL, account.ID, account.Concurrency)
 				if retryErr != nil {
 					appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 						Platform:           account.Platform,
 						AccountID:          account.ID,
 						UpstreamStatusCode: 0,
 						Kind:               "signature_retry_request_error",
 						Message:            sanitizeUpstreamErrorMessage(retryErr.Error()),
 					})
 					log.Printf("Antigravity account %d: signature retry request failed (%s): %v", account.ID, stage.name, retryErr)
 					continue
 				}
@@ -674,6 +751,25 @@ urlFallbackLoop:
 				retryBody, _ := io.ReadAll(io.LimitReader(retryResp.Body, 2<<20))
 				_ = retryResp.Body.Close()
 				kind := "signature_retry"
 				if strings.TrimSpace(stage.name) != "" {
 					kind = "signature_retry_" + strings.ReplaceAll(stage.name, "+", "_")
 				}
 				retryUpstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(retryBody))
 				retryUpstreamMsg = sanitizeUpstreamErrorMessage(retryUpstreamMsg)
 				retryUpstreamDetail := ""
 				if logBody {
 					retryUpstreamDetail = truncateString(string(retryBody), maxBytes)
 				}
 				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 					Platform:           account.Platform,
 					AccountID:          account.ID,
 					UpstreamStatusCode: retryResp.StatusCode,
 					UpstreamRequestID:  retryResp.Header.Get("x-request-id"),
 					Kind:               kind,
 					Message:            retryUpstreamMsg,
 					Detail:             retryUpstreamDetail,
 				})
 				// If this stage fixed the signature issue, we stop; otherwise we may try the next stage.
 				if retryResp.StatusCode != http.StatusBadRequest || !isSignatureRelatedError(retryBody) {
@@ -701,10 +797,30 @@ urlFallbackLoop:
 			s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope)
 			if s.shouldFailoverUpstreamError(resp.StatusCode) {
 				upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
 				upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 				logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
 				maxBytes := 2048
 				if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
 					maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 				}
 				upstreamDetail := ""
 				if logBody {
 					upstreamDetail = truncateString(string(respBody), maxBytes)
 				}
 				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 					Platform:           account.Platform,
 					AccountID:          account.ID,
 					UpstreamStatusCode: resp.StatusCode,
 					UpstreamRequestID:  resp.Header.Get("x-request-id"),
 					Kind:               "failover",
 					Message:            upstreamMsg,
 					Detail:             upstreamDetail,
 				})
 				return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 			}
-			return nil, s.writeMappedClaudeError(c, resp.StatusCode, respBody)
+			return nil, s.writeMappedClaudeError(c, account, resp.StatusCode, resp.Header.Get("x-request-id"), respBody)
 		}
 	}
@@ -1108,6 +1224,14 @@ urlFallbackLoop:
 			resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
 			if err != nil {
 				safeErr := sanitizeUpstreamErrorMessage(err.Error())
 				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 					Platform:           account.Platform,
 					AccountID:          account.ID,
 					UpstreamStatusCode: 0,
 					Kind:               "request_error",
 					Message:            safeErr,
 				})
 				// 检查是否应触发 URL 降级
 				if shouldAntigravityFallbackToNextURL(err, 0) && urlIdx < len(availableURLs)-1 {
 					antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
@@ -1123,6 +1247,7 @@ urlFallbackLoop:
 					continue
 				}
 				log.Printf("%s status=request_failed retries_exhausted error=%v", prefix, err)
 				setOpsUpstreamError(c, 0, safeErr, "")
 				return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries")
 			}
@@ -1130,6 +1255,26 @@ urlFallbackLoop:
 			if resp.StatusCode == http.StatusTooManyRequests && urlIdx < len(availableURLs)-1 {
 				respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 				_ = resp.Body.Close()
 				upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
 				upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 				logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
 				maxBytes := 2048
 				if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
 					maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 				}
 				upstreamDetail := ""
 				if logBody {
 					upstreamDetail = truncateString(string(respBody), maxBytes)
 				}
 				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 					Platform:           account.Platform,
 					AccountID:          account.ID,
 					UpstreamStatusCode: resp.StatusCode,
 					UpstreamRequestID:  resp.Header.Get("x-request-id"),
 					Kind:               "retry",
 					Message:            upstreamMsg,
 					Detail:             upstreamDetail,
 				})
 				antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
 				log.Printf("%s URL fallback (HTTP 429): %s -> %s body=%s", prefix, baseURL, availableURLs[urlIdx+1], truncateForLog(respBody, 200))
 				continue urlFallbackLoop
@@ -1140,6 +1285,26 @@ urlFallbackLoop:
 				_ = resp.Body.Close()
 				if attempt < antigravityMaxRetries {
 					upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
 					upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 					logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
 					maxBytes := 2048
 					if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
 						maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 					}
 					upstreamDetail := ""
 					if logBody {
 						upstreamDetail = truncateString(string(respBody), maxBytes)
 					}
 					appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 						Platform:           account.Platform,
 						AccountID:          account.ID,
 						UpstreamStatusCode: resp.StatusCode,
 						UpstreamRequestID:  resp.Header.Get("x-request-id"),
 						Kind:               "retry",
 						Message:            upstreamMsg,
 						Detail:             upstreamDetail,
 					})
 					log.Printf("%s status=%d retry=%d/%d", prefix, resp.StatusCode, attempt, antigravityMaxRetries)
 					if !sleepAntigravityBackoffWithContext(ctx, attempt) {
 						log.Printf("%s status=context_canceled_during_backoff", prefix)
@@ -1205,21 +1370,59 @@ urlFallbackLoop:
 		s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope)
 		if s.shouldFailoverUpstreamError(resp.StatusCode) {
 			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 		}
 		// 解包并返回错误
 		requestID := resp.Header.Get("x-request-id")
 		if requestID != "" {
 			c.Header("x-request-id", requestID)
 		}
-		unwrapped, _ := s.unwrapV1InternalResponse(respBody)
+
 		unwrapped, unwrapErr := s.unwrapV1InternalResponse(respBody)
 		unwrappedForOps := unwrapped
 		if unwrapErr != nil || len(unwrappedForOps) == 0 {
 			unwrappedForOps = respBody
 		}
 		upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(unwrappedForOps))
 		upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 		logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
 		maxBytes := 2048
 		if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
 			maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 		}
 		upstreamDetail := ""
 		if logBody {
 			upstreamDetail = truncateString(string(unwrappedForOps), maxBytes)
 		}
 		// Always record upstream context for Ops error logs, even when we will failover.
 		setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
 		if s.shouldFailoverUpstreamError(resp.StatusCode) {
 			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 				Platform:           account.Platform,
 				AccountID:          account.ID,
 				UpstreamStatusCode: resp.StatusCode,
 				UpstreamRequestID:  requestID,
 				Kind:               "failover",
 				Message:            upstreamMsg,
 				Detail:             upstreamDetail,
 			})
 			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 		}
 		contentType := resp.Header.Get("Content-Type")
 		if contentType == "" {
 			contentType = "application/json"
 		}
-		c.Data(resp.StatusCode, contentType, unwrapped)
+		appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 			Platform:           account.Platform,
 			AccountID:          account.ID,
 			UpstreamStatusCode: resp.StatusCode,
 			UpstreamRequestID:  requestID,
 			Kind:               "http_error",
 			Message:            upstreamMsg,
 			Detail:             upstreamDetail,
 		})
 		c.Data(resp.StatusCode, contentType, unwrappedForOps)
 		return nil, fmt.Errorf("antigravity upstream error: %d", resp.StatusCode)
 	}
@@ -1674,9 +1877,35 @@ func (s *AntigravityGatewayService) writeClaudeError(c *gin.Context, status int,
 	return fmt.Errorf("%s", message)
 }
-func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, upstreamStatus int, body []byte) error {
+func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, account *Account, upstreamStatus int, upstreamRequestID string, body []byte) error {
-	// 记录上游错误详情便于调试
+	upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
-	log.Printf("[antigravity-Forward] upstream_error status=%d body=%s", upstreamStatus, string(body))
+	upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 	logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
 	maxBytes := 2048
 	if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
 		maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 	}
 	upstreamDetail := ""
 	if logBody {
 		upstreamDetail = truncateString(string(body), maxBytes)
 	}
 	setOpsUpstreamError(c, upstreamStatus, upstreamMsg, upstreamDetail)
 	appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 		Platform:           account.Platform,
 		AccountID:          account.ID,
 		UpstreamStatusCode: upstreamStatus,
 		UpstreamRequestID:  upstreamRequestID,
 		Kind:               "http_error",
 		Message:            upstreamMsg,
 		Detail:             upstreamDetail,
 	})
 	// 记录上游错误详情便于排障（可选：由配置控制；不回显到客户端）
 	if logBody {
 		log.Printf("[antigravity-Forward] upstream_error status=%d body=%s", upstreamStatus, truncateForLog(body, maxBytes))
 	}
 	var statusCode int
 	var errType, errMsg string
@@ -1712,7 +1941,10 @@ func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, upstr
 		"type":  "error",
 		"error": gin.H{"type": errType, "message": errMsg},
 	})
-	return fmt.Errorf("upstream error: %d", upstreamStatus)
+	if upstreamMsg == "" {
 		return fmt.Errorf("upstream error: %d", upstreamStatus)
 	}
 	return fmt.Errorf("upstream error: %d message=%s", upstreamStatus, upstreamMsg)
 }
 func (s *AntigravityGatewayService) writeGoogleError(c *gin.Context, status int, message string) error {
--- a/backend/internal/service/auth_service.go
+++ b/backend/internal/service/auth_service.go
@@ -357,7 +357,7 @@ func (s *AuthService) Login(ctx context.Context, email, password string) (string
 // - 如果邮箱已存在：直接登录（不需要本地密码）
 // - 如果邮箱不存在：创建新用户并登录
 //
-// 注意：该函数用于“终端用户登录 Sub2API 本身”的场景（不同于上游账号的 OAuth，例如 OpenAI/Gemini）。
+// 注意：该函数用于 LinuxDo OAuth 登录场景（不同于上游账号的 OAuth，例如 Claude/OpenAI/Gemini）。
 // 为了满足现有数据库约束（需要密码哈希），新用户会生成随机密码并进行哈希保存。
 func (s *AuthService) LoginOrRegisterOAuth(ctx context.Context, email, username string) (string, *User, error) {
 	email = strings.TrimSpace(email)
@@ -376,8 +376,8 @@ func (s *AuthService) LoginOrRegisterOAuth(ctx context.Context, email, username
 	user, err := s.userRepo.GetByEmail(ctx, email)
 	if err != nil {
 		if errors.Is(err, ErrUserNotFound) {
-			// OAuth 首次登录视为注册。
+			// OAuth 首次登录视为注册（fail-close：settingService 未配置时不允许注册）
-			if s.settingService != nil && !s.settingService.IsRegistrationEnabled(ctx) {
+			if s.settingService == nil || !s.settingService.IsRegistrationEnabled(ctx) {
 				return "", nil, ErrRegDisabled
 			}
--- a/backend/internal/service/domain_constants.go
+++ b/backend/internal/service/domain_constants.go
@@ -63,6 +63,9 @@ const (
 	SubscriptionStatusSuspended = "suspended"
 )
 // LinuxDoConnectSyntheticEmailDomain 是 LinuxDo Connect 用户的合成邮箱后缀（RFC 保留域名）。
 const LinuxDoConnectSyntheticEmailDomain = "@linuxdo-connect.invalid"
 // Setting keys
 const (
 	// 注册设置
@@ -83,6 +86,12 @@ const (
 	SettingKeyTurnstileSiteKey   = "turnstile_site_key"   // Turnstile Site Key
 	SettingKeyTurnstileSecretKey = "turnstile_secret_key" // Turnstile Secret Key
 	// LinuxDo Connect OAuth 登录设置
 	SettingKeyLinuxDoConnectEnabled      = "linuxdo_connect_enabled"
 	SettingKeyLinuxDoConnectClientID     = "linuxdo_connect_client_id"
 	SettingKeyLinuxDoConnectClientSecret = "linuxdo_connect_client_secret"
 	SettingKeyLinuxDoConnectRedirectURL  = "linuxdo_connect_redirect_url"
 	// OEM设置
 	SettingKeySiteName     = "site_name"     // 网站名称
 	SettingKeySiteLogo     = "site_logo"     // 网站Logo (base64)
@@ -113,16 +122,31 @@ const (
 	SettingKeyEnableIdentityPatch = "enable_identity_patch"
 	SettingKeyIdentityPatchPrompt = "identity_patch_prompt"
-	// LinuxDo Connect OAuth 登录（终端用户 SSO）
+	// =========================
-	SettingKeyLinuxDoConnectEnabled      = "linuxdo_connect_enabled"
+	// Ops Monitoring (vNext)
-	SettingKeyLinuxDoConnectClientID     = "linuxdo_connect_client_id"
+	// =========================
 	SettingKeyLinuxDoConnectClientSecret = "linuxdo_connect_client_secret"
 	SettingKeyLinuxDoConnectRedirectURL  = "linuxdo_connect_redirect_url"
 )
-// LinuxDoConnectSyntheticEmailDomain 是 LinuxDo Connect 用户的合成邮箱后缀（RFC 保留域名）。
+	// SettingKeyOpsMonitoringEnabled is a DB-backed soft switch to enable/disable ops module at runtime.
-// 目的：避免第三方登录返回的用户标识与本地真实邮箱发生碰撞，进而造成账号被接管的风险。
+	SettingKeyOpsMonitoringEnabled = "ops_monitoring_enabled"
-const LinuxDoConnectSyntheticEmailDomain = "@linuxdo-connect.invalid"
+
 	// SettingKeyOpsRealtimeMonitoringEnabled controls realtime features (e.g. WS/QPS push).
 	SettingKeyOpsRealtimeMonitoringEnabled = "ops_realtime_monitoring_enabled"
 	// SettingKeyOpsQueryModeDefault controls the default query mode for ops dashboard (auto/raw/preagg).
 	SettingKeyOpsQueryModeDefault = "ops_query_mode_default"
 	// SettingKeyOpsEmailNotificationConfig stores JSON config for ops email notifications.
 	SettingKeyOpsEmailNotificationConfig = "ops_email_notification_config"
 	// SettingKeyOpsAlertRuntimeSettings stores JSON config for ops alert evaluator runtime settings.
 	SettingKeyOpsAlertRuntimeSettings = "ops_alert_runtime_settings"
 	// SettingKeyOpsMetricsIntervalSeconds controls the ops metrics collector interval (>=60).
 	SettingKeyOpsMetricsIntervalSeconds = "ops_metrics_interval_seconds"
 	// SettingKeyOpsAdvancedSettings stores JSON config for ops advanced settings (data retention, aggregation).
 	SettingKeyOpsAdvancedSettings = "ops_advanced_settings"
 )
 // AdminAPIKeyPrefix is the prefix for admin API keys (distinct from user "sk-" keys).
 const AdminAPIKeyPrefix = "admin-"
--- a/backend/internal/service/gateway_service.go
+++ b/backend/internal/service/gateway_service.go
@@ -1399,7 +1399,24 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 			if resp != nil && resp.Body != nil {
 				_ = resp.Body.Close()
 			}
-			return nil, fmt.Errorf("upstream request failed: %w", err)
+			// Ensure the client receives an error response (handlers assume Forward writes on non-failover errors).
 			safeErr := sanitizeUpstreamErrorMessage(err.Error())
 			setOpsUpstreamError(c, 0, safeErr, "")
 			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 				Platform:           account.Platform,
 				AccountID:          account.ID,
 				UpstreamStatusCode: 0,
 				Kind:               "request_error",
 				Message:            safeErr,
 			})
 			c.JSON(http.StatusBadGateway, gin.H{
 				"type": "error",
 				"error": gin.H{
 					"type":    "upstream_error",
 					"message": "Upstream request failed",
 				},
 			})
 			return nil, fmt.Errorf("upstream request failed: %s", safeErr)
 		}
 		// 优先检测thinking block签名错误（400）并重试一次
@@ -1409,6 +1426,21 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 				_ = resp.Body.Close()
 				if s.isThinkingBlockSignatureError(respBody) {
 					appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 						Platform:           account.Platform,
 						AccountID:          account.ID,
 						UpstreamStatusCode: resp.StatusCode,
 						UpstreamRequestID:  resp.Header.Get("x-request-id"),
 						Kind:               "signature_error",
 						Message:            extractUpstreamErrorMessage(respBody),
 						Detail: func() string {
 							if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 								return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
 							}
 							return ""
 						}(),
 					})
 					looksLikeToolSignatureError := func(msg string) bool {
 						m := strings.ToLower(msg)
 						return strings.Contains(m, "tool_use") ||
@@ -1445,6 +1477,20 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 							retryRespBody, retryReadErr := io.ReadAll(io.LimitReader(retryResp.Body, 2<<20))
 							_ = retryResp.Body.Close()
 							if retryReadErr == nil && retryResp.StatusCode == 400 && s.isThinkingBlockSignatureError(retryRespBody) {
 								appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 									Platform:           account.Platform,
 									AccountID:          account.ID,
 									UpstreamStatusCode: retryResp.StatusCode,
 									UpstreamRequestID:  retryResp.Header.Get("x-request-id"),
 									Kind:               "signature_retry_thinking",
 									Message:            extractUpstreamErrorMessage(retryRespBody),
 									Detail: func() string {
 										if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 											return truncateString(string(retryRespBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
 										}
 										return ""
 									}(),
 								})
 								msg2 := extractUpstreamErrorMessage(retryRespBody)
 								if looksLikeToolSignatureError(msg2) && time.Since(retryStart) < maxRetryElapsed {
 									log.Printf("Account %d: signature retry still failing and looks tool-related, retrying with tool blocks downgraded", account.ID)
@@ -1459,6 +1505,13 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 										if retryResp2 != nil && retryResp2.Body != nil {
 											_ = retryResp2.Body.Close()
 										}
 										appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 											Platform:           account.Platform,
 											AccountID:          account.ID,
 											UpstreamStatusCode: 0,
 											Kind:               "signature_retry_tools_request_error",
 											Message:            sanitizeUpstreamErrorMessage(retryErr2.Error()),
 										})
 										log.Printf("Account %d: tool-downgrade signature retry failed: %v", account.ID, retryErr2)
 									} else {
 										log.Printf("Account %d: tool-downgrade signature retry build failed: %v", account.ID, buildErr2)
@@ -1508,9 +1561,24 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 					break
 				}
 				respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 				_ = resp.Body.Close()
 				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 					Platform:           account.Platform,
 					AccountID:          account.ID,
 					UpstreamStatusCode: resp.StatusCode,
 					UpstreamRequestID:  resp.Header.Get("x-request-id"),
 					Kind:               "retry",
 					Message:            extractUpstreamErrorMessage(respBody),
 					Detail: func() string {
 						if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 							return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
 						}
 						return ""
 					}(),
 				})
 				log.Printf("Account %d: upstream error %d, retry %d/%d after %v (elapsed=%v/%v)",
 					account.ID, resp.StatusCode, attempt, maxRetryAttempts, delay, elapsed, maxRetryElapsed)
 				_ = resp.Body.Close()
 				if err := sleepWithContext(ctx, delay); err != nil {
 					return nil, err
 				}
@@ -1538,7 +1606,25 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 	// 处理重试耗尽的情况
 	if resp.StatusCode >= 400 && s.shouldRetryUpstreamError(account, resp.StatusCode) {
 		if s.shouldFailoverUpstreamError(resp.StatusCode) {
 			respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 			_ = resp.Body.Close()
 			resp.Body = io.NopCloser(bytes.NewReader(respBody))
 			s.handleRetryExhaustedSideEffects(ctx, resp, account)
 			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 				Platform:           account.Platform,
 				AccountID:          account.ID,
 				UpstreamStatusCode: resp.StatusCode,
 				UpstreamRequestID:  resp.Header.Get("x-request-id"),
 				Kind:               "retry_exhausted_failover",
 				Message:            extractUpstreamErrorMessage(respBody),
 				Detail: func() string {
 					if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 						return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
 					}
 					return ""
 				}(),
 			})
 			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 		}
 		return s.handleRetryExhaustedError(ctx, resp, c, account)
@@ -1546,7 +1632,25 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 	// 处理可切换账号的错误
 	if resp.StatusCode >= 400 && s.shouldFailoverUpstreamError(resp.StatusCode) {
 		respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 		_ = resp.Body.Close()
 		resp.Body = io.NopCloser(bytes.NewReader(respBody))
 		s.handleFailoverSideEffects(ctx, resp, account)
 		appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 			Platform:           account.Platform,
 			AccountID:          account.ID,
 			UpstreamStatusCode: resp.StatusCode,
 			UpstreamRequestID:  resp.Header.Get("x-request-id"),
 			Kind:               "failover",
 			Message:            extractUpstreamErrorMessage(respBody),
 			Detail: func() string {
 				if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 					return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
 				}
 				return ""
 			}(),
 		})
 		return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 	}
@@ -1563,6 +1667,26 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 			resp.Body = io.NopCloser(bytes.NewReader(respBody))
 			if s.shouldFailoverOn400(respBody) {
 				upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
 				upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 				upstreamDetail := ""
 				if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 					maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 					if maxBytes <= 0 {
 						maxBytes = 2048
 					}
 					upstreamDetail = truncateString(string(respBody), maxBytes)
 				}
 				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 					Platform:           account.Platform,
 					AccountID:          account.ID,
 					UpstreamStatusCode: resp.StatusCode,
 					UpstreamRequestID:  resp.Header.Get("x-request-id"),
 					Kind:               "failover_on_400",
 					Message:            upstreamMsg,
 					Detail:             upstreamDetail,
 				})
 				if s.cfg.Gateway.LogUpstreamErrorBody {
 					log.Printf(
 						"Account %d: 400 error, attempting failover: %s",
@@ -1859,7 +1983,30 @@ func extractUpstreamErrorMessage(body []byte) string {
 }
 func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*ForwardResult, error) {
-	body, _ := io.ReadAll(resp.Body)
+	body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 	upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
 	upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 	// Enrich Ops error logs with upstream status + message, and optionally a truncated body snippet.
 	upstreamDetail := ""
 	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 		maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 		if maxBytes <= 0 {
 			maxBytes = 2048
 		}
 		upstreamDetail = truncateString(string(body), maxBytes)
 	}
 	setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
 	appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 		Platform:           account.Platform,
 		AccountID:          account.ID,
 		UpstreamStatusCode: resp.StatusCode,
 		UpstreamRequestID:  resp.Header.Get("x-request-id"),
 		Kind:               "http_error",
 		Message:            upstreamMsg,
 		Detail:             upstreamDetail,
 	})
 	// 处理上游错误，标记账号状态
 	shouldDisable := false
@@ -1870,24 +2017,33 @@ func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Res
 		return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 	}
 	// 记录上游错误响应体摘要便于排障（可选：由配置控制；不回显到客户端）
 	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 		log.Printf(
 			"Upstream error %d (account=%d platform=%s type=%s): %s",
 			resp.StatusCode,
 			account.ID,
 			account.Platform,
 			account.Type,
 			truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
 		)
 	}
 	// 根据状态码返回适当的自定义错误响应（不透传上游详细信息）
 	var errType, errMsg string
 	var statusCode int
 	switch resp.StatusCode {
 	case 400:
 		// 仅记录上游错误摘要（避免输出请求内容）；需要时可通过配置打开
 		if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 			log.Printf(
 				"Upstream 400 error (account=%d platform=%s type=%s): %s",
 				account.ID,
 				account.Platform,
 				account.Type,
 				truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
 			)
 		}
 		c.Data(http.StatusBadRequest, "application/json", body)
-		return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
+		summary := upstreamMsg
 		if summary == "" {
 			summary = truncateForLog(body, 512)
 		}
 		if summary == "" {
 			return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
 		}
 		return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, summary)
 	case 401:
 		statusCode = http.StatusBadGateway
 		errType = "upstream_error"
@@ -1923,11 +2079,14 @@ func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Res
 		},
 	})
-	return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
+	if upstreamMsg == "" {
 		return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
 	}
 	return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
 }
 func (s *GatewayService) handleRetryExhaustedSideEffects(ctx context.Context, resp *http.Response, account *Account) {
-	body, _ := io.ReadAll(resp.Body)
+	body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 	statusCode := resp.StatusCode
 	// OAuth/Setup Token 账号的 403：标记账号异常
@@ -1941,7 +2100,7 @@ func (s *GatewayService) handleRetryExhaustedSideEffects(ctx context.Context, re
 }
 func (s *GatewayService) handleFailoverSideEffects(ctx context.Context, resp *http.Response, account *Account) {
-	body, _ := io.ReadAll(resp.Body)
+	body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 	s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body)
 }
@@ -1949,8 +2108,45 @@ func (s *GatewayService) handleFailoverSideEffects(ctx context.Context, resp *ht
 // OAuth 403：标记账号异常
 // API Key 未配置错误码：仅返回错误，不标记账号
 func (s *GatewayService) handleRetryExhaustedError(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*ForwardResult, error) {
 	// Capture upstream error body before side-effects consume the stream.
 	respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 	_ = resp.Body.Close()
 	resp.Body = io.NopCloser(bytes.NewReader(respBody))
 	s.handleRetryExhaustedSideEffects(ctx, resp, account)
 	upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
 	upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 	upstreamDetail := ""
 	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 		maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 		if maxBytes <= 0 {
 			maxBytes = 2048
 		}
 		upstreamDetail = truncateString(string(respBody), maxBytes)
 	}
 	setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
 	appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 		Platform:           account.Platform,
 		AccountID:          account.ID,
 		UpstreamStatusCode: resp.StatusCode,
 		UpstreamRequestID:  resp.Header.Get("x-request-id"),
 		Kind:               "retry_exhausted",
 		Message:            upstreamMsg,
 		Detail:             upstreamDetail,
 	})
 	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 		log.Printf(
 			"Upstream error %d retries_exhausted (account=%d platform=%s type=%s): %s",
 			resp.StatusCode,
 			account.ID,
 			account.Platform,
 			account.Type,
 			truncateForLog(respBody, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
 		)
 	}
 	// 返回统一的重试耗尽错误响应
 	c.JSON(http.StatusBadGateway, gin.H{
 		"type": "error",
@@ -1960,7 +2156,10 @@ func (s *GatewayService) handleRetryExhaustedError(ctx context.Context, resp *ht
 		},
 	})
-	return nil, fmt.Errorf("upstream error: %d (retries exhausted)", resp.StatusCode)
+	if upstreamMsg == "" {
 		return nil, fmt.Errorf("upstream error: %d (retries exhausted)", resp.StatusCode)
 	}
 	return nil, fmt.Errorf("upstream error: %d (retries exhausted) message=%s", resp.StatusCode, upstreamMsg)
 }
 // streamingResult 流式响应结果
@@ -2490,6 +2689,7 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context,
 	// 发送请求
 	resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
 	if err != nil {
 		setOpsUpstreamError(c, 0, sanitizeUpstreamErrorMessage(err.Error()), "")
 		s.countTokensError(c, http.StatusBadGateway, "upstream_error", "Request failed")
 		return fmt.Errorf("upstream request failed: %w", err)
 	}
@@ -2527,6 +2727,18 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context,
 		// 标记账号状态（429/529等）
 		s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
 		upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
 		upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 		upstreamDetail := ""
 		if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 			maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 			if maxBytes <= 0 {
 				maxBytes = 2048
 			}
 			upstreamDetail = truncateString(string(respBody), maxBytes)
 		}
 		setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
 		// 记录上游错误摘要便于排障（不回显请求内容）
 		if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 			log.Printf(
@@ -2548,7 +2760,10 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context,
 			errMsg = "Service overloaded"
 		}
 		s.countTokensError(c, resp.StatusCode, "upstream_error", errMsg)
-		return fmt.Errorf("upstream error: %d", resp.StatusCode)
+		if upstreamMsg == "" {
 			return fmt.Errorf("upstream error: %d", resp.StatusCode)
 		}
 		return fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
 	}
 	// 透传成功响应
--- a/backend/internal/service/gemini_messages_compat_service.go
+++ b/backend/internal/service/gemini_messages_compat_service.go
@@ -543,12 +543,21 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
 		resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
 		if err != nil {
 			safeErr := sanitizeUpstreamErrorMessage(err.Error())
 			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 				Platform:           account.Platform,
 				AccountID:          account.ID,
 				UpstreamStatusCode: 0,
 				Kind:               "request_error",
 				Message:            safeErr,
 			})
 			if attempt < geminiMaxRetries {
 				log.Printf("Gemini account %d: upstream request failed, retry %d/%d: %v", account.ID, attempt, geminiMaxRetries, err)
 				sleepGeminiBackoff(attempt)
 				continue
 			}
-			return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries: "+sanitizeUpstreamErrorMessage(err.Error()))
+			setOpsUpstreamError(c, 0, safeErr, "")
 			return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries: "+safeErr)
 		}
 		// Special-case: signature/thought_signature validation errors are not transient, but may be fixed by
@@ -558,6 +567,30 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
 			_ = resp.Body.Close()
 			if isGeminiSignatureRelatedError(respBody) {
 				upstreamReqID := resp.Header.Get(requestIDHeader)
 				if upstreamReqID == "" {
 					upstreamReqID = resp.Header.Get("x-goog-request-id")
 				}
 				upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
 				upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 				upstreamDetail := ""
 				if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 					maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 					if maxBytes <= 0 {
 						maxBytes = 2048
 					}
 					upstreamDetail = truncateString(string(respBody), maxBytes)
 				}
 				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 					Platform:           account.Platform,
 					AccountID:          account.ID,
 					UpstreamStatusCode: resp.StatusCode,
 					UpstreamRequestID:  upstreamReqID,
 					Kind:               "signature_error",
 					Message:            upstreamMsg,
 					Detail:             upstreamDetail,
 				})
 				var strippedClaudeBody []byte
 				stageName := ""
 				switch signatureRetryStage {
@@ -608,6 +641,30 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
 				s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
 			}
 			if attempt < geminiMaxRetries {
 				upstreamReqID := resp.Header.Get(requestIDHeader)
 				if upstreamReqID == "" {
 					upstreamReqID = resp.Header.Get("x-goog-request-id")
 				}
 				upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
 				upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 				upstreamDetail := ""
 				if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 					maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 					if maxBytes <= 0 {
 						maxBytes = 2048
 					}
 					upstreamDetail = truncateString(string(respBody), maxBytes)
 				}
 				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 					Platform:           account.Platform,
 					AccountID:          account.ID,
 					UpstreamStatusCode: resp.StatusCode,
 					UpstreamRequestID:  upstreamReqID,
 					Kind:               "retry",
 					Message:            upstreamMsg,
 					Detail:             upstreamDetail,
 				})
 				log.Printf("Gemini account %d: upstream status %d, retry %d/%d", account.ID, resp.StatusCode, attempt, geminiMaxRetries)
 				sleepGeminiBackoff(attempt)
 				continue
@@ -633,12 +690,62 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
 		}
 		s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
 		if tempMatched {
 			upstreamReqID := resp.Header.Get(requestIDHeader)
 			if upstreamReqID == "" {
 				upstreamReqID = resp.Header.Get("x-goog-request-id")
 			}
 			upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
 			upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 			upstreamDetail := ""
 			if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 				maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 				if maxBytes <= 0 {
 					maxBytes = 2048
 				}
 				upstreamDetail = truncateString(string(respBody), maxBytes)
 			}
 			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 				Platform:           account.Platform,
 				AccountID:          account.ID,
 				UpstreamStatusCode: resp.StatusCode,
 				UpstreamRequestID:  upstreamReqID,
 				Kind:               "failover",
 				Message:            upstreamMsg,
 				Detail:             upstreamDetail,
 			})
 			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 		}
 		if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {
 			upstreamReqID := resp.Header.Get(requestIDHeader)
 			if upstreamReqID == "" {
 				upstreamReqID = resp.Header.Get("x-goog-request-id")
 			}
 			upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
 			upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 			upstreamDetail := ""
 			if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 				maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 				if maxBytes <= 0 {
 					maxBytes = 2048
 				}
 				upstreamDetail = truncateString(string(respBody), maxBytes)
 			}
 			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 				Platform:           account.Platform,
 				AccountID:          account.ID,
 				UpstreamStatusCode: resp.StatusCode,
 				UpstreamRequestID:  upstreamReqID,
 				Kind:               "failover",
 				Message:            upstreamMsg,
 				Detail:             upstreamDetail,
 			})
 			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 		}
-		return nil, s.writeGeminiMappedError(c, resp.StatusCode, respBody)
+		upstreamReqID := resp.Header.Get(requestIDHeader)
 		if upstreamReqID == "" {
 			upstreamReqID = resp.Header.Get("x-goog-request-id")
 		}
 		return nil, s.writeGeminiMappedError(c, account, resp.StatusCode, upstreamReqID, respBody)
 	}
 	requestID := resp.Header.Get(requestIDHeader)
@@ -863,6 +970,14 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
 		resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
 		if err != nil {
 			safeErr := sanitizeUpstreamErrorMessage(err.Error())
 			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 				Platform:           account.Platform,
 				AccountID:          account.ID,
 				UpstreamStatusCode: 0,
 				Kind:               "request_error",
 				Message:            safeErr,
 			})
 			if attempt < geminiMaxRetries {
 				log.Printf("Gemini account %d: upstream request failed, retry %d/%d: %v", account.ID, attempt, geminiMaxRetries, err)
 				sleepGeminiBackoff(attempt)
@@ -880,7 +995,8 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
 					FirstTokenMs: nil,
 				}, nil
 			}
-			return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries: "+sanitizeUpstreamErrorMessage(err.Error()))
+			setOpsUpstreamError(c, 0, safeErr, "")
 			return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries: "+safeErr)
 		}
 		if resp.StatusCode >= 400 && s.shouldRetryGeminiUpstreamError(account, resp.StatusCode) {
@@ -899,6 +1015,30 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
 				s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
 			}
 			if attempt < geminiMaxRetries {
 				upstreamReqID := resp.Header.Get(requestIDHeader)
 				if upstreamReqID == "" {
 					upstreamReqID = resp.Header.Get("x-goog-request-id")
 				}
 				upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
 				upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 				upstreamDetail := ""
 				if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 					maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 					if maxBytes <= 0 {
 						maxBytes = 2048
 					}
 					upstreamDetail = truncateString(string(respBody), maxBytes)
 				}
 				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 					Platform:           account.Platform,
 					AccountID:          account.ID,
 					UpstreamStatusCode: resp.StatusCode,
 					UpstreamRequestID:  upstreamReqID,
 					Kind:               "retry",
 					Message:            upstreamMsg,
 					Detail:             upstreamDetail,
 				})
 				log.Printf("Gemini account %d: upstream status %d, retry %d/%d", account.ID, resp.StatusCode, attempt, geminiMaxRetries)
 				sleepGeminiBackoff(attempt)
 				continue
@@ -962,19 +1102,84 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
 		}
 		if tempMatched {
 			evBody := unwrapIfNeeded(isOAuth, respBody)
 			upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(evBody))
 			upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 			upstreamDetail := ""
 			if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 				maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 				if maxBytes <= 0 {
 					maxBytes = 2048
 				}
 				upstreamDetail = truncateString(string(evBody), maxBytes)
 			}
 			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 				Platform:           account.Platform,
 				AccountID:          account.ID,
 				UpstreamStatusCode: resp.StatusCode,
 				UpstreamRequestID:  requestID,
 				Kind:               "failover",
 				Message:            upstreamMsg,
 				Detail:             upstreamDetail,
 			})
 			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 		}
 		if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {
 			evBody := unwrapIfNeeded(isOAuth, respBody)
 			upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(evBody))
 			upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 			upstreamDetail := ""
 			if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 				maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 				if maxBytes <= 0 {
 					maxBytes = 2048
 				}
 				upstreamDetail = truncateString(string(evBody), maxBytes)
 			}
 			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 				Platform:           account.Platform,
 				AccountID:          account.ID,
 				UpstreamStatusCode: resp.StatusCode,
 				UpstreamRequestID:  requestID,
 				Kind:               "failover",
 				Message:            upstreamMsg,
 				Detail:             upstreamDetail,
 			})
 			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 		}
 		respBody = unwrapIfNeeded(isOAuth, respBody)
 		upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
 		upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 		upstreamDetail := ""
 		if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 			maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 			if maxBytes <= 0 {
 				maxBytes = 2048
 			}
 			upstreamDetail = truncateString(string(respBody), maxBytes)
 			log.Printf("[Gemini] native upstream error %d: %s", resp.StatusCode, truncateForLog(respBody, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes))
 		}
 		setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
 		appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 			Platform:           account.Platform,
 			AccountID:          account.ID,
 			UpstreamStatusCode: resp.StatusCode,
 			UpstreamRequestID:  requestID,
 			Kind:               "http_error",
 			Message:            upstreamMsg,
 			Detail:             upstreamDetail,
 		})
 		contentType := resp.Header.Get("Content-Type")
 		if contentType == "" {
 			contentType = "application/json"
 		}
 		c.Data(resp.StatusCode, contentType, respBody)
-		return nil, fmt.Errorf("gemini upstream error: %d", resp.StatusCode)
+		if upstreamMsg == "" {
 			return nil, fmt.Errorf("gemini upstream error: %d", resp.StatusCode)
 		}
 		return nil, fmt.Errorf("gemini upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
 	}
 	var usage *ClaudeUsage
@@ -1076,7 +1281,32 @@ func sanitizeUpstreamErrorMessage(msg string) string {
 	return sensitiveQueryParamRegex.ReplaceAllString(msg, `$1***`)
 }
-func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, upstreamStatus int, body []byte) error {
+func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, account *Account, upstreamStatus int, upstreamRequestID string, body []byte) error {
 	upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
 	upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 	upstreamDetail := ""
 	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 		maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 		if maxBytes <= 0 {
 			maxBytes = 2048
 		}
 		upstreamDetail = truncateString(string(body), maxBytes)
 	}
 	setOpsUpstreamError(c, upstreamStatus, upstreamMsg, upstreamDetail)
 	appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 		Platform:           account.Platform,
 		AccountID:          account.ID,
 		UpstreamStatusCode: upstreamStatus,
 		UpstreamRequestID:  upstreamRequestID,
 		Kind:               "http_error",
 		Message:            upstreamMsg,
 		Detail:             upstreamDetail,
 	})
 	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 		log.Printf("[Gemini] upstream error %d: %s", upstreamStatus, truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes))
 	}
 	var statusCode int
 	var errType, errMsg string
@@ -1184,7 +1414,10 @@ func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, ups
 		"type":  "error",
 		"error": gin.H{"type": errType, "message": errMsg},
 	})
-	return fmt.Errorf("upstream error: %d", upstreamStatus)
+	if upstreamMsg == "" {
 		return fmt.Errorf("upstream error: %d", upstreamStatus)
 	}
 	return fmt.Errorf("upstream error: %d message=%s", upstreamStatus, upstreamMsg)
 }
 type claudeErrorMapping struct {
--- a/backend/internal/service/openai_codex_transform.go
+++ b/backend/internal/service/openai_codex_transform.go
@@ -115,12 +115,6 @@ func applyCodexOAuthTransform(reqBody map[string]any) codexTransformResult {
 	existingInstructions = strings.TrimSpace(existingInstructions)
 	if instructions != "" {
 		if existingInstructions != "" && existingInstructions != instructions {
 			if input, ok := reqBody["input"].([]any); ok {
 				reqBody["input"] = prependSystemInstruction(input, existingInstructions)
 				result.Modified = true
 			}
 		}
 		if existingInstructions != instructions {
 			reqBody["instructions"] = instructions
 			result.Modified = true
@@ -129,7 +123,6 @@ func applyCodexOAuthTransform(reqBody map[string]any) codexTransformResult {
 	if input, ok := reqBody["input"].([]any); ok {
 		input = filterCodexInput(input)
 		input = normalizeOrphanedToolOutputs(input)
 		reqBody["input"] = input
 		result.Modified = true
 	}
@@ -266,19 +259,6 @@ func filterCodexInput(input []any) []any {
 	return filtered
 }
 func prependSystemInstruction(input []any, instructions string) []any {
 	message := map[string]any{
 		"role": "system",
 		"content": []any{
 			map[string]any{
 				"type": "input_text",
 				"text": instructions,
 			},
 		},
 	}
 	return append([]any{message}, input...)
 }
 func normalizeCodexTools(reqBody map[string]any) bool {
 	rawTools, ok := reqBody["tools"]
 	if !ok || rawTools == nil {
@@ -341,110 +321,6 @@ func normalizeCodexTools(reqBody map[string]any) bool {
 	return modified
 }
 func normalizeOrphanedToolOutputs(input []any) []any {
 	functionCallIDs := map[string]bool{}
 	localShellCallIDs := map[string]bool{}
 	customToolCallIDs := map[string]bool{}
 	for _, item := range input {
 		m, ok := item.(map[string]any)
 		if !ok {
 			continue
 		}
 		callID := getCallID(m)
 		if callID == "" {
 			continue
 		}
 		switch m["type"] {
 		case "function_call":
 			functionCallIDs[callID] = true
 		case "local_shell_call":
 			localShellCallIDs[callID] = true
 		case "custom_tool_call":
 			customToolCallIDs[callID] = true
 		}
 	}
 	output := make([]any, 0, len(input))
 	for _, item := range input {
 		m, ok := item.(map[string]any)
 		if !ok {
 			output = append(output, item)
 			continue
 		}
 		switch m["type"] {
 		case "function_call_output":
 			callID := getCallID(m)
 			if callID == "" || (!functionCallIDs[callID] && !localShellCallIDs[callID]) {
 				output = append(output, convertOrphanedOutputToMessage(m, callID))
 				continue
 			}
 		case "custom_tool_call_output":
 			callID := getCallID(m)
 			if callID == "" || !customToolCallIDs[callID] {
 				output = append(output, convertOrphanedOutputToMessage(m, callID))
 				continue
 			}
 		case "local_shell_call_output":
 			callID := getCallID(m)
 			if callID == "" || !localShellCallIDs[callID] {
 				output = append(output, convertOrphanedOutputToMessage(m, callID))
 				continue
 			}
 		}
 		output = append(output, m)
 	}
 	return output
 }
 func getCallID(item map[string]any) string {
 	raw, ok := item["call_id"]
 	if !ok {
 		return ""
 	}
 	callID, ok := raw.(string)
 	if !ok {
 		return ""
 	}
 	callID = strings.TrimSpace(callID)
 	if callID == "" {
 		return ""
 	}
 	return callID
 }
 func convertOrphanedOutputToMessage(item map[string]any, callID string) map[string]any {
 	toolName := "tool"
 	if name, ok := item["name"].(string); ok && name != "" {
 		toolName = name
 	}
 	labelID := callID
 	if labelID == "" {
 		labelID = "unknown"
 	}
 	text := stringifyOutput(item["output"])
 	if len(text) > 16000 {
 		text = text[:16000] + "\n...[truncated]"
 	}
 	return map[string]any{
 		"type":    "message",
 		"role":    "assistant",
 		"content": fmt.Sprintf("[Previous %s result; call_id=%s]: %s", toolName, labelID, text),
 	}
 }
 func stringifyOutput(output any) string {
 	switch v := output.(type) {
 	case string:
 		return v
 	default:
 		if data, err := json.Marshal(v); err == nil {
 			return string(data)
 		}
 		return fmt.Sprintf("%v", v)
 	}
 }
 func codexCachePath(filename string) string {
 	home, err := os.UserHomeDir()
 	if err != nil {
--- a/backend/internal/service/openai_gateway_service.go
+++ b/backend/internal/service/openai_gateway_service.go
@@ -12,7 +12,6 @@ import (
 	"io"
 	"log"
 	"net/http"
 	"os"
 	"regexp"
 	"sort"
 	"strconv"
@@ -513,7 +512,7 @@ func (s *OpenAIGatewayService) shouldFailoverUpstreamError(statusCode int) bool
 }
 func (s *OpenAIGatewayService) handleFailoverSideEffects(ctx context.Context, resp *http.Response, account *Account) {
-	body, _ := io.ReadAll(resp.Body)
+	body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 	s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body)
 }
@@ -594,13 +593,53 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco
 	// Send request
 	resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
 	if err != nil {
-		return nil, fmt.Errorf("upstream request failed: %w", err)
+		// Ensure the client receives an error response (handlers assume Forward writes on non-failover errors).
 		safeErr := sanitizeUpstreamErrorMessage(err.Error())
 		setOpsUpstreamError(c, 0, safeErr, "")
 		appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 			Platform:           account.Platform,
 			AccountID:          account.ID,
 			UpstreamStatusCode: 0,
 			Kind:               "request_error",
 			Message:            safeErr,
 		})
 		c.JSON(http.StatusBadGateway, gin.H{
 			"error": gin.H{
 				"type":    "upstream_error",
 				"message": "Upstream request failed",
 			},
 		})
 		return nil, fmt.Errorf("upstream request failed: %s", safeErr)
 	}
 	defer func() { _ = resp.Body.Close() }()
 	// Handle error response
 	if resp.StatusCode >= 400 {
 		if s.shouldFailoverUpstreamError(resp.StatusCode) {
 			respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 			_ = resp.Body.Close()
 			resp.Body = io.NopCloser(bytes.NewReader(respBody))
 			upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
 			upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 			upstreamDetail := ""
 			if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 				maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 				if maxBytes <= 0 {
 					maxBytes = 2048
 				}
 				upstreamDetail = truncateString(string(respBody), maxBytes)
 			}
 			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 				Platform:           account.Platform,
 				AccountID:          account.ID,
 				UpstreamStatusCode: resp.StatusCode,
 				UpstreamRequestID:  resp.Header.Get("x-request-id"),
 				Kind:               "failover",
 				Message:            upstreamMsg,
 				Detail:             upstreamDetail,
 			})
 			s.handleFailoverSideEffects(ctx, resp, account)
 			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 		}
@@ -724,18 +763,52 @@ func (s *OpenAIGatewayService) buildUpstreamRequest(ctx context.Context, c *gin.
 }
 func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*OpenAIForwardResult, error) {
-	body, _ := io.ReadAll(resp.Body)
+	body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
-	logUpstreamErrorBody(account.ID, resp.StatusCode, body)
+
 	upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
 	upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 	upstreamDetail := ""
 	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 		maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
 		if maxBytes <= 0 {
 			maxBytes = 2048
 		}
 		upstreamDetail = truncateString(string(body), maxBytes)
 	}
 	setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
 	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 		log.Printf(
 			"OpenAI upstream error %d (account=%d platform=%s type=%s): %s",
 			resp.StatusCode,
 			account.ID,
 			account.Platform,
 			account.Type,
 			truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
 		)
 	}
 	// Check custom error codes
 	if !account.ShouldHandleErrorCode(resp.StatusCode) {
 		appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 			Platform:           account.Platform,
 			AccountID:          account.ID,
 			UpstreamStatusCode: resp.StatusCode,
 			UpstreamRequestID:  resp.Header.Get("x-request-id"),
 			Kind:               "http_error",
 			Message:            upstreamMsg,
 			Detail:             upstreamDetail,
 		})
 		c.JSON(http.StatusInternalServerError, gin.H{
 			"error": gin.H{
 				"type":    "upstream_error",
 				"message": "Upstream gateway error",
 			},
 		})
-		return nil, fmt.Errorf("upstream error: %d (not in custom error codes)", resp.StatusCode)
+		if upstreamMsg == "" {
 			return nil, fmt.Errorf("upstream error: %d (not in custom error codes)", resp.StatusCode)
 		}
 		return nil, fmt.Errorf("upstream error: %d (not in custom error codes) message=%s", resp.StatusCode, upstreamMsg)
 	}
 	// Handle upstream error (mark account status)
@@ -743,6 +816,19 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht
 	if s.rateLimitService != nil {
 		shouldDisable = s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body)
 	}
 	kind := "http_error"
 	if shouldDisable {
 		kind = "failover"
 	}
 	appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
 		Platform:           account.Platform,
 		AccountID:          account.ID,
 		UpstreamStatusCode: resp.StatusCode,
 		UpstreamRequestID:  resp.Header.Get("x-request-id"),
 		Kind:               kind,
 		Message:            upstreamMsg,
 		Detail:             upstreamDetail,
 	})
 	if shouldDisable {
 		return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 	}
@@ -781,25 +867,10 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht
 		},
 	})
-	return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
+	if upstreamMsg == "" {
-}
+		return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
 func logUpstreamErrorBody(accountID int64, statusCode int, body []byte) {
 	if strings.ToLower(strings.TrimSpace(os.Getenv("GATEWAY_LOG_UPSTREAM_ERROR_BODY"))) != "true" {
 		return
 	}
-
+	return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
 	maxBytes := 2048
 	if rawMax := strings.TrimSpace(os.Getenv("GATEWAY_LOG_UPSTREAM_ERROR_BODY_MAX_BYTES")); rawMax != "" {
 		if parsed, err := strconv.Atoi(rawMax); err == nil && parsed > 0 {
 			maxBytes = parsed
 		}
 	}
 	if len(body) > maxBytes {
 		body = body[:maxBytes]
 	}
 	log.Printf("Upstream error body: account=%d status=%d body=%q", accountID, statusCode, string(body))
 }
 // openaiStreamingResult streaming response result
--- a/backend/internal/service/ops_account_availability.go
+++ b/backend/internal/service/ops_account_availability.go
@@ -0,0 +1,194 @@
 package service
 import (
 	"context"
 	"errors"
 	"time"
 )
 // GetAccountAvailabilityStats returns current account availability stats.
 //
 // Query-level filtering is intentionally limited to platform/group to match the dashboard scope.
 func (s *OpsService) GetAccountAvailabilityStats(ctx context.Context, platformFilter string, groupIDFilter *int64) (
 	map[string]*PlatformAvailability,
 	map[int64]*GroupAvailability,
 	map[int64]*AccountAvailability,
 	*time.Time,
 	error,
 ) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, nil, nil, nil, err
 	}
 	accounts, err := s.listAllAccountsForOps(ctx, platformFilter)
 	if err != nil {
 		return nil, nil, nil, nil, err
 	}
 	if groupIDFilter != nil && *groupIDFilter > 0 {
 		filtered := make([]Account, 0, len(accounts))
 		for _, acc := range accounts {
 			for _, grp := range acc.Groups {
 				if grp != nil && grp.ID == *groupIDFilter {
 					filtered = append(filtered, acc)
 					break
 				}
 			}
 		}
 		accounts = filtered
 	}
 	now := time.Now()
 	collectedAt := now
 	platform := make(map[string]*PlatformAvailability)
 	group := make(map[int64]*GroupAvailability)
 	account := make(map[int64]*AccountAvailability)
 	for _, acc := range accounts {
 		if acc.ID <= 0 {
 			continue
 		}
 		isTempUnsched := false
 		if acc.TempUnschedulableUntil != nil && now.Before(*acc.TempUnschedulableUntil) {
 			isTempUnsched = true
 		}
 		isRateLimited := acc.RateLimitResetAt != nil && now.Before(*acc.RateLimitResetAt)
 		isOverloaded := acc.OverloadUntil != nil && now.Before(*acc.OverloadUntil)
 		hasError := acc.Status == StatusError
 		// Normalize exclusive status flags so the UI doesn't show conflicting badges.
 		if hasError {
 			isRateLimited = false
 			isOverloaded = false
 		}
 		isAvailable := acc.Status == StatusActive && acc.Schedulable && !isRateLimited && !isOverloaded && !isTempUnsched
 		if acc.Platform != "" {
 			if _, ok := platform[acc.Platform]; !ok {
 				platform[acc.Platform] = &PlatformAvailability{
 					Platform: acc.Platform,
 				}
 			}
 			p := platform[acc.Platform]
 			p.TotalAccounts++
 			if isAvailable {
 				p.AvailableCount++
 			}
 			if isRateLimited {
 				p.RateLimitCount++
 			}
 			if hasError {
 				p.ErrorCount++
 			}
 		}
 		for _, grp := range acc.Groups {
 			if grp == nil || grp.ID <= 0 {
 				continue
 			}
 			if _, ok := group[grp.ID]; !ok {
 				group[grp.ID] = &GroupAvailability{
 					GroupID:   grp.ID,
 					GroupName: grp.Name,
 					Platform:  grp.Platform,
 				}
 			}
 			g := group[grp.ID]
 			g.TotalAccounts++
 			if isAvailable {
 				g.AvailableCount++
 			}
 			if isRateLimited {
 				g.RateLimitCount++
 			}
 			if hasError {
 				g.ErrorCount++
 			}
 		}
 		displayGroupID := int64(0)
 		displayGroupName := ""
 		if len(acc.Groups) > 0 && acc.Groups[0] != nil {
 			displayGroupID = acc.Groups[0].ID
 			displayGroupName = acc.Groups[0].Name
 		}
 		item := &AccountAvailability{
 			AccountID:   acc.ID,
 			AccountName: acc.Name,
 			Platform:    acc.Platform,
 			GroupID:     displayGroupID,
 			GroupName:   displayGroupName,
 			Status:      acc.Status,
 			IsAvailable:   isAvailable,
 			IsRateLimited: isRateLimited,
 			IsOverloaded:  isOverloaded,
 			HasError:      hasError,
 			ErrorMessage: acc.ErrorMessage,
 		}
 		if isRateLimited && acc.RateLimitResetAt != nil {
 			item.RateLimitResetAt = acc.RateLimitResetAt
 			remainingSec := int64(time.Until(*acc.RateLimitResetAt).Seconds())
 			if remainingSec > 0 {
 				item.RateLimitRemainingSec = &remainingSec
 			}
 		}
 		if isOverloaded && acc.OverloadUntil != nil {
 			item.OverloadUntil = acc.OverloadUntil
 			remainingSec := int64(time.Until(*acc.OverloadUntil).Seconds())
 			if remainingSec > 0 {
 				item.OverloadRemainingSec = &remainingSec
 			}
 		}
 		if isTempUnsched && acc.TempUnschedulableUntil != nil {
 			item.TempUnschedulableUntil = acc.TempUnschedulableUntil
 		}
 		account[acc.ID] = item
 	}
 	return platform, group, account, &collectedAt, nil
 }
 type OpsAccountAvailability struct {
 	Group       *GroupAvailability
 	Accounts    map[int64]*AccountAvailability
 	CollectedAt *time.Time
 }
 func (s *OpsService) GetAccountAvailability(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error) {
 	if s == nil {
 		return nil, errors.New("ops service is nil")
 	}
 	if s.getAccountAvailability != nil {
 		return s.getAccountAvailability(ctx, platformFilter, groupIDFilter)
 	}
 	_, groupStats, accountStats, collectedAt, err := s.GetAccountAvailabilityStats(ctx, platformFilter, groupIDFilter)
 	if err != nil {
 		return nil, err
 	}
 	var group *GroupAvailability
 	if groupIDFilter != nil && *groupIDFilter > 0 {
 		group = groupStats[*groupIDFilter]
 	}
 	if accountStats == nil {
 		accountStats = map[int64]*AccountAvailability{}
 	}
 	return &OpsAccountAvailability{
 		Group:       group,
 		Accounts:    accountStats,
 		CollectedAt: collectedAt,
 	}, nil
 }
--- a/backend/internal/service/ops_advisory_lock.go
+++ b/backend/internal/service/ops_advisory_lock.go
@@ -0,0 +1,46 @@
 package service
 import (
 	"context"
 	"database/sql"
 	"hash/fnv"
 	"time"
 )
 func hashAdvisoryLockID(key string) int64 {
 	h := fnv.New64a()
 	_, _ = h.Write([]byte(key))
 	return int64(h.Sum64())
 }
 func tryAcquireDBAdvisoryLock(ctx context.Context, db *sql.DB, lockID int64) (func(), bool) {
 	if db == nil {
 		return nil, false
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	conn, err := db.Conn(ctx)
 	if err != nil {
 		return nil, false
 	}
 	acquired := false
 	if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", lockID).Scan(&acquired); err != nil {
 		_ = conn.Close()
 		return nil, false
 	}
 	if !acquired {
 		_ = conn.Close()
 		return nil, false
 	}
 	release := func() {
 		unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 		defer cancel()
 		_, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", lockID)
 		_ = conn.Close()
 	}
 	return release, true
 }
--- a/backend/internal/service/ops_aggregation_service.go
+++ b/backend/internal/service/ops_aggregation_service.go
@@ -0,0 +1,443 @@
 package service
 import (
 	"context"
 	"database/sql"
 	"errors"
 	"log"
 	"strings"
 	"sync"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/config"
 	"github.com/google/uuid"
 	"github.com/redis/go-redis/v9"
 )
 const (
 	opsAggHourlyJobName = "ops_preaggregation_hourly"
 	opsAggDailyJobName  = "ops_preaggregation_daily"
 	opsAggHourlyInterval = 10 * time.Minute
 	opsAggDailyInterval  = 1 * time.Hour
 	// Keep in sync with ops retention target (vNext default 30d).
 	opsAggBackfillWindow = 30 * 24 * time.Hour
 	// Recompute overlap to absorb late-arriving rows near boundaries.
 	opsAggHourlyOverlap = 2 * time.Hour
 	opsAggDailyOverlap  = 48 * time.Hour
 	opsAggHourlyChunk = 24 * time.Hour
 	opsAggDailyChunk  = 7 * 24 * time.Hour
 	// Delay around boundaries (e.g. 10:00..10:05) to avoid aggregating buckets
 	// that may still receive late inserts.
 	opsAggSafeDelay = 5 * time.Minute
 	opsAggMaxQueryTimeout = 3 * time.Second
 	opsAggHourlyTimeout   = 5 * time.Minute
 	opsAggDailyTimeout    = 2 * time.Minute
 	opsAggHourlyLeaderLockKey = "ops:aggregation:hourly:leader"
 	opsAggDailyLeaderLockKey  = "ops:aggregation:daily:leader"
 	opsAggHourlyLeaderLockTTL = 15 * time.Minute
 	opsAggDailyLeaderLockTTL  = 10 * time.Minute
 )
 // OpsAggregationService periodically backfills ops_metrics_hourly / ops_metrics_daily
 // for stable long-window dashboard queries.
 //
 // It is safe to run in multi-replica deployments when Redis is available (leader lock).
 type OpsAggregationService struct {
 	opsRepo     OpsRepository
 	settingRepo SettingRepository
 	cfg         *config.Config
 	db          *sql.DB
 	redisClient *redis.Client
 	instanceID  string
 	stopCh    chan struct{}
 	startOnce sync.Once
 	stopOnce  sync.Once
 	hourlyMu sync.Mutex
 	dailyMu  sync.Mutex
 	skipLogMu sync.Mutex
 	skipLogAt time.Time
 }
 func NewOpsAggregationService(
 	opsRepo OpsRepository,
 	settingRepo SettingRepository,
 	db *sql.DB,
 	redisClient *redis.Client,
 	cfg *config.Config,
 ) *OpsAggregationService {
 	return &OpsAggregationService{
 		opsRepo:     opsRepo,
 		settingRepo: settingRepo,
 		cfg:         cfg,
 		db:          db,
 		redisClient: redisClient,
 		instanceID:  uuid.NewString(),
 	}
 }
 func (s *OpsAggregationService) Start() {
 	if s == nil {
 		return
 	}
 	s.startOnce.Do(func() {
 		if s.stopCh == nil {
 			s.stopCh = make(chan struct{})
 		}
 		go s.hourlyLoop()
 		go s.dailyLoop()
 	})
 }
 func (s *OpsAggregationService) Stop() {
 	if s == nil {
 		return
 	}
 	s.stopOnce.Do(func() {
 		if s.stopCh != nil {
 			close(s.stopCh)
 		}
 	})
 }
 func (s *OpsAggregationService) hourlyLoop() {
 	// First run immediately.
 	s.aggregateHourly()
 	ticker := time.NewTicker(opsAggHourlyInterval)
 	defer ticker.Stop()
 	for {
 		select {
 		case <-ticker.C:
 			s.aggregateHourly()
 		case <-s.stopCh:
 			return
 		}
 	}
 }
 func (s *OpsAggregationService) dailyLoop() {
 	// First run immediately.
 	s.aggregateDaily()
 	ticker := time.NewTicker(opsAggDailyInterval)
 	defer ticker.Stop()
 	for {
 		select {
 		case <-ticker.C:
 			s.aggregateDaily()
 		case <-s.stopCh:
 			return
 		}
 	}
 }
 func (s *OpsAggregationService) aggregateHourly() {
 	if s == nil || s.opsRepo == nil {
 		return
 	}
 	if s.cfg != nil {
 		if !s.cfg.Ops.Enabled {
 			return
 		}
 		if !s.cfg.Ops.Aggregation.Enabled {
 			return
 		}
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), opsAggHourlyTimeout)
 	defer cancel()
 	if !s.isMonitoringEnabled(ctx) {
 		return
 	}
 	release, ok := s.tryAcquireLeaderLock(ctx, opsAggHourlyLeaderLockKey, opsAggHourlyLeaderLockTTL, "[OpsAggregation][hourly]")
 	if !ok {
 		return
 	}
 	if release != nil {
 		defer release()
 	}
 	s.hourlyMu.Lock()
 	defer s.hourlyMu.Unlock()
 	startedAt := time.Now().UTC()
 	runAt := startedAt
 	// Aggregate stable full hours only.
 	end := utcFloorToHour(time.Now().UTC().Add(-opsAggSafeDelay))
 	start := end.Add(-opsAggBackfillWindow)
 	// Resume from the latest bucket with overlap.
 	{
 		ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout)
 		latest, ok, err := s.opsRepo.GetLatestHourlyBucketStart(ctxMax)
 		cancelMax()
 		if err != nil {
 			log.Printf("[OpsAggregation][hourly] failed to read latest bucket: %v", err)
 		} else if ok {
 			candidate := latest.Add(-opsAggHourlyOverlap)
 			if candidate.After(start) {
 				start = candidate
 			}
 		}
 	}
 	start = utcFloorToHour(start)
 	if !start.Before(end) {
 		return
 	}
 	var aggErr error
 	for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggHourlyChunk) {
 		chunkEnd := minTime(cursor.Add(opsAggHourlyChunk), end)
 		if err := s.opsRepo.UpsertHourlyMetrics(ctx, cursor, chunkEnd); err != nil {
 			aggErr = err
 			log.Printf("[OpsAggregation][hourly] upsert failed (%s..%s): %v", cursor.Format(time.RFC3339), chunkEnd.Format(time.RFC3339), err)
 			break
 		}
 	}
 	finishedAt := time.Now().UTC()
 	durationMs := finishedAt.Sub(startedAt).Milliseconds()
 	dur := durationMs
 	if aggErr != nil {
 		msg := truncateString(aggErr.Error(), 2048)
 		errAt := finishedAt
 		hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
 		defer hbCancel()
 		_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
 			JobName:        opsAggHourlyJobName,
 			LastRunAt:      &runAt,
 			LastErrorAt:    &errAt,
 			LastError:      &msg,
 			LastDurationMs: &dur,
 		})
 		return
 	}
 	successAt := finishedAt
 	hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
 	defer hbCancel()
 	_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
 		JobName:        opsAggHourlyJobName,
 		LastRunAt:      &runAt,
 		LastSuccessAt:  &successAt,
 		LastDurationMs: &dur,
 	})
 }
 func (s *OpsAggregationService) aggregateDaily() {
 	if s == nil || s.opsRepo == nil {
 		return
 	}
 	if s.cfg != nil {
 		if !s.cfg.Ops.Enabled {
 			return
 		}
 		if !s.cfg.Ops.Aggregation.Enabled {
 			return
 		}
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), opsAggDailyTimeout)
 	defer cancel()
 	if !s.isMonitoringEnabled(ctx) {
 		return
 	}
 	release, ok := s.tryAcquireLeaderLock(ctx, opsAggDailyLeaderLockKey, opsAggDailyLeaderLockTTL, "[OpsAggregation][daily]")
 	if !ok {
 		return
 	}
 	if release != nil {
 		defer release()
 	}
 	s.dailyMu.Lock()
 	defer s.dailyMu.Unlock()
 	startedAt := time.Now().UTC()
 	runAt := startedAt
 	end := utcFloorToDay(time.Now().UTC())
 	start := end.Add(-opsAggBackfillWindow)
 	{
 		ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout)
 		latest, ok, err := s.opsRepo.GetLatestDailyBucketDate(ctxMax)
 		cancelMax()
 		if err != nil {
 			log.Printf("[OpsAggregation][daily] failed to read latest bucket: %v", err)
 		} else if ok {
 			candidate := latest.Add(-opsAggDailyOverlap)
 			if candidate.After(start) {
 				start = candidate
 			}
 		}
 	}
 	start = utcFloorToDay(start)
 	if !start.Before(end) {
 		return
 	}
 	var aggErr error
 	for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggDailyChunk) {
 		chunkEnd := minTime(cursor.Add(opsAggDailyChunk), end)
 		if err := s.opsRepo.UpsertDailyMetrics(ctx, cursor, chunkEnd); err != nil {
 			aggErr = err
 			log.Printf("[OpsAggregation][daily] upsert failed (%s..%s): %v", cursor.Format("2006-01-02"), chunkEnd.Format("2006-01-02"), err)
 			break
 		}
 	}
 	finishedAt := time.Now().UTC()
 	durationMs := finishedAt.Sub(startedAt).Milliseconds()
 	dur := durationMs
 	if aggErr != nil {
 		msg := truncateString(aggErr.Error(), 2048)
 		errAt := finishedAt
 		hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
 		defer hbCancel()
 		_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
 			JobName:        opsAggDailyJobName,
 			LastRunAt:      &runAt,
 			LastErrorAt:    &errAt,
 			LastError:      &msg,
 			LastDurationMs: &dur,
 		})
 		return
 	}
 	successAt := finishedAt
 	hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
 	defer hbCancel()
 	_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
 		JobName:        opsAggDailyJobName,
 		LastRunAt:      &runAt,
 		LastSuccessAt:  &successAt,
 		LastDurationMs: &dur,
 	})
 }
 func (s *OpsAggregationService) isMonitoringEnabled(ctx context.Context) bool {
 	if s == nil {
 		return false
 	}
 	if s.cfg != nil && !s.cfg.Ops.Enabled {
 		return false
 	}
 	if s.settingRepo == nil {
 		return true
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
 	if err != nil {
 		if errors.Is(err, ErrSettingNotFound) {
 			return true
 		}
 		return true
 	}
 	switch strings.ToLower(strings.TrimSpace(value)) {
 	case "false", "0", "off", "disabled":
 		return false
 	default:
 		return true
 	}
 }
 var opsAggReleaseScript = redis.NewScript(`
 if redis.call("GET", KEYS[1]) == ARGV[1] then
  return redis.call("DEL", KEYS[1])
 end
 return 0
 `)
 func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) {
 	if s == nil {
 		return nil, false
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	// Prefer Redis leader lock when available (multi-instance), but avoid stampeding
 	// the DB when Redis is flaky by falling back to a DB advisory lock.
 	if s.redisClient != nil {
 		ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
 		if err == nil {
 			if !ok {
 				s.maybeLogSkip(logPrefix)
 				return nil, false
 			}
 			release := func() {
 				ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 				defer cancel()
 				_, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result()
 			}
 			return release, true
 		}
 		// Redis error: fall through to DB advisory lock.
 	}
 	release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
 	if !ok {
 		s.maybeLogSkip(logPrefix)
 		return nil, false
 	}
 	return release, true
 }
 func (s *OpsAggregationService) maybeLogSkip(prefix string) {
 	s.skipLogMu.Lock()
 	defer s.skipLogMu.Unlock()
 	now := time.Now()
 	if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < time.Minute {
 		return
 	}
 	s.skipLogAt = now
 	if prefix == "" {
 		prefix = "[OpsAggregation]"
 	}
 	log.Printf("%s leader lock held by another instance; skipping", prefix)
 }
 func utcFloorToHour(t time.Time) time.Time {
 	return t.UTC().Truncate(time.Hour)
 }
 func utcFloorToDay(t time.Time) time.Time {
 	u := t.UTC()
 	y, m, d := u.Date()
 	return time.Date(y, m, d, 0, 0, 0, 0, time.UTC)
 }
 func minTime(a, b time.Time) time.Time {
 	if a.Before(b) {
 		return a
 	}
 	return b
 }
--- a/backend/internal/service/ops_alert_evaluator_service.go
+++ b/backend/internal/service/ops_alert_evaluator_service.go
@@ -0,0 +1,913 @@
 package service
 import (
 	"context"
 	"fmt"
 	"log"
 	"math"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/config"
 	"github.com/google/uuid"
 	"github.com/redis/go-redis/v9"
 )
 const (
 	opsAlertEvaluatorJobName = "ops_alert_evaluator"
 	opsAlertEvaluatorTimeout         = 45 * time.Second
 	opsAlertEvaluatorLeaderLockKey   = "ops:alert:evaluator:leader"
 	opsAlertEvaluatorLeaderLockTTL   = 90 * time.Second
 	opsAlertEvaluatorSkipLogInterval = 1 * time.Minute
 )
 var opsAlertEvaluatorReleaseScript = redis.NewScript(`
 if redis.call("GET", KEYS[1]) == ARGV[1] then
  return redis.call("DEL", KEYS[1])
 end
 return 0
 `)
 type OpsAlertEvaluatorService struct {
 	opsService   *OpsService
 	opsRepo      OpsRepository
 	emailService *EmailService
 	redisClient *redis.Client
 	cfg         *config.Config
 	instanceID  string
 	stopCh    chan struct{}
 	startOnce sync.Once
 	stopOnce  sync.Once
 	wg        sync.WaitGroup
 	mu         sync.Mutex
 	ruleStates map[int64]*opsAlertRuleState
 	emailLimiter *slidingWindowLimiter
 	skipLogMu sync.Mutex
 	skipLogAt time.Time
 	warnNoRedisOnce sync.Once
 }
 type opsAlertRuleState struct {
 	LastEvaluatedAt     time.Time
 	ConsecutiveBreaches int
 }
 func NewOpsAlertEvaluatorService(
 	opsService *OpsService,
 	opsRepo OpsRepository,
 	emailService *EmailService,
 	redisClient *redis.Client,
 	cfg *config.Config,
 ) *OpsAlertEvaluatorService {
 	return &OpsAlertEvaluatorService{
 		opsService:   opsService,
 		opsRepo:      opsRepo,
 		emailService: emailService,
 		redisClient:  redisClient,
 		cfg:          cfg,
 		instanceID:   uuid.NewString(),
 		ruleStates:   map[int64]*opsAlertRuleState{},
 		emailLimiter: newSlidingWindowLimiter(0, time.Hour),
 	}
 }
 func (s *OpsAlertEvaluatorService) Start() {
 	if s == nil {
 		return
 	}
 	s.startOnce.Do(func() {
 		if s.stopCh == nil {
 			s.stopCh = make(chan struct{})
 		}
 		go s.run()
 	})
 }
 func (s *OpsAlertEvaluatorService) Stop() {
 	if s == nil {
 		return
 	}
 	s.stopOnce.Do(func() {
 		if s.stopCh != nil {
 			close(s.stopCh)
 		}
 	})
 	s.wg.Wait()
 }
 func (s *OpsAlertEvaluatorService) run() {
 	s.wg.Add(1)
 	defer s.wg.Done()
 	// Start immediately to produce early feedback in ops dashboard.
 	timer := time.NewTimer(0)
 	defer timer.Stop()
 	for {
 		select {
 		case <-timer.C:
 			interval := s.getInterval()
 			s.evaluateOnce(interval)
 			timer.Reset(interval)
 		case <-s.stopCh:
 			return
 		}
 	}
 }
 func (s *OpsAlertEvaluatorService) getInterval() time.Duration {
 	// Default.
 	interval := 60 * time.Second
 	if s == nil || s.opsService == nil {
 		return interval
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 	defer cancel()
 	cfg, err := s.opsService.GetOpsAlertRuntimeSettings(ctx)
 	if err != nil || cfg == nil {
 		return interval
 	}
 	if cfg.EvaluationIntervalSeconds <= 0 {
 		return interval
 	}
 	if cfg.EvaluationIntervalSeconds < 1 {
 		return interval
 	}
 	if cfg.EvaluationIntervalSeconds > int((24 * time.Hour).Seconds()) {
 		return interval
 	}
 	return time.Duration(cfg.EvaluationIntervalSeconds) * time.Second
 }
 func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
 	if s == nil || s.opsRepo == nil {
 		return
 	}
 	if s.cfg != nil && !s.cfg.Ops.Enabled {
 		return
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), opsAlertEvaluatorTimeout)
 	defer cancel()
 	if s.opsService != nil && !s.opsService.IsMonitoringEnabled(ctx) {
 		return
 	}
 	runtimeCfg := defaultOpsAlertRuntimeSettings()
 	if s.opsService != nil {
 		if loaded, err := s.opsService.GetOpsAlertRuntimeSettings(ctx); err == nil && loaded != nil {
 			runtimeCfg = loaded
 		}
 	}
 	release, ok := s.tryAcquireLeaderLock(ctx, runtimeCfg.DistributedLock)
 	if !ok {
 		return
 	}
 	if release != nil {
 		defer release()
 	}
 	startedAt := time.Now().UTC()
 	runAt := startedAt
 	rules, err := s.opsRepo.ListAlertRules(ctx)
 	if err != nil {
 		s.recordHeartbeatError(runAt, time.Since(startedAt), err)
 		log.Printf("[OpsAlertEvaluator] list rules failed: %v", err)
 		return
 	}
 	now := time.Now().UTC()
 	safeEnd := now.Truncate(time.Minute)
 	if safeEnd.IsZero() {
 		safeEnd = now
 	}
 	systemMetrics, _ := s.opsRepo.GetLatestSystemMetrics(ctx, 1)
 	// Cleanup stale state for removed rules.
 	s.pruneRuleStates(rules)
 	for _, rule := range rules {
 		if rule == nil || !rule.Enabled || rule.ID <= 0 {
 			continue
 		}
 		scopePlatform, scopeGroupID := parseOpsAlertRuleScope(rule.Filters)
 		windowMinutes := rule.WindowMinutes
 		if windowMinutes <= 0 {
 			windowMinutes = 1
 		}
 		windowStart := safeEnd.Add(-time.Duration(windowMinutes) * time.Minute)
 		windowEnd := safeEnd
 		metricValue, ok := s.computeRuleMetric(ctx, rule, systemMetrics, windowStart, windowEnd, scopePlatform, scopeGroupID)
 		if !ok {
 			s.resetRuleState(rule.ID, now)
 			continue
 		}
 		breachedNow := compareMetric(metricValue, rule.Operator, rule.Threshold)
 		required := requiredSustainedBreaches(rule.SustainedMinutes, interval)
 		consecutive := s.updateRuleBreaches(rule.ID, now, interval, breachedNow)
 		activeEvent, err := s.opsRepo.GetActiveAlertEvent(ctx, rule.ID)
 		if err != nil {
 			log.Printf("[OpsAlertEvaluator] get active event failed (rule=%d): %v", rule.ID, err)
 			continue
 		}
 		if breachedNow && consecutive >= required {
 			if activeEvent != nil {
 				continue
 			}
 			latestEvent, err := s.opsRepo.GetLatestAlertEvent(ctx, rule.ID)
 			if err != nil {
 				log.Printf("[OpsAlertEvaluator] get latest event failed (rule=%d): %v", rule.ID, err)
 				continue
 			}
 			if latestEvent != nil && rule.CooldownMinutes > 0 {
 				cooldown := time.Duration(rule.CooldownMinutes) * time.Minute
 				if now.Sub(latestEvent.FiredAt) < cooldown {
 					continue
 				}
 			}
 			firedEvent := &OpsAlertEvent{
 				RuleID:         rule.ID,
 				Severity:       strings.TrimSpace(rule.Severity),
 				Status:         OpsAlertStatusFiring,
 				Title:          fmt.Sprintf("%s: %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name)),
 				Description:    buildOpsAlertDescription(rule, metricValue, windowMinutes, scopePlatform, scopeGroupID),
 				MetricValue:    float64Ptr(metricValue),
 				ThresholdValue: float64Ptr(rule.Threshold),
 				Dimensions:     buildOpsAlertDimensions(scopePlatform, scopeGroupID),
 				FiredAt:        now,
 				CreatedAt:      now,
 			}
 			created, err := s.opsRepo.CreateAlertEvent(ctx, firedEvent)
 			if err != nil {
 				log.Printf("[OpsAlertEvaluator] create event failed (rule=%d): %v", rule.ID, err)
 				continue
 			}
 			if created != nil && created.ID > 0 {
 				s.maybeSendAlertEmail(ctx, runtimeCfg, rule, created)
 			}
 			continue
 		}
 		// Not breached: resolve active event if present.
 		if activeEvent != nil {
 			resolvedAt := now
 			if err := s.opsRepo.UpdateAlertEventStatus(ctx, activeEvent.ID, OpsAlertStatusResolved, &resolvedAt); err != nil {
 				log.Printf("[OpsAlertEvaluator] resolve event failed (event=%d): %v", activeEvent.ID, err)
 			}
 		}
 	}
 	s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
 }
 func (s *OpsAlertEvaluatorService) pruneRuleStates(rules []*OpsAlertRule) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	live := map[int64]struct{}{}
 	for _, r := range rules {
 		if r != nil && r.ID > 0 {
 			live[r.ID] = struct{}{}
 		}
 	}
 	for id := range s.ruleStates {
 		if _, ok := live[id]; !ok {
 			delete(s.ruleStates, id)
 		}
 	}
 }
 func (s *OpsAlertEvaluatorService) resetRuleState(ruleID int64, now time.Time) {
 	if ruleID <= 0 {
 		return
 	}
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	state, ok := s.ruleStates[ruleID]
 	if !ok {
 		state = &opsAlertRuleState{}
 		s.ruleStates[ruleID] = state
 	}
 	state.LastEvaluatedAt = now
 	state.ConsecutiveBreaches = 0
 }
 func (s *OpsAlertEvaluatorService) updateRuleBreaches(ruleID int64, now time.Time, interval time.Duration, breached bool) int {
 	if ruleID <= 0 {
 		return 0
 	}
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	state, ok := s.ruleStates[ruleID]
 	if !ok {
 		state = &opsAlertRuleState{}
 		s.ruleStates[ruleID] = state
 	}
 	if !state.LastEvaluatedAt.IsZero() && interval > 0 {
 		if now.Sub(state.LastEvaluatedAt) > interval*2 {
 			state.ConsecutiveBreaches = 0
 		}
 	}
 	state.LastEvaluatedAt = now
 	if breached {
 		state.ConsecutiveBreaches++
 	} else {
 		state.ConsecutiveBreaches = 0
 	}
 	return state.ConsecutiveBreaches
 }
 func requiredSustainedBreaches(sustainedMinutes int, interval time.Duration) int {
 	if sustainedMinutes <= 0 {
 		return 1
 	}
 	if interval <= 0 {
 		return sustainedMinutes
 	}
 	required := int(math.Ceil(float64(sustainedMinutes*60) / interval.Seconds()))
 	if required < 1 {
 		return 1
 	}
 	return required
 }
 func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64) {
 	if filters == nil {
 		return "", nil
 	}
 	if v, ok := filters["platform"]; ok {
 		if s, ok := v.(string); ok {
 			platform = strings.TrimSpace(s)
 		}
 	}
 	if v, ok := filters["group_id"]; ok {
 		switch t := v.(type) {
 		case float64:
 			if t > 0 {
 				id := int64(t)
 				groupID = &id
 			}
 		case int64:
 			if t > 0 {
 				id := t
 				groupID = &id
 			}
 		case int:
 			if t > 0 {
 				id := int64(t)
 				groupID = &id
 			}
 		case string:
 			n, err := strconv.ParseInt(strings.TrimSpace(t), 10, 64)
 			if err == nil && n > 0 {
 				groupID = &n
 			}
 		}
 	}
 	return platform, groupID
 }
 func (s *OpsAlertEvaluatorService) computeRuleMetric(
 	ctx context.Context,
 	rule *OpsAlertRule,
 	systemMetrics *OpsSystemMetricsSnapshot,
 	start time.Time,
 	end time.Time,
 	platform string,
 	groupID *int64,
 ) (float64, bool) {
 	if rule == nil {
 		return 0, false
 	}
 	switch strings.TrimSpace(rule.MetricType) {
 	case "cpu_usage_percent":
 		if systemMetrics != nil && systemMetrics.CPUUsagePercent != nil {
 			return *systemMetrics.CPUUsagePercent, true
 		}
 		return 0, false
 	case "memory_usage_percent":
 		if systemMetrics != nil && systemMetrics.MemoryUsagePercent != nil {
 			return *systemMetrics.MemoryUsagePercent, true
 		}
 		return 0, false
 	case "concurrency_queue_depth":
 		if systemMetrics != nil && systemMetrics.ConcurrencyQueueDepth != nil {
 			return float64(*systemMetrics.ConcurrencyQueueDepth), true
 		}
 		return 0, false
 	case "group_available_accounts":
 		if groupID == nil || *groupID <= 0 {
 			return 0, false
 		}
 		if s == nil || s.opsService == nil {
 			return 0, false
 		}
 		availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
 		if err != nil || availability == nil {
 			return 0, false
 		}
 		if availability.Group == nil {
 			return 0, true
 		}
 		return float64(availability.Group.AvailableCount), true
 	case "group_available_ratio":
 		if groupID == nil || *groupID <= 0 {
 			return 0, false
 		}
 		if s == nil || s.opsService == nil {
 			return 0, false
 		}
 		availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
 		if err != nil || availability == nil {
 			return 0, false
 		}
 		return computeGroupAvailableRatio(availability.Group), true
 	case "account_rate_limited_count":
 		if s == nil || s.opsService == nil {
 			return 0, false
 		}
 		availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
 		if err != nil || availability == nil {
 			return 0, false
 		}
 		return float64(countAccountsByCondition(availability.Accounts, func(acc *AccountAvailability) bool {
 			return acc.IsRateLimited
 		})), true
 	case "account_error_count":
 		if s == nil || s.opsService == nil {
 			return 0, false
 		}
 		availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
 		if err != nil || availability == nil {
 			return 0, false
 		}
 		return float64(countAccountsByCondition(availability.Accounts, func(acc *AccountAvailability) bool {
 			return acc.HasError && acc.TempUnschedulableUntil == nil
 		})), true
 	}
 	overview, err := s.opsRepo.GetDashboardOverview(ctx, &OpsDashboardFilter{
 		StartTime: start,
 		EndTime:   end,
 		Platform:  platform,
 		GroupID:   groupID,
 		QueryMode: OpsQueryModeRaw,
 	})
 	if err != nil {
 		return 0, false
 	}
 	if overview == nil {
 		return 0, false
 	}
 	switch strings.TrimSpace(rule.MetricType) {
 	case "success_rate":
 		if overview.RequestCountSLA <= 0 {
 			return 0, false
 		}
 		return overview.SLA * 100, true
 	case "error_rate":
 		if overview.RequestCountSLA <= 0 {
 			return 0, false
 		}
 		return overview.ErrorRate * 100, true
 	case "upstream_error_rate":
 		if overview.RequestCountSLA <= 0 {
 			return 0, false
 		}
 		return overview.UpstreamErrorRate * 100, true
 	case "p95_latency_ms":
 		if overview.Duration.P95 == nil {
 			return 0, false
 		}
 		return float64(*overview.Duration.P95), true
 	case "p99_latency_ms":
 		if overview.Duration.P99 == nil {
 			return 0, false
 		}
 		return float64(*overview.Duration.P99), true
 	default:
 		return 0, false
 	}
 }
 func compareMetric(value float64, operator string, threshold float64) bool {
 	switch strings.TrimSpace(operator) {
 	case ">":
 		return value > threshold
 	case ">=":
 		return value >= threshold
 	case "<":
 		return value < threshold
 	case "<=":
 		return value <= threshold
 	case "==":
 		return value == threshold
 	case "!=":
 		return value != threshold
 	default:
 		return false
 	}
 }
 func buildOpsAlertDimensions(platform string, groupID *int64) map[string]any {
 	dims := map[string]any{}
 	if strings.TrimSpace(platform) != "" {
 		dims["platform"] = strings.TrimSpace(platform)
 	}
 	if groupID != nil && *groupID > 0 {
 		dims["group_id"] = *groupID
 	}
 	if len(dims) == 0 {
 		return nil
 	}
 	return dims
 }
 func buildOpsAlertDescription(rule *OpsAlertRule, value float64, windowMinutes int, platform string, groupID *int64) string {
 	if rule == nil {
 		return ""
 	}
 	scope := "overall"
 	if strings.TrimSpace(platform) != "" {
 		scope = fmt.Sprintf("platform=%s", strings.TrimSpace(platform))
 	}
 	if groupID != nil && *groupID > 0 {
 		scope = fmt.Sprintf("%s group_id=%d", scope, *groupID)
 	}
 	if windowMinutes <= 0 {
 		windowMinutes = 1
 	}
 	return fmt.Sprintf("%s %s %.2f (current %.2f) over last %dm (%s)",
 		strings.TrimSpace(rule.MetricType),
 		strings.TrimSpace(rule.Operator),
 		rule.Threshold,
 		value,
 		windowMinutes,
 		strings.TrimSpace(scope),
 	)
 }
 func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runtimeCfg *OpsAlertRuntimeSettings, rule *OpsAlertRule, event *OpsAlertEvent) {
 	if s == nil || s.emailService == nil || s.opsService == nil || event == nil || rule == nil {
 		return
 	}
 	if event.EmailSent {
 		return
 	}
 	if !rule.NotifyEmail {
 		return
 	}
 	emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx)
 	if err != nil || emailCfg == nil || !emailCfg.Alert.Enabled {
 		return
 	}
 	if len(emailCfg.Alert.Recipients) == 0 {
 		return
 	}
 	if !shouldSendOpsAlertEmailByMinSeverity(strings.TrimSpace(emailCfg.Alert.MinSeverity), strings.TrimSpace(rule.Severity)) {
 		return
 	}
 	if runtimeCfg != nil && runtimeCfg.Silencing.Enabled {
 		if isOpsAlertSilenced(time.Now().UTC(), rule, event, runtimeCfg.Silencing) {
 			return
 		}
 	}
 	// Apply/update rate limiter.
 	s.emailLimiter.SetLimit(emailCfg.Alert.RateLimitPerHour)
 	subject := fmt.Sprintf("[Ops Alert][%s] %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name))
 	body := buildOpsAlertEmailBody(rule, event)
 	anySent := false
 	for _, to := range emailCfg.Alert.Recipients {
 		addr := strings.TrimSpace(to)
 		if addr == "" {
 			continue
 		}
 		if !s.emailLimiter.Allow(time.Now().UTC()) {
 			continue
 		}
 		if err := s.emailService.SendEmail(ctx, addr, subject, body); err != nil {
 			// Ignore per-recipient failures; continue best-effort.
 			continue
 		}
 		anySent = true
 	}
 	if anySent {
 		_ = s.opsRepo.UpdateAlertEventEmailSent(context.Background(), event.ID, true)
 	}
 }
 func buildOpsAlertEmailBody(rule *OpsAlertRule, event *OpsAlertEvent) string {
 	if rule == nil || event == nil {
 		return ""
 	}
 	metric := strings.TrimSpace(rule.MetricType)
 	value := "-"
 	threshold := fmt.Sprintf("%.2f", rule.Threshold)
 	if event.MetricValue != nil {
 		value = fmt.Sprintf("%.2f", *event.MetricValue)
 	}
 	if event.ThresholdValue != nil {
 		threshold = fmt.Sprintf("%.2f", *event.ThresholdValue)
 	}
 	return fmt.Sprintf(`
 <h2>Ops Alert</h2>
 <p><b>Rule</b>: %s</p>
 <p><b>Severity</b>: %s</p>
 <p><b>Status</b>: %s</p>
 <p><b>Metric</b>: %s %s %s</p>
 <p><b>Fired at</b>: %s</p>
 <p><b>Description</b>: %s</p>
 `,
 		htmlEscape(rule.Name),
 		htmlEscape(rule.Severity),
 		htmlEscape(event.Status),
 		htmlEscape(metric),
 		htmlEscape(rule.Operator),
 		htmlEscape(fmt.Sprintf("%s (threshold %s)", value, threshold)),
 		event.FiredAt.Format(time.RFC3339),
 		htmlEscape(event.Description),
 	)
 }
 func shouldSendOpsAlertEmailByMinSeverity(minSeverity string, ruleSeverity string) bool {
 	minSeverity = strings.ToLower(strings.TrimSpace(minSeverity))
 	if minSeverity == "" {
 		return true
 	}
 	eventLevel := opsEmailSeverityForOps(ruleSeverity)
 	minLevel := strings.ToLower(minSeverity)
 	rank := func(level string) int {
 		switch level {
 		case "critical":
 			return 3
 		case "warning":
 			return 2
 		case "info":
 			return 1
 		default:
 			return 0
 		}
 	}
 	return rank(eventLevel) >= rank(minLevel)
 }
 func opsEmailSeverityForOps(severity string) string {
 	switch strings.ToUpper(strings.TrimSpace(severity)) {
 	case "P0":
 		return "critical"
 	case "P1":
 		return "warning"
 	default:
 		return "info"
 	}
 }
 func isOpsAlertSilenced(now time.Time, rule *OpsAlertRule, event *OpsAlertEvent, silencing OpsAlertSilencingSettings) bool {
 	if !silencing.Enabled {
 		return false
 	}
 	if now.IsZero() {
 		now = time.Now().UTC()
 	}
 	if strings.TrimSpace(silencing.GlobalUntilRFC3339) != "" {
 		if t, err := time.Parse(time.RFC3339, strings.TrimSpace(silencing.GlobalUntilRFC3339)); err == nil {
 			if now.Before(t) {
 				return true
 			}
 		}
 	}
 	for _, entry := range silencing.Entries {
 		untilRaw := strings.TrimSpace(entry.UntilRFC3339)
 		if untilRaw == "" {
 			continue
 		}
 		until, err := time.Parse(time.RFC3339, untilRaw)
 		if err != nil {
 			continue
 		}
 		if now.After(until) {
 			continue
 		}
 		if entry.RuleID != nil && rule != nil && rule.ID > 0 && *entry.RuleID != rule.ID {
 			continue
 		}
 		if len(entry.Severities) > 0 {
 			match := false
 			for _, s := range entry.Severities {
 				if strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(event.Severity)) || strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(rule.Severity)) {
 					match = true
 					break
 				}
 			}
 			if !match {
 				continue
 			}
 		}
 		return true
 	}
 	return false
 }
 func (s *OpsAlertEvaluatorService) tryAcquireLeaderLock(ctx context.Context, lock OpsDistributedLockSettings) (func(), bool) {
 	if !lock.Enabled {
 		return nil, true
 	}
 	if s.redisClient == nil {
 		s.warnNoRedisOnce.Do(func() {
 			log.Printf("[OpsAlertEvaluator] redis not configured; running without distributed lock")
 		})
 		return nil, true
 	}
 	key := strings.TrimSpace(lock.Key)
 	if key == "" {
 		key = opsAlertEvaluatorLeaderLockKey
 	}
 	ttl := time.Duration(lock.TTLSeconds) * time.Second
 	if ttl <= 0 {
 		ttl = opsAlertEvaluatorLeaderLockTTL
 	}
 	ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
 	if err != nil {
 		// Prefer fail-closed to avoid duplicate evaluators stampeding the DB when Redis is flaky.
 		// Single-node deployments can disable the distributed lock via runtime settings.
 		s.warnNoRedisOnce.Do(func() {
 			log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; skipping this cycle: %v", err)
 		})
 		return nil, false
 	}
 	if !ok {
 		s.maybeLogSkip(key)
 		return nil, false
 	}
 	return func() {
 		_, _ = opsAlertEvaluatorReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
 	}, true
 }
 func (s *OpsAlertEvaluatorService) maybeLogSkip(key string) {
 	s.skipLogMu.Lock()
 	defer s.skipLogMu.Unlock()
 	now := time.Now()
 	if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < opsAlertEvaluatorSkipLogInterval {
 		return
 	}
 	s.skipLogAt = now
 	log.Printf("[OpsAlertEvaluator] leader lock held by another instance; skipping (key=%q)", key)
 }
 func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
 	if s == nil || s.opsRepo == nil {
 		return
 	}
 	now := time.Now().UTC()
 	durMs := duration.Milliseconds()
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 	defer cancel()
 	_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
 		JobName:        opsAlertEvaluatorJobName,
 		LastRunAt:      &runAt,
 		LastSuccessAt:  &now,
 		LastDurationMs: &durMs,
 	})
 }
 func (s *OpsAlertEvaluatorService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
 	if s == nil || s.opsRepo == nil || err == nil {
 		return
 	}
 	now := time.Now().UTC()
 	durMs := duration.Milliseconds()
 	msg := truncateString(err.Error(), 2048)
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 	defer cancel()
 	_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
 		JobName:        opsAlertEvaluatorJobName,
 		LastRunAt:      &runAt,
 		LastErrorAt:    &now,
 		LastError:      &msg,
 		LastDurationMs: &durMs,
 	})
 }
 func htmlEscape(s string) string {
 	replacer := strings.NewReplacer(
 		"&", "&amp;",
 		"<", "&lt;",
 		">", "&gt;",
 		`"`, "&quot;",
 		"'", "&#39;",
 	)
 	return replacer.Replace(s)
 }
 type slidingWindowLimiter struct {
 	mu     sync.Mutex
 	limit  int
 	window time.Duration
 	sent   []time.Time
 }
 func newSlidingWindowLimiter(limit int, window time.Duration) *slidingWindowLimiter {
 	if window <= 0 {
 		window = time.Hour
 	}
 	return &slidingWindowLimiter{
 		limit:  limit,
 		window: window,
 		sent:   []time.Time{},
 	}
 }
 func (l *slidingWindowLimiter) SetLimit(limit int) {
 	l.mu.Lock()
 	defer l.mu.Unlock()
 	l.limit = limit
 }
 func (l *slidingWindowLimiter) Allow(now time.Time) bool {
 	l.mu.Lock()
 	defer l.mu.Unlock()
 	if l.limit <= 0 {
 		return true
 	}
 	cutoff := now.Add(-l.window)
 	keep := l.sent[:0]
 	for _, t := range l.sent {
 		if t.After(cutoff) {
 			keep = append(keep, t)
 		}
 	}
 	l.sent = keep
 	if len(l.sent) >= l.limit {
 		return false
 	}
 	l.sent = append(l.sent, now)
 	return true
 }
 // computeGroupAvailableRatio returns the available percentage for a group.
 // Formula: (AvailableCount / TotalAccounts) * 100.
 // Returns 0 when TotalAccounts is 0.
 func computeGroupAvailableRatio(group *GroupAvailability) float64 {
 	if group == nil || group.TotalAccounts <= 0 {
 		return 0
 	}
 	return (float64(group.AvailableCount) / float64(group.TotalAccounts)) * 100
 }
 // countAccountsByCondition counts accounts that satisfy the given condition.
 func countAccountsByCondition(accounts map[int64]*AccountAvailability, condition func(*AccountAvailability) bool) int64 {
 	if len(accounts) == 0 || condition == nil {
 		return 0
 	}
 	var count int64
 	for _, account := range accounts {
 		if account != nil && condition(account) {
 			count++
 		}
 	}
 	return count
 }
--- a/backend/internal/service/ops_alert_evaluator_service_test.go
+++ b/backend/internal/service/ops_alert_evaluator_service_test.go
@@ -0,0 +1,210 @@
 //go:build unit
 package service
 import (
 	"context"
 	"testing"
 	"time"
 	"github.com/stretchr/testify/require"
 )
 type stubOpsRepo struct {
 	OpsRepository
 	overview *OpsDashboardOverview
 	err      error
 }
 func (s *stubOpsRepo) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) {
 	if s.err != nil {
 		return nil, s.err
 	}
 	if s.overview != nil {
 		return s.overview, nil
 	}
 	return &OpsDashboardOverview{}, nil
 }
 func TestComputeGroupAvailableRatio(t *testing.T) {
 	t.Parallel()
 	t.Run("正常情况: 10个账号, 8个可用 = 80%", func(t *testing.T) {
 		t.Parallel()
 		got := computeGroupAvailableRatio(&GroupAvailability{
 			TotalAccounts:  10,
 			AvailableCount: 8,
 		})
 		require.InDelta(t, 80.0, got, 0.0001)
 	})
 	t.Run("边界情况: TotalAccounts = 0 应返回 0", func(t *testing.T) {
 		t.Parallel()
 		got := computeGroupAvailableRatio(&GroupAvailability{
 			TotalAccounts:  0,
 			AvailableCount: 8,
 		})
 		require.Equal(t, 0.0, got)
 	})
 	t.Run("边界情况: AvailableCount = 0 应返回 0%", func(t *testing.T) {
 		t.Parallel()
 		got := computeGroupAvailableRatio(&GroupAvailability{
 			TotalAccounts:  10,
 			AvailableCount: 0,
 		})
 		require.Equal(t, 0.0, got)
 	})
 }
 func TestCountAccountsByCondition(t *testing.T) {
 	t.Parallel()
 	t.Run("测试限流账号统计: acc.IsRateLimited", func(t *testing.T) {
 		t.Parallel()
 		accounts := map[int64]*AccountAvailability{
 			1: {IsRateLimited: true},
 			2: {IsRateLimited: false},
 			3: {IsRateLimited: true},
 		}
 		got := countAccountsByCondition(accounts, func(acc *AccountAvailability) bool {
 			return acc.IsRateLimited
 		})
 		require.Equal(t, int64(2), got)
 	})
 	t.Run("测试错误账号统计（排除临时不可调度）: acc.HasError && acc.TempUnschedulableUntil == nil", func(t *testing.T) {
 		t.Parallel()
 		until := time.Now().UTC().Add(5 * time.Minute)
 		accounts := map[int64]*AccountAvailability{
 			1: {HasError: true},
 			2: {HasError: true, TempUnschedulableUntil: &until},
 			3: {HasError: false},
 		}
 		got := countAccountsByCondition(accounts, func(acc *AccountAvailability) bool {
 			return acc.HasError && acc.TempUnschedulableUntil == nil
 		})
 		require.Equal(t, int64(1), got)
 	})
 	t.Run("边界情况: 空 map 应返回 0", func(t *testing.T) {
 		t.Parallel()
 		got := countAccountsByCondition(map[int64]*AccountAvailability{}, func(acc *AccountAvailability) bool {
 			return acc.IsRateLimited
 		})
 		require.Equal(t, int64(0), got)
 	})
 }
 func TestComputeRuleMetricNewIndicators(t *testing.T) {
 	t.Parallel()
 	groupID := int64(101)
 	platform := "openai"
 	availability := &OpsAccountAvailability{
 		Group: &GroupAvailability{
 			GroupID:        groupID,
 			TotalAccounts:  10,
 			AvailableCount: 8,
 		},
 		Accounts: map[int64]*AccountAvailability{
 			1: {IsRateLimited: true},
 			2: {IsRateLimited: true},
 			3: {HasError: true},
 			4: {HasError: true, TempUnschedulableUntil: timePtr(time.Now().UTC().Add(2 * time.Minute))},
 			5: {HasError: false, IsRateLimited: false},
 		},
 	}
 	opsService := &OpsService{
 		getAccountAvailability: func(_ context.Context, _ string, _ *int64) (*OpsAccountAvailability, error) {
 			return availability, nil
 		},
 	}
 	svc := &OpsAlertEvaluatorService{
 		opsService: opsService,
 		opsRepo:    &stubOpsRepo{overview: &OpsDashboardOverview{}},
 	}
 	start := time.Now().UTC().Add(-5 * time.Minute)
 	end := time.Now().UTC()
 	ctx := context.Background()
 	tests := []struct {
 		name       string
 		metricType string
 		groupID    *int64
 		wantValue  float64
 		wantOK     bool
 	}{
 		{
 			name:       "group_available_accounts",
 			metricType: "group_available_accounts",
 			groupID:    &groupID,
 			wantValue:  8,
 			wantOK:     true,
 		},
 		{
 			name:       "group_available_ratio",
 			metricType: "group_available_ratio",
 			groupID:    &groupID,
 			wantValue:  80.0,
 			wantOK:     true,
 		},
 		{
 			name:       "account_rate_limited_count",
 			metricType: "account_rate_limited_count",
 			groupID:    nil,
 			wantValue:  2,
 			wantOK:     true,
 		},
 		{
 			name:       "account_error_count",
 			metricType: "account_error_count",
 			groupID:    nil,
 			wantValue:  1,
 			wantOK:     true,
 		},
 		{
 			name:       "group_available_accounts without group_id returns false",
 			metricType: "group_available_accounts",
 			groupID:    nil,
 			wantValue:  0,
 			wantOK:     false,
 		},
 		{
 			name:       "group_available_ratio without group_id returns false",
 			metricType: "group_available_ratio",
 			groupID:    nil,
 			wantValue:  0,
 			wantOK:     false,
 		},
 	}
 	for _, tt := range tests {
 		tt := tt
 		t.Run(tt.name, func(t *testing.T) {
 			t.Parallel()
 			rule := &OpsAlertRule{
 				MetricType: tt.metricType,
 			}
 			gotValue, gotOK := svc.computeRuleMetric(ctx, rule, nil, start, end, platform, tt.groupID)
 			require.Equal(t, tt.wantOK, gotOK)
 			if !tt.wantOK {
 				return
 			}
 			require.InDelta(t, tt.wantValue, gotValue, 0.0001)
 		})
 	}
 }
--- a/backend/internal/service/ops_alert_models.go
+++ b/backend/internal/service/ops_alert_models.go
@@ -0,0 +1,74 @@
 package service
 import "time"
 // Ops alert rule/event models.
 //
 // NOTE: These are admin-facing DTOs and intentionally keep JSON naming aligned
 // with the existing ops dashboard frontend (backup style).
 const (
 	OpsAlertStatusFiring   = "firing"
 	OpsAlertStatusResolved = "resolved"
 )
 type OpsAlertRule struct {
 	ID          int64  `json:"id"`
 	Name        string `json:"name"`
 	Description string `json:"description"`
 	Enabled  bool   `json:"enabled"`
 	Severity string `json:"severity"`
 	MetricType string  `json:"metric_type"`
 	Operator   string  `json:"operator"`
 	Threshold  float64 `json:"threshold"`
 	WindowMinutes    int `json:"window_minutes"`
 	SustainedMinutes int `json:"sustained_minutes"`
 	CooldownMinutes  int `json:"cooldown_minutes"`
 	NotifyEmail bool `json:"notify_email"`
 	Filters map[string]any `json:"filters,omitempty"`
 	LastTriggeredAt *time.Time `json:"last_triggered_at,omitempty"`
 	CreatedAt       time.Time  `json:"created_at"`
 	UpdatedAt       time.Time  `json:"updated_at"`
 }
 type OpsAlertEvent struct {
 	ID       int64  `json:"id"`
 	RuleID   int64  `json:"rule_id"`
 	Severity string `json:"severity"`
 	Status   string `json:"status"`
 	Title       string `json:"title"`
 	Description string `json:"description"`
 	MetricValue    *float64 `json:"metric_value,omitempty"`
 	ThresholdValue *float64 `json:"threshold_value,omitempty"`
 	Dimensions map[string]any `json:"dimensions,omitempty"`
 	FiredAt    time.Time  `json:"fired_at"`
 	ResolvedAt *time.Time `json:"resolved_at,omitempty"`
 	EmailSent bool      `json:"email_sent"`
 	CreatedAt time.Time `json:"created_at"`
 }
 type OpsAlertEventFilter struct {
 	Limit int
 	// Optional filters.
 	Status   string
 	Severity string
 	StartTime *time.Time
 	EndTime   *time.Time
 	// Dimensions filters (best-effort).
 	Platform string
 	GroupID  *int64
 }
--- a/backend/internal/service/ops_alerts.go
+++ b/backend/internal/service/ops_alerts.go
@@ -0,0 +1,162 @@
 package service
 import (
 	"context"
 	"database/sql"
 	"errors"
 	"strings"
 	"time"
 	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
 )
 func (s *OpsService) ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return []*OpsAlertRule{}, nil
 	}
 	return s.opsRepo.ListAlertRules(ctx)
 }
 func (s *OpsService) CreateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
 	}
 	if rule == nil {
 		return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule")
 	}
 	created, err := s.opsRepo.CreateAlertRule(ctx, rule)
 	if err != nil {
 		return nil, err
 	}
 	return created, nil
 }
 func (s *OpsService) UpdateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
 	}
 	if rule == nil || rule.ID <= 0 {
 		return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule")
 	}
 	updated, err := s.opsRepo.UpdateAlertRule(ctx, rule)
 	if err != nil {
 		if errors.Is(err, sql.ErrNoRows) {
 			return nil, infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found")
 		}
 		return nil, err
 	}
 	return updated, nil
 }
 func (s *OpsService) DeleteAlertRule(ctx context.Context, id int64) error {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return err
 	}
 	if s.opsRepo == nil {
 		return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
 	}
 	if id <= 0 {
 		return infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
 	}
 	if err := s.opsRepo.DeleteAlertRule(ctx, id); err != nil {
 		if errors.Is(err, sql.ErrNoRows) {
 			return infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found")
 		}
 		return err
 	}
 	return nil
 }
 func (s *OpsService) ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return []*OpsAlertEvent{}, nil
 	}
 	return s.opsRepo.ListAlertEvents(ctx, filter)
 }
 func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
 	}
 	if ruleID <= 0 {
 		return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
 	}
 	return s.opsRepo.GetActiveAlertEvent(ctx, ruleID)
 }
 func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
 	}
 	if ruleID <= 0 {
 		return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
 	}
 	return s.opsRepo.GetLatestAlertEvent(ctx, ruleID)
 }
 func (s *OpsService) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
 	}
 	if event == nil {
 		return nil, infraerrors.BadRequest("INVALID_EVENT", "invalid event")
 	}
 	created, err := s.opsRepo.CreateAlertEvent(ctx, event)
 	if err != nil {
 		return nil, err
 	}
 	return created, nil
 }
 func (s *OpsService) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return err
 	}
 	if s.opsRepo == nil {
 		return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
 	}
 	if eventID <= 0 {
 		return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
 	}
 	if strings.TrimSpace(status) == "" {
 		return infraerrors.BadRequest("INVALID_STATUS", "invalid status")
 	}
 	return s.opsRepo.UpdateAlertEventStatus(ctx, eventID, status, resolvedAt)
 }
 func (s *OpsService) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return err
 	}
 	if s.opsRepo == nil {
 		return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
 	}
 	if eventID <= 0 {
 		return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
 	}
 	return s.opsRepo.UpdateAlertEventEmailSent(ctx, eventID, emailSent)
 }
--- a/backend/internal/service/ops_cleanup_service.go
+++ b/backend/internal/service/ops_cleanup_service.go
@@ -0,0 +1,365 @@
 package service
 import (
 	"context"
 	"database/sql"
 	"fmt"
 	"log"
 	"strings"
 	"sync"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/config"
 	"github.com/google/uuid"
 	"github.com/redis/go-redis/v9"
 	"github.com/robfig/cron/v3"
 )
 const (
 	opsCleanupJobName = "ops_cleanup"
 	opsCleanupLeaderLockKeyDefault = "ops:cleanup:leader"
 	opsCleanupLeaderLockTTLDefault = 30 * time.Minute
 )
 var opsCleanupCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
 var opsCleanupReleaseScript = redis.NewScript(`
 if redis.call("GET", KEYS[1]) == ARGV[1] then
  return redis.call("DEL", KEYS[1])
 end
 return 0
 `)
 // OpsCleanupService periodically deletes old ops data to prevent unbounded DB growth.
 //
 // - Scheduling: 5-field cron spec (minute hour dom month dow).
 // - Multi-instance: best-effort Redis leader lock so only one node runs cleanup.
 // - Safety: deletes in batches to avoid long transactions.
 type OpsCleanupService struct {
 	opsRepo     OpsRepository
 	db          *sql.DB
 	redisClient *redis.Client
 	cfg         *config.Config
 	instanceID string
 	cron *cron.Cron
 	startOnce sync.Once
 	stopOnce  sync.Once
 	warnNoRedisOnce sync.Once
 }
 func NewOpsCleanupService(
 	opsRepo OpsRepository,
 	db *sql.DB,
 	redisClient *redis.Client,
 	cfg *config.Config,
 ) *OpsCleanupService {
 	return &OpsCleanupService{
 		opsRepo:     opsRepo,
 		db:          db,
 		redisClient: redisClient,
 		cfg:         cfg,
 		instanceID:  uuid.NewString(),
 	}
 }
 func (s *OpsCleanupService) Start() {
 	if s == nil {
 		return
 	}
 	if s.cfg != nil && !s.cfg.Ops.Enabled {
 		return
 	}
 	if s.cfg != nil && !s.cfg.Ops.Cleanup.Enabled {
 		log.Printf("[OpsCleanup] not started (disabled)")
 		return
 	}
 	if s.opsRepo == nil || s.db == nil {
 		log.Printf("[OpsCleanup] not started (missing deps)")
 		return
 	}
 	s.startOnce.Do(func() {
 		schedule := "0 2 * * *"
 		if s.cfg != nil && strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule) != "" {
 			schedule = strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule)
 		}
 		loc := time.Local
 		if s.cfg != nil && strings.TrimSpace(s.cfg.Timezone) != "" {
 			if parsed, err := time.LoadLocation(strings.TrimSpace(s.cfg.Timezone)); err == nil && parsed != nil {
 				loc = parsed
 			}
 		}
 		c := cron.New(cron.WithParser(opsCleanupCronParser), cron.WithLocation(loc))
 		_, err := c.AddFunc(schedule, func() { s.runScheduled() })
 		if err != nil {
 			log.Printf("[OpsCleanup] not started (invalid schedule=%q): %v", schedule, err)
 			return
 		}
 		s.cron = c
 		s.cron.Start()
 		log.Printf("[OpsCleanup] started (schedule=%q tz=%s)", schedule, loc.String())
 	})
 }
 func (s *OpsCleanupService) Stop() {
 	if s == nil {
 		return
 	}
 	s.stopOnce.Do(func() {
 		if s.cron != nil {
 			ctx := s.cron.Stop()
 			select {
 			case <-ctx.Done():
 			case <-time.After(3 * time.Second):
 				log.Printf("[OpsCleanup] cron stop timed out")
 			}
 		}
 	})
 }
 func (s *OpsCleanupService) runScheduled() {
 	if s == nil || s.db == nil || s.opsRepo == nil {
 		return
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
 	defer cancel()
 	release, ok := s.tryAcquireLeaderLock(ctx)
 	if !ok {
 		return
 	}
 	if release != nil {
 		defer release()
 	}
 	startedAt := time.Now().UTC()
 	runAt := startedAt
 	counts, err := s.runCleanupOnce(ctx)
 	if err != nil {
 		s.recordHeartbeatError(runAt, time.Since(startedAt), err)
 		log.Printf("[OpsCleanup] cleanup failed: %v", err)
 		return
 	}
 	s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
 	log.Printf("[OpsCleanup] cleanup complete: %s", counts)
 }
 type opsCleanupDeletedCounts struct {
 	errorLogs     int64
 	retryAttempts int64
 	alertEvents   int64
 	systemMetrics int64
 	hourlyPreagg  int64
 	dailyPreagg   int64
 }
 func (c opsCleanupDeletedCounts) String() string {
 	return fmt.Sprintf(
 		"error_logs=%d retry_attempts=%d alert_events=%d system_metrics=%d hourly_preagg=%d daily_preagg=%d",
 		c.errorLogs,
 		c.retryAttempts,
 		c.alertEvents,
 		c.systemMetrics,
 		c.hourlyPreagg,
 		c.dailyPreagg,
 	)
 }
 func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDeletedCounts, error) {
 	out := opsCleanupDeletedCounts{}
 	if s == nil || s.db == nil || s.cfg == nil {
 		return out, nil
 	}
 	batchSize := 5000
 	now := time.Now().UTC()
 	// Error-like tables: error logs / retry attempts / alert events.
 	if days := s.cfg.Ops.Cleanup.ErrorLogRetentionDays; days > 0 {
 		cutoff := now.AddDate(0, 0, -days)
 		n, err := deleteOldRowsByID(ctx, s.db, "ops_error_logs", "created_at", cutoff, batchSize, false)
 		if err != nil {
 			return out, err
 		}
 		out.errorLogs = n
 		n, err = deleteOldRowsByID(ctx, s.db, "ops_retry_attempts", "created_at", cutoff, batchSize, false)
 		if err != nil {
 			return out, err
 		}
 		out.retryAttempts = n
 		n, err = deleteOldRowsByID(ctx, s.db, "ops_alert_events", "created_at", cutoff, batchSize, false)
 		if err != nil {
 			return out, err
 		}
 		out.alertEvents = n
 	}
 	// Minute-level metrics snapshots.
 	if days := s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays; days > 0 {
 		cutoff := now.AddDate(0, 0, -days)
 		n, err := deleteOldRowsByID(ctx, s.db, "ops_system_metrics", "created_at", cutoff, batchSize, false)
 		if err != nil {
 			return out, err
 		}
 		out.systemMetrics = n
 	}
 	// Pre-aggregation tables (hourly/daily).
 	if days := s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays; days > 0 {
 		cutoff := now.AddDate(0, 0, -days)
 		n, err := deleteOldRowsByID(ctx, s.db, "ops_metrics_hourly", "bucket_start", cutoff, batchSize, false)
 		if err != nil {
 			return out, err
 		}
 		out.hourlyPreagg = n
 		n, err = deleteOldRowsByID(ctx, s.db, "ops_metrics_daily", "bucket_date", cutoff, batchSize, true)
 		if err != nil {
 			return out, err
 		}
 		out.dailyPreagg = n
 	}
 	return out, nil
 }
 func deleteOldRowsByID(
 	ctx context.Context,
 	db *sql.DB,
 	table string,
 	timeColumn string,
 	cutoff time.Time,
 	batchSize int,
 	castCutoffToDate bool,
 ) (int64, error) {
 	if db == nil {
 		return 0, nil
 	}
 	if batchSize <= 0 {
 		batchSize = 5000
 	}
 	where := fmt.Sprintf("%s < $1", timeColumn)
 	if castCutoffToDate {
 		where = fmt.Sprintf("%s < $1::date", timeColumn)
 	}
 	q := fmt.Sprintf(`
 WITH batch AS (
  SELECT id FROM %s
  WHERE %s
  ORDER BY id
  LIMIT $2
 )
 DELETE FROM %s
 WHERE id IN (SELECT id FROM batch)
 `, table, where, table)
 	var total int64
 	for {
 		res, err := db.ExecContext(ctx, q, cutoff, batchSize)
 		if err != nil {
 			// If ops tables aren't present yet (partial deployments), treat as no-op.
 			if strings.Contains(strings.ToLower(err.Error()), "does not exist") && strings.Contains(strings.ToLower(err.Error()), "relation") {
 				return total, nil
 			}
 			return total, err
 		}
 		affected, err := res.RowsAffected()
 		if err != nil {
 			return total, err
 		}
 		total += affected
 		if affected == 0 {
 			break
 		}
 	}
 	return total, nil
 }
 func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
 	if s == nil {
 		return nil, false
 	}
 	// In simple run mode, assume single instance.
 	if s.cfg != nil && s.cfg.RunMode == config.RunModeSimple {
 		return nil, true
 	}
 	key := opsCleanupLeaderLockKeyDefault
 	ttl := opsCleanupLeaderLockTTLDefault
 	// Prefer Redis leader lock when available, but avoid stampeding the DB when Redis is flaky by
 	// falling back to a DB advisory lock.
 	if s.redisClient != nil {
 		ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
 		if err == nil {
 			if !ok {
 				return nil, false
 			}
 			return func() {
 				_, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
 			}, true
 		}
 		// Redis error: fall back to DB advisory lock.
 		s.warnNoRedisOnce.Do(func() {
 			log.Printf("[OpsCleanup] leader lock SetNX failed; falling back to DB advisory lock: %v", err)
 		})
 	} else {
 		s.warnNoRedisOnce.Do(func() {
 			log.Printf("[OpsCleanup] redis not configured; using DB advisory lock")
 		})
 	}
 	release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
 	if !ok {
 		return nil, false
 	}
 	return release, true
 }
 func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
 	if s == nil || s.opsRepo == nil {
 		return
 	}
 	now := time.Now().UTC()
 	durMs := duration.Milliseconds()
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 	defer cancel()
 	_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
 		JobName:        opsCleanupJobName,
 		LastRunAt:      &runAt,
 		LastSuccessAt:  &now,
 		LastDurationMs: &durMs,
 	})
 }
 func (s *OpsCleanupService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
 	if s == nil || s.opsRepo == nil || err == nil {
 		return
 	}
 	now := time.Now().UTC()
 	durMs := duration.Milliseconds()
 	msg := truncateString(err.Error(), 2048)
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 	defer cancel()
 	_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
 		JobName:        opsCleanupJobName,
 		LastRunAt:      &runAt,
 		LastErrorAt:    &now,
 		LastError:      &msg,
 		LastDurationMs: &durMs,
 	})
 }
--- a/backend/internal/service/ops_concurrency.go
+++ b/backend/internal/service/ops_concurrency.go
@@ -0,0 +1,257 @@
 package service
 import (
 	"context"
 	"log"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/pagination"
 )
 const (
 	opsAccountsPageSize          = 100
 	opsConcurrencyBatchChunkSize = 200
 )
 func (s *OpsService) listAllAccountsForOps(ctx context.Context, platformFilter string) ([]Account, error) {
 	if s == nil || s.accountRepo == nil {
 		return []Account{}, nil
 	}
 	out := make([]Account, 0, 128)
 	page := 1
 	for {
 		accounts, pageInfo, err := s.accountRepo.ListWithFilters(ctx, pagination.PaginationParams{
 			Page:     page,
 			PageSize: opsAccountsPageSize,
 		}, platformFilter, "", "", "")
 		if err != nil {
 			return nil, err
 		}
 		if len(accounts) == 0 {
 			break
 		}
 		out = append(out, accounts...)
 		if pageInfo != nil && int64(len(out)) >= pageInfo.Total {
 			break
 		}
 		if len(accounts) < opsAccountsPageSize {
 			break
 		}
 		page++
 		if page > 10_000 {
 			log.Printf("[Ops] listAllAccountsForOps: aborting after too many pages (platform=%q)", platformFilter)
 			break
 		}
 	}
 	return out, nil
 }
 func (s *OpsService) getAccountsLoadMapBestEffort(ctx context.Context, accounts []Account) map[int64]*AccountLoadInfo {
 	if s == nil || s.concurrencyService == nil {
 		return map[int64]*AccountLoadInfo{}
 	}
 	if len(accounts) == 0 {
 		return map[int64]*AccountLoadInfo{}
 	}
 	// De-duplicate IDs (and keep the max concurrency to avoid under-reporting).
 	unique := make(map[int64]int, len(accounts))
 	for _, acc := range accounts {
 		if acc.ID <= 0 {
 			continue
 		}
 		if prev, ok := unique[acc.ID]; !ok || acc.Concurrency > prev {
 			unique[acc.ID] = acc.Concurrency
 		}
 	}
 	batch := make([]AccountWithConcurrency, 0, len(unique))
 	for id, maxConc := range unique {
 		batch = append(batch, AccountWithConcurrency{
 			ID:             id,
 			MaxConcurrency: maxConc,
 		})
 	}
 	out := make(map[int64]*AccountLoadInfo, len(batch))
 	for i := 0; i < len(batch); i += opsConcurrencyBatchChunkSize {
 		end := i + opsConcurrencyBatchChunkSize
 		if end > len(batch) {
 			end = len(batch)
 		}
 		part, err := s.concurrencyService.GetAccountsLoadBatch(ctx, batch[i:end])
 		if err != nil {
 			// Best-effort: return zeros rather than failing the ops UI.
 			log.Printf("[Ops] GetAccountsLoadBatch failed: %v", err)
 			continue
 		}
 		for k, v := range part {
 			out[k] = v
 		}
 	}
 	return out
 }
 // GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account.
 //
 // Optional filters:
 // - platformFilter: only include accounts in that platform (best-effort reduces DB load)
 // - groupIDFilter: only include accounts that belong to that group
 func (s *OpsService) GetConcurrencyStats(
 	ctx context.Context,
 	platformFilter string,
 	groupIDFilter *int64,
 ) (map[string]*PlatformConcurrencyInfo, map[int64]*GroupConcurrencyInfo, map[int64]*AccountConcurrencyInfo, *time.Time, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, nil, nil, nil, err
 	}
 	accounts, err := s.listAllAccountsForOps(ctx, platformFilter)
 	if err != nil {
 		return nil, nil, nil, nil, err
 	}
 	collectedAt := time.Now()
 	loadMap := s.getAccountsLoadMapBestEffort(ctx, accounts)
 	platform := make(map[string]*PlatformConcurrencyInfo)
 	group := make(map[int64]*GroupConcurrencyInfo)
 	account := make(map[int64]*AccountConcurrencyInfo)
 	for _, acc := range accounts {
 		if acc.ID <= 0 {
 			continue
 		}
 		var matchedGroup *Group
 		if groupIDFilter != nil && *groupIDFilter > 0 {
 			for _, grp := range acc.Groups {
 				if grp == nil || grp.ID <= 0 {
 					continue
 				}
 				if grp.ID == *groupIDFilter {
 					matchedGroup = grp
 					break
 				}
 			}
 			// Group filter provided: skip accounts not in that group.
 			if matchedGroup == nil {
 				continue
 			}
 		}
 		load := loadMap[acc.ID]
 		currentInUse := int64(0)
 		waiting := int64(0)
 		if load != nil {
 			currentInUse = int64(load.CurrentConcurrency)
 			waiting = int64(load.WaitingCount)
 		}
 		// Account-level view picks one display group (the first group).
 		displayGroupID := int64(0)
 		displayGroupName := ""
 		if matchedGroup != nil {
 			displayGroupID = matchedGroup.ID
 			displayGroupName = matchedGroup.Name
 		} else if len(acc.Groups) > 0 && acc.Groups[0] != nil {
 			displayGroupID = acc.Groups[0].ID
 			displayGroupName = acc.Groups[0].Name
 		}
 		if _, ok := account[acc.ID]; !ok {
 			info := &AccountConcurrencyInfo{
 				AccountID:      acc.ID,
 				AccountName:    acc.Name,
 				Platform:       acc.Platform,
 				GroupID:        displayGroupID,
 				GroupName:      displayGroupName,
 				CurrentInUse:   currentInUse,
 				MaxCapacity:    int64(acc.Concurrency),
 				WaitingInQueue: waiting,
 			}
 			if info.MaxCapacity > 0 {
 				info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
 			}
 			account[acc.ID] = info
 		}
 		// Platform aggregation.
 		if acc.Platform != "" {
 			if _, ok := platform[acc.Platform]; !ok {
 				platform[acc.Platform] = &PlatformConcurrencyInfo{
 					Platform: acc.Platform,
 				}
 			}
 			p := platform[acc.Platform]
 			p.MaxCapacity += int64(acc.Concurrency)
 			p.CurrentInUse += currentInUse
 			p.WaitingInQueue += waiting
 		}
 		// Group aggregation (one account may contribute to multiple groups).
 		if matchedGroup != nil {
 			grp := matchedGroup
 			if _, ok := group[grp.ID]; !ok {
 				group[grp.ID] = &GroupConcurrencyInfo{
 					GroupID:   grp.ID,
 					GroupName: grp.Name,
 					Platform:  grp.Platform,
 				}
 			}
 			g := group[grp.ID]
 			if g.GroupName == "" && grp.Name != "" {
 				g.GroupName = grp.Name
 			}
 			if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform {
 				// Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels.
 				g.Platform = ""
 			}
 			g.MaxCapacity += int64(acc.Concurrency)
 			g.CurrentInUse += currentInUse
 			g.WaitingInQueue += waiting
 		} else {
 			for _, grp := range acc.Groups {
 				if grp == nil || grp.ID <= 0 {
 					continue
 				}
 				if _, ok := group[grp.ID]; !ok {
 					group[grp.ID] = &GroupConcurrencyInfo{
 						GroupID:   grp.ID,
 						GroupName: grp.Name,
 						Platform:  grp.Platform,
 					}
 				}
 				g := group[grp.ID]
 				if g.GroupName == "" && grp.Name != "" {
 					g.GroupName = grp.Name
 				}
 				if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform {
 					// Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels.
 					g.Platform = ""
 				}
 				g.MaxCapacity += int64(acc.Concurrency)
 				g.CurrentInUse += currentInUse
 				g.WaitingInQueue += waiting
 			}
 		}
 	}
 	for _, info := range platform {
 		if info.MaxCapacity > 0 {
 			info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
 		}
 	}
 	for _, info := range group {
 		if info.MaxCapacity > 0 {
 			info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
 		}
 	}
 	return platform, group, account, &collectedAt, nil
 }
--- a/backend/internal/service/ops_dashboard.go
+++ b/backend/internal/service/ops_dashboard.go
@@ -0,0 +1,90 @@
 package service
 import (
 	"context"
 	"database/sql"
 	"errors"
 	"log"
 	"time"
 	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
 )
 func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
 	}
 	if filter == nil {
 		return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
 	}
 	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
 		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
 	}
 	if filter.StartTime.After(filter.EndTime) {
 		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
 	}
 	// Resolve query mode (requested via query param, or DB default).
 	filter.QueryMode = s.resolveOpsQueryMode(ctx, filter.QueryMode)
 	overview, err := s.opsRepo.GetDashboardOverview(ctx, filter)
 	if err != nil {
 		if errors.Is(err, ErrOpsPreaggregatedNotPopulated) {
 			return nil, infraerrors.Conflict("OPS_PREAGG_NOT_READY", "Pre-aggregated ops metrics are not populated yet")
 		}
 		return nil, err
 	}
 	// Best-effort system health + jobs; dashboard metrics should still render if these are missing.
 	if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil {
 		// Attach config-derived limits so the UI can show "current / max" for connection pools.
 		// These are best-effort and should never block the dashboard rendering.
 		if s != nil && s.cfg != nil {
 			if s.cfg.Database.MaxOpenConns > 0 {
 				metrics.DBMaxOpenConns = intPtr(s.cfg.Database.MaxOpenConns)
 			}
 			if s.cfg.Redis.PoolSize > 0 {
 				metrics.RedisPoolSize = intPtr(s.cfg.Redis.PoolSize)
 			}
 		}
 		overview.SystemMetrics = metrics
 	} else if err != nil && !errors.Is(err, sql.ErrNoRows) {
 		log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err)
 	}
 	if heartbeats, err := s.opsRepo.ListJobHeartbeats(ctx); err == nil {
 		overview.JobHeartbeats = heartbeats
 	} else {
 		log.Printf("[Ops] ListJobHeartbeats failed: %v", err)
 	}
 	overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview)
 	return overview, nil
 }
 func (s *OpsService) resolveOpsQueryMode(ctx context.Context, requested OpsQueryMode) OpsQueryMode {
 	if requested.IsValid() {
 		// Allow "auto" to be disabled via config until preagg is proven stable in production.
 		// Forced `preagg` via query param still works.
 		if requested == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
 			return OpsQueryModeRaw
 		}
 		return requested
 	}
 	mode := OpsQueryModeAuto
 	if s != nil && s.settingRepo != nil {
 		if raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsQueryModeDefault); err == nil {
 			mode = ParseOpsQueryMode(raw)
 		}
 	}
 	if mode == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
 		return OpsQueryModeRaw
 	}
 	return mode
 }
--- a/backend/internal/service/ops_dashboard_models.go
+++ b/backend/internal/service/ops_dashboard_models.go
@@ -0,0 +1,87 @@
 package service
 import "time"
 type OpsDashboardFilter struct {
 	StartTime time.Time
 	EndTime   time.Time
 	Platform string
 	GroupID  *int64
 	// QueryMode controls whether dashboard queries should use raw logs or pre-aggregated tables.
 	// Expected values: auto/raw/preagg (see OpsQueryMode).
 	QueryMode OpsQueryMode
 }
 type OpsRateSummary struct {
 	Current float64 `json:"current"`
 	Peak    float64 `json:"peak"`
 	Avg     float64 `json:"avg"`
 }
 type OpsPercentiles struct {
 	P50 *int `json:"p50_ms"`
 	P90 *int `json:"p90_ms"`
 	P95 *int `json:"p95_ms"`
 	P99 *int `json:"p99_ms"`
 	Avg *int `json:"avg_ms"`
 	Max *int `json:"max_ms"`
 }
 type OpsDashboardOverview struct {
 	StartTime time.Time `json:"start_time"`
 	EndTime   time.Time `json:"end_time"`
 	Platform  string    `json:"platform"`
 	GroupID   *int64    `json:"group_id"`
 	// HealthScore is a backend-computed overall health score (0-100).
 	// It is derived from the monitored metrics in this overview, plus best-effort system metrics/job heartbeats.
 	HealthScore int `json:"health_score"`
 	// Latest system-level snapshot (window=1m, global).
 	SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"`
 	// Background jobs health (heartbeats).
 	JobHeartbeats []*OpsJobHeartbeat `json:"job_heartbeats"`
 	SuccessCount         int64 `json:"success_count"`
 	ErrorCountTotal      int64 `json:"error_count_total"`
 	BusinessLimitedCount int64 `json:"business_limited_count"`
 	ErrorCountSLA     int64 `json:"error_count_sla"`
 	RequestCountTotal int64 `json:"request_count_total"`
 	RequestCountSLA   int64 `json:"request_count_sla"`
 	TokenConsumed int64 `json:"token_consumed"`
 	SLA                          float64 `json:"sla"`
 	ErrorRate                    float64 `json:"error_rate"`
 	UpstreamErrorRate            float64 `json:"upstream_error_rate"`
 	UpstreamErrorCountExcl429529 int64   `json:"upstream_error_count_excl_429_529"`
 	Upstream429Count             int64   `json:"upstream_429_count"`
 	Upstream529Count             int64   `json:"upstream_529_count"`
 	QPS OpsRateSummary `json:"qps"`
 	TPS OpsRateSummary `json:"tps"`
 	Duration OpsPercentiles `json:"duration"`
 	TTFT     OpsPercentiles `json:"ttft"`
 }
 type OpsLatencyHistogramBucket struct {
 	Range string `json:"range"`
 	Count int64  `json:"count"`
 }
 // OpsLatencyHistogramResponse is a coarse latency distribution histogram (success requests only).
 // It is used by the Ops dashboard to quickly identify tail latency regressions.
 type OpsLatencyHistogramResponse struct {
 	StartTime time.Time `json:"start_time"`
 	EndTime   time.Time `json:"end_time"`
 	Platform  string    `json:"platform"`
 	GroupID   *int64    `json:"group_id"`
 	TotalRequests int64                        `json:"total_requests"`
 	Buckets       []*OpsLatencyHistogramBucket `json:"buckets"`
 }
--- a/backend/internal/service/ops_errors.go
+++ b/backend/internal/service/ops_errors.go
@@ -0,0 +1,45 @@
 package service
 import (
 	"context"
 	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
 )
 func (s *OpsService) GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
 	}
 	if filter == nil {
 		return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
 	}
 	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
 		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
 	}
 	if filter.StartTime.After(filter.EndTime) {
 		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
 	}
 	return s.opsRepo.GetErrorTrend(ctx, filter, bucketSeconds)
 }
 func (s *OpsService) GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
 	}
 	if filter == nil {
 		return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
 	}
 	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
 		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
 	}
 	if filter.StartTime.After(filter.EndTime) {
 		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
 	}
 	return s.opsRepo.GetErrorDistribution(ctx, filter)
 }
--- a/backend/internal/service/ops_health_score.go
+++ b/backend/internal/service/ops_health_score.go
@@ -0,0 +1,154 @@
 package service
 import (
 	"math"
 	"time"
 )
 // computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
 //
 // Design goals:
 // - Backend-owned scoring (UI only displays).
 // - Layered scoring: Business Health (70%) + Infrastructure Health (30%)
 // - Avoids double-counting (e.g., DB failure affects both infra and business metrics)
 // - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
 func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
 	if overview == nil {
 		return 0
 	}
 	// Idle/no-data: avoid showing a "bad" score when there is no traffic.
 	// UI can still render a gray/idle state based on QPS + error rate.
 	if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 {
 		return 100
 	}
 	businessHealth := computeBusinessHealth(overview)
 	infraHealth := computeInfraHealth(now, overview)
 	// Weighted combination: 70% business + 30% infrastructure
 	score := businessHealth*0.7 + infraHealth*0.3
 	return int(math.Round(clampFloat64(score, 0, 100)))
 }
 // computeBusinessHealth calculates business health score (0-100)
 // Components: SLA (50%) + Error Rate (30%) + Latency (20%)
 func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
 	// SLA score: 99.5% → 100, 95% → 0 (linear)
 	slaScore := 100.0
 	slaPct := clampFloat64(overview.SLA*100, 0, 100)
 	if slaPct < 99.5 {
 		if slaPct >= 95 {
 			slaScore = (slaPct - 95) / 4.5 * 100
 		} else {
 			slaScore = 0
 		}
 	}
 	// Error rate score: 0.5% → 100, 5% → 0 (linear)
 	// Combines request errors and upstream errors
 	errorScore := 100.0
 	errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
 	upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
 	combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
 	if combinedErrorPct > 0.5 {
 		if combinedErrorPct <= 5 {
 			errorScore = (5 - combinedErrorPct) / 4.5 * 100
 		} else {
 			errorScore = 0
 		}
 	}
 	// Latency score: 1s → 100, 10s → 0 (linear)
 	// Uses P99 of duration (TTFT is less critical for overall health)
 	latencyScore := 100.0
 	if overview.Duration.P99 != nil {
 		p99 := float64(*overview.Duration.P99)
 		if p99 > 1000 {
 			if p99 <= 10000 {
 				latencyScore = (10000 - p99) / 9000 * 100
 			} else {
 				latencyScore = 0
 			}
 		}
 	}
 	// Weighted combination
 	return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2
 }
 // computeInfraHealth calculates infrastructure health score (0-100)
 // Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%)
 func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 {
 	// Storage score: DB critical, Redis less critical
 	storageScore := 100.0
 	if overview.SystemMetrics != nil {
 		if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
 			storageScore = 0 // DB failure is critical
 		} else if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
 			storageScore = 50 // Redis failure is degraded but not critical
 		}
 	}
 	// Compute resources score: CPU + Memory
 	computeScore := 100.0
 	if overview.SystemMetrics != nil {
 		cpuScore := 100.0
 		if overview.SystemMetrics.CPUUsagePercent != nil {
 			cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
 			if cpuPct > 80 {
 				if cpuPct <= 100 {
 					cpuScore = (100 - cpuPct) / 20 * 100
 				} else {
 					cpuScore = 0
 				}
 			}
 		}
 		memScore := 100.0
 		if overview.SystemMetrics.MemoryUsagePercent != nil {
 			memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
 			if memPct > 85 {
 				if memPct <= 100 {
 					memScore = (100 - memPct) / 15 * 100
 				} else {
 					memScore = 0
 				}
 			}
 		}
 		computeScore = (cpuScore + memScore) / 2
 	}
 	// Background jobs score
 	jobScore := 100.0
 	failedJobs := 0
 	totalJobs := 0
 	for _, hb := range overview.JobHeartbeats {
 		if hb == nil {
 			continue
 		}
 		totalJobs++
 		if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
 			failedJobs++
 		} else if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
 			failedJobs++
 		}
 	}
 	if totalJobs > 0 && failedJobs > 0 {
 		jobScore = (1 - float64(failedJobs)/float64(totalJobs)) * 100
 	}
 	// Weighted combination
 	return storageScore*0.4 + computeScore*0.3 + jobScore*0.3
 }
 func clampFloat64(v float64, min float64, max float64) float64 {
 	if v < min {
 		return min
 	}
 	if v > max {
 		return max
 	}
 	return v
 }
--- a/backend/internal/service/ops_health_score_test.go
+++ b/backend/internal/service/ops_health_score_test.go
@@ -0,0 +1,431 @@
 //go:build unit
 package service
 import (
 	"testing"
 	"time"
 	"github.com/stretchr/testify/require"
 )
 func TestComputeDashboardHealthScore_IdleReturns100(t *testing.T) {
 	t.Parallel()
 	score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{})
 	require.Equal(t, 100, score)
 }
 func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) {
 	t.Parallel()
 	ov := &OpsDashboardOverview{
 		RequestCountTotal: 100,
 		RequestCountSLA:   100,
 		SuccessCount:      90,
 		ErrorCountTotal:   10,
 		ErrorCountSLA:     10,
 		SLA:               0.90,
 		ErrorRate:         0.10,
 		UpstreamErrorRate: 0.08,
 		Duration: OpsPercentiles{P99: intPtr(20_000)},
 		TTFT:     OpsPercentiles{P99: intPtr(2_000)},
 		SystemMetrics: &OpsSystemMetricsSnapshot{
 			DBOK:                  boolPtr(false),
 			RedisOK:               boolPtr(false),
 			CPUUsagePercent:       float64Ptr(98.0),
 			MemoryUsagePercent:    float64Ptr(97.0),
 			DBConnWaiting:         intPtr(3),
 			ConcurrencyQueueDepth: intPtr(10),
 		},
 		JobHeartbeats: []*OpsJobHeartbeat{
 			{
 				JobName:     "job-a",
 				LastErrorAt: timePtr(time.Now().UTC().Add(-1 * time.Minute)),
 				LastError:   stringPtr("boom"),
 			},
 		},
 	}
 	score := computeDashboardHealthScore(time.Now().UTC(), ov)
 	require.Less(t, score, 80)
 	require.GreaterOrEqual(t, score, 0)
 }
 func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
 	t.Parallel()
 	tests := []struct {
 		name     string
 		overview *OpsDashboardOverview
 		wantMin  int
 		wantMax  int
 	}{
 		{
 			name:     "nil overview returns 0",
 			overview: nil,
 			wantMin:  0,
 			wantMax:  0,
 		},
 		{
 			name: "perfect health",
 			overview: &OpsDashboardOverview{
 				RequestCountTotal: 1000,
 				RequestCountSLA:   1000,
 				SLA:               1.0,
 				ErrorRate:         0,
 				UpstreamErrorRate: 0,
 				Duration:          OpsPercentiles{P99: intPtr(500)},
 				TTFT:              OpsPercentiles{P99: intPtr(100)},
 				SystemMetrics: &OpsSystemMetricsSnapshot{
 					DBOK:               boolPtr(true),
 					RedisOK:            boolPtr(true),
 					CPUUsagePercent:    float64Ptr(30),
 					MemoryUsagePercent: float64Ptr(40),
 				},
 			},
 			wantMin: 100,
 			wantMax: 100,
 		},
 		{
 			name: "good health - SLA 99.8%",
 			overview: &OpsDashboardOverview{
 				RequestCountTotal: 1000,
 				RequestCountSLA:   1000,
 				SLA:               0.998,
 				ErrorRate:         0.003,
 				UpstreamErrorRate: 0.001,
 				Duration:          OpsPercentiles{P99: intPtr(800)},
 				TTFT:              OpsPercentiles{P99: intPtr(200)},
 				SystemMetrics: &OpsSystemMetricsSnapshot{
 					DBOK:               boolPtr(true),
 					RedisOK:            boolPtr(true),
 					CPUUsagePercent:    float64Ptr(50),
 					MemoryUsagePercent: float64Ptr(60),
 				},
 			},
 			wantMin: 95,
 			wantMax: 100,
 		},
 		{
 			name: "medium health - SLA 96%",
 			overview: &OpsDashboardOverview{
 				RequestCountTotal: 1000,
 				RequestCountSLA:   1000,
 				SLA:               0.96,
 				ErrorRate:         0.02,
 				UpstreamErrorRate: 0.01,
 				Duration:          OpsPercentiles{P99: intPtr(3000)},
 				TTFT:              OpsPercentiles{P99: intPtr(600)},
 				SystemMetrics: &OpsSystemMetricsSnapshot{
 					DBOK:               boolPtr(true),
 					RedisOK:            boolPtr(true),
 					CPUUsagePercent:    float64Ptr(70),
 					MemoryUsagePercent: float64Ptr(75),
 				},
 			},
 			wantMin: 60,
 			wantMax: 85,
 		},
 		{
 			name: "DB failure",
 			overview: &OpsDashboardOverview{
 				RequestCountTotal: 1000,
 				RequestCountSLA:   1000,
 				SLA:               0.995,
 				ErrorRate:         0,
 				UpstreamErrorRate: 0,
 				Duration:          OpsPercentiles{P99: intPtr(500)},
 				SystemMetrics: &OpsSystemMetricsSnapshot{
 					DBOK:               boolPtr(false),
 					RedisOK:            boolPtr(true),
 					CPUUsagePercent:    float64Ptr(30),
 					MemoryUsagePercent: float64Ptr(40),
 				},
 			},
 			wantMin: 70,
 			wantMax: 90,
 		},
 		{
 			name: "Redis failure",
 			overview: &OpsDashboardOverview{
 				RequestCountTotal: 1000,
 				RequestCountSLA:   1000,
 				SLA:               0.995,
 				ErrorRate:         0,
 				UpstreamErrorRate: 0,
 				Duration:          OpsPercentiles{P99: intPtr(500)},
 				SystemMetrics: &OpsSystemMetricsSnapshot{
 					DBOK:               boolPtr(true),
 					RedisOK:            boolPtr(false),
 					CPUUsagePercent:    float64Ptr(30),
 					MemoryUsagePercent: float64Ptr(40),
 				},
 			},
 			wantMin: 85,
 			wantMax: 95,
 		},
 		{
 			name: "high CPU usage",
 			overview: &OpsDashboardOverview{
 				RequestCountTotal: 1000,
 				RequestCountSLA:   1000,
 				SLA:               0.995,
 				ErrorRate:         0,
 				UpstreamErrorRate: 0,
 				Duration:          OpsPercentiles{P99: intPtr(500)},
 				SystemMetrics: &OpsSystemMetricsSnapshot{
 					DBOK:               boolPtr(true),
 					RedisOK:            boolPtr(true),
 					CPUUsagePercent:    float64Ptr(95),
 					MemoryUsagePercent: float64Ptr(40),
 				},
 			},
 			wantMin: 85,
 			wantMax: 100,
 		},
 		{
 			name: "combined failures - business degraded + infra healthy",
 			overview: &OpsDashboardOverview{
 				RequestCountTotal: 1000,
 				RequestCountSLA:   1000,
 				SLA:               0.90,
 				ErrorRate:         0.05,
 				UpstreamErrorRate: 0.02,
 				Duration:          OpsPercentiles{P99: intPtr(10000)},
 				SystemMetrics: &OpsSystemMetricsSnapshot{
 					DBOK:               boolPtr(true),
 					RedisOK:            boolPtr(true),
 					CPUUsagePercent:    float64Ptr(20),
 					MemoryUsagePercent: float64Ptr(30),
 				},
 			},
 			wantMin: 25,
 			wantMax: 50,
 		},
 		{
 			name: "combined failures - business healthy + infra degraded",
 			overview: &OpsDashboardOverview{
 				RequestCountTotal: 1000,
 				RequestCountSLA:   1000,
 				SLA:               0.998,
 				ErrorRate:         0.001,
 				UpstreamErrorRate: 0,
 				Duration:          OpsPercentiles{P99: intPtr(600)},
 				SystemMetrics: &OpsSystemMetricsSnapshot{
 					DBOK:               boolPtr(false),
 					RedisOK:            boolPtr(false),
 					CPUUsagePercent:    float64Ptr(95),
 					MemoryUsagePercent: float64Ptr(95),
 				},
 			},
 			wantMin: 70,
 			wantMax: 90,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			score := computeDashboardHealthScore(time.Now().UTC(), tt.overview)
 			require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %d", tt.wantMin)
 			require.LessOrEqual(t, score, tt.wantMax, "score should be <= %d", tt.wantMax)
 			require.GreaterOrEqual(t, score, 0, "score must be >= 0")
 			require.LessOrEqual(t, score, 100, "score must be <= 100")
 		})
 	}
 }
 func TestComputeBusinessHealth(t *testing.T) {
 	t.Parallel()
 	tests := []struct {
 		name     string
 		overview *OpsDashboardOverview
 		wantMin  float64
 		wantMax  float64
 	}{
 		{
 			name: "perfect metrics",
 			overview: &OpsDashboardOverview{
 				SLA:               1.0,
 				ErrorRate:         0,
 				UpstreamErrorRate: 0,
 				Duration:          OpsPercentiles{P99: intPtr(500)},
 			},
 			wantMin: 100,
 			wantMax: 100,
 		},
 		{
 			name: "SLA boundary 99.5%",
 			overview: &OpsDashboardOverview{
 				SLA:               0.995,
 				ErrorRate:         0,
 				UpstreamErrorRate: 0,
 				Duration:          OpsPercentiles{P99: intPtr(500)},
 			},
 			wantMin: 100,
 			wantMax: 100,
 		},
 		{
 			name: "SLA boundary 95%",
 			overview: &OpsDashboardOverview{
 				SLA:               0.95,
 				ErrorRate:         0,
 				UpstreamErrorRate: 0,
 				Duration:          OpsPercentiles{P99: intPtr(500)},
 			},
 			wantMin: 50,
 			wantMax: 60,
 		},
 		{
 			name: "error rate boundary 0.5%",
 			overview: &OpsDashboardOverview{
 				SLA:               0.995,
 				ErrorRate:         0.005,
 				UpstreamErrorRate: 0,
 				Duration:          OpsPercentiles{P99: intPtr(500)},
 			},
 			wantMin: 95,
 			wantMax: 100,
 		},
 		{
 			name: "latency boundary 1000ms",
 			overview: &OpsDashboardOverview{
 				SLA:               0.995,
 				ErrorRate:         0,
 				UpstreamErrorRate: 0,
 				Duration:          OpsPercentiles{P99: intPtr(1000)},
 			},
 			wantMin: 95,
 			wantMax: 100,
 		},
 		{
 			name: "upstream error dominates",
 			overview: &OpsDashboardOverview{
 				SLA:               0.995,
 				ErrorRate:         0.001,
 				UpstreamErrorRate: 0.03,
 				Duration:          OpsPercentiles{P99: intPtr(500)},
 			},
 			wantMin: 75,
 			wantMax: 90,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			score := computeBusinessHealth(tt.overview)
 			require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
 			require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
 			require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
 			require.LessOrEqual(t, score, 100.0, "score must be <= 100")
 		})
 	}
 }
 func TestComputeInfraHealth(t *testing.T) {
 	t.Parallel()
 	now := time.Now().UTC()
 	tests := []struct {
 		name     string
 		overview *OpsDashboardOverview
 		wantMin  float64
 		wantMax  float64
 	}{
 		{
 			name: "all infrastructure healthy",
 			overview: &OpsDashboardOverview{
 				RequestCountTotal: 1000,
 				SystemMetrics: &OpsSystemMetricsSnapshot{
 					DBOK:               boolPtr(true),
 					RedisOK:            boolPtr(true),
 					CPUUsagePercent:    float64Ptr(30),
 					MemoryUsagePercent: float64Ptr(40),
 				},
 			},
 			wantMin: 100,
 			wantMax: 100,
 		},
 		{
 			name: "DB down",
 			overview: &OpsDashboardOverview{
 				RequestCountTotal: 1000,
 				SystemMetrics: &OpsSystemMetricsSnapshot{
 					DBOK:               boolPtr(false),
 					RedisOK:            boolPtr(true),
 					CPUUsagePercent:    float64Ptr(30),
 					MemoryUsagePercent: float64Ptr(40),
 				},
 			},
 			wantMin: 50,
 			wantMax: 70,
 		},
 		{
 			name: "Redis down",
 			overview: &OpsDashboardOverview{
 				RequestCountTotal: 1000,
 				SystemMetrics: &OpsSystemMetricsSnapshot{
 					DBOK:               boolPtr(true),
 					RedisOK:            boolPtr(false),
 					CPUUsagePercent:    float64Ptr(30),
 					MemoryUsagePercent: float64Ptr(40),
 				},
 			},
 			wantMin: 80,
 			wantMax: 95,
 		},
 		{
 			name: "CPU at 90%",
 			overview: &OpsDashboardOverview{
 				RequestCountTotal: 1000,
 				SystemMetrics: &OpsSystemMetricsSnapshot{
 					DBOK:               boolPtr(true),
 					RedisOK:            boolPtr(true),
 					CPUUsagePercent:    float64Ptr(90),
 					MemoryUsagePercent: float64Ptr(40),
 				},
 			},
 			wantMin: 85,
 			wantMax: 95,
 		},
 		{
 			name: "failed background job",
 			overview: &OpsDashboardOverview{
 				RequestCountTotal: 1000,
 				SystemMetrics: &OpsSystemMetricsSnapshot{
 					DBOK:               boolPtr(true),
 					RedisOK:            boolPtr(true),
 					CPUUsagePercent:    float64Ptr(30),
 					MemoryUsagePercent: float64Ptr(40),
 				},
 				JobHeartbeats: []*OpsJobHeartbeat{
 					{
 						JobName:     "test-job",
 						LastErrorAt: &now,
 					},
 				},
 			},
 			wantMin: 70,
 			wantMax: 90,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			score := computeInfraHealth(now, tt.overview)
 			require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
 			require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
 			require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
 			require.LessOrEqual(t, score, 100.0, "score must be <= 100")
 		})
 	}
 }
 func timePtr(v time.Time) *time.Time { return &v }
 func stringPtr(v string) *string { return &v }
--- a/backend/internal/service/ops_histograms.go
+++ b/backend/internal/service/ops_histograms.go
@@ -0,0 +1,26 @@
 package service
 import (
 	"context"
 	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
 )
 func (s *OpsService) GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
 	}
 	if filter == nil {
 		return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
 	}
 	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
 		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
 	}
 	if filter.StartTime.After(filter.EndTime) {
 		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
 	}
 	return s.opsRepo.GetLatencyHistogram(ctx, filter)
 }
--- a/backend/internal/service/ops_metrics_collector.go
+++ b/backend/internal/service/ops_metrics_collector.go
@@ -0,0 +1,920 @@
 package service
 import (
 	"context"
 	"database/sql"
 	"errors"
 	"fmt"
 	"log"
 	"math"
 	"os"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
 	"unicode/utf8"
 	"github.com/Wei-Shaw/sub2api/internal/config"
 	"github.com/google/uuid"
 	"github.com/redis/go-redis/v9"
 	"github.com/shirou/gopsutil/v4/cpu"
 	"github.com/shirou/gopsutil/v4/mem"
 )
 const (
 	opsMetricsCollectorJobName     = "ops_metrics_collector"
 	opsMetricsCollectorMinInterval = 60 * time.Second
 	opsMetricsCollectorMaxInterval = 1 * time.Hour
 	opsMetricsCollectorTimeout = 10 * time.Second
 	opsMetricsCollectorLeaderLockKey = "ops:metrics:collector:leader"
 	opsMetricsCollectorLeaderLockTTL = 90 * time.Second
 	opsMetricsCollectorHeartbeatTimeout = 2 * time.Second
 	bytesPerMB = 1024 * 1024
 )
 var opsMetricsCollectorAdvisoryLockID = hashAdvisoryLockID(opsMetricsCollectorLeaderLockKey)
 type OpsMetricsCollector struct {
 	opsRepo     OpsRepository
 	settingRepo SettingRepository
 	cfg         *config.Config
 	accountRepo        AccountRepository
 	concurrencyService *ConcurrencyService
 	db          *sql.DB
 	redisClient *redis.Client
 	instanceID  string
 	lastCgroupCPUUsageNanos uint64
 	lastCgroupCPUSampleAt   time.Time
 	stopCh    chan struct{}
 	startOnce sync.Once
 	stopOnce  sync.Once
 	skipLogMu sync.Mutex
 	skipLogAt time.Time
 }
 func NewOpsMetricsCollector(
 	opsRepo OpsRepository,
 	settingRepo SettingRepository,
 	accountRepo AccountRepository,
 	concurrencyService *ConcurrencyService,
 	db *sql.DB,
 	redisClient *redis.Client,
 	cfg *config.Config,
 ) *OpsMetricsCollector {
 	return &OpsMetricsCollector{
 		opsRepo:            opsRepo,
 		settingRepo:        settingRepo,
 		cfg:                cfg,
 		accountRepo:        accountRepo,
 		concurrencyService: concurrencyService,
 		db:                 db,
 		redisClient:        redisClient,
 		instanceID:         uuid.NewString(),
 	}
 }
 func (c *OpsMetricsCollector) Start() {
 	if c == nil {
 		return
 	}
 	c.startOnce.Do(func() {
 		if c.stopCh == nil {
 			c.stopCh = make(chan struct{})
 		}
 		go c.run()
 	})
 }
 func (c *OpsMetricsCollector) Stop() {
 	if c == nil {
 		return
 	}
 	c.stopOnce.Do(func() {
 		if c.stopCh != nil {
 			close(c.stopCh)
 		}
 	})
 }
 func (c *OpsMetricsCollector) run() {
 	// First run immediately so the dashboard has data soon after startup.
 	c.collectOnce()
 	for {
 		interval := c.getInterval()
 		timer := time.NewTimer(interval)
 		select {
 		case <-timer.C:
 			c.collectOnce()
 		case <-c.stopCh:
 			timer.Stop()
 			return
 		}
 	}
 }
 func (c *OpsMetricsCollector) getInterval() time.Duration {
 	interval := opsMetricsCollectorMinInterval
 	if c.settingRepo == nil {
 		return interval
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 	defer cancel()
 	raw, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMetricsIntervalSeconds)
 	if err != nil {
 		return interval
 	}
 	raw = strings.TrimSpace(raw)
 	if raw == "" {
 		return interval
 	}
 	seconds, err := strconv.Atoi(raw)
 	if err != nil {
 		return interval
 	}
 	if seconds < int(opsMetricsCollectorMinInterval.Seconds()) {
 		seconds = int(opsMetricsCollectorMinInterval.Seconds())
 	}
 	if seconds > int(opsMetricsCollectorMaxInterval.Seconds()) {
 		seconds = int(opsMetricsCollectorMaxInterval.Seconds())
 	}
 	return time.Duration(seconds) * time.Second
 }
 func (c *OpsMetricsCollector) collectOnce() {
 	if c == nil {
 		return
 	}
 	if c.cfg != nil && !c.cfg.Ops.Enabled {
 		return
 	}
 	if c.opsRepo == nil {
 		return
 	}
 	if c.db == nil {
 		return
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), opsMetricsCollectorTimeout)
 	defer cancel()
 	if !c.isMonitoringEnabled(ctx) {
 		return
 	}
 	release, ok := c.tryAcquireLeaderLock(ctx)
 	if !ok {
 		return
 	}
 	if release != nil {
 		defer release()
 	}
 	startedAt := time.Now().UTC()
 	err := c.collectAndPersist(ctx)
 	finishedAt := time.Now().UTC()
 	durationMs := finishedAt.Sub(startedAt).Milliseconds()
 	dur := durationMs
 	runAt := startedAt
 	if err != nil {
 		msg := truncateString(err.Error(), 2048)
 		errAt := finishedAt
 		hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
 		defer hbCancel()
 		_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
 			JobName:        opsMetricsCollectorJobName,
 			LastRunAt:      &runAt,
 			LastErrorAt:    &errAt,
 			LastError:      &msg,
 			LastDurationMs: &dur,
 		})
 		log.Printf("[OpsMetricsCollector] collect failed: %v", err)
 		return
 	}
 	successAt := finishedAt
 	hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
 	defer hbCancel()
 	_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
 		JobName:        opsMetricsCollectorJobName,
 		LastRunAt:      &runAt,
 		LastSuccessAt:  &successAt,
 		LastDurationMs: &dur,
 	})
 }
 func (c *OpsMetricsCollector) isMonitoringEnabled(ctx context.Context) bool {
 	if c == nil {
 		return false
 	}
 	if c.cfg != nil && !c.cfg.Ops.Enabled {
 		return false
 	}
 	if c.settingRepo == nil {
 		return true
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	value, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
 	if err != nil {
 		if errors.Is(err, ErrSettingNotFound) {
 			return true
 		}
 		// Fail-open: collector should not become a hard dependency.
 		return true
 	}
 	switch strings.ToLower(strings.TrimSpace(value)) {
 	case "false", "0", "off", "disabled":
 		return false
 	default:
 		return true
 	}
 }
 func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	// Align to stable minute boundaries to avoid partial buckets and to maximize cache hits.
 	now := time.Now().UTC()
 	windowEnd := now.Truncate(time.Minute)
 	windowStart := windowEnd.Add(-1 * time.Minute)
 	sys, err := c.collectSystemStats(ctx)
 	if err != nil {
 		// Continue; system stats are best-effort.
 		log.Printf("[OpsMetricsCollector] system stats error: %v", err)
 	}
 	dbOK := c.checkDB(ctx)
 	redisOK := c.checkRedis(ctx)
 	active, idle := c.dbPoolStats()
 	redisTotal, redisIdle, redisStatsOK := c.redisPoolStats()
 	successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd)
 	if err != nil {
 		return fmt.Errorf("query usage counts: %w", err)
 	}
 	duration, ttft, err := c.queryUsageLatency(ctx, windowStart, windowEnd)
 	if err != nil {
 		return fmt.Errorf("query usage latency: %w", err)
 	}
 	errorTotal, businessLimited, errorSLA, upstreamExcl, upstream429, upstream529, err := c.queryErrorCounts(ctx, windowStart, windowEnd)
 	if err != nil {
 		return fmt.Errorf("query error counts: %w", err)
 	}
 	windowSeconds := windowEnd.Sub(windowStart).Seconds()
 	if windowSeconds <= 0 {
 		windowSeconds = 60
 	}
 	requestTotal := successCount + errorTotal
 	qps := float64(requestTotal) / windowSeconds
 	tps := float64(tokenConsumed) / windowSeconds
 	goroutines := runtime.NumGoroutine()
 	concurrencyQueueDepth := c.collectConcurrencyQueueDepth(ctx)
 	input := &OpsInsertSystemMetricsInput{
 		CreatedAt:     windowEnd,
 		WindowMinutes: 1,
 		SuccessCount:         successCount,
 		ErrorCountTotal:      errorTotal,
 		BusinessLimitedCount: businessLimited,
 		ErrorCountSLA:        errorSLA,
 		UpstreamErrorCountExcl429529: upstreamExcl,
 		Upstream429Count:             upstream429,
 		Upstream529Count:             upstream529,
 		TokenConsumed: tokenConsumed,
 		QPS:           float64Ptr(roundTo1DP(qps)),
 		TPS:           float64Ptr(roundTo1DP(tps)),
 		DurationP50Ms: duration.p50,
 		DurationP90Ms: duration.p90,
 		DurationP95Ms: duration.p95,
 		DurationP99Ms: duration.p99,
 		DurationAvgMs: duration.avg,
 		DurationMaxMs: duration.max,
 		TTFTP50Ms: ttft.p50,
 		TTFTP90Ms: ttft.p90,
 		TTFTP95Ms: ttft.p95,
 		TTFTP99Ms: ttft.p99,
 		TTFTAvgMs: ttft.avg,
 		TTFTMaxMs: ttft.max,
 		CPUUsagePercent:    sys.cpuUsagePercent,
 		MemoryUsedMB:       sys.memoryUsedMB,
 		MemoryTotalMB:      sys.memoryTotalMB,
 		MemoryUsagePercent: sys.memoryUsagePercent,
 		DBOK:    boolPtr(dbOK),
 		RedisOK: boolPtr(redisOK),
 		RedisConnTotal: func() *int {
 			if !redisStatsOK {
 				return nil
 			}
 			return intPtr(redisTotal)
 		}(),
 		RedisConnIdle: func() *int {
 			if !redisStatsOK {
 				return nil
 			}
 			return intPtr(redisIdle)
 		}(),
 		DBConnActive:          intPtr(active),
 		DBConnIdle:            intPtr(idle),
 		GoroutineCount:        intPtr(goroutines),
 		ConcurrencyQueueDepth: concurrencyQueueDepth,
 	}
 	return c.opsRepo.InsertSystemMetrics(ctx, input)
 }
 func (c *OpsMetricsCollector) collectConcurrencyQueueDepth(parentCtx context.Context) *int {
 	if c == nil || c.accountRepo == nil || c.concurrencyService == nil {
 		return nil
 	}
 	if parentCtx == nil {
 		parentCtx = context.Background()
 	}
 	// Best-effort: never let concurrency sampling break the metrics collector.
 	ctx, cancel := context.WithTimeout(parentCtx, 2*time.Second)
 	defer cancel()
 	accounts, err := c.accountRepo.ListSchedulable(ctx)
 	if err != nil {
 		return nil
 	}
 	if len(accounts) == 0 {
 		zero := 0
 		return &zero
 	}
 	batch := make([]AccountWithConcurrency, 0, len(accounts))
 	for _, acc := range accounts {
 		if acc.ID <= 0 {
 			continue
 		}
 		maxConc := acc.Concurrency
 		if maxConc < 0 {
 			maxConc = 0
 		}
 		batch = append(batch, AccountWithConcurrency{
 			ID:             acc.ID,
 			MaxConcurrency: maxConc,
 		})
 	}
 	if len(batch) == 0 {
 		zero := 0
 		return &zero
 	}
 	loadMap, err := c.concurrencyService.GetAccountsLoadBatch(ctx, batch)
 	if err != nil {
 		return nil
 	}
 	var total int64
 	for _, info := range loadMap {
 		if info == nil || info.WaitingCount <= 0 {
 			continue
 		}
 		total += int64(info.WaitingCount)
 	}
 	if total < 0 {
 		total = 0
 	}
 	maxInt := int64(^uint(0) >> 1)
 	if total > maxInt {
 		total = maxInt
 	}
 	v := int(total)
 	return &v
 }
 type opsCollectedPercentiles struct {
 	p50 *int
 	p90 *int
 	p95 *int
 	p99 *int
 	avg *float64
 	max *int
 }
 func (c *OpsMetricsCollector) queryUsageCounts(ctx context.Context, start, end time.Time) (successCount int64, tokenConsumed int64, err error) {
 	q := `
 SELECT
  COALESCE(COUNT(*), 0) AS success_count,
  COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
 FROM usage_logs
 WHERE created_at >= $1 AND created_at < $2`
 	var tokens sql.NullInt64
 	if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&successCount, &tokens); err != nil {
 		return 0, 0, err
 	}
 	if tokens.Valid {
 		tokenConsumed = tokens.Int64
 	}
 	return successCount, tokenConsumed, nil
 }
 func (c *OpsMetricsCollector) queryUsageLatency(ctx context.Context, start, end time.Time) (duration opsCollectedPercentiles, ttft opsCollectedPercentiles, err error) {
 	{
 		q := `
 SELECT
  percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50,
  percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90,
  percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95,
  percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99,
  AVG(duration_ms) AS avg_ms,
  MAX(duration_ms) AS max_ms
 FROM usage_logs
 WHERE created_at >= $1 AND created_at < $2
  AND duration_ms IS NOT NULL`
 		var p50, p90, p95, p99 sql.NullFloat64
 		var avg sql.NullFloat64
 		var max sql.NullInt64
 		if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
 			return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
 		}
 		duration.p50 = floatToIntPtr(p50)
 		duration.p90 = floatToIntPtr(p90)
 		duration.p95 = floatToIntPtr(p95)
 		duration.p99 = floatToIntPtr(p99)
 		if avg.Valid {
 			v := roundTo1DP(avg.Float64)
 			duration.avg = &v
 		}
 		if max.Valid {
 			v := int(max.Int64)
 			duration.max = &v
 		}
 	}
 	{
 		q := `
 SELECT
  percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50,
  percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90,
  percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95,
  percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99,
  AVG(first_token_ms) AS avg_ms,
  MAX(first_token_ms) AS max_ms
 FROM usage_logs
 WHERE created_at >= $1 AND created_at < $2
  AND first_token_ms IS NOT NULL`
 		var p50, p90, p95, p99 sql.NullFloat64
 		var avg sql.NullFloat64
 		var max sql.NullInt64
 		if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
 			return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
 		}
 		ttft.p50 = floatToIntPtr(p50)
 		ttft.p90 = floatToIntPtr(p90)
 		ttft.p95 = floatToIntPtr(p95)
 		ttft.p99 = floatToIntPtr(p99)
 		if avg.Valid {
 			v := roundTo1DP(avg.Float64)
 			ttft.avg = &v
 		}
 		if max.Valid {
 			v := int(max.Int64)
 			ttft.max = &v
 		}
 	}
 	return duration, ttft, nil
 }
 func (c *OpsMetricsCollector) queryErrorCounts(ctx context.Context, start, end time.Time) (
 	errorTotal int64,
 	businessLimited int64,
 	errorSLA int64,
 	upstreamExcl429529 int64,
 	upstream429 int64,
 	upstream529 int64,
 	err error,
 ) {
 	q := `
 SELECT
  COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400), 0) AS error_total,
  COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited), 0) AS business_limited,
  COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited), 0) AS error_sla,
  COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl,
  COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429), 0) AS upstream_429,
  COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529), 0) AS upstream_529
 FROM ops_error_logs
 WHERE created_at >= $1 AND created_at < $2`
 	if err := c.db.QueryRowContext(ctx, q, start, end).Scan(
 		&errorTotal,
 		&businessLimited,
 		&errorSLA,
 		&upstreamExcl429529,
 		&upstream429,
 		&upstream529,
 	); err != nil {
 		return 0, 0, 0, 0, 0, 0, err
 	}
 	return errorTotal, businessLimited, errorSLA, upstreamExcl429529, upstream429, upstream529, nil
 }
 type opsCollectedSystemStats struct {
 	cpuUsagePercent    *float64
 	memoryUsedMB       *int64
 	memoryTotalMB      *int64
 	memoryUsagePercent *float64
 }
 func (c *OpsMetricsCollector) collectSystemStats(ctx context.Context) (*opsCollectedSystemStats, error) {
 	out := &opsCollectedSystemStats{}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	sampleAt := time.Now().UTC()
 	// Prefer cgroup (container) metrics when available.
 	if cpuPct := c.tryCgroupCPUPercent(sampleAt); cpuPct != nil {
 		out.cpuUsagePercent = cpuPct
 	}
 	cgroupUsed, cgroupTotal, cgroupOK := readCgroupMemoryBytes()
 	if cgroupOK {
 		usedMB := int64(cgroupUsed / bytesPerMB)
 		out.memoryUsedMB = &usedMB
 		if cgroupTotal > 0 {
 			totalMB := int64(cgroupTotal / bytesPerMB)
 			out.memoryTotalMB = &totalMB
 			pct := roundTo1DP(float64(cgroupUsed) / float64(cgroupTotal) * 100)
 			out.memoryUsagePercent = &pct
 		}
 	}
 	// Fallback to host metrics if cgroup metrics are unavailable (or incomplete).
 	if out.cpuUsagePercent == nil {
 		if cpuPercents, err := cpu.PercentWithContext(ctx, 0, false); err == nil && len(cpuPercents) > 0 {
 			v := roundTo1DP(cpuPercents[0])
 			out.cpuUsagePercent = &v
 		}
 	}
 	// If total memory isn't available from cgroup (e.g. memory.max = "max"), fill total from host.
 	if out.memoryUsedMB == nil || out.memoryTotalMB == nil || out.memoryUsagePercent == nil {
 		if vm, err := mem.VirtualMemoryWithContext(ctx); err == nil && vm != nil {
 			if out.memoryUsedMB == nil {
 				usedMB := int64(vm.Used / bytesPerMB)
 				out.memoryUsedMB = &usedMB
 			}
 			if out.memoryTotalMB == nil {
 				totalMB := int64(vm.Total / bytesPerMB)
 				out.memoryTotalMB = &totalMB
 			}
 			if out.memoryUsagePercent == nil {
 				if out.memoryUsedMB != nil && out.memoryTotalMB != nil && *out.memoryTotalMB > 0 {
 					pct := roundTo1DP(float64(*out.memoryUsedMB) / float64(*out.memoryTotalMB) * 100)
 					out.memoryUsagePercent = &pct
 				} else {
 					pct := roundTo1DP(vm.UsedPercent)
 					out.memoryUsagePercent = &pct
 				}
 			}
 		}
 	}
 	return out, nil
 }
 func (c *OpsMetricsCollector) tryCgroupCPUPercent(now time.Time) *float64 {
 	usageNanos, ok := readCgroupCPUUsageNanos()
 	if !ok {
 		return nil
 	}
 	// Initialize baseline sample.
 	if c.lastCgroupCPUSampleAt.IsZero() {
 		c.lastCgroupCPUUsageNanos = usageNanos
 		c.lastCgroupCPUSampleAt = now
 		return nil
 	}
 	elapsed := now.Sub(c.lastCgroupCPUSampleAt)
 	if elapsed <= 0 {
 		c.lastCgroupCPUUsageNanos = usageNanos
 		c.lastCgroupCPUSampleAt = now
 		return nil
 	}
 	prev := c.lastCgroupCPUUsageNanos
 	c.lastCgroupCPUUsageNanos = usageNanos
 	c.lastCgroupCPUSampleAt = now
 	if usageNanos < prev {
 		// Counter reset (container restarted).
 		return nil
 	}
 	deltaUsageSec := float64(usageNanos-prev) / 1e9
 	elapsedSec := elapsed.Seconds()
 	if elapsedSec <= 0 {
 		return nil
 	}
 	cores := readCgroupCPULimitCores()
 	if cores <= 0 {
 		// Can't reliably normalize; skip and fall back to gopsutil.
 		return nil
 	}
 	pct := (deltaUsageSec / (elapsedSec * cores)) * 100
 	if pct < 0 {
 		pct = 0
 	}
 	// Clamp to avoid noise/jitter showing impossible values.
 	if pct > 100 {
 		pct = 100
 	}
 	v := roundTo1DP(pct)
 	return &v
 }
 func readCgroupMemoryBytes() (usedBytes uint64, totalBytes uint64, ok bool) {
 	// cgroup v2 (most common in modern containers)
 	if used, ok1 := readUintFile("/sys/fs/cgroup/memory.current"); ok1 {
 		usedBytes = used
 		rawMax, err := os.ReadFile("/sys/fs/cgroup/memory.max")
 		if err == nil {
 			s := strings.TrimSpace(string(rawMax))
 			if s != "" && s != "max" {
 				if v, err := strconv.ParseUint(s, 10, 64); err == nil {
 					totalBytes = v
 				}
 			}
 		}
 		return usedBytes, totalBytes, true
 	}
 	// cgroup v1 fallback
 	if used, ok1 := readUintFile("/sys/fs/cgroup/memory/memory.usage_in_bytes"); ok1 {
 		usedBytes = used
 		if limit, ok2 := readUintFile("/sys/fs/cgroup/memory/memory.limit_in_bytes"); ok2 {
 			// Some environments report a very large number when unlimited.
 			if limit > 0 && limit < (1<<60) {
 				totalBytes = limit
 			}
 		}
 		return usedBytes, totalBytes, true
 	}
 	return 0, 0, false
 }
 func readCgroupCPUUsageNanos() (usageNanos uint64, ok bool) {
 	// cgroup v2: cpu.stat has usage_usec
 	if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.stat"); err == nil {
 		lines := strings.Split(string(raw), "\n")
 		for _, line := range lines {
 			fields := strings.Fields(line)
 			if len(fields) != 2 {
 				continue
 			}
 			if fields[0] != "usage_usec" {
 				continue
 			}
 			v, err := strconv.ParseUint(fields[1], 10, 64)
 			if err != nil {
 				continue
 			}
 			return v * 1000, true
 		}
 	}
 	// cgroup v1: cpuacct.usage is in nanoseconds
 	if v, ok := readUintFile("/sys/fs/cgroup/cpuacct/cpuacct.usage"); ok {
 		return v, true
 	}
 	return 0, false
 }
 func readCgroupCPULimitCores() float64 {
 	// cgroup v2: cpu.max => "<quota> <period>" or "max <period>"
 	if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.max"); err == nil {
 		fields := strings.Fields(string(raw))
 		if len(fields) >= 2 && fields[0] != "max" {
 			quota, err1 := strconv.ParseFloat(fields[0], 64)
 			period, err2 := strconv.ParseFloat(fields[1], 64)
 			if err1 == nil && err2 == nil && quota > 0 && period > 0 {
 				return quota / period
 			}
 		}
 	}
 	// cgroup v1: cpu.cfs_quota_us / cpu.cfs_period_us
 	quota, okQuota := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_quota_us")
 	period, okPeriod := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_period_us")
 	if okQuota && okPeriod && quota > 0 && period > 0 {
 		return float64(quota) / float64(period)
 	}
 	return 0
 }
 func readUintFile(path string) (uint64, bool) {
 	raw, err := os.ReadFile(path)
 	if err != nil {
 		return 0, false
 	}
 	s := strings.TrimSpace(string(raw))
 	if s == "" {
 		return 0, false
 	}
 	v, err := strconv.ParseUint(s, 10, 64)
 	if err != nil {
 		return 0, false
 	}
 	return v, true
 }
 func readIntFile(path string) (int64, bool) {
 	raw, err := os.ReadFile(path)
 	if err != nil {
 		return 0, false
 	}
 	s := strings.TrimSpace(string(raw))
 	if s == "" {
 		return 0, false
 	}
 	v, err := strconv.ParseInt(s, 10, 64)
 	if err != nil {
 		return 0, false
 	}
 	return v, true
 }
 func (c *OpsMetricsCollector) checkDB(ctx context.Context) bool {
 	if c == nil || c.db == nil {
 		return false
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	var one int
 	if err := c.db.QueryRowContext(ctx, "SELECT 1").Scan(&one); err != nil {
 		return false
 	}
 	return one == 1
 }
 func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool {
 	if c == nil || c.redisClient == nil {
 		return false
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	return c.redisClient.Ping(ctx).Err() == nil
 }
 func (c *OpsMetricsCollector) redisPoolStats() (total int, idle int, ok bool) {
 	if c == nil || c.redisClient == nil {
 		return 0, 0, false
 	}
 	stats := c.redisClient.PoolStats()
 	if stats == nil {
 		return 0, 0, false
 	}
 	return int(stats.TotalConns), int(stats.IdleConns), true
 }
 func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) {
 	if c == nil || c.db == nil {
 		return 0, 0
 	}
 	stats := c.db.Stats()
 	return stats.InUse, stats.Idle
 }
 var opsMetricsCollectorReleaseScript = redis.NewScript(`
 if redis.call("GET", KEYS[1]) == ARGV[1] then
  return redis.call("DEL", KEYS[1])
 end
 return 0
 `)
 func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
 	if c == nil || c.redisClient == nil {
 		return nil, true
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	ok, err := c.redisClient.SetNX(ctx, opsMetricsCollectorLeaderLockKey, c.instanceID, opsMetricsCollectorLeaderLockTTL).Result()
 	if err != nil {
 		// Prefer fail-closed to avoid stampeding the database when Redis is flaky.
 		// Fallback to a DB advisory lock when Redis is present but unavailable.
 		release, ok := tryAcquireDBAdvisoryLock(ctx, c.db, opsMetricsCollectorAdvisoryLockID)
 		if !ok {
 			c.maybeLogSkip()
 			return nil, false
 		}
 		return release, true
 	}
 	if !ok {
 		c.maybeLogSkip()
 		return nil, false
 	}
 	release := func() {
 		ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 		defer cancel()
 		_, _ = opsMetricsCollectorReleaseScript.Run(ctx, c.redisClient, []string{opsMetricsCollectorLeaderLockKey}, c.instanceID).Result()
 	}
 	return release, true
 }
 func (c *OpsMetricsCollector) maybeLogSkip() {
 	c.skipLogMu.Lock()
 	defer c.skipLogMu.Unlock()
 	now := time.Now()
 	if !c.skipLogAt.IsZero() && now.Sub(c.skipLogAt) < time.Minute {
 		return
 	}
 	c.skipLogAt = now
 	log.Printf("[OpsMetricsCollector] leader lock held by another instance; skipping")
 }
 func floatToIntPtr(v sql.NullFloat64) *int {
 	if !v.Valid {
 		return nil
 	}
 	n := int(math.Round(v.Float64))
 	return &n
 }
 func roundTo1DP(v float64) float64 {
 	return math.Round(v*10) / 10
 }
 func truncateString(s string, max int) string {
 	if max <= 0 {
 		return ""
 	}
 	if len(s) <= max {
 		return s
 	}
 	cut := s[:max]
 	for len(cut) > 0 && !utf8.ValidString(cut) {
 		cut = cut[:len(cut)-1]
 	}
 	return cut
 }
 func boolPtr(v bool) *bool {
 	out := v
 	return &out
 }
 func intPtr(v int) *int {
 	out := v
 	return &out
 }
 func float64Ptr(v float64) *float64 {
 	out := v
 	return &out
 }
--- a/backend/internal/service/ops_models.go
+++ b/backend/internal/service/ops_models.go
@@ -0,0 +1,124 @@
 package service
 import "time"
 type OpsErrorLog struct {
 	ID        int64     `json:"id"`
 	CreatedAt time.Time `json:"created_at"`
 	Phase    string `json:"phase"`
 	Type     string `json:"type"`
 	Severity string `json:"severity"`
 	StatusCode int    `json:"status_code"`
 	Platform   string `json:"platform"`
 	Model      string `json:"model"`
 	LatencyMs *int `json:"latency_ms"`
 	ClientRequestID string `json:"client_request_id"`
 	RequestID       string `json:"request_id"`
 	Message         string `json:"message"`
 	UserID    *int64 `json:"user_id"`
 	APIKeyID  *int64 `json:"api_key_id"`
 	AccountID *int64 `json:"account_id"`
 	GroupID   *int64 `json:"group_id"`
 	ClientIP    *string `json:"client_ip"`
 	RequestPath string  `json:"request_path"`
 	Stream      bool    `json:"stream"`
 }
 type OpsErrorLogDetail struct {
 	OpsErrorLog
 	ErrorBody string `json:"error_body"`
 	UserAgent string `json:"user_agent"`
 	// Upstream context (optional)
 	UpstreamStatusCode   *int   `json:"upstream_status_code,omitempty"`
 	UpstreamErrorMessage string `json:"upstream_error_message,omitempty"`
 	UpstreamErrorDetail  string `json:"upstream_error_detail,omitempty"`
 	UpstreamErrors       string `json:"upstream_errors,omitempty"` // JSON array (string) for display/parsing
 	// Timings (optional)
 	AuthLatencyMs      *int64 `json:"auth_latency_ms"`
 	RoutingLatencyMs   *int64 `json:"routing_latency_ms"`
 	UpstreamLatencyMs  *int64 `json:"upstream_latency_ms"`
 	ResponseLatencyMs  *int64 `json:"response_latency_ms"`
 	TimeToFirstTokenMs *int64 `json:"time_to_first_token_ms"`
 	// Retry context
 	RequestBody          string `json:"request_body"`
 	RequestBodyTruncated bool   `json:"request_body_truncated"`
 	RequestBodyBytes     *int   `json:"request_body_bytes"`
 	RequestHeaders       string `json:"request_headers,omitempty"`
 	// vNext metric semantics
 	IsBusinessLimited bool `json:"is_business_limited"`
 }
 type OpsErrorLogFilter struct {
 	StartTime *time.Time
 	EndTime   *time.Time
 	Platform  string
 	GroupID   *int64
 	AccountID *int64
 	StatusCodes []int
 	Phase       string
 	Query       string
 	Page     int
 	PageSize int
 }
 type OpsErrorLogList struct {
 	Errors   []*OpsErrorLog `json:"errors"`
 	Total    int            `json:"total"`
 	Page     int            `json:"page"`
 	PageSize int            `json:"page_size"`
 }
 type OpsRetryAttempt struct {
 	ID        int64     `json:"id"`
 	CreatedAt time.Time `json:"created_at"`
 	RequestedByUserID int64  `json:"requested_by_user_id"`
 	SourceErrorID     int64  `json:"source_error_id"`
 	Mode              string `json:"mode"`
 	PinnedAccountID   *int64 `json:"pinned_account_id"`
 	Status     string     `json:"status"`
 	StartedAt  *time.Time `json:"started_at"`
 	FinishedAt *time.Time `json:"finished_at"`
 	DurationMs *int64     `json:"duration_ms"`
 	ResultRequestID *string `json:"result_request_id"`
 	ResultErrorID   *int64  `json:"result_error_id"`
 	ErrorMessage *string `json:"error_message"`
 }
 type OpsRetryResult struct {
 	AttemptID int64  `json:"attempt_id"`
 	Mode      string `json:"mode"`
 	Status    string `json:"status"`
 	PinnedAccountID *int64 `json:"pinned_account_id"`
 	UsedAccountID   *int64 `json:"used_account_id"`
 	HTTPStatusCode    int    `json:"http_status_code"`
 	UpstreamRequestID string `json:"upstream_request_id"`
 	ResponsePreview   string `json:"response_preview"`
 	ResponseTruncated bool   `json:"response_truncated"`
 	ErrorMessage string `json:"error_message"`
 	StartedAt  time.Time `json:"started_at"`
 	FinishedAt time.Time `json:"finished_at"`
 	DurationMs int64     `json:"duration_ms"`
 }
--- a/backend/internal/service/ops_port.go
+++ b/backend/internal/service/ops_port.go
@@ -0,0 +1,242 @@
 package service
 import (
 	"context"
 	"time"
 )
 type OpsRepository interface {
 	InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error)
 	ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error)
 	GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error)
 	ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error)
 	InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
 	UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
 	GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)
 	// Lightweight window stats (for realtime WS / quick sampling).
 	GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
 	GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error)
 	GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error)
 	GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error)
 	GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error)
 	GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error)
 	InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error
 	GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error)
 	UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error
 	ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error)
 	// Alerts (rules + events)
 	ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error)
 	CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
 	UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
 	DeleteAlertRule(ctx context.Context, id int64) error
 	ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
 	GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
 	GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
 	CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
 	UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
 	UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error
 	// Pre-aggregation (hourly/daily) used for long-window dashboard performance.
 	UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
 	UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
 	GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error)
 	GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error)
 }
 type OpsInsertErrorLogInput struct {
 	RequestID       string
 	ClientRequestID string
 	UserID    *int64
 	APIKeyID  *int64
 	AccountID *int64
 	GroupID   *int64
 	ClientIP  *string
 	Platform    string
 	Model       string
 	RequestPath string
 	Stream      bool
 	UserAgent   string
 	ErrorPhase        string
 	ErrorType         string
 	Severity          string
 	StatusCode        int
 	IsBusinessLimited bool
 	ErrorMessage string
 	ErrorBody    string
 	ErrorSource string
 	ErrorOwner  string
 	UpstreamStatusCode   *int
 	UpstreamErrorMessage *string
 	UpstreamErrorDetail  *string
 	// UpstreamErrors captures all upstream error attempts observed during handling this request.
 	// It is populated during request processing (gin context) and sanitized+serialized by OpsService.
 	UpstreamErrors []*OpsUpstreamErrorEvent
 	// UpstreamErrorsJSON is the sanitized JSON string stored into ops_error_logs.upstream_errors.
 	// It is set by OpsService.RecordError before persisting.
 	UpstreamErrorsJSON *string
 	DurationMs         *int
 	TimeToFirstTokenMs *int64
 	RequestBodyJSON      *string // sanitized json string (not raw bytes)
 	RequestBodyTruncated bool
 	RequestBodyBytes     *int
 	RequestHeadersJSON   *string // optional json string
 	IsRetryable bool
 	RetryCount  int
 	CreatedAt time.Time
 }
 type OpsInsertRetryAttemptInput struct {
 	RequestedByUserID int64
 	SourceErrorID     int64
 	Mode              string
 	PinnedAccountID   *int64
 	// running|queued etc.
 	Status    string
 	StartedAt time.Time
 }
 type OpsUpdateRetryAttemptInput struct {
 	ID int64
 	// succeeded|failed
 	Status     string
 	FinishedAt time.Time
 	DurationMs int64
 	// Optional correlation
 	ResultRequestID *string
 	ResultErrorID   *int64
 	ErrorMessage *string
 }
 type OpsInsertSystemMetricsInput struct {
 	CreatedAt     time.Time
 	WindowMinutes int
 	Platform *string
 	GroupID  *int64
 	SuccessCount         int64
 	ErrorCountTotal      int64
 	BusinessLimitedCount int64
 	ErrorCountSLA        int64
 	UpstreamErrorCountExcl429529 int64
 	Upstream429Count             int64
 	Upstream529Count             int64
 	TokenConsumed int64
 	QPS *float64
 	TPS *float64
 	DurationP50Ms *int
 	DurationP90Ms *int
 	DurationP95Ms *int
 	DurationP99Ms *int
 	DurationAvgMs *float64
 	DurationMaxMs *int
 	TTFTP50Ms *int
 	TTFTP90Ms *int
 	TTFTP95Ms *int
 	TTFTP99Ms *int
 	TTFTAvgMs *float64
 	TTFTMaxMs *int
 	CPUUsagePercent    *float64
 	MemoryUsedMB       *int64
 	MemoryTotalMB      *int64
 	MemoryUsagePercent *float64
 	DBOK    *bool
 	RedisOK *bool
 	RedisConnTotal *int
 	RedisConnIdle  *int
 	DBConnActive  *int
 	DBConnIdle    *int
 	DBConnWaiting *int
 	GoroutineCount        *int
 	ConcurrencyQueueDepth *int
 }
 type OpsSystemMetricsSnapshot struct {
 	ID            int64     `json:"id"`
 	CreatedAt     time.Time `json:"created_at"`
 	WindowMinutes int       `json:"window_minutes"`
 	CPUUsagePercent    *float64 `json:"cpu_usage_percent"`
 	MemoryUsedMB       *int64   `json:"memory_used_mb"`
 	MemoryTotalMB      *int64   `json:"memory_total_mb"`
 	MemoryUsagePercent *float64 `json:"memory_usage_percent"`
 	DBOK    *bool `json:"db_ok"`
 	RedisOK *bool `json:"redis_ok"`
 	// Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max".
 	DBMaxOpenConns *int `json:"db_max_open_conns"`
 	RedisPoolSize  *int `json:"redis_pool_size"`
 	RedisConnTotal *int `json:"redis_conn_total"`
 	RedisConnIdle  *int `json:"redis_conn_idle"`
 	DBConnActive  *int `json:"db_conn_active"`
 	DBConnIdle    *int `json:"db_conn_idle"`
 	DBConnWaiting *int `json:"db_conn_waiting"`
 	GoroutineCount        *int `json:"goroutine_count"`
 	ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"`
 }
 type OpsUpsertJobHeartbeatInput struct {
 	JobName string
 	LastRunAt      *time.Time
 	LastSuccessAt  *time.Time
 	LastErrorAt    *time.Time
 	LastError      *string
 	LastDurationMs *int64
 }
 type OpsJobHeartbeat struct {
 	JobName string `json:"job_name"`
 	LastRunAt      *time.Time `json:"last_run_at"`
 	LastSuccessAt  *time.Time `json:"last_success_at"`
 	LastErrorAt    *time.Time `json:"last_error_at"`
 	LastError      *string    `json:"last_error"`
 	LastDurationMs *int64     `json:"last_duration_ms"`
 	UpdatedAt time.Time `json:"updated_at"`
 }
 type OpsWindowStats struct {
 	StartTime time.Time `json:"start_time"`
 	EndTime   time.Time `json:"end_time"`
 	SuccessCount    int64 `json:"success_count"`
 	ErrorCountTotal int64 `json:"error_count_total"`
 	TokenConsumed   int64 `json:"token_consumed"`
 }
--- a/backend/internal/service/ops_query_mode.go
+++ b/backend/internal/service/ops_query_mode.go
@@ -0,0 +1,40 @@
 package service
 import (
 	"errors"
 	"strings"
 )
 type OpsQueryMode string
 const (
 	OpsQueryModeAuto   OpsQueryMode = "auto"
 	OpsQueryModeRaw    OpsQueryMode = "raw"
 	OpsQueryModePreagg OpsQueryMode = "preagg"
 )
 // ErrOpsPreaggregatedNotPopulated indicates that raw logs exist for a window, but the
 // pre-aggregation tables are not populated yet. This is primarily used to implement
 // the forced `preagg` mode UX.
 var ErrOpsPreaggregatedNotPopulated = errors.New("ops pre-aggregated tables not populated")
 func ParseOpsQueryMode(raw string) OpsQueryMode {
 	v := strings.ToLower(strings.TrimSpace(raw))
 	switch v {
 	case string(OpsQueryModeRaw):
 		return OpsQueryModeRaw
 	case string(OpsQueryModePreagg):
 		return OpsQueryModePreagg
 	default:
 		return OpsQueryModeAuto
 	}
 }
 func (m OpsQueryMode) IsValid() bool {
 	switch m {
 	case OpsQueryModeAuto, OpsQueryModeRaw, OpsQueryModePreagg:
 		return true
 	default:
 		return false
 	}
 }
--- a/backend/internal/service/ops_realtime.go
+++ b/backend/internal/service/ops_realtime.go
@@ -0,0 +1,36 @@
 package service
 import (
 	"context"
 	"errors"
 	"strings"
 )
 // IsRealtimeMonitoringEnabled returns true when realtime ops features are enabled.
 //
 // This is a soft switch controlled by the DB setting `ops_realtime_monitoring_enabled`,
 // and it is also gated by the hard switch/soft switch of overall ops monitoring.
 func (s *OpsService) IsRealtimeMonitoringEnabled(ctx context.Context) bool {
 	if !s.IsMonitoringEnabled(ctx) {
 		return false
 	}
 	if s.settingRepo == nil {
 		return true
 	}
 	value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsRealtimeMonitoringEnabled)
 	if err != nil {
 		// Default enabled when key is missing; fail-open on transient errors.
 		if errors.Is(err, ErrSettingNotFound) {
 			return true
 		}
 		return true
 	}
 	switch strings.ToLower(strings.TrimSpace(value)) {
 	case "false", "0", "off", "disabled":
 		return false
 	default:
 		return true
 	}
 }
--- a/backend/internal/service/ops_realtime_models.go
+++ b/backend/internal/service/ops_realtime_models.go
@@ -0,0 +1,81 @@
 package service
 import "time"
 // PlatformConcurrencyInfo aggregates concurrency usage by platform.
 type PlatformConcurrencyInfo struct {
 	Platform       string  `json:"platform"`
 	CurrentInUse   int64   `json:"current_in_use"`
 	MaxCapacity    int64   `json:"max_capacity"`
 	LoadPercentage float64 `json:"load_percentage"`
 	WaitingInQueue int64   `json:"waiting_in_queue"`
 }
 // GroupConcurrencyInfo aggregates concurrency usage by group.
 //
 // Note: one account can belong to multiple groups; group totals are therefore not additive across groups.
 type GroupConcurrencyInfo struct {
 	GroupID        int64   `json:"group_id"`
 	GroupName      string  `json:"group_name"`
 	Platform       string  `json:"platform"`
 	CurrentInUse   int64   `json:"current_in_use"`
 	MaxCapacity    int64   `json:"max_capacity"`
 	LoadPercentage float64 `json:"load_percentage"`
 	WaitingInQueue int64   `json:"waiting_in_queue"`
 }
 // AccountConcurrencyInfo represents real-time concurrency usage for a single account.
 type AccountConcurrencyInfo struct {
 	AccountID      int64   `json:"account_id"`
 	AccountName    string  `json:"account_name"`
 	Platform       string  `json:"platform"`
 	GroupID        int64   `json:"group_id"`
 	GroupName      string  `json:"group_name"`
 	CurrentInUse   int64   `json:"current_in_use"`
 	MaxCapacity    int64   `json:"max_capacity"`
 	LoadPercentage float64 `json:"load_percentage"`
 	WaitingInQueue int64   `json:"waiting_in_queue"`
 }
 // PlatformAvailability aggregates account availability by platform.
 type PlatformAvailability struct {
 	Platform       string `json:"platform"`
 	TotalAccounts  int64  `json:"total_accounts"`
 	AvailableCount int64  `json:"available_count"`
 	RateLimitCount int64  `json:"rate_limit_count"`
 	ErrorCount     int64  `json:"error_count"`
 }
 // GroupAvailability aggregates account availability by group.
 type GroupAvailability struct {
 	GroupID        int64  `json:"group_id"`
 	GroupName      string `json:"group_name"`
 	Platform       string `json:"platform"`
 	TotalAccounts  int64  `json:"total_accounts"`
 	AvailableCount int64  `json:"available_count"`
 	RateLimitCount int64  `json:"rate_limit_count"`
 	ErrorCount     int64  `json:"error_count"`
 }
 // AccountAvailability represents current availability for a single account.
 type AccountAvailability struct {
 	AccountID   int64  `json:"account_id"`
 	AccountName string `json:"account_name"`
 	Platform    string `json:"platform"`
 	GroupID     int64  `json:"group_id"`
 	GroupName   string `json:"group_name"`
 	Status string `json:"status"`
 	IsAvailable   bool `json:"is_available"`
 	IsRateLimited bool `json:"is_rate_limited"`
 	IsOverloaded  bool `json:"is_overloaded"`
 	HasError      bool `json:"has_error"`
 	RateLimitResetAt       *time.Time `json:"rate_limit_reset_at"`
 	RateLimitRemainingSec  *int64     `json:"rate_limit_remaining_sec"`
 	OverloadUntil          *time.Time `json:"overload_until"`
 	OverloadRemainingSec   *int64     `json:"overload_remaining_sec"`
 	ErrorMessage           string     `json:"error_message"`
 	TempUnschedulableUntil *time.Time `json:"temp_unschedulable_until,omitempty"`
 }
--- a/backend/internal/service/ops_request_details.go
+++ b/backend/internal/service/ops_request_details.go
@@ -0,0 +1,151 @@
 package service
 import (
 	"context"
 	"time"
 )
 type OpsRequestKind string
 const (
 	OpsRequestKindSuccess OpsRequestKind = "success"
 	OpsRequestKindError   OpsRequestKind = "error"
 )
 // OpsRequestDetail is a request-level view across success (usage_logs) and error (ops_error_logs).
 // It powers "request drilldown" UIs without exposing full request bodies for successful requests.
 type OpsRequestDetail struct {
 	Kind      OpsRequestKind `json:"kind"`
 	CreatedAt time.Time      `json:"created_at"`
 	RequestID string         `json:"request_id"`
 	Platform string `json:"platform,omitempty"`
 	Model    string `json:"model,omitempty"`
 	DurationMs *int `json:"duration_ms,omitempty"`
 	StatusCode *int `json:"status_code,omitempty"`
 	// When Kind == "error", ErrorID links to /admin/ops/errors/:id.
 	ErrorID *int64 `json:"error_id,omitempty"`
 	Phase    string `json:"phase,omitempty"`
 	Severity string `json:"severity,omitempty"`
 	Message  string `json:"message,omitempty"`
 	UserID    *int64 `json:"user_id,omitempty"`
 	APIKeyID  *int64 `json:"api_key_id,omitempty"`
 	AccountID *int64 `json:"account_id,omitempty"`
 	GroupID   *int64 `json:"group_id,omitempty"`
 	Stream bool `json:"stream"`
 }
 type OpsRequestDetailFilter struct {
 	StartTime *time.Time
 	EndTime   *time.Time
 	// kind: success|error|all
 	Kind string
 	Platform string
 	GroupID  *int64
 	UserID    *int64
 	APIKeyID  *int64
 	AccountID *int64
 	Model     string
 	RequestID string
 	Query     string
 	MinDurationMs *int
 	MaxDurationMs *int
 	// Sort: created_at_desc (default) or duration_desc.
 	Sort string
 	Page     int
 	PageSize int
 }
 func (f *OpsRequestDetailFilter) Normalize() (page, pageSize int, startTime, endTime time.Time) {
 	page = 1
 	pageSize = 50
 	endTime = time.Now()
 	startTime = endTime.Add(-1 * time.Hour)
 	if f == nil {
 		return page, pageSize, startTime, endTime
 	}
 	if f.Page > 0 {
 		page = f.Page
 	}
 	if f.PageSize > 0 {
 		pageSize = f.PageSize
 	}
 	if pageSize > 100 {
 		pageSize = 100
 	}
 	if f.EndTime != nil {
 		endTime = *f.EndTime
 	}
 	if f.StartTime != nil {
 		startTime = *f.StartTime
 	} else if f.EndTime != nil {
 		startTime = endTime.Add(-1 * time.Hour)
 	}
 	if startTime.After(endTime) {
 		startTime, endTime = endTime, startTime
 	}
 	return page, pageSize, startTime, endTime
 }
 type OpsRequestDetailList struct {
 	Items    []*OpsRequestDetail `json:"items"`
 	Total    int64               `json:"total"`
 	Page     int                 `json:"page"`
 	PageSize int                 `json:"page_size"`
 }
 func (s *OpsService) ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) (*OpsRequestDetailList, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return &OpsRequestDetailList{
 			Items:    []*OpsRequestDetail{},
 			Total:    0,
 			Page:     1,
 			PageSize: 50,
 		}, nil
 	}
 	page, pageSize, startTime, endTime := filter.Normalize()
 	filterCopy := &OpsRequestDetailFilter{}
 	if filter != nil {
 		*filterCopy = *filter
 	}
 	filterCopy.Page = page
 	filterCopy.PageSize = pageSize
 	filterCopy.StartTime = &startTime
 	filterCopy.EndTime = &endTime
 	items, total, err := s.opsRepo.ListRequestDetails(ctx, filterCopy)
 	if err != nil {
 		return nil, err
 	}
 	if items == nil {
 		items = []*OpsRequestDetail{}
 	}
 	return &OpsRequestDetailList{
 		Items:    items,
 		Total:    total,
 		Page:     page,
 		PageSize: pageSize,
 	}, nil
 }
--- a/backend/internal/service/ops_retry.go
+++ b/backend/internal/service/ops_retry.go
@@ -0,0 +1,632 @@
 package service
 import (
 	"bytes"
 	"context"
 	"database/sql"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"log"
 	"net/http"
 	"strings"
 	"time"
 	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
 	"github.com/gin-gonic/gin"
 	"github.com/lib/pq"
 )
 const (
 	OpsRetryModeClient   = "client"
 	OpsRetryModeUpstream = "upstream"
 )
 const (
 	opsRetryStatusRunning   = "running"
 	opsRetryStatusSucceeded = "succeeded"
 	opsRetryStatusFailed    = "failed"
 )
 const (
 	opsRetryTimeout             = 60 * time.Second
 	opsRetryCaptureBytesLimit   = 64 * 1024
 	opsRetryResponsePreviewMax  = 8 * 1024
 	opsRetryMinIntervalPerError = 10 * time.Second
 	opsRetryMaxAccountSwitches  = 3
 )
 var opsRetryRequestHeaderAllowlist = map[string]bool{
 	"anthropic-beta":    true,
 	"anthropic-version": true,
 }
 type opsRetryRequestType string
 const (
 	opsRetryTypeMessages  opsRetryRequestType = "messages"
 	opsRetryTypeOpenAI    opsRetryRequestType = "openai_responses"
 	opsRetryTypeGeminiV1B opsRetryRequestType = "gemini_v1beta"
 )
 type limitedResponseWriter struct {
 	header      http.Header
 	wroteHeader bool
 	limit        int
 	totalWritten int64
 	buf          bytes.Buffer
 }
 func newLimitedResponseWriter(limit int) *limitedResponseWriter {
 	if limit <= 0 {
 		limit = 1
 	}
 	return &limitedResponseWriter{
 		header: make(http.Header),
 		limit:  limit,
 	}
 }
 func (w *limitedResponseWriter) Header() http.Header {
 	return w.header
 }
 func (w *limitedResponseWriter) WriteHeader(statusCode int) {
 	if w.wroteHeader {
 		return
 	}
 	w.wroteHeader = true
 }
 func (w *limitedResponseWriter) Write(p []byte) (int, error) {
 	if !w.wroteHeader {
 		w.WriteHeader(http.StatusOK)
 	}
 	w.totalWritten += int64(len(p))
 	if w.buf.Len() < w.limit {
 		remaining := w.limit - w.buf.Len()
 		if len(p) > remaining {
 			_, _ = w.buf.Write(p[:remaining])
 		} else {
 			_, _ = w.buf.Write(p)
 		}
 	}
 	// Pretend we wrote everything to avoid upstream/client code treating it as an error.
 	return len(p), nil
 }
 func (w *limitedResponseWriter) Flush() {}
 func (w *limitedResponseWriter) bodyBytes() []byte {
 	return w.buf.Bytes()
 }
 func (w *limitedResponseWriter) truncated() bool {
 	return w.totalWritten > int64(w.limit)
 }
 func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
 	}
 	mode = strings.ToLower(strings.TrimSpace(mode))
 	switch mode {
 	case OpsRetryModeClient, OpsRetryModeUpstream:
 	default:
 		return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
 	}
 	latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
 	if err != nil && !errors.Is(err, sql.ErrNoRows) {
 		return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
 	}
 	if latest != nil {
 		if strings.EqualFold(latest.Status, opsRetryStatusRunning) || strings.EqualFold(latest.Status, "queued") {
 			return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
 		}
 		lastAttemptAt := latest.CreatedAt
 		if latest.FinishedAt != nil && !latest.FinishedAt.IsZero() {
 			lastAttemptAt = *latest.FinishedAt
 		} else if latest.StartedAt != nil && !latest.StartedAt.IsZero() {
 			lastAttemptAt = *latest.StartedAt
 		}
 		if time.Since(lastAttemptAt) < opsRetryMinIntervalPerError {
 			return nil, infraerrors.Conflict("OPS_RETRY_TOO_FREQUENT", "Please wait before retrying this error again")
 		}
 	}
 	errorLog, err := s.GetErrorLogByID(ctx, errorID)
 	if err != nil {
 		return nil, err
 	}
 	if strings.TrimSpace(errorLog.RequestBody) == "" {
 		return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
 	}
 	var pinned *int64
 	if mode == OpsRetryModeUpstream {
 		if pinnedAccountID != nil && *pinnedAccountID > 0 {
 			pinned = pinnedAccountID
 		} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
 			pinned = errorLog.AccountID
 		} else {
 			return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
 		}
 	}
 	startedAt := time.Now()
 	attemptID, err := s.opsRepo.InsertRetryAttempt(ctx, &OpsInsertRetryAttemptInput{
 		RequestedByUserID: requestedByUserID,
 		SourceErrorID:     errorID,
 		Mode:              mode,
 		PinnedAccountID:   pinned,
 		Status:            opsRetryStatusRunning,
 		StartedAt:         startedAt,
 	})
 	if err != nil {
 		var pqErr *pq.Error
 		if errors.As(err, &pqErr) && string(pqErr.Code) == "23505" {
 			return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
 		}
 		return nil, infraerrors.InternalServer("OPS_RETRY_CREATE_ATTEMPT_FAILED", "Failed to create retry attempt").WithCause(err)
 	}
 	result := &OpsRetryResult{
 		AttemptID:         attemptID,
 		Mode:              mode,
 		Status:            opsRetryStatusFailed,
 		PinnedAccountID:   pinned,
 		HTTPStatusCode:    0,
 		UpstreamRequestID: "",
 		ResponsePreview:   "",
 		ResponseTruncated: false,
 		ErrorMessage:      "",
 		StartedAt:         startedAt,
 	}
 	execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
 	defer cancel()
 	execRes := s.executeRetry(execCtx, errorLog, mode, pinned)
 	finishedAt := time.Now()
 	result.FinishedAt = finishedAt
 	result.DurationMs = finishedAt.Sub(startedAt).Milliseconds()
 	if execRes != nil {
 		result.Status = execRes.status
 		result.UsedAccountID = execRes.usedAccountID
 		result.HTTPStatusCode = execRes.httpStatusCode
 		result.UpstreamRequestID = execRes.upstreamRequestID
 		result.ResponsePreview = execRes.responsePreview
 		result.ResponseTruncated = execRes.responseTruncated
 		result.ErrorMessage = execRes.errorMessage
 	}
 	updateCtx, updateCancel := context.WithTimeout(context.Background(), 3*time.Second)
 	defer updateCancel()
 	var updateErrMsg *string
 	if strings.TrimSpace(result.ErrorMessage) != "" {
 		msg := result.ErrorMessage
 		updateErrMsg = &msg
 	}
 	var resultRequestID *string
 	if strings.TrimSpace(result.UpstreamRequestID) != "" {
 		v := result.UpstreamRequestID
 		resultRequestID = &v
 	}
 	finalStatus := result.Status
 	if strings.TrimSpace(finalStatus) == "" {
 		finalStatus = opsRetryStatusFailed
 	}
 	if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{
 		ID:              attemptID,
 		Status:          finalStatus,
 		FinishedAt:      finishedAt,
 		DurationMs:      result.DurationMs,
 		ResultRequestID: resultRequestID,
 		ErrorMessage:    updateErrMsg,
 	}); err != nil {
 		// Best-effort: retry itself already executed; do not fail the API response.
 		log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
 	}
 	return result, nil
 }
 type opsRetryExecution struct {
 	status string
 	usedAccountID     *int64
 	httpStatusCode    int
 	upstreamRequestID string
 	responsePreview   string
 	responseTruncated bool
 	errorMessage string
 }
 func (s *OpsService) executeRetry(ctx context.Context, errorLog *OpsErrorLogDetail, mode string, pinnedAccountID *int64) *opsRetryExecution {
 	if errorLog == nil {
 		return &opsRetryExecution{
 			status:       opsRetryStatusFailed,
 			errorMessage: "missing error log",
 		}
 	}
 	reqType := detectOpsRetryType(errorLog.RequestPath)
 	bodyBytes := []byte(errorLog.RequestBody)
 	switch reqType {
 	case opsRetryTypeMessages:
 		bodyBytes = FilterThinkingBlocksForRetry(bodyBytes)
 	case opsRetryTypeOpenAI, opsRetryTypeGeminiV1B:
 		// No-op
 	}
 	switch strings.ToLower(strings.TrimSpace(mode)) {
 	case OpsRetryModeUpstream:
 		if pinnedAccountID == nil || *pinnedAccountID <= 0 {
 			return &opsRetryExecution{
 				status:       opsRetryStatusFailed,
 				errorMessage: "pinned_account_id required for upstream retry",
 			}
 		}
 		return s.executePinnedRetry(ctx, reqType, errorLog, bodyBytes, *pinnedAccountID)
 	case OpsRetryModeClient:
 		return s.executeClientRetry(ctx, reqType, errorLog, bodyBytes)
 	default:
 		return &opsRetryExecution{
 			status:       opsRetryStatusFailed,
 			errorMessage: "invalid retry mode",
 		}
 	}
 }
 func detectOpsRetryType(path string) opsRetryRequestType {
 	p := strings.ToLower(strings.TrimSpace(path))
 	switch {
 	case strings.Contains(p, "/responses"):
 		return opsRetryTypeOpenAI
 	case strings.Contains(p, "/v1beta/"):
 		return opsRetryTypeGeminiV1B
 	default:
 		return opsRetryTypeMessages
 	}
 }
 func (s *OpsService) executePinnedRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, pinnedAccountID int64) *opsRetryExecution {
 	if s.accountRepo == nil {
 		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account repository not available"}
 	}
 	account, err := s.accountRepo.GetByID(ctx, pinnedAccountID)
 	if err != nil {
 		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("account not found: %v", err)}
 	}
 	if account == nil {
 		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account not found"}
 	}
 	if !account.IsSchedulable() {
 		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account is not schedulable"}
 	}
 	if errorLog.GroupID != nil && *errorLog.GroupID > 0 {
 		if !containsInt64(account.GroupIDs, *errorLog.GroupID) {
 			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "pinned account is not in the same group as the original request"}
 		}
 	}
 	var release func()
 	if s.concurrencyService != nil {
 		acq, err := s.concurrencyService.AcquireAccountSlot(ctx, account.ID, account.Concurrency)
 		if err != nil {
 			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("acquire account slot failed: %v", err)}
 		}
 		if acq == nil || !acq.Acquired {
 			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account concurrency limit reached"}
 		}
 		release = acq.ReleaseFunc
 	}
 	if release != nil {
 		defer release()
 	}
 	usedID := account.ID
 	exec := s.executeWithAccount(ctx, reqType, errorLog, body, account)
 	exec.usedAccountID = &usedID
 	if exec.status == "" {
 		exec.status = opsRetryStatusFailed
 	}
 	return exec
 }
 func (s *OpsService) executeClientRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) *opsRetryExecution {
 	groupID := errorLog.GroupID
 	if groupID == nil || *groupID <= 0 {
 		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "group_id missing; cannot reselect account"}
 	}
 	model, stream, parsedErr := extractRetryModelAndStream(reqType, errorLog, body)
 	if parsedErr != nil {
 		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: parsedErr.Error()}
 	}
 	_ = stream
 	excluded := make(map[int64]struct{})
 	switches := 0
 	for {
 		if switches >= opsRetryMaxAccountSwitches {
 			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "retry failed after exhausting account failovers"}
 		}
 		selection, selErr := s.selectAccountForRetry(ctx, reqType, groupID, model, excluded)
 		if selErr != nil {
 			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: selErr.Error()}
 		}
 		if selection == nil || selection.Account == nil {
 			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "no available accounts"}
 		}
 		account := selection.Account
 		if !selection.Acquired || selection.ReleaseFunc == nil {
 			excluded[account.ID] = struct{}{}
 			switches++
 			continue
 		}
 		exec := func() *opsRetryExecution {
 			defer selection.ReleaseFunc()
 			return s.executeWithAccount(ctx, reqType, errorLog, body, account)
 		}()
 		if exec != nil {
 			if exec.status == opsRetryStatusSucceeded {
 				usedID := account.ID
 				exec.usedAccountID = &usedID
 				return exec
 			}
 			// If the gateway services ask for failover, try another account.
 			if s.isFailoverError(exec.errorMessage) {
 				excluded[account.ID] = struct{}{}
 				switches++
 				continue
 			}
 			usedID := account.ID
 			exec.usedAccountID = &usedID
 			return exec
 		}
 		excluded[account.ID] = struct{}{}
 		switches++
 	}
 }
 func (s *OpsService) selectAccountForRetry(ctx context.Context, reqType opsRetryRequestType, groupID *int64, model string, excludedIDs map[int64]struct{}) (*AccountSelectionResult, error) {
 	switch reqType {
 	case opsRetryTypeOpenAI:
 		if s.openAIGatewayService == nil {
 			return nil, fmt.Errorf("openai gateway service not available")
 		}
 		return s.openAIGatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
 	case opsRetryTypeGeminiV1B, opsRetryTypeMessages:
 		if s.gatewayService == nil {
 			return nil, fmt.Errorf("gateway service not available")
 		}
 		return s.gatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
 	default:
 		return nil, fmt.Errorf("unsupported retry type: %s", reqType)
 	}
 }
 func extractRetryModelAndStream(reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) (model string, stream bool, err error) {
 	switch reqType {
 	case opsRetryTypeMessages:
 		parsed, parseErr := ParseGatewayRequest(body)
 		if parseErr != nil {
 			return "", false, fmt.Errorf("failed to parse messages request body: %w", parseErr)
 		}
 		return parsed.Model, parsed.Stream, nil
 	case opsRetryTypeOpenAI:
 		var v struct {
 			Model  string `json:"model"`
 			Stream bool   `json:"stream"`
 		}
 		if err := json.Unmarshal(body, &v); err != nil {
 			return "", false, fmt.Errorf("failed to parse openai request body: %w", err)
 		}
 		return strings.TrimSpace(v.Model), v.Stream, nil
 	case opsRetryTypeGeminiV1B:
 		if strings.TrimSpace(errorLog.Model) == "" {
 			return "", false, fmt.Errorf("missing model for gemini v1beta retry")
 		}
 		return strings.TrimSpace(errorLog.Model), errorLog.Stream, nil
 	default:
 		return "", false, fmt.Errorf("unsupported retry type: %s", reqType)
 	}
 }
 func (s *OpsService) executeWithAccount(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, account *Account) *opsRetryExecution {
 	if account == nil {
 		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "missing account"}
 	}
 	c, w := newOpsRetryContext(ctx, errorLog)
 	var err error
 	switch reqType {
 	case opsRetryTypeOpenAI:
 		if s.openAIGatewayService == nil {
 			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "openai gateway service not available"}
 		}
 		_, err = s.openAIGatewayService.Forward(ctx, c, account, body)
 	case opsRetryTypeGeminiV1B:
 		if s.geminiCompatService == nil || s.antigravityGatewayService == nil {
 			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini services not available"}
 		}
 		modelName := strings.TrimSpace(errorLog.Model)
 		action := "generateContent"
 		if errorLog.Stream {
 			action = "streamGenerateContent"
 		}
 		if account.Platform == PlatformAntigravity {
 			_, err = s.antigravityGatewayService.ForwardGemini(ctx, c, account, modelName, action, errorLog.Stream, body)
 		} else {
 			_, err = s.geminiCompatService.ForwardNative(ctx, c, account, modelName, action, errorLog.Stream, body)
 		}
 	case opsRetryTypeMessages:
 		switch account.Platform {
 		case PlatformAntigravity:
 			if s.antigravityGatewayService == nil {
 				return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "antigravity gateway service not available"}
 			}
 			_, err = s.antigravityGatewayService.Forward(ctx, c, account, body)
 		case PlatformGemini:
 			if s.geminiCompatService == nil {
 				return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini gateway service not available"}
 			}
 			_, err = s.geminiCompatService.Forward(ctx, c, account, body)
 		default:
 			if s.gatewayService == nil {
 				return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gateway service not available"}
 			}
 			parsedReq, parseErr := ParseGatewayRequest(body)
 			if parseErr != nil {
 				return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "failed to parse request body"}
 			}
 			_, err = s.gatewayService.Forward(ctx, c, account, parsedReq)
 		}
 	default:
 		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "unsupported retry type"}
 	}
 	statusCode := http.StatusOK
 	if c != nil && c.Writer != nil {
 		statusCode = c.Writer.Status()
 	}
 	upstreamReqID := extractUpstreamRequestID(c)
 	preview, truncated := extractResponsePreview(w)
 	exec := &opsRetryExecution{
 		status:            opsRetryStatusFailed,
 		httpStatusCode:    statusCode,
 		upstreamRequestID: upstreamReqID,
 		responsePreview:   preview,
 		responseTruncated: truncated,
 		errorMessage:      "",
 	}
 	if err == nil && statusCode < 400 {
 		exec.status = opsRetryStatusSucceeded
 		return exec
 	}
 	if err != nil {
 		exec.errorMessage = err.Error()
 	} else {
 		exec.errorMessage = fmt.Sprintf("upstream returned status %d", statusCode)
 	}
 	return exec
 }
 func newOpsRetryContext(ctx context.Context, errorLog *OpsErrorLogDetail) (*gin.Context, *limitedResponseWriter) {
 	w := newLimitedResponseWriter(opsRetryCaptureBytesLimit)
 	c, _ := gin.CreateTestContext(w)
 	path := "/"
 	if errorLog != nil && strings.TrimSpace(errorLog.RequestPath) != "" {
 		path = errorLog.RequestPath
 	}
 	req, _ := http.NewRequestWithContext(ctx, http.MethodPost, "http://localhost"+path, bytes.NewReader(nil))
 	req.Header.Set("content-type", "application/json")
 	if errorLog != nil && strings.TrimSpace(errorLog.UserAgent) != "" {
 		req.Header.Set("user-agent", errorLog.UserAgent)
 	}
 	// Restore a minimal, whitelisted subset of request headers to improve retry fidelity
 	// (e.g. anthropic-beta / anthropic-version). Never replay auth credentials.
 	if errorLog != nil && strings.TrimSpace(errorLog.RequestHeaders) != "" {
 		var stored map[string]string
 		if err := json.Unmarshal([]byte(errorLog.RequestHeaders), &stored); err == nil {
 			for k, v := range stored {
 				key := strings.TrimSpace(k)
 				if key == "" {
 					continue
 				}
 				if !opsRetryRequestHeaderAllowlist[strings.ToLower(key)] {
 					continue
 				}
 				val := strings.TrimSpace(v)
 				if val == "" {
 					continue
 				}
 				req.Header.Set(key, val)
 			}
 		}
 	}
 	c.Request = req
 	return c, w
 }
 func extractUpstreamRequestID(c *gin.Context) string {
 	if c == nil || c.Writer == nil {
 		return ""
 	}
 	h := c.Writer.Header()
 	if h == nil {
 		return ""
 	}
 	for _, key := range []string{"x-request-id", "X-Request-Id", "X-Request-ID"} {
 		if v := strings.TrimSpace(h.Get(key)); v != "" {
 			return v
 		}
 	}
 	return ""
 }
 func extractResponsePreview(w *limitedResponseWriter) (preview string, truncated bool) {
 	if w == nil {
 		return "", false
 	}
 	b := bytes.TrimSpace(w.bodyBytes())
 	if len(b) == 0 {
 		return "", w.truncated()
 	}
 	if len(b) > opsRetryResponsePreviewMax {
 		return string(b[:opsRetryResponsePreviewMax]), true
 	}
 	return string(b), w.truncated()
 }
 func containsInt64(items []int64, needle int64) bool {
 	for _, v := range items {
 		if v == needle {
 			return true
 		}
 	}
 	return false
 }
 func (s *OpsService) isFailoverError(message string) bool {
 	msg := strings.ToLower(strings.TrimSpace(message))
 	if msg == "" {
 		return false
 	}
 	return strings.Contains(msg, "upstream error:") && strings.Contains(msg, "failover")
 }
--- a/backend/internal/service/ops_scheduled_report_service.go
+++ b/backend/internal/service/ops_scheduled_report_service.go
@@ -0,0 +1,705 @@
 package service
 import (
 	"context"
 	"fmt"
 	"log"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/config"
 	"github.com/google/uuid"
 	"github.com/redis/go-redis/v9"
 	"github.com/robfig/cron/v3"
 )
 const (
 	opsScheduledReportJobName = "ops_scheduled_reports"
 	opsScheduledReportLeaderLockKeyDefault = "ops:scheduled_reports:leader"
 	opsScheduledReportLeaderLockTTLDefault = 5 * time.Minute
 	opsScheduledReportLastRunKeyPrefix = "ops:scheduled_reports:last_run:"
 	opsScheduledReportTickInterval = 1 * time.Minute
 )
 var opsScheduledReportCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
 var opsScheduledReportReleaseScript = redis.NewScript(`
 if redis.call("GET", KEYS[1]) == ARGV[1] then
  return redis.call("DEL", KEYS[1])
 end
 return 0
 `)
 type OpsScheduledReportService struct {
 	opsService   *OpsService
 	userService  *UserService
 	emailService *EmailService
 	redisClient  *redis.Client
 	cfg          *config.Config
 	instanceID string
 	loc        *time.Location
 	distributedLockOn bool
 	warnNoRedisOnce   sync.Once
 	startOnce sync.Once
 	stopOnce  sync.Once
 	stopCtx   context.Context
 	stop      context.CancelFunc
 	wg        sync.WaitGroup
 }
 func NewOpsScheduledReportService(
 	opsService *OpsService,
 	userService *UserService,
 	emailService *EmailService,
 	redisClient *redis.Client,
 	cfg *config.Config,
 ) *OpsScheduledReportService {
 	lockOn := cfg == nil || strings.TrimSpace(cfg.RunMode) != config.RunModeSimple
 	loc := time.Local
 	if cfg != nil && strings.TrimSpace(cfg.Timezone) != "" {
 		if parsed, err := time.LoadLocation(strings.TrimSpace(cfg.Timezone)); err == nil && parsed != nil {
 			loc = parsed
 		}
 	}
 	return &OpsScheduledReportService{
 		opsService:   opsService,
 		userService:  userService,
 		emailService: emailService,
 		redisClient:  redisClient,
 		cfg:          cfg,
 		instanceID:        uuid.NewString(),
 		loc:               loc,
 		distributedLockOn: lockOn,
 		warnNoRedisOnce:   sync.Once{},
 		startOnce:         sync.Once{},
 		stopOnce:          sync.Once{},
 		stopCtx:           nil,
 		stop:              nil,
 		wg:                sync.WaitGroup{},
 	}
 }
 func (s *OpsScheduledReportService) Start() {
 	s.StartWithContext(context.Background())
 }
 func (s *OpsScheduledReportService) StartWithContext(ctx context.Context) {
 	if s == nil {
 		return
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	if s.cfg != nil && !s.cfg.Ops.Enabled {
 		return
 	}
 	if s.opsService == nil || s.emailService == nil {
 		return
 	}
 	s.startOnce.Do(func() {
 		s.stopCtx, s.stop = context.WithCancel(ctx)
 		s.wg.Add(1)
 		go s.run()
 	})
 }
 func (s *OpsScheduledReportService) Stop() {
 	if s == nil {
 		return
 	}
 	s.stopOnce.Do(func() {
 		if s.stop != nil {
 			s.stop()
 		}
 	})
 	s.wg.Wait()
 }
 func (s *OpsScheduledReportService) run() {
 	defer s.wg.Done()
 	ticker := time.NewTicker(opsScheduledReportTickInterval)
 	defer ticker.Stop()
 	s.runOnce()
 	for {
 		select {
 		case <-ticker.C:
 			s.runOnce()
 		case <-s.stopCtx.Done():
 			return
 		}
 	}
 }
 func (s *OpsScheduledReportService) runOnce() {
 	if s == nil || s.opsService == nil || s.emailService == nil {
 		return
 	}
 	startedAt := time.Now().UTC()
 	runAt := startedAt
 	ctx, cancel := context.WithTimeout(s.stopCtx, 60*time.Second)
 	defer cancel()
 	// Respect ops monitoring enabled switch.
 	if !s.opsService.IsMonitoringEnabled(ctx) {
 		return
 	}
 	release, ok := s.tryAcquireLeaderLock(ctx)
 	if !ok {
 		return
 	}
 	if release != nil {
 		defer release()
 	}
 	now := time.Now()
 	if s.loc != nil {
 		now = now.In(s.loc)
 	}
 	reports := s.listScheduledReports(ctx, now)
 	if len(reports) == 0 {
 		return
 	}
 	for _, report := range reports {
 		if report == nil || !report.Enabled {
 			continue
 		}
 		if report.NextRunAt.After(now) {
 			continue
 		}
 		if err := s.runReport(ctx, report, now); err != nil {
 			s.recordHeartbeatError(runAt, time.Since(startedAt), err)
 			return
 		}
 	}
 	s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
 }
 type opsScheduledReport struct {
 	Name       string
 	ReportType string
 	Schedule   string
 	Enabled    bool
 	TimeRange time.Duration
 	Recipients []string
 	ErrorDigestMinCount             int
 	AccountHealthErrorRateThreshold float64
 	LastRunAt *time.Time
 	NextRunAt time.Time
 }
 func (s *OpsScheduledReportService) listScheduledReports(ctx context.Context, now time.Time) []*opsScheduledReport {
 	if s == nil || s.opsService == nil {
 		return nil
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx)
 	if err != nil || emailCfg == nil {
 		return nil
 	}
 	if !emailCfg.Report.Enabled {
 		return nil
 	}
 	recipients := normalizeEmails(emailCfg.Report.Recipients)
 	type reportDef struct {
 		enabled   bool
 		name      string
 		kind      string
 		timeRange time.Duration
 		schedule  string
 	}
 	defs := []reportDef{
 		{enabled: emailCfg.Report.DailySummaryEnabled, name: "日报", kind: "daily_summary", timeRange: 24 * time.Hour, schedule: emailCfg.Report.DailySummarySchedule},
 		{enabled: emailCfg.Report.WeeklySummaryEnabled, name: "周报", kind: "weekly_summary", timeRange: 7 * 24 * time.Hour, schedule: emailCfg.Report.WeeklySummarySchedule},
 		{enabled: emailCfg.Report.ErrorDigestEnabled, name: "错误摘要", kind: "error_digest", timeRange: 24 * time.Hour, schedule: emailCfg.Report.ErrorDigestSchedule},
 		{enabled: emailCfg.Report.AccountHealthEnabled, name: "账号健康", kind: "account_health", timeRange: 24 * time.Hour, schedule: emailCfg.Report.AccountHealthSchedule},
 	}
 	out := make([]*opsScheduledReport, 0, len(defs))
 	for _, d := range defs {
 		if !d.enabled {
 			continue
 		}
 		spec := strings.TrimSpace(d.schedule)
 		if spec == "" {
 			continue
 		}
 		sched, err := opsScheduledReportCronParser.Parse(spec)
 		if err != nil {
 			log.Printf("[OpsScheduledReport] invalid cron spec=%q for report=%s: %v", spec, d.kind, err)
 			continue
 		}
 		lastRun := s.getLastRunAt(ctx, d.kind)
 		base := lastRun
 		if base.IsZero() {
 			// Allow a schedule matching the current minute to trigger right after startup.
 			base = now.Add(-1 * time.Minute)
 		}
 		next := sched.Next(base)
 		if next.IsZero() {
 			continue
 		}
 		var lastRunPtr *time.Time
 		if !lastRun.IsZero() {
 			lastCopy := lastRun
 			lastRunPtr = &lastCopy
 		}
 		out = append(out, &opsScheduledReport{
 			Name:       d.name,
 			ReportType: d.kind,
 			Schedule:   spec,
 			Enabled:    true,
 			TimeRange: d.timeRange,
 			Recipients: recipients,
 			ErrorDigestMinCount:             emailCfg.Report.ErrorDigestMinCount,
 			AccountHealthErrorRateThreshold: emailCfg.Report.AccountHealthErrorRateThreshold,
 			LastRunAt: lastRunPtr,
 			NextRunAt: next,
 		})
 	}
 	return out
 }
 func (s *OpsScheduledReportService) runReport(ctx context.Context, report *opsScheduledReport, now time.Time) error {
 	if s == nil || s.opsService == nil || s.emailService == nil || report == nil {
 		return nil
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	// Mark as "run" up-front so a broken SMTP config doesn't spam retries every minute.
 	s.setLastRunAt(ctx, report.ReportType, now)
 	content, err := s.generateReportHTML(ctx, report, now)
 	if err != nil {
 		return err
 	}
 	if strings.TrimSpace(content) == "" {
 		// Skip sending when the report decides not to emit content (e.g., digest below min count).
 		return nil
 	}
 	recipients := report.Recipients
 	if len(recipients) == 0 && s.userService != nil {
 		admin, err := s.userService.GetFirstAdmin(ctx)
 		if err == nil && admin != nil && strings.TrimSpace(admin.Email) != "" {
 			recipients = []string{strings.TrimSpace(admin.Email)}
 		}
 	}
 	if len(recipients) == 0 {
 		return nil
 	}
 	subject := fmt.Sprintf("[Ops Report] %s", strings.TrimSpace(report.Name))
 	for _, to := range recipients {
 		addr := strings.TrimSpace(to)
 		if addr == "" {
 			continue
 		}
 		if err := s.emailService.SendEmail(ctx, addr, subject, content); err != nil {
 			// Ignore per-recipient failures; continue best-effort.
 			continue
 		}
 	}
 	return nil
 }
 func (s *OpsScheduledReportService) generateReportHTML(ctx context.Context, report *opsScheduledReport, now time.Time) (string, error) {
 	if s == nil || s.opsService == nil || report == nil {
 		return "", fmt.Errorf("service not initialized")
 	}
 	if report.TimeRange <= 0 {
 		return "", fmt.Errorf("invalid time range")
 	}
 	end := now.UTC()
 	start := end.Add(-report.TimeRange)
 	switch strings.TrimSpace(report.ReportType) {
 	case "daily_summary", "weekly_summary":
 		overview, err := s.opsService.GetDashboardOverview(ctx, &OpsDashboardFilter{
 			StartTime: start,
 			EndTime:   end,
 			Platform:  "",
 			GroupID:   nil,
 			QueryMode: OpsQueryModeAuto,
 		})
 		if err != nil {
 			// If pre-aggregation isn't ready but the report is requested, fall back to raw.
 			if strings.TrimSpace(report.ReportType) == "daily_summary" || strings.TrimSpace(report.ReportType) == "weekly_summary" {
 				overview, err = s.opsService.GetDashboardOverview(ctx, &OpsDashboardFilter{
 					StartTime: start,
 					EndTime:   end,
 					Platform:  "",
 					GroupID:   nil,
 					QueryMode: OpsQueryModeRaw,
 				})
 			}
 			if err != nil {
 				return "", err
 			}
 		}
 		return buildOpsSummaryEmailHTML(report.Name, start, end, overview), nil
 	case "error_digest":
 		// Lightweight digest: list recent errors (status>=400) and breakdown by type.
 		startTime := start
 		endTime := end
 		filter := &OpsErrorLogFilter{
 			StartTime: &startTime,
 			EndTime:   &endTime,
 			Page:      1,
 			PageSize:  100,
 		}
 		out, err := s.opsService.GetErrorLogs(ctx, filter)
 		if err != nil {
 			return "", err
 		}
 		if report.ErrorDigestMinCount > 0 && out != nil && out.Total < report.ErrorDigestMinCount {
 			return "", nil
 		}
 		return buildOpsErrorDigestEmailHTML(report.Name, start, end, out), nil
 	case "account_health":
 		// Best-effort: use account availability (not error rate yet).
 		avail, err := s.opsService.GetAccountAvailability(ctx, "", nil)
 		if err != nil {
 			return "", err
 		}
 		_ = report.AccountHealthErrorRateThreshold // reserved for future per-account error rate report
 		return buildOpsAccountHealthEmailHTML(report.Name, start, end, avail), nil
 	default:
 		return "", fmt.Errorf("unknown report type: %s", report.ReportType)
 	}
 }
 func buildOpsSummaryEmailHTML(title string, start, end time.Time, overview *OpsDashboardOverview) string {
 	if overview == nil {
 		return fmt.Sprintf("<h2>%s</h2><p>No data.</p>", htmlEscape(title))
 	}
 	latP50 := "-"
 	latP99 := "-"
 	if overview.Duration.P50 != nil {
 		latP50 = fmt.Sprintf("%dms", *overview.Duration.P50)
 	}
 	if overview.Duration.P99 != nil {
 		latP99 = fmt.Sprintf("%dms", *overview.Duration.P99)
 	}
 	ttftP50 := "-"
 	ttftP99 := "-"
 	if overview.TTFT.P50 != nil {
 		ttftP50 = fmt.Sprintf("%dms", *overview.TTFT.P50)
 	}
 	if overview.TTFT.P99 != nil {
 		ttftP99 = fmt.Sprintf("%dms", *overview.TTFT.P99)
 	}
 	return fmt.Sprintf(`
 <h2>%s</h2>
 <p><b>Period</b>: %s ~ %s (UTC)</p>
 <ul>
  <li><b>Total Requests</b>: %d</li>
  <li><b>Success</b>: %d</li>
  <li><b>Errors (SLA)</b>: %d</li>
  <li><b>Business Limited</b>: %d</li>
  <li><b>SLA</b>: %.2f%%</li>
  <li><b>Error Rate</b>: %.2f%%</li>
  <li><b>Upstream Error Rate (excl 429/529)</b>: %.2f%%</li>
  <li><b>Upstream Errors</b>: excl429/529=%d, 429=%d, 529=%d</li>
  <li><b>Latency</b>: p50=%s, p99=%s</li>
  <li><b>TTFT</b>: p50=%s, p99=%s</li>
  <li><b>Tokens</b>: %d</li>
  <li><b>QPS</b>: current=%.1f, peak=%.1f, avg=%.1f</li>
  <li><b>TPS</b>: current=%.1f, peak=%.1f, avg=%.1f</li>
 </ul>
 `,
 		htmlEscape(strings.TrimSpace(title)),
 		htmlEscape(start.UTC().Format(time.RFC3339)),
 		htmlEscape(end.UTC().Format(time.RFC3339)),
 		overview.RequestCountTotal,
 		overview.SuccessCount,
 		overview.ErrorCountSLA,
 		overview.BusinessLimitedCount,
 		overview.SLA*100,
 		overview.ErrorRate*100,
 		overview.UpstreamErrorRate*100,
 		overview.UpstreamErrorCountExcl429529,
 		overview.Upstream429Count,
 		overview.Upstream529Count,
 		htmlEscape(latP50),
 		htmlEscape(latP99),
 		htmlEscape(ttftP50),
 		htmlEscape(ttftP99),
 		overview.TokenConsumed,
 		overview.QPS.Current,
 		overview.QPS.Peak,
 		overview.QPS.Avg,
 		overview.TPS.Current,
 		overview.TPS.Peak,
 		overview.TPS.Avg,
 	)
 }
 func buildOpsErrorDigestEmailHTML(title string, start, end time.Time, list *OpsErrorLogList) string {
 	total := 0
 	recent := []*OpsErrorLog{}
 	if list != nil {
 		total = list.Total
 		recent = list.Errors
 	}
 	if len(recent) > 10 {
 		recent = recent[:10]
 	}
 	rows := ""
 	for _, item := range recent {
 		if item == nil {
 			continue
 		}
 		rows += fmt.Sprintf(
 			"<tr><td>%s</td><td>%s</td><td>%d</td><td>%s</td></tr>",
 			htmlEscape(item.CreatedAt.UTC().Format(time.RFC3339)),
 			htmlEscape(item.Platform),
 			item.StatusCode,
 			htmlEscape(truncateString(item.Message, 180)),
 		)
 	}
 	if rows == "" {
 		rows = "<tr><td colspan=\"4\">No recent errors.</td></tr>"
 	}
 	return fmt.Sprintf(`
 <h2>%s</h2>
 <p><b>Period</b>: %s ~ %s (UTC)</p>
 <p><b>Total Errors</b>: %d</p>
 <h3>Recent</h3>
 <table border="1" cellpadding="6" cellspacing="0" style="border-collapse:collapse;">
  <thead><tr><th>Time</th><th>Platform</th><th>Status</th><th>Message</th></tr></thead>
  <tbody>%s</tbody>
 </table>
 `,
 		htmlEscape(strings.TrimSpace(title)),
 		htmlEscape(start.UTC().Format(time.RFC3339)),
 		htmlEscape(end.UTC().Format(time.RFC3339)),
 		total,
 		rows,
 	)
 }
 func buildOpsAccountHealthEmailHTML(title string, start, end time.Time, avail *OpsAccountAvailability) string {
 	total := 0
 	available := 0
 	rateLimited := 0
 	hasError := 0
 	if avail != nil && avail.Accounts != nil {
 		for _, a := range avail.Accounts {
 			if a == nil {
 				continue
 			}
 			total++
 			if a.IsAvailable {
 				available++
 			}
 			if a.IsRateLimited {
 				rateLimited++
 			}
 			if a.HasError {
 				hasError++
 			}
 		}
 	}
 	return fmt.Sprintf(`
 <h2>%s</h2>
 <p><b>Period</b>: %s ~ %s (UTC)</p>
 <ul>
  <li><b>Total Accounts</b>: %d</li>
  <li><b>Available</b>: %d</li>
  <li><b>Rate Limited</b>: %d</li>
  <li><b>Error</b>: %d</li>
 </ul>
 <p>Note: This report currently reflects account availability status only.</p>
 `,
 		htmlEscape(strings.TrimSpace(title)),
 		htmlEscape(start.UTC().Format(time.RFC3339)),
 		htmlEscape(end.UTC().Format(time.RFC3339)),
 		total,
 		available,
 		rateLimited,
 		hasError,
 	)
 }
 func (s *OpsScheduledReportService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
 	if s == nil || !s.distributedLockOn {
 		return nil, true
 	}
 	if s.redisClient == nil {
 		s.warnNoRedisOnce.Do(func() {
 			log.Printf("[OpsScheduledReport] redis not configured; running without distributed lock")
 		})
 		return nil, true
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	key := opsScheduledReportLeaderLockKeyDefault
 	ttl := opsScheduledReportLeaderLockTTLDefault
 	if strings.TrimSpace(key) == "" {
 		key = "ops:scheduled_reports:leader"
 	}
 	if ttl <= 0 {
 		ttl = 5 * time.Minute
 	}
 	ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
 	if err != nil {
 		// Prefer fail-closed to avoid duplicate report sends when Redis is flaky.
 		log.Printf("[OpsScheduledReport] leader lock SetNX failed; skipping this cycle: %v", err)
 		return nil, false
 	}
 	if !ok {
 		return nil, false
 	}
 	return func() {
 		_, _ = opsScheduledReportReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
 	}, true
 }
 func (s *OpsScheduledReportService) getLastRunAt(ctx context.Context, reportType string) time.Time {
 	if s == nil || s.redisClient == nil {
 		return time.Time{}
 	}
 	kind := strings.TrimSpace(reportType)
 	if kind == "" {
 		return time.Time{}
 	}
 	key := opsScheduledReportLastRunKeyPrefix + kind
 	raw, err := s.redisClient.Get(ctx, key).Result()
 	if err != nil || strings.TrimSpace(raw) == "" {
 		return time.Time{}
 	}
 	sec, err := strconv.ParseInt(strings.TrimSpace(raw), 10, 64)
 	if err != nil || sec <= 0 {
 		return time.Time{}
 	}
 	last := time.Unix(sec, 0)
 	// Cron schedules are interpreted in the configured timezone (s.loc). Ensure the base time
 	// passed into cron.Next() uses the same location; otherwise the job will drift by timezone
 	// offset (e.g. Asia/Shanghai default would run 8h later after the first execution).
 	if s.loc != nil {
 		return last.In(s.loc)
 	}
 	return last.UTC()
 }
 func (s *OpsScheduledReportService) setLastRunAt(ctx context.Context, reportType string, t time.Time) {
 	if s == nil || s.redisClient == nil {
 		return
 	}
 	kind := strings.TrimSpace(reportType)
 	if kind == "" {
 		return
 	}
 	if t.IsZero() {
 		t = time.Now().UTC()
 	}
 	key := opsScheduledReportLastRunKeyPrefix + kind
 	_ = s.redisClient.Set(ctx, key, strconv.FormatInt(t.UTC().Unix(), 10), 14*24*time.Hour).Err()
 }
 func (s *OpsScheduledReportService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
 	if s == nil || s.opsService == nil || s.opsService.opsRepo == nil {
 		return
 	}
 	now := time.Now().UTC()
 	durMs := duration.Milliseconds()
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 	defer cancel()
 	_ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
 		JobName:        opsScheduledReportJobName,
 		LastRunAt:      &runAt,
 		LastSuccessAt:  &now,
 		LastDurationMs: &durMs,
 	})
 }
 func (s *OpsScheduledReportService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
 	if s == nil || s.opsService == nil || s.opsService.opsRepo == nil || err == nil {
 		return
 	}
 	now := time.Now().UTC()
 	durMs := duration.Milliseconds()
 	msg := truncateString(err.Error(), 2048)
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 	defer cancel()
 	_ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
 		JobName:        opsScheduledReportJobName,
 		LastRunAt:      &runAt,
 		LastErrorAt:    &now,
 		LastError:      &msg,
 		LastDurationMs: &durMs,
 	})
 }
 func normalizeEmails(in []string) []string {
 	if len(in) == 0 {
 		return nil
 	}
 	seen := make(map[string]struct{}, len(in))
 	out := make([]string, 0, len(in))
 	for _, raw := range in {
 		addr := strings.ToLower(strings.TrimSpace(raw))
 		if addr == "" {
 			continue
 		}
 		if _, ok := seen[addr]; ok {
 			continue
 		}
 		seen[addr] = struct{}{}
 		out = append(out, addr)
 	}
 	return out
 }
--- a/backend/internal/service/ops_service.go
+++ b/backend/internal/service/ops_service.go
@@ -0,0 +1,537 @@
 package service
 import (
 	"context"
 	"database/sql"
 	"encoding/json"
 	"errors"
 	"log"
 	"strings"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/config"
 	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
 )
 var ErrOpsDisabled = infraerrors.NotFound("OPS_DISABLED", "Ops monitoring is disabled")
 const (
 	opsMaxStoredRequestBodyBytes = 10 * 1024
 	opsMaxStoredErrorBodyBytes   = 20 * 1024
 )
 // OpsService provides ingestion and query APIs for the Ops monitoring module.
 type OpsService struct {
 	opsRepo     OpsRepository
 	settingRepo SettingRepository
 	cfg         *config.Config
 	accountRepo AccountRepository
 	// getAccountAvailability is a unit-test hook for overriding account availability lookup.
 	getAccountAvailability func(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error)
 	concurrencyService        *ConcurrencyService
 	gatewayService            *GatewayService
 	openAIGatewayService      *OpenAIGatewayService
 	geminiCompatService       *GeminiMessagesCompatService
 	antigravityGatewayService *AntigravityGatewayService
 }
 func NewOpsService(
 	opsRepo OpsRepository,
 	settingRepo SettingRepository,
 	cfg *config.Config,
 	accountRepo AccountRepository,
 	concurrencyService *ConcurrencyService,
 	gatewayService *GatewayService,
 	openAIGatewayService *OpenAIGatewayService,
 	geminiCompatService *GeminiMessagesCompatService,
 	antigravityGatewayService *AntigravityGatewayService,
 ) *OpsService {
 	return &OpsService{
 		opsRepo:     opsRepo,
 		settingRepo: settingRepo,
 		cfg:         cfg,
 		accountRepo: accountRepo,
 		concurrencyService:        concurrencyService,
 		gatewayService:            gatewayService,
 		openAIGatewayService:      openAIGatewayService,
 		geminiCompatService:       geminiCompatService,
 		antigravityGatewayService: antigravityGatewayService,
 	}
 }
 func (s *OpsService) RequireMonitoringEnabled(ctx context.Context) error {
 	if s.IsMonitoringEnabled(ctx) {
 		return nil
 	}
 	return ErrOpsDisabled
 }
 func (s *OpsService) IsMonitoringEnabled(ctx context.Context) bool {
 	// Hard switch: disable ops entirely.
 	if s.cfg != nil && !s.cfg.Ops.Enabled {
 		return false
 	}
 	if s.settingRepo == nil {
 		return true
 	}
 	value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
 	if err != nil {
 		// Default enabled when key is missing, and fail-open on transient errors
 		// (ops should never block gateway traffic).
 		if errors.Is(err, ErrSettingNotFound) {
 			return true
 		}
 		return true
 	}
 	switch strings.ToLower(strings.TrimSpace(value)) {
 	case "false", "0", "off", "disabled":
 		return false
 	default:
 		return true
 	}
 }
 func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogInput, rawRequestBody []byte) error {
 	if entry == nil {
 		return nil
 	}
 	if !s.IsMonitoringEnabled(ctx) {
 		return nil
 	}
 	if s.opsRepo == nil {
 		return nil
 	}
 	// Ensure timestamps are always populated.
 	if entry.CreatedAt.IsZero() {
 		entry.CreatedAt = time.Now()
 	}
 	// Ensure required fields exist (DB has NOT NULL constraints).
 	entry.ErrorPhase = strings.TrimSpace(entry.ErrorPhase)
 	entry.ErrorType = strings.TrimSpace(entry.ErrorType)
 	if entry.ErrorPhase == "" {
 		entry.ErrorPhase = "internal"
 	}
 	if entry.ErrorType == "" {
 		entry.ErrorType = "api_error"
 	}
 	// Sanitize + trim request body (errors only).
 	if len(rawRequestBody) > 0 {
 		sanitized, truncated, bytesLen := sanitizeAndTrimRequestBody(rawRequestBody, opsMaxStoredRequestBodyBytes)
 		if sanitized != "" {
 			entry.RequestBodyJSON = &sanitized
 		}
 		entry.RequestBodyTruncated = truncated
 		entry.RequestBodyBytes = &bytesLen
 	}
 	// Sanitize + truncate error_body to avoid storing sensitive data.
 	if strings.TrimSpace(entry.ErrorBody) != "" {
 		sanitized, _ := sanitizeErrorBodyForStorage(entry.ErrorBody, opsMaxStoredErrorBodyBytes)
 		entry.ErrorBody = sanitized
 	}
 	// Sanitize upstream error context if provided by gateway services.
 	if entry.UpstreamStatusCode != nil && *entry.UpstreamStatusCode <= 0 {
 		entry.UpstreamStatusCode = nil
 	}
 	if entry.UpstreamErrorMessage != nil {
 		msg := strings.TrimSpace(*entry.UpstreamErrorMessage)
 		msg = sanitizeUpstreamErrorMessage(msg)
 		msg = truncateString(msg, 2048)
 		if strings.TrimSpace(msg) == "" {
 			entry.UpstreamErrorMessage = nil
 		} else {
 			entry.UpstreamErrorMessage = &msg
 		}
 	}
 	if entry.UpstreamErrorDetail != nil {
 		detail := strings.TrimSpace(*entry.UpstreamErrorDetail)
 		if detail == "" {
 			entry.UpstreamErrorDetail = nil
 		} else {
 			sanitized, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes)
 			if strings.TrimSpace(sanitized) == "" {
 				entry.UpstreamErrorDetail = nil
 			} else {
 				entry.UpstreamErrorDetail = &sanitized
 			}
 		}
 	}
 	// Sanitize + serialize upstream error events list.
 	if len(entry.UpstreamErrors) > 0 {
 		const maxEvents = 32
 		events := entry.UpstreamErrors
 		if len(events) > maxEvents {
 			events = events[len(events)-maxEvents:]
 		}
 		sanitized := make([]*OpsUpstreamErrorEvent, 0, len(events))
 		for _, ev := range events {
 			if ev == nil {
 				continue
 			}
 			out := *ev
 			out.Platform = strings.TrimSpace(out.Platform)
 			out.UpstreamRequestID = truncateString(strings.TrimSpace(out.UpstreamRequestID), 128)
 			out.Kind = truncateString(strings.TrimSpace(out.Kind), 64)
 			if out.AccountID < 0 {
 				out.AccountID = 0
 			}
 			if out.UpstreamStatusCode < 0 {
 				out.UpstreamStatusCode = 0
 			}
 			if out.AtUnixMs < 0 {
 				out.AtUnixMs = 0
 			}
 			msg := sanitizeUpstreamErrorMessage(strings.TrimSpace(out.Message))
 			msg = truncateString(msg, 2048)
 			out.Message = msg
 			detail := strings.TrimSpace(out.Detail)
 			if detail != "" {
 				// Keep upstream detail small; request bodies are not stored here, only upstream error payloads.
 				sanitizedDetail, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes)
 				out.Detail = sanitizedDetail
 			} else {
 				out.Detail = ""
 			}
 			// Drop fully-empty events (can happen if only status code was known).
 			if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" {
 				continue
 			}
 			evCopy := out
 			sanitized = append(sanitized, &evCopy)
 		}
 		entry.UpstreamErrorsJSON = marshalOpsUpstreamErrors(sanitized)
 		entry.UpstreamErrors = nil
 	}
 	if _, err := s.opsRepo.InsertErrorLog(ctx, entry); err != nil {
 		// Never bubble up to gateway; best-effort logging.
 		log.Printf("[Ops] RecordError failed: %v", err)
 		return err
 	}
 	return nil
 }
 func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil
 	}
 	return s.opsRepo.ListErrorLogs(ctx, filter)
 }
 func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
 	}
 	detail, err := s.opsRepo.GetErrorLogByID(ctx, id)
 	if err != nil {
 		if errors.Is(err, sql.ErrNoRows) {
 			return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
 		}
 		return nil, infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err)
 	}
 	return detail, nil
 }
 func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) {
 	bytesLen = len(raw)
 	if len(raw) == 0 {
 		return "", false, 0
 	}
 	var decoded any
 	if err := json.Unmarshal(raw, &decoded); err != nil {
 		// If it's not valid JSON, don't store (retry would not be reliable anyway).
 		return "", false, bytesLen
 	}
 	decoded = redactSensitiveJSON(decoded)
 	encoded, err := json.Marshal(decoded)
 	if err != nil {
 		return "", false, bytesLen
 	}
 	if len(encoded) <= maxBytes {
 		return string(encoded), false, bytesLen
 	}
 	// Trim conversation history to keep the most recent context.
 	if root, ok := decoded.(map[string]any); ok {
 		if trimmed, ok := trimConversationArrays(root, maxBytes); ok {
 			encoded2, err2 := json.Marshal(trimmed)
 			if err2 == nil && len(encoded2) <= maxBytes {
 				return string(encoded2), true, bytesLen
 			}
 			// Fallthrough: keep shrinking.
 			decoded = trimmed
 		}
 		essential := shrinkToEssentials(root)
 		encoded3, err3 := json.Marshal(essential)
 		if err3 == nil && len(encoded3) <= maxBytes {
 			return string(encoded3), true, bytesLen
 		}
 	}
 	// Last resort: store a minimal placeholder (still valid JSON).
 	placeholder := map[string]any{
 		"request_body_truncated": true,
 	}
 	if model := extractString(decoded, "model"); model != "" {
 		placeholder["model"] = model
 	}
 	encoded4, err4 := json.Marshal(placeholder)
 	if err4 != nil {
 		return "", true, bytesLen
 	}
 	return string(encoded4), true, bytesLen
 }
 func redactSensitiveJSON(v any) any {
 	switch t := v.(type) {
 	case map[string]any:
 		out := make(map[string]any, len(t))
 		for k, vv := range t {
 			if isSensitiveKey(k) {
 				out[k] = "[REDACTED]"
 				continue
 			}
 			out[k] = redactSensitiveJSON(vv)
 		}
 		return out
 	case []any:
 		out := make([]any, 0, len(t))
 		for _, vv := range t {
 			out = append(out, redactSensitiveJSON(vv))
 		}
 		return out
 	default:
 		return v
 	}
 }
 func isSensitiveKey(key string) bool {
 	k := strings.ToLower(strings.TrimSpace(key))
 	if k == "" {
 		return false
 	}
 	// Exact matches (common credential fields).
 	switch k {
 	case "authorization",
 		"proxy-authorization",
 		"x-api-key",
 		"api_key",
 		"apikey",
 		"access_token",
 		"refresh_token",
 		"id_token",
 		"session_token",
 		"token",
 		"password",
 		"passwd",
 		"passphrase",
 		"secret",
 		"client_secret",
 		"private_key",
 		"jwt",
 		"signature",
 		"accesskeyid",
 		"secretaccesskey":
 		return true
 	}
 	// Suffix matches.
 	for _, suffix := range []string{
 		"_secret",
 		"_token",
 		"_id_token",
 		"_session_token",
 		"_password",
 		"_passwd",
 		"_passphrase",
 		"_key",
 		"secret_key",
 		"private_key",
 	} {
 		if strings.HasSuffix(k, suffix) {
 			return true
 		}
 	}
 	// Substring matches (conservative, but errs on the side of privacy).
 	for _, sub := range []string{
 		"secret",
 		"token",
 		"password",
 		"passwd",
 		"passphrase",
 		"privatekey",
 		"private_key",
 		"apikey",
 		"api_key",
 		"accesskeyid",
 		"secretaccesskey",
 		"bearer",
 		"cookie",
 		"credential",
 		"session",
 		"jwt",
 		"signature",
 	} {
 		if strings.Contains(k, sub) {
 			return true
 		}
 	}
 	return false
 }
 func trimConversationArrays(root map[string]any, maxBytes int) (map[string]any, bool) {
 	// Supported: anthropic/openai: messages; gemini: contents.
 	if out, ok := trimArrayField(root, "messages", maxBytes); ok {
 		return out, true
 	}
 	if out, ok := trimArrayField(root, "contents", maxBytes); ok {
 		return out, true
 	}
 	return root, false
 }
 func trimArrayField(root map[string]any, field string, maxBytes int) (map[string]any, bool) {
 	raw, ok := root[field]
 	if !ok {
 		return nil, false
 	}
 	arr, ok := raw.([]any)
 	if !ok || len(arr) == 0 {
 		return nil, false
 	}
 	// Keep at least the last message/content. Use binary search so we don't marshal O(n) times.
 	// We are dropping from the *front* of the array (oldest context first).
 	lo := 0
 	hi := len(arr) - 1 // inclusive; hi ensures at least one item remains
 	var best map[string]any
 	found := false
 	for lo <= hi {
 		mid := (lo + hi) / 2
 		candidateArr := arr[mid:]
 		if len(candidateArr) == 0 {
 			lo = mid + 1
 			continue
 		}
 		next := shallowCopyMap(root)
 		next[field] = candidateArr
 		encoded, err := json.Marshal(next)
 		if err != nil {
 			// If marshal fails, try dropping more.
 			lo = mid + 1
 			continue
 		}
 		if len(encoded) <= maxBytes {
 			best = next
 			found = true
 			// Try to keep more context by dropping fewer items.
 			hi = mid - 1
 			continue
 		}
 		// Need to drop more.
 		lo = mid + 1
 	}
 	if found {
 		return best, true
 	}
 	// Nothing fit (even with only one element); return the smallest slice and let the
 	// caller fall back to shrinkToEssentials().
 	next := shallowCopyMap(root)
 	next[field] = arr[len(arr)-1:]
 	return next, true
 }
 func shrinkToEssentials(root map[string]any) map[string]any {
 	out := make(map[string]any)
 	for _, key := range []string{"model", "stream", "max_tokens", "temperature", "top_p", "top_k"} {
 		if v, ok := root[key]; ok {
 			out[key] = v
 		}
 	}
 	// Keep only the last element of the conversation array.
 	if v, ok := root["messages"]; ok {
 		if arr, ok := v.([]any); ok && len(arr) > 0 {
 			out["messages"] = []any{arr[len(arr)-1]}
 		}
 	}
 	if v, ok := root["contents"]; ok {
 		if arr, ok := v.([]any); ok && len(arr) > 0 {
 			out["contents"] = []any{arr[len(arr)-1]}
 		}
 	}
 	return out
 }
 func shallowCopyMap(m map[string]any) map[string]any {
 	out := make(map[string]any, len(m))
 	for k, v := range m {
 		out[k] = v
 	}
 	return out
 }
 func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, truncated bool) {
 	raw = strings.TrimSpace(raw)
 	if raw == "" {
 		return "", false
 	}
 	// Prefer JSON-safe sanitization when possible.
 	if out, trunc, _ := sanitizeAndTrimRequestBody([]byte(raw), maxBytes); out != "" {
 		return out, trunc
 	}
 	// Non-JSON: best-effort truncate.
 	if maxBytes > 0 && len(raw) > maxBytes {
 		return truncateString(raw, maxBytes), true
 	}
 	return raw, false
 }
 func extractString(v any, key string) string {
 	root, ok := v.(map[string]any)
 	if !ok {
 		return ""
 	}
 	s, _ := root[key].(string)
 	return strings.TrimSpace(s)
 }
--- a/backend/internal/service/ops_settings.go
+++ b/backend/internal/service/ops_settings.go
@@ -0,0 +1,465 @@
 package service
 import (
 	"context"
 	"encoding/json"
 	"errors"
 	"strings"
 	"time"
 )
 const (
 	opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader"
 	opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second
 )
 // =========================
 // Email notification config
 // =========================
 func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) {
 	defaultCfg := defaultOpsEmailNotificationConfig()
 	if s == nil || s.settingRepo == nil {
 		return defaultCfg, nil
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig)
 	if err != nil {
 		if errors.Is(err, ErrSettingNotFound) {
 			// Initialize defaults on first read (best-effort).
 			if b, mErr := json.Marshal(defaultCfg); mErr == nil {
 				_ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b))
 			}
 			return defaultCfg, nil
 		}
 		return nil, err
 	}
 	cfg := &OpsEmailNotificationConfig{}
 	if err := json.Unmarshal([]byte(raw), cfg); err != nil {
 		// Corrupted JSON should not break ops UI; fall back to defaults.
 		return defaultCfg, nil
 	}
 	normalizeOpsEmailNotificationConfig(cfg)
 	return cfg, nil
 }
 func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) {
 	if s == nil || s.settingRepo == nil {
 		return nil, errors.New("setting repository not initialized")
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	if req == nil {
 		return nil, errors.New("invalid request")
 	}
 	cfg, err := s.GetEmailNotificationConfig(ctx)
 	if err != nil {
 		return nil, err
 	}
 	if req.Alert != nil {
 		cfg.Alert.Enabled = req.Alert.Enabled
 		if req.Alert.Recipients != nil {
 			cfg.Alert.Recipients = req.Alert.Recipients
 		}
 		cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity)
 		cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour
 		cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds
 		cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts
 	}
 	if req.Report != nil {
 		cfg.Report.Enabled = req.Report.Enabled
 		if req.Report.Recipients != nil {
 			cfg.Report.Recipients = req.Report.Recipients
 		}
 		cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled
 		cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule)
 		cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled
 		cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule)
 		cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled
 		cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule)
 		cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount
 		cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled
 		cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule)
 		cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold
 	}
 	if err := validateOpsEmailNotificationConfig(cfg); err != nil {
 		return nil, err
 	}
 	normalizeOpsEmailNotificationConfig(cfg)
 	raw, err := json.Marshal(cfg)
 	if err != nil {
 		return nil, err
 	}
 	if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil {
 		return nil, err
 	}
 	return cfg, nil
 }
 func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig {
 	return &OpsEmailNotificationConfig{
 		Alert: OpsEmailAlertConfig{
 			Enabled:               true,
 			Recipients:            []string{},
 			MinSeverity:           "",
 			RateLimitPerHour:      0,
 			BatchingWindowSeconds: 0,
 			IncludeResolvedAlerts: false,
 		},
 		Report: OpsEmailReportConfig{
 			Enabled:                         false,
 			Recipients:                      []string{},
 			DailySummaryEnabled:             false,
 			DailySummarySchedule:            "0 9 * * *",
 			WeeklySummaryEnabled:            false,
 			WeeklySummarySchedule:           "0 9 * * 1",
 			ErrorDigestEnabled:              false,
 			ErrorDigestSchedule:             "0 9 * * *",
 			ErrorDigestMinCount:             10,
 			AccountHealthEnabled:            false,
 			AccountHealthSchedule:           "0 9 * * *",
 			AccountHealthErrorRateThreshold: 10.0,
 		},
 	}
 }
 func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) {
 	if cfg == nil {
 		return
 	}
 	if cfg.Alert.Recipients == nil {
 		cfg.Alert.Recipients = []string{}
 	}
 	if cfg.Report.Recipients == nil {
 		cfg.Report.Recipients = []string{}
 	}
 	cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity)
 	cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule)
 	cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule)
 	cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule)
 	cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule)
 	// Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings.
 	if cfg.Report.DailySummarySchedule == "" {
 		cfg.Report.DailySummarySchedule = "0 9 * * *"
 	}
 	if cfg.Report.WeeklySummarySchedule == "" {
 		cfg.Report.WeeklySummarySchedule = "0 9 * * 1"
 	}
 	if cfg.Report.ErrorDigestSchedule == "" {
 		cfg.Report.ErrorDigestSchedule = "0 9 * * *"
 	}
 	if cfg.Report.AccountHealthSchedule == "" {
 		cfg.Report.AccountHealthSchedule = "0 9 * * *"
 	}
 }
 func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error {
 	if cfg == nil {
 		return errors.New("invalid config")
 	}
 	if cfg.Alert.RateLimitPerHour < 0 {
 		return errors.New("alert.rate_limit_per_hour must be >= 0")
 	}
 	if cfg.Alert.BatchingWindowSeconds < 0 {
 		return errors.New("alert.batching_window_seconds must be >= 0")
 	}
 	switch strings.TrimSpace(cfg.Alert.MinSeverity) {
 	case "", "critical", "warning", "info":
 	default:
 		return errors.New("alert.min_severity must be one of: critical, warning, info, or empty")
 	}
 	if cfg.Report.ErrorDigestMinCount < 0 {
 		return errors.New("report.error_digest_min_count must be >= 0")
 	}
 	if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 {
 		return errors.New("report.account_health_error_rate_threshold must be between 0 and 100")
 	}
 	return nil
 }
 // =========================
 // Alert runtime settings
 // =========================
 func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings {
 	return &OpsAlertRuntimeSettings{
 		EvaluationIntervalSeconds: 60,
 		DistributedLock: OpsDistributedLockSettings{
 			Enabled:    true,
 			Key:        opsAlertEvaluatorLeaderLockKeyDefault,
 			TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()),
 		},
 		Silencing: OpsAlertSilencingSettings{
 			Enabled:            false,
 			GlobalUntilRFC3339: "",
 			GlobalReason:       "",
 			Entries:            []OpsAlertSilenceEntry{},
 		},
 	}
 }
 func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) {
 	if s == nil {
 		return
 	}
 	s.Key = strings.TrimSpace(s.Key)
 	if s.Key == "" {
 		s.Key = defaultKey
 	}
 	if s.TTLSeconds <= 0 {
 		s.TTLSeconds = defaultTTLSeconds
 	}
 }
 func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) {
 	if s == nil {
 		return
 	}
 	s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339)
 	s.GlobalReason = strings.TrimSpace(s.GlobalReason)
 	if s.Entries == nil {
 		s.Entries = []OpsAlertSilenceEntry{}
 	}
 	for i := range s.Entries {
 		s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339)
 		s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason)
 	}
 }
 func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error {
 	if strings.TrimSpace(s.Key) == "" {
 		return errors.New("distributed_lock.key is required")
 	}
 	if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) {
 		return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400")
 	}
 	return nil
 }
 func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error {
 	parse := func(raw string) error {
 		if strings.TrimSpace(raw) == "" {
 			return nil
 		}
 		if _, err := time.Parse(time.RFC3339, raw); err != nil {
 			return errors.New("silencing time must be RFC3339")
 		}
 		return nil
 	}
 	if err := parse(s.GlobalUntilRFC3339); err != nil {
 		return err
 	}
 	for _, entry := range s.Entries {
 		if strings.TrimSpace(entry.UntilRFC3339) == "" {
 			return errors.New("silencing.entries.until_rfc3339 is required")
 		}
 		if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil {
 			return errors.New("silencing.entries.until_rfc3339 must be RFC3339")
 		}
 	}
 	return nil
 }
 func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) {
 	defaultCfg := defaultOpsAlertRuntimeSettings()
 	if s == nil || s.settingRepo == nil {
 		return defaultCfg, nil
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings)
 	if err != nil {
 		if errors.Is(err, ErrSettingNotFound) {
 			if b, mErr := json.Marshal(defaultCfg); mErr == nil {
 				_ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b))
 			}
 			return defaultCfg, nil
 		}
 		return nil, err
 	}
 	cfg := &OpsAlertRuntimeSettings{}
 	if err := json.Unmarshal([]byte(raw), cfg); err != nil {
 		return defaultCfg, nil
 	}
 	if cfg.EvaluationIntervalSeconds <= 0 {
 		cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds
 	}
 	normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
 	normalizeOpsAlertSilencingSettings(&cfg.Silencing)
 	return cfg, nil
 }
 func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) {
 	if s == nil || s.settingRepo == nil {
 		return nil, errors.New("setting repository not initialized")
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	if cfg == nil {
 		return nil, errors.New("invalid config")
 	}
 	if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) {
 		return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400")
 	}
 	if cfg.DistributedLock.Enabled {
 		if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil {
 			return nil, err
 		}
 	}
 	if cfg.Silencing.Enabled {
 		if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil {
 			return nil, err
 		}
 	}
 	defaultCfg := defaultOpsAlertRuntimeSettings()
 	normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
 	normalizeOpsAlertSilencingSettings(&cfg.Silencing)
 	raw, err := json.Marshal(cfg)
 	if err != nil {
 		return nil, err
 	}
 	if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil {
 		return nil, err
 	}
 	// Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated).
 	updated := &OpsAlertRuntimeSettings{}
 	_ = json.Unmarshal(raw, updated)
 	return updated, nil
 }
 // =========================
 // Advanced settings
 // =========================
 func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
 	return &OpsAdvancedSettings{
 		DataRetention: OpsDataRetentionSettings{
 			CleanupEnabled:             false,
 			CleanupSchedule:            "0 2 * * *",
 			ErrorLogRetentionDays:      30,
 			MinuteMetricsRetentionDays: 30,
 			HourlyMetricsRetentionDays: 30,
 		},
 		Aggregation: OpsAggregationSettings{
 			AggregationEnabled: false,
 		},
 	}
 }
 func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) {
 	if cfg == nil {
 		return
 	}
 	cfg.DataRetention.CleanupSchedule = strings.TrimSpace(cfg.DataRetention.CleanupSchedule)
 	if cfg.DataRetention.CleanupSchedule == "" {
 		cfg.DataRetention.CleanupSchedule = "0 2 * * *"
 	}
 	if cfg.DataRetention.ErrorLogRetentionDays <= 0 {
 		cfg.DataRetention.ErrorLogRetentionDays = 30
 	}
 	if cfg.DataRetention.MinuteMetricsRetentionDays <= 0 {
 		cfg.DataRetention.MinuteMetricsRetentionDays = 30
 	}
 	if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 {
 		cfg.DataRetention.HourlyMetricsRetentionDays = 30
 	}
 }
 func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
 	if cfg == nil {
 		return errors.New("invalid config")
 	}
 	if cfg.DataRetention.ErrorLogRetentionDays < 1 || cfg.DataRetention.ErrorLogRetentionDays > 365 {
 		return errors.New("error_log_retention_days must be between 1 and 365")
 	}
 	if cfg.DataRetention.MinuteMetricsRetentionDays < 1 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 {
 		return errors.New("minute_metrics_retention_days must be between 1 and 365")
 	}
 	if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
 		return errors.New("hourly_metrics_retention_days must be between 1 and 365")
 	}
 	return nil
 }
 func (s *OpsService) GetOpsAdvancedSettings(ctx context.Context) (*OpsAdvancedSettings, error) {
 	defaultCfg := defaultOpsAdvancedSettings()
 	if s == nil || s.settingRepo == nil {
 		return defaultCfg, nil
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAdvancedSettings)
 	if err != nil {
 		if errors.Is(err, ErrSettingNotFound) {
 			if b, mErr := json.Marshal(defaultCfg); mErr == nil {
 				_ = s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(b))
 			}
 			return defaultCfg, nil
 		}
 		return nil, err
 	}
 	cfg := &OpsAdvancedSettings{}
 	if err := json.Unmarshal([]byte(raw), cfg); err != nil {
 		return defaultCfg, nil
 	}
 	normalizeOpsAdvancedSettings(cfg)
 	return cfg, nil
 }
 func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdvancedSettings) (*OpsAdvancedSettings, error) {
 	if s == nil || s.settingRepo == nil {
 		return nil, errors.New("setting repository not initialized")
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	if cfg == nil {
 		return nil, errors.New("invalid config")
 	}
 	if err := validateOpsAdvancedSettings(cfg); err != nil {
 		return nil, err
 	}
 	normalizeOpsAdvancedSettings(cfg)
 	raw, err := json.Marshal(cfg)
 	if err != nil {
 		return nil, err
 	}
 	if err := s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(raw)); err != nil {
 		return nil, err
 	}
 	updated := &OpsAdvancedSettings{}
 	_ = json.Unmarshal(raw, updated)
 	return updated, nil
 }
--- a/backend/internal/service/ops_settings_models.go
+++ b/backend/internal/service/ops_settings_models.go
@@ -0,0 +1,87 @@
 package service
 // Ops settings models stored in DB `settings` table (JSON blobs).
 type OpsEmailNotificationConfig struct {
 	Alert  OpsEmailAlertConfig  `json:"alert"`
 	Report OpsEmailReportConfig `json:"report"`
 }
 type OpsEmailAlertConfig struct {
 	Enabled               bool     `json:"enabled"`
 	Recipients            []string `json:"recipients"`
 	MinSeverity           string   `json:"min_severity"`
 	RateLimitPerHour      int      `json:"rate_limit_per_hour"`
 	BatchingWindowSeconds int      `json:"batching_window_seconds"`
 	IncludeResolvedAlerts bool     `json:"include_resolved_alerts"`
 }
 type OpsEmailReportConfig struct {
 	Enabled                         bool     `json:"enabled"`
 	Recipients                      []string `json:"recipients"`
 	DailySummaryEnabled             bool     `json:"daily_summary_enabled"`
 	DailySummarySchedule            string   `json:"daily_summary_schedule"`
 	WeeklySummaryEnabled            bool     `json:"weekly_summary_enabled"`
 	WeeklySummarySchedule           string   `json:"weekly_summary_schedule"`
 	ErrorDigestEnabled              bool     `json:"error_digest_enabled"`
 	ErrorDigestSchedule             string   `json:"error_digest_schedule"`
 	ErrorDigestMinCount             int      `json:"error_digest_min_count"`
 	AccountHealthEnabled            bool     `json:"account_health_enabled"`
 	AccountHealthSchedule           string   `json:"account_health_schedule"`
 	AccountHealthErrorRateThreshold float64  `json:"account_health_error_rate_threshold"`
 }
 // OpsEmailNotificationConfigUpdateRequest allows partial updates, while the
 // frontend can still send the full config shape.
 type OpsEmailNotificationConfigUpdateRequest struct {
 	Alert  *OpsEmailAlertConfig  `json:"alert"`
 	Report *OpsEmailReportConfig `json:"report"`
 }
 type OpsDistributedLockSettings struct {
 	Enabled    bool   `json:"enabled"`
 	Key        string `json:"key"`
 	TTLSeconds int    `json:"ttl_seconds"`
 }
 type OpsAlertSilenceEntry struct {
 	RuleID     *int64   `json:"rule_id,omitempty"`
 	Severities []string `json:"severities,omitempty"`
 	UntilRFC3339 string `json:"until_rfc3339"`
 	Reason       string `json:"reason"`
 }
 type OpsAlertSilencingSettings struct {
 	Enabled bool `json:"enabled"`
 	GlobalUntilRFC3339 string `json:"global_until_rfc3339"`
 	GlobalReason       string `json:"global_reason"`
 	Entries []OpsAlertSilenceEntry `json:"entries,omitempty"`
 }
 type OpsAlertRuntimeSettings struct {
 	EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"`
 	DistributedLock OpsDistributedLockSettings `json:"distributed_lock"`
 	Silencing       OpsAlertSilencingSettings  `json:"silencing"`
 }
 // OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation).
 type OpsAdvancedSettings struct {
 	DataRetention OpsDataRetentionSettings `json:"data_retention"`
 	Aggregation   OpsAggregationSettings   `json:"aggregation"`
 }
 type OpsDataRetentionSettings struct {
 	CleanupEnabled             bool   `json:"cleanup_enabled"`
 	CleanupSchedule            string `json:"cleanup_schedule"`
 	ErrorLogRetentionDays      int    `json:"error_log_retention_days"`
 	MinuteMetricsRetentionDays int    `json:"minute_metrics_retention_days"`
 	HourlyMetricsRetentionDays int    `json:"hourly_metrics_retention_days"`
 }
 type OpsAggregationSettings struct {
 	AggregationEnabled bool `json:"aggregation_enabled"`
 }
--- a/backend/internal/service/ops_trend_models.go
+++ b/backend/internal/service/ops_trend_models.go
@@ -0,0 +1,65 @@
 package service
 import "time"
 type OpsThroughputTrendPoint struct {
 	BucketStart   time.Time `json:"bucket_start"`
 	RequestCount  int64     `json:"request_count"`
 	TokenConsumed int64     `json:"token_consumed"`
 	QPS           float64   `json:"qps"`
 	TPS           float64   `json:"tps"`
 }
 type OpsThroughputPlatformBreakdownItem struct {
 	Platform      string `json:"platform"`
 	RequestCount  int64  `json:"request_count"`
 	TokenConsumed int64  `json:"token_consumed"`
 }
 type OpsThroughputGroupBreakdownItem struct {
 	GroupID       int64  `json:"group_id"`
 	GroupName     string `json:"group_name"`
 	RequestCount  int64  `json:"request_count"`
 	TokenConsumed int64  `json:"token_consumed"`
 }
 type OpsThroughputTrendResponse struct {
 	Bucket string `json:"bucket"`
 	Points []*OpsThroughputTrendPoint `json:"points"`
 	// Optional drilldown helpers:
 	// - When no platform/group is selected: returns totals by platform.
 	// - When platform is selected but group is not: returns top groups in that platform.
 	ByPlatform []*OpsThroughputPlatformBreakdownItem `json:"by_platform,omitempty"`
 	TopGroups  []*OpsThroughputGroupBreakdownItem    `json:"top_groups,omitempty"`
 }
 type OpsErrorTrendPoint struct {
 	BucketStart time.Time `json:"bucket_start"`
 	ErrorCountTotal      int64 `json:"error_count_total"`
 	BusinessLimitedCount int64 `json:"business_limited_count"`
 	ErrorCountSLA        int64 `json:"error_count_sla"`
 	UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
 	Upstream429Count             int64 `json:"upstream_429_count"`
 	Upstream529Count             int64 `json:"upstream_529_count"`
 }
 type OpsErrorTrendResponse struct {
 	Bucket string                `json:"bucket"`
 	Points []*OpsErrorTrendPoint `json:"points"`
 }
 type OpsErrorDistributionItem struct {
 	StatusCode      int   `json:"status_code"`
 	Total           int64 `json:"total"`
 	SLA             int64 `json:"sla"`
 	BusinessLimited int64 `json:"business_limited"`
 }
 type OpsErrorDistributionResponse struct {
 	Total int64                       `json:"total"`
 	Items []*OpsErrorDistributionItem `json:"items"`
 }
--- a/backend/internal/service/ops_trends.go
+++ b/backend/internal/service/ops_trends.go
@@ -0,0 +1,26 @@
 package service
 import (
 	"context"
 	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
 )
 func (s *OpsService) GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
 	}
 	if filter == nil {
 		return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
 	}
 	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
 		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
 	}
 	if filter.StartTime.After(filter.EndTime) {
 		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
 	}
 	return s.opsRepo.GetThroughputTrend(ctx, filter, bucketSeconds)
 }
--- a/backend/internal/service/ops_upstream_context.go
+++ b/backend/internal/service/ops_upstream_context.go
@@ -0,0 +1,94 @@
 package service
 import (
 	"encoding/json"
 	"strings"
 	"time"
 	"github.com/gin-gonic/gin"
 )
 // Gin context keys used by Ops error logger for capturing upstream error details.
 // These keys are set by gateway services and consumed by handler/ops_error_logger.go.
 const (
 	OpsUpstreamStatusCodeKey   = "ops_upstream_status_code"
 	OpsUpstreamErrorMessageKey = "ops_upstream_error_message"
 	OpsUpstreamErrorDetailKey  = "ops_upstream_error_detail"
 	OpsUpstreamErrorsKey       = "ops_upstream_errors"
 )
 func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) {
 	if c == nil {
 		return
 	}
 	if upstreamStatusCode > 0 {
 		c.Set(OpsUpstreamStatusCodeKey, upstreamStatusCode)
 	}
 	if msg := strings.TrimSpace(upstreamMessage); msg != "" {
 		c.Set(OpsUpstreamErrorMessageKey, msg)
 	}
 	if detail := strings.TrimSpace(upstreamDetail); detail != "" {
 		c.Set(OpsUpstreamErrorDetailKey, detail)
 	}
 }
 // OpsUpstreamErrorEvent describes one upstream error attempt during a single gateway request.
 // It is stored in ops_error_logs.upstream_errors as a JSON array.
 type OpsUpstreamErrorEvent struct {
 	AtUnixMs int64 `json:"at_unix_ms,omitempty"`
 	// Context
 	Platform  string `json:"platform,omitempty"`
 	AccountID int64  `json:"account_id,omitempty"`
 	// Outcome
 	UpstreamStatusCode int    `json:"upstream_status_code,omitempty"`
 	UpstreamRequestID  string `json:"upstream_request_id,omitempty"`
 	// Kind: http_error | request_error | retry_exhausted | failover
 	Kind string `json:"kind,omitempty"`
 	Message string `json:"message,omitempty"`
 	Detail  string `json:"detail,omitempty"`
 }
 func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
 	if c == nil {
 		return
 	}
 	if ev.AtUnixMs <= 0 {
 		ev.AtUnixMs = time.Now().UnixMilli()
 	}
 	ev.Platform = strings.TrimSpace(ev.Platform)
 	ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID)
 	ev.Kind = strings.TrimSpace(ev.Kind)
 	ev.Message = strings.TrimSpace(ev.Message)
 	ev.Detail = strings.TrimSpace(ev.Detail)
 	if ev.Message != "" {
 		ev.Message = sanitizeUpstreamErrorMessage(ev.Message)
 	}
 	var existing []*OpsUpstreamErrorEvent
 	if v, ok := c.Get(OpsUpstreamErrorsKey); ok {
 		if arr, ok := v.([]*OpsUpstreamErrorEvent); ok {
 			existing = arr
 		}
 	}
 	evCopy := ev
 	existing = append(existing, &evCopy)
 	c.Set(OpsUpstreamErrorsKey, existing)
 }
 func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string {
 	if len(events) == 0 {
 		return nil
 	}
 	// Ensure we always store a valid JSON value.
 	raw, err := json.Marshal(events)
 	if err != nil || len(raw) == 0 {
 		return nil
 	}
 	s := string(raw)
 	return &s
 }
--- a/backend/internal/service/ops_window_stats.go
+++ b/backend/internal/service/ops_window_stats.go
@@ -0,0 +1,24 @@
 package service
 import (
 	"context"
 	"time"
 	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
 )
 // GetWindowStats returns lightweight request/token counts for the provided window.
 // It is intended for realtime sampling (e.g. WebSocket QPS push) without computing percentiles/peaks.
 func (s *OpsService) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
 	}
 	if s.opsRepo == nil {
 		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
 	}
 	filter := &OpsDashboardFilter{
 		StartTime: startTime,
 		EndTime:   endTime,
 	}
 	return s.opsRepo.GetWindowStats(ctx, filter)
 }
--- a/backend/internal/service/ratelimit_service.go
+++ b/backend/internal/service/ratelimit_service.go
@@ -55,19 +55,36 @@ func (s *RateLimitService) HandleUpstreamError(ctx context.Context, account *Acc
 	}
 	tempMatched := s.tryTempUnschedulable(ctx, account, statusCode, responseBody)
 	upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(responseBody))
 	upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 	if upstreamMsg != "" {
 		upstreamMsg = truncateForLog([]byte(upstreamMsg), 512)
 	}
 	switch statusCode {
 	case 401:
 		// 认证失败：停止调度，记录错误
-		s.handleAuthError(ctx, account, "Authentication failed (401): invalid or expired credentials")
+		msg := "Authentication failed (401): invalid or expired credentials"
 		if upstreamMsg != "" {
 			msg = "Authentication failed (401): " + upstreamMsg
 		}
 		s.handleAuthError(ctx, account, msg)
 		shouldDisable = true
 	case 402:
 		// 支付要求：余额不足或计费问题，停止调度
-		s.handleAuthError(ctx, account, "Payment required (402): insufficient balance or billing issue")
+		msg := "Payment required (402): insufficient balance or billing issue"
 		if upstreamMsg != "" {
 			msg = "Payment required (402): " + upstreamMsg
 		}
 		s.handleAuthError(ctx, account, msg)
 		shouldDisable = true
 	case 403:
 		// 禁止访问：停止调度，记录错误
-		s.handleAuthError(ctx, account, "Access forbidden (403): account may be suspended or lack permissions")
+		msg := "Access forbidden (403): account may be suspended or lack permissions"
 		if upstreamMsg != "" {
 			msg = "Access forbidden (403): " + upstreamMsg
 		}
 		s.handleAuthError(ctx, account, msg)
 		shouldDisable = true
 	case 429:
 		s.handle429(ctx, account, headers)
--- a/backend/internal/service/setting_service.go
+++ b/backend/internal/service/setting_service.go
@@ -176,7 +176,7 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
 		updates[SettingKeyTurnstileSecretKey] = settings.TurnstileSecretKey
 	}
-	// LinuxDo Connect OAuth 登录（终端用户 SSO）
+	// LinuxDo Connect OAuth 登录
 	updates[SettingKeyLinuxDoConnectEnabled] = strconv.FormatBool(settings.LinuxDoConnectEnabled)
 	updates[SettingKeyLinuxDoConnectClientID] = settings.LinuxDoConnectClientID
 	updates[SettingKeyLinuxDoConnectRedirectURL] = settings.LinuxDoConnectRedirectURL
@@ -208,6 +208,14 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
 	updates[SettingKeyEnableIdentityPatch] = strconv.FormatBool(settings.EnableIdentityPatch)
 	updates[SettingKeyIdentityPatchPrompt] = settings.IdentityPatchPrompt
 	// Ops monitoring (vNext)
 	updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled)
 	updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled)
 	updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault))
 	if settings.OpsMetricsIntervalSeconds > 0 {
 		updates[SettingKeyOpsMetricsIntervalSeconds] = strconv.Itoa(settings.OpsMetricsIntervalSeconds)
 	}
 	err := s.settingRepo.SetMultiple(ctx, updates)
 	if err == nil && s.onUpdate != nil {
 		s.onUpdate() // Invalidate cache after settings update
@@ -298,6 +306,12 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error {
 		// Identity patch defaults
 		SettingKeyEnableIdentityPatch: "true",
 		SettingKeyIdentityPatchPrompt: "",
 		// Ops monitoring defaults (vNext)
 		SettingKeyOpsMonitoringEnabled:         "true",
 		SettingKeyOpsRealtimeMonitoringEnabled: "true",
 		SettingKeyOpsQueryModeDefault:          "auto",
 		SettingKeyOpsMetricsIntervalSeconds:    "60",
 	}
 	return s.settingRepo.SetMultiple(ctx, defaults)
@@ -397,100 +411,33 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin
 	}
 	result.IdentityPatchPrompt = settings[SettingKeyIdentityPatchPrompt]
 	// Ops monitoring settings (default: enabled, fail-open)
 	result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled])
 	result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled])
 	result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault]))
 	result.OpsMetricsIntervalSeconds = 60
 	if raw := strings.TrimSpace(settings[SettingKeyOpsMetricsIntervalSeconds]); raw != "" {
 		if v, err := strconv.Atoi(raw); err == nil {
 			if v < 60 {
 				v = 60
 			}
 			if v > 3600 {
 				v = 3600
 			}
 			result.OpsMetricsIntervalSeconds = v
 		}
 	}
 	return result
 }
-// GetLinuxDoConnectOAuthConfig 返回用于登录的“最终生效” LinuxDo Connect 配置。
+func isFalseSettingValue(value string) bool {
-//
+	switch strings.ToLower(strings.TrimSpace(value)) {
-// 优先级：
+	case "false", "0", "off", "disabled":
-// - 若对应系统设置键存在，则覆盖 config.yaml/env 的值
+		return true
 // - 否则回退到 config.yaml/env 的值
 func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) {
 	if s == nil || s.cfg == nil {
 		return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded")
 	}
 	effective := s.cfg.LinuxDo
 	keys := []string{
 		SettingKeyLinuxDoConnectEnabled,
 		SettingKeyLinuxDoConnectClientID,
 		SettingKeyLinuxDoConnectClientSecret,
 		SettingKeyLinuxDoConnectRedirectURL,
 	}
 	settings, err := s.settingRepo.GetMultiple(ctx, keys)
 	if err != nil {
 		return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
 	}
 	if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
 		effective.Enabled = raw == "true"
 	}
 	if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
 		effective.ClientID = strings.TrimSpace(v)
 	}
 	if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
 		effective.ClientSecret = strings.TrimSpace(v)
 	}
 	if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
 		effective.RedirectURL = strings.TrimSpace(v)
 	}
 	if !effective.Enabled {
 		return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
 	}
 	// 基础健壮性校验（避免把用户重定向到一个必然失败或不安全的 OAuth 流程里）。
 	if strings.TrimSpace(effective.ClientID) == "" {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
 	}
 	if strings.TrimSpace(effective.AuthorizeURL) == "" {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
 	}
 	if strings.TrimSpace(effective.TokenURL) == "" {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
 	}
 	if strings.TrimSpace(effective.UserInfoURL) == "" {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
 	}
 	if strings.TrimSpace(effective.RedirectURL) == "" {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
 	}
 	if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
 	}
 	if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid")
 	}
 	if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
 	}
 	if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
 	}
 	if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
 	}
 	if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
 	}
 	method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod))
 	switch method {
 	case "", "client_secret_post", "client_secret_basic":
 		if strings.TrimSpace(effective.ClientSecret) == "" {
 			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
 		}
 	case "none":
 		if !effective.UsePKCE {
 			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
 		}
 	default:
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid")
+		return false
 	}
 	return effective, nil
 }
 // getStringOrDefault 获取字符串值或默认值
@@ -635,3 +582,96 @@ func (s *SettingService) GetFallbackModel(ctx context.Context, platform string)
 	}
 	return value
 }
 // GetLinuxDoConnectOAuthConfig 返回用于登录的"最终生效" LinuxDo Connect 配置。
 //
 // 优先级：
 // - 若对应系统设置键存在，则覆盖 config.yaml/env 的值
 // - 否则回退到 config.yaml/env 的值
 func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) {
 	if s == nil || s.cfg == nil {
 		return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded")
 	}
 	effective := s.cfg.LinuxDo
 	keys := []string{
 		SettingKeyLinuxDoConnectEnabled,
 		SettingKeyLinuxDoConnectClientID,
 		SettingKeyLinuxDoConnectClientSecret,
 		SettingKeyLinuxDoConnectRedirectURL,
 	}
 	settings, err := s.settingRepo.GetMultiple(ctx, keys)
 	if err != nil {
 		return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
 	}
 	if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
 		effective.Enabled = raw == "true"
 	}
 	if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
 		effective.ClientID = strings.TrimSpace(v)
 	}
 	if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
 		effective.ClientSecret = strings.TrimSpace(v)
 	}
 	if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
 		effective.RedirectURL = strings.TrimSpace(v)
 	}
 	if !effective.Enabled {
 		return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
 	}
 	// 基础健壮性校验（避免把用户重定向到一个必然失败或不安全的 OAuth 流程里）。
 	if strings.TrimSpace(effective.ClientID) == "" {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
 	}
 	if strings.TrimSpace(effective.AuthorizeURL) == "" {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
 	}
 	if strings.TrimSpace(effective.TokenURL) == "" {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
 	}
 	if strings.TrimSpace(effective.UserInfoURL) == "" {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
 	}
 	if strings.TrimSpace(effective.RedirectURL) == "" {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
 	}
 	if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
 	}
 	if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid")
 	}
 	if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
 	}
 	if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
 	}
 	if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
 	}
 	if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
 	}
 	method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod))
 	switch method {
 	case "", "client_secret_post", "client_secret_basic":
 		if strings.TrimSpace(effective.ClientSecret) == "" {
 			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
 		}
 	case "none":
 		if !effective.UsePKCE {
 			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
 		}
 	default:
 		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid")
 	}
 	return effective, nil
 }
--- a/backend/internal/service/settings_view.go
+++ b/backend/internal/service/settings_view.go
@@ -18,7 +18,7 @@ type SystemSettings struct {
 	TurnstileSecretKey           string
 	TurnstileSecretKeyConfigured bool
-	// LinuxDo Connect OAuth 登录（终端用户 SSO）
+	// LinuxDo Connect OAuth 登录
 	LinuxDoConnectEnabled                bool
 	LinuxDoConnectClientID               string
 	LinuxDoConnectClientSecret           string
@@ -46,6 +46,12 @@ type SystemSettings struct {
 	// Identity patch configuration (Claude -> Gemini)
 	EnableIdentityPatch bool   `json:"enable_identity_patch"`
 	IdentityPatchPrompt string `json:"identity_patch_prompt"`
 	// Ops monitoring (vNext)
 	OpsMonitoringEnabled         bool
 	OpsRealtimeMonitoringEnabled bool
 	OpsQueryModeDefault          string
 	OpsMetricsIntervalSeconds    int
 }
 type PublicSettings struct {
--- a/backend/internal/service/wire.go
+++ b/backend/internal/service/wire.go
@@ -1,10 +1,12 @@
 package service
 import (
 	"database/sql"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/config"
 	"github.com/google/wire"
 	"github.com/redis/go-redis/v9"
 )
 // BuildInfo contains build information
@@ -84,6 +86,72 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi
 	return svc
 }
 // ProvideOpsMetricsCollector creates and starts OpsMetricsCollector.
 func ProvideOpsMetricsCollector(
 	opsRepo OpsRepository,
 	settingRepo SettingRepository,
 	accountRepo AccountRepository,
 	concurrencyService *ConcurrencyService,
 	db *sql.DB,
 	redisClient *redis.Client,
 	cfg *config.Config,
 ) *OpsMetricsCollector {
 	collector := NewOpsMetricsCollector(opsRepo, settingRepo, accountRepo, concurrencyService, db, redisClient, cfg)
 	collector.Start()
 	return collector
 }
 // ProvideOpsAggregationService creates and starts OpsAggregationService (hourly/daily pre-aggregation).
 func ProvideOpsAggregationService(
 	opsRepo OpsRepository,
 	settingRepo SettingRepository,
 	db *sql.DB,
 	redisClient *redis.Client,
 	cfg *config.Config,
 ) *OpsAggregationService {
 	svc := NewOpsAggregationService(opsRepo, settingRepo, db, redisClient, cfg)
 	svc.Start()
 	return svc
 }
 // ProvideOpsAlertEvaluatorService creates and starts OpsAlertEvaluatorService.
 func ProvideOpsAlertEvaluatorService(
 	opsService *OpsService,
 	opsRepo OpsRepository,
 	emailService *EmailService,
 	redisClient *redis.Client,
 	cfg *config.Config,
 ) *OpsAlertEvaluatorService {
 	svc := NewOpsAlertEvaluatorService(opsService, opsRepo, emailService, redisClient, cfg)
 	svc.Start()
 	return svc
 }
 // ProvideOpsCleanupService creates and starts OpsCleanupService (cron scheduled).
 func ProvideOpsCleanupService(
 	opsRepo OpsRepository,
 	db *sql.DB,
 	redisClient *redis.Client,
 	cfg *config.Config,
 ) *OpsCleanupService {
 	svc := NewOpsCleanupService(opsRepo, db, redisClient, cfg)
 	svc.Start()
 	return svc
 }
 // ProvideOpsScheduledReportService creates and starts OpsScheduledReportService.
 func ProvideOpsScheduledReportService(
 	opsService *OpsService,
 	userService *UserService,
 	emailService *EmailService,
 	redisClient *redis.Client,
 	cfg *config.Config,
 ) *OpsScheduledReportService {
 	svc := NewOpsScheduledReportService(opsService, userService, emailService, redisClient, cfg)
 	svc.Start()
 	return svc
 }
 // ProvideAPIKeyAuthCacheInvalidator 提供 API Key 认证缓存失效能力
 func ProvideAPIKeyAuthCacheInvalidator(apiKeyService *APIKeyService) APIKeyAuthCacheInvalidator {
 	return apiKeyService
@@ -122,6 +190,12 @@ var ProviderSet = wire.NewSet(
 	NewAccountUsageService,
 	NewAccountTestService,
 	NewSettingService,
 	NewOpsService,
 	ProvideOpsMetricsCollector,
 	ProvideOpsAggregationService,
 	ProvideOpsAlertEvaluatorService,
 	ProvideOpsCleanupService,
 	ProvideOpsScheduledReportService,
 	NewEmailService,
 	ProvideEmailQueueService,
 	NewTurnstileService,
--- a/backend/migrations/033_ops_monitoring_vnext.sql
+++ b/backend/migrations/033_ops_monitoring_vnext.sql
@@ -0,0 +1,717 @@
 -- Ops Monitoring (vNext): squashed migration (030)
 --
 -- This repository originally planned Ops vNext as migrations 030-036:
 --   030 drop legacy ops tables
 --   031 core schema
 --   032 pre-aggregation tables
 --   033 indexes + optional extensions
 --   034 add avg/max to preagg
 --   035 add notify_email to alert rules
 --   036 seed default alert rules
 --
 -- Since these migrations have NOT been applied to any environment yet, we squash them
 -- into a single 030 migration for easier review and a cleaner migration history.
 --
 -- Notes:
 -- - This is intentionally destructive for ops_* data (error logs / metrics / alerts).
 -- - It is idempotent (DROP/CREATE/ALTER IF EXISTS/IF NOT EXISTS), but will wipe ops_* data if re-run.
 -- =====================================================================
 -- 030_ops_drop_legacy_ops_tables.sql
 -- =====================================================================
 SET LOCAL lock_timeout = '5s';
 SET LOCAL statement_timeout = '10min';
 -- Legacy pre-aggregation tables (from 026 and/or previous branches)
 DROP TABLE IF EXISTS ops_metrics_daily CASCADE;
 DROP TABLE IF EXISTS ops_metrics_hourly CASCADE;
 -- Core ops tables that may exist in some deployments / branches
 DROP TABLE IF EXISTS ops_system_metrics CASCADE;
 DROP TABLE IF EXISTS ops_error_logs CASCADE;
 DROP TABLE IF EXISTS ops_alert_events CASCADE;
 DROP TABLE IF EXISTS ops_alert_rules CASCADE;
 DROP TABLE IF EXISTS ops_job_heartbeats CASCADE;
 DROP TABLE IF EXISTS ops_retry_attempts CASCADE;
 -- Optional legacy tables (best-effort cleanup)
 DROP TABLE IF EXISTS ops_scheduled_reports CASCADE;
 DROP TABLE IF EXISTS ops_group_availability_configs CASCADE;
 DROP TABLE IF EXISTS ops_group_availability_events CASCADE;
 -- Optional legacy views/indexes
 DROP VIEW IF EXISTS ops_latest_metrics CASCADE;
 -- =====================================================================
 -- 031_ops_core_schema.sql
 -- =====================================================================
 -- Ops Monitoring (vNext): core schema (errors / retries / metrics / jobs / alerts)
 --
 -- Design goals:
 -- - Support global filtering (time/platform/group) across all ops modules.
 -- - Persist enough context for two retry modes (client retry / pinned upstream retry).
 -- - Make ops background jobs observable via job heartbeats.
 -- - Keep schema stable and indexes targeted (high-write tables).
 --
 -- Notes:
 -- - This migration is idempotent.
 -- - ops_* tables intentionally avoid strict foreign keys to reduce write amplification/locks.
 SET LOCAL lock_timeout = '5s';
 SET LOCAL statement_timeout = '10min';
 -- ============================================
 -- 1) ops_error_logs: error log details (high-write)
 -- ============================================
 CREATE TABLE IF NOT EXISTS ops_error_logs (
    id BIGSERIAL PRIMARY KEY,
    -- Correlation / identities
    request_id VARCHAR(64),
    client_request_id VARCHAR(64),
    user_id BIGINT,
    api_key_id BIGINT,
    account_id BIGINT,
    group_id BIGINT,
    client_ip inet,
    -- Dimensions for global filtering
    platform VARCHAR(32),
    -- Request metadata
    model VARCHAR(100),
    request_path VARCHAR(256),
    stream BOOLEAN NOT NULL DEFAULT false,
    user_agent TEXT,
    -- Core error classification
    error_phase VARCHAR(32) NOT NULL,
    error_type VARCHAR(64) NOT NULL,
    severity VARCHAR(8) NOT NULL DEFAULT 'P2',
    status_code INT,
    -- vNext metric semantics
    is_business_limited BOOLEAN NOT NULL DEFAULT false,
    -- Error details (sanitized/truncated at ingest time)
    error_message TEXT,
    error_body TEXT,
    -- Provider/upstream details (optional; useful for trends & account health)
    error_source VARCHAR(64),
    error_owner VARCHAR(32),
    account_status VARCHAR(50),
    upstream_status_code INT,
    upstream_error_message TEXT,
    upstream_error_detail TEXT,
    provider_error_code VARCHAR(64),
    provider_error_type VARCHAR(64),
    network_error_type VARCHAR(50),
    retry_after_seconds INT,
    -- Timings (ms) - optional
    duration_ms INT,
    time_to_first_token_ms BIGINT,
    auth_latency_ms BIGINT,
    routing_latency_ms BIGINT,
    upstream_latency_ms BIGINT,
    response_latency_ms BIGINT,
    -- Retry context (only stored for error requests)
    request_body JSONB,
    request_headers JSONB,
    request_body_truncated BOOLEAN NOT NULL DEFAULT false,
    request_body_bytes INT,
    -- Retryability flags (best-effort classification)
    is_retryable BOOLEAN NOT NULL DEFAULT false,
    retry_count INT NOT NULL DEFAULT 0,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );
 COMMENT ON TABLE ops_error_logs IS 'Ops error logs (vNext). Stores sanitized error details and request_body for retries (errors only).';
 -- ============================================
 -- 2) ops_retry_attempts: audit log for retries
 -- ============================================
 CREATE TABLE IF NOT EXISTS ops_retry_attempts (
    id BIGSERIAL PRIMARY KEY,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    requested_by_user_id BIGINT,
    source_error_id BIGINT,
    -- client|upstream
    mode VARCHAR(16) NOT NULL,
    pinned_account_id BIGINT,
    -- queued|running|succeeded|failed
    status VARCHAR(16) NOT NULL DEFAULT 'queued',
    started_at TIMESTAMPTZ,
    finished_at TIMESTAMPTZ,
    duration_ms BIGINT,
    -- Optional result correlation
    result_request_id VARCHAR(64),
    result_error_id BIGINT,
    result_usage_request_id VARCHAR(64),
    error_message TEXT
 );
 COMMENT ON TABLE ops_retry_attempts IS 'Audit table for ops retries (client retry / pinned upstream retry).';
 -- ============================================
 -- 3) ops_system_metrics: system + request window snapshots
 -- ============================================
 CREATE TABLE IF NOT EXISTS ops_system_metrics (
    id BIGSERIAL PRIMARY KEY,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    window_minutes INT NOT NULL DEFAULT 1,
    -- Optional dimensions (only if collector chooses to write per-dimension snapshots)
    platform VARCHAR(32),
    group_id BIGINT,
    -- Core counts
    success_count BIGINT NOT NULL DEFAULT 0,
    error_count_total BIGINT NOT NULL DEFAULT 0,
    business_limited_count BIGINT NOT NULL DEFAULT 0,
    error_count_sla BIGINT NOT NULL DEFAULT 0,
    upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
    upstream_429_count BIGINT NOT NULL DEFAULT 0,
    upstream_529_count BIGINT NOT NULL DEFAULT 0,
    token_consumed BIGINT NOT NULL DEFAULT 0,
    -- Rates
    qps DOUBLE PRECISION,
    tps DOUBLE PRECISION,
    -- Duration percentiles (ms) - success requests
    duration_p50_ms INT,
    duration_p90_ms INT,
    duration_p95_ms INT,
    duration_p99_ms INT,
    duration_avg_ms DOUBLE PRECISION,
    duration_max_ms INT,
    -- TTFT percentiles (ms) - success requests (streaming)
    ttft_p50_ms INT,
    ttft_p90_ms INT,
    ttft_p95_ms INT,
    ttft_p99_ms INT,
    ttft_avg_ms DOUBLE PRECISION,
    ttft_max_ms INT,
    -- System resources
    cpu_usage_percent DOUBLE PRECISION,
    memory_used_mb BIGINT,
    memory_total_mb BIGINT,
    memory_usage_percent DOUBLE PRECISION,
    -- Dependency health (best-effort)
    db_ok BOOLEAN,
    redis_ok BOOLEAN,
    -- DB pool & runtime
    db_conn_active INT,
    db_conn_idle INT,
    db_conn_waiting INT,
    goroutine_count INT,
    -- Queue / concurrency
    concurrency_queue_depth INT
 );
 COMMENT ON TABLE ops_system_metrics IS 'Ops system/request metrics snapshots (vNext). Used for dashboard overview and realtime rates.';
 -- ============================================
 -- 4) ops_job_heartbeats: background jobs health
 -- ============================================
 CREATE TABLE IF NOT EXISTS ops_job_heartbeats (
    job_name VARCHAR(64) PRIMARY KEY,
    last_run_at TIMESTAMPTZ,
    last_success_at TIMESTAMPTZ,
    last_error_at TIMESTAMPTZ,
    last_error TEXT,
    last_duration_ms BIGINT,
    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );
 COMMENT ON TABLE ops_job_heartbeats IS 'Ops background jobs heartbeats (vNext).';
 -- ============================================
 -- 5) ops_alert_rules / ops_alert_events
 -- ============================================
 CREATE TABLE IF NOT EXISTS ops_alert_rules (
    id BIGSERIAL PRIMARY KEY,
    name VARCHAR(128) NOT NULL,
    description TEXT,
    enabled BOOLEAN NOT NULL DEFAULT true,
    severity VARCHAR(16) NOT NULL DEFAULT 'warning',
    -- Metric definition
    -- Metric definition
    metric_type VARCHAR(64) NOT NULL,
    operator VARCHAR(8) NOT NULL,
    threshold DOUBLE PRECISION NOT NULL,
    window_minutes INT NOT NULL DEFAULT 5,
    sustained_minutes INT NOT NULL DEFAULT 5,
    cooldown_minutes INT NOT NULL DEFAULT 10,
    -- Optional scoping: platform/group filters etc.
    filters JSONB,
    last_triggered_at TIMESTAMPTZ,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );
 CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_alert_rules_name_unique
    ON ops_alert_rules (name);
 CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled
    ON ops_alert_rules (enabled);
 CREATE TABLE IF NOT EXISTS ops_alert_events (
    id BIGSERIAL PRIMARY KEY,
    rule_id BIGINT,
    severity VARCHAR(16) NOT NULL,
    status VARCHAR(16) NOT NULL DEFAULT 'firing',
    title VARCHAR(200),
    description TEXT,
    metric_value DOUBLE PRECISION,
    threshold_value DOUBLE PRECISION,
    dimensions JSONB,
    fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    resolved_at TIMESTAMPTZ,
    email_sent BOOLEAN NOT NULL DEFAULT false,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );
 CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status
    ON ops_alert_events (rule_id, status);
 CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at
    ON ops_alert_events (fired_at DESC);
 -- =====================================================================
 -- 032_ops_preaggregation_tables.sql
 -- =====================================================================
 -- Ops Monitoring (vNext): pre-aggregation tables
 --
 -- Purpose:
 -- - Provide stable query performance for 1–24h windows (and beyond), avoiding expensive
 --   percentile_cont scans on raw logs for every dashboard refresh.
 -- - Support global filter dimensions: overall / platform / group.
 --
 -- Design note:
 -- - We keep a single table with nullable platform/group_id, and enforce uniqueness via a
 --   COALESCE-based unique index (because UNIQUE with NULLs allows duplicates in Postgres).
 SET LOCAL lock_timeout = '5s';
 SET LOCAL statement_timeout = '10min';
 -- ============================================
 -- 1) ops_metrics_hourly
 -- ============================================
 CREATE TABLE IF NOT EXISTS ops_metrics_hourly (
    id BIGSERIAL PRIMARY KEY,
    bucket_start TIMESTAMPTZ NOT NULL,
    platform VARCHAR(32),
    group_id BIGINT,
    success_count BIGINT NOT NULL DEFAULT 0,
    error_count_total BIGINT NOT NULL DEFAULT 0,
    business_limited_count BIGINT NOT NULL DEFAULT 0,
    error_count_sla BIGINT NOT NULL DEFAULT 0,
    upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
    upstream_429_count BIGINT NOT NULL DEFAULT 0,
    upstream_529_count BIGINT NOT NULL DEFAULT 0,
    token_consumed BIGINT NOT NULL DEFAULT 0,
    -- Duration percentiles (ms)
    duration_p50_ms INT,
    duration_p90_ms INT,
    duration_p95_ms INT,
    duration_p99_ms INT,
    -- TTFT percentiles (ms)
    ttft_p50_ms INT,
    ttft_p90_ms INT,
    ttft_p95_ms INT,
    ttft_p99_ms INT,
    computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );
 -- Uniqueness across three “dimension modes” (overall / platform / group).
 -- Postgres UNIQUE treats NULLs as distinct, so we enforce uniqueness via COALESCE.
 CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_hourly_unique_dim
    ON ops_metrics_hourly (
        bucket_start,
        COALESCE(platform, ''),
        COALESCE(group_id, 0)
    );
 CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_bucket
    ON ops_metrics_hourly (bucket_start DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_platform_bucket
    ON ops_metrics_hourly (platform, bucket_start DESC)
    WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
 CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_group_bucket
    ON ops_metrics_hourly (group_id, bucket_start DESC)
    WHERE group_id IS NOT NULL AND group_id <> 0;
 COMMENT ON TABLE ops_metrics_hourly IS 'vNext hourly pre-aggregated ops metrics (overall/platform/group).';
 -- ============================================
 -- 2) ops_metrics_daily (optional; for longer windows)
 -- ============================================
 CREATE TABLE IF NOT EXISTS ops_metrics_daily (
    id BIGSERIAL PRIMARY KEY,
    bucket_date DATE NOT NULL,
    platform VARCHAR(32),
    group_id BIGINT,
    success_count BIGINT NOT NULL DEFAULT 0,
    error_count_total BIGINT NOT NULL DEFAULT 0,
    business_limited_count BIGINT NOT NULL DEFAULT 0,
    error_count_sla BIGINT NOT NULL DEFAULT 0,
    upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
    upstream_429_count BIGINT NOT NULL DEFAULT 0,
    upstream_529_count BIGINT NOT NULL DEFAULT 0,
    token_consumed BIGINT NOT NULL DEFAULT 0,
    duration_p50_ms INT,
    duration_p90_ms INT,
    duration_p95_ms INT,
    duration_p99_ms INT,
    ttft_p50_ms INT,
    ttft_p90_ms INT,
    ttft_p95_ms INT,
    ttft_p99_ms INT,
    computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );
 CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_daily_unique_dim
    ON ops_metrics_daily (
        bucket_date,
        COALESCE(platform, ''),
        COALESCE(group_id, 0)
    );
 CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_bucket
    ON ops_metrics_daily (bucket_date DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_platform_bucket
    ON ops_metrics_daily (platform, bucket_date DESC)
    WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
 CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_group_bucket
    ON ops_metrics_daily (group_id, bucket_date DESC)
    WHERE group_id IS NOT NULL AND group_id <> 0;
 COMMENT ON TABLE ops_metrics_daily IS 'vNext daily pre-aggregated ops metrics (overall/platform/group).';
 -- =====================================================================
 -- 033_ops_indexes_and_extensions.sql
 -- =====================================================================
 -- Ops Monitoring (vNext): indexes and optional extensions
 --
 -- This migration intentionally keeps "optional" objects (like pg_trgm) best-effort,
 -- so environments without extension privileges won't fail the whole migration chain.
 SET LOCAL lock_timeout = '5s';
 SET LOCAL statement_timeout = '10min';
 -- ============================================
 -- 1) Core btree indexes (always safe)
 -- ============================================
 -- ops_error_logs
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at
    ON ops_error_logs (created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform_time
    ON ops_error_logs (platform, created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_group_time
    ON ops_error_logs (group_id, created_at DESC)
    WHERE group_id IS NOT NULL;
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_account_time
    ON ops_error_logs (account_id, created_at DESC)
    WHERE account_id IS NOT NULL;
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_status_time
    ON ops_error_logs (status_code, created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_time
    ON ops_error_logs (error_phase, created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_type_time
    ON ops_error_logs (error_type, created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id
    ON ops_error_logs (request_id);
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id
    ON ops_error_logs (client_request_id);
 -- ops_system_metrics
 CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at
    ON ops_system_metrics (created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time
    ON ops_system_metrics (window_minutes, created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_platform_time
    ON ops_system_metrics (platform, created_at DESC)
    WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
 CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_group_time
    ON ops_system_metrics (group_id, created_at DESC)
    WHERE group_id IS NOT NULL;
 -- ops_retry_attempts
 CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_created_at
    ON ops_retry_attempts (created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_source_error
    ON ops_retry_attempts (source_error_id, created_at DESC)
    WHERE source_error_id IS NOT NULL;
 -- Prevent concurrent retries for the same ops_error_logs row (race-free, multi-instance safe).
 CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_retry_attempts_unique_active
    ON ops_retry_attempts (source_error_id)
    WHERE source_error_id IS NOT NULL AND status IN ('queued', 'running');
 -- ============================================
 -- 2) Optional: pg_trgm + trigram indexes for fuzzy search
 -- ============================================
 DO $$
 BEGIN
  BEGIN
    CREATE EXTENSION IF NOT EXISTS pg_trgm;
  EXCEPTION WHEN OTHERS THEN
    -- Missing privileges or extension package should not block migrations.
    RAISE NOTICE 'pg_trgm extension not created: %', SQLERRM;
  END;
  IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm') THEN
    -- request_id / client_request_id fuzzy search
    EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id_trgm
             ON ops_error_logs USING gin (request_id gin_trgm_ops)';
    EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id_trgm
             ON ops_error_logs USING gin (client_request_id gin_trgm_ops)';
    -- error_message fuzzy search
    EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_error_message_trgm
             ON ops_error_logs USING gin (error_message gin_trgm_ops)';
  END IF;
 END $$;
 -- =====================================================================
 -- 034_ops_preaggregation_add_avg_max.sql
 -- =====================================================================
 -- Ops Monitoring (vNext): extend pre-aggregation tables with avg/max latency fields
 --
 -- Why:
 -- - The dashboard overview returns avg/max for duration/TTFT.
 -- - Hourly/daily pre-aggregation tables originally stored only p50/p90/p95/p99, which makes
 --   it impossible to answer avg/max in preagg mode without falling back to raw scans.
 --
 -- This migration is idempotent and safe to run multiple times.
 --
 -- NOTE: We keep the existing p50/p90/p95/p99 columns as-is; these are still used for
 --       approximate long-window summaries.
 SET LOCAL lock_timeout = '5s';
 SET LOCAL statement_timeout = '10min';
 -- Hourly table
 ALTER TABLE ops_metrics_hourly
    ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
    ADD COLUMN IF NOT EXISTS duration_max_ms INT,
    ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
    ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
 -- Daily table
 ALTER TABLE ops_metrics_daily
    ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
    ADD COLUMN IF NOT EXISTS duration_max_ms INT,
    ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
    ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
 -- =====================================================================
 -- 035_ops_alert_rules_notify_email.sql
 -- =====================================================================
 -- Ops Monitoring (vNext): alert rule notify settings
 --
 -- Adds notify_email flag to ops_alert_rules to keep UI parity with the backup Ops dashboard.
 -- Migration is idempotent.
 SET LOCAL lock_timeout = '5s';
 SET LOCAL statement_timeout = '10min';
 ALTER TABLE ops_alert_rules
    ADD COLUMN IF NOT EXISTS notify_email BOOLEAN NOT NULL DEFAULT true;
 -- =====================================================================
 -- 036_ops_seed_default_alert_rules.sql
 -- =====================================================================
 -- Ops Monitoring (vNext): seed default alert rules (idempotent)
 --
 -- Goal:
 -- - Provide "out of the box" alert rules so the Ops dashboard can immediately show alert events.
 -- - Keep inserts idempotent via ON CONFLICT (name) DO NOTHING.
 --
 -- Notes:
 -- - Thresholds are intentionally conservative defaults and should be tuned per deployment.
 -- - Metric semantics follow vNext:
 --   - success_rate / error_rate are based on SLA-scope counts (exclude is_business_limited).
 --   - upstream_error_rate excludes 429/529.
 SET LOCAL lock_timeout = '5s';
 SET LOCAL statement_timeout = '10min';
 -- 1) High error rate (P1)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    '错误率过高',
    '当错误率超过 5% 且持续 5 分钟时触发告警',
    true, 'error_rate', '>', 5.0, 5, 5, 'P1', true, 20, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
 -- 2) Low success rate (P0)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    '成功率过低',
    '当成功率低于 95% 且持续 5 分钟时触发告警（服务可用性下降）',
    true, 'success_rate', '<', 95.0, 5, 5, 'P0', true, 15, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
 -- 3) P99 latency too high (P2)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    'P99延迟过高',
    '当 P99 延迟超过 3000ms 且持续 10 分钟时触发告警',
    true, 'p99_latency_ms', '>', 3000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
 -- 4) P95 latency too high (P2)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    'P95延迟过高',
    '当 P95 延迟超过 2000ms 且持续 10 分钟时触发告警',
    true, 'p95_latency_ms', '>', 2000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
 -- 5) CPU usage too high (P2)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    'CPU使用率过高',
    '当 CPU 使用率超过 85% 且持续 10 分钟时触发告警',
    true, 'cpu_usage_percent', '>', 85.0, 5, 10, 'P2', true, 30, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
 -- 6) Memory usage too high (P1)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    '内存使用率过高',
    '当内存使用率超过 90% 且持续 10 分钟时触发告警（可能导致 OOM）',
    true, 'memory_usage_percent', '>', 90.0, 5, 10, 'P1', true, 20, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
 -- 7) Concurrency queue buildup (P1)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    '并发队列积压',
    '当并发队列深度超过 100 且持续 5 分钟时触发告警（系统处理能力不足）',
    true, 'concurrency_queue_depth', '>', 100.0, 5, 5, 'P1', true, 20, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
 -- 8) Extremely high error rate (P0)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    '错误率极高',
    '当错误率超过 20% 且持续 1 分钟时触发告警（服务严重异常）',
    true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
 -- Ops Monitoring vNext: add Redis pool stats fields to system metrics snapshots.
 -- This migration is intentionally idempotent.
 ALTER TABLE ops_system_metrics
  ADD COLUMN IF NOT EXISTS redis_conn_total INT,
  ADD COLUMN IF NOT EXISTS redis_conn_idle INT;
 COMMENT ON COLUMN ops_system_metrics.redis_conn_total IS 'Redis pool total connections (go-redis PoolStats.TotalConns).';
 COMMENT ON COLUMN ops_system_metrics.redis_conn_idle IS 'Redis pool idle connections (go-redis PoolStats.IdleConns).';
--- a/backend/migrations/034_ops_upstream_error_events.sql
+++ b/backend/migrations/034_ops_upstream_error_events.sql
@@ -0,0 +1,9 @@
 -- Add upstream error events list (JSONB) to ops_error_logs for per-request correlation.
 --
 -- This is intentionally idempotent.
 ALTER TABLE ops_error_logs
    ADD COLUMN IF NOT EXISTS upstream_errors JSONB;
 COMMENT ON COLUMN ops_error_logs.upstream_errors IS
    'Sanitized upstream error events list (JSON array), correlated per gateway request (request_id/client_request_id); used for per-request upstream debugging.';
--- a/config.yaml
+++ b/config.yaml
@@ -159,7 +159,7 @@ gateway:
  max_line_size: 41943040
  # Log upstream error response body summary (safe/truncated; does not log request content)
  # 记录上游错误响应体摘要（安全/截断；不记录请求内容）
-  log_upstream_error_body: false
+  log_upstream_error_body: true
  # Max bytes to log from upstream error body
  # 记录上游错误响应体的最大字节数
  log_upstream_error_body_max_bytes: 2048
@@ -302,6 +302,41 @@ redis:
  # 数据库编号（0-15）
  db: 0
 # =============================================================================
 # Ops Monitoring (Optional)
 # 运维监控 (可选)
 # =============================================================================
 ops:
  # Hard switch: disable all ops background jobs and APIs when false
  # 硬开关：为 false 时禁用所有 Ops 后台任务与接口
  enabled: true
  # Prefer pre-aggregated tables (ops_metrics_hourly/ops_metrics_daily) for long-window dashboard queries.
  # 优先使用预聚合表（用于长时间窗口查询性能）
  use_preaggregated_tables: false
  # Data cleanup configuration
  # 数据清理配置（vNext 默认统一保留 30 天）
  cleanup:
    enabled: true
    # Cron expression (minute hour dom month dow), e.g. "0 2 * * *" = daily at 2 AM
    # Cron 表达式（分 时 日 月 周），例如 "0 2 * * *" = 每天凌晨 2 点
    schedule: "0 2 * * *"
    error_log_retention_days: 30
    minute_metrics_retention_days: 30
    hourly_metrics_retention_days: 30
  # Pre-aggregation configuration
  # 预聚合任务配置
  aggregation:
    enabled: true
  # OpsMetricsCollector Redis cache (reduces duplicate expensive window aggregation in multi-replica deployments)
  # 指标采集 Redis 缓存（多副本部署时减少重复计算）
  metrics_collector_cache:
    enabled: true
    ttl: 65s
 # =============================================================================
 # JWT Configuration
 # JWT 配置
--- a/deploy/.env.example
+++ b/deploy/.env.example
@@ -151,6 +151,15 @@ GEMINI_OAUTH_SCOPES=
 # GEMINI_QUOTA_POLICY={"tiers":{"LEGACY":{"pro_rpd":50,"flash_rpd":1500,"cooldown_minutes":30},"PRO":{"pro_rpd":1500,"flash_rpd":4000,"cooldown_minutes":5},"ULTRA":{"pro_rpd":2000,"flash_rpd":0,"cooldown_minutes":5}}}
 GEMINI_QUOTA_POLICY=
 # -----------------------------------------------------------------------------
 # Ops Monitoring Configuration (运维监控配置)
 # -----------------------------------------------------------------------------
 # Enable ops monitoring features (background jobs and APIs)
 # 是否启用运维监控功能（后台任务和接口）
 # Set to false to hide ops menu in sidebar and disable all ops features
 # 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能
 OPS_ENABLED=true
 # -----------------------------------------------------------------------------
 # Update Configuration (在线更新配置)
 # -----------------------------------------------------------------------------
--- a/deploy/config.example.yaml
+++ b/deploy/config.example.yaml
@@ -159,7 +159,7 @@ gateway:
  max_line_size: 41943040
  # Log upstream error response body summary (safe/truncated; does not log request content)
  # 记录上游错误响应体摘要（安全/截断；不记录请求内容）
-  log_upstream_error_body: false
+  log_upstream_error_body: true
  # Max bytes to log from upstream error body
  # 记录上游错误响应体的最大字节数
  log_upstream_error_body_max_bytes: 2048
@@ -302,6 +302,19 @@ redis:
  # 数据库编号（0-15）
  db: 0
 # =============================================================================
 # Ops Monitoring (Optional)
 # 运维监控 (可选)
 # =============================================================================
 ops:
  # Enable ops monitoring features (background jobs and APIs)
  # 是否启用运维监控功能（后台任务和接口）
  # Set to false to hide ops menu in sidebar and disable all ops features
  # 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能
  # Other detailed settings (cleanup, aggregation, etc.) are configured in ops settings dialog
  # 其他详细设置（数据清理、预聚合等）在运维监控设置对话框中配置
  enabled: true
 # =============================================================================
 # JWT Configuration
 # JWT 配置
--- a/frontend/src/api/admin/index.ts
+++ b/frontend/src/api/admin/index.ts
@@ -17,6 +17,7 @@ import usageAPI from './usage'
 import geminiAPI from './gemini'
 import antigravityAPI from './antigravity'
 import userAttributesAPI from './userAttributes'
 import opsAPI from './ops'
 /**
 * Unified admin API object for convenient access
@@ -35,7 +36,8 @@ export const adminAPI = {
  usage: usageAPI,
  gemini: geminiAPI,
  antigravity: antigravityAPI,
-  userAttributes: userAttributesAPI
+  userAttributes: userAttributesAPI,
  ops: opsAPI
 }
 export {
@@ -52,7 +54,8 @@ export {
  usageAPI,
  geminiAPI,
  antigravityAPI,
-  userAttributesAPI
+  userAttributesAPI,
  opsAPI
 }
 export default adminAPI
--- a/frontend/src/api/admin/ops.ts
+++ b/frontend/src/api/admin/ops.ts
@@ -0,0 +1,958 @@
 /**
 * Admin Ops API endpoints (vNext)
 * - Error logs list/detail + retry (client/upstream)
 * - Dashboard overview (raw path)
 */
 import { apiClient } from '../client'
 import type { PaginatedResponse } from '@/types'
 export type OpsRetryMode = 'client' | 'upstream'
 export type OpsQueryMode = 'auto' | 'raw' | 'preagg'
 export interface OpsRequestOptions {
  signal?: AbortSignal
 }
 export interface OpsRetryRequest {
  mode: OpsRetryMode
  pinned_account_id?: number
 }
 export interface OpsRetryResult {
  attempt_id: number
  mode: OpsRetryMode
  status: 'running' | 'succeeded' | 'failed' | string
  pinned_account_id?: number | null
  used_account_id?: number | null
  http_status_code: number
  upstream_request_id: string
  response_preview: string
  response_truncated: boolean
  error_message: string
  started_at: string
  finished_at: string
  duration_ms: number
 }
 export interface OpsDashboardOverview {
  start_time: string
  end_time: string
  platform: string
  group_id?: number | null
  health_score?: number
  system_metrics?: OpsSystemMetricsSnapshot | null
  job_heartbeats?: OpsJobHeartbeat[] | null
  success_count: number
  error_count_total: number
  business_limited_count: number
  error_count_sla: number
  request_count_total: number
  request_count_sla: number
  token_consumed: number
  sla: number
  error_rate: number
  upstream_error_rate: number
  upstream_error_count_excl_429_529: number
  upstream_429_count: number
  upstream_529_count: number
  qps: {
    current: number
    peak: number
    avg: number
  }
  tps: {
    current: number
    peak: number
    avg: number
  }
  duration: OpsPercentiles
  ttft: OpsPercentiles
 }
 export interface OpsPercentiles {
  p50_ms?: number | null
  p90_ms?: number | null
  p95_ms?: number | null
  p99_ms?: number | null
  avg_ms?: number | null
  max_ms?: number | null
 }
 export interface OpsThroughputTrendPoint {
  bucket_start: string
  request_count: number
  token_consumed: number
  qps: number
  tps: number
 }
 export interface OpsThroughputPlatformBreakdownItem {
  platform: string
  request_count: number
  token_consumed: number
 }
 export interface OpsThroughputGroupBreakdownItem {
  group_id: number
  group_name: string
  request_count: number
  token_consumed: number
 }
 export interface OpsThroughputTrendResponse {
  bucket: string
  points: OpsThroughputTrendPoint[]
  by_platform?: OpsThroughputPlatformBreakdownItem[]
  top_groups?: OpsThroughputGroupBreakdownItem[]
 }
 export type OpsRequestKind = 'success' | 'error'
 export type OpsRequestDetailsKind = OpsRequestKind | 'all'
 export type OpsRequestDetailsSort = 'created_at_desc' | 'duration_desc'
 export interface OpsRequestDetail {
  kind: OpsRequestKind
  created_at: string
  request_id: string
  platform?: string
  model?: string
  duration_ms?: number | null
  status_code?: number | null
  error_id?: number | null
  phase?: string
  severity?: string
  message?: string
  user_id?: number | null
  api_key_id?: number | null
  account_id?: number | null
  group_id?: number | null
  stream?: boolean
 }
 export interface OpsRequestDetailsParams {
  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
  start_time?: string
  end_time?: string
  kind?: OpsRequestDetailsKind
  platform?: string
  group_id?: number | null
  user_id?: number
  api_key_id?: number
  account_id?: number
  model?: string
  request_id?: string
  q?: string
  min_duration_ms?: number
  max_duration_ms?: number
  sort?: OpsRequestDetailsSort
  page?: number
  page_size?: number
 }
 export type OpsRequestDetailsResponse = PaginatedResponse<OpsRequestDetail>
 export interface OpsLatencyHistogramBucket {
  range: string
  count: number
 }
 export interface OpsLatencyHistogramResponse {
  start_time: string
  end_time: string
  platform: string
  group_id?: number | null
  total_requests: number
  buckets: OpsLatencyHistogramBucket[]
 }
 export interface OpsErrorTrendPoint {
  bucket_start: string
  error_count_total: number
  business_limited_count: number
  error_count_sla: number
  upstream_error_count_excl_429_529: number
  upstream_429_count: number
  upstream_529_count: number
 }
 export interface OpsErrorTrendResponse {
  bucket: string
  points: OpsErrorTrendPoint[]
 }
 export interface OpsErrorDistributionItem {
  status_code: number
  total: number
  sla: number
  business_limited: number
 }
 export interface OpsErrorDistributionResponse {
  total: number
  items: OpsErrorDistributionItem[]
 }
 export interface OpsSystemMetricsSnapshot {
  id: number
  created_at: string
  window_minutes: number
  cpu_usage_percent?: number | null
  memory_used_mb?: number | null
  memory_total_mb?: number | null
  memory_usage_percent?: number | null
  db_ok?: boolean | null
  redis_ok?: boolean | null
  // Config-derived limits (best-effort) for rendering "current vs max".
  db_max_open_conns?: number | null
  redis_pool_size?: number | null
  redis_conn_total?: number | null
  redis_conn_idle?: number | null
  db_conn_active?: number | null
  db_conn_idle?: number | null
  db_conn_waiting?: number | null
  goroutine_count?: number | null
  concurrency_queue_depth?: number | null
 }
 export interface OpsJobHeartbeat {
  job_name: string
  last_run_at?: string | null
  last_success_at?: string | null
  last_error_at?: string | null
  last_error?: string | null
  last_duration_ms?: number | null
  updated_at: string
 }
 export interface PlatformConcurrencyInfo {
  platform: string
  current_in_use: number
  max_capacity: number
  load_percentage: number
  waiting_in_queue: number
 }
 export interface GroupConcurrencyInfo {
  group_id: number
  group_name: string
  platform: string
  current_in_use: number
  max_capacity: number
  load_percentage: number
  waiting_in_queue: number
 }
 export interface AccountConcurrencyInfo {
  account_id: number
  account_name?: string
  platform: string
  group_id: number
  group_name: string
  current_in_use: number
  max_capacity: number
  load_percentage: number
  waiting_in_queue: number
 }
 export interface OpsConcurrencyStatsResponse {
  enabled: boolean
  platform: Record<string, PlatformConcurrencyInfo>
  group: Record<string, GroupConcurrencyInfo>
  account: Record<string, AccountConcurrencyInfo>
  timestamp?: string
 }
 export async function getConcurrencyStats(platform?: string, groupId?: number | null): Promise<OpsConcurrencyStatsResponse> {
  const params: Record<string, any> = {}
  if (platform) {
    params.platform = platform
  }
  if (typeof groupId === 'number' && groupId > 0) {
    params.group_id = groupId
  }
  const { data } = await apiClient.get<OpsConcurrencyStatsResponse>('/admin/ops/concurrency', { params })
  return data
 }
 export interface PlatformAvailability {
  platform: string
  total_accounts: number
  available_count: number
  rate_limit_count: number
  error_count: number
 }
 export interface GroupAvailability {
  group_id: number
  group_name: string
  platform: string
  total_accounts: number
  available_count: number
  rate_limit_count: number
  error_count: number
 }
 export interface AccountAvailability {
  account_id: number
  account_name: string
  platform: string
  group_id: number
  group_name: string
  status: string
  is_available: boolean
  is_rate_limited: boolean
  rate_limit_reset_at?: string
  rate_limit_remaining_sec?: number
  is_overloaded: boolean
  overload_until?: string
  overload_remaining_sec?: number
  has_error: boolean
  error_message?: string
 }
 export interface OpsAccountAvailabilityStatsResponse {
  enabled: boolean
  platform: Record<string, PlatformAvailability>
  group: Record<string, GroupAvailability>
  account: Record<string, AccountAvailability>
  timestamp?: string
 }
 export async function getAccountAvailabilityStats(platform?: string, groupId?: number | null): Promise<OpsAccountAvailabilityStatsResponse> {
  const params: Record<string, any> = {}
  if (platform) {
    params.platform = platform
  }
  if (typeof groupId === 'number' && groupId > 0) {
    params.group_id = groupId
  }
  const { data } = await apiClient.get<OpsAccountAvailabilityStatsResponse>('/admin/ops/account-availability', { params })
  return data
 }
 /**
 * Subscribe to realtime QPS updates via WebSocket.
 *
 * Note: browsers cannot set Authorization headers for WebSockets.
 * We authenticate via Sec-WebSocket-Protocol using a prefixed token item:
 *   ["sub2api-admin", "jwt.<token>"]
 */
 export interface SubscribeQPSOptions {
  token?: string | null
  onOpen?: () => void
  onClose?: (event: CloseEvent) => void
  onError?: (event: Event) => void
  /**
   * Called when the server closes with an application close code that indicates
   * reconnecting is not useful (e.g. feature flag disabled).
   */
  onFatalClose?: (event: CloseEvent) => void
  /**
   * More granular status updates for UI (connecting/reconnecting/offline/etc).
   */
  onStatusChange?: (status: OpsWSStatus) => void
  /**
   * Called when a reconnect is scheduled (helps display "retry in Xs").
   */
  onReconnectScheduled?: (info: { attempt: number, delayMs: number }) => void
  wsBaseUrl?: string
  /**
   * Maximum reconnect attempts. Defaults to Infinity to keep the dashboard live.
   * Set to 0 to disable reconnect.
   */
  maxReconnectAttempts?: number
  reconnectBaseDelayMs?: number
  reconnectMaxDelayMs?: number
  /**
   * Stale connection detection (heartbeat-by-observation).
   * If no messages are received within this window, the socket is closed to trigger a reconnect.
   * Set to 0 to disable.
   */
  staleTimeoutMs?: number
  /**
   * How often to check staleness. Only used when `staleTimeoutMs > 0`.
   */
  staleCheckIntervalMs?: number
 }
 export type OpsWSStatus = 'connecting' | 'connected' | 'reconnecting' | 'offline' | 'closed'
 export const OPS_WS_CLOSE_CODES = {
  REALTIME_DISABLED: 4001
 } as const
 const OPS_WS_BASE_PROTOCOL = 'sub2api-admin'
 export function subscribeQPS(onMessage: (data: any) => void, options: SubscribeQPSOptions = {}): () => void {
  let ws: WebSocket | null = null
  let reconnectAttempts = 0
  const maxReconnectAttempts = Number.isFinite(options.maxReconnectAttempts as number)
    ? (options.maxReconnectAttempts as number)
    : Infinity
  const baseDelayMs = options.reconnectBaseDelayMs ?? 1000
  const maxDelayMs = options.reconnectMaxDelayMs ?? 30000
  let reconnectTimer: ReturnType<typeof setTimeout> | null = null
  let shouldReconnect = true
  let isConnecting = false
  let hasConnectedOnce = false
  let lastMessageAt = 0
  const staleTimeoutMs = options.staleTimeoutMs ?? 120_000
  const staleCheckIntervalMs = options.staleCheckIntervalMs ?? 30_000
  let staleTimer: ReturnType<typeof setInterval> | null = null
  const setStatus = (status: OpsWSStatus) => {
    options.onStatusChange?.(status)
  }
  const clearReconnectTimer = () => {
    if (reconnectTimer) {
      clearTimeout(reconnectTimer)
      reconnectTimer = null
    }
  }
  const clearStaleTimer = () => {
    if (staleTimer) {
      clearInterval(staleTimer)
      staleTimer = null
    }
  }
  const startStaleTimer = () => {
    clearStaleTimer()
    if (!staleTimeoutMs || staleTimeoutMs <= 0) return
    staleTimer = setInterval(() => {
      if (!shouldReconnect) return
      if (!ws || ws.readyState !== WebSocket.OPEN) return
      if (!lastMessageAt) return
      const ageMs = Date.now() - lastMessageAt
      if (ageMs > staleTimeoutMs) {
        // Treat as a half-open connection; closing triggers the normal reconnect path.
        ws.close()
      }
    }, staleCheckIntervalMs)
  }
  const scheduleReconnect = () => {
    if (!shouldReconnect) return
    if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
    // If we're offline, wait for the browser to come back online.
    if (typeof navigator !== 'undefined' && 'onLine' in navigator && !navigator.onLine) {
      setStatus('offline')
      return
    }
    const expDelay = baseDelayMs * Math.pow(2, reconnectAttempts)
    const delay = Math.min(expDelay, maxDelayMs)
    const jitter = Math.floor(Math.random() * 250)
    clearReconnectTimer()
    reconnectTimer = setTimeout(() => {
      reconnectAttempts++
      connect()
    }, delay + jitter)
    options.onReconnectScheduled?.({ attempt: reconnectAttempts + 1, delayMs: delay + jitter })
  }
  const handleOnline = () => {
    if (!shouldReconnect) return
    if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
    connect()
  }
  const handleOffline = () => {
    setStatus('offline')
  }
  const connect = () => {
    if (!shouldReconnect) return
    if (isConnecting) return
    if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
    if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
    isConnecting = true
    setStatus(hasConnectedOnce ? 'reconnecting' : 'connecting')
    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
    const wsBaseUrl = options.wsBaseUrl || import.meta.env.VITE_WS_BASE_URL || window.location.host
    const wsURL = new URL(`${protocol}//${wsBaseUrl}/api/v1/admin/ops/ws/qps`)
    // Do NOT put admin JWT in the URL query string (it can leak via access logs, proxies, etc).
    // Browsers cannot set Authorization headers for WebSockets, so we pass the token via
    // Sec-WebSocket-Protocol (subprotocol list): ["sub2api-admin", "jwt.<token>"].
    const rawToken = String(options.token ?? localStorage.getItem('auth_token') ?? '').trim()
    const protocols: string[] = [OPS_WS_BASE_PROTOCOL]
    if (rawToken) protocols.push(`jwt.${rawToken}`)
    ws = new WebSocket(wsURL.toString(), protocols)
    ws.onopen = () => {
      reconnectAttempts = 0
      isConnecting = false
      hasConnectedOnce = true
      clearReconnectTimer()
      lastMessageAt = Date.now()
      startStaleTimer()
      setStatus('connected')
      options.onOpen?.()
    }
    ws.onmessage = (e) => {
      try {
        const data = JSON.parse(e.data)
        lastMessageAt = Date.now()
        onMessage(data)
      } catch (err) {
        console.warn('[OpsWS] Failed to parse message:', err)
      }
    }
    ws.onerror = (error) => {
      console.error('[OpsWS] Connection error:', error)
      options.onError?.(error)
    }
    ws.onclose = (event) => {
      isConnecting = false
      options.onClose?.(event)
      clearStaleTimer()
      ws = null
      // If the server explicitly tells us to stop reconnecting, honor it.
      if (event && typeof event.code === 'number' && event.code === OPS_WS_CLOSE_CODES.REALTIME_DISABLED) {
        shouldReconnect = false
        clearReconnectTimer()
        setStatus('closed')
        options.onFatalClose?.(event)
        return
      }
      scheduleReconnect()
    }
  }
  window.addEventListener('online', handleOnline)
  window.addEventListener('offline', handleOffline)
  connect()
  return () => {
    shouldReconnect = false
    window.removeEventListener('online', handleOnline)
    window.removeEventListener('offline', handleOffline)
    clearReconnectTimer()
    clearStaleTimer()
    if (ws) ws.close()
    ws = null
    setStatus('closed')
  }
 }
 export type OpsSeverity = string
 export type OpsPhase = string
 export type AlertSeverity = 'critical' | 'warning' | 'info'
 export type ThresholdMode = 'count' | 'percentage' | 'both'
 export type MetricType =
  | 'success_rate'
  | 'error_rate'
  | 'upstream_error_rate'
  | 'p95_latency_ms'
  | 'p99_latency_ms'
  | 'cpu_usage_percent'
  | 'memory_usage_percent'
  | 'concurrency_queue_depth'
  | 'group_available_accounts'
  | 'group_available_ratio'
  | 'group_rate_limit_ratio'
  | 'account_rate_limited_count'
  | 'account_error_count'
  | 'account_error_ratio'
  | 'overload_account_count'
 export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!='
 export interface AlertRule {
  id?: number
  name: string
  description?: string
  enabled: boolean
  metric_type: MetricType
  operator: Operator
  threshold: number
  window_minutes: number
  sustained_minutes: number
  severity: OpsSeverity
  cooldown_minutes: number
  notify_email: boolean
  filters?: Record<string, any>
  created_at?: string
  updated_at?: string
  last_triggered_at?: string | null
 }
 export interface AlertEvent {
  id: number
  rule_id: number
  severity: OpsSeverity | string
  status: 'firing' | 'resolved' | string
  title?: string
  description?: string
  metric_value?: number
  threshold_value?: number
  dimensions?: Record<string, any>
  fired_at: string
  resolved_at?: string | null
  email_sent: boolean
  created_at: string
 }
 export interface EmailNotificationConfig {
  alert: {
    enabled: boolean
    recipients: string[]
    min_severity: AlertSeverity | ''
    rate_limit_per_hour: number
    batching_window_seconds: number
    include_resolved_alerts: boolean
  }
  report: {
    enabled: boolean
    recipients: string[]
    daily_summary_enabled: boolean
    daily_summary_schedule: string
    weekly_summary_enabled: boolean
    weekly_summary_schedule: string
    error_digest_enabled: boolean
    error_digest_schedule: string
    error_digest_min_count: number
    account_health_enabled: boolean
    account_health_schedule: string
    account_health_error_rate_threshold: number
  }
 }
 export interface OpsDistributedLockSettings {
  enabled: boolean
  key: string
  ttl_seconds: number
 }
 export interface OpsAlertRuntimeSettings {
  evaluation_interval_seconds: number
  distributed_lock: OpsDistributedLockSettings
  silencing: {
    enabled: boolean
    global_until_rfc3339: string
    global_reason: string
    entries?: Array<{
      rule_id?: number
      severities?: Array<OpsSeverity | string>
      until_rfc3339: string
      reason: string
    }>
  }
 }
 export interface OpsAdvancedSettings {
  data_retention: OpsDataRetentionSettings
  aggregation: OpsAggregationSettings
 }
 export interface OpsDataRetentionSettings {
  cleanup_enabled: boolean
  cleanup_schedule: string
  error_log_retention_days: number
  minute_metrics_retention_days: number
  hourly_metrics_retention_days: number
 }
 export interface OpsAggregationSettings {
  aggregation_enabled: boolean
 }
 export interface OpsErrorLog {
  id: number
  created_at: string
  phase: OpsPhase
  type: string
  severity: OpsSeverity
  status_code: number
  platform: string
  model: string
  latency_ms?: number | null
  client_request_id: string
  request_id: string
  message: string
  user_id?: number | null
  api_key_id?: number | null
  account_id?: number | null
  group_id?: number | null
  client_ip?: string | null
  request_path?: string
  stream?: boolean
 }
 export interface OpsErrorDetail extends OpsErrorLog {
  error_body: string
  user_agent: string
  // Upstream context (optional; enriched by gateway services)
  upstream_status_code?: number | null
  upstream_error_message?: string
  upstream_error_detail?: string
  upstream_errors?: string
  auth_latency_ms?: number | null
  routing_latency_ms?: number | null
  upstream_latency_ms?: number | null
  response_latency_ms?: number | null
  time_to_first_token_ms?: number | null
  request_body: string
  request_body_truncated: boolean
  request_body_bytes?: number | null
  is_business_limited: boolean
 }
 export type OpsErrorLogsResponse = PaginatedResponse<OpsErrorLog>
 export async function getDashboardOverview(
  params: {
  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
  start_time?: string
  end_time?: string
  platform?: string
  group_id?: number | null
  mode?: OpsQueryMode
  },
  options: OpsRequestOptions = {}
 ): Promise<OpsDashboardOverview> {
  const { data } = await apiClient.get<OpsDashboardOverview>('/admin/ops/dashboard/overview', {
    params,
    signal: options.signal
  })
  return data
 }
 export async function getThroughputTrend(
  params: {
  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
  start_time?: string
  end_time?: string
  platform?: string
  group_id?: number | null
  mode?: OpsQueryMode
  },
  options: OpsRequestOptions = {}
 ): Promise<OpsThroughputTrendResponse> {
  const { data } = await apiClient.get<OpsThroughputTrendResponse>('/admin/ops/dashboard/throughput-trend', {
    params,
    signal: options.signal
  })
  return data
 }
 export async function getLatencyHistogram(
  params: {
  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
  start_time?: string
  end_time?: string
  platform?: string
  group_id?: number | null
  mode?: OpsQueryMode
  },
  options: OpsRequestOptions = {}
 ): Promise<OpsLatencyHistogramResponse> {
  const { data } = await apiClient.get<OpsLatencyHistogramResponse>('/admin/ops/dashboard/latency-histogram', {
    params,
    signal: options.signal
  })
  return data
 }
 export async function getErrorTrend(
  params: {
  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
  start_time?: string
  end_time?: string
  platform?: string
  group_id?: number | null
  mode?: OpsQueryMode
  },
  options: OpsRequestOptions = {}
 ): Promise<OpsErrorTrendResponse> {
  const { data } = await apiClient.get<OpsErrorTrendResponse>('/admin/ops/dashboard/error-trend', {
    params,
    signal: options.signal
  })
  return data
 }
 export async function getErrorDistribution(
  params: {
  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
  start_time?: string
  end_time?: string
  platform?: string
  group_id?: number | null
  mode?: OpsQueryMode
  },
  options: OpsRequestOptions = {}
 ): Promise<OpsErrorDistributionResponse> {
  const { data } = await apiClient.get<OpsErrorDistributionResponse>('/admin/ops/dashboard/error-distribution', {
    params,
    signal: options.signal
  })
  return data
 }
 export async function listErrorLogs(params: {
  page?: number
  page_size?: number
  time_range?: string
  start_time?: string
  end_time?: string
  platform?: string
  group_id?: number | null
  account_id?: number | null
  phase?: string
  q?: string
  status_codes?: string
 }): Promise<OpsErrorLogsResponse> {
  const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/errors', { params })
  return data
 }
 export async function getErrorLogDetail(id: number): Promise<OpsErrorDetail> {
  const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/errors/${id}`)
  return data
 }
 export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promise<OpsRetryResult> {
  const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/errors/${id}/retry`, req)
  return data
 }
 export async function listRequestDetails(params: OpsRequestDetailsParams): Promise<OpsRequestDetailsResponse> {
  const { data } = await apiClient.get<OpsRequestDetailsResponse>('/admin/ops/requests', { params })
  return data
 }
 // Alert rules
 export async function listAlertRules(): Promise<AlertRule[]> {
  const { data } = await apiClient.get<AlertRule[]>('/admin/ops/alert-rules')
  return data
 }
 export async function createAlertRule(rule: AlertRule): Promise<AlertRule> {
  const { data } = await apiClient.post<AlertRule>('/admin/ops/alert-rules', rule)
  return data
 }
 export async function updateAlertRule(id: number, rule: Partial<AlertRule>): Promise<AlertRule> {
  const { data } = await apiClient.put<AlertRule>(`/admin/ops/alert-rules/${id}`, rule)
  return data
 }
 export async function deleteAlertRule(id: number): Promise<void> {
  await apiClient.delete(`/admin/ops/alert-rules/${id}`)
 }
 export async function listAlertEvents(limit = 100): Promise<AlertEvent[]> {
  const { data } = await apiClient.get<AlertEvent[]>('/admin/ops/alert-events', { params: { limit } })
  return data
 }
 // Email notification config
 export async function getEmailNotificationConfig(): Promise<EmailNotificationConfig> {
  const { data } = await apiClient.get<EmailNotificationConfig>('/admin/ops/email-notification/config')
  return data
 }
 export async function updateEmailNotificationConfig(config: EmailNotificationConfig): Promise<EmailNotificationConfig> {
  const { data } = await apiClient.put<EmailNotificationConfig>('/admin/ops/email-notification/config', config)
  return data
 }
 // Runtime settings (DB-backed)
 export async function getAlertRuntimeSettings(): Promise<OpsAlertRuntimeSettings> {
  const { data } = await apiClient.get<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert')
  return data
 }
 export async function updateAlertRuntimeSettings(config: OpsAlertRuntimeSettings): Promise<OpsAlertRuntimeSettings> {
  const { data } = await apiClient.put<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert', config)
  return data
 }
 // Advanced settings (DB-backed)
 export async function getAdvancedSettings(): Promise<OpsAdvancedSettings> {
  const { data } = await apiClient.get<OpsAdvancedSettings>('/admin/ops/advanced-settings')
  return data
 }
 export async function updateAdvancedSettings(config: OpsAdvancedSettings): Promise<OpsAdvancedSettings> {
  const { data } = await apiClient.put<OpsAdvancedSettings>('/admin/ops/advanced-settings', config)
  return data
 }
 export const opsAPI = {
  getDashboardOverview,
  getThroughputTrend,
  getLatencyHistogram,
  getErrorTrend,
  getErrorDistribution,
  getConcurrencyStats,
  getAccountAvailabilityStats,
  subscribeQPS,
  listErrorLogs,
  getErrorLogDetail,
  retryErrorRequest,
  listRequestDetails,
  listAlertRules,
  createAlertRule,
  updateAlertRule,
  deleteAlertRule,
  listAlertEvents,
  getEmailNotificationConfig,
  updateEmailNotificationConfig,
  getAlertRuntimeSettings,
  updateAlertRuntimeSettings,
  getAdvancedSettings,
  updateAdvancedSettings
 }
 export default opsAPI
--- a/frontend/src/api/admin/settings.ts
+++ b/frontend/src/api/admin/settings.ts
@@ -35,14 +35,29 @@ export interface SystemSettings {
  turnstile_enabled: boolean
  turnstile_site_key: string
  turnstile_secret_key_configured: boolean
-  // LinuxDo Connect OAuth 登录（终端用户 SSO）
+
  // LinuxDo Connect OAuth settings
  linuxdo_connect_enabled: boolean
  linuxdo_connect_client_id: string
  linuxdo_connect_client_secret_configured: boolean
  linuxdo_connect_redirect_url: string
  // Model fallback configuration
  enable_model_fallback: boolean
  fallback_model_anthropic: string
  fallback_model_openai: string
  fallback_model_gemini: string
  fallback_model_antigravity: string
  // Identity patch configuration (Claude -> Gemini)
  enable_identity_patch: boolean
  identity_patch_prompt: string
  // Ops Monitoring (vNext)
  ops_monitoring_enabled: boolean
  ops_realtime_monitoring_enabled: boolean
  ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string
  ops_metrics_interval_seconds: number
 }
 export interface UpdateSettingsRequest {
@@ -71,8 +86,17 @@ export interface UpdateSettingsRequest {
  linuxdo_connect_client_id?: string
  linuxdo_connect_client_secret?: string
  linuxdo_connect_redirect_url?: string
  enable_model_fallback?: boolean
  fallback_model_anthropic?: string
  fallback_model_openai?: string
  fallback_model_gemini?: string
  fallback_model_antigravity?: string
  enable_identity_patch?: boolean
  identity_patch_prompt?: string
  ops_monitoring_enabled?: boolean
  ops_realtime_monitoring_enabled?: boolean
  ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string
  ops_metrics_interval_seconds?: number
 }
 /**
--- a/frontend/src/api/client.ts
+++ b/frontend/src/api/client.ts
@@ -80,9 +80,45 @@ apiClient.interceptors.response.use(
    return response
  },
  (error: AxiosError<ApiResponse<unknown>>) => {
    // Request cancellation: keep the original axios cancellation error so callers can ignore it.
    // Otherwise we'd misclassify it as a generic "network error".
    if (error.code === 'ERR_CANCELED' || axios.isCancel(error)) {
      return Promise.reject(error)
    }
    // Handle common errors
    if (error.response) {
      const { status, data } = error.response
      const url = String(error.config?.url || '')
      // Validate `data` shape to avoid HTML error pages breaking our error handling.
      const apiData = (typeof data === 'object' && data !== null ? data : {}) as Record<string, any>
      // Ops monitoring disabled: treat as feature-flagged 404, and proactively redirect away
      // from ops pages to avoid broken UI states.
      if (status === 404 && apiData.message === 'Ops monitoring is disabled') {
        try {
          localStorage.setItem('ops_monitoring_enabled_cached', 'false')
        } catch {
          // ignore localStorage failures
        }
        try {
          window.dispatchEvent(new CustomEvent('ops-monitoring-disabled'))
        } catch {
          // ignore event failures
        }
        if (window.location.pathname.startsWith('/admin/ops')) {
          window.location.href = '/admin/settings'
        }
        return Promise.reject({
          status,
          code: 'OPS_DISABLED',
          message: apiData.message || error.message,
          url
        })
      }
      // 401: Unauthorized - clear token and redirect to login
      if (status === 401) {
@@ -113,8 +149,8 @@ apiClient.interceptors.response.use(
      // Return structured error
      return Promise.reject({
        status,
-        code: data?.code,
+        code: apiData.code,
-        message: data?.message || error.message
+        message: apiData.message || apiData.detail || error.message
      })
    }
--- a/frontend/src/components/common/HelpTooltip.vue
+++ b/frontend/src/components/common/HelpTooltip.vue
@@ -0,0 +1,44 @@
 <script setup lang="ts">
 import { ref } from 'vue'
 defineProps<{
  content?: string
 }>()
 const show = ref(false)
 </script>
 <template>
  <div
    class="group relative ml-1 inline-flex items-center align-middle"
    @mouseenter="show = true"
    @mouseleave="show = false"
  >
    <!-- Trigger Icon -->
    <slot name="trigger">
      <svg
        class="h-4 w-4 cursor-help text-gray-400 transition-colors hover:text-primary-600 dark:text-gray-500 dark:hover:text-primary-400"
        fill="none"
        viewBox="0 0 24 24"
        stroke="currentColor"
        stroke-width="2"
      >
        <path
          stroke-linecap="round"
          stroke-linejoin="round"
          d="M13 16h-1v-4h-1m1-4h.01M21 12a9 9 0 11-18 0 9 9 0 0118 0z"
        />
      </svg>
    </slot>
    <!-- Popover Content -->
    <div
      v-show="show"
      class="absolute bottom-full left-1/2 z-50 mb-2 w-64 -translate-x-1/2 rounded-lg bg-gray-900 p-3 text-xs leading-relaxed text-white shadow-xl ring-1 ring-white/10 opacity-0 transition-opacity duration-200 group-hover:opacity-100 dark:bg-gray-800"
    >
      <slot>{{ content }}</slot>
      <div class="absolute -bottom-1 left-1/2 h-2 w-2 -translate-x-1/2 rotate-45 bg-gray-900 dark:bg-gray-800"></div>
    </div>
  </div>
 </template>
--- a/frontend/src/components/common/Select.vue
+++ b/frontend/src/components/common/Select.vue
@@ -67,12 +67,13 @@
              :aria-selected="isSelected(option)"
              :aria-disabled="isOptionDisabled(option)"
              @click.stop="!isOptionDisabled(option) && selectOption(option)"
-              @mouseenter="focusedIndex = index"
+              @mouseenter="handleOptionMouseEnter(option, index)"
              :class="[
                'select-option',
                isGroupHeaderOption(option) && 'select-option-group',
                isSelected(option) && 'select-option-selected',
-                isOptionDisabled(option) && 'select-option-disabled',
+                isOptionDisabled(option) && !isGroupHeaderOption(option) && 'select-option-disabled',
-                focusedIndex === index && 'select-option-focused'
+                focusedIndex === index && !isGroupHeaderOption(option) && 'select-option-focused'
              ]"
            >
              <slot name="option" :option="option" :selected="isSelected(option)">
@@ -201,6 +202,13 @@ const isOptionDisabled = (option: any): boolean => {
  return false
 }
 const isGroupHeaderOption = (option: any): boolean => {
  if (typeof option === 'object' && option !== null) {
    return option.kind === 'group'
  }
  return false
 }
 const selectedOption = computed(() => {
  return props.options.find((opt) => getOptionValue(opt) === props.modelValue) || null
 })
@@ -225,6 +233,31 @@ const isSelected = (option: any): boolean => {
  return getOptionValue(option) === props.modelValue
 }
 const findNextEnabledIndex = (startIndex: number): number => {
  const opts = filteredOptions.value
  if (opts.length === 0) return -1
  for (let offset = 0; offset < opts.length; offset++) {
    const idx = (startIndex + offset) % opts.length
    if (!isOptionDisabled(opts[idx])) return idx
  }
  return -1
 }
 const findPrevEnabledIndex = (startIndex: number): number => {
  const opts = filteredOptions.value
  if (opts.length === 0) return -1
  for (let offset = 0; offset < opts.length; offset++) {
    const idx = (startIndex - offset + opts.length) % opts.length
    if (!isOptionDisabled(opts[idx])) return idx
  }
  return -1
 }
 const handleOptionMouseEnter = (option: any, index: number) => {
  if (isOptionDisabled(option) || isGroupHeaderOption(option)) return
  focusedIndex.value = index
 }
 // Update trigger rect periodically while open to follow scroll/resize
 const updateTriggerRect = () => {
  if (containerRef.value) {
@@ -259,8 +292,15 @@ watch(isOpen, (open) => {
  if (open) {
    calculateDropdownPosition()
    // Reset focused index to current selection or first item
-    const selectedIdx = filteredOptions.value.findIndex(isSelected)
+    if (filteredOptions.value.length === 0) {
-    focusedIndex.value = selectedIdx >= 0 ? selectedIdx : 0
+      focusedIndex.value = -1
    } else {
      const selectedIdx = filteredOptions.value.findIndex(isSelected)
      const initialIdx = selectedIdx >= 0 ? selectedIdx : 0
      focusedIndex.value = isOptionDisabled(filteredOptions.value[initialIdx])
        ? findNextEnabledIndex(initialIdx + 1)
        : initialIdx
    }
    if (props.searchable) {
      nextTick(() => searchInputRef.value?.focus())
@@ -295,13 +335,13 @@ const onDropdownKeyDown = (e: KeyboardEvent) => {
  switch (e.key) {
    case 'ArrowDown':
      e.preventDefault()
-      focusedIndex.value = (focusedIndex.value + 1) % filteredOptions.value.length
+      focusedIndex.value = findNextEnabledIndex(focusedIndex.value + 1)
-      scrollToFocused()
+      if (focusedIndex.value >= 0) scrollToFocused()
      break
    case 'ArrowUp':
      e.preventDefault()
-      focusedIndex.value = (focusedIndex.value - 1 + filteredOptions.value.length) % filteredOptions.value.length
+      focusedIndex.value = findPrevEnabledIndex(focusedIndex.value - 1)
-      scrollToFocused()
+      if (focusedIndex.value >= 0) scrollToFocused()
      break
    case 'Enter':
      e.preventDefault()
@@ -441,6 +481,17 @@ onUnmounted(() => {
  @apply cursor-not-allowed opacity-40;
 }
 .select-dropdown-portal .select-option-group {
  @apply cursor-default select-none;
  @apply bg-gray-50 dark:bg-dark-900;
  @apply text-[11px] font-bold uppercase tracking-wider;
  @apply text-gray-500 dark:text-gray-400;
 }
 .select-dropdown-portal .select-option-group:hover {
  @apply bg-gray-50 dark:bg-dark-900;
 }
 .select-dropdown-portal .select-option-label {
  @apply flex-1 min-w-0 truncate text-left;
 }
--- a/frontend/src/components/keys/UseKeyModal.vue
+++ b/frontend/src/components/keys/UseKeyModal.vue
@@ -28,8 +28,8 @@
          {{ platformDescription }}
        </p>
-        <!-- Client Tabs (only for Antigravity platform) -->
+        <!-- Client Tabs -->
-        <div v-if="platform === 'antigravity'" class="border-b border-gray-200 dark:border-dark-700">
+        <div v-if="clientTabs.length" class="border-b border-gray-200 dark:border-dark-700">
          <nav class="-mb-px flex space-x-6" aria-label="Client">
            <button
              v-for="tab in clientTabs"
@@ -51,7 +51,7 @@
        </div>
        <!-- OS/Shell Tabs -->
-        <div class="border-b border-gray-200 dark:border-dark-700">
+        <div v-if="showShellTabs" class="border-b border-gray-200 dark:border-dark-700">
          <nav class="-mb-px flex space-x-4" aria-label="Tabs">
            <button
              v-for="tab in currentTabs"
@@ -111,7 +111,7 @@
        </div>
        <!-- Usage Note -->
-        <div class="flex items-start gap-3 p-3 rounded-lg bg-blue-50 dark:bg-blue-900/20 border border-blue-100 dark:border-blue-800">
+        <div v-if="showPlatformNote" class="flex items-start gap-3 p-3 rounded-lg bg-blue-50 dark:bg-blue-900/20 border border-blue-100 dark:border-blue-800">
          <Icon name="infoCircle" size="md" class="text-blue-500 flex-shrink-0 mt-0.5" />
          <p class="text-sm text-blue-700 dark:text-blue-300">
            {{ platformNote }}
@@ -173,17 +173,28 @@ const { copyToClipboard: clipboardCopy } = useClipboard()
 const copiedIndex = ref<number | null>(null)
 const activeTab = ref<string>('unix')
-const activeClientTab = ref<string>('claude')  // Level 1 tab for antigravity platform
+const activeClientTab = ref<string>('claude')
 // Reset tabs when platform changes
-watch(() => props.platform, (newPlatform) => {
+const defaultClientTab = computed(() => {
-  activeTab.value = 'unix'
+  switch (props.platform) {
-  if (newPlatform === 'antigravity') {
+    case 'openai':
-    activeClientTab.value = 'claude'
+      return 'codex'
    case 'gemini':
      return 'gemini'
    case 'antigravity':
      return 'claude'
    default:
      return 'claude'
  }
 })
-// Reset shell tab when client changes (for antigravity)
+watch(() => props.platform, () => {
  activeTab.value = 'unix'
  activeClientTab.value = defaultClientTab.value
 }, { immediate: true })
 // Reset shell tab when client changes
 watch(activeClientTab, () => {
  activeTab.value = 'unix'
 })
@@ -251,11 +262,32 @@ const SparkleIcon = {
  }
 }
-// Client tabs for Antigravity platform (Level 1)
+const clientTabs = computed((): TabConfig[] => {
-const clientTabs = computed((): TabConfig[] => [
+  if (!props.platform) return []
-  { id: 'claude', label: t('keys.useKeyModal.antigravity.claudeCode'), icon: TerminalIcon },
+  switch (props.platform) {
-  { id: 'gemini', label: t('keys.useKeyModal.antigravity.geminiCli'), icon: SparkleIcon }
+    case 'openai':
-])
+      return [
        { id: 'codex', label: t('keys.useKeyModal.cliTabs.codexCli'), icon: TerminalIcon },
        { id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
      ]
    case 'gemini':
      return [
        { id: 'gemini', label: t('keys.useKeyModal.cliTabs.geminiCli'), icon: SparkleIcon },
        { id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
      ]
    case 'antigravity':
      return [
        { id: 'claude', label: t('keys.useKeyModal.cliTabs.claudeCode'), icon: TerminalIcon },
        { id: 'gemini', label: t('keys.useKeyModal.cliTabs.geminiCli'), icon: SparkleIcon },
        { id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
      ]
    default:
      return [
        { id: 'claude', label: t('keys.useKeyModal.cliTabs.claudeCode'), icon: TerminalIcon },
        { id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
      ]
  }
 })
 // Shell tabs (3 types for environment variable based configs)
 const shellTabs: TabConfig[] = [
@@ -270,11 +302,13 @@ const openaiTabs: TabConfig[] = [
  { id: 'windows', label: 'Windows', icon: WindowsIcon }
 ]
 const showShellTabs = computed(() => activeClientTab.value !== 'opencode')
 const currentTabs = computed(() => {
  if (!showShellTabs.value) return []
  if (props.platform === 'openai') {
-    return openaiTabs  // 2 tabs: unix, windows
+    return openaiTabs
  }
  // All other platforms (anthropic, gemini, antigravity) use shell tabs
  return shellTabs
 })
@@ -308,6 +342,8 @@ const platformNote = computed(() => {
  }
 })
 const showPlatformNote = computed(() => activeClientTab.value !== 'opencode')
 const escapeHtml = (value: string) => value
  .replace(/&/g, '&amp;')
  .replace(/</g, '&lt;')
@@ -329,6 +365,35 @@ const comment = (value: string) => wrapToken('text-slate-500', value)
 const currentFiles = computed((): FileConfig[] => {
  const baseUrl = props.baseUrl || window.location.origin
  const apiKey = props.apiKey
  const baseRoot = baseUrl.replace(/\/v1\/?$/, '').replace(/\/+$/, '')
  const ensureV1 = (value: string) => {
    const trimmed = value.replace(/\/+$/, '')
    return trimmed.endsWith('/v1') ? trimmed : `${trimmed}/v1`
  }
  const apiBase = ensureV1(baseRoot)
  const antigravityBase = ensureV1(`${baseRoot}/antigravity`)
  const antigravityGeminiBase = (() => {
    const trimmed = `${baseRoot}/antigravity`.replace(/\/+$/, '')
    return trimmed.endsWith('/v1beta') ? trimmed : `${trimmed}/v1beta`
  })()
  if (activeClientTab.value === 'opencode') {
    switch (props.platform) {
      case 'anthropic':
        return [generateOpenCodeConfig('anthropic', apiBase, apiKey)]
      case 'openai':
        return [generateOpenCodeConfig('openai', apiBase, apiKey)]
      case 'gemini':
        return [generateOpenCodeConfig('gemini', apiBase, apiKey)]
      case 'antigravity':
        return [
          generateOpenCodeConfig('antigravity-claude', antigravityBase, apiKey, 'opencode.json (Claude)'),
          generateOpenCodeConfig('antigravity-gemini', antigravityGeminiBase, apiKey, 'opencode.json (Gemini)')
        ]
      default:
        return [generateOpenCodeConfig('openai', apiBase, apiKey)]
    }
  }
  switch (props.platform) {
    case 'openai':
@@ -336,12 +401,11 @@ const currentFiles = computed((): FileConfig[] => {
    case 'gemini':
      return [generateGeminiCliContent(baseUrl, apiKey)]
    case 'antigravity':
-      // Both Claude Code and Gemini CLI need /antigravity suffix for antigravity platform
+      if (activeClientTab.value === 'gemini') {
-      if (activeClientTab.value === 'claude') {
+        return [generateGeminiCliContent(`${baseUrl}/antigravity`, apiKey)]
        return generateAnthropicFiles(`${baseUrl}/antigravity`, apiKey)
      }
-      return [generateGeminiCliContent(`${baseUrl}/antigravity`, apiKey)]
+      return generateAnthropicFiles(`${baseUrl}/antigravity`, apiKey)
-    default: // anthropic
+    default:
      return generateAnthropicFiles(baseUrl, apiKey)
  }
 })
@@ -456,6 +520,76 @@ requires_openai_auth = true`
  ]
 }
 function generateOpenCodeConfig(platform: string, baseUrl: string, apiKey: string, pathLabel?: string): FileConfig {
  const provider: Record<string, any> = {
    [platform]: {
      options: {
        baseURL: baseUrl,
        apiKey,
        ...(platform === 'openai' ? { store: false } : {})
      }
    }
  }
  const openaiModels = {
    'gpt-5.2-codex': {
      name: 'GPT-5.2 Codex',
      variants: {
        low: {},
        medium: {},
        high: {},
        xhigh: {}
      }
    }
  }
  const geminiModels = {
    'gemini-3-pro-high': { name: 'Gemini 3 Pro High' },
    'gemini-3-pro-low': { name: 'Gemini 3 Pro Low' },
    'gemini-3-pro-preview': { name: 'Gemini 3 Pro Preview' },
    'gemini-3-pro-image': { name: 'Gemini 3 Pro Image' },
    'gemini-3-flash': { name: 'Gemini 3 Flash' },
    'gemini-2.5-flash-thinking': { name: 'Gemini 2.5 Flash Thinking' },
    'gemini-2.5-flash': { name: 'Gemini 2.5 Flash' },
    'gemini-2.5-flash-lite': { name: 'Gemini 2.5 Flash Lite' }
  }
  const claudeModels = {
    'claude-opus-4-5-thinking': { name: 'Claude Opus 4.5 Thinking' },
    'claude-sonnet-4-5-thinking': { name: 'Claude Sonnet 4.5 Thinking' },
    'claude-sonnet-4-5': { name: 'Claude Sonnet 4.5' }
  }
  if (platform === 'gemini') {
    provider[platform].npm = '@ai-sdk/google'
    provider[platform].models = geminiModels
  } else if (platform === 'anthropic') {
    provider[platform].npm = '@ai-sdk/anthropic'
  } else if (platform === 'antigravity-claude') {
    provider[platform].npm = '@ai-sdk/anthropic'
    provider[platform].name = 'Antigravity (Claude)'
    provider[platform].models = claudeModels
  } else if (platform === 'antigravity-gemini') {
    provider[platform].npm = '@ai-sdk/google'
    provider[platform].name = 'Antigravity (Gemini)'
    provider[platform].models = geminiModels
  } else if (platform === 'openai') {
    provider[platform].models = openaiModels
  }
  const content = JSON.stringify(
    {
      provider,
      $schema: 'https://opencode.ai/config.json'
    },
    null,
    2
  )
  return {
    path: pathLabel ?? 'opencode.json',
    content,
    hint: t('keys.useKeyModal.opencode.hint')
  }
 }
 const copyContent = async (content: string, index: number) => {
  const success = await clipboardCopy(content, t('keys.copied'))
  if (success) {
--- a/frontend/src/components/layout/AppSidebar.vue
+++ b/frontend/src/components/layout/AppSidebar.vue
@@ -144,10 +144,10 @@
 </template>
 <script setup lang="ts">
-import { computed, h, ref } from 'vue'
+import { computed, h, onMounted, ref, watch } from 'vue'
 import { useRoute } from 'vue-router'
 import { useI18n } from 'vue-i18n'
-import { useAppStore, useAuthStore, useOnboardingStore } from '@/stores'
+import { useAdminSettingsStore, useAppStore, useAuthStore, useOnboardingStore } from '@/stores'
 import VersionBadge from '@/components/common/VersionBadge.vue'
 const { t } = useI18n()
@@ -156,6 +156,7 @@ const route = useRoute()
 const appStore = useAppStore()
 const authStore = useAuthStore()
 const onboardingStore = useOnboardingStore()
 const adminSettingsStore = useAdminSettingsStore()
 const sidebarCollapsed = computed(() => appStore.sidebarCollapsed)
 const mobileOpen = computed(() => appStore.mobileOpen)
@@ -442,6 +443,9 @@ const personalNavItems = computed(() => {
 const adminNavItems = computed(() => {
  const baseItems = [
    { path: '/admin/dashboard', label: t('nav.dashboard'), icon: DashboardIcon },
    ...(adminSettingsStore.opsMonitoringEnabled
      ? [{ path: '/admin/ops', label: t('nav.ops'), icon: ChartIcon }]
      : []),
    { path: '/admin/users', label: t('nav.users'), icon: UsersIcon, hideInSimpleMode: true },
    { path: '/admin/groups', label: t('nav.groups'), icon: FolderIcon, hideInSimpleMode: true },
    { path: '/admin/subscriptions', label: t('nav.subscriptions'), icon: CreditCardIcon, hideInSimpleMode: true },
@@ -511,6 +515,23 @@ if (
  isDark.value = true
  document.documentElement.classList.add('dark')
 }
 // Fetch admin settings (for feature-gated nav items like Ops).
 watch(
  isAdmin,
  (v) => {
    if (v) {
      adminSettingsStore.fetch()
    }
  },
  { immediate: true }
 )
 onMounted(() => {
  if (isAdmin.value) {
    adminSettingsStore.fetch()
  }
 })
 </script>
 <style scoped>
--- a/frontend/src/i18n/locales/en.ts
+++ b/frontend/src/i18n/locales/en.ts
@@ -131,6 +131,7 @@ export default {
    noData: 'No data',
    success: 'Success',
    error: 'Error',
    critical: 'Critical',
    warning: 'Warning',
    info: 'Info',
    active: 'Active',
@@ -145,9 +146,11 @@ export default {
    copiedToClipboard: 'Copied to clipboard',
    copyFailed: 'Failed to copy',
    contactSupport: 'Contact Support',
    add: 'Add',
    invalidEmail: 'Please enter a valid email address',
    optional: 'optional',
    selectOption: 'Select an option',
-        searchPlaceholder: 'Search...', 
+    searchPlaceholder: 'Search...', 
        noOptionsFound: 'No options found',
        noGroupsAvailable: 'No groups available',
        unknownError: 'Unknown error occurred',
@@ -178,6 +181,7 @@ export default {
    accounts: 'Accounts',
    proxies: 'Proxies',
    redeemCodes: 'Redeem Codes',
    ops: 'Ops',
    promoCodes: 'Promo Codes',
    settings: 'Settings',
    myAccount: 'My Account',
@@ -364,6 +368,12 @@ export default {
        note: 'Make sure the config directory exists. macOS/Linux users can run mkdir -p ~/.codex to create it.',
        noteWindows: 'Press Win+R and enter %userprofile%\\.codex to open the config directory. Create it manually if it does not exist.',
      },
      cliTabs: {
        claudeCode: 'Claude Code',
        geminiCli: 'Gemini CLI',
        codexCli: 'Codex CLI',
        opencode: 'OpenCode',
      },
      antigravity: {
        description: 'Configure API access for Antigravity group. Select the configuration method based on your client.',
        claudeCode: 'Claude Code',
@@ -376,6 +386,11 @@ export default {
        modelComment: 'If you have Gemini 3 access, you can use: gemini-3-pro-preview',
        note: 'These environment variables will be active in the current terminal session. For permanent configuration, add them to ~/.bashrc, ~/.zshrc, or the appropriate configuration file.',
      },
      opencode: {
        title: 'OpenCode Example',
        subtitle: 'opencode.json',
        hint: 'This is a group configuration example. Adjust model and options as needed.',
      },
    },
    customKeyLabel: 'Custom Key',
    customKeyPlaceholder: 'Enter your custom key (min 16 chars)',
@@ -1826,6 +1841,524 @@ export default {
      ipAddress: 'IP'
    },
    // Ops Monitoring
    ops: {
      title: 'Ops Monitoring',
      description: 'Operational monitoring and troubleshooting',
      // Dashboard
      systemHealth: 'System Health',
      overview: 'Overview',
      noSystemMetrics: 'No system metrics collected yet.',
      collectedAt: 'Collected at:',
      window: 'window',
      cpu: 'CPU',
      memory: 'Memory',
      db: 'DB',
      redis: 'Redis',
      goroutines: 'Goroutines',
      jobs: 'Jobs',
      jobsHelp: 'Click “Details” to view job heartbeats and recent errors',
      active: 'active',
      idle: 'idle',
      waiting: 'waiting',
      conns: 'conns',
      queue: 'queue',
      ok: 'ok',
      lastRun: 'last_run:',
      lastSuccess: 'last_success:',
      lastError: 'last_error:',
      noData: 'No data.',
      loadingText: 'loading',
      ready: 'ready',
      requestsTotal: 'Requests (total)',
      slaScope: 'SLA scope:',
      tokens: 'Tokens',
      tps: 'TPS:',
      current: 'current',
      peak: 'peak',
      average: 'average',
      totalRequests: 'Total Requests',
      avgQps: 'Avg QPS',
      avgTps: 'Avg TPS',
      avgLatency: 'Avg Latency',
      avgTtft: 'Avg TTFT',
      exceptions: 'Exceptions',
      requestErrors: 'Request Errors',
      errorCount: 'Error Count',
      upstreamErrors: 'Upstream Errors',
      errorCountExcl429529: 'Error Count (excl 429/529)',
      sla: 'SLA (excl business limits)',
      businessLimited: 'business_limited:',
      errors: 'Errors',
      errorRate: 'error_rate:',
      upstreamRate: 'upstream_rate:',
      latencyDuration: 'Latency (duration_ms)',
      ttftLabel: 'TTFT (first_token_ms)',
      p50: 'p50:',
      p90: 'p90:',
      p95: 'p95:',
      p99: 'p99:',
      avg: 'avg:',
      max: 'max:',
      qps: 'QPS',
      requests: 'Requests',
      upstream: 'Upstream',
      client: 'Client',
      system: 'System',
      other: 'Other',
      errorsSla: 'Errors (SLA scope)',
      upstreamExcl429529: 'Upstream (excl 429/529)',
      failedToLoadData: 'Failed to load ops data.',
      failedToLoadOverview: 'Failed to load overview',
      failedToLoadThroughputTrend: 'Failed to load throughput trend',
      failedToLoadLatencyHistogram: 'Failed to load latency histogram',
      failedToLoadErrorTrend: 'Failed to load error trend',
      failedToLoadErrorDistribution: 'Failed to load error distribution',
      failedToLoadErrorDetail: 'Failed to load error detail',
      retryFailed: 'Retry failed',
      tpsK: 'TPS (K)',
      top: 'Top:',
      throughputTrend: 'Throughput Trend',
      latencyHistogram: 'Latency Histogram',
      errorTrend: 'Error Trend',
      errorDistribution: 'Error Distribution',
      // Health Score & Diagnosis
      health: 'Health',
      healthCondition: 'Health Condition',
      healthHelp: 'Overall system health score based on SLA, error rate, and resource usage',
      healthyStatus: 'Healthy',
      riskyStatus: 'At Risk',
      idleStatus: 'Idle',
      timeRange: {
        '5m': 'Last 5 minutes',
        '30m': 'Last 30 minutes',
        '1h': 'Last 1 hour',
        '6h': 'Last 6 hours',
        '24h': 'Last 24 hours'
      },
      diagnosis: {
        title: 'Smart Diagnosis',
        footer: 'Automated diagnostic suggestions based on current metrics',
        idle: 'System is currently idle',
        idleImpact: 'No active traffic',
        // Resource diagnostics
        dbDown: 'Database connection failed',
        dbDownImpact: 'All database operations will fail',
        dbDownAction: 'Check database service status, network connectivity, and connection configuration',
        redisDown: 'Redis connection failed',
        redisDownImpact: 'Cache functionality degraded, performance may decline',
        redisDownAction: 'Check Redis service status and network connectivity',
        cpuCritical: 'CPU usage critically high ({usage}%)',
        cpuCriticalImpact: 'System response slowing, may affect all requests',
        cpuCriticalAction: 'Check CPU-intensive tasks, consider scaling or code optimization',
        cpuHigh: 'CPU usage elevated ({usage}%)',
        cpuHighImpact: 'System load is high, needs attention',
        cpuHighAction: 'Monitor CPU trends, prepare scaling plan',
        memoryCritical: 'Memory usage critically high ({usage}%)',
        memoryCriticalImpact: 'May trigger OOM, system stability threatened',
        memoryCriticalAction: 'Check for memory leaks, consider increasing memory or optimizing usage',
        memoryHigh: 'Memory usage elevated ({usage}%)',
        memoryHighImpact: 'Memory pressure is high, needs attention',
        memoryHighAction: 'Monitor memory trends, check for memory leaks',
        // Latency diagnostics
        latencyCritical: 'Response latency critically high ({latency}ms)',
        latencyCriticalImpact: 'User experience extremely poor, many requests timing out',
        latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services',
        latencyHigh: 'Response latency elevated ({latency}ms)',
        latencyHighImpact: 'User experience degraded, needs optimization',
        latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic',
        ttftHigh: 'Time to first byte elevated ({ttft}ms)',
        ttftHighImpact: 'User perceived latency increased',
        ttftHighAction: 'Optimize request processing flow, reduce pre-processing time',
        // Error rate diagnostics
        upstreamCritical: 'Upstream error rate critically high ({rate}%)',
        upstreamCriticalImpact: 'May affect many user requests',
        upstreamCriticalAction: 'Check upstream service health, enable fallback strategies',
        upstreamHigh: 'Upstream error rate elevated ({rate}%)',
        upstreamHighImpact: 'Recommend checking upstream service status',
        upstreamHighAction: 'Contact upstream service team, prepare fallback plan',
        errorHigh: 'Error rate too high ({rate}%)',
        errorHighImpact: 'Many requests failing',
        errorHighAction: 'Check error logs, identify root cause, urgent fix required',
        errorElevated: 'Error rate elevated ({rate}%)',
        errorElevatedImpact: 'Recommend checking error logs',
        errorElevatedAction: 'Analyze error types and distribution, create fix plan',
        // SLA diagnostics
        slaCritical: 'SLA critically below target ({sla}%)',
        slaCriticalImpact: 'User experience severely degraded',
        slaCriticalAction: 'Urgently investigate errors and latency, consider rate limiting',
        slaLow: 'SLA below target ({sla}%)',
        slaLowImpact: 'Service quality needs attention',
        slaLowAction: 'Analyze SLA decline causes, optimize system performance',
        // Health score diagnostics
        healthCritical: 'Overall health score critically low ({score})',
        healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation',
        healthCriticalAction: 'Comprehensive system check, prioritize critical-level issues',
        healthLow: 'Overall health score low ({score})',
        healthLowImpact: 'May indicate minor instability; monitor SLA and error rates',
        healthLowAction: 'Monitor metric trends, prevent issue escalation',
        healthy: 'All system metrics normal',
        healthyImpact: 'Service running stable'
      },
      // Error Log
      errorLog: {
        timeId: 'Time / ID',
        context: 'Context',
        status: 'Status',
        message: 'Message',
        latency: 'Latency',
        action: 'Action',
        noErrors: 'No errors in this window.',
        grp: 'GRP:',
        acc: 'ACC:',
        details: 'Details',
        phase: 'Phase'
      },
      // Error Details Modal
      errorDetails: {
        upstreamErrors: 'Upstream Errors',
        requestErrors: 'Request Errors',
        total: 'Total:',
        searchPlaceholder: 'Search request_id / client_request_id / message',
        accountIdPlaceholder: 'account_id'
      },
      // Error Detail Modal
      errorDetail: {
        loading: 'Loading…',
        requestId: 'Request ID',
        time: 'Time',
        phase: 'Phase',
        status: 'Status',
        message: 'Message',
        basicInfo: 'Basic Info',
        platform: 'Platform',
        model: 'Model',
        latency: 'Latency',
        ttft: 'TTFT',
        businessLimited: 'Business Limited',
        requestPath: 'Request Path',
        timings: 'Timings',
        auth: 'Auth',
        routing: 'Routing',
        upstream: 'Upstream',
        response: 'Response',
        retry: 'Retry',
        retryClient: 'Retry (Client)',
        retryUpstream: 'Retry (Upstream pinned)',
        pinnedAccountId: 'Pinned account_id',
        retryNotes: 'Retry Notes',
        requestBody: 'Request Body',
        errorBody: 'Error Body',
        trimmed: 'trimmed',
        confirmRetry: 'Confirm Retry',
        retrySuccess: 'Retry succeeded',
        retryFailed: 'Retry failed',
        na: 'N/A',
        retryHint: 'Retry will resend the request with the same parameters',
        retryClientHint: 'Use client retry (no account pinning)',
        retryUpstreamHint: 'Use upstream pinned retry (pin to the error account)',
        pinnedAccountIdHint: '(auto from error log)',
        retryNote1: 'Retry will use the same request body and parameters',
        retryNote2: 'If the original request failed due to account issues, pinned retry may still fail',
        retryNote3: 'Client retry will reselect an account',
        confirmRetryMessage: 'Confirm retry this request?',
        confirmRetryHint: 'Will resend with the same request parameters'
      },
      requestDetails: {
        title: 'Request Details',
        details: 'Details',
        rangeLabel: 'Window: {range}',
        rangeMinutes: '{n} minutes',
        rangeHours: '{n} hours',
        empty: 'No requests in this window.',
        emptyHint: 'Try a different time range or remove filters.',
        failedToLoad: 'Failed to load request details',
        requestIdCopied: 'Request ID copied',
        copyFailed: 'Copy failed',
        copy: 'Copy',
        viewError: 'View Error',
        kind: {
          success: 'SUCCESS',
          error: 'ERROR'
        },
        table: {
          time: 'Time',
          kind: 'Kind',
          platform: 'Platform',
          model: 'Model',
          duration: 'Duration',
          status: 'Status',
          requestId: 'Request ID',
          actions: 'Actions'
        }
      },
      alertEvents: {
        title: 'Alert Events',
        description: 'Recent alert firing/resolution records (email-only)',
        loading: 'Loading...',
        empty: 'No alert events',
        loadFailed: 'Failed to load alert events',
        table: {
          time: 'Time',
          status: 'Status',
          severity: 'Severity',
          title: 'Title',
          metric: 'Metric / Threshold',
          email: 'Email Sent'
        }
      },
      alertRules: {
        title: 'Alert Rules',
        description: 'Create and manage threshold-based system alerts (email-only)',
        loading: 'Loading...',
        empty: 'No alert rules',
        loadFailed: 'Failed to load alert rules',
        saveFailed: 'Failed to save alert rule',
        deleteFailed: 'Failed to delete alert rule',
        create: 'Create Rule',
        createTitle: 'Create Alert Rule',
        editTitle: 'Edit Alert Rule',
        deleteConfirmTitle: 'Delete this rule?',
        deleteConfirmMessage: 'This will remove the rule and its related events. Continue?',
        metricGroups: {
          system: 'System Metrics',
          group: 'Group-level Metrics (requires group_id)',
          account: 'Account-level Metrics'
        },
        metrics: {
          successRate: 'Success Rate (%)',
          errorRate: 'Error Rate (%)',
          upstreamErrorRate: 'Upstream Error Rate (%)',
          p95: 'P95 Latency (ms)',
          p99: 'P99 Latency (ms)',
          cpu: 'CPU Usage (%)',
          memory: 'Memory Usage (%)',
          queueDepth: 'Concurrency Queue Depth',
          groupAvailableAccounts: 'Group Available Accounts',
          groupAvailableRatio: 'Group Available Ratio (%)',
          groupRateLimitRatio: 'Group Rate Limit Ratio (%)',
          accountRateLimitedCount: 'Rate-limited Accounts',
          accountErrorCount: 'Error Accounts (excluding temporarily unschedulable)',
          accountErrorRatio: 'Error Account Ratio (%)',
          overloadAccountCount: 'Overloaded Accounts'
        },
        metricDescriptions: {
          successRate: 'Percentage of successful requests in the window (0-100).',
          errorRate: 'Percentage of failed requests in the window (0-100).',
          upstreamErrorRate: 'Percentage of upstream failures in the window (0-100).',
          p95: 'P95 request latency within the window (ms).',
          p99: 'P99 request latency within the window (ms).',
          cpu: 'Current instance CPU usage (0-100).',
          memory: 'Current instance memory usage (0-100).',
          queueDepth: 'Concurrency queue depth within the window (queued requests).',
          groupAvailableAccounts: 'Number of available accounts in the selected group (requires group_id).',
          groupAvailableRatio: 'Available account ratio in the selected group (0-100, requires group_id).',
          groupRateLimitRatio: 'Rate-limited account ratio in the selected group (0-100, requires group_id).',
          accountRateLimitedCount: 'Number of rate-limited accounts within the window.',
          accountErrorCount: 'Number of error accounts within the window (excluding temporarily unschedulable).',
          accountErrorRatio: 'Error account ratio within the window (0-100).',
          overloadAccountCount: 'Number of overloaded accounts within the window.'
        },
        hints: {
          recommended: 'Recommended: operator {operator}, threshold {threshold}{unit}',
          groupRequired: 'This is a group-level metric; selecting a group (group_id) is required.',
          groupOptional: 'Optional: limit the rule to a specific group via group_id.'
        },
        table: {
          name: 'Name',
          metric: 'Metric',
          severity: 'Severity',
          enabled: 'Enabled',
          actions: 'Actions'
        },
        form: {
          name: 'Name',
          description: 'Description',
          metric: 'Metric',
          operator: 'Operator',
          groupId: 'Group (group_id)',
          groupPlaceholder: 'Select a group',
          allGroups: 'All groups',
          threshold: 'Threshold',
          severity: 'Severity',
          window: 'Window (minutes)',
          sustained: 'Sustained (samples)',
          cooldown: 'Cooldown (minutes)',
          enabled: 'Enabled',
          notifyEmail: 'Send email notifications'
        },
        validation: {
          title: 'Please fix the following issues',
          invalid: 'Invalid rule',
          nameRequired: 'Name is required',
          metricRequired: 'Metric is required',
          groupIdRequired: 'group_id is required for group-level metrics',
          operatorRequired: 'Operator is required',
          thresholdRequired: 'Threshold must be a number',
          windowRange: 'Window must be one of: 1, 5, 60 minutes',
          sustainedRange: 'Sustained must be between 1 and 1440 samples',
          cooldownRange: 'Cooldown must be between 0 and 1440 minutes'
        }
      },
      runtime: {
        title: 'Ops Runtime Settings',
        description: 'Stored in database; changes take effect without editing config files.',
        loading: 'Loading...',
        noData: 'No runtime settings available',
        loadFailed: 'Failed to load runtime settings',
        saveSuccess: 'Runtime settings saved',
        saveFailed: 'Failed to save runtime settings',
        alertTitle: 'Alert Evaluator',
        groupAvailabilityTitle: 'Group Availability Monitor',
        evalIntervalSeconds: 'Evaluation Interval (seconds)',
        silencing: {
          title: 'Alert Silencing (Maintenance Mode)',
          enabled: 'Enable silencing',
          globalUntil: 'Silence until (RFC3339)',
          untilPlaceholder: '2026-01-05T00:00:00Z',
          untilHint: 'Leave empty to only toggle silencing without an expiry (not recommended).',
          reason: 'Reason',
          reasonPlaceholder: 'e.g., planned maintenance',
          entries: {
            title: 'Advanced: targeted silencing',
            hint: 'Optional: silence only certain rules or severities. Leave fields empty to match all.',
            add: 'Add Entry',
            empty: 'No targeted entries',
            entryTitle: 'Entry #{n}',
            ruleId: 'Rule ID (optional)',
            ruleIdPlaceholder: 'e.g., 1',
            severities: 'Severities (optional)',
            severitiesPlaceholder: 'e.g., P0,P1 (empty = all)',
            until: 'Until (RFC3339)',
            reason: 'Reason',
            validation: {
              untilRequired: 'Entry until time is required',
              untilFormat: 'Entry until time must be a valid RFC3339 timestamp',
              ruleIdPositive: 'Entry rule_id must be a positive integer',
              severitiesFormat: 'Entry severities must be a comma-separated list of P0..P3'
            }
          },
          validation: {
            timeFormat: 'Silence time must be a valid RFC3339 timestamp'
          }
        },
        lockEnabled: 'Distributed Lock Enabled',
        lockKey: 'Distributed Lock Key',
        lockTTLSeconds: 'Distributed Lock TTL (seconds)',
        showAdvancedDeveloperSettings: 'Show advanced developer settings (Distributed Lock)',
        advancedSettingsSummary: 'Advanced settings (Distributed Lock)',
        evalIntervalHint: 'How often the evaluator runs. Keeping the default is recommended.',
        validation: {
          title: 'Please fix the following issues',
          invalid: 'Invalid settings',
          evalIntervalRange: 'Evaluation interval must be between 1 and 86400 seconds',
          lockKeyRequired: 'Distributed lock key is required when lock is enabled',
          lockKeyPrefix: 'Distributed lock key must start with "{prefix}"',
          lockKeyHint: 'Recommended: start with "{prefix}" to avoid conflicts',
          lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds'
        }
      },
      email: {
        title: 'Email Notification',
        description: 'Configure alert/report email notifications (stored in database).',
        loading: 'Loading...',
        noData: 'No email notification config',
        loadFailed: 'Failed to load email notification config',
        saveSuccess: 'Email notification config saved',
        saveFailed: 'Failed to save email notification config',
        alertTitle: 'Alert Emails',
        reportTitle: 'Report Emails',
        recipients: 'Recipients',
        recipientsHint: 'If empty, the system may fallback to the first admin email.',
        minSeverity: 'Min Severity',
        minSeverityAll: 'All severities',
        rateLimitPerHour: 'Rate limit per hour',
        batchWindowSeconds: 'Batch window (seconds)',
        includeResolved: 'Include resolved alerts',
        dailySummary: 'Daily summary',
        weeklySummary: 'Weekly summary',
        errorDigest: 'Error digest',
        errorDigestMinCount: 'Min errors for digest',
        accountHealth: 'Account health',
        accountHealthThreshold: 'Error rate threshold (%)',
        cronPlaceholder: 'Cron expression',
        reportHint: 'Schedules use cron syntax; leave empty to use defaults.',
        validation: {
          title: 'Please fix the following issues',
          invalid: 'Invalid email notification config',
          alertRecipientsRequired: 'Alert emails are enabled but no recipients are configured',
          reportRecipientsRequired: 'Report emails are enabled but no recipients are configured',
          invalidRecipients: 'One or more recipient emails are invalid',
          rateLimitRange: 'Rate limit per hour must be a number ≥ 0',
          batchWindowRange: 'Batch window must be between 0 and 86400 seconds',
          cronRequired: 'A cron expression is required when schedule is enabled',
          cronFormat: 'Cron expression format looks invalid (expected at least 5 parts)',
          digestMinCountRange: 'Min errors for digest must be a number ≥ 0',
          accountHealthThresholdRange: 'Account health threshold must be between 0 and 100'
        }
      },
      concurrency: {
        title: 'Concurrency / Queue',
        byPlatform: 'By Platform',
        byGroup: 'By Group',
        byAccount: 'By Account',
        totalRows: '{count} rows',
        disabledHint: 'Realtime monitoring is disabled in settings.',
        empty: 'No data',
        queued: 'Queue {count}',
        rateLimited: 'Rate-limited {count}',
        errorAccounts: 'Errors {count}',
        loadFailed: 'Failed to load concurrency data'
      },
      realtime: {
        title: 'Realtime',
        connected: 'Realtime connected',
        connecting: 'Realtime connecting',
        reconnecting: 'Realtime reconnecting',
        offline: 'Realtime offline',
        closed: 'Realtime closed',
        reconnectIn: 'retry in {seconds}s'
      },
      queryMode: {
        auto: 'Auto',
        raw: 'Raw',
        preagg: 'Preagg'
      },
      accountAvailability: {
        available: 'Available',
        unavailable: 'Unavailable',
        accountError: 'Error'
      },
      tooltips: {
        throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
        latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
        errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
        errorDistribution: 'Error distribution by status code.',
        goroutines:
          'Number of Go runtime goroutines (lightweight threads). There is no absolute “safe” number—use your historical baseline. Heuristic: <2k is common; 2k–8k watch; >8k plus rising queue/latency often suggests blocking/leaks.',
        cpu: 'CPU usage percentage, showing system processor load.',
        memory: 'Memory usage, including used and total available memory.',
        db: 'Database connection pool status, including active, idle, and waiting connections.',
        redis: 'Redis connection pool status, showing active and idle connections.',
        jobs: 'Background job execution status, including last run time, success time, and error information.',
        qps: 'Queries Per Second (QPS) and Tokens Per Second (TPS), real-time system throughput.',
        tokens: 'Total number of tokens processed in the current time window.',
        sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
        errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
        latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
        ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
        health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
      },
      charts: {
        emptyRequest: 'No requests in this window.',
        emptyError: 'No errors in this window.',
        resetZoom: 'Reset',
        resetZoomHint: 'Reset zoom (if enabled)',
        downloadChart: 'Download',
        downloadChartHint: 'Download chart as image'
      }
    },
    // Settings
    settings: {
      title: 'System Settings',
@@ -1940,6 +2473,22 @@ export default {
        sending: 'Sending...',
        enterRecipientHint: 'Please enter a recipient email address'
      },
      opsMonitoring: {
        title: 'Ops Monitoring',
        description: 'Enable ops monitoring for troubleshooting and health visibility',
        disabled: 'Ops monitoring is disabled',
        enabled: 'Enable Ops Monitoring',
        enabledHint: 'Enable the ops monitoring module (admin only)',
        realtimeEnabled: 'Enable Realtime Monitoring',
        realtimeEnabledHint: 'Enable realtime QPS/metrics push (WebSocket)',
        queryMode: 'Default Query Mode',
        queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)',
        queryModeAuto: 'Auto (recommended)',
        queryModeRaw: 'Raw (most accurate, slower)',
        queryModePreagg: 'Preagg (fastest, requires aggregation)',
        metricsInterval: 'Metrics Collection Interval (seconds)',
        metricsIntervalHint: 'How often to collect system/request metrics (60-3600 seconds)'
      },
      adminApiKey: {
        title: 'Admin API Key',
        description: 'Global API key for external system integration with full admin access',
--- a/Show More
+++ b/Show More