diff --git a/.gitignore b/.gitignore
index a50f3ecc..fe715240 100644
--- a/.gitignore
+++ b/.gitignore
@@ -126,6 +126,4 @@ backend/cmd/server/server
 deploy/docker-compose.override.yml
 .gocache/
 vite.config.js
-!docs/
 docs/*
-!docs/dependency-security.md
diff --git a/backend/.dockerignore b/backend/.dockerignore
new file mode 100644
index 00000000..c1c2a854
--- /dev/null
+++ b/backend/.dockerignore
@@ -0,0 +1,2 @@
+.cache/
+.DS_Store
diff --git a/backend/.golangci.yml b/backend/.golangci.yml
index 52072b16..3ec692a8 100644
--- a/backend/.golangci.yml
+++ b/backend/.golangci.yml
@@ -18,6 +18,12 @@ linters:
           list-mode: original
           files:
             - "**/internal/service/**"
+            - "!**/internal/service/ops_aggregation_service.go"
+            - "!**/internal/service/ops_alert_evaluator_service.go"
+            - "!**/internal/service/ops_cleanup_service.go"
+            - "!**/internal/service/ops_metrics_collector.go"
+            - "!**/internal/service/ops_scheduled_report_service.go"
+            - "!**/internal/service/wire.go"
           deny:
             - pkg: github.com/Wei-Shaw/sub2api/internal/repository
               desc: "service must not import repository"
diff --git a/backend/cmd/server/wire.go b/backend/cmd/server/wire.go
index 9447de45..85a9b785 100644
--- a/backend/cmd/server/wire.go
+++ b/backend/cmd/server/wire.go
@@ -62,6 +62,11 @@ func provideServiceBuildInfo(buildInfo handler.BuildInfo) service.BuildInfo {
 func provideCleanup(
 	entClient *ent.Client,
 	rdb *redis.Client,
+	opsMetricsCollector *service.OpsMetricsCollector,
+	opsAggregation *service.OpsAggregationService,
+	opsAlertEvaluator *service.OpsAlertEvaluatorService,
+	opsCleanup *service.OpsCleanupService,
+	opsScheduledReport *service.OpsScheduledReportService,
 	tokenRefresh *service.TokenRefreshService,
 	accountExpiry *service.AccountExpiryService,
 	pricing *service.PricingService,
@@ -81,6 +86,36 @@ func provideCleanup(
 			name string
 			fn   func() error
 		}{
+			{"OpsScheduledReportService", func() error {
+				if opsScheduledReport != nil {
+					opsScheduledReport.Stop()
+				}
+				return nil
+			}},
+			{"OpsCleanupService", func() error {
+				if opsCleanup != nil {
+					opsCleanup.Stop()
+				}
+				return nil
+			}},
+			{"OpsAlertEvaluatorService", func() error {
+				if opsAlertEvaluator != nil {
+					opsAlertEvaluator.Stop()
+				}
+				return nil
+			}},
+			{"OpsAggregationService", func() error {
+				if opsAggregation != nil {
+					opsAggregation.Stop()
+				}
+				return nil
+			}},
+			{"OpsMetricsCollector", func() error {
+				if opsMetricsCollector != nil {
+					opsMetricsCollector.Stop()
+				}
+				return nil
+			}},
 			{"TokenRefreshService", func() error {
 				tokenRefresh.Stop()
 				return nil
diff --git a/backend/cmd/server/wire_gen.go b/backend/cmd/server/wire_gen.go
index bf015117..e66e0e05 100644
--- a/backend/cmd/server/wire_gen.go
+++ b/backend/cmd/server/wire_gen.go
@@ -120,7 +120,22 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
 	proxyHandler := admin.NewProxyHandler(adminService)
 	adminRedeemHandler := admin.NewRedeemHandler(adminService)
 	promoHandler := admin.NewPromoHandler(promoService)
-	settingHandler := admin.NewSettingHandler(settingService, emailService, turnstileService)
+	opsRepository := repository.NewOpsRepository(db)
+	pricingRemoteClient := repository.ProvidePricingRemoteClient(configConfig)
+	pricingService, err := service.ProvidePricingService(configConfig, pricingRemoteClient)
+	if err != nil {
+		return nil, err
+	}
+	billingService := service.NewBillingService(configConfig, pricingService)
+	identityCache := repository.NewIdentityCache(redisClient)
+	identityService := service.NewIdentityService(identityCache)
+	deferredService := service.ProvideDeferredService(accountRepository, timingWheelService)
+	gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService)
+	openAIGatewayService := service.NewOpenAIGatewayService(accountRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, httpUpstream, deferredService)
+	geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig)
+	opsService := service.NewOpsService(opsRepository, settingRepository, configConfig, accountRepository, concurrencyService, gatewayService, openAIGatewayService, geminiMessagesCompatService, antigravityGatewayService)
+	settingHandler := admin.NewSettingHandler(settingService, emailService, turnstileService, opsService)
+	opsHandler := admin.NewOpsHandler(opsService)
 	updateCache := repository.NewUpdateCache(redisClient)
 	gitHubReleaseClient := repository.ProvideGitHubReleaseClient(configConfig)
 	serviceBuildInfo := provideServiceBuildInfo(buildInfo)
@@ -132,31 +147,24 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
 	userAttributeValueRepository := repository.NewUserAttributeValueRepository(client)
 	userAttributeService := service.NewUserAttributeService(userAttributeDefinitionRepository, userAttributeValueRepository)
 	userAttributeHandler := admin.NewUserAttributeHandler(userAttributeService)
-	adminHandlers := handler.ProvideAdminHandlers(dashboardHandler, adminUserHandler, groupHandler, accountHandler, oAuthHandler, openAIOAuthHandler, geminiOAuthHandler, antigravityOAuthHandler, proxyHandler, adminRedeemHandler, promoHandler, settingHandler, systemHandler, adminSubscriptionHandler, adminUsageHandler, userAttributeHandler)
-	pricingRemoteClient := repository.ProvidePricingRemoteClient(configConfig)
-	pricingService, err := service.ProvidePricingService(configConfig, pricingRemoteClient)
-	if err != nil {
-		return nil, err
-	}
-	billingService := service.NewBillingService(configConfig, pricingService)
-	identityCache := repository.NewIdentityCache(redisClient)
-	identityService := service.NewIdentityService(identityCache)
-	deferredService := service.ProvideDeferredService(accountRepository, timingWheelService)
-	gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService)
-	geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig)
+	adminHandlers := handler.ProvideAdminHandlers(dashboardHandler, adminUserHandler, groupHandler, accountHandler, oAuthHandler, openAIOAuthHandler, geminiOAuthHandler, antigravityOAuthHandler, proxyHandler, adminRedeemHandler, promoHandler, settingHandler, opsHandler, systemHandler, adminSubscriptionHandler, adminUsageHandler, userAttributeHandler)
 	gatewayHandler := handler.NewGatewayHandler(gatewayService, geminiMessagesCompatService, antigravityGatewayService, userService, concurrencyService, billingCacheService, configConfig)
-	openAIGatewayService := service.NewOpenAIGatewayService(accountRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, httpUpstream, deferredService)
 	openAIGatewayHandler := handler.NewOpenAIGatewayHandler(openAIGatewayService, concurrencyService, billingCacheService, configConfig)
 	handlerSettingHandler := handler.ProvideSettingHandler(settingService, buildInfo)
 	handlers := handler.ProvideHandlers(authHandler, userHandler, apiKeyHandler, usageHandler, redeemHandler, subscriptionHandler, adminHandlers, gatewayHandler, openAIGatewayHandler, handlerSettingHandler)
 	jwtAuthMiddleware := middleware.NewJWTAuthMiddleware(authService, userService)
 	adminAuthMiddleware := middleware.NewAdminAuthMiddleware(authService, userService, settingService)
 	apiKeyAuthMiddleware := middleware.NewAPIKeyAuthMiddleware(apiKeyService, subscriptionService, configConfig)
-	engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, settingService, redisClient)
+	engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, opsService, settingService, redisClient)
 	httpServer := server.ProvideHTTPServer(configConfig, engine)
+	opsMetricsCollector := service.ProvideOpsMetricsCollector(opsRepository, settingRepository, accountRepository, concurrencyService, db, redisClient, configConfig)
+	opsAggregationService := service.ProvideOpsAggregationService(opsRepository, settingRepository, db, redisClient, configConfig)
+	opsAlertEvaluatorService := service.ProvideOpsAlertEvaluatorService(opsService, opsRepository, emailService, redisClient, configConfig)
+	opsCleanupService := service.ProvideOpsCleanupService(opsRepository, db, redisClient, configConfig)
+	opsScheduledReportService := service.ProvideOpsScheduledReportService(opsService, userService, emailService, redisClient, configConfig)
 	tokenRefreshService := service.ProvideTokenRefreshService(accountRepository, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService, configConfig)
 	accountExpiryService := service.ProvideAccountExpiryService(accountRepository)
-	v := provideCleanup(client, redisClient, tokenRefreshService, accountExpiryService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService)
+	v := provideCleanup(client, redisClient, opsMetricsCollector, opsAggregationService, opsAlertEvaluatorService, opsCleanupService, opsScheduledReportService, tokenRefreshService, accountExpiryService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService)
 	application := &Application{
 		Server:  httpServer,
 		Cleanup: v,
@@ -181,6 +189,11 @@ func provideServiceBuildInfo(buildInfo handler.BuildInfo) service.BuildInfo {
 func provideCleanup(
 	entClient *ent.Client,
 	rdb *redis.Client,
+	opsMetricsCollector *service.OpsMetricsCollector,
+	opsAggregation *service.OpsAggregationService,
+	opsAlertEvaluator *service.OpsAlertEvaluatorService,
+	opsCleanup *service.OpsCleanupService,
+	opsScheduledReport *service.OpsScheduledReportService,
 	tokenRefresh *service.TokenRefreshService,
 	accountExpiry *service.AccountExpiryService,
 	pricing *service.PricingService,
@@ -199,6 +212,36 @@ func provideCleanup(
 			name string
 			fn   func() error
 		}{
+			{"OpsScheduledReportService", func() error {
+				if opsScheduledReport != nil {
+					opsScheduledReport.Stop()
+				}
+				return nil
+			}},
+			{"OpsCleanupService", func() error {
+				if opsCleanup != nil {
+					opsCleanup.Stop()
+				}
+				return nil
+			}},
+			{"OpsAlertEvaluatorService", func() error {
+				if opsAlertEvaluator != nil {
+					opsAlertEvaluator.Stop()
+				}
+				return nil
+			}},
+			{"OpsAggregationService", func() error {
+				if opsAggregation != nil {
+					opsAggregation.Stop()
+				}
+				return nil
+			}},
+			{"OpsMetricsCollector", func() error {
+				if opsMetricsCollector != nil {
+					opsMetricsCollector.Stop()
+				}
+				return nil
+			}},
 			{"TokenRefreshService", func() error {
 				tokenRefresh.Stop()
 				return nil
diff --git a/backend/go.mod b/backend/go.mod
index 82a8e88e..4ac6ba14 100644
--- a/backend/go.mod
+++ b/backend/go.mod
@@ -8,9 +8,11 @@ require (
 	github.com/golang-jwt/jwt/v5 v5.2.2
 	github.com/google/uuid v1.6.0
 	github.com/google/wire v0.7.0
+	github.com/gorilla/websocket v1.5.3
 	github.com/imroc/req/v3 v3.57.0
 	github.com/lib/pq v1.10.9
 	github.com/redis/go-redis/v9 v9.17.2
+	github.com/shirou/gopsutil/v4 v4.25.6
 	github.com/spf13/viper v1.18.2
 	github.com/stretchr/testify v1.11.1
 	github.com/testcontainers/testcontainers-go/modules/postgres v0.40.0
@@ -106,9 +108,9 @@ require (
 	github.com/quic-go/quic-go v0.57.1 // indirect
 	github.com/refraction-networking/utls v1.8.1 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
+	github.com/robfig/cron/v3 v3.0.1 // indirect
 	github.com/sagikazarmark/locafero v0.4.0 // indirect
 	github.com/sagikazarmark/slog-shim v0.1.0 // indirect
-	github.com/shirou/gopsutil/v4 v4.25.6 // indirect
 	github.com/sirupsen/logrus v1.9.3 // indirect
 	github.com/sourcegraph/conc v0.3.0 // indirect
 	github.com/spaolacci/murmur3 v1.1.0 // indirect
diff --git a/backend/go.sum b/backend/go.sum
index 0fd47498..415e73a7 100644
--- a/backend/go.sum
+++ b/backend/go.sum
@@ -117,6 +117,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/wire v0.7.0 h1:JxUKI6+CVBgCO2WToKy/nQk0sS+amI9z9EjVmdaocj4=
 github.com/google/wire v0.7.0/go.mod h1:n6YbUQD9cPKTnHXEBN2DXlOp/mVADhVErcMFb0v3J18=
+github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
+github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg=
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4=
 github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
@@ -224,6 +226,8 @@ github.com/refraction-networking/utls v1.8.1 h1:yNY1kapmQU8JeM1sSw2H2asfTIwWxIkr
 github.com/refraction-networking/utls v1.8.1/go.mod h1:jkSOEkLqn+S/jtpEHPOsVv/4V4EVnelwbMQl4vCWXAM=
 github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
+github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
 github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
 github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go
index 7bdd8fe3..ffca51df 100644
--- a/backend/internal/config/config.go
+++ b/backend/internal/config/config.go
@@ -43,6 +43,7 @@ type Config struct {
 	Turnstile    TurnstileConfig            `mapstructure:"turnstile"`
 	Database     DatabaseConfig             `mapstructure:"database"`
 	Redis        RedisConfig                `mapstructure:"redis"`
+	Ops          OpsConfig                  `mapstructure:"ops"`
 	JWT          JWTConfig                  `mapstructure:"jwt"`
 	LinuxDo      LinuxDoConnectConfig       `mapstructure:"linuxdo_connect"`
 	Default      DefaultConfig              `mapstructure:"default"`
@@ -60,14 +61,6 @@ type Config struct {
 	Update       UpdateConfig               `mapstructure:"update"`
 }
 
-// UpdateConfig 在线更新相关配置
-type UpdateConfig struct {
-	// ProxyURL 用于访问 GitHub 的代理地址
-	// 支持 http/https/socks5/socks5h 协议
-	// 例如: "http://127.0.0.1:7890", "socks5://127.0.0.1:1080"
-	ProxyURL string `mapstructure:"proxy_url"`
-}
-
 type GeminiConfig struct {
 	OAuth GeminiOAuthConfig `mapstructure:"oauth"`
 	Quota GeminiQuotaConfig `mapstructure:"quota"`
@@ -90,6 +83,33 @@ type GeminiTierQuotaConfig struct {
 	CooldownMinutes *int   `mapstructure:"cooldown_minutes" json:"cooldown_minutes"`
 }
 
+type UpdateConfig struct {
+	// ProxyURL 用于访问 GitHub 的代理地址
+	// 支持 http/https/socks5/socks5h 协议
+	// 例如: "http://127.0.0.1:7890", "socks5://127.0.0.1:1080"
+	ProxyURL string `mapstructure:"proxy_url"`
+}
+
+type LinuxDoConnectConfig struct {
+	Enabled             bool   `mapstructure:"enabled"`
+	ClientID            string `mapstructure:"client_id"`
+	ClientSecret        string `mapstructure:"client_secret"`
+	AuthorizeURL        string `mapstructure:"authorize_url"`
+	TokenURL            string `mapstructure:"token_url"`
+	UserInfoURL         string `mapstructure:"userinfo_url"`
+	Scopes              string `mapstructure:"scopes"`
+	RedirectURL         string `mapstructure:"redirect_url"`          // 后端回调地址（需在提供方后台登记）
+	FrontendRedirectURL string `mapstructure:"frontend_redirect_url"` // 前端接收 token 的路由（默认：/auth/linuxdo/callback）
+	TokenAuthMethod     string `mapstructure:"token_auth_method"`     // client_secret_post / client_secret_basic / none
+	UsePKCE             bool   `mapstructure:"use_pkce"`
+
+	// 可选：用于从 userinfo JSON 中提取字段的 gjson 路径。
+	// 为空时，服务端会尝试一组常见字段名。
+	UserInfoEmailPath    string `mapstructure:"userinfo_email_path"`
+	UserInfoIDPath       string `mapstructure:"userinfo_id_path"`
+	UserInfoUsernamePath string `mapstructure:"userinfo_username_path"`
+}
+
 // TokenRefreshConfig OAuth token自动刷新配置
 type TokenRefreshConfig struct {
 	// 是否启用自动刷新
@@ -332,6 +352,47 @@ func (r *RedisConfig) Address() string {
 	return fmt.Sprintf("%s:%d", r.Host, r.Port)
 }
 
+type OpsConfig struct {
+	// Enabled controls whether ops features should run.
+	//
+	// NOTE: vNext still has a DB-backed feature flag (ops_monitoring_enabled) for runtime on/off.
+	// This config flag is the "hard switch" for deployments that want to disable ops completely.
+	Enabled bool `mapstructure:"enabled"`
+
+	// UsePreaggregatedTables prefers ops_metrics_hourly/daily for long-window dashboard queries.
+	UsePreaggregatedTables bool `mapstructure:"use_preaggregated_tables"`
+
+	// Cleanup controls periodic deletion of old ops data to prevent unbounded growth.
+	Cleanup OpsCleanupConfig `mapstructure:"cleanup"`
+
+	// MetricsCollectorCache controls Redis caching for expensive per-window collector queries.
+	MetricsCollectorCache OpsMetricsCollectorCacheConfig `mapstructure:"metrics_collector_cache"`
+
+	// Pre-aggregation configuration.
+	Aggregation OpsAggregationConfig `mapstructure:"aggregation"`
+}
+
+type OpsCleanupConfig struct {
+	Enabled  bool   `mapstructure:"enabled"`
+	Schedule string `mapstructure:"schedule"`
+
+	// Retention days (0 disables that cleanup target).
+	//
+	// vNext requirement: default 30 days across ops datasets.
+	ErrorLogRetentionDays      int `mapstructure:"error_log_retention_days"`
+	MinuteMetricsRetentionDays int `mapstructure:"minute_metrics_retention_days"`
+	HourlyMetricsRetentionDays int `mapstructure:"hourly_metrics_retention_days"`
+}
+
+type OpsAggregationConfig struct {
+	Enabled bool `mapstructure:"enabled"`
+}
+
+type OpsMetricsCollectorCacheConfig struct {
+	Enabled bool          `mapstructure:"enabled"`
+	TTL     time.Duration `mapstructure:"ttl"`
+}
+
 type JWTConfig struct {
 	Secret     string `mapstructure:"secret"`
 	ExpireHour int    `mapstructure:"expire_hour"`
@@ -341,30 +402,6 @@ type TurnstileConfig struct {
 	Required bool `mapstructure:"required"`
 }
 
-// LinuxDoConnectConfig 用于 LinuxDo Connect OAuth 登录（终端用户 SSO）。
-//
-// 注意：这与上游账号的 OAuth（例如 OpenAI/Gemini 账号接入）不是一回事。
-// 这里是用于登录 Sub2API 本身的用户体系。
-type LinuxDoConnectConfig struct {
-	Enabled             bool   `mapstructure:"enabled"`
-	ClientID            string `mapstructure:"client_id"`
-	ClientSecret        string `mapstructure:"client_secret"`
-	AuthorizeURL        string `mapstructure:"authorize_url"`
-	TokenURL            string `mapstructure:"token_url"`
-	UserInfoURL         string `mapstructure:"userinfo_url"`
-	Scopes              string `mapstructure:"scopes"`
-	RedirectURL         string `mapstructure:"redirect_url"`          // 后端回调地址（需在提供方后台登记）
-	FrontendRedirectURL string `mapstructure:"frontend_redirect_url"` // 前端接收 token 的路由（默认：/auth/linuxdo/callback）
-	TokenAuthMethod     string `mapstructure:"token_auth_method"`     // client_secret_post / client_secret_basic / none
-	UsePKCE             bool   `mapstructure:"use_pkce"`
-
-	// 可选：用于从 userinfo JSON 中提取字段的 gjson 路径。
-	// 为空时，服务端会尝试一组常见字段名。
-	UserInfoEmailPath    string `mapstructure:"userinfo_email_path"`
-	UserInfoIDPath       string `mapstructure:"userinfo_id_path"`
-	UserInfoUsernamePath string `mapstructure:"userinfo_username_path"`
-}
-
 type DefaultConfig struct {
 	AdminEmail      string  `mapstructure:"admin_email"`
 	AdminPassword   string  `mapstructure:"admin_password"`
@@ -531,81 +568,6 @@ func Load() (*Config, error) {
 	return &cfg, nil
 }
 
-// ValidateAbsoluteHTTPURL 校验一个绝对 http(s) URL（禁止 fragment）。
-func ValidateAbsoluteHTTPURL(raw string) error {
-	raw = strings.TrimSpace(raw)
-	if raw == "" {
-		return fmt.Errorf("empty url")
-	}
-	u, err := url.Parse(raw)
-	if err != nil {
-		return err
-	}
-	if !u.IsAbs() {
-		return fmt.Errorf("must be absolute")
-	}
-	if !isHTTPScheme(u.Scheme) {
-		return fmt.Errorf("unsupported scheme: %s", u.Scheme)
-	}
-	if strings.TrimSpace(u.Host) == "" {
-		return fmt.Errorf("missing host")
-	}
-	if u.Fragment != "" {
-		return fmt.Errorf("must not include fragment")
-	}
-	return nil
-}
-
-// ValidateFrontendRedirectURL 校验前端回调地址：
-// - 允许同源相对路径（以 / 开头）
-// - 或绝对 http(s) URL（禁止 fragment）
-func ValidateFrontendRedirectURL(raw string) error {
-	raw = strings.TrimSpace(raw)
-	if raw == "" {
-		return fmt.Errorf("empty url")
-	}
-	if strings.ContainsAny(raw, "\r\n") {
-		return fmt.Errorf("contains invalid characters")
-	}
-	if strings.HasPrefix(raw, "/") {
-		if strings.HasPrefix(raw, "//") {
-			return fmt.Errorf("must not start with //")
-		}
-		return nil
-	}
-	u, err := url.Parse(raw)
-	if err != nil {
-		return err
-	}
-	if !u.IsAbs() {
-		return fmt.Errorf("must be absolute http(s) url or relative path")
-	}
-	if !isHTTPScheme(u.Scheme) {
-		return fmt.Errorf("unsupported scheme: %s", u.Scheme)
-	}
-	if strings.TrimSpace(u.Host) == "" {
-		return fmt.Errorf("missing host")
-	}
-	if u.Fragment != "" {
-		return fmt.Errorf("must not include fragment")
-	}
-	return nil
-}
-
-func isHTTPScheme(scheme string) bool {
-	return strings.EqualFold(scheme, "http") || strings.EqualFold(scheme, "https")
-}
-
-func warnIfInsecureURL(field, raw string) {
-	u, err := url.Parse(strings.TrimSpace(raw))
-	if err != nil {
-		return
-	}
-	if strings.EqualFold(u.Scheme, "http") {
-		log.Printf("Warning: %s uses http scheme; use https in production to avoid token leakage.", field)
-	}
-}
-
 func setDefaults() {
 	viper.SetDefault("run_mode", RunModeStandard)
 
@@ -655,7 +617,7 @@ func setDefaults() {
 	// Turnstile
 	viper.SetDefault("turnstile.required", false)
 
-	// LinuxDo Connect OAuth 登录（终端用户 SSO）
+	// LinuxDo Connect OAuth 登录
 	viper.SetDefault("linuxdo_connect.enabled", false)
 	viper.SetDefault("linuxdo_connect.client_id", "")
 	viper.SetDefault("linuxdo_connect.client_secret", "")
@@ -694,6 +656,20 @@ func setDefaults() {
 	viper.SetDefault("redis.pool_size", 128)
 	viper.SetDefault("redis.min_idle_conns", 10)
 
+	// Ops (vNext)
+	viper.SetDefault("ops.enabled", true)
+	viper.SetDefault("ops.use_preaggregated_tables", false)
+	viper.SetDefault("ops.cleanup.enabled", true)
+	viper.SetDefault("ops.cleanup.schedule", "0 2 * * *")
+	// Retention days: vNext defaults to 30 days across ops datasets.
+	viper.SetDefault("ops.cleanup.error_log_retention_days", 30)
+	viper.SetDefault("ops.cleanup.minute_metrics_retention_days", 30)
+	viper.SetDefault("ops.cleanup.hourly_metrics_retention_days", 30)
+	viper.SetDefault("ops.aggregation.enabled", true)
+	viper.SetDefault("ops.metrics_collector_cache.enabled", true)
+	// TTL should be slightly larger than collection interval (1m) to maximize cross-replica cache hits.
+	viper.SetDefault("ops.metrics_collector_cache.ttl", 65*time.Second)
+
 	// JWT
 	viper.SetDefault("jwt.secret", "")
 	viper.SetDefault("jwt.expire_hour", 24)
@@ -750,7 +726,7 @@ func setDefaults() {
 
 	// Gateway
 	viper.SetDefault("gateway.response_header_timeout", 600) // 600秒(10分钟)等待上游响应头，LLM高负载时可能排队较久
-	viper.SetDefault("gateway.log_upstream_error_body", false)
+	viper.SetDefault("gateway.log_upstream_error_body", true)
 	viper.SetDefault("gateway.log_upstream_error_body_max_bytes", 2048)
 	viper.SetDefault("gateway.inject_beta_for_apikey", false)
 	viper.SetDefault("gateway.failover_on_400", false)
@@ -766,7 +742,7 @@ func setDefaults() {
 	viper.SetDefault("gateway.concurrency_slot_ttl_minutes", 30) // 并发槽位过期时间（支持超长请求）
 	viper.SetDefault("gateway.stream_data_interval_timeout", 180)
 	viper.SetDefault("gateway.stream_keepalive_interval", 10)
-	viper.SetDefault("gateway.max_line_size", 40*1024*1024)
+	viper.SetDefault("gateway.max_line_size", 10*1024*1024)
 	viper.SetDefault("gateway.scheduling.sticky_session_max_waiting", 3)
 	viper.SetDefault("gateway.scheduling.sticky_session_wait_timeout", 45*time.Second)
 	viper.SetDefault("gateway.scheduling.fallback_wait_timeout", 30*time.Second)
@@ -789,10 +765,6 @@ func setDefaults() {
 	viper.SetDefault("gemini.oauth.client_secret", "")
 	viper.SetDefault("gemini.oauth.scopes", "")
 	viper.SetDefault("gemini.quota.policy", "")
-
-	// Update - 在线更新配置
-	// 代理地址为空表示直连 GitHub（适用于海外服务器）
-	viper.SetDefault("update.proxy_url", "")
 }
 
 func (c *Config) Validate() error {
@@ -833,7 +805,8 @@ func (c *Config) Validate() error {
 		if method == "none" && !c.LinuxDo.UsePKCE {
 			return fmt.Errorf("linuxdo_connect.use_pkce must be true when linuxdo_connect.token_auth_method=none")
 		}
-		if (method == "" || method == "client_secret_post" || method == "client_secret_basic") && strings.TrimSpace(c.LinuxDo.ClientSecret) == "" {
+		if (method == "" || method == "client_secret_post" || method == "client_secret_basic") &&
+			strings.TrimSpace(c.LinuxDo.ClientSecret) == "" {
 			return fmt.Errorf("linuxdo_connect.client_secret is required when linuxdo_connect.enabled=true and token_auth_method is client_secret_post/client_secret_basic")
 		}
 		if strings.TrimSpace(c.LinuxDo.FrontendRedirectURL) == "" {
@@ -1048,6 +1021,21 @@ func (c *Config) Validate() error {
 	if c.Gateway.Scheduling.SlotCleanupInterval < 0 {
 		return fmt.Errorf("gateway.scheduling.slot_cleanup_interval must be non-negative")
 	}
+	if c.Ops.MetricsCollectorCache.TTL < 0 {
+		return fmt.Errorf("ops.metrics_collector_cache.ttl must be non-negative")
+	}
+	if c.Ops.Cleanup.ErrorLogRetentionDays < 0 {
+		return fmt.Errorf("ops.cleanup.error_log_retention_days must be non-negative")
+	}
+	if c.Ops.Cleanup.MinuteMetricsRetentionDays < 0 {
+		return fmt.Errorf("ops.cleanup.minute_metrics_retention_days must be non-negative")
+	}
+	if c.Ops.Cleanup.HourlyMetricsRetentionDays < 0 {
+		return fmt.Errorf("ops.cleanup.hourly_metrics_retention_days must be non-negative")
+	}
+	if c.Ops.Cleanup.Enabled && strings.TrimSpace(c.Ops.Cleanup.Schedule) == "" {
+		return fmt.Errorf("ops.cleanup.schedule is required when ops.cleanup.enabled=true")
+	}
 	if c.Concurrency.PingInterval < 5 || c.Concurrency.PingInterval > 30 {
 		return fmt.Errorf("concurrency.ping_interval must be between 5-30 seconds")
 	}
@@ -1124,3 +1112,77 @@ func GetServerAddress() string {
 	port := v.GetInt("server.port")
 	return fmt.Sprintf("%s:%d", host, port)
 }
+
+// ValidateAbsoluteHTTPURL 验证是否为有效的绝对 HTTP(S) URL
+func ValidateAbsoluteHTTPURL(raw string) error {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return fmt.Errorf("empty url")
+	}
+	u, err := url.Parse(raw)
+	if err != nil {
+		return err
+	}
+	if !u.IsAbs() {
+		return fmt.Errorf("must be absolute")
+	}
+	if !isHTTPScheme(u.Scheme) {
+		return fmt.Errorf("unsupported scheme: %s", u.Scheme)
+	}
+	if strings.TrimSpace(u.Host) == "" {
+		return fmt.Errorf("missing host")
+	}
+	if u.Fragment != "" {
+		return fmt.Errorf("must not include fragment")
+	}
+	return nil
+}
+
+// ValidateFrontendRedirectURL 验证前端重定向 URL（可以是绝对 URL 或相对路径）
+func ValidateFrontendRedirectURL(raw string) error {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return fmt.Errorf("empty url")
+	}
+	if strings.ContainsAny(raw, "\r\n") {
+		return fmt.Errorf("contains invalid characters")
+	}
+	if strings.HasPrefix(raw, "/") {
+		if strings.HasPrefix(raw, "//") {
+			return fmt.Errorf("must not start with //")
+		}
+		return nil
+	}
+	u, err := url.Parse(raw)
+	if err != nil {
+		return err
+	}
+	if !u.IsAbs() {
+		return fmt.Errorf("must be absolute http(s) url or relative path")
+	}
+	if !isHTTPScheme(u.Scheme) {
+		return fmt.Errorf("unsupported scheme: %s", u.Scheme)
+	}
+	if strings.TrimSpace(u.Host) == "" {
+		return fmt.Errorf("missing host")
+	}
+	if u.Fragment != "" {
+		return fmt.Errorf("must not include fragment")
+	}
+	return nil
+}
+
+// isHTTPScheme 检查是否为 HTTP 或 HTTPS 协议
+func isHTTPScheme(scheme string) bool {
+	return strings.EqualFold(scheme, "http") || strings.EqualFold(scheme, "https")
+}
+
+func warnIfInsecureURL(field, raw string) {
+	u, err := url.Parse(strings.TrimSpace(raw))
+	if err != nil {
+		return
+	}
+	if strings.EqualFold(u.Scheme, "http") {
+		log.Printf("Warning: %s uses http scheme; use https in production to avoid token leakage.", field)
+	}
+}
diff --git a/backend/internal/handler/admin/ops_alerts_handler.go b/backend/internal/handler/admin/ops_alerts_handler.go
new file mode 100644
index 00000000..1e33ddd5
--- /dev/null
+++ b/backend/internal/handler/admin/ops_alerts_handler.go
@@ -0,0 +1,432 @@
+package admin
+
+import (
+	"encoding/json"
+	"fmt"
+	"math"
+	"net/http"
+	"strconv"
+	"strings"
+
+	"github.com/Wei-Shaw/sub2api/internal/pkg/response"
+	"github.com/Wei-Shaw/sub2api/internal/service"
+	"github.com/gin-gonic/gin"
+	"github.com/gin-gonic/gin/binding"
+)
+
+var validOpsAlertMetricTypes = []string{
+	"success_rate",
+	"error_rate",
+	"upstream_error_rate",
+	"p95_latency_ms",
+	"p99_latency_ms",
+	"cpu_usage_percent",
+	"memory_usage_percent",
+	"concurrency_queue_depth",
+}
+
+var validOpsAlertMetricTypeSet = func() map[string]struct{} {
+	set := make(map[string]struct{}, len(validOpsAlertMetricTypes))
+	for _, v := range validOpsAlertMetricTypes {
+		set[v] = struct{}{}
+	}
+	return set
+}()
+
+var validOpsAlertOperators = []string{">", "<", ">=", "<=", "==", "!="}
+
+var validOpsAlertOperatorSet = func() map[string]struct{} {
+	set := make(map[string]struct{}, len(validOpsAlertOperators))
+	for _, v := range validOpsAlertOperators {
+		set[v] = struct{}{}
+	}
+	return set
+}()
+
+var validOpsAlertSeverities = []string{"P0", "P1", "P2", "P3"}
+
+var validOpsAlertSeveritySet = func() map[string]struct{} {
+	set := make(map[string]struct{}, len(validOpsAlertSeverities))
+	for _, v := range validOpsAlertSeverities {
+		set[v] = struct{}{}
+	}
+	return set
+}()
+
+type opsAlertRuleValidatedInput struct {
+	Name       string
+	MetricType string
+	Operator   string
+	Threshold  float64
+
+	Severity string
+
+	WindowMinutes    int
+	SustainedMinutes int
+	CooldownMinutes  int
+
+	Enabled     bool
+	NotifyEmail bool
+
+	WindowProvided    bool
+	SustainedProvided bool
+	CooldownProvided  bool
+	SeverityProvided  bool
+	EnabledProvided   bool
+	NotifyProvided    bool
+}
+
+func isPercentOrRateMetric(metricType string) bool {
+	switch metricType {
+	case "success_rate",
+		"error_rate",
+		"upstream_error_rate",
+		"cpu_usage_percent",
+		"memory_usage_percent":
+		return true
+	default:
+		return false
+	}
+}
+
+func validateOpsAlertRulePayload(raw map[string]json.RawMessage) (*opsAlertRuleValidatedInput, error) {
+	if raw == nil {
+		return nil, fmt.Errorf("invalid request body")
+	}
+
+	requiredFields := []string{"name", "metric_type", "operator", "threshold"}
+	for _, field := range requiredFields {
+		if _, ok := raw[field]; !ok {
+			return nil, fmt.Errorf("%s is required", field)
+		}
+	}
+
+	var name string
+	if err := json.Unmarshal(raw["name"], &name); err != nil || strings.TrimSpace(name) == "" {
+		return nil, fmt.Errorf("name is required")
+	}
+	name = strings.TrimSpace(name)
+
+	var metricType string
+	if err := json.Unmarshal(raw["metric_type"], &metricType); err != nil || strings.TrimSpace(metricType) == "" {
+		return nil, fmt.Errorf("metric_type is required")
+	}
+	metricType = strings.TrimSpace(metricType)
+	if _, ok := validOpsAlertMetricTypeSet[metricType]; !ok {
+		return nil, fmt.Errorf("metric_type must be one of: %s", strings.Join(validOpsAlertMetricTypes, ", "))
+	}
+
+	var operator string
+	if err := json.Unmarshal(raw["operator"], &operator); err != nil || strings.TrimSpace(operator) == "" {
+		return nil, fmt.Errorf("operator is required")
+	}
+	operator = strings.TrimSpace(operator)
+	if _, ok := validOpsAlertOperatorSet[operator]; !ok {
+		return nil, fmt.Errorf("operator must be one of: %s", strings.Join(validOpsAlertOperators, ", "))
+	}
+
+	var threshold float64
+	if err := json.Unmarshal(raw["threshold"], &threshold); err != nil {
+		return nil, fmt.Errorf("threshold must be a number")
+	}
+	if math.IsNaN(threshold) || math.IsInf(threshold, 0) {
+		return nil, fmt.Errorf("threshold must be a finite number")
+	}
+	if isPercentOrRateMetric(metricType) {
+		if threshold < 0 || threshold > 100 {
+			return nil, fmt.Errorf("threshold must be between 0 and 100 for metric_type %s", metricType)
+		}
+	} else if threshold < 0 {
+		return nil, fmt.Errorf("threshold must be >= 0")
+	}
+
+	validated := &opsAlertRuleValidatedInput{
+		Name:       name,
+		MetricType: metricType,
+		Operator:   operator,
+		Threshold:  threshold,
+	}
+
+	if v, ok := raw["severity"]; ok {
+		validated.SeverityProvided = true
+		var sev string
+		if err := json.Unmarshal(v, &sev); err != nil {
+			return nil, fmt.Errorf("severity must be a string")
+		}
+		sev = strings.ToUpper(strings.TrimSpace(sev))
+		if sev != "" {
+			if _, ok := validOpsAlertSeveritySet[sev]; !ok {
+				return nil, fmt.Errorf("severity must be one of: %s", strings.Join(validOpsAlertSeverities, ", "))
+			}
+			validated.Severity = sev
+		}
+	}
+	if validated.Severity == "" {
+		validated.Severity = "P2"
+	}
+
+	if v, ok := raw["enabled"]; ok {
+		validated.EnabledProvided = true
+		if err := json.Unmarshal(v, &validated.Enabled); err != nil {
+			return nil, fmt.Errorf("enabled must be a boolean")
+		}
+	} else {
+		validated.Enabled = true
+	}
+
+	if v, ok := raw["notify_email"]; ok {
+		validated.NotifyProvided = true
+		if err := json.Unmarshal(v, &validated.NotifyEmail); err != nil {
+			return nil, fmt.Errorf("notify_email must be a boolean")
+		}
+	} else {
+		validated.NotifyEmail = true
+	}
+
+	if v, ok := raw["window_minutes"]; ok {
+		validated.WindowProvided = true
+		if err := json.Unmarshal(v, &validated.WindowMinutes); err != nil {
+			return nil, fmt.Errorf("window_minutes must be an integer")
+		}
+		switch validated.WindowMinutes {
+		case 1, 5, 60:
+		default:
+			return nil, fmt.Errorf("window_minutes must be one of: 1, 5, 60")
+		}
+	} else {
+		validated.WindowMinutes = 1
+	}
+
+	if v, ok := raw["sustained_minutes"]; ok {
+		validated.SustainedProvided = true
+		if err := json.Unmarshal(v, &validated.SustainedMinutes); err != nil {
+			return nil, fmt.Errorf("sustained_minutes must be an integer")
+		}
+		if validated.SustainedMinutes < 1 || validated.SustainedMinutes > 1440 {
+			return nil, fmt.Errorf("sustained_minutes must be between 1 and 1440")
+		}
+	} else {
+		validated.SustainedMinutes = 1
+	}
+
+	if v, ok := raw["cooldown_minutes"]; ok {
+		validated.CooldownProvided = true
+		if err := json.Unmarshal(v, &validated.CooldownMinutes); err != nil {
+			return nil, fmt.Errorf("cooldown_minutes must be an integer")
+		}
+		if validated.CooldownMinutes < 0 || validated.CooldownMinutes > 1440 {
+			return nil, fmt.Errorf("cooldown_minutes must be between 0 and 1440")
+		}
+	} else {
+		validated.CooldownMinutes = 0
+	}
+
+	return validated, nil
+}
+
+// ListAlertRules returns all ops alert rules.
+// GET /api/v1/admin/ops/alert-rules
+func (h *OpsHandler) ListAlertRules(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	rules, err := h.opsService.ListAlertRules(c.Request.Context())
+	if err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+	response.Success(c, rules)
+}
+
+// CreateAlertRule creates an ops alert rule.
+// POST /api/v1/admin/ops/alert-rules
+func (h *OpsHandler) CreateAlertRule(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	var raw map[string]json.RawMessage
+	if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil {
+		response.BadRequest(c, "Invalid request body")
+		return
+	}
+	validated, err := validateOpsAlertRulePayload(raw)
+	if err != nil {
+		response.BadRequest(c, err.Error())
+		return
+	}
+
+	var rule service.OpsAlertRule
+	if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil {
+		response.BadRequest(c, "Invalid request body")
+		return
+	}
+
+	rule.Name = validated.Name
+	rule.MetricType = validated.MetricType
+	rule.Operator = validated.Operator
+	rule.Threshold = validated.Threshold
+	rule.WindowMinutes = validated.WindowMinutes
+	rule.SustainedMinutes = validated.SustainedMinutes
+	rule.CooldownMinutes = validated.CooldownMinutes
+	rule.Severity = validated.Severity
+	rule.Enabled = validated.Enabled
+	rule.NotifyEmail = validated.NotifyEmail
+
+	created, err := h.opsService.CreateAlertRule(c.Request.Context(), &rule)
+	if err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+	response.Success(c, created)
+}
+
+// UpdateAlertRule updates an existing ops alert rule.
+// PUT /api/v1/admin/ops/alert-rules/:id
+func (h *OpsHandler) UpdateAlertRule(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	id, err := strconv.ParseInt(c.Param("id"), 10, 64)
+	if err != nil || id <= 0 {
+		response.BadRequest(c, "Invalid rule ID")
+		return
+	}
+
+	var raw map[string]json.RawMessage
+	if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil {
+		response.BadRequest(c, "Invalid request body")
+		return
+	}
+	validated, err := validateOpsAlertRulePayload(raw)
+	if err != nil {
+		response.BadRequest(c, err.Error())
+		return
+	}
+
+	var rule service.OpsAlertRule
+	if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil {
+		response.BadRequest(c, "Invalid request body")
+		return
+	}
+
+	rule.ID = id
+	rule.Name = validated.Name
+	rule.MetricType = validated.MetricType
+	rule.Operator = validated.Operator
+	rule.Threshold = validated.Threshold
+	rule.WindowMinutes = validated.WindowMinutes
+	rule.SustainedMinutes = validated.SustainedMinutes
+	rule.CooldownMinutes = validated.CooldownMinutes
+	rule.Severity = validated.Severity
+	rule.Enabled = validated.Enabled
+	rule.NotifyEmail = validated.NotifyEmail
+
+	updated, err := h.opsService.UpdateAlertRule(c.Request.Context(), &rule)
+	if err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+	response.Success(c, updated)
+}
+
+// DeleteAlertRule deletes an ops alert rule.
+// DELETE /api/v1/admin/ops/alert-rules/:id
+func (h *OpsHandler) DeleteAlertRule(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	id, err := strconv.ParseInt(c.Param("id"), 10, 64)
+	if err != nil || id <= 0 {
+		response.BadRequest(c, "Invalid rule ID")
+		return
+	}
+
+	if err := h.opsService.DeleteAlertRule(c.Request.Context(), id); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+	response.Success(c, gin.H{"deleted": true})
+}
+
+// ListAlertEvents lists recent ops alert events.
+// GET /api/v1/admin/ops/alert-events
+func (h *OpsHandler) ListAlertEvents(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	limit := 100
+	if raw := strings.TrimSpace(c.Query("limit")); raw != "" {
+		n, err := strconv.Atoi(raw)
+		if err != nil || n <= 0 {
+			response.BadRequest(c, "Invalid limit")
+			return
+		}
+		limit = n
+	}
+
+	filter := &service.OpsAlertEventFilter{
+		Limit:    limit,
+		Status:   strings.TrimSpace(c.Query("status")),
+		Severity: strings.TrimSpace(c.Query("severity")),
+	}
+
+	// Optional global filter support (platform/group/time range).
+	if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
+		filter.Platform = platform
+	}
+	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+		id, err := strconv.ParseInt(v, 10, 64)
+		if err != nil || id <= 0 {
+			response.BadRequest(c, "Invalid group_id")
+			return
+		}
+		filter.GroupID = &id
+	}
+	if startTime, endTime, err := parseOpsTimeRange(c, "24h"); err == nil {
+		// Only apply when explicitly provided to avoid surprising default narrowing.
+		if strings.TrimSpace(c.Query("start_time")) != "" || strings.TrimSpace(c.Query("end_time")) != "" || strings.TrimSpace(c.Query("time_range")) != "" {
+			filter.StartTime = &startTime
+			filter.EndTime = &endTime
+		}
+	} else {
+		response.BadRequest(c, err.Error())
+		return
+	}
+
+	events, err := h.opsService.ListAlertEvents(c.Request.Context(), filter)
+	if err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+	response.Success(c, events)
+}
diff --git a/backend/internal/handler/admin/ops_dashboard_handler.go b/backend/internal/handler/admin/ops_dashboard_handler.go
new file mode 100644
index 00000000..2c87f734
--- /dev/null
+++ b/backend/internal/handler/admin/ops_dashboard_handler.go
@@ -0,0 +1,243 @@
+package admin
+
+import (
+	"net/http"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/pkg/response"
+	"github.com/Wei-Shaw/sub2api/internal/service"
+	"github.com/gin-gonic/gin"
+)
+
+// GetDashboardOverview returns vNext ops dashboard overview (raw path).
+// GET /api/v1/admin/ops/dashboard/overview
+func (h *OpsHandler) GetDashboardOverview(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	startTime, endTime, err := parseOpsTimeRange(c, "1h")
+	if err != nil {
+		response.BadRequest(c, err.Error())
+		return
+	}
+
+	filter := &service.OpsDashboardFilter{
+		StartTime: startTime,
+		EndTime:   endTime,
+		Platform:  strings.TrimSpace(c.Query("platform")),
+		QueryMode: parseOpsQueryMode(c),
+	}
+	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+		id, err := strconv.ParseInt(v, 10, 64)
+		if err != nil || id <= 0 {
+			response.BadRequest(c, "Invalid group_id")
+			return
+		}
+		filter.GroupID = &id
+	}
+
+	data, err := h.opsService.GetDashboardOverview(c.Request.Context(), filter)
+	if err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+	response.Success(c, data)
+}
+
+// GetDashboardThroughputTrend returns throughput time series (raw path).
+// GET /api/v1/admin/ops/dashboard/throughput-trend
+func (h *OpsHandler) GetDashboardThroughputTrend(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	startTime, endTime, err := parseOpsTimeRange(c, "1h")
+	if err != nil {
+		response.BadRequest(c, err.Error())
+		return
+	}
+
+	filter := &service.OpsDashboardFilter{
+		StartTime: startTime,
+		EndTime:   endTime,
+		Platform:  strings.TrimSpace(c.Query("platform")),
+		QueryMode: parseOpsQueryMode(c),
+	}
+	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+		id, err := strconv.ParseInt(v, 10, 64)
+		if err != nil || id <= 0 {
+			response.BadRequest(c, "Invalid group_id")
+			return
+		}
+		filter.GroupID = &id
+	}
+
+	bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime))
+	data, err := h.opsService.GetThroughputTrend(c.Request.Context(), filter, bucketSeconds)
+	if err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+	response.Success(c, data)
+}
+
+// GetDashboardLatencyHistogram returns the latency distribution histogram (success requests).
+// GET /api/v1/admin/ops/dashboard/latency-histogram
+func (h *OpsHandler) GetDashboardLatencyHistogram(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	startTime, endTime, err := parseOpsTimeRange(c, "1h")
+	if err != nil {
+		response.BadRequest(c, err.Error())
+		return
+	}
+
+	filter := &service.OpsDashboardFilter{
+		StartTime: startTime,
+		EndTime:   endTime,
+		Platform:  strings.TrimSpace(c.Query("platform")),
+		QueryMode: parseOpsQueryMode(c),
+	}
+	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+		id, err := strconv.ParseInt(v, 10, 64)
+		if err != nil || id <= 0 {
+			response.BadRequest(c, "Invalid group_id")
+			return
+		}
+		filter.GroupID = &id
+	}
+
+	data, err := h.opsService.GetLatencyHistogram(c.Request.Context(), filter)
+	if err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+	response.Success(c, data)
+}
+
+// GetDashboardErrorTrend returns error counts time series (raw path).
+// GET /api/v1/admin/ops/dashboard/error-trend
+func (h *OpsHandler) GetDashboardErrorTrend(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	startTime, endTime, err := parseOpsTimeRange(c, "1h")
+	if err != nil {
+		response.BadRequest(c, err.Error())
+		return
+	}
+
+	filter := &service.OpsDashboardFilter{
+		StartTime: startTime,
+		EndTime:   endTime,
+		Platform:  strings.TrimSpace(c.Query("platform")),
+		QueryMode: parseOpsQueryMode(c),
+	}
+	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+		id, err := strconv.ParseInt(v, 10, 64)
+		if err != nil || id <= 0 {
+			response.BadRequest(c, "Invalid group_id")
+			return
+		}
+		filter.GroupID = &id
+	}
+
+	bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime))
+	data, err := h.opsService.GetErrorTrend(c.Request.Context(), filter, bucketSeconds)
+	if err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+	response.Success(c, data)
+}
+
+// GetDashboardErrorDistribution returns error distribution by status code (raw path).
+// GET /api/v1/admin/ops/dashboard/error-distribution
+func (h *OpsHandler) GetDashboardErrorDistribution(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	startTime, endTime, err := parseOpsTimeRange(c, "1h")
+	if err != nil {
+		response.BadRequest(c, err.Error())
+		return
+	}
+
+	filter := &service.OpsDashboardFilter{
+		StartTime: startTime,
+		EndTime:   endTime,
+		Platform:  strings.TrimSpace(c.Query("platform")),
+		QueryMode: parseOpsQueryMode(c),
+	}
+	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+		id, err := strconv.ParseInt(v, 10, 64)
+		if err != nil || id <= 0 {
+			response.BadRequest(c, "Invalid group_id")
+			return
+		}
+		filter.GroupID = &id
+	}
+
+	data, err := h.opsService.GetErrorDistribution(c.Request.Context(), filter)
+	if err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+	response.Success(c, data)
+}
+
+func pickThroughputBucketSeconds(window time.Duration) int {
+	// Keep buckets predictable and avoid huge responses.
+	switch {
+	case window <= 2*time.Hour:
+		return 60
+	case window <= 24*time.Hour:
+		return 300
+	default:
+		return 3600
+	}
+}
+
+func parseOpsQueryMode(c *gin.Context) service.OpsQueryMode {
+	if c == nil {
+		return ""
+	}
+	raw := strings.TrimSpace(c.Query("mode"))
+	if raw == "" {
+		// Empty means "use server default" (DB setting ops_query_mode_default).
+		return ""
+	}
+	return service.ParseOpsQueryMode(raw)
+}
diff --git a/backend/internal/handler/admin/ops_handler.go b/backend/internal/handler/admin/ops_handler.go
new file mode 100644
index 00000000..bff7426a
--- /dev/null
+++ b/backend/internal/handler/admin/ops_handler.go
@@ -0,0 +1,364 @@
+package admin
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/pkg/response"
+	"github.com/Wei-Shaw/sub2api/internal/server/middleware"
+	"github.com/Wei-Shaw/sub2api/internal/service"
+	"github.com/gin-gonic/gin"
+)
+
+type OpsHandler struct {
+	opsService *service.OpsService
+}
+
+func NewOpsHandler(opsService *service.OpsService) *OpsHandler {
+	return &OpsHandler{opsService: opsService}
+}
+
+// GetErrorLogs lists ops error logs.
+// GET /api/v1/admin/ops/errors
+func (h *OpsHandler) GetErrorLogs(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	page, pageSize := response.ParsePagination(c)
+	// Ops list can be larger than standard admin tables.
+	if pageSize > 500 {
+		pageSize = 500
+	}
+
+	startTime, endTime, err := parseOpsTimeRange(c, "1h")
+	if err != nil {
+		response.BadRequest(c, err.Error())
+		return
+	}
+
+	filter := &service.OpsErrorLogFilter{
+		Page:     page,
+		PageSize: pageSize,
+	}
+	if !startTime.IsZero() {
+		filter.StartTime = &startTime
+	}
+	if !endTime.IsZero() {
+		filter.EndTime = &endTime
+	}
+
+	if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
+		filter.Platform = platform
+	}
+	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+		id, err := strconv.ParseInt(v, 10, 64)
+		if err != nil || id <= 0 {
+			response.BadRequest(c, "Invalid group_id")
+			return
+		}
+		filter.GroupID = &id
+	}
+	if v := strings.TrimSpace(c.Query("account_id")); v != "" {
+		id, err := strconv.ParseInt(v, 10, 64)
+		if err != nil || id <= 0 {
+			response.BadRequest(c, "Invalid account_id")
+			return
+		}
+		filter.AccountID = &id
+	}
+	if phase := strings.TrimSpace(c.Query("phase")); phase != "" {
+		filter.Phase = phase
+	}
+	if q := strings.TrimSpace(c.Query("q")); q != "" {
+		filter.Query = q
+	}
+	if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" {
+		parts := strings.Split(statusCodesStr, ",")
+		out := make([]int, 0, len(parts))
+		for _, part := range parts {
+			p := strings.TrimSpace(part)
+			if p == "" {
+				continue
+			}
+			n, err := strconv.Atoi(p)
+			if err != nil || n < 0 {
+				response.BadRequest(c, "Invalid status_codes")
+				return
+			}
+			out = append(out, n)
+		}
+		filter.StatusCodes = out
+	}
+
+	result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter)
+	if err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize)
+}
+
+// GetErrorLogByID returns a single error log detail.
+// GET /api/v1/admin/ops/errors/:id
+func (h *OpsHandler) GetErrorLogByID(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	idStr := strings.TrimSpace(c.Param("id"))
+	id, err := strconv.ParseInt(idStr, 10, 64)
+	if err != nil || id <= 0 {
+		response.BadRequest(c, "Invalid error id")
+		return
+	}
+
+	detail, err := h.opsService.GetErrorLogByID(c.Request.Context(), id)
+	if err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	response.Success(c, detail)
+}
+
+// ListRequestDetails returns a request-level list (success + error) for drill-down.
+// GET /api/v1/admin/ops/requests
+func (h *OpsHandler) ListRequestDetails(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	page, pageSize := response.ParsePagination(c)
+	if pageSize > 100 {
+		pageSize = 100
+	}
+
+	startTime, endTime, err := parseOpsTimeRange(c, "1h")
+	if err != nil {
+		response.BadRequest(c, err.Error())
+		return
+	}
+
+	filter := &service.OpsRequestDetailFilter{
+		Page:      page,
+		PageSize:  pageSize,
+		StartTime: &startTime,
+		EndTime:   &endTime,
+	}
+
+	filter.Kind = strings.TrimSpace(c.Query("kind"))
+	filter.Platform = strings.TrimSpace(c.Query("platform"))
+	filter.Model = strings.TrimSpace(c.Query("model"))
+	filter.RequestID = strings.TrimSpace(c.Query("request_id"))
+	filter.Query = strings.TrimSpace(c.Query("q"))
+	filter.Sort = strings.TrimSpace(c.Query("sort"))
+
+	if v := strings.TrimSpace(c.Query("user_id")); v != "" {
+		id, err := strconv.ParseInt(v, 10, 64)
+		if err != nil || id <= 0 {
+			response.BadRequest(c, "Invalid user_id")
+			return
+		}
+		filter.UserID = &id
+	}
+	if v := strings.TrimSpace(c.Query("api_key_id")); v != "" {
+		id, err := strconv.ParseInt(v, 10, 64)
+		if err != nil || id <= 0 {
+			response.BadRequest(c, "Invalid api_key_id")
+			return
+		}
+		filter.APIKeyID = &id
+	}
+	if v := strings.TrimSpace(c.Query("account_id")); v != "" {
+		id, err := strconv.ParseInt(v, 10, 64)
+		if err != nil || id <= 0 {
+			response.BadRequest(c, "Invalid account_id")
+			return
+		}
+		filter.AccountID = &id
+	}
+	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+		id, err := strconv.ParseInt(v, 10, 64)
+		if err != nil || id <= 0 {
+			response.BadRequest(c, "Invalid group_id")
+			return
+		}
+		filter.GroupID = &id
+	}
+
+	if v := strings.TrimSpace(c.Query("min_duration_ms")); v != "" {
+		parsed, err := strconv.Atoi(v)
+		if err != nil || parsed < 0 {
+			response.BadRequest(c, "Invalid min_duration_ms")
+			return
+		}
+		filter.MinDurationMs = &parsed
+	}
+	if v := strings.TrimSpace(c.Query("max_duration_ms")); v != "" {
+		parsed, err := strconv.Atoi(v)
+		if err != nil || parsed < 0 {
+			response.BadRequest(c, "Invalid max_duration_ms")
+			return
+		}
+		filter.MaxDurationMs = &parsed
+	}
+
+	out, err := h.opsService.ListRequestDetails(c.Request.Context(), filter)
+	if err != nil {
+		// Invalid sort/kind/platform etc should be a bad request; keep it simple.
+		if strings.Contains(strings.ToLower(err.Error()), "invalid") {
+			response.BadRequest(c, err.Error())
+			return
+		}
+		response.Error(c, http.StatusInternalServerError, "Failed to list request details")
+		return
+	}
+
+	response.Paginated(c, out.Items, out.Total, out.Page, out.PageSize)
+}
+
+type opsRetryRequest struct {
+	Mode            string `json:"mode"`
+	PinnedAccountID *int64 `json:"pinned_account_id"`
+}
+
+// RetryErrorRequest retries a failed request using stored request_body.
+// POST /api/v1/admin/ops/errors/:id/retry
+func (h *OpsHandler) RetryErrorRequest(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	subject, ok := middleware.GetAuthSubjectFromContext(c)
+	if !ok || subject.UserID <= 0 {
+		response.Error(c, http.StatusUnauthorized, "Unauthorized")
+		return
+	}
+
+	idStr := strings.TrimSpace(c.Param("id"))
+	id, err := strconv.ParseInt(idStr, 10, 64)
+	if err != nil || id <= 0 {
+		response.BadRequest(c, "Invalid error id")
+		return
+	}
+
+	req := opsRetryRequest{Mode: service.OpsRetryModeClient}
+	if err := c.ShouldBindJSON(&req); err != nil && !errors.Is(err, io.EOF) {
+		response.BadRequest(c, "Invalid request: "+err.Error())
+		return
+	}
+	if strings.TrimSpace(req.Mode) == "" {
+		req.Mode = service.OpsRetryModeClient
+	}
+
+	result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, req.Mode, req.PinnedAccountID)
+	if err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	response.Success(c, result)
+}
+
+func parseOpsTimeRange(c *gin.Context, defaultRange string) (time.Time, time.Time, error) {
+	startStr := strings.TrimSpace(c.Query("start_time"))
+	endStr := strings.TrimSpace(c.Query("end_time"))
+
+	parseTS := func(s string) (time.Time, error) {
+		if s == "" {
+			return time.Time{}, nil
+		}
+		if t, err := time.Parse(time.RFC3339Nano, s); err == nil {
+			return t, nil
+		}
+		return time.Parse(time.RFC3339, s)
+	}
+
+	start, err := parseTS(startStr)
+	if err != nil {
+		return time.Time{}, time.Time{}, err
+	}
+	end, err := parseTS(endStr)
+	if err != nil {
+		return time.Time{}, time.Time{}, err
+	}
+
+	// start/end explicitly provided (even partially)
+	if startStr != "" || endStr != "" {
+		if end.IsZero() {
+			end = time.Now()
+		}
+		if start.IsZero() {
+			dur, _ := parseOpsDuration(defaultRange)
+			start = end.Add(-dur)
+		}
+		if start.After(end) {
+			return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: start_time must be <= end_time")
+		}
+		if end.Sub(start) > 30*24*time.Hour {
+			return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days")
+		}
+		return start, end, nil
+	}
+
+	// time_range fallback
+	tr := strings.TrimSpace(c.Query("time_range"))
+	if tr == "" {
+		tr = defaultRange
+	}
+	dur, ok := parseOpsDuration(tr)
+	if !ok {
+		dur, _ = parseOpsDuration(defaultRange)
+	}
+
+	end = time.Now()
+	start = end.Add(-dur)
+	if end.Sub(start) > 30*24*time.Hour {
+		return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days")
+	}
+	return start, end, nil
+}
+
+func parseOpsDuration(v string) (time.Duration, bool) {
+	switch strings.TrimSpace(v) {
+	case "5m":
+		return 5 * time.Minute, true
+	case "30m":
+		return 30 * time.Minute, true
+	case "1h":
+		return time.Hour, true
+	case "6h":
+		return 6 * time.Hour, true
+	case "24h":
+		return 24 * time.Hour, true
+	default:
+		return 0, false
+	}
+}
diff --git a/backend/internal/handler/admin/ops_realtime_handler.go b/backend/internal/handler/admin/ops_realtime_handler.go
new file mode 100644
index 00000000..0c23c13b
--- /dev/null
+++ b/backend/internal/handler/admin/ops_realtime_handler.go
@@ -0,0 +1,120 @@
+package admin
+
+import (
+	"net/http"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/pkg/response"
+	"github.com/Wei-Shaw/sub2api/internal/service"
+	"github.com/gin-gonic/gin"
+)
+
+// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account.
+// GET /api/v1/admin/ops/concurrency
+func (h *OpsHandler) GetConcurrencyStats(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
+		response.Success(c, gin.H{
+			"enabled":   false,
+			"platform":  map[string]*service.PlatformConcurrencyInfo{},
+			"group":     map[int64]*service.GroupConcurrencyInfo{},
+			"account":   map[int64]*service.AccountConcurrencyInfo{},
+			"timestamp": time.Now().UTC(),
+		})
+		return
+	}
+
+	platformFilter := strings.TrimSpace(c.Query("platform"))
+	var groupID *int64
+	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+		id, err := strconv.ParseInt(v, 10, 64)
+		if err != nil || id <= 0 {
+			response.BadRequest(c, "Invalid group_id")
+			return
+		}
+		groupID = &id
+	}
+
+	platform, group, account, collectedAt, err := h.opsService.GetConcurrencyStats(c.Request.Context(), platformFilter, groupID)
+	if err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	payload := gin.H{
+		"enabled":  true,
+		"platform": platform,
+		"group":    group,
+		"account":  account,
+	}
+	if collectedAt != nil {
+		payload["timestamp"] = collectedAt.UTC()
+	}
+	response.Success(c, payload)
+}
+
+// GetAccountAvailability returns account availability statistics.
+// GET /api/v1/admin/ops/account-availability
+//
+// Query params:
+// - platform: optional
+// - group_id: optional
+func (h *OpsHandler) GetAccountAvailability(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
+		response.Success(c, gin.H{
+			"enabled":   false,
+			"platform":  map[string]*service.PlatformAvailability{},
+			"group":     map[int64]*service.GroupAvailability{},
+			"account":   map[int64]*service.AccountAvailability{},
+			"timestamp": time.Now().UTC(),
+		})
+		return
+	}
+
+	platform := strings.TrimSpace(c.Query("platform"))
+	var groupID *int64
+	if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+		id, err := strconv.ParseInt(v, 10, 64)
+		if err != nil || id <= 0 {
+			response.BadRequest(c, "Invalid group_id")
+			return
+		}
+		groupID = &id
+	}
+
+	platformStats, groupStats, accountStats, collectedAt, err := h.opsService.GetAccountAvailabilityStats(c.Request.Context(), platform, groupID)
+	if err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	payload := gin.H{
+		"enabled":  true,
+		"platform": platformStats,
+		"group":    groupStats,
+		"account":  accountStats,
+	}
+	if collectedAt != nil {
+		payload["timestamp"] = collectedAt.UTC()
+	}
+	response.Success(c, payload)
+}
diff --git a/backend/internal/handler/admin/ops_settings_handler.go b/backend/internal/handler/admin/ops_settings_handler.go
new file mode 100644
index 00000000..0e0ecb72
--- /dev/null
+++ b/backend/internal/handler/admin/ops_settings_handler.go
@@ -0,0 +1,148 @@
+package admin
+
+import (
+	"net/http"
+
+	"github.com/Wei-Shaw/sub2api/internal/pkg/response"
+	"github.com/Wei-Shaw/sub2api/internal/service"
+	"github.com/gin-gonic/gin"
+)
+
+// GetEmailNotificationConfig returns Ops email notification config (DB-backed).
+// GET /api/v1/admin/ops/email-notification/config
+func (h *OpsHandler) GetEmailNotificationConfig(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	cfg, err := h.opsService.GetEmailNotificationConfig(c.Request.Context())
+	if err != nil {
+		response.Error(c, http.StatusInternalServerError, "Failed to get email notification config")
+		return
+	}
+	response.Success(c, cfg)
+}
+
+// UpdateEmailNotificationConfig updates Ops email notification config (DB-backed).
+// PUT /api/v1/admin/ops/email-notification/config
+func (h *OpsHandler) UpdateEmailNotificationConfig(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	var req service.OpsEmailNotificationConfigUpdateRequest
+	if err := c.ShouldBindJSON(&req); err != nil {
+		response.BadRequest(c, "Invalid request body")
+		return
+	}
+
+	updated, err := h.opsService.UpdateEmailNotificationConfig(c.Request.Context(), &req)
+	if err != nil {
+		// Most failures here are validation errors from request payload; treat as 400.
+		response.Error(c, http.StatusBadRequest, err.Error())
+		return
+	}
+	response.Success(c, updated)
+}
+
+// GetAlertRuntimeSettings returns Ops alert evaluator runtime settings (DB-backed).
+// GET /api/v1/admin/ops/runtime/alert
+func (h *OpsHandler) GetAlertRuntimeSettings(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	cfg, err := h.opsService.GetOpsAlertRuntimeSettings(c.Request.Context())
+	if err != nil {
+		response.Error(c, http.StatusInternalServerError, "Failed to get alert runtime settings")
+		return
+	}
+	response.Success(c, cfg)
+}
+
+// UpdateAlertRuntimeSettings updates Ops alert evaluator runtime settings (DB-backed).
+// PUT /api/v1/admin/ops/runtime/alert
+func (h *OpsHandler) UpdateAlertRuntimeSettings(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	var req service.OpsAlertRuntimeSettings
+	if err := c.ShouldBindJSON(&req); err != nil {
+		response.BadRequest(c, "Invalid request body")
+		return
+	}
+
+	updated, err := h.opsService.UpdateOpsAlertRuntimeSettings(c.Request.Context(), &req)
+	if err != nil {
+		response.Error(c, http.StatusBadRequest, err.Error())
+		return
+	}
+	response.Success(c, updated)
+}
+
+// GetAdvancedSettings returns Ops advanced settings (DB-backed).
+// GET /api/v1/admin/ops/advanced-settings
+func (h *OpsHandler) GetAdvancedSettings(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	cfg, err := h.opsService.GetOpsAdvancedSettings(c.Request.Context())
+	if err != nil {
+		response.Error(c, http.StatusInternalServerError, "Failed to get advanced settings")
+		return
+	}
+	response.Success(c, cfg)
+}
+
+// UpdateAdvancedSettings updates Ops advanced settings (DB-backed).
+// PUT /api/v1/admin/ops/advanced-settings
+func (h *OpsHandler) UpdateAdvancedSettings(c *gin.Context) {
+	if h.opsService == nil {
+		response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+		return
+	}
+	if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+		response.ErrorFrom(c, err)
+		return
+	}
+
+	var req service.OpsAdvancedSettings
+	if err := c.ShouldBindJSON(&req); err != nil {
+		response.BadRequest(c, "Invalid request body")
+		return
+	}
+
+	updated, err := h.opsService.UpdateOpsAdvancedSettings(c.Request.Context(), &req)
+	if err != nil {
+		response.Error(c, http.StatusBadRequest, err.Error())
+		return
+	}
+	response.Success(c, updated)
+}
diff --git a/backend/internal/handler/admin/ops_ws_handler.go b/backend/internal/handler/admin/ops_ws_handler.go
new file mode 100644
index 00000000..db7442e5
--- /dev/null
+++ b/backend/internal/handler/admin/ops_ws_handler.go
@@ -0,0 +1,771 @@
+package admin
+
+import (
+	"context"
+	"encoding/json"
+	"log"
+	"math"
+	"net"
+	"net/http"
+	"net/netip"
+	"net/url"
+	"os"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/service"
+	"github.com/gin-gonic/gin"
+	"github.com/gorilla/websocket"
+)
+
+type OpsWSProxyConfig struct {
+	TrustProxy     bool
+	TrustedProxies []netip.Prefix
+	OriginPolicy   string
+}
+
+const (
+	envOpsWSTrustProxy     = "OPS_WS_TRUST_PROXY"
+	envOpsWSTrustedProxies = "OPS_WS_TRUSTED_PROXIES"
+	envOpsWSOriginPolicy   = "OPS_WS_ORIGIN_POLICY"
+	envOpsWSMaxConns       = "OPS_WS_MAX_CONNS"
+	envOpsWSMaxConnsPerIP  = "OPS_WS_MAX_CONNS_PER_IP"
+)
+
+const (
+	OriginPolicyStrict     = "strict"
+	OriginPolicyPermissive = "permissive"
+)
+
+var opsWSProxyConfig = loadOpsWSProxyConfigFromEnv()
+
+var upgrader = websocket.Upgrader{
+	CheckOrigin: func(r *http.Request) bool {
+		return isAllowedOpsWSOrigin(r)
+	},
+	// Subprotocol negotiation:
+	// - The frontend passes ["sub2api-admin", "jwt.<token>"].
+	// - We always select "sub2api-admin" so the token is never echoed back in the handshake response.
+	Subprotocols: []string{"sub2api-admin"},
+}
+
+const (
+	qpsWSPushInterval       = 2 * time.Second
+	qpsWSRefreshInterval    = 5 * time.Second
+	qpsWSRequestCountWindow = 1 * time.Minute
+
+	defaultMaxWSConns      = 100
+	defaultMaxWSConnsPerIP = 20
+)
+
+var wsConnCount atomic.Int32
+var wsConnCountByIP sync.Map // map[string]*atomic.Int32
+
+const qpsWSIdleStopDelay = 30 * time.Second
+
+const (
+	opsWSCloseRealtimeDisabled = 4001
+)
+
+var qpsWSIdleStopMu sync.Mutex
+var qpsWSIdleStopTimer *time.Timer
+
+func cancelQPSWSIdleStop() {
+	qpsWSIdleStopMu.Lock()
+	if qpsWSIdleStopTimer != nil {
+		qpsWSIdleStopTimer.Stop()
+		qpsWSIdleStopTimer = nil
+	}
+	qpsWSIdleStopMu.Unlock()
+}
+
+func scheduleQPSWSIdleStop() {
+	qpsWSIdleStopMu.Lock()
+	if qpsWSIdleStopTimer != nil {
+		qpsWSIdleStopMu.Unlock()
+		return
+	}
+	qpsWSIdleStopTimer = time.AfterFunc(qpsWSIdleStopDelay, func() {
+		// Only stop if truly idle at fire time.
+		if wsConnCount.Load() == 0 {
+			qpsWSCache.Stop()
+		}
+		qpsWSIdleStopMu.Lock()
+		qpsWSIdleStopTimer = nil
+		qpsWSIdleStopMu.Unlock()
+	})
+	qpsWSIdleStopMu.Unlock()
+}
+
+type opsWSRuntimeLimits struct {
+	MaxConns      int32
+	MaxConnsPerIP int32
+}
+
+var opsWSLimits = loadOpsWSRuntimeLimitsFromEnv()
+
+const (
+	qpsWSWriteTimeout = 10 * time.Second
+	qpsWSPongWait     = 60 * time.Second
+	qpsWSPingInterval = 30 * time.Second
+
+	// We don't expect clients to send application messages; we only read to process control frames (Pong/Close).
+	qpsWSMaxReadBytes = 1024
+)
+
+type opsWSQPSCache struct {
+	refreshInterval    time.Duration
+	requestCountWindow time.Duration
+
+	lastUpdatedUnixNano atomic.Int64
+	payload             atomic.Value // []byte
+
+	opsService *service.OpsService
+	cancel     context.CancelFunc
+	done       chan struct{}
+
+	mu      sync.Mutex
+	running bool
+}
+
+var qpsWSCache = &opsWSQPSCache{
+	refreshInterval:    qpsWSRefreshInterval,
+	requestCountWindow: qpsWSRequestCountWindow,
+}
+
+func (c *opsWSQPSCache) start(opsService *service.OpsService) {
+	if c == nil || opsService == nil {
+		return
+	}
+
+	for {
+		c.mu.Lock()
+		if c.running {
+			c.mu.Unlock()
+			return
+		}
+
+		// If a previous refresh loop is currently stopping, wait for it to fully exit.
+		done := c.done
+		if done != nil {
+			c.mu.Unlock()
+			<-done
+
+			c.mu.Lock()
+			if c.done == done && !c.running {
+				c.done = nil
+			}
+			c.mu.Unlock()
+			continue
+		}
+
+		c.opsService = opsService
+		ctx, cancel := context.WithCancel(context.Background())
+		c.cancel = cancel
+		c.done = make(chan struct{})
+		done = c.done
+		c.running = true
+		c.mu.Unlock()
+
+		go func() {
+			defer close(done)
+			c.refreshLoop(ctx)
+		}()
+		return
+	}
+}
+
+// Stop stops the background refresh loop.
+// It is safe to call multiple times.
+func (c *opsWSQPSCache) Stop() {
+	if c == nil {
+		return
+	}
+
+	c.mu.Lock()
+	if !c.running {
+		done := c.done
+		c.mu.Unlock()
+		if done != nil {
+			<-done
+		}
+		return
+	}
+	cancel := c.cancel
+	c.cancel = nil
+	c.running = false
+	c.opsService = nil
+	done := c.done
+	c.mu.Unlock()
+
+	if cancel != nil {
+		cancel()
+	}
+	if done != nil {
+		<-done
+	}
+
+	c.mu.Lock()
+	if c.done == done && !c.running {
+		c.done = nil
+	}
+	c.mu.Unlock()
+}
+
+func (c *opsWSQPSCache) refreshLoop(ctx context.Context) {
+	ticker := time.NewTicker(c.refreshInterval)
+	defer ticker.Stop()
+
+	c.refresh(ctx)
+	for {
+		select {
+		case <-ticker.C:
+			c.refresh(ctx)
+		case <-ctx.Done():
+			return
+		}
+	}
+}
+
+func (c *opsWSQPSCache) refresh(parentCtx context.Context) {
+	if c == nil {
+		return
+	}
+
+	c.mu.Lock()
+	opsService := c.opsService
+	c.mu.Unlock()
+	if opsService == nil {
+		return
+	}
+
+	if parentCtx == nil {
+		parentCtx = context.Background()
+	}
+	ctx, cancel := context.WithTimeout(parentCtx, 10*time.Second)
+	defer cancel()
+
+	now := time.Now().UTC()
+	stats, err := opsService.GetWindowStats(ctx, now.Add(-c.requestCountWindow), now)
+	if err != nil || stats == nil {
+		if err != nil {
+			log.Printf("[OpsWS] refresh: get window stats failed: %v", err)
+		}
+		return
+	}
+
+	requestCount := stats.SuccessCount + stats.ErrorCountTotal
+	qps := 0.0
+	tps := 0.0
+	if c.requestCountWindow > 0 {
+		seconds := c.requestCountWindow.Seconds()
+		qps = roundTo1DP(float64(requestCount) / seconds)
+		tps = roundTo1DP(float64(stats.TokenConsumed) / seconds)
+	}
+
+	payload := gin.H{
+		"type":      "qps_update",
+		"timestamp": now.Format(time.RFC3339),
+		"data": gin.H{
+			"qps":           qps,
+			"tps":           tps,
+			"request_count": requestCount,
+		},
+	}
+
+	msg, err := json.Marshal(payload)
+	if err != nil {
+		log.Printf("[OpsWS] refresh: marshal payload failed: %v", err)
+		return
+	}
+
+	c.payload.Store(msg)
+	c.lastUpdatedUnixNano.Store(now.UnixNano())
+}
+
+func roundTo1DP(v float64) float64 {
+	return math.Round(v*10) / 10
+}
+
+func (c *opsWSQPSCache) getPayload() []byte {
+	if c == nil {
+		return nil
+	}
+	if cached, ok := c.payload.Load().([]byte); ok && cached != nil {
+		return cached
+	}
+	return nil
+}
+
+func closeWS(conn *websocket.Conn, code int, reason string) {
+	if conn == nil {
+		return
+	}
+	msg := websocket.FormatCloseMessage(code, reason)
+	_ = conn.WriteControl(websocket.CloseMessage, msg, time.Now().Add(qpsWSWriteTimeout))
+	_ = conn.Close()
+}
+
+// QPSWSHandler handles realtime QPS push via WebSocket.
+// GET /api/v1/admin/ops/ws/qps
+func (h *OpsHandler) QPSWSHandler(c *gin.Context) {
+	clientIP := requestClientIP(c.Request)
+
+	if h == nil || h.opsService == nil {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "ops service not initialized"})
+		return
+	}
+
+	// If realtime monitoring is disabled, prefer a successful WS upgrade followed by a clean close
+	// with a deterministic close code. This prevents clients from spinning on 404/1006 reconnect loops.
+	if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
+		conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
+		if err != nil {
+			c.JSON(http.StatusNotFound, gin.H{"error": "ops realtime monitoring is disabled"})
+			return
+		}
+		closeWS(conn, opsWSCloseRealtimeDisabled, "realtime_disabled")
+		return
+	}
+
+	cancelQPSWSIdleStop()
+	// Lazily start the background refresh loop so unit tests that never hit the
+	// websocket route don't spawn goroutines that depend on DB/Redis stubs.
+	qpsWSCache.start(h.opsService)
+
+	// Reserve a global slot before upgrading the connection to keep the limit strict.
+	if !tryAcquireOpsWSTotalSlot(opsWSLimits.MaxConns) {
+		log.Printf("[OpsWS] connection limit reached: %d/%d", wsConnCount.Load(), opsWSLimits.MaxConns)
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"})
+		return
+	}
+	defer func() {
+		if wsConnCount.Add(-1) == 0 {
+			scheduleQPSWSIdleStop()
+		}
+	}()
+
+	if opsWSLimits.MaxConnsPerIP > 0 && clientIP != "" {
+		if !tryAcquireOpsWSIPSlot(clientIP, opsWSLimits.MaxConnsPerIP) {
+			log.Printf("[OpsWS] per-ip connection limit reached: ip=%s limit=%d", clientIP, opsWSLimits.MaxConnsPerIP)
+			c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"})
+			return
+		}
+		defer releaseOpsWSIPSlot(clientIP)
+	}
+
+	conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
+	if err != nil {
+		log.Printf("[OpsWS] upgrade failed: %v", err)
+		return
+	}
+
+	defer func() {
+		_ = conn.Close()
+	}()
+
+	handleQPSWebSocket(c.Request.Context(), conn)
+}
+
+func tryAcquireOpsWSTotalSlot(limit int32) bool {
+	if limit <= 0 {
+		return true
+	}
+	for {
+		current := wsConnCount.Load()
+		if current >= limit {
+			return false
+		}
+		if wsConnCount.CompareAndSwap(current, current+1) {
+			return true
+		}
+	}
+}
+
+func tryAcquireOpsWSIPSlot(clientIP string, limit int32) bool {
+	if strings.TrimSpace(clientIP) == "" || limit <= 0 {
+		return true
+	}
+
+	v, _ := wsConnCountByIP.LoadOrStore(clientIP, &atomic.Int32{})
+	counter, ok := v.(*atomic.Int32)
+	if !ok {
+		return false
+	}
+
+	for {
+		current := counter.Load()
+		if current >= limit {
+			return false
+		}
+		if counter.CompareAndSwap(current, current+1) {
+			return true
+		}
+	}
+}
+
+func releaseOpsWSIPSlot(clientIP string) {
+	if strings.TrimSpace(clientIP) == "" {
+		return
+	}
+
+	v, ok := wsConnCountByIP.Load(clientIP)
+	if !ok {
+		return
+	}
+	counter, ok := v.(*atomic.Int32)
+	if !ok {
+		return
+	}
+	next := counter.Add(-1)
+	if next <= 0 {
+		// Best-effort cleanup; safe even if a new slot was acquired concurrently.
+		wsConnCountByIP.Delete(clientIP)
+	}
+}
+
+func handleQPSWebSocket(parentCtx context.Context, conn *websocket.Conn) {
+	if conn == nil {
+		return
+	}
+
+	ctx, cancel := context.WithCancel(parentCtx)
+	defer cancel()
+
+	var closeOnce sync.Once
+	closeConn := func() {
+		closeOnce.Do(func() {
+			_ = conn.Close()
+		})
+	}
+
+	closeFrameCh := make(chan []byte, 1)
+
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		defer cancel()
+
+		conn.SetReadLimit(qpsWSMaxReadBytes)
+		if err := conn.SetReadDeadline(time.Now().Add(qpsWSPongWait)); err != nil {
+			log.Printf("[OpsWS] set read deadline failed: %v", err)
+			return
+		}
+		conn.SetPongHandler(func(string) error {
+			return conn.SetReadDeadline(time.Now().Add(qpsWSPongWait))
+		})
+		conn.SetCloseHandler(func(code int, text string) error {
+			select {
+			case closeFrameCh <- websocket.FormatCloseMessage(code, text):
+			default:
+			}
+			cancel()
+			return nil
+		})
+
+		for {
+			_, _, err := conn.ReadMessage()
+			if err != nil {
+				if websocket.IsUnexpectedCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway, websocket.CloseNoStatusReceived) {
+					log.Printf("[OpsWS] read failed: %v", err)
+				}
+				return
+			}
+		}
+	}()
+
+	// Push QPS data every 2 seconds (values are globally cached and refreshed at most once per qpsWSRefreshInterval).
+	pushTicker := time.NewTicker(qpsWSPushInterval)
+	defer pushTicker.Stop()
+
+	// Heartbeat ping every 30 seconds.
+	pingTicker := time.NewTicker(qpsWSPingInterval)
+	defer pingTicker.Stop()
+
+	writeWithTimeout := func(messageType int, data []byte) error {
+		if err := conn.SetWriteDeadline(time.Now().Add(qpsWSWriteTimeout)); err != nil {
+			return err
+		}
+		return conn.WriteMessage(messageType, data)
+	}
+
+	sendClose := func(closeFrame []byte) {
+		if closeFrame == nil {
+			closeFrame = websocket.FormatCloseMessage(websocket.CloseNormalClosure, "")
+		}
+		_ = writeWithTimeout(websocket.CloseMessage, closeFrame)
+	}
+
+	for {
+		select {
+		case <-pushTicker.C:
+			msg := qpsWSCache.getPayload()
+			if msg == nil {
+				continue
+			}
+			if err := writeWithTimeout(websocket.TextMessage, msg); err != nil {
+				log.Printf("[OpsWS] write failed: %v", err)
+				cancel()
+				closeConn()
+				wg.Wait()
+				return
+			}
+
+		case <-pingTicker.C:
+			if err := writeWithTimeout(websocket.PingMessage, nil); err != nil {
+				log.Printf("[OpsWS] ping failed: %v", err)
+				cancel()
+				closeConn()
+				wg.Wait()
+				return
+			}
+
+		case closeFrame := <-closeFrameCh:
+			sendClose(closeFrame)
+			closeConn()
+			wg.Wait()
+			return
+
+		case <-ctx.Done():
+			var closeFrame []byte
+			select {
+			case closeFrame = <-closeFrameCh:
+			default:
+			}
+			sendClose(closeFrame)
+
+			closeConn()
+			wg.Wait()
+			return
+		}
+	}
+}
+
+func isAllowedOpsWSOrigin(r *http.Request) bool {
+	if r == nil {
+		return false
+	}
+	origin := strings.TrimSpace(r.Header.Get("Origin"))
+	if origin == "" {
+		switch strings.ToLower(strings.TrimSpace(opsWSProxyConfig.OriginPolicy)) {
+		case OriginPolicyStrict:
+			return false
+		case OriginPolicyPermissive, "":
+			return true
+		default:
+			return true
+		}
+	}
+	parsed, err := url.Parse(origin)
+	if err != nil || parsed.Hostname() == "" {
+		return false
+	}
+	originHost := strings.ToLower(parsed.Hostname())
+
+	trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r)
+	reqHost := hostWithoutPort(r.Host)
+	if trustProxyHeaders {
+		xfHost := strings.TrimSpace(r.Header.Get("X-Forwarded-Host"))
+		if xfHost != "" {
+			xfHost = strings.TrimSpace(strings.Split(xfHost, ",")[0])
+			if xfHost != "" {
+				reqHost = hostWithoutPort(xfHost)
+			}
+		}
+	}
+	reqHost = strings.ToLower(reqHost)
+	if reqHost == "" {
+		return false
+	}
+	return originHost == reqHost
+}
+
+func shouldTrustOpsWSProxyHeaders(r *http.Request) bool {
+	if r == nil {
+		return false
+	}
+	if !opsWSProxyConfig.TrustProxy {
+		return false
+	}
+	peerIP, ok := requestPeerIP(r)
+	if !ok {
+		return false
+	}
+	return isAddrInTrustedProxies(peerIP, opsWSProxyConfig.TrustedProxies)
+}
+
+func requestPeerIP(r *http.Request) (netip.Addr, bool) {
+	if r == nil {
+		return netip.Addr{}, false
+	}
+	host, _, err := net.SplitHostPort(strings.TrimSpace(r.RemoteAddr))
+	if err != nil {
+		host = strings.TrimSpace(r.RemoteAddr)
+	}
+	host = strings.TrimPrefix(host, "[")
+	host = strings.TrimSuffix(host, "]")
+	if host == "" {
+		return netip.Addr{}, false
+	}
+	addr, err := netip.ParseAddr(host)
+	if err != nil {
+		return netip.Addr{}, false
+	}
+	return addr.Unmap(), true
+}
+
+func requestClientIP(r *http.Request) string {
+	if r == nil {
+		return ""
+	}
+
+	trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r)
+	if trustProxyHeaders {
+		xff := strings.TrimSpace(r.Header.Get("X-Forwarded-For"))
+		if xff != "" {
+			// Use the left-most entry (original client). If multiple proxies add values, they are comma-separated.
+			xff = strings.TrimSpace(strings.Split(xff, ",")[0])
+			xff = strings.TrimPrefix(xff, "[")
+			xff = strings.TrimSuffix(xff, "]")
+			if addr, err := netip.ParseAddr(xff); err == nil && addr.IsValid() {
+				return addr.Unmap().String()
+			}
+		}
+	}
+
+	if peer, ok := requestPeerIP(r); ok && peer.IsValid() {
+		return peer.String()
+	}
+	return ""
+}
+
+func isAddrInTrustedProxies(addr netip.Addr, trusted []netip.Prefix) bool {
+	if !addr.IsValid() {
+		return false
+	}
+	for _, p := range trusted {
+		if p.Contains(addr) {
+			return true
+		}
+	}
+	return false
+}
+
+func loadOpsWSProxyConfigFromEnv() OpsWSProxyConfig {
+	cfg := OpsWSProxyConfig{
+		TrustProxy:     true,
+		TrustedProxies: defaultTrustedProxies(),
+		OriginPolicy:   OriginPolicyPermissive,
+	}
+
+	if v := strings.TrimSpace(os.Getenv(envOpsWSTrustProxy)); v != "" {
+		if parsed, err := strconv.ParseBool(v); err == nil {
+			cfg.TrustProxy = parsed
+		} else {
+			log.Printf("[OpsWS] invalid %s=%q (expected bool); using default=%v", envOpsWSTrustProxy, v, cfg.TrustProxy)
+		}
+	}
+
+	if raw := strings.TrimSpace(os.Getenv(envOpsWSTrustedProxies)); raw != "" {
+		prefixes, invalid := parseTrustedProxyList(raw)
+		if len(invalid) > 0 {
+			log.Printf("[OpsWS] invalid %s entries ignored: %s", envOpsWSTrustedProxies, strings.Join(invalid, ", "))
+		}
+		cfg.TrustedProxies = prefixes
+	}
+
+	if v := strings.TrimSpace(os.Getenv(envOpsWSOriginPolicy)); v != "" {
+		normalized := strings.ToLower(v)
+		switch normalized {
+		case OriginPolicyStrict, OriginPolicyPermissive:
+			cfg.OriginPolicy = normalized
+		default:
+			log.Printf("[OpsWS] invalid %s=%q (expected %q or %q); using default=%q", envOpsWSOriginPolicy, v, OriginPolicyStrict, OriginPolicyPermissive, cfg.OriginPolicy)
+		}
+	}
+
+	return cfg
+}
+
+func loadOpsWSRuntimeLimitsFromEnv() opsWSRuntimeLimits {
+	cfg := opsWSRuntimeLimits{
+		MaxConns:      defaultMaxWSConns,
+		MaxConnsPerIP: defaultMaxWSConnsPerIP,
+	}
+
+	if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConns)); v != "" {
+		if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 {
+			cfg.MaxConns = int32(parsed)
+		} else {
+			log.Printf("[OpsWS] invalid %s=%q (expected int>0); using default=%d", envOpsWSMaxConns, v, cfg.MaxConns)
+		}
+	}
+	if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConnsPerIP)); v != "" {
+		if parsed, err := strconv.Atoi(v); err == nil && parsed >= 0 {
+			cfg.MaxConnsPerIP = int32(parsed)
+		} else {
+			log.Printf("[OpsWS] invalid %s=%q (expected int>=0); using default=%d", envOpsWSMaxConnsPerIP, v, cfg.MaxConnsPerIP)
+		}
+	}
+	return cfg
+}
+
+func defaultTrustedProxies() []netip.Prefix {
+	prefixes, _ := parseTrustedProxyList("127.0.0.0/8,::1/128")
+	return prefixes
+}
+
+func parseTrustedProxyList(raw string) (prefixes []netip.Prefix, invalid []string) {
+	for _, token := range strings.Split(raw, ",") {
+		item := strings.TrimSpace(token)
+		if item == "" {
+			continue
+		}
+
+		var (
+			p   netip.Prefix
+			err error
+		)
+		if strings.Contains(item, "/") {
+			p, err = netip.ParsePrefix(item)
+		} else {
+			var addr netip.Addr
+			addr, err = netip.ParseAddr(item)
+			if err == nil {
+				addr = addr.Unmap()
+				bits := 128
+				if addr.Is4() {
+					bits = 32
+				}
+				p = netip.PrefixFrom(addr, bits)
+			}
+		}
+
+		if err != nil || !p.IsValid() {
+			invalid = append(invalid, item)
+			continue
+		}
+
+		prefixes = append(prefixes, p.Masked())
+	}
+	return prefixes, invalid
+}
+
+func hostWithoutPort(hostport string) string {
+	hostport = strings.TrimSpace(hostport)
+	if hostport == "" {
+		return ""
+	}
+	if host, _, err := net.SplitHostPort(hostport); err == nil {
+		return host
+	}
+	if strings.HasPrefix(hostport, "[") && strings.HasSuffix(hostport, "]") {
+		return strings.Trim(hostport, "[]")
+	}
+	parts := strings.Split(hostport, ":")
+	return parts[0]
+}
diff --git a/backend/internal/handler/admin/setting_handler.go b/backend/internal/handler/admin/setting_handler.go
index e1584acb..2f9785ee 100644
--- a/backend/internal/handler/admin/setting_handler.go
+++ b/backend/internal/handler/admin/setting_handler.go
@@ -19,14 +19,16 @@ type SettingHandler struct {
 	settingService   *service.SettingService
 	emailService     *service.EmailService
 	turnstileService *service.TurnstileService
+	opsService       *service.OpsService
 }
 
 // NewSettingHandler 创建系统设置处理器
-func NewSettingHandler(settingService *service.SettingService, emailService *service.EmailService, turnstileService *service.TurnstileService) *SettingHandler {
+func NewSettingHandler(settingService *service.SettingService, emailService *service.EmailService, turnstileService *service.TurnstileService, opsService *service.OpsService) *SettingHandler {
 	return &SettingHandler{
 		settingService:   settingService,
 		emailService:     emailService,
 		turnstileService: turnstileService,
+		opsService:       opsService,
 	}
 }
 
@@ -39,6 +41,9 @@ func (h *SettingHandler) GetSettings(c *gin.Context) {
 		return
 	}
 
+	// Check if ops monitoring is enabled (respects config.ops.enabled)
+	opsEnabled := h.opsService != nil && h.opsService.IsMonitoringEnabled(c.Request.Context())
+
 	response.Success(c, dto.SystemSettings{
 		RegistrationEnabled:                  settings.RegistrationEnabled,
 		EmailVerifyEnabled:                   settings.EmailVerifyEnabled,
@@ -72,6 +77,10 @@ func (h *SettingHandler) GetSettings(c *gin.Context) {
 		FallbackModelAntigravity:             settings.FallbackModelAntigravity,
 		EnableIdentityPatch:                  settings.EnableIdentityPatch,
 		IdentityPatchPrompt:                  settings.IdentityPatchPrompt,
+		OpsMonitoringEnabled:                 opsEnabled && settings.OpsMonitoringEnabled,
+		OpsRealtimeMonitoringEnabled:         settings.OpsRealtimeMonitoringEnabled,
+		OpsQueryModeDefault:                  settings.OpsQueryModeDefault,
+		OpsMetricsIntervalSeconds:            settings.OpsMetricsIntervalSeconds,
 	})
 }
 
@@ -95,7 +104,7 @@ type UpdateSettingsRequest struct {
 	TurnstileSiteKey   string `json:"turnstile_site_key"`
 	TurnstileSecretKey string `json:"turnstile_secret_key"`
 
-	// LinuxDo Connect OAuth 登录（终端用户 SSO）
+	// LinuxDo Connect OAuth 登录
 	LinuxDoConnectEnabled      bool   `json:"linuxdo_connect_enabled"`
 	LinuxDoConnectClientID     string `json:"linuxdo_connect_client_id"`
 	LinuxDoConnectClientSecret string `json:"linuxdo_connect_client_secret"`
@@ -124,6 +133,12 @@ type UpdateSettingsRequest struct {
 	// Identity patch configuration (Claude -> Gemini)
 	EnableIdentityPatch bool   `json:"enable_identity_patch"`
 	IdentityPatchPrompt string `json:"identity_patch_prompt"`
+
+	// Ops monitoring (vNext)
+	OpsMonitoringEnabled         *bool   `json:"ops_monitoring_enabled"`
+	OpsRealtimeMonitoringEnabled *bool   `json:"ops_realtime_monitoring_enabled"`
+	OpsQueryModeDefault          *string `json:"ops_query_mode_default"`
+	OpsMetricsIntervalSeconds    *int    `json:"ops_metrics_interval_seconds"`
 }
 
 // UpdateSettings 更新系统设置
@@ -208,6 +223,18 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
 		}
 	}
 
+	// Ops metrics collector interval validation (seconds).
+	if req.OpsMetricsIntervalSeconds != nil {
+		v := *req.OpsMetricsIntervalSeconds
+		if v < 60 {
+			v = 60
+		}
+		if v > 3600 {
+			v = 3600
+		}
+		req.OpsMetricsIntervalSeconds = &v
+	}
+
 	settings := &service.SystemSettings{
 		RegistrationEnabled:        req.RegistrationEnabled,
 		EmailVerifyEnabled:         req.EmailVerifyEnabled,
@@ -241,6 +268,30 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
 		FallbackModelAntigravity:   req.FallbackModelAntigravity,
 		EnableIdentityPatch:        req.EnableIdentityPatch,
 		IdentityPatchPrompt:        req.IdentityPatchPrompt,
+		OpsMonitoringEnabled: func() bool {
+			if req.OpsMonitoringEnabled != nil {
+				return *req.OpsMonitoringEnabled
+			}
+			return previousSettings.OpsMonitoringEnabled
+		}(),
+		OpsRealtimeMonitoringEnabled: func() bool {
+			if req.OpsRealtimeMonitoringEnabled != nil {
+				return *req.OpsRealtimeMonitoringEnabled
+			}
+			return previousSettings.OpsRealtimeMonitoringEnabled
+		}(),
+		OpsQueryModeDefault: func() string {
+			if req.OpsQueryModeDefault != nil {
+				return *req.OpsQueryModeDefault
+			}
+			return previousSettings.OpsQueryModeDefault
+		}(),
+		OpsMetricsIntervalSeconds: func() int {
+			if req.OpsMetricsIntervalSeconds != nil {
+				return *req.OpsMetricsIntervalSeconds
+			}
+			return previousSettings.OpsMetricsIntervalSeconds
+		}(),
 	}
 
 	if err := h.settingService.UpdateSettings(c.Request.Context(), settings); err != nil {
@@ -290,6 +341,10 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
 		FallbackModelAntigravity:             updatedSettings.FallbackModelAntigravity,
 		EnableIdentityPatch:                  updatedSettings.EnableIdentityPatch,
 		IdentityPatchPrompt:                  updatedSettings.IdentityPatchPrompt,
+		OpsMonitoringEnabled:                 updatedSettings.OpsMonitoringEnabled,
+		OpsRealtimeMonitoringEnabled:         updatedSettings.OpsRealtimeMonitoringEnabled,
+		OpsQueryModeDefault:                  updatedSettings.OpsQueryModeDefault,
+		OpsMetricsIntervalSeconds:            updatedSettings.OpsMetricsIntervalSeconds,
 	})
 }
 
@@ -411,6 +466,18 @@ func diffSettings(before *service.SystemSettings, after *service.SystemSettings,
 	if before.IdentityPatchPrompt != after.IdentityPatchPrompt {
 		changed = append(changed, "identity_patch_prompt")
 	}
+	if before.OpsMonitoringEnabled != after.OpsMonitoringEnabled {
+		changed = append(changed, "ops_monitoring_enabled")
+	}
+	if before.OpsRealtimeMonitoringEnabled != after.OpsRealtimeMonitoringEnabled {
+		changed = append(changed, "ops_realtime_monitoring_enabled")
+	}
+	if before.OpsQueryModeDefault != after.OpsQueryModeDefault {
+		changed = append(changed, "ops_query_mode_default")
+	}
+	if before.OpsMetricsIntervalSeconds != after.OpsMetricsIntervalSeconds {
+		changed = append(changed, "ops_metrics_interval_seconds")
+	}
 	return changed
 }
 
diff --git a/backend/internal/handler/dto/settings.go b/backend/internal/handler/dto/settings.go
index c95bb6e5..d95fb121 100644
--- a/backend/internal/handler/dto/settings.go
+++ b/backend/internal/handler/dto/settings.go
@@ -43,6 +43,12 @@ type SystemSettings struct {
 	// Identity patch configuration (Claude -> Gemini)
 	EnableIdentityPatch bool   `json:"enable_identity_patch"`
 	IdentityPatchPrompt string `json:"identity_patch_prompt"`
+
+	// Ops monitoring (vNext)
+	OpsMonitoringEnabled         bool   `json:"ops_monitoring_enabled"`
+	OpsRealtimeMonitoringEnabled bool   `json:"ops_realtime_monitoring_enabled"`
+	OpsQueryModeDefault          string `json:"ops_query_mode_default"`
+	OpsMetricsIntervalSeconds    int    `json:"ops_metrics_interval_seconds"`
 }
 
 type PublicSettings struct {
diff --git a/backend/internal/handler/gateway_handler.go b/backend/internal/handler/gateway_handler.go
index 0d38db17..284a4f8f 100644
--- a/backend/internal/handler/gateway_handler.go
+++ b/backend/internal/handler/gateway_handler.go
@@ -15,7 +15,6 @@ import (
 	"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/claude"
 	pkgerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
-	"github.com/Wei-Shaw/sub2api/internal/pkg/ip"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/openai"
 	middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
 	"github.com/Wei-Shaw/sub2api/internal/service"
@@ -89,6 +88,8 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 		return
 	}
 
+	setOpsRequestContext(c, "", false, body)
+
 	parsedReq, err := service.ParseGatewayRequest(body)
 	if err != nil {
 		h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
@@ -97,8 +98,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 	reqModel := parsedReq.Model
 	reqStream := parsedReq.Stream
 
-	// 设置 Claude Code 客户端标识到 context（用于分组限制检查）
-	SetClaudeCodeClientContext(c, body)
+	setOpsRequestContext(c, reqModel, reqStream, body)
 
 	// 验证 model 必填
 	if reqModel == "" {
@@ -112,15 +112,10 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 	// 获取订阅信息（可能为nil）- 提前获取用于后续检查
 	subscription, _ := middleware2.GetSubscriptionFromContext(c)
 
-	// 获取 User-Agent
-	userAgent := c.Request.UserAgent()
-
-	// 获取客户端 IP
-	clientIP := ip.GetClientIP(c)
-
 	// 0. 检查wait队列是否已满
 	maxWait := service.CalculateMaxWait(subject.Concurrency)
 	canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait)
+	waitCounted := false
 	if err != nil {
 		log.Printf("Increment wait count failed: %v", err)
 		// On error, allow request to proceed
@@ -128,8 +123,15 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 		h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later")
 		return
 	}
-	// 确保在函数退出时减少wait计数
-	defer h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
+	if err == nil && canWait {
+		waitCounted = true
+	}
+	// Ensure we decrement if we exit before acquiring the user slot.
+	defer func() {
+		if waitCounted {
+			h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
+		}
+	}()
 
 	// 1. 首先获取用户并发槽位
 	userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted)
@@ -138,6 +140,11 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 		h.handleConcurrencyError(c, err, "user", streamStarted)
 		return
 	}
+	// User slot acquired: no longer waiting in the queue.
+	if waitCounted {
+		h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
+		waitCounted = false
+	}
 	// 在请求结束或 Context 取消时确保释放槽位，避免客户端断开造成泄漏
 	userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
 	if userReleaseFunc != nil {
@@ -184,6 +191,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 				return
 			}
 			account := selection.Account
+			setOpsSelectedAccount(c, account.ID)
 
 			// 检查预热请求拦截（在账号选择后、转发前检查）
 			if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) {
@@ -200,12 +208,12 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 
 			// 3. 获取账号并发槽位
 			accountReleaseFunc := selection.ReleaseFunc
-			var accountWaitRelease func()
 			if !selection.Acquired {
 				if selection.WaitPlan == nil {
 					h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
 					return
 				}
+				accountWaitCounted := false
 				canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
 				if err != nil {
 					log.Printf("Increment account wait count failed: %v", err)
@@ -213,12 +221,16 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 					log.Printf("Account wait queue full: account=%d", account.ID)
 					h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
 					return
-				} else {
-					// Only set release function if increment succeeded
-					accountWaitRelease = func() {
+				}
+				if err == nil && canWait {
+					accountWaitCounted = true
+				}
+				// Ensure the wait counter is decremented if we exit before acquiring the slot.
+				defer func() {
+					if accountWaitCounted {
 						h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
 					}
-				}
+				}()
 
 				accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
 					c,
@@ -229,20 +241,21 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 					&streamStarted,
 				)
 				if err != nil {
-					if accountWaitRelease != nil {
-						accountWaitRelease()
-					}
 					log.Printf("Account concurrency acquire failed: %v", err)
 					h.handleConcurrencyError(c, err, "account", streamStarted)
 					return
 				}
+				// Slot acquired: no longer waiting in queue.
+				if accountWaitCounted {
+					h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
+					accountWaitCounted = false
+				}
 				if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil {
 					log.Printf("Bind sticky session failed: %v", err)
 				}
 			}
 			// 账号槽位/等待计数需要在超时或断开时安全回收
 			accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
-			accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
 
 			// 转发请求 - 根据账号平台分流
 			var result *service.ForwardResult
@@ -254,9 +267,6 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 			if accountReleaseFunc != nil {
 				accountReleaseFunc()
 			}
-			if accountWaitRelease != nil {
-				accountWaitRelease()
-			}
 			if err != nil {
 				var failoverErr *service.UpstreamFailoverError
 				if errors.As(err, &failoverErr) {
@@ -277,7 +287,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 			}
 
 			// 异步记录使用量（subscription已在函数开头获取）
-			go func(result *service.ForwardResult, usedAccount *service.Account, ua string, cip string) {
+			go func(result *service.ForwardResult, usedAccount *service.Account) {
 				ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 				defer cancel()
 				if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{
@@ -286,12 +296,10 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 					User:         apiKey.User,
 					Account:      usedAccount,
 					Subscription: subscription,
-					UserAgent:    ua,
-					IPAddress:    cip,
 				}); err != nil {
 					log.Printf("Record usage failed: %v", err)
 				}
-			}(result, account, userAgent, clientIP)
+			}(result, account)
 			return
 		}
 	}
@@ -313,6 +321,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 			return
 		}
 		account := selection.Account
+		setOpsSelectedAccount(c, account.ID)
 
 		// 检查预热请求拦截（在账号选择后、转发前检查）
 		if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) {
@@ -329,12 +338,12 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 
 		// 3. 获取账号并发槽位
 		accountReleaseFunc := selection.ReleaseFunc
-		var accountWaitRelease func()
 		if !selection.Acquired {
 			if selection.WaitPlan == nil {
 				h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
 				return
 			}
+			accountWaitCounted := false
 			canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
 			if err != nil {
 				log.Printf("Increment account wait count failed: %v", err)
@@ -342,12 +351,15 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 				log.Printf("Account wait queue full: account=%d", account.ID)
 				h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
 				return
-			} else {
-				// Only set release function if increment succeeded
-				accountWaitRelease = func() {
+			}
+			if err == nil && canWait {
+				accountWaitCounted = true
+			}
+			defer func() {
+				if accountWaitCounted {
 					h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
 				}
-			}
+			}()
 
 			accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
 				c,
@@ -358,20 +370,20 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 				&streamStarted,
 			)
 			if err != nil {
-				if accountWaitRelease != nil {
-					accountWaitRelease()
-				}
 				log.Printf("Account concurrency acquire failed: %v", err)
 				h.handleConcurrencyError(c, err, "account", streamStarted)
 				return
 			}
+			if accountWaitCounted {
+				h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
+				accountWaitCounted = false
+			}
 			if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil {
 				log.Printf("Bind sticky session failed: %v", err)
 			}
 		}
 		// 账号槽位/等待计数需要在超时或断开时安全回收
 		accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
-		accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
 
 		// 转发请求 - 根据账号平台分流
 		var result *service.ForwardResult
@@ -383,9 +395,6 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 		if accountReleaseFunc != nil {
 			accountReleaseFunc()
 		}
-		if accountWaitRelease != nil {
-			accountWaitRelease()
-		}
 		if err != nil {
 			var failoverErr *service.UpstreamFailoverError
 			if errors.As(err, &failoverErr) {
@@ -406,7 +415,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 		}
 
 		// 异步记录使用量（subscription已在函数开头获取）
-		go func(result *service.ForwardResult, usedAccount *service.Account, ua string, cip string) {
+		go func(result *service.ForwardResult, usedAccount *service.Account) {
 			ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 			defer cancel()
 			if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{
@@ -415,12 +424,10 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 				User:         apiKey.User,
 				Account:      usedAccount,
 				Subscription: subscription,
-				UserAgent:    ua,
-				IPAddress:    cip,
 			}); err != nil {
 				log.Printf("Record usage failed: %v", err)
 			}
-		}(result, account, userAgent, clientIP)
+		}(result, account)
 		return
 	}
 }
@@ -686,21 +693,22 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) {
 		return
 	}
 
+	setOpsRequestContext(c, "", false, body)
+
 	parsedReq, err := service.ParseGatewayRequest(body)
 	if err != nil {
 		h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
 		return
 	}
 
-	// 设置 Claude Code 客户端标识到 context（用于分组限制检查）
-	SetClaudeCodeClientContext(c, body)
-
 	// 验证 model 必填
 	if parsedReq.Model == "" {
 		h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "model is required")
 		return
 	}
 
+	setOpsRequestContext(c, parsedReq.Model, parsedReq.Stream, body)
+
 	// 获取订阅信息（可能为nil）
 	subscription, _ := middleware2.GetSubscriptionFromContext(c)
 
@@ -721,6 +729,7 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) {
 		h.errorResponse(c, http.StatusServiceUnavailable, "api_error", "No available accounts: "+err.Error())
 		return
 	}
+	setOpsSelectedAccount(c, account.ID)
 
 	// 转发请求（不记录使用量）
 	if err := h.gatewayService.ForwardCountTokens(c.Request.Context(), c, account, parsedReq); err != nil {
diff --git a/backend/internal/handler/gemini_v1beta_handler.go b/backend/internal/handler/gemini_v1beta_handler.go
index 986b174b..d639beb3 100644
--- a/backend/internal/handler/gemini_v1beta_handler.go
+++ b/backend/internal/handler/gemini_v1beta_handler.go
@@ -12,7 +12,6 @@ import (
 	"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/gemini"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/googleapi"
-	"github.com/Wei-Shaw/sub2api/internal/pkg/ip"
 	"github.com/Wei-Shaw/sub2api/internal/server/middleware"
 	"github.com/Wei-Shaw/sub2api/internal/service"
 
@@ -162,28 +161,32 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 		return
 	}
 
+	setOpsRequestContext(c, modelName, stream, body)
+
 	// Get subscription (may be nil)
 	subscription, _ := middleware.GetSubscriptionFromContext(c)
 
-	// 获取 User-Agent
-	userAgent := c.Request.UserAgent()
-
-	// 获取客户端 IP
-	clientIP := ip.GetClientIP(c)
-
 	// For Gemini native API, do not send Claude-style ping frames.
 	geminiConcurrency := NewConcurrencyHelper(h.concurrencyHelper.concurrencyService, SSEPingFormatNone, 0)
 
 	// 0) wait queue check
 	maxWait := service.CalculateMaxWait(authSubject.Concurrency)
 	canWait, err := geminiConcurrency.IncrementWaitCount(c.Request.Context(), authSubject.UserID, maxWait)
+	waitCounted := false
 	if err != nil {
 		log.Printf("Increment wait count failed: %v", err)
 	} else if !canWait {
 		googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later")
 		return
 	}
-	defer geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID)
+	if err == nil && canWait {
+		waitCounted = true
+	}
+	defer func() {
+		if waitCounted {
+			geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID)
+		}
+	}()
 
 	// 1) user concurrency slot
 	streamStarted := false
@@ -192,6 +195,10 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 		googleError(c, http.StatusTooManyRequests, err.Error())
 		return
 	}
+	if waitCounted {
+		geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID)
+		waitCounted = false
+	}
 	// 确保请求取消时也会释放槽位，避免长连接被动中断造成泄漏
 	userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
 	if userReleaseFunc != nil {
@@ -207,10 +214,6 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 
 	// 3) select account (sticky session based on request body)
 	parsedReq, _ := service.ParseGatewayRequest(body)
-
-	// 设置 Claude Code 客户端标识到 context（用于分组限制检查）
-	SetClaudeCodeClientContext(c, body)
-
 	sessionHash := h.gatewayService.GenerateSessionHash(parsedReq)
 	sessionKey := sessionHash
 	if sessionHash != "" {
@@ -232,15 +235,16 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 			return
 		}
 		account := selection.Account
+		setOpsSelectedAccount(c, account.ID)
 
 		// 4) account concurrency slot
 		accountReleaseFunc := selection.ReleaseFunc
-		var accountWaitRelease func()
 		if !selection.Acquired {
 			if selection.WaitPlan == nil {
 				googleError(c, http.StatusServiceUnavailable, "No available Gemini accounts")
 				return
 			}
+			accountWaitCounted := false
 			canWait, err := geminiConcurrency.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
 			if err != nil {
 				log.Printf("Increment account wait count failed: %v", err)
@@ -248,12 +252,15 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 				log.Printf("Account wait queue full: account=%d", account.ID)
 				googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later")
 				return
-			} else {
-				// Only set release function if increment succeeded
-				accountWaitRelease = func() {
+			}
+			if err == nil && canWait {
+				accountWaitCounted = true
+			}
+			defer func() {
+				if accountWaitCounted {
 					geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID)
 				}
-			}
+			}()
 
 			accountReleaseFunc, err = geminiConcurrency.AcquireAccountSlotWithWaitTimeout(
 				c,
@@ -264,19 +271,19 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 				&streamStarted,
 			)
 			if err != nil {
-				if accountWaitRelease != nil {
-					accountWaitRelease()
-				}
 				googleError(c, http.StatusTooManyRequests, err.Error())
 				return
 			}
+			if accountWaitCounted {
+				geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID)
+				accountWaitCounted = false
+			}
 			if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil {
 				log.Printf("Bind sticky session failed: %v", err)
 			}
 		}
 		// 账号槽位/等待计数需要在超时或断开时安全回收
 		accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
-		accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
 
 		// 5) forward (根据平台分流)
 		var result *service.ForwardResult
@@ -288,9 +295,6 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 		if accountReleaseFunc != nil {
 			accountReleaseFunc()
 		}
-		if accountWaitRelease != nil {
-			accountWaitRelease()
-		}
 		if err != nil {
 			var failoverErr *service.UpstreamFailoverError
 			if errors.As(err, &failoverErr) {
@@ -311,7 +315,7 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 		}
 
 		// 6) record usage async
-		go func(result *service.ForwardResult, usedAccount *service.Account, ua string, cip string) {
+		go func(result *service.ForwardResult, usedAccount *service.Account) {
 			ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 			defer cancel()
 			if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{
@@ -320,12 +324,10 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 				User:         apiKey.User,
 				Account:      usedAccount,
 				Subscription: subscription,
-				UserAgent:    ua,
-				IPAddress:    cip,
 			}); err != nil {
 				log.Printf("Record usage failed: %v", err)
 			}
-		}(result, account, userAgent, clientIP)
+		}(result, account)
 		return
 	}
 }
diff --git a/backend/internal/handler/handler.go b/backend/internal/handler/handler.go
index 047703f3..5b1b317d 100644
--- a/backend/internal/handler/handler.go
+++ b/backend/internal/handler/handler.go
@@ -18,6 +18,7 @@ type AdminHandlers struct {
 	Redeem           *admin.RedeemHandler
 	Promo            *admin.PromoHandler
 	Setting          *admin.SettingHandler
+	Ops              *admin.OpsHandler
 	System           *admin.SystemHandler
 	Subscription     *admin.SubscriptionHandler
 	Usage            *admin.UsageHandler
diff --git a/backend/internal/handler/openai_gateway_handler.go b/backend/internal/handler/openai_gateway_handler.go
index 8c7d7d52..5f3474b0 100644
--- a/backend/internal/handler/openai_gateway_handler.go
+++ b/backend/internal/handler/openai_gateway_handler.go
@@ -12,7 +12,6 @@ import (
 	"time"
 
 	"github.com/Wei-Shaw/sub2api/internal/config"
-	"github.com/Wei-Shaw/sub2api/internal/pkg/ip"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/openai"
 	middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
 	"github.com/Wei-Shaw/sub2api/internal/service"
@@ -77,6 +76,8 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 		return
 	}
 
+	setOpsRequestContext(c, "", false, body)
+
 	// Parse request body to map for potential modification
 	var reqBody map[string]any
 	if err := json.Unmarshal(body, &reqBody); err != nil {
@@ -95,10 +96,6 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 	}
 
 	userAgent := c.GetHeader("User-Agent")
-
-	// 获取客户端 IP
-	clientIP := ip.GetClientIP(c)
-
 	if !openai.IsCodexCLIRequest(userAgent) {
 		existingInstructions, _ := reqBody["instructions"].(string)
 		if strings.TrimSpace(existingInstructions) == "" {
@@ -114,6 +111,8 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 		}
 	}
 
+	setOpsRequestContext(c, reqModel, reqStream, body)
+
 	// Track if we've started streaming (for error handling)
 	streamStarted := false
 
@@ -123,6 +122,7 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 	// 0. Check if wait queue is full
 	maxWait := service.CalculateMaxWait(subject.Concurrency)
 	canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait)
+	waitCounted := false
 	if err != nil {
 		log.Printf("Increment wait count failed: %v", err)
 		// On error, allow request to proceed
@@ -130,8 +130,14 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 		h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later")
 		return
 	}
-	// Ensure wait count is decremented when function exits
-	defer h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
+	if err == nil && canWait {
+		waitCounted = true
+	}
+	defer func() {
+		if waitCounted {
+			h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
+		}
+	}()
 
 	// 1. First acquire user concurrency slot
 	userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted)
@@ -140,6 +146,11 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 		h.handleConcurrencyError(c, err, "user", streamStarted)
 		return
 	}
+	// User slot acquired: no longer waiting.
+	if waitCounted {
+		h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
+		waitCounted = false
+	}
 	// 确保请求取消时也会释放槽位，避免长连接被动中断造成泄漏
 	userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
 	if userReleaseFunc != nil {
@@ -177,15 +188,16 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 		}
 		account := selection.Account
 		log.Printf("[OpenAI Handler] Selected account: id=%d name=%s", account.ID, account.Name)
+		setOpsSelectedAccount(c, account.ID)
 
 		// 3. Acquire account concurrency slot
 		accountReleaseFunc := selection.ReleaseFunc
-		var accountWaitRelease func()
 		if !selection.Acquired {
 			if selection.WaitPlan == nil {
 				h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
 				return
 			}
+			accountWaitCounted := false
 			canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
 			if err != nil {
 				log.Printf("Increment account wait count failed: %v", err)
@@ -193,12 +205,15 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 				log.Printf("Account wait queue full: account=%d", account.ID)
 				h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
 				return
-			} else {
-				// Only set release function if increment succeeded
-				accountWaitRelease = func() {
+			}
+			if err == nil && canWait {
+				accountWaitCounted = true
+			}
+			defer func() {
+				if accountWaitCounted {
 					h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
 				}
-			}
+			}()
 
 			accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
 				c,
@@ -209,29 +224,26 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 				&streamStarted,
 			)
 			if err != nil {
-				if accountWaitRelease != nil {
-					accountWaitRelease()
-				}
 				log.Printf("Account concurrency acquire failed: %v", err)
 				h.handleConcurrencyError(c, err, "account", streamStarted)
 				return
 			}
+			if accountWaitCounted {
+				h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
+				accountWaitCounted = false
+			}
 			if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionHash, account.ID); err != nil {
 				log.Printf("Bind sticky session failed: %v", err)
 			}
 		}
 		// 账号槽位/等待计数需要在超时或断开时安全回收
 		accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
-		accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
 
 		// Forward request
 		result, err := h.gatewayService.Forward(c.Request.Context(), c, account, body)
 		if accountReleaseFunc != nil {
 			accountReleaseFunc()
 		}
-		if accountWaitRelease != nil {
-			accountWaitRelease()
-		}
 		if err != nil {
 			var failoverErr *service.UpstreamFailoverError
 			if errors.As(err, &failoverErr) {
@@ -252,7 +264,7 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 		}
 
 		// Async record usage
-		go func(result *service.OpenAIForwardResult, usedAccount *service.Account, ua string, cip string) {
+		go func(result *service.OpenAIForwardResult, usedAccount *service.Account) {
 			ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 			defer cancel()
 			if err := h.gatewayService.RecordUsage(ctx, &service.OpenAIRecordUsageInput{
@@ -261,12 +273,10 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
 				User:         apiKey.User,
 				Account:      usedAccount,
 				Subscription: subscription,
-				UserAgent:    ua,
-				IPAddress:    cip,
 			}); err != nil {
 				log.Printf("Record usage failed: %v", err)
 			}
-		}(result, account, userAgent, clientIP)
+		}(result, account)
 		return
 	}
 }
diff --git a/backend/internal/handler/ops_error_logger.go b/backend/internal/handler/ops_error_logger.go
new file mode 100644
index 00000000..7115059a
--- /dev/null
+++ b/backend/internal/handler/ops_error_logger.go
@@ -0,0 +1,954 @@
+package handler
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"log"
+	"runtime"
+	"runtime/debug"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+	"unicode/utf8"
+
+	"github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
+	middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
+	"github.com/Wei-Shaw/sub2api/internal/service"
+	"github.com/gin-gonic/gin"
+)
+
+const (
+	opsModelKey       = "ops_model"
+	opsStreamKey      = "ops_stream"
+	opsRequestBodyKey = "ops_request_body"
+	opsAccountIDKey   = "ops_account_id"
+)
+
+const (
+	opsErrorLogTimeout      = 5 * time.Second
+	opsErrorLogDrainTimeout = 10 * time.Second
+
+	opsErrorLogMinWorkerCount = 4
+	opsErrorLogMaxWorkerCount = 32
+
+	opsErrorLogQueueSizePerWorker = 128
+	opsErrorLogMinQueueSize       = 256
+	opsErrorLogMaxQueueSize       = 8192
+)
+
+type opsErrorLogJob struct {
+	ops         *service.OpsService
+	entry       *service.OpsInsertErrorLogInput
+	requestBody []byte
+}
+
+var (
+	opsErrorLogOnce  sync.Once
+	opsErrorLogQueue chan opsErrorLogJob
+
+	opsErrorLogStopOnce  sync.Once
+	opsErrorLogWorkersWg sync.WaitGroup
+	opsErrorLogMu        sync.RWMutex
+	opsErrorLogStopping  bool
+	opsErrorLogQueueLen  atomic.Int64
+	opsErrorLogEnqueued  atomic.Int64
+	opsErrorLogDropped   atomic.Int64
+	opsErrorLogProcessed atomic.Int64
+
+	opsErrorLogLastDropLogAt atomic.Int64
+
+	opsErrorLogShutdownCh   = make(chan struct{})
+	opsErrorLogShutdownOnce sync.Once
+	opsErrorLogDrained      atomic.Bool
+)
+
+func startOpsErrorLogWorkers() {
+	opsErrorLogMu.Lock()
+	defer opsErrorLogMu.Unlock()
+
+	if opsErrorLogStopping {
+		return
+	}
+
+	workerCount, queueSize := opsErrorLogConfig()
+	opsErrorLogQueue = make(chan opsErrorLogJob, queueSize)
+	opsErrorLogQueueLen.Store(0)
+
+	opsErrorLogWorkersWg.Add(workerCount)
+	for i := 0; i < workerCount; i++ {
+		go func() {
+			defer opsErrorLogWorkersWg.Done()
+			for job := range opsErrorLogQueue {
+				opsErrorLogQueueLen.Add(-1)
+				if job.ops == nil || job.entry == nil {
+					continue
+				}
+				func() {
+					defer func() {
+						if r := recover(); r != nil {
+							log.Printf("[OpsErrorLogger] worker panic: %v\n%s", r, debug.Stack())
+						}
+					}()
+					ctx, cancel := context.WithTimeout(context.Background(), opsErrorLogTimeout)
+					_ = job.ops.RecordError(ctx, job.entry, job.requestBody)
+					cancel()
+					opsErrorLogProcessed.Add(1)
+				}()
+			}
+		}()
+	}
+}
+
+func enqueueOpsErrorLog(ops *service.OpsService, entry *service.OpsInsertErrorLogInput, requestBody []byte) {
+	if ops == nil || entry == nil {
+		return
+	}
+	select {
+	case <-opsErrorLogShutdownCh:
+		return
+	default:
+	}
+
+	opsErrorLogMu.RLock()
+	stopping := opsErrorLogStopping
+	opsErrorLogMu.RUnlock()
+	if stopping {
+		return
+	}
+
+	opsErrorLogOnce.Do(startOpsErrorLogWorkers)
+
+	opsErrorLogMu.RLock()
+	defer opsErrorLogMu.RUnlock()
+	if opsErrorLogStopping || opsErrorLogQueue == nil {
+		return
+	}
+
+	select {
+	case opsErrorLogQueue <- opsErrorLogJob{ops: ops, entry: entry, requestBody: requestBody}:
+		opsErrorLogQueueLen.Add(1)
+		opsErrorLogEnqueued.Add(1)
+	default:
+		// Queue is full; drop to avoid blocking request handling.
+		opsErrorLogDropped.Add(1)
+		maybeLogOpsErrorLogDrop()
+	}
+}
+
+func StopOpsErrorLogWorkers() bool {
+	opsErrorLogStopOnce.Do(func() {
+		opsErrorLogShutdownOnce.Do(func() {
+			close(opsErrorLogShutdownCh)
+		})
+		opsErrorLogDrained.Store(stopOpsErrorLogWorkers())
+	})
+	return opsErrorLogDrained.Load()
+}
+
+func stopOpsErrorLogWorkers() bool {
+	opsErrorLogMu.Lock()
+	opsErrorLogStopping = true
+	ch := opsErrorLogQueue
+	if ch != nil {
+		close(ch)
+	}
+	opsErrorLogQueue = nil
+	opsErrorLogMu.Unlock()
+
+	if ch == nil {
+		opsErrorLogQueueLen.Store(0)
+		return true
+	}
+
+	done := make(chan struct{})
+	go func() {
+		opsErrorLogWorkersWg.Wait()
+		close(done)
+	}()
+
+	select {
+	case <-done:
+		opsErrorLogQueueLen.Store(0)
+		return true
+	case <-time.After(opsErrorLogDrainTimeout):
+		return false
+	}
+}
+
+func OpsErrorLogQueueLength() int64 {
+	return opsErrorLogQueueLen.Load()
+}
+
+func OpsErrorLogQueueCapacity() int {
+	opsErrorLogMu.RLock()
+	ch := opsErrorLogQueue
+	opsErrorLogMu.RUnlock()
+	if ch == nil {
+		return 0
+	}
+	return cap(ch)
+}
+
+func OpsErrorLogDroppedTotal() int64 {
+	return opsErrorLogDropped.Load()
+}
+
+func OpsErrorLogEnqueuedTotal() int64 {
+	return opsErrorLogEnqueued.Load()
+}
+
+func OpsErrorLogProcessedTotal() int64 {
+	return opsErrorLogProcessed.Load()
+}
+
+func maybeLogOpsErrorLogDrop() {
+	now := time.Now().Unix()
+
+	for {
+		last := opsErrorLogLastDropLogAt.Load()
+		if last != 0 && now-last < 60 {
+			return
+		}
+		if opsErrorLogLastDropLogAt.CompareAndSwap(last, now) {
+			break
+		}
+	}
+
+	queued := opsErrorLogQueueLen.Load()
+	queueCap := OpsErrorLogQueueCapacity()
+
+	log.Printf(
+		"[OpsErrorLogger] queue is full; dropping logs (queued=%d cap=%d enqueued_total=%d dropped_total=%d processed_total=%d)",
+		queued,
+		queueCap,
+		opsErrorLogEnqueued.Load(),
+		opsErrorLogDropped.Load(),
+		opsErrorLogProcessed.Load(),
+	)
+}
+
+func opsErrorLogConfig() (workerCount int, queueSize int) {
+	workerCount = runtime.GOMAXPROCS(0) * 2
+	if workerCount < opsErrorLogMinWorkerCount {
+		workerCount = opsErrorLogMinWorkerCount
+	}
+	if workerCount > opsErrorLogMaxWorkerCount {
+		workerCount = opsErrorLogMaxWorkerCount
+	}
+
+	queueSize = workerCount * opsErrorLogQueueSizePerWorker
+	if queueSize < opsErrorLogMinQueueSize {
+		queueSize = opsErrorLogMinQueueSize
+	}
+	if queueSize > opsErrorLogMaxQueueSize {
+		queueSize = opsErrorLogMaxQueueSize
+	}
+
+	return workerCount, queueSize
+}
+
+func setOpsRequestContext(c *gin.Context, model string, stream bool, requestBody []byte) {
+	if c == nil {
+		return
+	}
+	c.Set(opsModelKey, model)
+	c.Set(opsStreamKey, stream)
+	if len(requestBody) > 0 {
+		c.Set(opsRequestBodyKey, requestBody)
+	}
+}
+
+func setOpsSelectedAccount(c *gin.Context, accountID int64) {
+	if c == nil || accountID <= 0 {
+		return
+	}
+	c.Set(opsAccountIDKey, accountID)
+}
+
+type opsCaptureWriter struct {
+	gin.ResponseWriter
+	limit int
+	buf   bytes.Buffer
+}
+
+func (w *opsCaptureWriter) Write(b []byte) (int, error) {
+	if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit {
+		remaining := w.limit - w.buf.Len()
+		if len(b) > remaining {
+			_, _ = w.buf.Write(b[:remaining])
+		} else {
+			_, _ = w.buf.Write(b)
+		}
+	}
+	return w.ResponseWriter.Write(b)
+}
+
+func (w *opsCaptureWriter) WriteString(s string) (int, error) {
+	if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit {
+		remaining := w.limit - w.buf.Len()
+		if len(s) > remaining {
+			_, _ = w.buf.WriteString(s[:remaining])
+		} else {
+			_, _ = w.buf.WriteString(s)
+		}
+	}
+	return w.ResponseWriter.WriteString(s)
+}
+
+// OpsErrorLoggerMiddleware records error responses (status >= 400) into ops_error_logs.
+//
+// Notes:
+// - It buffers response bodies only when status >= 400 to avoid overhead for successful traffic.
+// - Streaming errors after the response has started (SSE) may still need explicit logging.
+func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
+	return func(c *gin.Context) {
+		w := &opsCaptureWriter{ResponseWriter: c.Writer, limit: 64 * 1024}
+		c.Writer = w
+		c.Next()
+
+		if ops == nil {
+			return
+		}
+		if !ops.IsMonitoringEnabled(c.Request.Context()) {
+			return
+		}
+
+		status := c.Writer.Status()
+		if status < 400 {
+			// Even when the client request succeeds, we still want to persist upstream error attempts
+			// (retries/failover) so ops can observe upstream instability that gets "covered" by retries.
+			var events []*service.OpsUpstreamErrorEvent
+			if v, ok := c.Get(service.OpsUpstreamErrorsKey); ok {
+				if arr, ok := v.([]*service.OpsUpstreamErrorEvent); ok && len(arr) > 0 {
+					events = arr
+				}
+			}
+			// Also accept single upstream fields set by gateway services (rare for successful requests).
+			hasUpstreamContext := len(events) > 0
+			if !hasUpstreamContext {
+				if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
+					switch t := v.(type) {
+					case int:
+						hasUpstreamContext = t > 0
+					case int64:
+						hasUpstreamContext = t > 0
+					}
+				}
+			}
+			if !hasUpstreamContext {
+				if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
+					if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
+						hasUpstreamContext = true
+					}
+				}
+			}
+			if !hasUpstreamContext {
+				if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
+					if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
+						hasUpstreamContext = true
+					}
+				}
+			}
+			if !hasUpstreamContext {
+				return
+			}
+
+			apiKey, _ := middleware2.GetAPIKeyFromContext(c)
+			clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string)
+
+			model, _ := c.Get(opsModelKey)
+			streamV, _ := c.Get(opsStreamKey)
+			accountIDV, _ := c.Get(opsAccountIDKey)
+
+			var modelName string
+			if s, ok := model.(string); ok {
+				modelName = s
+			}
+			stream := false
+			if b, ok := streamV.(bool); ok {
+				stream = b
+			}
+
+			// Prefer showing the account that experienced the upstream error (if we have events),
+			// otherwise fall back to the final selected account (best-effort).
+			var accountID *int64
+			if len(events) > 0 {
+				if last := events[len(events)-1]; last != nil && last.AccountID > 0 {
+					v := last.AccountID
+					accountID = &v
+				}
+			}
+			if accountID == nil {
+				if v, ok := accountIDV.(int64); ok && v > 0 {
+					accountID = &v
+				}
+			}
+
+			fallbackPlatform := guessPlatformFromPath(c.Request.URL.Path)
+			platform := resolveOpsPlatform(apiKey, fallbackPlatform)
+
+			requestID := c.Writer.Header().Get("X-Request-Id")
+			if requestID == "" {
+				requestID = c.Writer.Header().Get("x-request-id")
+			}
+
+			// Best-effort backfill single upstream fields from the last event (if present).
+			var upstreamStatusCode *int
+			var upstreamErrorMessage *string
+			var upstreamErrorDetail *string
+			if len(events) > 0 {
+				last := events[len(events)-1]
+				if last != nil {
+					if last.UpstreamStatusCode > 0 {
+						code := last.UpstreamStatusCode
+						upstreamStatusCode = &code
+					}
+					if msg := strings.TrimSpace(last.Message); msg != "" {
+						upstreamErrorMessage = &msg
+					}
+					if detail := strings.TrimSpace(last.Detail); detail != "" {
+						upstreamErrorDetail = &detail
+					}
+				}
+			}
+
+			if upstreamStatusCode == nil {
+				if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
+					switch t := v.(type) {
+					case int:
+						if t > 0 {
+							code := t
+							upstreamStatusCode = &code
+						}
+					case int64:
+						if t > 0 {
+							code := int(t)
+							upstreamStatusCode = &code
+						}
+					}
+				}
+			}
+			if upstreamErrorMessage == nil {
+				if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
+					if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
+						msg := strings.TrimSpace(s)
+						upstreamErrorMessage = &msg
+					}
+				}
+			}
+			if upstreamErrorDetail == nil {
+				if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
+					if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
+						detail := strings.TrimSpace(s)
+						upstreamErrorDetail = &detail
+					}
+				}
+			}
+
+			// If we still have nothing meaningful, skip.
+			if upstreamStatusCode == nil && upstreamErrorMessage == nil && upstreamErrorDetail == nil && len(events) == 0 {
+				return
+			}
+
+			effectiveUpstreamStatus := 0
+			if upstreamStatusCode != nil {
+				effectiveUpstreamStatus = *upstreamStatusCode
+			}
+
+			recoveredMsg := "Recovered upstream error"
+			if effectiveUpstreamStatus > 0 {
+				recoveredMsg += " " + strconvItoa(effectiveUpstreamStatus)
+			}
+			if upstreamErrorMessage != nil && strings.TrimSpace(*upstreamErrorMessage) != "" {
+				recoveredMsg += ": " + strings.TrimSpace(*upstreamErrorMessage)
+			}
+			recoveredMsg = truncateString(recoveredMsg, 2048)
+
+			entry := &service.OpsInsertErrorLogInput{
+				RequestID:       requestID,
+				ClientRequestID: clientRequestID,
+
+				AccountID: accountID,
+				Platform:  platform,
+				Model:     modelName,
+				RequestPath: func() string {
+					if c.Request != nil && c.Request.URL != nil {
+						return c.Request.URL.Path
+					}
+					return ""
+				}(),
+				Stream:    stream,
+				UserAgent: c.GetHeader("User-Agent"),
+
+				ErrorPhase: "upstream",
+				ErrorType:  "upstream_error",
+				// Severity/retryability should reflect the upstream failure, not the final client status (200).
+				Severity:          classifyOpsSeverity("upstream_error", effectiveUpstreamStatus),
+				StatusCode:        status,
+				IsBusinessLimited: false,
+
+				ErrorMessage: recoveredMsg,
+				ErrorBody:    "",
+
+				ErrorSource: "upstream_http",
+				ErrorOwner:  "provider",
+
+				UpstreamStatusCode:   upstreamStatusCode,
+				UpstreamErrorMessage: upstreamErrorMessage,
+				UpstreamErrorDetail:  upstreamErrorDetail,
+				UpstreamErrors:       events,
+
+				IsRetryable: classifyOpsIsRetryable("upstream_error", effectiveUpstreamStatus),
+				RetryCount:  0,
+				CreatedAt:   time.Now(),
+			}
+
+			if apiKey != nil {
+				entry.APIKeyID = &apiKey.ID
+				if apiKey.User != nil {
+					entry.UserID = &apiKey.User.ID
+				}
+				if apiKey.GroupID != nil {
+					entry.GroupID = apiKey.GroupID
+				}
+				// Prefer group platform if present (more stable than inferring from path).
+				if apiKey.Group != nil && apiKey.Group.Platform != "" {
+					entry.Platform = apiKey.Group.Platform
+				}
+			}
+
+			var clientIP string
+			if ip := strings.TrimSpace(c.ClientIP()); ip != "" {
+				clientIP = ip
+				entry.ClientIP = &clientIP
+			}
+
+			var requestBody []byte
+			if v, ok := c.Get(opsRequestBodyKey); ok {
+				if b, ok := v.([]byte); ok && len(b) > 0 {
+					requestBody = b
+				}
+			}
+			// Store request headers/body only when an upstream error occurred to keep overhead minimal.
+			entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c)
+
+			enqueueOpsErrorLog(ops, entry, requestBody)
+			return
+		}
+
+		body := w.buf.Bytes()
+		parsed := parseOpsErrorResponse(body)
+
+		apiKey, _ := middleware2.GetAPIKeyFromContext(c)
+
+		clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string)
+
+		model, _ := c.Get(opsModelKey)
+		streamV, _ := c.Get(opsStreamKey)
+		accountIDV, _ := c.Get(opsAccountIDKey)
+
+		var modelName string
+		if s, ok := model.(string); ok {
+			modelName = s
+		}
+		stream := false
+		if b, ok := streamV.(bool); ok {
+			stream = b
+		}
+		var accountID *int64
+		if v, ok := accountIDV.(int64); ok && v > 0 {
+			accountID = &v
+		}
+
+		fallbackPlatform := guessPlatformFromPath(c.Request.URL.Path)
+		platform := resolveOpsPlatform(apiKey, fallbackPlatform)
+
+		requestID := c.Writer.Header().Get("X-Request-Id")
+		if requestID == "" {
+			requestID = c.Writer.Header().Get("x-request-id")
+		}
+
+		phase := classifyOpsPhase(parsed.ErrorType, parsed.Message, parsed.Code)
+		isBusinessLimited := classifyOpsIsBusinessLimited(parsed.ErrorType, phase, parsed.Code, status, parsed.Message)
+
+		errorOwner := classifyOpsErrorOwner(phase, parsed.Message)
+		errorSource := classifyOpsErrorSource(phase, parsed.Message)
+
+		entry := &service.OpsInsertErrorLogInput{
+			RequestID:       requestID,
+			ClientRequestID: clientRequestID,
+
+			AccountID: accountID,
+			Platform:  platform,
+			Model:     modelName,
+			RequestPath: func() string {
+				if c.Request != nil && c.Request.URL != nil {
+					return c.Request.URL.Path
+				}
+				return ""
+			}(),
+			Stream:    stream,
+			UserAgent: c.GetHeader("User-Agent"),
+
+			ErrorPhase:        phase,
+			ErrorType:         normalizeOpsErrorType(parsed.ErrorType, parsed.Code),
+			Severity:          classifyOpsSeverity(parsed.ErrorType, status),
+			StatusCode:        status,
+			IsBusinessLimited: isBusinessLimited,
+
+			ErrorMessage: parsed.Message,
+			// Keep the full captured error body (capture is already capped at 64KB) so the
+			// service layer can sanitize JSON before truncating for storage.
+			ErrorBody:   string(body),
+			ErrorSource: errorSource,
+			ErrorOwner:  errorOwner,
+
+			IsRetryable: classifyOpsIsRetryable(parsed.ErrorType, status),
+			RetryCount:  0,
+			CreatedAt:   time.Now(),
+		}
+
+		// Capture upstream error context set by gateway services (if present).
+		// This does NOT affect the client response; it enriches Ops troubleshooting data.
+		{
+			if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
+				switch t := v.(type) {
+				case int:
+					if t > 0 {
+						code := t
+						entry.UpstreamStatusCode = &code
+					}
+				case int64:
+					if t > 0 {
+						code := int(t)
+						entry.UpstreamStatusCode = &code
+					}
+				}
+			}
+			if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
+				if s, ok := v.(string); ok {
+					if msg := strings.TrimSpace(s); msg != "" {
+						entry.UpstreamErrorMessage = &msg
+					}
+				}
+			}
+			if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
+				if s, ok := v.(string); ok {
+					if detail := strings.TrimSpace(s); detail != "" {
+						entry.UpstreamErrorDetail = &detail
+					}
+				}
+			}
+			if v, ok := c.Get(service.OpsUpstreamErrorsKey); ok {
+				if events, ok := v.([]*service.OpsUpstreamErrorEvent); ok && len(events) > 0 {
+					entry.UpstreamErrors = events
+					// Best-effort backfill the single upstream fields from the last event when missing.
+					last := events[len(events)-1]
+					if last != nil {
+						if entry.UpstreamStatusCode == nil && last.UpstreamStatusCode > 0 {
+							code := last.UpstreamStatusCode
+							entry.UpstreamStatusCode = &code
+						}
+						if entry.UpstreamErrorMessage == nil && strings.TrimSpace(last.Message) != "" {
+							msg := strings.TrimSpace(last.Message)
+							entry.UpstreamErrorMessage = &msg
+						}
+						if entry.UpstreamErrorDetail == nil && strings.TrimSpace(last.Detail) != "" {
+							detail := strings.TrimSpace(last.Detail)
+							entry.UpstreamErrorDetail = &detail
+						}
+					}
+				}
+			}
+		}
+
+		if apiKey != nil {
+			entry.APIKeyID = &apiKey.ID
+			if apiKey.User != nil {
+				entry.UserID = &apiKey.User.ID
+			}
+			if apiKey.GroupID != nil {
+				entry.GroupID = apiKey.GroupID
+			}
+			// Prefer group platform if present (more stable than inferring from path).
+			if apiKey.Group != nil && apiKey.Group.Platform != "" {
+				entry.Platform = apiKey.Group.Platform
+			}
+		}
+
+		var clientIP string
+		if ip := strings.TrimSpace(c.ClientIP()); ip != "" {
+			clientIP = ip
+			entry.ClientIP = &clientIP
+		}
+
+		var requestBody []byte
+		if v, ok := c.Get(opsRequestBodyKey); ok {
+			if b, ok := v.([]byte); ok && len(b) > 0 {
+				requestBody = b
+			}
+		}
+		// Persist only a minimal, whitelisted set of request headers to improve retry fidelity.
+		// Do NOT store Authorization/Cookie/etc.
+		entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c)
+
+		enqueueOpsErrorLog(ops, entry, requestBody)
+	}
+}
+
+var opsRetryRequestHeaderAllowlist = []string{
+	"anthropic-beta",
+	"anthropic-version",
+}
+
+func extractOpsRetryRequestHeaders(c *gin.Context) *string {
+	if c == nil || c.Request == nil {
+		return nil
+	}
+
+	headers := make(map[string]string, 4)
+	for _, key := range opsRetryRequestHeaderAllowlist {
+		v := strings.TrimSpace(c.GetHeader(key))
+		if v == "" {
+			continue
+		}
+		// Keep headers small even if a client sends something unexpected.
+		headers[key] = truncateString(v, 512)
+	}
+	if len(headers) == 0 {
+		return nil
+	}
+
+	raw, err := json.Marshal(headers)
+	if err != nil {
+		return nil
+	}
+	s := string(raw)
+	return &s
+}
+
+type parsedOpsError struct {
+	ErrorType string
+	Message   string
+	Code      string
+}
+
+func parseOpsErrorResponse(body []byte) parsedOpsError {
+	if len(body) == 0 {
+		return parsedOpsError{}
+	}
+
+	// Fast path: attempt to decode into a generic map.
+	var m map[string]any
+	if err := json.Unmarshal(body, &m); err != nil {
+		return parsedOpsError{Message: truncateString(string(body), 1024)}
+	}
+
+	// Claude/OpenAI-style gateway error: { type:"error", error:{ type, message } }
+	if errObj, ok := m["error"].(map[string]any); ok {
+		t, _ := errObj["type"].(string)
+		msg, _ := errObj["message"].(string)
+		// Gemini googleError also uses "error": { code, message, status }
+		if msg == "" {
+			if v, ok := errObj["message"]; ok {
+				msg, _ = v.(string)
+			}
+		}
+		if t == "" {
+			// Gemini error does not have "type" field.
+			t = "api_error"
+		}
+		// For gemini error, capture numeric code as string for business-limited mapping if needed.
+		var code string
+		if v, ok := errObj["code"]; ok {
+			switch n := v.(type) {
+			case float64:
+				code = strconvItoa(int(n))
+			case int:
+				code = strconvItoa(n)
+			}
+		}
+		return parsedOpsError{ErrorType: t, Message: msg, Code: code}
+	}
+
+	// APIKeyAuth-style: { code:"INSUFFICIENT_BALANCE", message:"..." }
+	code, _ := m["code"].(string)
+	msg, _ := m["message"].(string)
+	if code != "" || msg != "" {
+		return parsedOpsError{ErrorType: "api_error", Message: msg, Code: code}
+	}
+
+	return parsedOpsError{Message: truncateString(string(body), 1024)}
+}
+
+func resolveOpsPlatform(apiKey *service.APIKey, fallback string) string {
+	if apiKey != nil && apiKey.Group != nil && apiKey.Group.Platform != "" {
+		return apiKey.Group.Platform
+	}
+	return fallback
+}
+
+func guessPlatformFromPath(path string) string {
+	p := strings.ToLower(path)
+	switch {
+	case strings.HasPrefix(p, "/antigravity/"):
+		return service.PlatformAntigravity
+	case strings.HasPrefix(p, "/v1beta/"):
+		return service.PlatformGemini
+	case strings.Contains(p, "/responses"):
+		return service.PlatformOpenAI
+	default:
+		return ""
+	}
+}
+
+func normalizeOpsErrorType(errType string, code string) string {
+	if errType != "" {
+		return errType
+	}
+	switch strings.TrimSpace(code) {
+	case "INSUFFICIENT_BALANCE":
+		return "billing_error"
+	case "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
+		return "subscription_error"
+	default:
+		return "api_error"
+	}
+}
+
+func classifyOpsPhase(errType, message, code string) string {
+	msg := strings.ToLower(message)
+	switch strings.TrimSpace(code) {
+	case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
+		return "billing"
+	}
+
+	switch errType {
+	case "authentication_error":
+		return "auth"
+	case "billing_error", "subscription_error":
+		return "billing"
+	case "rate_limit_error":
+		if strings.Contains(msg, "concurrency") || strings.Contains(msg, "pending") || strings.Contains(msg, "queue") {
+			return "concurrency"
+		}
+		return "upstream"
+	case "invalid_request_error":
+		return "response"
+	case "upstream_error", "overloaded_error":
+		return "upstream"
+	case "api_error":
+		if strings.Contains(msg, "no available accounts") {
+			return "scheduling"
+		}
+		return "internal"
+	default:
+		return "internal"
+	}
+}
+
+func classifyOpsSeverity(errType string, status int) string {
+	switch errType {
+	case "invalid_request_error", "authentication_error", "billing_error", "subscription_error":
+		return "P3"
+	}
+	if status >= 500 {
+		return "P1"
+	}
+	if status == 429 {
+		return "P1"
+	}
+	if status >= 400 {
+		return "P2"
+	}
+	return "P3"
+}
+
+func classifyOpsIsRetryable(errType string, statusCode int) bool {
+	switch errType {
+	case "authentication_error", "invalid_request_error":
+		return false
+	case "timeout_error":
+		return true
+	case "rate_limit_error":
+		// May be transient (upstream or queue); retry can help.
+		return true
+	case "billing_error", "subscription_error":
+		return false
+	case "upstream_error", "overloaded_error":
+		return statusCode >= 500 || statusCode == 429 || statusCode == 529
+	default:
+		return statusCode >= 500
+	}
+}
+
+func classifyOpsIsBusinessLimited(errType, phase, code string, status int, message string) bool {
+	switch strings.TrimSpace(code) {
+	case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
+		return true
+	}
+	if phase == "billing" || phase == "concurrency" {
+		// SLA/错误率排除“用户级业务限制”
+		return true
+	}
+	// Avoid treating upstream rate limits as business-limited.
+	if errType == "rate_limit_error" && strings.Contains(strings.ToLower(message), "upstream") {
+		return false
+	}
+	_ = status
+	return false
+}
+
+func classifyOpsErrorOwner(phase string, message string) string {
+	switch phase {
+	case "upstream", "network":
+		return "provider"
+	case "billing", "concurrency", "auth", "response":
+		return "client"
+	default:
+		if strings.Contains(strings.ToLower(message), "upstream") {
+			return "provider"
+		}
+		return "sub2api"
+	}
+}
+
+func classifyOpsErrorSource(phase string, message string) string {
+	switch phase {
+	case "upstream":
+		return "upstream_http"
+	case "network":
+		return "upstream_network"
+	case "billing":
+		return "billing"
+	case "concurrency":
+		return "concurrency"
+	default:
+		if strings.Contains(strings.ToLower(message), "upstream") {
+			return "upstream_http"
+		}
+		return "internal"
+	}
+}
+
+func truncateString(s string, max int) string {
+	if max <= 0 {
+		return ""
+	}
+	if len(s) <= max {
+		return s
+	}
+	cut := s[:max]
+	// Ensure truncation does not split multi-byte characters.
+	for len(cut) > 0 && !utf8.ValidString(cut) {
+		cut = cut[:len(cut)-1]
+	}
+	return cut
+}
+
+func strconvItoa(v int) string {
+	return strconv.Itoa(v)
+}
diff --git a/backend/internal/handler/wire.go b/backend/internal/handler/wire.go
index a5e62d0a..2af7905e 100644
--- a/backend/internal/handler/wire.go
+++ b/backend/internal/handler/wire.go
@@ -21,6 +21,7 @@ func ProvideAdminHandlers(
 	redeemHandler *admin.RedeemHandler,
 	promoHandler *admin.PromoHandler,
 	settingHandler *admin.SettingHandler,
+	opsHandler *admin.OpsHandler,
 	systemHandler *admin.SystemHandler,
 	subscriptionHandler *admin.SubscriptionHandler,
 	usageHandler *admin.UsageHandler,
@@ -39,6 +40,7 @@ func ProvideAdminHandlers(
 		Redeem:           redeemHandler,
 		Promo:            promoHandler,
 		Setting:          settingHandler,
+		Ops:              opsHandler,
 		System:           systemHandler,
 		Subscription:     subscriptionHandler,
 		Usage:            usageHandler,
@@ -109,6 +111,7 @@ var ProviderSet = wire.NewSet(
 	admin.NewRedeemHandler,
 	admin.NewPromoHandler,
 	admin.NewSettingHandler,
+	admin.NewOpsHandler,
 	ProvideSystemHandler,
 	admin.NewSubscriptionHandler,
 	admin.NewUsageHandler,
diff --git a/backend/internal/pkg/ctxkey/ctxkey.go b/backend/internal/pkg/ctxkey/ctxkey.go
index bd10eae0..27bb5ac5 100644
--- a/backend/internal/pkg/ctxkey/ctxkey.go
+++ b/backend/internal/pkg/ctxkey/ctxkey.go
@@ -7,7 +7,14 @@ type Key string
 const (
 	// ForcePlatform 强制平台（用于 /antigravity 路由），由 middleware.ForcePlatform 设置
 	ForcePlatform Key = "ctx_force_platform"
-	// IsClaudeCodeClient 是否为 Claude Code 客户端，由中间件设置
+
+	// ClientRequestID 客户端请求的唯一标识，用于追踪请求全生命周期（用于 Ops 监控与排障）。
+	ClientRequestID Key = "ctx_client_request_id"
+
+	// RetryCount 表示当前请求在网关层的重试次数（用于 Ops 记录与排障）。
+	RetryCount Key = "ctx_retry_count"
+
+	// IsClaudeCodeClient 标识当前请求是否来自 Claude Code 客户端
 	IsClaudeCodeClient Key = "ctx_is_claude_code_client"
 	// Group 认证后的分组信息，由 API Key 认证中间件设置
 	Group Key = "ctx_group"
diff --git a/backend/internal/repository/concurrency_cache.go b/backend/internal/repository/concurrency_cache.go
index 0831f5eb..b34961e1 100644
--- a/backend/internal/repository/concurrency_cache.go
+++ b/backend/internal/repository/concurrency_cache.go
@@ -93,7 +93,7 @@ var (
 		return redis.call('ZCARD', key)
 	`)
 
-	// incrementWaitScript - only sets TTL on first creation to avoid refreshing
+	// incrementWaitScript - refreshes TTL on each increment to keep queue depth accurate
 	// KEYS[1] = wait queue key
 	// ARGV[1] = maxWait
 	// ARGV[2] = TTL in seconds
@@ -111,15 +111,13 @@ var (
 
 		local newVal = redis.call('INCR', KEYS[1])
 
-		-- Only set TTL on first creation to avoid refreshing zombie data
-		if newVal == 1 then
-			redis.call('EXPIRE', KEYS[1], ARGV[2])
-		end
+		-- Refresh TTL so long-running traffic doesn't expire active queue counters.
+		redis.call('EXPIRE', KEYS[1], ARGV[2])
 
 			return 1
 		`)
 
-	// incrementAccountWaitScript - account-level wait queue count
+	// incrementAccountWaitScript - account-level wait queue count (refresh TTL on each increment)
 	incrementAccountWaitScript = redis.NewScript(`
 			local current = redis.call('GET', KEYS[1])
 			if current == false then
@@ -134,10 +132,8 @@ var (
 
 			local newVal = redis.call('INCR', KEYS[1])
 
-			-- Only set TTL on first creation to avoid refreshing zombie data
-			if newVal == 1 then
-				redis.call('EXPIRE', KEYS[1], ARGV[2])
-			end
+			-- Refresh TTL so long-running traffic doesn't expire active queue counters.
+			redis.call('EXPIRE', KEYS[1], ARGV[2])
 
 			return 1
 		`)
diff --git a/backend/internal/repository/ops_repo.go b/backend/internal/repository/ops_repo.go
new file mode 100644
index 00000000..8e157dbf
--- /dev/null
+++ b/backend/internal/repository/ops_repo.go
@@ -0,0 +1,707 @@
+package repository
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/service"
+	"github.com/lib/pq"
+)
+
+type opsRepository struct {
+	db *sql.DB
+}
+
+func NewOpsRepository(db *sql.DB) service.OpsRepository {
+	return &opsRepository{db: db}
+}
+
+func (r *opsRepository) InsertErrorLog(ctx context.Context, input *service.OpsInsertErrorLogInput) (int64, error) {
+	if r == nil || r.db == nil {
+		return 0, fmt.Errorf("nil ops repository")
+	}
+	if input == nil {
+		return 0, fmt.Errorf("nil input")
+	}
+
+	q := `
+INSERT INTO ops_error_logs (
+  request_id,
+  client_request_id,
+  user_id,
+  api_key_id,
+  account_id,
+  group_id,
+  client_ip,
+  platform,
+  model,
+  request_path,
+  stream,
+  user_agent,
+  error_phase,
+  error_type,
+  severity,
+  status_code,
+  is_business_limited,
+  error_message,
+  error_body,
+  error_source,
+  error_owner,
+  upstream_status_code,
+  upstream_error_message,
+  upstream_error_detail,
+  upstream_errors,
+  duration_ms,
+  time_to_first_token_ms,
+  request_body,
+  request_body_truncated,
+  request_body_bytes,
+  request_headers,
+  is_retryable,
+  retry_count,
+  created_at
+) VALUES (
+  $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34
+) RETURNING id`
+
+	var id int64
+	err := r.db.QueryRowContext(
+		ctx,
+		q,
+		opsNullString(input.RequestID),
+		opsNullString(input.ClientRequestID),
+		opsNullInt64(input.UserID),
+		opsNullInt64(input.APIKeyID),
+		opsNullInt64(input.AccountID),
+		opsNullInt64(input.GroupID),
+		opsNullString(input.ClientIP),
+		opsNullString(input.Platform),
+		opsNullString(input.Model),
+		opsNullString(input.RequestPath),
+		input.Stream,
+		opsNullString(input.UserAgent),
+		input.ErrorPhase,
+		input.ErrorType,
+		opsNullString(input.Severity),
+		opsNullInt(input.StatusCode),
+		input.IsBusinessLimited,
+		opsNullString(input.ErrorMessage),
+		opsNullString(input.ErrorBody),
+		opsNullString(input.ErrorSource),
+		opsNullString(input.ErrorOwner),
+		opsNullInt(input.UpstreamStatusCode),
+		opsNullString(input.UpstreamErrorMessage),
+		opsNullString(input.UpstreamErrorDetail),
+		opsNullString(input.UpstreamErrorsJSON),
+		opsNullInt(input.DurationMs),
+		opsNullInt64(input.TimeToFirstTokenMs),
+		opsNullString(input.RequestBodyJSON),
+		input.RequestBodyTruncated,
+		opsNullInt(input.RequestBodyBytes),
+		opsNullString(input.RequestHeadersJSON),
+		input.IsRetryable,
+		input.RetryCount,
+		input.CreatedAt,
+	).Scan(&id)
+	if err != nil {
+		return 0, err
+	}
+	return id, nil
+}
+
+func (r *opsRepository) ListErrorLogs(ctx context.Context, filter *service.OpsErrorLogFilter) (*service.OpsErrorLogList, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if filter == nil {
+		filter = &service.OpsErrorLogFilter{}
+	}
+
+	page := filter.Page
+	if page <= 0 {
+		page = 1
+	}
+	pageSize := filter.PageSize
+	if pageSize <= 0 {
+		pageSize = 20
+	}
+	if pageSize > 500 {
+		pageSize = 500
+	}
+
+	where, args := buildOpsErrorLogsWhere(filter)
+	countSQL := "SELECT COUNT(*) FROM ops_error_logs " + where
+
+	var total int
+	if err := r.db.QueryRowContext(ctx, countSQL, args...).Scan(&total); err != nil {
+		return nil, err
+	}
+
+	offset := (page - 1) * pageSize
+	argsWithLimit := append(args, pageSize, offset)
+	selectSQL := `
+SELECT
+  id,
+  created_at,
+  error_phase,
+  error_type,
+  severity,
+  COALESCE(upstream_status_code, status_code, 0),
+  COALESCE(platform, ''),
+  COALESCE(model, ''),
+  duration_ms,
+  COALESCE(client_request_id, ''),
+  COALESCE(request_id, ''),
+  COALESCE(error_message, ''),
+  user_id,
+  api_key_id,
+  account_id,
+  group_id,
+  CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END,
+  COALESCE(request_path, ''),
+  stream
+FROM ops_error_logs
+` + where + `
+ORDER BY created_at DESC
+LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2)
+
+	rows, err := r.db.QueryContext(ctx, selectSQL, argsWithLimit...)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = rows.Close() }()
+
+	out := make([]*service.OpsErrorLog, 0, pageSize)
+	for rows.Next() {
+		var item service.OpsErrorLog
+		var latency sql.NullInt64
+		var statusCode sql.NullInt64
+		var clientIP sql.NullString
+		var userID sql.NullInt64
+		var apiKeyID sql.NullInt64
+		var accountID sql.NullInt64
+		var groupID sql.NullInt64
+		if err := rows.Scan(
+			&item.ID,
+			&item.CreatedAt,
+			&item.Phase,
+			&item.Type,
+			&item.Severity,
+			&statusCode,
+			&item.Platform,
+			&item.Model,
+			&latency,
+			&item.ClientRequestID,
+			&item.RequestID,
+			&item.Message,
+			&userID,
+			&apiKeyID,
+			&accountID,
+			&groupID,
+			&clientIP,
+			&item.RequestPath,
+			&item.Stream,
+		); err != nil {
+			return nil, err
+		}
+		if latency.Valid {
+			v := int(latency.Int64)
+			item.LatencyMs = &v
+		}
+		item.StatusCode = int(statusCode.Int64)
+		if clientIP.Valid {
+			s := clientIP.String
+			item.ClientIP = &s
+		}
+		if userID.Valid {
+			v := userID.Int64
+			item.UserID = &v
+		}
+		if apiKeyID.Valid {
+			v := apiKeyID.Int64
+			item.APIKeyID = &v
+		}
+		if accountID.Valid {
+			v := accountID.Int64
+			item.AccountID = &v
+		}
+		if groupID.Valid {
+			v := groupID.Int64
+			item.GroupID = &v
+		}
+		out = append(out, &item)
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+
+	return &service.OpsErrorLogList{
+		Errors:   out,
+		Total:    total,
+		Page:     page,
+		PageSize: pageSize,
+	}, nil
+}
+
+func (r *opsRepository) GetErrorLogByID(ctx context.Context, id int64) (*service.OpsErrorLogDetail, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if id <= 0 {
+		return nil, fmt.Errorf("invalid id")
+	}
+
+	q := `
+SELECT
+  id,
+  created_at,
+  error_phase,
+  error_type,
+  severity,
+  COALESCE(upstream_status_code, status_code, 0),
+  COALESCE(platform, ''),
+  COALESCE(model, ''),
+  duration_ms,
+  COALESCE(client_request_id, ''),
+  COALESCE(request_id, ''),
+  COALESCE(error_message, ''),
+  COALESCE(error_body, ''),
+  upstream_status_code,
+  COALESCE(upstream_error_message, ''),
+  COALESCE(upstream_error_detail, ''),
+  COALESCE(upstream_errors::text, ''),
+  is_business_limited,
+  user_id,
+  api_key_id,
+  account_id,
+  group_id,
+  CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END,
+  COALESCE(request_path, ''),
+  stream,
+  COALESCE(user_agent, ''),
+  auth_latency_ms,
+  routing_latency_ms,
+  upstream_latency_ms,
+  response_latency_ms,
+  time_to_first_token_ms,
+  COALESCE(request_body::text, ''),
+  request_body_truncated,
+  request_body_bytes,
+  COALESCE(request_headers::text, '')
+FROM ops_error_logs
+WHERE id = $1
+LIMIT 1`
+
+	var out service.OpsErrorLogDetail
+	var latency sql.NullInt64
+	var statusCode sql.NullInt64
+	var upstreamStatusCode sql.NullInt64
+	var clientIP sql.NullString
+	var userID sql.NullInt64
+	var apiKeyID sql.NullInt64
+	var accountID sql.NullInt64
+	var groupID sql.NullInt64
+	var authLatency sql.NullInt64
+	var routingLatency sql.NullInt64
+	var upstreamLatency sql.NullInt64
+	var responseLatency sql.NullInt64
+	var ttft sql.NullInt64
+	var requestBodyBytes sql.NullInt64
+
+	err := r.db.QueryRowContext(ctx, q, id).Scan(
+		&out.ID,
+		&out.CreatedAt,
+		&out.Phase,
+		&out.Type,
+		&out.Severity,
+		&statusCode,
+		&out.Platform,
+		&out.Model,
+		&latency,
+		&out.ClientRequestID,
+		&out.RequestID,
+		&out.Message,
+		&out.ErrorBody,
+		&upstreamStatusCode,
+		&out.UpstreamErrorMessage,
+		&out.UpstreamErrorDetail,
+		&out.UpstreamErrors,
+		&out.IsBusinessLimited,
+		&userID,
+		&apiKeyID,
+		&accountID,
+		&groupID,
+		&clientIP,
+		&out.RequestPath,
+		&out.Stream,
+		&out.UserAgent,
+		&authLatency,
+		&routingLatency,
+		&upstreamLatency,
+		&responseLatency,
+		&ttft,
+		&out.RequestBody,
+		&out.RequestBodyTruncated,
+		&requestBodyBytes,
+		&out.RequestHeaders,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	out.StatusCode = int(statusCode.Int64)
+	if latency.Valid {
+		v := int(latency.Int64)
+		out.LatencyMs = &v
+	}
+	if clientIP.Valid {
+		s := clientIP.String
+		out.ClientIP = &s
+	}
+	if upstreamStatusCode.Valid && upstreamStatusCode.Int64 > 0 {
+		v := int(upstreamStatusCode.Int64)
+		out.UpstreamStatusCode = &v
+	}
+	if userID.Valid {
+		v := userID.Int64
+		out.UserID = &v
+	}
+	if apiKeyID.Valid {
+		v := apiKeyID.Int64
+		out.APIKeyID = &v
+	}
+	if accountID.Valid {
+		v := accountID.Int64
+		out.AccountID = &v
+	}
+	if groupID.Valid {
+		v := groupID.Int64
+		out.GroupID = &v
+	}
+	if authLatency.Valid {
+		v := authLatency.Int64
+		out.AuthLatencyMs = &v
+	}
+	if routingLatency.Valid {
+		v := routingLatency.Int64
+		out.RoutingLatencyMs = &v
+	}
+	if upstreamLatency.Valid {
+		v := upstreamLatency.Int64
+		out.UpstreamLatencyMs = &v
+	}
+	if responseLatency.Valid {
+		v := responseLatency.Int64
+		out.ResponseLatencyMs = &v
+	}
+	if ttft.Valid {
+		v := ttft.Int64
+		out.TimeToFirstTokenMs = &v
+	}
+	if requestBodyBytes.Valid {
+		v := int(requestBodyBytes.Int64)
+		out.RequestBodyBytes = &v
+	}
+
+	// Normalize request_body to empty string when stored as JSON null.
+	out.RequestBody = strings.TrimSpace(out.RequestBody)
+	if out.RequestBody == "null" {
+		out.RequestBody = ""
+	}
+	// Normalize request_headers to empty string when stored as JSON null.
+	out.RequestHeaders = strings.TrimSpace(out.RequestHeaders)
+	if out.RequestHeaders == "null" {
+		out.RequestHeaders = ""
+	}
+	// Normalize upstream_errors to empty string when stored as JSON null.
+	out.UpstreamErrors = strings.TrimSpace(out.UpstreamErrors)
+	if out.UpstreamErrors == "null" {
+		out.UpstreamErrors = ""
+	}
+
+	return &out, nil
+}
+
+func (r *opsRepository) InsertRetryAttempt(ctx context.Context, input *service.OpsInsertRetryAttemptInput) (int64, error) {
+	if r == nil || r.db == nil {
+		return 0, fmt.Errorf("nil ops repository")
+	}
+	if input == nil {
+		return 0, fmt.Errorf("nil input")
+	}
+	if input.SourceErrorID <= 0 {
+		return 0, fmt.Errorf("invalid source_error_id")
+	}
+	if strings.TrimSpace(input.Mode) == "" {
+		return 0, fmt.Errorf("invalid mode")
+	}
+
+	q := `
+INSERT INTO ops_retry_attempts (
+  requested_by_user_id,
+  source_error_id,
+  mode,
+  pinned_account_id,
+  status,
+  started_at
+) VALUES (
+  $1,$2,$3,$4,$5,$6
+) RETURNING id`
+
+	var id int64
+	err := r.db.QueryRowContext(
+		ctx,
+		q,
+		opsNullInt64(&input.RequestedByUserID),
+		input.SourceErrorID,
+		strings.TrimSpace(input.Mode),
+		opsNullInt64(input.PinnedAccountID),
+		strings.TrimSpace(input.Status),
+		input.StartedAt,
+	).Scan(&id)
+	if err != nil {
+		return 0, err
+	}
+	return id, nil
+}
+
+func (r *opsRepository) UpdateRetryAttempt(ctx context.Context, input *service.OpsUpdateRetryAttemptInput) error {
+	if r == nil || r.db == nil {
+		return fmt.Errorf("nil ops repository")
+	}
+	if input == nil {
+		return fmt.Errorf("nil input")
+	}
+	if input.ID <= 0 {
+		return fmt.Errorf("invalid id")
+	}
+
+	q := `
+UPDATE ops_retry_attempts
+SET
+  status = $2,
+  finished_at = $3,
+  duration_ms = $4,
+  result_request_id = $5,
+  result_error_id = $6,
+  error_message = $7
+WHERE id = $1`
+
+	_, err := r.db.ExecContext(
+		ctx,
+		q,
+		input.ID,
+		strings.TrimSpace(input.Status),
+		nullTime(input.FinishedAt),
+		input.DurationMs,
+		opsNullString(input.ResultRequestID),
+		opsNullInt64(input.ResultErrorID),
+		opsNullString(input.ErrorMessage),
+	)
+	return err
+}
+
+func (r *opsRepository) GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*service.OpsRetryAttempt, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if sourceErrorID <= 0 {
+		return nil, fmt.Errorf("invalid source_error_id")
+	}
+
+	q := `
+SELECT
+  id,
+  created_at,
+  COALESCE(requested_by_user_id, 0),
+  source_error_id,
+  COALESCE(mode, ''),
+  pinned_account_id,
+  COALESCE(status, ''),
+  started_at,
+  finished_at,
+  duration_ms,
+  result_request_id,
+  result_error_id,
+  error_message
+FROM ops_retry_attempts
+WHERE source_error_id = $1
+ORDER BY created_at DESC
+LIMIT 1`
+
+	var out service.OpsRetryAttempt
+	var pinnedAccountID sql.NullInt64
+	var requestedBy sql.NullInt64
+	var startedAt sql.NullTime
+	var finishedAt sql.NullTime
+	var durationMs sql.NullInt64
+	var resultRequestID sql.NullString
+	var resultErrorID sql.NullInt64
+	var errorMessage sql.NullString
+
+	err := r.db.QueryRowContext(ctx, q, sourceErrorID).Scan(
+		&out.ID,
+		&out.CreatedAt,
+		&requestedBy,
+		&out.SourceErrorID,
+		&out.Mode,
+		&pinnedAccountID,
+		&out.Status,
+		&startedAt,
+		&finishedAt,
+		&durationMs,
+		&resultRequestID,
+		&resultErrorID,
+		&errorMessage,
+	)
+	if err != nil {
+		return nil, err
+	}
+	out.RequestedByUserID = requestedBy.Int64
+	if pinnedAccountID.Valid {
+		v := pinnedAccountID.Int64
+		out.PinnedAccountID = &v
+	}
+	if startedAt.Valid {
+		t := startedAt.Time
+		out.StartedAt = &t
+	}
+	if finishedAt.Valid {
+		t := finishedAt.Time
+		out.FinishedAt = &t
+	}
+	if durationMs.Valid {
+		v := durationMs.Int64
+		out.DurationMs = &v
+	}
+	if resultRequestID.Valid {
+		s := resultRequestID.String
+		out.ResultRequestID = &s
+	}
+	if resultErrorID.Valid {
+		v := resultErrorID.Int64
+		out.ResultErrorID = &v
+	}
+	if errorMessage.Valid {
+		s := errorMessage.String
+		out.ErrorMessage = &s
+	}
+
+	return &out, nil
+}
+
+func nullTime(t time.Time) sql.NullTime {
+	if t.IsZero() {
+		return sql.NullTime{}
+	}
+	return sql.NullTime{Time: t, Valid: true}
+}
+
+func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) {
+	clauses := make([]string, 0, 8)
+	args := make([]any, 0, 8)
+	clauses = append(clauses, "1=1")
+
+	phaseFilter := ""
+	if filter != nil {
+		phaseFilter = strings.TrimSpace(strings.ToLower(filter.Phase))
+	}
+	// ops_error_logs primarily stores client-visible error requests (status>=400),
+	// but we also persist "recovered" upstream errors (status<400) for upstream health visibility.
+	// By default, keep list endpoints scoped to client errors unless explicitly filtering upstream phase.
+	if phaseFilter != "upstream" {
+		clauses = append(clauses, "COALESCE(status_code, 0) >= 400")
+	}
+
+	if filter.StartTime != nil && !filter.StartTime.IsZero() {
+		args = append(args, filter.StartTime.UTC())
+		clauses = append(clauses, "created_at >= $"+itoa(len(args)))
+	}
+	if filter.EndTime != nil && !filter.EndTime.IsZero() {
+		args = append(args, filter.EndTime.UTC())
+		// Keep time-window semantics consistent with other ops queries: [start, end)
+		clauses = append(clauses, "created_at < $"+itoa(len(args)))
+	}
+	if p := strings.TrimSpace(filter.Platform); p != "" {
+		args = append(args, p)
+		clauses = append(clauses, "platform = $"+itoa(len(args)))
+	}
+	if filter.GroupID != nil && *filter.GroupID > 0 {
+		args = append(args, *filter.GroupID)
+		clauses = append(clauses, "group_id = $"+itoa(len(args)))
+	}
+	if filter.AccountID != nil && *filter.AccountID > 0 {
+		args = append(args, *filter.AccountID)
+		clauses = append(clauses, "account_id = $"+itoa(len(args)))
+	}
+	if phase := phaseFilter; phase != "" {
+		args = append(args, phase)
+		clauses = append(clauses, "error_phase = $"+itoa(len(args)))
+	}
+	if len(filter.StatusCodes) > 0 {
+		args = append(args, pq.Array(filter.StatusCodes))
+		clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) = ANY($"+itoa(len(args))+")")
+	}
+	if q := strings.TrimSpace(filter.Query); q != "" {
+		like := "%" + q + "%"
+		args = append(args, like)
+		n := itoa(len(args))
+		clauses = append(clauses, "(request_id ILIKE $"+n+" OR client_request_id ILIKE $"+n+" OR error_message ILIKE $"+n+")")
+	}
+
+	return "WHERE " + strings.Join(clauses, " AND "), args
+}
+
+// Helpers for nullable args
+func opsNullString(v any) any {
+	switch s := v.(type) {
+	case nil:
+		return sql.NullString{}
+	case *string:
+		if s == nil || strings.TrimSpace(*s) == "" {
+			return sql.NullString{}
+		}
+		return sql.NullString{String: strings.TrimSpace(*s), Valid: true}
+	case string:
+		if strings.TrimSpace(s) == "" {
+			return sql.NullString{}
+		}
+		return sql.NullString{String: strings.TrimSpace(s), Valid: true}
+	default:
+		return sql.NullString{}
+	}
+}
+
+func opsNullInt64(v *int64) any {
+	if v == nil || *v == 0 {
+		return sql.NullInt64{}
+	}
+	return sql.NullInt64{Int64: *v, Valid: true}
+}
+
+func opsNullInt(v any) any {
+	switch n := v.(type) {
+	case nil:
+		return sql.NullInt64{}
+	case *int:
+		if n == nil || *n == 0 {
+			return sql.NullInt64{}
+		}
+		return sql.NullInt64{Int64: int64(*n), Valid: true}
+	case *int64:
+		if n == nil || *n == 0 {
+			return sql.NullInt64{}
+		}
+		return sql.NullInt64{Int64: *n, Valid: true}
+	case int:
+		if n == 0 {
+			return sql.NullInt64{}
+		}
+		return sql.NullInt64{Int64: int64(n), Valid: true}
+	default:
+		return sql.NullInt64{}
+	}
+}
diff --git a/backend/internal/repository/ops_repo_alerts.go b/backend/internal/repository/ops_repo_alerts.go
new file mode 100644
index 00000000..f601c363
--- /dev/null
+++ b/backend/internal/repository/ops_repo_alerts.go
@@ -0,0 +1,689 @@
+package repository
+
+import (
+	"context"
+	"database/sql"
+	"encoding/json"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/service"
+)
+
+func (r *opsRepository) ListAlertRules(ctx context.Context) ([]*service.OpsAlertRule, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+
+	q := `
+SELECT
+  id,
+  name,
+  COALESCE(description, ''),
+  enabled,
+  COALESCE(severity, ''),
+  metric_type,
+  operator,
+  threshold,
+  window_minutes,
+  sustained_minutes,
+  cooldown_minutes,
+  COALESCE(notify_email, true),
+  filters,
+  last_triggered_at,
+  created_at,
+  updated_at
+FROM ops_alert_rules
+ORDER BY id DESC`
+
+	rows, err := r.db.QueryContext(ctx, q)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = rows.Close() }()
+
+	out := []*service.OpsAlertRule{}
+	for rows.Next() {
+		var rule service.OpsAlertRule
+		var filtersRaw []byte
+		var lastTriggeredAt sql.NullTime
+		if err := rows.Scan(
+			&rule.ID,
+			&rule.Name,
+			&rule.Description,
+			&rule.Enabled,
+			&rule.Severity,
+			&rule.MetricType,
+			&rule.Operator,
+			&rule.Threshold,
+			&rule.WindowMinutes,
+			&rule.SustainedMinutes,
+			&rule.CooldownMinutes,
+			&rule.NotifyEmail,
+			&filtersRaw,
+			&lastTriggeredAt,
+			&rule.CreatedAt,
+			&rule.UpdatedAt,
+		); err != nil {
+			return nil, err
+		}
+		if lastTriggeredAt.Valid {
+			v := lastTriggeredAt.Time
+			rule.LastTriggeredAt = &v
+		}
+		if len(filtersRaw) > 0 && string(filtersRaw) != "null" {
+			var decoded map[string]any
+			if err := json.Unmarshal(filtersRaw, &decoded); err == nil {
+				rule.Filters = decoded
+			}
+		}
+		out = append(out, &rule)
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (r *opsRepository) CreateAlertRule(ctx context.Context, input *service.OpsAlertRule) (*service.OpsAlertRule, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if input == nil {
+		return nil, fmt.Errorf("nil input")
+	}
+
+	filtersArg, err := opsNullJSONMap(input.Filters)
+	if err != nil {
+		return nil, err
+	}
+
+	q := `
+INSERT INTO ops_alert_rules (
+  name,
+  description,
+  enabled,
+  severity,
+  metric_type,
+  operator,
+  threshold,
+  window_minutes,
+  sustained_minutes,
+  cooldown_minutes,
+  notify_email,
+  filters,
+  created_at,
+  updated_at
+) VALUES (
+  $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,NOW(),NOW()
+)
+RETURNING
+  id,
+  name,
+  COALESCE(description, ''),
+  enabled,
+  COALESCE(severity, ''),
+  metric_type,
+  operator,
+  threshold,
+  window_minutes,
+  sustained_minutes,
+  cooldown_minutes,
+  COALESCE(notify_email, true),
+  filters,
+  last_triggered_at,
+  created_at,
+  updated_at`
+
+	var out service.OpsAlertRule
+	var filtersRaw []byte
+	var lastTriggeredAt sql.NullTime
+
+	if err := r.db.QueryRowContext(
+		ctx,
+		q,
+		strings.TrimSpace(input.Name),
+		strings.TrimSpace(input.Description),
+		input.Enabled,
+		strings.TrimSpace(input.Severity),
+		strings.TrimSpace(input.MetricType),
+		strings.TrimSpace(input.Operator),
+		input.Threshold,
+		input.WindowMinutes,
+		input.SustainedMinutes,
+		input.CooldownMinutes,
+		input.NotifyEmail,
+		filtersArg,
+	).Scan(
+		&out.ID,
+		&out.Name,
+		&out.Description,
+		&out.Enabled,
+		&out.Severity,
+		&out.MetricType,
+		&out.Operator,
+		&out.Threshold,
+		&out.WindowMinutes,
+		&out.SustainedMinutes,
+		&out.CooldownMinutes,
+		&out.NotifyEmail,
+		&filtersRaw,
+		&lastTriggeredAt,
+		&out.CreatedAt,
+		&out.UpdatedAt,
+	); err != nil {
+		return nil, err
+	}
+	if lastTriggeredAt.Valid {
+		v := lastTriggeredAt.Time
+		out.LastTriggeredAt = &v
+	}
+	if len(filtersRaw) > 0 && string(filtersRaw) != "null" {
+		var decoded map[string]any
+		if err := json.Unmarshal(filtersRaw, &decoded); err == nil {
+			out.Filters = decoded
+		}
+	}
+
+	return &out, nil
+}
+
+func (r *opsRepository) UpdateAlertRule(ctx context.Context, input *service.OpsAlertRule) (*service.OpsAlertRule, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if input == nil {
+		return nil, fmt.Errorf("nil input")
+	}
+	if input.ID <= 0 {
+		return nil, fmt.Errorf("invalid id")
+	}
+
+	filtersArg, err := opsNullJSONMap(input.Filters)
+	if err != nil {
+		return nil, err
+	}
+
+	q := `
+UPDATE ops_alert_rules
+SET
+  name = $2,
+  description = $3,
+  enabled = $4,
+  severity = $5,
+  metric_type = $6,
+  operator = $7,
+  threshold = $8,
+  window_minutes = $9,
+  sustained_minutes = $10,
+  cooldown_minutes = $11,
+  notify_email = $12,
+  filters = $13,
+  updated_at = NOW()
+WHERE id = $1
+RETURNING
+  id,
+  name,
+  COALESCE(description, ''),
+  enabled,
+  COALESCE(severity, ''),
+  metric_type,
+  operator,
+  threshold,
+  window_minutes,
+  sustained_minutes,
+  cooldown_minutes,
+  COALESCE(notify_email, true),
+  filters,
+  last_triggered_at,
+  created_at,
+  updated_at`
+
+	var out service.OpsAlertRule
+	var filtersRaw []byte
+	var lastTriggeredAt sql.NullTime
+
+	if err := r.db.QueryRowContext(
+		ctx,
+		q,
+		input.ID,
+		strings.TrimSpace(input.Name),
+		strings.TrimSpace(input.Description),
+		input.Enabled,
+		strings.TrimSpace(input.Severity),
+		strings.TrimSpace(input.MetricType),
+		strings.TrimSpace(input.Operator),
+		input.Threshold,
+		input.WindowMinutes,
+		input.SustainedMinutes,
+		input.CooldownMinutes,
+		input.NotifyEmail,
+		filtersArg,
+	).Scan(
+		&out.ID,
+		&out.Name,
+		&out.Description,
+		&out.Enabled,
+		&out.Severity,
+		&out.MetricType,
+		&out.Operator,
+		&out.Threshold,
+		&out.WindowMinutes,
+		&out.SustainedMinutes,
+		&out.CooldownMinutes,
+		&out.NotifyEmail,
+		&filtersRaw,
+		&lastTriggeredAt,
+		&out.CreatedAt,
+		&out.UpdatedAt,
+	); err != nil {
+		return nil, err
+	}
+
+	if lastTriggeredAt.Valid {
+		v := lastTriggeredAt.Time
+		out.LastTriggeredAt = &v
+	}
+	if len(filtersRaw) > 0 && string(filtersRaw) != "null" {
+		var decoded map[string]any
+		if err := json.Unmarshal(filtersRaw, &decoded); err == nil {
+			out.Filters = decoded
+		}
+	}
+
+	return &out, nil
+}
+
+func (r *opsRepository) DeleteAlertRule(ctx context.Context, id int64) error {
+	if r == nil || r.db == nil {
+		return fmt.Errorf("nil ops repository")
+	}
+	if id <= 0 {
+		return fmt.Errorf("invalid id")
+	}
+
+	res, err := r.db.ExecContext(ctx, "DELETE FROM ops_alert_rules WHERE id = $1", id)
+	if err != nil {
+		return err
+	}
+	affected, err := res.RowsAffected()
+	if err != nil {
+		return err
+	}
+	if affected == 0 {
+		return sql.ErrNoRows
+	}
+	return nil
+}
+
+func (r *opsRepository) ListAlertEvents(ctx context.Context, filter *service.OpsAlertEventFilter) ([]*service.OpsAlertEvent, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if filter == nil {
+		filter = &service.OpsAlertEventFilter{}
+	}
+
+	limit := filter.Limit
+	if limit <= 0 {
+		limit = 100
+	}
+	if limit > 500 {
+		limit = 500
+	}
+
+	where, args := buildOpsAlertEventsWhere(filter)
+	args = append(args, limit)
+	limitArg := "$" + itoa(len(args))
+
+	q := `
+SELECT
+  id,
+  COALESCE(rule_id, 0),
+  COALESCE(severity, ''),
+  COALESCE(status, ''),
+  COALESCE(title, ''),
+  COALESCE(description, ''),
+  metric_value,
+  threshold_value,
+  dimensions,
+  fired_at,
+  resolved_at,
+  email_sent,
+  created_at
+FROM ops_alert_events
+` + where + `
+ORDER BY fired_at DESC
+LIMIT ` + limitArg
+
+	rows, err := r.db.QueryContext(ctx, q, args...)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = rows.Close() }()
+
+	out := []*service.OpsAlertEvent{}
+	for rows.Next() {
+		var ev service.OpsAlertEvent
+		var metricValue sql.NullFloat64
+		var thresholdValue sql.NullFloat64
+		var dimensionsRaw []byte
+		var resolvedAt sql.NullTime
+		if err := rows.Scan(
+			&ev.ID,
+			&ev.RuleID,
+			&ev.Severity,
+			&ev.Status,
+			&ev.Title,
+			&ev.Description,
+			&metricValue,
+			&thresholdValue,
+			&dimensionsRaw,
+			&ev.FiredAt,
+			&resolvedAt,
+			&ev.EmailSent,
+			&ev.CreatedAt,
+		); err != nil {
+			return nil, err
+		}
+		if metricValue.Valid {
+			v := metricValue.Float64
+			ev.MetricValue = &v
+		}
+		if thresholdValue.Valid {
+			v := thresholdValue.Float64
+			ev.ThresholdValue = &v
+		}
+		if resolvedAt.Valid {
+			v := resolvedAt.Time
+			ev.ResolvedAt = &v
+		}
+		if len(dimensionsRaw) > 0 && string(dimensionsRaw) != "null" {
+			var decoded map[string]any
+			if err := json.Unmarshal(dimensionsRaw, &decoded); err == nil {
+				ev.Dimensions = decoded
+			}
+		}
+		out = append(out, &ev)
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (r *opsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if ruleID <= 0 {
+		return nil, fmt.Errorf("invalid rule id")
+	}
+
+	q := `
+SELECT
+  id,
+  COALESCE(rule_id, 0),
+  COALESCE(severity, ''),
+  COALESCE(status, ''),
+  COALESCE(title, ''),
+  COALESCE(description, ''),
+  metric_value,
+  threshold_value,
+  dimensions,
+  fired_at,
+  resolved_at,
+  email_sent,
+  created_at
+FROM ops_alert_events
+WHERE rule_id = $1 AND status = $2
+ORDER BY fired_at DESC
+LIMIT 1`
+
+	row := r.db.QueryRowContext(ctx, q, ruleID, service.OpsAlertStatusFiring)
+	ev, err := scanOpsAlertEvent(row)
+	if err != nil {
+		if err == sql.ErrNoRows {
+			return nil, nil
+		}
+		return nil, err
+	}
+	return ev, nil
+}
+
+func (r *opsRepository) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if ruleID <= 0 {
+		return nil, fmt.Errorf("invalid rule id")
+	}
+
+	q := `
+SELECT
+  id,
+  COALESCE(rule_id, 0),
+  COALESCE(severity, ''),
+  COALESCE(status, ''),
+  COALESCE(title, ''),
+  COALESCE(description, ''),
+  metric_value,
+  threshold_value,
+  dimensions,
+  fired_at,
+  resolved_at,
+  email_sent,
+  created_at
+FROM ops_alert_events
+WHERE rule_id = $1
+ORDER BY fired_at DESC
+LIMIT 1`
+
+	row := r.db.QueryRowContext(ctx, q, ruleID)
+	ev, err := scanOpsAlertEvent(row)
+	if err != nil {
+		if err == sql.ErrNoRows {
+			return nil, nil
+		}
+		return nil, err
+	}
+	return ev, nil
+}
+
+func (r *opsRepository) CreateAlertEvent(ctx context.Context, event *service.OpsAlertEvent) (*service.OpsAlertEvent, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if event == nil {
+		return nil, fmt.Errorf("nil event")
+	}
+
+	dimensionsArg, err := opsNullJSONMap(event.Dimensions)
+	if err != nil {
+		return nil, err
+	}
+
+	q := `
+INSERT INTO ops_alert_events (
+  rule_id,
+  severity,
+  status,
+  title,
+  description,
+  metric_value,
+  threshold_value,
+  dimensions,
+  fired_at,
+  resolved_at,
+  email_sent,
+  created_at
+) VALUES (
+  $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,NOW()
+)
+RETURNING
+  id,
+  COALESCE(rule_id, 0),
+  COALESCE(severity, ''),
+  COALESCE(status, ''),
+  COALESCE(title, ''),
+  COALESCE(description, ''),
+  metric_value,
+  threshold_value,
+  dimensions,
+  fired_at,
+  resolved_at,
+  email_sent,
+  created_at`
+
+	row := r.db.QueryRowContext(
+		ctx,
+		q,
+		opsNullInt64(&event.RuleID),
+		opsNullString(event.Severity),
+		opsNullString(event.Status),
+		opsNullString(event.Title),
+		opsNullString(event.Description),
+		opsNullFloat64(event.MetricValue),
+		opsNullFloat64(event.ThresholdValue),
+		dimensionsArg,
+		event.FiredAt,
+		opsNullTime(event.ResolvedAt),
+		event.EmailSent,
+	)
+	return scanOpsAlertEvent(row)
+}
+
+func (r *opsRepository) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
+	if r == nil || r.db == nil {
+		return fmt.Errorf("nil ops repository")
+	}
+	if eventID <= 0 {
+		return fmt.Errorf("invalid event id")
+	}
+	if strings.TrimSpace(status) == "" {
+		return fmt.Errorf("invalid status")
+	}
+
+	q := `
+UPDATE ops_alert_events
+SET status = $2,
+    resolved_at = $3
+WHERE id = $1`
+
+	_, err := r.db.ExecContext(ctx, q, eventID, strings.TrimSpace(status), opsNullTime(resolvedAt))
+	return err
+}
+
+func (r *opsRepository) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error {
+	if r == nil || r.db == nil {
+		return fmt.Errorf("nil ops repository")
+	}
+	if eventID <= 0 {
+		return fmt.Errorf("invalid event id")
+	}
+
+	_, err := r.db.ExecContext(ctx, "UPDATE ops_alert_events SET email_sent = $2 WHERE id = $1", eventID, emailSent)
+	return err
+}
+
+type opsAlertEventRow interface {
+	Scan(dest ...any) error
+}
+
+func scanOpsAlertEvent(row opsAlertEventRow) (*service.OpsAlertEvent, error) {
+	var ev service.OpsAlertEvent
+	var metricValue sql.NullFloat64
+	var thresholdValue sql.NullFloat64
+	var dimensionsRaw []byte
+	var resolvedAt sql.NullTime
+
+	if err := row.Scan(
+		&ev.ID,
+		&ev.RuleID,
+		&ev.Severity,
+		&ev.Status,
+		&ev.Title,
+		&ev.Description,
+		&metricValue,
+		&thresholdValue,
+		&dimensionsRaw,
+		&ev.FiredAt,
+		&resolvedAt,
+		&ev.EmailSent,
+		&ev.CreatedAt,
+	); err != nil {
+		return nil, err
+	}
+	if metricValue.Valid {
+		v := metricValue.Float64
+		ev.MetricValue = &v
+	}
+	if thresholdValue.Valid {
+		v := thresholdValue.Float64
+		ev.ThresholdValue = &v
+	}
+	if resolvedAt.Valid {
+		v := resolvedAt.Time
+		ev.ResolvedAt = &v
+	}
+	if len(dimensionsRaw) > 0 && string(dimensionsRaw) != "null" {
+		var decoded map[string]any
+		if err := json.Unmarshal(dimensionsRaw, &decoded); err == nil {
+			ev.Dimensions = decoded
+		}
+	}
+	return &ev, nil
+}
+
+func buildOpsAlertEventsWhere(filter *service.OpsAlertEventFilter) (string, []any) {
+	clauses := []string{"1=1"}
+	args := []any{}
+
+	if filter == nil {
+		return "WHERE " + strings.Join(clauses, " AND "), args
+	}
+
+	if status := strings.TrimSpace(filter.Status); status != "" {
+		args = append(args, status)
+		clauses = append(clauses, "status = $"+itoa(len(args)))
+	}
+	if severity := strings.TrimSpace(filter.Severity); severity != "" {
+		args = append(args, severity)
+		clauses = append(clauses, "severity = $"+itoa(len(args)))
+	}
+	if filter.StartTime != nil && !filter.StartTime.IsZero() {
+		args = append(args, *filter.StartTime)
+		clauses = append(clauses, "fired_at >= $"+itoa(len(args)))
+	}
+	if filter.EndTime != nil && !filter.EndTime.IsZero() {
+		args = append(args, *filter.EndTime)
+		clauses = append(clauses, "fired_at < $"+itoa(len(args)))
+	}
+
+	// Dimensions are stored in JSONB. We filter best-effort without requiring GIN indexes.
+	if platform := strings.TrimSpace(filter.Platform); platform != "" {
+		args = append(args, platform)
+		clauses = append(clauses, "(dimensions->>'platform') = $"+itoa(len(args)))
+	}
+	if filter.GroupID != nil && *filter.GroupID > 0 {
+		args = append(args, fmt.Sprintf("%d", *filter.GroupID))
+		clauses = append(clauses, "(dimensions->>'group_id') = $"+itoa(len(args)))
+	}
+
+	return "WHERE " + strings.Join(clauses, " AND "), args
+}
+
+func opsNullJSONMap(v map[string]any) (any, error) {
+	if v == nil {
+		return sql.NullString{}, nil
+	}
+	b, err := json.Marshal(v)
+	if err != nil {
+		return nil, err
+	}
+	if len(b) == 0 {
+		return sql.NullString{}, nil
+	}
+	return sql.NullString{String: string(b), Valid: true}, nil
+}
diff --git a/backend/internal/repository/ops_repo_dashboard.go b/backend/internal/repository/ops_repo_dashboard.go
new file mode 100644
index 00000000..194020bb
--- /dev/null
+++ b/backend/internal/repository/ops_repo_dashboard.go
@@ -0,0 +1,1013 @@
+package repository
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"fmt"
+	"math"
+	"strings"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/service"
+)
+
+func (r *opsRepository) GetDashboardOverview(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsDashboardOverview, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if filter == nil {
+		return nil, fmt.Errorf("nil filter")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, fmt.Errorf("start_time/end_time required")
+	}
+
+	mode := filter.QueryMode
+	if !mode.IsValid() {
+		mode = service.OpsQueryModeRaw
+	}
+
+	switch mode {
+	case service.OpsQueryModePreagg:
+		return r.getDashboardOverviewPreaggregated(ctx, filter)
+	case service.OpsQueryModeAuto:
+		out, err := r.getDashboardOverviewPreaggregated(ctx, filter)
+		if err != nil && errors.Is(err, service.ErrOpsPreaggregatedNotPopulated) {
+			return r.getDashboardOverviewRaw(ctx, filter)
+		}
+		return out, err
+	default:
+		return r.getDashboardOverviewRaw(ctx, filter)
+	}
+}
+
+func (r *opsRepository) getDashboardOverviewRaw(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsDashboardOverview, error) {
+	start := filter.StartTime.UTC()
+	end := filter.EndTime.UTC()
+
+	successCount, tokenConsumed, err := r.queryUsageCounts(ctx, filter, start, end)
+	if err != nil {
+		return nil, err
+	}
+
+	duration, ttft, err := r.queryUsageLatency(ctx, filter, start, end)
+	if err != nil {
+		return nil, err
+	}
+
+	errorTotal, businessLimited, errorCountSLA, upstreamExcl, upstream429, upstream529, err := r.queryErrorCounts(ctx, filter, start, end)
+	if err != nil {
+		return nil, err
+	}
+
+	windowSeconds := end.Sub(start).Seconds()
+	if windowSeconds <= 0 {
+		windowSeconds = 1
+	}
+
+	requestCountTotal := successCount + errorTotal
+	requestCountSLA := successCount + errorCountSLA
+
+	sla := safeDivideFloat64(float64(successCount), float64(requestCountSLA))
+	errorRate := safeDivideFloat64(float64(errorCountSLA), float64(requestCountSLA))
+	upstreamErrorRate := safeDivideFloat64(float64(upstreamExcl), float64(requestCountSLA))
+
+	qpsCurrent, tpsCurrent, err := r.queryCurrentRates(ctx, filter, end)
+	if err != nil {
+		return nil, err
+	}
+
+	qpsPeak, err := r.queryPeakQPS(ctx, filter, start, end)
+	if err != nil {
+		return nil, err
+	}
+	tpsPeak, err := r.queryPeakTPS(ctx, filter, start, end)
+	if err != nil {
+		return nil, err
+	}
+
+	qpsAvg := roundTo1DP(float64(requestCountTotal) / windowSeconds)
+	tpsAvg := roundTo1DP(float64(tokenConsumed) / windowSeconds)
+
+	return &service.OpsDashboardOverview{
+		StartTime: start,
+		EndTime:   end,
+		Platform:  strings.TrimSpace(filter.Platform),
+		GroupID:   filter.GroupID,
+
+		SuccessCount:         successCount,
+		ErrorCountTotal:      errorTotal,
+		BusinessLimitedCount: businessLimited,
+		ErrorCountSLA:        errorCountSLA,
+		RequestCountTotal:    requestCountTotal,
+		RequestCountSLA:      requestCountSLA,
+		TokenConsumed:        tokenConsumed,
+
+		SLA:                          roundTo4DP(sla),
+		ErrorRate:                    roundTo4DP(errorRate),
+		UpstreamErrorRate:            roundTo4DP(upstreamErrorRate),
+		UpstreamErrorCountExcl429529: upstreamExcl,
+		Upstream429Count:             upstream429,
+		Upstream529Count:             upstream529,
+
+		QPS: service.OpsRateSummary{
+			Current: qpsCurrent,
+			Peak:    qpsPeak,
+			Avg:     qpsAvg,
+		},
+		TPS: service.OpsRateSummary{
+			Current: tpsCurrent,
+			Peak:    tpsPeak,
+			Avg:     tpsAvg,
+		},
+
+		Duration: duration,
+		TTFT:     ttft,
+	}, nil
+}
+
+type opsDashboardPartial struct {
+	successCount         int64
+	errorCountTotal      int64
+	businessLimitedCount int64
+	errorCountSLA        int64
+
+	upstreamErrorCountExcl429529 int64
+	upstream429Count             int64
+	upstream529Count             int64
+
+	tokenConsumed int64
+
+	duration service.OpsPercentiles
+	ttft     service.OpsPercentiles
+}
+
+func (r *opsRepository) getDashboardOverviewPreaggregated(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsDashboardOverview, error) {
+	if filter == nil {
+		return nil, fmt.Errorf("nil filter")
+	}
+
+	start := filter.StartTime.UTC()
+	end := filter.EndTime.UTC()
+
+	// Stable full-hour range covered by pre-aggregation.
+	aggSafeEnd := preaggSafeEnd(end)
+	aggFullStart := utcCeilToHour(start)
+	aggFullEnd := utcFloorToHour(aggSafeEnd)
+
+	// If there are no stable full-hour buckets, use raw directly (short windows).
+	if !aggFullStart.Before(aggFullEnd) {
+		return r.getDashboardOverviewRaw(ctx, filter)
+	}
+
+	// 1) Pre-aggregated stable segment.
+	preaggRows, err := r.listHourlyMetricsRows(ctx, filter, aggFullStart, aggFullEnd)
+	if err != nil {
+		return nil, err
+	}
+	if len(preaggRows) == 0 {
+		// Distinguish "no data" vs "preagg not populated yet".
+		if exists, err := r.rawOpsDataExists(ctx, filter, aggFullStart, aggFullEnd); err == nil && exists {
+			return nil, service.ErrOpsPreaggregatedNotPopulated
+		}
+	}
+	preagg := aggregateHourlyRows(preaggRows)
+
+	// 2) Raw head/tail fragments (at most ~1 hour each).
+	head := opsDashboardPartial{}
+	tail := opsDashboardPartial{}
+
+	if start.Before(aggFullStart) {
+		part, err := r.queryRawPartial(ctx, filter, start, minTime(end, aggFullStart))
+		if err != nil {
+			return nil, err
+		}
+		head = *part
+	}
+	if aggFullEnd.Before(end) {
+		part, err := r.queryRawPartial(ctx, filter, maxTime(start, aggFullEnd), end)
+		if err != nil {
+			return nil, err
+		}
+		tail = *part
+	}
+
+	// Merge counts.
+	successCount := preagg.successCount + head.successCount + tail.successCount
+	errorTotal := preagg.errorCountTotal + head.errorCountTotal + tail.errorCountTotal
+	businessLimited := preagg.businessLimitedCount + head.businessLimitedCount + tail.businessLimitedCount
+	errorCountSLA := preagg.errorCountSLA + head.errorCountSLA + tail.errorCountSLA
+
+	upstreamExcl := preagg.upstreamErrorCountExcl429529 + head.upstreamErrorCountExcl429529 + tail.upstreamErrorCountExcl429529
+	upstream429 := preagg.upstream429Count + head.upstream429Count + tail.upstream429Count
+	upstream529 := preagg.upstream529Count + head.upstream529Count + tail.upstream529Count
+
+	tokenConsumed := preagg.tokenConsumed + head.tokenConsumed + tail.tokenConsumed
+
+	// Approximate percentiles across segments:
+	// - p50/p90/avg: weighted average by success_count
+	// - p95/p99/max: max (conservative tail)
+	duration := combineApproxPercentiles([]opsPercentileSegment{
+		{weight: preagg.successCount, p: preagg.duration},
+		{weight: head.successCount, p: head.duration},
+		{weight: tail.successCount, p: tail.duration},
+	})
+	ttft := combineApproxPercentiles([]opsPercentileSegment{
+		{weight: preagg.successCount, p: preagg.ttft},
+		{weight: head.successCount, p: head.ttft},
+		{weight: tail.successCount, p: tail.ttft},
+	})
+
+	windowSeconds := end.Sub(start).Seconds()
+	if windowSeconds <= 0 {
+		windowSeconds = 1
+	}
+
+	requestCountTotal := successCount + errorTotal
+	requestCountSLA := successCount + errorCountSLA
+
+	sla := safeDivideFloat64(float64(successCount), float64(requestCountSLA))
+	errorRate := safeDivideFloat64(float64(errorCountSLA), float64(requestCountSLA))
+	upstreamErrorRate := safeDivideFloat64(float64(upstreamExcl), float64(requestCountSLA))
+
+	// Keep "current" rates as raw, to preserve realtime semantics.
+	qpsCurrent, tpsCurrent, err := r.queryCurrentRates(ctx, filter, end)
+	if err != nil {
+		return nil, err
+	}
+
+	// NOTE: peak still uses raw logs (minute granularity). This is typically cheaper than percentile_cont
+	// and keeps semantics consistent across modes.
+	qpsPeak, err := r.queryPeakQPS(ctx, filter, start, end)
+	if err != nil {
+		return nil, err
+	}
+	tpsPeak, err := r.queryPeakTPS(ctx, filter, start, end)
+	if err != nil {
+		return nil, err
+	}
+
+	qpsAvg := roundTo1DP(float64(requestCountTotal) / windowSeconds)
+	tpsAvg := roundTo1DP(float64(tokenConsumed) / windowSeconds)
+
+	return &service.OpsDashboardOverview{
+		StartTime: start,
+		EndTime:   end,
+		Platform:  strings.TrimSpace(filter.Platform),
+		GroupID:   filter.GroupID,
+
+		SuccessCount:         successCount,
+		ErrorCountTotal:      errorTotal,
+		BusinessLimitedCount: businessLimited,
+		ErrorCountSLA:        errorCountSLA,
+		RequestCountTotal:    requestCountTotal,
+		RequestCountSLA:      requestCountSLA,
+		TokenConsumed:        tokenConsumed,
+
+		SLA:                          roundTo4DP(sla),
+		ErrorRate:                    roundTo4DP(errorRate),
+		UpstreamErrorRate:            roundTo4DP(upstreamErrorRate),
+		UpstreamErrorCountExcl429529: upstreamExcl,
+		Upstream429Count:             upstream429,
+		Upstream529Count:             upstream529,
+
+		QPS: service.OpsRateSummary{
+			Current: qpsCurrent,
+			Peak:    qpsPeak,
+			Avg:     qpsAvg,
+		},
+		TPS: service.OpsRateSummary{
+			Current: tpsCurrent,
+			Peak:    tpsPeak,
+			Avg:     tpsAvg,
+		},
+
+		Duration: duration,
+		TTFT:     ttft,
+	}, nil
+}
+
+type opsHourlyMetricsRow struct {
+	bucketStart time.Time
+
+	successCount         int64
+	errorCountTotal      int64
+	businessLimitedCount int64
+	errorCountSLA        int64
+
+	upstreamErrorCountExcl429529 int64
+	upstream429Count             int64
+	upstream529Count             int64
+
+	tokenConsumed int64
+
+	durationP50 sql.NullInt64
+	durationP90 sql.NullInt64
+	durationP95 sql.NullInt64
+	durationP99 sql.NullInt64
+	durationAvg sql.NullFloat64
+	durationMax sql.NullInt64
+
+	ttftP50 sql.NullInt64
+	ttftP90 sql.NullInt64
+	ttftP95 sql.NullInt64
+	ttftP99 sql.NullInt64
+	ttftAvg sql.NullFloat64
+	ttftMax sql.NullInt64
+}
+
+func (r *opsRepository) listHourlyMetricsRows(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) ([]opsHourlyMetricsRow, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if start.IsZero() || end.IsZero() || !start.Before(end) {
+		return []opsHourlyMetricsRow{}, nil
+	}
+
+	where := "bucket_start >= $1 AND bucket_start < $2"
+	args := []any{start.UTC(), end.UTC()}
+	idx := 3
+
+	platform := ""
+	groupID := (*int64)(nil)
+	if filter != nil {
+		platform = strings.TrimSpace(strings.ToLower(filter.Platform))
+		groupID = filter.GroupID
+	}
+
+	switch {
+	case groupID != nil && *groupID > 0:
+		where += fmt.Sprintf(" AND group_id = $%d", idx)
+		args = append(args, *groupID)
+		idx++
+		if platform != "" {
+			where += fmt.Sprintf(" AND platform = $%d", idx)
+			args = append(args, platform)
+			// idx++ removed - not used after this
+		}
+	case platform != "":
+		where += fmt.Sprintf(" AND platform = $%d AND group_id IS NULL", idx)
+		args = append(args, platform)
+		// idx++ removed - not used after this
+	default:
+		where += " AND platform IS NULL AND group_id IS NULL"
+	}
+
+	q := `
+SELECT
+  bucket_start,
+  success_count,
+  error_count_total,
+  business_limited_count,
+  error_count_sla,
+  upstream_error_count_excl_429_529,
+  upstream_429_count,
+  upstream_529_count,
+  token_consumed,
+  duration_p50_ms,
+  duration_p90_ms,
+  duration_p95_ms,
+  duration_p99_ms,
+  duration_avg_ms,
+  duration_max_ms,
+  ttft_p50_ms,
+  ttft_p90_ms,
+  ttft_p95_ms,
+  ttft_p99_ms,
+  ttft_avg_ms,
+  ttft_max_ms
+FROM ops_metrics_hourly
+WHERE ` + where + `
+ORDER BY bucket_start ASC`
+
+	rows, err := r.db.QueryContext(ctx, q, args...)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = rows.Close() }()
+
+	out := make([]opsHourlyMetricsRow, 0, 64)
+	for rows.Next() {
+		var row opsHourlyMetricsRow
+		if err := rows.Scan(
+			&row.bucketStart,
+			&row.successCount,
+			&row.errorCountTotal,
+			&row.businessLimitedCount,
+			&row.errorCountSLA,
+			&row.upstreamErrorCountExcl429529,
+			&row.upstream429Count,
+			&row.upstream529Count,
+			&row.tokenConsumed,
+			&row.durationP50,
+			&row.durationP90,
+			&row.durationP95,
+			&row.durationP99,
+			&row.durationAvg,
+			&row.durationMax,
+			&row.ttftP50,
+			&row.ttftP90,
+			&row.ttftP95,
+			&row.ttftP99,
+			&row.ttftAvg,
+			&row.ttftMax,
+		); err != nil {
+			return nil, err
+		}
+		out = append(out, row)
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func aggregateHourlyRows(rows []opsHourlyMetricsRow) opsDashboardPartial {
+	out := opsDashboardPartial{}
+	if len(rows) == 0 {
+		return out
+	}
+
+	var (
+		p50Sum float64
+		p50W   int64
+		p90Sum float64
+		p90W   int64
+		avgSum float64
+		avgW   int64
+	)
+	var (
+		ttftP50Sum float64
+		ttftP50W   int64
+		ttftP90Sum float64
+		ttftP90W   int64
+		ttftAvgSum float64
+		ttftAvgW   int64
+	)
+
+	var (
+		p95Max *int
+		p99Max *int
+		maxMax *int
+
+		ttftP95Max *int
+		ttftP99Max *int
+		ttftMaxMax *int
+	)
+
+	for _, row := range rows {
+		out.successCount += row.successCount
+		out.errorCountTotal += row.errorCountTotal
+		out.businessLimitedCount += row.businessLimitedCount
+		out.errorCountSLA += row.errorCountSLA
+
+		out.upstreamErrorCountExcl429529 += row.upstreamErrorCountExcl429529
+		out.upstream429Count += row.upstream429Count
+		out.upstream529Count += row.upstream529Count
+
+		out.tokenConsumed += row.tokenConsumed
+
+		if row.successCount > 0 {
+			if row.durationP50.Valid {
+				p50Sum += float64(row.durationP50.Int64) * float64(row.successCount)
+				p50W += row.successCount
+			}
+			if row.durationP90.Valid {
+				p90Sum += float64(row.durationP90.Int64) * float64(row.successCount)
+				p90W += row.successCount
+			}
+			if row.durationAvg.Valid {
+				avgSum += row.durationAvg.Float64 * float64(row.successCount)
+				avgW += row.successCount
+			}
+			if row.ttftP50.Valid {
+				ttftP50Sum += float64(row.ttftP50.Int64) * float64(row.successCount)
+				ttftP50W += row.successCount
+			}
+			if row.ttftP90.Valid {
+				ttftP90Sum += float64(row.ttftP90.Int64) * float64(row.successCount)
+				ttftP90W += row.successCount
+			}
+			if row.ttftAvg.Valid {
+				ttftAvgSum += row.ttftAvg.Float64 * float64(row.successCount)
+				ttftAvgW += row.successCount
+			}
+		}
+
+		if row.durationP95.Valid {
+			v := int(row.durationP95.Int64)
+			if p95Max == nil || v > *p95Max {
+				p95Max = &v
+			}
+		}
+		if row.durationP99.Valid {
+			v := int(row.durationP99.Int64)
+			if p99Max == nil || v > *p99Max {
+				p99Max = &v
+			}
+		}
+		if row.durationMax.Valid {
+			v := int(row.durationMax.Int64)
+			if maxMax == nil || v > *maxMax {
+				maxMax = &v
+			}
+		}
+
+		if row.ttftP95.Valid {
+			v := int(row.ttftP95.Int64)
+			if ttftP95Max == nil || v > *ttftP95Max {
+				ttftP95Max = &v
+			}
+		}
+		if row.ttftP99.Valid {
+			v := int(row.ttftP99.Int64)
+			if ttftP99Max == nil || v > *ttftP99Max {
+				ttftP99Max = &v
+			}
+		}
+		if row.ttftMax.Valid {
+			v := int(row.ttftMax.Int64)
+			if ttftMaxMax == nil || v > *ttftMaxMax {
+				ttftMaxMax = &v
+			}
+		}
+	}
+
+	// duration
+	if p50W > 0 {
+		v := int(math.Round(p50Sum / float64(p50W)))
+		out.duration.P50 = &v
+	}
+	if p90W > 0 {
+		v := int(math.Round(p90Sum / float64(p90W)))
+		out.duration.P90 = &v
+	}
+	out.duration.P95 = p95Max
+	out.duration.P99 = p99Max
+	if avgW > 0 {
+		v := int(math.Round(avgSum / float64(avgW)))
+		out.duration.Avg = &v
+	}
+	out.duration.Max = maxMax
+
+	// ttft
+	if ttftP50W > 0 {
+		v := int(math.Round(ttftP50Sum / float64(ttftP50W)))
+		out.ttft.P50 = &v
+	}
+	if ttftP90W > 0 {
+		v := int(math.Round(ttftP90Sum / float64(ttftP90W)))
+		out.ttft.P90 = &v
+	}
+	out.ttft.P95 = ttftP95Max
+	out.ttft.P99 = ttftP99Max
+	if ttftAvgW > 0 {
+		v := int(math.Round(ttftAvgSum / float64(ttftAvgW)))
+		out.ttft.Avg = &v
+	}
+	out.ttft.Max = ttftMaxMax
+
+	return out
+}
+
+func (r *opsRepository) queryRawPartial(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (*opsDashboardPartial, error) {
+	successCount, tokenConsumed, err := r.queryUsageCounts(ctx, filter, start, end)
+	if err != nil {
+		return nil, err
+	}
+
+	duration, ttft, err := r.queryUsageLatency(ctx, filter, start, end)
+	if err != nil {
+		return nil, err
+	}
+
+	errorTotal, businessLimited, errorCountSLA, upstreamExcl, upstream429, upstream529, err := r.queryErrorCounts(ctx, filter, start, end)
+	if err != nil {
+		return nil, err
+	}
+
+	return &opsDashboardPartial{
+		successCount:                 successCount,
+		errorCountTotal:              errorTotal,
+		businessLimitedCount:         businessLimited,
+		errorCountSLA:                errorCountSLA,
+		upstreamErrorCountExcl429529: upstreamExcl,
+		upstream429Count:             upstream429,
+		upstream529Count:             upstream529,
+		tokenConsumed:                tokenConsumed,
+		duration:                     duration,
+		ttft:                         ttft,
+	}, nil
+}
+
+func (r *opsRepository) rawOpsDataExists(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (bool, error) {
+	{
+		join, where, args, _ := buildUsageWhere(filter, start, end, 1)
+		q := `SELECT EXISTS(SELECT 1 FROM usage_logs ul ` + join + ` ` + where + ` LIMIT 1)`
+		var exists bool
+		if err := r.db.QueryRowContext(ctx, q, args...).Scan(&exists); err != nil {
+			return false, err
+		}
+		if exists {
+			return true, nil
+		}
+	}
+
+	{
+		where, args, _ := buildErrorWhere(filter, start, end, 1)
+		q := `SELECT EXISTS(SELECT 1 FROM ops_error_logs ` + where + ` LIMIT 1)`
+		var exists bool
+		if err := r.db.QueryRowContext(ctx, q, args...).Scan(&exists); err != nil {
+			return false, err
+		}
+		return exists, nil
+	}
+}
+
+type opsPercentileSegment struct {
+	weight int64
+	p      service.OpsPercentiles
+}
+
+func combineApproxPercentiles(segments []opsPercentileSegment) service.OpsPercentiles {
+	weightedInt := func(get func(service.OpsPercentiles) *int) *int {
+		var sum float64
+		var w int64
+		for _, seg := range segments {
+			if seg.weight <= 0 {
+				continue
+			}
+			v := get(seg.p)
+			if v == nil {
+				continue
+			}
+			sum += float64(*v) * float64(seg.weight)
+			w += seg.weight
+		}
+		if w <= 0 {
+			return nil
+		}
+		out := int(math.Round(sum / float64(w)))
+		return &out
+	}
+
+	maxInt := func(get func(service.OpsPercentiles) *int) *int {
+		var max *int
+		for _, seg := range segments {
+			v := get(seg.p)
+			if v == nil {
+				continue
+			}
+			if max == nil || *v > *max {
+				c := *v
+				max = &c
+			}
+		}
+		return max
+	}
+
+	return service.OpsPercentiles{
+		P50: weightedInt(func(p service.OpsPercentiles) *int { return p.P50 }),
+		P90: weightedInt(func(p service.OpsPercentiles) *int { return p.P90 }),
+		P95: maxInt(func(p service.OpsPercentiles) *int { return p.P95 }),
+		P99: maxInt(func(p service.OpsPercentiles) *int { return p.P99 }),
+		Avg: weightedInt(func(p service.OpsPercentiles) *int { return p.Avg }),
+		Max: maxInt(func(p service.OpsPercentiles) *int { return p.Max }),
+	}
+}
+
+func preaggSafeEnd(endTime time.Time) time.Time {
+	now := time.Now().UTC()
+	cutoff := now.Add(-5 * time.Minute)
+	if endTime.After(cutoff) {
+		return cutoff
+	}
+	return endTime
+}
+
+func utcCeilToHour(t time.Time) time.Time {
+	u := t.UTC()
+	f := u.Truncate(time.Hour)
+	if f.Equal(u) {
+		return f
+	}
+	return f.Add(time.Hour)
+}
+
+func utcFloorToHour(t time.Time) time.Time {
+	return t.UTC().Truncate(time.Hour)
+}
+
+func minTime(a, b time.Time) time.Time {
+	if a.Before(b) {
+		return a
+	}
+	return b
+}
+
+func maxTime(a, b time.Time) time.Time {
+	if a.After(b) {
+		return a
+	}
+	return b
+}
+
+func (r *opsRepository) queryUsageCounts(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (successCount int64, tokenConsumed int64, err error) {
+	join, where, args, _ := buildUsageWhere(filter, start, end, 1)
+
+	q := `
+SELECT
+  COALESCE(COUNT(*), 0) AS success_count,
+  COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
+FROM usage_logs ul
+` + join + `
+` + where
+
+	var tokens sql.NullInt64
+	if err := r.db.QueryRowContext(ctx, q, args...).Scan(&successCount, &tokens); err != nil {
+		return 0, 0, err
+	}
+	if tokens.Valid {
+		tokenConsumed = tokens.Int64
+	}
+	return successCount, tokenConsumed, nil
+}
+
+func (r *opsRepository) queryUsageLatency(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (duration service.OpsPercentiles, ttft service.OpsPercentiles, err error) {
+	{
+		join, where, args, _ := buildUsageWhere(filter, start, end, 1)
+		q := `
+SELECT
+  percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50,
+  percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90,
+  percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95,
+  percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99,
+  AVG(duration_ms) AS avg_ms,
+  MAX(duration_ms) AS max_ms
+FROM usage_logs ul
+` + join + `
+` + where + `
+AND duration_ms IS NOT NULL`
+
+		var p50, p90, p95, p99 sql.NullFloat64
+		var avg sql.NullFloat64
+		var max sql.NullInt64
+		if err := r.db.QueryRowContext(ctx, q, args...).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
+			return service.OpsPercentiles{}, service.OpsPercentiles{}, err
+		}
+		duration.P50 = floatToIntPtr(p50)
+		duration.P90 = floatToIntPtr(p90)
+		duration.P95 = floatToIntPtr(p95)
+		duration.P99 = floatToIntPtr(p99)
+		duration.Avg = floatToIntPtr(avg)
+		if max.Valid {
+			v := int(max.Int64)
+			duration.Max = &v
+		}
+	}
+
+	{
+		join, where, args, _ := buildUsageWhere(filter, start, end, 1)
+		q := `
+SELECT
+  percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50,
+  percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90,
+  percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95,
+  percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99,
+  AVG(first_token_ms) AS avg_ms,
+  MAX(first_token_ms) AS max_ms
+FROM usage_logs ul
+` + join + `
+` + where + `
+AND first_token_ms IS NOT NULL`
+
+		var p50, p90, p95, p99 sql.NullFloat64
+		var avg sql.NullFloat64
+		var max sql.NullInt64
+		if err := r.db.QueryRowContext(ctx, q, args...).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
+			return service.OpsPercentiles{}, service.OpsPercentiles{}, err
+		}
+		ttft.P50 = floatToIntPtr(p50)
+		ttft.P90 = floatToIntPtr(p90)
+		ttft.P95 = floatToIntPtr(p95)
+		ttft.P99 = floatToIntPtr(p99)
+		ttft.Avg = floatToIntPtr(avg)
+		if max.Valid {
+			v := int(max.Int64)
+			ttft.Max = &v
+		}
+	}
+
+	return duration, ttft, nil
+}
+
+func (r *opsRepository) queryErrorCounts(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (
+	errorTotal int64,
+	businessLimited int64,
+	errorCountSLA int64,
+	upstreamExcl429529 int64,
+	upstream429 int64,
+	upstream529 int64,
+	err error,
+) {
+	where, args, _ := buildErrorWhere(filter, start, end, 1)
+
+	q := `
+SELECT
+  COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400), 0) AS error_total,
+  COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited), 0) AS business_limited,
+  COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited), 0) AS error_sla,
+  COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl,
+  COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429), 0) AS upstream_429,
+  COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529), 0) AS upstream_529
+FROM ops_error_logs
+` + where
+
+	if err := r.db.QueryRowContext(ctx, q, args...).Scan(
+		&errorTotal,
+		&businessLimited,
+		&errorCountSLA,
+		&upstreamExcl429529,
+		&upstream429,
+		&upstream529,
+	); err != nil {
+		return 0, 0, 0, 0, 0, 0, err
+	}
+	return errorTotal, businessLimited, errorCountSLA, upstreamExcl429529, upstream429, upstream529, nil
+}
+
+func (r *opsRepository) queryCurrentRates(ctx context.Context, filter *service.OpsDashboardFilter, end time.Time) (qpsCurrent float64, tpsCurrent float64, err error) {
+	windowStart := end.Add(-1 * time.Minute)
+
+	successCount1m, token1m, err := r.queryUsageCounts(ctx, filter, windowStart, end)
+	if err != nil {
+		return 0, 0, err
+	}
+	errorCount1m, _, _, _, _, _, err := r.queryErrorCounts(ctx, filter, windowStart, end)
+	if err != nil {
+		return 0, 0, err
+	}
+
+	qpsCurrent = roundTo1DP(float64(successCount1m+errorCount1m) / 60.0)
+	tpsCurrent = roundTo1DP(float64(token1m) / 60.0)
+	return qpsCurrent, tpsCurrent, nil
+}
+
+func (r *opsRepository) queryPeakQPS(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (float64, error) {
+	usageJoin, usageWhere, usageArgs, next := buildUsageWhere(filter, start, end, 1)
+	errorWhere, errorArgs, _ := buildErrorWhere(filter, start, end, next)
+
+	q := `
+WITH usage_buckets AS (
+  SELECT date_trunc('minute', ul.created_at) AS bucket, COUNT(*) AS cnt
+  FROM usage_logs ul
+  ` + usageJoin + `
+  ` + usageWhere + `
+  GROUP BY 1
+),
+error_buckets AS (
+  SELECT date_trunc('minute', created_at) AS bucket, COUNT(*) AS cnt
+  FROM ops_error_logs
+  ` + errorWhere + `
+    AND COALESCE(status_code, 0) >= 400
+  GROUP BY 1
+),
+combined AS (
+  SELECT COALESCE(u.bucket, e.bucket) AS bucket,
+         COALESCE(u.cnt, 0) + COALESCE(e.cnt, 0) AS total
+  FROM usage_buckets u
+  FULL OUTER JOIN error_buckets e ON u.bucket = e.bucket
+)
+SELECT COALESCE(MAX(total), 0) FROM combined`
+
+	args := append(usageArgs, errorArgs...)
+
+	var maxPerMinute sql.NullInt64
+	if err := r.db.QueryRowContext(ctx, q, args...).Scan(&maxPerMinute); err != nil {
+		return 0, err
+	}
+	if !maxPerMinute.Valid || maxPerMinute.Int64 <= 0 {
+		return 0, nil
+	}
+	return roundTo1DP(float64(maxPerMinute.Int64) / 60.0), nil
+}
+
+func (r *opsRepository) queryPeakTPS(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (float64, error) {
+	join, where, args, _ := buildUsageWhere(filter, start, end, 1)
+
+	q := `
+SELECT COALESCE(MAX(tokens_per_min), 0)
+FROM (
+  SELECT
+    date_trunc('minute', ul.created_at) AS bucket,
+    COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS tokens_per_min
+  FROM usage_logs ul
+  ` + join + `
+  ` + where + `
+  GROUP BY 1
+) t`
+
+	var maxPerMinute sql.NullInt64
+	if err := r.db.QueryRowContext(ctx, q, args...).Scan(&maxPerMinute); err != nil {
+		return 0, err
+	}
+	if !maxPerMinute.Valid || maxPerMinute.Int64 <= 0 {
+		return 0, nil
+	}
+	return roundTo1DP(float64(maxPerMinute.Int64) / 60.0), nil
+}
+
+func buildUsageWhere(filter *service.OpsDashboardFilter, start, end time.Time, startIndex int) (join string, where string, args []any, nextIndex int) {
+	platform := ""
+	groupID := (*int64)(nil)
+	if filter != nil {
+		platform = strings.TrimSpace(strings.ToLower(filter.Platform))
+		groupID = filter.GroupID
+	}
+
+	idx := startIndex
+	clauses := make([]string, 0, 4)
+	args = make([]any, 0, 4)
+
+	args = append(args, start)
+	clauses = append(clauses, fmt.Sprintf("ul.created_at >= $%d", idx))
+	idx++
+	args = append(args, end)
+	clauses = append(clauses, fmt.Sprintf("ul.created_at < $%d", idx))
+	idx++
+
+	if groupID != nil && *groupID > 0 {
+		args = append(args, *groupID)
+		clauses = append(clauses, fmt.Sprintf("ul.group_id = $%d", idx))
+		idx++
+	}
+	if platform != "" {
+		// Prefer group.platform when available; fall back to account.platform so we don't
+		// drop rows where group_id is NULL.
+		join = "LEFT JOIN groups g ON g.id = ul.group_id LEFT JOIN accounts a ON a.id = ul.account_id"
+		args = append(args, platform)
+		clauses = append(clauses, fmt.Sprintf("COALESCE(NULLIF(g.platform,''), a.platform) = $%d", idx))
+		idx++
+	}
+
+	where = "WHERE " + strings.Join(clauses, " AND ")
+	return join, where, args, idx
+}
+
+func buildErrorWhere(filter *service.OpsDashboardFilter, start, end time.Time, startIndex int) (where string, args []any, nextIndex int) {
+	platform := ""
+	groupID := (*int64)(nil)
+	if filter != nil {
+		platform = strings.TrimSpace(strings.ToLower(filter.Platform))
+		groupID = filter.GroupID
+	}
+
+	idx := startIndex
+	clauses := make([]string, 0, 4)
+	args = make([]any, 0, 4)
+
+	args = append(args, start)
+	clauses = append(clauses, fmt.Sprintf("created_at >= $%d", idx))
+	idx++
+	args = append(args, end)
+	clauses = append(clauses, fmt.Sprintf("created_at < $%d", idx))
+	idx++
+
+	if groupID != nil && *groupID > 0 {
+		args = append(args, *groupID)
+		clauses = append(clauses, fmt.Sprintf("group_id = $%d", idx))
+		idx++
+	}
+	if platform != "" {
+		args = append(args, platform)
+		clauses = append(clauses, fmt.Sprintf("platform = $%d", idx))
+		idx++
+	}
+
+	where = "WHERE " + strings.Join(clauses, " AND ")
+	return where, args, idx
+}
+
+func floatToIntPtr(v sql.NullFloat64) *int {
+	if !v.Valid {
+		return nil
+	}
+	n := int(math.Round(v.Float64))
+	return &n
+}
+
+func safeDivideFloat64(numerator float64, denominator float64) float64 {
+	if denominator == 0 {
+		return 0
+	}
+	return numerator / denominator
+}
+
+func roundTo1DP(v float64) float64 {
+	return math.Round(v*10) / 10
+}
+
+func roundTo4DP(v float64) float64 {
+	return math.Round(v*10000) / 10000
+}
diff --git a/backend/internal/repository/ops_repo_histograms.go b/backend/internal/repository/ops_repo_histograms.go
new file mode 100644
index 00000000..c2978798
--- /dev/null
+++ b/backend/internal/repository/ops_repo_histograms.go
@@ -0,0 +1,79 @@
+package repository
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	"github.com/Wei-Shaw/sub2api/internal/service"
+)
+
+func (r *opsRepository) GetLatencyHistogram(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsLatencyHistogramResponse, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if filter == nil {
+		return nil, fmt.Errorf("nil filter")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, fmt.Errorf("start_time/end_time required")
+	}
+
+	start := filter.StartTime.UTC()
+	end := filter.EndTime.UTC()
+
+	join, where, args, _ := buildUsageWhere(filter, start, end, 1)
+	rangeExpr := latencyHistogramRangeCaseExpr("ul.duration_ms")
+	orderExpr := latencyHistogramRangeOrderCaseExpr("ul.duration_ms")
+
+	q := `
+SELECT
+  ` + rangeExpr + ` AS range,
+  COALESCE(COUNT(*), 0) AS count,
+  ` + orderExpr + ` AS ord
+FROM usage_logs ul
+` + join + `
+` + where + `
+AND ul.duration_ms IS NOT NULL
+GROUP BY 1, 3
+ORDER BY 3 ASC`
+
+	rows, err := r.db.QueryContext(ctx, q, args...)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = rows.Close() }()
+
+	counts := make(map[string]int64, len(latencyHistogramOrderedRanges))
+	var total int64
+	for rows.Next() {
+		var label string
+		var count int64
+		var _ord int
+		if err := rows.Scan(&label, &count, &_ord); err != nil {
+			return nil, err
+		}
+		counts[label] = count
+		total += count
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+
+	buckets := make([]*service.OpsLatencyHistogramBucket, 0, len(latencyHistogramOrderedRanges))
+	for _, label := range latencyHistogramOrderedRanges {
+		buckets = append(buckets, &service.OpsLatencyHistogramBucket{
+			Range: label,
+			Count: counts[label],
+		})
+	}
+
+	return &service.OpsLatencyHistogramResponse{
+		StartTime:     start,
+		EndTime:       end,
+		Platform:      strings.TrimSpace(filter.Platform),
+		GroupID:       filter.GroupID,
+		TotalRequests: total,
+		Buckets:       buckets,
+	}, nil
+}
diff --git a/backend/internal/repository/ops_repo_latency_histogram_buckets.go b/backend/internal/repository/ops_repo_latency_histogram_buckets.go
new file mode 100644
index 00000000..cd5bed37
--- /dev/null
+++ b/backend/internal/repository/ops_repo_latency_histogram_buckets.go
@@ -0,0 +1,64 @@
+package repository
+
+import (
+	"fmt"
+	"strings"
+)
+
+type latencyHistogramBucket struct {
+	upperMs int
+	label   string
+}
+
+var latencyHistogramBuckets = []latencyHistogramBucket{
+	{upperMs: 100, label: "0-100ms"},
+	{upperMs: 200, label: "100-200ms"},
+	{upperMs: 500, label: "200-500ms"},
+	{upperMs: 1000, label: "500-1000ms"},
+	{upperMs: 2000, label: "1000-2000ms"},
+	{upperMs: 0, label: "2000ms+"}, // default bucket
+}
+
+var latencyHistogramOrderedRanges = func() []string {
+	out := make([]string, 0, len(latencyHistogramBuckets))
+	for _, b := range latencyHistogramBuckets {
+		out = append(out, b.label)
+	}
+	return out
+}()
+
+func latencyHistogramRangeCaseExpr(column string) string {
+	var sb strings.Builder
+	_, _ = sb.WriteString("CASE\n")
+
+	for _, b := range latencyHistogramBuckets {
+		if b.upperMs <= 0 {
+			continue
+		}
+		_, _ = sb.WriteString(fmt.Sprintf("\tWHEN %s < %d THEN '%s'\n", column, b.upperMs, b.label))
+	}
+
+	// Default bucket.
+	last := latencyHistogramBuckets[len(latencyHistogramBuckets)-1]
+	_, _ = sb.WriteString(fmt.Sprintf("\tELSE '%s'\n", last.label))
+	_, _ = sb.WriteString("END")
+	return sb.String()
+}
+
+func latencyHistogramRangeOrderCaseExpr(column string) string {
+	var sb strings.Builder
+	_, _ = sb.WriteString("CASE\n")
+
+	order := 1
+	for _, b := range latencyHistogramBuckets {
+		if b.upperMs <= 0 {
+			continue
+		}
+		_, _ = sb.WriteString(fmt.Sprintf("\tWHEN %s < %d THEN %d\n", column, b.upperMs, order))
+		order++
+	}
+
+	_, _ = sb.WriteString(fmt.Sprintf("\tELSE %d\n", order))
+	_, _ = sb.WriteString("END")
+	return sb.String()
+}
diff --git a/backend/internal/repository/ops_repo_latency_histogram_buckets_test.go b/backend/internal/repository/ops_repo_latency_histogram_buckets_test.go
new file mode 100644
index 00000000..dc79f6cc
--- /dev/null
+++ b/backend/internal/repository/ops_repo_latency_histogram_buckets_test.go
@@ -0,0 +1,14 @@
+package repository
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestLatencyHistogramBuckets_AreConsistent(t *testing.T) {
+	require.Equal(t, len(latencyHistogramBuckets), len(latencyHistogramOrderedRanges))
+	for i, b := range latencyHistogramBuckets {
+		require.Equal(t, b.label, latencyHistogramOrderedRanges[i])
+	}
+}
diff --git a/backend/internal/repository/ops_repo_metrics.go b/backend/internal/repository/ops_repo_metrics.go
new file mode 100644
index 00000000..bc80ed6e
--- /dev/null
+++ b/backend/internal/repository/ops_repo_metrics.go
@@ -0,0 +1,422 @@
+package repository
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/service"
+)
+
+func (r *opsRepository) InsertSystemMetrics(ctx context.Context, input *service.OpsInsertSystemMetricsInput) error {
+	if r == nil || r.db == nil {
+		return fmt.Errorf("nil ops repository")
+	}
+	if input == nil {
+		return fmt.Errorf("nil input")
+	}
+
+	window := input.WindowMinutes
+	if window <= 0 {
+		window = 1
+	}
+	createdAt := input.CreatedAt
+	if createdAt.IsZero() {
+		createdAt = time.Now().UTC()
+	}
+
+	q := `
+INSERT INTO ops_system_metrics (
+  created_at,
+  window_minutes,
+  platform,
+  group_id,
+
+  success_count,
+  error_count_total,
+  business_limited_count,
+  error_count_sla,
+
+  upstream_error_count_excl_429_529,
+  upstream_429_count,
+  upstream_529_count,
+
+  token_consumed,
+  qps,
+  tps,
+
+  duration_p50_ms,
+  duration_p90_ms,
+  duration_p95_ms,
+  duration_p99_ms,
+  duration_avg_ms,
+  duration_max_ms,
+
+  ttft_p50_ms,
+  ttft_p90_ms,
+  ttft_p95_ms,
+  ttft_p99_ms,
+  ttft_avg_ms,
+  ttft_max_ms,
+
+  cpu_usage_percent,
+  memory_used_mb,
+  memory_total_mb,
+  memory_usage_percent,
+
+  db_ok,
+  redis_ok,
+
+  redis_conn_total,
+  redis_conn_idle,
+
+  db_conn_active,
+  db_conn_idle,
+  db_conn_waiting,
+
+  goroutine_count,
+  concurrency_queue_depth
+) VALUES (
+  $1,$2,$3,$4,
+  $5,$6,$7,$8,
+  $9,$10,$11,
+  $12,$13,$14,
+  $15,$16,$17,$18,$19,$20,
+  $21,$22,$23,$24,$25,$26,
+  $27,$28,$29,$30,
+  $31,$32,
+  $33,$34,
+  $35,$36,$37,
+  $38,$39
+)`
+
+	_, err := r.db.ExecContext(
+		ctx,
+		q,
+		createdAt,
+		window,
+		opsNullString(input.Platform),
+		opsNullInt64(input.GroupID),
+
+		input.SuccessCount,
+		input.ErrorCountTotal,
+		input.BusinessLimitedCount,
+		input.ErrorCountSLA,
+
+		input.UpstreamErrorCountExcl429529,
+		input.Upstream429Count,
+		input.Upstream529Count,
+
+		input.TokenConsumed,
+		opsNullFloat64(input.QPS),
+		opsNullFloat64(input.TPS),
+
+		opsNullInt(input.DurationP50Ms),
+		opsNullInt(input.DurationP90Ms),
+		opsNullInt(input.DurationP95Ms),
+		opsNullInt(input.DurationP99Ms),
+		opsNullFloat64(input.DurationAvgMs),
+		opsNullInt(input.DurationMaxMs),
+
+		opsNullInt(input.TTFTP50Ms),
+		opsNullInt(input.TTFTP90Ms),
+		opsNullInt(input.TTFTP95Ms),
+		opsNullInt(input.TTFTP99Ms),
+		opsNullFloat64(input.TTFTAvgMs),
+		opsNullInt(input.TTFTMaxMs),
+
+		opsNullFloat64(input.CPUUsagePercent),
+		opsNullInt(input.MemoryUsedMB),
+		opsNullInt(input.MemoryTotalMB),
+		opsNullFloat64(input.MemoryUsagePercent),
+
+		opsNullBool(input.DBOK),
+		opsNullBool(input.RedisOK),
+
+		opsNullInt(input.RedisConnTotal),
+		opsNullInt(input.RedisConnIdle),
+
+		opsNullInt(input.DBConnActive),
+		opsNullInt(input.DBConnIdle),
+		opsNullInt(input.DBConnWaiting),
+
+		opsNullInt(input.GoroutineCount),
+		opsNullInt(input.ConcurrencyQueueDepth),
+	)
+	return err
+}
+
+func (r *opsRepository) GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*service.OpsSystemMetricsSnapshot, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if windowMinutes <= 0 {
+		windowMinutes = 1
+	}
+
+	q := `
+SELECT
+  id,
+  created_at,
+  window_minutes,
+
+  cpu_usage_percent,
+  memory_used_mb,
+  memory_total_mb,
+  memory_usage_percent,
+
+  db_ok,
+  redis_ok,
+
+  redis_conn_total,
+  redis_conn_idle,
+
+  db_conn_active,
+  db_conn_idle,
+  db_conn_waiting,
+
+  goroutine_count,
+  concurrency_queue_depth
+FROM ops_system_metrics
+WHERE window_minutes = $1
+  AND platform IS NULL
+  AND group_id IS NULL
+ORDER BY created_at DESC
+LIMIT 1`
+
+	var out service.OpsSystemMetricsSnapshot
+	var cpu sql.NullFloat64
+	var memUsed sql.NullInt64
+	var memTotal sql.NullInt64
+	var memPct sql.NullFloat64
+	var dbOK sql.NullBool
+	var redisOK sql.NullBool
+	var redisTotal sql.NullInt64
+	var redisIdle sql.NullInt64
+	var dbActive sql.NullInt64
+	var dbIdle sql.NullInt64
+	var dbWaiting sql.NullInt64
+	var goroutines sql.NullInt64
+	var queueDepth sql.NullInt64
+
+	if err := r.db.QueryRowContext(ctx, q, windowMinutes).Scan(
+		&out.ID,
+		&out.CreatedAt,
+		&out.WindowMinutes,
+		&cpu,
+		&memUsed,
+		&memTotal,
+		&memPct,
+		&dbOK,
+		&redisOK,
+		&redisTotal,
+		&redisIdle,
+		&dbActive,
+		&dbIdle,
+		&dbWaiting,
+		&goroutines,
+		&queueDepth,
+	); err != nil {
+		return nil, err
+	}
+
+	if cpu.Valid {
+		v := cpu.Float64
+		out.CPUUsagePercent = &v
+	}
+	if memUsed.Valid {
+		v := memUsed.Int64
+		out.MemoryUsedMB = &v
+	}
+	if memTotal.Valid {
+		v := memTotal.Int64
+		out.MemoryTotalMB = &v
+	}
+	if memPct.Valid {
+		v := memPct.Float64
+		out.MemoryUsagePercent = &v
+	}
+	if dbOK.Valid {
+		v := dbOK.Bool
+		out.DBOK = &v
+	}
+	if redisOK.Valid {
+		v := redisOK.Bool
+		out.RedisOK = &v
+	}
+	if redisTotal.Valid {
+		v := int(redisTotal.Int64)
+		out.RedisConnTotal = &v
+	}
+	if redisIdle.Valid {
+		v := int(redisIdle.Int64)
+		out.RedisConnIdle = &v
+	}
+	if dbActive.Valid {
+		v := int(dbActive.Int64)
+		out.DBConnActive = &v
+	}
+	if dbIdle.Valid {
+		v := int(dbIdle.Int64)
+		out.DBConnIdle = &v
+	}
+	if dbWaiting.Valid {
+		v := int(dbWaiting.Int64)
+		out.DBConnWaiting = &v
+	}
+	if goroutines.Valid {
+		v := int(goroutines.Int64)
+		out.GoroutineCount = &v
+	}
+	if queueDepth.Valid {
+		v := int(queueDepth.Int64)
+		out.ConcurrencyQueueDepth = &v
+	}
+
+	return &out, nil
+}
+
+func (r *opsRepository) UpsertJobHeartbeat(ctx context.Context, input *service.OpsUpsertJobHeartbeatInput) error {
+	if r == nil || r.db == nil {
+		return fmt.Errorf("nil ops repository")
+	}
+	if input == nil {
+		return fmt.Errorf("nil input")
+	}
+	if input.JobName == "" {
+		return fmt.Errorf("job_name required")
+	}
+
+	q := `
+INSERT INTO ops_job_heartbeats (
+  job_name,
+  last_run_at,
+  last_success_at,
+  last_error_at,
+  last_error,
+  last_duration_ms,
+  updated_at
+) VALUES (
+  $1,$2,$3,$4,$5,$6,NOW()
+)
+ON CONFLICT (job_name) DO UPDATE SET
+  last_run_at = COALESCE(EXCLUDED.last_run_at, ops_job_heartbeats.last_run_at),
+  last_success_at = COALESCE(EXCLUDED.last_success_at, ops_job_heartbeats.last_success_at),
+  last_error_at = CASE
+    WHEN EXCLUDED.last_success_at IS NOT NULL THEN NULL
+    ELSE COALESCE(EXCLUDED.last_error_at, ops_job_heartbeats.last_error_at)
+  END,
+  last_error = CASE
+    WHEN EXCLUDED.last_success_at IS NOT NULL THEN NULL
+    ELSE COALESCE(EXCLUDED.last_error, ops_job_heartbeats.last_error)
+  END,
+  last_duration_ms = COALESCE(EXCLUDED.last_duration_ms, ops_job_heartbeats.last_duration_ms),
+  updated_at = NOW()`
+
+	_, err := r.db.ExecContext(
+		ctx,
+		q,
+		input.JobName,
+		opsNullTime(input.LastRunAt),
+		opsNullTime(input.LastSuccessAt),
+		opsNullTime(input.LastErrorAt),
+		opsNullString(input.LastError),
+		opsNullInt(input.LastDurationMs),
+	)
+	return err
+}
+
+func (r *opsRepository) ListJobHeartbeats(ctx context.Context) ([]*service.OpsJobHeartbeat, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+
+	q := `
+SELECT
+  job_name,
+  last_run_at,
+  last_success_at,
+  last_error_at,
+  last_error,
+  last_duration_ms,
+  updated_at
+FROM ops_job_heartbeats
+ORDER BY job_name ASC`
+
+	rows, err := r.db.QueryContext(ctx, q)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = rows.Close() }()
+
+	out := make([]*service.OpsJobHeartbeat, 0, 8)
+	for rows.Next() {
+		var item service.OpsJobHeartbeat
+		var lastRun sql.NullTime
+		var lastSuccess sql.NullTime
+		var lastErrorAt sql.NullTime
+		var lastError sql.NullString
+		var lastDuration sql.NullInt64
+
+		if err := rows.Scan(
+			&item.JobName,
+			&lastRun,
+			&lastSuccess,
+			&lastErrorAt,
+			&lastError,
+			&lastDuration,
+			&item.UpdatedAt,
+		); err != nil {
+			return nil, err
+		}
+
+		if lastRun.Valid {
+			v := lastRun.Time
+			item.LastRunAt = &v
+		}
+		if lastSuccess.Valid {
+			v := lastSuccess.Time
+			item.LastSuccessAt = &v
+		}
+		if lastErrorAt.Valid {
+			v := lastErrorAt.Time
+			item.LastErrorAt = &v
+		}
+		if lastError.Valid {
+			v := lastError.String
+			item.LastError = &v
+		}
+		if lastDuration.Valid {
+			v := lastDuration.Int64
+			item.LastDurationMs = &v
+		}
+
+		out = append(out, &item)
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func opsNullBool(v *bool) any {
+	if v == nil {
+		return sql.NullBool{}
+	}
+	return sql.NullBool{Bool: *v, Valid: true}
+}
+
+func opsNullFloat64(v *float64) any {
+	if v == nil {
+		return sql.NullFloat64{}
+	}
+	return sql.NullFloat64{Float64: *v, Valid: true}
+}
+
+func opsNullTime(v *time.Time) any {
+	if v == nil || v.IsZero() {
+		return sql.NullTime{}
+	}
+	return sql.NullTime{Time: *v, Valid: true}
+}
diff --git a/backend/internal/repository/ops_repo_preagg.go b/backend/internal/repository/ops_repo_preagg.go
new file mode 100644
index 00000000..fc74e4f6
--- /dev/null
+++ b/backend/internal/repository/ops_repo_preagg.go
@@ -0,0 +1,359 @@
+package repository
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"time"
+)
+
+func (r *opsRepository) UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error {
+	if r == nil || r.db == nil {
+		return fmt.Errorf("nil ops repository")
+	}
+	if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) {
+		return nil
+	}
+
+	start := startTime.UTC()
+	end := endTime.UTC()
+
+	// NOTE:
+	// - We aggregate usage_logs + ops_error_logs into ops_metrics_hourly.
+	// - We emit three dimension granularities via GROUPING SETS:
+	//   1) overall: (bucket_start)
+	//   2) platform: (bucket_start, platform)
+	//   3) group: (bucket_start, platform, group_id)
+	//
+	// IMPORTANT: Postgres UNIQUE treats NULLs as distinct, so the table uses a COALESCE-based
+	// unique index; our ON CONFLICT target must match that expression set.
+	q := `
+WITH usage_base AS (
+  SELECT
+    date_trunc('hour', ul.created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start,
+    g.platform AS platform,
+    ul.group_id AS group_id,
+    ul.duration_ms AS duration_ms,
+    ul.first_token_ms AS first_token_ms,
+    (ul.input_tokens + ul.output_tokens + ul.cache_creation_tokens + ul.cache_read_tokens) AS tokens
+  FROM usage_logs ul
+  JOIN groups g ON g.id = ul.group_id
+  WHERE ul.created_at >= $1 AND ul.created_at < $2
+),
+usage_agg AS (
+  SELECT
+    bucket_start,
+    CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform,
+    CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id,
+    COUNT(*) AS success_count,
+    COALESCE(SUM(tokens), 0) AS token_consumed,
+
+    percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p50_ms,
+    percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p90_ms,
+    percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p95_ms,
+    percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p99_ms,
+    AVG(duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_avg_ms,
+    MAX(duration_ms) AS duration_max_ms,
+
+    percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p50_ms,
+    percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p90_ms,
+    percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p95_ms,
+    percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p99_ms,
+    AVG(first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_avg_ms,
+    MAX(first_token_ms) AS ttft_max_ms
+  FROM usage_base
+  GROUP BY GROUPING SETS (
+    (bucket_start),
+    (bucket_start, platform),
+    (bucket_start, platform, group_id)
+  )
+),
+error_base AS (
+  SELECT
+    date_trunc('hour', created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start,
+    platform AS platform,
+    group_id AS group_id,
+    is_business_limited AS is_business_limited,
+    error_owner AS error_owner,
+    status_code AS client_status_code,
+    COALESCE(upstream_status_code, status_code, 0) AS effective_status_code
+  FROM ops_error_logs
+  WHERE created_at >= $1 AND created_at < $2
+),
+error_agg AS (
+  SELECT
+    bucket_start,
+    CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform,
+    CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id,
+    COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400) AS error_count_total,
+    COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND is_business_limited) AS business_limited_count,
+    COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND NOT is_business_limited) AS error_count_sla,
+    COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) NOT IN (429, 529)) AS upstream_error_count_excl_429_529,
+    COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 429) AS upstream_429_count,
+    COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 529) AS upstream_529_count
+  FROM error_base
+  GROUP BY GROUPING SETS (
+    (bucket_start),
+    (bucket_start, platform),
+    (bucket_start, platform, group_id)
+  )
+  HAVING GROUPING(group_id) = 1 OR group_id IS NOT NULL
+),
+combined AS (
+  SELECT
+    COALESCE(u.bucket_start, e.bucket_start) AS bucket_start,
+    COALESCE(u.platform, e.platform) AS platform,
+    COALESCE(u.group_id, e.group_id) AS group_id,
+
+    COALESCE(u.success_count, 0) AS success_count,
+    COALESCE(e.error_count_total, 0) AS error_count_total,
+    COALESCE(e.business_limited_count, 0) AS business_limited_count,
+    COALESCE(e.error_count_sla, 0) AS error_count_sla,
+    COALESCE(e.upstream_error_count_excl_429_529, 0) AS upstream_error_count_excl_429_529,
+    COALESCE(e.upstream_429_count, 0) AS upstream_429_count,
+    COALESCE(e.upstream_529_count, 0) AS upstream_529_count,
+
+    COALESCE(u.token_consumed, 0) AS token_consumed,
+
+    u.duration_p50_ms,
+    u.duration_p90_ms,
+    u.duration_p95_ms,
+    u.duration_p99_ms,
+    u.duration_avg_ms,
+    u.duration_max_ms,
+
+    u.ttft_p50_ms,
+    u.ttft_p90_ms,
+    u.ttft_p95_ms,
+    u.ttft_p99_ms,
+    u.ttft_avg_ms,
+    u.ttft_max_ms
+  FROM usage_agg u
+  FULL OUTER JOIN error_agg e
+    ON u.bucket_start = e.bucket_start
+   AND COALESCE(u.platform, '') = COALESCE(e.platform, '')
+   AND COALESCE(u.group_id, 0) = COALESCE(e.group_id, 0)
+)
+INSERT INTO ops_metrics_hourly (
+  bucket_start,
+  platform,
+  group_id,
+  success_count,
+  error_count_total,
+  business_limited_count,
+  error_count_sla,
+  upstream_error_count_excl_429_529,
+  upstream_429_count,
+  upstream_529_count,
+  token_consumed,
+  duration_p50_ms,
+  duration_p90_ms,
+  duration_p95_ms,
+  duration_p99_ms,
+  duration_avg_ms,
+  duration_max_ms,
+  ttft_p50_ms,
+  ttft_p90_ms,
+  ttft_p95_ms,
+  ttft_p99_ms,
+  ttft_avg_ms,
+  ttft_max_ms,
+  computed_at
+)
+SELECT
+  bucket_start,
+  NULLIF(platform, '') AS platform,
+  group_id,
+  success_count,
+  error_count_total,
+  business_limited_count,
+  error_count_sla,
+  upstream_error_count_excl_429_529,
+  upstream_429_count,
+  upstream_529_count,
+  token_consumed,
+  duration_p50_ms::int,
+  duration_p90_ms::int,
+  duration_p95_ms::int,
+  duration_p99_ms::int,
+  duration_avg_ms,
+  duration_max_ms::int,
+  ttft_p50_ms::int,
+  ttft_p90_ms::int,
+  ttft_p95_ms::int,
+  ttft_p99_ms::int,
+  ttft_avg_ms,
+  ttft_max_ms::int,
+  NOW()
+FROM combined
+WHERE bucket_start IS NOT NULL
+  AND (platform IS NULL OR platform <> '')
+ON CONFLICT (bucket_start, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET
+  success_count = EXCLUDED.success_count,
+  error_count_total = EXCLUDED.error_count_total,
+  business_limited_count = EXCLUDED.business_limited_count,
+  error_count_sla = EXCLUDED.error_count_sla,
+  upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529,
+  upstream_429_count = EXCLUDED.upstream_429_count,
+  upstream_529_count = EXCLUDED.upstream_529_count,
+  token_consumed = EXCLUDED.token_consumed,
+
+  duration_p50_ms = EXCLUDED.duration_p50_ms,
+  duration_p90_ms = EXCLUDED.duration_p90_ms,
+  duration_p95_ms = EXCLUDED.duration_p95_ms,
+  duration_p99_ms = EXCLUDED.duration_p99_ms,
+  duration_avg_ms = EXCLUDED.duration_avg_ms,
+  duration_max_ms = EXCLUDED.duration_max_ms,
+
+  ttft_p50_ms = EXCLUDED.ttft_p50_ms,
+  ttft_p90_ms = EXCLUDED.ttft_p90_ms,
+  ttft_p95_ms = EXCLUDED.ttft_p95_ms,
+  ttft_p99_ms = EXCLUDED.ttft_p99_ms,
+  ttft_avg_ms = EXCLUDED.ttft_avg_ms,
+  ttft_max_ms = EXCLUDED.ttft_max_ms,
+
+  computed_at = NOW()
+`
+
+	_, err := r.db.ExecContext(ctx, q, start, end)
+	return err
+}
+
+func (r *opsRepository) UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error {
+	if r == nil || r.db == nil {
+		return fmt.Errorf("nil ops repository")
+	}
+	if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) {
+		return nil
+	}
+
+	start := startTime.UTC()
+	end := endTime.UTC()
+
+	q := `
+INSERT INTO ops_metrics_daily (
+  bucket_date,
+  platform,
+  group_id,
+  success_count,
+  error_count_total,
+  business_limited_count,
+  error_count_sla,
+  upstream_error_count_excl_429_529,
+  upstream_429_count,
+  upstream_529_count,
+  token_consumed,
+  duration_p50_ms,
+  duration_p90_ms,
+  duration_p95_ms,
+  duration_p99_ms,
+  duration_avg_ms,
+  duration_max_ms,
+  ttft_p50_ms,
+  ttft_p90_ms,
+  ttft_p95_ms,
+  ttft_p99_ms,
+  ttft_avg_ms,
+  ttft_max_ms,
+  computed_at
+)
+SELECT
+  (bucket_start AT TIME ZONE 'UTC')::date AS bucket_date,
+  platform,
+  group_id,
+
+  COALESCE(SUM(success_count), 0) AS success_count,
+  COALESCE(SUM(error_count_total), 0) AS error_count_total,
+  COALESCE(SUM(business_limited_count), 0) AS business_limited_count,
+  COALESCE(SUM(error_count_sla), 0) AS error_count_sla,
+  COALESCE(SUM(upstream_error_count_excl_429_529), 0) AS upstream_error_count_excl_429_529,
+  COALESCE(SUM(upstream_429_count), 0) AS upstream_429_count,
+  COALESCE(SUM(upstream_529_count), 0) AS upstream_529_count,
+  COALESCE(SUM(token_consumed), 0) AS token_consumed,
+
+  -- Approximation: weighted average for p50/p90, max for p95/p99 (conservative tail).
+  ROUND(SUM(duration_p50_ms::double precision * success_count) FILTER (WHERE duration_p50_ms IS NOT NULL)
+    / NULLIF(SUM(success_count) FILTER (WHERE duration_p50_ms IS NOT NULL), 0))::int AS duration_p50_ms,
+  ROUND(SUM(duration_p90_ms::double precision * success_count) FILTER (WHERE duration_p90_ms IS NOT NULL)
+    / NULLIF(SUM(success_count) FILTER (WHERE duration_p90_ms IS NOT NULL), 0))::int AS duration_p90_ms,
+  MAX(duration_p95_ms) AS duration_p95_ms,
+  MAX(duration_p99_ms) AS duration_p99_ms,
+  SUM(duration_avg_ms * success_count) FILTER (WHERE duration_avg_ms IS NOT NULL)
+    / NULLIF(SUM(success_count) FILTER (WHERE duration_avg_ms IS NOT NULL), 0) AS duration_avg_ms,
+  MAX(duration_max_ms) AS duration_max_ms,
+
+  ROUND(SUM(ttft_p50_ms::double precision * success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL)
+    / NULLIF(SUM(success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL), 0))::int AS ttft_p50_ms,
+  ROUND(SUM(ttft_p90_ms::double precision * success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL)
+    / NULLIF(SUM(success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL), 0))::int AS ttft_p90_ms,
+  MAX(ttft_p95_ms) AS ttft_p95_ms,
+  MAX(ttft_p99_ms) AS ttft_p99_ms,
+  SUM(ttft_avg_ms * success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL)
+    / NULLIF(SUM(success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL), 0) AS ttft_avg_ms,
+  MAX(ttft_max_ms) AS ttft_max_ms,
+
+  NOW()
+FROM ops_metrics_hourly
+WHERE bucket_start >= $1 AND bucket_start < $2
+GROUP BY 1, 2, 3
+ON CONFLICT (bucket_date, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET
+  success_count = EXCLUDED.success_count,
+  error_count_total = EXCLUDED.error_count_total,
+  business_limited_count = EXCLUDED.business_limited_count,
+  error_count_sla = EXCLUDED.error_count_sla,
+  upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529,
+  upstream_429_count = EXCLUDED.upstream_429_count,
+  upstream_529_count = EXCLUDED.upstream_529_count,
+  token_consumed = EXCLUDED.token_consumed,
+
+  duration_p50_ms = EXCLUDED.duration_p50_ms,
+  duration_p90_ms = EXCLUDED.duration_p90_ms,
+  duration_p95_ms = EXCLUDED.duration_p95_ms,
+  duration_p99_ms = EXCLUDED.duration_p99_ms,
+  duration_avg_ms = EXCLUDED.duration_avg_ms,
+  duration_max_ms = EXCLUDED.duration_max_ms,
+
+  ttft_p50_ms = EXCLUDED.ttft_p50_ms,
+  ttft_p90_ms = EXCLUDED.ttft_p90_ms,
+  ttft_p95_ms = EXCLUDED.ttft_p95_ms,
+  ttft_p99_ms = EXCLUDED.ttft_p99_ms,
+  ttft_avg_ms = EXCLUDED.ttft_avg_ms,
+  ttft_max_ms = EXCLUDED.ttft_max_ms,
+
+  computed_at = NOW()
+`
+
+	_, err := r.db.ExecContext(ctx, q, start, end)
+	return err
+}
+
+func (r *opsRepository) GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error) {
+	if r == nil || r.db == nil {
+		return time.Time{}, false, fmt.Errorf("nil ops repository")
+	}
+
+	var value sql.NullTime
+	if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_start) FROM ops_metrics_hourly`).Scan(&value); err != nil {
+		return time.Time{}, false, err
+	}
+	if !value.Valid {
+		return time.Time{}, false, nil
+	}
+	return value.Time.UTC(), true, nil
+}
+
+func (r *opsRepository) GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error) {
+	if r == nil || r.db == nil {
+		return time.Time{}, false, fmt.Errorf("nil ops repository")
+	}
+
+	var value sql.NullTime
+	if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_date) FROM ops_metrics_daily`).Scan(&value); err != nil {
+		return time.Time{}, false, err
+	}
+	if !value.Valid {
+		return time.Time{}, false, nil
+	}
+	t := value.Time.UTC()
+	return time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, time.UTC), true, nil
+}
diff --git a/backend/internal/repository/ops_repo_request_details.go b/backend/internal/repository/ops_repo_request_details.go
new file mode 100644
index 00000000..d8d5d111
--- /dev/null
+++ b/backend/internal/repository/ops_repo_request_details.go
@@ -0,0 +1,286 @@
+package repository
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/service"
+)
+
+func (r *opsRepository) ListRequestDetails(ctx context.Context, filter *service.OpsRequestDetailFilter) ([]*service.OpsRequestDetail, int64, error) {
+	if r == nil || r.db == nil {
+		return nil, 0, fmt.Errorf("nil ops repository")
+	}
+
+	page, pageSize, startTime, endTime := filter.Normalize()
+	offset := (page - 1) * pageSize
+
+	conditions := make([]string, 0, 16)
+	args := make([]any, 0, 24)
+
+	// Placeholders $1/$2 reserved for time window inside the CTE.
+	args = append(args, startTime.UTC(), endTime.UTC())
+
+	addCondition := func(condition string, values ...any) {
+		conditions = append(conditions, condition)
+		args = append(args, values...)
+	}
+
+	if filter != nil {
+		if kind := strings.TrimSpace(strings.ToLower(filter.Kind)); kind != "" && kind != "all" {
+			if kind != string(service.OpsRequestKindSuccess) && kind != string(service.OpsRequestKindError) {
+				return nil, 0, fmt.Errorf("invalid kind")
+			}
+			addCondition(fmt.Sprintf("kind = $%d", len(args)+1), kind)
+		}
+
+		if platform := strings.TrimSpace(strings.ToLower(filter.Platform)); platform != "" {
+			addCondition(fmt.Sprintf("platform = $%d", len(args)+1), platform)
+		}
+		if filter.GroupID != nil && *filter.GroupID > 0 {
+			addCondition(fmt.Sprintf("group_id = $%d", len(args)+1), *filter.GroupID)
+		}
+
+		if filter.UserID != nil && *filter.UserID > 0 {
+			addCondition(fmt.Sprintf("user_id = $%d", len(args)+1), *filter.UserID)
+		}
+		if filter.APIKeyID != nil && *filter.APIKeyID > 0 {
+			addCondition(fmt.Sprintf("api_key_id = $%d", len(args)+1), *filter.APIKeyID)
+		}
+		if filter.AccountID != nil && *filter.AccountID > 0 {
+			addCondition(fmt.Sprintf("account_id = $%d", len(args)+1), *filter.AccountID)
+		}
+
+		if model := strings.TrimSpace(filter.Model); model != "" {
+			addCondition(fmt.Sprintf("model = $%d", len(args)+1), model)
+		}
+		if requestID := strings.TrimSpace(filter.RequestID); requestID != "" {
+			addCondition(fmt.Sprintf("request_id = $%d", len(args)+1), requestID)
+		}
+		if q := strings.TrimSpace(filter.Query); q != "" {
+			like := "%" + strings.ToLower(q) + "%"
+			startIdx := len(args) + 1
+			addCondition(
+				fmt.Sprintf("(LOWER(COALESCE(request_id,'')) LIKE $%d OR LOWER(COALESCE(model,'')) LIKE $%d OR LOWER(COALESCE(message,'')) LIKE $%d)",
+					startIdx, startIdx+1, startIdx+2,
+				),
+				like, like, like,
+			)
+		}
+
+		if filter.MinDurationMs != nil {
+			addCondition(fmt.Sprintf("duration_ms >= $%d", len(args)+1), *filter.MinDurationMs)
+		}
+		if filter.MaxDurationMs != nil {
+			addCondition(fmt.Sprintf("duration_ms <= $%d", len(args)+1), *filter.MaxDurationMs)
+		}
+	}
+
+	where := ""
+	if len(conditions) > 0 {
+		where = "WHERE " + strings.Join(conditions, " AND ")
+	}
+
+	cte := `
+WITH combined AS (
+  SELECT
+    'success'::TEXT AS kind,
+    ul.created_at AS created_at,
+    ul.request_id AS request_id,
+    COALESCE(NULLIF(g.platform, ''), NULLIF(a.platform, ''), '') AS platform,
+    ul.model AS model,
+    ul.duration_ms AS duration_ms,
+    NULL::INT AS status_code,
+    NULL::BIGINT AS error_id,
+    NULL::TEXT AS phase,
+    NULL::TEXT AS severity,
+    NULL::TEXT AS message,
+    ul.user_id AS user_id,
+    ul.api_key_id AS api_key_id,
+    ul.account_id AS account_id,
+    ul.group_id AS group_id,
+    ul.stream AS stream
+  FROM usage_logs ul
+  LEFT JOIN groups g ON g.id = ul.group_id
+  LEFT JOIN accounts a ON a.id = ul.account_id
+  WHERE ul.created_at >= $1 AND ul.created_at < $2
+
+  UNION ALL
+
+  SELECT
+    'error'::TEXT AS kind,
+    o.created_at AS created_at,
+    COALESCE(NULLIF(o.request_id,''), NULLIF(o.client_request_id,''), '') AS request_id,
+    COALESCE(NULLIF(o.platform, ''), NULLIF(g.platform, ''), NULLIF(a.platform, ''), '') AS platform,
+    o.model AS model,
+    o.duration_ms AS duration_ms,
+    o.status_code AS status_code,
+    o.id AS error_id,
+    o.error_phase AS phase,
+    o.severity AS severity,
+    o.error_message AS message,
+    o.user_id AS user_id,
+    o.api_key_id AS api_key_id,
+    o.account_id AS account_id,
+    o.group_id AS group_id,
+    o.stream AS stream
+  FROM ops_error_logs o
+  LEFT JOIN groups g ON g.id = o.group_id
+  LEFT JOIN accounts a ON a.id = o.account_id
+  WHERE o.created_at >= $1 AND o.created_at < $2
+    AND COALESCE(o.status_code, 0) >= 400
+)
+`
+
+	countQuery := fmt.Sprintf(`%s SELECT COUNT(1) FROM combined %s`, cte, where)
+	var total int64
+	if err := r.db.QueryRowContext(ctx, countQuery, args...).Scan(&total); err != nil {
+		if err == sql.ErrNoRows {
+			total = 0
+		} else {
+			return nil, 0, err
+		}
+	}
+
+	sort := "ORDER BY created_at DESC"
+	if filter != nil {
+		switch strings.TrimSpace(strings.ToLower(filter.Sort)) {
+		case "", "created_at_desc":
+			// default
+		case "duration_desc":
+			sort = "ORDER BY duration_ms DESC NULLS LAST, created_at DESC"
+		default:
+			return nil, 0, fmt.Errorf("invalid sort")
+		}
+	}
+
+	listQuery := fmt.Sprintf(`
+%s
+SELECT
+  kind,
+  created_at,
+  request_id,
+  platform,
+  model,
+  duration_ms,
+  status_code,
+  error_id,
+  phase,
+  severity,
+  message,
+  user_id,
+  api_key_id,
+  account_id,
+  group_id,
+  stream
+FROM combined
+%s
+%s
+LIMIT $%d OFFSET $%d
+`, cte, where, sort, len(args)+1, len(args)+2)
+
+	listArgs := append(append([]any{}, args...), pageSize, offset)
+	rows, err := r.db.QueryContext(ctx, listQuery, listArgs...)
+	if err != nil {
+		return nil, 0, err
+	}
+	defer func() { _ = rows.Close() }()
+
+	toIntPtr := func(v sql.NullInt64) *int {
+		if !v.Valid {
+			return nil
+		}
+		i := int(v.Int64)
+		return &i
+	}
+	toInt64Ptr := func(v sql.NullInt64) *int64 {
+		if !v.Valid {
+			return nil
+		}
+		i := v.Int64
+		return &i
+	}
+
+	out := make([]*service.OpsRequestDetail, 0, pageSize)
+	for rows.Next() {
+		var (
+			kind      string
+			createdAt time.Time
+			requestID sql.NullString
+			platform  sql.NullString
+			model     sql.NullString
+
+			durationMs sql.NullInt64
+			statusCode sql.NullInt64
+			errorID    sql.NullInt64
+
+			phase    sql.NullString
+			severity sql.NullString
+			message  sql.NullString
+
+			userID    sql.NullInt64
+			apiKeyID  sql.NullInt64
+			accountID sql.NullInt64
+			groupID   sql.NullInt64
+
+			stream bool
+		)
+
+		if err := rows.Scan(
+			&kind,
+			&createdAt,
+			&requestID,
+			&platform,
+			&model,
+			&durationMs,
+			&statusCode,
+			&errorID,
+			&phase,
+			&severity,
+			&message,
+			&userID,
+			&apiKeyID,
+			&accountID,
+			&groupID,
+			&stream,
+		); err != nil {
+			return nil, 0, err
+		}
+
+		item := &service.OpsRequestDetail{
+			Kind:      service.OpsRequestKind(kind),
+			CreatedAt: createdAt,
+			RequestID: strings.TrimSpace(requestID.String),
+			Platform:  strings.TrimSpace(platform.String),
+			Model:     strings.TrimSpace(model.String),
+
+			DurationMs: toIntPtr(durationMs),
+			StatusCode: toIntPtr(statusCode),
+			ErrorID:    toInt64Ptr(errorID),
+			Phase:      phase.String,
+			Severity:   severity.String,
+			Message:    message.String,
+
+			UserID:    toInt64Ptr(userID),
+			APIKeyID:  toInt64Ptr(apiKeyID),
+			AccountID: toInt64Ptr(accountID),
+			GroupID:   toInt64Ptr(groupID),
+
+			Stream: stream,
+		}
+
+		if item.Platform == "" {
+			item.Platform = "unknown"
+		}
+
+		out = append(out, item)
+	}
+	if err := rows.Err(); err != nil {
+		return nil, 0, err
+	}
+
+	return out, total, nil
+}
diff --git a/backend/internal/repository/ops_repo_trends.go b/backend/internal/repository/ops_repo_trends.go
new file mode 100644
index 00000000..e4ac96d3
--- /dev/null
+++ b/backend/internal/repository/ops_repo_trends.go
@@ -0,0 +1,571 @@
+package repository
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/service"
+)
+
+func (r *opsRepository) GetThroughputTrend(ctx context.Context, filter *service.OpsDashboardFilter, bucketSeconds int) (*service.OpsThroughputTrendResponse, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if filter == nil {
+		return nil, fmt.Errorf("nil filter")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, fmt.Errorf("start_time/end_time required")
+	}
+
+	if bucketSeconds <= 0 {
+		bucketSeconds = 60
+	}
+	if bucketSeconds != 60 && bucketSeconds != 300 && bucketSeconds != 3600 {
+		// Keep a small, predictable set of supported buckets for now.
+		bucketSeconds = 60
+	}
+
+	start := filter.StartTime.UTC()
+	end := filter.EndTime.UTC()
+
+	usageJoin, usageWhere, usageArgs, next := buildUsageWhere(filter, start, end, 1)
+	errorWhere, errorArgs, _ := buildErrorWhere(filter, start, end, next)
+
+	usageBucketExpr := opsBucketExprForUsage(bucketSeconds)
+	errorBucketExpr := opsBucketExprForError(bucketSeconds)
+
+	q := `
+WITH usage_buckets AS (
+  SELECT ` + usageBucketExpr + ` AS bucket,
+         COUNT(*) AS success_count,
+         COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
+  FROM usage_logs ul
+  ` + usageJoin + `
+  ` + usageWhere + `
+  GROUP BY 1
+),
+error_buckets AS (
+  SELECT ` + errorBucketExpr + ` AS bucket,
+         COUNT(*) AS error_count
+  FROM ops_error_logs
+  ` + errorWhere + `
+    AND COALESCE(status_code, 0) >= 400
+  GROUP BY 1
+),
+combined AS (
+  SELECT COALESCE(u.bucket, e.bucket) AS bucket,
+         COALESCE(u.success_count, 0) AS success_count,
+         COALESCE(e.error_count, 0) AS error_count,
+         COALESCE(u.token_consumed, 0) AS token_consumed
+  FROM usage_buckets u
+  FULL OUTER JOIN error_buckets e ON u.bucket = e.bucket
+)
+SELECT
+  bucket,
+  (success_count + error_count) AS request_count,
+  token_consumed
+FROM combined
+ORDER BY bucket ASC`
+
+	args := append(usageArgs, errorArgs...)
+
+	rows, err := r.db.QueryContext(ctx, q, args...)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = rows.Close() }()
+
+	points := make([]*service.OpsThroughputTrendPoint, 0, 256)
+	for rows.Next() {
+		var bucket time.Time
+		var requests int64
+		var tokens sql.NullInt64
+		if err := rows.Scan(&bucket, &requests, &tokens); err != nil {
+			return nil, err
+		}
+		tokenConsumed := int64(0)
+		if tokens.Valid {
+			tokenConsumed = tokens.Int64
+		}
+
+		denom := float64(bucketSeconds)
+		if denom <= 0 {
+			denom = 60
+		}
+		qps := roundTo1DP(float64(requests) / denom)
+		tps := roundTo1DP(float64(tokenConsumed) / denom)
+
+		points = append(points, &service.OpsThroughputTrendPoint{
+			BucketStart:   bucket.UTC(),
+			RequestCount:  requests,
+			TokenConsumed: tokenConsumed,
+			QPS:           qps,
+			TPS:           tps,
+		})
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+
+	// Fill missing buckets with zeros so charts render continuous timelines.
+	points = fillOpsThroughputBuckets(start, end, bucketSeconds, points)
+
+	var byPlatform []*service.OpsThroughputPlatformBreakdownItem
+	var topGroups []*service.OpsThroughputGroupBreakdownItem
+
+	platform := ""
+	if filter != nil {
+		platform = strings.TrimSpace(strings.ToLower(filter.Platform))
+	}
+	groupID := (*int64)(nil)
+	if filter != nil {
+		groupID = filter.GroupID
+	}
+
+	// Drilldown helpers:
+	// - No platform/group: totals by platform
+	// - Platform selected but no group: top groups in that platform
+	if platform == "" && (groupID == nil || *groupID <= 0) {
+		items, err := r.getThroughputBreakdownByPlatform(ctx, start, end)
+		if err != nil {
+			return nil, err
+		}
+		byPlatform = items
+	} else if platform != "" && (groupID == nil || *groupID <= 0) {
+		items, err := r.getThroughputTopGroupsByPlatform(ctx, start, end, platform, 10)
+		if err != nil {
+			return nil, err
+		}
+		topGroups = items
+	}
+
+	return &service.OpsThroughputTrendResponse{
+		Bucket: opsBucketLabel(bucketSeconds),
+		Points: points,
+
+		ByPlatform: byPlatform,
+		TopGroups:  topGroups,
+	}, nil
+}
+
+func (r *opsRepository) getThroughputBreakdownByPlatform(ctx context.Context, start, end time.Time) ([]*service.OpsThroughputPlatformBreakdownItem, error) {
+	q := `
+WITH usage_totals AS (
+  SELECT COALESCE(NULLIF(g.platform,''), a.platform) AS platform,
+         COUNT(*) AS success_count,
+         COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
+  FROM usage_logs ul
+  LEFT JOIN groups g ON g.id = ul.group_id
+  LEFT JOIN accounts a ON a.id = ul.account_id
+  WHERE ul.created_at >= $1 AND ul.created_at < $2
+  GROUP BY 1
+),
+error_totals AS (
+  SELECT platform,
+         COUNT(*) AS error_count
+  FROM ops_error_logs
+  WHERE created_at >= $1 AND created_at < $2
+    AND COALESCE(status_code, 0) >= 400
+  GROUP BY 1
+),
+combined AS (
+  SELECT COALESCE(u.platform, e.platform) AS platform,
+         COALESCE(u.success_count, 0) AS success_count,
+         COALESCE(e.error_count, 0) AS error_count,
+         COALESCE(u.token_consumed, 0) AS token_consumed
+  FROM usage_totals u
+  FULL OUTER JOIN error_totals e ON u.platform = e.platform
+)
+SELECT platform, (success_count + error_count) AS request_count, token_consumed
+FROM combined
+WHERE platform IS NOT NULL AND platform <> ''
+ORDER BY request_count DESC`
+
+	rows, err := r.db.QueryContext(ctx, q, start, end)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = rows.Close() }()
+
+	items := make([]*service.OpsThroughputPlatformBreakdownItem, 0, 8)
+	for rows.Next() {
+		var platform string
+		var requests int64
+		var tokens sql.NullInt64
+		if err := rows.Scan(&platform, &requests, &tokens); err != nil {
+			return nil, err
+		}
+		tokenConsumed := int64(0)
+		if tokens.Valid {
+			tokenConsumed = tokens.Int64
+		}
+		items = append(items, &service.OpsThroughputPlatformBreakdownItem{
+			Platform:      platform,
+			RequestCount:  requests,
+			TokenConsumed: tokenConsumed,
+		})
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+	return items, nil
+}
+
+func (r *opsRepository) getThroughputTopGroupsByPlatform(ctx context.Context, start, end time.Time, platform string, limit int) ([]*service.OpsThroughputGroupBreakdownItem, error) {
+	if strings.TrimSpace(platform) == "" {
+		return nil, nil
+	}
+	if limit <= 0 || limit > 100 {
+		limit = 10
+	}
+
+	q := `
+WITH usage_totals AS (
+  SELECT ul.group_id AS group_id,
+         g.name AS group_name,
+         COUNT(*) AS success_count,
+         COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
+  FROM usage_logs ul
+  JOIN groups g ON g.id = ul.group_id
+  WHERE ul.created_at >= $1 AND ul.created_at < $2
+    AND g.platform = $3
+  GROUP BY 1, 2
+),
+error_totals AS (
+  SELECT group_id,
+         COUNT(*) AS error_count
+  FROM ops_error_logs
+  WHERE created_at >= $1 AND created_at < $2
+    AND platform = $3
+    AND group_id IS NOT NULL
+    AND COALESCE(status_code, 0) >= 400
+  GROUP BY 1
+),
+combined AS (
+  SELECT COALESCE(u.group_id, e.group_id) AS group_id,
+         COALESCE(u.group_name, g2.name, '') AS group_name,
+         COALESCE(u.success_count, 0) AS success_count,
+         COALESCE(e.error_count, 0) AS error_count,
+         COALESCE(u.token_consumed, 0) AS token_consumed
+  FROM usage_totals u
+  FULL OUTER JOIN error_totals e ON u.group_id = e.group_id
+  LEFT JOIN groups g2 ON g2.id = COALESCE(u.group_id, e.group_id)
+)
+SELECT group_id, group_name, (success_count + error_count) AS request_count, token_consumed
+FROM combined
+WHERE group_id IS NOT NULL
+ORDER BY request_count DESC
+LIMIT $4`
+
+	rows, err := r.db.QueryContext(ctx, q, start, end, platform, limit)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = rows.Close() }()
+
+	items := make([]*service.OpsThroughputGroupBreakdownItem, 0, limit)
+	for rows.Next() {
+		var groupID int64
+		var groupName sql.NullString
+		var requests int64
+		var tokens sql.NullInt64
+		if err := rows.Scan(&groupID, &groupName, &requests, &tokens); err != nil {
+			return nil, err
+		}
+		tokenConsumed := int64(0)
+		if tokens.Valid {
+			tokenConsumed = tokens.Int64
+		}
+		name := ""
+		if groupName.Valid {
+			name = groupName.String
+		}
+		items = append(items, &service.OpsThroughputGroupBreakdownItem{
+			GroupID:       groupID,
+			GroupName:     name,
+			RequestCount:  requests,
+			TokenConsumed: tokenConsumed,
+		})
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+	return items, nil
+}
+
+func opsBucketExprForUsage(bucketSeconds int) string {
+	switch bucketSeconds {
+	case 3600:
+		return "date_trunc('hour', ul.created_at)"
+	case 300:
+		// 5-minute buckets in UTC.
+		return "to_timestamp(floor(extract(epoch from ul.created_at) / 300) * 300)"
+	default:
+		return "date_trunc('minute', ul.created_at)"
+	}
+}
+
+func opsBucketExprForError(bucketSeconds int) string {
+	switch bucketSeconds {
+	case 3600:
+		return "date_trunc('hour', created_at)"
+	case 300:
+		return "to_timestamp(floor(extract(epoch from created_at) / 300) * 300)"
+	default:
+		return "date_trunc('minute', created_at)"
+	}
+}
+
+func opsBucketLabel(bucketSeconds int) string {
+	if bucketSeconds <= 0 {
+		return "1m"
+	}
+	if bucketSeconds%3600 == 0 {
+		h := bucketSeconds / 3600
+		if h <= 0 {
+			h = 1
+		}
+		return fmt.Sprintf("%dh", h)
+	}
+	m := bucketSeconds / 60
+	if m <= 0 {
+		m = 1
+	}
+	return fmt.Sprintf("%dm", m)
+}
+
+func opsFloorToBucketStart(t time.Time, bucketSeconds int) time.Time {
+	t = t.UTC()
+	if bucketSeconds <= 0 {
+		bucketSeconds = 60
+	}
+	secs := t.Unix()
+	floored := secs - (secs % int64(bucketSeconds))
+	return time.Unix(floored, 0).UTC()
+}
+
+func fillOpsThroughputBuckets(start, end time.Time, bucketSeconds int, points []*service.OpsThroughputTrendPoint) []*service.OpsThroughputTrendPoint {
+	if bucketSeconds <= 0 {
+		bucketSeconds = 60
+	}
+	if !start.Before(end) {
+		return points
+	}
+
+	endMinus := end.Add(-time.Nanosecond)
+	if endMinus.Before(start) {
+		return points
+	}
+
+	first := opsFloorToBucketStart(start, bucketSeconds)
+	last := opsFloorToBucketStart(endMinus, bucketSeconds)
+	step := time.Duration(bucketSeconds) * time.Second
+
+	existing := make(map[int64]*service.OpsThroughputTrendPoint, len(points))
+	for _, p := range points {
+		if p == nil {
+			continue
+		}
+		existing[p.BucketStart.UTC().Unix()] = p
+	}
+
+	out := make([]*service.OpsThroughputTrendPoint, 0, int(last.Sub(first)/step)+1)
+	for cursor := first; !cursor.After(last); cursor = cursor.Add(step) {
+		if p, ok := existing[cursor.Unix()]; ok && p != nil {
+			out = append(out, p)
+			continue
+		}
+		out = append(out, &service.OpsThroughputTrendPoint{
+			BucketStart:   cursor,
+			RequestCount:  0,
+			TokenConsumed: 0,
+			QPS:           0,
+			TPS:           0,
+		})
+	}
+	return out
+}
+
+func (r *opsRepository) GetErrorTrend(ctx context.Context, filter *service.OpsDashboardFilter, bucketSeconds int) (*service.OpsErrorTrendResponse, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if filter == nil {
+		return nil, fmt.Errorf("nil filter")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, fmt.Errorf("start_time/end_time required")
+	}
+
+	if bucketSeconds <= 0 {
+		bucketSeconds = 60
+	}
+	if bucketSeconds != 60 && bucketSeconds != 300 && bucketSeconds != 3600 {
+		bucketSeconds = 60
+	}
+
+	start := filter.StartTime.UTC()
+	end := filter.EndTime.UTC()
+	where, args, _ := buildErrorWhere(filter, start, end, 1)
+	bucketExpr := opsBucketExprForError(bucketSeconds)
+
+	q := `
+SELECT
+  ` + bucketExpr + ` AS bucket,
+  COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400) AS error_total,
+  COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited) AS business_limited,
+  COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited) AS error_sla,
+  COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)) AS upstream_excl,
+  COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429) AS upstream_429,
+  COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529) AS upstream_529
+FROM ops_error_logs
+` + where + `
+GROUP BY 1
+ORDER BY 1 ASC`
+
+	rows, err := r.db.QueryContext(ctx, q, args...)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = rows.Close() }()
+
+	points := make([]*service.OpsErrorTrendPoint, 0, 256)
+	for rows.Next() {
+		var bucket time.Time
+		var total, businessLimited, sla, upstreamExcl, upstream429, upstream529 int64
+		if err := rows.Scan(&bucket, &total, &businessLimited, &sla, &upstreamExcl, &upstream429, &upstream529); err != nil {
+			return nil, err
+		}
+		points = append(points, &service.OpsErrorTrendPoint{
+			BucketStart: bucket.UTC(),
+
+			ErrorCountTotal:      total,
+			BusinessLimitedCount: businessLimited,
+			ErrorCountSLA:        sla,
+
+			UpstreamErrorCountExcl429529: upstreamExcl,
+			Upstream429Count:             upstream429,
+			Upstream529Count:             upstream529,
+		})
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+
+	points = fillOpsErrorTrendBuckets(start, end, bucketSeconds, points)
+
+	return &service.OpsErrorTrendResponse{
+		Bucket: opsBucketLabel(bucketSeconds),
+		Points: points,
+	}, nil
+}
+
+func fillOpsErrorTrendBuckets(start, end time.Time, bucketSeconds int, points []*service.OpsErrorTrendPoint) []*service.OpsErrorTrendPoint {
+	if bucketSeconds <= 0 {
+		bucketSeconds = 60
+	}
+	if !start.Before(end) {
+		return points
+	}
+
+	endMinus := end.Add(-time.Nanosecond)
+	if endMinus.Before(start) {
+		return points
+	}
+
+	first := opsFloorToBucketStart(start, bucketSeconds)
+	last := opsFloorToBucketStart(endMinus, bucketSeconds)
+	step := time.Duration(bucketSeconds) * time.Second
+
+	existing := make(map[int64]*service.OpsErrorTrendPoint, len(points))
+	for _, p := range points {
+		if p == nil {
+			continue
+		}
+		existing[p.BucketStart.UTC().Unix()] = p
+	}
+
+	out := make([]*service.OpsErrorTrendPoint, 0, int(last.Sub(first)/step)+1)
+	for cursor := first; !cursor.After(last); cursor = cursor.Add(step) {
+		if p, ok := existing[cursor.Unix()]; ok && p != nil {
+			out = append(out, p)
+			continue
+		}
+		out = append(out, &service.OpsErrorTrendPoint{
+			BucketStart: cursor,
+
+			ErrorCountTotal:      0,
+			BusinessLimitedCount: 0,
+			ErrorCountSLA:        0,
+
+			UpstreamErrorCountExcl429529: 0,
+			Upstream429Count:             0,
+			Upstream529Count:             0,
+		})
+	}
+	return out
+}
+
+func (r *opsRepository) GetErrorDistribution(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsErrorDistributionResponse, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if filter == nil {
+		return nil, fmt.Errorf("nil filter")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, fmt.Errorf("start_time/end_time required")
+	}
+
+	start := filter.StartTime.UTC()
+	end := filter.EndTime.UTC()
+	where, args, _ := buildErrorWhere(filter, start, end, 1)
+
+	q := `
+SELECT
+  COALESCE(upstream_status_code, status_code, 0) AS status_code,
+  COUNT(*) AS total,
+  COUNT(*) FILTER (WHERE NOT is_business_limited) AS sla,
+  COUNT(*) FILTER (WHERE is_business_limited) AS business_limited
+FROM ops_error_logs
+` + where + `
+  AND COALESCE(status_code, 0) >= 400
+GROUP BY 1
+ORDER BY total DESC
+LIMIT 20`
+
+	rows, err := r.db.QueryContext(ctx, q, args...)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = rows.Close() }()
+
+	items := make([]*service.OpsErrorDistributionItem, 0, 16)
+	var total int64
+	for rows.Next() {
+		var statusCode int
+		var cntTotal, cntSLA, cntBiz int64
+		if err := rows.Scan(&statusCode, &cntTotal, &cntSLA, &cntBiz); err != nil {
+			return nil, err
+		}
+		total += cntTotal
+		items = append(items, &service.OpsErrorDistributionItem{
+			StatusCode:      statusCode,
+			Total:           cntTotal,
+			SLA:             cntSLA,
+			BusinessLimited: cntBiz,
+		})
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+
+	return &service.OpsErrorDistributionResponse{
+		Total: total,
+		Items: items,
+	}, nil
+}
diff --git a/backend/internal/repository/ops_repo_window_stats.go b/backend/internal/repository/ops_repo_window_stats.go
new file mode 100644
index 00000000..8221c473
--- /dev/null
+++ b/backend/internal/repository/ops_repo_window_stats.go
@@ -0,0 +1,50 @@
+package repository
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/service"
+)
+
+func (r *opsRepository) GetWindowStats(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsWindowStats, error) {
+	if r == nil || r.db == nil {
+		return nil, fmt.Errorf("nil ops repository")
+	}
+	if filter == nil {
+		return nil, fmt.Errorf("nil filter")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, fmt.Errorf("start_time/end_time required")
+	}
+
+	start := filter.StartTime.UTC()
+	end := filter.EndTime.UTC()
+	if start.After(end) {
+		return nil, fmt.Errorf("start_time must be <= end_time")
+	}
+	// Bound excessively large windows to prevent accidental heavy queries.
+	if end.Sub(start) > 24*time.Hour {
+		return nil, fmt.Errorf("window too large")
+	}
+
+	successCount, tokenConsumed, err := r.queryUsageCounts(ctx, filter, start, end)
+	if err != nil {
+		return nil, err
+	}
+
+	errorTotal, _, _, _, _, _, err := r.queryErrorCounts(ctx, filter, start, end)
+	if err != nil {
+		return nil, err
+	}
+
+	return &service.OpsWindowStats{
+		StartTime: start,
+		EndTime:   end,
+
+		SuccessCount:    successCount,
+		ErrorCountTotal: errorTotal,
+		TokenConsumed:   tokenConsumed,
+	}, nil
+}
diff --git a/backend/internal/repository/usage_log_repo_integration_test.go b/backend/internal/repository/usage_log_repo_integration_test.go
index 51964782..3f90e49e 100644
--- a/backend/internal/repository/usage_log_repo_integration_test.go
+++ b/backend/internal/repository/usage_log_repo_integration_test.go
@@ -204,7 +204,7 @@ func (s *UsageLogRepoSuite) TestDashboardStats_TodayTotalsAndPerformance() {
 
 	userToday := mustCreateUser(s.T(), s.client, &service.User{
 		Email:     "today@example.com",
-		CreatedAt: maxTime(todayStart.Add(10*time.Second), now.Add(-10*time.Second)),
+		CreatedAt: testMaxTime(todayStart.Add(10*time.Second), now.Add(-10*time.Second)),
 		UpdatedAt: now,
 	})
 	userOld := mustCreateUser(s.T(), s.client, &service.User{
@@ -237,7 +237,7 @@ func (s *UsageLogRepoSuite) TestDashboardStats_TodayTotalsAndPerformance() {
 		TotalCost:           1.5,
 		ActualCost:          1.2,
 		DurationMs:          &d1,
-		CreatedAt:           maxTime(todayStart.Add(2*time.Minute), now.Add(-2*time.Minute)),
+		CreatedAt:           testMaxTime(todayStart.Add(2*time.Minute), now.Add(-2*time.Minute)),
 	}
 	_, err = s.repo.Create(s.ctx, logToday)
 	s.Require().NoError(err, "Create logToday")
@@ -413,9 +413,17 @@ func (s *UsageLogRepoSuite) TestGetAccountTodayStats() {
 
 func (s *UsageLogRepoSuite) TestDashboardAggregationConsistency() {
 	now := time.Now().UTC().Truncate(time.Second)
-	hour1 := now.Add(-90 * time.Minute).Truncate(time.Hour)
-	hour2 := now.Add(-30 * time.Minute).Truncate(time.Hour)
+	// 使用固定的时间偏移确保 hour1 和 hour2 在同一天且都在过去
+	// 选择当天 02:00 和 03:00 作为测试时间点（基于 now 的日期）
 	dayStart := truncateToDayUTC(now)
+	hour1 := dayStart.Add(2 * time.Hour)  // 当天 02:00
+	hour2 := dayStart.Add(3 * time.Hour)  // 当天 03:00
+	// 如果当前时间早于 hour2，则使用昨天的时间
+	if now.Before(hour2.Add(time.Hour)) {
+		dayStart = dayStart.Add(-24 * time.Hour)
+		hour1 = dayStart.Add(2 * time.Hour)
+		hour2 = dayStart.Add(3 * time.Hour)
+	}
 
 	user1 := mustCreateUser(s.T(), s.client, &service.User{Email: "agg-u1@test.com"})
 	user2 := mustCreateUser(s.T(), s.client, &service.User{Email: "agg-u2@test.com"})
@@ -473,7 +481,7 @@ func (s *UsageLogRepoSuite) TestDashboardAggregationConsistency() {
 
 	aggRepo := newDashboardAggregationRepositoryWithSQL(s.tx)
 	aggStart := hour1.Add(-5 * time.Minute)
-	aggEnd := now.Add(5 * time.Minute)
+	aggEnd := hour2.Add(time.Hour) // 确保覆盖 hour2 的所有数据
 	s.Require().NoError(aggRepo.AggregateRange(s.ctx, aggStart, aggEnd))
 
 	type hourlyRow struct {
@@ -621,7 +629,7 @@ func (s *UsageLogRepoSuite) TestGetGlobalStats() {
 	s.Require().Equal(int64(45), stats.TotalOutputTokens)
 }
 
-func maxTime(a, b time.Time) time.Time {
+func testMaxTime(a, b time.Time) time.Time {
 	if a.After(b) {
 		return a
 	}
diff --git a/backend/internal/repository/wire.go b/backend/internal/repository/wire.go
index 8cc937bb..e1c6c3d4 100644
--- a/backend/internal/repository/wire.go
+++ b/backend/internal/repository/wire.go
@@ -49,6 +49,7 @@ var ProviderSet = wire.NewSet(
 	NewUsageLogRepository,
 	NewDashboardAggregationRepository,
 	NewSettingRepository,
+	NewOpsRepository,
 	NewUserSubscriptionRepository,
 	NewUserAttributeDefinitionRepository,
 	NewUserAttributeValueRepository,
diff --git a/backend/internal/server/api_contract_test.go b/backend/internal/server/api_contract_test.go
index ebb98a50..d96732bd 100644
--- a/backend/internal/server/api_contract_test.go
+++ b/backend/internal/server/api_contract_test.go
@@ -262,11 +262,11 @@ func TestAPIContracts(t *testing.T) {
 			name: "GET /api/v1/admin/settings",
 			setup: func(t *testing.T, deps *contractDeps) {
 				t.Helper()
-				deps.settingRepo.SetAll(map[string]string{
-					service.SettingKeyRegistrationEnabled: "true",
-					service.SettingKeyEmailVerifyEnabled:  "false",
+					deps.settingRepo.SetAll(map[string]string{
+						service.SettingKeyRegistrationEnabled: "true",
+						service.SettingKeyEmailVerifyEnabled:  "false",
 
-					service.SettingKeySMTPHost:     "smtp.example.com",
+						service.SettingKeySMTPHost:     "smtp.example.com",
 					service.SettingKeySMTPPort:     "587",
 					service.SettingKeySMTPUsername: "user",
 					service.SettingKeySMTPPassword: "secret",
@@ -285,10 +285,15 @@ func TestAPIContracts(t *testing.T) {
 					service.SettingKeyContactInfo:  "support",
 					service.SettingKeyDocURL:       "https://docs.example.com",
 
-					service.SettingKeyDefaultConcurrency: "5",
-					service.SettingKeyDefaultBalance:     "1.25",
-				})
-			},
+						service.SettingKeyDefaultConcurrency: "5",
+						service.SettingKeyDefaultBalance:     "1.25",
+
+						service.SettingKeyOpsMonitoringEnabled:         "false",
+						service.SettingKeyOpsRealtimeMonitoringEnabled: "true",
+						service.SettingKeyOpsQueryModeDefault:          "auto",
+						service.SettingKeyOpsMetricsIntervalSeconds:    "60",
+					})
+				},
 			method:     http.MethodGet,
 			path:       "/api/v1/admin/settings",
 			wantStatus: http.StatusOK,
@@ -309,13 +314,17 @@ func TestAPIContracts(t *testing.T) {
 					"turnstile_site_key": "site-key",
 					"turnstile_secret_key_configured": true,
 					"linuxdo_connect_enabled": false,
-					"linuxdo_connect_client_id": "",
-					"linuxdo_connect_client_secret_configured": false,
-					"linuxdo_connect_redirect_url": "",
-					"site_name": "Sub2API",
-					"site_logo": "",
-					"site_subtitle": "Subtitle",
-					"api_base_url": "https://api.example.com",
+						"linuxdo_connect_client_id": "",
+						"linuxdo_connect_client_secret_configured": false,
+						"linuxdo_connect_redirect_url": "",
+						"ops_monitoring_enabled": false,
+						"ops_realtime_monitoring_enabled": true,
+						"ops_query_mode_default": "auto",
+						"ops_metrics_interval_seconds": 60,
+						"site_name": "Sub2API",
+						"site_logo": "",
+						"site_subtitle": "Subtitle",
+						"api_base_url": "https://api.example.com",
 					"contact_info": "support",
 					"doc_url": "https://docs.example.com",
 					"default_concurrency": 5,
@@ -430,7 +439,7 @@ func newContractDeps(t *testing.T) *contractDeps {
 	authHandler := handler.NewAuthHandler(cfg, nil, userService, settingService, nil)
 	apiKeyHandler := handler.NewAPIKeyHandler(apiKeyService)
 	usageHandler := handler.NewUsageHandler(usageService, apiKeyService)
-	adminSettingHandler := adminhandler.NewSettingHandler(settingService, nil, nil)
+	adminSettingHandler := adminhandler.NewSettingHandler(settingService, nil, nil, nil)
 	adminAccountHandler := adminhandler.NewAccountHandler(adminService, nil, nil, nil, nil, nil, nil, nil, nil, nil)
 
 	jwtAuth := func(c *gin.Context) {
diff --git a/backend/internal/server/http.go b/backend/internal/server/http.go
index a7d1d3b5..52d5c926 100644
--- a/backend/internal/server/http.go
+++ b/backend/internal/server/http.go
@@ -31,6 +31,7 @@ func ProvideRouter(
 	apiKeyAuth middleware2.APIKeyAuthMiddleware,
 	apiKeyService *service.APIKeyService,
 	subscriptionService *service.SubscriptionService,
+	opsService *service.OpsService,
 	settingService *service.SettingService,
 	redisClient *redis.Client,
 ) *gin.Engine {
@@ -50,7 +51,7 @@ func ProvideRouter(
 		}
 	}
 
-	return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, settingService, cfg, redisClient)
+	return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, cfg, redisClient)
 }
 
 // ProvideHTTPServer 提供 HTTP 服务器
diff --git a/backend/internal/server/middleware/admin_auth.go b/backend/internal/server/middleware/admin_auth.go
index e02a7b0a..8f30107c 100644
--- a/backend/internal/server/middleware/admin_auth.go
+++ b/backend/internal/server/middleware/admin_auth.go
@@ -30,6 +30,20 @@ func adminAuth(
 	settingService *service.SettingService,
 ) gin.HandlerFunc {
 	return func(c *gin.Context) {
+		// WebSocket upgrade requests cannot set Authorization headers in browsers.
+		// For admin WebSocket endpoints (e.g. Ops realtime), allow passing the JWT via
+		// Sec-WebSocket-Protocol (subprotocol list) using a prefixed token item:
+		//   Sec-WebSocket-Protocol: sub2api-admin, jwt.<token>
+		if isWebSocketUpgradeRequest(c) {
+			if token := extractJWTFromWebSocketSubprotocol(c); token != "" {
+				if !validateJWTForAdmin(c, token, authService, userService) {
+					return
+				}
+				c.Next()
+				return
+			}
+		}
+
 		// 检查 x-api-key header（Admin API Key 认证）
 		apiKey := c.GetHeader("x-api-key")
 		if apiKey != "" {
@@ -58,6 +72,44 @@ func adminAuth(
 	}
 }
 
+func isWebSocketUpgradeRequest(c *gin.Context) bool {
+	if c == nil || c.Request == nil {
+		return false
+	}
+	// RFC6455 handshake uses:
+	//   Connection: Upgrade
+	//   Upgrade: websocket
+	upgrade := strings.ToLower(strings.TrimSpace(c.GetHeader("Upgrade")))
+	if upgrade != "websocket" {
+		return false
+	}
+	connection := strings.ToLower(c.GetHeader("Connection"))
+	return strings.Contains(connection, "upgrade")
+}
+
+func extractJWTFromWebSocketSubprotocol(c *gin.Context) string {
+	if c == nil {
+		return ""
+	}
+	raw := strings.TrimSpace(c.GetHeader("Sec-WebSocket-Protocol"))
+	if raw == "" {
+		return ""
+	}
+
+	// The header is a comma-separated list of tokens. We reserve the prefix "jwt."
+	// for carrying the admin JWT.
+	for _, part := range strings.Split(raw, ",") {
+		p := strings.TrimSpace(part)
+		if strings.HasPrefix(p, "jwt.") {
+			token := strings.TrimSpace(strings.TrimPrefix(p, "jwt."))
+			if token != "" {
+				return token
+			}
+		}
+	}
+	return ""
+}
+
 // validateAdminAPIKey 验证管理员 API Key
 func validateAdminAPIKey(
 	c *gin.Context,
diff --git a/backend/internal/server/middleware/client_request_id.go b/backend/internal/server/middleware/client_request_id.go
new file mode 100644
index 00000000..d22b6cc5
--- /dev/null
+++ b/backend/internal/server/middleware/client_request_id.go
@@ -0,0 +1,30 @@
+package middleware
+
+import (
+	"context"
+
+	"github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
+	"github.com/gin-gonic/gin"
+	"github.com/google/uuid"
+)
+
+// ClientRequestID ensures every request has a unique client_request_id in request.Context().
+//
+// This is used by the Ops monitoring module for end-to-end request correlation.
+func ClientRequestID() gin.HandlerFunc {
+	return func(c *gin.Context) {
+		if c.Request == nil {
+			c.Next()
+			return
+		}
+
+		if v := c.Request.Context().Value(ctxkey.ClientRequestID); v != nil {
+			c.Next()
+			return
+		}
+
+		id := uuid.New().String()
+		c.Request = c.Request.WithContext(context.WithValue(c.Request.Context(), ctxkey.ClientRequestID, id))
+		c.Next()
+	}
+}
diff --git a/backend/internal/server/router.go b/backend/internal/server/router.go
index 70f7da84..cf9015e4 100644
--- a/backend/internal/server/router.go
+++ b/backend/internal/server/router.go
@@ -23,6 +23,7 @@ func SetupRouter(
 	apiKeyAuth middleware2.APIKeyAuthMiddleware,
 	apiKeyService *service.APIKeyService,
 	subscriptionService *service.SubscriptionService,
+	opsService *service.OpsService,
 	settingService *service.SettingService,
 	cfg *config.Config,
 	redisClient *redis.Client,
@@ -46,7 +47,7 @@ func SetupRouter(
 	}
 
 	// 注册路由
-	registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, cfg, redisClient)
+	registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg, redisClient)
 
 	return r
 }
@@ -60,6 +61,7 @@ func registerRoutes(
 	apiKeyAuth middleware2.APIKeyAuthMiddleware,
 	apiKeyService *service.APIKeyService,
 	subscriptionService *service.SubscriptionService,
+	opsService *service.OpsService,
 	cfg *config.Config,
 	redisClient *redis.Client,
 ) {
@@ -73,5 +75,5 @@ func registerRoutes(
 	routes.RegisterAuthRoutes(v1, h, jwtAuth, redisClient)
 	routes.RegisterUserRoutes(v1, h, jwtAuth)
 	routes.RegisterAdminRoutes(v1, h, adminAuth)
-	routes.RegisterGatewayRoutes(r, h, apiKeyAuth, apiKeyService, subscriptionService, cfg)
+	routes.RegisterGatewayRoutes(r, h, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg)
 }
diff --git a/backend/internal/server/routes/admin.go b/backend/internal/server/routes/admin.go
index c9c5352c..a2f1b8c7 100644
--- a/backend/internal/server/routes/admin.go
+++ b/backend/internal/server/routes/admin.go
@@ -50,6 +50,9 @@ func RegisterAdminRoutes(
 		// 系统设置
 		registerSettingsRoutes(admin, h)
 
+		// 运维监控（Ops）
+		registerOpsRoutes(admin, h)
+
 		// 系统管理
 		registerSystemRoutes(admin, h)
 
@@ -64,6 +67,58 @@ func RegisterAdminRoutes(
 	}
 }
 
+func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) {
+	ops := admin.Group("/ops")
+	{
+		// Realtime ops signals
+		ops.GET("/concurrency", h.Admin.Ops.GetConcurrencyStats)
+		ops.GET("/account-availability", h.Admin.Ops.GetAccountAvailability)
+
+		// Alerts (rules + events)
+		ops.GET("/alert-rules", h.Admin.Ops.ListAlertRules)
+		ops.POST("/alert-rules", h.Admin.Ops.CreateAlertRule)
+		ops.PUT("/alert-rules/:id", h.Admin.Ops.UpdateAlertRule)
+		ops.DELETE("/alert-rules/:id", h.Admin.Ops.DeleteAlertRule)
+		ops.GET("/alert-events", h.Admin.Ops.ListAlertEvents)
+
+		// Email notification config (DB-backed)
+		ops.GET("/email-notification/config", h.Admin.Ops.GetEmailNotificationConfig)
+		ops.PUT("/email-notification/config", h.Admin.Ops.UpdateEmailNotificationConfig)
+
+		// Runtime settings (DB-backed)
+		runtime := ops.Group("/runtime")
+		{
+			runtime.GET("/alert", h.Admin.Ops.GetAlertRuntimeSettings)
+			runtime.PUT("/alert", h.Admin.Ops.UpdateAlertRuntimeSettings)
+		}
+
+		// Advanced settings (DB-backed)
+		ops.GET("/advanced-settings", h.Admin.Ops.GetAdvancedSettings)
+		ops.PUT("/advanced-settings", h.Admin.Ops.UpdateAdvancedSettings)
+
+		// WebSocket realtime (QPS/TPS)
+		ws := ops.Group("/ws")
+		{
+			ws.GET("/qps", h.Admin.Ops.QPSWSHandler)
+		}
+
+		// Error logs (MVP-1)
+		ops.GET("/errors", h.Admin.Ops.GetErrorLogs)
+		ops.GET("/errors/:id", h.Admin.Ops.GetErrorLogByID)
+		ops.POST("/errors/:id/retry", h.Admin.Ops.RetryErrorRequest)
+
+		// Request drilldown (success + error)
+		ops.GET("/requests", h.Admin.Ops.ListRequestDetails)
+
+		// Dashboard (vNext - raw path for MVP)
+		ops.GET("/dashboard/overview", h.Admin.Ops.GetDashboardOverview)
+		ops.GET("/dashboard/throughput-trend", h.Admin.Ops.GetDashboardThroughputTrend)
+		ops.GET("/dashboard/latency-histogram", h.Admin.Ops.GetDashboardLatencyHistogram)
+		ops.GET("/dashboard/error-trend", h.Admin.Ops.GetDashboardErrorTrend)
+		ops.GET("/dashboard/error-distribution", h.Admin.Ops.GetDashboardErrorDistribution)
+	}
+}
+
 func registerDashboardRoutes(admin *gin.RouterGroup, h *handler.Handlers) {
 	dashboard := admin.Group("/dashboard")
 	{
diff --git a/backend/internal/server/routes/gateway.go b/backend/internal/server/routes/gateway.go
index 0b62185e..bf019ce3 100644
--- a/backend/internal/server/routes/gateway.go
+++ b/backend/internal/server/routes/gateway.go
@@ -16,13 +16,18 @@ func RegisterGatewayRoutes(
 	apiKeyAuth middleware.APIKeyAuthMiddleware,
 	apiKeyService *service.APIKeyService,
 	subscriptionService *service.SubscriptionService,
+	opsService *service.OpsService,
 	cfg *config.Config,
 ) {
 	bodyLimit := middleware.RequestBodyLimit(cfg.Gateway.MaxBodySize)
+	clientRequestID := middleware.ClientRequestID()
+	opsErrorLogger := handler.OpsErrorLoggerMiddleware(opsService)
 
 	// API网关（Claude API兼容）
 	gateway := r.Group("/v1")
 	gateway.Use(bodyLimit)
+	gateway.Use(clientRequestID)
+	gateway.Use(opsErrorLogger)
 	gateway.Use(gin.HandlerFunc(apiKeyAuth))
 	{
 		gateway.POST("/messages", h.Gateway.Messages)
@@ -36,6 +41,8 @@ func RegisterGatewayRoutes(
 	// Gemini 原生 API 兼容层（Gemini SDK/CLI 直连）
 	gemini := r.Group("/v1beta")
 	gemini.Use(bodyLimit)
+	gemini.Use(clientRequestID)
+	gemini.Use(opsErrorLogger)
 	gemini.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg))
 	{
 		gemini.GET("/models", h.Gateway.GeminiV1BetaListModels)
@@ -45,7 +52,7 @@ func RegisterGatewayRoutes(
 	}
 
 	// OpenAI Responses API（不带v1前缀的别名）
-	r.POST("/responses", bodyLimit, gin.HandlerFunc(apiKeyAuth), h.OpenAIGateway.Responses)
+	r.POST("/responses", bodyLimit, clientRequestID, opsErrorLogger, gin.HandlerFunc(apiKeyAuth), h.OpenAIGateway.Responses)
 
 	// Antigravity 模型列表
 	r.GET("/antigravity/models", gin.HandlerFunc(apiKeyAuth), h.Gateway.AntigravityModels)
@@ -53,6 +60,8 @@ func RegisterGatewayRoutes(
 	// Antigravity 专用路由（仅使用 antigravity 账户，不混合调度）
 	antigravityV1 := r.Group("/antigravity/v1")
 	antigravityV1.Use(bodyLimit)
+	antigravityV1.Use(clientRequestID)
+	antigravityV1.Use(opsErrorLogger)
 	antigravityV1.Use(middleware.ForcePlatform(service.PlatformAntigravity))
 	antigravityV1.Use(gin.HandlerFunc(apiKeyAuth))
 	{
@@ -64,6 +73,8 @@ func RegisterGatewayRoutes(
 
 	antigravityV1Beta := r.Group("/antigravity/v1beta")
 	antigravityV1Beta.Use(bodyLimit)
+	antigravityV1Beta.Use(clientRequestID)
+	antigravityV1Beta.Use(opsErrorLogger)
 	antigravityV1Beta.Use(middleware.ForcePlatform(service.PlatformAntigravity))
 	antigravityV1Beta.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg))
 	{
diff --git a/backend/internal/service/antigravity_gateway_service.go b/backend/internal/service/antigravity_gateway_service.go
index 4fd55757..4dd4d303 100644
--- a/backend/internal/service/antigravity_gateway_service.go
+++ b/backend/internal/service/antigravity_gateway_service.go
@@ -564,6 +564,14 @@ urlFallbackLoop:
 
 			resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
 			if err != nil {
+				safeErr := sanitizeUpstreamErrorMessage(err.Error())
+				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+					Platform:           account.Platform,
+					AccountID:          account.ID,
+					UpstreamStatusCode: 0,
+					Kind:               "request_error",
+					Message:            safeErr,
+				})
 				// 检查是否应触发 URL 降级
 				if shouldAntigravityFallbackToNextURL(err, 0) && urlIdx < len(availableURLs)-1 {
 					antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
@@ -579,6 +587,7 @@ urlFallbackLoop:
 					continue
 				}
 				log.Printf("%s status=request_failed retries_exhausted error=%v", prefix, err)
+				setOpsUpstreamError(c, 0, safeErr, "")
 				return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries")
 			}
 
@@ -586,6 +595,26 @@ urlFallbackLoop:
 			if resp.StatusCode == http.StatusTooManyRequests && urlIdx < len(availableURLs)-1 {
 				respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 				_ = resp.Body.Close()
+				upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
+				upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+				logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
+				maxBytes := 2048
+				if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
+					maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+				}
+				upstreamDetail := ""
+				if logBody {
+					upstreamDetail = truncateString(string(respBody), maxBytes)
+				}
+				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+					Platform:           account.Platform,
+					AccountID:          account.ID,
+					UpstreamStatusCode: resp.StatusCode,
+					UpstreamRequestID:  resp.Header.Get("x-request-id"),
+					Kind:               "retry",
+					Message:            upstreamMsg,
+					Detail:             upstreamDetail,
+				})
 				antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
 				log.Printf("%s URL fallback (HTTP 429): %s -> %s body=%s", prefix, baseURL, availableURLs[urlIdx+1], truncateForLog(respBody, 200))
 				continue urlFallbackLoop
@@ -596,6 +625,26 @@ urlFallbackLoop:
 				_ = resp.Body.Close()
 
 				if attempt < antigravityMaxRetries {
+					upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
+					upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+					logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
+					maxBytes := 2048
+					if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
+						maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+					}
+					upstreamDetail := ""
+					if logBody {
+						upstreamDetail = truncateString(string(respBody), maxBytes)
+					}
+					appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+						Platform:           account.Platform,
+						AccountID:          account.ID,
+						UpstreamStatusCode: resp.StatusCode,
+						UpstreamRequestID:  resp.Header.Get("x-request-id"),
+						Kind:               "retry",
+						Message:            upstreamMsg,
+						Detail:             upstreamDetail,
+					})
 					log.Printf("%s status=%d retry=%d/%d body=%s", prefix, resp.StatusCode, attempt, antigravityMaxRetries, truncateForLog(respBody, 500))
 					if !sleepAntigravityBackoffWithContext(ctx, attempt) {
 						log.Printf("%s status=context_canceled_during_backoff", prefix)
@@ -628,6 +677,27 @@ urlFallbackLoop:
 		// Antigravity /v1internal 链路在部分场景会对 thought/thinking signature 做严格校验，
 		// 当历史消息携带的 signature 不合法时会直接 400；去除 thinking 后可继续完成请求。
 		if resp.StatusCode == http.StatusBadRequest && isSignatureRelatedError(respBody) {
+			upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
+			upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+			logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
+			maxBytes := 2048
+			if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
+				maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+			}
+			upstreamDetail := ""
+			if logBody {
+				upstreamDetail = truncateString(string(respBody), maxBytes)
+			}
+			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+				Platform:           account.Platform,
+				AccountID:          account.ID,
+				UpstreamStatusCode: resp.StatusCode,
+				UpstreamRequestID:  resp.Header.Get("x-request-id"),
+				Kind:               "signature_error",
+				Message:            upstreamMsg,
+				Detail:             upstreamDetail,
+			})
+
 			// Conservative two-stage fallback:
 			// 1) Disable top-level thinking + thinking->text
 			// 2) Only if still signature-related 400: also downgrade tool_use/tool_result to text.
@@ -661,6 +731,13 @@ urlFallbackLoop:
 				}
 				retryResp, retryErr := s.httpUpstream.Do(retryReq, proxyURL, account.ID, account.Concurrency)
 				if retryErr != nil {
+					appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+						Platform:           account.Platform,
+						AccountID:          account.ID,
+						UpstreamStatusCode: 0,
+						Kind:               "signature_retry_request_error",
+						Message:            sanitizeUpstreamErrorMessage(retryErr.Error()),
+					})
 					log.Printf("Antigravity account %d: signature retry request failed (%s): %v", account.ID, stage.name, retryErr)
 					continue
 				}
@@ -674,6 +751,25 @@ urlFallbackLoop:
 
 				retryBody, _ := io.ReadAll(io.LimitReader(retryResp.Body, 2<<20))
 				_ = retryResp.Body.Close()
+				kind := "signature_retry"
+				if strings.TrimSpace(stage.name) != "" {
+					kind = "signature_retry_" + strings.ReplaceAll(stage.name, "+", "_")
+				}
+				retryUpstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(retryBody))
+				retryUpstreamMsg = sanitizeUpstreamErrorMessage(retryUpstreamMsg)
+				retryUpstreamDetail := ""
+				if logBody {
+					retryUpstreamDetail = truncateString(string(retryBody), maxBytes)
+				}
+				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+					Platform:           account.Platform,
+					AccountID:          account.ID,
+					UpstreamStatusCode: retryResp.StatusCode,
+					UpstreamRequestID:  retryResp.Header.Get("x-request-id"),
+					Kind:               kind,
+					Message:            retryUpstreamMsg,
+					Detail:             retryUpstreamDetail,
+				})
 
 				// If this stage fixed the signature issue, we stop; otherwise we may try the next stage.
 				if retryResp.StatusCode != http.StatusBadRequest || !isSignatureRelatedError(retryBody) {
@@ -701,10 +797,30 @@ urlFallbackLoop:
 			s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope)
 
 			if s.shouldFailoverUpstreamError(resp.StatusCode) {
+				upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
+				upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+				logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
+				maxBytes := 2048
+				if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
+					maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+				}
+				upstreamDetail := ""
+				if logBody {
+					upstreamDetail = truncateString(string(respBody), maxBytes)
+				}
+				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+					Platform:           account.Platform,
+					AccountID:          account.ID,
+					UpstreamStatusCode: resp.StatusCode,
+					UpstreamRequestID:  resp.Header.Get("x-request-id"),
+					Kind:               "failover",
+					Message:            upstreamMsg,
+					Detail:             upstreamDetail,
+				})
 				return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 			}
 
-			return nil, s.writeMappedClaudeError(c, resp.StatusCode, respBody)
+			return nil, s.writeMappedClaudeError(c, account, resp.StatusCode, resp.Header.Get("x-request-id"), respBody)
 		}
 	}
 
@@ -1108,6 +1224,14 @@ urlFallbackLoop:
 
 			resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
 			if err != nil {
+				safeErr := sanitizeUpstreamErrorMessage(err.Error())
+				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+					Platform:           account.Platform,
+					AccountID:          account.ID,
+					UpstreamStatusCode: 0,
+					Kind:               "request_error",
+					Message:            safeErr,
+				})
 				// 检查是否应触发 URL 降级
 				if shouldAntigravityFallbackToNextURL(err, 0) && urlIdx < len(availableURLs)-1 {
 					antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
@@ -1123,6 +1247,7 @@ urlFallbackLoop:
 					continue
 				}
 				log.Printf("%s status=request_failed retries_exhausted error=%v", prefix, err)
+				setOpsUpstreamError(c, 0, safeErr, "")
 				return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries")
 			}
 
@@ -1130,6 +1255,26 @@ urlFallbackLoop:
 			if resp.StatusCode == http.StatusTooManyRequests && urlIdx < len(availableURLs)-1 {
 				respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 				_ = resp.Body.Close()
+				upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
+				upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+				logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
+				maxBytes := 2048
+				if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
+					maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+				}
+				upstreamDetail := ""
+				if logBody {
+					upstreamDetail = truncateString(string(respBody), maxBytes)
+				}
+				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+					Platform:           account.Platform,
+					AccountID:          account.ID,
+					UpstreamStatusCode: resp.StatusCode,
+					UpstreamRequestID:  resp.Header.Get("x-request-id"),
+					Kind:               "retry",
+					Message:            upstreamMsg,
+					Detail:             upstreamDetail,
+				})
 				antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
 				log.Printf("%s URL fallback (HTTP 429): %s -> %s body=%s", prefix, baseURL, availableURLs[urlIdx+1], truncateForLog(respBody, 200))
 				continue urlFallbackLoop
@@ -1140,6 +1285,26 @@ urlFallbackLoop:
 				_ = resp.Body.Close()
 
 				if attempt < antigravityMaxRetries {
+					upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
+					upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+					logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
+					maxBytes := 2048
+					if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
+						maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+					}
+					upstreamDetail := ""
+					if logBody {
+						upstreamDetail = truncateString(string(respBody), maxBytes)
+					}
+					appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+						Platform:           account.Platform,
+						AccountID:          account.ID,
+						UpstreamStatusCode: resp.StatusCode,
+						UpstreamRequestID:  resp.Header.Get("x-request-id"),
+						Kind:               "retry",
+						Message:            upstreamMsg,
+						Detail:             upstreamDetail,
+					})
 					log.Printf("%s status=%d retry=%d/%d", prefix, resp.StatusCode, attempt, antigravityMaxRetries)
 					if !sleepAntigravityBackoffWithContext(ctx, attempt) {
 						log.Printf("%s status=context_canceled_during_backoff", prefix)
@@ -1205,21 +1370,59 @@ urlFallbackLoop:
 
 		s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope)
 
-		if s.shouldFailoverUpstreamError(resp.StatusCode) {
-			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
-		}
-
-		// 解包并返回错误
 		requestID := resp.Header.Get("x-request-id")
 		if requestID != "" {
 			c.Header("x-request-id", requestID)
 		}
-		unwrapped, _ := s.unwrapV1InternalResponse(respBody)
+
+		unwrapped, unwrapErr := s.unwrapV1InternalResponse(respBody)
+		unwrappedForOps := unwrapped
+		if unwrapErr != nil || len(unwrappedForOps) == 0 {
+			unwrappedForOps = respBody
+		}
+		upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(unwrappedForOps))
+		upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+
+		logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
+		maxBytes := 2048
+		if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
+			maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+		}
+		upstreamDetail := ""
+		if logBody {
+			upstreamDetail = truncateString(string(unwrappedForOps), maxBytes)
+		}
+
+		// Always record upstream context for Ops error logs, even when we will failover.
+		setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
+
+		if s.shouldFailoverUpstreamError(resp.StatusCode) {
+			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+				Platform:           account.Platform,
+				AccountID:          account.ID,
+				UpstreamStatusCode: resp.StatusCode,
+				UpstreamRequestID:  requestID,
+				Kind:               "failover",
+				Message:            upstreamMsg,
+				Detail:             upstreamDetail,
+			})
+			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
+		}
+
 		contentType := resp.Header.Get("Content-Type")
 		if contentType == "" {
 			contentType = "application/json"
 		}
-		c.Data(resp.StatusCode, contentType, unwrapped)
+		appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+			Platform:           account.Platform,
+			AccountID:          account.ID,
+			UpstreamStatusCode: resp.StatusCode,
+			UpstreamRequestID:  requestID,
+			Kind:               "http_error",
+			Message:            upstreamMsg,
+			Detail:             upstreamDetail,
+		})
+		c.Data(resp.StatusCode, contentType, unwrappedForOps)
 		return nil, fmt.Errorf("antigravity upstream error: %d", resp.StatusCode)
 	}
 
@@ -1674,9 +1877,35 @@ func (s *AntigravityGatewayService) writeClaudeError(c *gin.Context, status int,
 	return fmt.Errorf("%s", message)
 }
 
-func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, upstreamStatus int, body []byte) error {
-	// 记录上游错误详情便于调试
-	log.Printf("[antigravity-Forward] upstream_error status=%d body=%s", upstreamStatus, string(body))
+func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, account *Account, upstreamStatus int, upstreamRequestID string, body []byte) error {
+	upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
+	upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+
+	logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
+	maxBytes := 2048
+	if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
+		maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+	}
+
+	upstreamDetail := ""
+	if logBody {
+		upstreamDetail = truncateString(string(body), maxBytes)
+	}
+	setOpsUpstreamError(c, upstreamStatus, upstreamMsg, upstreamDetail)
+	appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+		Platform:           account.Platform,
+		AccountID:          account.ID,
+		UpstreamStatusCode: upstreamStatus,
+		UpstreamRequestID:  upstreamRequestID,
+		Kind:               "http_error",
+		Message:            upstreamMsg,
+		Detail:             upstreamDetail,
+	})
+
+	// 记录上游错误详情便于排障（可选：由配置控制；不回显到客户端）
+	if logBody {
+		log.Printf("[antigravity-Forward] upstream_error status=%d body=%s", upstreamStatus, truncateForLog(body, maxBytes))
+	}
 
 	var statusCode int
 	var errType, errMsg string
@@ -1712,7 +1941,10 @@ func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, upstr
 		"type":  "error",
 		"error": gin.H{"type": errType, "message": errMsg},
 	})
-	return fmt.Errorf("upstream error: %d", upstreamStatus)
+	if upstreamMsg == "" {
+		return fmt.Errorf("upstream error: %d", upstreamStatus)
+	}
+	return fmt.Errorf("upstream error: %d message=%s", upstreamStatus, upstreamMsg)
 }
 
 func (s *AntigravityGatewayService) writeGoogleError(c *gin.Context, status int, message string) error {
diff --git a/backend/internal/service/auth_service.go b/backend/internal/service/auth_service.go
index 61b15cd8..386b43fc 100644
--- a/backend/internal/service/auth_service.go
+++ b/backend/internal/service/auth_service.go
@@ -357,7 +357,7 @@ func (s *AuthService) Login(ctx context.Context, email, password string) (string
 // - 如果邮箱已存在：直接登录（不需要本地密码）
 // - 如果邮箱不存在：创建新用户并登录
 //
-// 注意：该函数用于“终端用户登录 Sub2API 本身”的场景（不同于上游账号的 OAuth，例如 OpenAI/Gemini）。
+// 注意：该函数用于 LinuxDo OAuth 登录场景（不同于上游账号的 OAuth，例如 Claude/OpenAI/Gemini）。
 // 为了满足现有数据库约束（需要密码哈希），新用户会生成随机密码并进行哈希保存。
 func (s *AuthService) LoginOrRegisterOAuth(ctx context.Context, email, username string) (string, *User, error) {
 	email = strings.TrimSpace(email)
@@ -376,8 +376,8 @@ func (s *AuthService) LoginOrRegisterOAuth(ctx context.Context, email, username
 	user, err := s.userRepo.GetByEmail(ctx, email)
 	if err != nil {
 		if errors.Is(err, ErrUserNotFound) {
-			// OAuth 首次登录视为注册。
-			if s.settingService != nil && !s.settingService.IsRegistrationEnabled(ctx) {
+			// OAuth 首次登录视为注册（fail-close：settingService 未配置时不允许注册）
+			if s.settingService == nil || !s.settingService.IsRegistrationEnabled(ctx) {
 				return "", nil, ErrRegDisabled
 			}
 
diff --git a/backend/internal/service/domain_constants.go b/backend/internal/service/domain_constants.go
index 77709553..398d9fbd 100644
--- a/backend/internal/service/domain_constants.go
+++ b/backend/internal/service/domain_constants.go
@@ -63,6 +63,9 @@ const (
 	SubscriptionStatusSuspended = "suspended"
 )
 
+// LinuxDoConnectSyntheticEmailDomain 是 LinuxDo Connect 用户的合成邮箱后缀（RFC 保留域名）。
+const LinuxDoConnectSyntheticEmailDomain = "@linuxdo-connect.invalid"
+
 // Setting keys
 const (
 	// 注册设置
@@ -83,6 +86,12 @@ const (
 	SettingKeyTurnstileSiteKey   = "turnstile_site_key"   // Turnstile Site Key
 	SettingKeyTurnstileSecretKey = "turnstile_secret_key" // Turnstile Secret Key
 
+	// LinuxDo Connect OAuth 登录设置
+	SettingKeyLinuxDoConnectEnabled      = "linuxdo_connect_enabled"
+	SettingKeyLinuxDoConnectClientID     = "linuxdo_connect_client_id"
+	SettingKeyLinuxDoConnectClientSecret = "linuxdo_connect_client_secret"
+	SettingKeyLinuxDoConnectRedirectURL  = "linuxdo_connect_redirect_url"
+
 	// OEM设置
 	SettingKeySiteName     = "site_name"     // 网站名称
 	SettingKeySiteLogo     = "site_logo"     // 网站Logo (base64)
@@ -113,16 +122,31 @@ const (
 	SettingKeyEnableIdentityPatch = "enable_identity_patch"
 	SettingKeyIdentityPatchPrompt = "identity_patch_prompt"
 
-	// LinuxDo Connect OAuth 登录（终端用户 SSO）
-	SettingKeyLinuxDoConnectEnabled      = "linuxdo_connect_enabled"
-	SettingKeyLinuxDoConnectClientID     = "linuxdo_connect_client_id"
-	SettingKeyLinuxDoConnectClientSecret = "linuxdo_connect_client_secret"
-	SettingKeyLinuxDoConnectRedirectURL  = "linuxdo_connect_redirect_url"
-)
+	// =========================
+	// Ops Monitoring (vNext)
+	// =========================
 
-// LinuxDoConnectSyntheticEmailDomain 是 LinuxDo Connect 用户的合成邮箱后缀（RFC 保留域名）。
-// 目的：避免第三方登录返回的用户标识与本地真实邮箱发生碰撞，进而造成账号被接管的风险。
-const LinuxDoConnectSyntheticEmailDomain = "@linuxdo-connect.invalid"
+	// SettingKeyOpsMonitoringEnabled is a DB-backed soft switch to enable/disable ops module at runtime.
+	SettingKeyOpsMonitoringEnabled = "ops_monitoring_enabled"
+
+	// SettingKeyOpsRealtimeMonitoringEnabled controls realtime features (e.g. WS/QPS push).
+	SettingKeyOpsRealtimeMonitoringEnabled = "ops_realtime_monitoring_enabled"
+
+	// SettingKeyOpsQueryModeDefault controls the default query mode for ops dashboard (auto/raw/preagg).
+	SettingKeyOpsQueryModeDefault = "ops_query_mode_default"
+
+	// SettingKeyOpsEmailNotificationConfig stores JSON config for ops email notifications.
+	SettingKeyOpsEmailNotificationConfig = "ops_email_notification_config"
+
+	// SettingKeyOpsAlertRuntimeSettings stores JSON config for ops alert evaluator runtime settings.
+	SettingKeyOpsAlertRuntimeSettings = "ops_alert_runtime_settings"
+
+	// SettingKeyOpsMetricsIntervalSeconds controls the ops metrics collector interval (>=60).
+	SettingKeyOpsMetricsIntervalSeconds = "ops_metrics_interval_seconds"
+
+	// SettingKeyOpsAdvancedSettings stores JSON config for ops advanced settings (data retention, aggregation).
+	SettingKeyOpsAdvancedSettings = "ops_advanced_settings"
+)
 
 // AdminAPIKeyPrefix is the prefix for admin API keys (distinct from user "sk-" keys).
 const AdminAPIKeyPrefix = "admin-"
diff --git a/backend/internal/service/gateway_service.go b/backend/internal/service/gateway_service.go
index 31148b17..b48af7b0 100644
--- a/backend/internal/service/gateway_service.go
+++ b/backend/internal/service/gateway_service.go
@@ -1399,7 +1399,24 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 			if resp != nil && resp.Body != nil {
 				_ = resp.Body.Close()
 			}
-			return nil, fmt.Errorf("upstream request failed: %w", err)
+			// Ensure the client receives an error response (handlers assume Forward writes on non-failover errors).
+			safeErr := sanitizeUpstreamErrorMessage(err.Error())
+			setOpsUpstreamError(c, 0, safeErr, "")
+			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+				Platform:           account.Platform,
+				AccountID:          account.ID,
+				UpstreamStatusCode: 0,
+				Kind:               "request_error",
+				Message:            safeErr,
+			})
+			c.JSON(http.StatusBadGateway, gin.H{
+				"type": "error",
+				"error": gin.H{
+					"type":    "upstream_error",
+					"message": "Upstream request failed",
+				},
+			})
+			return nil, fmt.Errorf("upstream request failed: %s", safeErr)
 		}
 
 		// 优先检测thinking block签名错误（400）并重试一次
@@ -1409,6 +1426,21 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 				_ = resp.Body.Close()
 
 				if s.isThinkingBlockSignatureError(respBody) {
+					appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+						Platform:           account.Platform,
+						AccountID:          account.ID,
+						UpstreamStatusCode: resp.StatusCode,
+						UpstreamRequestID:  resp.Header.Get("x-request-id"),
+						Kind:               "signature_error",
+						Message:            extractUpstreamErrorMessage(respBody),
+						Detail: func() string {
+							if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+								return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
+							}
+							return ""
+						}(),
+					})
+
 					looksLikeToolSignatureError := func(msg string) bool {
 						m := strings.ToLower(msg)
 						return strings.Contains(m, "tool_use") ||
@@ -1445,6 +1477,20 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 							retryRespBody, retryReadErr := io.ReadAll(io.LimitReader(retryResp.Body, 2<<20))
 							_ = retryResp.Body.Close()
 							if retryReadErr == nil && retryResp.StatusCode == 400 && s.isThinkingBlockSignatureError(retryRespBody) {
+								appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+									Platform:           account.Platform,
+									AccountID:          account.ID,
+									UpstreamStatusCode: retryResp.StatusCode,
+									UpstreamRequestID:  retryResp.Header.Get("x-request-id"),
+									Kind:               "signature_retry_thinking",
+									Message:            extractUpstreamErrorMessage(retryRespBody),
+									Detail: func() string {
+										if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+											return truncateString(string(retryRespBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
+										}
+										return ""
+									}(),
+								})
 								msg2 := extractUpstreamErrorMessage(retryRespBody)
 								if looksLikeToolSignatureError(msg2) && time.Since(retryStart) < maxRetryElapsed {
 									log.Printf("Account %d: signature retry still failing and looks tool-related, retrying with tool blocks downgraded", account.ID)
@@ -1459,6 +1505,13 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 										if retryResp2 != nil && retryResp2.Body != nil {
 											_ = retryResp2.Body.Close()
 										}
+										appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+											Platform:           account.Platform,
+											AccountID:          account.ID,
+											UpstreamStatusCode: 0,
+											Kind:               "signature_retry_tools_request_error",
+											Message:            sanitizeUpstreamErrorMessage(retryErr2.Error()),
+										})
 										log.Printf("Account %d: tool-downgrade signature retry failed: %v", account.ID, retryErr2)
 									} else {
 										log.Printf("Account %d: tool-downgrade signature retry build failed: %v", account.ID, buildErr2)
@@ -1508,9 +1561,24 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 					break
 				}
 
+				respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
+				_ = resp.Body.Close()
+				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+					Platform:           account.Platform,
+					AccountID:          account.ID,
+					UpstreamStatusCode: resp.StatusCode,
+					UpstreamRequestID:  resp.Header.Get("x-request-id"),
+					Kind:               "retry",
+					Message:            extractUpstreamErrorMessage(respBody),
+					Detail: func() string {
+						if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+							return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
+						}
+						return ""
+					}(),
+				})
 				log.Printf("Account %d: upstream error %d, retry %d/%d after %v (elapsed=%v/%v)",
 					account.ID, resp.StatusCode, attempt, maxRetryAttempts, delay, elapsed, maxRetryElapsed)
-				_ = resp.Body.Close()
 				if err := sleepWithContext(ctx, delay); err != nil {
 					return nil, err
 				}
@@ -1538,7 +1606,25 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 	// 处理重试耗尽的情况
 	if resp.StatusCode >= 400 && s.shouldRetryUpstreamError(account, resp.StatusCode) {
 		if s.shouldFailoverUpstreamError(resp.StatusCode) {
+			respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
+			_ = resp.Body.Close()
+			resp.Body = io.NopCloser(bytes.NewReader(respBody))
+
 			s.handleRetryExhaustedSideEffects(ctx, resp, account)
+			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+				Platform:           account.Platform,
+				AccountID:          account.ID,
+				UpstreamStatusCode: resp.StatusCode,
+				UpstreamRequestID:  resp.Header.Get("x-request-id"),
+				Kind:               "retry_exhausted_failover",
+				Message:            extractUpstreamErrorMessage(respBody),
+				Detail: func() string {
+					if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+						return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
+					}
+					return ""
+				}(),
+			})
 			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 		}
 		return s.handleRetryExhaustedError(ctx, resp, c, account)
@@ -1546,7 +1632,25 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 
 	// 处理可切换账号的错误
 	if resp.StatusCode >= 400 && s.shouldFailoverUpstreamError(resp.StatusCode) {
+		respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
+		_ = resp.Body.Close()
+		resp.Body = io.NopCloser(bytes.NewReader(respBody))
+
 		s.handleFailoverSideEffects(ctx, resp, account)
+		appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+			Platform:           account.Platform,
+			AccountID:          account.ID,
+			UpstreamStatusCode: resp.StatusCode,
+			UpstreamRequestID:  resp.Header.Get("x-request-id"),
+			Kind:               "failover",
+			Message:            extractUpstreamErrorMessage(respBody),
+			Detail: func() string {
+				if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+					return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
+				}
+				return ""
+			}(),
+		})
 		return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 	}
 
@@ -1563,6 +1667,26 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
 			resp.Body = io.NopCloser(bytes.NewReader(respBody))
 
 			if s.shouldFailoverOn400(respBody) {
+				upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
+				upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+				upstreamDetail := ""
+				if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+					maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+					if maxBytes <= 0 {
+						maxBytes = 2048
+					}
+					upstreamDetail = truncateString(string(respBody), maxBytes)
+				}
+				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+					Platform:           account.Platform,
+					AccountID:          account.ID,
+					UpstreamStatusCode: resp.StatusCode,
+					UpstreamRequestID:  resp.Header.Get("x-request-id"),
+					Kind:               "failover_on_400",
+					Message:            upstreamMsg,
+					Detail:             upstreamDetail,
+				})
+
 				if s.cfg.Gateway.LogUpstreamErrorBody {
 					log.Printf(
 						"Account %d: 400 error, attempting failover: %s",
@@ -1859,7 +1983,30 @@ func extractUpstreamErrorMessage(body []byte) string {
 }
 
 func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*ForwardResult, error) {
-	body, _ := io.ReadAll(resp.Body)
+	body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
+
+	upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
+	upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+
+	// Enrich Ops error logs with upstream status + message, and optionally a truncated body snippet.
+	upstreamDetail := ""
+	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+		maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+		if maxBytes <= 0 {
+			maxBytes = 2048
+		}
+		upstreamDetail = truncateString(string(body), maxBytes)
+	}
+	setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
+	appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+		Platform:           account.Platform,
+		AccountID:          account.ID,
+		UpstreamStatusCode: resp.StatusCode,
+		UpstreamRequestID:  resp.Header.Get("x-request-id"),
+		Kind:               "http_error",
+		Message:            upstreamMsg,
+		Detail:             upstreamDetail,
+	})
 
 	// 处理上游错误，标记账号状态
 	shouldDisable := false
@@ -1870,24 +2017,33 @@ func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Res
 		return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 	}
 
+	// 记录上游错误响应体摘要便于排障（可选：由配置控制；不回显到客户端）
+	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+		log.Printf(
+			"Upstream error %d (account=%d platform=%s type=%s): %s",
+			resp.StatusCode,
+			account.ID,
+			account.Platform,
+			account.Type,
+			truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
+		)
+	}
+
 	// 根据状态码返回适当的自定义错误响应（不透传上游详细信息）
 	var errType, errMsg string
 	var statusCode int
 
 	switch resp.StatusCode {
 	case 400:
-		// 仅记录上游错误摘要（避免输出请求内容）；需要时可通过配置打开
-		if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
-			log.Printf(
-				"Upstream 400 error (account=%d platform=%s type=%s): %s",
-				account.ID,
-				account.Platform,
-				account.Type,
-				truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
-			)
-		}
 		c.Data(http.StatusBadRequest, "application/json", body)
-		return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
+		summary := upstreamMsg
+		if summary == "" {
+			summary = truncateForLog(body, 512)
+		}
+		if summary == "" {
+			return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
+		}
+		return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, summary)
 	case 401:
 		statusCode = http.StatusBadGateway
 		errType = "upstream_error"
@@ -1923,11 +2079,14 @@ func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Res
 		},
 	})
 
-	return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
+	if upstreamMsg == "" {
+		return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
+	}
+	return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
 }
 
 func (s *GatewayService) handleRetryExhaustedSideEffects(ctx context.Context, resp *http.Response, account *Account) {
-	body, _ := io.ReadAll(resp.Body)
+	body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 	statusCode := resp.StatusCode
 
 	// OAuth/Setup Token 账号的 403：标记账号异常
@@ -1941,7 +2100,7 @@ func (s *GatewayService) handleRetryExhaustedSideEffects(ctx context.Context, re
 }
 
 func (s *GatewayService) handleFailoverSideEffects(ctx context.Context, resp *http.Response, account *Account) {
-	body, _ := io.ReadAll(resp.Body)
+	body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 	s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body)
 }
 
@@ -1949,8 +2108,45 @@ func (s *GatewayService) handleFailoverSideEffects(ctx context.Context, resp *ht
 // OAuth 403：标记账号异常
 // API Key 未配置错误码：仅返回错误，不标记账号
 func (s *GatewayService) handleRetryExhaustedError(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*ForwardResult, error) {
+	// Capture upstream error body before side-effects consume the stream.
+	respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
+	_ = resp.Body.Close()
+	resp.Body = io.NopCloser(bytes.NewReader(respBody))
+
 	s.handleRetryExhaustedSideEffects(ctx, resp, account)
 
+	upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
+	upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+	upstreamDetail := ""
+	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+		maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+		if maxBytes <= 0 {
+			maxBytes = 2048
+		}
+		upstreamDetail = truncateString(string(respBody), maxBytes)
+	}
+	setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
+	appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+		Platform:           account.Platform,
+		AccountID:          account.ID,
+		UpstreamStatusCode: resp.StatusCode,
+		UpstreamRequestID:  resp.Header.Get("x-request-id"),
+		Kind:               "retry_exhausted",
+		Message:            upstreamMsg,
+		Detail:             upstreamDetail,
+	})
+
+	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+		log.Printf(
+			"Upstream error %d retries_exhausted (account=%d platform=%s type=%s): %s",
+			resp.StatusCode,
+			account.ID,
+			account.Platform,
+			account.Type,
+			truncateForLog(respBody, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
+		)
+	}
+
 	// 返回统一的重试耗尽错误响应
 	c.JSON(http.StatusBadGateway, gin.H{
 		"type": "error",
@@ -1960,7 +2156,10 @@ func (s *GatewayService) handleRetryExhaustedError(ctx context.Context, resp *ht
 		},
 	})
 
-	return nil, fmt.Errorf("upstream error: %d (retries exhausted)", resp.StatusCode)
+	if upstreamMsg == "" {
+		return nil, fmt.Errorf("upstream error: %d (retries exhausted)", resp.StatusCode)
+	}
+	return nil, fmt.Errorf("upstream error: %d (retries exhausted) message=%s", resp.StatusCode, upstreamMsg)
 }
 
 // streamingResult 流式响应结果
@@ -2490,6 +2689,7 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context,
 	// 发送请求
 	resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
 	if err != nil {
+		setOpsUpstreamError(c, 0, sanitizeUpstreamErrorMessage(err.Error()), "")
 		s.countTokensError(c, http.StatusBadGateway, "upstream_error", "Request failed")
 		return fmt.Errorf("upstream request failed: %w", err)
 	}
@@ -2527,6 +2727,18 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context,
 		// 标记账号状态（429/529等）
 		s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
 
+		upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
+		upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+		upstreamDetail := ""
+		if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+			maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+			if maxBytes <= 0 {
+				maxBytes = 2048
+			}
+			upstreamDetail = truncateString(string(respBody), maxBytes)
+		}
+		setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
+
 		// 记录上游错误摘要便于排障（不回显请求内容）
 		if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
 			log.Printf(
@@ -2548,7 +2760,10 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context,
 			errMsg = "Service overloaded"
 		}
 		s.countTokensError(c, resp.StatusCode, "upstream_error", errMsg)
-		return fmt.Errorf("upstream error: %d", resp.StatusCode)
+		if upstreamMsg == "" {
+			return fmt.Errorf("upstream error: %d", resp.StatusCode)
+		}
+		return fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
 	}
 
 	// 透传成功响应
diff --git a/backend/internal/service/gemini_messages_compat_service.go b/backend/internal/service/gemini_messages_compat_service.go
index 78452b1e..d1b65b71 100644
--- a/backend/internal/service/gemini_messages_compat_service.go
+++ b/backend/internal/service/gemini_messages_compat_service.go
@@ -543,12 +543,21 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
 
 		resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
 		if err != nil {
+			safeErr := sanitizeUpstreamErrorMessage(err.Error())
+			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+				Platform:           account.Platform,
+				AccountID:          account.ID,
+				UpstreamStatusCode: 0,
+				Kind:               "request_error",
+				Message:            safeErr,
+			})
 			if attempt < geminiMaxRetries {
 				log.Printf("Gemini account %d: upstream request failed, retry %d/%d: %v", account.ID, attempt, geminiMaxRetries, err)
 				sleepGeminiBackoff(attempt)
 				continue
 			}
-			return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries: "+sanitizeUpstreamErrorMessage(err.Error()))
+			setOpsUpstreamError(c, 0, safeErr, "")
+			return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries: "+safeErr)
 		}
 
 		// Special-case: signature/thought_signature validation errors are not transient, but may be fixed by
@@ -558,6 +567,30 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
 			_ = resp.Body.Close()
 
 			if isGeminiSignatureRelatedError(respBody) {
+				upstreamReqID := resp.Header.Get(requestIDHeader)
+				if upstreamReqID == "" {
+					upstreamReqID = resp.Header.Get("x-goog-request-id")
+				}
+				upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
+				upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+				upstreamDetail := ""
+				if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+					maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+					if maxBytes <= 0 {
+						maxBytes = 2048
+					}
+					upstreamDetail = truncateString(string(respBody), maxBytes)
+				}
+				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+					Platform:           account.Platform,
+					AccountID:          account.ID,
+					UpstreamStatusCode: resp.StatusCode,
+					UpstreamRequestID:  upstreamReqID,
+					Kind:               "signature_error",
+					Message:            upstreamMsg,
+					Detail:             upstreamDetail,
+				})
+
 				var strippedClaudeBody []byte
 				stageName := ""
 				switch signatureRetryStage {
@@ -608,6 +641,30 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
 				s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
 			}
 			if attempt < geminiMaxRetries {
+				upstreamReqID := resp.Header.Get(requestIDHeader)
+				if upstreamReqID == "" {
+					upstreamReqID = resp.Header.Get("x-goog-request-id")
+				}
+				upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
+				upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+				upstreamDetail := ""
+				if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+					maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+					if maxBytes <= 0 {
+						maxBytes = 2048
+					}
+					upstreamDetail = truncateString(string(respBody), maxBytes)
+				}
+				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+					Platform:           account.Platform,
+					AccountID:          account.ID,
+					UpstreamStatusCode: resp.StatusCode,
+					UpstreamRequestID:  upstreamReqID,
+					Kind:               "retry",
+					Message:            upstreamMsg,
+					Detail:             upstreamDetail,
+				})
+
 				log.Printf("Gemini account %d: upstream status %d, retry %d/%d", account.ID, resp.StatusCode, attempt, geminiMaxRetries)
 				sleepGeminiBackoff(attempt)
 				continue
@@ -633,12 +690,62 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
 		}
 		s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
 		if tempMatched {
+			upstreamReqID := resp.Header.Get(requestIDHeader)
+			if upstreamReqID == "" {
+				upstreamReqID = resp.Header.Get("x-goog-request-id")
+			}
+			upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
+			upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+			upstreamDetail := ""
+			if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+				maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+				if maxBytes <= 0 {
+					maxBytes = 2048
+				}
+				upstreamDetail = truncateString(string(respBody), maxBytes)
+			}
+			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+				Platform:           account.Platform,
+				AccountID:          account.ID,
+				UpstreamStatusCode: resp.StatusCode,
+				UpstreamRequestID:  upstreamReqID,
+				Kind:               "failover",
+				Message:            upstreamMsg,
+				Detail:             upstreamDetail,
+			})
 			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 		}
 		if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {
+			upstreamReqID := resp.Header.Get(requestIDHeader)
+			if upstreamReqID == "" {
+				upstreamReqID = resp.Header.Get("x-goog-request-id")
+			}
+			upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
+			upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+			upstreamDetail := ""
+			if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+				maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+				if maxBytes <= 0 {
+					maxBytes = 2048
+				}
+				upstreamDetail = truncateString(string(respBody), maxBytes)
+			}
+			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+				Platform:           account.Platform,
+				AccountID:          account.ID,
+				UpstreamStatusCode: resp.StatusCode,
+				UpstreamRequestID:  upstreamReqID,
+				Kind:               "failover",
+				Message:            upstreamMsg,
+				Detail:             upstreamDetail,
+			})
 			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 		}
-		return nil, s.writeGeminiMappedError(c, resp.StatusCode, respBody)
+		upstreamReqID := resp.Header.Get(requestIDHeader)
+		if upstreamReqID == "" {
+			upstreamReqID = resp.Header.Get("x-goog-request-id")
+		}
+		return nil, s.writeGeminiMappedError(c, account, resp.StatusCode, upstreamReqID, respBody)
 	}
 
 	requestID := resp.Header.Get(requestIDHeader)
@@ -863,6 +970,14 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
 
 		resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
 		if err != nil {
+			safeErr := sanitizeUpstreamErrorMessage(err.Error())
+			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+				Platform:           account.Platform,
+				AccountID:          account.ID,
+				UpstreamStatusCode: 0,
+				Kind:               "request_error",
+				Message:            safeErr,
+			})
 			if attempt < geminiMaxRetries {
 				log.Printf("Gemini account %d: upstream request failed, retry %d/%d: %v", account.ID, attempt, geminiMaxRetries, err)
 				sleepGeminiBackoff(attempt)
@@ -880,7 +995,8 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
 					FirstTokenMs: nil,
 				}, nil
 			}
-			return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries: "+sanitizeUpstreamErrorMessage(err.Error()))
+			setOpsUpstreamError(c, 0, safeErr, "")
+			return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries: "+safeErr)
 		}
 
 		if resp.StatusCode >= 400 && s.shouldRetryGeminiUpstreamError(account, resp.StatusCode) {
@@ -899,6 +1015,30 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
 				s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
 			}
 			if attempt < geminiMaxRetries {
+				upstreamReqID := resp.Header.Get(requestIDHeader)
+				if upstreamReqID == "" {
+					upstreamReqID = resp.Header.Get("x-goog-request-id")
+				}
+				upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
+				upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+				upstreamDetail := ""
+				if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+					maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+					if maxBytes <= 0 {
+						maxBytes = 2048
+					}
+					upstreamDetail = truncateString(string(respBody), maxBytes)
+				}
+				appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+					Platform:           account.Platform,
+					AccountID:          account.ID,
+					UpstreamStatusCode: resp.StatusCode,
+					UpstreamRequestID:  upstreamReqID,
+					Kind:               "retry",
+					Message:            upstreamMsg,
+					Detail:             upstreamDetail,
+				})
+
 				log.Printf("Gemini account %d: upstream status %d, retry %d/%d", account.ID, resp.StatusCode, attempt, geminiMaxRetries)
 				sleepGeminiBackoff(attempt)
 				continue
@@ -962,19 +1102,84 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
 		}
 
 		if tempMatched {
+			evBody := unwrapIfNeeded(isOAuth, respBody)
+			upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(evBody))
+			upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+			upstreamDetail := ""
+			if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+				maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+				if maxBytes <= 0 {
+					maxBytes = 2048
+				}
+				upstreamDetail = truncateString(string(evBody), maxBytes)
+			}
+			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+				Platform:           account.Platform,
+				AccountID:          account.ID,
+				UpstreamStatusCode: resp.StatusCode,
+				UpstreamRequestID:  requestID,
+				Kind:               "failover",
+				Message:            upstreamMsg,
+				Detail:             upstreamDetail,
+			})
 			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 		}
 		if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {
+			evBody := unwrapIfNeeded(isOAuth, respBody)
+			upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(evBody))
+			upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+			upstreamDetail := ""
+			if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+				maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+				if maxBytes <= 0 {
+					maxBytes = 2048
+				}
+				upstreamDetail = truncateString(string(evBody), maxBytes)
+			}
+			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+				Platform:           account.Platform,
+				AccountID:          account.ID,
+				UpstreamStatusCode: resp.StatusCode,
+				UpstreamRequestID:  requestID,
+				Kind:               "failover",
+				Message:            upstreamMsg,
+				Detail:             upstreamDetail,
+			})
 			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 		}
 
 		respBody = unwrapIfNeeded(isOAuth, respBody)
+		upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
+		upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+		upstreamDetail := ""
+		if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+			maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+			if maxBytes <= 0 {
+				maxBytes = 2048
+			}
+			upstreamDetail = truncateString(string(respBody), maxBytes)
+			log.Printf("[Gemini] native upstream error %d: %s", resp.StatusCode, truncateForLog(respBody, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes))
+		}
+		setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
+		appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+			Platform:           account.Platform,
+			AccountID:          account.ID,
+			UpstreamStatusCode: resp.StatusCode,
+			UpstreamRequestID:  requestID,
+			Kind:               "http_error",
+			Message:            upstreamMsg,
+			Detail:             upstreamDetail,
+		})
+
 		contentType := resp.Header.Get("Content-Type")
 		if contentType == "" {
 			contentType = "application/json"
 		}
 		c.Data(resp.StatusCode, contentType, respBody)
-		return nil, fmt.Errorf("gemini upstream error: %d", resp.StatusCode)
+		if upstreamMsg == "" {
+			return nil, fmt.Errorf("gemini upstream error: %d", resp.StatusCode)
+		}
+		return nil, fmt.Errorf("gemini upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
 	}
 
 	var usage *ClaudeUsage
@@ -1076,7 +1281,32 @@ func sanitizeUpstreamErrorMessage(msg string) string {
 	return sensitiveQueryParamRegex.ReplaceAllString(msg, `$1***`)
 }
 
-func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, upstreamStatus int, body []byte) error {
+func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, account *Account, upstreamStatus int, upstreamRequestID string, body []byte) error {
+	upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
+	upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+	upstreamDetail := ""
+	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+		maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+		if maxBytes <= 0 {
+			maxBytes = 2048
+		}
+		upstreamDetail = truncateString(string(body), maxBytes)
+	}
+	setOpsUpstreamError(c, upstreamStatus, upstreamMsg, upstreamDetail)
+	appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+		Platform:           account.Platform,
+		AccountID:          account.ID,
+		UpstreamStatusCode: upstreamStatus,
+		UpstreamRequestID:  upstreamRequestID,
+		Kind:               "http_error",
+		Message:            upstreamMsg,
+		Detail:             upstreamDetail,
+	})
+
+	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+		log.Printf("[Gemini] upstream error %d: %s", upstreamStatus, truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes))
+	}
+
 	var statusCode int
 	var errType, errMsg string
 
@@ -1184,7 +1414,10 @@ func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, ups
 		"type":  "error",
 		"error": gin.H{"type": errType, "message": errMsg},
 	})
-	return fmt.Errorf("upstream error: %d", upstreamStatus)
+	if upstreamMsg == "" {
+		return fmt.Errorf("upstream error: %d", upstreamStatus)
+	}
+	return fmt.Errorf("upstream error: %d message=%s", upstreamStatus, upstreamMsg)
 }
 
 type claudeErrorMapping struct {
diff --git a/backend/internal/service/openai_codex_transform.go b/backend/internal/service/openai_codex_transform.go
index 965fb770..94e74f22 100644
--- a/backend/internal/service/openai_codex_transform.go
+++ b/backend/internal/service/openai_codex_transform.go
@@ -115,12 +115,6 @@ func applyCodexOAuthTransform(reqBody map[string]any) codexTransformResult {
 	existingInstructions = strings.TrimSpace(existingInstructions)
 
 	if instructions != "" {
-		if existingInstructions != "" && existingInstructions != instructions {
-			if input, ok := reqBody["input"].([]any); ok {
-				reqBody["input"] = prependSystemInstruction(input, existingInstructions)
-				result.Modified = true
-			}
-		}
 		if existingInstructions != instructions {
 			reqBody["instructions"] = instructions
 			result.Modified = true
@@ -129,7 +123,6 @@ func applyCodexOAuthTransform(reqBody map[string]any) codexTransformResult {
 
 	if input, ok := reqBody["input"].([]any); ok {
 		input = filterCodexInput(input)
-		input = normalizeOrphanedToolOutputs(input)
 		reqBody["input"] = input
 		result.Modified = true
 	}
@@ -266,19 +259,6 @@ func filterCodexInput(input []any) []any {
 	return filtered
 }
 
-func prependSystemInstruction(input []any, instructions string) []any {
-	message := map[string]any{
-		"role": "system",
-		"content": []any{
-			map[string]any{
-				"type": "input_text",
-				"text": instructions,
-			},
-		},
-	}
-	return append([]any{message}, input...)
-}
-
 func normalizeCodexTools(reqBody map[string]any) bool {
 	rawTools, ok := reqBody["tools"]
 	if !ok || rawTools == nil {
@@ -341,110 +321,6 @@ func normalizeCodexTools(reqBody map[string]any) bool {
 	return modified
 }
 
-func normalizeOrphanedToolOutputs(input []any) []any {
-	functionCallIDs := map[string]bool{}
-	localShellCallIDs := map[string]bool{}
-	customToolCallIDs := map[string]bool{}
-
-	for _, item := range input {
-		m, ok := item.(map[string]any)
-		if !ok {
-			continue
-		}
-		callID := getCallID(m)
-		if callID == "" {
-			continue
-		}
-		switch m["type"] {
-		case "function_call":
-			functionCallIDs[callID] = true
-		case "local_shell_call":
-			localShellCallIDs[callID] = true
-		case "custom_tool_call":
-			customToolCallIDs[callID] = true
-		}
-	}
-
-	output := make([]any, 0, len(input))
-	for _, item := range input {
-		m, ok := item.(map[string]any)
-		if !ok {
-			output = append(output, item)
-			continue
-		}
-		switch m["type"] {
-		case "function_call_output":
-			callID := getCallID(m)
-			if callID == "" || (!functionCallIDs[callID] && !localShellCallIDs[callID]) {
-				output = append(output, convertOrphanedOutputToMessage(m, callID))
-				continue
-			}
-		case "custom_tool_call_output":
-			callID := getCallID(m)
-			if callID == "" || !customToolCallIDs[callID] {
-				output = append(output, convertOrphanedOutputToMessage(m, callID))
-				continue
-			}
-		case "local_shell_call_output":
-			callID := getCallID(m)
-			if callID == "" || !localShellCallIDs[callID] {
-				output = append(output, convertOrphanedOutputToMessage(m, callID))
-				continue
-			}
-		}
-		output = append(output, m)
-	}
-	return output
-}
-
-func getCallID(item map[string]any) string {
-	raw, ok := item["call_id"]
-	if !ok {
-		return ""
-	}
-	callID, ok := raw.(string)
-	if !ok {
-		return ""
-	}
-	callID = strings.TrimSpace(callID)
-	if callID == "" {
-		return ""
-	}
-	return callID
-}
-
-func convertOrphanedOutputToMessage(item map[string]any, callID string) map[string]any {
-	toolName := "tool"
-	if name, ok := item["name"].(string); ok && name != "" {
-		toolName = name
-	}
-	labelID := callID
-	if labelID == "" {
-		labelID = "unknown"
-	}
-	text := stringifyOutput(item["output"])
-	if len(text) > 16000 {
-		text = text[:16000] + "\n...[truncated]"
-	}
-	return map[string]any{
-		"type":    "message",
-		"role":    "assistant",
-		"content": fmt.Sprintf("[Previous %s result; call_id=%s]: %s", toolName, labelID, text),
-	}
-}
-
-func stringifyOutput(output any) string {
-	switch v := output.(type) {
-	case string:
-		return v
-	default:
-		if data, err := json.Marshal(v); err == nil {
-			return string(data)
-		}
-		return fmt.Sprintf("%v", v)
-	}
-}
-
 func codexCachePath(filename string) string {
 	home, err := os.UserHomeDir()
 	if err != nil {
diff --git a/backend/internal/service/openai_gateway_service.go b/backend/internal/service/openai_gateway_service.go
index 8b1f214b..b3ee469a 100644
--- a/backend/internal/service/openai_gateway_service.go
+++ b/backend/internal/service/openai_gateway_service.go
@@ -12,7 +12,6 @@ import (
 	"io"
 	"log"
 	"net/http"
-	"os"
 	"regexp"
 	"sort"
 	"strconv"
@@ -513,7 +512,7 @@ func (s *OpenAIGatewayService) shouldFailoverUpstreamError(statusCode int) bool
 }
 
 func (s *OpenAIGatewayService) handleFailoverSideEffects(ctx context.Context, resp *http.Response, account *Account) {
-	body, _ := io.ReadAll(resp.Body)
+	body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 	s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body)
 }
 
@@ -594,13 +593,53 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco
 	// Send request
 	resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
 	if err != nil {
-		return nil, fmt.Errorf("upstream request failed: %w", err)
+		// Ensure the client receives an error response (handlers assume Forward writes on non-failover errors).
+		safeErr := sanitizeUpstreamErrorMessage(err.Error())
+		setOpsUpstreamError(c, 0, safeErr, "")
+		appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+			Platform:           account.Platform,
+			AccountID:          account.ID,
+			UpstreamStatusCode: 0,
+			Kind:               "request_error",
+			Message:            safeErr,
+		})
+		c.JSON(http.StatusBadGateway, gin.H{
+			"error": gin.H{
+				"type":    "upstream_error",
+				"message": "Upstream request failed",
+			},
+		})
+		return nil, fmt.Errorf("upstream request failed: %s", safeErr)
 	}
 	defer func() { _ = resp.Body.Close() }()
 
 	// Handle error response
 	if resp.StatusCode >= 400 {
 		if s.shouldFailoverUpstreamError(resp.StatusCode) {
+			respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
+			_ = resp.Body.Close()
+			resp.Body = io.NopCloser(bytes.NewReader(respBody))
+
+			upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
+			upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+			upstreamDetail := ""
+			if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+				maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+				if maxBytes <= 0 {
+					maxBytes = 2048
+				}
+				upstreamDetail = truncateString(string(respBody), maxBytes)
+			}
+			appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+				Platform:           account.Platform,
+				AccountID:          account.ID,
+				UpstreamStatusCode: resp.StatusCode,
+				UpstreamRequestID:  resp.Header.Get("x-request-id"),
+				Kind:               "failover",
+				Message:            upstreamMsg,
+				Detail:             upstreamDetail,
+			})
+
 			s.handleFailoverSideEffects(ctx, resp, account)
 			return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 		}
@@ -724,18 +763,52 @@ func (s *OpenAIGatewayService) buildUpstreamRequest(ctx context.Context, c *gin.
 }
 
 func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*OpenAIForwardResult, error) {
-	body, _ := io.ReadAll(resp.Body)
-	logUpstreamErrorBody(account.ID, resp.StatusCode, body)
+	body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
+
+	upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
+	upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+	upstreamDetail := ""
+	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+		maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
+		if maxBytes <= 0 {
+			maxBytes = 2048
+		}
+		upstreamDetail = truncateString(string(body), maxBytes)
+	}
+	setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
+
+	if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
+		log.Printf(
+			"OpenAI upstream error %d (account=%d platform=%s type=%s): %s",
+			resp.StatusCode,
+			account.ID,
+			account.Platform,
+			account.Type,
+			truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
+		)
+	}
 
 	// Check custom error codes
 	if !account.ShouldHandleErrorCode(resp.StatusCode) {
+		appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+			Platform:           account.Platform,
+			AccountID:          account.ID,
+			UpstreamStatusCode: resp.StatusCode,
+			UpstreamRequestID:  resp.Header.Get("x-request-id"),
+			Kind:               "http_error",
+			Message:            upstreamMsg,
+			Detail:             upstreamDetail,
+		})
 		c.JSON(http.StatusInternalServerError, gin.H{
 			"error": gin.H{
 				"type":    "upstream_error",
 				"message": "Upstream gateway error",
 			},
 		})
-		return nil, fmt.Errorf("upstream error: %d (not in custom error codes)", resp.StatusCode)
+		if upstreamMsg == "" {
+			return nil, fmt.Errorf("upstream error: %d (not in custom error codes)", resp.StatusCode)
+		}
+		return nil, fmt.Errorf("upstream error: %d (not in custom error codes) message=%s", resp.StatusCode, upstreamMsg)
 	}
 
 	// Handle upstream error (mark account status)
@@ -743,6 +816,19 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht
 	if s.rateLimitService != nil {
 		shouldDisable = s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body)
 	}
+	kind := "http_error"
+	if shouldDisable {
+		kind = "failover"
+	}
+	appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
+		Platform:           account.Platform,
+		AccountID:          account.ID,
+		UpstreamStatusCode: resp.StatusCode,
+		UpstreamRequestID:  resp.Header.Get("x-request-id"),
+		Kind:               kind,
+		Message:            upstreamMsg,
+		Detail:             upstreamDetail,
+	})
 	if shouldDisable {
 		return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
 	}
@@ -781,25 +867,10 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht
 		},
 	})
 
-	return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
-}
-
-func logUpstreamErrorBody(accountID int64, statusCode int, body []byte) {
-	if strings.ToLower(strings.TrimSpace(os.Getenv("GATEWAY_LOG_UPSTREAM_ERROR_BODY"))) != "true" {
-		return
+	if upstreamMsg == "" {
+		return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
 	}
-
-	maxBytes := 2048
-	if rawMax := strings.TrimSpace(os.Getenv("GATEWAY_LOG_UPSTREAM_ERROR_BODY_MAX_BYTES")); rawMax != "" {
-		if parsed, err := strconv.Atoi(rawMax); err == nil && parsed > 0 {
-			maxBytes = parsed
-		}
-	}
-	if len(body) > maxBytes {
-		body = body[:maxBytes]
-	}
-
-	log.Printf("Upstream error body: account=%d status=%d body=%q", accountID, statusCode, string(body))
+	return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
 }
 
 // openaiStreamingResult streaming response result
diff --git a/backend/internal/service/ops_account_availability.go b/backend/internal/service/ops_account_availability.go
new file mode 100644
index 00000000..da66ec4d
--- /dev/null
+++ b/backend/internal/service/ops_account_availability.go
@@ -0,0 +1,194 @@
+package service
+
+import (
+	"context"
+	"errors"
+	"time"
+)
+
+// GetAccountAvailabilityStats returns current account availability stats.
+//
+// Query-level filtering is intentionally limited to platform/group to match the dashboard scope.
+func (s *OpsService) GetAccountAvailabilityStats(ctx context.Context, platformFilter string, groupIDFilter *int64) (
+	map[string]*PlatformAvailability,
+	map[int64]*GroupAvailability,
+	map[int64]*AccountAvailability,
+	*time.Time,
+	error,
+) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, nil, nil, nil, err
+	}
+
+	accounts, err := s.listAllAccountsForOps(ctx, platformFilter)
+	if err != nil {
+		return nil, nil, nil, nil, err
+	}
+
+	if groupIDFilter != nil && *groupIDFilter > 0 {
+		filtered := make([]Account, 0, len(accounts))
+		for _, acc := range accounts {
+			for _, grp := range acc.Groups {
+				if grp != nil && grp.ID == *groupIDFilter {
+					filtered = append(filtered, acc)
+					break
+				}
+			}
+		}
+		accounts = filtered
+	}
+
+	now := time.Now()
+	collectedAt := now
+
+	platform := make(map[string]*PlatformAvailability)
+	group := make(map[int64]*GroupAvailability)
+	account := make(map[int64]*AccountAvailability)
+
+	for _, acc := range accounts {
+		if acc.ID <= 0 {
+			continue
+		}
+
+		isTempUnsched := false
+		if acc.TempUnschedulableUntil != nil && now.Before(*acc.TempUnschedulableUntil) {
+			isTempUnsched = true
+		}
+
+		isRateLimited := acc.RateLimitResetAt != nil && now.Before(*acc.RateLimitResetAt)
+		isOverloaded := acc.OverloadUntil != nil && now.Before(*acc.OverloadUntil)
+		hasError := acc.Status == StatusError
+
+		// Normalize exclusive status flags so the UI doesn't show conflicting badges.
+		if hasError {
+			isRateLimited = false
+			isOverloaded = false
+		}
+
+		isAvailable := acc.Status == StatusActive && acc.Schedulable && !isRateLimited && !isOverloaded && !isTempUnsched
+
+		if acc.Platform != "" {
+			if _, ok := platform[acc.Platform]; !ok {
+				platform[acc.Platform] = &PlatformAvailability{
+					Platform: acc.Platform,
+				}
+			}
+			p := platform[acc.Platform]
+			p.TotalAccounts++
+			if isAvailable {
+				p.AvailableCount++
+			}
+			if isRateLimited {
+				p.RateLimitCount++
+			}
+			if hasError {
+				p.ErrorCount++
+			}
+		}
+
+		for _, grp := range acc.Groups {
+			if grp == nil || grp.ID <= 0 {
+				continue
+			}
+			if _, ok := group[grp.ID]; !ok {
+				group[grp.ID] = &GroupAvailability{
+					GroupID:   grp.ID,
+					GroupName: grp.Name,
+					Platform:  grp.Platform,
+				}
+			}
+			g := group[grp.ID]
+			g.TotalAccounts++
+			if isAvailable {
+				g.AvailableCount++
+			}
+			if isRateLimited {
+				g.RateLimitCount++
+			}
+			if hasError {
+				g.ErrorCount++
+			}
+		}
+
+		displayGroupID := int64(0)
+		displayGroupName := ""
+		if len(acc.Groups) > 0 && acc.Groups[0] != nil {
+			displayGroupID = acc.Groups[0].ID
+			displayGroupName = acc.Groups[0].Name
+		}
+
+		item := &AccountAvailability{
+			AccountID:   acc.ID,
+			AccountName: acc.Name,
+			Platform:    acc.Platform,
+			GroupID:     displayGroupID,
+			GroupName:   displayGroupName,
+			Status:      acc.Status,
+
+			IsAvailable:   isAvailable,
+			IsRateLimited: isRateLimited,
+			IsOverloaded:  isOverloaded,
+			HasError:      hasError,
+
+			ErrorMessage: acc.ErrorMessage,
+		}
+
+		if isRateLimited && acc.RateLimitResetAt != nil {
+			item.RateLimitResetAt = acc.RateLimitResetAt
+			remainingSec := int64(time.Until(*acc.RateLimitResetAt).Seconds())
+			if remainingSec > 0 {
+				item.RateLimitRemainingSec = &remainingSec
+			}
+		}
+		if isOverloaded && acc.OverloadUntil != nil {
+			item.OverloadUntil = acc.OverloadUntil
+			remainingSec := int64(time.Until(*acc.OverloadUntil).Seconds())
+			if remainingSec > 0 {
+				item.OverloadRemainingSec = &remainingSec
+			}
+		}
+		if isTempUnsched && acc.TempUnschedulableUntil != nil {
+			item.TempUnschedulableUntil = acc.TempUnschedulableUntil
+		}
+
+		account[acc.ID] = item
+	}
+
+	return platform, group, account, &collectedAt, nil
+}
+
+type OpsAccountAvailability struct {
+	Group       *GroupAvailability
+	Accounts    map[int64]*AccountAvailability
+	CollectedAt *time.Time
+}
+
+func (s *OpsService) GetAccountAvailability(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error) {
+	if s == nil {
+		return nil, errors.New("ops service is nil")
+	}
+
+	if s.getAccountAvailability != nil {
+		return s.getAccountAvailability(ctx, platformFilter, groupIDFilter)
+	}
+
+	_, groupStats, accountStats, collectedAt, err := s.GetAccountAvailabilityStats(ctx, platformFilter, groupIDFilter)
+	if err != nil {
+		return nil, err
+	}
+
+	var group *GroupAvailability
+	if groupIDFilter != nil && *groupIDFilter > 0 {
+		group = groupStats[*groupIDFilter]
+	}
+
+	if accountStats == nil {
+		accountStats = map[int64]*AccountAvailability{}
+	}
+
+	return &OpsAccountAvailability{
+		Group:       group,
+		Accounts:    accountStats,
+		CollectedAt: collectedAt,
+	}, nil
+}
diff --git a/backend/internal/service/ops_advisory_lock.go b/backend/internal/service/ops_advisory_lock.go
new file mode 100644
index 00000000..f7ef4cee
--- /dev/null
+++ b/backend/internal/service/ops_advisory_lock.go
@@ -0,0 +1,46 @@
+package service
+
+import (
+	"context"
+	"database/sql"
+	"hash/fnv"
+	"time"
+)
+
+func hashAdvisoryLockID(key string) int64 {
+	h := fnv.New64a()
+	_, _ = h.Write([]byte(key))
+	return int64(h.Sum64())
+}
+
+func tryAcquireDBAdvisoryLock(ctx context.Context, db *sql.DB, lockID int64) (func(), bool) {
+	if db == nil {
+		return nil, false
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	conn, err := db.Conn(ctx)
+	if err != nil {
+		return nil, false
+	}
+
+	acquired := false
+	if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", lockID).Scan(&acquired); err != nil {
+		_ = conn.Close()
+		return nil, false
+	}
+	if !acquired {
+		_ = conn.Close()
+		return nil, false
+	}
+
+	release := func() {
+		unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+		defer cancel()
+		_, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", lockID)
+		_ = conn.Close()
+	}
+	return release, true
+}
diff --git a/backend/internal/service/ops_aggregation_service.go b/backend/internal/service/ops_aggregation_service.go
new file mode 100644
index 00000000..2a6afbba
--- /dev/null
+++ b/backend/internal/service/ops_aggregation_service.go
@@ -0,0 +1,443 @@
+package service
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"log"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/config"
+	"github.com/google/uuid"
+	"github.com/redis/go-redis/v9"
+)
+
+const (
+	opsAggHourlyJobName = "ops_preaggregation_hourly"
+	opsAggDailyJobName  = "ops_preaggregation_daily"
+
+	opsAggHourlyInterval = 10 * time.Minute
+	opsAggDailyInterval  = 1 * time.Hour
+
+	// Keep in sync with ops retention target (vNext default 30d).
+	opsAggBackfillWindow = 30 * 24 * time.Hour
+
+	// Recompute overlap to absorb late-arriving rows near boundaries.
+	opsAggHourlyOverlap = 2 * time.Hour
+	opsAggDailyOverlap  = 48 * time.Hour
+
+	opsAggHourlyChunk = 24 * time.Hour
+	opsAggDailyChunk  = 7 * 24 * time.Hour
+
+	// Delay around boundaries (e.g. 10:00..10:05) to avoid aggregating buckets
+	// that may still receive late inserts.
+	opsAggSafeDelay = 5 * time.Minute
+
+	opsAggMaxQueryTimeout = 3 * time.Second
+	opsAggHourlyTimeout   = 5 * time.Minute
+	opsAggDailyTimeout    = 2 * time.Minute
+
+	opsAggHourlyLeaderLockKey = "ops:aggregation:hourly:leader"
+	opsAggDailyLeaderLockKey  = "ops:aggregation:daily:leader"
+
+	opsAggHourlyLeaderLockTTL = 15 * time.Minute
+	opsAggDailyLeaderLockTTL  = 10 * time.Minute
+)
+
+// OpsAggregationService periodically backfills ops_metrics_hourly / ops_metrics_daily
+// for stable long-window dashboard queries.
+//
+// It is safe to run in multi-replica deployments when Redis is available (leader lock).
+type OpsAggregationService struct {
+	opsRepo     OpsRepository
+	settingRepo SettingRepository
+	cfg         *config.Config
+
+	db          *sql.DB
+	redisClient *redis.Client
+	instanceID  string
+
+	stopCh    chan struct{}
+	startOnce sync.Once
+	stopOnce  sync.Once
+
+	hourlyMu sync.Mutex
+	dailyMu  sync.Mutex
+
+	skipLogMu sync.Mutex
+	skipLogAt time.Time
+}
+
+func NewOpsAggregationService(
+	opsRepo OpsRepository,
+	settingRepo SettingRepository,
+	db *sql.DB,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsAggregationService {
+	return &OpsAggregationService{
+		opsRepo:     opsRepo,
+		settingRepo: settingRepo,
+		cfg:         cfg,
+		db:          db,
+		redisClient: redisClient,
+		instanceID:  uuid.NewString(),
+	}
+}
+
+func (s *OpsAggregationService) Start() {
+	if s == nil {
+		return
+	}
+	s.startOnce.Do(func() {
+		if s.stopCh == nil {
+			s.stopCh = make(chan struct{})
+		}
+		go s.hourlyLoop()
+		go s.dailyLoop()
+	})
+}
+
+func (s *OpsAggregationService) Stop() {
+	if s == nil {
+		return
+	}
+	s.stopOnce.Do(func() {
+		if s.stopCh != nil {
+			close(s.stopCh)
+		}
+	})
+}
+
+func (s *OpsAggregationService) hourlyLoop() {
+	// First run immediately.
+	s.aggregateHourly()
+
+	ticker := time.NewTicker(opsAggHourlyInterval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ticker.C:
+			s.aggregateHourly()
+		case <-s.stopCh:
+			return
+		}
+	}
+}
+
+func (s *OpsAggregationService) dailyLoop() {
+	// First run immediately.
+	s.aggregateDaily()
+
+	ticker := time.NewTicker(opsAggDailyInterval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ticker.C:
+			s.aggregateDaily()
+		case <-s.stopCh:
+			return
+		}
+	}
+}
+
+func (s *OpsAggregationService) aggregateHourly() {
+	if s == nil || s.opsRepo == nil {
+		return
+	}
+	if s.cfg != nil {
+		if !s.cfg.Ops.Enabled {
+			return
+		}
+		if !s.cfg.Ops.Aggregation.Enabled {
+			return
+		}
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), opsAggHourlyTimeout)
+	defer cancel()
+
+	if !s.isMonitoringEnabled(ctx) {
+		return
+	}
+
+	release, ok := s.tryAcquireLeaderLock(ctx, opsAggHourlyLeaderLockKey, opsAggHourlyLeaderLockTTL, "[OpsAggregation][hourly]")
+	if !ok {
+		return
+	}
+	if release != nil {
+		defer release()
+	}
+
+	s.hourlyMu.Lock()
+	defer s.hourlyMu.Unlock()
+
+	startedAt := time.Now().UTC()
+	runAt := startedAt
+
+	// Aggregate stable full hours only.
+	end := utcFloorToHour(time.Now().UTC().Add(-opsAggSafeDelay))
+	start := end.Add(-opsAggBackfillWindow)
+
+	// Resume from the latest bucket with overlap.
+	{
+		ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout)
+		latest, ok, err := s.opsRepo.GetLatestHourlyBucketStart(ctxMax)
+		cancelMax()
+		if err != nil {
+			log.Printf("[OpsAggregation][hourly] failed to read latest bucket: %v", err)
+		} else if ok {
+			candidate := latest.Add(-opsAggHourlyOverlap)
+			if candidate.After(start) {
+				start = candidate
+			}
+		}
+	}
+
+	start = utcFloorToHour(start)
+	if !start.Before(end) {
+		return
+	}
+
+	var aggErr error
+	for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggHourlyChunk) {
+		chunkEnd := minTime(cursor.Add(opsAggHourlyChunk), end)
+		if err := s.opsRepo.UpsertHourlyMetrics(ctx, cursor, chunkEnd); err != nil {
+			aggErr = err
+			log.Printf("[OpsAggregation][hourly] upsert failed (%s..%s): %v", cursor.Format(time.RFC3339), chunkEnd.Format(time.RFC3339), err)
+			break
+		}
+	}
+
+	finishedAt := time.Now().UTC()
+	durationMs := finishedAt.Sub(startedAt).Milliseconds()
+	dur := durationMs
+
+	if aggErr != nil {
+		msg := truncateString(aggErr.Error(), 2048)
+		errAt := finishedAt
+		hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
+		defer hbCancel()
+		_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
+			JobName:        opsAggHourlyJobName,
+			LastRunAt:      &runAt,
+			LastErrorAt:    &errAt,
+			LastError:      &msg,
+			LastDurationMs: &dur,
+		})
+		return
+	}
+
+	successAt := finishedAt
+	hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer hbCancel()
+	_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
+		JobName:        opsAggHourlyJobName,
+		LastRunAt:      &runAt,
+		LastSuccessAt:  &successAt,
+		LastDurationMs: &dur,
+	})
+}
+
+func (s *OpsAggregationService) aggregateDaily() {
+	if s == nil || s.opsRepo == nil {
+		return
+	}
+	if s.cfg != nil {
+		if !s.cfg.Ops.Enabled {
+			return
+		}
+		if !s.cfg.Ops.Aggregation.Enabled {
+			return
+		}
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), opsAggDailyTimeout)
+	defer cancel()
+
+	if !s.isMonitoringEnabled(ctx) {
+		return
+	}
+
+	release, ok := s.tryAcquireLeaderLock(ctx, opsAggDailyLeaderLockKey, opsAggDailyLeaderLockTTL, "[OpsAggregation][daily]")
+	if !ok {
+		return
+	}
+	if release != nil {
+		defer release()
+	}
+
+	s.dailyMu.Lock()
+	defer s.dailyMu.Unlock()
+
+	startedAt := time.Now().UTC()
+	runAt := startedAt
+
+	end := utcFloorToDay(time.Now().UTC())
+	start := end.Add(-opsAggBackfillWindow)
+
+	{
+		ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout)
+		latest, ok, err := s.opsRepo.GetLatestDailyBucketDate(ctxMax)
+		cancelMax()
+		if err != nil {
+			log.Printf("[OpsAggregation][daily] failed to read latest bucket: %v", err)
+		} else if ok {
+			candidate := latest.Add(-opsAggDailyOverlap)
+			if candidate.After(start) {
+				start = candidate
+			}
+		}
+	}
+
+	start = utcFloorToDay(start)
+	if !start.Before(end) {
+		return
+	}
+
+	var aggErr error
+	for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggDailyChunk) {
+		chunkEnd := minTime(cursor.Add(opsAggDailyChunk), end)
+		if err := s.opsRepo.UpsertDailyMetrics(ctx, cursor, chunkEnd); err != nil {
+			aggErr = err
+			log.Printf("[OpsAggregation][daily] upsert failed (%s..%s): %v", cursor.Format("2006-01-02"), chunkEnd.Format("2006-01-02"), err)
+			break
+		}
+	}
+
+	finishedAt := time.Now().UTC()
+	durationMs := finishedAt.Sub(startedAt).Milliseconds()
+	dur := durationMs
+
+	if aggErr != nil {
+		msg := truncateString(aggErr.Error(), 2048)
+		errAt := finishedAt
+		hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
+		defer hbCancel()
+		_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
+			JobName:        opsAggDailyJobName,
+			LastRunAt:      &runAt,
+			LastErrorAt:    &errAt,
+			LastError:      &msg,
+			LastDurationMs: &dur,
+		})
+		return
+	}
+
+	successAt := finishedAt
+	hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer hbCancel()
+	_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
+		JobName:        opsAggDailyJobName,
+		LastRunAt:      &runAt,
+		LastSuccessAt:  &successAt,
+		LastDurationMs: &dur,
+	})
+}
+
+func (s *OpsAggregationService) isMonitoringEnabled(ctx context.Context) bool {
+	if s == nil {
+		return false
+	}
+	if s.cfg != nil && !s.cfg.Ops.Enabled {
+		return false
+	}
+	if s.settingRepo == nil {
+		return true
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
+	if err != nil {
+		if errors.Is(err, ErrSettingNotFound) {
+			return true
+		}
+		return true
+	}
+	switch strings.ToLower(strings.TrimSpace(value)) {
+	case "false", "0", "off", "disabled":
+		return false
+	default:
+		return true
+	}
+}
+
+var opsAggReleaseScript = redis.NewScript(`
+if redis.call("GET", KEYS[1]) == ARGV[1] then
+  return redis.call("DEL", KEYS[1])
+end
+return 0
+`)
+
+func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) {
+	if s == nil {
+		return nil, false
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	// Prefer Redis leader lock when available (multi-instance), but avoid stampeding
+	// the DB when Redis is flaky by falling back to a DB advisory lock.
+	if s.redisClient != nil {
+		ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
+		if err == nil {
+			if !ok {
+				s.maybeLogSkip(logPrefix)
+				return nil, false
+			}
+			release := func() {
+				ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+				defer cancel()
+				_, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result()
+			}
+			return release, true
+		}
+		// Redis error: fall through to DB advisory lock.
+	}
+
+	release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
+	if !ok {
+		s.maybeLogSkip(logPrefix)
+		return nil, false
+	}
+	return release, true
+}
+
+func (s *OpsAggregationService) maybeLogSkip(prefix string) {
+	s.skipLogMu.Lock()
+	defer s.skipLogMu.Unlock()
+
+	now := time.Now()
+	if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < time.Minute {
+		return
+	}
+	s.skipLogAt = now
+	if prefix == "" {
+		prefix = "[OpsAggregation]"
+	}
+	log.Printf("%s leader lock held by another instance; skipping", prefix)
+}
+
+func utcFloorToHour(t time.Time) time.Time {
+	return t.UTC().Truncate(time.Hour)
+}
+
+func utcFloorToDay(t time.Time) time.Time {
+	u := t.UTC()
+	y, m, d := u.Date()
+	return time.Date(y, m, d, 0, 0, 0, 0, time.UTC)
+}
+
+func minTime(a, b time.Time) time.Time {
+	if a.Before(b) {
+		return a
+	}
+	return b
+}
diff --git a/backend/internal/service/ops_alert_evaluator_service.go b/backend/internal/service/ops_alert_evaluator_service.go
new file mode 100644
index 00000000..f376c246
--- /dev/null
+++ b/backend/internal/service/ops_alert_evaluator_service.go
@@ -0,0 +1,913 @@
+package service
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"math"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/config"
+	"github.com/google/uuid"
+	"github.com/redis/go-redis/v9"
+)
+
+const (
+	opsAlertEvaluatorJobName = "ops_alert_evaluator"
+
+	opsAlertEvaluatorTimeout         = 45 * time.Second
+	opsAlertEvaluatorLeaderLockKey   = "ops:alert:evaluator:leader"
+	opsAlertEvaluatorLeaderLockTTL   = 90 * time.Second
+	opsAlertEvaluatorSkipLogInterval = 1 * time.Minute
+)
+
+var opsAlertEvaluatorReleaseScript = redis.NewScript(`
+if redis.call("GET", KEYS[1]) == ARGV[1] then
+  return redis.call("DEL", KEYS[1])
+end
+return 0
+`)
+
+type OpsAlertEvaluatorService struct {
+	opsService   *OpsService
+	opsRepo      OpsRepository
+	emailService *EmailService
+
+	redisClient *redis.Client
+	cfg         *config.Config
+	instanceID  string
+
+	stopCh    chan struct{}
+	startOnce sync.Once
+	stopOnce  sync.Once
+	wg        sync.WaitGroup
+
+	mu         sync.Mutex
+	ruleStates map[int64]*opsAlertRuleState
+
+	emailLimiter *slidingWindowLimiter
+
+	skipLogMu sync.Mutex
+	skipLogAt time.Time
+
+	warnNoRedisOnce sync.Once
+}
+
+type opsAlertRuleState struct {
+	LastEvaluatedAt     time.Time
+	ConsecutiveBreaches int
+}
+
+func NewOpsAlertEvaluatorService(
+	opsService *OpsService,
+	opsRepo OpsRepository,
+	emailService *EmailService,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsAlertEvaluatorService {
+	return &OpsAlertEvaluatorService{
+		opsService:   opsService,
+		opsRepo:      opsRepo,
+		emailService: emailService,
+		redisClient:  redisClient,
+		cfg:          cfg,
+		instanceID:   uuid.NewString(),
+		ruleStates:   map[int64]*opsAlertRuleState{},
+		emailLimiter: newSlidingWindowLimiter(0, time.Hour),
+	}
+}
+
+func (s *OpsAlertEvaluatorService) Start() {
+	if s == nil {
+		return
+	}
+	s.startOnce.Do(func() {
+		if s.stopCh == nil {
+			s.stopCh = make(chan struct{})
+		}
+		go s.run()
+	})
+}
+
+func (s *OpsAlertEvaluatorService) Stop() {
+	if s == nil {
+		return
+	}
+	s.stopOnce.Do(func() {
+		if s.stopCh != nil {
+			close(s.stopCh)
+		}
+	})
+	s.wg.Wait()
+}
+
+func (s *OpsAlertEvaluatorService) run() {
+	s.wg.Add(1)
+	defer s.wg.Done()
+
+	// Start immediately to produce early feedback in ops dashboard.
+	timer := time.NewTimer(0)
+	defer timer.Stop()
+
+	for {
+		select {
+		case <-timer.C:
+			interval := s.getInterval()
+			s.evaluateOnce(interval)
+			timer.Reset(interval)
+		case <-s.stopCh:
+			return
+		}
+	}
+}
+
+func (s *OpsAlertEvaluatorService) getInterval() time.Duration {
+	// Default.
+	interval := 60 * time.Second
+
+	if s == nil || s.opsService == nil {
+		return interval
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	cfg, err := s.opsService.GetOpsAlertRuntimeSettings(ctx)
+	if err != nil || cfg == nil {
+		return interval
+	}
+	if cfg.EvaluationIntervalSeconds <= 0 {
+		return interval
+	}
+	if cfg.EvaluationIntervalSeconds < 1 {
+		return interval
+	}
+	if cfg.EvaluationIntervalSeconds > int((24 * time.Hour).Seconds()) {
+		return interval
+	}
+	return time.Duration(cfg.EvaluationIntervalSeconds) * time.Second
+}
+
+func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
+	if s == nil || s.opsRepo == nil {
+		return
+	}
+	if s.cfg != nil && !s.cfg.Ops.Enabled {
+		return
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), opsAlertEvaluatorTimeout)
+	defer cancel()
+
+	if s.opsService != nil && !s.opsService.IsMonitoringEnabled(ctx) {
+		return
+	}
+
+	runtimeCfg := defaultOpsAlertRuntimeSettings()
+	if s.opsService != nil {
+		if loaded, err := s.opsService.GetOpsAlertRuntimeSettings(ctx); err == nil && loaded != nil {
+			runtimeCfg = loaded
+		}
+	}
+
+	release, ok := s.tryAcquireLeaderLock(ctx, runtimeCfg.DistributedLock)
+	if !ok {
+		return
+	}
+	if release != nil {
+		defer release()
+	}
+
+	startedAt := time.Now().UTC()
+	runAt := startedAt
+
+	rules, err := s.opsRepo.ListAlertRules(ctx)
+	if err != nil {
+		s.recordHeartbeatError(runAt, time.Since(startedAt), err)
+		log.Printf("[OpsAlertEvaluator] list rules failed: %v", err)
+		return
+	}
+
+	now := time.Now().UTC()
+	safeEnd := now.Truncate(time.Minute)
+	if safeEnd.IsZero() {
+		safeEnd = now
+	}
+
+	systemMetrics, _ := s.opsRepo.GetLatestSystemMetrics(ctx, 1)
+
+	// Cleanup stale state for removed rules.
+	s.pruneRuleStates(rules)
+
+	for _, rule := range rules {
+		if rule == nil || !rule.Enabled || rule.ID <= 0 {
+			continue
+		}
+
+		scopePlatform, scopeGroupID := parseOpsAlertRuleScope(rule.Filters)
+
+		windowMinutes := rule.WindowMinutes
+		if windowMinutes <= 0 {
+			windowMinutes = 1
+		}
+		windowStart := safeEnd.Add(-time.Duration(windowMinutes) * time.Minute)
+		windowEnd := safeEnd
+
+		metricValue, ok := s.computeRuleMetric(ctx, rule, systemMetrics, windowStart, windowEnd, scopePlatform, scopeGroupID)
+		if !ok {
+			s.resetRuleState(rule.ID, now)
+			continue
+		}
+
+		breachedNow := compareMetric(metricValue, rule.Operator, rule.Threshold)
+		required := requiredSustainedBreaches(rule.SustainedMinutes, interval)
+		consecutive := s.updateRuleBreaches(rule.ID, now, interval, breachedNow)
+
+		activeEvent, err := s.opsRepo.GetActiveAlertEvent(ctx, rule.ID)
+		if err != nil {
+			log.Printf("[OpsAlertEvaluator] get active event failed (rule=%d): %v", rule.ID, err)
+			continue
+		}
+
+		if breachedNow && consecutive >= required {
+			if activeEvent != nil {
+				continue
+			}
+
+			latestEvent, err := s.opsRepo.GetLatestAlertEvent(ctx, rule.ID)
+			if err != nil {
+				log.Printf("[OpsAlertEvaluator] get latest event failed (rule=%d): %v", rule.ID, err)
+				continue
+			}
+			if latestEvent != nil && rule.CooldownMinutes > 0 {
+				cooldown := time.Duration(rule.CooldownMinutes) * time.Minute
+				if now.Sub(latestEvent.FiredAt) < cooldown {
+					continue
+				}
+			}
+
+			firedEvent := &OpsAlertEvent{
+				RuleID:         rule.ID,
+				Severity:       strings.TrimSpace(rule.Severity),
+				Status:         OpsAlertStatusFiring,
+				Title:          fmt.Sprintf("%s: %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name)),
+				Description:    buildOpsAlertDescription(rule, metricValue, windowMinutes, scopePlatform, scopeGroupID),
+				MetricValue:    float64Ptr(metricValue),
+				ThresholdValue: float64Ptr(rule.Threshold),
+				Dimensions:     buildOpsAlertDimensions(scopePlatform, scopeGroupID),
+				FiredAt:        now,
+				CreatedAt:      now,
+			}
+
+			created, err := s.opsRepo.CreateAlertEvent(ctx, firedEvent)
+			if err != nil {
+				log.Printf("[OpsAlertEvaluator] create event failed (rule=%d): %v", rule.ID, err)
+				continue
+			}
+
+			if created != nil && created.ID > 0 {
+				s.maybeSendAlertEmail(ctx, runtimeCfg, rule, created)
+			}
+			continue
+		}
+
+		// Not breached: resolve active event if present.
+		if activeEvent != nil {
+			resolvedAt := now
+			if err := s.opsRepo.UpdateAlertEventStatus(ctx, activeEvent.ID, OpsAlertStatusResolved, &resolvedAt); err != nil {
+				log.Printf("[OpsAlertEvaluator] resolve event failed (event=%d): %v", activeEvent.ID, err)
+			}
+		}
+	}
+
+	s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
+}
+
+func (s *OpsAlertEvaluatorService) pruneRuleStates(rules []*OpsAlertRule) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	live := map[int64]struct{}{}
+	for _, r := range rules {
+		if r != nil && r.ID > 0 {
+			live[r.ID] = struct{}{}
+		}
+	}
+	for id := range s.ruleStates {
+		if _, ok := live[id]; !ok {
+			delete(s.ruleStates, id)
+		}
+	}
+}
+
+func (s *OpsAlertEvaluatorService) resetRuleState(ruleID int64, now time.Time) {
+	if ruleID <= 0 {
+		return
+	}
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	state, ok := s.ruleStates[ruleID]
+	if !ok {
+		state = &opsAlertRuleState{}
+		s.ruleStates[ruleID] = state
+	}
+	state.LastEvaluatedAt = now
+	state.ConsecutiveBreaches = 0
+}
+
+func (s *OpsAlertEvaluatorService) updateRuleBreaches(ruleID int64, now time.Time, interval time.Duration, breached bool) int {
+	if ruleID <= 0 {
+		return 0
+	}
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	state, ok := s.ruleStates[ruleID]
+	if !ok {
+		state = &opsAlertRuleState{}
+		s.ruleStates[ruleID] = state
+	}
+
+	if !state.LastEvaluatedAt.IsZero() && interval > 0 {
+		if now.Sub(state.LastEvaluatedAt) > interval*2 {
+			state.ConsecutiveBreaches = 0
+		}
+	}
+
+	state.LastEvaluatedAt = now
+	if breached {
+		state.ConsecutiveBreaches++
+	} else {
+		state.ConsecutiveBreaches = 0
+	}
+	return state.ConsecutiveBreaches
+}
+
+func requiredSustainedBreaches(sustainedMinutes int, interval time.Duration) int {
+	if sustainedMinutes <= 0 {
+		return 1
+	}
+	if interval <= 0 {
+		return sustainedMinutes
+	}
+	required := int(math.Ceil(float64(sustainedMinutes*60) / interval.Seconds()))
+	if required < 1 {
+		return 1
+	}
+	return required
+}
+
+func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64) {
+	if filters == nil {
+		return "", nil
+	}
+	if v, ok := filters["platform"]; ok {
+		if s, ok := v.(string); ok {
+			platform = strings.TrimSpace(s)
+		}
+	}
+	if v, ok := filters["group_id"]; ok {
+		switch t := v.(type) {
+		case float64:
+			if t > 0 {
+				id := int64(t)
+				groupID = &id
+			}
+		case int64:
+			if t > 0 {
+				id := t
+				groupID = &id
+			}
+		case int:
+			if t > 0 {
+				id := int64(t)
+				groupID = &id
+			}
+		case string:
+			n, err := strconv.ParseInt(strings.TrimSpace(t), 10, 64)
+			if err == nil && n > 0 {
+				groupID = &n
+			}
+		}
+	}
+	return platform, groupID
+}
+
+func (s *OpsAlertEvaluatorService) computeRuleMetric(
+	ctx context.Context,
+	rule *OpsAlertRule,
+	systemMetrics *OpsSystemMetricsSnapshot,
+	start time.Time,
+	end time.Time,
+	platform string,
+	groupID *int64,
+) (float64, bool) {
+	if rule == nil {
+		return 0, false
+	}
+	switch strings.TrimSpace(rule.MetricType) {
+	case "cpu_usage_percent":
+		if systemMetrics != nil && systemMetrics.CPUUsagePercent != nil {
+			return *systemMetrics.CPUUsagePercent, true
+		}
+		return 0, false
+	case "memory_usage_percent":
+		if systemMetrics != nil && systemMetrics.MemoryUsagePercent != nil {
+			return *systemMetrics.MemoryUsagePercent, true
+		}
+		return 0, false
+	case "concurrency_queue_depth":
+		if systemMetrics != nil && systemMetrics.ConcurrencyQueueDepth != nil {
+			return float64(*systemMetrics.ConcurrencyQueueDepth), true
+		}
+		return 0, false
+	case "group_available_accounts":
+		if groupID == nil || *groupID <= 0 {
+			return 0, false
+		}
+		if s == nil || s.opsService == nil {
+			return 0, false
+		}
+		availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
+		if err != nil || availability == nil {
+			return 0, false
+		}
+		if availability.Group == nil {
+			return 0, true
+		}
+		return float64(availability.Group.AvailableCount), true
+	case "group_available_ratio":
+		if groupID == nil || *groupID <= 0 {
+			return 0, false
+		}
+		if s == nil || s.opsService == nil {
+			return 0, false
+		}
+		availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
+		if err != nil || availability == nil {
+			return 0, false
+		}
+		return computeGroupAvailableRatio(availability.Group), true
+	case "account_rate_limited_count":
+		if s == nil || s.opsService == nil {
+			return 0, false
+		}
+		availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
+		if err != nil || availability == nil {
+			return 0, false
+		}
+		return float64(countAccountsByCondition(availability.Accounts, func(acc *AccountAvailability) bool {
+			return acc.IsRateLimited
+		})), true
+	case "account_error_count":
+		if s == nil || s.opsService == nil {
+			return 0, false
+		}
+		availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
+		if err != nil || availability == nil {
+			return 0, false
+		}
+		return float64(countAccountsByCondition(availability.Accounts, func(acc *AccountAvailability) bool {
+			return acc.HasError && acc.TempUnschedulableUntil == nil
+		})), true
+	}
+
+	overview, err := s.opsRepo.GetDashboardOverview(ctx, &OpsDashboardFilter{
+		StartTime: start,
+		EndTime:   end,
+		Platform:  platform,
+		GroupID:   groupID,
+		QueryMode: OpsQueryModeRaw,
+	})
+	if err != nil {
+		return 0, false
+	}
+	if overview == nil {
+		return 0, false
+	}
+
+	switch strings.TrimSpace(rule.MetricType) {
+	case "success_rate":
+		if overview.RequestCountSLA <= 0 {
+			return 0, false
+		}
+		return overview.SLA * 100, true
+	case "error_rate":
+		if overview.RequestCountSLA <= 0 {
+			return 0, false
+		}
+		return overview.ErrorRate * 100, true
+	case "upstream_error_rate":
+		if overview.RequestCountSLA <= 0 {
+			return 0, false
+		}
+		return overview.UpstreamErrorRate * 100, true
+	case "p95_latency_ms":
+		if overview.Duration.P95 == nil {
+			return 0, false
+		}
+		return float64(*overview.Duration.P95), true
+	case "p99_latency_ms":
+		if overview.Duration.P99 == nil {
+			return 0, false
+		}
+		return float64(*overview.Duration.P99), true
+	default:
+		return 0, false
+	}
+}
+
+func compareMetric(value float64, operator string, threshold float64) bool {
+	switch strings.TrimSpace(operator) {
+	case ">":
+		return value > threshold
+	case ">=":
+		return value >= threshold
+	case "<":
+		return value < threshold
+	case "<=":
+		return value <= threshold
+	case "==":
+		return value == threshold
+	case "!=":
+		return value != threshold
+	default:
+		return false
+	}
+}
+
+func buildOpsAlertDimensions(platform string, groupID *int64) map[string]any {
+	dims := map[string]any{}
+	if strings.TrimSpace(platform) != "" {
+		dims["platform"] = strings.TrimSpace(platform)
+	}
+	if groupID != nil && *groupID > 0 {
+		dims["group_id"] = *groupID
+	}
+	if len(dims) == 0 {
+		return nil
+	}
+	return dims
+}
+
+func buildOpsAlertDescription(rule *OpsAlertRule, value float64, windowMinutes int, platform string, groupID *int64) string {
+	if rule == nil {
+		return ""
+	}
+	scope := "overall"
+	if strings.TrimSpace(platform) != "" {
+		scope = fmt.Sprintf("platform=%s", strings.TrimSpace(platform))
+	}
+	if groupID != nil && *groupID > 0 {
+		scope = fmt.Sprintf("%s group_id=%d", scope, *groupID)
+	}
+	if windowMinutes <= 0 {
+		windowMinutes = 1
+	}
+	return fmt.Sprintf("%s %s %.2f (current %.2f) over last %dm (%s)",
+		strings.TrimSpace(rule.MetricType),
+		strings.TrimSpace(rule.Operator),
+		rule.Threshold,
+		value,
+		windowMinutes,
+		strings.TrimSpace(scope),
+	)
+}
+
+func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runtimeCfg *OpsAlertRuntimeSettings, rule *OpsAlertRule, event *OpsAlertEvent) {
+	if s == nil || s.emailService == nil || s.opsService == nil || event == nil || rule == nil {
+		return
+	}
+	if event.EmailSent {
+		return
+	}
+	if !rule.NotifyEmail {
+		return
+	}
+
+	emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx)
+	if err != nil || emailCfg == nil || !emailCfg.Alert.Enabled {
+		return
+	}
+
+	if len(emailCfg.Alert.Recipients) == 0 {
+		return
+	}
+	if !shouldSendOpsAlertEmailByMinSeverity(strings.TrimSpace(emailCfg.Alert.MinSeverity), strings.TrimSpace(rule.Severity)) {
+		return
+	}
+
+	if runtimeCfg != nil && runtimeCfg.Silencing.Enabled {
+		if isOpsAlertSilenced(time.Now().UTC(), rule, event, runtimeCfg.Silencing) {
+			return
+		}
+	}
+
+	// Apply/update rate limiter.
+	s.emailLimiter.SetLimit(emailCfg.Alert.RateLimitPerHour)
+
+	subject := fmt.Sprintf("[Ops Alert][%s] %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name))
+	body := buildOpsAlertEmailBody(rule, event)
+
+	anySent := false
+	for _, to := range emailCfg.Alert.Recipients {
+		addr := strings.TrimSpace(to)
+		if addr == "" {
+			continue
+		}
+		if !s.emailLimiter.Allow(time.Now().UTC()) {
+			continue
+		}
+		if err := s.emailService.SendEmail(ctx, addr, subject, body); err != nil {
+			// Ignore per-recipient failures; continue best-effort.
+			continue
+		}
+		anySent = true
+	}
+
+	if anySent {
+		_ = s.opsRepo.UpdateAlertEventEmailSent(context.Background(), event.ID, true)
+	}
+}
+
+func buildOpsAlertEmailBody(rule *OpsAlertRule, event *OpsAlertEvent) string {
+	if rule == nil || event == nil {
+		return ""
+	}
+	metric := strings.TrimSpace(rule.MetricType)
+	value := "-"
+	threshold := fmt.Sprintf("%.2f", rule.Threshold)
+	if event.MetricValue != nil {
+		value = fmt.Sprintf("%.2f", *event.MetricValue)
+	}
+	if event.ThresholdValue != nil {
+		threshold = fmt.Sprintf("%.2f", *event.ThresholdValue)
+	}
+	return fmt.Sprintf(`
+<h2>Ops Alert</h2>
+<p><b>Rule</b>: %s</p>
+<p><b>Severity</b>: %s</p>
+<p><b>Status</b>: %s</p>
+<p><b>Metric</b>: %s %s %s</p>
+<p><b>Fired at</b>: %s</p>
+<p><b>Description</b>: %s</p>
+`,
+		htmlEscape(rule.Name),
+		htmlEscape(rule.Severity),
+		htmlEscape(event.Status),
+		htmlEscape(metric),
+		htmlEscape(rule.Operator),
+		htmlEscape(fmt.Sprintf("%s (threshold %s)", value, threshold)),
+		event.FiredAt.Format(time.RFC3339),
+		htmlEscape(event.Description),
+	)
+}
+
+func shouldSendOpsAlertEmailByMinSeverity(minSeverity string, ruleSeverity string) bool {
+	minSeverity = strings.ToLower(strings.TrimSpace(minSeverity))
+	if minSeverity == "" {
+		return true
+	}
+
+	eventLevel := opsEmailSeverityForOps(ruleSeverity)
+	minLevel := strings.ToLower(minSeverity)
+
+	rank := func(level string) int {
+		switch level {
+		case "critical":
+			return 3
+		case "warning":
+			return 2
+		case "info":
+			return 1
+		default:
+			return 0
+		}
+	}
+	return rank(eventLevel) >= rank(minLevel)
+}
+
+func opsEmailSeverityForOps(severity string) string {
+	switch strings.ToUpper(strings.TrimSpace(severity)) {
+	case "P0":
+		return "critical"
+	case "P1":
+		return "warning"
+	default:
+		return "info"
+	}
+}
+
+func isOpsAlertSilenced(now time.Time, rule *OpsAlertRule, event *OpsAlertEvent, silencing OpsAlertSilencingSettings) bool {
+	if !silencing.Enabled {
+		return false
+	}
+	if now.IsZero() {
+		now = time.Now().UTC()
+	}
+	if strings.TrimSpace(silencing.GlobalUntilRFC3339) != "" {
+		if t, err := time.Parse(time.RFC3339, strings.TrimSpace(silencing.GlobalUntilRFC3339)); err == nil {
+			if now.Before(t) {
+				return true
+			}
+		}
+	}
+
+	for _, entry := range silencing.Entries {
+		untilRaw := strings.TrimSpace(entry.UntilRFC3339)
+		if untilRaw == "" {
+			continue
+		}
+		until, err := time.Parse(time.RFC3339, untilRaw)
+		if err != nil {
+			continue
+		}
+		if now.After(until) {
+			continue
+		}
+		if entry.RuleID != nil && rule != nil && rule.ID > 0 && *entry.RuleID != rule.ID {
+			continue
+		}
+		if len(entry.Severities) > 0 {
+			match := false
+			for _, s := range entry.Severities {
+				if strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(event.Severity)) || strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(rule.Severity)) {
+					match = true
+					break
+				}
+			}
+			if !match {
+				continue
+			}
+		}
+		return true
+	}
+
+	return false
+}
+
+func (s *OpsAlertEvaluatorService) tryAcquireLeaderLock(ctx context.Context, lock OpsDistributedLockSettings) (func(), bool) {
+	if !lock.Enabled {
+		return nil, true
+	}
+	if s.redisClient == nil {
+		s.warnNoRedisOnce.Do(func() {
+			log.Printf("[OpsAlertEvaluator] redis not configured; running without distributed lock")
+		})
+		return nil, true
+	}
+	key := strings.TrimSpace(lock.Key)
+	if key == "" {
+		key = opsAlertEvaluatorLeaderLockKey
+	}
+	ttl := time.Duration(lock.TTLSeconds) * time.Second
+	if ttl <= 0 {
+		ttl = opsAlertEvaluatorLeaderLockTTL
+	}
+
+	ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
+	if err != nil {
+		// Prefer fail-closed to avoid duplicate evaluators stampeding the DB when Redis is flaky.
+		// Single-node deployments can disable the distributed lock via runtime settings.
+		s.warnNoRedisOnce.Do(func() {
+			log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; skipping this cycle: %v", err)
+		})
+		return nil, false
+	}
+	if !ok {
+		s.maybeLogSkip(key)
+		return nil, false
+	}
+	return func() {
+		_, _ = opsAlertEvaluatorReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
+	}, true
+}
+
+func (s *OpsAlertEvaluatorService) maybeLogSkip(key string) {
+	s.skipLogMu.Lock()
+	defer s.skipLogMu.Unlock()
+
+	now := time.Now()
+	if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < opsAlertEvaluatorSkipLogInterval {
+		return
+	}
+	s.skipLogAt = now
+	log.Printf("[OpsAlertEvaluator] leader lock held by another instance; skipping (key=%q)", key)
+}
+
+func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
+	if s == nil || s.opsRepo == nil {
+		return
+	}
+	now := time.Now().UTC()
+	durMs := duration.Milliseconds()
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+	_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
+		JobName:        opsAlertEvaluatorJobName,
+		LastRunAt:      &runAt,
+		LastSuccessAt:  &now,
+		LastDurationMs: &durMs,
+	})
+}
+
+func (s *OpsAlertEvaluatorService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
+	if s == nil || s.opsRepo == nil || err == nil {
+		return
+	}
+	now := time.Now().UTC()
+	durMs := duration.Milliseconds()
+	msg := truncateString(err.Error(), 2048)
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+	_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
+		JobName:        opsAlertEvaluatorJobName,
+		LastRunAt:      &runAt,
+		LastErrorAt:    &now,
+		LastError:      &msg,
+		LastDurationMs: &durMs,
+	})
+}
+
+func htmlEscape(s string) string {
+	replacer := strings.NewReplacer(
+		"&", "&amp;",
+		"<", "&lt;",
+		">", "&gt;",
+		`"`, "&quot;",
+		"'", "&#39;",
+	)
+	return replacer.Replace(s)
+}
+
+type slidingWindowLimiter struct {
+	mu     sync.Mutex
+	limit  int
+	window time.Duration
+	sent   []time.Time
+}
+
+func newSlidingWindowLimiter(limit int, window time.Duration) *slidingWindowLimiter {
+	if window <= 0 {
+		window = time.Hour
+	}
+	return &slidingWindowLimiter{
+		limit:  limit,
+		window: window,
+		sent:   []time.Time{},
+	}
+}
+
+func (l *slidingWindowLimiter) SetLimit(limit int) {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	l.limit = limit
+}
+
+func (l *slidingWindowLimiter) Allow(now time.Time) bool {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	if l.limit <= 0 {
+		return true
+	}
+	cutoff := now.Add(-l.window)
+	keep := l.sent[:0]
+	for _, t := range l.sent {
+		if t.After(cutoff) {
+			keep = append(keep, t)
+		}
+	}
+	l.sent = keep
+	if len(l.sent) >= l.limit {
+		return false
+	}
+	l.sent = append(l.sent, now)
+	return true
+}
+
+// computeGroupAvailableRatio returns the available percentage for a group.
+// Formula: (AvailableCount / TotalAccounts) * 100.
+// Returns 0 when TotalAccounts is 0.
+func computeGroupAvailableRatio(group *GroupAvailability) float64 {
+	if group == nil || group.TotalAccounts <= 0 {
+		return 0
+	}
+	return (float64(group.AvailableCount) / float64(group.TotalAccounts)) * 100
+}
+
+// countAccountsByCondition counts accounts that satisfy the given condition.
+func countAccountsByCondition(accounts map[int64]*AccountAvailability, condition func(*AccountAvailability) bool) int64 {
+	if len(accounts) == 0 || condition == nil {
+		return 0
+	}
+	var count int64
+	for _, account := range accounts {
+		if account != nil && condition(account) {
+			count++
+		}
+	}
+	return count
+}
diff --git a/backend/internal/service/ops_alert_evaluator_service_test.go b/backend/internal/service/ops_alert_evaluator_service_test.go
new file mode 100644
index 00000000..068ab6bb
--- /dev/null
+++ b/backend/internal/service/ops_alert_evaluator_service_test.go
@@ -0,0 +1,210 @@
+//go:build unit
+
+package service
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+type stubOpsRepo struct {
+	OpsRepository
+	overview *OpsDashboardOverview
+	err      error
+}
+
+func (s *stubOpsRepo) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) {
+	if s.err != nil {
+		return nil, s.err
+	}
+	if s.overview != nil {
+		return s.overview, nil
+	}
+	return &OpsDashboardOverview{}, nil
+}
+
+func TestComputeGroupAvailableRatio(t *testing.T) {
+	t.Parallel()
+
+	t.Run("正常情况: 10个账号, 8个可用 = 80%", func(t *testing.T) {
+		t.Parallel()
+
+		got := computeGroupAvailableRatio(&GroupAvailability{
+			TotalAccounts:  10,
+			AvailableCount: 8,
+		})
+		require.InDelta(t, 80.0, got, 0.0001)
+	})
+
+	t.Run("边界情况: TotalAccounts = 0 应返回 0", func(t *testing.T) {
+		t.Parallel()
+
+		got := computeGroupAvailableRatio(&GroupAvailability{
+			TotalAccounts:  0,
+			AvailableCount: 8,
+		})
+		require.Equal(t, 0.0, got)
+	})
+
+	t.Run("边界情况: AvailableCount = 0 应返回 0%", func(t *testing.T) {
+		t.Parallel()
+
+		got := computeGroupAvailableRatio(&GroupAvailability{
+			TotalAccounts:  10,
+			AvailableCount: 0,
+		})
+		require.Equal(t, 0.0, got)
+	})
+}
+
+func TestCountAccountsByCondition(t *testing.T) {
+	t.Parallel()
+
+	t.Run("测试限流账号统计: acc.IsRateLimited", func(t *testing.T) {
+		t.Parallel()
+
+		accounts := map[int64]*AccountAvailability{
+			1: {IsRateLimited: true},
+			2: {IsRateLimited: false},
+			3: {IsRateLimited: true},
+		}
+
+		got := countAccountsByCondition(accounts, func(acc *AccountAvailability) bool {
+			return acc.IsRateLimited
+		})
+		require.Equal(t, int64(2), got)
+	})
+
+	t.Run("测试错误账号统计（排除临时不可调度）: acc.HasError && acc.TempUnschedulableUntil == nil", func(t *testing.T) {
+		t.Parallel()
+
+		until := time.Now().UTC().Add(5 * time.Minute)
+		accounts := map[int64]*AccountAvailability{
+			1: {HasError: true},
+			2: {HasError: true, TempUnschedulableUntil: &until},
+			3: {HasError: false},
+		}
+
+		got := countAccountsByCondition(accounts, func(acc *AccountAvailability) bool {
+			return acc.HasError && acc.TempUnschedulableUntil == nil
+		})
+		require.Equal(t, int64(1), got)
+	})
+
+	t.Run("边界情况: 空 map 应返回 0", func(t *testing.T) {
+		t.Parallel()
+
+		got := countAccountsByCondition(map[int64]*AccountAvailability{}, func(acc *AccountAvailability) bool {
+			return acc.IsRateLimited
+		})
+		require.Equal(t, int64(0), got)
+	})
+}
+
+func TestComputeRuleMetricNewIndicators(t *testing.T) {
+	t.Parallel()
+
+	groupID := int64(101)
+	platform := "openai"
+
+	availability := &OpsAccountAvailability{
+		Group: &GroupAvailability{
+			GroupID:        groupID,
+			TotalAccounts:  10,
+			AvailableCount: 8,
+		},
+		Accounts: map[int64]*AccountAvailability{
+			1: {IsRateLimited: true},
+			2: {IsRateLimited: true},
+			3: {HasError: true},
+			4: {HasError: true, TempUnschedulableUntil: timePtr(time.Now().UTC().Add(2 * time.Minute))},
+			5: {HasError: false, IsRateLimited: false},
+		},
+	}
+
+	opsService := &OpsService{
+		getAccountAvailability: func(_ context.Context, _ string, _ *int64) (*OpsAccountAvailability, error) {
+			return availability, nil
+		},
+	}
+
+	svc := &OpsAlertEvaluatorService{
+		opsService: opsService,
+		opsRepo:    &stubOpsRepo{overview: &OpsDashboardOverview{}},
+	}
+
+	start := time.Now().UTC().Add(-5 * time.Minute)
+	end := time.Now().UTC()
+	ctx := context.Background()
+
+	tests := []struct {
+		name       string
+		metricType string
+		groupID    *int64
+		wantValue  float64
+		wantOK     bool
+	}{
+		{
+			name:       "group_available_accounts",
+			metricType: "group_available_accounts",
+			groupID:    &groupID,
+			wantValue:  8,
+			wantOK:     true,
+		},
+		{
+			name:       "group_available_ratio",
+			metricType: "group_available_ratio",
+			groupID:    &groupID,
+			wantValue:  80.0,
+			wantOK:     true,
+		},
+		{
+			name:       "account_rate_limited_count",
+			metricType: "account_rate_limited_count",
+			groupID:    nil,
+			wantValue:  2,
+			wantOK:     true,
+		},
+		{
+			name:       "account_error_count",
+			metricType: "account_error_count",
+			groupID:    nil,
+			wantValue:  1,
+			wantOK:     true,
+		},
+		{
+			name:       "group_available_accounts without group_id returns false",
+			metricType: "group_available_accounts",
+			groupID:    nil,
+			wantValue:  0,
+			wantOK:     false,
+		},
+		{
+			name:       "group_available_ratio without group_id returns false",
+			metricType: "group_available_ratio",
+			groupID:    nil,
+			wantValue:  0,
+			wantOK:     false,
+		},
+	}
+
+	for _, tt := range tests {
+		tt := tt
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+
+			rule := &OpsAlertRule{
+				MetricType: tt.metricType,
+			}
+			gotValue, gotOK := svc.computeRuleMetric(ctx, rule, nil, start, end, platform, tt.groupID)
+			require.Equal(t, tt.wantOK, gotOK)
+			if !tt.wantOK {
+				return
+			}
+			require.InDelta(t, tt.wantValue, gotValue, 0.0001)
+		})
+	}
+}
diff --git a/backend/internal/service/ops_alert_models.go b/backend/internal/service/ops_alert_models.go
new file mode 100644
index 00000000..0acf13ab
--- /dev/null
+++ b/backend/internal/service/ops_alert_models.go
@@ -0,0 +1,74 @@
+package service
+
+import "time"
+
+// Ops alert rule/event models.
+//
+// NOTE: These are admin-facing DTOs and intentionally keep JSON naming aligned
+// with the existing ops dashboard frontend (backup style).
+
+const (
+	OpsAlertStatusFiring   = "firing"
+	OpsAlertStatusResolved = "resolved"
+)
+
+type OpsAlertRule struct {
+	ID          int64  `json:"id"`
+	Name        string `json:"name"`
+	Description string `json:"description"`
+
+	Enabled  bool   `json:"enabled"`
+	Severity string `json:"severity"`
+
+	MetricType string  `json:"metric_type"`
+	Operator   string  `json:"operator"`
+	Threshold  float64 `json:"threshold"`
+
+	WindowMinutes    int `json:"window_minutes"`
+	SustainedMinutes int `json:"sustained_minutes"`
+	CooldownMinutes  int `json:"cooldown_minutes"`
+
+	NotifyEmail bool `json:"notify_email"`
+
+	Filters map[string]any `json:"filters,omitempty"`
+
+	LastTriggeredAt *time.Time `json:"last_triggered_at,omitempty"`
+	CreatedAt       time.Time  `json:"created_at"`
+	UpdatedAt       time.Time  `json:"updated_at"`
+}
+
+type OpsAlertEvent struct {
+	ID       int64  `json:"id"`
+	RuleID   int64  `json:"rule_id"`
+	Severity string `json:"severity"`
+	Status   string `json:"status"`
+
+	Title       string `json:"title"`
+	Description string `json:"description"`
+
+	MetricValue    *float64 `json:"metric_value,omitempty"`
+	ThresholdValue *float64 `json:"threshold_value,omitempty"`
+
+	Dimensions map[string]any `json:"dimensions,omitempty"`
+
+	FiredAt    time.Time  `json:"fired_at"`
+	ResolvedAt *time.Time `json:"resolved_at,omitempty"`
+
+	EmailSent bool      `json:"email_sent"`
+	CreatedAt time.Time `json:"created_at"`
+}
+
+type OpsAlertEventFilter struct {
+	Limit int
+
+	// Optional filters.
+	Status   string
+	Severity string
+
+	StartTime *time.Time
+	EndTime   *time.Time
+
+	// Dimensions filters (best-effort).
+	Platform string
+	GroupID  *int64
+}
diff --git a/backend/internal/service/ops_alerts.go b/backend/internal/service/ops_alerts.go
new file mode 100644
index 00000000..b6c3d1c3
--- /dev/null
+++ b/backend/internal/service/ops_alerts.go
@@ -0,0 +1,162 @@
+package service
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"strings"
+	"time"
+
+	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+func (s *OpsService) ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return []*OpsAlertRule{}, nil
+	}
+	return s.opsRepo.ListAlertRules(ctx)
+}
+
+func (s *OpsService) CreateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if rule == nil {
+		return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule")
+	}
+
+	created, err := s.opsRepo.CreateAlertRule(ctx, rule)
+	if err != nil {
+		return nil, err
+	}
+	return created, nil
+}
+
+func (s *OpsService) UpdateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if rule == nil || rule.ID <= 0 {
+		return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule")
+	}
+
+	updated, err := s.opsRepo.UpdateAlertRule(ctx, rule)
+	if err != nil {
+		if errors.Is(err, sql.ErrNoRows) {
+			return nil, infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found")
+		}
+		return nil, err
+	}
+	return updated, nil
+}
+
+func (s *OpsService) DeleteAlertRule(ctx context.Context, id int64) error {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return err
+	}
+	if s.opsRepo == nil {
+		return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if id <= 0 {
+		return infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
+	}
+	if err := s.opsRepo.DeleteAlertRule(ctx, id); err != nil {
+		if errors.Is(err, sql.ErrNoRows) {
+			return infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found")
+		}
+		return err
+	}
+	return nil
+}
+
+func (s *OpsService) ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return []*OpsAlertEvent{}, nil
+	}
+	return s.opsRepo.ListAlertEvents(ctx, filter)
+}
+
+func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if ruleID <= 0 {
+		return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
+	}
+	return s.opsRepo.GetActiveAlertEvent(ctx, ruleID)
+}
+
+func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if ruleID <= 0 {
+		return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
+	}
+	return s.opsRepo.GetLatestAlertEvent(ctx, ruleID)
+}
+
+func (s *OpsService) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if event == nil {
+		return nil, infraerrors.BadRequest("INVALID_EVENT", "invalid event")
+	}
+
+	created, err := s.opsRepo.CreateAlertEvent(ctx, event)
+	if err != nil {
+		return nil, err
+	}
+	return created, nil
+}
+
+func (s *OpsService) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return err
+	}
+	if s.opsRepo == nil {
+		return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if eventID <= 0 {
+		return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
+	}
+	if strings.TrimSpace(status) == "" {
+		return infraerrors.BadRequest("INVALID_STATUS", "invalid status")
+	}
+	return s.opsRepo.UpdateAlertEventStatus(ctx, eventID, status, resolvedAt)
+}
+
+func (s *OpsService) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return err
+	}
+	if s.opsRepo == nil {
+		return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if eventID <= 0 {
+		return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
+	}
+	return s.opsRepo.UpdateAlertEventEmailSent(ctx, eventID, emailSent)
+}
diff --git a/backend/internal/service/ops_cleanup_service.go b/backend/internal/service/ops_cleanup_service.go
new file mode 100644
index 00000000..afd2d22c
--- /dev/null
+++ b/backend/internal/service/ops_cleanup_service.go
@@ -0,0 +1,365 @@
+package service
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"log"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/config"
+	"github.com/google/uuid"
+	"github.com/redis/go-redis/v9"
+	"github.com/robfig/cron/v3"
+)
+
+const (
+	opsCleanupJobName = "ops_cleanup"
+
+	opsCleanupLeaderLockKeyDefault = "ops:cleanup:leader"
+	opsCleanupLeaderLockTTLDefault = 30 * time.Minute
+)
+
+var opsCleanupCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
+
+var opsCleanupReleaseScript = redis.NewScript(`
+if redis.call("GET", KEYS[1]) == ARGV[1] then
+  return redis.call("DEL", KEYS[1])
+end
+return 0
+`)
+
+// OpsCleanupService periodically deletes old ops data to prevent unbounded DB growth.
+//
+// - Scheduling: 5-field cron spec (minute hour dom month dow).
+// - Multi-instance: best-effort Redis leader lock so only one node runs cleanup.
+// - Safety: deletes in batches to avoid long transactions.
+type OpsCleanupService struct {
+	opsRepo     OpsRepository
+	db          *sql.DB
+	redisClient *redis.Client
+	cfg         *config.Config
+
+	instanceID string
+
+	cron *cron.Cron
+
+	startOnce sync.Once
+	stopOnce  sync.Once
+
+	warnNoRedisOnce sync.Once
+}
+
+func NewOpsCleanupService(
+	opsRepo OpsRepository,
+	db *sql.DB,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsCleanupService {
+	return &OpsCleanupService{
+		opsRepo:     opsRepo,
+		db:          db,
+		redisClient: redisClient,
+		cfg:         cfg,
+		instanceID:  uuid.NewString(),
+	}
+}
+
+func (s *OpsCleanupService) Start() {
+	if s == nil {
+		return
+	}
+	if s.cfg != nil && !s.cfg.Ops.Enabled {
+		return
+	}
+	if s.cfg != nil && !s.cfg.Ops.Cleanup.Enabled {
+		log.Printf("[OpsCleanup] not started (disabled)")
+		return
+	}
+	if s.opsRepo == nil || s.db == nil {
+		log.Printf("[OpsCleanup] not started (missing deps)")
+		return
+	}
+
+	s.startOnce.Do(func() {
+		schedule := "0 2 * * *"
+		if s.cfg != nil && strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule) != "" {
+			schedule = strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule)
+		}
+
+		loc := time.Local
+		if s.cfg != nil && strings.TrimSpace(s.cfg.Timezone) != "" {
+			if parsed, err := time.LoadLocation(strings.TrimSpace(s.cfg.Timezone)); err == nil && parsed != nil {
+				loc = parsed
+			}
+		}
+
+		c := cron.New(cron.WithParser(opsCleanupCronParser), cron.WithLocation(loc))
+		_, err := c.AddFunc(schedule, func() { s.runScheduled() })
+		if err != nil {
+			log.Printf("[OpsCleanup] not started (invalid schedule=%q): %v", schedule, err)
+			return
+		}
+		s.cron = c
+		s.cron.Start()
+		log.Printf("[OpsCleanup] started (schedule=%q tz=%s)", schedule, loc.String())
+	})
+}
+
+func (s *OpsCleanupService) Stop() {
+	if s == nil {
+		return
+	}
+	s.stopOnce.Do(func() {
+		if s.cron != nil {
+			ctx := s.cron.Stop()
+			select {
+			case <-ctx.Done():
+			case <-time.After(3 * time.Second):
+				log.Printf("[OpsCleanup] cron stop timed out")
+			}
+		}
+	})
+}
+
+func (s *OpsCleanupService) runScheduled() {
+	if s == nil || s.db == nil || s.opsRepo == nil {
+		return
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
+	defer cancel()
+
+	release, ok := s.tryAcquireLeaderLock(ctx)
+	if !ok {
+		return
+	}
+	if release != nil {
+		defer release()
+	}
+
+	startedAt := time.Now().UTC()
+	runAt := startedAt
+
+	counts, err := s.runCleanupOnce(ctx)
+	if err != nil {
+		s.recordHeartbeatError(runAt, time.Since(startedAt), err)
+		log.Printf("[OpsCleanup] cleanup failed: %v", err)
+		return
+	}
+	s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
+	log.Printf("[OpsCleanup] cleanup complete: %s", counts)
+}
+
+type opsCleanupDeletedCounts struct {
+	errorLogs     int64
+	retryAttempts int64
+	alertEvents   int64
+	systemMetrics int64
+	hourlyPreagg  int64
+	dailyPreagg   int64
+}
+
+func (c opsCleanupDeletedCounts) String() string {
+	return fmt.Sprintf(
+		"error_logs=%d retry_attempts=%d alert_events=%d system_metrics=%d hourly_preagg=%d daily_preagg=%d",
+		c.errorLogs,
+		c.retryAttempts,
+		c.alertEvents,
+		c.systemMetrics,
+		c.hourlyPreagg,
+		c.dailyPreagg,
+	)
+}
+
+func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDeletedCounts, error) {
+	out := opsCleanupDeletedCounts{}
+	if s == nil || s.db == nil || s.cfg == nil {
+		return out, nil
+	}
+
+	batchSize := 5000
+
+	now := time.Now().UTC()
+
+	// Error-like tables: error logs / retry attempts / alert events.
+	if days := s.cfg.Ops.Cleanup.ErrorLogRetentionDays; days > 0 {
+		cutoff := now.AddDate(0, 0, -days)
+		n, err := deleteOldRowsByID(ctx, s.db, "ops_error_logs", "created_at", cutoff, batchSize, false)
+		if err != nil {
+			return out, err
+		}
+		out.errorLogs = n
+
+		n, err = deleteOldRowsByID(ctx, s.db, "ops_retry_attempts", "created_at", cutoff, batchSize, false)
+		if err != nil {
+			return out, err
+		}
+		out.retryAttempts = n
+
+		n, err = deleteOldRowsByID(ctx, s.db, "ops_alert_events", "created_at", cutoff, batchSize, false)
+		if err != nil {
+			return out, err
+		}
+		out.alertEvents = n
+	}
+
+	// Minute-level metrics snapshots.
+	if days := s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays; days > 0 {
+		cutoff := now.AddDate(0, 0, -days)
+		n, err := deleteOldRowsByID(ctx, s.db, "ops_system_metrics", "created_at", cutoff, batchSize, false)
+		if err != nil {
+			return out, err
+		}
+		out.systemMetrics = n
+	}
+
+	// Pre-aggregation tables (hourly/daily).
+	if days := s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays; days > 0 {
+		cutoff := now.AddDate(0, 0, -days)
+		n, err := deleteOldRowsByID(ctx, s.db, "ops_metrics_hourly", "bucket_start", cutoff, batchSize, false)
+		if err != nil {
+			return out, err
+		}
+		out.hourlyPreagg = n
+
+		n, err = deleteOldRowsByID(ctx, s.db, "ops_metrics_daily", "bucket_date", cutoff, batchSize, true)
+		if err != nil {
+			return out, err
+		}
+		out.dailyPreagg = n
+	}
+
+	return out, nil
+}
+
+func deleteOldRowsByID(
+	ctx context.Context,
+	db *sql.DB,
+	table string,
+	timeColumn string,
+	cutoff time.Time,
+	batchSize int,
+	castCutoffToDate bool,
+) (int64, error) {
+	if db == nil {
+		return 0, nil
+	}
+	if batchSize <= 0 {
+		batchSize = 5000
+	}
+
+	where := fmt.Sprintf("%s < $1", timeColumn)
+	if castCutoffToDate {
+		where = fmt.Sprintf("%s < $1::date", timeColumn)
+	}
+
+	q := fmt.Sprintf(`
+WITH batch AS (
+  SELECT id FROM %s
+  WHERE %s
+  ORDER BY id
+  LIMIT $2
+)
+DELETE FROM %s
+WHERE id IN (SELECT id FROM batch)
+`, table, where, table)
+
+	var total int64
+	for {
+		res, err := db.ExecContext(ctx, q, cutoff, batchSize)
+		if err != nil {
+			// If ops tables aren't present yet (partial deployments), treat as no-op.
+			if strings.Contains(strings.ToLower(err.Error()), "does not exist") && strings.Contains(strings.ToLower(err.Error()), "relation") {
+				return total, nil
+			}
+			return total, err
+		}
+		affected, err := res.RowsAffected()
+		if err != nil {
+			return total, err
+		}
+		total += affected
+		if affected == 0 {
+			break
+		}
+	}
+	return total, nil
+}
+
+func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
+	if s == nil {
+		return nil, false
+	}
+	// In simple run mode, assume single instance.
+	if s.cfg != nil && s.cfg.RunMode == config.RunModeSimple {
+		return nil, true
+	}
+
+	key := opsCleanupLeaderLockKeyDefault
+	ttl := opsCleanupLeaderLockTTLDefault
+
+	// Prefer Redis leader lock when available, but avoid stampeding the DB when Redis is flaky by
+	// falling back to a DB advisory lock.
+	if s.redisClient != nil {
+		ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
+		if err == nil {
+			if !ok {
+				return nil, false
+			}
+			return func() {
+				_, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
+			}, true
+		}
+		// Redis error: fall back to DB advisory lock.
+		s.warnNoRedisOnce.Do(func() {
+			log.Printf("[OpsCleanup] leader lock SetNX failed; falling back to DB advisory lock: %v", err)
+		})
+	} else {
+		s.warnNoRedisOnce.Do(func() {
+			log.Printf("[OpsCleanup] redis not configured; using DB advisory lock")
+		})
+	}
+
+	release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
+	if !ok {
+		return nil, false
+	}
+	return release, true
+}
+
+func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
+	if s == nil || s.opsRepo == nil {
+		return
+	}
+	now := time.Now().UTC()
+	durMs := duration.Milliseconds()
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+	_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
+		JobName:        opsCleanupJobName,
+		LastRunAt:      &runAt,
+		LastSuccessAt:  &now,
+		LastDurationMs: &durMs,
+	})
+}
+
+func (s *OpsCleanupService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
+	if s == nil || s.opsRepo == nil || err == nil {
+		return
+	}
+	now := time.Now().UTC()
+	durMs := duration.Milliseconds()
+	msg := truncateString(err.Error(), 2048)
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+	_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
+		JobName:        opsCleanupJobName,
+		LastRunAt:      &runAt,
+		LastErrorAt:    &now,
+		LastError:      &msg,
+		LastDurationMs: &durMs,
+	})
+}
diff --git a/backend/internal/service/ops_concurrency.go b/backend/internal/service/ops_concurrency.go
new file mode 100644
index 00000000..c3b7b853
--- /dev/null
+++ b/backend/internal/service/ops_concurrency.go
@@ -0,0 +1,257 @@
+package service
+
+import (
+	"context"
+	"log"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/pkg/pagination"
+)
+
+const (
+	opsAccountsPageSize          = 100
+	opsConcurrencyBatchChunkSize = 200
+)
+
+func (s *OpsService) listAllAccountsForOps(ctx context.Context, platformFilter string) ([]Account, error) {
+	if s == nil || s.accountRepo == nil {
+		return []Account{}, nil
+	}
+
+	out := make([]Account, 0, 128)
+	page := 1
+	for {
+		accounts, pageInfo, err := s.accountRepo.ListWithFilters(ctx, pagination.PaginationParams{
+			Page:     page,
+			PageSize: opsAccountsPageSize,
+		}, platformFilter, "", "", "")
+		if err != nil {
+			return nil, err
+		}
+		if len(accounts) == 0 {
+			break
+		}
+
+		out = append(out, accounts...)
+		if pageInfo != nil && int64(len(out)) >= pageInfo.Total {
+			break
+		}
+		if len(accounts) < opsAccountsPageSize {
+			break
+		}
+
+		page++
+		if page > 10_000 {
+			log.Printf("[Ops] listAllAccountsForOps: aborting after too many pages (platform=%q)", platformFilter)
+			break
+		}
+	}
+
+	return out, nil
+}
+
+func (s *OpsService) getAccountsLoadMapBestEffort(ctx context.Context, accounts []Account) map[int64]*AccountLoadInfo {
+	if s == nil || s.concurrencyService == nil {
+		return map[int64]*AccountLoadInfo{}
+	}
+	if len(accounts) == 0 {
+		return map[int64]*AccountLoadInfo{}
+	}
+
+	// De-duplicate IDs (and keep the max concurrency to avoid under-reporting).
+	unique := make(map[int64]int, len(accounts))
+	for _, acc := range accounts {
+		if acc.ID <= 0 {
+			continue
+		}
+		if prev, ok := unique[acc.ID]; !ok || acc.Concurrency > prev {
+			unique[acc.ID] = acc.Concurrency
+		}
+	}
+
+	batch := make([]AccountWithConcurrency, 0, len(unique))
+	for id, maxConc := range unique {
+		batch = append(batch, AccountWithConcurrency{
+			ID:             id,
+			MaxConcurrency: maxConc,
+		})
+	}
+
+	out := make(map[int64]*AccountLoadInfo, len(batch))
+	for i := 0; i < len(batch); i += opsConcurrencyBatchChunkSize {
+		end := i + opsConcurrencyBatchChunkSize
+		if end > len(batch) {
+			end = len(batch)
+		}
+		part, err := s.concurrencyService.GetAccountsLoadBatch(ctx, batch[i:end])
+		if err != nil {
+			// Best-effort: return zeros rather than failing the ops UI.
+			log.Printf("[Ops] GetAccountsLoadBatch failed: %v", err)
+			continue
+		}
+		for k, v := range part {
+			out[k] = v
+		}
+	}
+
+	return out
+}
+
+// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account.
+//
+// Optional filters:
+// - platformFilter: only include accounts in that platform (best-effort reduces DB load)
+// - groupIDFilter: only include accounts that belong to that group
+func (s *OpsService) GetConcurrencyStats(
+	ctx context.Context,
+	platformFilter string,
+	groupIDFilter *int64,
+) (map[string]*PlatformConcurrencyInfo, map[int64]*GroupConcurrencyInfo, map[int64]*AccountConcurrencyInfo, *time.Time, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, nil, nil, nil, err
+	}
+
+	accounts, err := s.listAllAccountsForOps(ctx, platformFilter)
+	if err != nil {
+		return nil, nil, nil, nil, err
+	}
+
+	collectedAt := time.Now()
+	loadMap := s.getAccountsLoadMapBestEffort(ctx, accounts)
+
+	platform := make(map[string]*PlatformConcurrencyInfo)
+	group := make(map[int64]*GroupConcurrencyInfo)
+	account := make(map[int64]*AccountConcurrencyInfo)
+
+	for _, acc := range accounts {
+		if acc.ID <= 0 {
+			continue
+		}
+
+		var matchedGroup *Group
+		if groupIDFilter != nil && *groupIDFilter > 0 {
+			for _, grp := range acc.Groups {
+				if grp == nil || grp.ID <= 0 {
+					continue
+				}
+				if grp.ID == *groupIDFilter {
+					matchedGroup = grp
+					break
+				}
+			}
+			// Group filter provided: skip accounts not in that group.
+			if matchedGroup == nil {
+				continue
+			}
+		}
+
+		load := loadMap[acc.ID]
+		currentInUse := int64(0)
+		waiting := int64(0)
+		if load != nil {
+			currentInUse = int64(load.CurrentConcurrency)
+			waiting = int64(load.WaitingCount)
+		}
+
+		// Account-level view picks one display group (the first group).
+		displayGroupID := int64(0)
+		displayGroupName := ""
+		if matchedGroup != nil {
+			displayGroupID = matchedGroup.ID
+			displayGroupName = matchedGroup.Name
+		} else if len(acc.Groups) > 0 && acc.Groups[0] != nil {
+			displayGroupID = acc.Groups[0].ID
+			displayGroupName = acc.Groups[0].Name
+		}
+
+		if _, ok := account[acc.ID]; !ok {
+			info := &AccountConcurrencyInfo{
+				AccountID:      acc.ID,
+				AccountName:    acc.Name,
+				Platform:       acc.Platform,
+				GroupID:        displayGroupID,
+				GroupName:      displayGroupName,
+				CurrentInUse:   currentInUse,
+				MaxCapacity:    int64(acc.Concurrency),
+				WaitingInQueue: waiting,
+			}
+			if info.MaxCapacity > 0 {
+				info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
+			}
+			account[acc.ID] = info
+		}
+
+		// Platform aggregation.
+		if acc.Platform != "" {
+			if _, ok := platform[acc.Platform]; !ok {
+				platform[acc.Platform] = &PlatformConcurrencyInfo{
+					Platform: acc.Platform,
+				}
+			}
+			p := platform[acc.Platform]
+			p.MaxCapacity += int64(acc.Concurrency)
+			p.CurrentInUse += currentInUse
+			p.WaitingInQueue += waiting
+		}
+
+		// Group aggregation (one account may contribute to multiple groups).
+		if matchedGroup != nil {
+			grp := matchedGroup
+			if _, ok := group[grp.ID]; !ok {
+				group[grp.ID] = &GroupConcurrencyInfo{
+					GroupID:   grp.ID,
+					GroupName: grp.Name,
+					Platform:  grp.Platform,
+				}
+			}
+			g := group[grp.ID]
+			if g.GroupName == "" && grp.Name != "" {
+				g.GroupName = grp.Name
+			}
+			if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform {
+				// Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels.
+				g.Platform = ""
+			}
+			g.MaxCapacity += int64(acc.Concurrency)
+			g.CurrentInUse += currentInUse
+			g.WaitingInQueue += waiting
+		} else {
+			for _, grp := range acc.Groups {
+				if grp == nil || grp.ID <= 0 {
+					continue
+				}
+				if _, ok := group[grp.ID]; !ok {
+					group[grp.ID] = &GroupConcurrencyInfo{
+						GroupID:   grp.ID,
+						GroupName: grp.Name,
+						Platform:  grp.Platform,
+					}
+				}
+				g := group[grp.ID]
+				if g.GroupName == "" && grp.Name != "" {
+					g.GroupName = grp.Name
+				}
+				if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform {
+					// Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels.
+					g.Platform = ""
+				}
+				g.MaxCapacity += int64(acc.Concurrency)
+				g.CurrentInUse += currentInUse
+				g.WaitingInQueue += waiting
+			}
+		}
+	}
+
+	for _, info := range platform {
+		if info.MaxCapacity > 0 {
+			info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
+		}
+	}
+	for _, info := range group {
+		if info.MaxCapacity > 0 {
+			info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
+		}
+	}
+
+	return platform, group, account, &collectedAt, nil
+}
diff --git a/backend/internal/service/ops_dashboard.go b/backend/internal/service/ops_dashboard.go
new file mode 100644
index 00000000..31822ba8
--- /dev/null
+++ b/backend/internal/service/ops_dashboard.go
@@ -0,0 +1,90 @@
+package service
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"log"
+	"time"
+
+	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if filter == nil {
+		return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
+	}
+	if filter.StartTime.After(filter.EndTime) {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
+	}
+
+	// Resolve query mode (requested via query param, or DB default).
+	filter.QueryMode = s.resolveOpsQueryMode(ctx, filter.QueryMode)
+
+	overview, err := s.opsRepo.GetDashboardOverview(ctx, filter)
+	if err != nil {
+		if errors.Is(err, ErrOpsPreaggregatedNotPopulated) {
+			return nil, infraerrors.Conflict("OPS_PREAGG_NOT_READY", "Pre-aggregated ops metrics are not populated yet")
+		}
+		return nil, err
+	}
+
+	// Best-effort system health + jobs; dashboard metrics should still render if these are missing.
+	if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil {
+		// Attach config-derived limits so the UI can show "current / max" for connection pools.
+		// These are best-effort and should never block the dashboard rendering.
+		if s != nil && s.cfg != nil {
+			if s.cfg.Database.MaxOpenConns > 0 {
+				metrics.DBMaxOpenConns = intPtr(s.cfg.Database.MaxOpenConns)
+			}
+			if s.cfg.Redis.PoolSize > 0 {
+				metrics.RedisPoolSize = intPtr(s.cfg.Redis.PoolSize)
+			}
+		}
+		overview.SystemMetrics = metrics
+	} else if err != nil && !errors.Is(err, sql.ErrNoRows) {
+		log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err)
+	}
+
+	if heartbeats, err := s.opsRepo.ListJobHeartbeats(ctx); err == nil {
+		overview.JobHeartbeats = heartbeats
+	} else {
+		log.Printf("[Ops] ListJobHeartbeats failed: %v", err)
+	}
+
+	overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview)
+
+	return overview, nil
+}
+
+func (s *OpsService) resolveOpsQueryMode(ctx context.Context, requested OpsQueryMode) OpsQueryMode {
+	if requested.IsValid() {
+		// Allow "auto" to be disabled via config until preagg is proven stable in production.
+		// Forced `preagg` via query param still works.
+		if requested == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
+			return OpsQueryModeRaw
+		}
+		return requested
+	}
+
+	mode := OpsQueryModeAuto
+	if s != nil && s.settingRepo != nil {
+		if raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsQueryModeDefault); err == nil {
+			mode = ParseOpsQueryMode(raw)
+		}
+	}
+
+	if mode == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
+		return OpsQueryModeRaw
+	}
+	return mode
+}
diff --git a/backend/internal/service/ops_dashboard_models.go b/backend/internal/service/ops_dashboard_models.go
new file mode 100644
index 00000000..f189031b
--- /dev/null
+++ b/backend/internal/service/ops_dashboard_models.go
@@ -0,0 +1,87 @@
+package service
+
+import "time"
+
+type OpsDashboardFilter struct {
+	StartTime time.Time
+	EndTime   time.Time
+
+	Platform string
+	GroupID  *int64
+
+	// QueryMode controls whether dashboard queries should use raw logs or pre-aggregated tables.
+	// Expected values: auto/raw/preagg (see OpsQueryMode).
+	QueryMode OpsQueryMode
+}
+
+type OpsRateSummary struct {
+	Current float64 `json:"current"`
+	Peak    float64 `json:"peak"`
+	Avg     float64 `json:"avg"`
+}
+
+type OpsPercentiles struct {
+	P50 *int `json:"p50_ms"`
+	P90 *int `json:"p90_ms"`
+	P95 *int `json:"p95_ms"`
+	P99 *int `json:"p99_ms"`
+	Avg *int `json:"avg_ms"`
+	Max *int `json:"max_ms"`
+}
+
+type OpsDashboardOverview struct {
+	StartTime time.Time `json:"start_time"`
+	EndTime   time.Time `json:"end_time"`
+	Platform  string    `json:"platform"`
+	GroupID   *int64    `json:"group_id"`
+
+	// HealthScore is a backend-computed overall health score (0-100).
+	// It is derived from the monitored metrics in this overview, plus best-effort system metrics/job heartbeats.
+	HealthScore int `json:"health_score"`
+
+	// Latest system-level snapshot (window=1m, global).
+	SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"`
+
+	// Background jobs health (heartbeats).
+	JobHeartbeats []*OpsJobHeartbeat `json:"job_heartbeats"`
+
+	SuccessCount         int64 `json:"success_count"`
+	ErrorCountTotal      int64 `json:"error_count_total"`
+	BusinessLimitedCount int64 `json:"business_limited_count"`
+
+	ErrorCountSLA     int64 `json:"error_count_sla"`
+	RequestCountTotal int64 `json:"request_count_total"`
+	RequestCountSLA   int64 `json:"request_count_sla"`
+
+	TokenConsumed int64 `json:"token_consumed"`
+
+	SLA                          float64 `json:"sla"`
+	ErrorRate                    float64 `json:"error_rate"`
+	UpstreamErrorRate            float64 `json:"upstream_error_rate"`
+	UpstreamErrorCountExcl429529 int64   `json:"upstream_error_count_excl_429_529"`
+	Upstream429Count             int64   `json:"upstream_429_count"`
+	Upstream529Count             int64   `json:"upstream_529_count"`
+
+	QPS OpsRateSummary `json:"qps"`
+	TPS OpsRateSummary `json:"tps"`
+
+	Duration OpsPercentiles `json:"duration"`
+	TTFT     OpsPercentiles `json:"ttft"`
+}
+
+type OpsLatencyHistogramBucket struct {
+	Range string `json:"range"`
+	Count int64  `json:"count"`
+}
+
+// OpsLatencyHistogramResponse is a coarse latency distribution histogram (success requests only).
+// It is used by the Ops dashboard to quickly identify tail latency regressions.
+type OpsLatencyHistogramResponse struct {
+	StartTime time.Time `json:"start_time"`
+	EndTime   time.Time `json:"end_time"`
+	Platform  string    `json:"platform"`
+	GroupID   *int64    `json:"group_id"`
+
+	TotalRequests int64                        `json:"total_requests"`
+	Buckets       []*OpsLatencyHistogramBucket `json:"buckets"`
+}
diff --git a/backend/internal/service/ops_errors.go b/backend/internal/service/ops_errors.go
new file mode 100644
index 00000000..76b5ce8b
--- /dev/null
+++ b/backend/internal/service/ops_errors.go
@@ -0,0 +1,45 @@
+package service
+
+import (
+	"context"
+
+	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+func (s *OpsService) GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if filter == nil {
+		return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
+	}
+	if filter.StartTime.After(filter.EndTime) {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
+	}
+	return s.opsRepo.GetErrorTrend(ctx, filter, bucketSeconds)
+}
+
+func (s *OpsService) GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if filter == nil {
+		return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
+	}
+	if filter.StartTime.After(filter.EndTime) {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
+	}
+	return s.opsRepo.GetErrorDistribution(ctx, filter)
+}
diff --git a/backend/internal/service/ops_health_score.go b/backend/internal/service/ops_health_score.go
new file mode 100644
index 00000000..feb0d843
--- /dev/null
+++ b/backend/internal/service/ops_health_score.go
@@ -0,0 +1,154 @@
+package service
+
+import (
+	"math"
+	"time"
+)
+
+// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
+//
+// Design goals:
+// - Backend-owned scoring (UI only displays).
+// - Layered scoring: Business Health (70%) + Infrastructure Health (30%)
+// - Avoids double-counting (e.g., DB failure affects both infra and business metrics)
+// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
+func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
+	if overview == nil {
+		return 0
+	}
+
+	// Idle/no-data: avoid showing a "bad" score when there is no traffic.
+	// UI can still render a gray/idle state based on QPS + error rate.
+	if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 {
+		return 100
+	}
+
+	businessHealth := computeBusinessHealth(overview)
+	infraHealth := computeInfraHealth(now, overview)
+
+	// Weighted combination: 70% business + 30% infrastructure
+	score := businessHealth*0.7 + infraHealth*0.3
+	return int(math.Round(clampFloat64(score, 0, 100)))
+}
+
+// computeBusinessHealth calculates business health score (0-100)
+// Components: SLA (50%) + Error Rate (30%) + Latency (20%)
+func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
+	// SLA score: 99.5% → 100, 95% → 0 (linear)
+	slaScore := 100.0
+	slaPct := clampFloat64(overview.SLA*100, 0, 100)
+	if slaPct < 99.5 {
+		if slaPct >= 95 {
+			slaScore = (slaPct - 95) / 4.5 * 100
+		} else {
+			slaScore = 0
+		}
+	}
+
+	// Error rate score: 0.5% → 100, 5% → 0 (linear)
+	// Combines request errors and upstream errors
+	errorScore := 100.0
+	errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
+	upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
+	combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
+	if combinedErrorPct > 0.5 {
+		if combinedErrorPct <= 5 {
+			errorScore = (5 - combinedErrorPct) / 4.5 * 100
+		} else {
+			errorScore = 0
+		}
+	}
+
+	// Latency score: 1s → 100, 10s → 0 (linear)
+	// Uses P99 of duration (TTFT is less critical for overall health)
+	latencyScore := 100.0
+	if overview.Duration.P99 != nil {
+		p99 := float64(*overview.Duration.P99)
+		if p99 > 1000 {
+			if p99 <= 10000 {
+				latencyScore = (10000 - p99) / 9000 * 100
+			} else {
+				latencyScore = 0
+			}
+		}
+	}
+
+	// Weighted combination
+	return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2
+}
+
+// computeInfraHealth calculates infrastructure health score (0-100)
+// Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%)
+func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 {
+	// Storage score: DB critical, Redis less critical
+	storageScore := 100.0
+	if overview.SystemMetrics != nil {
+		if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
+			storageScore = 0 // DB failure is critical
+		} else if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
+			storageScore = 50 // Redis failure is degraded but not critical
+		}
+	}
+
+	// Compute resources score: CPU + Memory
+	computeScore := 100.0
+	if overview.SystemMetrics != nil {
+		cpuScore := 100.0
+		if overview.SystemMetrics.CPUUsagePercent != nil {
+			cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
+			if cpuPct > 80 {
+				if cpuPct <= 100 {
+					cpuScore = (100 - cpuPct) / 20 * 100
+				} else {
+					cpuScore = 0
+				}
+			}
+		}
+
+		memScore := 100.0
+		if overview.SystemMetrics.MemoryUsagePercent != nil {
+			memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
+			if memPct > 85 {
+				if memPct <= 100 {
+					memScore = (100 - memPct) / 15 * 100
+				} else {
+					memScore = 0
+				}
+			}
+		}
+
+		computeScore = (cpuScore + memScore) / 2
+	}
+
+	// Background jobs score
+	jobScore := 100.0
+	failedJobs := 0
+	totalJobs := 0
+	for _, hb := range overview.JobHeartbeats {
+		if hb == nil {
+			continue
+		}
+		totalJobs++
+		if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
+			failedJobs++
+		} else if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
+			failedJobs++
+		}
+	}
+	if totalJobs > 0 && failedJobs > 0 {
+		jobScore = (1 - float64(failedJobs)/float64(totalJobs)) * 100
+	}
+
+	// Weighted combination
+	return storageScore*0.4 + computeScore*0.3 + jobScore*0.3
+}
+
+func clampFloat64(v float64, min float64, max float64) float64 {
+	if v < min {
+		return min
+	}
+	if v > max {
+		return max
+	}
+	return v
+}
diff --git a/backend/internal/service/ops_health_score_test.go b/backend/internal/service/ops_health_score_test.go
new file mode 100644
index 00000000..849ba146
--- /dev/null
+++ b/backend/internal/service/ops_health_score_test.go
@@ -0,0 +1,431 @@
+//go:build unit
+
+package service
+
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestComputeDashboardHealthScore_IdleReturns100(t *testing.T) {
+	t.Parallel()
+
+	score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{})
+	require.Equal(t, 100, score)
+}
+
+func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) {
+	t.Parallel()
+
+	ov := &OpsDashboardOverview{
+		RequestCountTotal: 100,
+		RequestCountSLA:   100,
+		SuccessCount:      90,
+		ErrorCountTotal:   10,
+		ErrorCountSLA:     10,
+
+		SLA:               0.90,
+		ErrorRate:         0.10,
+		UpstreamErrorRate: 0.08,
+
+		Duration: OpsPercentiles{P99: intPtr(20_000)},
+		TTFT:     OpsPercentiles{P99: intPtr(2_000)},
+
+		SystemMetrics: &OpsSystemMetricsSnapshot{
+			DBOK:                  boolPtr(false),
+			RedisOK:               boolPtr(false),
+			CPUUsagePercent:       float64Ptr(98.0),
+			MemoryUsagePercent:    float64Ptr(97.0),
+			DBConnWaiting:         intPtr(3),
+			ConcurrencyQueueDepth: intPtr(10),
+		},
+		JobHeartbeats: []*OpsJobHeartbeat{
+			{
+				JobName:     "job-a",
+				LastErrorAt: timePtr(time.Now().UTC().Add(-1 * time.Minute)),
+				LastError:   stringPtr("boom"),
+			},
+		},
+	}
+
+	score := computeDashboardHealthScore(time.Now().UTC(), ov)
+	require.Less(t, score, 80)
+	require.GreaterOrEqual(t, score, 0)
+}
+
+func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name     string
+		overview *OpsDashboardOverview
+		wantMin  int
+		wantMax  int
+	}{
+		{
+			name:     "nil overview returns 0",
+			overview: nil,
+			wantMin:  0,
+			wantMax:  0,
+		},
+		{
+			name: "perfect health",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               1.0,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+				TTFT:              OpsPercentiles{P99: intPtr(100)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 100,
+			wantMax: 100,
+		},
+		{
+			name: "good health - SLA 99.8%",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.998,
+				ErrorRate:         0.003,
+				UpstreamErrorRate: 0.001,
+				Duration:          OpsPercentiles{P99: intPtr(800)},
+				TTFT:              OpsPercentiles{P99: intPtr(200)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(50),
+					MemoryUsagePercent: float64Ptr(60),
+				},
+			},
+			wantMin: 95,
+			wantMax: 100,
+		},
+		{
+			name: "medium health - SLA 96%",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.96,
+				ErrorRate:         0.02,
+				UpstreamErrorRate: 0.01,
+				Duration:          OpsPercentiles{P99: intPtr(3000)},
+				TTFT:              OpsPercentiles{P99: intPtr(600)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(70),
+					MemoryUsagePercent: float64Ptr(75),
+				},
+			},
+			wantMin: 60,
+			wantMax: 85,
+		},
+		{
+			name: "DB failure",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.995,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(false),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 70,
+			wantMax: 90,
+		},
+		{
+			name: "Redis failure",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.995,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(false),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 85,
+			wantMax: 95,
+		},
+		{
+			name: "high CPU usage",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.995,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(95),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 85,
+			wantMax: 100,
+		},
+		{
+			name: "combined failures - business degraded + infra healthy",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.90,
+				ErrorRate:         0.05,
+				UpstreamErrorRate: 0.02,
+				Duration:          OpsPercentiles{P99: intPtr(10000)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(20),
+					MemoryUsagePercent: float64Ptr(30),
+				},
+			},
+			wantMin: 25,
+			wantMax: 50,
+		},
+		{
+			name: "combined failures - business healthy + infra degraded",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.998,
+				ErrorRate:         0.001,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(600)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(false),
+					RedisOK:            boolPtr(false),
+					CPUUsagePercent:    float64Ptr(95),
+					MemoryUsagePercent: float64Ptr(95),
+				},
+			},
+			wantMin: 70,
+			wantMax: 90,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			score := computeDashboardHealthScore(time.Now().UTC(), tt.overview)
+			require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %d", tt.wantMin)
+			require.LessOrEqual(t, score, tt.wantMax, "score should be <= %d", tt.wantMax)
+			require.GreaterOrEqual(t, score, 0, "score must be >= 0")
+			require.LessOrEqual(t, score, 100, "score must be <= 100")
+		})
+	}
+}
+
+func TestComputeBusinessHealth(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name     string
+		overview *OpsDashboardOverview
+		wantMin  float64
+		wantMax  float64
+	}{
+		{
+			name: "perfect metrics",
+			overview: &OpsDashboardOverview{
+				SLA:               1.0,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+			},
+			wantMin: 100,
+			wantMax: 100,
+		},
+		{
+			name: "SLA boundary 99.5%",
+			overview: &OpsDashboardOverview{
+				SLA:               0.995,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+			},
+			wantMin: 100,
+			wantMax: 100,
+		},
+		{
+			name: "SLA boundary 95%",
+			overview: &OpsDashboardOverview{
+				SLA:               0.95,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+			},
+			wantMin: 50,
+			wantMax: 60,
+		},
+		{
+			name: "error rate boundary 0.5%",
+			overview: &OpsDashboardOverview{
+				SLA:               0.995,
+				ErrorRate:         0.005,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+			},
+			wantMin: 95,
+			wantMax: 100,
+		},
+		{
+			name: "latency boundary 1000ms",
+			overview: &OpsDashboardOverview{
+				SLA:               0.995,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(1000)},
+			},
+			wantMin: 95,
+			wantMax: 100,
+		},
+		{
+			name: "upstream error dominates",
+			overview: &OpsDashboardOverview{
+				SLA:               0.995,
+				ErrorRate:         0.001,
+				UpstreamErrorRate: 0.03,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+			},
+			wantMin: 75,
+			wantMax: 90,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			score := computeBusinessHealth(tt.overview)
+			require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
+			require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
+			require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
+			require.LessOrEqual(t, score, 100.0, "score must be <= 100")
+		})
+	}
+}
+
+func TestComputeInfraHealth(t *testing.T) {
+	t.Parallel()
+
+	now := time.Now().UTC()
+
+	tests := []struct {
+		name     string
+		overview *OpsDashboardOverview
+		wantMin  float64
+		wantMax  float64
+	}{
+		{
+			name: "all infrastructure healthy",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 100,
+			wantMax: 100,
+		},
+		{
+			name: "DB down",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(false),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 50,
+			wantMax: 70,
+		},
+		{
+			name: "Redis down",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(false),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 80,
+			wantMax: 95,
+		},
+		{
+			name: "CPU at 90%",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(90),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 85,
+			wantMax: 95,
+		},
+		{
+			name: "failed background job",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+				JobHeartbeats: []*OpsJobHeartbeat{
+					{
+						JobName:     "test-job",
+						LastErrorAt: &now,
+					},
+				},
+			},
+			wantMin: 70,
+			wantMax: 90,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			score := computeInfraHealth(now, tt.overview)
+			require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
+			require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
+			require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
+			require.LessOrEqual(t, score, 100.0, "score must be <= 100")
+		})
+	}
+}
+
+func timePtr(v time.Time) *time.Time { return &v }
+
+func stringPtr(v string) *string { return &v }
diff --git a/backend/internal/service/ops_histograms.go b/backend/internal/service/ops_histograms.go
new file mode 100644
index 00000000..9f5b514f
--- /dev/null
+++ b/backend/internal/service/ops_histograms.go
@@ -0,0 +1,26 @@
+package service
+
+import (
+	"context"
+
+	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+func (s *OpsService) GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if filter == nil {
+		return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
+	}
+	if filter.StartTime.After(filter.EndTime) {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
+	}
+	return s.opsRepo.GetLatencyHistogram(ctx, filter)
+}
diff --git a/backend/internal/service/ops_metrics_collector.go b/backend/internal/service/ops_metrics_collector.go
new file mode 100644
index 00000000..edf32cf2
--- /dev/null
+++ b/backend/internal/service/ops_metrics_collector.go
@@ -0,0 +1,920 @@
+package service
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"fmt"
+	"log"
+	"math"
+	"os"
+	"runtime"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+	"unicode/utf8"
+
+	"github.com/Wei-Shaw/sub2api/internal/config"
+	"github.com/google/uuid"
+	"github.com/redis/go-redis/v9"
+	"github.com/shirou/gopsutil/v4/cpu"
+	"github.com/shirou/gopsutil/v4/mem"
+)
+
+const (
+	opsMetricsCollectorJobName     = "ops_metrics_collector"
+	opsMetricsCollectorMinInterval = 60 * time.Second
+	opsMetricsCollectorMaxInterval = 1 * time.Hour
+
+	opsMetricsCollectorTimeout = 10 * time.Second
+
+	opsMetricsCollectorLeaderLockKey = "ops:metrics:collector:leader"
+	opsMetricsCollectorLeaderLockTTL = 90 * time.Second
+
+	opsMetricsCollectorHeartbeatTimeout = 2 * time.Second
+
+	bytesPerMB = 1024 * 1024
+)
+
+var opsMetricsCollectorAdvisoryLockID = hashAdvisoryLockID(opsMetricsCollectorLeaderLockKey)
+
+type OpsMetricsCollector struct {
+	opsRepo     OpsRepository
+	settingRepo SettingRepository
+	cfg         *config.Config
+
+	accountRepo        AccountRepository
+	concurrencyService *ConcurrencyService
+
+	db          *sql.DB
+	redisClient *redis.Client
+	instanceID  string
+
+	lastCgroupCPUUsageNanos uint64
+	lastCgroupCPUSampleAt   time.Time
+
+	stopCh    chan struct{}
+	startOnce sync.Once
+	stopOnce  sync.Once
+
+	skipLogMu sync.Mutex
+	skipLogAt time.Time
+}
+
+func NewOpsMetricsCollector(
+	opsRepo OpsRepository,
+	settingRepo SettingRepository,
+	accountRepo AccountRepository,
+	concurrencyService *ConcurrencyService,
+	db *sql.DB,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsMetricsCollector {
+	return &OpsMetricsCollector{
+		opsRepo:            opsRepo,
+		settingRepo:        settingRepo,
+		cfg:                cfg,
+		accountRepo:        accountRepo,
+		concurrencyService: concurrencyService,
+		db:                 db,
+		redisClient:        redisClient,
+		instanceID:         uuid.NewString(),
+	}
+}
+
+func (c *OpsMetricsCollector) Start() {
+	if c == nil {
+		return
+	}
+	c.startOnce.Do(func() {
+		if c.stopCh == nil {
+			c.stopCh = make(chan struct{})
+		}
+		go c.run()
+	})
+}
+
+func (c *OpsMetricsCollector) Stop() {
+	if c == nil {
+		return
+	}
+	c.stopOnce.Do(func() {
+		if c.stopCh != nil {
+			close(c.stopCh)
+		}
+	})
+}
+
+func (c *OpsMetricsCollector) run() {
+	// First run immediately so the dashboard has data soon after startup.
+	c.collectOnce()
+
+	for {
+		interval := c.getInterval()
+		timer := time.NewTimer(interval)
+		select {
+		case <-timer.C:
+			c.collectOnce()
+		case <-c.stopCh:
+			timer.Stop()
+			return
+		}
+	}
+}
+
+func (c *OpsMetricsCollector) getInterval() time.Duration {
+	interval := opsMetricsCollectorMinInterval
+
+	if c.settingRepo == nil {
+		return interval
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	raw, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMetricsIntervalSeconds)
+	if err != nil {
+		return interval
+	}
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return interval
+	}
+
+	seconds, err := strconv.Atoi(raw)
+	if err != nil {
+		return interval
+	}
+	if seconds < int(opsMetricsCollectorMinInterval.Seconds()) {
+		seconds = int(opsMetricsCollectorMinInterval.Seconds())
+	}
+	if seconds > int(opsMetricsCollectorMaxInterval.Seconds()) {
+		seconds = int(opsMetricsCollectorMaxInterval.Seconds())
+	}
+	return time.Duration(seconds) * time.Second
+}
+
+func (c *OpsMetricsCollector) collectOnce() {
+	if c == nil {
+		return
+	}
+	if c.cfg != nil && !c.cfg.Ops.Enabled {
+		return
+	}
+	if c.opsRepo == nil {
+		return
+	}
+	if c.db == nil {
+		return
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), opsMetricsCollectorTimeout)
+	defer cancel()
+
+	if !c.isMonitoringEnabled(ctx) {
+		return
+	}
+
+	release, ok := c.tryAcquireLeaderLock(ctx)
+	if !ok {
+		return
+	}
+	if release != nil {
+		defer release()
+	}
+
+	startedAt := time.Now().UTC()
+	err := c.collectAndPersist(ctx)
+	finishedAt := time.Now().UTC()
+
+	durationMs := finishedAt.Sub(startedAt).Milliseconds()
+	dur := durationMs
+	runAt := startedAt
+
+	if err != nil {
+		msg := truncateString(err.Error(), 2048)
+		errAt := finishedAt
+		hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
+		defer hbCancel()
+		_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
+			JobName:        opsMetricsCollectorJobName,
+			LastRunAt:      &runAt,
+			LastErrorAt:    &errAt,
+			LastError:      &msg,
+			LastDurationMs: &dur,
+		})
+		log.Printf("[OpsMetricsCollector] collect failed: %v", err)
+		return
+	}
+
+	successAt := finishedAt
+	hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
+	defer hbCancel()
+	_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
+		JobName:        opsMetricsCollectorJobName,
+		LastRunAt:      &runAt,
+		LastSuccessAt:  &successAt,
+		LastDurationMs: &dur,
+	})
+}
+
+func (c *OpsMetricsCollector) isMonitoringEnabled(ctx context.Context) bool {
+	if c == nil {
+		return false
+	}
+	if c.cfg != nil && !c.cfg.Ops.Enabled {
+		return false
+	}
+	if c.settingRepo == nil {
+		return true
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	value, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
+	if err != nil {
+		if errors.Is(err, ErrSettingNotFound) {
+			return true
+		}
+		// Fail-open: collector should not become a hard dependency.
+		return true
+	}
+	switch strings.ToLower(strings.TrimSpace(value)) {
+	case "false", "0", "off", "disabled":
+		return false
+	default:
+		return true
+	}
+}
+
+func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	// Align to stable minute boundaries to avoid partial buckets and to maximize cache hits.
+	now := time.Now().UTC()
+	windowEnd := now.Truncate(time.Minute)
+	windowStart := windowEnd.Add(-1 * time.Minute)
+
+	sys, err := c.collectSystemStats(ctx)
+	if err != nil {
+		// Continue; system stats are best-effort.
+		log.Printf("[OpsMetricsCollector] system stats error: %v", err)
+	}
+
+	dbOK := c.checkDB(ctx)
+	redisOK := c.checkRedis(ctx)
+	active, idle := c.dbPoolStats()
+	redisTotal, redisIdle, redisStatsOK := c.redisPoolStats()
+
+	successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd)
+	if err != nil {
+		return fmt.Errorf("query usage counts: %w", err)
+	}
+
+	duration, ttft, err := c.queryUsageLatency(ctx, windowStart, windowEnd)
+	if err != nil {
+		return fmt.Errorf("query usage latency: %w", err)
+	}
+
+	errorTotal, businessLimited, errorSLA, upstreamExcl, upstream429, upstream529, err := c.queryErrorCounts(ctx, windowStart, windowEnd)
+	if err != nil {
+		return fmt.Errorf("query error counts: %w", err)
+	}
+
+	windowSeconds := windowEnd.Sub(windowStart).Seconds()
+	if windowSeconds <= 0 {
+		windowSeconds = 60
+	}
+	requestTotal := successCount + errorTotal
+	qps := float64(requestTotal) / windowSeconds
+	tps := float64(tokenConsumed) / windowSeconds
+
+	goroutines := runtime.NumGoroutine()
+	concurrencyQueueDepth := c.collectConcurrencyQueueDepth(ctx)
+
+	input := &OpsInsertSystemMetricsInput{
+		CreatedAt:     windowEnd,
+		WindowMinutes: 1,
+
+		SuccessCount:         successCount,
+		ErrorCountTotal:      errorTotal,
+		BusinessLimitedCount: businessLimited,
+		ErrorCountSLA:        errorSLA,
+
+		UpstreamErrorCountExcl429529: upstreamExcl,
+		Upstream429Count:             upstream429,
+		Upstream529Count:             upstream529,
+
+		TokenConsumed: tokenConsumed,
+		QPS:           float64Ptr(roundTo1DP(qps)),
+		TPS:           float64Ptr(roundTo1DP(tps)),
+
+		DurationP50Ms: duration.p50,
+		DurationP90Ms: duration.p90,
+		DurationP95Ms: duration.p95,
+		DurationP99Ms: duration.p99,
+		DurationAvgMs: duration.avg,
+		DurationMaxMs: duration.max,
+
+		TTFTP50Ms: ttft.p50,
+		TTFTP90Ms: ttft.p90,
+		TTFTP95Ms: ttft.p95,
+		TTFTP99Ms: ttft.p99,
+		TTFTAvgMs: ttft.avg,
+		TTFTMaxMs: ttft.max,
+
+		CPUUsagePercent:    sys.cpuUsagePercent,
+		MemoryUsedMB:       sys.memoryUsedMB,
+		MemoryTotalMB:      sys.memoryTotalMB,
+		MemoryUsagePercent: sys.memoryUsagePercent,
+
+		DBOK:    boolPtr(dbOK),
+		RedisOK: boolPtr(redisOK),
+
+		RedisConnTotal: func() *int {
+			if !redisStatsOK {
+				return nil
+			}
+			return intPtr(redisTotal)
+		}(),
+		RedisConnIdle: func() *int {
+			if !redisStatsOK {
+				return nil
+			}
+			return intPtr(redisIdle)
+		}(),
+
+		DBConnActive:          intPtr(active),
+		DBConnIdle:            intPtr(idle),
+		GoroutineCount:        intPtr(goroutines),
+		ConcurrencyQueueDepth: concurrencyQueueDepth,
+	}
+
+	return c.opsRepo.InsertSystemMetrics(ctx, input)
+}
+
+func (c *OpsMetricsCollector) collectConcurrencyQueueDepth(parentCtx context.Context) *int {
+	if c == nil || c.accountRepo == nil || c.concurrencyService == nil {
+		return nil
+	}
+	if parentCtx == nil {
+		parentCtx = context.Background()
+	}
+
+	// Best-effort: never let concurrency sampling break the metrics collector.
+	ctx, cancel := context.WithTimeout(parentCtx, 2*time.Second)
+	defer cancel()
+
+	accounts, err := c.accountRepo.ListSchedulable(ctx)
+	if err != nil {
+		return nil
+	}
+	if len(accounts) == 0 {
+		zero := 0
+		return &zero
+	}
+
+	batch := make([]AccountWithConcurrency, 0, len(accounts))
+	for _, acc := range accounts {
+		if acc.ID <= 0 {
+			continue
+		}
+		maxConc := acc.Concurrency
+		if maxConc < 0 {
+			maxConc = 0
+		}
+		batch = append(batch, AccountWithConcurrency{
+			ID:             acc.ID,
+			MaxConcurrency: maxConc,
+		})
+	}
+	if len(batch) == 0 {
+		zero := 0
+		return &zero
+	}
+
+	loadMap, err := c.concurrencyService.GetAccountsLoadBatch(ctx, batch)
+	if err != nil {
+		return nil
+	}
+
+	var total int64
+	for _, info := range loadMap {
+		if info == nil || info.WaitingCount <= 0 {
+			continue
+		}
+		total += int64(info.WaitingCount)
+	}
+	if total < 0 {
+		total = 0
+	}
+
+	maxInt := int64(^uint(0) >> 1)
+	if total > maxInt {
+		total = maxInt
+	}
+	v := int(total)
+	return &v
+}
+
+type opsCollectedPercentiles struct {
+	p50 *int
+	p90 *int
+	p95 *int
+	p99 *int
+	avg *float64
+	max *int
+}
+
+func (c *OpsMetricsCollector) queryUsageCounts(ctx context.Context, start, end time.Time) (successCount int64, tokenConsumed int64, err error) {
+	q := `
+SELECT
+  COALESCE(COUNT(*), 0) AS success_count,
+  COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
+FROM usage_logs
+WHERE created_at >= $1 AND created_at < $2`
+
+	var tokens sql.NullInt64
+	if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&successCount, &tokens); err != nil {
+		return 0, 0, err
+	}
+	if tokens.Valid {
+		tokenConsumed = tokens.Int64
+	}
+	return successCount, tokenConsumed, nil
+}
+
+func (c *OpsMetricsCollector) queryUsageLatency(ctx context.Context, start, end time.Time) (duration opsCollectedPercentiles, ttft opsCollectedPercentiles, err error) {
+	{
+		q := `
+SELECT
+  percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50,
+  percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90,
+  percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95,
+  percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99,
+  AVG(duration_ms) AS avg_ms,
+  MAX(duration_ms) AS max_ms
+FROM usage_logs
+WHERE created_at >= $1 AND created_at < $2
+  AND duration_ms IS NOT NULL`
+
+		var p50, p90, p95, p99 sql.NullFloat64
+		var avg sql.NullFloat64
+		var max sql.NullInt64
+		if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
+			return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
+		}
+		duration.p50 = floatToIntPtr(p50)
+		duration.p90 = floatToIntPtr(p90)
+		duration.p95 = floatToIntPtr(p95)
+		duration.p99 = floatToIntPtr(p99)
+		if avg.Valid {
+			v := roundTo1DP(avg.Float64)
+			duration.avg = &v
+		}
+		if max.Valid {
+			v := int(max.Int64)
+			duration.max = &v
+		}
+	}
+
+	{
+		q := `
+SELECT
+  percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50,
+  percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90,
+  percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95,
+  percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99,
+  AVG(first_token_ms) AS avg_ms,
+  MAX(first_token_ms) AS max_ms
+FROM usage_logs
+WHERE created_at >= $1 AND created_at < $2
+  AND first_token_ms IS NOT NULL`
+
+		var p50, p90, p95, p99 sql.NullFloat64
+		var avg sql.NullFloat64
+		var max sql.NullInt64
+		if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
+			return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
+		}
+		ttft.p50 = floatToIntPtr(p50)
+		ttft.p90 = floatToIntPtr(p90)
+		ttft.p95 = floatToIntPtr(p95)
+		ttft.p99 = floatToIntPtr(p99)
+		if avg.Valid {
+			v := roundTo1DP(avg.Float64)
+			ttft.avg = &v
+		}
+		if max.Valid {
+			v := int(max.Int64)
+			ttft.max = &v
+		}
+	}
+
+	return duration, ttft, nil
+}
+
+func (c *OpsMetricsCollector) queryErrorCounts(ctx context.Context, start, end time.Time) (
+	errorTotal int64,
+	businessLimited int64,
+	errorSLA int64,
+	upstreamExcl429529 int64,
+	upstream429 int64,
+	upstream529 int64,
+	err error,
+) {
+	q := `
+SELECT
+  COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400), 0) AS error_total,
+  COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited), 0) AS business_limited,
+  COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited), 0) AS error_sla,
+  COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl,
+  COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429), 0) AS upstream_429,
+  COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529), 0) AS upstream_529
+FROM ops_error_logs
+WHERE created_at >= $1 AND created_at < $2`
+
+	if err := c.db.QueryRowContext(ctx, q, start, end).Scan(
+		&errorTotal,
+		&businessLimited,
+		&errorSLA,
+		&upstreamExcl429529,
+		&upstream429,
+		&upstream529,
+	); err != nil {
+		return 0, 0, 0, 0, 0, 0, err
+	}
+	return errorTotal, businessLimited, errorSLA, upstreamExcl429529, upstream429, upstream529, nil
+}
+
+type opsCollectedSystemStats struct {
+	cpuUsagePercent    *float64
+	memoryUsedMB       *int64
+	memoryTotalMB      *int64
+	memoryUsagePercent *float64
+}
+
+func (c *OpsMetricsCollector) collectSystemStats(ctx context.Context) (*opsCollectedSystemStats, error) {
+	out := &opsCollectedSystemStats{}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	sampleAt := time.Now().UTC()
+
+	// Prefer cgroup (container) metrics when available.
+	if cpuPct := c.tryCgroupCPUPercent(sampleAt); cpuPct != nil {
+		out.cpuUsagePercent = cpuPct
+	}
+
+	cgroupUsed, cgroupTotal, cgroupOK := readCgroupMemoryBytes()
+	if cgroupOK {
+		usedMB := int64(cgroupUsed / bytesPerMB)
+		out.memoryUsedMB = &usedMB
+		if cgroupTotal > 0 {
+			totalMB := int64(cgroupTotal / bytesPerMB)
+			out.memoryTotalMB = &totalMB
+			pct := roundTo1DP(float64(cgroupUsed) / float64(cgroupTotal) * 100)
+			out.memoryUsagePercent = &pct
+		}
+	}
+
+	// Fallback to host metrics if cgroup metrics are unavailable (or incomplete).
+	if out.cpuUsagePercent == nil {
+		if cpuPercents, err := cpu.PercentWithContext(ctx, 0, false); err == nil && len(cpuPercents) > 0 {
+			v := roundTo1DP(cpuPercents[0])
+			out.cpuUsagePercent = &v
+		}
+	}
+
+	// If total memory isn't available from cgroup (e.g. memory.max = "max"), fill total from host.
+	if out.memoryUsedMB == nil || out.memoryTotalMB == nil || out.memoryUsagePercent == nil {
+		if vm, err := mem.VirtualMemoryWithContext(ctx); err == nil && vm != nil {
+			if out.memoryUsedMB == nil {
+				usedMB := int64(vm.Used / bytesPerMB)
+				out.memoryUsedMB = &usedMB
+			}
+			if out.memoryTotalMB == nil {
+				totalMB := int64(vm.Total / bytesPerMB)
+				out.memoryTotalMB = &totalMB
+			}
+			if out.memoryUsagePercent == nil {
+				if out.memoryUsedMB != nil && out.memoryTotalMB != nil && *out.memoryTotalMB > 0 {
+					pct := roundTo1DP(float64(*out.memoryUsedMB) / float64(*out.memoryTotalMB) * 100)
+					out.memoryUsagePercent = &pct
+				} else {
+					pct := roundTo1DP(vm.UsedPercent)
+					out.memoryUsagePercent = &pct
+				}
+			}
+		}
+	}
+
+	return out, nil
+}
+
+func (c *OpsMetricsCollector) tryCgroupCPUPercent(now time.Time) *float64 {
+	usageNanos, ok := readCgroupCPUUsageNanos()
+	if !ok {
+		return nil
+	}
+
+	// Initialize baseline sample.
+	if c.lastCgroupCPUSampleAt.IsZero() {
+		c.lastCgroupCPUUsageNanos = usageNanos
+		c.lastCgroupCPUSampleAt = now
+		return nil
+	}
+
+	elapsed := now.Sub(c.lastCgroupCPUSampleAt)
+	if elapsed <= 0 {
+		c.lastCgroupCPUUsageNanos = usageNanos
+		c.lastCgroupCPUSampleAt = now
+		return nil
+	}
+
+	prev := c.lastCgroupCPUUsageNanos
+	c.lastCgroupCPUUsageNanos = usageNanos
+	c.lastCgroupCPUSampleAt = now
+
+	if usageNanos < prev {
+		// Counter reset (container restarted).
+		return nil
+	}
+
+	deltaUsageSec := float64(usageNanos-prev) / 1e9
+	elapsedSec := elapsed.Seconds()
+	if elapsedSec <= 0 {
+		return nil
+	}
+
+	cores := readCgroupCPULimitCores()
+	if cores <= 0 {
+		// Can't reliably normalize; skip and fall back to gopsutil.
+		return nil
+	}
+
+	pct := (deltaUsageSec / (elapsedSec * cores)) * 100
+	if pct < 0 {
+		pct = 0
+	}
+	// Clamp to avoid noise/jitter showing impossible values.
+	if pct > 100 {
+		pct = 100
+	}
+	v := roundTo1DP(pct)
+	return &v
+}
+
+func readCgroupMemoryBytes() (usedBytes uint64, totalBytes uint64, ok bool) {
+	// cgroup v2 (most common in modern containers)
+	if used, ok1 := readUintFile("/sys/fs/cgroup/memory.current"); ok1 {
+		usedBytes = used
+		rawMax, err := os.ReadFile("/sys/fs/cgroup/memory.max")
+		if err == nil {
+			s := strings.TrimSpace(string(rawMax))
+			if s != "" && s != "max" {
+				if v, err := strconv.ParseUint(s, 10, 64); err == nil {
+					totalBytes = v
+				}
+			}
+		}
+		return usedBytes, totalBytes, true
+	}
+
+	// cgroup v1 fallback
+	if used, ok1 := readUintFile("/sys/fs/cgroup/memory/memory.usage_in_bytes"); ok1 {
+		usedBytes = used
+		if limit, ok2 := readUintFile("/sys/fs/cgroup/memory/memory.limit_in_bytes"); ok2 {
+			// Some environments report a very large number when unlimited.
+			if limit > 0 && limit < (1<<60) {
+				totalBytes = limit
+			}
+		}
+		return usedBytes, totalBytes, true
+	}
+
+	return 0, 0, false
+}
+
+func readCgroupCPUUsageNanos() (usageNanos uint64, ok bool) {
+	// cgroup v2: cpu.stat has usage_usec
+	if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.stat"); err == nil {
+		lines := strings.Split(string(raw), "\n")
+		for _, line := range lines {
+			fields := strings.Fields(line)
+			if len(fields) != 2 {
+				continue
+			}
+			if fields[0] != "usage_usec" {
+				continue
+			}
+			v, err := strconv.ParseUint(fields[1], 10, 64)
+			if err != nil {
+				continue
+			}
+			return v * 1000, true
+		}
+	}
+
+	// cgroup v1: cpuacct.usage is in nanoseconds
+	if v, ok := readUintFile("/sys/fs/cgroup/cpuacct/cpuacct.usage"); ok {
+		return v, true
+	}
+
+	return 0, false
+}
+
+func readCgroupCPULimitCores() float64 {
+	// cgroup v2: cpu.max => "<quota> <period>" or "max <period>"
+	if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.max"); err == nil {
+		fields := strings.Fields(string(raw))
+		if len(fields) >= 2 && fields[0] != "max" {
+			quota, err1 := strconv.ParseFloat(fields[0], 64)
+			period, err2 := strconv.ParseFloat(fields[1], 64)
+			if err1 == nil && err2 == nil && quota > 0 && period > 0 {
+				return quota / period
+			}
+		}
+	}
+
+	// cgroup v1: cpu.cfs_quota_us / cpu.cfs_period_us
+	quota, okQuota := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_quota_us")
+	period, okPeriod := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_period_us")
+	if okQuota && okPeriod && quota > 0 && period > 0 {
+		return float64(quota) / float64(period)
+	}
+
+	return 0
+}
+
+func readUintFile(path string) (uint64, bool) {
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		return 0, false
+	}
+	s := strings.TrimSpace(string(raw))
+	if s == "" {
+		return 0, false
+	}
+	v, err := strconv.ParseUint(s, 10, 64)
+	if err != nil {
+		return 0, false
+	}
+	return v, true
+}
+
+func readIntFile(path string) (int64, bool) {
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		return 0, false
+	}
+	s := strings.TrimSpace(string(raw))
+	if s == "" {
+		return 0, false
+	}
+	v, err := strconv.ParseInt(s, 10, 64)
+	if err != nil {
+		return 0, false
+	}
+	return v, true
+}
+
+func (c *OpsMetricsCollector) checkDB(ctx context.Context) bool {
+	if c == nil || c.db == nil {
+		return false
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	var one int
+	if err := c.db.QueryRowContext(ctx, "SELECT 1").Scan(&one); err != nil {
+		return false
+	}
+	return one == 1
+}
+
+func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool {
+	if c == nil || c.redisClient == nil {
+		return false
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	return c.redisClient.Ping(ctx).Err() == nil
+}
+
+func (c *OpsMetricsCollector) redisPoolStats() (total int, idle int, ok bool) {
+	if c == nil || c.redisClient == nil {
+		return 0, 0, false
+	}
+	stats := c.redisClient.PoolStats()
+	if stats == nil {
+		return 0, 0, false
+	}
+	return int(stats.TotalConns), int(stats.IdleConns), true
+}
+
+func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) {
+	if c == nil || c.db == nil {
+		return 0, 0
+	}
+	stats := c.db.Stats()
+	return stats.InUse, stats.Idle
+}
+
+var opsMetricsCollectorReleaseScript = redis.NewScript(`
+if redis.call("GET", KEYS[1]) == ARGV[1] then
+  return redis.call("DEL", KEYS[1])
+end
+return 0
+`)
+
+func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
+	if c == nil || c.redisClient == nil {
+		return nil, true
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	ok, err := c.redisClient.SetNX(ctx, opsMetricsCollectorLeaderLockKey, c.instanceID, opsMetricsCollectorLeaderLockTTL).Result()
+	if err != nil {
+		// Prefer fail-closed to avoid stampeding the database when Redis is flaky.
+		// Fallback to a DB advisory lock when Redis is present but unavailable.
+		release, ok := tryAcquireDBAdvisoryLock(ctx, c.db, opsMetricsCollectorAdvisoryLockID)
+		if !ok {
+			c.maybeLogSkip()
+			return nil, false
+		}
+		return release, true
+	}
+	if !ok {
+		c.maybeLogSkip()
+		return nil, false
+	}
+
+	release := func() {
+		ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+		defer cancel()
+		_, _ = opsMetricsCollectorReleaseScript.Run(ctx, c.redisClient, []string{opsMetricsCollectorLeaderLockKey}, c.instanceID).Result()
+	}
+	return release, true
+}
+
+func (c *OpsMetricsCollector) maybeLogSkip() {
+	c.skipLogMu.Lock()
+	defer c.skipLogMu.Unlock()
+
+	now := time.Now()
+	if !c.skipLogAt.IsZero() && now.Sub(c.skipLogAt) < time.Minute {
+		return
+	}
+	c.skipLogAt = now
+	log.Printf("[OpsMetricsCollector] leader lock held by another instance; skipping")
+}
+
+func floatToIntPtr(v sql.NullFloat64) *int {
+	if !v.Valid {
+		return nil
+	}
+	n := int(math.Round(v.Float64))
+	return &n
+}
+
+func roundTo1DP(v float64) float64 {
+	return math.Round(v*10) / 10
+}
+
+func truncateString(s string, max int) string {
+	if max <= 0 {
+		return ""
+	}
+	if len(s) <= max {
+		return s
+	}
+	cut := s[:max]
+	for len(cut) > 0 && !utf8.ValidString(cut) {
+		cut = cut[:len(cut)-1]
+	}
+	return cut
+}
+
+func boolPtr(v bool) *bool {
+	out := v
+	return &out
+}
+
+func intPtr(v int) *int {
+	out := v
+	return &out
+}
+
+func float64Ptr(v float64) *float64 {
+	out := v
+	return &out
+}
diff --git a/backend/internal/service/ops_models.go b/backend/internal/service/ops_models.go
new file mode 100644
index 00000000..996267fd
--- /dev/null
+++ b/backend/internal/service/ops_models.go
@@ -0,0 +1,124 @@
+package service
+
+import "time"
+
+type OpsErrorLog struct {
+	ID        int64     `json:"id"`
+	CreatedAt time.Time `json:"created_at"`
+
+	Phase    string `json:"phase"`
+	Type     string `json:"type"`
+	Severity string `json:"severity"`
+
+	StatusCode int    `json:"status_code"`
+	Platform   string `json:"platform"`
+	Model      string `json:"model"`
+
+	LatencyMs *int `json:"latency_ms"`
+
+	ClientRequestID string `json:"client_request_id"`
+	RequestID       string `json:"request_id"`
+	Message         string `json:"message"`
+
+	UserID    *int64 `json:"user_id"`
+	APIKeyID  *int64 `json:"api_key_id"`
+	AccountID *int64 `json:"account_id"`
+	GroupID   *int64 `json:"group_id"`
+
+	ClientIP    *string `json:"client_ip"`
+	RequestPath string  `json:"request_path"`
+	Stream      bool    `json:"stream"`
+}
+
+type OpsErrorLogDetail struct {
+	OpsErrorLog
+
+	ErrorBody string `json:"error_body"`
+	UserAgent string `json:"user_agent"`
+
+	// Upstream context (optional)
+	UpstreamStatusCode   *int   `json:"upstream_status_code,omitempty"`
+	UpstreamErrorMessage string `json:"upstream_error_message,omitempty"`
+	UpstreamErrorDetail  string `json:"upstream_error_detail,omitempty"`
+	UpstreamErrors       string `json:"upstream_errors,omitempty"` // JSON array (string) for display/parsing
+
+	// Timings (optional)
+	AuthLatencyMs      *int64 `json:"auth_latency_ms"`
+	RoutingLatencyMs   *int64 `json:"routing_latency_ms"`
+	UpstreamLatencyMs  *int64 `json:"upstream_latency_ms"`
+	ResponseLatencyMs  *int64 `json:"response_latency_ms"`
+	TimeToFirstTokenMs *int64 `json:"time_to_first_token_ms"`
+
+	// Retry context
+	RequestBody          string `json:"request_body"`
+	RequestBodyTruncated bool   `json:"request_body_truncated"`
+	RequestBodyBytes     *int   `json:"request_body_bytes"`
+	RequestHeaders       string `json:"request_headers,omitempty"`
+
+	// vNext metric semantics
+	IsBusinessLimited bool `json:"is_business_limited"`
+}
+
+type OpsErrorLogFilter struct {
+	StartTime *time.Time
+	EndTime   *time.Time
+
+	Platform  string
+	GroupID   *int64
+	AccountID *int64
+
+	StatusCodes []int
+	Phase       string
+	Query       string
+
+	Page     int
+	PageSize int
+}
+
+type OpsErrorLogList struct {
+	Errors   []*OpsErrorLog `json:"errors"`
+	Total    int            `json:"total"`
+	Page     int            `json:"page"`
+	PageSize int            `json:"page_size"`
+}
+
+type OpsRetryAttempt struct {
+	ID        int64     `json:"id"`
+	CreatedAt time.Time `json:"created_at"`
+
+	RequestedByUserID int64  `json:"requested_by_user_id"`
+	SourceErrorID     int64  `json:"source_error_id"`
+	Mode              string `json:"mode"`
+	PinnedAccountID   *int64 `json:"pinned_account_id"`
+
+	Status     string     `json:"status"`
+	StartedAt  *time.Time `json:"started_at"`
+	FinishedAt *time.Time `json:"finished_at"`
+	DurationMs *int64     `json:"duration_ms"`
+
+	ResultRequestID *string `json:"result_request_id"`
+	ResultErrorID   *int64  `json:"result_error_id"`
+
+	ErrorMessage *string `json:"error_message"`
+}
+
+type OpsRetryResult struct {
+	AttemptID int64  `json:"attempt_id"`
+	Mode      string `json:"mode"`
+	Status    string `json:"status"`
+
+	PinnedAccountID *int64 `json:"pinned_account_id"`
+	UsedAccountID   *int64 `json:"used_account_id"`
+
+	HTTPStatusCode    int    `json:"http_status_code"`
+	UpstreamRequestID string `json:"upstream_request_id"`
+
+	ResponsePreview   string `json:"response_preview"`
+	ResponseTruncated bool   `json:"response_truncated"`
+
+	ErrorMessage string `json:"error_message"`
+
+	StartedAt  time.Time `json:"started_at"`
+	FinishedAt time.Time `json:"finished_at"`
+	DurationMs int64     `json:"duration_ms"`
+}
diff --git a/backend/internal/service/ops_port.go b/backend/internal/service/ops_port.go
new file mode 100644
index 00000000..39f3aaf2
--- /dev/null
+++ b/backend/internal/service/ops_port.go
@@ -0,0 +1,242 @@
+package service
+
+import (
+	"context"
+	"time"
+)
+
+type OpsRepository interface {
+	InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error)
+	ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error)
+	GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error)
+	ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error)
+
+	InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
+	UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
+	GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)
+
+	// Lightweight window stats (for realtime WS / quick sampling).
+	GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
+
+	GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error)
+	GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error)
+	GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error)
+	GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error)
+	GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error)
+
+	InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error
+	GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error)
+
+	UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error
+	ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error)
+
+	// Alerts (rules + events)
+	ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error)
+	CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
+	UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
+	DeleteAlertRule(ctx context.Context, id int64) error
+
+	ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
+	GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
+	GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
+	CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
+	UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
+	UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error
+
+	// Pre-aggregation (hourly/daily) used for long-window dashboard performance.
+	UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
+	UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
+	GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error)
+	GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error)
+}
+
+type OpsInsertErrorLogInput struct {
+	RequestID       string
+	ClientRequestID string
+
+	UserID    *int64
+	APIKeyID  *int64
+	AccountID *int64
+	GroupID   *int64
+	ClientIP  *string
+
+	Platform    string
+	Model       string
+	RequestPath string
+	Stream      bool
+	UserAgent   string
+
+	ErrorPhase        string
+	ErrorType         string
+	Severity          string
+	StatusCode        int
+	IsBusinessLimited bool
+
+	ErrorMessage string
+	ErrorBody    string
+
+	ErrorSource string
+	ErrorOwner  string
+
+	UpstreamStatusCode   *int
+	UpstreamErrorMessage *string
+	UpstreamErrorDetail  *string
+	// UpstreamErrors captures all upstream error attempts observed during handling this request.
+	// It is populated during request processing (gin context) and sanitized+serialized by OpsService.
+	UpstreamErrors []*OpsUpstreamErrorEvent
+	// UpstreamErrorsJSON is the sanitized JSON string stored into ops_error_logs.upstream_errors.
+	// It is set by OpsService.RecordError before persisting.
+	UpstreamErrorsJSON *string
+
+	DurationMs         *int
+	TimeToFirstTokenMs *int64
+
+	RequestBodyJSON      *string // sanitized json string (not raw bytes)
+	RequestBodyTruncated bool
+	RequestBodyBytes     *int
+	RequestHeadersJSON   *string // optional json string
+
+	IsRetryable bool
+	RetryCount  int
+
+	CreatedAt time.Time
+}
+
+type OpsInsertRetryAttemptInput struct {
+	RequestedByUserID int64
+	SourceErrorID     int64
+	Mode              string
+	PinnedAccountID   *int64
+
+	// running|queued etc.
+	Status    string
+	StartedAt time.Time
+}
+
+type OpsUpdateRetryAttemptInput struct {
+	ID int64
+
+	// succeeded|failed
+	Status     string
+	FinishedAt time.Time
+	DurationMs int64
+
+	// Optional correlation
+	ResultRequestID *string
+	ResultErrorID   *int64
+
+	ErrorMessage *string
+}
+
+type OpsInsertSystemMetricsInput struct {
+	CreatedAt     time.Time
+	WindowMinutes int
+
+	Platform *string
+	GroupID  *int64
+
+	SuccessCount         int64
+	ErrorCountTotal      int64
+	BusinessLimitedCount int64
+	ErrorCountSLA        int64
+
+	UpstreamErrorCountExcl429529 int64
+	Upstream429Count             int64
+	Upstream529Count             int64
+
+	TokenConsumed int64
+
+	QPS *float64
+	TPS *float64
+
+	DurationP50Ms *int
+	DurationP90Ms *int
+	DurationP95Ms *int
+	DurationP99Ms *int
+	DurationAvgMs *float64
+	DurationMaxMs *int
+
+	TTFTP50Ms *int
+	TTFTP90Ms *int
+	TTFTP95Ms *int
+	TTFTP99Ms *int
+	TTFTAvgMs *float64
+	TTFTMaxMs *int
+
+	CPUUsagePercent    *float64
+	MemoryUsedMB       *int64
+	MemoryTotalMB      *int64
+	MemoryUsagePercent *float64
+
+	DBOK    *bool
+	RedisOK *bool
+
+	RedisConnTotal *int
+	RedisConnIdle  *int
+
+	DBConnActive  *int
+	DBConnIdle    *int
+	DBConnWaiting *int
+
+	GoroutineCount        *int
+	ConcurrencyQueueDepth *int
+}
+
+type OpsSystemMetricsSnapshot struct {
+	ID            int64     `json:"id"`
+	CreatedAt     time.Time `json:"created_at"`
+	WindowMinutes int       `json:"window_minutes"`
+
+	CPUUsagePercent    *float64 `json:"cpu_usage_percent"`
+	MemoryUsedMB       *int64   `json:"memory_used_mb"`
+	MemoryTotalMB      *int64   `json:"memory_total_mb"`
+	MemoryUsagePercent *float64 `json:"memory_usage_percent"`
+
+	DBOK    *bool `json:"db_ok"`
+	RedisOK *bool `json:"redis_ok"`
+
+	// Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max".
+	DBMaxOpenConns *int `json:"db_max_open_conns"`
+	RedisPoolSize  *int `json:"redis_pool_size"`
+
+	RedisConnTotal *int `json:"redis_conn_total"`
+	RedisConnIdle  *int `json:"redis_conn_idle"`
+
+	DBConnActive  *int `json:"db_conn_active"`
+	DBConnIdle    *int `json:"db_conn_idle"`
+	DBConnWaiting *int `json:"db_conn_waiting"`
+
+	GoroutineCount        *int `json:"goroutine_count"`
+	ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"`
+}
+
+type OpsUpsertJobHeartbeatInput struct {
+	JobName string
+
+	LastRunAt      *time.Time
+	LastSuccessAt  *time.Time
+	LastErrorAt    *time.Time
+	LastError      *string
+	LastDurationMs *int64
+}
+
+type OpsJobHeartbeat struct {
+	JobName string `json:"job_name"`
+
+	LastRunAt      *time.Time `json:"last_run_at"`
+	LastSuccessAt  *time.Time `json:"last_success_at"`
+	LastErrorAt    *time.Time `json:"last_error_at"`
+	LastError      *string    `json:"last_error"`
+	LastDurationMs *int64     `json:"last_duration_ms"`
+
+	UpdatedAt time.Time `json:"updated_at"`
+}
+
+type OpsWindowStats struct {
+	StartTime time.Time `json:"start_time"`
+	EndTime   time.Time `json:"end_time"`
+
+	SuccessCount    int64 `json:"success_count"`
+	ErrorCountTotal int64 `json:"error_count_total"`
+	TokenConsumed   int64 `json:"token_consumed"`
+}
diff --git a/backend/internal/service/ops_query_mode.go b/backend/internal/service/ops_query_mode.go
new file mode 100644
index 00000000..e6fa9c1e
--- /dev/null
+++ b/backend/internal/service/ops_query_mode.go
@@ -0,0 +1,40 @@
+package service
+
+import (
+	"errors"
+	"strings"
+)
+
+type OpsQueryMode string
+
+const (
+	OpsQueryModeAuto   OpsQueryMode = "auto"
+	OpsQueryModeRaw    OpsQueryMode = "raw"
+	OpsQueryModePreagg OpsQueryMode = "preagg"
+)
+
+// ErrOpsPreaggregatedNotPopulated indicates that raw logs exist for a window, but the
+// pre-aggregation tables are not populated yet. This is primarily used to implement
+// the forced `preagg` mode UX.
+var ErrOpsPreaggregatedNotPopulated = errors.New("ops pre-aggregated tables not populated")
+
+func ParseOpsQueryMode(raw string) OpsQueryMode {
+	v := strings.ToLower(strings.TrimSpace(raw))
+	switch v {
+	case string(OpsQueryModeRaw):
+		return OpsQueryModeRaw
+	case string(OpsQueryModePreagg):
+		return OpsQueryModePreagg
+	default:
+		return OpsQueryModeAuto
+	}
+}
+
+func (m OpsQueryMode) IsValid() bool {
+	switch m {
+	case OpsQueryModeAuto, OpsQueryModeRaw, OpsQueryModePreagg:
+		return true
+	default:
+		return false
+	}
+}
diff --git a/backend/internal/service/ops_realtime.go b/backend/internal/service/ops_realtime.go
new file mode 100644
index 00000000..479b9482
--- /dev/null
+++ b/backend/internal/service/ops_realtime.go
@@ -0,0 +1,36 @@
+package service
+
+import (
+	"context"
+	"errors"
+	"strings"
+)
+
+// IsRealtimeMonitoringEnabled returns true when realtime ops features are enabled.
+//
+// This is a soft switch controlled by the DB setting `ops_realtime_monitoring_enabled`,
+// and it is also gated by the hard switch/soft switch of overall ops monitoring.
+func (s *OpsService) IsRealtimeMonitoringEnabled(ctx context.Context) bool {
+	if !s.IsMonitoringEnabled(ctx) {
+		return false
+	}
+	if s.settingRepo == nil {
+		return true
+	}
+
+	value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsRealtimeMonitoringEnabled)
+	if err != nil {
+		// Default enabled when key is missing; fail-open on transient errors.
+		if errors.Is(err, ErrSettingNotFound) {
+			return true
+		}
+		return true
+	}
+
+	switch strings.ToLower(strings.TrimSpace(value)) {
+	case "false", "0", "off", "disabled":
+		return false
+	default:
+		return true
+	}
+}
diff --git a/backend/internal/service/ops_realtime_models.go b/backend/internal/service/ops_realtime_models.go
new file mode 100644
index 00000000..f7514a24
--- /dev/null
+++ b/backend/internal/service/ops_realtime_models.go
@@ -0,0 +1,81 @@
+package service
+
+import "time"
+
+// PlatformConcurrencyInfo aggregates concurrency usage by platform.
+type PlatformConcurrencyInfo struct {
+	Platform       string  `json:"platform"`
+	CurrentInUse   int64   `json:"current_in_use"`
+	MaxCapacity    int64   `json:"max_capacity"`
+	LoadPercentage float64 `json:"load_percentage"`
+	WaitingInQueue int64   `json:"waiting_in_queue"`
+}
+
+// GroupConcurrencyInfo aggregates concurrency usage by group.
+//
+// Note: one account can belong to multiple groups; group totals are therefore not additive across groups.
+type GroupConcurrencyInfo struct {
+	GroupID        int64   `json:"group_id"`
+	GroupName      string  `json:"group_name"`
+	Platform       string  `json:"platform"`
+	CurrentInUse   int64   `json:"current_in_use"`
+	MaxCapacity    int64   `json:"max_capacity"`
+	LoadPercentage float64 `json:"load_percentage"`
+	WaitingInQueue int64   `json:"waiting_in_queue"`
+}
+
+// AccountConcurrencyInfo represents real-time concurrency usage for a single account.
+type AccountConcurrencyInfo struct {
+	AccountID      int64   `json:"account_id"`
+	AccountName    string  `json:"account_name"`
+	Platform       string  `json:"platform"`
+	GroupID        int64   `json:"group_id"`
+	GroupName      string  `json:"group_name"`
+	CurrentInUse   int64   `json:"current_in_use"`
+	MaxCapacity    int64   `json:"max_capacity"`
+	LoadPercentage float64 `json:"load_percentage"`
+	WaitingInQueue int64   `json:"waiting_in_queue"`
+}
+
+// PlatformAvailability aggregates account availability by platform.
+type PlatformAvailability struct {
+	Platform       string `json:"platform"`
+	TotalAccounts  int64  `json:"total_accounts"`
+	AvailableCount int64  `json:"available_count"`
+	RateLimitCount int64  `json:"rate_limit_count"`
+	ErrorCount     int64  `json:"error_count"`
+}
+
+// GroupAvailability aggregates account availability by group.
+type GroupAvailability struct {
+	GroupID        int64  `json:"group_id"`
+	GroupName      string `json:"group_name"`
+	Platform       string `json:"platform"`
+	TotalAccounts  int64  `json:"total_accounts"`
+	AvailableCount int64  `json:"available_count"`
+	RateLimitCount int64  `json:"rate_limit_count"`
+	ErrorCount     int64  `json:"error_count"`
+}
+
+// AccountAvailability represents current availability for a single account.
+type AccountAvailability struct {
+	AccountID   int64  `json:"account_id"`
+	AccountName string `json:"account_name"`
+	Platform    string `json:"platform"`
+	GroupID     int64  `json:"group_id"`
+	GroupName   string `json:"group_name"`
+
+	Status string `json:"status"`
+
+	IsAvailable   bool `json:"is_available"`
+	IsRateLimited bool `json:"is_rate_limited"`
+	IsOverloaded  bool `json:"is_overloaded"`
+	HasError      bool `json:"has_error"`
+
+	RateLimitResetAt       *time.Time `json:"rate_limit_reset_at"`
+	RateLimitRemainingSec  *int64     `json:"rate_limit_remaining_sec"`
+	OverloadUntil          *time.Time `json:"overload_until"`
+	OverloadRemainingSec   *int64     `json:"overload_remaining_sec"`
+	ErrorMessage           string     `json:"error_message"`
+	TempUnschedulableUntil *time.Time `json:"temp_unschedulable_until,omitempty"`
+}
diff --git a/backend/internal/service/ops_request_details.go b/backend/internal/service/ops_request_details.go
new file mode 100644
index 00000000..12b9aa1b
--- /dev/null
+++ b/backend/internal/service/ops_request_details.go
@@ -0,0 +1,151 @@
+package service
+
+import (
+	"context"
+	"time"
+)
+
+type OpsRequestKind string
+
+const (
+	OpsRequestKindSuccess OpsRequestKind = "success"
+	OpsRequestKindError   OpsRequestKind = "error"
+)
+
+// OpsRequestDetail is a request-level view across success (usage_logs) and error (ops_error_logs).
+// It powers "request drilldown" UIs without exposing full request bodies for successful requests.
+type OpsRequestDetail struct {
+	Kind      OpsRequestKind `json:"kind"`
+	CreatedAt time.Time      `json:"created_at"`
+	RequestID string         `json:"request_id"`
+
+	Platform string `json:"platform,omitempty"`
+	Model    string `json:"model,omitempty"`
+
+	DurationMs *int `json:"duration_ms,omitempty"`
+	StatusCode *int `json:"status_code,omitempty"`
+
+	// When Kind == "error", ErrorID links to /admin/ops/errors/:id.
+	ErrorID *int64 `json:"error_id,omitempty"`
+
+	Phase    string `json:"phase,omitempty"`
+	Severity string `json:"severity,omitempty"`
+	Message  string `json:"message,omitempty"`
+
+	UserID    *int64 `json:"user_id,omitempty"`
+	APIKeyID  *int64 `json:"api_key_id,omitempty"`
+	AccountID *int64 `json:"account_id,omitempty"`
+	GroupID   *int64 `json:"group_id,omitempty"`
+
+	Stream bool `json:"stream"`
+}
+
+type OpsRequestDetailFilter struct {
+	StartTime *time.Time
+	EndTime   *time.Time
+
+	// kind: success|error|all
+	Kind string
+
+	Platform string
+	GroupID  *int64
+
+	UserID    *int64
+	APIKeyID  *int64
+	AccountID *int64
+
+	Model     string
+	RequestID string
+	Query     string
+
+	MinDurationMs *int
+	MaxDurationMs *int
+
+	// Sort: created_at_desc (default) or duration_desc.
+	Sort string
+
+	Page     int
+	PageSize int
+}
+
+func (f *OpsRequestDetailFilter) Normalize() (page, pageSize int, startTime, endTime time.Time) {
+	page = 1
+	pageSize = 50
+	endTime = time.Now()
+	startTime = endTime.Add(-1 * time.Hour)
+
+	if f == nil {
+		return page, pageSize, startTime, endTime
+	}
+
+	if f.Page > 0 {
+		page = f.Page
+	}
+	if f.PageSize > 0 {
+		pageSize = f.PageSize
+	}
+	if pageSize > 100 {
+		pageSize = 100
+	}
+
+	if f.EndTime != nil {
+		endTime = *f.EndTime
+	}
+	if f.StartTime != nil {
+		startTime = *f.StartTime
+	} else if f.EndTime != nil {
+		startTime = endTime.Add(-1 * time.Hour)
+	}
+
+	if startTime.After(endTime) {
+		startTime, endTime = endTime, startTime
+	}
+
+	return page, pageSize, startTime, endTime
+}
+
+type OpsRequestDetailList struct {
+	Items    []*OpsRequestDetail `json:"items"`
+	Total    int64               `json:"total"`
+	Page     int                 `json:"page"`
+	PageSize int                 `json:"page_size"`
+}
+
+func (s *OpsService) ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) (*OpsRequestDetailList, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return &OpsRequestDetailList{
+			Items:    []*OpsRequestDetail{},
+			Total:    0,
+			Page:     1,
+			PageSize: 50,
+		}, nil
+	}
+
+	page, pageSize, startTime, endTime := filter.Normalize()
+	filterCopy := &OpsRequestDetailFilter{}
+	if filter != nil {
+		*filterCopy = *filter
+	}
+	filterCopy.Page = page
+	filterCopy.PageSize = pageSize
+	filterCopy.StartTime = &startTime
+	filterCopy.EndTime = &endTime
+
+	items, total, err := s.opsRepo.ListRequestDetails(ctx, filterCopy)
+	if err != nil {
+		return nil, err
+	}
+	if items == nil {
+		items = []*OpsRequestDetail{}
+	}
+
+	return &OpsRequestDetailList{
+		Items:    items,
+		Total:    total,
+		Page:     page,
+		PageSize: pageSize,
+	}, nil
+}
diff --git a/backend/internal/service/ops_retry.go b/backend/internal/service/ops_retry.go
new file mode 100644
index 00000000..747aa3b8
--- /dev/null
+++ b/backend/internal/service/ops_retry.go
@@ -0,0 +1,632 @@
+package service
+
+import (
+	"bytes"
+	"context"
+	"database/sql"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log"
+	"net/http"
+	"strings"
+	"time"
+
+	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+	"github.com/gin-gonic/gin"
+	"github.com/lib/pq"
+)
+
+const (
+	OpsRetryModeClient   = "client"
+	OpsRetryModeUpstream = "upstream"
+)
+
+const (
+	opsRetryStatusRunning   = "running"
+	opsRetryStatusSucceeded = "succeeded"
+	opsRetryStatusFailed    = "failed"
+)
+
+const (
+	opsRetryTimeout             = 60 * time.Second
+	opsRetryCaptureBytesLimit   = 64 * 1024
+	opsRetryResponsePreviewMax  = 8 * 1024
+	opsRetryMinIntervalPerError = 10 * time.Second
+	opsRetryMaxAccountSwitches  = 3
+)
+
+var opsRetryRequestHeaderAllowlist = map[string]bool{
+	"anthropic-beta":    true,
+	"anthropic-version": true,
+}
+
+type opsRetryRequestType string
+
+const (
+	opsRetryTypeMessages  opsRetryRequestType = "messages"
+	opsRetryTypeOpenAI    opsRetryRequestType = "openai_responses"
+	opsRetryTypeGeminiV1B opsRetryRequestType = "gemini_v1beta"
+)
+
+type limitedResponseWriter struct {
+	header      http.Header
+	wroteHeader bool
+
+	limit        int
+	totalWritten int64
+	buf          bytes.Buffer
+}
+
+func newLimitedResponseWriter(limit int) *limitedResponseWriter {
+	if limit <= 0 {
+		limit = 1
+	}
+	return &limitedResponseWriter{
+		header: make(http.Header),
+		limit:  limit,
+	}
+}
+
+func (w *limitedResponseWriter) Header() http.Header {
+	return w.header
+}
+
+func (w *limitedResponseWriter) WriteHeader(statusCode int) {
+	if w.wroteHeader {
+		return
+	}
+	w.wroteHeader = true
+}
+
+func (w *limitedResponseWriter) Write(p []byte) (int, error) {
+	if !w.wroteHeader {
+		w.WriteHeader(http.StatusOK)
+	}
+	w.totalWritten += int64(len(p))
+
+	if w.buf.Len() < w.limit {
+		remaining := w.limit - w.buf.Len()
+		if len(p) > remaining {
+			_, _ = w.buf.Write(p[:remaining])
+		} else {
+			_, _ = w.buf.Write(p)
+		}
+	}
+
+	// Pretend we wrote everything to avoid upstream/client code treating it as an error.
+	return len(p), nil
+}
+
+func (w *limitedResponseWriter) Flush() {}
+
+func (w *limitedResponseWriter) bodyBytes() []byte {
+	return w.buf.Bytes()
+}
+
+func (w *limitedResponseWriter) truncated() bool {
+	return w.totalWritten > int64(w.limit)
+}
+
+func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+
+	mode = strings.ToLower(strings.TrimSpace(mode))
+	switch mode {
+	case OpsRetryModeClient, OpsRetryModeUpstream:
+	default:
+		return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
+	}
+
+	latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
+	if err != nil && !errors.Is(err, sql.ErrNoRows) {
+		return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
+	}
+	if latest != nil {
+		if strings.EqualFold(latest.Status, opsRetryStatusRunning) || strings.EqualFold(latest.Status, "queued") {
+			return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
+		}
+
+		lastAttemptAt := latest.CreatedAt
+		if latest.FinishedAt != nil && !latest.FinishedAt.IsZero() {
+			lastAttemptAt = *latest.FinishedAt
+		} else if latest.StartedAt != nil && !latest.StartedAt.IsZero() {
+			lastAttemptAt = *latest.StartedAt
+		}
+
+		if time.Since(lastAttemptAt) < opsRetryMinIntervalPerError {
+			return nil, infraerrors.Conflict("OPS_RETRY_TOO_FREQUENT", "Please wait before retrying this error again")
+		}
+	}
+
+	errorLog, err := s.GetErrorLogByID(ctx, errorID)
+	if err != nil {
+		return nil, err
+	}
+	if strings.TrimSpace(errorLog.RequestBody) == "" {
+		return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
+	}
+
+	var pinned *int64
+	if mode == OpsRetryModeUpstream {
+		if pinnedAccountID != nil && *pinnedAccountID > 0 {
+			pinned = pinnedAccountID
+		} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
+			pinned = errorLog.AccountID
+		} else {
+			return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
+		}
+	}
+
+	startedAt := time.Now()
+	attemptID, err := s.opsRepo.InsertRetryAttempt(ctx, &OpsInsertRetryAttemptInput{
+		RequestedByUserID: requestedByUserID,
+		SourceErrorID:     errorID,
+		Mode:              mode,
+		PinnedAccountID:   pinned,
+		Status:            opsRetryStatusRunning,
+		StartedAt:         startedAt,
+	})
+	if err != nil {
+		var pqErr *pq.Error
+		if errors.As(err, &pqErr) && string(pqErr.Code) == "23505" {
+			return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
+		}
+		return nil, infraerrors.InternalServer("OPS_RETRY_CREATE_ATTEMPT_FAILED", "Failed to create retry attempt").WithCause(err)
+	}
+
+	result := &OpsRetryResult{
+		AttemptID:         attemptID,
+		Mode:              mode,
+		Status:            opsRetryStatusFailed,
+		PinnedAccountID:   pinned,
+		HTTPStatusCode:    0,
+		UpstreamRequestID: "",
+		ResponsePreview:   "",
+		ResponseTruncated: false,
+		ErrorMessage:      "",
+		StartedAt:         startedAt,
+	}
+
+	execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
+	defer cancel()
+
+	execRes := s.executeRetry(execCtx, errorLog, mode, pinned)
+
+	finishedAt := time.Now()
+	result.FinishedAt = finishedAt
+	result.DurationMs = finishedAt.Sub(startedAt).Milliseconds()
+
+	if execRes != nil {
+		result.Status = execRes.status
+		result.UsedAccountID = execRes.usedAccountID
+		result.HTTPStatusCode = execRes.httpStatusCode
+		result.UpstreamRequestID = execRes.upstreamRequestID
+		result.ResponsePreview = execRes.responsePreview
+		result.ResponseTruncated = execRes.responseTruncated
+		result.ErrorMessage = execRes.errorMessage
+	}
+
+	updateCtx, updateCancel := context.WithTimeout(context.Background(), 3*time.Second)
+	defer updateCancel()
+
+	var updateErrMsg *string
+	if strings.TrimSpace(result.ErrorMessage) != "" {
+		msg := result.ErrorMessage
+		updateErrMsg = &msg
+	}
+	var resultRequestID *string
+	if strings.TrimSpace(result.UpstreamRequestID) != "" {
+		v := result.UpstreamRequestID
+		resultRequestID = &v
+	}
+
+	finalStatus := result.Status
+	if strings.TrimSpace(finalStatus) == "" {
+		finalStatus = opsRetryStatusFailed
+	}
+
+	if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{
+		ID:              attemptID,
+		Status:          finalStatus,
+		FinishedAt:      finishedAt,
+		DurationMs:      result.DurationMs,
+		ResultRequestID: resultRequestID,
+		ErrorMessage:    updateErrMsg,
+	}); err != nil {
+		// Best-effort: retry itself already executed; do not fail the API response.
+		log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
+	}
+
+	return result, nil
+}
+
+type opsRetryExecution struct {
+	status string
+
+	usedAccountID     *int64
+	httpStatusCode    int
+	upstreamRequestID string
+
+	responsePreview   string
+	responseTruncated bool
+
+	errorMessage string
+}
+
+func (s *OpsService) executeRetry(ctx context.Context, errorLog *OpsErrorLogDetail, mode string, pinnedAccountID *int64) *opsRetryExecution {
+	if errorLog == nil {
+		return &opsRetryExecution{
+			status:       opsRetryStatusFailed,
+			errorMessage: "missing error log",
+		}
+	}
+
+	reqType := detectOpsRetryType(errorLog.RequestPath)
+	bodyBytes := []byte(errorLog.RequestBody)
+
+	switch reqType {
+	case opsRetryTypeMessages:
+		bodyBytes = FilterThinkingBlocksForRetry(bodyBytes)
+	case opsRetryTypeOpenAI, opsRetryTypeGeminiV1B:
+		// No-op
+	}
+
+	switch strings.ToLower(strings.TrimSpace(mode)) {
+	case OpsRetryModeUpstream:
+		if pinnedAccountID == nil || *pinnedAccountID <= 0 {
+			return &opsRetryExecution{
+				status:       opsRetryStatusFailed,
+				errorMessage: "pinned_account_id required for upstream retry",
+			}
+		}
+		return s.executePinnedRetry(ctx, reqType, errorLog, bodyBytes, *pinnedAccountID)
+	case OpsRetryModeClient:
+		return s.executeClientRetry(ctx, reqType, errorLog, bodyBytes)
+	default:
+		return &opsRetryExecution{
+			status:       opsRetryStatusFailed,
+			errorMessage: "invalid retry mode",
+		}
+	}
+}
+
+func detectOpsRetryType(path string) opsRetryRequestType {
+	p := strings.ToLower(strings.TrimSpace(path))
+	switch {
+	case strings.Contains(p, "/responses"):
+		return opsRetryTypeOpenAI
+	case strings.Contains(p, "/v1beta/"):
+		return opsRetryTypeGeminiV1B
+	default:
+		return opsRetryTypeMessages
+	}
+}
+
+func (s *OpsService) executePinnedRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, pinnedAccountID int64) *opsRetryExecution {
+	if s.accountRepo == nil {
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account repository not available"}
+	}
+
+	account, err := s.accountRepo.GetByID(ctx, pinnedAccountID)
+	if err != nil {
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("account not found: %v", err)}
+	}
+	if account == nil {
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account not found"}
+	}
+	if !account.IsSchedulable() {
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account is not schedulable"}
+	}
+	if errorLog.GroupID != nil && *errorLog.GroupID > 0 {
+		if !containsInt64(account.GroupIDs, *errorLog.GroupID) {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "pinned account is not in the same group as the original request"}
+		}
+	}
+
+	var release func()
+	if s.concurrencyService != nil {
+		acq, err := s.concurrencyService.AcquireAccountSlot(ctx, account.ID, account.Concurrency)
+		if err != nil {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("acquire account slot failed: %v", err)}
+		}
+		if acq == nil || !acq.Acquired {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account concurrency limit reached"}
+		}
+		release = acq.ReleaseFunc
+	}
+	if release != nil {
+		defer release()
+	}
+
+	usedID := account.ID
+	exec := s.executeWithAccount(ctx, reqType, errorLog, body, account)
+	exec.usedAccountID = &usedID
+	if exec.status == "" {
+		exec.status = opsRetryStatusFailed
+	}
+	return exec
+}
+
+func (s *OpsService) executeClientRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) *opsRetryExecution {
+	groupID := errorLog.GroupID
+	if groupID == nil || *groupID <= 0 {
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "group_id missing; cannot reselect account"}
+	}
+
+	model, stream, parsedErr := extractRetryModelAndStream(reqType, errorLog, body)
+	if parsedErr != nil {
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: parsedErr.Error()}
+	}
+	_ = stream
+
+	excluded := make(map[int64]struct{})
+	switches := 0
+
+	for {
+		if switches >= opsRetryMaxAccountSwitches {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "retry failed after exhausting account failovers"}
+		}
+
+		selection, selErr := s.selectAccountForRetry(ctx, reqType, groupID, model, excluded)
+		if selErr != nil {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: selErr.Error()}
+		}
+		if selection == nil || selection.Account == nil {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "no available accounts"}
+		}
+
+		account := selection.Account
+		if !selection.Acquired || selection.ReleaseFunc == nil {
+			excluded[account.ID] = struct{}{}
+			switches++
+			continue
+		}
+
+		exec := func() *opsRetryExecution {
+			defer selection.ReleaseFunc()
+			return s.executeWithAccount(ctx, reqType, errorLog, body, account)
+		}()
+
+		if exec != nil {
+			if exec.status == opsRetryStatusSucceeded {
+				usedID := account.ID
+				exec.usedAccountID = &usedID
+				return exec
+			}
+			// If the gateway services ask for failover, try another account.
+			if s.isFailoverError(exec.errorMessage) {
+				excluded[account.ID] = struct{}{}
+				switches++
+				continue
+			}
+			usedID := account.ID
+			exec.usedAccountID = &usedID
+			return exec
+		}
+
+		excluded[account.ID] = struct{}{}
+		switches++
+	}
+}
+
+func (s *OpsService) selectAccountForRetry(ctx context.Context, reqType opsRetryRequestType, groupID *int64, model string, excludedIDs map[int64]struct{}) (*AccountSelectionResult, error) {
+	switch reqType {
+	case opsRetryTypeOpenAI:
+		if s.openAIGatewayService == nil {
+			return nil, fmt.Errorf("openai gateway service not available")
+		}
+		return s.openAIGatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
+	case opsRetryTypeGeminiV1B, opsRetryTypeMessages:
+		if s.gatewayService == nil {
+			return nil, fmt.Errorf("gateway service not available")
+		}
+		return s.gatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
+	default:
+		return nil, fmt.Errorf("unsupported retry type: %s", reqType)
+	}
+}
+
+func extractRetryModelAndStream(reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) (model string, stream bool, err error) {
+	switch reqType {
+	case opsRetryTypeMessages:
+		parsed, parseErr := ParseGatewayRequest(body)
+		if parseErr != nil {
+			return "", false, fmt.Errorf("failed to parse messages request body: %w", parseErr)
+		}
+		return parsed.Model, parsed.Stream, nil
+	case opsRetryTypeOpenAI:
+		var v struct {
+			Model  string `json:"model"`
+			Stream bool   `json:"stream"`
+		}
+		if err := json.Unmarshal(body, &v); err != nil {
+			return "", false, fmt.Errorf("failed to parse openai request body: %w", err)
+		}
+		return strings.TrimSpace(v.Model), v.Stream, nil
+	case opsRetryTypeGeminiV1B:
+		if strings.TrimSpace(errorLog.Model) == "" {
+			return "", false, fmt.Errorf("missing model for gemini v1beta retry")
+		}
+		return strings.TrimSpace(errorLog.Model), errorLog.Stream, nil
+	default:
+		return "", false, fmt.Errorf("unsupported retry type: %s", reqType)
+	}
+}
+
+func (s *OpsService) executeWithAccount(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, account *Account) *opsRetryExecution {
+	if account == nil {
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "missing account"}
+	}
+
+	c, w := newOpsRetryContext(ctx, errorLog)
+
+	var err error
+	switch reqType {
+	case opsRetryTypeOpenAI:
+		if s.openAIGatewayService == nil {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "openai gateway service not available"}
+		}
+		_, err = s.openAIGatewayService.Forward(ctx, c, account, body)
+	case opsRetryTypeGeminiV1B:
+		if s.geminiCompatService == nil || s.antigravityGatewayService == nil {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini services not available"}
+		}
+		modelName := strings.TrimSpace(errorLog.Model)
+		action := "generateContent"
+		if errorLog.Stream {
+			action = "streamGenerateContent"
+		}
+		if account.Platform == PlatformAntigravity {
+			_, err = s.antigravityGatewayService.ForwardGemini(ctx, c, account, modelName, action, errorLog.Stream, body)
+		} else {
+			_, err = s.geminiCompatService.ForwardNative(ctx, c, account, modelName, action, errorLog.Stream, body)
+		}
+	case opsRetryTypeMessages:
+		switch account.Platform {
+		case PlatformAntigravity:
+			if s.antigravityGatewayService == nil {
+				return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "antigravity gateway service not available"}
+			}
+			_, err = s.antigravityGatewayService.Forward(ctx, c, account, body)
+		case PlatformGemini:
+			if s.geminiCompatService == nil {
+				return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini gateway service not available"}
+			}
+			_, err = s.geminiCompatService.Forward(ctx, c, account, body)
+		default:
+			if s.gatewayService == nil {
+				return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gateway service not available"}
+			}
+			parsedReq, parseErr := ParseGatewayRequest(body)
+			if parseErr != nil {
+				return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "failed to parse request body"}
+			}
+			_, err = s.gatewayService.Forward(ctx, c, account, parsedReq)
+		}
+	default:
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "unsupported retry type"}
+	}
+
+	statusCode := http.StatusOK
+	if c != nil && c.Writer != nil {
+		statusCode = c.Writer.Status()
+	}
+
+	upstreamReqID := extractUpstreamRequestID(c)
+	preview, truncated := extractResponsePreview(w)
+
+	exec := &opsRetryExecution{
+		status:            opsRetryStatusFailed,
+		httpStatusCode:    statusCode,
+		upstreamRequestID: upstreamReqID,
+		responsePreview:   preview,
+		responseTruncated: truncated,
+		errorMessage:      "",
+	}
+
+	if err == nil && statusCode < 400 {
+		exec.status = opsRetryStatusSucceeded
+		return exec
+	}
+
+	if err != nil {
+		exec.errorMessage = err.Error()
+	} else {
+		exec.errorMessage = fmt.Sprintf("upstream returned status %d", statusCode)
+	}
+
+	return exec
+}
+
+func newOpsRetryContext(ctx context.Context, errorLog *OpsErrorLogDetail) (*gin.Context, *limitedResponseWriter) {
+	w := newLimitedResponseWriter(opsRetryCaptureBytesLimit)
+	c, _ := gin.CreateTestContext(w)
+
+	path := "/"
+	if errorLog != nil && strings.TrimSpace(errorLog.RequestPath) != "" {
+		path = errorLog.RequestPath
+	}
+
+	req, _ := http.NewRequestWithContext(ctx, http.MethodPost, "http://localhost"+path, bytes.NewReader(nil))
+	req.Header.Set("content-type", "application/json")
+	if errorLog != nil && strings.TrimSpace(errorLog.UserAgent) != "" {
+		req.Header.Set("user-agent", errorLog.UserAgent)
+	}
+	// Restore a minimal, whitelisted subset of request headers to improve retry fidelity
+	// (e.g. anthropic-beta / anthropic-version). Never replay auth credentials.
+	if errorLog != nil && strings.TrimSpace(errorLog.RequestHeaders) != "" {
+		var stored map[string]string
+		if err := json.Unmarshal([]byte(errorLog.RequestHeaders), &stored); err == nil {
+			for k, v := range stored {
+				key := strings.TrimSpace(k)
+				if key == "" {
+					continue
+				}
+				if !opsRetryRequestHeaderAllowlist[strings.ToLower(key)] {
+					continue
+				}
+				val := strings.TrimSpace(v)
+				if val == "" {
+					continue
+				}
+				req.Header.Set(key, val)
+			}
+		}
+	}
+
+	c.Request = req
+	return c, w
+}
+
+func extractUpstreamRequestID(c *gin.Context) string {
+	if c == nil || c.Writer == nil {
+		return ""
+	}
+	h := c.Writer.Header()
+	if h == nil {
+		return ""
+	}
+	for _, key := range []string{"x-request-id", "X-Request-Id", "X-Request-ID"} {
+		if v := strings.TrimSpace(h.Get(key)); v != "" {
+			return v
+		}
+	}
+	return ""
+}
+
+func extractResponsePreview(w *limitedResponseWriter) (preview string, truncated bool) {
+	if w == nil {
+		return "", false
+	}
+	b := bytes.TrimSpace(w.bodyBytes())
+	if len(b) == 0 {
+		return "", w.truncated()
+	}
+	if len(b) > opsRetryResponsePreviewMax {
+		return string(b[:opsRetryResponsePreviewMax]), true
+	}
+	return string(b), w.truncated()
+}
+
+func containsInt64(items []int64, needle int64) bool {
+	for _, v := range items {
+		if v == needle {
+			return true
+		}
+	}
+	return false
+}
+
+func (s *OpsService) isFailoverError(message string) bool {
+	msg := strings.ToLower(strings.TrimSpace(message))
+	if msg == "" {
+		return false
+	}
+	return strings.Contains(msg, "upstream error:") && strings.Contains(msg, "failover")
+}
diff --git a/backend/internal/service/ops_scheduled_report_service.go b/backend/internal/service/ops_scheduled_report_service.go
new file mode 100644
index 00000000..28902cbc
--- /dev/null
+++ b/backend/internal/service/ops_scheduled_report_service.go
@@ -0,0 +1,705 @@
+package service
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/config"
+	"github.com/google/uuid"
+	"github.com/redis/go-redis/v9"
+	"github.com/robfig/cron/v3"
+)
+
+const (
+	opsScheduledReportJobName = "ops_scheduled_reports"
+
+	opsScheduledReportLeaderLockKeyDefault = "ops:scheduled_reports:leader"
+	opsScheduledReportLeaderLockTTLDefault = 5 * time.Minute
+
+	opsScheduledReportLastRunKeyPrefix = "ops:scheduled_reports:last_run:"
+
+	opsScheduledReportTickInterval = 1 * time.Minute
+)
+
+var opsScheduledReportCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
+
+var opsScheduledReportReleaseScript = redis.NewScript(`
+if redis.call("GET", KEYS[1]) == ARGV[1] then
+  return redis.call("DEL", KEYS[1])
+end
+return 0
+`)
+
+type OpsScheduledReportService struct {
+	opsService   *OpsService
+	userService  *UserService
+	emailService *EmailService
+	redisClient  *redis.Client
+	cfg          *config.Config
+
+	instanceID string
+	loc        *time.Location
+
+	distributedLockOn bool
+	warnNoRedisOnce   sync.Once
+
+	startOnce sync.Once
+	stopOnce  sync.Once
+	stopCtx   context.Context
+	stop      context.CancelFunc
+	wg        sync.WaitGroup
+}
+
+func NewOpsScheduledReportService(
+	opsService *OpsService,
+	userService *UserService,
+	emailService *EmailService,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsScheduledReportService {
+	lockOn := cfg == nil || strings.TrimSpace(cfg.RunMode) != config.RunModeSimple
+
+	loc := time.Local
+	if cfg != nil && strings.TrimSpace(cfg.Timezone) != "" {
+		if parsed, err := time.LoadLocation(strings.TrimSpace(cfg.Timezone)); err == nil && parsed != nil {
+			loc = parsed
+		}
+	}
+	return &OpsScheduledReportService{
+		opsService:   opsService,
+		userService:  userService,
+		emailService: emailService,
+		redisClient:  redisClient,
+		cfg:          cfg,
+
+		instanceID:        uuid.NewString(),
+		loc:               loc,
+		distributedLockOn: lockOn,
+		warnNoRedisOnce:   sync.Once{},
+		startOnce:         sync.Once{},
+		stopOnce:          sync.Once{},
+		stopCtx:           nil,
+		stop:              nil,
+		wg:                sync.WaitGroup{},
+	}
+}
+
+func (s *OpsScheduledReportService) Start() {
+	s.StartWithContext(context.Background())
+}
+
+func (s *OpsScheduledReportService) StartWithContext(ctx context.Context) {
+	if s == nil {
+		return
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if s.cfg != nil && !s.cfg.Ops.Enabled {
+		return
+	}
+	if s.opsService == nil || s.emailService == nil {
+		return
+	}
+
+	s.startOnce.Do(func() {
+		s.stopCtx, s.stop = context.WithCancel(ctx)
+		s.wg.Add(1)
+		go s.run()
+	})
+}
+
+func (s *OpsScheduledReportService) Stop() {
+	if s == nil {
+		return
+	}
+	s.stopOnce.Do(func() {
+		if s.stop != nil {
+			s.stop()
+		}
+	})
+	s.wg.Wait()
+}
+
+func (s *OpsScheduledReportService) run() {
+	defer s.wg.Done()
+
+	ticker := time.NewTicker(opsScheduledReportTickInterval)
+	defer ticker.Stop()
+
+	s.runOnce()
+	for {
+		select {
+		case <-ticker.C:
+			s.runOnce()
+		case <-s.stopCtx.Done():
+			return
+		}
+	}
+}
+
+func (s *OpsScheduledReportService) runOnce() {
+	if s == nil || s.opsService == nil || s.emailService == nil {
+		return
+	}
+
+	startedAt := time.Now().UTC()
+	runAt := startedAt
+
+	ctx, cancel := context.WithTimeout(s.stopCtx, 60*time.Second)
+	defer cancel()
+
+	// Respect ops monitoring enabled switch.
+	if !s.opsService.IsMonitoringEnabled(ctx) {
+		return
+	}
+
+	release, ok := s.tryAcquireLeaderLock(ctx)
+	if !ok {
+		return
+	}
+	if release != nil {
+		defer release()
+	}
+
+	now := time.Now()
+	if s.loc != nil {
+		now = now.In(s.loc)
+	}
+
+	reports := s.listScheduledReports(ctx, now)
+	if len(reports) == 0 {
+		return
+	}
+
+	for _, report := range reports {
+		if report == nil || !report.Enabled {
+			continue
+		}
+		if report.NextRunAt.After(now) {
+			continue
+		}
+
+		if err := s.runReport(ctx, report, now); err != nil {
+			s.recordHeartbeatError(runAt, time.Since(startedAt), err)
+			return
+		}
+	}
+
+	s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
+}
+
+type opsScheduledReport struct {
+	Name       string
+	ReportType string
+	Schedule   string
+	Enabled    bool
+
+	TimeRange time.Duration
+
+	Recipients []string
+
+	ErrorDigestMinCount             int
+	AccountHealthErrorRateThreshold float64
+
+	LastRunAt *time.Time
+	NextRunAt time.Time
+}
+
+func (s *OpsScheduledReportService) listScheduledReports(ctx context.Context, now time.Time) []*opsScheduledReport {
+	if s == nil || s.opsService == nil {
+		return nil
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx)
+	if err != nil || emailCfg == nil {
+		return nil
+	}
+	if !emailCfg.Report.Enabled {
+		return nil
+	}
+
+	recipients := normalizeEmails(emailCfg.Report.Recipients)
+
+	type reportDef struct {
+		enabled   bool
+		name      string
+		kind      string
+		timeRange time.Duration
+		schedule  string
+	}
+
+	defs := []reportDef{
+		{enabled: emailCfg.Report.DailySummaryEnabled, name: "日报", kind: "daily_summary", timeRange: 24 * time.Hour, schedule: emailCfg.Report.DailySummarySchedule},
+		{enabled: emailCfg.Report.WeeklySummaryEnabled, name: "周报", kind: "weekly_summary", timeRange: 7 * 24 * time.Hour, schedule: emailCfg.Report.WeeklySummarySchedule},
+		{enabled: emailCfg.Report.ErrorDigestEnabled, name: "错误摘要", kind: "error_digest", timeRange: 24 * time.Hour, schedule: emailCfg.Report.ErrorDigestSchedule},
+		{enabled: emailCfg.Report.AccountHealthEnabled, name: "账号健康", kind: "account_health", timeRange: 24 * time.Hour, schedule: emailCfg.Report.AccountHealthSchedule},
+	}
+
+	out := make([]*opsScheduledReport, 0, len(defs))
+	for _, d := range defs {
+		if !d.enabled {
+			continue
+		}
+		spec := strings.TrimSpace(d.schedule)
+		if spec == "" {
+			continue
+		}
+		sched, err := opsScheduledReportCronParser.Parse(spec)
+		if err != nil {
+			log.Printf("[OpsScheduledReport] invalid cron spec=%q for report=%s: %v", spec, d.kind, err)
+			continue
+		}
+
+		lastRun := s.getLastRunAt(ctx, d.kind)
+		base := lastRun
+		if base.IsZero() {
+			// Allow a schedule matching the current minute to trigger right after startup.
+			base = now.Add(-1 * time.Minute)
+		}
+		next := sched.Next(base)
+		if next.IsZero() {
+			continue
+		}
+
+		var lastRunPtr *time.Time
+		if !lastRun.IsZero() {
+			lastCopy := lastRun
+			lastRunPtr = &lastCopy
+		}
+
+		out = append(out, &opsScheduledReport{
+			Name:       d.name,
+			ReportType: d.kind,
+			Schedule:   spec,
+			Enabled:    true,
+
+			TimeRange: d.timeRange,
+
+			Recipients: recipients,
+
+			ErrorDigestMinCount:             emailCfg.Report.ErrorDigestMinCount,
+			AccountHealthErrorRateThreshold: emailCfg.Report.AccountHealthErrorRateThreshold,
+
+			LastRunAt: lastRunPtr,
+			NextRunAt: next,
+		})
+	}
+
+	return out
+}
+
+func (s *OpsScheduledReportService) runReport(ctx context.Context, report *opsScheduledReport, now time.Time) error {
+	if s == nil || s.opsService == nil || s.emailService == nil || report == nil {
+		return nil
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	// Mark as "run" up-front so a broken SMTP config doesn't spam retries every minute.
+	s.setLastRunAt(ctx, report.ReportType, now)
+
+	content, err := s.generateReportHTML(ctx, report, now)
+	if err != nil {
+		return err
+	}
+	if strings.TrimSpace(content) == "" {
+		// Skip sending when the report decides not to emit content (e.g., digest below min count).
+		return nil
+	}
+
+	recipients := report.Recipients
+	if len(recipients) == 0 && s.userService != nil {
+		admin, err := s.userService.GetFirstAdmin(ctx)
+		if err == nil && admin != nil && strings.TrimSpace(admin.Email) != "" {
+			recipients = []string{strings.TrimSpace(admin.Email)}
+		}
+	}
+	if len(recipients) == 0 {
+		return nil
+	}
+
+	subject := fmt.Sprintf("[Ops Report] %s", strings.TrimSpace(report.Name))
+
+	for _, to := range recipients {
+		addr := strings.TrimSpace(to)
+		if addr == "" {
+			continue
+		}
+		if err := s.emailService.SendEmail(ctx, addr, subject, content); err != nil {
+			// Ignore per-recipient failures; continue best-effort.
+			continue
+		}
+	}
+	return nil
+}
+
+func (s *OpsScheduledReportService) generateReportHTML(ctx context.Context, report *opsScheduledReport, now time.Time) (string, error) {
+	if s == nil || s.opsService == nil || report == nil {
+		return "", fmt.Errorf("service not initialized")
+	}
+	if report.TimeRange <= 0 {
+		return "", fmt.Errorf("invalid time range")
+	}
+
+	end := now.UTC()
+	start := end.Add(-report.TimeRange)
+
+	switch strings.TrimSpace(report.ReportType) {
+	case "daily_summary", "weekly_summary":
+		overview, err := s.opsService.GetDashboardOverview(ctx, &OpsDashboardFilter{
+			StartTime: start,
+			EndTime:   end,
+			Platform:  "",
+			GroupID:   nil,
+			QueryMode: OpsQueryModeAuto,
+		})
+		if err != nil {
+			// If pre-aggregation isn't ready but the report is requested, fall back to raw.
+			if strings.TrimSpace(report.ReportType) == "daily_summary" || strings.TrimSpace(report.ReportType) == "weekly_summary" {
+				overview, err = s.opsService.GetDashboardOverview(ctx, &OpsDashboardFilter{
+					StartTime: start,
+					EndTime:   end,
+					Platform:  "",
+					GroupID:   nil,
+					QueryMode: OpsQueryModeRaw,
+				})
+			}
+			if err != nil {
+				return "", err
+			}
+		}
+		return buildOpsSummaryEmailHTML(report.Name, start, end, overview), nil
+	case "error_digest":
+		// Lightweight digest: list recent errors (status>=400) and breakdown by type.
+		startTime := start
+		endTime := end
+		filter := &OpsErrorLogFilter{
+			StartTime: &startTime,
+			EndTime:   &endTime,
+			Page:      1,
+			PageSize:  100,
+		}
+		out, err := s.opsService.GetErrorLogs(ctx, filter)
+		if err != nil {
+			return "", err
+		}
+		if report.ErrorDigestMinCount > 0 && out != nil && out.Total < report.ErrorDigestMinCount {
+			return "", nil
+		}
+		return buildOpsErrorDigestEmailHTML(report.Name, start, end, out), nil
+	case "account_health":
+		// Best-effort: use account availability (not error rate yet).
+		avail, err := s.opsService.GetAccountAvailability(ctx, "", nil)
+		if err != nil {
+			return "", err
+		}
+		_ = report.AccountHealthErrorRateThreshold // reserved for future per-account error rate report
+		return buildOpsAccountHealthEmailHTML(report.Name, start, end, avail), nil
+	default:
+		return "", fmt.Errorf("unknown report type: %s", report.ReportType)
+	}
+}
+
+func buildOpsSummaryEmailHTML(title string, start, end time.Time, overview *OpsDashboardOverview) string {
+	if overview == nil {
+		return fmt.Sprintf("<h2>%s</h2><p>No data.</p>", htmlEscape(title))
+	}
+
+	latP50 := "-"
+	latP99 := "-"
+	if overview.Duration.P50 != nil {
+		latP50 = fmt.Sprintf("%dms", *overview.Duration.P50)
+	}
+	if overview.Duration.P99 != nil {
+		latP99 = fmt.Sprintf("%dms", *overview.Duration.P99)
+	}
+
+	ttftP50 := "-"
+	ttftP99 := "-"
+	if overview.TTFT.P50 != nil {
+		ttftP50 = fmt.Sprintf("%dms", *overview.TTFT.P50)
+	}
+	if overview.TTFT.P99 != nil {
+		ttftP99 = fmt.Sprintf("%dms", *overview.TTFT.P99)
+	}
+
+	return fmt.Sprintf(`
+<h2>%s</h2>
+<p><b>Period</b>: %s ~ %s (UTC)</p>
+<ul>
+  <li><b>Total Requests</b>: %d</li>
+  <li><b>Success</b>: %d</li>
+  <li><b>Errors (SLA)</b>: %d</li>
+  <li><b>Business Limited</b>: %d</li>
+  <li><b>SLA</b>: %.2f%%</li>
+  <li><b>Error Rate</b>: %.2f%%</li>
+  <li><b>Upstream Error Rate (excl 429/529)</b>: %.2f%%</li>
+  <li><b>Upstream Errors</b>: excl429/529=%d, 429=%d, 529=%d</li>
+  <li><b>Latency</b>: p50=%s, p99=%s</li>
+  <li><b>TTFT</b>: p50=%s, p99=%s</li>
+  <li><b>Tokens</b>: %d</li>
+  <li><b>QPS</b>: current=%.1f, peak=%.1f, avg=%.1f</li>
+  <li><b>TPS</b>: current=%.1f, peak=%.1f, avg=%.1f</li>
+</ul>
+`,
+		htmlEscape(strings.TrimSpace(title)),
+		htmlEscape(start.UTC().Format(time.RFC3339)),
+		htmlEscape(end.UTC().Format(time.RFC3339)),
+		overview.RequestCountTotal,
+		overview.SuccessCount,
+		overview.ErrorCountSLA,
+		overview.BusinessLimitedCount,
+		overview.SLA*100,
+		overview.ErrorRate*100,
+		overview.UpstreamErrorRate*100,
+		overview.UpstreamErrorCountExcl429529,
+		overview.Upstream429Count,
+		overview.Upstream529Count,
+		htmlEscape(latP50),
+		htmlEscape(latP99),
+		htmlEscape(ttftP50),
+		htmlEscape(ttftP99),
+		overview.TokenConsumed,
+		overview.QPS.Current,
+		overview.QPS.Peak,
+		overview.QPS.Avg,
+		overview.TPS.Current,
+		overview.TPS.Peak,
+		overview.TPS.Avg,
+	)
+}
+
+func buildOpsErrorDigestEmailHTML(title string, start, end time.Time, list *OpsErrorLogList) string {
+	total := 0
+	recent := []*OpsErrorLog{}
+	if list != nil {
+		total = list.Total
+		recent = list.Errors
+	}
+	if len(recent) > 10 {
+		recent = recent[:10]
+	}
+
+	rows := ""
+	for _, item := range recent {
+		if item == nil {
+			continue
+		}
+		rows += fmt.Sprintf(
+			"<tr><td>%s</td><td>%s</td><td>%d</td><td>%s</td></tr>",
+			htmlEscape(item.CreatedAt.UTC().Format(time.RFC3339)),
+			htmlEscape(item.Platform),
+			item.StatusCode,
+			htmlEscape(truncateString(item.Message, 180)),
+		)
+	}
+	if rows == "" {
+		rows = "<tr><td colspan=\"4\">No recent errors.</td></tr>"
+	}
+
+	return fmt.Sprintf(`
+<h2>%s</h2>
+<p><b>Period</b>: %s ~ %s (UTC)</p>
+<p><b>Total Errors</b>: %d</p>
+<h3>Recent</h3>
+<table border="1" cellpadding="6" cellspacing="0" style="border-collapse:collapse;">
+  <thead><tr><th>Time</th><th>Platform</th><th>Status</th><th>Message</th></tr></thead>
+  <tbody>%s</tbody>
+</table>
+`,
+		htmlEscape(strings.TrimSpace(title)),
+		htmlEscape(start.UTC().Format(time.RFC3339)),
+		htmlEscape(end.UTC().Format(time.RFC3339)),
+		total,
+		rows,
+	)
+}
+
+func buildOpsAccountHealthEmailHTML(title string, start, end time.Time, avail *OpsAccountAvailability) string {
+	total := 0
+	available := 0
+	rateLimited := 0
+	hasError := 0
+
+	if avail != nil && avail.Accounts != nil {
+		for _, a := range avail.Accounts {
+			if a == nil {
+				continue
+			}
+			total++
+			if a.IsAvailable {
+				available++
+			}
+			if a.IsRateLimited {
+				rateLimited++
+			}
+			if a.HasError {
+				hasError++
+			}
+		}
+	}
+
+	return fmt.Sprintf(`
+<h2>%s</h2>
+<p><b>Period</b>: %s ~ %s (UTC)</p>
+<ul>
+  <li><b>Total Accounts</b>: %d</li>
+  <li><b>Available</b>: %d</li>
+  <li><b>Rate Limited</b>: %d</li>
+  <li><b>Error</b>: %d</li>
+</ul>
+<p>Note: This report currently reflects account availability status only.</p>
+`,
+		htmlEscape(strings.TrimSpace(title)),
+		htmlEscape(start.UTC().Format(time.RFC3339)),
+		htmlEscape(end.UTC().Format(time.RFC3339)),
+		total,
+		available,
+		rateLimited,
+		hasError,
+	)
+}
+
+func (s *OpsScheduledReportService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
+	if s == nil || !s.distributedLockOn {
+		return nil, true
+	}
+	if s.redisClient == nil {
+		s.warnNoRedisOnce.Do(func() {
+			log.Printf("[OpsScheduledReport] redis not configured; running without distributed lock")
+		})
+		return nil, true
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	key := opsScheduledReportLeaderLockKeyDefault
+	ttl := opsScheduledReportLeaderLockTTLDefault
+	if strings.TrimSpace(key) == "" {
+		key = "ops:scheduled_reports:leader"
+	}
+	if ttl <= 0 {
+		ttl = 5 * time.Minute
+	}
+
+	ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
+	if err != nil {
+		// Prefer fail-closed to avoid duplicate report sends when Redis is flaky.
+		log.Printf("[OpsScheduledReport] leader lock SetNX failed; skipping this cycle: %v", err)
+		return nil, false
+	}
+	if !ok {
+		return nil, false
+	}
+	return func() {
+		_, _ = opsScheduledReportReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
+	}, true
+}
+
+func (s *OpsScheduledReportService) getLastRunAt(ctx context.Context, reportType string) time.Time {
+	if s == nil || s.redisClient == nil {
+		return time.Time{}
+	}
+	kind := strings.TrimSpace(reportType)
+	if kind == "" {
+		return time.Time{}
+	}
+	key := opsScheduledReportLastRunKeyPrefix + kind
+
+	raw, err := s.redisClient.Get(ctx, key).Result()
+	if err != nil || strings.TrimSpace(raw) == "" {
+		return time.Time{}
+	}
+	sec, err := strconv.ParseInt(strings.TrimSpace(raw), 10, 64)
+	if err != nil || sec <= 0 {
+		return time.Time{}
+	}
+	last := time.Unix(sec, 0)
+	// Cron schedules are interpreted in the configured timezone (s.loc). Ensure the base time
+	// passed into cron.Next() uses the same location; otherwise the job will drift by timezone
+	// offset (e.g. Asia/Shanghai default would run 8h later after the first execution).
+	if s.loc != nil {
+		return last.In(s.loc)
+	}
+	return last.UTC()
+}
+
+func (s *OpsScheduledReportService) setLastRunAt(ctx context.Context, reportType string, t time.Time) {
+	if s == nil || s.redisClient == nil {
+		return
+	}
+	kind := strings.TrimSpace(reportType)
+	if kind == "" {
+		return
+	}
+	if t.IsZero() {
+		t = time.Now().UTC()
+	}
+	key := opsScheduledReportLastRunKeyPrefix + kind
+	_ = s.redisClient.Set(ctx, key, strconv.FormatInt(t.UTC().Unix(), 10), 14*24*time.Hour).Err()
+}
+
+func (s *OpsScheduledReportService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
+	if s == nil || s.opsService == nil || s.opsService.opsRepo == nil {
+		return
+	}
+	now := time.Now().UTC()
+	durMs := duration.Milliseconds()
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+	_ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
+		JobName:        opsScheduledReportJobName,
+		LastRunAt:      &runAt,
+		LastSuccessAt:  &now,
+		LastDurationMs: &durMs,
+	})
+}
+
+func (s *OpsScheduledReportService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
+	if s == nil || s.opsService == nil || s.opsService.opsRepo == nil || err == nil {
+		return
+	}
+	now := time.Now().UTC()
+	durMs := duration.Milliseconds()
+	msg := truncateString(err.Error(), 2048)
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+	_ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
+		JobName:        opsScheduledReportJobName,
+		LastRunAt:      &runAt,
+		LastErrorAt:    &now,
+		LastError:      &msg,
+		LastDurationMs: &durMs,
+	})
+}
+
+func normalizeEmails(in []string) []string {
+	if len(in) == 0 {
+		return nil
+	}
+	seen := make(map[string]struct{}, len(in))
+	out := make([]string, 0, len(in))
+	for _, raw := range in {
+		addr := strings.ToLower(strings.TrimSpace(raw))
+		if addr == "" {
+			continue
+		}
+		if _, ok := seen[addr]; ok {
+			continue
+		}
+		seen[addr] = struct{}{}
+		out = append(out, addr)
+	}
+	return out
+}
diff --git a/backend/internal/service/ops_service.go b/backend/internal/service/ops_service.go
new file mode 100644
index 00000000..426d46f1
--- /dev/null
+++ b/backend/internal/service/ops_service.go
@@ -0,0 +1,537 @@
+package service
+
+import (
+	"context"
+	"database/sql"
+	"encoding/json"
+	"errors"
+	"log"
+	"strings"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/config"
+	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+var ErrOpsDisabled = infraerrors.NotFound("OPS_DISABLED", "Ops monitoring is disabled")
+
+const (
+	opsMaxStoredRequestBodyBytes = 10 * 1024
+	opsMaxStoredErrorBodyBytes   = 20 * 1024
+)
+
+// OpsService provides ingestion and query APIs for the Ops monitoring module.
+type OpsService struct {
+	opsRepo     OpsRepository
+	settingRepo SettingRepository
+	cfg         *config.Config
+
+	accountRepo AccountRepository
+
+	// getAccountAvailability is a unit-test hook for overriding account availability lookup.
+	getAccountAvailability func(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error)
+
+	concurrencyService        *ConcurrencyService
+	gatewayService            *GatewayService
+	openAIGatewayService      *OpenAIGatewayService
+	geminiCompatService       *GeminiMessagesCompatService
+	antigravityGatewayService *AntigravityGatewayService
+}
+
+func NewOpsService(
+	opsRepo OpsRepository,
+	settingRepo SettingRepository,
+	cfg *config.Config,
+	accountRepo AccountRepository,
+	concurrencyService *ConcurrencyService,
+	gatewayService *GatewayService,
+	openAIGatewayService *OpenAIGatewayService,
+	geminiCompatService *GeminiMessagesCompatService,
+	antigravityGatewayService *AntigravityGatewayService,
+) *OpsService {
+	return &OpsService{
+		opsRepo:     opsRepo,
+		settingRepo: settingRepo,
+		cfg:         cfg,
+
+		accountRepo: accountRepo,
+
+		concurrencyService:        concurrencyService,
+		gatewayService:            gatewayService,
+		openAIGatewayService:      openAIGatewayService,
+		geminiCompatService:       geminiCompatService,
+		antigravityGatewayService: antigravityGatewayService,
+	}
+}
+
+func (s *OpsService) RequireMonitoringEnabled(ctx context.Context) error {
+	if s.IsMonitoringEnabled(ctx) {
+		return nil
+	}
+	return ErrOpsDisabled
+}
+
+func (s *OpsService) IsMonitoringEnabled(ctx context.Context) bool {
+	// Hard switch: disable ops entirely.
+	if s.cfg != nil && !s.cfg.Ops.Enabled {
+		return false
+	}
+	if s.settingRepo == nil {
+		return true
+	}
+	value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
+	if err != nil {
+		// Default enabled when key is missing, and fail-open on transient errors
+		// (ops should never block gateway traffic).
+		if errors.Is(err, ErrSettingNotFound) {
+			return true
+		}
+		return true
+	}
+	switch strings.ToLower(strings.TrimSpace(value)) {
+	case "false", "0", "off", "disabled":
+		return false
+	default:
+		return true
+	}
+}
+
+func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogInput, rawRequestBody []byte) error {
+	if entry == nil {
+		return nil
+	}
+	if !s.IsMonitoringEnabled(ctx) {
+		return nil
+	}
+	if s.opsRepo == nil {
+		return nil
+	}
+
+	// Ensure timestamps are always populated.
+	if entry.CreatedAt.IsZero() {
+		entry.CreatedAt = time.Now()
+	}
+
+	// Ensure required fields exist (DB has NOT NULL constraints).
+	entry.ErrorPhase = strings.TrimSpace(entry.ErrorPhase)
+	entry.ErrorType = strings.TrimSpace(entry.ErrorType)
+	if entry.ErrorPhase == "" {
+		entry.ErrorPhase = "internal"
+	}
+	if entry.ErrorType == "" {
+		entry.ErrorType = "api_error"
+	}
+
+	// Sanitize + trim request body (errors only).
+	if len(rawRequestBody) > 0 {
+		sanitized, truncated, bytesLen := sanitizeAndTrimRequestBody(rawRequestBody, opsMaxStoredRequestBodyBytes)
+		if sanitized != "" {
+			entry.RequestBodyJSON = &sanitized
+		}
+		entry.RequestBodyTruncated = truncated
+		entry.RequestBodyBytes = &bytesLen
+	}
+
+	// Sanitize + truncate error_body to avoid storing sensitive data.
+	if strings.TrimSpace(entry.ErrorBody) != "" {
+		sanitized, _ := sanitizeErrorBodyForStorage(entry.ErrorBody, opsMaxStoredErrorBodyBytes)
+		entry.ErrorBody = sanitized
+	}
+
+	// Sanitize upstream error context if provided by gateway services.
+	if entry.UpstreamStatusCode != nil && *entry.UpstreamStatusCode <= 0 {
+		entry.UpstreamStatusCode = nil
+	}
+	if entry.UpstreamErrorMessage != nil {
+		msg := strings.TrimSpace(*entry.UpstreamErrorMessage)
+		msg = sanitizeUpstreamErrorMessage(msg)
+		msg = truncateString(msg, 2048)
+		if strings.TrimSpace(msg) == "" {
+			entry.UpstreamErrorMessage = nil
+		} else {
+			entry.UpstreamErrorMessage = &msg
+		}
+	}
+	if entry.UpstreamErrorDetail != nil {
+		detail := strings.TrimSpace(*entry.UpstreamErrorDetail)
+		if detail == "" {
+			entry.UpstreamErrorDetail = nil
+		} else {
+			sanitized, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes)
+			if strings.TrimSpace(sanitized) == "" {
+				entry.UpstreamErrorDetail = nil
+			} else {
+				entry.UpstreamErrorDetail = &sanitized
+			}
+		}
+	}
+
+	// Sanitize + serialize upstream error events list.
+	if len(entry.UpstreamErrors) > 0 {
+		const maxEvents = 32
+		events := entry.UpstreamErrors
+		if len(events) > maxEvents {
+			events = events[len(events)-maxEvents:]
+		}
+
+		sanitized := make([]*OpsUpstreamErrorEvent, 0, len(events))
+		for _, ev := range events {
+			if ev == nil {
+				continue
+			}
+			out := *ev
+
+			out.Platform = strings.TrimSpace(out.Platform)
+			out.UpstreamRequestID = truncateString(strings.TrimSpace(out.UpstreamRequestID), 128)
+			out.Kind = truncateString(strings.TrimSpace(out.Kind), 64)
+
+			if out.AccountID < 0 {
+				out.AccountID = 0
+			}
+			if out.UpstreamStatusCode < 0 {
+				out.UpstreamStatusCode = 0
+			}
+			if out.AtUnixMs < 0 {
+				out.AtUnixMs = 0
+			}
+
+			msg := sanitizeUpstreamErrorMessage(strings.TrimSpace(out.Message))
+			msg = truncateString(msg, 2048)
+			out.Message = msg
+
+			detail := strings.TrimSpace(out.Detail)
+			if detail != "" {
+				// Keep upstream detail small; request bodies are not stored here, only upstream error payloads.
+				sanitizedDetail, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes)
+				out.Detail = sanitizedDetail
+			} else {
+				out.Detail = ""
+			}
+
+			// Drop fully-empty events (can happen if only status code was known).
+			if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" {
+				continue
+			}
+
+			evCopy := out
+			sanitized = append(sanitized, &evCopy)
+		}
+
+		entry.UpstreamErrorsJSON = marshalOpsUpstreamErrors(sanitized)
+		entry.UpstreamErrors = nil
+	}
+
+	if _, err := s.opsRepo.InsertErrorLog(ctx, entry); err != nil {
+		// Never bubble up to gateway; best-effort logging.
+		log.Printf("[Ops] RecordError failed: %v", err)
+		return err
+	}
+	return nil
+}
+
+func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil
+	}
+	return s.opsRepo.ListErrorLogs(ctx, filter)
+}
+
+func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
+	}
+	detail, err := s.opsRepo.GetErrorLogByID(ctx, id)
+	if err != nil {
+		if errors.Is(err, sql.ErrNoRows) {
+			return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
+		}
+		return nil, infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err)
+	}
+	return detail, nil
+}
+
+func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) {
+	bytesLen = len(raw)
+	if len(raw) == 0 {
+		return "", false, 0
+	}
+
+	var decoded any
+	if err := json.Unmarshal(raw, &decoded); err != nil {
+		// If it's not valid JSON, don't store (retry would not be reliable anyway).
+		return "", false, bytesLen
+	}
+
+	decoded = redactSensitiveJSON(decoded)
+
+	encoded, err := json.Marshal(decoded)
+	if err != nil {
+		return "", false, bytesLen
+	}
+	if len(encoded) <= maxBytes {
+		return string(encoded), false, bytesLen
+	}
+
+	// Trim conversation history to keep the most recent context.
+	if root, ok := decoded.(map[string]any); ok {
+		if trimmed, ok := trimConversationArrays(root, maxBytes); ok {
+			encoded2, err2 := json.Marshal(trimmed)
+			if err2 == nil && len(encoded2) <= maxBytes {
+				return string(encoded2), true, bytesLen
+			}
+			// Fallthrough: keep shrinking.
+			decoded = trimmed
+		}
+
+		essential := shrinkToEssentials(root)
+		encoded3, err3 := json.Marshal(essential)
+		if err3 == nil && len(encoded3) <= maxBytes {
+			return string(encoded3), true, bytesLen
+		}
+	}
+
+	// Last resort: store a minimal placeholder (still valid JSON).
+	placeholder := map[string]any{
+		"request_body_truncated": true,
+	}
+	if model := extractString(decoded, "model"); model != "" {
+		placeholder["model"] = model
+	}
+	encoded4, err4 := json.Marshal(placeholder)
+	if err4 != nil {
+		return "", true, bytesLen
+	}
+	return string(encoded4), true, bytesLen
+}
+
+func redactSensitiveJSON(v any) any {
+	switch t := v.(type) {
+	case map[string]any:
+		out := make(map[string]any, len(t))
+		for k, vv := range t {
+			if isSensitiveKey(k) {
+				out[k] = "[REDACTED]"
+				continue
+			}
+			out[k] = redactSensitiveJSON(vv)
+		}
+		return out
+	case []any:
+		out := make([]any, 0, len(t))
+		for _, vv := range t {
+			out = append(out, redactSensitiveJSON(vv))
+		}
+		return out
+	default:
+		return v
+	}
+}
+
+func isSensitiveKey(key string) bool {
+	k := strings.ToLower(strings.TrimSpace(key))
+	if k == "" {
+		return false
+	}
+
+	// Exact matches (common credential fields).
+	switch k {
+	case "authorization",
+		"proxy-authorization",
+		"x-api-key",
+		"api_key",
+		"apikey",
+		"access_token",
+		"refresh_token",
+		"id_token",
+		"session_token",
+		"token",
+		"password",
+		"passwd",
+		"passphrase",
+		"secret",
+		"client_secret",
+		"private_key",
+		"jwt",
+		"signature",
+		"accesskeyid",
+		"secretaccesskey":
+		return true
+	}
+
+	// Suffix matches.
+	for _, suffix := range []string{
+		"_secret",
+		"_token",
+		"_id_token",
+		"_session_token",
+		"_password",
+		"_passwd",
+		"_passphrase",
+		"_key",
+		"secret_key",
+		"private_key",
+	} {
+		if strings.HasSuffix(k, suffix) {
+			return true
+		}
+	}
+
+	// Substring matches (conservative, but errs on the side of privacy).
+	for _, sub := range []string{
+		"secret",
+		"token",
+		"password",
+		"passwd",
+		"passphrase",
+		"privatekey",
+		"private_key",
+		"apikey",
+		"api_key",
+		"accesskeyid",
+		"secretaccesskey",
+		"bearer",
+		"cookie",
+		"credential",
+		"session",
+		"jwt",
+		"signature",
+	} {
+		if strings.Contains(k, sub) {
+			return true
+		}
+	}
+
+	return false
+}
+
+func trimConversationArrays(root map[string]any, maxBytes int) (map[string]any, bool) {
+	// Supported: anthropic/openai: messages; gemini: contents.
+	if out, ok := trimArrayField(root, "messages", maxBytes); ok {
+		return out, true
+	}
+	if out, ok := trimArrayField(root, "contents", maxBytes); ok {
+		return out, true
+	}
+	return root, false
+}
+
+func trimArrayField(root map[string]any, field string, maxBytes int) (map[string]any, bool) {
+	raw, ok := root[field]
+	if !ok {
+		return nil, false
+	}
+	arr, ok := raw.([]any)
+	if !ok || len(arr) == 0 {
+		return nil, false
+	}
+
+	// Keep at least the last message/content. Use binary search so we don't marshal O(n) times.
+	// We are dropping from the *front* of the array (oldest context first).
+	lo := 0
+	hi := len(arr) - 1 // inclusive; hi ensures at least one item remains
+
+	var best map[string]any
+	found := false
+
+	for lo <= hi {
+		mid := (lo + hi) / 2
+		candidateArr := arr[mid:]
+		if len(candidateArr) == 0 {
+			lo = mid + 1
+			continue
+		}
+
+		next := shallowCopyMap(root)
+		next[field] = candidateArr
+		encoded, err := json.Marshal(next)
+		if err != nil {
+			// If marshal fails, try dropping more.
+			lo = mid + 1
+			continue
+		}
+
+		if len(encoded) <= maxBytes {
+			best = next
+			found = true
+			// Try to keep more context by dropping fewer items.
+			hi = mid - 1
+			continue
+		}
+
+		// Need to drop more.
+		lo = mid + 1
+	}
+
+	if found {
+		return best, true
+	}
+
+	// Nothing fit (even with only one element); return the smallest slice and let the
+	// caller fall back to shrinkToEssentials().
+	next := shallowCopyMap(root)
+	next[field] = arr[len(arr)-1:]
+	return next, true
+}
+
+func shrinkToEssentials(root map[string]any) map[string]any {
+	out := make(map[string]any)
+	for _, key := range []string{"model", "stream", "max_tokens", "temperature", "top_p", "top_k"} {
+		if v, ok := root[key]; ok {
+			out[key] = v
+		}
+	}
+
+	// Keep only the last element of the conversation array.
+	if v, ok := root["messages"]; ok {
+		if arr, ok := v.([]any); ok && len(arr) > 0 {
+			out["messages"] = []any{arr[len(arr)-1]}
+		}
+	}
+	if v, ok := root["contents"]; ok {
+		if arr, ok := v.([]any); ok && len(arr) > 0 {
+			out["contents"] = []any{arr[len(arr)-1]}
+		}
+	}
+	return out
+}
+
+func shallowCopyMap(m map[string]any) map[string]any {
+	out := make(map[string]any, len(m))
+	for k, v := range m {
+		out[k] = v
+	}
+	return out
+}
+
+func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, truncated bool) {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return "", false
+	}
+
+	// Prefer JSON-safe sanitization when possible.
+	if out, trunc, _ := sanitizeAndTrimRequestBody([]byte(raw), maxBytes); out != "" {
+		return out, trunc
+	}
+
+	// Non-JSON: best-effort truncate.
+	if maxBytes > 0 && len(raw) > maxBytes {
+		return truncateString(raw, maxBytes), true
+	}
+	return raw, false
+}
+
+func extractString(v any, key string) string {
+	root, ok := v.(map[string]any)
+	if !ok {
+		return ""
+	}
+	s, _ := root[key].(string)
+	return strings.TrimSpace(s)
+}
diff --git a/backend/internal/service/ops_settings.go b/backend/internal/service/ops_settings.go
new file mode 100644
index 00000000..fbf8f069
--- /dev/null
+++ b/backend/internal/service/ops_settings.go
@@ -0,0 +1,465 @@
+package service
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"strings"
+	"time"
+)
+
+const (
+	opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader"
+	opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second
+)
+
+// =========================
+// Email notification config
+// =========================
+
+func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) {
+	defaultCfg := defaultOpsEmailNotificationConfig()
+	if s == nil || s.settingRepo == nil {
+		return defaultCfg, nil
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig)
+	if err != nil {
+		if errors.Is(err, ErrSettingNotFound) {
+			// Initialize defaults on first read (best-effort).
+			if b, mErr := json.Marshal(defaultCfg); mErr == nil {
+				_ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b))
+			}
+			return defaultCfg, nil
+		}
+		return nil, err
+	}
+
+	cfg := &OpsEmailNotificationConfig{}
+	if err := json.Unmarshal([]byte(raw), cfg); err != nil {
+		// Corrupted JSON should not break ops UI; fall back to defaults.
+		return defaultCfg, nil
+	}
+	normalizeOpsEmailNotificationConfig(cfg)
+	return cfg, nil
+}
+
+func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) {
+	if s == nil || s.settingRepo == nil {
+		return nil, errors.New("setting repository not initialized")
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if req == nil {
+		return nil, errors.New("invalid request")
+	}
+
+	cfg, err := s.GetEmailNotificationConfig(ctx)
+	if err != nil {
+		return nil, err
+	}
+
+	if req.Alert != nil {
+		cfg.Alert.Enabled = req.Alert.Enabled
+		if req.Alert.Recipients != nil {
+			cfg.Alert.Recipients = req.Alert.Recipients
+		}
+		cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity)
+		cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour
+		cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds
+		cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts
+	}
+
+	if req.Report != nil {
+		cfg.Report.Enabled = req.Report.Enabled
+		if req.Report.Recipients != nil {
+			cfg.Report.Recipients = req.Report.Recipients
+		}
+		cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled
+		cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule)
+		cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled
+		cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule)
+		cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled
+		cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule)
+		cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount
+		cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled
+		cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule)
+		cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold
+	}
+
+	if err := validateOpsEmailNotificationConfig(cfg); err != nil {
+		return nil, err
+	}
+
+	normalizeOpsEmailNotificationConfig(cfg)
+	raw, err := json.Marshal(cfg)
+	if err != nil {
+		return nil, err
+	}
+	if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil {
+		return nil, err
+	}
+	return cfg, nil
+}
+
+func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig {
+	return &OpsEmailNotificationConfig{
+		Alert: OpsEmailAlertConfig{
+			Enabled:               true,
+			Recipients:            []string{},
+			MinSeverity:           "",
+			RateLimitPerHour:      0,
+			BatchingWindowSeconds: 0,
+			IncludeResolvedAlerts: false,
+		},
+		Report: OpsEmailReportConfig{
+			Enabled:                         false,
+			Recipients:                      []string{},
+			DailySummaryEnabled:             false,
+			DailySummarySchedule:            "0 9 * * *",
+			WeeklySummaryEnabled:            false,
+			WeeklySummarySchedule:           "0 9 * * 1",
+			ErrorDigestEnabled:              false,
+			ErrorDigestSchedule:             "0 9 * * *",
+			ErrorDigestMinCount:             10,
+			AccountHealthEnabled:            false,
+			AccountHealthSchedule:           "0 9 * * *",
+			AccountHealthErrorRateThreshold: 10.0,
+		},
+	}
+}
+
+func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) {
+	if cfg == nil {
+		return
+	}
+	if cfg.Alert.Recipients == nil {
+		cfg.Alert.Recipients = []string{}
+	}
+	if cfg.Report.Recipients == nil {
+		cfg.Report.Recipients = []string{}
+	}
+
+	cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity)
+	cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule)
+	cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule)
+	cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule)
+	cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule)
+
+	// Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings.
+	if cfg.Report.DailySummarySchedule == "" {
+		cfg.Report.DailySummarySchedule = "0 9 * * *"
+	}
+	if cfg.Report.WeeklySummarySchedule == "" {
+		cfg.Report.WeeklySummarySchedule = "0 9 * * 1"
+	}
+	if cfg.Report.ErrorDigestSchedule == "" {
+		cfg.Report.ErrorDigestSchedule = "0 9 * * *"
+	}
+	if cfg.Report.AccountHealthSchedule == "" {
+		cfg.Report.AccountHealthSchedule = "0 9 * * *"
+	}
+}
+
+func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error {
+	if cfg == nil {
+		return errors.New("invalid config")
+	}
+
+	if cfg.Alert.RateLimitPerHour < 0 {
+		return errors.New("alert.rate_limit_per_hour must be >= 0")
+	}
+	if cfg.Alert.BatchingWindowSeconds < 0 {
+		return errors.New("alert.batching_window_seconds must be >= 0")
+	}
+	switch strings.TrimSpace(cfg.Alert.MinSeverity) {
+	case "", "critical", "warning", "info":
+	default:
+		return errors.New("alert.min_severity must be one of: critical, warning, info, or empty")
+	}
+
+	if cfg.Report.ErrorDigestMinCount < 0 {
+		return errors.New("report.error_digest_min_count must be >= 0")
+	}
+	if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 {
+		return errors.New("report.account_health_error_rate_threshold must be between 0 and 100")
+	}
+	return nil
+}
+
+// =========================
+// Alert runtime settings
+// =========================
+
+func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings {
+	return &OpsAlertRuntimeSettings{
+		EvaluationIntervalSeconds: 60,
+		DistributedLock: OpsDistributedLockSettings{
+			Enabled:    true,
+			Key:        opsAlertEvaluatorLeaderLockKeyDefault,
+			TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()),
+		},
+		Silencing: OpsAlertSilencingSettings{
+			Enabled:            false,
+			GlobalUntilRFC3339: "",
+			GlobalReason:       "",
+			Entries:            []OpsAlertSilenceEntry{},
+		},
+	}
+}
+
+func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) {
+	if s == nil {
+		return
+	}
+	s.Key = strings.TrimSpace(s.Key)
+	if s.Key == "" {
+		s.Key = defaultKey
+	}
+	if s.TTLSeconds <= 0 {
+		s.TTLSeconds = defaultTTLSeconds
+	}
+}
+
+func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) {
+	if s == nil {
+		return
+	}
+	s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339)
+	s.GlobalReason = strings.TrimSpace(s.GlobalReason)
+	if s.Entries == nil {
+		s.Entries = []OpsAlertSilenceEntry{}
+	}
+	for i := range s.Entries {
+		s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339)
+		s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason)
+	}
+}
+
+func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error {
+	if strings.TrimSpace(s.Key) == "" {
+		return errors.New("distributed_lock.key is required")
+	}
+	if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) {
+		return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400")
+	}
+	return nil
+}
+
+func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error {
+	parse := func(raw string) error {
+		if strings.TrimSpace(raw) == "" {
+			return nil
+		}
+		if _, err := time.Parse(time.RFC3339, raw); err != nil {
+			return errors.New("silencing time must be RFC3339")
+		}
+		return nil
+	}
+
+	if err := parse(s.GlobalUntilRFC3339); err != nil {
+		return err
+	}
+	for _, entry := range s.Entries {
+		if strings.TrimSpace(entry.UntilRFC3339) == "" {
+			return errors.New("silencing.entries.until_rfc3339 is required")
+		}
+		if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil {
+			return errors.New("silencing.entries.until_rfc3339 must be RFC3339")
+		}
+	}
+	return nil
+}
+
+func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) {
+	defaultCfg := defaultOpsAlertRuntimeSettings()
+	if s == nil || s.settingRepo == nil {
+		return defaultCfg, nil
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings)
+	if err != nil {
+		if errors.Is(err, ErrSettingNotFound) {
+			if b, mErr := json.Marshal(defaultCfg); mErr == nil {
+				_ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b))
+			}
+			return defaultCfg, nil
+		}
+		return nil, err
+	}
+
+	cfg := &OpsAlertRuntimeSettings{}
+	if err := json.Unmarshal([]byte(raw), cfg); err != nil {
+		return defaultCfg, nil
+	}
+
+	if cfg.EvaluationIntervalSeconds <= 0 {
+		cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds
+	}
+	normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
+	normalizeOpsAlertSilencingSettings(&cfg.Silencing)
+
+	return cfg, nil
+}
+
+func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) {
+	if s == nil || s.settingRepo == nil {
+		return nil, errors.New("setting repository not initialized")
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if cfg == nil {
+		return nil, errors.New("invalid config")
+	}
+
+	if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) {
+		return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400")
+	}
+	if cfg.DistributedLock.Enabled {
+		if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil {
+			return nil, err
+		}
+	}
+	if cfg.Silencing.Enabled {
+		if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil {
+			return nil, err
+		}
+	}
+
+	defaultCfg := defaultOpsAlertRuntimeSettings()
+	normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
+	normalizeOpsAlertSilencingSettings(&cfg.Silencing)
+
+	raw, err := json.Marshal(cfg)
+	if err != nil {
+		return nil, err
+	}
+	if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil {
+		return nil, err
+	}
+
+	// Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated).
+	updated := &OpsAlertRuntimeSettings{}
+	_ = json.Unmarshal(raw, updated)
+	return updated, nil
+}
+
+// =========================
+// Advanced settings
+// =========================
+
+func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
+	return &OpsAdvancedSettings{
+		DataRetention: OpsDataRetentionSettings{
+			CleanupEnabled:             false,
+			CleanupSchedule:            "0 2 * * *",
+			ErrorLogRetentionDays:      30,
+			MinuteMetricsRetentionDays: 30,
+			HourlyMetricsRetentionDays: 30,
+		},
+		Aggregation: OpsAggregationSettings{
+			AggregationEnabled: false,
+		},
+	}
+}
+
+func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) {
+	if cfg == nil {
+		return
+	}
+	cfg.DataRetention.CleanupSchedule = strings.TrimSpace(cfg.DataRetention.CleanupSchedule)
+	if cfg.DataRetention.CleanupSchedule == "" {
+		cfg.DataRetention.CleanupSchedule = "0 2 * * *"
+	}
+	if cfg.DataRetention.ErrorLogRetentionDays <= 0 {
+		cfg.DataRetention.ErrorLogRetentionDays = 30
+	}
+	if cfg.DataRetention.MinuteMetricsRetentionDays <= 0 {
+		cfg.DataRetention.MinuteMetricsRetentionDays = 30
+	}
+	if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 {
+		cfg.DataRetention.HourlyMetricsRetentionDays = 30
+	}
+}
+
+func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
+	if cfg == nil {
+		return errors.New("invalid config")
+	}
+	if cfg.DataRetention.ErrorLogRetentionDays < 1 || cfg.DataRetention.ErrorLogRetentionDays > 365 {
+		return errors.New("error_log_retention_days must be between 1 and 365")
+	}
+	if cfg.DataRetention.MinuteMetricsRetentionDays < 1 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 {
+		return errors.New("minute_metrics_retention_days must be between 1 and 365")
+	}
+	if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
+		return errors.New("hourly_metrics_retention_days must be between 1 and 365")
+	}
+	return nil
+}
+
+func (s *OpsService) GetOpsAdvancedSettings(ctx context.Context) (*OpsAdvancedSettings, error) {
+	defaultCfg := defaultOpsAdvancedSettings()
+	if s == nil || s.settingRepo == nil {
+		return defaultCfg, nil
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAdvancedSettings)
+	if err != nil {
+		if errors.Is(err, ErrSettingNotFound) {
+			if b, mErr := json.Marshal(defaultCfg); mErr == nil {
+				_ = s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(b))
+			}
+			return defaultCfg, nil
+		}
+		return nil, err
+	}
+
+	cfg := &OpsAdvancedSettings{}
+	if err := json.Unmarshal([]byte(raw), cfg); err != nil {
+		return defaultCfg, nil
+	}
+
+	normalizeOpsAdvancedSettings(cfg)
+	return cfg, nil
+}
+
+func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdvancedSettings) (*OpsAdvancedSettings, error) {
+	if s == nil || s.settingRepo == nil {
+		return nil, errors.New("setting repository not initialized")
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if cfg == nil {
+		return nil, errors.New("invalid config")
+	}
+
+	if err := validateOpsAdvancedSettings(cfg); err != nil {
+		return nil, err
+	}
+
+	normalizeOpsAdvancedSettings(cfg)
+	raw, err := json.Marshal(cfg)
+	if err != nil {
+		return nil, err
+	}
+	if err := s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(raw)); err != nil {
+		return nil, err
+	}
+
+	updated := &OpsAdvancedSettings{}
+	_ = json.Unmarshal(raw, updated)
+	return updated, nil
+}
diff --git a/backend/internal/service/ops_settings_models.go b/backend/internal/service/ops_settings_models.go
new file mode 100644
index 00000000..7d9a823c
--- /dev/null
+++ b/backend/internal/service/ops_settings_models.go
@@ -0,0 +1,87 @@
+package service
+
+// Ops settings models stored in DB `settings` table (JSON blobs).
+
+type OpsEmailNotificationConfig struct {
+	Alert  OpsEmailAlertConfig  `json:"alert"`
+	Report OpsEmailReportConfig `json:"report"`
+}
+
+type OpsEmailAlertConfig struct {
+	Enabled               bool     `json:"enabled"`
+	Recipients            []string `json:"recipients"`
+	MinSeverity           string   `json:"min_severity"`
+	RateLimitPerHour      int      `json:"rate_limit_per_hour"`
+	BatchingWindowSeconds int      `json:"batching_window_seconds"`
+	IncludeResolvedAlerts bool     `json:"include_resolved_alerts"`
+}
+
+type OpsEmailReportConfig struct {
+	Enabled                         bool     `json:"enabled"`
+	Recipients                      []string `json:"recipients"`
+	DailySummaryEnabled             bool     `json:"daily_summary_enabled"`
+	DailySummarySchedule            string   `json:"daily_summary_schedule"`
+	WeeklySummaryEnabled            bool     `json:"weekly_summary_enabled"`
+	WeeklySummarySchedule           string   `json:"weekly_summary_schedule"`
+	ErrorDigestEnabled              bool     `json:"error_digest_enabled"`
+	ErrorDigestSchedule             string   `json:"error_digest_schedule"`
+	ErrorDigestMinCount             int      `json:"error_digest_min_count"`
+	AccountHealthEnabled            bool     `json:"account_health_enabled"`
+	AccountHealthSchedule           string   `json:"account_health_schedule"`
+	AccountHealthErrorRateThreshold float64  `json:"account_health_error_rate_threshold"`
+}
+
+// OpsEmailNotificationConfigUpdateRequest allows partial updates, while the
+// frontend can still send the full config shape.
+type OpsEmailNotificationConfigUpdateRequest struct {
+	Alert  *OpsEmailAlertConfig  `json:"alert"`
+	Report *OpsEmailReportConfig `json:"report"`
+}
+
+type OpsDistributedLockSettings struct {
+	Enabled    bool   `json:"enabled"`
+	Key        string `json:"key"`
+	TTLSeconds int    `json:"ttl_seconds"`
+}
+
+type OpsAlertSilenceEntry struct {
+	RuleID     *int64   `json:"rule_id,omitempty"`
+	Severities []string `json:"severities,omitempty"`
+
+	UntilRFC3339 string `json:"until_rfc3339"`
+	Reason       string `json:"reason"`
+}
+
+type OpsAlertSilencingSettings struct {
+	Enabled bool `json:"enabled"`
+
+	GlobalUntilRFC3339 string `json:"global_until_rfc3339"`
+	GlobalReason       string `json:"global_reason"`
+
+	Entries []OpsAlertSilenceEntry `json:"entries,omitempty"`
+}
+
+type OpsAlertRuntimeSettings struct {
+	EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"`
+
+	DistributedLock OpsDistributedLockSettings `json:"distributed_lock"`
+	Silencing       OpsAlertSilencingSettings  `json:"silencing"`
+}
+
+// OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation).
+type OpsAdvancedSettings struct {
+	DataRetention OpsDataRetentionSettings `json:"data_retention"`
+	Aggregation   OpsAggregationSettings   `json:"aggregation"`
+}
+
+type OpsDataRetentionSettings struct {
+	CleanupEnabled             bool   `json:"cleanup_enabled"`
+	CleanupSchedule            string `json:"cleanup_schedule"`
+	ErrorLogRetentionDays      int    `json:"error_log_retention_days"`
+	MinuteMetricsRetentionDays int    `json:"minute_metrics_retention_days"`
+	HourlyMetricsRetentionDays int    `json:"hourly_metrics_retention_days"`
+}
+
+type OpsAggregationSettings struct {
+	AggregationEnabled bool `json:"aggregation_enabled"`
+}
diff --git a/backend/internal/service/ops_trend_models.go b/backend/internal/service/ops_trend_models.go
new file mode 100644
index 00000000..f6d07c14
--- /dev/null
+++ b/backend/internal/service/ops_trend_models.go
@@ -0,0 +1,65 @@
+package service
+
+import "time"
+
+type OpsThroughputTrendPoint struct {
+	BucketStart   time.Time `json:"bucket_start"`
+	RequestCount  int64     `json:"request_count"`
+	TokenConsumed int64     `json:"token_consumed"`
+	QPS           float64   `json:"qps"`
+	TPS           float64   `json:"tps"`
+}
+
+type OpsThroughputPlatformBreakdownItem struct {
+	Platform      string `json:"platform"`
+	RequestCount  int64  `json:"request_count"`
+	TokenConsumed int64  `json:"token_consumed"`
+}
+
+type OpsThroughputGroupBreakdownItem struct {
+	GroupID       int64  `json:"group_id"`
+	GroupName     string `json:"group_name"`
+	RequestCount  int64  `json:"request_count"`
+	TokenConsumed int64  `json:"token_consumed"`
+}
+
+type OpsThroughputTrendResponse struct {
+	Bucket string `json:"bucket"`
+
+	Points []*OpsThroughputTrendPoint `json:"points"`
+
+	// Optional drilldown helpers:
+	// - When no platform/group is selected: returns totals by platform.
+	// - When platform is selected but group is not: returns top groups in that platform.
+	ByPlatform []*OpsThroughputPlatformBreakdownItem `json:"by_platform,omitempty"`
+	TopGroups  []*OpsThroughputGroupBreakdownItem    `json:"top_groups,omitempty"`
+}
+
+type OpsErrorTrendPoint struct {
+	BucketStart time.Time `json:"bucket_start"`
+
+	ErrorCountTotal      int64 `json:"error_count_total"`
+	BusinessLimitedCount int64 `json:"business_limited_count"`
+	ErrorCountSLA        int64 `json:"error_count_sla"`
+
+	UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
+	Upstream429Count             int64 `json:"upstream_429_count"`
+	Upstream529Count             int64 `json:"upstream_529_count"`
+}
+
+type OpsErrorTrendResponse struct {
+	Bucket string                `json:"bucket"`
+	Points []*OpsErrorTrendPoint `json:"points"`
+}
+
+type OpsErrorDistributionItem struct {
+	StatusCode      int   `json:"status_code"`
+	Total           int64 `json:"total"`
+	SLA             int64 `json:"sla"`
+	BusinessLimited int64 `json:"business_limited"`
+}
+
+type OpsErrorDistributionResponse struct {
+	Total int64                       `json:"total"`
+	Items []*OpsErrorDistributionItem `json:"items"`
+}
diff --git a/backend/internal/service/ops_trends.go b/backend/internal/service/ops_trends.go
new file mode 100644
index 00000000..ec55c6ce
--- /dev/null
+++ b/backend/internal/service/ops_trends.go
@@ -0,0 +1,26 @@
+package service
+
+import (
+	"context"
+
+	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+func (s *OpsService) GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if filter == nil {
+		return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
+	}
+	if filter.StartTime.After(filter.EndTime) {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
+	}
+	return s.opsRepo.GetThroughputTrend(ctx, filter, bucketSeconds)
+}
diff --git a/backend/internal/service/ops_upstream_context.go b/backend/internal/service/ops_upstream_context.go
new file mode 100644
index 00000000..615ae6a1
--- /dev/null
+++ b/backend/internal/service/ops_upstream_context.go
@@ -0,0 +1,94 @@
+package service
+
+import (
+	"encoding/json"
+	"strings"
+	"time"
+
+	"github.com/gin-gonic/gin"
+)
+
+// Gin context keys used by Ops error logger for capturing upstream error details.
+// These keys are set by gateway services and consumed by handler/ops_error_logger.go.
+const (
+	OpsUpstreamStatusCodeKey   = "ops_upstream_status_code"
+	OpsUpstreamErrorMessageKey = "ops_upstream_error_message"
+	OpsUpstreamErrorDetailKey  = "ops_upstream_error_detail"
+	OpsUpstreamErrorsKey       = "ops_upstream_errors"
+)
+
+func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) {
+	if c == nil {
+		return
+	}
+	if upstreamStatusCode > 0 {
+		c.Set(OpsUpstreamStatusCodeKey, upstreamStatusCode)
+	}
+	if msg := strings.TrimSpace(upstreamMessage); msg != "" {
+		c.Set(OpsUpstreamErrorMessageKey, msg)
+	}
+	if detail := strings.TrimSpace(upstreamDetail); detail != "" {
+		c.Set(OpsUpstreamErrorDetailKey, detail)
+	}
+}
+
+// OpsUpstreamErrorEvent describes one upstream error attempt during a single gateway request.
+// It is stored in ops_error_logs.upstream_errors as a JSON array.
+type OpsUpstreamErrorEvent struct {
+	AtUnixMs int64 `json:"at_unix_ms,omitempty"`
+
+	// Context
+	Platform  string `json:"platform,omitempty"`
+	AccountID int64  `json:"account_id,omitempty"`
+
+	// Outcome
+	UpstreamStatusCode int    `json:"upstream_status_code,omitempty"`
+	UpstreamRequestID  string `json:"upstream_request_id,omitempty"`
+
+	// Kind: http_error | request_error | retry_exhausted | failover
+	Kind string `json:"kind,omitempty"`
+
+	Message string `json:"message,omitempty"`
+	Detail  string `json:"detail,omitempty"`
+}
+
+func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
+	if c == nil {
+		return
+	}
+	if ev.AtUnixMs <= 0 {
+		ev.AtUnixMs = time.Now().UnixMilli()
+	}
+	ev.Platform = strings.TrimSpace(ev.Platform)
+	ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID)
+	ev.Kind = strings.TrimSpace(ev.Kind)
+	ev.Message = strings.TrimSpace(ev.Message)
+	ev.Detail = strings.TrimSpace(ev.Detail)
+	if ev.Message != "" {
+		ev.Message = sanitizeUpstreamErrorMessage(ev.Message)
+	}
+
+	var existing []*OpsUpstreamErrorEvent
+	if v, ok := c.Get(OpsUpstreamErrorsKey); ok {
+		if arr, ok := v.([]*OpsUpstreamErrorEvent); ok {
+			existing = arr
+		}
+	}
+
+	evCopy := ev
+	existing = append(existing, &evCopy)
+	c.Set(OpsUpstreamErrorsKey, existing)
+}
+
+func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string {
+	if len(events) == 0 {
+		return nil
+	}
+	// Ensure we always store a valid JSON value.
+	raw, err := json.Marshal(events)
+	if err != nil || len(raw) == 0 {
+		return nil
+	}
+	s := string(raw)
+	return &s
+}
diff --git a/backend/internal/service/ops_window_stats.go b/backend/internal/service/ops_window_stats.go
new file mode 100644
index 00000000..71021d15
--- /dev/null
+++ b/backend/internal/service/ops_window_stats.go
@@ -0,0 +1,24 @@
+package service
+
+import (
+	"context"
+	"time"
+
+	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+// GetWindowStats returns lightweight request/token counts for the provided window.
+// It is intended for realtime sampling (e.g. WebSocket QPS push) without computing percentiles/peaks.
+func (s *OpsService) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	filter := &OpsDashboardFilter{
+		StartTime: startTime,
+		EndTime:   endTime,
+	}
+	return s.opsRepo.GetWindowStats(ctx, filter)
+}
diff --git a/backend/internal/service/ratelimit_service.go b/backend/internal/service/ratelimit_service.go
index f1362646..d570b92e 100644
--- a/backend/internal/service/ratelimit_service.go
+++ b/backend/internal/service/ratelimit_service.go
@@ -55,19 +55,36 @@ func (s *RateLimitService) HandleUpstreamError(ctx context.Context, account *Acc
 	}
 
 	tempMatched := s.tryTempUnschedulable(ctx, account, statusCode, responseBody)
+	upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(responseBody))
+	upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+	if upstreamMsg != "" {
+		upstreamMsg = truncateForLog([]byte(upstreamMsg), 512)
+	}
 
 	switch statusCode {
 	case 401:
 		// 认证失败：停止调度，记录错误
-		s.handleAuthError(ctx, account, "Authentication failed (401): invalid or expired credentials")
+		msg := "Authentication failed (401): invalid or expired credentials"
+		if upstreamMsg != "" {
+			msg = "Authentication failed (401): " + upstreamMsg
+		}
+		s.handleAuthError(ctx, account, msg)
 		shouldDisable = true
 	case 402:
 		// 支付要求：余额不足或计费问题，停止调度
-		s.handleAuthError(ctx, account, "Payment required (402): insufficient balance or billing issue")
+		msg := "Payment required (402): insufficient balance or billing issue"
+		if upstreamMsg != "" {
+			msg = "Payment required (402): " + upstreamMsg
+		}
+		s.handleAuthError(ctx, account, msg)
 		shouldDisable = true
 	case 403:
 		// 禁止访问：停止调度，记录错误
-		s.handleAuthError(ctx, account, "Access forbidden (403): account may be suspended or lack permissions")
+		msg := "Access forbidden (403): account may be suspended or lack permissions"
+		if upstreamMsg != "" {
+			msg = "Access forbidden (403): " + upstreamMsg
+		}
+		s.handleAuthError(ctx, account, msg)
 		shouldDisable = true
 	case 429:
 		s.handle429(ctx, account, headers)
diff --git a/backend/internal/service/setting_service.go b/backend/internal/service/setting_service.go
index 3e47d9d4..863d8a57 100644
--- a/backend/internal/service/setting_service.go
+++ b/backend/internal/service/setting_service.go
@@ -176,7 +176,7 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
 		updates[SettingKeyTurnstileSecretKey] = settings.TurnstileSecretKey
 	}
 
-	// LinuxDo Connect OAuth 登录（终端用户 SSO）
+	// LinuxDo Connect OAuth 登录
 	updates[SettingKeyLinuxDoConnectEnabled] = strconv.FormatBool(settings.LinuxDoConnectEnabled)
 	updates[SettingKeyLinuxDoConnectClientID] = settings.LinuxDoConnectClientID
 	updates[SettingKeyLinuxDoConnectRedirectURL] = settings.LinuxDoConnectRedirectURL
@@ -208,6 +208,14 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
 	updates[SettingKeyEnableIdentityPatch] = strconv.FormatBool(settings.EnableIdentityPatch)
 	updates[SettingKeyIdentityPatchPrompt] = settings.IdentityPatchPrompt
 
+	// Ops monitoring (vNext)
+	updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled)
+	updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled)
+	updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault))
+	if settings.OpsMetricsIntervalSeconds > 0 {
+		updates[SettingKeyOpsMetricsIntervalSeconds] = strconv.Itoa(settings.OpsMetricsIntervalSeconds)
+	}
+
 	err := s.settingRepo.SetMultiple(ctx, updates)
 	if err == nil && s.onUpdate != nil {
 		s.onUpdate() // Invalidate cache after settings update
@@ -298,6 +306,12 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error {
 		// Identity patch defaults
 		SettingKeyEnableIdentityPatch: "true",
 		SettingKeyIdentityPatchPrompt: "",
+
+		// Ops monitoring defaults (vNext)
+		SettingKeyOpsMonitoringEnabled:         "true",
+		SettingKeyOpsRealtimeMonitoringEnabled: "true",
+		SettingKeyOpsQueryModeDefault:          "auto",
+		SettingKeyOpsMetricsIntervalSeconds:    "60",
 	}
 
 	return s.settingRepo.SetMultiple(ctx, defaults)
@@ -397,100 +411,33 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin
 	}
 	result.IdentityPatchPrompt = settings[SettingKeyIdentityPatchPrompt]
 
+	// Ops monitoring settings (default: enabled, fail-open)
+	result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled])
+	result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled])
+	result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault]))
+	result.OpsMetricsIntervalSeconds = 60
+	if raw := strings.TrimSpace(settings[SettingKeyOpsMetricsIntervalSeconds]); raw != "" {
+		if v, err := strconv.Atoi(raw); err == nil {
+			if v < 60 {
+				v = 60
+			}
+			if v > 3600 {
+				v = 3600
+			}
+			result.OpsMetricsIntervalSeconds = v
+		}
+	}
+
 	return result
 }
 
-// GetLinuxDoConnectOAuthConfig 返回用于登录的“最终生效” LinuxDo Connect 配置。
-//
-// 优先级：
-// - 若对应系统设置键存在，则覆盖 config.yaml/env 的值
-// - 否则回退到 config.yaml/env 的值
-func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) {
-	if s == nil || s.cfg == nil {
-		return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded")
-	}
-
-	effective := s.cfg.LinuxDo
-
-	keys := []string{
-		SettingKeyLinuxDoConnectEnabled,
-		SettingKeyLinuxDoConnectClientID,
-		SettingKeyLinuxDoConnectClientSecret,
-		SettingKeyLinuxDoConnectRedirectURL,
-	}
-	settings, err := s.settingRepo.GetMultiple(ctx, keys)
-	if err != nil {
-		return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
-	}
-
-	if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
-		effective.Enabled = raw == "true"
-	}
-	if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
-		effective.ClientID = strings.TrimSpace(v)
-	}
-	if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
-		effective.ClientSecret = strings.TrimSpace(v)
-	}
-	if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
-		effective.RedirectURL = strings.TrimSpace(v)
-	}
-
-	if !effective.Enabled {
-		return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
-	}
-
-	// 基础健壮性校验（避免把用户重定向到一个必然失败或不安全的 OAuth 流程里）。
-	if strings.TrimSpace(effective.ClientID) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
-	}
-	if strings.TrimSpace(effective.AuthorizeURL) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
-	}
-	if strings.TrimSpace(effective.TokenURL) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
-	}
-	if strings.TrimSpace(effective.UserInfoURL) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
-	}
-	if strings.TrimSpace(effective.RedirectURL) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
-	}
-	if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
-	}
-
-	if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid")
-	}
-	if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
-	}
-	if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
-	}
-	if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
-	}
-	if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
-	}
-
-	method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod))
-	switch method {
-	case "", "client_secret_post", "client_secret_basic":
-		if strings.TrimSpace(effective.ClientSecret) == "" {
-			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
-		}
-	case "none":
-		if !effective.UsePKCE {
-			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
-		}
+func isFalseSettingValue(value string) bool {
+	switch strings.ToLower(strings.TrimSpace(value)) {
+	case "false", "0", "off", "disabled":
+		return true
 	default:
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid")
+		return false
 	}
-
-	return effective, nil
 }
 
 // getStringOrDefault 获取字符串值或默认值
@@ -635,3 +582,96 @@ func (s *SettingService) GetFallbackModel(ctx context.Context, platform string)
 	}
 	return value
 }
+
+// GetLinuxDoConnectOAuthConfig 返回用于登录的"最终生效" LinuxDo Connect 配置。
+//
+// 优先级：
+// - 若对应系统设置键存在，则覆盖 config.yaml/env 的值
+// - 否则回退到 config.yaml/env 的值
+func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) {
+	if s == nil || s.cfg == nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded")
+	}
+
+	effective := s.cfg.LinuxDo
+
+	keys := []string{
+		SettingKeyLinuxDoConnectEnabled,
+		SettingKeyLinuxDoConnectClientID,
+		SettingKeyLinuxDoConnectClientSecret,
+		SettingKeyLinuxDoConnectRedirectURL,
+	}
+	settings, err := s.settingRepo.GetMultiple(ctx, keys)
+	if err != nil {
+		return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
+	}
+
+	if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
+		effective.Enabled = raw == "true"
+	}
+	if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
+		effective.ClientID = strings.TrimSpace(v)
+	}
+	if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
+		effective.ClientSecret = strings.TrimSpace(v)
+	}
+	if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
+		effective.RedirectURL = strings.TrimSpace(v)
+	}
+
+	if !effective.Enabled {
+		return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
+	}
+
+	// 基础健壮性校验（避免把用户重定向到一个必然失败或不安全的 OAuth 流程里）。
+	if strings.TrimSpace(effective.ClientID) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
+	}
+	if strings.TrimSpace(effective.AuthorizeURL) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
+	}
+	if strings.TrimSpace(effective.TokenURL) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
+	}
+	if strings.TrimSpace(effective.UserInfoURL) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
+	}
+	if strings.TrimSpace(effective.RedirectURL) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
+	}
+	if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
+	}
+
+	if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid")
+	}
+	if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
+	}
+	if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
+	}
+	if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
+	}
+	if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
+	}
+
+	method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod))
+	switch method {
+	case "", "client_secret_post", "client_secret_basic":
+		if strings.TrimSpace(effective.ClientSecret) == "" {
+			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
+		}
+	case "none":
+		if !effective.UsePKCE {
+			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
+		}
+	default:
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid")
+	}
+
+	return effective, nil
+}
diff --git a/backend/internal/service/settings_view.go b/backend/internal/service/settings_view.go
index 325b7f8f..e20a230a 100644
--- a/backend/internal/service/settings_view.go
+++ b/backend/internal/service/settings_view.go
@@ -18,7 +18,7 @@ type SystemSettings struct {
 	TurnstileSecretKey           string
 	TurnstileSecretKeyConfigured bool
 
-	// LinuxDo Connect OAuth 登录（终端用户 SSO）
+	// LinuxDo Connect OAuth 登录
 	LinuxDoConnectEnabled                bool
 	LinuxDoConnectClientID               string
 	LinuxDoConnectClientSecret           string
@@ -46,6 +46,12 @@ type SystemSettings struct {
 	// Identity patch configuration (Claude -> Gemini)
 	EnableIdentityPatch bool   `json:"enable_identity_patch"`
 	IdentityPatchPrompt string `json:"identity_patch_prompt"`
+
+	// Ops monitoring (vNext)
+	OpsMonitoringEnabled         bool
+	OpsRealtimeMonitoringEnabled bool
+	OpsQueryModeDefault          string
+	OpsMetricsIntervalSeconds    int
 }
 
 type PublicSettings struct {
diff --git a/backend/internal/service/wire.go b/backend/internal/service/wire.go
index f1074e9d..f2cb9c44 100644
--- a/backend/internal/service/wire.go
+++ b/backend/internal/service/wire.go
@@ -1,10 +1,12 @@
 package service
 
 import (
+	"database/sql"
 	"time"
 
 	"github.com/Wei-Shaw/sub2api/internal/config"
 	"github.com/google/wire"
+	"github.com/redis/go-redis/v9"
 )
 
 // BuildInfo contains build information
@@ -84,6 +86,72 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi
 	return svc
 }
 
+// ProvideOpsMetricsCollector creates and starts OpsMetricsCollector.
+func ProvideOpsMetricsCollector(
+	opsRepo OpsRepository,
+	settingRepo SettingRepository,
+	accountRepo AccountRepository,
+	concurrencyService *ConcurrencyService,
+	db *sql.DB,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsMetricsCollector {
+	collector := NewOpsMetricsCollector(opsRepo, settingRepo, accountRepo, concurrencyService, db, redisClient, cfg)
+	collector.Start()
+	return collector
+}
+
+// ProvideOpsAggregationService creates and starts OpsAggregationService (hourly/daily pre-aggregation).
+func ProvideOpsAggregationService(
+	opsRepo OpsRepository,
+	settingRepo SettingRepository,
+	db *sql.DB,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsAggregationService {
+	svc := NewOpsAggregationService(opsRepo, settingRepo, db, redisClient, cfg)
+	svc.Start()
+	return svc
+}
+
+// ProvideOpsAlertEvaluatorService creates and starts OpsAlertEvaluatorService.
+func ProvideOpsAlertEvaluatorService(
+	opsService *OpsService,
+	opsRepo OpsRepository,
+	emailService *EmailService,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsAlertEvaluatorService {
+	svc := NewOpsAlertEvaluatorService(opsService, opsRepo, emailService, redisClient, cfg)
+	svc.Start()
+	return svc
+}
+
+// ProvideOpsCleanupService creates and starts OpsCleanupService (cron scheduled).
+func ProvideOpsCleanupService(
+	opsRepo OpsRepository,
+	db *sql.DB,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsCleanupService {
+	svc := NewOpsCleanupService(opsRepo, db, redisClient, cfg)
+	svc.Start()
+	return svc
+}
+
+// ProvideOpsScheduledReportService creates and starts OpsScheduledReportService.
+func ProvideOpsScheduledReportService(
+	opsService *OpsService,
+	userService *UserService,
+	emailService *EmailService,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsScheduledReportService {
+	svc := NewOpsScheduledReportService(opsService, userService, emailService, redisClient, cfg)
+	svc.Start()
+	return svc
+}
+
 // ProvideAPIKeyAuthCacheInvalidator 提供 API Key 认证缓存失效能力
 func ProvideAPIKeyAuthCacheInvalidator(apiKeyService *APIKeyService) APIKeyAuthCacheInvalidator {
 	return apiKeyService
@@ -122,6 +190,12 @@ var ProviderSet = wire.NewSet(
 	NewAccountUsageService,
 	NewAccountTestService,
 	NewSettingService,
+	NewOpsService,
+	ProvideOpsMetricsCollector,
+	ProvideOpsAggregationService,
+	ProvideOpsAlertEvaluatorService,
+	ProvideOpsCleanupService,
+	ProvideOpsScheduledReportService,
 	NewEmailService,
 	ProvideEmailQueueService,
 	NewTurnstileService,
diff --git a/backend/migrations/033_ops_monitoring_vnext.sql b/backend/migrations/033_ops_monitoring_vnext.sql
new file mode 100644
index 00000000..a18c061d
--- /dev/null
+++ b/backend/migrations/033_ops_monitoring_vnext.sql
@@ -0,0 +1,717 @@
+-- Ops Monitoring (vNext): squashed migration (030)
+--
+-- This repository originally planned Ops vNext as migrations 030-036:
+--   030 drop legacy ops tables
+--   031 core schema
+--   032 pre-aggregation tables
+--   033 indexes + optional extensions
+--   034 add avg/max to preagg
+--   035 add notify_email to alert rules
+--   036 seed default alert rules
+--
+-- Since these migrations have NOT been applied to any environment yet, we squash them
+-- into a single 030 migration for easier review and a cleaner migration history.
+--
+-- Notes:
+-- - This is intentionally destructive for ops_* data (error logs / metrics / alerts).
+-- - It is idempotent (DROP/CREATE/ALTER IF EXISTS/IF NOT EXISTS), but will wipe ops_* data if re-run.
+
+-- =====================================================================
+-- 030_ops_drop_legacy_ops_tables.sql
+-- =====================================================================
+
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+
+-- Legacy pre-aggregation tables (from 026 and/or previous branches)
+DROP TABLE IF EXISTS ops_metrics_daily CASCADE;
+DROP TABLE IF EXISTS ops_metrics_hourly CASCADE;
+
+-- Core ops tables that may exist in some deployments / branches
+DROP TABLE IF EXISTS ops_system_metrics CASCADE;
+DROP TABLE IF EXISTS ops_error_logs CASCADE;
+DROP TABLE IF EXISTS ops_alert_events CASCADE;
+DROP TABLE IF EXISTS ops_alert_rules CASCADE;
+DROP TABLE IF EXISTS ops_job_heartbeats CASCADE;
+DROP TABLE IF EXISTS ops_retry_attempts CASCADE;
+
+-- Optional legacy tables (best-effort cleanup)
+DROP TABLE IF EXISTS ops_scheduled_reports CASCADE;
+DROP TABLE IF EXISTS ops_group_availability_configs CASCADE;
+DROP TABLE IF EXISTS ops_group_availability_events CASCADE;
+
+-- Optional legacy views/indexes
+DROP VIEW IF EXISTS ops_latest_metrics CASCADE;
+
+-- =====================================================================
+-- 031_ops_core_schema.sql
+-- =====================================================================
+
+-- Ops Monitoring (vNext): core schema (errors / retries / metrics / jobs / alerts)
+--
+-- Design goals:
+-- - Support global filtering (time/platform/group) across all ops modules.
+-- - Persist enough context for two retry modes (client retry / pinned upstream retry).
+-- - Make ops background jobs observable via job heartbeats.
+-- - Keep schema stable and indexes targeted (high-write tables).
+--
+-- Notes:
+-- - This migration is idempotent.
+-- - ops_* tables intentionally avoid strict foreign keys to reduce write amplification/locks.
+
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+
+-- ============================================
+-- 1) ops_error_logs: error log details (high-write)
+-- ============================================
+
+CREATE TABLE IF NOT EXISTS ops_error_logs (
+    id BIGSERIAL PRIMARY KEY,
+
+    -- Correlation / identities
+    request_id VARCHAR(64),
+    client_request_id VARCHAR(64),
+    user_id BIGINT,
+    api_key_id BIGINT,
+    account_id BIGINT,
+    group_id BIGINT,
+    client_ip inet,
+
+    -- Dimensions for global filtering
+    platform VARCHAR(32),
+
+    -- Request metadata
+    model VARCHAR(100),
+    request_path VARCHAR(256),
+    stream BOOLEAN NOT NULL DEFAULT false,
+    user_agent TEXT,
+
+    -- Core error classification
+    error_phase VARCHAR(32) NOT NULL,
+    error_type VARCHAR(64) NOT NULL,
+    severity VARCHAR(8) NOT NULL DEFAULT 'P2',
+    status_code INT,
+
+    -- vNext metric semantics
+    is_business_limited BOOLEAN NOT NULL DEFAULT false,
+
+    -- Error details (sanitized/truncated at ingest time)
+    error_message TEXT,
+    error_body TEXT,
+
+    -- Provider/upstream details (optional; useful for trends & account health)
+    error_source VARCHAR(64),
+    error_owner VARCHAR(32),
+    account_status VARCHAR(50),
+    upstream_status_code INT,
+    upstream_error_message TEXT,
+    upstream_error_detail TEXT,
+    provider_error_code VARCHAR(64),
+    provider_error_type VARCHAR(64),
+    network_error_type VARCHAR(50),
+    retry_after_seconds INT,
+
+    -- Timings (ms) - optional
+    duration_ms INT,
+    time_to_first_token_ms BIGINT,
+    auth_latency_ms BIGINT,
+    routing_latency_ms BIGINT,
+    upstream_latency_ms BIGINT,
+    response_latency_ms BIGINT,
+
+    -- Retry context (only stored for error requests)
+    request_body JSONB,
+    request_headers JSONB,
+    request_body_truncated BOOLEAN NOT NULL DEFAULT false,
+    request_body_bytes INT,
+
+    -- Retryability flags (best-effort classification)
+    is_retryable BOOLEAN NOT NULL DEFAULT false,
+    retry_count INT NOT NULL DEFAULT 0,
+
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+COMMENT ON TABLE ops_error_logs IS 'Ops error logs (vNext). Stores sanitized error details and request_body for retries (errors only).';
+
+-- ============================================
+-- 2) ops_retry_attempts: audit log for retries
+-- ============================================
+
+CREATE TABLE IF NOT EXISTS ops_retry_attempts (
+    id BIGSERIAL PRIMARY KEY,
+
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+
+    requested_by_user_id BIGINT,
+    source_error_id BIGINT,
+
+    -- client|upstream
+    mode VARCHAR(16) NOT NULL,
+    pinned_account_id BIGINT,
+
+    -- queued|running|succeeded|failed
+    status VARCHAR(16) NOT NULL DEFAULT 'queued',
+    started_at TIMESTAMPTZ,
+    finished_at TIMESTAMPTZ,
+    duration_ms BIGINT,
+
+    -- Optional result correlation
+    result_request_id VARCHAR(64),
+    result_error_id BIGINT,
+    result_usage_request_id VARCHAR(64),
+
+    error_message TEXT
+);
+
+COMMENT ON TABLE ops_retry_attempts IS 'Audit table for ops retries (client retry / pinned upstream retry).';
+
+-- ============================================
+-- 3) ops_system_metrics: system + request window snapshots
+-- ============================================
+
+CREATE TABLE IF NOT EXISTS ops_system_metrics (
+    id BIGSERIAL PRIMARY KEY,
+
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    window_minutes INT NOT NULL DEFAULT 1,
+
+    -- Optional dimensions (only if collector chooses to write per-dimension snapshots)
+    platform VARCHAR(32),
+    group_id BIGINT,
+
+    -- Core counts
+    success_count BIGINT NOT NULL DEFAULT 0,
+    error_count_total BIGINT NOT NULL DEFAULT 0,
+    business_limited_count BIGINT NOT NULL DEFAULT 0,
+    error_count_sla BIGINT NOT NULL DEFAULT 0,
+
+    upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
+    upstream_429_count BIGINT NOT NULL DEFAULT 0,
+    upstream_529_count BIGINT NOT NULL DEFAULT 0,
+
+    token_consumed BIGINT NOT NULL DEFAULT 0,
+
+    -- Rates
+    qps DOUBLE PRECISION,
+    tps DOUBLE PRECISION,
+
+    -- Duration percentiles (ms) - success requests
+    duration_p50_ms INT,
+    duration_p90_ms INT,
+    duration_p95_ms INT,
+    duration_p99_ms INT,
+    duration_avg_ms DOUBLE PRECISION,
+    duration_max_ms INT,
+
+    -- TTFT percentiles (ms) - success requests (streaming)
+    ttft_p50_ms INT,
+    ttft_p90_ms INT,
+    ttft_p95_ms INT,
+    ttft_p99_ms INT,
+    ttft_avg_ms DOUBLE PRECISION,
+    ttft_max_ms INT,
+
+    -- System resources
+    cpu_usage_percent DOUBLE PRECISION,
+    memory_used_mb BIGINT,
+    memory_total_mb BIGINT,
+    memory_usage_percent DOUBLE PRECISION,
+
+    -- Dependency health (best-effort)
+    db_ok BOOLEAN,
+    redis_ok BOOLEAN,
+
+    -- DB pool & runtime
+    db_conn_active INT,
+    db_conn_idle INT,
+    db_conn_waiting INT,
+    goroutine_count INT,
+
+    -- Queue / concurrency
+    concurrency_queue_depth INT
+);
+
+COMMENT ON TABLE ops_system_metrics IS 'Ops system/request metrics snapshots (vNext). Used for dashboard overview and realtime rates.';
+
+-- ============================================
+-- 4) ops_job_heartbeats: background jobs health
+-- ============================================
+
+CREATE TABLE IF NOT EXISTS ops_job_heartbeats (
+    job_name VARCHAR(64) PRIMARY KEY,
+
+    last_run_at TIMESTAMPTZ,
+    last_success_at TIMESTAMPTZ,
+    last_error_at TIMESTAMPTZ,
+    last_error TEXT,
+    last_duration_ms BIGINT,
+
+    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+COMMENT ON TABLE ops_job_heartbeats IS 'Ops background jobs heartbeats (vNext).';
+
+-- ============================================
+-- 5) ops_alert_rules / ops_alert_events
+-- ============================================
+
+CREATE TABLE IF NOT EXISTS ops_alert_rules (
+    id BIGSERIAL PRIMARY KEY,
+
+    name VARCHAR(128) NOT NULL,
+    description TEXT,
+    enabled BOOLEAN NOT NULL DEFAULT true,
+
+    severity VARCHAR(16) NOT NULL DEFAULT 'warning',
+
+    -- Metric definition
+    -- Metric definition
+    metric_type VARCHAR(64) NOT NULL,
+    operator VARCHAR(8) NOT NULL,
+    threshold DOUBLE PRECISION NOT NULL,
+
+    window_minutes INT NOT NULL DEFAULT 5,
+    sustained_minutes INT NOT NULL DEFAULT 5,
+    cooldown_minutes INT NOT NULL DEFAULT 10,
+
+    -- Optional scoping: platform/group filters etc.
+    filters JSONB,
+
+    last_triggered_at TIMESTAMPTZ,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_alert_rules_name_unique
+    ON ops_alert_rules (name);
+
+CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled
+    ON ops_alert_rules (enabled);
+
+CREATE TABLE IF NOT EXISTS ops_alert_events (
+    id BIGSERIAL PRIMARY KEY,
+
+    rule_id BIGINT,
+    severity VARCHAR(16) NOT NULL,
+    status VARCHAR(16) NOT NULL DEFAULT 'firing',
+
+    title VARCHAR(200),
+    description TEXT,
+
+    metric_value DOUBLE PRECISION,
+    threshold_value DOUBLE PRECISION,
+    dimensions JSONB,
+
+    fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    resolved_at TIMESTAMPTZ,
+
+    email_sent BOOLEAN NOT NULL DEFAULT false,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status
+    ON ops_alert_events (rule_id, status);
+
+CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at
+    ON ops_alert_events (fired_at DESC);
+
+-- =====================================================================
+-- 032_ops_preaggregation_tables.sql
+-- =====================================================================
+
+-- Ops Monitoring (vNext): pre-aggregation tables
+--
+-- Purpose:
+-- - Provide stable query performance for 1–24h windows (and beyond), avoiding expensive
+--   percentile_cont scans on raw logs for every dashboard refresh.
+-- - Support global filter dimensions: overall / platform / group.
+--
+-- Design note:
+-- - We keep a single table with nullable platform/group_id, and enforce uniqueness via a
+--   COALESCE-based unique index (because UNIQUE with NULLs allows duplicates in Postgres).
+
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+
+-- ============================================
+-- 1) ops_metrics_hourly
+-- ============================================
+
+CREATE TABLE IF NOT EXISTS ops_metrics_hourly (
+    id BIGSERIAL PRIMARY KEY,
+
+    bucket_start TIMESTAMPTZ NOT NULL,
+    platform VARCHAR(32),
+    group_id BIGINT,
+
+    success_count BIGINT NOT NULL DEFAULT 0,
+    error_count_total BIGINT NOT NULL DEFAULT 0,
+    business_limited_count BIGINT NOT NULL DEFAULT 0,
+    error_count_sla BIGINT NOT NULL DEFAULT 0,
+
+    upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
+    upstream_429_count BIGINT NOT NULL DEFAULT 0,
+    upstream_529_count BIGINT NOT NULL DEFAULT 0,
+
+    token_consumed BIGINT NOT NULL DEFAULT 0,
+
+    -- Duration percentiles (ms)
+    duration_p50_ms INT,
+    duration_p90_ms INT,
+    duration_p95_ms INT,
+    duration_p99_ms INT,
+
+    -- TTFT percentiles (ms)
+    ttft_p50_ms INT,
+    ttft_p90_ms INT,
+    ttft_p95_ms INT,
+    ttft_p99_ms INT,
+
+    computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+-- Uniqueness across three “dimension modes” (overall / platform / group).
+-- Postgres UNIQUE treats NULLs as distinct, so we enforce uniqueness via COALESCE.
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_hourly_unique_dim
+    ON ops_metrics_hourly (
+        bucket_start,
+        COALESCE(platform, ''),
+        COALESCE(group_id, 0)
+    );
+
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_bucket
+    ON ops_metrics_hourly (bucket_start DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_platform_bucket
+    ON ops_metrics_hourly (platform, bucket_start DESC)
+    WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
+
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_group_bucket
+    ON ops_metrics_hourly (group_id, bucket_start DESC)
+    WHERE group_id IS NOT NULL AND group_id <> 0;
+
+COMMENT ON TABLE ops_metrics_hourly IS 'vNext hourly pre-aggregated ops metrics (overall/platform/group).';
+
+-- ============================================
+-- 2) ops_metrics_daily (optional; for longer windows)
+-- ============================================
+
+CREATE TABLE IF NOT EXISTS ops_metrics_daily (
+    id BIGSERIAL PRIMARY KEY,
+
+    bucket_date DATE NOT NULL,
+    platform VARCHAR(32),
+    group_id BIGINT,
+
+    success_count BIGINT NOT NULL DEFAULT 0,
+    error_count_total BIGINT NOT NULL DEFAULT 0,
+    business_limited_count BIGINT NOT NULL DEFAULT 0,
+    error_count_sla BIGINT NOT NULL DEFAULT 0,
+
+    upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
+    upstream_429_count BIGINT NOT NULL DEFAULT 0,
+    upstream_529_count BIGINT NOT NULL DEFAULT 0,
+
+    token_consumed BIGINT NOT NULL DEFAULT 0,
+
+    duration_p50_ms INT,
+    duration_p90_ms INT,
+    duration_p95_ms INT,
+    duration_p99_ms INT,
+
+    ttft_p50_ms INT,
+    ttft_p90_ms INT,
+    ttft_p95_ms INT,
+    ttft_p99_ms INT,
+
+    computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_daily_unique_dim
+    ON ops_metrics_daily (
+        bucket_date,
+        COALESCE(platform, ''),
+        COALESCE(group_id, 0)
+    );
+
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_bucket
+    ON ops_metrics_daily (bucket_date DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_platform_bucket
+    ON ops_metrics_daily (platform, bucket_date DESC)
+    WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
+
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_group_bucket
+    ON ops_metrics_daily (group_id, bucket_date DESC)
+    WHERE group_id IS NOT NULL AND group_id <> 0;
+
+COMMENT ON TABLE ops_metrics_daily IS 'vNext daily pre-aggregated ops metrics (overall/platform/group).';
+
+-- =====================================================================
+-- 033_ops_indexes_and_extensions.sql
+-- =====================================================================
+
+-- Ops Monitoring (vNext): indexes and optional extensions
+--
+-- This migration intentionally keeps "optional" objects (like pg_trgm) best-effort,
+-- so environments without extension privileges won't fail the whole migration chain.
+
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+
+-- ============================================
+-- 1) Core btree indexes (always safe)
+-- ============================================
+
+-- ops_error_logs
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at
+    ON ops_error_logs (created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform_time
+    ON ops_error_logs (platform, created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_group_time
+    ON ops_error_logs (group_id, created_at DESC)
+    WHERE group_id IS NOT NULL;
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_account_time
+    ON ops_error_logs (account_id, created_at DESC)
+    WHERE account_id IS NOT NULL;
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_status_time
+    ON ops_error_logs (status_code, created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_time
+    ON ops_error_logs (error_phase, created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_type_time
+    ON ops_error_logs (error_type, created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id
+    ON ops_error_logs (request_id);
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id
+    ON ops_error_logs (client_request_id);
+
+-- ops_system_metrics
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at
+    ON ops_system_metrics (created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time
+    ON ops_system_metrics (window_minutes, created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_platform_time
+    ON ops_system_metrics (platform, created_at DESC)
+    WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
+
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_group_time
+    ON ops_system_metrics (group_id, created_at DESC)
+    WHERE group_id IS NOT NULL;
+
+-- ops_retry_attempts
+CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_created_at
+    ON ops_retry_attempts (created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_source_error
+    ON ops_retry_attempts (source_error_id, created_at DESC)
+    WHERE source_error_id IS NOT NULL;
+
+-- Prevent concurrent retries for the same ops_error_logs row (race-free, multi-instance safe).
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_retry_attempts_unique_active
+    ON ops_retry_attempts (source_error_id)
+    WHERE source_error_id IS NOT NULL AND status IN ('queued', 'running');
+
+-- ============================================
+-- 2) Optional: pg_trgm + trigram indexes for fuzzy search
+-- ============================================
+
+DO $$
+BEGIN
+  BEGIN
+    CREATE EXTENSION IF NOT EXISTS pg_trgm;
+  EXCEPTION WHEN OTHERS THEN
+    -- Missing privileges or extension package should not block migrations.
+    RAISE NOTICE 'pg_trgm extension not created: %', SQLERRM;
+  END;
+
+  IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm') THEN
+    -- request_id / client_request_id fuzzy search
+    EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id_trgm
+             ON ops_error_logs USING gin (request_id gin_trgm_ops)';
+    EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id_trgm
+             ON ops_error_logs USING gin (client_request_id gin_trgm_ops)';
+
+    -- error_message fuzzy search
+    EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_error_message_trgm
+             ON ops_error_logs USING gin (error_message gin_trgm_ops)';
+  END IF;
+END $$;
+
+-- =====================================================================
+-- 034_ops_preaggregation_add_avg_max.sql
+-- =====================================================================
+
+-- Ops Monitoring (vNext): extend pre-aggregation tables with avg/max latency fields
+--
+-- Why:
+-- - The dashboard overview returns avg/max for duration/TTFT.
+-- - Hourly/daily pre-aggregation tables originally stored only p50/p90/p95/p99, which makes
+--   it impossible to answer avg/max in preagg mode without falling back to raw scans.
+--
+-- This migration is idempotent and safe to run multiple times.
+--
+-- NOTE: We keep the existing p50/p90/p95/p99 columns as-is; these are still used for
+--       approximate long-window summaries.
+
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+
+-- Hourly table
+ALTER TABLE ops_metrics_hourly
+    ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
+    ADD COLUMN IF NOT EXISTS duration_max_ms INT,
+    ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
+    ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
+
+-- Daily table
+ALTER TABLE ops_metrics_daily
+    ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
+    ADD COLUMN IF NOT EXISTS duration_max_ms INT,
+    ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
+    ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
+
+-- =====================================================================
+-- 035_ops_alert_rules_notify_email.sql
+-- =====================================================================
+
+-- Ops Monitoring (vNext): alert rule notify settings
+--
+-- Adds notify_email flag to ops_alert_rules to keep UI parity with the backup Ops dashboard.
+-- Migration is idempotent.
+
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+
+ALTER TABLE ops_alert_rules
+    ADD COLUMN IF NOT EXISTS notify_email BOOLEAN NOT NULL DEFAULT true;
+
+-- =====================================================================
+-- 036_ops_seed_default_alert_rules.sql
+-- =====================================================================
+
+-- Ops Monitoring (vNext): seed default alert rules (idempotent)
+--
+-- Goal:
+-- - Provide "out of the box" alert rules so the Ops dashboard can immediately show alert events.
+-- - Keep inserts idempotent via ON CONFLICT (name) DO NOTHING.
+--
+-- Notes:
+-- - Thresholds are intentionally conservative defaults and should be tuned per deployment.
+-- - Metric semantics follow vNext:
+--   - success_rate / error_rate are based on SLA-scope counts (exclude is_business_limited).
+--   - upstream_error_rate excludes 429/529.
+
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+
+-- 1) High error rate (P1)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    '错误率过高',
+    '当错误率超过 5% 且持续 5 分钟时触发告警',
+    true, 'error_rate', '>', 5.0, 5, 5, 'P1', true, 20, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+
+-- 2) Low success rate (P0)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    '成功率过低',
+    '当成功率低于 95% 且持续 5 分钟时触发告警（服务可用性下降）',
+    true, 'success_rate', '<', 95.0, 5, 5, 'P0', true, 15, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+
+-- 3) P99 latency too high (P2)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    'P99延迟过高',
+    '当 P99 延迟超过 3000ms 且持续 10 分钟时触发告警',
+    true, 'p99_latency_ms', '>', 3000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+
+-- 4) P95 latency too high (P2)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    'P95延迟过高',
+    '当 P95 延迟超过 2000ms 且持续 10 分钟时触发告警',
+    true, 'p95_latency_ms', '>', 2000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+
+-- 5) CPU usage too high (P2)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    'CPU使用率过高',
+    '当 CPU 使用率超过 85% 且持续 10 分钟时触发告警',
+    true, 'cpu_usage_percent', '>', 85.0, 5, 10, 'P2', true, 30, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+
+-- 6) Memory usage too high (P1)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    '内存使用率过高',
+    '当内存使用率超过 90% 且持续 10 分钟时触发告警（可能导致 OOM）',
+    true, 'memory_usage_percent', '>', 90.0, 5, 10, 'P1', true, 20, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+
+-- 7) Concurrency queue buildup (P1)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    '并发队列积压',
+    '当并发队列深度超过 100 且持续 5 分钟时触发告警（系统处理能力不足）',
+    true, 'concurrency_queue_depth', '>', 100.0, 5, 5, 'P1', true, 20, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+
+-- 8) Extremely high error rate (P0)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    '错误率极高',
+    '当错误率超过 20% 且持续 1 分钟时触发告警（服务严重异常）',
+    true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+
+-- Ops Monitoring vNext: add Redis pool stats fields to system metrics snapshots.
+-- This migration is intentionally idempotent.
+
+ALTER TABLE ops_system_metrics
+  ADD COLUMN IF NOT EXISTS redis_conn_total INT,
+  ADD COLUMN IF NOT EXISTS redis_conn_idle INT;
+
+COMMENT ON COLUMN ops_system_metrics.redis_conn_total IS 'Redis pool total connections (go-redis PoolStats.TotalConns).';
+COMMENT ON COLUMN ops_system_metrics.redis_conn_idle IS 'Redis pool idle connections (go-redis PoolStats.IdleConns).';
diff --git a/backend/migrations/034_ops_upstream_error_events.sql b/backend/migrations/034_ops_upstream_error_events.sql
new file mode 100644
index 00000000..f8bfa5e2
--- /dev/null
+++ b/backend/migrations/034_ops_upstream_error_events.sql
@@ -0,0 +1,9 @@
+-- Add upstream error events list (JSONB) to ops_error_logs for per-request correlation.
+--
+-- This is intentionally idempotent.
+
+ALTER TABLE ops_error_logs
+    ADD COLUMN IF NOT EXISTS upstream_errors JSONB;
+
+COMMENT ON COLUMN ops_error_logs.upstream_errors IS
+    'Sanitized upstream error events list (JSON array), correlated per gateway request (request_id/client_request_id); used for per-request upstream debugging.';
diff --git a/config.yaml b/config.yaml
index b5272aac..424ce9eb 100644
--- a/config.yaml
+++ b/config.yaml
@@ -159,7 +159,7 @@ gateway:
   max_line_size: 41943040
   # Log upstream error response body summary (safe/truncated; does not log request content)
   # 记录上游错误响应体摘要（安全/截断；不记录请求内容）
-  log_upstream_error_body: false
+  log_upstream_error_body: true
   # Max bytes to log from upstream error body
   # 记录上游错误响应体的最大字节数
   log_upstream_error_body_max_bytes: 2048
@@ -302,6 +302,41 @@ redis:
   # 数据库编号（0-15）
   db: 0
 
+# =============================================================================
+# Ops Monitoring (Optional)
+# 运维监控 (可选)
+# =============================================================================
+ops:
+  # Hard switch: disable all ops background jobs and APIs when false
+  # 硬开关：为 false 时禁用所有 Ops 后台任务与接口
+  enabled: true
+
+  # Prefer pre-aggregated tables (ops_metrics_hourly/ops_metrics_daily) for long-window dashboard queries.
+  # 优先使用预聚合表（用于长时间窗口查询性能）
+  use_preaggregated_tables: false
+
+  # Data cleanup configuration
+  # 数据清理配置（vNext 默认统一保留 30 天）
+  cleanup:
+    enabled: true
+    # Cron expression (minute hour dom month dow), e.g. "0 2 * * *" = daily at 2 AM
+    # Cron 表达式（分 时 日 月 周），例如 "0 2 * * *" = 每天凌晨 2 点
+    schedule: "0 2 * * *"
+    error_log_retention_days: 30
+    minute_metrics_retention_days: 30
+    hourly_metrics_retention_days: 30
+
+  # Pre-aggregation configuration
+  # 预聚合任务配置
+  aggregation:
+    enabled: true
+
+  # OpsMetricsCollector Redis cache (reduces duplicate expensive window aggregation in multi-replica deployments)
+  # 指标采集 Redis 缓存（多副本部署时减少重复计算）
+  metrics_collector_cache:
+    enabled: true
+    ttl: 65s
+
 # =============================================================================
 # JWT Configuration
 # JWT 配置
diff --git a/deploy/.env.example b/deploy/.env.example
index 83e58a50..27618284 100644
--- a/deploy/.env.example
+++ b/deploy/.env.example
@@ -151,6 +151,15 @@ GEMINI_OAUTH_SCOPES=
 # GEMINI_QUOTA_POLICY={"tiers":{"LEGACY":{"pro_rpd":50,"flash_rpd":1500,"cooldown_minutes":30},"PRO":{"pro_rpd":1500,"flash_rpd":4000,"cooldown_minutes":5},"ULTRA":{"pro_rpd":2000,"flash_rpd":0,"cooldown_minutes":5}}}
 GEMINI_QUOTA_POLICY=
 
+# -----------------------------------------------------------------------------
+# Ops Monitoring Configuration (运维监控配置)
+# -----------------------------------------------------------------------------
+# Enable ops monitoring features (background jobs and APIs)
+# 是否启用运维监控功能（后台任务和接口）
+# Set to false to hide ops menu in sidebar and disable all ops features
+# 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能
+OPS_ENABLED=true
+
 # -----------------------------------------------------------------------------
 # Update Configuration (在线更新配置)
 # -----------------------------------------------------------------------------
diff --git a/deploy/config.example.yaml b/deploy/config.example.yaml
index 57239f8e..b1fc9bbd 100644
--- a/deploy/config.example.yaml
+++ b/deploy/config.example.yaml
@@ -159,7 +159,7 @@ gateway:
   max_line_size: 41943040
   # Log upstream error response body summary (safe/truncated; does not log request content)
   # 记录上游错误响应体摘要（安全/截断；不记录请求内容）
-  log_upstream_error_body: false
+  log_upstream_error_body: true
   # Max bytes to log from upstream error body
   # 记录上游错误响应体的最大字节数
   log_upstream_error_body_max_bytes: 2048
@@ -302,6 +302,19 @@ redis:
   # 数据库编号（0-15）
   db: 0
 
+# =============================================================================
+# Ops Monitoring (Optional)
+# 运维监控 (可选)
+# =============================================================================
+ops:
+  # Enable ops monitoring features (background jobs and APIs)
+  # 是否启用运维监控功能（后台任务和接口）
+  # Set to false to hide ops menu in sidebar and disable all ops features
+  # 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能
+  # Other detailed settings (cleanup, aggregation, etc.) are configured in ops settings dialog
+  # 其他详细设置（数据清理、预聚合等）在运维监控设置对话框中配置
+  enabled: true
+
 # =============================================================================
 # JWT Configuration
 # JWT 配置
diff --git a/frontend/src/api/admin/index.ts b/frontend/src/api/admin/index.ts
index c90017a8..e86f6348 100644
--- a/frontend/src/api/admin/index.ts
+++ b/frontend/src/api/admin/index.ts
@@ -17,6 +17,7 @@ import usageAPI from './usage'
 import geminiAPI from './gemini'
 import antigravityAPI from './antigravity'
 import userAttributesAPI from './userAttributes'
+import opsAPI from './ops'
 
 /**
  * Unified admin API object for convenient access
@@ -35,7 +36,8 @@ export const adminAPI = {
   usage: usageAPI,
   gemini: geminiAPI,
   antigravity: antigravityAPI,
-  userAttributes: userAttributesAPI
+  userAttributes: userAttributesAPI,
+  ops: opsAPI
 }
 
 export {
@@ -52,7 +54,8 @@ export {
   usageAPI,
   geminiAPI,
   antigravityAPI,
-  userAttributesAPI
+  userAttributesAPI,
+  opsAPI
 }
 
 export default adminAPI
diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts
new file mode 100644
index 00000000..1d1453f5
--- /dev/null
+++ b/frontend/src/api/admin/ops.ts
@@ -0,0 +1,958 @@
+/**
+ * Admin Ops API endpoints (vNext)
+ * - Error logs list/detail + retry (client/upstream)
+ * - Dashboard overview (raw path)
+ */
+
+import { apiClient } from '../client'
+import type { PaginatedResponse } from '@/types'
+
+export type OpsRetryMode = 'client' | 'upstream'
+export type OpsQueryMode = 'auto' | 'raw' | 'preagg'
+
+export interface OpsRequestOptions {
+  signal?: AbortSignal
+}
+
+export interface OpsRetryRequest {
+  mode: OpsRetryMode
+  pinned_account_id?: number
+}
+
+export interface OpsRetryResult {
+  attempt_id: number
+  mode: OpsRetryMode
+  status: 'running' | 'succeeded' | 'failed' | string
+
+  pinned_account_id?: number | null
+  used_account_id?: number | null
+
+  http_status_code: number
+  upstream_request_id: string
+
+  response_preview: string
+  response_truncated: boolean
+
+  error_message: string
+
+  started_at: string
+  finished_at: string
+  duration_ms: number
+}
+
+export interface OpsDashboardOverview {
+  start_time: string
+  end_time: string
+  platform: string
+  group_id?: number | null
+
+  health_score?: number
+
+  system_metrics?: OpsSystemMetricsSnapshot | null
+  job_heartbeats?: OpsJobHeartbeat[] | null
+
+  success_count: number
+  error_count_total: number
+  business_limited_count: number
+  error_count_sla: number
+  request_count_total: number
+  request_count_sla: number
+
+  token_consumed: number
+
+  sla: number
+  error_rate: number
+  upstream_error_rate: number
+  upstream_error_count_excl_429_529: number
+  upstream_429_count: number
+  upstream_529_count: number
+
+  qps: {
+    current: number
+    peak: number
+    avg: number
+  }
+  tps: {
+    current: number
+    peak: number
+    avg: number
+  }
+
+  duration: OpsPercentiles
+  ttft: OpsPercentiles
+}
+
+export interface OpsPercentiles {
+  p50_ms?: number | null
+  p90_ms?: number | null
+  p95_ms?: number | null
+  p99_ms?: number | null
+  avg_ms?: number | null
+  max_ms?: number | null
+}
+
+export interface OpsThroughputTrendPoint {
+  bucket_start: string
+  request_count: number
+  token_consumed: number
+  qps: number
+  tps: number
+}
+
+export interface OpsThroughputPlatformBreakdownItem {
+  platform: string
+  request_count: number
+  token_consumed: number
+}
+
+export interface OpsThroughputGroupBreakdownItem {
+  group_id: number
+  group_name: string
+  request_count: number
+  token_consumed: number
+}
+
+export interface OpsThroughputTrendResponse {
+  bucket: string
+  points: OpsThroughputTrendPoint[]
+  by_platform?: OpsThroughputPlatformBreakdownItem[]
+  top_groups?: OpsThroughputGroupBreakdownItem[]
+}
+
+export type OpsRequestKind = 'success' | 'error'
+export type OpsRequestDetailsKind = OpsRequestKind | 'all'
+export type OpsRequestDetailsSort = 'created_at_desc' | 'duration_desc'
+
+export interface OpsRequestDetail {
+  kind: OpsRequestKind
+  created_at: string
+  request_id: string
+
+  platform?: string
+  model?: string
+  duration_ms?: number | null
+  status_code?: number | null
+
+  error_id?: number | null
+  phase?: string
+  severity?: string
+  message?: string
+
+  user_id?: number | null
+  api_key_id?: number | null
+  account_id?: number | null
+  group_id?: number | null
+
+  stream?: boolean
+}
+
+export interface OpsRequestDetailsParams {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+
+  kind?: OpsRequestDetailsKind
+
+  platform?: string
+  group_id?: number | null
+
+  user_id?: number
+  api_key_id?: number
+  account_id?: number
+
+  model?: string
+  request_id?: string
+  q?: string
+
+  min_duration_ms?: number
+  max_duration_ms?: number
+
+  sort?: OpsRequestDetailsSort
+
+  page?: number
+  page_size?: number
+}
+
+export type OpsRequestDetailsResponse = PaginatedResponse<OpsRequestDetail>
+
+export interface OpsLatencyHistogramBucket {
+  range: string
+  count: number
+}
+
+export interface OpsLatencyHistogramResponse {
+  start_time: string
+  end_time: string
+  platform: string
+  group_id?: number | null
+
+  total_requests: number
+  buckets: OpsLatencyHistogramBucket[]
+}
+
+export interface OpsErrorTrendPoint {
+  bucket_start: string
+  error_count_total: number
+  business_limited_count: number
+  error_count_sla: number
+  upstream_error_count_excl_429_529: number
+  upstream_429_count: number
+  upstream_529_count: number
+}
+
+export interface OpsErrorTrendResponse {
+  bucket: string
+  points: OpsErrorTrendPoint[]
+}
+
+export interface OpsErrorDistributionItem {
+  status_code: number
+  total: number
+  sla: number
+  business_limited: number
+}
+
+export interface OpsErrorDistributionResponse {
+  total: number
+  items: OpsErrorDistributionItem[]
+}
+
+export interface OpsSystemMetricsSnapshot {
+  id: number
+  created_at: string
+  window_minutes: number
+
+  cpu_usage_percent?: number | null
+  memory_used_mb?: number | null
+  memory_total_mb?: number | null
+  memory_usage_percent?: number | null
+
+  db_ok?: boolean | null
+  redis_ok?: boolean | null
+
+  // Config-derived limits (best-effort) for rendering "current vs max".
+  db_max_open_conns?: number | null
+  redis_pool_size?: number | null
+
+  redis_conn_total?: number | null
+  redis_conn_idle?: number | null
+
+  db_conn_active?: number | null
+  db_conn_idle?: number | null
+  db_conn_waiting?: number | null
+
+  goroutine_count?: number | null
+  concurrency_queue_depth?: number | null
+}
+
+export interface OpsJobHeartbeat {
+  job_name: string
+  last_run_at?: string | null
+  last_success_at?: string | null
+  last_error_at?: string | null
+  last_error?: string | null
+  last_duration_ms?: number | null
+  updated_at: string
+}
+
+export interface PlatformConcurrencyInfo {
+  platform: string
+  current_in_use: number
+  max_capacity: number
+  load_percentage: number
+  waiting_in_queue: number
+}
+
+export interface GroupConcurrencyInfo {
+  group_id: number
+  group_name: string
+  platform: string
+  current_in_use: number
+  max_capacity: number
+  load_percentage: number
+  waiting_in_queue: number
+}
+
+export interface AccountConcurrencyInfo {
+  account_id: number
+  account_name?: string
+  platform: string
+  group_id: number
+  group_name: string
+  current_in_use: number
+  max_capacity: number
+  load_percentage: number
+  waiting_in_queue: number
+}
+
+export interface OpsConcurrencyStatsResponse {
+  enabled: boolean
+  platform: Record<string, PlatformConcurrencyInfo>
+  group: Record<string, GroupConcurrencyInfo>
+  account: Record<string, AccountConcurrencyInfo>
+  timestamp?: string
+}
+
+export async function getConcurrencyStats(platform?: string, groupId?: number | null): Promise<OpsConcurrencyStatsResponse> {
+  const params: Record<string, any> = {}
+  if (platform) {
+    params.platform = platform
+  }
+  if (typeof groupId === 'number' && groupId > 0) {
+    params.group_id = groupId
+  }
+
+  const { data } = await apiClient.get<OpsConcurrencyStatsResponse>('/admin/ops/concurrency', { params })
+  return data
+}
+
+export interface PlatformAvailability {
+  platform: string
+  total_accounts: number
+  available_count: number
+  rate_limit_count: number
+  error_count: number
+}
+
+export interface GroupAvailability {
+  group_id: number
+  group_name: string
+  platform: string
+  total_accounts: number
+  available_count: number
+  rate_limit_count: number
+  error_count: number
+}
+
+export interface AccountAvailability {
+  account_id: number
+  account_name: string
+  platform: string
+  group_id: number
+  group_name: string
+  status: string
+  is_available: boolean
+  is_rate_limited: boolean
+  rate_limit_reset_at?: string
+  rate_limit_remaining_sec?: number
+  is_overloaded: boolean
+  overload_until?: string
+  overload_remaining_sec?: number
+  has_error: boolean
+  error_message?: string
+}
+
+export interface OpsAccountAvailabilityStatsResponse {
+  enabled: boolean
+  platform: Record<string, PlatformAvailability>
+  group: Record<string, GroupAvailability>
+  account: Record<string, AccountAvailability>
+  timestamp?: string
+}
+
+export async function getAccountAvailabilityStats(platform?: string, groupId?: number | null): Promise<OpsAccountAvailabilityStatsResponse> {
+  const params: Record<string, any> = {}
+  if (platform) {
+    params.platform = platform
+  }
+  if (typeof groupId === 'number' && groupId > 0) {
+    params.group_id = groupId
+  }
+  const { data } = await apiClient.get<OpsAccountAvailabilityStatsResponse>('/admin/ops/account-availability', { params })
+  return data
+}
+
+/**
+ * Subscribe to realtime QPS updates via WebSocket.
+ *
+ * Note: browsers cannot set Authorization headers for WebSockets.
+ * We authenticate via Sec-WebSocket-Protocol using a prefixed token item:
+ *   ["sub2api-admin", "jwt.<token>"]
+ */
+export interface SubscribeQPSOptions {
+  token?: string | null
+  onOpen?: () => void
+  onClose?: (event: CloseEvent) => void
+  onError?: (event: Event) => void
+  /**
+   * Called when the server closes with an application close code that indicates
+   * reconnecting is not useful (e.g. feature flag disabled).
+   */
+  onFatalClose?: (event: CloseEvent) => void
+  /**
+   * More granular status updates for UI (connecting/reconnecting/offline/etc).
+   */
+  onStatusChange?: (status: OpsWSStatus) => void
+  /**
+   * Called when a reconnect is scheduled (helps display "retry in Xs").
+   */
+  onReconnectScheduled?: (info: { attempt: number, delayMs: number }) => void
+  wsBaseUrl?: string
+  /**
+   * Maximum reconnect attempts. Defaults to Infinity to keep the dashboard live.
+   * Set to 0 to disable reconnect.
+   */
+  maxReconnectAttempts?: number
+  reconnectBaseDelayMs?: number
+  reconnectMaxDelayMs?: number
+  /**
+   * Stale connection detection (heartbeat-by-observation).
+   * If no messages are received within this window, the socket is closed to trigger a reconnect.
+   * Set to 0 to disable.
+   */
+  staleTimeoutMs?: number
+  /**
+   * How often to check staleness. Only used when `staleTimeoutMs > 0`.
+   */
+  staleCheckIntervalMs?: number
+}
+
+export type OpsWSStatus = 'connecting' | 'connected' | 'reconnecting' | 'offline' | 'closed'
+
+export const OPS_WS_CLOSE_CODES = {
+  REALTIME_DISABLED: 4001
+} as const
+
+const OPS_WS_BASE_PROTOCOL = 'sub2api-admin'
+
+export function subscribeQPS(onMessage: (data: any) => void, options: SubscribeQPSOptions = {}): () => void {
+  let ws: WebSocket | null = null
+  let reconnectAttempts = 0
+  const maxReconnectAttempts = Number.isFinite(options.maxReconnectAttempts as number)
+    ? (options.maxReconnectAttempts as number)
+    : Infinity
+  const baseDelayMs = options.reconnectBaseDelayMs ?? 1000
+  const maxDelayMs = options.reconnectMaxDelayMs ?? 30000
+  let reconnectTimer: ReturnType<typeof setTimeout> | null = null
+  let shouldReconnect = true
+  let isConnecting = false
+  let hasConnectedOnce = false
+  let lastMessageAt = 0
+  const staleTimeoutMs = options.staleTimeoutMs ?? 120_000
+  const staleCheckIntervalMs = options.staleCheckIntervalMs ?? 30_000
+  let staleTimer: ReturnType<typeof setInterval> | null = null
+
+  const setStatus = (status: OpsWSStatus) => {
+    options.onStatusChange?.(status)
+  }
+
+  const clearReconnectTimer = () => {
+    if (reconnectTimer) {
+      clearTimeout(reconnectTimer)
+      reconnectTimer = null
+    }
+  }
+
+  const clearStaleTimer = () => {
+    if (staleTimer) {
+      clearInterval(staleTimer)
+      staleTimer = null
+    }
+  }
+
+  const startStaleTimer = () => {
+    clearStaleTimer()
+    if (!staleTimeoutMs || staleTimeoutMs <= 0) return
+    staleTimer = setInterval(() => {
+      if (!shouldReconnect) return
+      if (!ws || ws.readyState !== WebSocket.OPEN) return
+      if (!lastMessageAt) return
+      const ageMs = Date.now() - lastMessageAt
+      if (ageMs > staleTimeoutMs) {
+        // Treat as a half-open connection; closing triggers the normal reconnect path.
+        ws.close()
+      }
+    }, staleCheckIntervalMs)
+  }
+
+  const scheduleReconnect = () => {
+    if (!shouldReconnect) return
+    if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
+
+    // If we're offline, wait for the browser to come back online.
+    if (typeof navigator !== 'undefined' && 'onLine' in navigator && !navigator.onLine) {
+      setStatus('offline')
+      return
+    }
+
+    const expDelay = baseDelayMs * Math.pow(2, reconnectAttempts)
+    const delay = Math.min(expDelay, maxDelayMs)
+    const jitter = Math.floor(Math.random() * 250)
+    clearReconnectTimer()
+    reconnectTimer = setTimeout(() => {
+      reconnectAttempts++
+      connect()
+    }, delay + jitter)
+    options.onReconnectScheduled?.({ attempt: reconnectAttempts + 1, delayMs: delay + jitter })
+  }
+
+  const handleOnline = () => {
+    if (!shouldReconnect) return
+    if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
+    connect()
+  }
+
+  const handleOffline = () => {
+    setStatus('offline')
+  }
+
+  const connect = () => {
+    if (!shouldReconnect) return
+    if (isConnecting) return
+    if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
+    if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
+
+    isConnecting = true
+    setStatus(hasConnectedOnce ? 'reconnecting' : 'connecting')
+    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
+    const wsBaseUrl = options.wsBaseUrl || import.meta.env.VITE_WS_BASE_URL || window.location.host
+    const wsURL = new URL(`${protocol}//${wsBaseUrl}/api/v1/admin/ops/ws/qps`)
+
+    // Do NOT put admin JWT in the URL query string (it can leak via access logs, proxies, etc).
+    // Browsers cannot set Authorization headers for WebSockets, so we pass the token via
+    // Sec-WebSocket-Protocol (subprotocol list): ["sub2api-admin", "jwt.<token>"].
+    const rawToken = String(options.token ?? localStorage.getItem('auth_token') ?? '').trim()
+    const protocols: string[] = [OPS_WS_BASE_PROTOCOL]
+    if (rawToken) protocols.push(`jwt.${rawToken}`)
+
+    ws = new WebSocket(wsURL.toString(), protocols)
+
+    ws.onopen = () => {
+      reconnectAttempts = 0
+      isConnecting = false
+      hasConnectedOnce = true
+      clearReconnectTimer()
+      lastMessageAt = Date.now()
+      startStaleTimer()
+      setStatus('connected')
+      options.onOpen?.()
+    }
+
+    ws.onmessage = (e) => {
+      try {
+        const data = JSON.parse(e.data)
+        lastMessageAt = Date.now()
+        onMessage(data)
+      } catch (err) {
+        console.warn('[OpsWS] Failed to parse message:', err)
+      }
+    }
+
+    ws.onerror = (error) => {
+      console.error('[OpsWS] Connection error:', error)
+      options.onError?.(error)
+    }
+
+    ws.onclose = (event) => {
+      isConnecting = false
+      options.onClose?.(event)
+      clearStaleTimer()
+      ws = null
+
+      // If the server explicitly tells us to stop reconnecting, honor it.
+      if (event && typeof event.code === 'number' && event.code === OPS_WS_CLOSE_CODES.REALTIME_DISABLED) {
+        shouldReconnect = false
+        clearReconnectTimer()
+        setStatus('closed')
+        options.onFatalClose?.(event)
+        return
+      }
+
+      scheduleReconnect()
+    }
+  }
+
+  window.addEventListener('online', handleOnline)
+  window.addEventListener('offline', handleOffline)
+  connect()
+
+  return () => {
+    shouldReconnect = false
+    window.removeEventListener('online', handleOnline)
+    window.removeEventListener('offline', handleOffline)
+    clearReconnectTimer()
+    clearStaleTimer()
+    if (ws) ws.close()
+    ws = null
+    setStatus('closed')
+  }
+}
+
+export type OpsSeverity = string
+export type OpsPhase = string
+
+export type AlertSeverity = 'critical' | 'warning' | 'info'
+export type ThresholdMode = 'count' | 'percentage' | 'both'
+export type MetricType =
+  | 'success_rate'
+  | 'error_rate'
+  | 'upstream_error_rate'
+  | 'p95_latency_ms'
+  | 'p99_latency_ms'
+  | 'cpu_usage_percent'
+  | 'memory_usage_percent'
+  | 'concurrency_queue_depth'
+  | 'group_available_accounts'
+  | 'group_available_ratio'
+  | 'group_rate_limit_ratio'
+  | 'account_rate_limited_count'
+  | 'account_error_count'
+  | 'account_error_ratio'
+  | 'overload_account_count'
+export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!='
+
+export interface AlertRule {
+  id?: number
+  name: string
+  description?: string
+  enabled: boolean
+  metric_type: MetricType
+  operator: Operator
+  threshold: number
+  window_minutes: number
+  sustained_minutes: number
+  severity: OpsSeverity
+  cooldown_minutes: number
+  notify_email: boolean
+  filters?: Record<string, any>
+  created_at?: string
+  updated_at?: string
+  last_triggered_at?: string | null
+}
+
+export interface AlertEvent {
+  id: number
+  rule_id: number
+  severity: OpsSeverity | string
+  status: 'firing' | 'resolved' | string
+  title?: string
+  description?: string
+  metric_value?: number
+  threshold_value?: number
+  dimensions?: Record<string, any>
+  fired_at: string
+  resolved_at?: string | null
+  email_sent: boolean
+  created_at: string
+}
+
+export interface EmailNotificationConfig {
+  alert: {
+    enabled: boolean
+    recipients: string[]
+    min_severity: AlertSeverity | ''
+    rate_limit_per_hour: number
+    batching_window_seconds: number
+    include_resolved_alerts: boolean
+  }
+  report: {
+    enabled: boolean
+    recipients: string[]
+    daily_summary_enabled: boolean
+    daily_summary_schedule: string
+    weekly_summary_enabled: boolean
+    weekly_summary_schedule: string
+    error_digest_enabled: boolean
+    error_digest_schedule: string
+    error_digest_min_count: number
+    account_health_enabled: boolean
+    account_health_schedule: string
+    account_health_error_rate_threshold: number
+  }
+}
+
+export interface OpsDistributedLockSettings {
+  enabled: boolean
+  key: string
+  ttl_seconds: number
+}
+
+export interface OpsAlertRuntimeSettings {
+  evaluation_interval_seconds: number
+  distributed_lock: OpsDistributedLockSettings
+  silencing: {
+    enabled: boolean
+    global_until_rfc3339: string
+    global_reason: string
+    entries?: Array<{
+      rule_id?: number
+      severities?: Array<OpsSeverity | string>
+      until_rfc3339: string
+      reason: string
+    }>
+  }
+}
+
+export interface OpsAdvancedSettings {
+  data_retention: OpsDataRetentionSettings
+  aggregation: OpsAggregationSettings
+}
+
+export interface OpsDataRetentionSettings {
+  cleanup_enabled: boolean
+  cleanup_schedule: string
+  error_log_retention_days: number
+  minute_metrics_retention_days: number
+  hourly_metrics_retention_days: number
+}
+
+export interface OpsAggregationSettings {
+  aggregation_enabled: boolean
+}
+
+export interface OpsErrorLog {
+  id: number
+  created_at: string
+  phase: OpsPhase
+  type: string
+  severity: OpsSeverity
+  status_code: number
+  platform: string
+  model: string
+  latency_ms?: number | null
+  client_request_id: string
+  request_id: string
+  message: string
+
+  user_id?: number | null
+  api_key_id?: number | null
+  account_id?: number | null
+  group_id?: number | null
+
+  client_ip?: string | null
+  request_path?: string
+  stream?: boolean
+}
+
+export interface OpsErrorDetail extends OpsErrorLog {
+  error_body: string
+  user_agent: string
+
+  // Upstream context (optional; enriched by gateway services)
+  upstream_status_code?: number | null
+  upstream_error_message?: string
+  upstream_error_detail?: string
+  upstream_errors?: string
+
+  auth_latency_ms?: number | null
+  routing_latency_ms?: number | null
+  upstream_latency_ms?: number | null
+  response_latency_ms?: number | null
+  time_to_first_token_ms?: number | null
+
+  request_body: string
+  request_body_truncated: boolean
+  request_body_bytes?: number | null
+
+  is_business_limited: boolean
+}
+
+export type OpsErrorLogsResponse = PaginatedResponse<OpsErrorLog>
+
+export async function getDashboardOverview(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsDashboardOverview> {
+  const { data } = await apiClient.get<OpsDashboardOverview>('/admin/ops/dashboard/overview', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+
+export async function getThroughputTrend(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsThroughputTrendResponse> {
+  const { data } = await apiClient.get<OpsThroughputTrendResponse>('/admin/ops/dashboard/throughput-trend', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+
+export async function getLatencyHistogram(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsLatencyHistogramResponse> {
+  const { data } = await apiClient.get<OpsLatencyHistogramResponse>('/admin/ops/dashboard/latency-histogram', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+
+export async function getErrorTrend(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsErrorTrendResponse> {
+  const { data } = await apiClient.get<OpsErrorTrendResponse>('/admin/ops/dashboard/error-trend', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+
+export async function getErrorDistribution(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsErrorDistributionResponse> {
+  const { data } = await apiClient.get<OpsErrorDistributionResponse>('/admin/ops/dashboard/error-distribution', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+
+export async function listErrorLogs(params: {
+  page?: number
+  page_size?: number
+  time_range?: string
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  account_id?: number | null
+  phase?: string
+  q?: string
+  status_codes?: string
+}): Promise<OpsErrorLogsResponse> {
+  const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/errors', { params })
+  return data
+}
+
+export async function getErrorLogDetail(id: number): Promise<OpsErrorDetail> {
+  const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/errors/${id}`)
+  return data
+}
+
+export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promise<OpsRetryResult> {
+  const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/errors/${id}/retry`, req)
+  return data
+}
+
+export async function listRequestDetails(params: OpsRequestDetailsParams): Promise<OpsRequestDetailsResponse> {
+  const { data } = await apiClient.get<OpsRequestDetailsResponse>('/admin/ops/requests', { params })
+  return data
+}
+
+// Alert rules
+export async function listAlertRules(): Promise<AlertRule[]> {
+  const { data } = await apiClient.get<AlertRule[]>('/admin/ops/alert-rules')
+  return data
+}
+
+export async function createAlertRule(rule: AlertRule): Promise<AlertRule> {
+  const { data } = await apiClient.post<AlertRule>('/admin/ops/alert-rules', rule)
+  return data
+}
+
+export async function updateAlertRule(id: number, rule: Partial<AlertRule>): Promise<AlertRule> {
+  const { data } = await apiClient.put<AlertRule>(`/admin/ops/alert-rules/${id}`, rule)
+  return data
+}
+
+export async function deleteAlertRule(id: number): Promise<void> {
+  await apiClient.delete(`/admin/ops/alert-rules/${id}`)
+}
+
+export async function listAlertEvents(limit = 100): Promise<AlertEvent[]> {
+  const { data } = await apiClient.get<AlertEvent[]>('/admin/ops/alert-events', { params: { limit } })
+  return data
+}
+
+// Email notification config
+export async function getEmailNotificationConfig(): Promise<EmailNotificationConfig> {
+  const { data } = await apiClient.get<EmailNotificationConfig>('/admin/ops/email-notification/config')
+  return data
+}
+
+export async function updateEmailNotificationConfig(config: EmailNotificationConfig): Promise<EmailNotificationConfig> {
+  const { data } = await apiClient.put<EmailNotificationConfig>('/admin/ops/email-notification/config', config)
+  return data
+}
+
+// Runtime settings (DB-backed)
+export async function getAlertRuntimeSettings(): Promise<OpsAlertRuntimeSettings> {
+  const { data } = await apiClient.get<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert')
+  return data
+}
+
+export async function updateAlertRuntimeSettings(config: OpsAlertRuntimeSettings): Promise<OpsAlertRuntimeSettings> {
+  const { data } = await apiClient.put<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert', config)
+  return data
+}
+
+// Advanced settings (DB-backed)
+export async function getAdvancedSettings(): Promise<OpsAdvancedSettings> {
+  const { data } = await apiClient.get<OpsAdvancedSettings>('/admin/ops/advanced-settings')
+  return data
+}
+
+export async function updateAdvancedSettings(config: OpsAdvancedSettings): Promise<OpsAdvancedSettings> {
+  const { data } = await apiClient.put<OpsAdvancedSettings>('/admin/ops/advanced-settings', config)
+  return data
+}
+
+export const opsAPI = {
+  getDashboardOverview,
+  getThroughputTrend,
+  getLatencyHistogram,
+  getErrorTrend,
+  getErrorDistribution,
+  getConcurrencyStats,
+  getAccountAvailabilityStats,
+  subscribeQPS,
+  listErrorLogs,
+  getErrorLogDetail,
+  retryErrorRequest,
+  listRequestDetails,
+  listAlertRules,
+  createAlertRule,
+  updateAlertRule,
+  deleteAlertRule,
+  listAlertEvents,
+  getEmailNotificationConfig,
+  updateEmailNotificationConfig,
+  getAlertRuntimeSettings,
+  updateAlertRuntimeSettings,
+  getAdvancedSettings,
+  updateAdvancedSettings
+}
+
+export default opsAPI
diff --git a/frontend/src/api/admin/settings.ts b/frontend/src/api/admin/settings.ts
index fc68eee4..913c9652 100644
--- a/frontend/src/api/admin/settings.ts
+++ b/frontend/src/api/admin/settings.ts
@@ -35,14 +35,29 @@ export interface SystemSettings {
   turnstile_enabled: boolean
   turnstile_site_key: string
   turnstile_secret_key_configured: boolean
-  // LinuxDo Connect OAuth 登录（终端用户 SSO）
+
+  // LinuxDo Connect OAuth settings
   linuxdo_connect_enabled: boolean
   linuxdo_connect_client_id: string
   linuxdo_connect_client_secret_configured: boolean
   linuxdo_connect_redirect_url: string
+
+  // Model fallback configuration
+  enable_model_fallback: boolean
+  fallback_model_anthropic: string
+  fallback_model_openai: string
+  fallback_model_gemini: string
+  fallback_model_antigravity: string
+
   // Identity patch configuration (Claude -> Gemini)
   enable_identity_patch: boolean
   identity_patch_prompt: string
+
+  // Ops Monitoring (vNext)
+  ops_monitoring_enabled: boolean
+  ops_realtime_monitoring_enabled: boolean
+  ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string
+  ops_metrics_interval_seconds: number
 }
 
 export interface UpdateSettingsRequest {
@@ -71,8 +86,17 @@ export interface UpdateSettingsRequest {
   linuxdo_connect_client_id?: string
   linuxdo_connect_client_secret?: string
   linuxdo_connect_redirect_url?: string
+  enable_model_fallback?: boolean
+  fallback_model_anthropic?: string
+  fallback_model_openai?: string
+  fallback_model_gemini?: string
+  fallback_model_antigravity?: string
   enable_identity_patch?: boolean
   identity_patch_prompt?: string
+  ops_monitoring_enabled?: boolean
+  ops_realtime_monitoring_enabled?: boolean
+  ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string
+  ops_metrics_interval_seconds?: number
 }
 
 /**
diff --git a/frontend/src/api/client.ts b/frontend/src/api/client.ts
index 4e53069a..3827498b 100644
--- a/frontend/src/api/client.ts
+++ b/frontend/src/api/client.ts
@@ -80,9 +80,45 @@ apiClient.interceptors.response.use(
     return response
   },
   (error: AxiosError<ApiResponse<unknown>>) => {
+    // Request cancellation: keep the original axios cancellation error so callers can ignore it.
+    // Otherwise we'd misclassify it as a generic "network error".
+    if (error.code === 'ERR_CANCELED' || axios.isCancel(error)) {
+      return Promise.reject(error)
+    }
+
     // Handle common errors
     if (error.response) {
       const { status, data } = error.response
+      const url = String(error.config?.url || '')
+
+      // Validate `data` shape to avoid HTML error pages breaking our error handling.
+      const apiData = (typeof data === 'object' && data !== null ? data : {}) as Record<string, any>
+
+      // Ops monitoring disabled: treat as feature-flagged 404, and proactively redirect away
+      // from ops pages to avoid broken UI states.
+      if (status === 404 && apiData.message === 'Ops monitoring is disabled') {
+        try {
+          localStorage.setItem('ops_monitoring_enabled_cached', 'false')
+        } catch {
+          // ignore localStorage failures
+        }
+        try {
+          window.dispatchEvent(new CustomEvent('ops-monitoring-disabled'))
+        } catch {
+          // ignore event failures
+        }
+
+        if (window.location.pathname.startsWith('/admin/ops')) {
+          window.location.href = '/admin/settings'
+        }
+
+        return Promise.reject({
+          status,
+          code: 'OPS_DISABLED',
+          message: apiData.message || error.message,
+          url
+        })
+      }
 
       // 401: Unauthorized - clear token and redirect to login
       if (status === 401) {
@@ -113,8 +149,8 @@ apiClient.interceptors.response.use(
       // Return structured error
       return Promise.reject({
         status,
-        code: data?.code,
-        message: data?.message || error.message
+        code: apiData.code,
+        message: apiData.message || apiData.detail || error.message
       })
     }
 
diff --git a/frontend/src/components/common/HelpTooltip.vue b/frontend/src/components/common/HelpTooltip.vue
new file mode 100644
index 00000000..7679ced4
--- /dev/null
+++ b/frontend/src/components/common/HelpTooltip.vue
@@ -0,0 +1,44 @@
+<script setup lang="ts">
+import { ref } from 'vue'
+
+defineProps<{
+  content?: string
+}>()
+
+const show = ref(false)
+</script>
+
+<template>
+  <div
+    class="group relative ml-1 inline-flex items-center align-middle"
+    @mouseenter="show = true"
+    @mouseleave="show = false"
+  >
+    <!-- Trigger Icon -->
+    <slot name="trigger">
+      <svg
+        class="h-4 w-4 cursor-help text-gray-400 transition-colors hover:text-primary-600 dark:text-gray-500 dark:hover:text-primary-400"
+        fill="none"
+        viewBox="0 0 24 24"
+        stroke="currentColor"
+        stroke-width="2"
+      >
+        <path
+          stroke-linecap="round"
+          stroke-linejoin="round"
+          d="M13 16h-1v-4h-1m1-4h.01M21 12a9 9 0 11-18 0 9 9 0 0118 0z"
+        />
+      </svg>
+    </slot>
+
+    <!-- Popover Content -->
+    <div
+      v-show="show"
+      class="absolute bottom-full left-1/2 z-50 mb-2 w-64 -translate-x-1/2 rounded-lg bg-gray-900 p-3 text-xs leading-relaxed text-white shadow-xl ring-1 ring-white/10 opacity-0 transition-opacity duration-200 group-hover:opacity-100 dark:bg-gray-800"
+    >
+      <slot>{{ content }}</slot>
+      <div class="absolute -bottom-1 left-1/2 h-2 w-2 -translate-x-1/2 rotate-45 bg-gray-900 dark:bg-gray-800"></div>
+    </div>
+  </div>
+</template>
+
diff --git a/frontend/src/components/common/Select.vue b/frontend/src/components/common/Select.vue
index 74f564f5..c90d0201 100644
--- a/frontend/src/components/common/Select.vue
+++ b/frontend/src/components/common/Select.vue
@@ -67,12 +67,13 @@
               :aria-selected="isSelected(option)"
               :aria-disabled="isOptionDisabled(option)"
               @click.stop="!isOptionDisabled(option) && selectOption(option)"
-              @mouseenter="focusedIndex = index"
+              @mouseenter="handleOptionMouseEnter(option, index)"
               :class="[
                 'select-option',
+                isGroupHeaderOption(option) && 'select-option-group',
                 isSelected(option) && 'select-option-selected',
-                isOptionDisabled(option) && 'select-option-disabled',
-                focusedIndex === index && 'select-option-focused'
+                isOptionDisabled(option) && !isGroupHeaderOption(option) && 'select-option-disabled',
+                focusedIndex === index && !isGroupHeaderOption(option) && 'select-option-focused'
               ]"
             >
               <slot name="option" :option="option" :selected="isSelected(option)">
@@ -201,6 +202,13 @@ const isOptionDisabled = (option: any): boolean => {
   return false
 }
 
+const isGroupHeaderOption = (option: any): boolean => {
+  if (typeof option === 'object' && option !== null) {
+    return option.kind === 'group'
+  }
+  return false
+}
+
 const selectedOption = computed(() => {
   return props.options.find((opt) => getOptionValue(opt) === props.modelValue) || null
 })
@@ -225,6 +233,31 @@ const isSelected = (option: any): boolean => {
   return getOptionValue(option) === props.modelValue
 }
 
+const findNextEnabledIndex = (startIndex: number): number => {
+  const opts = filteredOptions.value
+  if (opts.length === 0) return -1
+  for (let offset = 0; offset < opts.length; offset++) {
+    const idx = (startIndex + offset) % opts.length
+    if (!isOptionDisabled(opts[idx])) return idx
+  }
+  return -1
+}
+
+const findPrevEnabledIndex = (startIndex: number): number => {
+  const opts = filteredOptions.value
+  if (opts.length === 0) return -1
+  for (let offset = 0; offset < opts.length; offset++) {
+    const idx = (startIndex - offset + opts.length) % opts.length
+    if (!isOptionDisabled(opts[idx])) return idx
+  }
+  return -1
+}
+
+const handleOptionMouseEnter = (option: any, index: number) => {
+  if (isOptionDisabled(option) || isGroupHeaderOption(option)) return
+  focusedIndex.value = index
+}
+
 // Update trigger rect periodically while open to follow scroll/resize
 const updateTriggerRect = () => {
   if (containerRef.value) {
@@ -259,8 +292,15 @@ watch(isOpen, (open) => {
   if (open) {
     calculateDropdownPosition()
     // Reset focused index to current selection or first item
-    const selectedIdx = filteredOptions.value.findIndex(isSelected)
-    focusedIndex.value = selectedIdx >= 0 ? selectedIdx : 0
+    if (filteredOptions.value.length === 0) {
+      focusedIndex.value = -1
+    } else {
+      const selectedIdx = filteredOptions.value.findIndex(isSelected)
+      const initialIdx = selectedIdx >= 0 ? selectedIdx : 0
+      focusedIndex.value = isOptionDisabled(filteredOptions.value[initialIdx])
+        ? findNextEnabledIndex(initialIdx + 1)
+        : initialIdx
+    }
 
     if (props.searchable) {
       nextTick(() => searchInputRef.value?.focus())
@@ -295,13 +335,13 @@ const onDropdownKeyDown = (e: KeyboardEvent) => {
   switch (e.key) {
     case 'ArrowDown':
       e.preventDefault()
-      focusedIndex.value = (focusedIndex.value + 1) % filteredOptions.value.length
-      scrollToFocused()
+      focusedIndex.value = findNextEnabledIndex(focusedIndex.value + 1)
+      if (focusedIndex.value >= 0) scrollToFocused()
       break
     case 'ArrowUp':
       e.preventDefault()
-      focusedIndex.value = (focusedIndex.value - 1 + filteredOptions.value.length) % filteredOptions.value.length
-      scrollToFocused()
+      focusedIndex.value = findPrevEnabledIndex(focusedIndex.value - 1)
+      if (focusedIndex.value >= 0) scrollToFocused()
       break
     case 'Enter':
       e.preventDefault()
@@ -441,6 +481,17 @@ onUnmounted(() => {
   @apply cursor-not-allowed opacity-40;
 }
 
+.select-dropdown-portal .select-option-group {
+  @apply cursor-default select-none;
+  @apply bg-gray-50 dark:bg-dark-900;
+  @apply text-[11px] font-bold uppercase tracking-wider;
+  @apply text-gray-500 dark:text-gray-400;
+}
+
+.select-dropdown-portal .select-option-group:hover {
+  @apply bg-gray-50 dark:bg-dark-900;
+}
+
 .select-dropdown-portal .select-option-label {
   @apply flex-1 min-w-0 truncate text-left;
 }
diff --git a/frontend/src/components/keys/UseKeyModal.vue b/frontend/src/components/keys/UseKeyModal.vue
index 546a53ab..58f42ae6 100644
--- a/frontend/src/components/keys/UseKeyModal.vue
+++ b/frontend/src/components/keys/UseKeyModal.vue
@@ -28,8 +28,8 @@
           {{ platformDescription }}
         </p>
 
-        <!-- Client Tabs (only for Antigravity platform) -->
-        <div v-if="platform === 'antigravity'" class="border-b border-gray-200 dark:border-dark-700">
+        <!-- Client Tabs -->
+        <div v-if="clientTabs.length" class="border-b border-gray-200 dark:border-dark-700">
           <nav class="-mb-px flex space-x-6" aria-label="Client">
             <button
               v-for="tab in clientTabs"
@@ -51,7 +51,7 @@
         </div>
 
         <!-- OS/Shell Tabs -->
-        <div class="border-b border-gray-200 dark:border-dark-700">
+        <div v-if="showShellTabs" class="border-b border-gray-200 dark:border-dark-700">
           <nav class="-mb-px flex space-x-4" aria-label="Tabs">
             <button
               v-for="tab in currentTabs"
@@ -111,7 +111,7 @@
         </div>
 
         <!-- Usage Note -->
-        <div class="flex items-start gap-3 p-3 rounded-lg bg-blue-50 dark:bg-blue-900/20 border border-blue-100 dark:border-blue-800">
+        <div v-if="showPlatformNote" class="flex items-start gap-3 p-3 rounded-lg bg-blue-50 dark:bg-blue-900/20 border border-blue-100 dark:border-blue-800">
           <Icon name="infoCircle" size="md" class="text-blue-500 flex-shrink-0 mt-0.5" />
           <p class="text-sm text-blue-700 dark:text-blue-300">
             {{ platformNote }}
@@ -173,17 +173,28 @@ const { copyToClipboard: clipboardCopy } = useClipboard()
 
 const copiedIndex = ref<number | null>(null)
 const activeTab = ref<string>('unix')
-const activeClientTab = ref<string>('claude')  // Level 1 tab for antigravity platform
+const activeClientTab = ref<string>('claude')
 
 // Reset tabs when platform changes
-watch(() => props.platform, (newPlatform) => {
-  activeTab.value = 'unix'
-  if (newPlatform === 'antigravity') {
-    activeClientTab.value = 'claude'
+const defaultClientTab = computed(() => {
+  switch (props.platform) {
+    case 'openai':
+      return 'codex'
+    case 'gemini':
+      return 'gemini'
+    case 'antigravity':
+      return 'claude'
+    default:
+      return 'claude'
   }
 })
 
-// Reset shell tab when client changes (for antigravity)
+watch(() => props.platform, () => {
+  activeTab.value = 'unix'
+  activeClientTab.value = defaultClientTab.value
+}, { immediate: true })
+
+// Reset shell tab when client changes
 watch(activeClientTab, () => {
   activeTab.value = 'unix'
 })
@@ -251,11 +262,32 @@ const SparkleIcon = {
   }
 }
 
-// Client tabs for Antigravity platform (Level 1)
-const clientTabs = computed((): TabConfig[] => [
-  { id: 'claude', label: t('keys.useKeyModal.antigravity.claudeCode'), icon: TerminalIcon },
-  { id: 'gemini', label: t('keys.useKeyModal.antigravity.geminiCli'), icon: SparkleIcon }
-])
+const clientTabs = computed((): TabConfig[] => {
+  if (!props.platform) return []
+  switch (props.platform) {
+    case 'openai':
+      return [
+        { id: 'codex', label: t('keys.useKeyModal.cliTabs.codexCli'), icon: TerminalIcon },
+        { id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
+      ]
+    case 'gemini':
+      return [
+        { id: 'gemini', label: t('keys.useKeyModal.cliTabs.geminiCli'), icon: SparkleIcon },
+        { id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
+      ]
+    case 'antigravity':
+      return [
+        { id: 'claude', label: t('keys.useKeyModal.cliTabs.claudeCode'), icon: TerminalIcon },
+        { id: 'gemini', label: t('keys.useKeyModal.cliTabs.geminiCli'), icon: SparkleIcon },
+        { id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
+      ]
+    default:
+      return [
+        { id: 'claude', label: t('keys.useKeyModal.cliTabs.claudeCode'), icon: TerminalIcon },
+        { id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
+      ]
+  }
+})
 
 // Shell tabs (3 types for environment variable based configs)
 const shellTabs: TabConfig[] = [
@@ -270,11 +302,13 @@ const openaiTabs: TabConfig[] = [
   { id: 'windows', label: 'Windows', icon: WindowsIcon }
 ]
 
+const showShellTabs = computed(() => activeClientTab.value !== 'opencode')
+
 const currentTabs = computed(() => {
+  if (!showShellTabs.value) return []
   if (props.platform === 'openai') {
-    return openaiTabs  // 2 tabs: unix, windows
+    return openaiTabs
   }
-  // All other platforms (anthropic, gemini, antigravity) use shell tabs
   return shellTabs
 })
 
@@ -308,6 +342,8 @@ const platformNote = computed(() => {
   }
 })
 
+const showPlatformNote = computed(() => activeClientTab.value !== 'opencode')
+
 const escapeHtml = (value: string) => value
   .replace(/&/g, '&amp;')
   .replace(/</g, '&lt;')
@@ -329,6 +365,35 @@ const comment = (value: string) => wrapToken('text-slate-500', value)
 const currentFiles = computed((): FileConfig[] => {
   const baseUrl = props.baseUrl || window.location.origin
   const apiKey = props.apiKey
+  const baseRoot = baseUrl.replace(/\/v1\/?$/, '').replace(/\/+$/, '')
+  const ensureV1 = (value: string) => {
+    const trimmed = value.replace(/\/+$/, '')
+    return trimmed.endsWith('/v1') ? trimmed : `${trimmed}/v1`
+  }
+  const apiBase = ensureV1(baseRoot)
+  const antigravityBase = ensureV1(`${baseRoot}/antigravity`)
+  const antigravityGeminiBase = (() => {
+    const trimmed = `${baseRoot}/antigravity`.replace(/\/+$/, '')
+    return trimmed.endsWith('/v1beta') ? trimmed : `${trimmed}/v1beta`
+  })()
+
+  if (activeClientTab.value === 'opencode') {
+    switch (props.platform) {
+      case 'anthropic':
+        return [generateOpenCodeConfig('anthropic', apiBase, apiKey)]
+      case 'openai':
+        return [generateOpenCodeConfig('openai', apiBase, apiKey)]
+      case 'gemini':
+        return [generateOpenCodeConfig('gemini', apiBase, apiKey)]
+      case 'antigravity':
+        return [
+          generateOpenCodeConfig('antigravity-claude', antigravityBase, apiKey, 'opencode.json (Claude)'),
+          generateOpenCodeConfig('antigravity-gemini', antigravityGeminiBase, apiKey, 'opencode.json (Gemini)')
+        ]
+      default:
+        return [generateOpenCodeConfig('openai', apiBase, apiKey)]
+    }
+  }
 
   switch (props.platform) {
     case 'openai':
@@ -336,12 +401,11 @@ const currentFiles = computed((): FileConfig[] => {
     case 'gemini':
       return [generateGeminiCliContent(baseUrl, apiKey)]
     case 'antigravity':
-      // Both Claude Code and Gemini CLI need /antigravity suffix for antigravity platform
-      if (activeClientTab.value === 'claude') {
-        return generateAnthropicFiles(`${baseUrl}/antigravity`, apiKey)
+      if (activeClientTab.value === 'gemini') {
+        return [generateGeminiCliContent(`${baseUrl}/antigravity`, apiKey)]
       }
-      return [generateGeminiCliContent(`${baseUrl}/antigravity`, apiKey)]
-    default: // anthropic
+      return generateAnthropicFiles(`${baseUrl}/antigravity`, apiKey)
+    default:
       return generateAnthropicFiles(baseUrl, apiKey)
   }
 })
@@ -456,6 +520,76 @@ requires_openai_auth = true`
   ]
 }
 
+function generateOpenCodeConfig(platform: string, baseUrl: string, apiKey: string, pathLabel?: string): FileConfig {
+  const provider: Record<string, any> = {
+    [platform]: {
+      options: {
+        baseURL: baseUrl,
+        apiKey,
+        ...(platform === 'openai' ? { store: false } : {})
+      }
+    }
+  }
+  const openaiModels = {
+    'gpt-5.2-codex': {
+      name: 'GPT-5.2 Codex',
+      variants: {
+        low: {},
+        medium: {},
+        high: {},
+        xhigh: {}
+      }
+    }
+  }
+  const geminiModels = {
+    'gemini-3-pro-high': { name: 'Gemini 3 Pro High' },
+    'gemini-3-pro-low': { name: 'Gemini 3 Pro Low' },
+    'gemini-3-pro-preview': { name: 'Gemini 3 Pro Preview' },
+    'gemini-3-pro-image': { name: 'Gemini 3 Pro Image' },
+    'gemini-3-flash': { name: 'Gemini 3 Flash' },
+    'gemini-2.5-flash-thinking': { name: 'Gemini 2.5 Flash Thinking' },
+    'gemini-2.5-flash': { name: 'Gemini 2.5 Flash' },
+    'gemini-2.5-flash-lite': { name: 'Gemini 2.5 Flash Lite' }
+  }
+  const claudeModels = {
+    'claude-opus-4-5-thinking': { name: 'Claude Opus 4.5 Thinking' },
+    'claude-sonnet-4-5-thinking': { name: 'Claude Sonnet 4.5 Thinking' },
+    'claude-sonnet-4-5': { name: 'Claude Sonnet 4.5' }
+  }
+
+  if (platform === 'gemini') {
+    provider[platform].npm = '@ai-sdk/google'
+    provider[platform].models = geminiModels
+  } else if (platform === 'anthropic') {
+    provider[platform].npm = '@ai-sdk/anthropic'
+  } else if (platform === 'antigravity-claude') {
+    provider[platform].npm = '@ai-sdk/anthropic'
+    provider[platform].name = 'Antigravity (Claude)'
+    provider[platform].models = claudeModels
+  } else if (platform === 'antigravity-gemini') {
+    provider[platform].npm = '@ai-sdk/google'
+    provider[platform].name = 'Antigravity (Gemini)'
+    provider[platform].models = geminiModels
+  } else if (platform === 'openai') {
+    provider[platform].models = openaiModels
+  }
+
+  const content = JSON.stringify(
+    {
+      provider,
+      $schema: 'https://opencode.ai/config.json'
+    },
+    null,
+    2
+  )
+
+  return {
+    path: pathLabel ?? 'opencode.json',
+    content,
+    hint: t('keys.useKeyModal.opencode.hint')
+  }
+}
+
 const copyContent = async (content: string, index: number) => {
   const success = await clipboardCopy(content, t('keys.copied'))
   if (success) {
diff --git a/frontend/src/components/layout/AppSidebar.vue b/frontend/src/components/layout/AppSidebar.vue
index d8c91475..391f858f 100644
--- a/frontend/src/components/layout/AppSidebar.vue
+++ b/frontend/src/components/layout/AppSidebar.vue
@@ -144,10 +144,10 @@
 </template>
 
 <script setup lang="ts">
-import { computed, h, ref } from 'vue'
+import { computed, h, onMounted, ref, watch } from 'vue'
 import { useRoute } from 'vue-router'
 import { useI18n } from 'vue-i18n'
-import { useAppStore, useAuthStore, useOnboardingStore } from '@/stores'
+import { useAdminSettingsStore, useAppStore, useAuthStore, useOnboardingStore } from '@/stores'
 import VersionBadge from '@/components/common/VersionBadge.vue'
 
 const { t } = useI18n()
@@ -156,6 +156,7 @@ const route = useRoute()
 const appStore = useAppStore()
 const authStore = useAuthStore()
 const onboardingStore = useOnboardingStore()
+const adminSettingsStore = useAdminSettingsStore()
 
 const sidebarCollapsed = computed(() => appStore.sidebarCollapsed)
 const mobileOpen = computed(() => appStore.mobileOpen)
@@ -442,6 +443,9 @@ const personalNavItems = computed(() => {
 const adminNavItems = computed(() => {
   const baseItems = [
     { path: '/admin/dashboard', label: t('nav.dashboard'), icon: DashboardIcon },
+    ...(adminSettingsStore.opsMonitoringEnabled
+      ? [{ path: '/admin/ops', label: t('nav.ops'), icon: ChartIcon }]
+      : []),
     { path: '/admin/users', label: t('nav.users'), icon: UsersIcon, hideInSimpleMode: true },
     { path: '/admin/groups', label: t('nav.groups'), icon: FolderIcon, hideInSimpleMode: true },
     { path: '/admin/subscriptions', label: t('nav.subscriptions'), icon: CreditCardIcon, hideInSimpleMode: true },
@@ -511,6 +515,23 @@ if (
   isDark.value = true
   document.documentElement.classList.add('dark')
 }
+
+// Fetch admin settings (for feature-gated nav items like Ops).
+watch(
+  isAdmin,
+  (v) => {
+    if (v) {
+      adminSettingsStore.fetch()
+    }
+  },
+  { immediate: true }
+)
+
+onMounted(() => {
+  if (isAdmin.value) {
+    adminSettingsStore.fetch()
+  }
+})
 </script>
 
 <style scoped>
diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts
index 3dfe3034..f0e7db55 100644
--- a/frontend/src/i18n/locales/en.ts
+++ b/frontend/src/i18n/locales/en.ts
@@ -131,6 +131,7 @@ export default {
     noData: 'No data',
     success: 'Success',
     error: 'Error',
+    critical: 'Critical',
     warning: 'Warning',
     info: 'Info',
     active: 'Active',
@@ -145,9 +146,11 @@ export default {
     copiedToClipboard: 'Copied to clipboard',
     copyFailed: 'Failed to copy',
     contactSupport: 'Contact Support',
+    add: 'Add',
+    invalidEmail: 'Please enter a valid email address',
     optional: 'optional',
     selectOption: 'Select an option',
-        searchPlaceholder: 'Search...', 
+    searchPlaceholder: 'Search...', 
         noOptionsFound: 'No options found',
         noGroupsAvailable: 'No groups available',
         unknownError: 'Unknown error occurred',
@@ -178,6 +181,7 @@ export default {
     accounts: 'Accounts',
     proxies: 'Proxies',
     redeemCodes: 'Redeem Codes',
+    ops: 'Ops',
     promoCodes: 'Promo Codes',
     settings: 'Settings',
     myAccount: 'My Account',
@@ -364,6 +368,12 @@ export default {
         note: 'Make sure the config directory exists. macOS/Linux users can run mkdir -p ~/.codex to create it.',
         noteWindows: 'Press Win+R and enter %userprofile%\\.codex to open the config directory. Create it manually if it does not exist.',
       },
+      cliTabs: {
+        claudeCode: 'Claude Code',
+        geminiCli: 'Gemini CLI',
+        codexCli: 'Codex CLI',
+        opencode: 'OpenCode',
+      },
       antigravity: {
         description: 'Configure API access for Antigravity group. Select the configuration method based on your client.',
         claudeCode: 'Claude Code',
@@ -376,6 +386,11 @@ export default {
         modelComment: 'If you have Gemini 3 access, you can use: gemini-3-pro-preview',
         note: 'These environment variables will be active in the current terminal session. For permanent configuration, add them to ~/.bashrc, ~/.zshrc, or the appropriate configuration file.',
       },
+      opencode: {
+        title: 'OpenCode Example',
+        subtitle: 'opencode.json',
+        hint: 'This is a group configuration example. Adjust model and options as needed.',
+      },
     },
     customKeyLabel: 'Custom Key',
     customKeyPlaceholder: 'Enter your custom key (min 16 chars)',
@@ -1826,6 +1841,524 @@ export default {
       ipAddress: 'IP'
     },
 
+    // Ops Monitoring
+    ops: {
+      title: 'Ops Monitoring',
+      description: 'Operational monitoring and troubleshooting',
+      // Dashboard
+      systemHealth: 'System Health',
+      overview: 'Overview',
+      noSystemMetrics: 'No system metrics collected yet.',
+      collectedAt: 'Collected at:',
+      window: 'window',
+      cpu: 'CPU',
+      memory: 'Memory',
+      db: 'DB',
+      redis: 'Redis',
+      goroutines: 'Goroutines',
+      jobs: 'Jobs',
+      jobsHelp: 'Click “Details” to view job heartbeats and recent errors',
+      active: 'active',
+      idle: 'idle',
+      waiting: 'waiting',
+      conns: 'conns',
+      queue: 'queue',
+      ok: 'ok',
+      lastRun: 'last_run:',
+      lastSuccess: 'last_success:',
+      lastError: 'last_error:',
+      noData: 'No data.',
+      loadingText: 'loading',
+      ready: 'ready',
+      requestsTotal: 'Requests (total)',
+      slaScope: 'SLA scope:',
+      tokens: 'Tokens',
+      tps: 'TPS:',
+      current: 'current',
+      peak: 'peak',
+      average: 'average',
+      totalRequests: 'Total Requests',
+      avgQps: 'Avg QPS',
+      avgTps: 'Avg TPS',
+      avgLatency: 'Avg Latency',
+      avgTtft: 'Avg TTFT',
+      exceptions: 'Exceptions',
+      requestErrors: 'Request Errors',
+      errorCount: 'Error Count',
+      upstreamErrors: 'Upstream Errors',
+      errorCountExcl429529: 'Error Count (excl 429/529)',
+      sla: 'SLA (excl business limits)',
+      businessLimited: 'business_limited:',
+      errors: 'Errors',
+      errorRate: 'error_rate:',
+      upstreamRate: 'upstream_rate:',
+      latencyDuration: 'Latency (duration_ms)',
+      ttftLabel: 'TTFT (first_token_ms)',
+      p50: 'p50:',
+      p90: 'p90:',
+      p95: 'p95:',
+      p99: 'p99:',
+      avg: 'avg:',
+      max: 'max:',
+      qps: 'QPS',
+      requests: 'Requests',
+      upstream: 'Upstream',
+      client: 'Client',
+      system: 'System',
+      other: 'Other',
+      errorsSla: 'Errors (SLA scope)',
+      upstreamExcl429529: 'Upstream (excl 429/529)',
+      failedToLoadData: 'Failed to load ops data.',
+      failedToLoadOverview: 'Failed to load overview',
+      failedToLoadThroughputTrend: 'Failed to load throughput trend',
+      failedToLoadLatencyHistogram: 'Failed to load latency histogram',
+      failedToLoadErrorTrend: 'Failed to load error trend',
+      failedToLoadErrorDistribution: 'Failed to load error distribution',
+      failedToLoadErrorDetail: 'Failed to load error detail',
+      retryFailed: 'Retry failed',
+      tpsK: 'TPS (K)',
+      top: 'Top:',
+      throughputTrend: 'Throughput Trend',
+      latencyHistogram: 'Latency Histogram',
+      errorTrend: 'Error Trend',
+      errorDistribution: 'Error Distribution',
+      // Health Score & Diagnosis
+      health: 'Health',
+      healthCondition: 'Health Condition',
+      healthHelp: 'Overall system health score based on SLA, error rate, and resource usage',
+      healthyStatus: 'Healthy',
+      riskyStatus: 'At Risk',
+      idleStatus: 'Idle',
+      timeRange: {
+        '5m': 'Last 5 minutes',
+        '30m': 'Last 30 minutes',
+        '1h': 'Last 1 hour',
+        '6h': 'Last 6 hours',
+        '24h': 'Last 24 hours'
+      },
+      diagnosis: {
+        title: 'Smart Diagnosis',
+        footer: 'Automated diagnostic suggestions based on current metrics',
+        idle: 'System is currently idle',
+        idleImpact: 'No active traffic',
+        // Resource diagnostics
+        dbDown: 'Database connection failed',
+        dbDownImpact: 'All database operations will fail',
+        dbDownAction: 'Check database service status, network connectivity, and connection configuration',
+        redisDown: 'Redis connection failed',
+        redisDownImpact: 'Cache functionality degraded, performance may decline',
+        redisDownAction: 'Check Redis service status and network connectivity',
+        cpuCritical: 'CPU usage critically high ({usage}%)',
+        cpuCriticalImpact: 'System response slowing, may affect all requests',
+        cpuCriticalAction: 'Check CPU-intensive tasks, consider scaling or code optimization',
+        cpuHigh: 'CPU usage elevated ({usage}%)',
+        cpuHighImpact: 'System load is high, needs attention',
+        cpuHighAction: 'Monitor CPU trends, prepare scaling plan',
+        memoryCritical: 'Memory usage critically high ({usage}%)',
+        memoryCriticalImpact: 'May trigger OOM, system stability threatened',
+        memoryCriticalAction: 'Check for memory leaks, consider increasing memory or optimizing usage',
+        memoryHigh: 'Memory usage elevated ({usage}%)',
+        memoryHighImpact: 'Memory pressure is high, needs attention',
+        memoryHighAction: 'Monitor memory trends, check for memory leaks',
+        // Latency diagnostics
+        latencyCritical: 'Response latency critically high ({latency}ms)',
+        latencyCriticalImpact: 'User experience extremely poor, many requests timing out',
+        latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services',
+        latencyHigh: 'Response latency elevated ({latency}ms)',
+        latencyHighImpact: 'User experience degraded, needs optimization',
+        latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic',
+        ttftHigh: 'Time to first byte elevated ({ttft}ms)',
+        ttftHighImpact: 'User perceived latency increased',
+        ttftHighAction: 'Optimize request processing flow, reduce pre-processing time',
+        // Error rate diagnostics
+        upstreamCritical: 'Upstream error rate critically high ({rate}%)',
+        upstreamCriticalImpact: 'May affect many user requests',
+        upstreamCriticalAction: 'Check upstream service health, enable fallback strategies',
+        upstreamHigh: 'Upstream error rate elevated ({rate}%)',
+        upstreamHighImpact: 'Recommend checking upstream service status',
+        upstreamHighAction: 'Contact upstream service team, prepare fallback plan',
+        errorHigh: 'Error rate too high ({rate}%)',
+        errorHighImpact: 'Many requests failing',
+        errorHighAction: 'Check error logs, identify root cause, urgent fix required',
+        errorElevated: 'Error rate elevated ({rate}%)',
+        errorElevatedImpact: 'Recommend checking error logs',
+        errorElevatedAction: 'Analyze error types and distribution, create fix plan',
+        // SLA diagnostics
+        slaCritical: 'SLA critically below target ({sla}%)',
+        slaCriticalImpact: 'User experience severely degraded',
+        slaCriticalAction: 'Urgently investigate errors and latency, consider rate limiting',
+        slaLow: 'SLA below target ({sla}%)',
+        slaLowImpact: 'Service quality needs attention',
+        slaLowAction: 'Analyze SLA decline causes, optimize system performance',
+        // Health score diagnostics
+        healthCritical: 'Overall health score critically low ({score})',
+        healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation',
+        healthCriticalAction: 'Comprehensive system check, prioritize critical-level issues',
+        healthLow: 'Overall health score low ({score})',
+        healthLowImpact: 'May indicate minor instability; monitor SLA and error rates',
+        healthLowAction: 'Monitor metric trends, prevent issue escalation',
+        healthy: 'All system metrics normal',
+        healthyImpact: 'Service running stable'
+      },
+      // Error Log
+      errorLog: {
+        timeId: 'Time / ID',
+        context: 'Context',
+        status: 'Status',
+        message: 'Message',
+        latency: 'Latency',
+        action: 'Action',
+        noErrors: 'No errors in this window.',
+        grp: 'GRP:',
+        acc: 'ACC:',
+        details: 'Details',
+        phase: 'Phase'
+      },
+      // Error Details Modal
+      errorDetails: {
+        upstreamErrors: 'Upstream Errors',
+        requestErrors: 'Request Errors',
+        total: 'Total:',
+        searchPlaceholder: 'Search request_id / client_request_id / message',
+        accountIdPlaceholder: 'account_id'
+      },
+      // Error Detail Modal
+      errorDetail: {
+        loading: 'Loading…',
+        requestId: 'Request ID',
+        time: 'Time',
+        phase: 'Phase',
+        status: 'Status',
+        message: 'Message',
+        basicInfo: 'Basic Info',
+        platform: 'Platform',
+        model: 'Model',
+        latency: 'Latency',
+        ttft: 'TTFT',
+        businessLimited: 'Business Limited',
+        requestPath: 'Request Path',
+        timings: 'Timings',
+        auth: 'Auth',
+        routing: 'Routing',
+        upstream: 'Upstream',
+        response: 'Response',
+        retry: 'Retry',
+        retryClient: 'Retry (Client)',
+        retryUpstream: 'Retry (Upstream pinned)',
+        pinnedAccountId: 'Pinned account_id',
+        retryNotes: 'Retry Notes',
+        requestBody: 'Request Body',
+        errorBody: 'Error Body',
+        trimmed: 'trimmed',
+        confirmRetry: 'Confirm Retry',
+        retrySuccess: 'Retry succeeded',
+        retryFailed: 'Retry failed',
+        na: 'N/A',
+        retryHint: 'Retry will resend the request with the same parameters',
+        retryClientHint: 'Use client retry (no account pinning)',
+        retryUpstreamHint: 'Use upstream pinned retry (pin to the error account)',
+        pinnedAccountIdHint: '(auto from error log)',
+        retryNote1: 'Retry will use the same request body and parameters',
+        retryNote2: 'If the original request failed due to account issues, pinned retry may still fail',
+        retryNote3: 'Client retry will reselect an account',
+        confirmRetryMessage: 'Confirm retry this request?',
+        confirmRetryHint: 'Will resend with the same request parameters'
+      },
+      requestDetails: {
+        title: 'Request Details',
+        details: 'Details',
+        rangeLabel: 'Window: {range}',
+        rangeMinutes: '{n} minutes',
+        rangeHours: '{n} hours',
+        empty: 'No requests in this window.',
+        emptyHint: 'Try a different time range or remove filters.',
+        failedToLoad: 'Failed to load request details',
+        requestIdCopied: 'Request ID copied',
+        copyFailed: 'Copy failed',
+        copy: 'Copy',
+        viewError: 'View Error',
+        kind: {
+          success: 'SUCCESS',
+          error: 'ERROR'
+        },
+        table: {
+          time: 'Time',
+          kind: 'Kind',
+          platform: 'Platform',
+          model: 'Model',
+          duration: 'Duration',
+          status: 'Status',
+          requestId: 'Request ID',
+          actions: 'Actions'
+        }
+      },
+      alertEvents: {
+        title: 'Alert Events',
+        description: 'Recent alert firing/resolution records (email-only)',
+        loading: 'Loading...',
+        empty: 'No alert events',
+        loadFailed: 'Failed to load alert events',
+        table: {
+          time: 'Time',
+          status: 'Status',
+          severity: 'Severity',
+          title: 'Title',
+          metric: 'Metric / Threshold',
+          email: 'Email Sent'
+        }
+      },
+      alertRules: {
+        title: 'Alert Rules',
+        description: 'Create and manage threshold-based system alerts (email-only)',
+        loading: 'Loading...',
+        empty: 'No alert rules',
+        loadFailed: 'Failed to load alert rules',
+        saveFailed: 'Failed to save alert rule',
+        deleteFailed: 'Failed to delete alert rule',
+        create: 'Create Rule',
+        createTitle: 'Create Alert Rule',
+        editTitle: 'Edit Alert Rule',
+        deleteConfirmTitle: 'Delete this rule?',
+        deleteConfirmMessage: 'This will remove the rule and its related events. Continue?',
+        metricGroups: {
+          system: 'System Metrics',
+          group: 'Group-level Metrics (requires group_id)',
+          account: 'Account-level Metrics'
+        },
+        metrics: {
+          successRate: 'Success Rate (%)',
+          errorRate: 'Error Rate (%)',
+          upstreamErrorRate: 'Upstream Error Rate (%)',
+          p95: 'P95 Latency (ms)',
+          p99: 'P99 Latency (ms)',
+          cpu: 'CPU Usage (%)',
+          memory: 'Memory Usage (%)',
+          queueDepth: 'Concurrency Queue Depth',
+          groupAvailableAccounts: 'Group Available Accounts',
+          groupAvailableRatio: 'Group Available Ratio (%)',
+          groupRateLimitRatio: 'Group Rate Limit Ratio (%)',
+          accountRateLimitedCount: 'Rate-limited Accounts',
+          accountErrorCount: 'Error Accounts (excluding temporarily unschedulable)',
+          accountErrorRatio: 'Error Account Ratio (%)',
+          overloadAccountCount: 'Overloaded Accounts'
+        },
+        metricDescriptions: {
+          successRate: 'Percentage of successful requests in the window (0-100).',
+          errorRate: 'Percentage of failed requests in the window (0-100).',
+          upstreamErrorRate: 'Percentage of upstream failures in the window (0-100).',
+          p95: 'P95 request latency within the window (ms).',
+          p99: 'P99 request latency within the window (ms).',
+          cpu: 'Current instance CPU usage (0-100).',
+          memory: 'Current instance memory usage (0-100).',
+          queueDepth: 'Concurrency queue depth within the window (queued requests).',
+          groupAvailableAccounts: 'Number of available accounts in the selected group (requires group_id).',
+          groupAvailableRatio: 'Available account ratio in the selected group (0-100, requires group_id).',
+          groupRateLimitRatio: 'Rate-limited account ratio in the selected group (0-100, requires group_id).',
+          accountRateLimitedCount: 'Number of rate-limited accounts within the window.',
+          accountErrorCount: 'Number of error accounts within the window (excluding temporarily unschedulable).',
+          accountErrorRatio: 'Error account ratio within the window (0-100).',
+          overloadAccountCount: 'Number of overloaded accounts within the window.'
+        },
+        hints: {
+          recommended: 'Recommended: operator {operator}, threshold {threshold}{unit}',
+          groupRequired: 'This is a group-level metric; selecting a group (group_id) is required.',
+          groupOptional: 'Optional: limit the rule to a specific group via group_id.'
+        },
+        table: {
+          name: 'Name',
+          metric: 'Metric',
+          severity: 'Severity',
+          enabled: 'Enabled',
+          actions: 'Actions'
+        },
+        form: {
+          name: 'Name',
+          description: 'Description',
+          metric: 'Metric',
+          operator: 'Operator',
+          groupId: 'Group (group_id)',
+          groupPlaceholder: 'Select a group',
+          allGroups: 'All groups',
+          threshold: 'Threshold',
+          severity: 'Severity',
+          window: 'Window (minutes)',
+          sustained: 'Sustained (samples)',
+          cooldown: 'Cooldown (minutes)',
+          enabled: 'Enabled',
+          notifyEmail: 'Send email notifications'
+        },
+        validation: {
+          title: 'Please fix the following issues',
+          invalid: 'Invalid rule',
+          nameRequired: 'Name is required',
+          metricRequired: 'Metric is required',
+          groupIdRequired: 'group_id is required for group-level metrics',
+          operatorRequired: 'Operator is required',
+          thresholdRequired: 'Threshold must be a number',
+          windowRange: 'Window must be one of: 1, 5, 60 minutes',
+          sustainedRange: 'Sustained must be between 1 and 1440 samples',
+          cooldownRange: 'Cooldown must be between 0 and 1440 minutes'
+        }
+      },
+      runtime: {
+        title: 'Ops Runtime Settings',
+        description: 'Stored in database; changes take effect without editing config files.',
+        loading: 'Loading...',
+        noData: 'No runtime settings available',
+        loadFailed: 'Failed to load runtime settings',
+        saveSuccess: 'Runtime settings saved',
+        saveFailed: 'Failed to save runtime settings',
+        alertTitle: 'Alert Evaluator',
+        groupAvailabilityTitle: 'Group Availability Monitor',
+        evalIntervalSeconds: 'Evaluation Interval (seconds)',
+        silencing: {
+          title: 'Alert Silencing (Maintenance Mode)',
+          enabled: 'Enable silencing',
+          globalUntil: 'Silence until (RFC3339)',
+          untilPlaceholder: '2026-01-05T00:00:00Z',
+          untilHint: 'Leave empty to only toggle silencing without an expiry (not recommended).',
+          reason: 'Reason',
+          reasonPlaceholder: 'e.g., planned maintenance',
+          entries: {
+            title: 'Advanced: targeted silencing',
+            hint: 'Optional: silence only certain rules or severities. Leave fields empty to match all.',
+            add: 'Add Entry',
+            empty: 'No targeted entries',
+            entryTitle: 'Entry #{n}',
+            ruleId: 'Rule ID (optional)',
+            ruleIdPlaceholder: 'e.g., 1',
+            severities: 'Severities (optional)',
+            severitiesPlaceholder: 'e.g., P0,P1 (empty = all)',
+            until: 'Until (RFC3339)',
+            reason: 'Reason',
+            validation: {
+              untilRequired: 'Entry until time is required',
+              untilFormat: 'Entry until time must be a valid RFC3339 timestamp',
+              ruleIdPositive: 'Entry rule_id must be a positive integer',
+              severitiesFormat: 'Entry severities must be a comma-separated list of P0..P3'
+            }
+          },
+          validation: {
+            timeFormat: 'Silence time must be a valid RFC3339 timestamp'
+          }
+        },
+        lockEnabled: 'Distributed Lock Enabled',
+        lockKey: 'Distributed Lock Key',
+        lockTTLSeconds: 'Distributed Lock TTL (seconds)',
+        showAdvancedDeveloperSettings: 'Show advanced developer settings (Distributed Lock)',
+        advancedSettingsSummary: 'Advanced settings (Distributed Lock)',
+        evalIntervalHint: 'How often the evaluator runs. Keeping the default is recommended.',
+        validation: {
+          title: 'Please fix the following issues',
+          invalid: 'Invalid settings',
+          evalIntervalRange: 'Evaluation interval must be between 1 and 86400 seconds',
+          lockKeyRequired: 'Distributed lock key is required when lock is enabled',
+          lockKeyPrefix: 'Distributed lock key must start with "{prefix}"',
+          lockKeyHint: 'Recommended: start with "{prefix}" to avoid conflicts',
+          lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds'
+        }
+      },
+      email: {
+        title: 'Email Notification',
+        description: 'Configure alert/report email notifications (stored in database).',
+        loading: 'Loading...',
+        noData: 'No email notification config',
+        loadFailed: 'Failed to load email notification config',
+        saveSuccess: 'Email notification config saved',
+        saveFailed: 'Failed to save email notification config',
+        alertTitle: 'Alert Emails',
+        reportTitle: 'Report Emails',
+        recipients: 'Recipients',
+        recipientsHint: 'If empty, the system may fallback to the first admin email.',
+        minSeverity: 'Min Severity',
+        minSeverityAll: 'All severities',
+        rateLimitPerHour: 'Rate limit per hour',
+        batchWindowSeconds: 'Batch window (seconds)',
+        includeResolved: 'Include resolved alerts',
+        dailySummary: 'Daily summary',
+        weeklySummary: 'Weekly summary',
+        errorDigest: 'Error digest',
+        errorDigestMinCount: 'Min errors for digest',
+        accountHealth: 'Account health',
+        accountHealthThreshold: 'Error rate threshold (%)',
+        cronPlaceholder: 'Cron expression',
+        reportHint: 'Schedules use cron syntax; leave empty to use defaults.',
+        validation: {
+          title: 'Please fix the following issues',
+          invalid: 'Invalid email notification config',
+          alertRecipientsRequired: 'Alert emails are enabled but no recipients are configured',
+          reportRecipientsRequired: 'Report emails are enabled but no recipients are configured',
+          invalidRecipients: 'One or more recipient emails are invalid',
+          rateLimitRange: 'Rate limit per hour must be a number ≥ 0',
+          batchWindowRange: 'Batch window must be between 0 and 86400 seconds',
+          cronRequired: 'A cron expression is required when schedule is enabled',
+          cronFormat: 'Cron expression format looks invalid (expected at least 5 parts)',
+          digestMinCountRange: 'Min errors for digest must be a number ≥ 0',
+          accountHealthThresholdRange: 'Account health threshold must be between 0 and 100'
+        }
+      },
+      concurrency: {
+        title: 'Concurrency / Queue',
+        byPlatform: 'By Platform',
+        byGroup: 'By Group',
+        byAccount: 'By Account',
+        totalRows: '{count} rows',
+        disabledHint: 'Realtime monitoring is disabled in settings.',
+        empty: 'No data',
+        queued: 'Queue {count}',
+        rateLimited: 'Rate-limited {count}',
+        errorAccounts: 'Errors {count}',
+        loadFailed: 'Failed to load concurrency data'
+      },
+      realtime: {
+        title: 'Realtime',
+        connected: 'Realtime connected',
+        connecting: 'Realtime connecting',
+        reconnecting: 'Realtime reconnecting',
+        offline: 'Realtime offline',
+        closed: 'Realtime closed',
+        reconnectIn: 'retry in {seconds}s'
+      },
+      queryMode: {
+        auto: 'Auto',
+        raw: 'Raw',
+        preagg: 'Preagg'
+      },
+      accountAvailability: {
+        available: 'Available',
+        unavailable: 'Unavailable',
+        accountError: 'Error'
+      },
+      tooltips: {
+        throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
+        latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
+        errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
+        errorDistribution: 'Error distribution by status code.',
+        goroutines:
+          'Number of Go runtime goroutines (lightweight threads). There is no absolute “safe” number—use your historical baseline. Heuristic: <2k is common; 2k–8k watch; >8k plus rising queue/latency often suggests blocking/leaks.',
+        cpu: 'CPU usage percentage, showing system processor load.',
+        memory: 'Memory usage, including used and total available memory.',
+        db: 'Database connection pool status, including active, idle, and waiting connections.',
+        redis: 'Redis connection pool status, showing active and idle connections.',
+        jobs: 'Background job execution status, including last run time, success time, and error information.',
+        qps: 'Queries Per Second (QPS) and Tokens Per Second (TPS), real-time system throughput.',
+        tokens: 'Total number of tokens processed in the current time window.',
+        sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
+        errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
+        latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
+        ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
+        health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
+      },
+      charts: {
+        emptyRequest: 'No requests in this window.',
+        emptyError: 'No errors in this window.',
+        resetZoom: 'Reset',
+        resetZoomHint: 'Reset zoom (if enabled)',
+        downloadChart: 'Download',
+        downloadChartHint: 'Download chart as image'
+      }
+    },
+
     // Settings
     settings: {
       title: 'System Settings',
@@ -1940,6 +2473,22 @@ export default {
         sending: 'Sending...',
         enterRecipientHint: 'Please enter a recipient email address'
       },
+      opsMonitoring: {
+        title: 'Ops Monitoring',
+        description: 'Enable ops monitoring for troubleshooting and health visibility',
+        disabled: 'Ops monitoring is disabled',
+        enabled: 'Enable Ops Monitoring',
+        enabledHint: 'Enable the ops monitoring module (admin only)',
+        realtimeEnabled: 'Enable Realtime Monitoring',
+        realtimeEnabledHint: 'Enable realtime QPS/metrics push (WebSocket)',
+        queryMode: 'Default Query Mode',
+        queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)',
+        queryModeAuto: 'Auto (recommended)',
+        queryModeRaw: 'Raw (most accurate, slower)',
+        queryModePreagg: 'Preagg (fastest, requires aggregation)',
+        metricsInterval: 'Metrics Collection Interval (seconds)',
+        metricsIntervalHint: 'How often to collect system/request metrics (60-3600 seconds)'
+      },
       adminApiKey: {
         title: 'Admin API Key',
         description: 'Global API key for external system integration with full admin access',
diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts
index 22d5eebe..ecdcb13f 100644
--- a/frontend/src/i18n/locales/zh.ts
+++ b/frontend/src/i18n/locales/zh.ts
@@ -128,6 +128,7 @@ export default {
     noData: '暂无数据',
     success: '成功',
     error: '错误',
+    critical: '严重',
     warning: '警告',
     info: '提示',
     active: '启用',
@@ -142,6 +143,8 @@ export default {
     copiedToClipboard: '已复制到剪贴板',
     copyFailed: '复制失败',
     contactSupport: '联系客服',
+    add: '添加',
+    invalidEmail: '请输入有效的邮箱地址',
     optional: '可选',
     selectOption: '请选择',
     searchPlaceholder: '搜索...',
@@ -151,6 +154,7 @@ export default {
     saving: '保存中...',
     selectedCount: '（已选 {count} 个）',
     refresh: '刷新',
+    settings: '设置',
     notAvailable: '不可用',
     now: '现在',
     unknown: '未知',
@@ -176,6 +180,7 @@ export default {
     accounts: '账号管理',
     proxies: 'IP管理',
     redeemCodes: '兑换码',
+    ops: '运维监控',
     promoCodes: '优惠码',
     settings: '系统设置',
     myAccount: '我的账户',
@@ -361,6 +366,12 @@ export default {
         note: '请确保配置目录存在。macOS/Linux 用户可运行 mkdir -p ~/.codex 创建目录。',
         noteWindows: '按 Win+R，输入 %userprofile%\\.codex 打开配置目录。如目录不存在，请先手动创建。',
       },
+      cliTabs: {
+        claudeCode: 'Claude Code',
+        geminiCli: 'Gemini CLI',
+        codexCli: 'Codex CLI',
+        opencode: 'OpenCode',
+      },
       antigravity: {
         description: '为 Antigravity 分组配置 API 访问。请根据您使用的客户端选择对应的配置方式。',
         claudeCode: 'Claude Code',
@@ -373,6 +384,11 @@ export default {
         modelComment: '如果你有 Gemini 3 权限可以填：gemini-3-pro-preview',
         note: '这些环境变量将在当前终端会话中生效。如需永久配置，请将其添加到 ~/.bashrc、~/.zshrc 或相应的配置文件中。',
       },
+      opencode: {
+        title: 'OpenCode 配置示例',
+        subtitle: 'opencode.json',
+        hint: '示例仅用于演示分组配置，模型与选项可按需调整。',
+      },
     },
     customKeyLabel: '自定义密钥',
     customKeyPlaceholder: '输入自定义密钥（至少16个字符）',
@@ -1971,6 +1987,565 @@ export default {
       ipAddress: 'IP'
     },
 
+    // Ops Monitoring
+    ops: {
+      title: '运维监控',
+      description: '运维监控与排障',
+      // Dashboard
+      systemHealth: '系统健康',
+      overview: '概览',
+      noSystemMetrics: '尚未收集系统指标。',
+      collectedAt: '采集时间：',
+      window: '窗口',
+      cpu: 'CPU',
+      memory: '内存',
+      db: '数据库',
+      redis: 'Redis',
+      goroutines: '协程',
+      jobs: '后台任务',
+      jobsHelp: '点击“明细”查看任务心跳与报错信息',
+      active: '活跃',
+      idle: '空闲',
+      waiting: '等待',
+      conns: '连接',
+      queue: '队列',
+      ok: '正常',
+      lastRun: '最近运行',
+      lastSuccess: '最近成功',
+      lastError: '最近错误',
+      noData: '暂无数据',
+      loadingText: '加载中...',
+      ready: '就绪',
+      requestsTotal: '请求（总计）',
+      slaScope: 'SLA 范围：',
+      tokens: 'Token',
+      tps: 'TPS',
+      current: '当前',
+      peak: '峰值',
+      average: '平均',
+      totalRequests: '总请求',
+      avgQps: '平均 QPS',
+      avgTps: '平均 TPS',
+      avgLatency: '平均延迟',
+      avgTtft: '平均首字延迟',
+      exceptions: '异常数',
+      requestErrors: '请求错误',
+      errorCount: '错误数',
+      upstreamErrors: '上游错误',
+      errorCountExcl429529: '错误数（排除429/529）',
+      sla: 'SLA（排除业务限制）',
+      businessLimited: '业务限制：',
+      errors: '错误',
+      errorRate: '错误率：',
+      upstreamRate: '上游错误率：',
+      latencyDuration: '延迟（毫秒）',
+      ttftLabel: '首字延迟（毫秒）',
+      p50: 'p50',
+      p90: 'p90',
+      p95: 'p95',
+      p99: 'p99',
+      avg: 'avg',
+      max: 'max',
+      qps: 'QPS',
+      requests: '请求',
+      upstream: '上游',
+      client: '客户端',
+      system: '系统',
+      other: '其他',
+      errorsSla: '错误（SLA范围）',
+      upstreamExcl429529: '上游（排除429/529）',
+      failedToLoadData: '加载运维数据失败',
+      failedToLoadOverview: '加载概览数据失败',
+      failedToLoadThroughputTrend: '加载吞吐趋势失败',
+      failedToLoadLatencyHistogram: '加载延迟分布失败',
+      failedToLoadErrorTrend: '加载错误趋势失败',
+      failedToLoadErrorDistribution: '加载错误分布失败',
+      failedToLoadErrorDetail: '加载错误详情失败',
+      retryFailed: '重试失败',
+      tpsK: 'TPS（千）',
+      top: '最高：',
+      throughputTrend: '吞吐趋势',
+      latencyHistogram: '延迟分布',
+      errorTrend: '错误趋势',
+      errorDistribution: '错误分布',
+      // Health Score & Diagnosis
+      health: '健康',
+      healthCondition: '健康状况',
+      healthHelp: '基于 SLA、错误率和资源使用情况的系统整体健康评分',
+      healthyStatus: '健康',
+      riskyStatus: '风险',
+      idleStatus: '待机',
+      timeRange: {
+        '5m': '近5分钟',
+        '30m': '近30分钟',
+        '1h': '近1小时',
+        '6h': '近6小时',
+        '24h': '近24小时'
+      },
+      diagnosis: {
+        title: '智能诊断',
+        footer: '基于当前指标的自动诊断建议',
+        idle: '系统当前处于待机状态',
+        idleImpact: '无活跃流量',
+        // Resource diagnostics
+        dbDown: '数据库连接失败',
+        dbDownImpact: '所有数据库操作将失败',
+        dbDownAction: '检查数据库服务状态、网络连接和连接配置',
+        redisDown: 'Redis连接失败',
+        redisDownImpact: '缓存功能降级，性能可能下降',
+        redisDownAction: '检查Redis服务状态和网络连接',
+        cpuCritical: 'CPU使用率严重过高 ({usage}%)',
+        cpuCriticalImpact: '系统响应变慢，可能影响所有请求',
+        cpuCriticalAction: '检查CPU密集型任务，考虑扩容或优化代码',
+        cpuHigh: 'CPU使用率偏高 ({usage}%)',
+        cpuHighImpact: '系统负载较高，需要关注',
+        cpuHighAction: '监控CPU趋势，准备扩容方案',
+        memoryCritical: '内存使用率严重过高 ({usage}%)',
+        memoryCriticalImpact: '可能触发OOM，系统稳定性受威胁',
+        memoryCriticalAction: '检查内存泄漏，考虑增加内存或优化内存使用',
+        memoryHigh: '内存使用率偏高 ({usage}%)',
+        memoryHighImpact: '内存压力较大，需要关注',
+        memoryHighAction: '监控内存趋势，检查是否有内存泄漏',
+        // Latency diagnostics
+        latencyCritical: '响应延迟严重过高 ({latency}ms)',
+        latencyCriticalImpact: '用户体验极差，大量请求超时',
+        latencyCriticalAction: '检查慢查询、数据库索引、网络延迟和上游服务',
+        latencyHigh: '响应延迟偏高 ({latency}ms)',
+        latencyHighImpact: '用户体验下降，需要优化',
+        latencyHighAction: '分析慢请求日志，优化数据库查询和业务逻辑',
+        ttftHigh: '首字节时间偏高 ({ttft}ms)',
+        ttftHighImpact: '用户感知延迟增加',
+        ttftHighAction: '优化请求处理流程，减少前置逻辑耗时',
+        // Error rate diagnostics
+        upstreamCritical: '上游错误率严重偏高 ({rate}%)',
+        upstreamCriticalImpact: '可能影响大量用户请求',
+        upstreamCriticalAction: '检查上游服务健康状态，启用降级策略',
+        upstreamHigh: '上游错误率偏高 ({rate}%)',
+        upstreamHighImpact: '建议检查上游服务状态',
+        upstreamHighAction: '联系上游服务团队，准备降级方案',
+        errorHigh: '错误率过高 ({rate}%)',
+        errorHighImpact: '大量请求失败',
+        errorHighAction: '查看错误日志，定位错误根因，紧急修复',
+        errorElevated: '错误率偏高 ({rate}%)',
+        errorElevatedImpact: '建议检查错误日志',
+        errorElevatedAction: '分析错误类型和分布，制定修复计划',
+        // SLA diagnostics
+        slaCritical: 'SLA 严重低于目标 ({sla}%)',
+        slaCriticalImpact: '用户体验严重受损',
+        slaCriticalAction: '紧急排查错误和延迟问题，考虑限流保护',
+        slaLow: 'SLA 低于目标 ({sla}%)',
+        slaLowImpact: '需要关注服务质量',
+        slaLowAction: '分析SLA下降原因，优化系统性能',
+        // Health score diagnostics
+        healthCritical: '综合健康评分过低 ({score})',
+        healthCriticalImpact: '多个指标可能同时异常，建议优先排查错误与延迟',
+        healthCriticalAction: '全面检查系统状态，优先处理critical级别问题',
+        healthLow: '综合健康评分偏低 ({score})',
+        healthLowImpact: '可能存在轻度波动，建议关注 SLA 与错误率',
+        healthLowAction: '监控指标趋势，预防问题恶化',
+        healthy: '所有系统指标正常',
+        healthyImpact: '服务运行稳定'
+      },
+      // Error Log
+      errorLog: {
+        timeId: '时间 / ID',
+        context: '上下文',
+        status: '状态码',
+        message: '消息',
+        latency: '延迟',
+        action: '操作',
+        noErrors: '该窗口内暂无错误。',
+        grp: 'GRP：',
+        acc: 'ACC：',
+        details: '详情',
+        phase: '阶段'
+      },
+      // Error Details Modal
+      errorDetails: {
+        upstreamErrors: '上游错误',
+        requestErrors: '请求错误',
+        total: '总计：',
+        searchPlaceholder: '搜索 request_id / client_request_id / message',
+        accountIdPlaceholder: 'account_id'
+      },
+      // Error Detail Modal
+      errorDetail: {
+        loading: '加载中…',
+        requestId: '请求 ID',
+        time: '时间',
+        phase: '阶段',
+        status: '状态码',
+        message: '消息',
+        basicInfo: '基本信息',
+        platform: '平台',
+        model: '模型',
+        latency: '延迟',
+        ttft: 'TTFT',
+        businessLimited: '业务限制',
+        requestPath: '请求路径',
+        timings: '时序信息',
+        auth: '认证',
+        routing: '路由',
+        upstream: '上游',
+        response: '响应',
+        retry: '重试',
+        retryClient: '重试（客户端）',
+        retryUpstream: '重试（上游固定）',
+        pinnedAccountId: '固定 account_id',
+        retryNotes: '重试说明',
+        requestBody: '请求体',
+        errorBody: '错误体',
+        trimmed: '已截断',
+        confirmRetry: '确认重试',
+        retrySuccess: '重试成功',
+        retryFailed: '重试失败',
+        na: 'N/A',
+        retryHint: '重试将使用相同的请求参数重新发送请求',
+        retryClientHint: '使用客户端重试（不固定账号）',
+        retryUpstreamHint: '使用上游固定重试（固定到错误的账号）',
+        pinnedAccountIdHint: '（自动从错误日志获取）',
+        retryNote1: '重试会使用相同的请求体和参数',
+        retryNote2: '如果原请求失败是因为账号问题，固定重试可能仍会失败',
+        retryNote3: '客户端重试会重新选择账号',
+        confirmRetryMessage: '确认要重试该请求吗？',
+        confirmRetryHint: '将使用相同的请求参数重新发送'
+      },
+      requestDetails: {
+        title: '请求明细',
+        details: '明细',
+        rangeLabel: '窗口：{range}',
+        rangeMinutes: '{n} 分钟',
+        rangeHours: '{n} 小时',
+        empty: '该窗口内暂无请求。',
+        emptyHint: '可尝试调整时间范围或取消部分筛选。',
+        failedToLoad: '加载请求明细失败',
+        requestIdCopied: '请求ID已复制',
+        copyFailed: '复制失败',
+        copy: '复制',
+        viewError: '查看错误',
+        kind: {
+          success: '成功',
+          error: '失败'
+        },
+        table: {
+          time: '时间',
+          kind: '类型',
+          platform: '平台',
+          model: '模型',
+          duration: '耗时',
+          status: '状态码',
+          requestId: '请求ID',
+          actions: '操作'
+        }
+      },
+      alertEvents: {
+        title: '告警事件',
+        description: '最近的告警触发/恢复记录（仅邮件通知）',
+        loading: '加载中...',
+        empty: '暂无告警事件',
+        loadFailed: '加载告警事件失败',
+        table: {
+          time: '时间',
+          status: '状态',
+          severity: '级别',
+          title: '标题',
+          metric: '指标 / 阈值',
+          email: '邮件已发送'
+        }
+      },
+      alertRules: {
+        title: '告警规则',
+        description: '创建与管理系统阈值告警（仅邮件通知）',
+        loading: '加载中...',
+        empty: '暂无告警规则',
+        loadFailed: '加载告警规则失败',
+        saveSuccess: '警报规则保存成功',
+        saveFailed: '保存告警规则失败',
+        deleteSuccess: '警报规则删除成功',
+        deleteFailed: '删除告警规则失败',
+        create: '新建规则',
+        createTitle: '新建告警规则',
+        editTitle: '编辑告警规则',
+        deleteConfirmTitle: '确认删除该规则？',
+        deleteConfirmMessage: '将删除该规则及其关联的告警事件，是否继续？',
+        manage: '预警规则',
+        metricGroups: {
+          system: '系统指标',
+          group: '分组级别指标（需 group_id）',
+          account: '账号级别指标'
+        },
+        metrics: {
+          successRate: '成功率 (%)',
+          errorRate: '错误率 (%)',
+          upstreamErrorRate: '上游错误率 (%)',
+          p95: 'P95 延迟 (ms)',
+          p99: 'P99 延迟 (ms)',
+          cpu: 'CPU 使用率 (%)',
+          memory: '内存使用率 (%)',
+          queueDepth: '并发排队深度',
+          groupAvailableAccounts: '分组可用账号数',
+          groupAvailableRatio: '分组可用比例 (%)',
+          groupRateLimitRatio: '分组限流比例 (%)',
+          accountRateLimitedCount: '限流账号数',
+          accountErrorCount: '错误账号数（不含临时不可调度）',
+          accountErrorRatio: '错误账号比例 (%)',
+          overloadAccountCount: '过载账号数'
+        },
+        metricDescriptions: {
+          successRate: '统计窗口内成功请求占比（0~100）。',
+          errorRate: '统计窗口内失败请求占比（0~100）。',
+          upstreamErrorRate: '统计窗口内上游错误占比（0~100）。',
+          p95: '统计窗口内 P95 请求耗时（毫秒）。',
+          p99: '统计窗口内 P99 请求耗时（毫秒）。',
+          cpu: '当前实例 CPU 使用率（0~100）。',
+          memory: '当前实例内存使用率（0~100）。',
+          queueDepth: '统计窗口内并发队列排队深度（等待中的请求数）。',
+          groupAvailableAccounts: '指定分组中当前可用账号数量（需要 group_id 过滤）。',
+          groupAvailableRatio: '指定分组中可用账号占比（0~100，需要 group_id 过滤）。',
+          groupRateLimitRatio: '指定分组中账号被限流的比例（0~100，需要 group_id 过滤）。',
+          accountRateLimitedCount: '统计窗口内被限流的账号数量。',
+          accountErrorCount: '统计窗口内产生错误的账号数量（不含临时不可调度）。',
+          accountErrorRatio: '统计窗口内错误账号占比（0~100）。',
+          overloadAccountCount: '统计窗口内过载账号数量。'
+        },
+        hints: {
+          recommended: '推荐：运算符 {operator}，阈值 {threshold}{unit}',
+          groupRequired: '该指标为分组级别指标，必须选择分组（group_id）。',
+          groupOptional: '可选：通过 group_id 将规则限定到某个分组。'
+        },
+        table: {
+          name: '名称',
+          metric: '指标',
+          severity: '级别',
+          enabled: '启用',
+          actions: '操作'
+        },
+        form: {
+          name: '名称',
+          description: '描述',
+          metric: '指标',
+          operator: '运算符',
+          groupId: '分组（group_id）',
+          groupPlaceholder: '请选择分组',
+          allGroups: '全部分组',
+          threshold: '阈值',
+          severity: '级别',
+          window: '统计窗口（分钟）',
+          sustained: '连续样本数（每分钟）',
+          cooldown: '冷却期（分钟）',
+          enabled: '启用',
+          notifyEmail: '发送邮件通知'
+        },
+        validation: {
+          title: '请先修正以下问题',
+          invalid: '规则不合法',
+          nameRequired: '名称不能为空',
+          metricRequired: '指标不能为空',
+          groupIdRequired: '分组级别指标必须指定 group_id',
+          operatorRequired: '运算符不能为空',
+          thresholdRequired: '阈值必须为数字',
+          windowRange: '统计窗口必须为 1 / 5 / 60 分钟之一',
+          sustainedRange: '连续样本数必须在 1 到 1440 之间',
+          cooldownRange: '冷却期必须在 0 到 1440 分钟之间'
+        }
+      },
+      runtime: {
+        title: '运维监控运行设置',
+        description: '配置存储在数据库中，无需修改 config 文件即可生效。',
+        loading: '加载中...',
+        noData: '暂无运行设置',
+        loadFailed: '加载运行设置失败',
+        saveSuccess: '运行设置已保存',
+        saveFailed: '保存运行设置失败',
+        alertTitle: '告警评估器',
+        groupAvailabilityTitle: '分组可用性监控',
+        evalIntervalSeconds: '评估间隔（秒）',
+        silencing: {
+          title: '告警静默（维护模式）',
+          enabled: '启用静默',
+          globalUntil: '静默截止时间（RFC3339）',
+          untilPlaceholder: '2026-01-05T00:00:00Z',
+          untilHint: '建议填写截止时间，避免忘记关闭静默。',
+          reason: '原因',
+          reasonPlaceholder: '例如：计划维护',
+          entries: {
+            title: '高级：定向静默',
+            hint: '可选：仅静默特定规则或特定级别。字段留空表示匹配全部。',
+            add: '新增条目',
+            empty: '暂无定向静默条目',
+            entryTitle: '条目 #{n}',
+            ruleId: '规则ID（可选）',
+            ruleIdPlaceholder: '例如：1',
+            severities: '级别（可选）',
+            severitiesPlaceholder: '例如：P0,P1（留空=全部）',
+            until: '截止时间（RFC3339）',
+            reason: '原因',
+            validation: {
+              untilRequired: '条目截止时间不能为空',
+              untilFormat: '条目截止时间必须为合法的 RFC3339 时间戳',
+              ruleIdPositive: '条目 rule_id 必须为正整数',
+              severitiesFormat: '条目级别必须为 P0..P3 的逗号分隔列表'
+            }
+          },
+          validation: {
+            timeFormat: '静默时间必须为合法的 RFC3339 时间戳'
+          }
+        },
+        lockEnabled: '启用分布式锁',
+        lockKey: '分布式锁 Key',
+        lockTTLSeconds: '分布式锁 TTL（秒）',
+        showAdvancedDeveloperSettings: '显示高级开发者设置 (Distributed Lock)',
+        advancedSettingsSummary: '高级设置 (分布式锁)',
+        evalIntervalHint: '检测任务的执行频率，建议保持默认。',
+        validation: {
+          title: '请先修正以下问题',
+          invalid: '设置不合法',
+          evalIntervalRange: '评估间隔必须在 1 到 86400 秒之间',
+          lockKeyRequired: '启用分布式锁时必须填写 Lock Key',
+          lockKeyPrefix: '分布式锁 Key 必须以「{prefix}」开头',
+          lockKeyHint: '建议以「{prefix}」开头以避免冲突',
+          lockTtlRange: '分布式锁 TTL 必须在 1 到 86400 秒之间'
+        }
+      },
+      email: {
+        title: '邮件通知配置',
+        description: '配置告警/报告邮件通知（存储在数据库中）。',
+        loading: '加载中...',
+        noData: '暂无邮件通知配置',
+        loadFailed: '加载邮件通知配置失败',
+        saveSuccess: '邮件通知配置已保存',
+        saveFailed: '保存邮件通知配置失败',
+        alertTitle: '告警邮件',
+        reportTitle: '报告邮件',
+        recipients: '收件人',
+        recipientsHint: '若为空，系统可能会回退使用第一个管理员邮箱。',
+        minSeverity: '最低级别',
+        minSeverityAll: '全部级别',
+        rateLimitPerHour: '每小时限额',
+        batchWindowSeconds: '合并窗口（秒）',
+        includeResolved: '包含恢复通知',
+        dailySummary: '每日摘要',
+        weeklySummary: '每周摘要',
+        errorDigest: '错误摘要',
+        errorDigestMinCount: '错误摘要最小数量',
+        accountHealth: '账号健康报告',
+        accountHealthThreshold: '错误率阈值（%）',
+        cronPlaceholder: 'Cron 表达式',
+        reportHint: '发送时间使用 Cron 语法；留空将使用默认值。',
+        validation: {
+          title: '请先修正以下问题',
+          invalid: '邮件通知配置不合法',
+          alertRecipientsRequired: '已启用告警邮件，但未配置任何收件人',
+          reportRecipientsRequired: '已启用报告邮件，但未配置任何收件人',
+          invalidRecipients: '存在不合法的收件人邮箱',
+          rateLimitRange: '每小时限额必须为 ≥ 0 的数字',
+          batchWindowRange: '合并窗口必须在 0 到 86400 秒之间',
+          cronRequired: '启用定时任务时必须填写 Cron 表达式',
+          cronFormat: 'Cron 表达式格式可能不正确（至少应包含 5 段）',
+          digestMinCountRange: '错误摘要最小数量必须为 ≥ 0 的数字',
+          accountHealthThresholdRange: '账号健康错误率阈值必须在 0 到 100 之间'
+        }
+      },
+      settings: {
+        title: '运维监控设置',
+        loadFailed: '加载设置失败',
+        saveSuccess: '运维监控设置保存成功',
+        saveFailed: '保存设置失败',
+        dataCollection: '数据采集',
+        evaluationInterval: '评估间隔（秒）',
+        evaluationIntervalHint: '检测任务的执行频率，建议保持默认',
+        alertConfig: '预警配置',
+        enableAlert: '开启预警',
+        alertRecipients: '预警接收邮箱',
+        emailPlaceholder: '输入邮箱地址',
+        recipientsHint: '若为空，系统将使用第一个管理员邮箱作为默认收件人',
+        minSeverity: '最低级别',
+        reportConfig: '评估报告配置',
+        enableReport: '开启评估报告',
+        reportRecipients: '评估报告接收邮箱',
+        dailySummary: '每日摘要',
+        weeklySummary: '每周摘要',
+        advancedSettings: '高级设置',
+        dataRetention: '数据保留策略',
+        enableCleanup: '启用数据清理',
+        cleanupSchedule: '清理计划（Cron）',
+        cleanupScheduleHint: '例如：0 2 * * * 表示每天凌晨2点',
+        errorLogRetentionDays: '错误日志保留天数',
+        minuteMetricsRetentionDays: '分钟指标保留天数',
+        hourlyMetricsRetentionDays: '小时指标保留天数',
+        retentionDaysHint: '建议保留7-90天，过长会占用存储空间',
+        aggregation: '预聚合任务',
+        enableAggregation: '启用预聚合任务',
+        aggregationHint: '预聚合可提升长时间窗口查询性能',
+        validation: {
+          title: '请先修正以下问题',
+          retentionDaysRange: '保留天数必须在1-365天之间'
+        }
+      },
+      concurrency: {
+        title: '并发 / 排队',
+        byPlatform: '按平台',
+        byGroup: '按分组',
+        byAccount: '按账号',
+        totalRows: '共 {count} 项',
+        disabledHint: '已在设置中关闭实时监控。',
+        empty: '暂无数据',
+        queued: '队列 {count}',
+        rateLimited: '限流 {count}',
+        errorAccounts: '异常 {count}',
+        loadFailed: '加载并发数据失败'
+      },
+      realtime: {
+        title: '实时信息',
+        connected: '实时已连接',
+        connecting: '实时连接中',
+        reconnecting: '实时重连中',
+        offline: '实时离线',
+        closed: '实时已关闭',
+        reconnectIn: '重连 {seconds}s'
+      },
+      queryMode: {
+        auto: 'Auto（自动）',
+        raw: 'Raw（不聚合）',
+        preagg: 'Preagg（聚合）'
+      },
+      accountAvailability: {
+        available: '可用',
+        unavailable: '不可用',
+        accountError: '异常'
+      },
+      tooltips: {
+        totalRequests: '当前时间窗口内的总请求数和Token消耗量。',
+        throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。',
+        latencyHistogram: '成功请求的延迟分布（毫秒）。',
+        errorTrend: '错误趋势（SLA 口径排除业务限制；上游错误率排除 429/529）。',
+        errorDistribution: '按状态码统计的错误分布。',
+        upstreamErrors: '上游服务返回的错误，包括API提供商的错误响应（排除429/529限流错误）。',
+        goroutines:
+          'Go 运行时的协程数量（轻量级线程）。没有绝对“安全值”，建议以历史基线为准。经验参考：<2000 常见；2000-8000 需关注；>8000 且伴随队列/延迟上升时，优先排查阻塞/泄漏。',
+        cpu: 'CPU 使用率，显示系统处理器的负载情况。',
+        memory: '内存使用率，包括已使用和总可用内存。',
+        db: '数据库连接池状态，包括活跃连接、空闲连接和等待连接数。',
+        redis: 'Redis 连接池状态，显示活跃和空闲的连接数。',
+        jobs: '后台任务执行状态，包括最近运行时间、成功时间和错误信息。',
+        qps: '每秒查询数（QPS）和每秒Token数（TPS），实时显示系统吞吐量。',
+        tokens: '当前时间窗口内处理的总Token数量。',
+        sla: '服务等级协议达成率，排除业务限制（如余额不足、配额超限）的成功请求占比。',
+        errors: '错误统计，包括总错误数、错误率和上游错误率。',
+        latency: '请求延迟统计，包括 p50、p90、p95、p99 等百分位数。',
+        ttft: '首Token延迟（Time To First Token），衡量流式响应的首字节返回速度。',
+        health: '系统健康评分（0-100），综合考虑 SLA、错误率和资源使用情况。'
+      },
+      charts: {
+        emptyRequest: '该时间窗口内暂无请求。',
+        emptyError: '该时间窗口内暂无错误。',
+        resetZoom: '重置',
+        resetZoomHint: '重置缩放（若启用）',
+        downloadChart: '下载',
+        downloadChartHint: '下载图表图片'
+      }
+    },
+
     // Settings
     settings: {
       title: '系统设置',
@@ -2083,6 +2658,22 @@ export default {
         sending: '发送中...',
         enterRecipientHint: '请输入收件人邮箱地址'
       },
+      opsMonitoring: {
+        title: '运维监控',
+        description: '启用运维监控模块，用于排障与健康可视化',
+        disabled: '运维监控已关闭',
+        enabled: '启用运维监控',
+        enabledHint: '启用运维监控模块（仅管理员可见）',
+        realtimeEnabled: '启用实时监控',
+        realtimeEnabledHint: '启用实时请求速率和指标推送（WebSocket）',
+        queryMode: '默认查询模式',
+        queryModeHint: '运维监控默认查询模式（自动/原始/预聚合）',
+        queryModeAuto: '自动（推荐）',
+        queryModeRaw: '原始（最准确，但较慢）',
+        queryModePreagg: '预聚合（最快，需预聚合）',
+        metricsInterval: '采集频率（秒）',
+        metricsIntervalHint: '系统/请求指标采集频率（60-3600 秒）'
+      },
       adminApiKey: {
         title: '管理员 API Key',
         description: '用于外部系统集成的全局 API Key，拥有完整的管理员权限',
diff --git a/frontend/src/router/index.ts b/frontend/src/router/index.ts
index 7a8f2268..7e929400 100644
--- a/frontend/src/router/index.ts
+++ b/frontend/src/router/index.ts
@@ -173,6 +173,18 @@ const routes: RouteRecordRaw[] = [
       descriptionKey: 'admin.dashboard.description'
     }
   },
+  {
+    path: '/admin/ops',
+    name: 'AdminOps',
+    component: () => import('@/views/admin/ops/OpsDashboard.vue'),
+    meta: {
+      requiresAuth: true,
+      requiresAdmin: true,
+      title: 'Ops Monitoring',
+      titleKey: 'admin.ops.title',
+      descriptionKey: 'admin.ops.description'
+    }
+  },
   {
     path: '/admin/users',
     name: 'AdminUsers',
diff --git a/frontend/src/stores/adminSettings.ts b/frontend/src/stores/adminSettings.ts
new file mode 100644
index 00000000..460cc92b
--- /dev/null
+++ b/frontend/src/stores/adminSettings.ts
@@ -0,0 +1,130 @@
+import { defineStore } from 'pinia'
+import { ref } from 'vue'
+import { adminAPI } from '@/api'
+
+export const useAdminSettingsStore = defineStore('adminSettings', () => {
+  const loaded = ref(false)
+  const loading = ref(false)
+
+  const readCachedBool = (key: string, defaultValue: boolean): boolean => {
+    try {
+      const raw = localStorage.getItem(key)
+      if (raw === 'true') return true
+      if (raw === 'false') return false
+    } catch {
+      // ignore localStorage failures
+    }
+    return defaultValue
+  }
+
+  const writeCachedBool = (key: string, value: boolean) => {
+    try {
+      localStorage.setItem(key, value ? 'true' : 'false')
+    } catch {
+      // ignore localStorage failures
+    }
+  }
+
+  const readCachedString = (key: string, defaultValue: string): string => {
+    try {
+      const raw = localStorage.getItem(key)
+      if (typeof raw === 'string' && raw.length > 0) return raw
+    } catch {
+      // ignore localStorage failures
+    }
+    return defaultValue
+  }
+
+  const writeCachedString = (key: string, value: string) => {
+    try {
+      localStorage.setItem(key, value)
+    } catch {
+      // ignore localStorage failures
+    }
+  }
+
+  // Default open, but honor cached value to reduce UI flicker on first paint.
+  const opsMonitoringEnabled = ref(readCachedBool('ops_monitoring_enabled_cached', true))
+  const opsRealtimeMonitoringEnabled = ref(readCachedBool('ops_realtime_monitoring_enabled_cached', true))
+  const opsQueryModeDefault = ref(readCachedString('ops_query_mode_default_cached', 'auto'))
+
+  async function fetch(force = false): Promise<void> {
+    if (loaded.value && !force) return
+    if (loading.value) return
+
+    loading.value = true
+    try {
+      const settings = await adminAPI.settings.getSettings()
+      opsMonitoringEnabled.value = settings.ops_monitoring_enabled ?? true
+      writeCachedBool('ops_monitoring_enabled_cached', opsMonitoringEnabled.value)
+
+      opsRealtimeMonitoringEnabled.value = settings.ops_realtime_monitoring_enabled ?? true
+      writeCachedBool('ops_realtime_monitoring_enabled_cached', opsRealtimeMonitoringEnabled.value)
+
+      opsQueryModeDefault.value = settings.ops_query_mode_default || 'auto'
+      writeCachedString('ops_query_mode_default_cached', opsQueryModeDefault.value)
+
+      loaded.value = true
+    } catch (err) {
+      // Keep cached/default value: do not "flip" the UI based on a transient fetch failure.
+      loaded.value = true
+      console.error('[adminSettings] Failed to fetch settings:', err)
+    } finally {
+      loading.value = false
+    }
+  }
+
+  function setOpsMonitoringEnabledLocal(value: boolean) {
+    opsMonitoringEnabled.value = value
+    writeCachedBool('ops_monitoring_enabled_cached', value)
+    loaded.value = true
+  }
+
+  function setOpsRealtimeMonitoringEnabledLocal(value: boolean) {
+    opsRealtimeMonitoringEnabled.value = value
+    writeCachedBool('ops_realtime_monitoring_enabled_cached', value)
+    loaded.value = true
+  }
+
+  function setOpsQueryModeDefaultLocal(value: string) {
+    opsQueryModeDefault.value = value || 'auto'
+    writeCachedString('ops_query_mode_default_cached', opsQueryModeDefault.value)
+    loaded.value = true
+  }
+
+  // Keep UI consistent if we learn that ops is disabled via feature-gated 404s.
+  // (event is dispatched from the axios interceptor)
+  let eventHandlerCleanup: (() => void) | null = null
+
+  function initializeEventListeners() {
+    if (eventHandlerCleanup) return
+
+    try {
+      const handler = () => {
+        setOpsMonitoringEnabledLocal(false)
+      }
+      window.addEventListener('ops-monitoring-disabled', handler)
+      eventHandlerCleanup = () => {
+        window.removeEventListener('ops-monitoring-disabled', handler)
+      }
+    } catch {
+      // ignore window access failures (SSR)
+    }
+  }
+
+  if (typeof window !== 'undefined') {
+    initializeEventListeners()
+  }
+
+  return {
+    loaded,
+    loading,
+    opsMonitoringEnabled,
+    opsRealtimeMonitoringEnabled,
+    opsQueryModeDefault,
+    fetch,
+    setOpsMonitoringEnabledLocal,
+    setOpsRealtimeMonitoringEnabledLocal,
+    setOpsQueryModeDefaultLocal
+  }
+})
diff --git a/frontend/src/stores/index.ts b/frontend/src/stores/index.ts
index 0e4caef0..05c18e7e 100644
--- a/frontend/src/stores/index.ts
+++ b/frontend/src/stores/index.ts
@@ -5,6 +5,7 @@
 
 export { useAuthStore } from './auth'
 export { useAppStore } from './app'
+export { useAdminSettingsStore } from './adminSettings'
 export { useSubscriptionStore } from './subscriptions'
 export { useOnboardingStore } from './onboarding'
 
diff --git a/frontend/src/views/admin/SettingsView.vue b/frontend/src/views/admin/SettingsView.vue
index f145c924..57b18d0d 100644
--- a/frontend/src/views/admin/SettingsView.vue
+++ b/frontend/src/views/admin/SettingsView.vue
@@ -871,17 +871,29 @@ const form = reactive<SettingsForm>({
   turnstile_site_key: '',
   turnstile_secret_key: '',
   turnstile_secret_key_configured: false,
-  // LinuxDo Connect OAuth（终端用户登录）
+  // LinuxDo Connect OAuth 登录
   linuxdo_connect_enabled: false,
   linuxdo_connect_client_id: '',
   linuxdo_connect_client_secret: '',
   linuxdo_connect_client_secret_configured: false,
   linuxdo_connect_redirect_url: '',
+  // Model fallback
+  enable_model_fallback: false,
+  fallback_model_anthropic: 'claude-3-5-sonnet-20241022',
+  fallback_model_openai: 'gpt-4o',
+  fallback_model_gemini: 'gemini-2.5-pro',
+  fallback_model_antigravity: 'gemini-2.5-pro',
   // Identity patch (Claude -> Gemini)
   enable_identity_patch: true,
-  identity_patch_prompt: ''
+  identity_patch_prompt: '',
+  // Ops monitoring (vNext)
+  ops_monitoring_enabled: true,
+  ops_realtime_monitoring_enabled: true,
+  ops_query_mode_default: 'auto',
+  ops_metrics_interval_seconds: 60
 })
 
+// LinuxDo OAuth redirect URL suggestion
 const linuxdoRedirectUrlSuggestion = computed(() => {
   if (typeof window === 'undefined') return ''
   const origin =
@@ -980,7 +992,14 @@ async function saveSettings() {
       linuxdo_connect_enabled: form.linuxdo_connect_enabled,
       linuxdo_connect_client_id: form.linuxdo_connect_client_id,
       linuxdo_connect_client_secret: form.linuxdo_connect_client_secret || undefined,
-      linuxdo_connect_redirect_url: form.linuxdo_connect_redirect_url
+      linuxdo_connect_redirect_url: form.linuxdo_connect_redirect_url,
+      enable_model_fallback: form.enable_model_fallback,
+      fallback_model_anthropic: form.fallback_model_anthropic,
+      fallback_model_openai: form.fallback_model_openai,
+      fallback_model_gemini: form.fallback_model_gemini,
+      fallback_model_antigravity: form.fallback_model_antigravity,
+      enable_identity_patch: form.enable_identity_patch,
+      identity_patch_prompt: form.identity_patch_prompt
     }
     const updated = await adminAPI.settings.updateSettings(payload)
     Object.assign(form, updated)
diff --git a/frontend/src/views/admin/ops/OpsDashboard.vue b/frontend/src/views/admin/ops/OpsDashboard.vue
new file mode 100644
index 00000000..e8fedc5a
--- /dev/null
+++ b/frontend/src/views/admin/ops/OpsDashboard.vue
@@ -0,0 +1,645 @@
+<template>
+  <AppLayout>
+    <div class="space-y-6 pb-12">
+      <div
+        v-if="errorMessage"
+        class="rounded-2xl bg-red-50 p-4 text-sm text-red-600 dark:bg-red-900/20 dark:text-red-400"
+      >
+        {{ errorMessage }}
+      </div>
+
+      <OpsDashboardSkeleton v-if="loading && !hasLoadedOnce" />
+
+      <OpsDashboardHeader
+        v-else-if="opsEnabled"
+        :overview="overview"
+        :ws-status="wsStatus"
+        :ws-reconnect-in-ms="wsReconnectInMs"
+        :ws-has-data="wsHasData"
+        :real-time-qps="realTimeQPS"
+        :real-time-tps="realTimeTPS"
+        :platform="platform"
+        :group-id="groupId"
+        :time-range="timeRange"
+        :query-mode="queryMode"
+        :loading="loading"
+        :last-updated="lastUpdated"
+        @update:time-range="onTimeRangeChange"
+        @update:platform="onPlatformChange"
+        @update:group="onGroupChange"
+        @update:query-mode="onQueryModeChange"
+        @refresh="fetchData"
+        @open-request-details="handleOpenRequestDetails"
+        @open-error-details="openErrorDetails"
+        @open-settings="showSettingsDialog = true"
+        @open-alert-rules="showAlertRulesCard = true"
+      />
+
+      <!-- Row: Concurrency + Throughput -->
+      <div v-if="opsEnabled && !(loading && !hasLoadedOnce)" class="grid grid-cols-1 gap-6 lg:grid-cols-3">
+        <div class="lg:col-span-1 min-h-[360px]">
+          <OpsConcurrencyCard :platform-filter="platform" :group-id-filter="groupId" />
+        </div>
+        <div class="lg:col-span-2 min-h-[360px]">
+          <OpsThroughputTrendChart
+            :points="throughputTrend?.points ?? []"
+            :by-platform="throughputTrend?.by_platform ?? []"
+            :top-groups="throughputTrend?.top_groups ?? []"
+            :loading="loadingTrend"
+            :time-range="timeRange"
+            @select-platform="handleThroughputSelectPlatform"
+            @select-group="handleThroughputSelectGroup"
+            @open-details="handleOpenRequestDetails"
+          />
+        </div>
+      </div>
+
+      <!-- Row: Visual Analysis (baseline 3-up grid) -->
+      <div v-if="opsEnabled && !(loading && !hasLoadedOnce)" class="grid grid-cols-1 gap-6 md:grid-cols-3">
+        <OpsLatencyChart :latency-data="latencyHistogram" :loading="loadingLatency" />
+        <OpsErrorDistributionChart
+          :data="errorDistribution"
+          :loading="loadingErrorDistribution"
+          @open-details="openErrorDetails('request')"
+        />
+        <OpsErrorTrendChart
+          :points="errorTrend?.points ?? []"
+          :loading="loadingErrorTrend"
+          :time-range="timeRange"
+          @open-request-errors="openErrorDetails('request')"
+          @open-upstream-errors="openErrorDetails('upstream')"
+        />
+      </div>
+
+      <!-- Alert Events -->
+      <OpsAlertEventsCard v-if="opsEnabled && !(loading && !hasLoadedOnce)" />
+
+      <!-- Settings Dialog -->
+      <OpsSettingsDialog :show="showSettingsDialog" @close="showSettingsDialog = false" @saved="fetchData" />
+
+      <!-- Alert Rules Dialog -->
+      <BaseDialog :show="showAlertRulesCard" :title="t('admin.ops.alertRules.title')" width="extra-wide" @close="showAlertRulesCard = false">
+        <OpsAlertRulesCard />
+      </BaseDialog>
+
+      <OpsErrorDetailsModal
+        :show="showErrorDetails"
+        :time-range="timeRange"
+        :platform="platform"
+        :group-id="groupId"
+        :error-type="errorDetailsType"
+        @update:show="showErrorDetails = $event"
+        @openErrorDetail="openError"
+      />
+
+      <OpsErrorDetailModal v-model:show="showErrorModal" :error-id="selectedErrorId" />
+
+      <OpsRequestDetailsModal
+        v-model="showRequestDetails"
+        :time-range="timeRange"
+        :preset="requestDetailsPreset"
+        :platform="platform"
+        :group-id="groupId"
+        @openErrorDetail="openError"
+      />
+    </div>
+  </AppLayout>
+</template>
+
+<script setup lang="ts">
+import { computed, onMounted, onUnmounted, ref, watch } from 'vue'
+import { useDebounceFn } from '@vueuse/core'
+import { useI18n } from 'vue-i18n'
+import { useRoute, useRouter } from 'vue-router'
+import AppLayout from '@/components/layout/AppLayout.vue'
+import BaseDialog from '@/components/common/BaseDialog.vue'
+import {
+  opsAPI,
+  OPS_WS_CLOSE_CODES,
+  type OpsWSStatus,
+  type OpsDashboardOverview,
+  type OpsErrorDistributionResponse,
+  type OpsErrorTrendResponse,
+  type OpsLatencyHistogramResponse,
+  type OpsThroughputTrendResponse
+} from '@/api/admin/ops'
+import { useAdminSettingsStore, useAppStore } from '@/stores'
+import OpsDashboardHeader from './components/OpsDashboardHeader.vue'
+import OpsDashboardSkeleton from './components/OpsDashboardSkeleton.vue'
+import OpsConcurrencyCard from './components/OpsConcurrencyCard.vue'
+import OpsErrorDetailModal from './components/OpsErrorDetailModal.vue'
+import OpsErrorDistributionChart from './components/OpsErrorDistributionChart.vue'
+import OpsErrorDetailsModal from './components/OpsErrorDetailsModal.vue'
+import OpsErrorTrendChart from './components/OpsErrorTrendChart.vue'
+import OpsLatencyChart from './components/OpsLatencyChart.vue'
+import OpsThroughputTrendChart from './components/OpsThroughputTrendChart.vue'
+import OpsAlertEventsCard from './components/OpsAlertEventsCard.vue'
+import OpsRequestDetailsModal, { type OpsRequestDetailsPreset } from './components/OpsRequestDetailsModal.vue'
+import OpsSettingsDialog from './components/OpsSettingsDialog.vue'
+import OpsAlertRulesCard from './components/OpsAlertRulesCard.vue'
+
+const route = useRoute()
+const router = useRouter()
+const appStore = useAppStore()
+const adminSettingsStore = useAdminSettingsStore()
+const { t } = useI18n()
+
+const opsEnabled = computed(() => adminSettingsStore.opsMonitoringEnabled)
+
+type TimeRange = '5m' | '30m' | '1h' | '6h' | '24h'
+const allowedTimeRanges = new Set<TimeRange>(['5m', '30m', '1h', '6h', '24h'])
+
+type QueryMode = 'auto' | 'raw' | 'preagg'
+const allowedQueryModes = new Set<QueryMode>(['auto', 'raw', 'preagg'])
+
+const loading = ref(true)
+const hasLoadedOnce = ref(false)
+const errorMessage = ref('')
+const lastUpdated = ref<Date | null>(new Date())
+
+const timeRange = ref<TimeRange>('1h')
+const platform = ref<string>('')
+const groupId = ref<number | null>(null)
+const queryMode = ref<QueryMode>('auto')
+
+const QUERY_KEYS = {
+  timeRange: 'tr',
+  platform: 'platform',
+  groupId: 'group_id',
+  queryMode: 'mode'
+} as const
+
+const isApplyingRouteQuery = ref(false)
+const isSyncingRouteQuery = ref(false)
+
+// WebSocket for realtime QPS/TPS
+const realTimeQPS = ref(0)
+const realTimeTPS = ref(0)
+const wsStatus = ref<OpsWSStatus>('closed')
+const wsReconnectInMs = ref<number | null>(null)
+const wsHasData = ref(false)
+let unsubscribeQPS: (() => void) | null = null
+
+let dashboardFetchController: AbortController | null = null
+let dashboardFetchSeq = 0
+
+function isCanceledRequest(err: unknown): boolean {
+  return (
+    !!err &&
+    typeof err === 'object' &&
+    'code' in err &&
+    (err as Record<string, unknown>).code === 'ERR_CANCELED'
+  )
+}
+
+function abortDashboardFetch() {
+  if (dashboardFetchController) {
+    dashboardFetchController.abort()
+    dashboardFetchController = null
+  }
+}
+
+function stopQPSSubscription(options?: { resetMetrics?: boolean }) {
+  wsStatus.value = 'closed'
+  wsReconnectInMs.value = null
+  if (unsubscribeQPS) unsubscribeQPS()
+  unsubscribeQPS = null
+
+  if (options?.resetMetrics) {
+    realTimeQPS.value = 0
+    realTimeTPS.value = 0
+    wsHasData.value = false
+  }
+}
+
+function startQPSSubscription() {
+  stopQPSSubscription()
+  unsubscribeQPS = opsAPI.subscribeQPS(
+    (payload) => {
+      if (payload && typeof payload === 'object' && payload.type === 'qps_update' && payload.data) {
+        realTimeQPS.value = payload.data.qps || 0
+        realTimeTPS.value = payload.data.tps || 0
+        wsHasData.value = true
+      }
+    },
+    {
+      onStatusChange: (status) => {
+        wsStatus.value = status
+        if (status === 'connected') wsReconnectInMs.value = null
+      },
+      onReconnectScheduled: ({ delayMs }) => {
+        wsReconnectInMs.value = delayMs
+      },
+      onFatalClose: (event) => {
+        // Server-side feature flag says realtime is disabled; keep UI consistent and avoid reconnect loops.
+        if (event && event.code === OPS_WS_CLOSE_CODES.REALTIME_DISABLED) {
+          adminSettingsStore.setOpsRealtimeMonitoringEnabledLocal(false)
+          stopQPSSubscription({ resetMetrics: true })
+        }
+      },
+      // QPS updates may be sparse in idle periods; keep the timeout conservative.
+      staleTimeoutMs: 180_000
+    }
+  )
+}
+
+const readQueryString = (key: string): string => {
+  const value = route.query[key]
+  if (typeof value === 'string') return value
+  if (Array.isArray(value) && typeof value[0] === 'string') return value[0]
+  return ''
+}
+
+const readQueryNumber = (key: string): number | null => {
+  const raw = readQueryString(key)
+  if (!raw) return null
+  const n = Number.parseInt(raw, 10)
+  return Number.isFinite(n) ? n : null
+}
+
+const applyRouteQueryToState = () => {
+  const nextTimeRange = readQueryString(QUERY_KEYS.timeRange)
+  if (nextTimeRange && allowedTimeRanges.has(nextTimeRange as TimeRange)) {
+    timeRange.value = nextTimeRange as TimeRange
+  }
+
+  platform.value = readQueryString(QUERY_KEYS.platform) || ''
+
+  const groupIdRaw = readQueryNumber(QUERY_KEYS.groupId)
+  groupId.value = typeof groupIdRaw === 'number' && groupIdRaw > 0 ? groupIdRaw : null
+
+  const nextMode = readQueryString(QUERY_KEYS.queryMode)
+  if (nextMode && allowedQueryModes.has(nextMode as QueryMode)) {
+    queryMode.value = nextMode as QueryMode
+  } else {
+    const fallback = adminSettingsStore.opsQueryModeDefault || 'auto'
+    queryMode.value = allowedQueryModes.has(fallback as QueryMode) ? (fallback as QueryMode) : 'auto'
+  }
+}
+
+applyRouteQueryToState()
+
+const buildQueryFromState = () => {
+  const next: Record<string, any> = { ...route.query }
+
+  Object.values(QUERY_KEYS).forEach((k) => {
+    delete next[k]
+  })
+
+  if (timeRange.value !== '1h') next[QUERY_KEYS.timeRange] = timeRange.value
+  if (platform.value) next[QUERY_KEYS.platform] = platform.value
+  if (typeof groupId.value === 'number' && groupId.value > 0) next[QUERY_KEYS.groupId] = String(groupId.value)
+  if (queryMode.value !== 'auto') next[QUERY_KEYS.queryMode] = queryMode.value
+
+  return next
+}
+
+const syncQueryToRoute = useDebounceFn(async () => {
+  if (isApplyingRouteQuery.value) return
+  const nextQuery = buildQueryFromState()
+
+  const curr = route.query as Record<string, any>
+  const nextKeys = Object.keys(nextQuery)
+  const currKeys = Object.keys(curr)
+  const sameLength = nextKeys.length === currKeys.length
+  const sameValues = sameLength && nextKeys.every((k) => String(curr[k] ?? '') === String(nextQuery[k] ?? ''))
+  if (sameValues) return
+
+  try {
+    isSyncingRouteQuery.value = true
+    await router.replace({ query: nextQuery })
+  } finally {
+    isSyncingRouteQuery.value = false
+  }
+}, 250)
+
+const overview = ref<OpsDashboardOverview | null>(null)
+
+const throughputTrend = ref<OpsThroughputTrendResponse | null>(null)
+const loadingTrend = ref(false)
+
+const latencyHistogram = ref<OpsLatencyHistogramResponse | null>(null)
+const loadingLatency = ref(false)
+
+const errorTrend = ref<OpsErrorTrendResponse | null>(null)
+const loadingErrorTrend = ref(false)
+
+const errorDistribution = ref<OpsErrorDistributionResponse | null>(null)
+const loadingErrorDistribution = ref(false)
+
+const selectedErrorId = ref<number | null>(null)
+const showErrorModal = ref(false)
+
+const showErrorDetails = ref(false)
+const errorDetailsType = ref<'request' | 'upstream'>('request')
+
+const showRequestDetails = ref(false)
+const requestDetailsPreset = ref<OpsRequestDetailsPreset>({
+  title: '',
+  kind: 'all',
+  sort: 'created_at_desc'
+})
+
+const showSettingsDialog = ref(false)
+const showAlertRulesCard = ref(false)
+
+function handleThroughputSelectPlatform(nextPlatform: string) {
+  platform.value = nextPlatform || ''
+  groupId.value = null
+}
+
+function handleThroughputSelectGroup(nextGroupId: number) {
+  const id = Number.isFinite(nextGroupId) && nextGroupId > 0 ? nextGroupId : null
+  groupId.value = id
+}
+
+function handleOpenRequestDetails(preset?: OpsRequestDetailsPreset) {
+  const basePreset: OpsRequestDetailsPreset = {
+    title: t('admin.ops.requestDetails.title'),
+    kind: 'all',
+    sort: 'created_at_desc'
+  }
+
+  requestDetailsPreset.value = { ...basePreset, ...(preset ?? {}) }
+  if (!requestDetailsPreset.value.title) requestDetailsPreset.value.title = basePreset.title
+  showRequestDetails.value = true
+}
+
+function openErrorDetails(kind: 'request' | 'upstream') {
+  errorDetailsType.value = kind
+  showErrorDetails.value = true
+}
+
+function onTimeRangeChange(v: string | number | boolean | null) {
+  if (typeof v !== 'string') return
+  if (!allowedTimeRanges.has(v as TimeRange)) return
+  timeRange.value = v as TimeRange
+}
+
+function onPlatformChange(v: string | number | boolean | null) {
+  platform.value = typeof v === 'string' ? v : ''
+}
+
+function onGroupChange(v: string | number | boolean | null) {
+  if (v === null) {
+    groupId.value = null
+    return
+  }
+  if (typeof v === 'number') {
+    groupId.value = v > 0 ? v : null
+    return
+  }
+  if (typeof v === 'string') {
+    const n = Number.parseInt(v, 10)
+    groupId.value = Number.isFinite(n) && n > 0 ? n : null
+  }
+}
+
+function onQueryModeChange(v: string | number | boolean | null) {
+  if (typeof v !== 'string') return
+  if (!allowedQueryModes.has(v as QueryMode)) return
+  queryMode.value = v as QueryMode
+}
+
+function openError(id: number) {
+  selectedErrorId.value = id
+  showErrorModal.value = true
+}
+
+async function refreshOverviewWithCancel(fetchSeq: number, signal: AbortSignal) {
+  if (!opsEnabled.value) return
+  try {
+    const data = await opsAPI.getDashboardOverview(
+      {
+        time_range: timeRange.value,
+        platform: platform.value || undefined,
+        group_id: groupId.value ?? undefined,
+        mode: queryMode.value
+      },
+      { signal }
+    )
+    if (fetchSeq !== dashboardFetchSeq) return
+    overview.value = data
+  } catch (err: any) {
+    if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
+    overview.value = null
+    appStore.showError(err?.message || t('admin.ops.failedToLoadOverview'))
+  }
+}
+
+async function refreshThroughputTrendWithCancel(fetchSeq: number, signal: AbortSignal) {
+  if (!opsEnabled.value) return
+  loadingTrend.value = true
+  try {
+    const data = await opsAPI.getThroughputTrend(
+      {
+        time_range: timeRange.value,
+        platform: platform.value || undefined,
+        group_id: groupId.value ?? undefined,
+        mode: queryMode.value
+      },
+      { signal }
+    )
+    if (fetchSeq !== dashboardFetchSeq) return
+    throughputTrend.value = data
+  } catch (err: any) {
+    if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
+    throughputTrend.value = null
+    appStore.showError(err?.message || t('admin.ops.failedToLoadThroughputTrend'))
+  } finally {
+    if (fetchSeq === dashboardFetchSeq) {
+      loadingTrend.value = false
+    }
+  }
+}
+
+async function refreshLatencyHistogramWithCancel(fetchSeq: number, signal: AbortSignal) {
+  if (!opsEnabled.value) return
+  loadingLatency.value = true
+  try {
+    const data = await opsAPI.getLatencyHistogram(
+      {
+        time_range: timeRange.value,
+        platform: platform.value || undefined,
+        group_id: groupId.value ?? undefined,
+        mode: queryMode.value
+      },
+      { signal }
+    )
+    if (fetchSeq !== dashboardFetchSeq) return
+    latencyHistogram.value = data
+  } catch (err: any) {
+    if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
+    latencyHistogram.value = null
+    appStore.showError(err?.message || t('admin.ops.failedToLoadLatencyHistogram'))
+  } finally {
+    if (fetchSeq === dashboardFetchSeq) {
+      loadingLatency.value = false
+    }
+  }
+}
+
+async function refreshErrorTrendWithCancel(fetchSeq: number, signal: AbortSignal) {
+  if (!opsEnabled.value) return
+  loadingErrorTrend.value = true
+  try {
+    const data = await opsAPI.getErrorTrend(
+      {
+        time_range: timeRange.value,
+        platform: platform.value || undefined,
+        group_id: groupId.value ?? undefined,
+        mode: queryMode.value
+      },
+      { signal }
+    )
+    if (fetchSeq !== dashboardFetchSeq) return
+    errorTrend.value = data
+  } catch (err: any) {
+    if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
+    errorTrend.value = null
+    appStore.showError(err?.message || t('admin.ops.failedToLoadErrorTrend'))
+  } finally {
+    if (fetchSeq === dashboardFetchSeq) {
+      loadingErrorTrend.value = false
+    }
+  }
+}
+
+async function refreshErrorDistributionWithCancel(fetchSeq: number, signal: AbortSignal) {
+  if (!opsEnabled.value) return
+  loadingErrorDistribution.value = true
+  try {
+    const data = await opsAPI.getErrorDistribution(
+      {
+        time_range: timeRange.value,
+        platform: platform.value || undefined,
+        group_id: groupId.value ?? undefined,
+        mode: queryMode.value
+      },
+      { signal }
+    )
+    if (fetchSeq !== dashboardFetchSeq) return
+    errorDistribution.value = data
+  } catch (err: any) {
+    if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
+    errorDistribution.value = null
+    appStore.showError(err?.message || t('admin.ops.failedToLoadErrorDistribution'))
+  } finally {
+    if (fetchSeq === dashboardFetchSeq) {
+      loadingErrorDistribution.value = false
+    }
+  }
+}
+
+function isOpsDisabledError(err: unknown): boolean {
+  return (
+    !!err &&
+    typeof err === 'object' &&
+    'code' in err &&
+    typeof (err as Record<string, unknown>).code === 'string' &&
+    (err as Record<string, unknown>).code === 'OPS_DISABLED'
+  )
+}
+
+async function fetchData() {
+  if (!opsEnabled.value) return
+
+  abortDashboardFetch()
+  dashboardFetchSeq += 1
+  const fetchSeq = dashboardFetchSeq
+  dashboardFetchController = new AbortController()
+
+  loading.value = true
+  errorMessage.value = ''
+  try {
+    await Promise.all([
+      refreshOverviewWithCancel(fetchSeq, dashboardFetchController.signal),
+      refreshThroughputTrendWithCancel(fetchSeq, dashboardFetchController.signal),
+      refreshLatencyHistogramWithCancel(fetchSeq, dashboardFetchController.signal),
+      refreshErrorTrendWithCancel(fetchSeq, dashboardFetchController.signal),
+      refreshErrorDistributionWithCancel(fetchSeq, dashboardFetchController.signal)
+    ])
+    if (fetchSeq !== dashboardFetchSeq) return
+    lastUpdated.value = new Date()
+  } catch (err) {
+    if (!isOpsDisabledError(err)) {
+      console.error('[ops] failed to fetch dashboard data', err)
+      errorMessage.value = t('admin.ops.failedToLoadData')
+    }
+  } finally {
+    if (fetchSeq === dashboardFetchSeq) {
+      loading.value = false
+      hasLoadedOnce.value = true
+    }
+  }
+}
+
+watch(
+  () => [timeRange.value, platform.value, groupId.value, queryMode.value] as const,
+  () => {
+    if (isApplyingRouteQuery.value) return
+    if (opsEnabled.value) {
+      fetchData()
+    }
+    syncQueryToRoute()
+  }
+)
+
+watch(
+  () => route.query,
+  () => {
+    if (isSyncingRouteQuery.value) return
+
+    const prevTimeRange = timeRange.value
+    const prevPlatform = platform.value
+    const prevGroupId = groupId.value
+
+    isApplyingRouteQuery.value = true
+    applyRouteQueryToState()
+    isApplyingRouteQuery.value = false
+
+    const changed =
+      prevTimeRange !== timeRange.value || prevPlatform !== platform.value || prevGroupId !== groupId.value
+    if (changed) {
+      if (opsEnabled.value) {
+        fetchData()
+      }
+    }
+  }
+)
+
+onMounted(async () => {
+  await adminSettingsStore.fetch()
+  if (!adminSettingsStore.opsMonitoringEnabled) {
+    await router.replace('/admin/settings')
+    return
+  }
+
+  if (adminSettingsStore.opsRealtimeMonitoringEnabled) {
+    startQPSSubscription()
+  } else {
+    stopQPSSubscription({ resetMetrics: true })
+  }
+
+  if (opsEnabled.value) {
+    await fetchData()
+  }
+})
+
+onUnmounted(() => {
+  stopQPSSubscription()
+  abortDashboardFetch()
+})
+
+watch(
+  () => adminSettingsStore.opsRealtimeMonitoringEnabled,
+  (enabled) => {
+    if (!opsEnabled.value) return
+    if (enabled) {
+      startQPSSubscription()
+    } else {
+      stopQPSSubscription({ resetMetrics: true })
+    }
+  }
+)
+</script>
diff --git a/frontend/src/views/admin/ops/components/OpsAlertEventsCard.vue b/frontend/src/views/admin/ops/components/OpsAlertEventsCard.vue
new file mode 100644
index 00000000..58a91355
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsAlertEventsCard.vue
@@ -0,0 +1,165 @@
+<script setup lang="ts">
+import { computed, onMounted, ref, watch } from 'vue'
+import { useI18n } from 'vue-i18n'
+import { useAppStore } from '@/stores/app'
+import Select from '@/components/common/Select.vue'
+import { opsAPI } from '@/api/admin/ops'
+import type { AlertEvent } from '../types'
+import { formatDateTime } from '../utils/opsFormatters'
+
+const { t } = useI18n()
+const appStore = useAppStore()
+
+const loading = ref(false)
+const events = ref<AlertEvent[]>([])
+
+const limit = ref(100)
+const limitOptions = computed(() => [
+  { value: 50, label: '50' },
+  { value: 100, label: '100' },
+  { value: 200, label: '200' }
+])
+
+async function load() {
+  loading.value = true
+  try {
+    events.value = await opsAPI.listAlertEvents(limit.value)
+  } catch (err: any) {
+    console.error('[OpsAlertEventsCard] Failed to load alert events', err)
+    appStore.showError(err?.response?.data?.detail || t('admin.ops.alertEvents.loadFailed'))
+    events.value = []
+  } finally {
+    loading.value = false
+  }
+}
+
+onMounted(() => {
+  load()
+})
+
+watch(limit, () => {
+  load()
+})
+
+function severityBadgeClass(severity: string | undefined): string {
+  const s = String(severity || '').trim().toLowerCase()
+  if (s === 'p0' || s === 'critical') return 'bg-red-100 text-red-700 dark:bg-red-900/30 dark:text-red-300'
+  if (s === 'p1' || s === 'warning') return 'bg-amber-100 text-amber-700 dark:bg-amber-900/30 dark:text-amber-300'
+  if (s === 'p2' || s === 'info') return 'bg-blue-100 text-blue-700 dark:bg-blue-900/30 dark:text-blue-300'
+  if (s === 'p3') return 'bg-gray-100 text-gray-700 dark:bg-dark-700 dark:text-gray-300'
+  return 'bg-gray-100 text-gray-700 dark:bg-dark-700 dark:text-gray-300'
+}
+
+function statusBadgeClass(status: string | undefined): string {
+  const s = String(status || '').trim().toLowerCase()
+  if (s === 'firing') return 'bg-red-50 text-red-700 ring-red-600/20 dark:bg-red-900/30 dark:text-red-300 dark:ring-red-500/30'
+  if (s === 'resolved') return 'bg-green-50 text-green-700 ring-green-600/20 dark:bg-green-900/30 dark:text-green-300 dark:ring-green-500/30'
+  return 'bg-gray-50 text-gray-700 ring-gray-600/20 dark:bg-gray-900/30 dark:text-gray-300 dark:ring-gray-500/30'
+}
+
+const empty = computed(() => events.value.length === 0 && !loading.value)
+</script>
+
+<template>
+  <div class="rounded-3xl bg-white p-6 shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700">
+    <div class="mb-4 flex items-start justify-between gap-4">
+      <div>
+        <h3 class="text-sm font-bold text-gray-900 dark:text-white">{{ t('admin.ops.alertEvents.title') }}</h3>
+        <p class="mt-1 text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.alertEvents.description') }}</p>
+      </div>
+
+      <div class="flex items-center gap-2">
+        <Select :model-value="limit" :options="limitOptions" class="w-[88px]" @change="limit = Number($event || 100)" />
+        <button
+          class="flex items-center gap-1.5 rounded-lg bg-gray-100 px-3 py-1.5 text-xs font-bold text-gray-700 transition-colors hover:bg-gray-200 disabled:cursor-not-allowed disabled:opacity-50 dark:bg-dark-700 dark:text-gray-300 dark:hover:bg-dark-600"
+          :disabled="loading"
+          @click="load"
+        >
+          <svg class="h-3.5 w-3.5" :class="{ 'animate-spin': loading }" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 4v5h.582m15.356 2A8.001 8.001 0 004.582 9m0 0H9m11 11v-5h-.581m0 0a8.003 8.003 0 01-15.357-2m15.357 2H15" />
+          </svg>
+          {{ t('common.refresh') }}
+        </button>
+      </div>
+    </div>
+
+    <div v-if="loading" class="flex items-center gap-2 text-sm text-gray-500 dark:text-gray-400">
+      <svg class="h-4 w-4 animate-spin" fill="none" viewBox="0 0 24 24">
+        <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
+        <path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
+      </svg>
+      {{ t('admin.ops.alertEvents.loading') }}
+    </div>
+
+    <div v-else-if="empty" class="rounded-xl border border-dashed border-gray-200 p-8 text-center text-sm text-gray-500 dark:border-dark-700 dark:text-gray-400">
+      {{ t('admin.ops.alertEvents.empty') }}
+    </div>
+
+    <div v-else class="overflow-hidden rounded-xl border border-gray-200 dark:border-dark-700">
+      <div class="max-h-[600px] overflow-y-auto">
+        <table class="min-w-full divide-y divide-gray-200 dark:divide-dark-700">
+          <thead class="sticky top-0 z-10 bg-gray-50 dark:bg-dark-900">
+            <tr>
+              <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                {{ t('admin.ops.alertEvents.table.time') }}
+              </th>
+              <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                {{ t('admin.ops.alertEvents.table.status') }}
+              </th>
+              <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                {{ t('admin.ops.alertEvents.table.severity') }}
+              </th>
+              <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                {{ t('admin.ops.alertEvents.table.title') }}
+              </th>
+              <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                {{ t('admin.ops.alertEvents.table.metric') }}
+              </th>
+              <th class="px-4 py-3 text-right text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                {{ t('admin.ops.alertEvents.table.email') }}
+              </th>
+            </tr>
+          </thead>
+          <tbody class="divide-y divide-gray-200 bg-white dark:divide-dark-700 dark:bg-dark-800">
+            <tr v-for="row in events" :key="row.id" class="hover:bg-gray-50 dark:hover:bg-dark-700/50">
+              <td class="whitespace-nowrap px-4 py-3 text-xs text-gray-600 dark:text-gray-300">
+                {{ formatDateTime(row.fired_at || row.created_at) }}
+              </td>
+              <td class="whitespace-nowrap px-4 py-3">
+                <span class="inline-flex items-center rounded-full px-2 py-1 text-[10px] font-bold ring-1 ring-inset" :class="statusBadgeClass(row.status)">
+                  {{ String(row.status || '-').toUpperCase() }}
+                </span>
+              </td>
+              <td class="whitespace-nowrap px-4 py-3">
+                <span class="rounded-full px-2 py-1 text-[10px] font-bold" :class="severityBadgeClass(String(row.severity || ''))">
+                  {{ row.severity || '-' }}
+                </span>
+              </td>
+              <td class="min-w-[280px] px-4 py-3 text-xs text-gray-700 dark:text-gray-200">
+                <div class="font-semibold">{{ row.title || '-' }}</div>
+                <div v-if="row.description" class="mt-0.5 line-clamp-2 text-[11px] text-gray-500 dark:text-gray-400">
+                  {{ row.description }}
+                </div>
+              </td>
+              <td class="whitespace-nowrap px-4 py-3 text-xs text-gray-600 dark:text-gray-300">
+                <span v-if="typeof row.metric_value === 'number' && typeof row.threshold_value === 'number'">
+                  {{ row.metric_value.toFixed(2) }} / {{ row.threshold_value.toFixed(2) }}
+                </span>
+                <span v-else>-</span>
+              </td>
+              <td class="whitespace-nowrap px-4 py-3 text-right text-xs">
+                <span
+                  class="inline-flex items-center rounded-full px-2 py-1 text-[10px] font-bold ring-1 ring-inset"
+                  :class="row.email_sent ? 'bg-green-50 text-green-700 ring-green-600/20 dark:bg-green-900/30 dark:text-green-300 dark:ring-green-500/30' : 'bg-gray-50 text-gray-700 ring-gray-600/20 dark:bg-gray-900/30 dark:text-gray-300 dark:ring-gray-500/30'"
+                >
+                  {{ row.email_sent ? t('common.enabled') : t('common.disabled') }}
+                </span>
+              </td>
+            </tr>
+          </tbody>
+        </table>
+      </div>
+    </div>
+  </div>
+</template>
+
diff --git a/frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue b/frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue
new file mode 100644
index 00000000..2cf097c0
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue
@@ -0,0 +1,609 @@
+<script setup lang="ts">
+import { computed, onMounted, ref } from 'vue'
+import { useI18n } from 'vue-i18n'
+import { useAppStore } from '@/stores/app'
+import BaseDialog from '@/components/common/BaseDialog.vue'
+import ConfirmDialog from '@/components/common/ConfirmDialog.vue'
+import Select, { type SelectOption } from '@/components/common/Select.vue'
+import { adminAPI } from '@/api'
+import { opsAPI } from '@/api/admin/ops'
+import type { AlertRule, MetricType, Operator } from '../types'
+import type { OpsSeverity } from '@/api/admin/ops'
+import { formatDateTime } from '../utils/opsFormatters'
+
+const { t } = useI18n()
+const appStore = useAppStore()
+
+const loading = ref(false)
+const rules = ref<AlertRule[]>([])
+
+async function load() {
+  loading.value = true
+  try {
+    rules.value = await opsAPI.listAlertRules()
+  } catch (err: any) {
+    console.error('[OpsAlertRulesCard] Failed to load rules', err)
+    appStore.showError(err?.response?.data?.detail || t('admin.ops.alertRules.loadFailed'))
+    rules.value = []
+  } finally {
+    loading.value = false
+  }
+}
+
+onMounted(() => {
+  load()
+  loadGroups()
+})
+
+const sortedRules = computed(() => {
+  return [...rules.value].sort((a, b) => (b.id || 0) - (a.id || 0))
+})
+
+const showEditor = ref(false)
+const saving = ref(false)
+const editingId = ref<number | null>(null)
+const draft = ref<AlertRule | null>(null)
+
+type MetricGroup = 'system' | 'group' | 'account'
+
+interface MetricDefinition {
+  type: MetricType
+  group: MetricGroup
+  label: string
+  description: string
+  recommendedOperator: Operator
+  recommendedThreshold: number
+  unit?: string
+}
+
+const groupMetricTypes = new Set<MetricType>([
+  'group_available_accounts',
+  'group_available_ratio',
+  'group_rate_limit_ratio'
+])
+
+function parsePositiveInt(value: unknown): number | null {
+  if (value == null) return null
+  if (typeof value === 'boolean') return null
+  const n = typeof value === 'number' ? value : Number.parseInt(String(value), 10)
+  return Number.isFinite(n) && n > 0 ? n : null
+}
+
+const groupOptionsBase = ref<SelectOption[]>([])
+
+async function loadGroups() {
+  try {
+    const list = await adminAPI.groups.getAll()
+    groupOptionsBase.value = list.map((g) => ({ value: g.id, label: g.name }))
+  } catch (err) {
+    console.error('[OpsAlertRulesCard] Failed to load groups', err)
+    groupOptionsBase.value = []
+  }
+}
+
+const isGroupMetricSelected = computed(() => {
+  const metricType = draft.value?.metric_type
+  return metricType ? groupMetricTypes.has(metricType) : false
+})
+
+const draftGroupId = computed<number | null>({
+  get() {
+    return parsePositiveInt(draft.value?.filters?.group_id)
+  },
+  set(value) {
+    if (!draft.value) return
+    if (value == null) {
+      if (!draft.value.filters) return
+      delete draft.value.filters.group_id
+      if (Object.keys(draft.value.filters).length === 0) {
+        delete draft.value.filters
+      }
+      return
+    }
+    if (!draft.value.filters) draft.value.filters = {}
+    draft.value.filters.group_id = value
+  }
+})
+
+const groupOptions = computed<SelectOption[]>(() => {
+  if (isGroupMetricSelected.value) return groupOptionsBase.value
+  return [{ value: null, label: t('admin.ops.alertRules.form.allGroups') }, ...groupOptionsBase.value]
+})
+
+const metricDefinitions = computed(() => {
+  return [
+    // System-level metrics
+    {
+      type: 'success_rate',
+      group: 'system',
+      label: t('admin.ops.alertRules.metrics.successRate'),
+      description: t('admin.ops.alertRules.metricDescriptions.successRate'),
+      recommendedOperator: '<',
+      recommendedThreshold: 99,
+      unit: '%'
+    },
+    {
+      type: 'error_rate',
+      group: 'system',
+      label: t('admin.ops.alertRules.metrics.errorRate'),
+      description: t('admin.ops.alertRules.metricDescriptions.errorRate'),
+      recommendedOperator: '>',
+      recommendedThreshold: 1,
+      unit: '%'
+    },
+    {
+      type: 'upstream_error_rate',
+      group: 'system',
+      label: t('admin.ops.alertRules.metrics.upstreamErrorRate'),
+      description: t('admin.ops.alertRules.metricDescriptions.upstreamErrorRate'),
+      recommendedOperator: '>',
+      recommendedThreshold: 1,
+      unit: '%'
+    },
+    {
+      type: 'p95_latency_ms',
+      group: 'system',
+      label: t('admin.ops.alertRules.metrics.p95'),
+      description: t('admin.ops.alertRules.metricDescriptions.p95'),
+      recommendedOperator: '>',
+      recommendedThreshold: 1000,
+      unit: 'ms'
+    },
+    {
+      type: 'p99_latency_ms',
+      group: 'system',
+      label: t('admin.ops.alertRules.metrics.p99'),
+      description: t('admin.ops.alertRules.metricDescriptions.p99'),
+      recommendedOperator: '>',
+      recommendedThreshold: 2000,
+      unit: 'ms'
+    },
+    {
+      type: 'cpu_usage_percent',
+      group: 'system',
+      label: t('admin.ops.alertRules.metrics.cpu'),
+      description: t('admin.ops.alertRules.metricDescriptions.cpu'),
+      recommendedOperator: '>',
+      recommendedThreshold: 80,
+      unit: '%'
+    },
+    {
+      type: 'memory_usage_percent',
+      group: 'system',
+      label: t('admin.ops.alertRules.metrics.memory'),
+      description: t('admin.ops.alertRules.metricDescriptions.memory'),
+      recommendedOperator: '>',
+      recommendedThreshold: 80,
+      unit: '%'
+    },
+    {
+      type: 'concurrency_queue_depth',
+      group: 'system',
+      label: t('admin.ops.alertRules.metrics.queueDepth'),
+      description: t('admin.ops.alertRules.metricDescriptions.queueDepth'),
+      recommendedOperator: '>',
+      recommendedThreshold: 10
+    },
+
+    // Group-level metrics (requires group_id filter)
+    {
+      type: 'group_available_accounts',
+      group: 'group',
+      label: t('admin.ops.alertRules.metrics.groupAvailableAccounts'),
+      description: t('admin.ops.alertRules.metricDescriptions.groupAvailableAccounts'),
+      recommendedOperator: '<',
+      recommendedThreshold: 1
+    },
+    {
+      type: 'group_available_ratio',
+      group: 'group',
+      label: t('admin.ops.alertRules.metrics.groupAvailableRatio'),
+      description: t('admin.ops.alertRules.metricDescriptions.groupAvailableRatio'),
+      recommendedOperator: '<',
+      recommendedThreshold: 50,
+      unit: '%'
+    },
+    {
+      type: 'group_rate_limit_ratio',
+      group: 'group',
+      label: t('admin.ops.alertRules.metrics.groupRateLimitRatio'),
+      description: t('admin.ops.alertRules.metricDescriptions.groupRateLimitRatio'),
+      recommendedOperator: '>',
+      recommendedThreshold: 10,
+      unit: '%'
+    },
+
+    // Account-level metrics
+    {
+      type: 'account_rate_limited_count',
+      group: 'account',
+      label: t('admin.ops.alertRules.metrics.accountRateLimitedCount'),
+      description: t('admin.ops.alertRules.metricDescriptions.accountRateLimitedCount'),
+      recommendedOperator: '>',
+      recommendedThreshold: 0
+    },
+    {
+      type: 'account_error_count',
+      group: 'account',
+      label: t('admin.ops.alertRules.metrics.accountErrorCount'),
+      description: t('admin.ops.alertRules.metricDescriptions.accountErrorCount'),
+      recommendedOperator: '>',
+      recommendedThreshold: 0
+    },
+    {
+      type: 'account_error_ratio',
+      group: 'account',
+      label: t('admin.ops.alertRules.metrics.accountErrorRatio'),
+      description: t('admin.ops.alertRules.metricDescriptions.accountErrorRatio'),
+      recommendedOperator: '>',
+      recommendedThreshold: 5,
+      unit: '%'
+    },
+    {
+      type: 'overload_account_count',
+      group: 'account',
+      label: t('admin.ops.alertRules.metrics.overloadAccountCount'),
+      description: t('admin.ops.alertRules.metricDescriptions.overloadAccountCount'),
+      recommendedOperator: '>',
+      recommendedThreshold: 0
+    }
+  ] satisfies MetricDefinition[]
+})
+
+const selectedMetricDefinition = computed(() => {
+  const metricType = draft.value?.metric_type
+  if (!metricType) return null
+  return metricDefinitions.value.find((m) => m.type === metricType) ?? null
+})
+
+const metricOptions = computed(() => {
+  const buildGroup = (group: MetricGroup): SelectOption[] => {
+    const items = metricDefinitions.value.filter((m) => m.group === group)
+    if (items.length === 0) return []
+    const headerValue = `__group__${group}`
+    return [
+      {
+        value: headerValue,
+        label: t(`admin.ops.alertRules.metricGroups.${group}`),
+        disabled: true,
+        kind: 'group'
+      },
+      ...items.map((m) => ({ value: m.type, label: m.label }))
+    ]
+  }
+
+  return [...buildGroup('system'), ...buildGroup('group'), ...buildGroup('account')]
+})
+
+const operatorOptions = computed(() => {
+  const ops: Operator[] = ['>', '>=', '<', '<=', '==', '!=']
+  return ops.map((o) => ({ value: o, label: o }))
+})
+
+const severityOptions = computed(() => {
+  const sev: OpsSeverity[] = ['P0', 'P1', 'P2', 'P3']
+  return sev.map((s) => ({ value: s, label: s }))
+})
+
+const windowOptions = computed(() => {
+  const windows = [1, 5, 60]
+  return windows.map((m) => ({ value: m, label: `${m}m` }))
+})
+
+function newRuleDraft(): AlertRule {
+  return {
+    name: '',
+    description: '',
+    enabled: true,
+    metric_type: 'error_rate',
+    operator: '>',
+    threshold: 1,
+    window_minutes: 1,
+    sustained_minutes: 2,
+    severity: 'P1',
+    cooldown_minutes: 10,
+    notify_email: true
+  }
+}
+
+function openCreate() {
+  editingId.value = null
+  draft.value = newRuleDraft()
+  showEditor.value = true
+}
+
+function openEdit(rule: AlertRule) {
+  editingId.value = rule.id ?? null
+  draft.value = JSON.parse(JSON.stringify(rule))
+  showEditor.value = true
+}
+
+const editorValidation = computed(() => {
+  const errors: string[] = []
+  const r = draft.value
+  if (!r) return { valid: true, errors }
+  if (!r.name || !r.name.trim()) errors.push(t('admin.ops.alertRules.validation.nameRequired'))
+  if (!r.metric_type) errors.push(t('admin.ops.alertRules.validation.metricRequired'))
+  if (groupMetricTypes.has(r.metric_type) && !parsePositiveInt(r.filters?.group_id)) {
+    errors.push(t('admin.ops.alertRules.validation.groupIdRequired'))
+  }
+  if (!r.operator) errors.push(t('admin.ops.alertRules.validation.operatorRequired'))
+  if (!(typeof r.threshold === 'number' && Number.isFinite(r.threshold)))
+    errors.push(t('admin.ops.alertRules.validation.thresholdRequired'))
+  if (!(typeof r.window_minutes === 'number' && Number.isFinite(r.window_minutes) && [1, 5, 60].includes(r.window_minutes))) {
+    errors.push(t('admin.ops.alertRules.validation.windowRange'))
+  }
+  if (!(typeof r.sustained_minutes === 'number' && Number.isFinite(r.sustained_minutes) && r.sustained_minutes >= 1 && r.sustained_minutes <= 1440)) {
+    errors.push(t('admin.ops.alertRules.validation.sustainedRange'))
+  }
+  if (!(typeof r.cooldown_minutes === 'number' && Number.isFinite(r.cooldown_minutes) && r.cooldown_minutes >= 0 && r.cooldown_minutes <= 1440)) {
+    errors.push(t('admin.ops.alertRules.validation.cooldownRange'))
+  }
+  return { valid: errors.length === 0, errors }
+})
+
+async function save() {
+  if (!draft.value) return
+  if (!editorValidation.value.valid) {
+    appStore.showError(editorValidation.value.errors[0] || t('admin.ops.alertRules.validation.invalid'))
+    return
+  }
+  saving.value = true
+  try {
+    if (editingId.value) {
+      await opsAPI.updateAlertRule(editingId.value, draft.value)
+    } else {
+      await opsAPI.createAlertRule(draft.value)
+    }
+    showEditor.value = false
+    draft.value = null
+    editingId.value = null
+    await load()
+    appStore.showSuccess(t('admin.ops.alertRules.saveSuccess'))
+  } catch (err: any) {
+    console.error('[OpsAlertRulesCard] Failed to save rule', err)
+    appStore.showError(err?.response?.data?.detail || t('admin.ops.alertRules.saveFailed'))
+  } finally {
+    saving.value = false
+  }
+}
+
+const showDeleteConfirm = ref(false)
+const pendingDelete = ref<AlertRule | null>(null)
+
+function requestDelete(rule: AlertRule) {
+  pendingDelete.value = rule
+  showDeleteConfirm.value = true
+}
+
+async function confirmDelete() {
+  if (!pendingDelete.value?.id) return
+  try {
+    await opsAPI.deleteAlertRule(pendingDelete.value.id)
+    showDeleteConfirm.value = false
+    pendingDelete.value = null
+    await load()
+    appStore.showSuccess(t('admin.ops.alertRules.deleteSuccess'))
+  } catch (err: any) {
+    console.error('[OpsAlertRulesCard] Failed to delete rule', err)
+    appStore.showError(err?.response?.data?.detail || t('admin.ops.alertRules.deleteFailed'))
+  }
+}
+
+function cancelDelete() {
+  showDeleteConfirm.value = false
+  pendingDelete.value = null
+}
+</script>
+
+<template>
+  <div class="rounded-3xl bg-white p-6 shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700">
+    <div class="mb-4 flex items-start justify-between gap-4">
+      <div>
+        <h3 class="text-sm font-bold text-gray-900 dark:text-white">{{ t('admin.ops.alertRules.title') }}</h3>
+        <p class="mt-1 text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.alertRules.description') }}</p>
+      </div>
+
+      <div class="flex items-center gap-2">
+        <button class="btn btn-sm btn-primary" :disabled="loading" @click="openCreate">
+          {{ t('admin.ops.alertRules.create') }}
+        </button>
+        <button
+          class="flex items-center gap-1.5 rounded-lg bg-gray-100 px-3 py-1.5 text-xs font-bold text-gray-700 transition-colors hover:bg-gray-200 disabled:cursor-not-allowed disabled:opacity-50 dark:bg-dark-700 dark:text-gray-300 dark:hover:bg-dark-600"
+          :disabled="loading"
+          @click="load"
+        >
+          <svg class="h-3.5 w-3.5" :class="{ 'animate-spin': loading }" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 4v5h.582m15.356 2A8.001 8.001 0 004.582 9m0 0H9m11 11v-5h-.581m0 0a8.003 8.003 0 01-15.357-2m15.357 2H15" />
+          </svg>
+          {{ t('common.refresh') }}
+        </button>
+      </div>
+    </div>
+
+    <div v-if="loading" class="py-10 text-center text-sm text-gray-500 dark:text-gray-400">
+      {{ t('admin.ops.alertRules.loading') }}
+    </div>
+
+    <div v-else-if="sortedRules.length === 0" class="rounded-xl border border-dashed border-gray-200 p-8 text-center text-sm text-gray-500 dark:border-dark-700 dark:text-gray-400">
+      {{ t('admin.ops.alertRules.empty') }}
+    </div>
+
+    <div v-else class="max-h-[520px] overflow-hidden rounded-xl border border-gray-200 dark:border-dark-700">
+      <div class="max-h-[520px] overflow-y-auto">
+        <table class="min-w-full divide-y divide-gray-200 dark:divide-dark-700">
+          <thead class="sticky top-0 z-10 bg-gray-50 dark:bg-dark-900">
+            <tr>
+              <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                {{ t('admin.ops.alertRules.table.name') }}
+              </th>
+              <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                {{ t('admin.ops.alertRules.table.metric') }}
+              </th>
+              <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                {{ t('admin.ops.alertRules.table.severity') }}
+              </th>
+              <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                {{ t('admin.ops.alertRules.table.enabled') }}
+              </th>
+              <th class="px-4 py-3 text-right text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                {{ t('admin.ops.alertRules.table.actions') }}
+              </th>
+            </tr>
+          </thead>
+          <tbody class="divide-y divide-gray-200 bg-white dark:divide-dark-700 dark:bg-dark-800">
+            <tr v-for="row in sortedRules" :key="row.id" class="hover:bg-gray-50 dark:hover:bg-dark-700/50">
+              <td class="px-4 py-3">
+                <div class="text-xs font-bold text-gray-900 dark:text-white">{{ row.name }}</div>
+                <div v-if="row.description" class="mt-0.5 line-clamp-2 text-[11px] text-gray-500 dark:text-gray-400">
+                  {{ row.description }}
+                </div>
+                <div v-if="row.updated_at" class="mt-1 text-[10px] text-gray-400">
+                  {{ formatDateTime(row.updated_at) }}
+                </div>
+              </td>
+              <td class="whitespace-nowrap px-4 py-3 text-xs text-gray-700 dark:text-gray-200">
+                <span class="font-mono">{{ row.metric_type }}</span>
+                <span class="mx-1 text-gray-400">{{ row.operator }}</span>
+                <span class="font-mono">{{ row.threshold }}</span>
+              </td>
+              <td class="whitespace-nowrap px-4 py-3 text-xs font-bold text-gray-700 dark:text-gray-200">
+                {{ row.severity }}
+              </td>
+              <td class="whitespace-nowrap px-4 py-3 text-xs text-gray-700 dark:text-gray-200">
+                {{ row.enabled ? t('common.enabled') : t('common.disabled') }}
+              </td>
+              <td class="whitespace-nowrap px-4 py-3 text-right text-xs">
+                <button class="btn btn-sm btn-secondary" @click="openEdit(row)">{{ t('common.edit') }}</button>
+                <button class="ml-2 btn btn-sm btn-danger" @click="requestDelete(row)">{{ t('common.delete') }}</button>
+              </td>
+            </tr>
+          </tbody>
+        </table>
+      </div>
+    </div>
+
+    <BaseDialog
+      :show="showEditor"
+      :title="editingId ? t('admin.ops.alertRules.editTitle') : t('admin.ops.alertRules.createTitle')"
+      width="wide"
+      @close="showEditor = false"
+    >
+      <div class="space-y-4">
+        <div v-if="!editorValidation.valid" class="rounded-xl bg-red-50 p-4 text-xs text-red-700 dark:bg-red-900/30 dark:text-red-300">
+          <div class="font-bold">{{ t('admin.ops.alertRules.validation.title') }}</div>
+          <ul class="mt-1 list-disc pl-5">
+            <li v-for="e in editorValidation.errors" :key="e">{{ e }}</li>
+          </ul>
+        </div>
+
+        <div class="grid grid-cols-1 gap-4 md:grid-cols-2">
+          <div class="md:col-span-2">
+            <label class="input-label">{{ t('admin.ops.alertRules.form.name') }}</label>
+            <input v-model="draft!.name" class="input" type="text" />
+          </div>
+
+          <div class="md:col-span-2">
+            <label class="input-label">{{ t('admin.ops.alertRules.form.description') }}</label>
+            <input v-model="draft!.description" class="input" type="text" />
+          </div>
+
+          <div>
+            <label class="input-label">{{ t('admin.ops.alertRules.form.metric') }}</label>
+            <Select v-model="draft!.metric_type" :options="metricOptions" />
+            <div v-if="selectedMetricDefinition" class="mt-1 space-y-0.5 text-xs text-gray-500 dark:text-gray-400">
+              <p>{{ selectedMetricDefinition.description }}</p>
+              <p>
+                {{
+                  t('admin.ops.alertRules.hints.recommended', {
+                    operator: selectedMetricDefinition.recommendedOperator,
+                    threshold: selectedMetricDefinition.recommendedThreshold,
+                    unit: selectedMetricDefinition.unit || ''
+                  })
+                }}
+              </p>
+            </div>
+          </div>
+
+          <div>
+            <label class="input-label">{{ t('admin.ops.alertRules.form.operator') }}</label>
+            <Select v-model="draft!.operator" :options="operatorOptions" />
+          </div>
+
+          <div class="md:col-span-2">
+            <label class="input-label">
+              {{ t('admin.ops.alertRules.form.groupId') }}
+              <span v-if="isGroupMetricSelected" class="ml-1 text-red-500">*</span>
+            </label>
+            <Select
+              v-model="draftGroupId"
+              :options="groupOptions"
+              searchable
+              :placeholder="t('admin.ops.alertRules.form.groupPlaceholder')"
+              :error="isGroupMetricSelected && !draftGroupId"
+            />
+            <p class="mt-1 text-xs text-gray-500 dark:text-gray-400">
+              {{ isGroupMetricSelected ? t('admin.ops.alertRules.hints.groupRequired') : t('admin.ops.alertRules.hints.groupOptional') }}
+            </p>
+          </div>
+
+          <div>
+            <label class="input-label">{{ t('admin.ops.alertRules.form.threshold') }}</label>
+            <input v-model.number="draft!.threshold" class="input" type="number" />
+          </div>
+
+          <div>
+            <label class="input-label">{{ t('admin.ops.alertRules.form.severity') }}</label>
+            <Select v-model="draft!.severity" :options="severityOptions" />
+          </div>
+
+          <div>
+            <label class="input-label">{{ t('admin.ops.alertRules.form.window') }}</label>
+            <Select v-model="draft!.window_minutes" :options="windowOptions" />
+          </div>
+
+          <div>
+            <label class="input-label">{{ t('admin.ops.alertRules.form.sustained') }}</label>
+            <input v-model.number="draft!.sustained_minutes" class="input" type="number" min="1" max="1440" />
+          </div>
+
+          <div>
+            <label class="input-label">{{ t('admin.ops.alertRules.form.cooldown') }}</label>
+            <input v-model.number="draft!.cooldown_minutes" class="input" type="number" min="0" max="1440" />
+          </div>
+
+          <div class="flex items-center justify-between rounded-xl bg-gray-50 px-4 py-3 dark:bg-dark-800/50 md:col-span-2">
+            <span class="text-xs font-bold text-gray-700 dark:text-gray-200">{{ t('admin.ops.alertRules.form.enabled') }}</span>
+            <input v-model="draft!.enabled" type="checkbox" class="h-4 w-4 rounded border-gray-300 text-primary-600 focus:ring-primary-500" />
+          </div>
+
+          <div class="flex items-center justify-between rounded-xl bg-gray-50 px-4 py-3 dark:bg-dark-800/50 md:col-span-2">
+            <span class="text-xs font-bold text-gray-700 dark:text-gray-200">{{ t('admin.ops.alertRules.form.notifyEmail') }}</span>
+            <input v-model="draft!.notify_email" type="checkbox" class="h-4 w-4 rounded border-gray-300 text-primary-600 focus:ring-primary-500" />
+          </div>
+        </div>
+      </div>
+
+      <template #footer>
+        <div class="flex items-center justify-end gap-2">
+          <button class="btn btn-secondary" :disabled="saving" @click="showEditor = false">
+            {{ t('common.cancel') }}
+          </button>
+          <button class="btn btn-primary" :disabled="saving" @click="save">
+            {{ saving ? t('common.saving') : t('common.save') }}
+          </button>
+        </div>
+      </template>
+    </BaseDialog>
+
+    <ConfirmDialog
+      :show="showDeleteConfirm"
+      :title="t('admin.ops.alertRules.deleteConfirmTitle')"
+      :message="t('admin.ops.alertRules.deleteConfirmMessage')"
+      :confirmText="t('common.delete')"
+      :cancelText="t('common.cancel')"
+      @confirm="confirmDelete"
+      @cancel="cancelDelete"
+    />
+  </div>
+</template>
diff --git a/frontend/src/views/admin/ops/components/OpsConcurrencyCard.vue b/frontend/src/views/admin/ops/components/OpsConcurrencyCard.vue
new file mode 100644
index 00000000..2104d1f7
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsConcurrencyCard.vue
@@ -0,0 +1,525 @@
+<script setup lang="ts">
+import { computed, onMounted, onUnmounted, ref, watch } from 'vue'
+import { useI18n } from 'vue-i18n'
+import { useIntervalFn } from '@vueuse/core'
+import { opsAPI, type OpsAccountAvailabilityStatsResponse, type OpsConcurrencyStatsResponse } from '@/api/admin/ops'
+
+interface Props {
+  platformFilter?: string
+  groupIdFilter?: number | null
+}
+
+const props = withDefaults(defineProps<Props>(), {
+  platformFilter: '',
+  groupIdFilter: null
+})
+
+const { t } = useI18n()
+
+const loading = ref(false)
+const errorMessage = ref('')
+const concurrency = ref<OpsConcurrencyStatsResponse | null>(null)
+const availability = ref<OpsAccountAvailabilityStatsResponse | null>(null)
+
+const realtimeEnabled = computed(() => {
+  return (concurrency.value?.enabled ?? true) && (availability.value?.enabled ?? true)
+})
+
+function safeNumber(n: unknown): number {
+  return typeof n === 'number' && Number.isFinite(n) ? n : 0
+}
+
+// 计算显示维度
+const displayDimension = computed<'platform' | 'group' | 'account'>(() => {
+  if (typeof props.groupIdFilter === 'number' && props.groupIdFilter > 0) {
+    return 'account'
+  }
+  if (props.platformFilter) {
+    return 'group'
+  }
+  return 'platform'
+})
+
+// 平台/分组汇总行数据
+interface SummaryRow {
+  key: string
+  name: string
+  platform?: string
+  // 账号统计
+  total_accounts: number
+  available_accounts: number
+  rate_limited_accounts: number
+  error_accounts: number
+  // 并发统计
+  total_concurrency: number
+  used_concurrency: number
+  waiting_in_queue: number
+  // 计算字段
+  availability_percentage: number
+  concurrency_percentage: number
+}
+
+// 账号详细行数据
+interface AccountRow {
+  key: string
+  name: string
+  platform: string
+  group_name: string
+  // 并发
+  current_in_use: number
+  max_capacity: number
+  waiting_in_queue: number
+  load_percentage: number
+  // 状态
+  is_available: boolean
+  is_rate_limited: boolean
+  rate_limit_remaining_sec?: number
+  is_overloaded: boolean
+  overload_remaining_sec?: number
+  has_error: boolean
+  error_message?: string
+}
+
+// 平台维度汇总
+const platformRows = computed((): SummaryRow[] => {
+  const concStats = concurrency.value?.platform || {}
+  const availStats = availability.value?.platform || {}
+
+  const platforms = new Set([...Object.keys(concStats), ...Object.keys(availStats)])
+
+  return Array.from(platforms).map(platform => {
+    const conc = concStats[platform] || {}
+    const avail = availStats[platform] || {}
+
+    const totalAccounts = safeNumber(avail.total_accounts)
+    const availableAccounts = safeNumber(avail.available_count)
+    const totalConcurrency = safeNumber(conc.max_capacity)
+    const usedConcurrency = safeNumber(conc.current_in_use)
+
+    return {
+      key: platform,
+      name: platform.toUpperCase(),
+      total_accounts: totalAccounts,
+      available_accounts: availableAccounts,
+      rate_limited_accounts: safeNumber(avail.rate_limit_count),
+      error_accounts: safeNumber(avail.error_count),
+      total_concurrency: totalConcurrency,
+      used_concurrency: usedConcurrency,
+      waiting_in_queue: safeNumber(conc.waiting_in_queue),
+      availability_percentage: totalAccounts > 0 ? Math.round((availableAccounts / totalAccounts) * 100) : 0,
+      concurrency_percentage: totalConcurrency > 0 ? Math.round((usedConcurrency / totalConcurrency) * 100) : 0
+    }
+  }).sort((a, b) => b.concurrency_percentage - a.concurrency_percentage)
+})
+
+// 分组维度汇总
+const groupRows = computed((): SummaryRow[] => {
+  const concStats = concurrency.value?.group || {}
+  const availStats = availability.value?.group || {}
+
+  const groupIds = new Set([...Object.keys(concStats), ...Object.keys(availStats)])
+
+  const rows = Array.from(groupIds)
+    .map(gid => {
+      const conc = concStats[gid] || {}
+      const avail = availStats[gid] || {}
+
+      // 只显示匹配的平台
+      if (props.platformFilter && conc.platform !== props.platformFilter && avail.platform !== props.platformFilter) {
+        return null
+      }
+
+      const totalAccounts = safeNumber(avail.total_accounts)
+      const availableAccounts = safeNumber(avail.available_count)
+      const totalConcurrency = safeNumber(conc.max_capacity)
+      const usedConcurrency = safeNumber(conc.current_in_use)
+
+      return {
+        key: gid,
+        name: String(conc.group_name || avail.group_name || `Group ${gid}`),
+        platform: String(conc.platform || avail.platform || ''),
+        total_accounts: totalAccounts,
+        available_accounts: availableAccounts,
+        rate_limited_accounts: safeNumber(avail.rate_limit_count),
+        error_accounts: safeNumber(avail.error_count),
+        total_concurrency: totalConcurrency,
+        used_concurrency: usedConcurrency,
+        waiting_in_queue: safeNumber(conc.waiting_in_queue),
+        availability_percentage: totalAccounts > 0 ? Math.round((availableAccounts / totalAccounts) * 100) : 0,
+        concurrency_percentage: totalConcurrency > 0 ? Math.round((usedConcurrency / totalConcurrency) * 100) : 0
+      }
+    })
+    .filter((row): row is NonNullable<typeof row> => row !== null)
+
+  return rows.sort((a, b) => b.concurrency_percentage - a.concurrency_percentage)
+})
+
+// 账号维度详细
+const accountRows = computed((): AccountRow[] => {
+  const concStats = concurrency.value?.account || {}
+  const availStats = availability.value?.account || {}
+
+  const accountIds = new Set([...Object.keys(concStats), ...Object.keys(availStats)])
+
+  const rows = Array.from(accountIds)
+    .map(aid => {
+      const conc = concStats[aid] || {}
+      const avail = availStats[aid] || {}
+
+      // 只显示匹配的分组
+      if (typeof props.groupIdFilter === 'number' && props.groupIdFilter > 0) {
+        if (conc.group_id !== props.groupIdFilter && avail.group_id !== props.groupIdFilter) {
+          return null
+        }
+      }
+
+      return {
+        key: aid,
+        name: String(conc.account_name || avail.account_name || `Account ${aid}`),
+        platform: String(conc.platform || avail.platform || ''),
+        group_name: String(conc.group_name || avail.group_name || ''),
+        current_in_use: safeNumber(conc.current_in_use),
+        max_capacity: safeNumber(conc.max_capacity),
+        waiting_in_queue: safeNumber(conc.waiting_in_queue),
+        load_percentage: safeNumber(conc.load_percentage),
+        is_available: avail.is_available || false,
+        is_rate_limited: avail.is_rate_limited || false,
+        rate_limit_remaining_sec: avail.rate_limit_remaining_sec,
+        is_overloaded: avail.is_overloaded || false,
+        overload_remaining_sec: avail.overload_remaining_sec,
+        has_error: avail.has_error || false,
+        error_message: avail.error_message || ''
+      }
+    })
+    .filter((row): row is NonNullable<typeof row> => row !== null)
+
+  return rows.sort((a, b) => {
+    // 优先显示异常账号
+    if (a.has_error !== b.has_error) return a.has_error ? -1 : 1
+    if (a.is_rate_limited !== b.is_rate_limited) return a.is_rate_limited ? -1 : 1
+    // 然后按负载排序
+    return b.load_percentage - a.load_percentage
+  })
+})
+
+// 根据维度选择数据
+const displayRows = computed(() => {
+  if (displayDimension.value === 'account') return accountRows.value
+  if (displayDimension.value === 'group') return groupRows.value
+  return platformRows.value
+})
+
+const displayTitle = computed(() => {
+  if (displayDimension.value === 'account') return t('admin.ops.concurrency.byAccount')
+  if (displayDimension.value === 'group') return t('admin.ops.concurrency.byGroup')
+  return t('admin.ops.concurrency.byPlatform')
+})
+
+async function loadData() {
+  loading.value = true
+  errorMessage.value = ''
+  try {
+    const [concData, availData] = await Promise.all([
+      opsAPI.getConcurrencyStats(props.platformFilter, props.groupIdFilter),
+      opsAPI.getAccountAvailabilityStats(props.platformFilter, props.groupIdFilter)
+    ])
+    concurrency.value = concData
+    availability.value = availData
+  } catch (err: any) {
+    console.error('[OpsConcurrencyCard] Failed to load data', err)
+    errorMessage.value = err?.response?.data?.detail || t('admin.ops.concurrency.loadFailed')
+  } finally {
+    loading.value = false
+  }
+}
+
+// 定期刷新（5秒）
+const { pause: pauseRefresh, resume: resumeRefresh } = useIntervalFn(
+  () => {
+    if (realtimeEnabled.value) {
+      loadData()
+    }
+  },
+  5000,
+  { immediate: false }
+)
+
+function getLoadBarClass(loadPct: number): string {
+  if (loadPct >= 90) return 'bg-red-500 dark:bg-red-600'
+  if (loadPct >= 70) return 'bg-orange-500 dark:bg-orange-600'
+  if (loadPct >= 50) return 'bg-yellow-500 dark:bg-yellow-600'
+  return 'bg-green-500 dark:bg-green-600'
+}
+
+function getLoadBarStyle(loadPct: number): string {
+  return `width: ${Math.min(100, Math.max(0, loadPct))}%`
+}
+
+function getLoadTextClass(loadPct: number): string {
+  if (loadPct >= 90) return 'text-red-600 dark:text-red-400'
+  if (loadPct >= 70) return 'text-orange-600 dark:text-orange-400'
+  if (loadPct >= 50) return 'text-yellow-600 dark:text-yellow-400'
+  return 'text-green-600 dark:text-green-400'
+}
+
+function formatDuration(seconds: number): string {
+  if (seconds <= 0) return '0s'
+  if (seconds < 60) return `${Math.round(seconds)}s`
+  const minutes = Math.floor(seconds / 60)
+  if (minutes < 60) return `${minutes}m`
+  const hours = Math.floor(minutes / 60)
+  return `${hours}h`
+}
+
+onMounted(() => {
+  loadData()
+  resumeRefresh()
+})
+
+onUnmounted(() => {
+  pauseRefresh()
+})
+
+watch(realtimeEnabled, async (enabled) => {
+  if (!enabled) {
+    pauseRefresh()
+  } else {
+    resumeRefresh()
+    await loadData()
+  }
+})
+</script>
+
+<template>
+  <div class="flex h-full flex-col rounded-3xl bg-white p-6 shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700">
+    <!-- 头部 -->
+    <div class="mb-4 flex shrink-0 items-center justify-between gap-3">
+      <h3 class="flex items-center gap-2 text-sm font-bold text-gray-900 dark:text-white">
+        <svg class="h-4 w-4 text-blue-500" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+          <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M13 10V3L4 14h7v7l9-11h-7z" />
+        </svg>
+        {{ t('admin.ops.concurrency.title') }}
+      </h3>
+      <button
+        class="flex items-center gap-1 rounded-lg bg-gray-100 px-2 py-1 text-[11px] font-semibold text-gray-700 transition-colors hover:bg-gray-200 disabled:cursor-not-allowed disabled:opacity-50 dark:bg-dark-700 dark:text-gray-300 dark:hover:bg-dark-600"
+        :disabled="loading"
+        :title="t('common.refresh')"
+        @click="loadData"
+      >
+        <svg class="h-3 w-3" :class="{ 'animate-spin': loading }" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+          <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 4v5h.582m15.356 2A8.001 8.001 0 004.582 9m0 0H9m11 11v-5h-.581m0 0a8.003 8.003 0 01-15.357-2m15.357 2H15" />
+        </svg>
+      </button>
+    </div>
+
+    <!-- 错误提示 -->
+    <div v-if="errorMessage" class="mb-3 shrink-0 rounded-xl bg-red-50 p-2.5 text-xs text-red-600 dark:bg-red-900/20 dark:text-red-400">
+      {{ errorMessage }}
+    </div>
+
+    <!-- 禁用状态 -->
+    <div
+      v-if="!realtimeEnabled"
+      class="flex flex-1 items-center justify-center rounded-xl border border-dashed border-gray-200 text-sm text-gray-500 dark:border-dark-700 dark:text-gray-400"
+    >
+      {{ t('admin.ops.concurrency.disabledHint') }}
+    </div>
+
+    <!-- 数据展示区域 -->
+    <div v-else class="flex min-h-0 flex-1 flex-col overflow-hidden rounded-xl border border-gray-200 dark:border-dark-700">
+      <!-- 维度标题栏 -->
+      <div class="flex shrink-0 items-center justify-between border-b border-gray-200 bg-gray-50 px-3 py-2 dark:border-dark-700 dark:bg-dark-900">
+        <span class="text-[10px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+          {{ displayTitle }}
+        </span>
+        <span class="text-[10px] text-gray-500 dark:text-gray-400">
+          {{ t('admin.ops.concurrency.totalRows', { count: displayRows.length }) }}
+        </span>
+      </div>
+
+      <!-- 空状态 -->
+      <div v-if="displayRows.length === 0" class="flex flex-1 items-center justify-center text-sm text-gray-500 dark:text-gray-400">
+        {{ t('admin.ops.concurrency.empty') }}
+      </div>
+
+      <!-- 汇总视图（平台/分组） -->
+      <div v-else-if="displayDimension !== 'account'" class="custom-scrollbar max-h-[360px] flex-1 space-y-2 overflow-y-auto p-3">
+        <div v-for="row in (displayRows as SummaryRow[])" :key="row.key" class="rounded-lg bg-gray-50 p-3 dark:bg-dark-900">
+          <!-- 标题行 -->
+          <div class="mb-2 flex items-center justify-between gap-2">
+            <div class="flex items-center gap-2">
+              <div class="truncate text-[11px] font-bold text-gray-900 dark:text-white" :title="row.name">
+                {{ row.name }}
+              </div>
+              <span v-if="displayDimension === 'group' && row.platform" class="text-[10px] text-gray-400 dark:text-gray-500">
+                {{ row.platform.toUpperCase() }}
+              </span>
+            </div>
+            <div class="flex shrink-0 items-center gap-2 text-[10px]">
+              <span class="font-mono font-bold text-gray-900 dark:text-white"> {{ row.used_concurrency }}/{{ row.total_concurrency }} </span>
+              <span :class="['font-bold', getLoadTextClass(row.concurrency_percentage)]"> {{ row.concurrency_percentage }}% </span>
+            </div>
+          </div>
+
+          <!-- 进度条 -->
+          <div class="mb-2 h-1.5 w-full overflow-hidden rounded-full bg-gray-200 dark:bg-dark-700">
+            <div
+              class="h-full rounded-full transition-all duration-300"
+              :class="getLoadBarClass(row.concurrency_percentage)"
+              :style="getLoadBarStyle(row.concurrency_percentage)"
+            ></div>
+          </div>
+
+          <!-- 统计信息 -->
+          <div class="flex flex-wrap items-center gap-x-3 gap-y-1 text-[10px]">
+            <!-- 账号统计 -->
+            <div class="flex items-center gap-1">
+              <svg class="h-3 w-3 text-gray-400" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+                <path
+                  stroke-linecap="round"
+                  stroke-linejoin="round"
+                  stroke-width="2"
+                  d="M17 20h5v-2a3 3 0 00-5.356-1.857M17 20H7m10 0v-2c0-.656-.126-1.283-.356-1.857M7 20H2v-2a3 3 0 015.356-1.857M7 20v-2c0-.656.126-1.283.356-1.857m0 0a5.002 5.002 0 019.288 0M15 7a3 3 0 11-6 0 3 3 0 016 0zm6 3a2 2 0 11-4 0 2 2 0 014 0zM7 10a2 2 0 11-4 0 2 2 0 014 0z"
+                />
+              </svg>
+              <span class="text-gray-600 dark:text-gray-300">
+                <span class="font-bold text-green-600 dark:text-green-400">{{ row.available_accounts }}</span
+                >/{{ row.total_accounts }}
+              </span>
+              <span class="text-gray-400 dark:text-gray-500">{{ row.availability_percentage }}%</span>
+            </div>
+
+            <!-- 限流账号 -->
+            <span
+              v-if="row.rate_limited_accounts > 0"
+              class="rounded-full bg-amber-100 px-1.5 py-0.5 font-semibold text-amber-700 dark:bg-amber-900/30 dark:text-amber-400"
+            >
+              {{ t('admin.ops.concurrency.rateLimited', { count: row.rate_limited_accounts }) }}
+            </span>
+
+            <!-- 异常账号 -->
+            <span
+              v-if="row.error_accounts > 0"
+              class="rounded-full bg-red-100 px-1.5 py-0.5 font-semibold text-red-700 dark:bg-red-900/30 dark:text-red-400"
+            >
+              {{ t('admin.ops.concurrency.errorAccounts', { count: row.error_accounts }) }}
+            </span>
+
+            <!-- 等待队列 -->
+            <span
+              v-if="row.waiting_in_queue > 0"
+              class="rounded-full bg-purple-100 px-1.5 py-0.5 font-semibold text-purple-700 dark:bg-purple-900/30 dark:text-purple-400"
+            >
+              {{ t('admin.ops.concurrency.queued', { count: row.waiting_in_queue }) }}
+            </span>
+          </div>
+        </div>
+      </div>
+
+      <!-- 账号详细视图 -->
+      <div v-else class="custom-scrollbar max-h-[360px] flex-1 space-y-2 overflow-y-auto p-3">
+        <div v-for="row in (displayRows as AccountRow[])" :key="row.key" class="rounded-lg bg-gray-50 p-2.5 dark:bg-dark-900">
+          <!-- 账号名称和并发 -->
+          <div class="mb-1.5 flex items-center justify-between gap-2">
+            <div class="min-w-0 flex-1">
+              <div class="truncate text-[11px] font-bold text-gray-900 dark:text-white" :title="row.name">
+                {{ row.name }}
+              </div>
+              <div class="mt-0.5 text-[9px] text-gray-400 dark:text-gray-500">
+                {{ row.group_name }}
+              </div>
+            </div>
+            <div class="flex shrink-0 items-center gap-2">
+              <!-- 并发使用 -->
+              <span class="font-mono text-[11px] font-bold text-gray-900 dark:text-white"> {{ row.current_in_use }}/{{ row.max_capacity }} </span>
+              <!-- 状态徽章 -->
+              <span
+                v-if="row.is_available"
+                class="inline-flex items-center gap-1 rounded bg-green-100 px-1.5 py-0.5 text-[10px] font-medium text-green-700 dark:bg-green-900/30 dark:text-green-400"
+              >
+                <svg class="h-3 w-3" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+                  <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7" />
+                </svg>
+                {{ t('admin.ops.accountAvailability.available') }}
+              </span>
+              <span
+                v-else-if="row.is_rate_limited"
+                class="inline-flex items-center gap-1 rounded bg-amber-100 px-1.5 py-0.5 text-[10px] font-medium text-amber-700 dark:bg-amber-900/30 dark:text-amber-400"
+              >
+                <svg class="h-3 w-3" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+                  <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 8v4l3 3m6-3a9 9 0 11-18 0 9 9 0 0118 0z" />
+                </svg>
+                {{ formatDuration(row.rate_limit_remaining_sec || 0) }}
+              </span>
+              <span
+                v-else-if="row.is_overloaded"
+                class="inline-flex items-center gap-1 rounded bg-red-100 px-1.5 py-0.5 text-[10px] font-medium text-red-700 dark:bg-red-900/30 dark:text-red-400"
+              >
+                <svg class="h-3 w-3" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+                  <path
+                    stroke-linecap="round"
+                    stroke-linejoin="round"
+                    stroke-width="2"
+                    d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-3L13.732 4c-.77-1.333-2.694-1.333-3.464 0L3.34 16c-.77 1.333.192 3 1.732 3z"
+                  />
+                </svg>
+                {{ formatDuration(row.overload_remaining_sec || 0) }}
+              </span>
+              <span
+                v-else-if="row.has_error"
+                class="inline-flex items-center gap-1 rounded bg-red-100 px-1.5 py-0.5 text-[10px] font-medium text-red-700 dark:bg-red-900/30 dark:text-red-400"
+              >
+                <svg class="h-3 w-3" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+                  <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M6 18L18 6M6 6l12 12" />
+                </svg>
+                {{ t('admin.ops.accountAvailability.accountError') }}
+              </span>
+              <span
+                v-else
+                class="inline-flex items-center gap-1 rounded bg-gray-100 px-1.5 py-0.5 text-[10px] font-medium text-gray-700 dark:bg-gray-800 dark:text-gray-400"
+              >
+                {{ t('admin.ops.accountAvailability.unavailable') }}
+              </span>
+            </div>
+          </div>
+
+          <!-- 进度条 -->
+          <div class="h-1.5 w-full overflow-hidden rounded-full bg-gray-200 dark:bg-dark-700">
+            <div class="h-full rounded-full transition-all duration-300" :class="getLoadBarClass(row.load_percentage)" :style="getLoadBarStyle(row.load_percentage)"></div>
+          </div>
+
+          <!-- 等待队列 -->
+          <div v-if="row.waiting_in_queue > 0" class="mt-1.5 flex justify-end">
+            <span class="rounded-full bg-purple-100 px-1.5 py-0.5 text-[10px] font-semibold text-purple-700 dark:bg-purple-900/30 dark:text-purple-400">
+              {{ t('admin.ops.concurrency.queued', { count: row.waiting_in_queue }) }}
+            </span>
+          </div>
+        </div>
+      </div>
+    </div>
+  </div>
+</template>
+
+<style scoped>
+.custom-scrollbar {
+  scrollbar-width: thin;
+  scrollbar-color: rgba(156, 163, 175, 0.3) transparent;
+}
+
+.custom-scrollbar::-webkit-scrollbar {
+  width: 6px;
+}
+
+.custom-scrollbar::-webkit-scrollbar-track {
+  background: transparent;
+}
+
+.custom-scrollbar::-webkit-scrollbar-thumb {
+  background-color: rgba(156, 163, 175, 0.3);
+  border-radius: 3px;
+}
+
+.custom-scrollbar::-webkit-scrollbar-thumb:hover {
+  background-color: rgba(156, 163, 175, 0.5);
+}
+</style>
diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
new file mode 100644
index 00000000..ccb5dac7
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
@@ -0,0 +1,1387 @@
+<script setup lang="ts">
+import { computed, onMounted, ref, watch } from 'vue'
+import { useI18n } from 'vue-i18n'
+import Select from '@/components/common/Select.vue'
+import HelpTooltip from '@/components/common/HelpTooltip.vue'
+import BaseDialog from '@/components/common/BaseDialog.vue'
+import { adminAPI } from '@/api'
+import type { OpsDashboardOverview, OpsWSStatus } from '@/api/admin/ops'
+import type { OpsRequestDetailsPreset } from './OpsRequestDetailsModal.vue'
+import { formatNumber } from '@/utils/format'
+
+type RealtimeWindow = '1min' | '5min' | '30min' | '1h'
+
+interface Props {
+  overview?: OpsDashboardOverview | null
+  wsStatus: OpsWSStatus
+  wsReconnectInMs?: number | null
+  wsHasData?: boolean
+  realTimeQps: number
+  realTimeTps: number
+  platform: string
+  groupId: number | null
+  timeRange: string
+  queryMode: string
+  loading: boolean
+  lastUpdated: Date | null
+}
+
+interface Emits {
+  (e: 'update:platform', value: string): void
+  (e: 'update:group', value: number | null): void
+  (e: 'update:timeRange', value: string): void
+  (e: 'update:queryMode', value: string): void
+  (e: 'refresh'): void
+  (e: 'openRequestDetails', preset?: OpsRequestDetailsPreset): void
+  (e: 'openErrorDetails', kind: 'request' | 'upstream'): void
+  (e: 'openSettings'): void
+  (e: 'openAlertRules'): void
+}
+
+const props = defineProps<Props>()
+const emit = defineEmits<Emits>()
+
+const { t } = useI18n()
+
+const realtimeWindow = ref<RealtimeWindow>('1min')
+
+const overview = computed(() => props.overview ?? null)
+const systemMetrics = computed(() => overview.value?.system_metrics ?? null)
+
+// --- Filters ---
+
+const groups = ref<Array<{ id: number; name: string; platform: string }>>([])
+
+const platformOptions = computed(() => [
+  { value: '', label: t('common.all') },
+  { value: 'openai', label: 'OpenAI' },
+  { value: 'anthropic', label: 'Anthropic' },
+  { value: 'gemini', label: 'Gemini' },
+  { value: 'antigravity', label: 'Antigravity' }
+])
+
+const timeRangeOptions = computed(() => [
+  { value: '5m', label: t('admin.ops.timeRange.5m') },
+  { value: '30m', label: t('admin.ops.timeRange.30m') },
+  { value: '1h', label: t('admin.ops.timeRange.1h') },
+  { value: '6h', label: t('admin.ops.timeRange.6h') },
+  { value: '24h', label: t('admin.ops.timeRange.24h') }
+])
+
+const queryModeOptions = computed(() => [
+  { value: 'auto', label: t('admin.ops.queryMode.auto') },
+  { value: 'raw', label: t('admin.ops.queryMode.raw') },
+  { value: 'preagg', label: t('admin.ops.queryMode.preagg') }
+])
+
+const groupOptions = computed(() => {
+  const filtered = props.platform ? groups.value.filter((g) => g.platform === props.platform) : groups.value
+  return [{ value: null, label: t('common.all') }, ...filtered.map((g) => ({ value: g.id, label: g.name }))]
+})
+
+watch(
+  () => props.platform,
+  (newPlatform) => {
+    if (!newPlatform) return
+    const currentGroup = groups.value.find((g) => g.id === props.groupId)
+    if (currentGroup && currentGroup.platform !== newPlatform) {
+      emit('update:group', null)
+    }
+  }
+)
+
+onMounted(async () => {
+  try {
+    const list = await adminAPI.groups.getAll()
+    groups.value = list.map((g) => ({ id: g.id, name: g.name, platform: g.platform }))
+  } catch (e) {
+    console.error('[OpsDashboardHeader] Failed to load groups', e)
+    groups.value = []
+  }
+})
+
+function handlePlatformChange(val: string | number | boolean | null) {
+  emit('update:platform', String(val || ''))
+}
+
+function handleGroupChange(val: string | number | boolean | null) {
+  if (val === null || val === '' || typeof val === 'boolean') {
+    emit('update:group', null)
+    return
+  }
+  const id = typeof val === 'number' ? val : Number.parseInt(String(val), 10)
+  emit('update:group', Number.isFinite(id) && id > 0 ? id : null)
+}
+
+function handleTimeRangeChange(val: string | number | boolean | null) {
+  emit('update:timeRange', String(val || '1h'))
+}
+
+function handleQueryModeChange(val: string | number | boolean | null) {
+  emit('update:queryMode', String(val || 'auto'))
+}
+
+function openDetails(preset?: OpsRequestDetailsPreset) {
+  emit('openRequestDetails', preset)
+}
+
+function openErrorDetails(kind: 'request' | 'upstream') {
+  emit('openErrorDetails', kind)
+}
+
+const updatedAtLabel = computed(() => {
+  if (!props.lastUpdated) return t('common.unknown')
+  return props.lastUpdated.toLocaleTimeString()
+})
+
+// --- Color coding for latency/TTFT ---
+function getLatencyColor(ms: number | null | undefined): string {
+  if (ms == null) return 'text-gray-900 dark:text-white'
+  if (ms < 500) return 'text-green-600 dark:text-green-400'
+  if (ms < 1000) return 'text-yellow-600 dark:text-yellow-400'
+  if (ms < 2000) return 'text-orange-600 dark:text-orange-400'
+  return 'text-red-600 dark:text-red-400'
+}
+
+// --- Realtime / Overview labels ---
+
+const totalRequestsLabel = computed(() => formatNumber(overview.value?.request_count_total ?? 0))
+const totalTokensLabel = computed(() => formatNumber(overview.value?.token_consumed ?? 0))
+
+const displayRealTimeQps = computed(() => {
+  const ov = overview.value
+  if (!ov) return 0
+  const useRealtime = props.wsStatus === 'connected' && !!props.wsHasData
+  const v = useRealtime ? props.realTimeQps : ov.qps?.current
+  return typeof v === 'number' && Number.isFinite(v) ? v : 0
+})
+
+const displayRealTimeTps = computed(() => {
+  const ov = overview.value
+  if (!ov) return 0
+  const useRealtime = props.wsStatus === 'connected' && !!props.wsHasData
+  const v = useRealtime ? props.realTimeTps : ov.tps?.current
+  return typeof v === 'number' && Number.isFinite(v) ? v : 0
+})
+
+// Sparkline history (keep last 60 data points)
+const qpsHistory = ref<number[]>([])
+const tpsHistory = ref<number[]>([])
+const MAX_HISTORY_POINTS = 60
+
+watch([displayRealTimeQps, displayRealTimeTps], ([newQps, newTps]) => {
+  // Add new data points
+  qpsHistory.value.push(newQps)
+  tpsHistory.value.push(newTps)
+
+  // Keep only last N points
+  if (qpsHistory.value.length > MAX_HISTORY_POINTS) {
+    qpsHistory.value.shift()
+  }
+  if (tpsHistory.value.length > MAX_HISTORY_POINTS) {
+    tpsHistory.value.shift()
+  }
+})
+
+const qpsPeakLabel = computed(() => {
+  const v = overview.value?.qps?.peak
+  if (typeof v !== 'number') return '-'
+  return v.toFixed(1)
+})
+
+const tpsPeakLabel = computed(() => {
+  const v = overview.value?.tps?.peak
+  if (typeof v !== 'number') return '-'
+  return v.toFixed(1)
+})
+
+const qpsAvgLabel = computed(() => {
+  const v = overview.value?.qps?.avg
+  if (typeof v !== 'number') return '-'
+  return v.toFixed(1)
+})
+
+const tpsAvgLabel = computed(() => {
+  const v = overview.value?.tps?.avg
+  if (typeof v !== 'number') return '-'
+  return v.toFixed(1)
+})
+
+const slaPercent = computed(() => {
+  const v = overview.value?.sla
+  if (typeof v !== 'number') return null
+  return v * 100
+})
+
+const errorRatePercent = computed(() => {
+  const v = overview.value?.error_rate
+  if (typeof v !== 'number') return null
+  return v * 100
+})
+
+const upstreamErrorRatePercent = computed(() => {
+  const v = overview.value?.upstream_error_rate
+  if (typeof v !== 'number') return null
+  return v * 100
+})
+
+const durationP99Ms = computed(() => overview.value?.duration?.p99_ms ?? null)
+const durationP95Ms = computed(() => overview.value?.duration?.p95_ms ?? null)
+const durationP90Ms = computed(() => overview.value?.duration?.p90_ms ?? null)
+const durationP50Ms = computed(() => overview.value?.duration?.p50_ms ?? null)
+const durationAvgMs = computed(() => overview.value?.duration?.avg_ms ?? null)
+const durationMaxMs = computed(() => overview.value?.duration?.max_ms ?? null)
+
+const ttftP99Ms = computed(() => overview.value?.ttft?.p99_ms ?? null)
+const ttftP95Ms = computed(() => overview.value?.ttft?.p95_ms ?? null)
+const ttftP90Ms = computed(() => overview.value?.ttft?.p90_ms ?? null)
+const ttftP50Ms = computed(() => overview.value?.ttft?.p50_ms ?? null)
+const ttftAvgMs = computed(() => overview.value?.ttft?.avg_ms ?? null)
+const ttftMaxMs = computed(() => overview.value?.ttft?.max_ms ?? null)
+
+// --- Health Score & Diagnosis (primary) ---
+
+const isSystemIdle = computed(() => {
+  const ov = overview.value
+  if (!ov) return true
+  const qps = props.wsStatus === 'connected' && props.wsHasData ? props.realTimeQps : ov.qps?.current
+  const errorRate = ov.error_rate ?? 0
+  return (qps ?? 0) === 0 && errorRate === 0
+})
+
+const healthScoreValue = computed<number | null>(() => {
+  const v = overview.value?.health_score
+  return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const healthScoreColor = computed(() => {
+  if (isSystemIdle.value) return '#9ca3af' // gray-400
+  const score = healthScoreValue.value
+  if (score == null) return '#9ca3af'
+  if (score >= 90) return '#10b981' // green
+  if (score >= 60) return '#f59e0b' // yellow
+  return '#ef4444' // red
+})
+
+const healthScoreClass = computed(() => {
+  if (isSystemIdle.value) return 'text-gray-400'
+  const score = healthScoreValue.value
+  if (score == null) return 'text-gray-400'
+  if (score >= 90) return 'text-green-500'
+  if (score >= 60) return 'text-yellow-500'
+  return 'text-red-500'
+})
+
+const circleSize = 100
+const strokeWidth = 8
+const radius = (circleSize - strokeWidth) / 2
+const circumference = 2 * Math.PI * radius
+const dashOffset = computed(() => {
+  if (isSystemIdle.value) return 0
+  if (healthScoreValue.value == null) return 0
+  const score = Math.max(0, Math.min(100, healthScoreValue.value))
+  return circumference - (score / 100) * circumference
+})
+
+interface DiagnosisItem {
+  type: 'critical' | 'warning' | 'info'
+  message: string
+  impact: string
+  action?: string
+}
+
+const diagnosisReport = computed<DiagnosisItem[]>(() => {
+  const ov = overview.value
+  if (!ov) return []
+
+  const report: DiagnosisItem[] = []
+
+  if (isSystemIdle.value) {
+    report.push({
+      type: 'info',
+      message: t('admin.ops.diagnosis.idle'),
+      impact: t('admin.ops.diagnosis.idleImpact')
+    })
+    return report
+  }
+
+  // Resource diagnostics (highest priority)
+  const sm = ov.system_metrics
+  if (sm) {
+    if (sm.db_ok === false) {
+      report.push({
+        type: 'critical',
+        message: t('admin.ops.diagnosis.dbDown'),
+        impact: t('admin.ops.diagnosis.dbDownImpact'),
+        action: t('admin.ops.diagnosis.dbDownAction')
+      })
+    }
+    if (sm.redis_ok === false) {
+      report.push({
+        type: 'warning',
+        message: t('admin.ops.diagnosis.redisDown'),
+        impact: t('admin.ops.diagnosis.redisDownImpact'),
+        action: t('admin.ops.diagnosis.redisDownAction')
+      })
+    }
+
+    const cpuPct = sm.cpu_usage_percent ?? 0
+    if (cpuPct > 90) {
+      report.push({
+        type: 'critical',
+        message: t('admin.ops.diagnosis.cpuCritical', { usage: cpuPct.toFixed(1) }),
+        impact: t('admin.ops.diagnosis.cpuCriticalImpact'),
+        action: t('admin.ops.diagnosis.cpuCriticalAction')
+      })
+    } else if (cpuPct > 80) {
+      report.push({
+        type: 'warning',
+        message: t('admin.ops.diagnosis.cpuHigh', { usage: cpuPct.toFixed(1) }),
+        impact: t('admin.ops.diagnosis.cpuHighImpact'),
+        action: t('admin.ops.diagnosis.cpuHighAction')
+      })
+    }
+
+    const memPct = sm.memory_usage_percent ?? 0
+    if (memPct > 90) {
+      report.push({
+        type: 'critical',
+        message: t('admin.ops.diagnosis.memoryCritical', { usage: memPct.toFixed(1) }),
+        impact: t('admin.ops.diagnosis.memoryCriticalImpact'),
+        action: t('admin.ops.diagnosis.memoryCriticalAction')
+      })
+    } else if (memPct > 85) {
+      report.push({
+        type: 'warning',
+        message: t('admin.ops.diagnosis.memoryHigh', { usage: memPct.toFixed(1) }),
+        impact: t('admin.ops.diagnosis.memoryHighImpact'),
+        action: t('admin.ops.diagnosis.memoryHighAction')
+      })
+    }
+  }
+
+  // Latency diagnostics
+  const durationP99 = ov.duration?.p99_ms ?? 0
+  if (durationP99 > 2000) {
+    report.push({
+      type: 'critical',
+      message: t('admin.ops.diagnosis.latencyCritical', { latency: durationP99.toFixed(0) }),
+      impact: t('admin.ops.diagnosis.latencyCriticalImpact'),
+      action: t('admin.ops.diagnosis.latencyCriticalAction')
+    })
+  } else if (durationP99 > 1000) {
+    report.push({
+      type: 'warning',
+      message: t('admin.ops.diagnosis.latencyHigh', { latency: durationP99.toFixed(0) }),
+      impact: t('admin.ops.diagnosis.latencyHighImpact'),
+      action: t('admin.ops.diagnosis.latencyHighAction')
+    })
+  }
+
+  const ttftP99 = ov.ttft?.p99_ms ?? 0
+  if (ttftP99 > 500) {
+    report.push({
+      type: 'warning',
+      message: t('admin.ops.diagnosis.ttftHigh', { ttft: ttftP99.toFixed(0) }),
+      impact: t('admin.ops.diagnosis.ttftHighImpact'),
+      action: t('admin.ops.diagnosis.ttftHighAction')
+    })
+  }
+
+  // Error rate diagnostics (adjusted thresholds)
+  const upstreamRatePct = (ov.upstream_error_rate ?? 0) * 100
+  if (upstreamRatePct > 5) {
+    report.push({
+      type: 'critical',
+      message: t('admin.ops.diagnosis.upstreamCritical', { rate: upstreamRatePct.toFixed(2) }),
+      impact: t('admin.ops.diagnosis.upstreamCriticalImpact'),
+      action: t('admin.ops.diagnosis.upstreamCriticalAction')
+    })
+  } else if (upstreamRatePct > 2) {
+    report.push({
+      type: 'warning',
+      message: t('admin.ops.diagnosis.upstreamHigh', { rate: upstreamRatePct.toFixed(2) }),
+      impact: t('admin.ops.diagnosis.upstreamHighImpact'),
+      action: t('admin.ops.diagnosis.upstreamHighAction')
+    })
+  }
+
+  const errorPct = (ov.error_rate ?? 0) * 100
+  if (errorPct > 3) {
+    report.push({
+      type: 'critical',
+      message: t('admin.ops.diagnosis.errorHigh', { rate: errorPct.toFixed(2) }),
+      impact: t('admin.ops.diagnosis.errorHighImpact'),
+      action: t('admin.ops.diagnosis.errorHighAction')
+    })
+  } else if (errorPct > 0.5) {
+    report.push({
+      type: 'warning',
+      message: t('admin.ops.diagnosis.errorElevated', { rate: errorPct.toFixed(2) }),
+      impact: t('admin.ops.diagnosis.errorElevatedImpact'),
+      action: t('admin.ops.diagnosis.errorElevatedAction')
+    })
+  }
+
+  // SLA diagnostics
+  const slaPct = (ov.sla ?? 0) * 100
+  if (slaPct < 90) {
+    report.push({
+      type: 'critical',
+      message: t('admin.ops.diagnosis.slaCritical', { sla: slaPct.toFixed(2) }),
+      impact: t('admin.ops.diagnosis.slaCriticalImpact'),
+      action: t('admin.ops.diagnosis.slaCriticalAction')
+    })
+  } else if (slaPct < 98) {
+    report.push({
+      type: 'warning',
+      message: t('admin.ops.diagnosis.slaLow', { sla: slaPct.toFixed(2) }),
+      impact: t('admin.ops.diagnosis.slaLowImpact'),
+      action: t('admin.ops.diagnosis.slaLowAction')
+    })
+  }
+
+  // Health score diagnostics (lowest priority)
+  if (healthScoreValue.value != null) {
+    if (healthScoreValue.value < 60) {
+      report.push({
+        type: 'critical',
+        message: t('admin.ops.diagnosis.healthCritical', { score: healthScoreValue.value }),
+        impact: t('admin.ops.diagnosis.healthCriticalImpact'),
+        action: t('admin.ops.diagnosis.healthCriticalAction')
+      })
+    } else if (healthScoreValue.value < 90) {
+      report.push({
+        type: 'warning',
+        message: t('admin.ops.diagnosis.healthLow', { score: healthScoreValue.value }),
+        impact: t('admin.ops.diagnosis.healthLowImpact'),
+        action: t('admin.ops.diagnosis.healthLowAction')
+      })
+    }
+  }
+
+  if (report.length === 0) {
+    report.push({
+      type: 'info',
+      message: t('admin.ops.diagnosis.healthy'),
+      impact: t('admin.ops.diagnosis.healthyImpact')
+    })
+  }
+
+  return report
+})
+
+// --- System health (secondary) ---
+
+function formatTimeShort(ts?: string | null): string {
+  if (!ts) return '-'
+  const d = new Date(ts)
+  if (Number.isNaN(d.getTime())) return '-'
+  return d.toLocaleTimeString()
+}
+
+const cpuPercentValue = computed<number | null>(() => {
+  const v = systemMetrics.value?.cpu_usage_percent
+  return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const cpuPercentClass = computed(() => {
+  const v = cpuPercentValue.value
+  if (v == null) return 'text-gray-900 dark:text-white'
+  if (v >= 95) return 'text-rose-600 dark:text-rose-400'
+  if (v >= 80) return 'text-yellow-600 dark:text-yellow-400'
+  return 'text-emerald-600 dark:text-emerald-400'
+})
+
+const memPercentValue = computed<number | null>(() => {
+  const v = systemMetrics.value?.memory_usage_percent
+  return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const memPercentClass = computed(() => {
+  const v = memPercentValue.value
+  if (v == null) return 'text-gray-900 dark:text-white'
+  if (v >= 95) return 'text-rose-600 dark:text-rose-400'
+  if (v >= 85) return 'text-yellow-600 dark:text-yellow-400'
+  return 'text-emerald-600 dark:text-emerald-400'
+})
+
+const dbConnActiveValue = computed<number | null>(() => {
+  const v = systemMetrics.value?.db_conn_active
+  return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const dbConnIdleValue = computed<number | null>(() => {
+  const v = systemMetrics.value?.db_conn_idle
+  return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const dbConnWaitingValue = computed<number | null>(() => {
+  const v = systemMetrics.value?.db_conn_waiting
+  return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const dbConnOpenValue = computed<number | null>(() => {
+  if (dbConnActiveValue.value == null || dbConnIdleValue.value == null) return null
+  return dbConnActiveValue.value + dbConnIdleValue.value
+})
+
+const dbMaxOpenConnsValue = computed<number | null>(() => {
+  const v = systemMetrics.value?.db_max_open_conns
+  return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const dbUsagePercent = computed<number | null>(() => {
+  if (dbConnOpenValue.value == null || dbMaxOpenConnsValue.value == null || dbMaxOpenConnsValue.value <= 0) return null
+  return Math.min(100, Math.max(0, (dbConnOpenValue.value / dbMaxOpenConnsValue.value) * 100))
+})
+
+const dbMiddleLabel = computed(() => {
+  if (systemMetrics.value?.db_ok === false) return 'FAIL'
+  if (dbUsagePercent.value != null) return `${dbUsagePercent.value.toFixed(0)}%`
+  if (systemMetrics.value?.db_ok === true) return t('admin.ops.ok')
+  return t('admin.ops.noData')
+})
+
+const dbMiddleClass = computed(() => {
+  if (systemMetrics.value?.db_ok === false) return 'text-rose-600 dark:text-rose-400'
+  if (dbUsagePercent.value != null) {
+    if (dbUsagePercent.value >= 90) return 'text-rose-600 dark:text-rose-400'
+    if (dbUsagePercent.value >= 70) return 'text-yellow-600 dark:text-yellow-400'
+    return 'text-emerald-600 dark:text-emerald-400'
+  }
+  if (systemMetrics.value?.db_ok === true) return 'text-emerald-600 dark:text-emerald-400'
+  return 'text-gray-900 dark:text-white'
+})
+
+const redisConnTotalValue = computed<number | null>(() => {
+  const v = systemMetrics.value?.redis_conn_total
+  return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const redisConnIdleValue = computed<number | null>(() => {
+  const v = systemMetrics.value?.redis_conn_idle
+  return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const redisConnActiveValue = computed<number | null>(() => {
+  if (redisConnTotalValue.value == null || redisConnIdleValue.value == null) return null
+  return Math.max(redisConnTotalValue.value - redisConnIdleValue.value, 0)
+})
+
+const redisPoolSizeValue = computed<number | null>(() => {
+  const v = systemMetrics.value?.redis_pool_size
+  return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const redisUsagePercent = computed<number | null>(() => {
+  if (redisConnTotalValue.value == null || redisPoolSizeValue.value == null || redisPoolSizeValue.value <= 0) return null
+  return Math.min(100, Math.max(0, (redisConnTotalValue.value / redisPoolSizeValue.value) * 100))
+})
+
+const redisMiddleLabel = computed(() => {
+  if (systemMetrics.value?.redis_ok === false) return 'FAIL'
+  if (redisUsagePercent.value != null) return `${redisUsagePercent.value.toFixed(0)}%`
+  if (systemMetrics.value?.redis_ok === true) return t('admin.ops.ok')
+  return t('admin.ops.noData')
+})
+
+const redisMiddleClass = computed(() => {
+  if (systemMetrics.value?.redis_ok === false) return 'text-rose-600 dark:text-rose-400'
+  if (redisUsagePercent.value != null) {
+    if (redisUsagePercent.value >= 90) return 'text-rose-600 dark:text-rose-400'
+    if (redisUsagePercent.value >= 70) return 'text-yellow-600 dark:text-yellow-400'
+    return 'text-emerald-600 dark:text-emerald-400'
+  }
+  if (systemMetrics.value?.redis_ok === true) return 'text-emerald-600 dark:text-emerald-400'
+  return 'text-gray-900 dark:text-white'
+})
+
+const goroutineCountValue = computed<number | null>(() => {
+  const v = systemMetrics.value?.goroutine_count
+  return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const goroutinesWarnThreshold = 8_000
+const goroutinesCriticalThreshold = 15_000
+
+const goroutineStatus = computed<'ok' | 'warning' | 'critical' | 'unknown'>(() => {
+  const n = goroutineCountValue.value
+  if (n == null) return 'unknown'
+  if (n >= goroutinesCriticalThreshold) return 'critical'
+  if (n >= goroutinesWarnThreshold) return 'warning'
+  return 'ok'
+})
+
+const goroutineStatusLabel = computed(() => {
+  switch (goroutineStatus.value) {
+    case 'ok':
+      return t('admin.ops.ok')
+    case 'warning':
+      return t('common.warning')
+    case 'critical':
+      return t('common.critical')
+    default:
+      return t('admin.ops.noData')
+  }
+})
+
+const goroutineStatusClass = computed(() => {
+  switch (goroutineStatus.value) {
+    case 'ok':
+      return 'text-emerald-600 dark:text-emerald-400'
+    case 'warning':
+      return 'text-yellow-600 dark:text-yellow-400'
+    case 'critical':
+      return 'text-rose-600 dark:text-rose-400'
+    default:
+      return 'text-gray-900 dark:text-white'
+  }
+})
+
+const jobHeartbeats = computed(() => overview.value?.job_heartbeats ?? [])
+
+const jobsStatus = computed<'ok' | 'warn' | 'unknown'>(() => {
+  const list = jobHeartbeats.value
+  if (!list.length) return 'unknown'
+  for (const hb of list) {
+    if (!hb) continue
+    if (hb.last_error_at && (!hb.last_success_at || hb.last_error_at > hb.last_success_at)) return 'warn'
+  }
+  return 'ok'
+})
+
+const jobsWarnCount = computed(() => {
+  let warn = 0
+  for (const hb of jobHeartbeats.value) {
+    if (!hb) continue
+    if (hb.last_error_at && (!hb.last_success_at || hb.last_error_at > hb.last_success_at)) warn++
+  }
+  return warn
+})
+
+const jobsStatusLabel = computed(() => {
+  switch (jobsStatus.value) {
+    case 'ok':
+      return t('admin.ops.ok')
+    case 'warn':
+      return t('common.warning')
+    default:
+      return t('admin.ops.noData')
+  }
+})
+
+const jobsStatusClass = computed(() => {
+  switch (jobsStatus.value) {
+    case 'ok':
+      return 'text-emerald-600 dark:text-emerald-400'
+    case 'warn':
+      return 'text-yellow-600 dark:text-yellow-400'
+    default:
+      return 'text-gray-900 dark:text-white'
+  }
+})
+
+const showJobsDetails = ref(false)
+
+function openJobsDetails() {
+  showJobsDetails.value = true
+}
+</script>
+
+<template>
+  <div class="flex flex-col gap-4 rounded-3xl bg-white p-6 shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700">
+    <!-- Top Toolbar -->
+    <div class="flex flex-wrap items-center justify-between gap-4 border-b border-gray-100 pb-4 dark:border-dark-700">
+      <div>
+        <h1 class="flex items-center gap-2 text-xl font-black text-gray-900 dark:text-white">
+          <svg class="h-6 w-6 text-blue-500" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+            <path
+              stroke-linecap="round"
+              stroke-linejoin="round"
+              stroke-width="2"
+              d="M9 19v-6a2 2 0 00-2-2H5a2 2 0 00-2 2v6a2 2 0 002 2h2a2 2 0 002-2zm0 0V9a2 2 0 012-2h2a2 2 0 012 2v10m-6 0a2 2 0 002 2h2a2 2 0 002-2m0 0V5a2 2 0 012-2h2a2 2 0 01-2 2h-2a2 2 0 01-2-2z"
+            />
+          </svg>
+          {{ t('admin.ops.title') }}
+        </h1>
+
+        <div class="mt-1 flex items-center gap-3 text-xs text-gray-500 dark:text-gray-400">
+          <span class="flex items-center gap-1.5" :title="props.loading ? t('admin.ops.loadingText') : t('admin.ops.ready')">
+            <span class="relative flex h-2 w-2">
+              <span class="relative inline-flex h-2 w-2 rounded-full" :class="props.loading ? 'bg-gray-400' : 'bg-green-500'"></span>
+            </span>
+            {{ props.loading ? t('admin.ops.loadingText') : t('admin.ops.ready') }}
+          </span>
+
+          <span>·</span>
+          <span>{{ t('common.refresh') }}: {{ updatedAtLabel }}</span>
+
+          <template v-if="systemMetrics">
+            <span>·</span>
+            <span>
+              {{ t('admin.ops.collectedAt') }} {{ formatTimeShort(systemMetrics.created_at) }}
+              ({{ t('admin.ops.window') }} {{ systemMetrics.window_minutes }}m)
+            </span>
+          </template>
+        </div>
+      </div>
+
+      <div class="flex flex-wrap items-center gap-3">
+        <Select
+          :model-value="platform"
+          :options="platformOptions"
+          class="w-full sm:w-[140px]"
+          @update:model-value="handlePlatformChange"
+        />
+
+        <Select
+          :model-value="groupId"
+          :options="groupOptions"
+          class="w-full sm:w-[160px]"
+          @update:model-value="handleGroupChange"
+        />
+
+        <div class="mx-1 hidden h-4 w-[1px] bg-gray-200 dark:bg-dark-700 sm:block"></div>
+
+        <Select
+          :model-value="timeRange"
+          :options="timeRangeOptions"
+          class="relative w-full sm:w-[150px]"
+          @update:model-value="handleTimeRangeChange"
+        />
+
+        <Select
+          v-if="false"
+          :model-value="queryMode"
+          :options="queryModeOptions"
+          class="relative w-full sm:w-[170px]"
+          @update:model-value="handleQueryModeChange"
+        />
+
+        <button
+          type="button"
+          class="flex h-8 w-8 items-center justify-center rounded-lg bg-gray-100 text-gray-500 transition-colors hover:bg-gray-200 dark:bg-dark-700 dark:text-gray-400 dark:hover:bg-dark-600"
+          :disabled="loading"
+          :title="t('common.refresh')"
+          @click="emit('refresh')"
+        >
+          <svg class="h-4 w-4" :class="{ 'animate-spin': loading }" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+            <path
+              stroke-linecap="round"
+              stroke-linejoin="round"
+              stroke-width="2"
+              d="M4 4v5h.582m15.356 2A8.001 8.001 0 004.582 9m0 0H9m11 11v-5h-.581m0 0a8.003 8.003 0 01-15.357-2m15.357 2H15"
+            />
+          </svg>
+        </button>
+
+        <div class="mx-1 hidden h-4 w-[1px] bg-gray-200 dark:bg-dark-700 sm:block"></div>
+
+        <button
+          type="button"
+          class="flex h-8 items-center gap-1.5 rounded-lg bg-blue-100 px-3 text-xs font-bold text-blue-700 transition-colors hover:bg-blue-200 dark:bg-blue-900/30 dark:text-blue-400 dark:hover:bg-blue-900/50"
+          :title="t('admin.ops.alertRules.title')"
+          @click="emit('openAlertRules')"
+        >
+          <svg class="h-4 w-4" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M15 17h5l-1.405-1.405A2.032 2.032 0 0118 14.158V11a6.002 6.002 0 00-4-5.659V5a2 2 0 10-4 0v.341C7.67 6.165 6 8.388 6 11v3.159c0 .538-.214 1.055-.595 1.436L4 17h5m6 0v1a3 3 0 11-6 0v-1m6 0H9" />
+          </svg>
+          <span class="hidden sm:inline">{{ t('admin.ops.alertRules.manage') }}</span>
+        </button>
+
+        <button
+          type="button"
+          class="flex h-8 items-center gap-1.5 rounded-lg bg-gray-100 px-3 text-xs font-bold text-gray-700 transition-colors hover:bg-gray-200 dark:bg-dark-700 dark:text-gray-300 dark:hover:bg-dark-600"
+          :title="t('admin.ops.settings.title')"
+          @click="emit('openSettings')"
+        >
+          <svg class="h-4 w-4" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10.325 4.317c.426-1.756 2.924-1.756 3.35 0a1.724 1.724 0 002.573 1.066c1.543-.94 3.31.826 2.37 2.37a1.724 1.724 0 001.065 2.572c1.756.426 1.756 2.924 0 3.35a1.724 1.724 0 00-1.066 2.573c.94 1.543-.826 3.31-2.37 2.37a1.724 1.724 0 00-2.572 1.065c-.426 1.756-2.924 1.756-3.35 0a1.724 1.724 0 00-2.573-1.066c-1.543.94-3.31-.826-2.37-2.37a1.724 1.724 0 00-1.065-2.572c-1.756-.426-1.756-2.924 0-3.35a1.724 1.724 0 001.066-2.573c-.94-1.543.826-3.31 2.37-2.37.996.608 2.296.07 2.572-1.065z" />
+            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M15 12a3 3 0 11-6 0 3 3 0 016 0z" />
+          </svg>
+          <span class="hidden sm:inline">{{ t('common.settings') }}</span>
+        </button>
+      </div>
+    </div>
+
+    <div v-if="overview" class="grid grid-cols-1 gap-6 lg:grid-cols-12">
+      <!-- Left: Health + Realtime -->
+      <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900 lg:col-span-5">
+        <div class="grid grid-cols-1 gap-6 md:grid-cols-[200px_1fr] md:items-center">
+          <!-- 1) Health Score -->
+          <div
+            class="group relative flex cursor-pointer flex-col items-center justify-center rounded-xl py-2 transition-all hover:bg-white/60 dark:hover:bg-dark-800/60 md:border-r md:border-gray-200 md:pr-6 dark:md:border-dark-700"
+          >
+            <!-- Diagnosis Popover (hover) -->
+            <div
+              class="pointer-events-none absolute left-1/2 top-full z-50 mt-2 w-72 -translate-x-1/2 opacity-0 transition-opacity duration-200 group-hover:pointer-events-auto group-hover:opacity-100 md:left-full md:top-0 md:ml-2 md:mt-0 md:translate-x-0"
+            >
+              <div class="rounded-xl bg-white p-4 shadow-xl ring-1 ring-black/5 dark:bg-gray-800 dark:ring-white/10">
+                <h4 class="mb-3 border-b border-gray-100 pb-2 text-sm font-bold text-gray-900 dark:border-gray-700 dark:text-white">
+                  🧠 {{ t('admin.ops.diagnosis.title') }}
+                </h4>
+
+                <div class="space-y-3">
+                  <div v-for="(item, idx) in diagnosisReport" :key="idx" class="flex gap-3">
+                    <div class="mt-0.5 shrink-0">
+                      <svg v-if="item.type === 'critical'" class="h-4 w-4 text-red-500" fill="currentColor" viewBox="0 0 20 20">
+                        <path
+                          fill-rule="evenodd"
+                          d="M10 18a8 8 0 100-16 8 8 0 000 16zM8.707 7.293a1 1 0 00-1.414 1.414L8.586 10l-1.293 1.293a1 1 0 101.414 1.414L10 11.414l1.293 1.293a1 1 0 001.414-1.414L11.414 10l1.293-1.293a1 1 0 00-1.414-1.414L10 8.586 8.707 7.293z"
+                          clip-rule="evenodd"
+                        />
+                      </svg>
+                      <svg v-else-if="item.type === 'warning'" class="h-4 w-4 text-yellow-500" fill="currentColor" viewBox="0 0 20 20">
+                        <path
+                          fill-rule="evenodd"
+                          d="M8.257 3.099c.765-1.36 2.722-1.36 3.486 0l5.58 9.92c.75 1.334-.213 2.98-1.742 2.98H4.42c-1.53 0-2.493-1.646-1.743-2.98l5.58-9.92zM11 13a1 1 0 11-2 0 1 1 0 012 0zm-1-8a1 1 0 00-1 1v3a1 1 0 002 0V6a1 1 0 00-1-1z"
+                          clip-rule="evenodd"
+                        />
+                      </svg>
+                      <svg v-else class="h-4 w-4 text-blue-500" fill="currentColor" viewBox="0 0 20 20">
+                        <path
+                          fill-rule="evenodd"
+                          d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-8-3a1 1 0 100 2 1 1 0 000-2zm-1 3a1 1 0 012 0v4a1 1 0 11-2 0v-4z"
+                          clip-rule="evenodd"
+                        />
+                      </svg>
+                    </div>
+                    <div class="flex-1">
+                      <div class="text-xs font-semibold text-gray-900 dark:text-white">{{ item.message }}</div>
+                      <div class="mt-0.5 text-[11px] text-gray-500 dark:text-gray-400">{{ item.impact }}</div>
+                      <div v-if="item.action" class="mt-1 text-[11px] text-blue-600 dark:text-blue-400">
+                        💡 {{ item.action }}
+                      </div>
+                    </div>
+                  </div>
+                </div>
+
+                <div class="mt-3 border-t border-gray-100 pt-2 text-[10px] text-gray-400 dark:border-gray-700">
+                  {{ t('admin.ops.diagnosis.footer') }}
+                </div>
+              </div>
+            </div>
+
+            <div class="relative flex items-center justify-center">
+              <svg :width="circleSize" :height="circleSize" class="-rotate-90 transform">
+                <circle
+                  :cx="circleSize / 2"
+                  :cy="circleSize / 2"
+                  :r="radius"
+                  :stroke-width="strokeWidth"
+                  fill="transparent"
+                  class="text-gray-200 dark:text-dark-700"
+                  stroke="currentColor"
+                />
+                <circle
+                  :cx="circleSize / 2"
+                  :cy="circleSize / 2"
+                  :r="radius"
+                  :stroke-width="strokeWidth"
+                  fill="transparent"
+                  :stroke="healthScoreColor"
+                  stroke-linecap="round"
+                  :stroke-dasharray="circumference"
+                  :stroke-dashoffset="dashOffset"
+                  class="transition-all duration-1000 ease-out"
+                />
+              </svg>
+
+              <div class="absolute flex flex-col items-center">
+                <span class="text-3xl font-black" :class="healthScoreClass">
+                  {{ isSystemIdle ? t('admin.ops.idleStatus') : (overview.health_score ?? '--') }}
+                </span>
+                <span class="text-[10px] font-bold uppercase tracking-wider text-gray-400">{{ t('admin.ops.health') }}</span>
+              </div>
+            </div>
+
+            <div class="mt-4 text-center">
+              <div class="flex items-center justify-center gap-1 text-xs font-medium text-gray-500">
+                {{ t('admin.ops.healthCondition') }}
+                <HelpTooltip :content="t('admin.ops.healthHelp')" />
+              </div>
+              <div class="mt-1 text-xs font-bold" :class="healthScoreClass">
+                {{
+                  isSystemIdle
+                    ? t('admin.ops.idleStatus')
+                    : typeof overview.health_score === 'number' && overview.health_score >= 90
+                      ? t('admin.ops.healthyStatus')
+                      : t('admin.ops.riskyStatus')
+                }}
+              </div>
+            </div>
+          </div>
+
+          <!-- 2) Realtime Traffic -->
+          <div class="flex flex-col justify-center py-2">
+            <div class="mb-3 flex flex-wrap items-center justify-between gap-2">
+              <div class="flex items-center gap-2">
+                <div class="relative flex h-3 w-3 shrink-0">
+                  <span class="absolute inline-flex h-full w-full animate-ping rounded-full bg-blue-400 opacity-75"></span>
+                  <span class="relative inline-flex h-3 w-3 rounded-full bg-blue-500"></span>
+                </div>
+                <h3 class="text-xs font-bold uppercase tracking-wider text-gray-400">{{ t('admin.ops.realtime.title') }}</h3>
+                <HelpTooltip :content="t('admin.ops.tooltips.qps')" />
+              </div>
+
+              <!-- Time Window Selector -->
+              <div class="flex flex-wrap gap-1">
+                <button
+                  v-for="window in (['1min', '5min', '30min', '1h'] as RealtimeWindow[])"
+                  :key="window"
+                  type="button"
+                  class="rounded px-1.5 py-0.5 text-[9px] font-bold transition-colors sm:px-2 sm:text-[10px]"
+                  :class="realtimeWindow === window
+                    ? 'bg-blue-500 text-white'
+                    : 'bg-gray-200 text-gray-600 hover:bg-gray-300 dark:bg-dark-700 dark:text-gray-400 dark:hover:bg-dark-600'"
+                  @click="realtimeWindow = window"
+                >
+                  {{ window }}
+                </button>
+              </div>
+            </div>
+
+            <div class="space-y-3">
+              <!-- Row 1: Current -->
+              <div>
+                <div class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.current') }}</div>
+                <div class="mt-1 flex flex-wrap items-baseline gap-x-4 gap-y-2">
+                  <div class="flex items-baseline gap-1.5">
+                    <span class="text-xl font-black text-gray-900 dark:text-white sm:text-2xl">{{ displayRealTimeQps.toFixed(1) }}</span>
+                    <span class="text-xs font-bold text-gray-500">QPS</span>
+                  </div>
+                  <div class="flex items-baseline gap-1.5">
+                    <span class="text-xl font-black text-gray-900 dark:text-white sm:text-2xl">{{ displayRealTimeTps.toFixed(1) }}</span>
+                    <span class="text-xs font-bold text-gray-500">TPS</span>
+                  </div>
+                </div>
+              </div>
+
+              <!-- Row 2: Peak + Average -->
+              <div class="grid grid-cols-2 gap-3">
+                <!-- Peak -->
+                <div>
+                  <div class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.peak') }}</div>
+                  <div class="mt-1 space-y-0.5 text-sm font-medium text-gray-600 dark:text-gray-400">
+                    <div class="flex items-baseline gap-1.5">
+                      <span class="font-black text-gray-900 dark:text-white">{{ qpsPeakLabel }}</span>
+                      <span class="text-xs">QPS</span>
+                    </div>
+                    <div class="flex items-baseline gap-1.5">
+                      <span class="font-black text-gray-900 dark:text-white">{{ tpsPeakLabel }}</span>
+                      <span class="text-xs">TPS</span>
+                    </div>
+                  </div>
+                </div>
+
+                <!-- Average -->
+                <div>
+                  <div class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.average') }}</div>
+                  <div class="mt-1 space-y-0.5 text-sm font-medium text-gray-600 dark:text-gray-400">
+                    <div class="flex items-baseline gap-1.5">
+                      <span class="font-black text-gray-900 dark:text-white">{{ qpsAvgLabel }}</span>
+                      <span class="text-xs">QPS</span>
+                    </div>
+                    <div class="flex items-baseline gap-1.5">
+                      <span class="font-black text-gray-900 dark:text-white">{{ tpsAvgLabel }}</span>
+                      <span class="text-xs">TPS</span>
+                    </div>
+                  </div>
+                </div>
+              </div>
+
+              <!-- Animated Pulse Line (Heart Beat Animation) -->
+              <div class="h-8 w-full overflow-hidden opacity-50">
+                <svg class="h-full w-full" viewBox="0 0 280 32" preserveAspectRatio="none">
+                  <path
+                    d="M0 16 Q 20 16, 40 16 T 80 16 T 120 10 T 160 22 T 200 16 T 240 16 T 280 16"
+                    fill="none"
+                    stroke="#3b82f6"
+                    stroke-width="2"
+                    vector-effect="non-scaling-stroke"
+                  >
+                    <animate
+                      attributeName="d"
+                      dur="2s"
+                      repeatCount="indefinite"
+                      values="M0 16 Q 20 16, 40 16 T 80 16 T 120 10 T 160 22 T 200 16 T 240 16 T 280 16;
+                              M0 16 Q 20 16, 40 16 T 80 16 T 120 16 T 160 16 T 200 10 T 240 22 T 280 16;
+                              M0 16 Q 20 16, 40 16 T 80 16 T 120 16 T 160 16 T 200 16 T 240 16 T 280 16"
+                      keyTimes="0;0.5;1"
+                    />
+                  </path>
+                </svg>
+              </div>
+            </div>
+          </div>
+        </div>
+      </div>
+
+      <!-- Right: 6 cards (3 cols x 2 rows) -->
+      <div class="grid grid-cols-1 gap-4 sm:grid-cols-2 lg:col-span-7 lg:grid-cols-3">
+        <!-- Card 1: Requests -->
+        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
+          <div class="flex items-center justify-between">
+            <div class="flex items-center gap-1">
+              <span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.requests') }}</span>
+              <HelpTooltip :content="t('admin.ops.tooltips.totalRequests')" />
+            </div>
+            <button
+              class="text-[10px] font-bold text-blue-500 hover:underline"
+              type="button"
+              @click="openDetails({ title: t('admin.ops.requestDetails.title') })"
+            >
+              {{ t('admin.ops.requestDetails.details') }}
+            </button>
+          </div>
+          <div class="mt-2 space-y-2 text-xs">
+            <div class="flex justify-between">
+              <span class="text-gray-500">{{ t('admin.ops.requests') }}:</span>
+              <span class="font-bold text-gray-900 dark:text-white">{{ totalRequestsLabel }}</span>
+            </div>
+            <div class="flex justify-between">
+              <span class="text-gray-500">{{ t('admin.ops.tokens') }}:</span>
+              <span class="font-bold text-gray-900 dark:text-white">{{ totalTokensLabel }}</span>
+            </div>
+            <div class="flex justify-between">
+              <span class="text-gray-500">{{ t('admin.ops.avgQps') }}:</span>
+              <span class="font-bold text-gray-900 dark:text-white">{{ qpsAvgLabel }}</span>
+            </div>
+            <div class="flex justify-between">
+              <span class="text-gray-500">{{ t('admin.ops.avgTps') }}:</span>
+              <span class="font-bold text-gray-900 dark:text-white">{{ tpsAvgLabel }}</span>
+            </div>
+          </div>
+        </div>
+
+        <!-- Card 2: SLA -->
+        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
+          <div class="flex items-center justify-between">
+            <div class="flex items-center gap-2">
+              <span class="text-[10px] font-bold uppercase text-gray-400">SLA</span>
+              <HelpTooltip :content="t('admin.ops.tooltips.sla')" />
+              <span class="h-1.5 w-1.5 rounded-full" :class="(slaPercent ?? 0) >= 99.5 ? 'bg-green-500' : 'bg-yellow-500'"></span>
+            </div>
+            <button
+              class="text-[10px] font-bold text-blue-500 hover:underline"
+              type="button"
+              @click="openDetails({ title: t('admin.ops.requestDetails.title') })"
+            >
+              {{ t('admin.ops.requestDetails.details') }}
+            </button>
+          </div>
+          <div class="mt-2 text-3xl font-black text-gray-900 dark:text-white">
+            {{ slaPercent == null ? '-' : `${slaPercent.toFixed(3)}%` }}
+          </div>
+          <div class="mt-3 h-2 w-full overflow-hidden rounded-full bg-gray-200 dark:bg-dark-700">
+            <div class="h-full bg-green-500 transition-all" :style="{ width: `${Math.max((slaPercent ?? 0) - 90, 0) * 10}%` }"></div>
+          </div>
+          <div class="mt-3 text-xs">
+            <div class="flex justify-between">
+              <span class="text-gray-500">{{ t('admin.ops.exceptions') }}:</span>
+              <span class="font-bold text-red-600 dark:text-red-400">{{ formatNumber((overview.request_count_sla ?? 0) - (overview.success_count ?? 0)) }}</span>
+            </div>
+          </div>
+        </div>
+
+        <!-- Card 3: Latency (Duration) -->
+        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
+          <div class="flex items-center justify-between">
+            <div class="flex items-center gap-1">
+              <span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.latencyDuration') }}</span>
+              <HelpTooltip :content="t('admin.ops.tooltips.latency')" />
+            </div>
+            <button
+              class="text-[10px] font-bold text-blue-500 hover:underline"
+              type="button"
+              @click="openDetails({ title: t('admin.ops.latencyDuration'), sort: 'duration_desc', min_duration_ms: Math.max(Number(durationP99Ms ?? 0), 0) })"
+            >
+              {{ t('admin.ops.requestDetails.details') }}
+            </button>
+          </div>
+          <div class="mt-2 flex items-baseline gap-2">
+            <div class="text-3xl font-black" :class="getLatencyColor(durationP99Ms)">
+              {{ durationP99Ms ?? '-' }}
+            </div>
+            <span class="text-xs font-bold text-gray-400">ms (P99)</span>
+          </div>
+          <div class="mt-3 flex flex-wrap gap-x-3 gap-y-1 text-xs">
+            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
+              <span class="text-gray-500">P95:</span>
+              <span class="font-bold" :class="getLatencyColor(durationP95Ms)">{{ durationP95Ms ?? '-' }}</span>
+              <span class="text-gray-400">ms</span>
+            </div>
+            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
+              <span class="text-gray-500">P90:</span>
+              <span class="font-bold" :class="getLatencyColor(durationP90Ms)">{{ durationP90Ms ?? '-' }}</span>
+              <span class="text-gray-400">ms</span>
+            </div>
+            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
+              <span class="text-gray-500">P50:</span>
+              <span class="font-bold" :class="getLatencyColor(durationP50Ms)">{{ durationP50Ms ?? '-' }}</span>
+              <span class="text-gray-400">ms</span>
+            </div>
+            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
+              <span class="text-gray-500">Avg:</span>
+              <span class="font-bold" :class="getLatencyColor(durationAvgMs)">{{ durationAvgMs ?? '-' }}</span>
+              <span class="text-gray-400">ms</span>
+            </div>
+            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
+              <span class="text-gray-500">Max:</span>
+              <span class="font-bold" :class="getLatencyColor(durationMaxMs)">{{ durationMaxMs ?? '-' }}</span>
+              <span class="text-gray-400">ms</span>
+            </div>
+          </div>
+        </div>
+
+        <!-- Card 4: TTFT -->
+        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
+          <div class="flex items-center justify-between">
+            <div class="flex items-center gap-1">
+              <span class="text-[10px] font-bold uppercase text-gray-400">TTFT</span>
+              <HelpTooltip :content="t('admin.ops.tooltips.ttft')" />
+            </div>
+            <button
+              class="text-[10px] font-bold text-blue-500 hover:underline"
+              type="button"
+              @click="openDetails({ title: 'TTFT' })"
+            >
+              {{ t('admin.ops.requestDetails.details') }}
+            </button>
+          </div>
+          <div class="mt-2 flex items-baseline gap-2">
+            <div class="text-3xl font-black" :class="getLatencyColor(ttftP99Ms)">
+              {{ ttftP99Ms ?? '-' }}
+            </div>
+            <span class="text-xs font-bold text-gray-400">ms (P99)</span>
+          </div>
+          <div class="mt-3 flex flex-wrap gap-x-3 gap-y-1 text-xs">
+            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
+              <span class="text-gray-500">P95:</span>
+              <span class="font-bold" :class="getLatencyColor(ttftP95Ms)">{{ ttftP95Ms ?? '-' }}</span>
+              <span class="text-gray-400">ms</span>
+            </div>
+            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
+              <span class="text-gray-500">P90:</span>
+              <span class="font-bold" :class="getLatencyColor(ttftP90Ms)">{{ ttftP90Ms ?? '-' }}</span>
+              <span class="text-gray-400">ms</span>
+            </div>
+            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
+              <span class="text-gray-500">P50:</span>
+              <span class="font-bold" :class="getLatencyColor(ttftP50Ms)">{{ ttftP50Ms ?? '-' }}</span>
+              <span class="text-gray-400">ms</span>
+            </div>
+            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
+              <span class="text-gray-500">Avg:</span>
+              <span class="font-bold" :class="getLatencyColor(ttftAvgMs)">{{ ttftAvgMs ?? '-' }}</span>
+              <span class="text-gray-400">ms</span>
+            </div>
+            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
+              <span class="text-gray-500">Max:</span>
+              <span class="font-bold" :class="getLatencyColor(ttftMaxMs)">{{ ttftMaxMs ?? '-' }}</span>
+              <span class="text-gray-400">ms</span>
+            </div>
+          </div>
+        </div>
+
+        <!-- Card 5: Request Errors -->
+        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
+          <div class="flex items-center justify-between">
+            <div class="flex items-center gap-1">
+              <span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.requestErrors') }}</span>
+              <HelpTooltip :content="t('admin.ops.tooltips.errors')" />
+            </div>
+            <button class="text-[10px] font-bold text-blue-500 hover:underline" type="button" @click="openErrorDetails('request')">
+              {{ t('admin.ops.requestDetails.details') }}
+            </button>
+          </div>
+          <div class="mt-2 text-3xl font-black" :class="(errorRatePercent ?? 0) > 5 ? 'text-red-500' : 'text-gray-900 dark:text-white'">
+            {{ errorRatePercent == null ? '-' : `${errorRatePercent.toFixed(2)}%` }}
+          </div>
+          <div class="mt-3 space-y-1 text-xs">
+            <div class="flex justify-between">
+              <span class="text-gray-500">{{ t('admin.ops.errorCount') }}:</span>
+              <span class="font-bold text-gray-900 dark:text-white">{{ formatNumber(overview.error_count_sla ?? 0) }}</span>
+            </div>
+            <div class="flex justify-between">
+              <span class="text-gray-500">{{ t('admin.ops.businessLimited') }}:</span>
+              <span class="font-bold text-gray-900 dark:text-white">{{ formatNumber(overview.business_limited_count ?? 0) }}</span>
+            </div>
+          </div>
+        </div>
+
+        <!-- Card 6: Upstream Errors -->
+        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
+          <div class="flex items-center justify-between">
+            <div class="flex items-center gap-1">
+              <span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.upstreamErrors') }}</span>
+              <HelpTooltip :content="t('admin.ops.tooltips.upstreamErrors')" />
+            </div>
+            <button class="text-[10px] font-bold text-blue-500 hover:underline" type="button" @click="openErrorDetails('upstream')">
+              {{ t('admin.ops.requestDetails.details') }}
+            </button>
+          </div>
+          <div class="mt-2 text-3xl font-black" :class="(upstreamErrorRatePercent ?? 0) > 5 ? 'text-red-500' : 'text-gray-900 dark:text-white'">
+            {{ upstreamErrorRatePercent == null ? '-' : `${upstreamErrorRatePercent.toFixed(2)}%` }}
+          </div>
+          <div class="mt-3 space-y-1 text-xs">
+            <div class="flex justify-between">
+              <span class="text-gray-500">{{ t('admin.ops.errorCountExcl429529') }}:</span>
+              <span class="font-bold text-gray-900 dark:text-white">{{ formatNumber(overview.upstream_error_count_excl_429_529 ?? 0) }}</span>
+            </div>
+            <div class="flex justify-between">
+              <span class="text-gray-500">429/529:</span>
+              <span class="font-bold text-gray-900 dark:text-white">{{ formatNumber((overview.upstream_429_count ?? 0) + (overview.upstream_529_count ?? 0)) }}</span>
+            </div>
+          </div>
+        </div>
+      </div>
+    </div>
+
+    <!-- Integrated: System health (cards) -->
+    <div v-if="overview" class="mt-2 border-t border-gray-100 pt-4 dark:border-dark-700">
+      <div class="grid grid-cols-2 gap-3 sm:grid-cols-3 lg:grid-cols-6">
+        <!-- CPU -->
+        <div class="rounded-xl bg-gray-50 p-3 dark:bg-dark-900">
+          <div class="flex items-center gap-1">
+            <div class="text-[10px] font-bold uppercase tracking-wider text-gray-400">CPU</div>
+            <HelpTooltip :content="t('admin.ops.tooltips.cpu')" />
+          </div>
+          <div class="mt-1 text-lg font-black" :class="cpuPercentClass">
+            {{ cpuPercentValue == null ? '-' : `${cpuPercentValue.toFixed(1)}%` }}
+          </div>
+          <div class="mt-1 text-[10px] text-gray-500 dark:text-gray-400">
+            {{ t('common.warning') }} 80% · {{ t('common.critical') }} 95%
+          </div>
+        </div>
+
+        <!-- MEM -->
+        <div class="rounded-xl bg-gray-50 p-3 dark:bg-dark-900">
+          <div class="flex items-center gap-1">
+            <div class="text-[10px] font-bold uppercase tracking-wider text-gray-400">MEM</div>
+            <HelpTooltip :content="t('admin.ops.tooltips.memory')" />
+          </div>
+          <div class="mt-1 text-lg font-black" :class="memPercentClass">
+            {{ memPercentValue == null ? '-' : `${memPercentValue.toFixed(1)}%` }}
+          </div>
+          <div class="mt-1 text-[10px] text-gray-500 dark:text-gray-400">
+            {{
+              systemMetrics?.memory_used_mb == null || systemMetrics?.memory_total_mb == null
+                ? '-'
+                : `${formatNumber(systemMetrics.memory_used_mb)} / ${formatNumber(systemMetrics.memory_total_mb)} MB`
+            }}
+          </div>
+        </div>
+
+        <!-- DB -->
+        <div class="rounded-xl bg-gray-50 p-3 dark:bg-dark-900">
+          <div class="flex items-center gap-1">
+            <div class="text-[10px] font-bold uppercase tracking-wider text-gray-400">DB</div>
+            <HelpTooltip :content="t('admin.ops.tooltips.db')" />
+          </div>
+          <div class="mt-1 text-lg font-black" :class="dbMiddleClass">
+            {{ dbMiddleLabel }}
+          </div>
+          <div class="mt-1 text-[10px] text-gray-500 dark:text-gray-400">
+            {{ t('admin.ops.conns') }} {{ dbConnOpenValue ?? '-' }} / {{ dbMaxOpenConnsValue ?? '-' }}
+            · {{ t('admin.ops.active') }} {{ dbConnActiveValue ?? '-' }}
+            · {{ t('admin.ops.idle') }} {{ dbConnIdleValue ?? '-' }}
+            <span v-if="dbConnWaitingValue != null"> · {{ t('admin.ops.waiting') }} {{ dbConnWaitingValue }} </span>
+          </div>
+        </div>
+
+        <!-- Redis -->
+        <div class="rounded-xl bg-gray-50 p-3 dark:bg-dark-900">
+          <div class="flex items-center gap-1">
+            <div class="text-[10px] font-bold uppercase tracking-wider text-gray-400">Redis</div>
+            <HelpTooltip :content="t('admin.ops.tooltips.redis')" />
+          </div>
+          <div class="mt-1 text-lg font-black" :class="redisMiddleClass">
+            {{ redisMiddleLabel }}
+          </div>
+          <div class="mt-1 text-[10px] text-gray-500 dark:text-gray-400">
+            {{ t('admin.ops.conns') }} {{ redisConnTotalValue ?? '-' }} / {{ redisPoolSizeValue ?? '-' }}
+            <span v-if="redisConnActiveValue != null"> · {{ t('admin.ops.active') }} {{ redisConnActiveValue }} </span>
+            <span v-if="redisConnIdleValue != null"> · {{ t('admin.ops.idle') }} {{ redisConnIdleValue }} </span>
+          </div>
+        </div>
+
+        <!-- Goroutines -->
+        <div class="rounded-xl bg-gray-50 p-3 dark:bg-dark-900">
+          <div class="flex items-center gap-1">
+            <div class="text-[10px] font-bold uppercase tracking-wider text-gray-400">{{ t('admin.ops.goroutines') }}</div>
+            <HelpTooltip :content="t('admin.ops.tooltips.goroutines')" />
+          </div>
+          <div class="mt-1 text-lg font-black" :class="goroutineStatusClass">
+            {{ goroutineStatusLabel }}
+          </div>
+          <div class="mt-1 text-[10px] text-gray-500 dark:text-gray-400">
+            {{ t('admin.ops.current') }} <span class="font-mono">{{ goroutineCountValue ?? '-' }}</span>
+            · {{ t('common.warning') }} <span class="font-mono">{{ goroutinesWarnThreshold }}</span>
+            · {{ t('common.critical') }} <span class="font-mono">{{ goroutinesCriticalThreshold }}</span>
+            <span v-if="systemMetrics?.concurrency_queue_depth != null">
+              · {{ t('admin.ops.queue') }} <span class="font-mono">{{ systemMetrics.concurrency_queue_depth }}</span>
+            </span>
+          </div>
+        </div>
+
+        <!-- Jobs -->
+        <div class="rounded-xl bg-gray-50 p-3 dark:bg-dark-900">
+          <div class="flex items-center justify-between gap-2">
+            <div class="flex items-center gap-1">
+              <div class="text-[10px] font-bold uppercase tracking-wider text-gray-400">{{ t('admin.ops.jobs') }}</div>
+              <HelpTooltip :content="t('admin.ops.tooltips.jobs')" />
+            </div>
+            <button class="text-[10px] font-bold text-blue-500 hover:underline" type="button" @click="openJobsDetails">
+              {{ t('admin.ops.requestDetails.details') }}
+            </button>
+          </div>
+
+          <div class="mt-1 text-lg font-black" :class="jobsStatusClass">
+            {{ jobsStatusLabel }}
+          </div>
+
+          <div class="mt-1 text-[10px] text-gray-500 dark:text-gray-400">
+            {{ t('common.total') }} <span class="font-mono">{{ jobHeartbeats.length }}</span>
+            · {{ t('common.warning') }} <span class="font-mono">{{ jobsWarnCount }}</span>
+          </div>
+        </div>
+      </div>
+    </div>
+
+    <BaseDialog :show="showJobsDetails" :title="t('admin.ops.jobs')" width="wide" @close="showJobsDetails = false">
+      <div v-if="!jobHeartbeats.length" class="text-sm text-gray-500 dark:text-gray-400">
+        {{ t('admin.ops.noData') }}
+      </div>
+      <div v-else class="space-y-3">
+        <div
+          v-for="hb in jobHeartbeats"
+          :key="hb.job_name"
+          class="rounded-xl border border-gray-100 bg-white p-4 dark:border-dark-700 dark:bg-dark-900"
+        >
+          <div class="flex items-center justify-between gap-3">
+            <div class="truncate text-sm font-semibold text-gray-900 dark:text-white">{{ hb.job_name }}</div>
+            <div class="text-xs text-gray-500 dark:text-gray-400">{{ formatTimeShort(hb.updated_at) }}</div>
+          </div>
+
+          <div class="mt-2 grid grid-cols-1 gap-2 text-xs text-gray-600 dark:text-gray-300 sm:grid-cols-2">
+            <div>
+              {{ t('admin.ops.lastSuccess') }} <span class="font-mono">{{ formatTimeShort(hb.last_success_at) }}</span>
+            </div>
+            <div>
+              {{ t('admin.ops.lastError') }} <span class="font-mono">{{ formatTimeShort(hb.last_error_at) }}</span>
+            </div>
+          </div>
+
+          <div
+            v-if="hb.last_error"
+            class="mt-3 rounded-lg bg-rose-50 p-2 text-xs text-rose-700 dark:bg-rose-900/20 dark:text-rose-300"
+          >
+            {{ hb.last_error }}
+          </div>
+        </div>
+      </div>
+    </BaseDialog>
+  </div>
+</template>
diff --git a/frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue b/frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue
new file mode 100644
index 00000000..5bbadd03
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue
@@ -0,0 +1,53 @@
+<template>
+  <div class="space-y-6">
+    <!-- Header -->
+    <div class="rounded-3xl bg-white p-6 shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700">
+      <div class="flex flex-col gap-4 sm:flex-row sm:items-center sm:justify-between">
+        <div class="space-y-2">
+          <div class="h-5 w-48 animate-pulse rounded bg-gray-200 dark:bg-dark-700"></div>
+          <div class="h-4 w-72 animate-pulse rounded bg-gray-100 dark:bg-dark-700/70"></div>
+        </div>
+        <div class="flex items-center gap-3">
+          <div class="h-9 w-28 animate-pulse rounded-xl bg-gray-200 dark:bg-dark-700"></div>
+          <div class="h-9 w-28 animate-pulse rounded-xl bg-gray-200 dark:bg-dark-700"></div>
+        </div>
+      </div>
+
+      <div class="mt-6 grid grid-cols-2 gap-4 sm:grid-cols-4">
+        <div v-for="i in 4" :key="i" class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900/30">
+          <div class="h-3 w-16 animate-pulse rounded bg-gray-200 dark:bg-dark-700"></div>
+          <div class="mt-3 h-6 w-24 animate-pulse rounded bg-gray-200 dark:bg-dark-700"></div>
+        </div>
+      </div>
+    </div>
+
+    <!-- Charts -->
+    <div class="grid grid-cols-1 gap-6 lg:grid-cols-2">
+      <div class="rounded-3xl bg-white p-6 shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700">
+        <div class="h-4 w-40 animate-pulse rounded bg-gray-200 dark:bg-dark-700"></div>
+        <div class="mt-6 h-64 animate-pulse rounded-2xl bg-gray-100 dark:bg-dark-700/70"></div>
+      </div>
+      <div class="rounded-3xl bg-white p-6 shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700">
+        <div class="h-4 w-40 animate-pulse rounded bg-gray-200 dark:bg-dark-700"></div>
+        <div class="mt-6 h-64 animate-pulse rounded-2xl bg-gray-100 dark:bg-dark-700/70"></div>
+      </div>
+    </div>
+
+    <!-- Cards -->
+    <div class="grid grid-cols-1 gap-6 lg:grid-cols-3">
+      <div
+        v-for="i in 3"
+        :key="i"
+        class="rounded-3xl bg-white p-6 shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700"
+      >
+        <div class="h-4 w-36 animate-pulse rounded bg-gray-200 dark:bg-dark-700"></div>
+        <div class="mt-4 space-y-3">
+          <div class="h-3 w-2/3 animate-pulse rounded bg-gray-100 dark:bg-dark-700/70"></div>
+          <div class="h-3 w-1/2 animate-pulse rounded bg-gray-100 dark:bg-dark-700/70"></div>
+          <div class="h-3 w-3/5 animate-pulse rounded bg-gray-100 dark:bg-dark-700/70"></div>
+        </div>
+      </div>
+    </div>
+  </div>
+</template>
+
diff --git a/frontend/src/views/admin/ops/components/OpsEmailNotificationCard.vue b/frontend/src/views/admin/ops/components/OpsEmailNotificationCard.vue
new file mode 100644
index 00000000..0204cbeb
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsEmailNotificationCard.vue
@@ -0,0 +1,441 @@
+<script setup lang="ts">
+import { ref, onMounted, computed } from 'vue'
+import { useI18n } from 'vue-i18n'
+import { useAppStore } from '@/stores/app'
+import { opsAPI } from '@/api/admin/ops'
+import type { EmailNotificationConfig, AlertSeverity } from '../types'
+import BaseDialog from '@/components/common/BaseDialog.vue'
+import Select from '@/components/common/Select.vue'
+
+const { t } = useI18n()
+const appStore = useAppStore()
+
+const loading = ref(false)
+const config = ref<EmailNotificationConfig | null>(null)
+
+const showEditor = ref(false)
+const saving = ref(false)
+const draft = ref<EmailNotificationConfig | null>(null)
+const alertRecipientInput = ref('')
+const reportRecipientInput = ref('')
+const alertRecipientError = ref('')
+const reportRecipientError = ref('')
+
+const severityOptions: Array<{ value: AlertSeverity | ''; label: string }> = [
+  { value: '', label: t('admin.ops.email.minSeverityAll') },
+  { value: 'critical', label: t('common.critical') },
+  { value: 'warning', label: t('common.warning') },
+  { value: 'info', label: t('common.info') }
+]
+
+async function loadConfig() {
+  loading.value = true
+  try {
+    const data = await opsAPI.getEmailNotificationConfig()
+    config.value = data
+  } catch (err: any) {
+    console.error('[OpsEmailNotificationCard] Failed to load config', err)
+    appStore.showError(err?.response?.data?.detail || t('admin.ops.email.loadFailed'))
+  } finally {
+    loading.value = false
+  }
+}
+
+async function saveConfig() {
+  if (!draft.value) return
+  if (!editorValidation.value.valid) {
+    appStore.showError(editorValidation.value.errors[0] || t('admin.ops.email.validation.invalid'))
+    return
+  }
+  saving.value = true
+  try {
+    config.value = await opsAPI.updateEmailNotificationConfig(draft.value)
+    showEditor.value = false
+    appStore.showSuccess(t('admin.ops.email.saveSuccess'))
+  } catch (err: any) {
+    console.error('[OpsEmailNotificationCard] Failed to save config', err)
+    appStore.showError(err?.response?.data?.detail || t('admin.ops.email.saveFailed'))
+  } finally {
+    saving.value = false
+  }
+}
+
+function openEditor() {
+  if (!config.value) return
+  draft.value = JSON.parse(JSON.stringify(config.value))
+  alertRecipientInput.value = ''
+  reportRecipientInput.value = ''
+  alertRecipientError.value = ''
+  reportRecipientError.value = ''
+  showEditor.value = true
+}
+
+function isValidEmailAddress(email: string): boolean {
+  return /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(email)
+}
+
+function isNonNegativeNumber(value: unknown): boolean {
+  return typeof value === 'number' && Number.isFinite(value) && value >= 0
+}
+
+function validateCronField(enabled: boolean, cron: string): string | null {
+  if (!enabled) return null
+  if (!cron || !cron.trim()) return t('admin.ops.email.validation.cronRequired')
+  if (cron.trim().split(/\s+/).length < 5) return t('admin.ops.email.validation.cronFormat')
+  return null
+}
+
+const editorValidation = computed(() => {
+  const errors: string[] = []
+  if (!draft.value) return { valid: true, errors }
+
+  if (draft.value.alert.enabled && draft.value.alert.recipients.length === 0) {
+    errors.push(t('admin.ops.email.validation.alertRecipientsRequired'))
+  }
+  if (draft.value.report.enabled && draft.value.report.recipients.length === 0) {
+    errors.push(t('admin.ops.email.validation.reportRecipientsRequired'))
+  }
+
+  const invalidAlertRecipients = draft.value.alert.recipients.filter((e) => !isValidEmailAddress(e))
+  if (invalidAlertRecipients.length > 0) errors.push(t('admin.ops.email.validation.invalidRecipients'))
+
+  const invalidReportRecipients = draft.value.report.recipients.filter((e) => !isValidEmailAddress(e))
+  if (invalidReportRecipients.length > 0) errors.push(t('admin.ops.email.validation.invalidRecipients'))
+
+  if (!isNonNegativeNumber(draft.value.alert.rate_limit_per_hour)) {
+    errors.push(t('admin.ops.email.validation.rateLimitRange'))
+  }
+  if (
+    !isNonNegativeNumber(draft.value.alert.batching_window_seconds) ||
+    draft.value.alert.batching_window_seconds > 86400
+  ) {
+    errors.push(t('admin.ops.email.validation.batchWindowRange'))
+  }
+
+  const dailyErr = validateCronField(
+    draft.value.report.daily_summary_enabled,
+    draft.value.report.daily_summary_schedule
+  )
+  if (dailyErr) errors.push(dailyErr)
+  const weeklyErr = validateCronField(
+    draft.value.report.weekly_summary_enabled,
+    draft.value.report.weekly_summary_schedule
+  )
+  if (weeklyErr) errors.push(weeklyErr)
+  const digestErr = validateCronField(
+    draft.value.report.error_digest_enabled,
+    draft.value.report.error_digest_schedule
+  )
+  if (digestErr) errors.push(digestErr)
+  const accErr = validateCronField(
+    draft.value.report.account_health_enabled,
+    draft.value.report.account_health_schedule
+  )
+  if (accErr) errors.push(accErr)
+
+  if (!isNonNegativeNumber(draft.value.report.error_digest_min_count)) {
+    errors.push(t('admin.ops.email.validation.digestMinCountRange'))
+  }
+
+  const thr = draft.value.report.account_health_error_rate_threshold
+  if (!(typeof thr === 'number' && Number.isFinite(thr) && thr >= 0 && thr <= 100)) {
+    errors.push(t('admin.ops.email.validation.accountHealthThresholdRange'))
+  }
+
+  return { valid: errors.length === 0, errors }
+})
+
+function addRecipient(target: 'alert' | 'report') {
+  if (!draft.value) return
+  const raw = (target === 'alert' ? alertRecipientInput.value : reportRecipientInput.value).trim()
+  if (!raw) return
+
+  if (!isValidEmailAddress(raw)) {
+    const msg = t('common.invalidEmail')
+    if (target === 'alert') alertRecipientError.value = msg
+    else reportRecipientError.value = msg
+    return
+  }
+
+  const normalized = raw.toLowerCase()
+  const list = target === 'alert' ? draft.value.alert.recipients : draft.value.report.recipients
+  if (!list.includes(normalized)) {
+    list.push(normalized)
+  }
+  if (target === 'alert') alertRecipientInput.value = ''
+  else reportRecipientInput.value = ''
+  if (target === 'alert') alertRecipientError.value = ''
+  else reportRecipientError.value = ''
+}
+
+function removeRecipient(target: 'alert' | 'report', email: string) {
+  if (!draft.value) return
+  const list = target === 'alert' ? draft.value.alert.recipients : draft.value.report.recipients
+  const idx = list.indexOf(email)
+  if (idx >= 0) list.splice(idx, 1)
+}
+
+onMounted(() => {
+  loadConfig()
+})
+</script>
+
+<template>
+  <div class="rounded-3xl bg-white p-6 shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700">
+    <div class="mb-4 flex items-start justify-between gap-4">
+      <div>
+        <h3 class="text-sm font-bold text-gray-900 dark:text-white">{{ t('admin.ops.email.title') }}</h3>
+        <p class="mt-1 text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.email.description') }}</p>
+      </div>
+      <div class="flex items-center gap-2">
+        <button
+          class="flex items-center gap-1.5 rounded-lg bg-gray-100 px-3 py-1.5 text-xs font-bold text-gray-700 transition-colors hover:bg-gray-200 disabled:cursor-not-allowed disabled:opacity-50 dark:bg-dark-700 dark:text-gray-300 dark:hover:bg-dark-600"
+          :disabled="loading"
+          @click="loadConfig"
+        >
+          <svg class="h-3.5 w-3.5" :class="{ 'animate-spin': loading }" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 4v5h.582m15.356 2A8.001 8.001 0 004.582 9m0 0H9m11 11v-5h-.581m0 0a8.003 8.003 0 01-15.357-2m15.357 2H15" />
+          </svg>
+          {{ t('common.refresh') }}
+        </button>
+        <button class="btn btn-sm btn-secondary" :disabled="!config" @click="openEditor">{{ t('common.edit') }}</button>
+      </div>
+    </div>
+
+    <div v-if="!config" class="text-sm text-gray-500 dark:text-gray-400">
+      <span v-if="loading">{{ t('admin.ops.email.loading') }}</span>
+      <span v-else>{{ t('admin.ops.email.noData') }}</span>
+    </div>
+
+    <div v-else class="space-y-6">
+      <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-700/50">
+        <h4 class="mb-2 text-sm font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.email.alertTitle') }}</h4>
+        <div class="grid grid-cols-1 gap-3 md:grid-cols-2">
+          <div class="text-xs text-gray-600 dark:text-gray-300">
+            {{ t('common.enabled') }}:
+            <span class="ml-1 font-medium text-gray-900 dark:text-white">
+              {{ config.alert.enabled ? t('common.enabled') : t('common.disabled') }}
+            </span>
+          </div>
+          <div class="text-xs text-gray-600 dark:text-gray-300">
+            {{ t('admin.ops.email.recipients') }}:
+            <span class="ml-1 font-medium text-gray-900 dark:text-white">{{ config.alert.recipients.length }}</span>
+          </div>
+          <div class="text-xs text-gray-600 dark:text-gray-300">
+            {{ t('admin.ops.email.minSeverity') }}:
+            <span class="ml-1 font-medium text-gray-900 dark:text-white">{{
+              config.alert.min_severity || t('admin.ops.email.minSeverityAll')
+            }}</span>
+          </div>
+          <div class="text-xs text-gray-600 dark:text-gray-300">
+            {{ t('admin.ops.email.rateLimitPerHour') }}:
+            <span class="ml-1 font-medium text-gray-900 dark:text-white">{{ config.alert.rate_limit_per_hour }}</span>
+          </div>
+        </div>
+      </div>
+
+      <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-700/50">
+        <h4 class="mb-2 text-sm font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.email.reportTitle') }}</h4>
+        <div class="grid grid-cols-1 gap-3 md:grid-cols-2">
+          <div class="text-xs text-gray-600 dark:text-gray-300">
+            {{ t('common.enabled') }}:
+            <span class="ml-1 font-medium text-gray-900 dark:text-white">
+              {{ config.report.enabled ? t('common.enabled') : t('common.disabled') }}
+            </span>
+          </div>
+          <div class="text-xs text-gray-600 dark:text-gray-300">
+            {{ t('admin.ops.email.recipients') }}:
+            <span class="ml-1 font-medium text-gray-900 dark:text-white">{{ config.report.recipients.length }}</span>
+          </div>
+        </div>
+      </div>
+    </div>
+  </div>
+
+  <BaseDialog :show="showEditor" :title="t('admin.ops.email.title')" width="extra-wide" @close="showEditor = false">
+    <div v-if="draft" class="space-y-6">
+      <div
+        v-if="!editorValidation.valid"
+        class="rounded-lg border border-amber-200 bg-amber-50 p-3 text-xs text-amber-800 dark:border-amber-900/50 dark:bg-amber-900/20 dark:text-amber-200"
+      >
+        <div class="font-bold">{{ t('admin.ops.email.validation.title') }}</div>
+        <ul class="mt-1 list-disc space-y-1 pl-4">
+          <li v-for="msg in editorValidation.errors" :key="msg">{{ msg }}</li>
+        </ul>
+      </div>
+      <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-700/50">
+        <h4 class="mb-3 text-sm font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.email.alertTitle') }}</h4>
+        <div class="grid grid-cols-1 gap-4 md:grid-cols-2">
+          <div>
+            <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('common.enabled') }}</div>
+            <label class="inline-flex items-center gap-2 text-sm text-gray-700 dark:text-gray-300">
+              <input v-model="draft.alert.enabled" type="checkbox" class="h-4 w-4 rounded border-gray-300" />
+              <span>{{ draft.alert.enabled ? t('common.enabled') : t('common.disabled') }}</span>
+            </label>
+          </div>
+
+          <div>
+            <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.email.minSeverity') }}</div>
+            <Select v-model="draft.alert.min_severity" :options="severityOptions" />
+          </div>
+
+          <div class="md:col-span-2">
+            <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.email.recipients') }}</div>
+            <div class="flex gap-2">
+              <input
+                v-model="alertRecipientInput"
+                type="email"
+                class="input"
+                :placeholder="t('admin.ops.email.recipients')"
+                @keydown.enter.prevent="addRecipient('alert')"
+              />
+              <button class="btn btn-secondary whitespace-nowrap" type="button" @click="addRecipient('alert')">
+                {{ t('common.add') }}
+              </button>
+            </div>
+            <p v-if="alertRecipientError" class="mt-1 text-xs text-red-600 dark:text-red-400">{{ alertRecipientError }}</p>
+            <div class="mt-2 flex flex-wrap gap-2">
+              <span
+                v-for="email in draft.alert.recipients"
+                :key="email"
+                class="inline-flex items-center gap-2 rounded-full bg-blue-100 px-3 py-1 text-xs font-medium text-blue-700 dark:bg-blue-900/30 dark:text-blue-400"
+              >
+                {{ email }}
+                <button
+                  type="button"
+                  class="text-blue-700/80 hover:text-blue-900 dark:text-blue-300"
+                  @click="removeRecipient('alert', email)"
+                >
+                  ×
+                </button>
+              </span>
+            </div>
+            <div class="mt-1 text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.email.recipientsHint') }}</div>
+          </div>
+
+          <div>
+            <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.email.rateLimitPerHour') }}</div>
+            <input v-model.number="draft.alert.rate_limit_per_hour" type="number" min="0" max="100000" class="input" />
+          </div>
+
+          <div>
+            <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.email.batchWindowSeconds') }}</div>
+            <input v-model.number="draft.alert.batching_window_seconds" type="number" min="0" max="86400" class="input" />
+          </div>
+
+          <div>
+            <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.email.includeResolved') }}</div>
+            <label class="inline-flex items-center gap-2 text-sm text-gray-700 dark:text-gray-300">
+              <input v-model="draft.alert.include_resolved_alerts" type="checkbox" class="h-4 w-4 rounded border-gray-300" />
+              <span>{{ draft.alert.include_resolved_alerts ? t('common.enabled') : t('common.disabled') }}</span>
+            </label>
+          </div>
+        </div>
+      </div>
+
+      <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-700/50">
+        <h4 class="mb-3 text-sm font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.email.reportTitle') }}</h4>
+        <div class="grid grid-cols-1 gap-4 md:grid-cols-2">
+          <div>
+            <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('common.enabled') }}</div>
+            <label class="inline-flex items-center gap-2 text-sm text-gray-700 dark:text-gray-300">
+              <input v-model="draft.report.enabled" type="checkbox" class="h-4 w-4 rounded border-gray-300" />
+              <span>{{ draft.report.enabled ? t('common.enabled') : t('common.disabled') }}</span>
+            </label>
+          </div>
+
+          <div class="md:col-span-2">
+            <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.email.recipients') }}</div>
+            <div class="flex gap-2">
+              <input
+                v-model="reportRecipientInput"
+                type="email"
+                class="input"
+                :placeholder="t('admin.ops.email.recipients')"
+                @keydown.enter.prevent="addRecipient('report')"
+              />
+              <button class="btn btn-secondary whitespace-nowrap" type="button" @click="addRecipient('report')">
+                {{ t('common.add') }}
+              </button>
+            </div>
+            <p v-if="reportRecipientError" class="mt-1 text-xs text-red-600 dark:text-red-400">{{ reportRecipientError }}</p>
+            <div class="mt-2 flex flex-wrap gap-2">
+              <span
+                v-for="email in draft.report.recipients"
+                :key="email"
+                class="inline-flex items-center gap-2 rounded-full bg-blue-100 px-3 py-1 text-xs font-medium text-blue-700 dark:bg-blue-900/30 dark:text-blue-400"
+              >
+                {{ email }}
+                <button
+                  type="button"
+                  class="text-blue-700/80 hover:text-blue-900 dark:text-blue-300"
+                  @click="removeRecipient('report', email)"
+                >
+                  ×
+                </button>
+              </span>
+            </div>
+          </div>
+
+          <div class="md:col-span-2">
+            <div class="grid grid-cols-1 gap-4 md:grid-cols-2">
+              <div>
+                <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.email.dailySummary') }}</div>
+                <div class="flex items-center gap-2">
+                  <label class="inline-flex items-center gap-2 text-sm text-gray-700 dark:text-gray-300">
+                    <input v-model="draft.report.daily_summary_enabled" type="checkbox" class="h-4 w-4 rounded border-gray-300" />
+                  </label>
+                  <input v-model="draft.report.daily_summary_schedule" type="text" class="input" :placeholder="t('admin.ops.email.cronPlaceholder')" />
+                </div>
+              </div>
+              <div>
+                <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.email.weeklySummary') }}</div>
+                <div class="flex items-center gap-2">
+                  <label class="inline-flex items-center gap-2 text-sm text-gray-700 dark:text-gray-300">
+                    <input v-model="draft.report.weekly_summary_enabled" type="checkbox" class="h-4 w-4 rounded border-gray-300" />
+                  </label>
+                  <input v-model="draft.report.weekly_summary_schedule" type="text" class="input" :placeholder="t('admin.ops.email.cronPlaceholder')" />
+                </div>
+              </div>
+              <div>
+                <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.email.errorDigest') }}</div>
+                <div class="flex items-center gap-2">
+                  <label class="inline-flex items-center gap-2 text-sm text-gray-700 dark:text-gray-300">
+                    <input v-model="draft.report.error_digest_enabled" type="checkbox" class="h-4 w-4 rounded border-gray-300" />
+                  </label>
+                  <input v-model="draft.report.error_digest_schedule" type="text" class="input" :placeholder="t('admin.ops.email.cronPlaceholder')" />
+                </div>
+              </div>
+              <div>
+                <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.email.errorDigestMinCount') }}</div>
+                <input v-model.number="draft.report.error_digest_min_count" type="number" min="0" max="1000000" class="input" />
+              </div>
+              <div>
+                <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.email.accountHealth') }}</div>
+                <div class="flex items-center gap-2">
+                  <label class="inline-flex items-center gap-2 text-sm text-gray-700 dark:text-gray-300">
+                    <input v-model="draft.report.account_health_enabled" type="checkbox" class="h-4 w-4 rounded border-gray-300" />
+                  </label>
+                  <input v-model="draft.report.account_health_schedule" type="text" class="input" :placeholder="t('admin.ops.email.cronPlaceholder')" />
+                </div>
+              </div>
+              <div>
+                <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.email.accountHealthThreshold') }}</div>
+                <input v-model.number="draft.report.account_health_error_rate_threshold" type="number" min="0" max="100" step="0.1" class="input" />
+              </div>
+            </div>
+            <div class="mt-2 text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.email.reportHint') }}</div>
+          </div>
+        </div>
+      </div>
+    </div>
+    <template #footer>
+      <div class="flex justify-end gap-2">
+        <button class="btn btn-secondary" @click="showEditor = false">{{ t('common.cancel') }}</button>
+        <button class="btn btn-primary" :disabled="saving || !editorValidation.valid" @click="saveConfig">
+          {{ saving ? t('common.saving') : t('common.save') }}
+        </button>
+      </div>
+    </template>
+  </BaseDialog>
+</template>
diff --git a/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue
new file mode 100644
index 00000000..0726bacd
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue
@@ -0,0 +1,457 @@
+<template>
+  <BaseDialog :show="show" :title="title" width="full" :close-on-click-outside="true" @close="close">
+    <div v-if="loading" class="flex items-center justify-center py-16">
+      <div class="flex flex-col items-center gap-3">
+        <div class="h-8 w-8 animate-spin rounded-full border-b-2 border-primary-600"></div>
+        <div class="text-sm font-medium text-gray-500 dark:text-gray-400">{{ t('admin.ops.errorDetail.loading') }}</div>
+      </div>
+    </div>
+
+    <div v-else-if="!detail" class="py-10 text-center text-sm text-gray-500 dark:text-gray-400">
+      {{ emptyText }}
+    </div>
+
+    <div v-else class="space-y-6 p-6">
+      <!-- Top Summary -->
+      <div class="grid grid-cols-1 gap-4 sm:grid-cols-4">
+        <div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-900">
+          <div class="text-xs font-bold uppercase tracking-wider text-gray-400">{{ t('admin.ops.errorDetail.requestId') }}</div>
+          <div class="mt-1 break-all font-mono text-sm font-medium text-gray-900 dark:text-white">
+            {{ detail.request_id || detail.client_request_id || '—' }}
+          </div>
+        </div>
+
+        <div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-900">
+          <div class="text-xs font-bold uppercase tracking-wider text-gray-400">{{ t('admin.ops.errorDetail.time') }}</div>
+          <div class="mt-1 text-sm font-medium text-gray-900 dark:text-white">
+            {{ formatDateTime(detail.created_at) }}
+          </div>
+        </div>
+
+        <div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-900">
+          <div class="text-xs font-bold uppercase tracking-wider text-gray-400">{{ t('admin.ops.errorDetail.phase') }}</div>
+          <div class="mt-1 text-sm font-bold uppercase text-gray-900 dark:text-white">
+            {{ detail.phase || '—' }}
+          </div>
+          <div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
+            {{ detail.type || '—' }}
+          </div>
+        </div>
+
+        <div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-900">
+          <div class="text-xs font-bold uppercase tracking-wider text-gray-400">{{ t('admin.ops.errorDetail.status') }}</div>
+          <div class="mt-1 flex flex-wrap items-center gap-2">
+            <span :class="['inline-flex items-center rounded-lg px-2 py-1 text-xs font-black ring-1 ring-inset shadow-sm', statusClass]">
+              {{ detail.status_code }}
+            </span>
+            <span
+              v-if="detail.severity"
+              :class="['rounded-md px-2 py-0.5 text-[10px] font-black shadow-sm', severityClass]"
+            >
+              {{ detail.severity }}
+            </span>
+          </div>
+        </div>
+      </div>
+
+      <!-- Message -->
+      <div class="rounded-xl bg-gray-50 p-6 dark:bg-dark-900">
+        <h3 class="mb-4 text-sm font-black uppercase tracking-wider text-gray-900 dark:text-white">{{ t('admin.ops.errorDetail.message') }}</h3>
+        <div class="text-sm font-medium text-gray-800 dark:text-gray-200 break-words">
+          {{ detail.message || '—' }}
+        </div>
+      </div>
+
+      <!-- Basic Info -->
+      <div class="rounded-xl bg-gray-50 p-6 dark:bg-dark-900">
+        <h3 class="mb-4 text-sm font-black uppercase tracking-wider text-gray-900 dark:text-white">{{ t('admin.ops.errorDetail.basicInfo') }}</h3>
+        <div class="grid grid-cols-1 gap-4 sm:grid-cols-2 lg:grid-cols-3">
+          <div>
+            <div class="text-xs font-bold uppercase text-gray-400">{{ t('admin.ops.errorDetail.platform') }}</div>
+            <div class="mt-1 text-sm font-medium text-gray-900 dark:text-white">{{ detail.platform || '—' }}</div>
+          </div>
+          <div>
+            <div class="text-xs font-bold uppercase text-gray-400">{{ t('admin.ops.errorDetail.model') }}</div>
+            <div class="mt-1 text-sm font-medium text-gray-900 dark:text-white">{{ detail.model || '—' }}</div>
+          </div>
+          <div>
+            <div class="text-xs font-bold uppercase text-gray-400">{{ t('admin.ops.errorDetail.latency') }}</div>
+            <div class="mt-1 font-mono text-sm font-bold text-gray-900 dark:text-white">
+              {{ detail.latency_ms != null ? `${detail.latency_ms}ms` : '—' }}
+            </div>
+          </div>
+          <div>
+            <div class="text-xs font-bold uppercase text-gray-400">{{ t('admin.ops.errorDetail.ttft') }}</div>
+            <div class="mt-1 font-mono text-sm font-bold text-gray-900 dark:text-white">
+              {{ detail.time_to_first_token_ms != null ? `${detail.time_to_first_token_ms}ms` : '—' }}
+            </div>
+          </div>
+          <div>
+            <div class="text-xs font-bold uppercase text-gray-400">{{ t('admin.ops.errorDetail.businessLimited') }}</div>
+            <div class="mt-1 text-sm font-medium text-gray-900 dark:text-white">
+              {{ detail.is_business_limited ? 'true' : 'false' }}
+            </div>
+          </div>
+          <div>
+            <div class="text-xs font-bold uppercase text-gray-400">{{ t('admin.ops.errorDetail.requestPath') }}</div>
+            <div class="mt-1 font-mono text-xs text-gray-700 dark:text-gray-200 break-all">
+              {{ detail.request_path || '—' }}
+            </div>
+          </div>
+        </div>
+      </div>
+
+      <!-- Timings (best-effort fields) -->
+      <div class="rounded-xl bg-gray-50 p-6 dark:bg-dark-900">
+        <h3 class="mb-4 text-sm font-black uppercase tracking-wider text-gray-900 dark:text-white">{{ t('admin.ops.errorDetail.timings') }}</h3>
+        <div class="grid grid-cols-1 gap-3 sm:grid-cols-2 lg:grid-cols-4">
+          <div class="rounded-lg bg-white p-4 shadow-sm dark:bg-dark-800">
+            <div class="text-xs font-bold uppercase text-gray-400">{{ t('admin.ops.errorDetail.auth') }}</div>
+            <div class="mt-1 font-mono text-sm font-bold text-gray-900 dark:text-white">
+              {{ detail.auth_latency_ms != null ? `${detail.auth_latency_ms}ms` : '—' }}
+            </div>
+          </div>
+          <div class="rounded-lg bg-white p-4 shadow-sm dark:bg-dark-800">
+            <div class="text-xs font-bold uppercase text-gray-400">{{ t('admin.ops.errorDetail.routing') }}</div>
+            <div class="mt-1 font-mono text-sm font-bold text-gray-900 dark:text-white">
+              {{ detail.routing_latency_ms != null ? `${detail.routing_latency_ms}ms` : '—' }}
+            </div>
+          </div>
+          <div class="rounded-lg bg-white p-4 shadow-sm dark:bg-dark-800">
+            <div class="text-xs font-bold uppercase text-gray-400">{{ t('admin.ops.errorDetail.upstream') }}</div>
+            <div class="mt-1 font-mono text-sm font-bold text-gray-900 dark:text-white">
+              {{ detail.upstream_latency_ms != null ? `${detail.upstream_latency_ms}ms` : '—' }}
+            </div>
+          </div>
+          <div class="rounded-lg bg-white p-4 shadow-sm dark:bg-dark-800">
+            <div class="text-xs font-bold uppercase text-gray-400">{{ t('admin.ops.errorDetail.response') }}</div>
+            <div class="mt-1 font-mono text-sm font-bold text-gray-900 dark:text-white">
+              {{ detail.response_latency_ms != null ? `${detail.response_latency_ms}ms` : '—' }}
+            </div>
+          </div>
+        </div>
+      </div>
+
+      <!-- Retry -->
+      <div class="rounded-xl bg-gray-50 p-6 dark:bg-dark-900">
+        <div class="flex flex-col justify-between gap-4 md:flex-row md:items-start">
+          <div class="space-y-1">
+            <h3 class="text-sm font-black uppercase tracking-wider text-gray-900 dark:text-white">{{ t('admin.ops.errorDetail.retry') }}</h3>
+            <div class="text-xs text-gray-500 dark:text-gray-400">
+              {{ t('admin.ops.errorDetail.retryNote1') }}
+            </div>
+          </div>
+          <div class="flex flex-wrap gap-2">
+            <button type="button" class="btn btn-secondary btn-sm" :disabled="retrying" @click="openRetryConfirm('client')">
+              {{ t('admin.ops.errorDetail.retryClient') }}
+            </button>
+            <button
+              type="button"
+              class="btn btn-secondary btn-sm"
+              :disabled="retrying || !pinnedAccountId"
+              @click="openRetryConfirm('upstream')"
+              :title="pinnedAccountId ? '' : t('admin.ops.errorDetail.retryUpstreamHint')"
+            >
+              {{ t('admin.ops.errorDetail.retryUpstream') }}
+            </button>
+          </div>
+        </div>
+
+        <div class="mt-4 grid grid-cols-1 gap-4 md:grid-cols-3">
+          <div class="md:col-span-1">
+            <label class="mb-1 block text-xs font-bold uppercase tracking-wider text-gray-400">{{ t('admin.ops.errorDetail.pinnedAccountId') }}</label>
+            <input v-model="pinnedAccountIdInput" type="text" class="input font-mono text-sm" :placeholder="t('admin.ops.errorDetail.pinnedAccountIdHint')" />
+            <div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
+              {{ t('admin.ops.errorDetail.retryNote2') }}
+            </div>
+          </div>
+          <div class="md:col-span-2">
+            <div class="rounded-lg bg-white p-4 shadow-sm dark:bg-dark-800">
+              <div class="text-xs font-bold uppercase text-gray-400">{{ t('admin.ops.errorDetail.retryNotes') }}</div>
+              <ul class="mt-2 list-disc space-y-1 pl-5 text-xs text-gray-600 dark:text-gray-300">
+                <li>{{ t('admin.ops.errorDetail.retryNote3') }}</li>
+                <li>{{ t('admin.ops.errorDetail.retryNote4') }}</li>
+              </ul>
+            </div>
+          </div>
+        </div>
+      </div>
+
+      <!-- Upstream errors -->
+      <div
+        v-if="detail.upstream_status_code || detail.upstream_error_message || detail.upstream_error_detail || detail.upstream_errors"
+        class="rounded-xl bg-gray-50 p-6 dark:bg-dark-900"
+      >
+        <h3 class="mb-4 text-sm font-black uppercase tracking-wider text-gray-900 dark:text-white">
+          {{ t('admin.ops.errorDetails.upstreamErrors') }}
+        </h3>
+
+        <div class="grid grid-cols-1 gap-4 sm:grid-cols-3">
+          <div>
+            <div class="text-xs font-bold uppercase text-gray-400">status</div>
+            <div class="mt-1 font-mono text-sm font-bold text-gray-900 dark:text-white">
+              {{ detail.upstream_status_code != null ? detail.upstream_status_code : '—' }}
+            </div>
+          </div>
+          <div class="sm:col-span-2">
+            <div class="text-xs font-bold uppercase text-gray-400">message</div>
+            <div class="mt-1 break-words text-sm font-medium text-gray-900 dark:text-white">
+              {{ detail.upstream_error_message || '—' }}
+            </div>
+          </div>
+        </div>
+
+        <div v-if="detail.upstream_error_detail" class="mt-4">
+          <div class="text-xs font-bold uppercase text-gray-400">detail</div>
+          <pre
+            class="mt-2 max-h-[240px] overflow-auto rounded-xl border border-gray-200 bg-white p-4 text-xs text-gray-800 dark:border-dark-700 dark:bg-dark-800 dark:text-gray-100"
+          ><code>{{ prettyJSON(detail.upstream_error_detail) }}</code></pre>
+        </div>
+
+        <div v-if="detail.upstream_errors" class="mt-5">
+          <div class="mb-2 text-xs font-bold uppercase text-gray-400">upstream_errors</div>
+
+          <div v-if="upstreamErrors.length" class="space-y-3">
+            <div
+              v-for="(ev, idx) in upstreamErrors"
+              :key="idx"
+              class="rounded-xl border border-gray-200 bg-white p-4 shadow-sm dark:border-dark-700 dark:bg-dark-800"
+            >
+              <div class="flex flex-wrap items-center justify-between gap-2">
+                <div class="text-xs font-black text-gray-800 dark:text-gray-100">
+                  #{{ idx + 1 }} <span v-if="ev.kind" class="font-mono">{{ ev.kind }}</span>
+                </div>
+                <div class="font-mono text-xs text-gray-500 dark:text-gray-400">
+                  {{ ev.at_unix_ms ? formatDateTime(new Date(ev.at_unix_ms)) : '' }}
+                </div>
+              </div>
+
+              <div class="mt-2 grid grid-cols-1 gap-2 text-xs text-gray-600 dark:text-gray-300 sm:grid-cols-3">
+                <div><span class="text-gray-400">account_id:</span> <span class="font-mono">{{ ev.account_id ?? '—' }}</span></div>
+                <div><span class="text-gray-400">status:</span> <span class="font-mono">{{ ev.upstream_status_code ?? '—' }}</span></div>
+                <div class="break-all">
+                  <span class="text-gray-400">request_id:</span> <span class="font-mono">{{ ev.upstream_request_id || '—' }}</span>
+                </div>
+              </div>
+
+              <div v-if="ev.message" class="mt-2 break-words text-sm font-medium text-gray-900 dark:text-white">
+                {{ ev.message }}
+              </div>
+
+              <pre
+                v-if="ev.detail"
+                class="mt-3 max-h-[240px] overflow-auto rounded-xl border border-gray-200 bg-gray-50 p-3 text-xs text-gray-800 dark:border-dark-700 dark:bg-dark-900 dark:text-gray-100"
+              ><code>{{ prettyJSON(ev.detail) }}</code></pre>
+            </div>
+          </div>
+
+          <pre
+            v-else
+            class="max-h-[420px] overflow-auto rounded-xl border border-gray-200 bg-white p-4 text-xs text-gray-800 dark:border-dark-700 dark:bg-dark-800 dark:text-gray-100"
+          ><code>{{ prettyJSON(detail.upstream_errors) }}</code></pre>
+        </div>
+      </div>
+
+      <!-- Request body -->
+      <div class="rounded-xl bg-gray-50 p-6 dark:bg-dark-900">
+        <div class="flex items-center justify-between">
+          <h3 class="text-sm font-black uppercase tracking-wider text-gray-900 dark:text-white">{{ t('admin.ops.errorDetail.requestBody') }}</h3>
+          <div
+            v-if="detail.request_body_truncated"
+            class="rounded-full bg-amber-100 px-2 py-0.5 text-xs font-medium text-amber-700 dark:bg-amber-900/30 dark:text-amber-300"
+          >
+            {{ t('admin.ops.errorDetail.trimmed') }}
+          </div>
+        </div>
+        <pre
+          class="mt-4 max-h-[420px] overflow-auto rounded-xl border border-gray-200 bg-white p-4 text-xs text-gray-800 dark:border-dark-700 dark:bg-dark-800 dark:text-gray-100"
+        ><code>{{ prettyJSON(detail.request_body) }}</code></pre>
+      </div>
+
+      <!-- Error body -->
+      <div class="rounded-xl bg-gray-50 p-6 dark:bg-dark-900">
+        <h3 class="text-sm font-black uppercase tracking-wider text-gray-900 dark:text-white">{{ t('admin.ops.errorDetail.errorBody') }}</h3>
+        <pre
+          class="mt-4 max-h-[420px] overflow-auto rounded-xl border border-gray-200 bg-white p-4 text-xs text-gray-800 dark:border-dark-700 dark:bg-dark-800 dark:text-gray-100"
+        ><code>{{ prettyJSON(detail.error_body) }}</code></pre>
+      </div>
+    </div>
+  </BaseDialog>
+
+  <ConfirmDialog
+    :show="showRetryConfirm"
+    :title="t('admin.ops.errorDetail.confirmRetry')"
+    :message="retryConfirmMessage"
+    @confirm="runConfirmedRetry"
+    @cancel="cancelRetry"
+  />
+</template>
+
+<script setup lang="ts">
+import { computed, ref, watch } from 'vue'
+import { useI18n } from 'vue-i18n'
+import BaseDialog from '@/components/common/BaseDialog.vue'
+import ConfirmDialog from '@/components/common/ConfirmDialog.vue'
+import { useAppStore } from '@/stores'
+import { opsAPI, type OpsErrorDetail, type OpsRetryMode } from '@/api/admin/ops'
+import { formatDateTime } from '@/utils/format'
+import { getSeverityClass } from '../utils/opsFormatters'
+
+interface Props {
+  show: boolean
+  errorId: number | null
+}
+
+interface Emits {
+  (e: 'update:show', value: boolean): void
+}
+
+const props = defineProps<Props>()
+const emit = defineEmits<Emits>()
+
+const { t } = useI18n()
+const appStore = useAppStore()
+
+const loading = ref(false)
+const detail = ref<OpsErrorDetail | null>(null)
+
+const retrying = ref(false)
+const showRetryConfirm = ref(false)
+const pendingRetryMode = ref<OpsRetryMode>('client')
+
+const pinnedAccountIdInput = ref('')
+const pinnedAccountId = computed<number | null>(() => {
+  const raw = String(pinnedAccountIdInput.value || '').trim()
+  if (!raw) return null
+  const n = Number.parseInt(raw, 10)
+  return Number.isFinite(n) && n > 0 ? n : null
+})
+
+const title = computed(() => {
+  if (!props.errorId) return 'Error Detail'
+  return `Error #${props.errorId}`
+})
+
+const emptyText = computed(() => 'No error selected.')
+
+type UpstreamErrorEvent = {
+  at_unix_ms?: number
+  platform?: string
+  account_id?: number
+  upstream_status_code?: number
+  upstream_request_id?: string
+  kind?: string
+  message?: string
+  detail?: string
+}
+
+const upstreamErrors = computed<UpstreamErrorEvent[]>(() => {
+  const raw = detail.value?.upstream_errors
+  if (!raw) return []
+  try {
+    const parsed = JSON.parse(raw)
+    return Array.isArray(parsed) ? (parsed as UpstreamErrorEvent[]) : []
+  } catch {
+    return []
+  }
+})
+
+function close() {
+  emit('update:show', false)
+}
+
+function prettyJSON(raw?: string): string {
+  if (!raw) return t('admin.ops.errorDetail.na')
+  try {
+    return JSON.stringify(JSON.parse(raw), null, 2)
+  } catch {
+    return raw
+  }
+}
+
+async function fetchDetail(id: number) {
+  loading.value = true
+  try {
+    const d = await opsAPI.getErrorLogDetail(id)
+    detail.value = d
+
+    // Default pinned account from error log if present.
+    if (d.account_id && d.account_id > 0) {
+      pinnedAccountIdInput.value = String(d.account_id)
+    } else {
+      pinnedAccountIdInput.value = ''
+    }
+  } catch (err: any) {
+    detail.value = null
+    appStore.showError(err?.message || t('admin.ops.failedToLoadErrorDetail'))
+  } finally {
+    loading.value = false
+  }
+}
+
+watch(
+  () => [props.show, props.errorId] as const,
+  ([show, id]) => {
+    if (!show) {
+      detail.value = null
+      return
+    }
+    if (typeof id === 'number' && id > 0) {
+      fetchDetail(id)
+    }
+  },
+  { immediate: true }
+)
+
+function openRetryConfirm(mode: OpsRetryMode) {
+  pendingRetryMode.value = mode
+  showRetryConfirm.value = true
+}
+
+const retryConfirmMessage = computed(() => {
+  const mode = pendingRetryMode.value
+  if (mode === 'upstream') {
+    return t('admin.ops.errorDetail.confirmRetryMessage')
+  }
+  return t('admin.ops.errorDetail.confirmRetryHint')
+})
+
+const severityClass = computed(() => {
+  if (!detail.value?.severity) return 'bg-gray-100 text-gray-700 dark:bg-dark-700 dark:text-gray-300'
+  return getSeverityClass(detail.value.severity)
+})
+
+const statusClass = computed(() => {
+  const code = detail.value?.status_code ?? 0
+  if (code >= 500) return 'bg-red-50 text-red-700 ring-red-600/20 dark:bg-red-900/30 dark:text-red-400 dark:ring-red-500/30'
+  if (code === 429) return 'bg-purple-50 text-purple-700 ring-purple-600/20 dark:bg-purple-900/30 dark:text-purple-400 dark:ring-purple-500/30'
+  if (code >= 400) return 'bg-amber-50 text-amber-700 ring-amber-600/20 dark:bg-amber-900/30 dark:text-amber-400 dark:ring-amber-500/30'
+  return 'bg-gray-50 text-gray-700 ring-gray-600/20 dark:bg-gray-900/30 dark:text-gray-400 dark:ring-gray-500/30'
+})
+
+async function runConfirmedRetry() {
+  if (!props.errorId) return
+  const mode = pendingRetryMode.value
+  showRetryConfirm.value = false
+
+  retrying.value = true
+  try {
+    const req =
+      mode === 'upstream'
+        ? { mode, pinned_account_id: pinnedAccountId.value ?? undefined }
+        : { mode }
+
+    const res = await opsAPI.retryErrorRequest(props.errorId, req)
+    const summary = res.status === 'succeeded' ? t('admin.ops.errorDetail.retrySuccess') : t('admin.ops.errorDetail.retryFailed')
+    appStore.showSuccess(summary)
+  } catch (err: any) {
+    appStore.showError(err?.message || t('admin.ops.retryFailed'))
+  } finally {
+    retrying.value = false
+  }
+}
+
+function cancelRetry() {
+  showRetryConfirm.value = false
+}
+</script>
diff --git a/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue b/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue
new file mode 100644
index 00000000..1ba51db8
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue
@@ -0,0 +1,242 @@
+<script setup lang="ts">
+import { computed, ref, watch } from 'vue'
+import { useI18n } from 'vue-i18n'
+import BaseDialog from '@/components/common/BaseDialog.vue'
+import Select from '@/components/common/Select.vue'
+import OpsErrorLogTable from './OpsErrorLogTable.vue'
+import { opsAPI, type OpsErrorLog } from '@/api/admin/ops'
+
+interface Props {
+  show: boolean
+  timeRange: string
+  platform?: string
+  groupId?: number | null
+  errorType: 'request' | 'upstream'
+}
+
+const props = defineProps<Props>()
+const emit = defineEmits<{
+  (e: 'update:show', value: boolean): void
+  (e: 'openErrorDetail', errorId: number): void
+}>()
+
+const { t } = useI18n()
+
+const loading = ref(false)
+const rows = ref<OpsErrorLog[]>([])
+const total = ref(0)
+const page = ref(1)
+const pageSize = ref(20)
+
+const q = ref('')
+const statusCode = ref<number | null>(null)
+const phase = ref<string>('')
+const accountIdInput = ref<string>('')
+
+const accountId = computed<number | null>(() => {
+  const raw = String(accountIdInput.value || '').trim()
+  if (!raw) return null
+  const n = Number.parseInt(raw, 10)
+  return Number.isFinite(n) && n > 0 ? n : null
+})
+
+const modalTitle = computed(() => {
+  return props.errorType === 'upstream' ? t('admin.ops.errorDetails.upstreamErrors') : t('admin.ops.errorDetails.requestErrors')
+})
+
+const statusCodeSelectOptions = computed(() => {
+  const codes = [400, 401, 403, 404, 409, 422, 429, 500, 502, 503, 504, 529]
+  return [
+    { value: null, label: t('common.all') },
+    ...codes.map((c) => ({ value: c, label: String(c) }))
+  ]
+})
+
+const phaseSelectOptions = computed(() => {
+  const options = [
+    { value: '', label: t('common.all') },
+    { value: 'upstream', label: 'upstream' },
+    { value: 'network', label: 'network' },
+    { value: 'routing', label: 'routing' },
+    { value: 'auth', label: 'auth' },
+    { value: 'billing', label: 'billing' },
+    { value: 'concurrency', label: 'concurrency' },
+    { value: 'internal', label: 'internal' }
+  ]
+  return options
+})
+
+function close() {
+  emit('update:show', false)
+}
+
+async function fetchErrorLogs() {
+  if (!props.show) return
+
+  loading.value = true
+  try {
+    const params: Record<string, any> = {
+      page: page.value,
+      page_size: pageSize.value,
+      time_range: props.timeRange
+    }
+
+    const platform = String(props.platform || '').trim()
+    if (platform) params.platform = platform
+    if (typeof props.groupId === 'number' && props.groupId > 0) params.group_id = props.groupId
+
+    if (q.value.trim()) params.q = q.value.trim()
+    if (typeof statusCode.value === 'number') params.status_codes = String(statusCode.value)
+    if (typeof accountId.value === 'number') params.account_id = accountId.value
+
+    const phaseVal = String(phase.value || '').trim()
+    if (phaseVal) params.phase = phaseVal
+
+    const res = await opsAPI.listErrorLogs(params)
+    rows.value = res.items || []
+    total.value = res.total || 0
+  } catch (err) {
+    console.error('[OpsErrorDetailsModal] Failed to fetch error logs', err)
+    rows.value = []
+    total.value = 0
+  } finally {
+    loading.value = false
+  }
+}
+
+function resetFilters() {
+  q.value = ''
+  statusCode.value = null
+  phase.value = props.errorType === 'upstream' ? 'upstream' : ''
+  accountIdInput.value = ''
+  page.value = 1
+  fetchErrorLogs()
+}
+
+watch(
+  () => props.show,
+  (open) => {
+    if (!open) return
+    page.value = 1
+    pageSize.value = 20
+    resetFilters()
+  }
+)
+
+watch(
+  () => [props.timeRange, props.platform, props.groupId] as const,
+  () => {
+    if (!props.show) return
+    page.value = 1
+    fetchErrorLogs()
+  }
+)
+
+watch(
+  () => [page.value, pageSize.value] as const,
+  () => {
+    if (!props.show) return
+    fetchErrorLogs()
+  }
+)
+
+let searchTimeout: number | null = null
+watch(
+  () => q.value,
+  () => {
+    if (!props.show) return
+    if (searchTimeout) window.clearTimeout(searchTimeout)
+    searchTimeout = window.setTimeout(() => {
+      page.value = 1
+      fetchErrorLogs()
+    }, 350)
+  }
+)
+
+watch(
+  () => [statusCode.value, phase.value] as const,
+  () => {
+    if (!props.show) return
+    page.value = 1
+    fetchErrorLogs()
+  }
+)
+
+watch(
+  () => accountId.value,
+  () => {
+    if (!props.show) return
+    page.value = 1
+    fetchErrorLogs()
+  }
+)
+</script>
+
+<template>
+  <BaseDialog :show="show" :title="modalTitle" width="full" @close="close">
+    <!-- Filters -->
+    <div class="border-b border-gray-200 pb-4 mb-4 dark:border-dark-700">
+      <div class="grid grid-cols-1 gap-4 lg:grid-cols-12">
+        <div class="lg:col-span-5">
+          <div class="relative group">
+            <div class="pointer-events-none absolute inset-y-0 left-0 flex items-center pl-3.5">
+              <svg
+                class="h-4 w-4 text-gray-400 transition-colors group-focus-within:text-blue-500"
+                fill="none"
+                viewBox="0 0 24 24"
+                stroke="currentColor"
+              >
+                <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2.5" d="M21 21l-6-6m2-5a7 7 0 11-14 0 7 7 0 0114 0z" />
+              </svg>
+            </div>
+            <input
+              v-model="q"
+              type="text"
+              class="w-full rounded-2xl border-gray-200 bg-gray-50/50 py-2 pl-10 pr-4 text-sm font-medium text-gray-700 transition-all focus:border-blue-500 focus:bg-white focus:ring-4 focus:ring-blue-500/10 dark:border-dark-700 dark:bg-dark-900 dark:text-gray-300 dark:focus:bg-dark-800"
+              :placeholder="t('admin.ops.errorDetails.searchPlaceholder')"
+            />
+          </div>
+        </div>
+
+        <div class="lg:col-span-2">
+          <Select :model-value="statusCode" :options="statusCodeSelectOptions" class="w-full" @update:model-value="statusCode = $event as any" />
+        </div>
+
+        <div class="lg:col-span-2">
+          <Select :model-value="phase" :options="phaseSelectOptions" class="w-full" @update:model-value="phase = String($event ?? '')" />
+        </div>
+
+        <div class="lg:col-span-2">
+          <input
+            v-model="accountIdInput"
+            type="text"
+            inputmode="numeric"
+            class="input w-full text-sm"
+            :placeholder="t('admin.ops.errorDetails.accountIdPlaceholder')"
+          />
+        </div>
+
+        <div class="lg:col-span-1 flex items-center justify-end">
+          <button type="button" class="btn btn-secondary btn-sm" @click="resetFilters">
+            {{ t('common.reset') }}
+          </button>
+        </div>
+      </div>
+    </div>
+
+    <!-- Body -->
+    <div class="text-xs text-gray-500 dark:text-gray-400 mb-2">
+      {{ t('admin.ops.errorDetails.total') }} {{ total }}
+    </div>
+    <OpsErrorLogTable
+      :rows="rows"
+      :total="total"
+      :loading="loading"
+      :page="page"
+      :page-size="pageSize"
+      @openErrorDetail="emit('openErrorDetail', $event)"
+      @update:page="page = $event"
+      @update:pageSize="pageSize = $event"
+    />
+  </BaseDialog>
+</template>
diff --git a/frontend/src/views/admin/ops/components/OpsErrorDistributionChart.vue b/frontend/src/views/admin/ops/components/OpsErrorDistributionChart.vue
new file mode 100644
index 00000000..a52b5442
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsErrorDistributionChart.vue
@@ -0,0 +1,157 @@
+<script setup lang="ts">
+import { computed } from 'vue'
+import { useI18n } from 'vue-i18n'
+import { Chart as ChartJS, ArcElement, Legend, Tooltip } from 'chart.js'
+import { Doughnut } from 'vue-chartjs'
+import type { OpsErrorDistributionResponse } from '@/api/admin/ops'
+import type { ChartState } from '../types'
+import HelpTooltip from '@/components/common/HelpTooltip.vue'
+import EmptyState from '@/components/common/EmptyState.vue'
+
+ChartJS.register(ArcElement, Tooltip, Legend)
+
+interface Props {
+  data: OpsErrorDistributionResponse | null
+  loading: boolean
+}
+
+const props = defineProps<Props>()
+const emit = defineEmits<{
+  (e: 'openDetails'): void
+}>()
+const { t } = useI18n()
+
+const isDarkMode = computed(() => document.documentElement.classList.contains('dark'))
+const colors = computed(() => ({
+  blue: '#3b82f6',
+  red: '#ef4444',
+  orange: '#f59e0b',
+  gray: '#9ca3af',
+  text: isDarkMode.value ? '#9ca3af' : '#6b7280'
+}))
+
+const hasData = computed(() => (props.data?.total ?? 0) > 0)
+
+const state = computed<ChartState>(() => {
+  if (hasData.value) return 'ready'
+  if (props.loading) return 'loading'
+  return 'empty'
+})
+
+interface ErrorCategory {
+  label: string
+  count: number
+  color: string
+}
+
+const categories = computed<ErrorCategory[]>(() => {
+  if (!props.data) return []
+
+  let upstream = 0 // 502, 503, 504
+  let client = 0 // 4xx
+  let system = 0 // 500
+  let other = 0
+
+  for (const item of props.data.items || []) {
+    const code = Number(item.status_code || 0)
+    const count = Number(item.total || 0)
+    if (!Number.isFinite(code) || !Number.isFinite(count)) continue
+
+    if ([502, 503, 504].includes(code)) upstream += count
+    else if (code >= 400 && code < 500) client += count
+    else if (code === 500) system += count
+    else other += count
+  }
+
+  const out: ErrorCategory[] = []
+  if (upstream > 0) out.push({ label: t('admin.ops.upstream'), count: upstream, color: colors.value.orange })
+  if (client > 0) out.push({ label: t('admin.ops.client'), count: client, color: colors.value.blue })
+  if (system > 0) out.push({ label: t('admin.ops.system'), count: system, color: colors.value.red })
+  if (other > 0) out.push({ label: t('admin.ops.other'), count: other, color: colors.value.gray })
+  return out
+})
+
+const topReason = computed(() => {
+  if (categories.value.length === 0) return null
+  return categories.value.reduce((prev, cur) => (cur.count > prev.count ? cur : prev))
+})
+
+const chartData = computed(() => {
+  if (!hasData.value || categories.value.length === 0) return null
+  return {
+    labels: categories.value.map((c) => c.label),
+    datasets: [
+      {
+        data: categories.value.map((c) => c.count),
+        backgroundColor: categories.value.map((c) => c.color),
+        borderWidth: 0
+      }
+    ]
+  }
+})
+
+const options = computed(() => ({
+  responsive: true,
+  maintainAspectRatio: false,
+  plugins: {
+    legend: { display: false },
+    tooltip: {
+      backgroundColor: isDarkMode.value ? '#1f2937' : '#ffffff',
+      titleColor: isDarkMode.value ? '#f3f4f6' : '#111827',
+      bodyColor: isDarkMode.value ? '#d1d5db' : '#4b5563'
+    }
+  }
+}))
+</script>
+
+<template>
+  <div class="flex h-full flex-col rounded-3xl bg-white p-6 shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700">
+    <div class="mb-4 flex items-center justify-between">
+      <h3 class="flex items-center gap-2 text-sm font-bold text-gray-900 dark:text-white">
+        <svg class="h-4 w-4 text-red-500" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+          <path
+            stroke-linecap="round"
+            stroke-linejoin="round"
+            stroke-width="2"
+            d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-3L13.732 4c-.77-1.333-2.694-1.333-3.464 0L3.34 16c-.77 1.333.192 3 1.732 3z"
+          />
+        </svg>
+        {{ t('admin.ops.errorDistribution') }}
+        <HelpTooltip :content="t('admin.ops.tooltips.errorDistribution')" />
+      </h3>
+      <button
+        type="button"
+        class="inline-flex items-center rounded-lg border border-gray-200 bg-white px-2 py-1 text-[11px] font-semibold text-gray-600 hover:bg-gray-50 disabled:opacity-50 dark:border-dark-700 dark:bg-dark-900 dark:text-gray-300 dark:hover:bg-dark-800"
+        :disabled="state !== 'ready'"
+        :title="t('admin.ops.errorTrend')"
+        @click="emit('openDetails')"
+      >
+        {{ t('admin.ops.requestDetails.details') }}
+      </button>
+    </div>
+
+    <div class="relative min-h-0 flex-1">
+      <div v-if="state === 'ready' && chartData" class="flex h-full flex-col">
+        <div class="flex-1">
+          <Doughnut :data="chartData" :options="{ ...options, cutout: '65%' }" />
+        </div>
+        <div class="mt-4 flex flex-col items-center gap-2">
+          <div v-if="topReason" class="text-xs font-bold text-gray-900 dark:text-white">
+            {{ t('admin.ops.top') }}: <span :style="{ color: topReason.color }">{{ topReason.label }}</span>
+          </div>
+          <div class="flex flex-wrap justify-center gap-3">
+            <div v-for="item in categories" :key="item.label" class="flex items-center gap-1.5 text-xs">
+              <span class="h-2 w-2 rounded-full" :style="{ backgroundColor: item.color }"></span>
+              <span class="text-gray-500 dark:text-gray-400">{{ item.count }}</span>
+            </div>
+          </div>
+        </div>
+      </div>
+
+      <div v-else class="flex h-full items-center justify-center">
+        <div v-if="state === 'loading'" class="animate-pulse text-sm text-gray-400">{{ t('common.loading') }}</div>
+        <EmptyState v-else :title="t('common.noData')" :description="t('admin.ops.charts.emptyError')" />
+      </div>
+    </div>
+  </div>
+</template>
diff --git a/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue b/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue
new file mode 100644
index 00000000..6a4be1a7
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue
@@ -0,0 +1,238 @@
+<template>
+  <div>
+    <div v-if="loading" class="flex items-center justify-center py-10">
+      <div class="h-8 w-8 animate-spin rounded-full border-b-2 border-primary-600"></div>
+    </div>
+
+    <div v-else class="overflow-x-auto">
+      <table class="min-w-full divide-y divide-gray-200 dark:divide-dark-700">
+        <thead class="sticky top-0 z-10 bg-gray-50/50 dark:bg-dark-800/50">
+          <tr>
+            <th
+              scope="col"
+              class="whitespace-nowrap px-6 py-4 text-left text-xs font-bold uppercase tracking-wider text-gray-500 dark:text-dark-400"
+            >
+              {{ t('admin.ops.errorLog.timeId') }}
+            </th>
+            <th
+              scope="col"
+              class="whitespace-nowrap px-6 py-4 text-left text-xs font-bold uppercase tracking-wider text-gray-500 dark:text-dark-400"
+            >
+              {{ t('admin.ops.errorLog.context') }}
+            </th>
+            <th
+              scope="col"
+              class="whitespace-nowrap px-6 py-4 text-left text-xs font-bold uppercase tracking-wider text-gray-500 dark:text-dark-400"
+            >
+              {{ t('admin.ops.errorLog.status') }}
+            </th>
+            <th
+              scope="col"
+              class="px-6 py-4 text-left text-xs font-bold uppercase tracking-wider text-gray-500 dark:text-dark-400"
+            >
+              {{ t('admin.ops.errorLog.message') }}
+            </th>
+            <th
+              scope="col"
+              class="whitespace-nowrap px-6 py-4 text-right text-xs font-bold uppercase tracking-wider text-gray-500 dark:text-dark-400"
+            >
+              {{ t('admin.ops.errorLog.latency') }}
+            </th>
+            <th
+              scope="col"
+              class="whitespace-nowrap px-6 py-4 text-right text-xs font-bold uppercase tracking-wider text-gray-500 dark:text-dark-400"
+            >
+              {{ t('admin.ops.errorLog.action') }}
+            </th>
+          </tr>
+        </thead>
+        <tbody class="divide-y divide-gray-100 dark:divide-dark-700">
+          <tr v-if="rows.length === 0" class="bg-white dark:bg-dark-900">
+            <td colspan="6" class="py-16 text-center text-sm text-gray-400 dark:text-dark-500">
+              {{ t('admin.ops.errorLog.noErrors') }}
+            </td>
+          </tr>
+
+          <tr
+            v-for="log in rows"
+            :key="log.id"
+            class="group cursor-pointer transition-all duration-200 hover:bg-gray-50/80 focus:outline-none focus:ring-2 focus:ring-primary-500 focus:ring-offset-2 dark:hover:bg-dark-800/50 dark:focus:ring-offset-dark-900"
+            tabindex="0"
+            role="button"
+            @click="emit('openErrorDetail', log.id)"
+            @keydown.enter.prevent="emit('openErrorDetail', log.id)"
+            @keydown.space.prevent="emit('openErrorDetail', log.id)"
+          >
+            <!-- Time & ID -->
+            <td class="px-6 py-4">
+              <div class="flex flex-col gap-0.5">
+                <span class="font-mono text-xs font-bold text-gray-900 dark:text-gray-200">
+                  {{ formatDateTime(log.created_at).split(' ')[1] }}
+                </span>
+                <span
+                  class="font-mono text-[10px] text-gray-400 transition-colors group-hover:text-primary-600 dark:group-hover:text-primary-400"
+                  :title="log.request_id || log.client_request_id"
+                >
+                  {{ (log.request_id || log.client_request_id || '').substring(0, 12) }}
+                </span>
+              </div>
+            </td>
+
+	            <!-- Context (Platform/Model) -->
+	            <td class="px-6 py-4">
+	              <div class="flex flex-col items-start gap-1.5">
+	                <span
+	                  class="inline-flex items-center rounded-md bg-gray-100 px-2 py-0.5 text-[10px] font-bold uppercase tracking-tight text-gray-600 dark:bg-dark-700 dark:text-gray-300"
+	                >
+	                  {{ log.platform || '-' }}
+	                </span>
+	                <span
+	                  v-if="log.model"
+	                  class="max-w-[160px] truncate font-mono text-[10px] text-gray-500 dark:text-dark-400"
+	                  :title="log.model"
+	                >
+	                  {{ log.model }}
+	                </span>
+	                <div
+	                  v-if="log.group_id || log.account_id"
+	                  class="flex flex-wrap items-center gap-2 font-mono text-[10px] font-semibold text-gray-400 dark:text-dark-500"
+	                >
+	                  <span v-if="log.group_id">{{ t('admin.ops.errorLog.grp') }} {{ log.group_id }}</span>
+	                  <span v-if="log.account_id">{{ t('admin.ops.errorLog.acc') }} {{ log.account_id }}</span>
+	                </div>
+	              </div>
+	            </td>
+
+            <!-- Status & Severity -->
+            <td class="px-6 py-4">
+              <div class="flex flex-wrap items-center gap-2">
+                <span
+                  :class="[
+                    'inline-flex items-center rounded-lg px-2 py-1 text-xs font-black ring-1 ring-inset shadow-sm',
+                    getStatusClass(log.status_code)
+                  ]"
+                >
+                  {{ log.status_code }}
+                </span>
+                <span
+                  v-if="log.severity"
+                  :class="['rounded-md px-2 py-0.5 text-[10px] font-black shadow-sm', getSeverityClass(log.severity)]"
+                >
+                  {{ log.severity }}
+                </span>
+              </div>
+            </td>
+
+            <!-- Message -->
+            <td class="px-6 py-4">
+              <div class="max-w-md lg:max-w-2xl">
+                <p class="truncate text-xs font-semibold text-gray-700 dark:text-gray-300" :title="log.message">
+                  {{ formatSmartMessage(log.message) || '-' }}
+                </p>
+                <div class="mt-1.5 flex flex-wrap gap-x-3 gap-y-1">
+                  <div v-if="log.phase" class="flex items-center gap-1">
+                    <span class="h-1 w-1 rounded-full bg-gray-300"></span>
+                    <span class="text-[9px] font-black uppercase tracking-tighter text-gray-400">{{ log.phase }}</span>
+                  </div>
+                  <div v-if="log.client_ip" class="flex items-center gap-1">
+                    <span class="h-1 w-1 rounded-full bg-gray-300"></span>
+                    <span class="text-[9px] font-mono font-bold text-gray-400">{{ log.client_ip }}</span>
+                  </div>
+                </div>
+              </div>
+            </td>
+
+            <!-- Latency -->
+            <td class="px-6 py-4 text-right">
+              <div class="flex flex-col items-end">
+                <span class="font-mono text-xs font-black" :class="getLatencyClass(log.latency_ms ?? null)">
+                  {{ log.latency_ms != null ? Math.round(log.latency_ms) + 'ms' : '--' }}
+                </span>
+              </div>
+            </td>
+
+            <!-- Actions -->
+            <td class="px-6 py-4 text-right" @click.stop>
+              <button type="button" class="btn btn-secondary btn-sm" @click="emit('openErrorDetail', log.id)">
+                {{ t('admin.ops.errorLog.details') }}
+              </button>
+            </td>
+          </tr>
+        </tbody>
+      </table>
+    </div>
+
+    <Pagination
+      v-if="total > 0"
+      :total="total"
+      :page="page"
+      :page-size="pageSize"
+      :page-size-options="[10, 20, 50, 100, 200, 500]"
+      @update:page="emit('update:page', $event)"
+      @update:pageSize="emit('update:pageSize', $event)"
+    />
+  </div>
+</template>
+
+<script setup lang="ts">
+import { useI18n } from 'vue-i18n'
+import Pagination from '@/components/common/Pagination.vue'
+import type { OpsErrorLog } from '@/api/admin/ops'
+import { getSeverityClass, formatDateTime } from '../utils/opsFormatters'
+
+const { t } = useI18n()
+
+interface Props {
+  rows: OpsErrorLog[]
+  total: number
+  loading: boolean
+  page: number
+  pageSize: number
+}
+
+interface Emits {
+  (e: 'openErrorDetail', id: number): void
+  (e: 'update:page', value: number): void
+  (e: 'update:pageSize', value: number): void
+}
+
+defineProps<Props>()
+const emit = defineEmits<Emits>()
+
+function getStatusClass(code: number): string {
+  if (code >= 500) return 'bg-red-50 text-red-700 ring-red-600/20 dark:bg-red-900/30 dark:text-red-400 dark:ring-red-500/30'
+  if (code === 429) return 'bg-purple-50 text-purple-700 ring-purple-600/20 dark:bg-purple-900/30 dark:text-purple-400 dark:ring-purple-500/30'
+  if (code >= 400) return 'bg-amber-50 text-amber-700 ring-amber-600/20 dark:bg-amber-900/30 dark:text-amber-400 dark:ring-amber-500/30'
+  return 'bg-gray-50 text-gray-700 ring-gray-600/20 dark:bg-gray-900/30 dark:text-gray-400 dark:ring-gray-500/30'
+}
+
+function getLatencyClass(latency: number | null): string {
+  if (!latency) return 'text-gray-400'
+  if (latency > 10000) return 'text-red-600 font-black'
+  if (latency > 5000) return 'text-red-500 font-bold'
+  if (latency > 2000) return 'text-orange-500 font-medium'
+  return 'text-gray-600 dark:text-gray-400'
+}
+
+function formatSmartMessage(msg: string): string {
+  if (!msg) return ''
+
+  if (msg.startsWith('{') || msg.startsWith('[')) {
+    try {
+      const obj = JSON.parse(msg)
+      if (obj?.error?.message) return String(obj.error.message)
+      if (obj?.message) return String(obj.message)
+      if (obj?.detail) return String(obj.detail)
+      if (typeof obj === 'object') return JSON.stringify(obj).substring(0, 150)
+    } catch {
+      // ignore parse error
+    }
+  }
+
+  if (msg.includes('context deadline exceeded')) return 'context deadline exceeded'
+  if (msg.includes('connection refused')) return 'connection refused'
+  if (msg.toLowerCase().includes('rate limit')) return 'rate limit'
+
+  return msg.length > 200 ? msg.substring(0, 200) + '...' : msg
+}
+</script>
diff --git a/frontend/src/views/admin/ops/components/OpsErrorTrendChart.vue b/frontend/src/views/admin/ops/components/OpsErrorTrendChart.vue
new file mode 100644
index 00000000..088dc317
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsErrorTrendChart.vue
@@ -0,0 +1,200 @@
+<script setup lang="ts">
+import { computed } from 'vue'
+import { useI18n } from 'vue-i18n'
+import {
+  Chart as ChartJS,
+  CategoryScale,
+  Filler,
+  Legend,
+  LineElement,
+  LinearScale,
+  PointElement,
+  Title,
+  Tooltip
+} from 'chart.js'
+import { Line } from 'vue-chartjs'
+import type { OpsErrorTrendPoint } from '@/api/admin/ops'
+import type { ChartState } from '../types'
+import { formatHistoryLabel, sumNumbers } from '../utils/opsFormatters'
+import HelpTooltip from '@/components/common/HelpTooltip.vue'
+import EmptyState from '@/components/common/EmptyState.vue'
+
+ChartJS.register(Title, Tooltip, Legend, LineElement, LinearScale, PointElement, CategoryScale, Filler)
+
+interface Props {
+  points: OpsErrorTrendPoint[]
+  loading: boolean
+  timeRange: string
+}
+
+const props = defineProps<Props>()
+const emit = defineEmits<{
+  (e: 'openRequestErrors'): void
+  (e: 'openUpstreamErrors'): void
+}>()
+const { t } = useI18n()
+
+const isDarkMode = computed(() => document.documentElement.classList.contains('dark'))
+const colors = computed(() => ({
+  red: '#ef4444',
+  redAlpha: '#ef444420',
+  purple: '#8b5cf6',
+  purpleAlpha: '#8b5cf620',
+  gray: '#9ca3af',
+  grid: isDarkMode.value ? '#374151' : '#f3f4f6',
+  text: isDarkMode.value ? '#9ca3af' : '#6b7280'
+}))
+
+const totalRequestErrors = computed(() =>
+  sumNumbers(props.points.map((p) => (p.error_count_sla ?? 0) + (p.business_limited_count ?? 0)))
+)
+
+const totalUpstreamErrors = computed(() =>
+  sumNumbers(
+    props.points.map((p) => (p.upstream_error_count_excl_429_529 ?? 0) + (p.upstream_429_count ?? 0) + (p.upstream_529_count ?? 0))
+  )
+)
+
+const totalDisplayed = computed(() =>
+  sumNumbers(props.points.map((p) => (p.error_count_sla ?? 0) + (p.upstream_error_count_excl_429_529 ?? 0) + (p.business_limited_count ?? 0)))
+)
+
+const hasRequestErrors = computed(() => totalRequestErrors.value > 0)
+const hasUpstreamErrors = computed(() => totalUpstreamErrors.value > 0)
+
+const chartData = computed(() => {
+  if (!props.points.length || totalDisplayed.value <= 0) return null
+  return {
+    labels: props.points.map((p) => formatHistoryLabel(p.bucket_start, props.timeRange)),
+    datasets: [
+      {
+        label: t('admin.ops.errorsSla'),
+        data: props.points.map((p) => p.error_count_sla ?? 0),
+        borderColor: colors.value.red,
+        backgroundColor: colors.value.redAlpha,
+        fill: true,
+        tension: 0.35,
+        pointRadius: 0,
+        pointHitRadius: 10
+      },
+      {
+        label: t('admin.ops.upstreamExcl429529'),
+        data: props.points.map((p) => p.upstream_error_count_excl_429_529 ?? 0),
+        borderColor: colors.value.purple,
+        backgroundColor: colors.value.purpleAlpha,
+        fill: true,
+        tension: 0.35,
+        pointRadius: 0,
+        pointHitRadius: 10
+      },
+      {
+        label: t('admin.ops.businessLimited'),
+        data: props.points.map((p) => p.business_limited_count ?? 0),
+        borderColor: colors.value.gray,
+        backgroundColor: 'transparent',
+        borderDash: [6, 6],
+        fill: false,
+        tension: 0.35,
+        pointRadius: 0,
+        pointHitRadius: 10
+      }
+    ]
+  }
+})
+
+const state = computed<ChartState>(() => {
+  if (chartData.value) return 'ready'
+  if (props.loading) return 'loading'
+  return 'empty'
+})
+
+const options = computed(() => {
+  const c = colors.value
+  return {
+    responsive: true,
+    maintainAspectRatio: false,
+    interaction: { intersect: false, mode: 'index' as const },
+    plugins: {
+      legend: {
+        position: 'top' as const,
+        align: 'end' as const,
+        labels: { color: c.text, usePointStyle: true, boxWidth: 6, font: { size: 10 } }
+      },
+      tooltip: {
+        backgroundColor: isDarkMode.value ? '#1f2937' : '#ffffff',
+        titleColor: isDarkMode.value ? '#f3f4f6' : '#111827',
+        bodyColor: isDarkMode.value ? '#d1d5db' : '#4b5563',
+        borderColor: c.grid,
+        borderWidth: 1,
+        padding: 10,
+        displayColors: true
+      }
+    },
+    scales: {
+      x: {
+        type: 'category' as const,
+        grid: { display: false },
+        ticks: {
+          color: c.text,
+          font: { size: 10 },
+          maxTicksLimit: 8,
+          autoSkip: true,
+          autoSkipPadding: 10
+        }
+      },
+      y: {
+        type: 'linear' as const,
+        display: true,
+        position: 'left' as const,
+        grid: { color: c.grid, borderDash: [4, 4] },
+        ticks: { color: c.text, font: { size: 10 }, precision: 0 }
+      }
+    }
+  }
+})
+</script>
+
+<template>
+  <div class="flex h-full flex-col rounded-3xl bg-white p-6 shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700">
+    <div class="mb-4 flex shrink-0 items-center justify-between">
+      <h3 class="flex items-center gap-2 text-sm font-bold text-gray-900 dark:text-white">
+        <svg class="h-4 w-4 text-rose-500" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+          <path
+            stroke-linecap="round"
+            stroke-linejoin="round"
+            stroke-width="2"
+            d="M13 17h8m0 0V9m0 8l-8-8-4 4-6-6"
+          />
+        </svg>
+        {{ t('admin.ops.errorTrend') }}
+        <HelpTooltip :content="t('admin.ops.tooltips.errorTrend')" />
+      </h3>
+      <div class="flex items-center gap-2">
+        <button
+          type="button"
+          class="inline-flex items-center rounded-lg border border-gray-200 bg-white px-2 py-1 text-[11px] font-semibold text-gray-600 hover:bg-gray-50 disabled:opacity-50 dark:border-dark-700 dark:bg-dark-900 dark:text-gray-300 dark:hover:bg-dark-800"
+          :disabled="!hasRequestErrors"
+          @click="emit('openRequestErrors')"
+        >
+          {{ t('admin.ops.errorDetails.requestErrors') }}
+        </button>
+        <button
+          type="button"
+          class="inline-flex items-center rounded-lg border border-gray-200 bg-white px-2 py-1 text-[11px] font-semibold text-gray-600 hover:bg-gray-50 disabled:opacity-50 dark:border-dark-700 dark:bg-dark-900 dark:text-gray-300 dark:hover:bg-dark-800"
+          :disabled="!hasUpstreamErrors"
+          @click="emit('openUpstreamErrors')"
+        >
+          {{ t('admin.ops.errorDetails.upstreamErrors') }}
+        </button>
+      </div>
+    </div>
+
+    <div class="min-h-0 flex-1">
+      <Line v-if="state === 'ready' && chartData" :data="chartData" :options="options" />
+      <div v-else class="flex h-full items-center justify-center">
+        <div v-if="state === 'loading'" class="animate-pulse text-sm text-gray-400">{{ t('common.loading') }}</div>
+        <EmptyState v-else :title="t('common.noData')" :description="t('admin.ops.charts.emptyError')" />
+      </div>
+    </div>
+  </div>
+</template>
diff --git a/frontend/src/views/admin/ops/components/OpsLatencyChart.vue b/frontend/src/views/admin/ops/components/OpsLatencyChart.vue
new file mode 100644
index 00000000..c62b3aa9
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsLatencyChart.vue
@@ -0,0 +1,101 @@
+<script setup lang="ts">
+import { computed } from 'vue'
+import { useI18n } from 'vue-i18n'
+import { Chart as ChartJS, BarElement, CategoryScale, Legend, LinearScale, Tooltip } from 'chart.js'
+import { Bar } from 'vue-chartjs'
+import type { OpsLatencyHistogramResponse } from '@/api/admin/ops'
+import type { ChartState } from '../types'
+import HelpTooltip from '@/components/common/HelpTooltip.vue'
+import EmptyState from '@/components/common/EmptyState.vue'
+
+ChartJS.register(BarElement, CategoryScale, LinearScale, Tooltip, Legend)
+
+interface Props {
+  latencyData: OpsLatencyHistogramResponse | null
+  loading: boolean
+}
+
+const props = defineProps<Props>()
+const { t } = useI18n()
+
+const isDarkMode = computed(() => document.documentElement.classList.contains('dark'))
+const colors = computed(() => ({
+  blue: '#3b82f6',
+  grid: isDarkMode.value ? '#374151' : '#f3f4f6',
+  text: isDarkMode.value ? '#9ca3af' : '#6b7280'
+}))
+
+const hasData = computed(() => (props.latencyData?.total_requests ?? 0) > 0)
+
+const state = computed<ChartState>(() => {
+  if (hasData.value) return 'ready'
+  if (props.loading) return 'loading'
+  return 'empty'
+})
+
+const chartData = computed(() => {
+  if (!props.latencyData || !hasData.value) return null
+  const c = colors.value
+  return {
+    labels: props.latencyData.buckets.map((b) => b.range),
+    datasets: [
+      {
+        label: t('admin.ops.requests'),
+        data: props.latencyData.buckets.map((b) => b.count),
+        backgroundColor: c.blue,
+        borderRadius: 4,
+        barPercentage: 0.6
+      }
+    ]
+  }
+})
+
+const options = computed(() => {
+  const c = colors.value
+  return {
+    responsive: true,
+    maintainAspectRatio: false,
+    plugins: {
+      legend: { display: false }
+    },
+    scales: {
+      x: {
+        grid: { display: false },
+        ticks: { color: c.text, font: { size: 10 } }
+      },
+      y: {
+        beginAtZero: true,
+        grid: { color: c.grid, borderDash: [4, 4] },
+        ticks: { color: c.text, font: { size: 10 } }
+      }
+    }
+  }
+})
+</script>
+
+<template>
+  <div class="flex h-full flex-col rounded-3xl bg-white p-6 shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700">
+    <div class="mb-4 flex items-center justify-between">
+      <h3 class="flex items-center gap-2 text-sm font-bold text-gray-900 dark:text-white">
+        <svg class="h-4 w-4 text-purple-500" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+          <path
+            stroke-linecap="round"
+            stroke-linejoin="round"
+            stroke-width="2"
+            d="M12 8v4l3 3m6-3a9 9 0 11-18 0 9 9 0 0118 0z"
+          />
+        </svg>
+        {{ t('admin.ops.latencyHistogram') }}
+        <HelpTooltip :content="t('admin.ops.tooltips.latencyHistogram')" />
+      </h3>
+    </div>
+
+    <div class="min-h-0 flex-1">
+      <Bar v-if="state === 'ready' && chartData" :data="chartData" :options="options" />
+      <div v-else class="flex h-full items-center justify-center">
+        <div v-if="state === 'loading'" class="animate-pulse text-sm text-gray-400">{{ t('common.loading') }}</div>
+        <EmptyState v-else :title="t('common.noData')" :description="t('admin.ops.charts.emptyRequest')" />
+      </div>
+    </div>
+  </div>
+</template>
diff --git a/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue b/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue
new file mode 100644
index 00000000..3044ee3a
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue
@@ -0,0 +1,281 @@
+<script setup lang="ts">
+import { computed, ref, watch } from 'vue'
+import { useI18n } from 'vue-i18n'
+import BaseDialog from '@/components/common/BaseDialog.vue'
+import Pagination from '@/components/common/Pagination.vue'
+import { useClipboard } from '@/composables/useClipboard'
+import { useAppStore } from '@/stores'
+import { opsAPI, type OpsRequestDetailsParams, type OpsRequestDetail } from '@/api/admin/ops'
+import { parseTimeRangeMinutes, formatDateTime } from '../utils/opsFormatters'
+
+export interface OpsRequestDetailsPreset {
+  title: string
+  kind?: OpsRequestDetailsParams['kind']
+  sort?: OpsRequestDetailsParams['sort']
+  min_duration_ms?: number
+  max_duration_ms?: number
+}
+
+interface Props {
+  modelValue: boolean
+  timeRange: string
+  preset: OpsRequestDetailsPreset
+  platform?: string
+  groupId?: number | null
+}
+
+const props = defineProps<Props>()
+const emit = defineEmits<{
+  (e: 'update:modelValue', value: boolean): void
+  (e: 'openErrorDetail', errorId: number): void
+}>()
+
+const { t } = useI18n()
+const appStore = useAppStore()
+const { copyToClipboard } = useClipboard()
+
+const loading = ref(false)
+const items = ref<OpsRequestDetail[]>([])
+const total = ref(0)
+const page = ref(1)
+const pageSize = ref(20)
+
+const close = () => emit('update:modelValue', false)
+
+const rangeLabel = computed(() => {
+  const minutes = parseTimeRangeMinutes(props.timeRange)
+  if (minutes >= 60) return t('admin.ops.requestDetails.rangeHours', { n: Math.round(minutes / 60) })
+  return t('admin.ops.requestDetails.rangeMinutes', { n: minutes })
+})
+
+function buildTimeParams(): Pick<OpsRequestDetailsParams, 'start_time' | 'end_time'> {
+  const minutes = parseTimeRangeMinutes(props.timeRange)
+  const endTime = new Date()
+  const startTime = new Date(endTime.getTime() - minutes * 60 * 1000)
+  return {
+    start_time: startTime.toISOString(),
+    end_time: endTime.toISOString()
+  }
+}
+
+const fetchData = async () => {
+  if (!props.modelValue) return
+  loading.value = true
+  try {
+    const params: OpsRequestDetailsParams = {
+      ...buildTimeParams(),
+      page: page.value,
+      page_size: pageSize.value,
+      kind: props.preset.kind ?? 'all',
+      sort: props.preset.sort ?? 'created_at_desc'
+    }
+
+    const platform = (props.platform || '').trim()
+    if (platform) params.platform = platform
+    if (typeof props.groupId === 'number' && props.groupId > 0) params.group_id = props.groupId
+
+    if (typeof props.preset.min_duration_ms === 'number') params.min_duration_ms = props.preset.min_duration_ms
+    if (typeof props.preset.max_duration_ms === 'number') params.max_duration_ms = props.preset.max_duration_ms
+
+    const res = await opsAPI.listRequestDetails(params)
+    items.value = res.items || []
+    total.value = res.total || 0
+  } catch (e: any) {
+    console.error('[OpsRequestDetailsModal] Failed to fetch request details', e)
+    appStore.showError(e?.message || t('admin.ops.requestDetails.failedToLoad'))
+    items.value = []
+    total.value = 0
+  } finally {
+    loading.value = false
+  }
+}
+
+watch(
+  () => props.modelValue,
+  (open) => {
+    if (open) {
+      page.value = 1
+      fetchData()
+    }
+  }
+)
+
+watch(
+  () => [
+    props.timeRange,
+    props.platform,
+    props.groupId,
+    props.preset.kind,
+    props.preset.sort,
+    props.preset.min_duration_ms,
+    props.preset.max_duration_ms
+  ],
+  () => {
+    if (!props.modelValue) return
+    page.value = 1
+    fetchData()
+  }
+)
+
+function handlePageChange(next: number) {
+  page.value = next
+  fetchData()
+}
+
+function handlePageSizeChange(next: number) {
+  pageSize.value = next
+  page.value = 1
+  fetchData()
+}
+
+async function handleCopyRequestId(requestId: string) {
+  const ok = await copyToClipboard(requestId, t('admin.ops.requestDetails.requestIdCopied'))
+  if (ok) return
+  // `useClipboard` already shows toast on failure; this keeps UX consistent with older ops modal.
+  appStore.showWarning(t('admin.ops.requestDetails.copyFailed'))
+}
+
+function openErrorDetail(errorId: number | null | undefined) {
+  if (!errorId) return
+  close()
+  emit('openErrorDetail', errorId)
+}
+
+const kindBadgeClass = (kind: string) => {
+  if (kind === 'error') return 'bg-red-100 text-red-700 dark:bg-red-900/30 dark:text-red-300'
+  return 'bg-green-100 text-green-700 dark:bg-green-900/30 dark:text-green-300'
+}
+</script>
+
+<template>
+  <BaseDialog :show="modelValue" :title="props.preset.title || t('admin.ops.requestDetails.title')" width="full" @close="close">
+    <template #default>
+      <div class="flex items-center justify-between mb-4">
+        <div class="text-xs text-gray-500 dark:text-gray-400">
+          {{ t('admin.ops.requestDetails.rangeLabel', { range: rangeLabel }) }}
+        </div>
+        <button
+          type="button"
+          class="btn btn-secondary btn-sm"
+          @click="fetchData"
+        >
+          {{ t('common.refresh') }}
+        </button>
+      </div>
+
+      <!-- Loading -->
+      <div v-if="loading" class="flex items-center justify-center py-16">
+        <div class="flex flex-col items-center gap-3">
+          <svg class="h-8 w-8 animate-spin text-blue-500" fill="none" viewBox="0 0 24 24">
+            <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
+            <path
+              class="opacity-75"
+              fill="currentColor"
+              d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"
+            ></path>
+          </svg>
+          <span class="text-sm font-medium text-gray-500 dark:text-gray-400">{{ t('common.loading') }}</span>
+        </div>
+      </div>
+
+      <!-- Table -->
+      <div v-else>
+        <div v-if="items.length === 0" class="rounded-xl border border-dashed border-gray-200 p-10 text-center dark:border-dark-700">
+          <div class="text-sm font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.requestDetails.empty') }}</div>
+          <div class="mt-1 text-xs text-gray-400">{{ t('admin.ops.requestDetails.emptyHint') }}</div>
+        </div>
+
+        <div v-else class="overflow-hidden rounded-xl border border-gray-200 dark:border-dark-700">
+          <div class="overflow-x-auto">
+            <table class="min-w-full divide-y divide-gray-200 dark:divide-dark-700">
+              <thead class="bg-gray-50 dark:bg-dark-900">
+                <tr>
+                  <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                    {{ t('admin.ops.requestDetails.table.time') }}
+                  </th>
+                  <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                    {{ t('admin.ops.requestDetails.table.kind') }}
+                  </th>
+                  <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                    {{ t('admin.ops.requestDetails.table.platform') }}
+                  </th>
+                  <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                    {{ t('admin.ops.requestDetails.table.model') }}
+                  </th>
+                  <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                    {{ t('admin.ops.requestDetails.table.duration') }}
+                  </th>
+                  <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                    {{ t('admin.ops.requestDetails.table.status') }}
+                  </th>
+                  <th class="px-4 py-3 text-left text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                    {{ t('admin.ops.requestDetails.table.requestId') }}
+                  </th>
+                  <th class="px-4 py-3 text-right text-[11px] font-bold uppercase tracking-wider text-gray-500 dark:text-gray-400">
+                    {{ t('admin.ops.requestDetails.table.actions') }}
+                  </th>
+                </tr>
+              </thead>
+              <tbody class="divide-y divide-gray-200 bg-white dark:divide-dark-700 dark:bg-dark-800">
+                <tr v-for="(row, idx) in items" :key="idx" class="hover:bg-gray-50 dark:hover:bg-dark-700/50">
+                  <td class="whitespace-nowrap px-4 py-3 text-xs text-gray-600 dark:text-gray-300">
+                    {{ formatDateTime(row.created_at) }}
+                  </td>
+                  <td class="whitespace-nowrap px-4 py-3">
+                    <span class="rounded-full px-2 py-1 text-[10px] font-bold" :class="kindBadgeClass(row.kind)">
+                      {{ row.kind === 'error' ? t('admin.ops.requestDetails.kind.error') : t('admin.ops.requestDetails.kind.success') }}
+                    </span>
+                  </td>
+                  <td class="whitespace-nowrap px-4 py-3 text-xs font-medium text-gray-700 dark:text-gray-200">
+                    {{ (row.platform || 'unknown').toUpperCase() }}
+                  </td>
+                  <td class="max-w-[240px] truncate px-4 py-3 text-xs text-gray-600 dark:text-gray-300" :title="row.model || ''">
+                    {{ row.model || '-' }}
+                  </td>
+                  <td class="whitespace-nowrap px-4 py-3 text-xs text-gray-600 dark:text-gray-300">
+                    {{ typeof row.duration_ms === 'number' ? `${row.duration_ms} ms` : '-' }}
+                  </td>
+                  <td class="whitespace-nowrap px-4 py-3 text-xs text-gray-600 dark:text-gray-300">
+                    {{ row.status_code ?? '-' }}
+                  </td>
+                  <td class="px-4 py-3">
+                    <div v-if="row.request_id" class="flex items-center gap-2">
+                      <span class="max-w-[220px] truncate font-mono text-[11px] text-gray-700 dark:text-gray-200" :title="row.request_id">
+                        {{ row.request_id }}
+                      </span>
+                      <button
+                        class="rounded-md bg-gray-100 px-2 py-1 text-[10px] font-bold text-gray-600 hover:bg-gray-200 dark:bg-dark-700 dark:text-gray-300 dark:hover:bg-dark-600"
+                        @click="handleCopyRequestId(row.request_id)"
+                      >
+                        {{ t('admin.ops.requestDetails.copy') }}
+                      </button>
+                    </div>
+                    <span v-else class="text-xs text-gray-400">-</span>
+                  </td>
+                  <td class="whitespace-nowrap px-4 py-3 text-right">
+                    <button
+                      v-if="row.kind === 'error' && row.error_id"
+                      class="rounded-lg bg-red-50 px-3 py-1.5 text-xs font-bold text-red-600 hover:bg-red-100 dark:bg-red-900/20 dark:text-red-300 dark:hover:bg-red-900/30"
+                      @click="openErrorDetail(row.error_id)"
+                    >
+                      {{ t('admin.ops.requestDetails.viewError') }}
+                    </button>
+                    <span v-else class="text-xs text-gray-400">-</span>
+                  </td>
+                </tr>
+              </tbody>
+            </table>
+          </div>
+
+          <Pagination
+            :total="total"
+            :page="page"
+            :page-size="pageSize"
+            @update:page="handlePageChange"
+            @update:pageSize="handlePageSizeChange"
+          />
+        </div>
+      </div>
+    </template>
+  </BaseDialog>
+</template>
diff --git a/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue b/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue
new file mode 100644
index 00000000..e9df347d
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue
@@ -0,0 +1,439 @@
+<script setup lang="ts">
+import { computed, onMounted, ref } from 'vue'
+import { useI18n } from 'vue-i18n'
+import { useAppStore } from '@/stores/app'
+import { opsAPI } from '@/api/admin/ops'
+import type { OpsAlertRuntimeSettings } from '../types'
+import BaseDialog from '@/components/common/BaseDialog.vue'
+
+const { t } = useI18n()
+const appStore = useAppStore()
+
+const loading = ref(false)
+const saving = ref(false)
+
+const alertSettings = ref<OpsAlertRuntimeSettings | null>(null)
+
+const showAlertEditor = ref(false)
+const draftAlert = ref<OpsAlertRuntimeSettings | null>(null)
+
+type ValidationResult = { valid: boolean; errors: string[] }
+
+function normalizeSeverities(input: Array<string | null | undefined> | null | undefined): string[] {
+  if (!input || input.length === 0) return []
+  const allowed = new Set(['P0', 'P1', 'P2', 'P3'])
+  const out: string[] = []
+  const seen = new Set<string>()
+  for (const raw of input) {
+    const s = String(raw || '')
+      .trim()
+      .toUpperCase()
+    if (!s) continue
+    if (!allowed.has(s)) continue
+    if (seen.has(s)) continue
+    seen.add(s)
+    out.push(s)
+  }
+  return out
+}
+
+function validateRuntimeSettings(settings: OpsAlertRuntimeSettings): ValidationResult {
+  const errors: string[] = []
+
+  const evalSeconds = settings.evaluation_interval_seconds
+  if (!Number.isFinite(evalSeconds) || evalSeconds < 1 || evalSeconds > 86400) {
+    errors.push(t('admin.ops.runtime.validation.evalIntervalRange'))
+  }
+
+  const lock = settings.distributed_lock
+  if (lock?.enabled) {
+    if (!lock.key || lock.key.trim().length < 3) {
+      errors.push(t('admin.ops.runtime.validation.lockKeyRequired'))
+    } else if (!lock.key.startsWith('ops:')) {
+      errors.push(t('admin.ops.runtime.validation.lockKeyPrefix', { prefix: 'ops:' }))
+    }
+    if (!Number.isFinite(lock.ttl_seconds) || lock.ttl_seconds < 1 || lock.ttl_seconds > 86400) {
+      errors.push(t('admin.ops.runtime.validation.lockTtlRange'))
+    }
+  }
+
+  // Silencing validation (alert-only)
+  const silencing = settings.silencing
+  if (silencing?.enabled) {
+    const until = (silencing.global_until_rfc3339 || '').trim()
+    if (until) {
+      const parsed = Date.parse(until)
+      if (!Number.isFinite(parsed)) errors.push(t('admin.ops.runtime.silencing.validation.timeFormat'))
+    }
+
+    const entries = Array.isArray(silencing.entries) ? silencing.entries : []
+    for (let idx = 0; idx < entries.length; idx++) {
+      const entry = entries[idx]
+      const untilEntry = (entry?.until_rfc3339 || '').trim()
+      if (!untilEntry) {
+        errors.push(t('admin.ops.runtime.silencing.entries.validation.untilRequired'))
+        break
+      }
+      const parsedEntry = Date.parse(untilEntry)
+      if (!Number.isFinite(parsedEntry)) {
+        errors.push(t('admin.ops.runtime.silencing.entries.validation.untilFormat'))
+        break
+      }
+      const ruleId = (entry as any)?.rule_id
+      if (typeof ruleId === 'number' && (!Number.isFinite(ruleId) || ruleId <= 0)) {
+        errors.push(t('admin.ops.runtime.silencing.entries.validation.ruleIdPositive'))
+        break
+      }
+      if ((entry as any)?.severities) {
+        const raw = (entry as any).severities
+        const normalized = normalizeSeverities(Array.isArray(raw) ? raw : [raw])
+        if (Array.isArray(raw) && raw.length > 0 && normalized.length === 0) {
+          errors.push(t('admin.ops.runtime.silencing.entries.validation.severitiesFormat'))
+          break
+        }
+      }
+    }
+  }
+
+  return { valid: errors.length === 0, errors }
+}
+
+const alertValidation = computed(() => {
+  if (!draftAlert.value) return { valid: true, errors: [] as string[] }
+  return validateRuntimeSettings(draftAlert.value)
+})
+
+async function loadSettings() {
+  loading.value = true
+  try {
+    alertSettings.value = await opsAPI.getAlertRuntimeSettings()
+  } catch (err: any) {
+    console.error('[OpsRuntimeSettingsCard] Failed to load runtime settings', err)
+    appStore.showError(err?.response?.data?.detail || t('admin.ops.runtime.loadFailed'))
+  } finally {
+    loading.value = false
+  }
+}
+
+function openAlertEditor() {
+  if (!alertSettings.value) return
+  draftAlert.value = JSON.parse(JSON.stringify(alertSettings.value))
+
+  // Backwards-compat: ensure nested settings exist even if API payload is older.
+  if (draftAlert.value) {
+    if (!draftAlert.value.distributed_lock) {
+      draftAlert.value.distributed_lock = { enabled: true, key: 'ops:alert:evaluator:leader', ttl_seconds: 30 }
+    }
+    if (!draftAlert.value.silencing) {
+      draftAlert.value.silencing = { enabled: false, global_until_rfc3339: '', global_reason: '', entries: [] }
+    }
+    if (!Array.isArray(draftAlert.value.silencing.entries)) {
+      draftAlert.value.silencing.entries = []
+    }
+  }
+
+  showAlertEditor.value = true
+}
+
+function addSilenceEntry() {
+  if (!draftAlert.value) return
+  if (!draftAlert.value.silencing) {
+    draftAlert.value.silencing = { enabled: true, global_until_rfc3339: '', global_reason: '', entries: [] }
+  }
+  if (!Array.isArray(draftAlert.value.silencing.entries)) {
+    draftAlert.value.silencing.entries = []
+  }
+  draftAlert.value.silencing.entries.push({
+    rule_id: undefined,
+    severities: [],
+    until_rfc3339: '',
+    reason: ''
+  })
+}
+
+function removeSilenceEntry(index: number) {
+  if (!draftAlert.value?.silencing?.entries) return
+  draftAlert.value.silencing.entries.splice(index, 1)
+}
+
+function updateSilenceEntryRuleId(index: number, raw: string) {
+  const entries = draftAlert.value?.silencing?.entries
+  if (!entries || !entries[index]) return
+  const trimmed = raw.trim()
+  if (!trimmed) {
+    delete (entries[index] as any).rule_id
+    return
+  }
+  const n = Number.parseInt(trimmed, 10)
+  ;(entries[index] as any).rule_id = Number.isFinite(n) ? n : undefined
+}
+
+function updateSilenceEntrySeverities(index: number, raw: string) {
+  const entries = draftAlert.value?.silencing?.entries
+  if (!entries || !entries[index]) return
+  const parts = raw
+    .split(',')
+    .map((s) => s.trim())
+    .filter(Boolean)
+  ;(entries[index] as any).severities = normalizeSeverities(parts)
+}
+
+async function saveAlertSettings() {
+  if (!draftAlert.value) return
+  if (!alertValidation.value.valid) {
+    appStore.showError(alertValidation.value.errors[0] || t('admin.ops.runtime.validation.invalid'))
+    return
+  }
+
+  saving.value = true
+  try {
+    alertSettings.value = await opsAPI.updateAlertRuntimeSettings(draftAlert.value)
+    showAlertEditor.value = false
+    appStore.showSuccess(t('admin.ops.runtime.saveSuccess'))
+  } catch (err: any) {
+    console.error('[OpsRuntimeSettingsCard] Failed to save alert runtime settings', err)
+    appStore.showError(err?.response?.data?.detail || t('admin.ops.runtime.saveFailed'))
+  } finally {
+    saving.value = false
+  }
+}
+
+onMounted(() => {
+  loadSettings()
+})
+</script>
+
+<template>
+  <div class="rounded-3xl bg-white p-6 shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700">
+    <div class="mb-4 flex items-start justify-between gap-4">
+      <div>
+        <h3 class="text-sm font-bold text-gray-900 dark:text-white">{{ t('admin.ops.runtime.title') }}</h3>
+        <p class="mt-1 text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.runtime.description') }}</p>
+      </div>
+      <button
+        class="flex items-center gap-1.5 rounded-lg bg-gray-100 px-3 py-1.5 text-xs font-bold text-gray-700 transition-colors hover:bg-gray-200 disabled:cursor-not-allowed disabled:opacity-50 dark:bg-dark-700 dark:text-gray-300 dark:hover:bg-dark-600"
+        :disabled="loading"
+        @click="loadSettings"
+      >
+        <svg class="h-3.5 w-3.5" :class="{ 'animate-spin': loading }" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+          <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 4v5h.582m15.356 2A8.001 8.001 0 004.582 9m0 0H9m11 11v-5h-.581m0 0a8.003 8.003 0 01-15.357-2m15.357 2H15" />
+        </svg>
+        {{ t('common.refresh') }}
+      </button>
+    </div>
+
+    <div v-if="!alertSettings" class="text-sm text-gray-500 dark:text-gray-400">
+      <span v-if="loading">{{ t('admin.ops.runtime.loading') }}</span>
+      <span v-else>{{ t('admin.ops.runtime.noData') }}</span>
+    </div>
+
+    <div v-else class="space-y-6">
+      <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-700/50">
+        <div class="mb-3 flex items-center justify-between">
+          <h4 class="text-sm font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.runtime.alertTitle') }}</h4>
+          <button class="btn btn-sm btn-secondary" @click="openAlertEditor">{{ t('common.edit') }}</button>
+        </div>
+        <div class="grid grid-cols-1 gap-3 md:grid-cols-2">
+          <div class="text-xs text-gray-600 dark:text-gray-300">
+            {{ t('admin.ops.runtime.evalIntervalSeconds') }}:
+            <span class="ml-1 font-medium text-gray-900 dark:text-white">{{ alertSettings.evaluation_interval_seconds }}s</span>
+          </div>
+          <div
+            v-if="alertSettings.silencing?.enabled && alertSettings.silencing.global_until_rfc3339"
+            class="text-xs text-gray-600 dark:text-gray-300 md:col-span-2"
+          >
+            {{ t('admin.ops.runtime.silencing.globalUntil') }}:
+            <span class="ml-1 font-mono text-gray-900 dark:text-white">{{ alertSettings.silencing.global_until_rfc3339 }}</span>
+          </div>
+
+          <details class="col-span-1 md:col-span-2">
+            <summary class="cursor-pointer text-xs font-medium text-blue-600 hover:text-blue-700 dark:text-blue-400">
+              {{ t('admin.ops.runtime.showAdvancedDeveloperSettings') }}
+            </summary>
+            <div class="mt-2 grid grid-cols-1 gap-3 rounded-lg bg-gray-100 p-3 dark:bg-dark-800 md:grid-cols-2">
+              <div class="text-xs text-gray-500 dark:text-gray-400">
+                {{ t('admin.ops.runtime.lockEnabled') }}:
+                <span class="ml-1 font-mono text-gray-700 dark:text-gray-300">{{ alertSettings.distributed_lock.enabled }}</span>
+              </div>
+              <div class="text-xs text-gray-500 dark:text-gray-400">
+                {{ t('admin.ops.runtime.lockKey') }}:
+                <span class="ml-1 font-mono text-gray-700 dark:text-gray-300">{{ alertSettings.distributed_lock.key }}</span>
+              </div>
+              <div class="text-xs text-gray-500 dark:text-gray-400">
+                {{ t('admin.ops.runtime.lockTTLSeconds') }}:
+                <span class="ml-1 font-mono text-gray-700 dark:text-gray-300">{{ alertSettings.distributed_lock.ttl_seconds }}s</span>
+              </div>
+            </div>
+          </details>
+        </div>
+      </div>
+    </div>
+  </div>
+
+  <BaseDialog :show="showAlertEditor" :title="t('admin.ops.runtime.alertTitle')" width="extra-wide" @close="showAlertEditor = false">
+    <div v-if="draftAlert" class="space-y-4">
+      <div
+        v-if="!alertValidation.valid"
+        class="rounded-lg border border-amber-200 bg-amber-50 p-3 text-xs text-amber-800 dark:border-amber-900/50 dark:bg-amber-900/20 dark:text-amber-200"
+      >
+        <div class="font-bold">{{ t('admin.ops.runtime.validation.title') }}</div>
+        <ul class="mt-1 list-disc space-y-1 pl-4">
+          <li v-for="msg in alertValidation.errors" :key="msg">{{ msg }}</li>
+        </ul>
+      </div>
+
+      <div>
+        <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.runtime.evalIntervalSeconds') }}</div>
+        <input
+          v-model.number="draftAlert.evaluation_interval_seconds"
+          type="number"
+          min="1"
+          max="86400"
+          class="input"
+          :aria-invalid="!alertValidation.valid"
+        />
+        <p class="mt-1 text-xs text-gray-500">{{ t('admin.ops.runtime.evalIntervalHint') }}</p>
+      </div>
+
+      <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-700/50">
+        <div class="mb-2 text-sm font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.runtime.silencing.title') }}</div>
+
+        <label class="inline-flex items-center gap-2 text-sm text-gray-700 dark:text-gray-300">
+          <input v-model="draftAlert.silencing.enabled" type="checkbox" class="h-4 w-4 rounded border-gray-300" />
+          <span>{{ t('admin.ops.runtime.silencing.enabled') }}</span>
+        </label>
+
+        <div v-if="draftAlert.silencing.enabled" class="mt-4 space-y-4">
+          <div>
+            <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.runtime.silencing.globalUntil') }}</div>
+            <input
+              v-model="draftAlert.silencing.global_until_rfc3339"
+              type="text"
+              class="input font-mono text-sm"
+              :placeholder="t('admin.ops.runtime.silencing.untilPlaceholder')"
+            />
+            <p class="mt-1 text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.runtime.silencing.untilHint') }}</p>
+          </div>
+
+          <div>
+            <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.runtime.silencing.reason') }}</div>
+            <input
+              v-model="draftAlert.silencing.global_reason"
+              type="text"
+              class="input"
+              :placeholder="t('admin.ops.runtime.silencing.reasonPlaceholder')"
+            />
+          </div>
+
+          <div class="rounded-xl border border-gray-200 bg-white p-4 dark:border-dark-700 dark:bg-dark-800">
+            <div class="flex items-start justify-between gap-4">
+              <div>
+                <div class="text-xs font-bold text-gray-900 dark:text-white">{{ t('admin.ops.runtime.silencing.entries.title') }}</div>
+                <p class="text-[11px] text-gray-500 dark:text-gray-400">{{ t('admin.ops.runtime.silencing.entries.hint') }}</p>
+              </div>
+              <button class="btn btn-sm btn-secondary" type="button" @click="addSilenceEntry">
+                {{ t('admin.ops.runtime.silencing.entries.add') }}
+              </button>
+            </div>
+
+            <div v-if="!draftAlert.silencing.entries?.length" class="mt-3 rounded-lg bg-gray-50 p-3 text-xs text-gray-500 dark:bg-dark-900 dark:text-gray-400">
+              {{ t('admin.ops.runtime.silencing.entries.empty') }}
+            </div>
+
+            <div v-else class="mt-4 space-y-4">
+              <div
+                v-for="(entry, idx) in draftAlert.silencing.entries"
+                :key="idx"
+                class="rounded-lg border border-gray-200 bg-gray-50 p-4 dark:border-dark-700 dark:bg-dark-900"
+              >
+                <div class="mb-3 flex items-center justify-between">
+                  <div class="text-xs font-bold text-gray-900 dark:text-white">
+                    {{ t('admin.ops.runtime.silencing.entries.entryTitle', { n: idx + 1 }) }}
+                  </div>
+                  <button class="btn btn-sm btn-danger" type="button" @click="removeSilenceEntry(idx)">{{ t('common.delete') }}</button>
+                </div>
+
+                <div class="grid grid-cols-1 gap-3 md:grid-cols-2">
+                  <div>
+                    <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.runtime.silencing.entries.ruleId') }}</div>
+                    <input
+                      :value="typeof (entry as any).rule_id === 'number' ? String((entry as any).rule_id) : ''"
+                      type="text"
+                      class="input font-mono text-sm"
+                      :placeholder="t('admin.ops.runtime.silencing.entries.ruleIdPlaceholder')"
+                      @input="updateSilenceEntryRuleId(idx, ($event.target as HTMLInputElement).value)"
+                    />
+                  </div>
+
+                  <div>
+                    <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.runtime.silencing.entries.severities') }}</div>
+                    <input
+                      :value="Array.isArray((entry as any).severities) ? (entry as any).severities.join(', ') : ''"
+                      type="text"
+                      class="input font-mono text-sm"
+                      :placeholder="t('admin.ops.runtime.silencing.entries.severitiesPlaceholder')"
+                      @input="updateSilenceEntrySeverities(idx, ($event.target as HTMLInputElement).value)"
+                    />
+                  </div>
+
+                  <div>
+                    <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.runtime.silencing.entries.until') }}</div>
+                    <input
+                      v-model="(entry as any).until_rfc3339"
+                      type="text"
+                      class="input font-mono text-sm"
+                      :placeholder="t('admin.ops.runtime.silencing.untilPlaceholder')"
+                    />
+                  </div>
+
+                  <div>
+                    <div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">{{ t('admin.ops.runtime.silencing.entries.reason') }}</div>
+                    <input
+                      v-model="(entry as any).reason"
+                      type="text"
+                      class="input"
+                      :placeholder="t('admin.ops.runtime.silencing.reasonPlaceholder')"
+                    />
+                  </div>
+                </div>
+              </div>
+            </div>
+          </div>
+        </div>
+      </div>
+
+      <details class="rounded-lg border border-gray-200 bg-gray-50 p-3 dark:border-dark-600 dark:bg-dark-800">
+        <summary class="cursor-pointer text-xs font-medium text-gray-600 dark:text-gray-400">{{ t('admin.ops.runtime.advancedSettingsSummary') }}</summary>
+        <div class="mt-3 grid grid-cols-1 gap-4 md:grid-cols-2">
+          <div>
+            <label class="inline-flex items-center gap-2 text-xs text-gray-700 dark:text-gray-300">
+              <input v-model="draftAlert.distributed_lock.enabled" type="checkbox" class="h-4 w-4 rounded border-gray-300" />
+              <span>{{ t('admin.ops.runtime.lockEnabled') }}</span>
+            </label>
+          </div>
+          <div class="md:col-span-2">
+            <div class="mb-1 text-xs font-medium text-gray-500">{{ t('admin.ops.runtime.lockKey') }}</div>
+            <input v-model="draftAlert.distributed_lock.key" type="text" class="input text-xs font-mono" />
+            <p v-if="draftAlert.distributed_lock.enabled" class="mt-1 text-[11px] text-gray-500 dark:text-gray-400">
+              {{ t('admin.ops.runtime.validation.lockKeyHint', { prefix: 'ops:' }) }}
+            </p>
+          </div>
+          <div>
+            <div class="mb-1 text-xs font-medium text-gray-500">{{ t('admin.ops.runtime.lockTTLSeconds') }}</div>
+            <input v-model.number="draftAlert.distributed_lock.ttl_seconds" type="number" min="1" max="86400" class="input text-xs font-mono" />
+          </div>
+        </div>
+      </details>
+    </div>
+
+    <template #footer>
+      <div class="flex justify-end gap-2">
+        <button class="btn btn-secondary" @click="showAlertEditor = false">{{ t('common.cancel') }}</button>
+        <button class="btn btn-primary" :disabled="saving || !alertValidation.valid" @click="saveAlertSettings">
+          {{ saving ? t('common.saving') : t('common.save') }}
+        </button>
+      </div>
+    </template>
+  </BaseDialog>
+</template>
+
diff --git a/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue
new file mode 100644
index 00000000..968c5081
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue
@@ -0,0 +1,395 @@
+<script setup lang="ts">
+import { ref, computed, watch } from 'vue'
+import { useI18n } from 'vue-i18n'
+import { useAppStore } from '@/stores/app'
+import { opsAPI } from '@/api/admin/ops'
+import BaseDialog from '@/components/common/BaseDialog.vue'
+import Select from '@/components/common/Select.vue'
+import Toggle from '@/components/common/Toggle.vue'
+import type { OpsAlertRuntimeSettings, EmailNotificationConfig, AlertSeverity, OpsAdvancedSettings } from '../types'
+
+const { t } = useI18n()
+const appStore = useAppStore()
+
+const props = defineProps<{
+  show: boolean
+}>()
+
+const emit = defineEmits<{
+  close: []
+  saved: []
+}>()
+
+const loading = ref(false)
+const saving = ref(false)
+
+// 运行时设置
+const runtimeSettings = ref<OpsAlertRuntimeSettings | null>(null)
+// 邮件通知配置
+const emailConfig = ref<EmailNotificationConfig | null>(null)
+// 高级设置
+const advancedSettings = ref<OpsAdvancedSettings | null>(null)
+
+// 加载所有配置
+async function loadAllSettings() {
+  loading.value = true
+  try {
+    const [runtime, email, advanced] = await Promise.all([
+      opsAPI.getAlertRuntimeSettings(),
+      opsAPI.getEmailNotificationConfig(),
+      opsAPI.getAdvancedSettings()
+    ])
+    runtimeSettings.value = runtime
+    emailConfig.value = email
+    advancedSettings.value = advanced
+  } catch (err: any) {
+    console.error('[OpsSettingsDialog] Failed to load settings', err)
+    appStore.showError(err?.response?.data?.detail || t('admin.ops.settings.loadFailed'))
+  } finally {
+    loading.value = false
+  }
+}
+
+// 监听弹窗打开
+watch(() => props.show, (show) => {
+  if (show) {
+    loadAllSettings()
+  }
+})
+
+// 邮件输入
+const alertRecipientInput = ref('')
+const reportRecipientInput = ref('')
+
+// 严重级别选项
+const severityOptions: Array<{ value: AlertSeverity | ''; label: string }> = [
+  { value: '', label: t('admin.ops.email.minSeverityAll') },
+  { value: 'critical', label: t('common.critical') },
+  { value: 'warning', label: t('common.warning') },
+  { value: 'info', label: t('common.info') }
+]
+
+// 验证邮箱
+function isValidEmailAddress(email: string): boolean {
+  return /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(email)
+}
+
+// 添加收件人
+function addRecipient(target: 'alert' | 'report') {
+  if (!emailConfig.value) return
+  const raw = (target === 'alert' ? alertRecipientInput.value : reportRecipientInput.value).trim()
+  if (!raw) return
+
+  if (!isValidEmailAddress(raw)) {
+    appStore.showError(t('common.invalidEmail'))
+    return
+  }
+
+  const normalized = raw.toLowerCase()
+  const list = target === 'alert' ? emailConfig.value.alert.recipients : emailConfig.value.report.recipients
+  if (!list.includes(normalized)) {
+    list.push(normalized)
+  }
+  if (target === 'alert') alertRecipientInput.value = ''
+  else reportRecipientInput.value = ''
+}
+
+// 移除收件人
+function removeRecipient(target: 'alert' | 'report', email: string) {
+  if (!emailConfig.value) return
+  const list = target === 'alert' ? emailConfig.value.alert.recipients : emailConfig.value.report.recipients
+  const idx = list.indexOf(email)
+  if (idx >= 0) list.splice(idx, 1)
+}
+
+// 验证
+const validation = computed(() => {
+  const errors: string[] = []
+
+  // 验证运行时设置
+  if (runtimeSettings.value) {
+    const evalSeconds = runtimeSettings.value.evaluation_interval_seconds
+    if (!Number.isFinite(evalSeconds) || evalSeconds < 1 || evalSeconds > 86400) {
+      errors.push(t('admin.ops.runtime.validation.evalIntervalRange'))
+    }
+  }
+
+  // 验证邮件配置
+  if (emailConfig.value) {
+    if (emailConfig.value.alert.enabled && emailConfig.value.alert.recipients.length === 0) {
+      errors.push(t('admin.ops.email.validation.alertRecipientsRequired'))
+    }
+    if (emailConfig.value.report.enabled && emailConfig.value.report.recipients.length === 0) {
+      errors.push(t('admin.ops.email.validation.reportRecipientsRequired'))
+    }
+  }
+
+  // 验证高级设置
+  if (advancedSettings.value) {
+    const { error_log_retention_days, minute_metrics_retention_days, hourly_metrics_retention_days } = advancedSettings.value.data_retention
+    if (error_log_retention_days < 1 || error_log_retention_days > 365) {
+      errors.push(t('admin.ops.settings.validation.retentionDaysRange'))
+    }
+    if (minute_metrics_retention_days < 1 || minute_metrics_retention_days > 365) {
+      errors.push(t('admin.ops.settings.validation.retentionDaysRange'))
+    }
+    if (hourly_metrics_retention_days < 1 || hourly_metrics_retention_days > 365) {
+      errors.push(t('admin.ops.settings.validation.retentionDaysRange'))
+    }
+  }
+
+  return { valid: errors.length === 0, errors }
+})
+
+// 保存所有配置
+async function saveAllSettings() {
+  if (!validation.value.valid) {
+    appStore.showError(validation.value.errors[0])
+    return
+  }
+
+  saving.value = true
+  try {
+    await Promise.all([
+      runtimeSettings.value ? opsAPI.updateAlertRuntimeSettings(runtimeSettings.value) : Promise.resolve(),
+      emailConfig.value ? opsAPI.updateEmailNotificationConfig(emailConfig.value) : Promise.resolve(),
+      advancedSettings.value ? opsAPI.updateAdvancedSettings(advancedSettings.value) : Promise.resolve()
+    ])
+    appStore.showSuccess(t('admin.ops.settings.saveSuccess'))
+    emit('saved')
+    emit('close')
+  } catch (err: any) {
+    console.error('[OpsSettingsDialog] Failed to save settings', err)
+    appStore.showError(err?.response?.data?.detail || t('admin.ops.settings.saveFailed'))
+  } finally {
+    saving.value = false
+  }
+}
+</script>
+
+<template>
+  <BaseDialog :show="show" :title="t('admin.ops.settings.title')" width="extra-wide" @close="emit('close')">
+    <div v-if="loading" class="py-10 text-center text-sm text-gray-500">
+      {{ t('common.loading') }}
+    </div>
+
+    <div v-else-if="runtimeSettings && emailConfig && advancedSettings" class="space-y-6">
+      <!-- 验证错误 -->
+      <div v-if="!validation.valid" class="rounded-lg border border-amber-200 bg-amber-50 p-3 text-xs text-amber-800 dark:border-amber-900/50 dark:bg-amber-900/20 dark:text-amber-200">
+        <div class="font-bold">{{ t('admin.ops.settings.validation.title') }}</div>
+        <ul class="mt-1 list-disc space-y-1 pl-4">
+          <li v-for="msg in validation.errors" :key="msg">{{ msg }}</li>
+        </ul>
+      </div>
+
+      <!-- 数据采集频率 -->
+      <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-700/50">
+        <h4 class="mb-3 text-sm font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.settings.dataCollection') }}</h4>
+        <div>
+          <label class="input-label">{{ t('admin.ops.settings.evaluationInterval') }}</label>
+          <input
+            v-model.number="runtimeSettings.evaluation_interval_seconds"
+            type="number"
+            min="1"
+            max="86400"
+            class="input"
+          />
+          <p class="mt-1 text-xs text-gray-500">{{ t('admin.ops.settings.evaluationIntervalHint') }}</p>
+        </div>
+      </div>
+
+      <!-- 预警配置 -->
+      <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-700/50">
+        <h4 class="mb-3 text-sm font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.settings.alertConfig') }}</h4>
+
+        <div class="space-y-4">
+          <div class="flex items-center justify-between">
+            <div>
+              <label class="font-medium text-gray-900 dark:text-white">{{ t('admin.ops.settings.enableAlert') }}</label>
+            </div>
+            <Toggle v-model="emailConfig.alert.enabled" />
+          </div>
+
+          <div v-if="emailConfig.alert.enabled">
+            <label class="input-label">{{ t('admin.ops.settings.alertRecipients') }}</label>
+            <div class="flex gap-2">
+              <input
+                v-model="alertRecipientInput"
+                type="email"
+                class="input"
+                :placeholder="t('admin.ops.settings.emailPlaceholder')"
+                @keydown.enter.prevent="addRecipient('alert')"
+              />
+              <button class="btn btn-secondary whitespace-nowrap" type="button" @click="addRecipient('alert')">
+                {{ t('common.add') }}
+              </button>
+            </div>
+            <div class="mt-2 flex flex-wrap gap-2">
+              <span
+                v-for="email in emailConfig.alert.recipients"
+                :key="email"
+                class="inline-flex items-center gap-2 rounded-full bg-blue-100 px-3 py-1 text-xs font-medium text-blue-700 dark:bg-blue-900/30 dark:text-blue-400"
+              >
+                {{ email }}
+                <button type="button" class="text-blue-700/80 hover:text-blue-900" @click="removeRecipient('alert', email)">×</button>
+              </span>
+            </div>
+            <p class="mt-2 text-xs text-gray-500 dark:text-gray-400">
+              {{ t('admin.ops.settings.recipientsHint') }}
+            </p>
+          </div>
+
+          <div v-if="emailConfig.alert.enabled">
+            <label class="input-label">{{ t('admin.ops.settings.minSeverity') }}</label>
+            <Select v-model="emailConfig.alert.min_severity" :options="severityOptions" />
+          </div>
+        </div>
+      </div>
+
+      <!-- 评估报告配置 -->
+      <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-700/50">
+        <h4 class="mb-3 text-sm font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.settings.reportConfig') }}</h4>
+
+        <div class="space-y-4">
+          <div class="flex items-center justify-between">
+            <div>
+              <label class="font-medium text-gray-900 dark:text-white">{{ t('admin.ops.settings.enableReport') }}</label>
+            </div>
+            <Toggle v-model="emailConfig.report.enabled" />
+          </div>
+
+          <div v-if="emailConfig.report.enabled">
+            <label class="input-label">{{ t('admin.ops.settings.reportRecipients') }}</label>
+            <div class="flex gap-2">
+              <input
+                v-model="reportRecipientInput"
+                type="email"
+                class="input"
+                :placeholder="t('admin.ops.settings.emailPlaceholder')"
+                @keydown.enter.prevent="addRecipient('report')"
+              />
+              <button class="btn btn-secondary whitespace-nowrap" type="button" @click="addRecipient('report')">
+                {{ t('common.add') }}
+              </button>
+            </div>
+            <div class="mt-2 flex flex-wrap gap-2">
+              <span
+                v-for="email in emailConfig.report.recipients"
+                :key="email"
+                class="inline-flex items-center gap-2 rounded-full bg-blue-100 px-3 py-1 text-xs font-medium text-blue-700 dark:bg-blue-900/30 dark:text-blue-400"
+              >
+                {{ email }}
+                <button type="button" class="text-blue-700/80 hover:text-blue-900" @click="removeRecipient('report', email)">×</button>
+              </span>
+            </div>
+            <p class="mt-2 text-xs text-gray-500 dark:text-gray-400">
+              {{ t('admin.ops.settings.recipientsHint') }}
+            </p>
+          </div>
+
+          <div v-if="emailConfig.report.enabled" class="grid grid-cols-1 gap-4 md:grid-cols-2">
+            <div class="flex items-center justify-between">
+              <label class="text-sm font-medium text-gray-700 dark:text-gray-300">{{ t('admin.ops.settings.dailySummary') }}</label>
+              <Toggle v-model="emailConfig.report.daily_summary_enabled" />
+            </div>
+            <div v-if="emailConfig.report.daily_summary_enabled">
+              <input v-model="emailConfig.report.daily_summary_schedule" type="text" class="input" placeholder="0 9 * * *" />
+            </div>
+            <div class="flex items-center justify-between">
+              <label class="text-sm font-medium text-gray-700 dark:text-gray-300">{{ t('admin.ops.settings.weeklySummary') }}</label>
+              <Toggle v-model="emailConfig.report.weekly_summary_enabled" />
+            </div>
+            <div v-if="emailConfig.report.weekly_summary_enabled">
+              <input v-model="emailConfig.report.weekly_summary_schedule" type="text" class="input" placeholder="0 9 * * 1" />
+            </div>
+          </div>
+        </div>
+      </div>
+
+      <!-- 高级设置 -->
+      <details class="rounded-2xl bg-gray-50 dark:bg-dark-700/50">
+        <summary class="cursor-pointer p-4 text-sm font-semibold text-gray-900 dark:text-white">
+          {{ t('admin.ops.settings.advancedSettings') }}
+        </summary>
+        <div class="space-y-4 px-4 pb-4">
+          <!-- 数据保留策略 -->
+          <div class="space-y-3">
+            <h5 class="text-xs font-semibold text-gray-700 dark:text-gray-300">{{ t('admin.ops.settings.dataRetention') }}</h5>
+
+            <div class="flex items-center justify-between">
+              <label class="text-sm font-medium text-gray-700 dark:text-gray-300">{{ t('admin.ops.settings.enableCleanup') }}</label>
+              <Toggle v-model="advancedSettings.data_retention.cleanup_enabled" />
+            </div>
+
+            <div v-if="advancedSettings.data_retention.cleanup_enabled">
+              <label class="input-label">{{ t('admin.ops.settings.cleanupSchedule') }}</label>
+              <input
+                v-model="advancedSettings.data_retention.cleanup_schedule"
+                type="text"
+                class="input"
+                placeholder="0 2 * * *"
+              />
+              <p class="mt-1 text-xs text-gray-500">{{ t('admin.ops.settings.cleanupScheduleHint') }}</p>
+            </div>
+
+            <div class="grid grid-cols-1 gap-4 md:grid-cols-3">
+              <div>
+                <label class="input-label">{{ t('admin.ops.settings.errorLogRetentionDays') }}</label>
+                <input
+                  v-model.number="advancedSettings.data_retention.error_log_retention_days"
+                  type="number"
+                  min="1"
+                  max="365"
+                  class="input"
+                />
+              </div>
+              <div>
+                <label class="input-label">{{ t('admin.ops.settings.minuteMetricsRetentionDays') }}</label>
+                <input
+                  v-model.number="advancedSettings.data_retention.minute_metrics_retention_days"
+                  type="number"
+                  min="1"
+                  max="365"
+                  class="input"
+                />
+              </div>
+              <div>
+                <label class="input-label">{{ t('admin.ops.settings.hourlyMetricsRetentionDays') }}</label>
+                <input
+                  v-model.number="advancedSettings.data_retention.hourly_metrics_retention_days"
+                  type="number"
+                  min="1"
+                  max="365"
+                  class="input"
+                />
+              </div>
+            </div>
+            <p class="text-xs text-gray-500">{{ t('admin.ops.settings.retentionDaysHint') }}</p>
+          </div>
+
+          <!-- 预聚合任务 -->
+          <div class="space-y-3">
+            <h5 class="text-xs font-semibold text-gray-700 dark:text-gray-300">{{ t('admin.ops.settings.aggregation') }}</h5>
+
+            <div class="flex items-center justify-between">
+              <div>
+                <label class="text-sm font-medium text-gray-700 dark:text-gray-300">{{ t('admin.ops.settings.enableAggregation') }}</label>
+                <p class="mt-1 text-xs text-gray-500">{{ t('admin.ops.settings.aggregationHint') }}</p>
+              </div>
+              <Toggle v-model="advancedSettings.aggregation.aggregation_enabled" />
+            </div>
+          </div>
+        </div>
+      </details>
+    </div>
+
+    <template #footer>
+      <div class="flex justify-end gap-2">
+        <button class="btn btn-secondary" @click="emit('close')">{{ t('common.cancel') }}</button>
+        <button class="btn btn-primary" :disabled="saving || !validation.valid" @click="saveAllSettings">
+          {{ saving ? t('common.saving') : t('common.save') }}
+        </button>
+      </div>
+    </template>
+  </BaseDialog>
+</template>
diff --git a/frontend/src/views/admin/ops/components/OpsThroughputTrendChart.vue b/frontend/src/views/admin/ops/components/OpsThroughputTrendChart.vue
new file mode 100644
index 00000000..e3bd26c2
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsThroughputTrendChart.vue
@@ -0,0 +1,252 @@
+<script setup lang="ts">
+import { computed, ref, watch } from 'vue'
+import { useI18n } from 'vue-i18n'
+import { Chart as ChartJS, CategoryScale, Filler, Legend, LineElement, LinearScale, PointElement, Title, Tooltip } from 'chart.js'
+import { Line } from 'vue-chartjs'
+import type { ChartComponentRef } from 'vue-chartjs'
+import type { OpsThroughputGroupBreakdownItem, OpsThroughputPlatformBreakdownItem, OpsThroughputTrendPoint } from '@/api/admin/ops'
+import type { ChartState } from '../types'
+import { formatHistoryLabel, sumNumbers } from '../utils/opsFormatters'
+import HelpTooltip from '@/components/common/HelpTooltip.vue'
+import EmptyState from '@/components/common/EmptyState.vue'
+import { formatNumber } from '@/utils/format'
+
+ChartJS.register(Title, Tooltip, Legend, LineElement, LinearScale, PointElement, CategoryScale, Filler)
+
+interface Props {
+  points: OpsThroughputTrendPoint[]
+  loading: boolean
+  timeRange: string
+  byPlatform?: OpsThroughputPlatformBreakdownItem[]
+  topGroups?: OpsThroughputGroupBreakdownItem[]
+}
+
+const props = defineProps<Props>()
+const { t } = useI18n()
+const emit = defineEmits<{
+  (e: 'selectPlatform', platform: string): void
+  (e: 'selectGroup', groupId: number): void
+  (e: 'openDetails'): void
+}>()
+
+const throughputChartRef = ref<ChartComponentRef | null>(null)
+watch(
+  () => props.timeRange,
+  () => {
+    setTimeout(() => {
+      const chart: any = throughputChartRef.value?.chart
+      if (chart && typeof chart.resetZoom === 'function') {
+        chart.resetZoom()
+      }
+    }, 100)
+  }
+)
+
+const isDarkMode = computed(() => document.documentElement.classList.contains('dark'))
+const colors = computed(() => ({
+  blue: '#3b82f6',
+  blueAlpha: '#3b82f620',
+  green: '#10b981',
+  greenAlpha: '#10b98120',
+  grid: isDarkMode.value ? '#374151' : '#f3f4f6',
+  text: isDarkMode.value ? '#9ca3af' : '#6b7280'
+}))
+
+const totalRequests = computed(() => sumNumbers(props.points.map((p) => p.request_count)))
+
+const chartData = computed(() => {
+  if (!props.points.length || totalRequests.value <= 0) return null
+  return {
+    labels: props.points.map((p) => formatHistoryLabel(p.bucket_start, props.timeRange)),
+    datasets: [
+      {
+        label: t('admin.ops.qps'),
+        data: props.points.map((p) => p.qps ?? 0),
+        borderColor: colors.value.blue,
+        backgroundColor: colors.value.blueAlpha,
+        fill: true,
+        tension: 0.4,
+        pointRadius: 0,
+        pointHitRadius: 10
+      },
+      {
+        label: t('admin.ops.tpsK'),
+        data: props.points.map((p) => (p.tps ?? 0) / 1000),
+        borderColor: colors.value.green,
+        backgroundColor: colors.value.greenAlpha,
+        fill: true,
+        tension: 0.4,
+        pointRadius: 0,
+        pointHitRadius: 10,
+        yAxisID: 'y1'
+      }
+    ]
+  }
+})
+
+const state = computed<ChartState>(() => {
+  if (chartData.value) return 'ready'
+  if (props.loading) return 'loading'
+  return 'empty'
+})
+
+const options = computed(() => {
+  const c = colors.value
+  return {
+    responsive: true,
+    maintainAspectRatio: false,
+    interaction: { intersect: false, mode: 'index' as const },
+    plugins: {
+      legend: {
+        position: 'top' as const,
+        align: 'end' as const,
+        labels: { color: c.text, usePointStyle: true, boxWidth: 6, font: { size: 10 } }
+      },
+      tooltip: {
+        backgroundColor: isDarkMode.value ? '#1f2937' : '#ffffff',
+        titleColor: isDarkMode.value ? '#f3f4f6' : '#111827',
+        bodyColor: isDarkMode.value ? '#d1d5db' : '#4b5563',
+        borderColor: c.grid,
+        borderWidth: 1,
+        padding: 10,
+        displayColors: true,
+        callbacks: {
+          label: (context: any) => {
+            let label = context.dataset.label || ''
+            if (label) label += ': '
+            if (context.raw !== null) label += context.parsed.y.toFixed(1)
+            return label
+          }
+        }
+      },
+      // Optional: if chartjs-plugin-zoom is installed, these options will enable zoom/pan.
+      zoom: {
+        pan: { enabled: true, mode: 'x' as const, modifierKey: 'ctrl' as const },
+        zoom: { wheel: { enabled: true }, pinch: { enabled: true }, mode: 'x' as const }
+      }
+    },
+    scales: {
+      x: {
+        type: 'category' as const,
+        grid: { display: false },
+        ticks: {
+          color: c.text,
+          font: { size: 10 },
+          maxTicksLimit: 8,
+          autoSkip: true,
+          autoSkipPadding: 10
+        }
+      },
+      y: {
+        type: 'linear' as const,
+        display: true,
+        position: 'left' as const,
+        grid: { color: c.grid, borderDash: [4, 4] },
+        ticks: { color: c.text, font: { size: 10 } }
+      },
+      y1: {
+        type: 'linear' as const,
+        display: true,
+        position: 'right' as const,
+        grid: { display: false },
+        ticks: { color: c.green, font: { size: 10 } }
+      }
+    }
+  }
+})
+
+function resetZoom() {
+  const chart: any = throughputChartRef.value?.chart
+  if (chart && typeof chart.resetZoom === 'function') chart.resetZoom()
+}
+
+function downloadChart() {
+  const chart: any = throughputChartRef.value?.chart
+  if (!chart || typeof chart.toBase64Image !== 'function') return
+  const url = chart.toBase64Image('image/png', 1)
+  const a = document.createElement('a')
+  a.href = url
+  a.download = `ops-throughput-${new Date().toISOString().slice(0, 19).replace(/[:T]/g, '-')}.png`
+  a.click()
+}
+</script>
+
+<template>
+  <div class="flex h-full flex-col rounded-3xl bg-white p-6 shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700">
+    <div class="mb-4 flex shrink-0 items-center justify-between">
+      <h3 class="flex items-center gap-2 text-sm font-bold text-gray-900 dark:text-white">
+        <svg class="h-4 w-4 text-blue-500" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+          <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M13 7h8m0 0v8m0-8l-8 8-4-4-6 6" />
+        </svg>
+        {{ t('admin.ops.throughputTrend') }}
+        <HelpTooltip :content="t('admin.ops.tooltips.throughputTrend')" />
+      </h3>
+      <div class="flex items-center gap-2 text-xs text-gray-500 dark:text-gray-400">
+        <span class="flex items-center gap-1"><span class="h-2 w-2 rounded-full bg-blue-500"></span>{{ t('admin.ops.qps') }}</span>
+        <span class="flex items-center gap-1"><span class="h-2 w-2 rounded-full bg-green-500"></span>{{ t('admin.ops.tpsK') }}</span>
+        <button
+          type="button"
+          class="ml-2 inline-flex items-center rounded-lg border border-gray-200 bg-white px-2 py-1 text-[11px] font-semibold text-gray-600 hover:bg-gray-50 disabled:opacity-50 dark:border-dark-700 dark:bg-dark-900 dark:text-gray-300 dark:hover:bg-dark-800"
+          :disabled="state !== 'ready'"
+          :title="t('admin.ops.requestDetails.title')"
+          @click="emit('openDetails')"
+        >
+          {{ t('admin.ops.requestDetails.details') }}
+        </button>
+        <button
+          type="button"
+          class="ml-2 inline-flex items-center rounded-lg border border-gray-200 bg-white px-2 py-1 text-[11px] font-semibold text-gray-600 hover:bg-gray-50 disabled:opacity-50 dark:border-dark-700 dark:bg-dark-900 dark:text-gray-300 dark:hover:bg-dark-800"
+          :disabled="state !== 'ready'"
+          :title="t('admin.ops.charts.resetZoomHint')"
+          @click="resetZoom"
+        >
+          {{ t('admin.ops.charts.resetZoom') }}
+        </button>
+        <button
+          type="button"
+          class="inline-flex items-center rounded-lg border border-gray-200 bg-white px-2 py-1 text-[11px] font-semibold text-gray-600 hover:bg-gray-50 disabled:opacity-50 dark:border-dark-700 dark:bg-dark-900 dark:text-gray-300 dark:hover:bg-dark-800"
+          :disabled="state !== 'ready'"
+          :title="t('admin.ops.charts.downloadChartHint')"
+          @click="downloadChart"
+        >
+          {{ t('admin.ops.charts.downloadChart') }}
+        </button>
+      </div>
+    </div>
+
+    <!-- Drilldown chips (baseline interaction: click to set global filter) -->
+    <div v-if="(props.topGroups?.length ?? 0) > 0" class="mb-3 flex flex-wrap gap-2">
+      <button
+        v-for="g in props.topGroups"
+        :key="g.group_id"
+        type="button"
+        class="inline-flex items-center gap-2 rounded-full border border-gray-200 bg-white px-3 py-1 text-[11px] font-semibold text-gray-700 hover:bg-gray-50 dark:border-dark-700 dark:bg-dark-900 dark:text-gray-200 dark:hover:bg-dark-800"
+        @click="emit('selectGroup', g.group_id)"
+      >
+        <span class="max-w-[180px] truncate">{{ g.group_name || `#${g.group_id}` }}</span>
+        <span class="text-gray-400 dark:text-gray-500">{{ formatNumber(g.request_count) }}</span>
+      </button>
+    </div>
+
+    <div v-else-if="(props.byPlatform?.length ?? 0) > 0" class="mb-3 flex flex-wrap gap-2">
+      <button
+        v-for="p in props.byPlatform"
+        :key="p.platform"
+        type="button"
+        class="inline-flex items-center gap-2 rounded-full border border-gray-200 bg-white px-3 py-1 text-[11px] font-semibold text-gray-700 hover:bg-gray-50 dark:border-dark-700 dark:bg-dark-900 dark:text-gray-200 dark:hover:bg-dark-800"
+        @click="emit('selectPlatform', p.platform)"
+      >
+        <span class="uppercase">{{ p.platform }}</span>
+        <span class="text-gray-400 dark:text-gray-500">{{ formatNumber(p.request_count) }}</span>
+      </button>
+    </div>
+
+    <div class="min-h-0 flex-1">
+      <Line v-if="state === 'ready' && chartData" ref="throughputChartRef" :data="chartData" :options="options" />
+      <div v-else class="flex h-full items-center justify-center">
+        <div v-if="state === 'loading'" class="animate-pulse text-sm text-gray-400">{{ t('common.loading') }}</div>
+        <EmptyState v-else :title="t('common.noData')" :description="t('admin.ops.charts.emptyRequest')" />
+      </div>
+    </div>
+  </div>
+</template>
diff --git a/frontend/src/views/admin/ops/types.ts b/frontend/src/views/admin/ops/types.ts
new file mode 100644
index 00000000..45ba031f
--- /dev/null
+++ b/frontend/src/views/admin/ops/types.ts
@@ -0,0 +1,20 @@
+// Ops 前端视图层的共享类型（与后端 DTO 解耦）。
+
+export type ChartState = 'loading' | 'empty' | 'ready'
+
+// Re-export ops alert/settings types so view components can import from a single place
+// while keeping the API contract centralized in `@/api/admin/ops`.
+export type {
+  AlertRule,
+  AlertEvent,
+  AlertSeverity,
+  ThresholdMode,
+  MetricType,
+  Operator,
+  EmailNotificationConfig,
+  OpsDistributedLockSettings,
+  OpsAlertRuntimeSettings,
+  OpsAdvancedSettings,
+  OpsDataRetentionSettings,
+  OpsAggregationSettings
+} from '@/api/admin/ops'
diff --git a/frontend/src/views/admin/ops/utils/opsFormatters.ts b/frontend/src/views/admin/ops/utils/opsFormatters.ts
new file mode 100644
index 00000000..d503b5a5
--- /dev/null
+++ b/frontend/src/views/admin/ops/utils/opsFormatters.ts
@@ -0,0 +1,75 @@
+/**
+ * Ops 页面共享的格式化/样式工具。
+ *
+ * 目标：尽量对齐 `docs/sub2api` 备份版本的视觉表现（需求一致部分保持一致），
+ * 同时避免引入额外 UI 依赖。
+ */
+
+import type { OpsSeverity } from '@/api/admin/ops'
+import { formatBytes } from '@/utils/format'
+
+export function getSeverityClass(severity: OpsSeverity): string {
+  const classes: Record<string, string> = {
+    P0: 'bg-red-100 text-red-800 dark:bg-red-900/30 dark:text-red-400',
+    P1: 'bg-orange-100 text-orange-800 dark:bg-orange-900/30 dark:text-orange-400',
+    P2: 'bg-yellow-100 text-yellow-800 dark:bg-yellow-900/30 dark:text-yellow-400',
+    P3: 'bg-blue-100 text-blue-800 dark:bg-blue-900/30 dark:text-blue-400'
+  }
+  return classes[String(severity || '')] || classes.P3
+}
+
+export function truncateMessage(msg: string, maxLength = 80): string {
+  if (!msg) return ''
+  return msg.length > maxLength ? msg.substring(0, maxLength) + '...' : msg
+}
+
+/**
+ * 格式化日期时间（短格式，和旧 Ops 页面一致）。
+ * 输出: `MM-DD HH:mm:ss`
+ */
+export function formatDateTime(dateStr: string): string {
+  const d = new Date(dateStr)
+  if (Number.isNaN(d.getTime())) return ''
+  return `${String(d.getMonth() + 1).padStart(2, '0')}-${String(d.getDate()).padStart(2, '0')} ${String(d.getHours()).padStart(2, '0')}:${String(d.getMinutes()).padStart(2, '0')}:${String(d.getSeconds()).padStart(2, '0')}`
+}
+
+export function sumNumbers(values: Array<number | null | undefined>): number {
+  return values.reduce<number>((acc, v) => {
+    const n = typeof v === 'number' && Number.isFinite(v) ? v : 0
+    return acc + n
+  }, 0)
+}
+
+/**
+ * 解析 time_range 为分钟数。
+ * 支持：`5m/30m/1h/6h/24h`
+ */
+export function parseTimeRangeMinutes(range: string): number {
+  const trimmed = (range || '').trim()
+  if (!trimmed) return 60
+  if (trimmed.endsWith('m')) {
+    const v = Number.parseInt(trimmed.slice(0, -1), 10)
+    return Number.isFinite(v) && v > 0 ? v : 60
+  }
+  if (trimmed.endsWith('h')) {
+    const v = Number.parseInt(trimmed.slice(0, -1), 10)
+    return Number.isFinite(v) && v > 0 ? v * 60 : 60
+  }
+  return 60
+}
+
+export function formatHistoryLabel(date: string | undefined, timeRange: string): string {
+  if (!date) return ''
+  const d = new Date(date)
+  if (Number.isNaN(d.getTime())) return ''
+  const minutes = parseTimeRangeMinutes(timeRange)
+  if (minutes >= 24 * 60) {
+    return `${String(d.getMonth() + 1).padStart(2, '0')}-${String(d.getDate()).padStart(2, '0')} ${String(d.getHours()).padStart(2, '0')}:${String(d.getMinutes()).padStart(2, '0')}`
+  }
+  return `${String(d.getHours()).padStart(2, '0')}:${String(d.getMinutes()).padStart(2, '0')}`
+}
+
+export function formatByteRate(bytes: number, windowMinutes: number): string {
+  const seconds = Math.max(1, (windowMinutes || 1) * 60)
+  return `${formatBytes(bytes / seconds, 1)}/s`
+}