diff --git a/.gitignore b/.gitignore index a50f3ecc..fe715240 100644 --- a/.gitignore +++ b/.gitignore @@ -126,6 +126,4 @@ backend/cmd/server/server deploy/docker-compose.override.yml .gocache/ vite.config.js -!docs/ docs/* -!docs/dependency-security.md diff --git a/backend/.dockerignore b/backend/.dockerignore new file mode 100644 index 00000000..c1c2a854 --- /dev/null +++ b/backend/.dockerignore @@ -0,0 +1,2 @@ +.cache/ +.DS_Store diff --git a/backend/.golangci.yml b/backend/.golangci.yml index 52072b16..3ec692a8 100644 --- a/backend/.golangci.yml +++ b/backend/.golangci.yml @@ -18,6 +18,12 @@ linters: list-mode: original files: - "**/internal/service/**" + - "!**/internal/service/ops_aggregation_service.go" + - "!**/internal/service/ops_alert_evaluator_service.go" + - "!**/internal/service/ops_cleanup_service.go" + - "!**/internal/service/ops_metrics_collector.go" + - "!**/internal/service/ops_scheduled_report_service.go" + - "!**/internal/service/wire.go" deny: - pkg: github.com/Wei-Shaw/sub2api/internal/repository desc: "service must not import repository" diff --git a/backend/cmd/server/wire.go b/backend/cmd/server/wire.go index 9447de45..85a9b785 100644 --- a/backend/cmd/server/wire.go +++ b/backend/cmd/server/wire.go @@ -62,6 +62,11 @@ func provideServiceBuildInfo(buildInfo handler.BuildInfo) service.BuildInfo { func provideCleanup( entClient *ent.Client, rdb *redis.Client, + opsMetricsCollector *service.OpsMetricsCollector, + opsAggregation *service.OpsAggregationService, + opsAlertEvaluator *service.OpsAlertEvaluatorService, + opsCleanup *service.OpsCleanupService, + opsScheduledReport *service.OpsScheduledReportService, tokenRefresh *service.TokenRefreshService, accountExpiry *service.AccountExpiryService, pricing *service.PricingService, @@ -81,6 +86,36 @@ func provideCleanup( name string fn func() error }{ + {"OpsScheduledReportService", func() error { + if opsScheduledReport != nil { + opsScheduledReport.Stop() + } + return nil + }}, + {"OpsCleanupService", func() error { + if opsCleanup != nil { + opsCleanup.Stop() + } + return nil + }}, + {"OpsAlertEvaluatorService", func() error { + if opsAlertEvaluator != nil { + opsAlertEvaluator.Stop() + } + return nil + }}, + {"OpsAggregationService", func() error { + if opsAggregation != nil { + opsAggregation.Stop() + } + return nil + }}, + {"OpsMetricsCollector", func() error { + if opsMetricsCollector != nil { + opsMetricsCollector.Stop() + } + return nil + }}, {"TokenRefreshService", func() error { tokenRefresh.Stop() return nil diff --git a/backend/cmd/server/wire_gen.go b/backend/cmd/server/wire_gen.go index bf015117..e66e0e05 100644 --- a/backend/cmd/server/wire_gen.go +++ b/backend/cmd/server/wire_gen.go @@ -120,7 +120,22 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) { proxyHandler := admin.NewProxyHandler(adminService) adminRedeemHandler := admin.NewRedeemHandler(adminService) promoHandler := admin.NewPromoHandler(promoService) - settingHandler := admin.NewSettingHandler(settingService, emailService, turnstileService) + opsRepository := repository.NewOpsRepository(db) + pricingRemoteClient := repository.ProvidePricingRemoteClient(configConfig) + pricingService, err := service.ProvidePricingService(configConfig, pricingRemoteClient) + if err != nil { + return nil, err + } + billingService := service.NewBillingService(configConfig, pricingService) + identityCache := repository.NewIdentityCache(redisClient) + identityService := service.NewIdentityService(identityCache) + deferredService := service.ProvideDeferredService(accountRepository, timingWheelService) + gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService) + openAIGatewayService := service.NewOpenAIGatewayService(accountRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, httpUpstream, deferredService) + geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig) + opsService := service.NewOpsService(opsRepository, settingRepository, configConfig, accountRepository, concurrencyService, gatewayService, openAIGatewayService, geminiMessagesCompatService, antigravityGatewayService) + settingHandler := admin.NewSettingHandler(settingService, emailService, turnstileService, opsService) + opsHandler := admin.NewOpsHandler(opsService) updateCache := repository.NewUpdateCache(redisClient) gitHubReleaseClient := repository.ProvideGitHubReleaseClient(configConfig) serviceBuildInfo := provideServiceBuildInfo(buildInfo) @@ -132,31 +147,24 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) { userAttributeValueRepository := repository.NewUserAttributeValueRepository(client) userAttributeService := service.NewUserAttributeService(userAttributeDefinitionRepository, userAttributeValueRepository) userAttributeHandler := admin.NewUserAttributeHandler(userAttributeService) - adminHandlers := handler.ProvideAdminHandlers(dashboardHandler, adminUserHandler, groupHandler, accountHandler, oAuthHandler, openAIOAuthHandler, geminiOAuthHandler, antigravityOAuthHandler, proxyHandler, adminRedeemHandler, promoHandler, settingHandler, systemHandler, adminSubscriptionHandler, adminUsageHandler, userAttributeHandler) - pricingRemoteClient := repository.ProvidePricingRemoteClient(configConfig) - pricingService, err := service.ProvidePricingService(configConfig, pricingRemoteClient) - if err != nil { - return nil, err - } - billingService := service.NewBillingService(configConfig, pricingService) - identityCache := repository.NewIdentityCache(redisClient) - identityService := service.NewIdentityService(identityCache) - deferredService := service.ProvideDeferredService(accountRepository, timingWheelService) - gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService) - geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig) + adminHandlers := handler.ProvideAdminHandlers(dashboardHandler, adminUserHandler, groupHandler, accountHandler, oAuthHandler, openAIOAuthHandler, geminiOAuthHandler, antigravityOAuthHandler, proxyHandler, adminRedeemHandler, promoHandler, settingHandler, opsHandler, systemHandler, adminSubscriptionHandler, adminUsageHandler, userAttributeHandler) gatewayHandler := handler.NewGatewayHandler(gatewayService, geminiMessagesCompatService, antigravityGatewayService, userService, concurrencyService, billingCacheService, configConfig) - openAIGatewayService := service.NewOpenAIGatewayService(accountRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, httpUpstream, deferredService) openAIGatewayHandler := handler.NewOpenAIGatewayHandler(openAIGatewayService, concurrencyService, billingCacheService, configConfig) handlerSettingHandler := handler.ProvideSettingHandler(settingService, buildInfo) handlers := handler.ProvideHandlers(authHandler, userHandler, apiKeyHandler, usageHandler, redeemHandler, subscriptionHandler, adminHandlers, gatewayHandler, openAIGatewayHandler, handlerSettingHandler) jwtAuthMiddleware := middleware.NewJWTAuthMiddleware(authService, userService) adminAuthMiddleware := middleware.NewAdminAuthMiddleware(authService, userService, settingService) apiKeyAuthMiddleware := middleware.NewAPIKeyAuthMiddleware(apiKeyService, subscriptionService, configConfig) - engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, settingService, redisClient) + engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, opsService, settingService, redisClient) httpServer := server.ProvideHTTPServer(configConfig, engine) + opsMetricsCollector := service.ProvideOpsMetricsCollector(opsRepository, settingRepository, accountRepository, concurrencyService, db, redisClient, configConfig) + opsAggregationService := service.ProvideOpsAggregationService(opsRepository, settingRepository, db, redisClient, configConfig) + opsAlertEvaluatorService := service.ProvideOpsAlertEvaluatorService(opsService, opsRepository, emailService, redisClient, configConfig) + opsCleanupService := service.ProvideOpsCleanupService(opsRepository, db, redisClient, configConfig) + opsScheduledReportService := service.ProvideOpsScheduledReportService(opsService, userService, emailService, redisClient, configConfig) tokenRefreshService := service.ProvideTokenRefreshService(accountRepository, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService, configConfig) accountExpiryService := service.ProvideAccountExpiryService(accountRepository) - v := provideCleanup(client, redisClient, tokenRefreshService, accountExpiryService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService) + v := provideCleanup(client, redisClient, opsMetricsCollector, opsAggregationService, opsAlertEvaluatorService, opsCleanupService, opsScheduledReportService, tokenRefreshService, accountExpiryService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService) application := &Application{ Server: httpServer, Cleanup: v, @@ -181,6 +189,11 @@ func provideServiceBuildInfo(buildInfo handler.BuildInfo) service.BuildInfo { func provideCleanup( entClient *ent.Client, rdb *redis.Client, + opsMetricsCollector *service.OpsMetricsCollector, + opsAggregation *service.OpsAggregationService, + opsAlertEvaluator *service.OpsAlertEvaluatorService, + opsCleanup *service.OpsCleanupService, + opsScheduledReport *service.OpsScheduledReportService, tokenRefresh *service.TokenRefreshService, accountExpiry *service.AccountExpiryService, pricing *service.PricingService, @@ -199,6 +212,36 @@ func provideCleanup( name string fn func() error }{ + {"OpsScheduledReportService", func() error { + if opsScheduledReport != nil { + opsScheduledReport.Stop() + } + return nil + }}, + {"OpsCleanupService", func() error { + if opsCleanup != nil { + opsCleanup.Stop() + } + return nil + }}, + {"OpsAlertEvaluatorService", func() error { + if opsAlertEvaluator != nil { + opsAlertEvaluator.Stop() + } + return nil + }}, + {"OpsAggregationService", func() error { + if opsAggregation != nil { + opsAggregation.Stop() + } + return nil + }}, + {"OpsMetricsCollector", func() error { + if opsMetricsCollector != nil { + opsMetricsCollector.Stop() + } + return nil + }}, {"TokenRefreshService", func() error { tokenRefresh.Stop() return nil diff --git a/backend/go.mod b/backend/go.mod index 82a8e88e..4ac6ba14 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -8,9 +8,11 @@ require ( github.com/golang-jwt/jwt/v5 v5.2.2 github.com/google/uuid v1.6.0 github.com/google/wire v0.7.0 + github.com/gorilla/websocket v1.5.3 github.com/imroc/req/v3 v3.57.0 github.com/lib/pq v1.10.9 github.com/redis/go-redis/v9 v9.17.2 + github.com/shirou/gopsutil/v4 v4.25.6 github.com/spf13/viper v1.18.2 github.com/stretchr/testify v1.11.1 github.com/testcontainers/testcontainers-go/modules/postgres v0.40.0 @@ -106,9 +108,9 @@ require ( github.com/quic-go/quic-go v0.57.1 // indirect github.com/refraction-networking/utls v1.8.1 // indirect github.com/rivo/uniseg v0.2.0 // indirect + github.com/robfig/cron/v3 v3.0.1 // indirect github.com/sagikazarmark/locafero v0.4.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect - github.com/shirou/gopsutil/v4 v4.25.6 // indirect github.com/sirupsen/logrus v1.9.3 // indirect github.com/sourcegraph/conc v0.3.0 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect diff --git a/backend/go.sum b/backend/go.sum index 0fd47498..415e73a7 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -117,6 +117,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/wire v0.7.0 h1:JxUKI6+CVBgCO2WToKy/nQk0sS+amI9z9EjVmdaocj4= github.com/google/wire v0.7.0/go.mod h1:n6YbUQD9cPKTnHXEBN2DXlOp/mVADhVErcMFb0v3J18= +github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= +github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= @@ -224,6 +226,8 @@ github.com/refraction-networking/utls v1.8.1 h1:yNY1kapmQU8JeM1sSw2H2asfTIwWxIkr github.com/refraction-networking/utls v1.8.1/go.mod h1:jkSOEkLqn+S/jtpEHPOsVv/4V4EVnelwbMQl4vCWXAM= github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= +github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index 7bdd8fe3..ffca51df 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -43,6 +43,7 @@ type Config struct { Turnstile TurnstileConfig `mapstructure:"turnstile"` Database DatabaseConfig `mapstructure:"database"` Redis RedisConfig `mapstructure:"redis"` + Ops OpsConfig `mapstructure:"ops"` JWT JWTConfig `mapstructure:"jwt"` LinuxDo LinuxDoConnectConfig `mapstructure:"linuxdo_connect"` Default DefaultConfig `mapstructure:"default"` @@ -60,14 +61,6 @@ type Config struct { Update UpdateConfig `mapstructure:"update"` } -// UpdateConfig 在线更新相关配置 -type UpdateConfig struct { - // ProxyURL 用于访问 GitHub 的代理地址 - // 支持 http/https/socks5/socks5h 协议 - // 例如: "http://127.0.0.1:7890", "socks5://127.0.0.1:1080" - ProxyURL string `mapstructure:"proxy_url"` -} - type GeminiConfig struct { OAuth GeminiOAuthConfig `mapstructure:"oauth"` Quota GeminiQuotaConfig `mapstructure:"quota"` @@ -90,6 +83,33 @@ type GeminiTierQuotaConfig struct { CooldownMinutes *int `mapstructure:"cooldown_minutes" json:"cooldown_minutes"` } +type UpdateConfig struct { + // ProxyURL 用于访问 GitHub 的代理地址 + // 支持 http/https/socks5/socks5h 协议 + // 例如: "http://127.0.0.1:7890", "socks5://127.0.0.1:1080" + ProxyURL string `mapstructure:"proxy_url"` +} + +type LinuxDoConnectConfig struct { + Enabled bool `mapstructure:"enabled"` + ClientID string `mapstructure:"client_id"` + ClientSecret string `mapstructure:"client_secret"` + AuthorizeURL string `mapstructure:"authorize_url"` + TokenURL string `mapstructure:"token_url"` + UserInfoURL string `mapstructure:"userinfo_url"` + Scopes string `mapstructure:"scopes"` + RedirectURL string `mapstructure:"redirect_url"` // 后端回调地址(需在提供方后台登记) + FrontendRedirectURL string `mapstructure:"frontend_redirect_url"` // 前端接收 token 的路由(默认:/auth/linuxdo/callback) + TokenAuthMethod string `mapstructure:"token_auth_method"` // client_secret_post / client_secret_basic / none + UsePKCE bool `mapstructure:"use_pkce"` + + // 可选:用于从 userinfo JSON 中提取字段的 gjson 路径。 + // 为空时,服务端会尝试一组常见字段名。 + UserInfoEmailPath string `mapstructure:"userinfo_email_path"` + UserInfoIDPath string `mapstructure:"userinfo_id_path"` + UserInfoUsernamePath string `mapstructure:"userinfo_username_path"` +} + // TokenRefreshConfig OAuth token自动刷新配置 type TokenRefreshConfig struct { // 是否启用自动刷新 @@ -332,6 +352,47 @@ func (r *RedisConfig) Address() string { return fmt.Sprintf("%s:%d", r.Host, r.Port) } +type OpsConfig struct { + // Enabled controls whether ops features should run. + // + // NOTE: vNext still has a DB-backed feature flag (ops_monitoring_enabled) for runtime on/off. + // This config flag is the "hard switch" for deployments that want to disable ops completely. + Enabled bool `mapstructure:"enabled"` + + // UsePreaggregatedTables prefers ops_metrics_hourly/daily for long-window dashboard queries. + UsePreaggregatedTables bool `mapstructure:"use_preaggregated_tables"` + + // Cleanup controls periodic deletion of old ops data to prevent unbounded growth. + Cleanup OpsCleanupConfig `mapstructure:"cleanup"` + + // MetricsCollectorCache controls Redis caching for expensive per-window collector queries. + MetricsCollectorCache OpsMetricsCollectorCacheConfig `mapstructure:"metrics_collector_cache"` + + // Pre-aggregation configuration. + Aggregation OpsAggregationConfig `mapstructure:"aggregation"` +} + +type OpsCleanupConfig struct { + Enabled bool `mapstructure:"enabled"` + Schedule string `mapstructure:"schedule"` + + // Retention days (0 disables that cleanup target). + // + // vNext requirement: default 30 days across ops datasets. + ErrorLogRetentionDays int `mapstructure:"error_log_retention_days"` + MinuteMetricsRetentionDays int `mapstructure:"minute_metrics_retention_days"` + HourlyMetricsRetentionDays int `mapstructure:"hourly_metrics_retention_days"` +} + +type OpsAggregationConfig struct { + Enabled bool `mapstructure:"enabled"` +} + +type OpsMetricsCollectorCacheConfig struct { + Enabled bool `mapstructure:"enabled"` + TTL time.Duration `mapstructure:"ttl"` +} + type JWTConfig struct { Secret string `mapstructure:"secret"` ExpireHour int `mapstructure:"expire_hour"` @@ -341,30 +402,6 @@ type TurnstileConfig struct { Required bool `mapstructure:"required"` } -// LinuxDoConnectConfig 用于 LinuxDo Connect OAuth 登录(终端用户 SSO)。 -// -// 注意:这与上游账号的 OAuth(例如 OpenAI/Gemini 账号接入)不是一回事。 -// 这里是用于登录 Sub2API 本身的用户体系。 -type LinuxDoConnectConfig struct { - Enabled bool `mapstructure:"enabled"` - ClientID string `mapstructure:"client_id"` - ClientSecret string `mapstructure:"client_secret"` - AuthorizeURL string `mapstructure:"authorize_url"` - TokenURL string `mapstructure:"token_url"` - UserInfoURL string `mapstructure:"userinfo_url"` - Scopes string `mapstructure:"scopes"` - RedirectURL string `mapstructure:"redirect_url"` // 后端回调地址(需在提供方后台登记) - FrontendRedirectURL string `mapstructure:"frontend_redirect_url"` // 前端接收 token 的路由(默认:/auth/linuxdo/callback) - TokenAuthMethod string `mapstructure:"token_auth_method"` // client_secret_post / client_secret_basic / none - UsePKCE bool `mapstructure:"use_pkce"` - - // 可选:用于从 userinfo JSON 中提取字段的 gjson 路径。 - // 为空时,服务端会尝试一组常见字段名。 - UserInfoEmailPath string `mapstructure:"userinfo_email_path"` - UserInfoIDPath string `mapstructure:"userinfo_id_path"` - UserInfoUsernamePath string `mapstructure:"userinfo_username_path"` -} - type DefaultConfig struct { AdminEmail string `mapstructure:"admin_email"` AdminPassword string `mapstructure:"admin_password"` @@ -531,81 +568,6 @@ func Load() (*Config, error) { return &cfg, nil } -// ValidateAbsoluteHTTPURL 校验一个绝对 http(s) URL(禁止 fragment)。 -func ValidateAbsoluteHTTPURL(raw string) error { - raw = strings.TrimSpace(raw) - if raw == "" { - return fmt.Errorf("empty url") - } - u, err := url.Parse(raw) - if err != nil { - return err - } - if !u.IsAbs() { - return fmt.Errorf("must be absolute") - } - if !isHTTPScheme(u.Scheme) { - return fmt.Errorf("unsupported scheme: %s", u.Scheme) - } - if strings.TrimSpace(u.Host) == "" { - return fmt.Errorf("missing host") - } - if u.Fragment != "" { - return fmt.Errorf("must not include fragment") - } - return nil -} - -// ValidateFrontendRedirectURL 校验前端回调地址: -// - 允许同源相对路径(以 / 开头) -// - 或绝对 http(s) URL(禁止 fragment) -func ValidateFrontendRedirectURL(raw string) error { - raw = strings.TrimSpace(raw) - if raw == "" { - return fmt.Errorf("empty url") - } - if strings.ContainsAny(raw, "\r\n") { - return fmt.Errorf("contains invalid characters") - } - if strings.HasPrefix(raw, "/") { - if strings.HasPrefix(raw, "//") { - return fmt.Errorf("must not start with //") - } - return nil - } - u, err := url.Parse(raw) - if err != nil { - return err - } - if !u.IsAbs() { - return fmt.Errorf("must be absolute http(s) url or relative path") - } - if !isHTTPScheme(u.Scheme) { - return fmt.Errorf("unsupported scheme: %s", u.Scheme) - } - if strings.TrimSpace(u.Host) == "" { - return fmt.Errorf("missing host") - } - if u.Fragment != "" { - return fmt.Errorf("must not include fragment") - } - return nil -} - -func isHTTPScheme(scheme string) bool { - return strings.EqualFold(scheme, "http") || strings.EqualFold(scheme, "https") -} - -func warnIfInsecureURL(field, raw string) { - u, err := url.Parse(strings.TrimSpace(raw)) - if err != nil { - return - } - if strings.EqualFold(u.Scheme, "http") { - log.Printf("Warning: %s uses http scheme; use https in production to avoid token leakage.", field) - } -} - func setDefaults() { viper.SetDefault("run_mode", RunModeStandard) @@ -655,7 +617,7 @@ func setDefaults() { // Turnstile viper.SetDefault("turnstile.required", false) - // LinuxDo Connect OAuth 登录(终端用户 SSO) + // LinuxDo Connect OAuth 登录 viper.SetDefault("linuxdo_connect.enabled", false) viper.SetDefault("linuxdo_connect.client_id", "") viper.SetDefault("linuxdo_connect.client_secret", "") @@ -694,6 +656,20 @@ func setDefaults() { viper.SetDefault("redis.pool_size", 128) viper.SetDefault("redis.min_idle_conns", 10) + // Ops (vNext) + viper.SetDefault("ops.enabled", true) + viper.SetDefault("ops.use_preaggregated_tables", false) + viper.SetDefault("ops.cleanup.enabled", true) + viper.SetDefault("ops.cleanup.schedule", "0 2 * * *") + // Retention days: vNext defaults to 30 days across ops datasets. + viper.SetDefault("ops.cleanup.error_log_retention_days", 30) + viper.SetDefault("ops.cleanup.minute_metrics_retention_days", 30) + viper.SetDefault("ops.cleanup.hourly_metrics_retention_days", 30) + viper.SetDefault("ops.aggregation.enabled", true) + viper.SetDefault("ops.metrics_collector_cache.enabled", true) + // TTL should be slightly larger than collection interval (1m) to maximize cross-replica cache hits. + viper.SetDefault("ops.metrics_collector_cache.ttl", 65*time.Second) + // JWT viper.SetDefault("jwt.secret", "") viper.SetDefault("jwt.expire_hour", 24) @@ -750,7 +726,7 @@ func setDefaults() { // Gateway viper.SetDefault("gateway.response_header_timeout", 600) // 600秒(10分钟)等待上游响应头,LLM高负载时可能排队较久 - viper.SetDefault("gateway.log_upstream_error_body", false) + viper.SetDefault("gateway.log_upstream_error_body", true) viper.SetDefault("gateway.log_upstream_error_body_max_bytes", 2048) viper.SetDefault("gateway.inject_beta_for_apikey", false) viper.SetDefault("gateway.failover_on_400", false) @@ -766,7 +742,7 @@ func setDefaults() { viper.SetDefault("gateway.concurrency_slot_ttl_minutes", 30) // 并发槽位过期时间(支持超长请求) viper.SetDefault("gateway.stream_data_interval_timeout", 180) viper.SetDefault("gateway.stream_keepalive_interval", 10) - viper.SetDefault("gateway.max_line_size", 40*1024*1024) + viper.SetDefault("gateway.max_line_size", 10*1024*1024) viper.SetDefault("gateway.scheduling.sticky_session_max_waiting", 3) viper.SetDefault("gateway.scheduling.sticky_session_wait_timeout", 45*time.Second) viper.SetDefault("gateway.scheduling.fallback_wait_timeout", 30*time.Second) @@ -789,10 +765,6 @@ func setDefaults() { viper.SetDefault("gemini.oauth.client_secret", "") viper.SetDefault("gemini.oauth.scopes", "") viper.SetDefault("gemini.quota.policy", "") - - // Update - 在线更新配置 - // 代理地址为空表示直连 GitHub(适用于海外服务器) - viper.SetDefault("update.proxy_url", "") } func (c *Config) Validate() error { @@ -833,7 +805,8 @@ func (c *Config) Validate() error { if method == "none" && !c.LinuxDo.UsePKCE { return fmt.Errorf("linuxdo_connect.use_pkce must be true when linuxdo_connect.token_auth_method=none") } - if (method == "" || method == "client_secret_post" || method == "client_secret_basic") && strings.TrimSpace(c.LinuxDo.ClientSecret) == "" { + if (method == "" || method == "client_secret_post" || method == "client_secret_basic") && + strings.TrimSpace(c.LinuxDo.ClientSecret) == "" { return fmt.Errorf("linuxdo_connect.client_secret is required when linuxdo_connect.enabled=true and token_auth_method is client_secret_post/client_secret_basic") } if strings.TrimSpace(c.LinuxDo.FrontendRedirectURL) == "" { @@ -1048,6 +1021,21 @@ func (c *Config) Validate() error { if c.Gateway.Scheduling.SlotCleanupInterval < 0 { return fmt.Errorf("gateway.scheduling.slot_cleanup_interval must be non-negative") } + if c.Ops.MetricsCollectorCache.TTL < 0 { + return fmt.Errorf("ops.metrics_collector_cache.ttl must be non-negative") + } + if c.Ops.Cleanup.ErrorLogRetentionDays < 0 { + return fmt.Errorf("ops.cleanup.error_log_retention_days must be non-negative") + } + if c.Ops.Cleanup.MinuteMetricsRetentionDays < 0 { + return fmt.Errorf("ops.cleanup.minute_metrics_retention_days must be non-negative") + } + if c.Ops.Cleanup.HourlyMetricsRetentionDays < 0 { + return fmt.Errorf("ops.cleanup.hourly_metrics_retention_days must be non-negative") + } + if c.Ops.Cleanup.Enabled && strings.TrimSpace(c.Ops.Cleanup.Schedule) == "" { + return fmt.Errorf("ops.cleanup.schedule is required when ops.cleanup.enabled=true") + } if c.Concurrency.PingInterval < 5 || c.Concurrency.PingInterval > 30 { return fmt.Errorf("concurrency.ping_interval must be between 5-30 seconds") } @@ -1124,3 +1112,77 @@ func GetServerAddress() string { port := v.GetInt("server.port") return fmt.Sprintf("%s:%d", host, port) } + +// ValidateAbsoluteHTTPURL 验证是否为有效的绝对 HTTP(S) URL +func ValidateAbsoluteHTTPURL(raw string) error { + raw = strings.TrimSpace(raw) + if raw == "" { + return fmt.Errorf("empty url") + } + u, err := url.Parse(raw) + if err != nil { + return err + } + if !u.IsAbs() { + return fmt.Errorf("must be absolute") + } + if !isHTTPScheme(u.Scheme) { + return fmt.Errorf("unsupported scheme: %s", u.Scheme) + } + if strings.TrimSpace(u.Host) == "" { + return fmt.Errorf("missing host") + } + if u.Fragment != "" { + return fmt.Errorf("must not include fragment") + } + return nil +} + +// ValidateFrontendRedirectURL 验证前端重定向 URL(可以是绝对 URL 或相对路径) +func ValidateFrontendRedirectURL(raw string) error { + raw = strings.TrimSpace(raw) + if raw == "" { + return fmt.Errorf("empty url") + } + if strings.ContainsAny(raw, "\r\n") { + return fmt.Errorf("contains invalid characters") + } + if strings.HasPrefix(raw, "/") { + if strings.HasPrefix(raw, "//") { + return fmt.Errorf("must not start with //") + } + return nil + } + u, err := url.Parse(raw) + if err != nil { + return err + } + if !u.IsAbs() { + return fmt.Errorf("must be absolute http(s) url or relative path") + } + if !isHTTPScheme(u.Scheme) { + return fmt.Errorf("unsupported scheme: %s", u.Scheme) + } + if strings.TrimSpace(u.Host) == "" { + return fmt.Errorf("missing host") + } + if u.Fragment != "" { + return fmt.Errorf("must not include fragment") + } + return nil +} + +// isHTTPScheme 检查是否为 HTTP 或 HTTPS 协议 +func isHTTPScheme(scheme string) bool { + return strings.EqualFold(scheme, "http") || strings.EqualFold(scheme, "https") +} + +func warnIfInsecureURL(field, raw string) { + u, err := url.Parse(strings.TrimSpace(raw)) + if err != nil { + return + } + if strings.EqualFold(u.Scheme, "http") { + log.Printf("Warning: %s uses http scheme; use https in production to avoid token leakage.", field) + } +} diff --git a/backend/internal/handler/admin/ops_alerts_handler.go b/backend/internal/handler/admin/ops_alerts_handler.go new file mode 100644 index 00000000..1e33ddd5 --- /dev/null +++ b/backend/internal/handler/admin/ops_alerts_handler.go @@ -0,0 +1,432 @@ +package admin + +import ( + "encoding/json" + "fmt" + "math" + "net/http" + "strconv" + "strings" + + "github.com/Wei-Shaw/sub2api/internal/pkg/response" + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/gin-gonic/gin" + "github.com/gin-gonic/gin/binding" +) + +var validOpsAlertMetricTypes = []string{ + "success_rate", + "error_rate", + "upstream_error_rate", + "p95_latency_ms", + "p99_latency_ms", + "cpu_usage_percent", + "memory_usage_percent", + "concurrency_queue_depth", +} + +var validOpsAlertMetricTypeSet = func() map[string]struct{} { + set := make(map[string]struct{}, len(validOpsAlertMetricTypes)) + for _, v := range validOpsAlertMetricTypes { + set[v] = struct{}{} + } + return set +}() + +var validOpsAlertOperators = []string{">", "<", ">=", "<=", "==", "!="} + +var validOpsAlertOperatorSet = func() map[string]struct{} { + set := make(map[string]struct{}, len(validOpsAlertOperators)) + for _, v := range validOpsAlertOperators { + set[v] = struct{}{} + } + return set +}() + +var validOpsAlertSeverities = []string{"P0", "P1", "P2", "P3"} + +var validOpsAlertSeveritySet = func() map[string]struct{} { + set := make(map[string]struct{}, len(validOpsAlertSeverities)) + for _, v := range validOpsAlertSeverities { + set[v] = struct{}{} + } + return set +}() + +type opsAlertRuleValidatedInput struct { + Name string + MetricType string + Operator string + Threshold float64 + + Severity string + + WindowMinutes int + SustainedMinutes int + CooldownMinutes int + + Enabled bool + NotifyEmail bool + + WindowProvided bool + SustainedProvided bool + CooldownProvided bool + SeverityProvided bool + EnabledProvided bool + NotifyProvided bool +} + +func isPercentOrRateMetric(metricType string) bool { + switch metricType { + case "success_rate", + "error_rate", + "upstream_error_rate", + "cpu_usage_percent", + "memory_usage_percent": + return true + default: + return false + } +} + +func validateOpsAlertRulePayload(raw map[string]json.RawMessage) (*opsAlertRuleValidatedInput, error) { + if raw == nil { + return nil, fmt.Errorf("invalid request body") + } + + requiredFields := []string{"name", "metric_type", "operator", "threshold"} + for _, field := range requiredFields { + if _, ok := raw[field]; !ok { + return nil, fmt.Errorf("%s is required", field) + } + } + + var name string + if err := json.Unmarshal(raw["name"], &name); err != nil || strings.TrimSpace(name) == "" { + return nil, fmt.Errorf("name is required") + } + name = strings.TrimSpace(name) + + var metricType string + if err := json.Unmarshal(raw["metric_type"], &metricType); err != nil || strings.TrimSpace(metricType) == "" { + return nil, fmt.Errorf("metric_type is required") + } + metricType = strings.TrimSpace(metricType) + if _, ok := validOpsAlertMetricTypeSet[metricType]; !ok { + return nil, fmt.Errorf("metric_type must be one of: %s", strings.Join(validOpsAlertMetricTypes, ", ")) + } + + var operator string + if err := json.Unmarshal(raw["operator"], &operator); err != nil || strings.TrimSpace(operator) == "" { + return nil, fmt.Errorf("operator is required") + } + operator = strings.TrimSpace(operator) + if _, ok := validOpsAlertOperatorSet[operator]; !ok { + return nil, fmt.Errorf("operator must be one of: %s", strings.Join(validOpsAlertOperators, ", ")) + } + + var threshold float64 + if err := json.Unmarshal(raw["threshold"], &threshold); err != nil { + return nil, fmt.Errorf("threshold must be a number") + } + if math.IsNaN(threshold) || math.IsInf(threshold, 0) { + return nil, fmt.Errorf("threshold must be a finite number") + } + if isPercentOrRateMetric(metricType) { + if threshold < 0 || threshold > 100 { + return nil, fmt.Errorf("threshold must be between 0 and 100 for metric_type %s", metricType) + } + } else if threshold < 0 { + return nil, fmt.Errorf("threshold must be >= 0") + } + + validated := &opsAlertRuleValidatedInput{ + Name: name, + MetricType: metricType, + Operator: operator, + Threshold: threshold, + } + + if v, ok := raw["severity"]; ok { + validated.SeverityProvided = true + var sev string + if err := json.Unmarshal(v, &sev); err != nil { + return nil, fmt.Errorf("severity must be a string") + } + sev = strings.ToUpper(strings.TrimSpace(sev)) + if sev != "" { + if _, ok := validOpsAlertSeveritySet[sev]; !ok { + return nil, fmt.Errorf("severity must be one of: %s", strings.Join(validOpsAlertSeverities, ", ")) + } + validated.Severity = sev + } + } + if validated.Severity == "" { + validated.Severity = "P2" + } + + if v, ok := raw["enabled"]; ok { + validated.EnabledProvided = true + if err := json.Unmarshal(v, &validated.Enabled); err != nil { + return nil, fmt.Errorf("enabled must be a boolean") + } + } else { + validated.Enabled = true + } + + if v, ok := raw["notify_email"]; ok { + validated.NotifyProvided = true + if err := json.Unmarshal(v, &validated.NotifyEmail); err != nil { + return nil, fmt.Errorf("notify_email must be a boolean") + } + } else { + validated.NotifyEmail = true + } + + if v, ok := raw["window_minutes"]; ok { + validated.WindowProvided = true + if err := json.Unmarshal(v, &validated.WindowMinutes); err != nil { + return nil, fmt.Errorf("window_minutes must be an integer") + } + switch validated.WindowMinutes { + case 1, 5, 60: + default: + return nil, fmt.Errorf("window_minutes must be one of: 1, 5, 60") + } + } else { + validated.WindowMinutes = 1 + } + + if v, ok := raw["sustained_minutes"]; ok { + validated.SustainedProvided = true + if err := json.Unmarshal(v, &validated.SustainedMinutes); err != nil { + return nil, fmt.Errorf("sustained_minutes must be an integer") + } + if validated.SustainedMinutes < 1 || validated.SustainedMinutes > 1440 { + return nil, fmt.Errorf("sustained_minutes must be between 1 and 1440") + } + } else { + validated.SustainedMinutes = 1 + } + + if v, ok := raw["cooldown_minutes"]; ok { + validated.CooldownProvided = true + if err := json.Unmarshal(v, &validated.CooldownMinutes); err != nil { + return nil, fmt.Errorf("cooldown_minutes must be an integer") + } + if validated.CooldownMinutes < 0 || validated.CooldownMinutes > 1440 { + return nil, fmt.Errorf("cooldown_minutes must be between 0 and 1440") + } + } else { + validated.CooldownMinutes = 0 + } + + return validated, nil +} + +// ListAlertRules returns all ops alert rules. +// GET /api/v1/admin/ops/alert-rules +func (h *OpsHandler) ListAlertRules(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + rules, err := h.opsService.ListAlertRules(c.Request.Context()) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, rules) +} + +// CreateAlertRule creates an ops alert rule. +// POST /api/v1/admin/ops/alert-rules +func (h *OpsHandler) CreateAlertRule(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + var raw map[string]json.RawMessage + if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + validated, err := validateOpsAlertRulePayload(raw) + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + var rule service.OpsAlertRule + if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + + rule.Name = validated.Name + rule.MetricType = validated.MetricType + rule.Operator = validated.Operator + rule.Threshold = validated.Threshold + rule.WindowMinutes = validated.WindowMinutes + rule.SustainedMinutes = validated.SustainedMinutes + rule.CooldownMinutes = validated.CooldownMinutes + rule.Severity = validated.Severity + rule.Enabled = validated.Enabled + rule.NotifyEmail = validated.NotifyEmail + + created, err := h.opsService.CreateAlertRule(c.Request.Context(), &rule) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, created) +} + +// UpdateAlertRule updates an existing ops alert rule. +// PUT /api/v1/admin/ops/alert-rules/:id +func (h *OpsHandler) UpdateAlertRule(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + id, err := strconv.ParseInt(c.Param("id"), 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid rule ID") + return + } + + var raw map[string]json.RawMessage + if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + validated, err := validateOpsAlertRulePayload(raw) + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + var rule service.OpsAlertRule + if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + + rule.ID = id + rule.Name = validated.Name + rule.MetricType = validated.MetricType + rule.Operator = validated.Operator + rule.Threshold = validated.Threshold + rule.WindowMinutes = validated.WindowMinutes + rule.SustainedMinutes = validated.SustainedMinutes + rule.CooldownMinutes = validated.CooldownMinutes + rule.Severity = validated.Severity + rule.Enabled = validated.Enabled + rule.NotifyEmail = validated.NotifyEmail + + updated, err := h.opsService.UpdateAlertRule(c.Request.Context(), &rule) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, updated) +} + +// DeleteAlertRule deletes an ops alert rule. +// DELETE /api/v1/admin/ops/alert-rules/:id +func (h *OpsHandler) DeleteAlertRule(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + id, err := strconv.ParseInt(c.Param("id"), 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid rule ID") + return + } + + if err := h.opsService.DeleteAlertRule(c.Request.Context(), id); err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, gin.H{"deleted": true}) +} + +// ListAlertEvents lists recent ops alert events. +// GET /api/v1/admin/ops/alert-events +func (h *OpsHandler) ListAlertEvents(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + limit := 100 + if raw := strings.TrimSpace(c.Query("limit")); raw != "" { + n, err := strconv.Atoi(raw) + if err != nil || n <= 0 { + response.BadRequest(c, "Invalid limit") + return + } + limit = n + } + + filter := &service.OpsAlertEventFilter{ + Limit: limit, + Status: strings.TrimSpace(c.Query("status")), + Severity: strings.TrimSpace(c.Query("severity")), + } + + // Optional global filter support (platform/group/time range). + if platform := strings.TrimSpace(c.Query("platform")); platform != "" { + filter.Platform = platform + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + if startTime, endTime, err := parseOpsTimeRange(c, "24h"); err == nil { + // Only apply when explicitly provided to avoid surprising default narrowing. + if strings.TrimSpace(c.Query("start_time")) != "" || strings.TrimSpace(c.Query("end_time")) != "" || strings.TrimSpace(c.Query("time_range")) != "" { + filter.StartTime = &startTime + filter.EndTime = &endTime + } + } else { + response.BadRequest(c, err.Error()) + return + } + + events, err := h.opsService.ListAlertEvents(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, events) +} diff --git a/backend/internal/handler/admin/ops_dashboard_handler.go b/backend/internal/handler/admin/ops_dashboard_handler.go new file mode 100644 index 00000000..2c87f734 --- /dev/null +++ b/backend/internal/handler/admin/ops_dashboard_handler.go @@ -0,0 +1,243 @@ +package admin + +import ( + "net/http" + "strconv" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/pkg/response" + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/gin-gonic/gin" +) + +// GetDashboardOverview returns vNext ops dashboard overview (raw path). +// GET /api/v1/admin/ops/dashboard/overview +func (h *OpsHandler) GetDashboardOverview(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsDashboardFilter{ + StartTime: startTime, + EndTime: endTime, + Platform: strings.TrimSpace(c.Query("platform")), + QueryMode: parseOpsQueryMode(c), + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + + data, err := h.opsService.GetDashboardOverview(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, data) +} + +// GetDashboardThroughputTrend returns throughput time series (raw path). +// GET /api/v1/admin/ops/dashboard/throughput-trend +func (h *OpsHandler) GetDashboardThroughputTrend(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsDashboardFilter{ + StartTime: startTime, + EndTime: endTime, + Platform: strings.TrimSpace(c.Query("platform")), + QueryMode: parseOpsQueryMode(c), + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + + bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime)) + data, err := h.opsService.GetThroughputTrend(c.Request.Context(), filter, bucketSeconds) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, data) +} + +// GetDashboardLatencyHistogram returns the latency distribution histogram (success requests). +// GET /api/v1/admin/ops/dashboard/latency-histogram +func (h *OpsHandler) GetDashboardLatencyHistogram(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsDashboardFilter{ + StartTime: startTime, + EndTime: endTime, + Platform: strings.TrimSpace(c.Query("platform")), + QueryMode: parseOpsQueryMode(c), + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + + data, err := h.opsService.GetLatencyHistogram(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, data) +} + +// GetDashboardErrorTrend returns error counts time series (raw path). +// GET /api/v1/admin/ops/dashboard/error-trend +func (h *OpsHandler) GetDashboardErrorTrend(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsDashboardFilter{ + StartTime: startTime, + EndTime: endTime, + Platform: strings.TrimSpace(c.Query("platform")), + QueryMode: parseOpsQueryMode(c), + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + + bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime)) + data, err := h.opsService.GetErrorTrend(c.Request.Context(), filter, bucketSeconds) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, data) +} + +// GetDashboardErrorDistribution returns error distribution by status code (raw path). +// GET /api/v1/admin/ops/dashboard/error-distribution +func (h *OpsHandler) GetDashboardErrorDistribution(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsDashboardFilter{ + StartTime: startTime, + EndTime: endTime, + Platform: strings.TrimSpace(c.Query("platform")), + QueryMode: parseOpsQueryMode(c), + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + + data, err := h.opsService.GetErrorDistribution(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, data) +} + +func pickThroughputBucketSeconds(window time.Duration) int { + // Keep buckets predictable and avoid huge responses. + switch { + case window <= 2*time.Hour: + return 60 + case window <= 24*time.Hour: + return 300 + default: + return 3600 + } +} + +func parseOpsQueryMode(c *gin.Context) service.OpsQueryMode { + if c == nil { + return "" + } + raw := strings.TrimSpace(c.Query("mode")) + if raw == "" { + // Empty means "use server default" (DB setting ops_query_mode_default). + return "" + } + return service.ParseOpsQueryMode(raw) +} diff --git a/backend/internal/handler/admin/ops_handler.go b/backend/internal/handler/admin/ops_handler.go new file mode 100644 index 00000000..bff7426a --- /dev/null +++ b/backend/internal/handler/admin/ops_handler.go @@ -0,0 +1,364 @@ +package admin + +import ( + "errors" + "fmt" + "io" + "net/http" + "strconv" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/pkg/response" + "github.com/Wei-Shaw/sub2api/internal/server/middleware" + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/gin-gonic/gin" +) + +type OpsHandler struct { + opsService *service.OpsService +} + +func NewOpsHandler(opsService *service.OpsService) *OpsHandler { + return &OpsHandler{opsService: opsService} +} + +// GetErrorLogs lists ops error logs. +// GET /api/v1/admin/ops/errors +func (h *OpsHandler) GetErrorLogs(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + page, pageSize := response.ParsePagination(c) + // Ops list can be larger than standard admin tables. + if pageSize > 500 { + pageSize = 500 + } + + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsErrorLogFilter{ + Page: page, + PageSize: pageSize, + } + if !startTime.IsZero() { + filter.StartTime = &startTime + } + if !endTime.IsZero() { + filter.EndTime = &endTime + } + + if platform := strings.TrimSpace(c.Query("platform")); platform != "" { + filter.Platform = platform + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + if v := strings.TrimSpace(c.Query("account_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid account_id") + return + } + filter.AccountID = &id + } + if phase := strings.TrimSpace(c.Query("phase")); phase != "" { + filter.Phase = phase + } + if q := strings.TrimSpace(c.Query("q")); q != "" { + filter.Query = q + } + if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" { + parts := strings.Split(statusCodesStr, ",") + out := make([]int, 0, len(parts)) + for _, part := range parts { + p := strings.TrimSpace(part) + if p == "" { + continue + } + n, err := strconv.Atoi(p) + if err != nil || n < 0 { + response.BadRequest(c, "Invalid status_codes") + return + } + out = append(out, n) + } + filter.StatusCodes = out + } + + result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + + response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize) +} + +// GetErrorLogByID returns a single error log detail. +// GET /api/v1/admin/ops/errors/:id +func (h *OpsHandler) GetErrorLogByID(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + detail, err := h.opsService.GetErrorLogByID(c.Request.Context(), id) + if err != nil { + response.ErrorFrom(c, err) + return + } + + response.Success(c, detail) +} + +// ListRequestDetails returns a request-level list (success + error) for drill-down. +// GET /api/v1/admin/ops/requests +func (h *OpsHandler) ListRequestDetails(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + page, pageSize := response.ParsePagination(c) + if pageSize > 100 { + pageSize = 100 + } + + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsRequestDetailFilter{ + Page: page, + PageSize: pageSize, + StartTime: &startTime, + EndTime: &endTime, + } + + filter.Kind = strings.TrimSpace(c.Query("kind")) + filter.Platform = strings.TrimSpace(c.Query("platform")) + filter.Model = strings.TrimSpace(c.Query("model")) + filter.RequestID = strings.TrimSpace(c.Query("request_id")) + filter.Query = strings.TrimSpace(c.Query("q")) + filter.Sort = strings.TrimSpace(c.Query("sort")) + + if v := strings.TrimSpace(c.Query("user_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid user_id") + return + } + filter.UserID = &id + } + if v := strings.TrimSpace(c.Query("api_key_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid api_key_id") + return + } + filter.APIKeyID = &id + } + if v := strings.TrimSpace(c.Query("account_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid account_id") + return + } + filter.AccountID = &id + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + + if v := strings.TrimSpace(c.Query("min_duration_ms")); v != "" { + parsed, err := strconv.Atoi(v) + if err != nil || parsed < 0 { + response.BadRequest(c, "Invalid min_duration_ms") + return + } + filter.MinDurationMs = &parsed + } + if v := strings.TrimSpace(c.Query("max_duration_ms")); v != "" { + parsed, err := strconv.Atoi(v) + if err != nil || parsed < 0 { + response.BadRequest(c, "Invalid max_duration_ms") + return + } + filter.MaxDurationMs = &parsed + } + + out, err := h.opsService.ListRequestDetails(c.Request.Context(), filter) + if err != nil { + // Invalid sort/kind/platform etc should be a bad request; keep it simple. + if strings.Contains(strings.ToLower(err.Error()), "invalid") { + response.BadRequest(c, err.Error()) + return + } + response.Error(c, http.StatusInternalServerError, "Failed to list request details") + return + } + + response.Paginated(c, out.Items, out.Total, out.Page, out.PageSize) +} + +type opsRetryRequest struct { + Mode string `json:"mode"` + PinnedAccountID *int64 `json:"pinned_account_id"` +} + +// RetryErrorRequest retries a failed request using stored request_body. +// POST /api/v1/admin/ops/errors/:id/retry +func (h *OpsHandler) RetryErrorRequest(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + subject, ok := middleware.GetAuthSubjectFromContext(c) + if !ok || subject.UserID <= 0 { + response.Error(c, http.StatusUnauthorized, "Unauthorized") + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + req := opsRetryRequest{Mode: service.OpsRetryModeClient} + if err := c.ShouldBindJSON(&req); err != nil && !errors.Is(err, io.EOF) { + response.BadRequest(c, "Invalid request: "+err.Error()) + return + } + if strings.TrimSpace(req.Mode) == "" { + req.Mode = service.OpsRetryModeClient + } + + result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, req.Mode, req.PinnedAccountID) + if err != nil { + response.ErrorFrom(c, err) + return + } + + response.Success(c, result) +} + +func parseOpsTimeRange(c *gin.Context, defaultRange string) (time.Time, time.Time, error) { + startStr := strings.TrimSpace(c.Query("start_time")) + endStr := strings.TrimSpace(c.Query("end_time")) + + parseTS := func(s string) (time.Time, error) { + if s == "" { + return time.Time{}, nil + } + if t, err := time.Parse(time.RFC3339Nano, s); err == nil { + return t, nil + } + return time.Parse(time.RFC3339, s) + } + + start, err := parseTS(startStr) + if err != nil { + return time.Time{}, time.Time{}, err + } + end, err := parseTS(endStr) + if err != nil { + return time.Time{}, time.Time{}, err + } + + // start/end explicitly provided (even partially) + if startStr != "" || endStr != "" { + if end.IsZero() { + end = time.Now() + } + if start.IsZero() { + dur, _ := parseOpsDuration(defaultRange) + start = end.Add(-dur) + } + if start.After(end) { + return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: start_time must be <= end_time") + } + if end.Sub(start) > 30*24*time.Hour { + return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days") + } + return start, end, nil + } + + // time_range fallback + tr := strings.TrimSpace(c.Query("time_range")) + if tr == "" { + tr = defaultRange + } + dur, ok := parseOpsDuration(tr) + if !ok { + dur, _ = parseOpsDuration(defaultRange) + } + + end = time.Now() + start = end.Add(-dur) + if end.Sub(start) > 30*24*time.Hour { + return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days") + } + return start, end, nil +} + +func parseOpsDuration(v string) (time.Duration, bool) { + switch strings.TrimSpace(v) { + case "5m": + return 5 * time.Minute, true + case "30m": + return 30 * time.Minute, true + case "1h": + return time.Hour, true + case "6h": + return 6 * time.Hour, true + case "24h": + return 24 * time.Hour, true + default: + return 0, false + } +} diff --git a/backend/internal/handler/admin/ops_realtime_handler.go b/backend/internal/handler/admin/ops_realtime_handler.go new file mode 100644 index 00000000..0c23c13b --- /dev/null +++ b/backend/internal/handler/admin/ops_realtime_handler.go @@ -0,0 +1,120 @@ +package admin + +import ( + "net/http" + "strconv" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/pkg/response" + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/gin-gonic/gin" +) + +// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account. +// GET /api/v1/admin/ops/concurrency +func (h *OpsHandler) GetConcurrencyStats(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) { + response.Success(c, gin.H{ + "enabled": false, + "platform": map[string]*service.PlatformConcurrencyInfo{}, + "group": map[int64]*service.GroupConcurrencyInfo{}, + "account": map[int64]*service.AccountConcurrencyInfo{}, + "timestamp": time.Now().UTC(), + }) + return + } + + platformFilter := strings.TrimSpace(c.Query("platform")) + var groupID *int64 + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + groupID = &id + } + + platform, group, account, collectedAt, err := h.opsService.GetConcurrencyStats(c.Request.Context(), platformFilter, groupID) + if err != nil { + response.ErrorFrom(c, err) + return + } + + payload := gin.H{ + "enabled": true, + "platform": platform, + "group": group, + "account": account, + } + if collectedAt != nil { + payload["timestamp"] = collectedAt.UTC() + } + response.Success(c, payload) +} + +// GetAccountAvailability returns account availability statistics. +// GET /api/v1/admin/ops/account-availability +// +// Query params: +// - platform: optional +// - group_id: optional +func (h *OpsHandler) GetAccountAvailability(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) { + response.Success(c, gin.H{ + "enabled": false, + "platform": map[string]*service.PlatformAvailability{}, + "group": map[int64]*service.GroupAvailability{}, + "account": map[int64]*service.AccountAvailability{}, + "timestamp": time.Now().UTC(), + }) + return + } + + platform := strings.TrimSpace(c.Query("platform")) + var groupID *int64 + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + groupID = &id + } + + platformStats, groupStats, accountStats, collectedAt, err := h.opsService.GetAccountAvailabilityStats(c.Request.Context(), platform, groupID) + if err != nil { + response.ErrorFrom(c, err) + return + } + + payload := gin.H{ + "enabled": true, + "platform": platformStats, + "group": groupStats, + "account": accountStats, + } + if collectedAt != nil { + payload["timestamp"] = collectedAt.UTC() + } + response.Success(c, payload) +} diff --git a/backend/internal/handler/admin/ops_settings_handler.go b/backend/internal/handler/admin/ops_settings_handler.go new file mode 100644 index 00000000..0e0ecb72 --- /dev/null +++ b/backend/internal/handler/admin/ops_settings_handler.go @@ -0,0 +1,148 @@ +package admin + +import ( + "net/http" + + "github.com/Wei-Shaw/sub2api/internal/pkg/response" + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/gin-gonic/gin" +) + +// GetEmailNotificationConfig returns Ops email notification config (DB-backed). +// GET /api/v1/admin/ops/email-notification/config +func (h *OpsHandler) GetEmailNotificationConfig(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + cfg, err := h.opsService.GetEmailNotificationConfig(c.Request.Context()) + if err != nil { + response.Error(c, http.StatusInternalServerError, "Failed to get email notification config") + return + } + response.Success(c, cfg) +} + +// UpdateEmailNotificationConfig updates Ops email notification config (DB-backed). +// PUT /api/v1/admin/ops/email-notification/config +func (h *OpsHandler) UpdateEmailNotificationConfig(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + var req service.OpsEmailNotificationConfigUpdateRequest + if err := c.ShouldBindJSON(&req); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + + updated, err := h.opsService.UpdateEmailNotificationConfig(c.Request.Context(), &req) + if err != nil { + // Most failures here are validation errors from request payload; treat as 400. + response.Error(c, http.StatusBadRequest, err.Error()) + return + } + response.Success(c, updated) +} + +// GetAlertRuntimeSettings returns Ops alert evaluator runtime settings (DB-backed). +// GET /api/v1/admin/ops/runtime/alert +func (h *OpsHandler) GetAlertRuntimeSettings(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + cfg, err := h.opsService.GetOpsAlertRuntimeSettings(c.Request.Context()) + if err != nil { + response.Error(c, http.StatusInternalServerError, "Failed to get alert runtime settings") + return + } + response.Success(c, cfg) +} + +// UpdateAlertRuntimeSettings updates Ops alert evaluator runtime settings (DB-backed). +// PUT /api/v1/admin/ops/runtime/alert +func (h *OpsHandler) UpdateAlertRuntimeSettings(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + var req service.OpsAlertRuntimeSettings + if err := c.ShouldBindJSON(&req); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + + updated, err := h.opsService.UpdateOpsAlertRuntimeSettings(c.Request.Context(), &req) + if err != nil { + response.Error(c, http.StatusBadRequest, err.Error()) + return + } + response.Success(c, updated) +} + +// GetAdvancedSettings returns Ops advanced settings (DB-backed). +// GET /api/v1/admin/ops/advanced-settings +func (h *OpsHandler) GetAdvancedSettings(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + cfg, err := h.opsService.GetOpsAdvancedSettings(c.Request.Context()) + if err != nil { + response.Error(c, http.StatusInternalServerError, "Failed to get advanced settings") + return + } + response.Success(c, cfg) +} + +// UpdateAdvancedSettings updates Ops advanced settings (DB-backed). +// PUT /api/v1/admin/ops/advanced-settings +func (h *OpsHandler) UpdateAdvancedSettings(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + var req service.OpsAdvancedSettings + if err := c.ShouldBindJSON(&req); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + + updated, err := h.opsService.UpdateOpsAdvancedSettings(c.Request.Context(), &req) + if err != nil { + response.Error(c, http.StatusBadRequest, err.Error()) + return + } + response.Success(c, updated) +} diff --git a/backend/internal/handler/admin/ops_ws_handler.go b/backend/internal/handler/admin/ops_ws_handler.go new file mode 100644 index 00000000..db7442e5 --- /dev/null +++ b/backend/internal/handler/admin/ops_ws_handler.go @@ -0,0 +1,771 @@ +package admin + +import ( + "context" + "encoding/json" + "log" + "math" + "net" + "net/http" + "net/netip" + "net/url" + "os" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/gin-gonic/gin" + "github.com/gorilla/websocket" +) + +type OpsWSProxyConfig struct { + TrustProxy bool + TrustedProxies []netip.Prefix + OriginPolicy string +} + +const ( + envOpsWSTrustProxy = "OPS_WS_TRUST_PROXY" + envOpsWSTrustedProxies = "OPS_WS_TRUSTED_PROXIES" + envOpsWSOriginPolicy = "OPS_WS_ORIGIN_POLICY" + envOpsWSMaxConns = "OPS_WS_MAX_CONNS" + envOpsWSMaxConnsPerIP = "OPS_WS_MAX_CONNS_PER_IP" +) + +const ( + OriginPolicyStrict = "strict" + OriginPolicyPermissive = "permissive" +) + +var opsWSProxyConfig = loadOpsWSProxyConfigFromEnv() + +var upgrader = websocket.Upgrader{ + CheckOrigin: func(r *http.Request) bool { + return isAllowedOpsWSOrigin(r) + }, + // Subprotocol negotiation: + // - The frontend passes ["sub2api-admin", "jwt."]. + // - We always select "sub2api-admin" so the token is never echoed back in the handshake response. + Subprotocols: []string{"sub2api-admin"}, +} + +const ( + qpsWSPushInterval = 2 * time.Second + qpsWSRefreshInterval = 5 * time.Second + qpsWSRequestCountWindow = 1 * time.Minute + + defaultMaxWSConns = 100 + defaultMaxWSConnsPerIP = 20 +) + +var wsConnCount atomic.Int32 +var wsConnCountByIP sync.Map // map[string]*atomic.Int32 + +const qpsWSIdleStopDelay = 30 * time.Second + +const ( + opsWSCloseRealtimeDisabled = 4001 +) + +var qpsWSIdleStopMu sync.Mutex +var qpsWSIdleStopTimer *time.Timer + +func cancelQPSWSIdleStop() { + qpsWSIdleStopMu.Lock() + if qpsWSIdleStopTimer != nil { + qpsWSIdleStopTimer.Stop() + qpsWSIdleStopTimer = nil + } + qpsWSIdleStopMu.Unlock() +} + +func scheduleQPSWSIdleStop() { + qpsWSIdleStopMu.Lock() + if qpsWSIdleStopTimer != nil { + qpsWSIdleStopMu.Unlock() + return + } + qpsWSIdleStopTimer = time.AfterFunc(qpsWSIdleStopDelay, func() { + // Only stop if truly idle at fire time. + if wsConnCount.Load() == 0 { + qpsWSCache.Stop() + } + qpsWSIdleStopMu.Lock() + qpsWSIdleStopTimer = nil + qpsWSIdleStopMu.Unlock() + }) + qpsWSIdleStopMu.Unlock() +} + +type opsWSRuntimeLimits struct { + MaxConns int32 + MaxConnsPerIP int32 +} + +var opsWSLimits = loadOpsWSRuntimeLimitsFromEnv() + +const ( + qpsWSWriteTimeout = 10 * time.Second + qpsWSPongWait = 60 * time.Second + qpsWSPingInterval = 30 * time.Second + + // We don't expect clients to send application messages; we only read to process control frames (Pong/Close). + qpsWSMaxReadBytes = 1024 +) + +type opsWSQPSCache struct { + refreshInterval time.Duration + requestCountWindow time.Duration + + lastUpdatedUnixNano atomic.Int64 + payload atomic.Value // []byte + + opsService *service.OpsService + cancel context.CancelFunc + done chan struct{} + + mu sync.Mutex + running bool +} + +var qpsWSCache = &opsWSQPSCache{ + refreshInterval: qpsWSRefreshInterval, + requestCountWindow: qpsWSRequestCountWindow, +} + +func (c *opsWSQPSCache) start(opsService *service.OpsService) { + if c == nil || opsService == nil { + return + } + + for { + c.mu.Lock() + if c.running { + c.mu.Unlock() + return + } + + // If a previous refresh loop is currently stopping, wait for it to fully exit. + done := c.done + if done != nil { + c.mu.Unlock() + <-done + + c.mu.Lock() + if c.done == done && !c.running { + c.done = nil + } + c.mu.Unlock() + continue + } + + c.opsService = opsService + ctx, cancel := context.WithCancel(context.Background()) + c.cancel = cancel + c.done = make(chan struct{}) + done = c.done + c.running = true + c.mu.Unlock() + + go func() { + defer close(done) + c.refreshLoop(ctx) + }() + return + } +} + +// Stop stops the background refresh loop. +// It is safe to call multiple times. +func (c *opsWSQPSCache) Stop() { + if c == nil { + return + } + + c.mu.Lock() + if !c.running { + done := c.done + c.mu.Unlock() + if done != nil { + <-done + } + return + } + cancel := c.cancel + c.cancel = nil + c.running = false + c.opsService = nil + done := c.done + c.mu.Unlock() + + if cancel != nil { + cancel() + } + if done != nil { + <-done + } + + c.mu.Lock() + if c.done == done && !c.running { + c.done = nil + } + c.mu.Unlock() +} + +func (c *opsWSQPSCache) refreshLoop(ctx context.Context) { + ticker := time.NewTicker(c.refreshInterval) + defer ticker.Stop() + + c.refresh(ctx) + for { + select { + case <-ticker.C: + c.refresh(ctx) + case <-ctx.Done(): + return + } + } +} + +func (c *opsWSQPSCache) refresh(parentCtx context.Context) { + if c == nil { + return + } + + c.mu.Lock() + opsService := c.opsService + c.mu.Unlock() + if opsService == nil { + return + } + + if parentCtx == nil { + parentCtx = context.Background() + } + ctx, cancel := context.WithTimeout(parentCtx, 10*time.Second) + defer cancel() + + now := time.Now().UTC() + stats, err := opsService.GetWindowStats(ctx, now.Add(-c.requestCountWindow), now) + if err != nil || stats == nil { + if err != nil { + log.Printf("[OpsWS] refresh: get window stats failed: %v", err) + } + return + } + + requestCount := stats.SuccessCount + stats.ErrorCountTotal + qps := 0.0 + tps := 0.0 + if c.requestCountWindow > 0 { + seconds := c.requestCountWindow.Seconds() + qps = roundTo1DP(float64(requestCount) / seconds) + tps = roundTo1DP(float64(stats.TokenConsumed) / seconds) + } + + payload := gin.H{ + "type": "qps_update", + "timestamp": now.Format(time.RFC3339), + "data": gin.H{ + "qps": qps, + "tps": tps, + "request_count": requestCount, + }, + } + + msg, err := json.Marshal(payload) + if err != nil { + log.Printf("[OpsWS] refresh: marshal payload failed: %v", err) + return + } + + c.payload.Store(msg) + c.lastUpdatedUnixNano.Store(now.UnixNano()) +} + +func roundTo1DP(v float64) float64 { + return math.Round(v*10) / 10 +} + +func (c *opsWSQPSCache) getPayload() []byte { + if c == nil { + return nil + } + if cached, ok := c.payload.Load().([]byte); ok && cached != nil { + return cached + } + return nil +} + +func closeWS(conn *websocket.Conn, code int, reason string) { + if conn == nil { + return + } + msg := websocket.FormatCloseMessage(code, reason) + _ = conn.WriteControl(websocket.CloseMessage, msg, time.Now().Add(qpsWSWriteTimeout)) + _ = conn.Close() +} + +// QPSWSHandler handles realtime QPS push via WebSocket. +// GET /api/v1/admin/ops/ws/qps +func (h *OpsHandler) QPSWSHandler(c *gin.Context) { + clientIP := requestClientIP(c.Request) + + if h == nil || h.opsService == nil { + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "ops service not initialized"}) + return + } + + // If realtime monitoring is disabled, prefer a successful WS upgrade followed by a clean close + // with a deterministic close code. This prevents clients from spinning on 404/1006 reconnect loops. + if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) { + conn, err := upgrader.Upgrade(c.Writer, c.Request, nil) + if err != nil { + c.JSON(http.StatusNotFound, gin.H{"error": "ops realtime monitoring is disabled"}) + return + } + closeWS(conn, opsWSCloseRealtimeDisabled, "realtime_disabled") + return + } + + cancelQPSWSIdleStop() + // Lazily start the background refresh loop so unit tests that never hit the + // websocket route don't spawn goroutines that depend on DB/Redis stubs. + qpsWSCache.start(h.opsService) + + // Reserve a global slot before upgrading the connection to keep the limit strict. + if !tryAcquireOpsWSTotalSlot(opsWSLimits.MaxConns) { + log.Printf("[OpsWS] connection limit reached: %d/%d", wsConnCount.Load(), opsWSLimits.MaxConns) + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"}) + return + } + defer func() { + if wsConnCount.Add(-1) == 0 { + scheduleQPSWSIdleStop() + } + }() + + if opsWSLimits.MaxConnsPerIP > 0 && clientIP != "" { + if !tryAcquireOpsWSIPSlot(clientIP, opsWSLimits.MaxConnsPerIP) { + log.Printf("[OpsWS] per-ip connection limit reached: ip=%s limit=%d", clientIP, opsWSLimits.MaxConnsPerIP) + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"}) + return + } + defer releaseOpsWSIPSlot(clientIP) + } + + conn, err := upgrader.Upgrade(c.Writer, c.Request, nil) + if err != nil { + log.Printf("[OpsWS] upgrade failed: %v", err) + return + } + + defer func() { + _ = conn.Close() + }() + + handleQPSWebSocket(c.Request.Context(), conn) +} + +func tryAcquireOpsWSTotalSlot(limit int32) bool { + if limit <= 0 { + return true + } + for { + current := wsConnCount.Load() + if current >= limit { + return false + } + if wsConnCount.CompareAndSwap(current, current+1) { + return true + } + } +} + +func tryAcquireOpsWSIPSlot(clientIP string, limit int32) bool { + if strings.TrimSpace(clientIP) == "" || limit <= 0 { + return true + } + + v, _ := wsConnCountByIP.LoadOrStore(clientIP, &atomic.Int32{}) + counter, ok := v.(*atomic.Int32) + if !ok { + return false + } + + for { + current := counter.Load() + if current >= limit { + return false + } + if counter.CompareAndSwap(current, current+1) { + return true + } + } +} + +func releaseOpsWSIPSlot(clientIP string) { + if strings.TrimSpace(clientIP) == "" { + return + } + + v, ok := wsConnCountByIP.Load(clientIP) + if !ok { + return + } + counter, ok := v.(*atomic.Int32) + if !ok { + return + } + next := counter.Add(-1) + if next <= 0 { + // Best-effort cleanup; safe even if a new slot was acquired concurrently. + wsConnCountByIP.Delete(clientIP) + } +} + +func handleQPSWebSocket(parentCtx context.Context, conn *websocket.Conn) { + if conn == nil { + return + } + + ctx, cancel := context.WithCancel(parentCtx) + defer cancel() + + var closeOnce sync.Once + closeConn := func() { + closeOnce.Do(func() { + _ = conn.Close() + }) + } + + closeFrameCh := make(chan []byte, 1) + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + defer cancel() + + conn.SetReadLimit(qpsWSMaxReadBytes) + if err := conn.SetReadDeadline(time.Now().Add(qpsWSPongWait)); err != nil { + log.Printf("[OpsWS] set read deadline failed: %v", err) + return + } + conn.SetPongHandler(func(string) error { + return conn.SetReadDeadline(time.Now().Add(qpsWSPongWait)) + }) + conn.SetCloseHandler(func(code int, text string) error { + select { + case closeFrameCh <- websocket.FormatCloseMessage(code, text): + default: + } + cancel() + return nil + }) + + for { + _, _, err := conn.ReadMessage() + if err != nil { + if websocket.IsUnexpectedCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway, websocket.CloseNoStatusReceived) { + log.Printf("[OpsWS] read failed: %v", err) + } + return + } + } + }() + + // Push QPS data every 2 seconds (values are globally cached and refreshed at most once per qpsWSRefreshInterval). + pushTicker := time.NewTicker(qpsWSPushInterval) + defer pushTicker.Stop() + + // Heartbeat ping every 30 seconds. + pingTicker := time.NewTicker(qpsWSPingInterval) + defer pingTicker.Stop() + + writeWithTimeout := func(messageType int, data []byte) error { + if err := conn.SetWriteDeadline(time.Now().Add(qpsWSWriteTimeout)); err != nil { + return err + } + return conn.WriteMessage(messageType, data) + } + + sendClose := func(closeFrame []byte) { + if closeFrame == nil { + closeFrame = websocket.FormatCloseMessage(websocket.CloseNormalClosure, "") + } + _ = writeWithTimeout(websocket.CloseMessage, closeFrame) + } + + for { + select { + case <-pushTicker.C: + msg := qpsWSCache.getPayload() + if msg == nil { + continue + } + if err := writeWithTimeout(websocket.TextMessage, msg); err != nil { + log.Printf("[OpsWS] write failed: %v", err) + cancel() + closeConn() + wg.Wait() + return + } + + case <-pingTicker.C: + if err := writeWithTimeout(websocket.PingMessage, nil); err != nil { + log.Printf("[OpsWS] ping failed: %v", err) + cancel() + closeConn() + wg.Wait() + return + } + + case closeFrame := <-closeFrameCh: + sendClose(closeFrame) + closeConn() + wg.Wait() + return + + case <-ctx.Done(): + var closeFrame []byte + select { + case closeFrame = <-closeFrameCh: + default: + } + sendClose(closeFrame) + + closeConn() + wg.Wait() + return + } + } +} + +func isAllowedOpsWSOrigin(r *http.Request) bool { + if r == nil { + return false + } + origin := strings.TrimSpace(r.Header.Get("Origin")) + if origin == "" { + switch strings.ToLower(strings.TrimSpace(opsWSProxyConfig.OriginPolicy)) { + case OriginPolicyStrict: + return false + case OriginPolicyPermissive, "": + return true + default: + return true + } + } + parsed, err := url.Parse(origin) + if err != nil || parsed.Hostname() == "" { + return false + } + originHost := strings.ToLower(parsed.Hostname()) + + trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r) + reqHost := hostWithoutPort(r.Host) + if trustProxyHeaders { + xfHost := strings.TrimSpace(r.Header.Get("X-Forwarded-Host")) + if xfHost != "" { + xfHost = strings.TrimSpace(strings.Split(xfHost, ",")[0]) + if xfHost != "" { + reqHost = hostWithoutPort(xfHost) + } + } + } + reqHost = strings.ToLower(reqHost) + if reqHost == "" { + return false + } + return originHost == reqHost +} + +func shouldTrustOpsWSProxyHeaders(r *http.Request) bool { + if r == nil { + return false + } + if !opsWSProxyConfig.TrustProxy { + return false + } + peerIP, ok := requestPeerIP(r) + if !ok { + return false + } + return isAddrInTrustedProxies(peerIP, opsWSProxyConfig.TrustedProxies) +} + +func requestPeerIP(r *http.Request) (netip.Addr, bool) { + if r == nil { + return netip.Addr{}, false + } + host, _, err := net.SplitHostPort(strings.TrimSpace(r.RemoteAddr)) + if err != nil { + host = strings.TrimSpace(r.RemoteAddr) + } + host = strings.TrimPrefix(host, "[") + host = strings.TrimSuffix(host, "]") + if host == "" { + return netip.Addr{}, false + } + addr, err := netip.ParseAddr(host) + if err != nil { + return netip.Addr{}, false + } + return addr.Unmap(), true +} + +func requestClientIP(r *http.Request) string { + if r == nil { + return "" + } + + trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r) + if trustProxyHeaders { + xff := strings.TrimSpace(r.Header.Get("X-Forwarded-For")) + if xff != "" { + // Use the left-most entry (original client). If multiple proxies add values, they are comma-separated. + xff = strings.TrimSpace(strings.Split(xff, ",")[0]) + xff = strings.TrimPrefix(xff, "[") + xff = strings.TrimSuffix(xff, "]") + if addr, err := netip.ParseAddr(xff); err == nil && addr.IsValid() { + return addr.Unmap().String() + } + } + } + + if peer, ok := requestPeerIP(r); ok && peer.IsValid() { + return peer.String() + } + return "" +} + +func isAddrInTrustedProxies(addr netip.Addr, trusted []netip.Prefix) bool { + if !addr.IsValid() { + return false + } + for _, p := range trusted { + if p.Contains(addr) { + return true + } + } + return false +} + +func loadOpsWSProxyConfigFromEnv() OpsWSProxyConfig { + cfg := OpsWSProxyConfig{ + TrustProxy: true, + TrustedProxies: defaultTrustedProxies(), + OriginPolicy: OriginPolicyPermissive, + } + + if v := strings.TrimSpace(os.Getenv(envOpsWSTrustProxy)); v != "" { + if parsed, err := strconv.ParseBool(v); err == nil { + cfg.TrustProxy = parsed + } else { + log.Printf("[OpsWS] invalid %s=%q (expected bool); using default=%v", envOpsWSTrustProxy, v, cfg.TrustProxy) + } + } + + if raw := strings.TrimSpace(os.Getenv(envOpsWSTrustedProxies)); raw != "" { + prefixes, invalid := parseTrustedProxyList(raw) + if len(invalid) > 0 { + log.Printf("[OpsWS] invalid %s entries ignored: %s", envOpsWSTrustedProxies, strings.Join(invalid, ", ")) + } + cfg.TrustedProxies = prefixes + } + + if v := strings.TrimSpace(os.Getenv(envOpsWSOriginPolicy)); v != "" { + normalized := strings.ToLower(v) + switch normalized { + case OriginPolicyStrict, OriginPolicyPermissive: + cfg.OriginPolicy = normalized + default: + log.Printf("[OpsWS] invalid %s=%q (expected %q or %q); using default=%q", envOpsWSOriginPolicy, v, OriginPolicyStrict, OriginPolicyPermissive, cfg.OriginPolicy) + } + } + + return cfg +} + +func loadOpsWSRuntimeLimitsFromEnv() opsWSRuntimeLimits { + cfg := opsWSRuntimeLimits{ + MaxConns: defaultMaxWSConns, + MaxConnsPerIP: defaultMaxWSConnsPerIP, + } + + if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConns)); v != "" { + if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 { + cfg.MaxConns = int32(parsed) + } else { + log.Printf("[OpsWS] invalid %s=%q (expected int>0); using default=%d", envOpsWSMaxConns, v, cfg.MaxConns) + } + } + if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConnsPerIP)); v != "" { + if parsed, err := strconv.Atoi(v); err == nil && parsed >= 0 { + cfg.MaxConnsPerIP = int32(parsed) + } else { + log.Printf("[OpsWS] invalid %s=%q (expected int>=0); using default=%d", envOpsWSMaxConnsPerIP, v, cfg.MaxConnsPerIP) + } + } + return cfg +} + +func defaultTrustedProxies() []netip.Prefix { + prefixes, _ := parseTrustedProxyList("127.0.0.0/8,::1/128") + return prefixes +} + +func parseTrustedProxyList(raw string) (prefixes []netip.Prefix, invalid []string) { + for _, token := range strings.Split(raw, ",") { + item := strings.TrimSpace(token) + if item == "" { + continue + } + + var ( + p netip.Prefix + err error + ) + if strings.Contains(item, "/") { + p, err = netip.ParsePrefix(item) + } else { + var addr netip.Addr + addr, err = netip.ParseAddr(item) + if err == nil { + addr = addr.Unmap() + bits := 128 + if addr.Is4() { + bits = 32 + } + p = netip.PrefixFrom(addr, bits) + } + } + + if err != nil || !p.IsValid() { + invalid = append(invalid, item) + continue + } + + prefixes = append(prefixes, p.Masked()) + } + return prefixes, invalid +} + +func hostWithoutPort(hostport string) string { + hostport = strings.TrimSpace(hostport) + if hostport == "" { + return "" + } + if host, _, err := net.SplitHostPort(hostport); err == nil { + return host + } + if strings.HasPrefix(hostport, "[") && strings.HasSuffix(hostport, "]") { + return strings.Trim(hostport, "[]") + } + parts := strings.Split(hostport, ":") + return parts[0] +} diff --git a/backend/internal/handler/admin/setting_handler.go b/backend/internal/handler/admin/setting_handler.go index e1584acb..2f9785ee 100644 --- a/backend/internal/handler/admin/setting_handler.go +++ b/backend/internal/handler/admin/setting_handler.go @@ -19,14 +19,16 @@ type SettingHandler struct { settingService *service.SettingService emailService *service.EmailService turnstileService *service.TurnstileService + opsService *service.OpsService } // NewSettingHandler 创建系统设置处理器 -func NewSettingHandler(settingService *service.SettingService, emailService *service.EmailService, turnstileService *service.TurnstileService) *SettingHandler { +func NewSettingHandler(settingService *service.SettingService, emailService *service.EmailService, turnstileService *service.TurnstileService, opsService *service.OpsService) *SettingHandler { return &SettingHandler{ settingService: settingService, emailService: emailService, turnstileService: turnstileService, + opsService: opsService, } } @@ -39,6 +41,9 @@ func (h *SettingHandler) GetSettings(c *gin.Context) { return } + // Check if ops monitoring is enabled (respects config.ops.enabled) + opsEnabled := h.opsService != nil && h.opsService.IsMonitoringEnabled(c.Request.Context()) + response.Success(c, dto.SystemSettings{ RegistrationEnabled: settings.RegistrationEnabled, EmailVerifyEnabled: settings.EmailVerifyEnabled, @@ -72,6 +77,10 @@ func (h *SettingHandler) GetSettings(c *gin.Context) { FallbackModelAntigravity: settings.FallbackModelAntigravity, EnableIdentityPatch: settings.EnableIdentityPatch, IdentityPatchPrompt: settings.IdentityPatchPrompt, + OpsMonitoringEnabled: opsEnabled && settings.OpsMonitoringEnabled, + OpsRealtimeMonitoringEnabled: settings.OpsRealtimeMonitoringEnabled, + OpsQueryModeDefault: settings.OpsQueryModeDefault, + OpsMetricsIntervalSeconds: settings.OpsMetricsIntervalSeconds, }) } @@ -95,7 +104,7 @@ type UpdateSettingsRequest struct { TurnstileSiteKey string `json:"turnstile_site_key"` TurnstileSecretKey string `json:"turnstile_secret_key"` - // LinuxDo Connect OAuth 登录(终端用户 SSO) + // LinuxDo Connect OAuth 登录 LinuxDoConnectEnabled bool `json:"linuxdo_connect_enabled"` LinuxDoConnectClientID string `json:"linuxdo_connect_client_id"` LinuxDoConnectClientSecret string `json:"linuxdo_connect_client_secret"` @@ -124,6 +133,12 @@ type UpdateSettingsRequest struct { // Identity patch configuration (Claude -> Gemini) EnableIdentityPatch bool `json:"enable_identity_patch"` IdentityPatchPrompt string `json:"identity_patch_prompt"` + + // Ops monitoring (vNext) + OpsMonitoringEnabled *bool `json:"ops_monitoring_enabled"` + OpsRealtimeMonitoringEnabled *bool `json:"ops_realtime_monitoring_enabled"` + OpsQueryModeDefault *string `json:"ops_query_mode_default"` + OpsMetricsIntervalSeconds *int `json:"ops_metrics_interval_seconds"` } // UpdateSettings 更新系统设置 @@ -208,6 +223,18 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) { } } + // Ops metrics collector interval validation (seconds). + if req.OpsMetricsIntervalSeconds != nil { + v := *req.OpsMetricsIntervalSeconds + if v < 60 { + v = 60 + } + if v > 3600 { + v = 3600 + } + req.OpsMetricsIntervalSeconds = &v + } + settings := &service.SystemSettings{ RegistrationEnabled: req.RegistrationEnabled, EmailVerifyEnabled: req.EmailVerifyEnabled, @@ -241,6 +268,30 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) { FallbackModelAntigravity: req.FallbackModelAntigravity, EnableIdentityPatch: req.EnableIdentityPatch, IdentityPatchPrompt: req.IdentityPatchPrompt, + OpsMonitoringEnabled: func() bool { + if req.OpsMonitoringEnabled != nil { + return *req.OpsMonitoringEnabled + } + return previousSettings.OpsMonitoringEnabled + }(), + OpsRealtimeMonitoringEnabled: func() bool { + if req.OpsRealtimeMonitoringEnabled != nil { + return *req.OpsRealtimeMonitoringEnabled + } + return previousSettings.OpsRealtimeMonitoringEnabled + }(), + OpsQueryModeDefault: func() string { + if req.OpsQueryModeDefault != nil { + return *req.OpsQueryModeDefault + } + return previousSettings.OpsQueryModeDefault + }(), + OpsMetricsIntervalSeconds: func() int { + if req.OpsMetricsIntervalSeconds != nil { + return *req.OpsMetricsIntervalSeconds + } + return previousSettings.OpsMetricsIntervalSeconds + }(), } if err := h.settingService.UpdateSettings(c.Request.Context(), settings); err != nil { @@ -290,6 +341,10 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) { FallbackModelAntigravity: updatedSettings.FallbackModelAntigravity, EnableIdentityPatch: updatedSettings.EnableIdentityPatch, IdentityPatchPrompt: updatedSettings.IdentityPatchPrompt, + OpsMonitoringEnabled: updatedSettings.OpsMonitoringEnabled, + OpsRealtimeMonitoringEnabled: updatedSettings.OpsRealtimeMonitoringEnabled, + OpsQueryModeDefault: updatedSettings.OpsQueryModeDefault, + OpsMetricsIntervalSeconds: updatedSettings.OpsMetricsIntervalSeconds, }) } @@ -411,6 +466,18 @@ func diffSettings(before *service.SystemSettings, after *service.SystemSettings, if before.IdentityPatchPrompt != after.IdentityPatchPrompt { changed = append(changed, "identity_patch_prompt") } + if before.OpsMonitoringEnabled != after.OpsMonitoringEnabled { + changed = append(changed, "ops_monitoring_enabled") + } + if before.OpsRealtimeMonitoringEnabled != after.OpsRealtimeMonitoringEnabled { + changed = append(changed, "ops_realtime_monitoring_enabled") + } + if before.OpsQueryModeDefault != after.OpsQueryModeDefault { + changed = append(changed, "ops_query_mode_default") + } + if before.OpsMetricsIntervalSeconds != after.OpsMetricsIntervalSeconds { + changed = append(changed, "ops_metrics_interval_seconds") + } return changed } diff --git a/backend/internal/handler/dto/settings.go b/backend/internal/handler/dto/settings.go index c95bb6e5..d95fb121 100644 --- a/backend/internal/handler/dto/settings.go +++ b/backend/internal/handler/dto/settings.go @@ -43,6 +43,12 @@ type SystemSettings struct { // Identity patch configuration (Claude -> Gemini) EnableIdentityPatch bool `json:"enable_identity_patch"` IdentityPatchPrompt string `json:"identity_patch_prompt"` + + // Ops monitoring (vNext) + OpsMonitoringEnabled bool `json:"ops_monitoring_enabled"` + OpsRealtimeMonitoringEnabled bool `json:"ops_realtime_monitoring_enabled"` + OpsQueryModeDefault string `json:"ops_query_mode_default"` + OpsMetricsIntervalSeconds int `json:"ops_metrics_interval_seconds"` } type PublicSettings struct { diff --git a/backend/internal/handler/gateway_handler.go b/backend/internal/handler/gateway_handler.go index 0d38db17..284a4f8f 100644 --- a/backend/internal/handler/gateway_handler.go +++ b/backend/internal/handler/gateway_handler.go @@ -15,7 +15,6 @@ import ( "github.com/Wei-Shaw/sub2api/internal/pkg/antigravity" "github.com/Wei-Shaw/sub2api/internal/pkg/claude" pkgerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" - "github.com/Wei-Shaw/sub2api/internal/pkg/ip" "github.com/Wei-Shaw/sub2api/internal/pkg/openai" middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware" "github.com/Wei-Shaw/sub2api/internal/service" @@ -89,6 +88,8 @@ func (h *GatewayHandler) Messages(c *gin.Context) { return } + setOpsRequestContext(c, "", false, body) + parsedReq, err := service.ParseGatewayRequest(body) if err != nil { h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body") @@ -97,8 +98,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) { reqModel := parsedReq.Model reqStream := parsedReq.Stream - // 设置 Claude Code 客户端标识到 context(用于分组限制检查) - SetClaudeCodeClientContext(c, body) + setOpsRequestContext(c, reqModel, reqStream, body) // 验证 model 必填 if reqModel == "" { @@ -112,15 +112,10 @@ func (h *GatewayHandler) Messages(c *gin.Context) { // 获取订阅信息(可能为nil)- 提前获取用于后续检查 subscription, _ := middleware2.GetSubscriptionFromContext(c) - // 获取 User-Agent - userAgent := c.Request.UserAgent() - - // 获取客户端 IP - clientIP := ip.GetClientIP(c) - // 0. 检查wait队列是否已满 maxWait := service.CalculateMaxWait(subject.Concurrency) canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait) + waitCounted := false if err != nil { log.Printf("Increment wait count failed: %v", err) // On error, allow request to proceed @@ -128,8 +123,15 @@ func (h *GatewayHandler) Messages(c *gin.Context) { h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later") return } - // 确保在函数退出时减少wait计数 - defer h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID) + if err == nil && canWait { + waitCounted = true + } + // Ensure we decrement if we exit before acquiring the user slot. + defer func() { + if waitCounted { + h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID) + } + }() // 1. 首先获取用户并发槽位 userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted) @@ -138,6 +140,11 @@ func (h *GatewayHandler) Messages(c *gin.Context) { h.handleConcurrencyError(c, err, "user", streamStarted) return } + // User slot acquired: no longer waiting in the queue. + if waitCounted { + h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID) + waitCounted = false + } // 在请求结束或 Context 取消时确保释放槽位,避免客户端断开造成泄漏 userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc) if userReleaseFunc != nil { @@ -184,6 +191,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) { return } account := selection.Account + setOpsSelectedAccount(c, account.ID) // 检查预热请求拦截(在账号选择后、转发前检查) if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) { @@ -200,12 +208,12 @@ func (h *GatewayHandler) Messages(c *gin.Context) { // 3. 获取账号并发槽位 accountReleaseFunc := selection.ReleaseFunc - var accountWaitRelease func() if !selection.Acquired { if selection.WaitPlan == nil { h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted) return } + accountWaitCounted := false canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting) if err != nil { log.Printf("Increment account wait count failed: %v", err) @@ -213,12 +221,16 @@ func (h *GatewayHandler) Messages(c *gin.Context) { log.Printf("Account wait queue full: account=%d", account.ID) h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted) return - } else { - // Only set release function if increment succeeded - accountWaitRelease = func() { + } + if err == nil && canWait { + accountWaitCounted = true + } + // Ensure the wait counter is decremented if we exit before acquiring the slot. + defer func() { + if accountWaitCounted { h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID) } - } + }() accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout( c, @@ -229,20 +241,21 @@ func (h *GatewayHandler) Messages(c *gin.Context) { &streamStarted, ) if err != nil { - if accountWaitRelease != nil { - accountWaitRelease() - } log.Printf("Account concurrency acquire failed: %v", err) h.handleConcurrencyError(c, err, "account", streamStarted) return } + // Slot acquired: no longer waiting in queue. + if accountWaitCounted { + h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID) + accountWaitCounted = false + } if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil { log.Printf("Bind sticky session failed: %v", err) } } // 账号槽位/等待计数需要在超时或断开时安全回收 accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc) - accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease) // 转发请求 - 根据账号平台分流 var result *service.ForwardResult @@ -254,9 +267,6 @@ func (h *GatewayHandler) Messages(c *gin.Context) { if accountReleaseFunc != nil { accountReleaseFunc() } - if accountWaitRelease != nil { - accountWaitRelease() - } if err != nil { var failoverErr *service.UpstreamFailoverError if errors.As(err, &failoverErr) { @@ -277,7 +287,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) { } // 异步记录使用量(subscription已在函数开头获取) - go func(result *service.ForwardResult, usedAccount *service.Account, ua string, cip string) { + go func(result *service.ForwardResult, usedAccount *service.Account) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{ @@ -286,12 +296,10 @@ func (h *GatewayHandler) Messages(c *gin.Context) { User: apiKey.User, Account: usedAccount, Subscription: subscription, - UserAgent: ua, - IPAddress: cip, }); err != nil { log.Printf("Record usage failed: %v", err) } - }(result, account, userAgent, clientIP) + }(result, account) return } } @@ -313,6 +321,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) { return } account := selection.Account + setOpsSelectedAccount(c, account.ID) // 检查预热请求拦截(在账号选择后、转发前检查) if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) { @@ -329,12 +338,12 @@ func (h *GatewayHandler) Messages(c *gin.Context) { // 3. 获取账号并发槽位 accountReleaseFunc := selection.ReleaseFunc - var accountWaitRelease func() if !selection.Acquired { if selection.WaitPlan == nil { h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted) return } + accountWaitCounted := false canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting) if err != nil { log.Printf("Increment account wait count failed: %v", err) @@ -342,12 +351,15 @@ func (h *GatewayHandler) Messages(c *gin.Context) { log.Printf("Account wait queue full: account=%d", account.ID) h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted) return - } else { - // Only set release function if increment succeeded - accountWaitRelease = func() { + } + if err == nil && canWait { + accountWaitCounted = true + } + defer func() { + if accountWaitCounted { h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID) } - } + }() accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout( c, @@ -358,20 +370,20 @@ func (h *GatewayHandler) Messages(c *gin.Context) { &streamStarted, ) if err != nil { - if accountWaitRelease != nil { - accountWaitRelease() - } log.Printf("Account concurrency acquire failed: %v", err) h.handleConcurrencyError(c, err, "account", streamStarted) return } + if accountWaitCounted { + h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID) + accountWaitCounted = false + } if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil { log.Printf("Bind sticky session failed: %v", err) } } // 账号槽位/等待计数需要在超时或断开时安全回收 accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc) - accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease) // 转发请求 - 根据账号平台分流 var result *service.ForwardResult @@ -383,9 +395,6 @@ func (h *GatewayHandler) Messages(c *gin.Context) { if accountReleaseFunc != nil { accountReleaseFunc() } - if accountWaitRelease != nil { - accountWaitRelease() - } if err != nil { var failoverErr *service.UpstreamFailoverError if errors.As(err, &failoverErr) { @@ -406,7 +415,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) { } // 异步记录使用量(subscription已在函数开头获取) - go func(result *service.ForwardResult, usedAccount *service.Account, ua string, cip string) { + go func(result *service.ForwardResult, usedAccount *service.Account) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{ @@ -415,12 +424,10 @@ func (h *GatewayHandler) Messages(c *gin.Context) { User: apiKey.User, Account: usedAccount, Subscription: subscription, - UserAgent: ua, - IPAddress: cip, }); err != nil { log.Printf("Record usage failed: %v", err) } - }(result, account, userAgent, clientIP) + }(result, account) return } } @@ -686,21 +693,22 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) { return } + setOpsRequestContext(c, "", false, body) + parsedReq, err := service.ParseGatewayRequest(body) if err != nil { h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body") return } - // 设置 Claude Code 客户端标识到 context(用于分组限制检查) - SetClaudeCodeClientContext(c, body) - // 验证 model 必填 if parsedReq.Model == "" { h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "model is required") return } + setOpsRequestContext(c, parsedReq.Model, parsedReq.Stream, body) + // 获取订阅信息(可能为nil) subscription, _ := middleware2.GetSubscriptionFromContext(c) @@ -721,6 +729,7 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) { h.errorResponse(c, http.StatusServiceUnavailable, "api_error", "No available accounts: "+err.Error()) return } + setOpsSelectedAccount(c, account.ID) // 转发请求(不记录使用量) if err := h.gatewayService.ForwardCountTokens(c.Request.Context(), c, account, parsedReq); err != nil { diff --git a/backend/internal/handler/gemini_v1beta_handler.go b/backend/internal/handler/gemini_v1beta_handler.go index 986b174b..d639beb3 100644 --- a/backend/internal/handler/gemini_v1beta_handler.go +++ b/backend/internal/handler/gemini_v1beta_handler.go @@ -12,7 +12,6 @@ import ( "github.com/Wei-Shaw/sub2api/internal/pkg/antigravity" "github.com/Wei-Shaw/sub2api/internal/pkg/gemini" "github.com/Wei-Shaw/sub2api/internal/pkg/googleapi" - "github.com/Wei-Shaw/sub2api/internal/pkg/ip" "github.com/Wei-Shaw/sub2api/internal/server/middleware" "github.com/Wei-Shaw/sub2api/internal/service" @@ -162,28 +161,32 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { return } + setOpsRequestContext(c, modelName, stream, body) + // Get subscription (may be nil) subscription, _ := middleware.GetSubscriptionFromContext(c) - // 获取 User-Agent - userAgent := c.Request.UserAgent() - - // 获取客户端 IP - clientIP := ip.GetClientIP(c) - // For Gemini native API, do not send Claude-style ping frames. geminiConcurrency := NewConcurrencyHelper(h.concurrencyHelper.concurrencyService, SSEPingFormatNone, 0) // 0) wait queue check maxWait := service.CalculateMaxWait(authSubject.Concurrency) canWait, err := geminiConcurrency.IncrementWaitCount(c.Request.Context(), authSubject.UserID, maxWait) + waitCounted := false if err != nil { log.Printf("Increment wait count failed: %v", err) } else if !canWait { googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later") return } - defer geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID) + if err == nil && canWait { + waitCounted = true + } + defer func() { + if waitCounted { + geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID) + } + }() // 1) user concurrency slot streamStarted := false @@ -192,6 +195,10 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { googleError(c, http.StatusTooManyRequests, err.Error()) return } + if waitCounted { + geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID) + waitCounted = false + } // 确保请求取消时也会释放槽位,避免长连接被动中断造成泄漏 userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc) if userReleaseFunc != nil { @@ -207,10 +214,6 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { // 3) select account (sticky session based on request body) parsedReq, _ := service.ParseGatewayRequest(body) - - // 设置 Claude Code 客户端标识到 context(用于分组限制检查) - SetClaudeCodeClientContext(c, body) - sessionHash := h.gatewayService.GenerateSessionHash(parsedReq) sessionKey := sessionHash if sessionHash != "" { @@ -232,15 +235,16 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { return } account := selection.Account + setOpsSelectedAccount(c, account.ID) // 4) account concurrency slot accountReleaseFunc := selection.ReleaseFunc - var accountWaitRelease func() if !selection.Acquired { if selection.WaitPlan == nil { googleError(c, http.StatusServiceUnavailable, "No available Gemini accounts") return } + accountWaitCounted := false canWait, err := geminiConcurrency.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting) if err != nil { log.Printf("Increment account wait count failed: %v", err) @@ -248,12 +252,15 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { log.Printf("Account wait queue full: account=%d", account.ID) googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later") return - } else { - // Only set release function if increment succeeded - accountWaitRelease = func() { + } + if err == nil && canWait { + accountWaitCounted = true + } + defer func() { + if accountWaitCounted { geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID) } - } + }() accountReleaseFunc, err = geminiConcurrency.AcquireAccountSlotWithWaitTimeout( c, @@ -264,19 +271,19 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { &streamStarted, ) if err != nil { - if accountWaitRelease != nil { - accountWaitRelease() - } googleError(c, http.StatusTooManyRequests, err.Error()) return } + if accountWaitCounted { + geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID) + accountWaitCounted = false + } if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil { log.Printf("Bind sticky session failed: %v", err) } } // 账号槽位/等待计数需要在超时或断开时安全回收 accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc) - accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease) // 5) forward (根据平台分流) var result *service.ForwardResult @@ -288,9 +295,6 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { if accountReleaseFunc != nil { accountReleaseFunc() } - if accountWaitRelease != nil { - accountWaitRelease() - } if err != nil { var failoverErr *service.UpstreamFailoverError if errors.As(err, &failoverErr) { @@ -311,7 +315,7 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { } // 6) record usage async - go func(result *service.ForwardResult, usedAccount *service.Account, ua string, cip string) { + go func(result *service.ForwardResult, usedAccount *service.Account) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{ @@ -320,12 +324,10 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { User: apiKey.User, Account: usedAccount, Subscription: subscription, - UserAgent: ua, - IPAddress: cip, }); err != nil { log.Printf("Record usage failed: %v", err) } - }(result, account, userAgent, clientIP) + }(result, account) return } } diff --git a/backend/internal/handler/handler.go b/backend/internal/handler/handler.go index 047703f3..5b1b317d 100644 --- a/backend/internal/handler/handler.go +++ b/backend/internal/handler/handler.go @@ -18,6 +18,7 @@ type AdminHandlers struct { Redeem *admin.RedeemHandler Promo *admin.PromoHandler Setting *admin.SettingHandler + Ops *admin.OpsHandler System *admin.SystemHandler Subscription *admin.SubscriptionHandler Usage *admin.UsageHandler diff --git a/backend/internal/handler/openai_gateway_handler.go b/backend/internal/handler/openai_gateway_handler.go index 8c7d7d52..5f3474b0 100644 --- a/backend/internal/handler/openai_gateway_handler.go +++ b/backend/internal/handler/openai_gateway_handler.go @@ -12,7 +12,6 @@ import ( "time" "github.com/Wei-Shaw/sub2api/internal/config" - "github.com/Wei-Shaw/sub2api/internal/pkg/ip" "github.com/Wei-Shaw/sub2api/internal/pkg/openai" middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware" "github.com/Wei-Shaw/sub2api/internal/service" @@ -77,6 +76,8 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { return } + setOpsRequestContext(c, "", false, body) + // Parse request body to map for potential modification var reqBody map[string]any if err := json.Unmarshal(body, &reqBody); err != nil { @@ -95,10 +96,6 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { } userAgent := c.GetHeader("User-Agent") - - // 获取客户端 IP - clientIP := ip.GetClientIP(c) - if !openai.IsCodexCLIRequest(userAgent) { existingInstructions, _ := reqBody["instructions"].(string) if strings.TrimSpace(existingInstructions) == "" { @@ -114,6 +111,8 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { } } + setOpsRequestContext(c, reqModel, reqStream, body) + // Track if we've started streaming (for error handling) streamStarted := false @@ -123,6 +122,7 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { // 0. Check if wait queue is full maxWait := service.CalculateMaxWait(subject.Concurrency) canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait) + waitCounted := false if err != nil { log.Printf("Increment wait count failed: %v", err) // On error, allow request to proceed @@ -130,8 +130,14 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later") return } - // Ensure wait count is decremented when function exits - defer h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID) + if err == nil && canWait { + waitCounted = true + } + defer func() { + if waitCounted { + h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID) + } + }() // 1. First acquire user concurrency slot userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted) @@ -140,6 +146,11 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { h.handleConcurrencyError(c, err, "user", streamStarted) return } + // User slot acquired: no longer waiting. + if waitCounted { + h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID) + waitCounted = false + } // 确保请求取消时也会释放槽位,避免长连接被动中断造成泄漏 userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc) if userReleaseFunc != nil { @@ -177,15 +188,16 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { } account := selection.Account log.Printf("[OpenAI Handler] Selected account: id=%d name=%s", account.ID, account.Name) + setOpsSelectedAccount(c, account.ID) // 3. Acquire account concurrency slot accountReleaseFunc := selection.ReleaseFunc - var accountWaitRelease func() if !selection.Acquired { if selection.WaitPlan == nil { h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted) return } + accountWaitCounted := false canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting) if err != nil { log.Printf("Increment account wait count failed: %v", err) @@ -193,12 +205,15 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { log.Printf("Account wait queue full: account=%d", account.ID) h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted) return - } else { - // Only set release function if increment succeeded - accountWaitRelease = func() { + } + if err == nil && canWait { + accountWaitCounted = true + } + defer func() { + if accountWaitCounted { h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID) } - } + }() accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout( c, @@ -209,29 +224,26 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { &streamStarted, ) if err != nil { - if accountWaitRelease != nil { - accountWaitRelease() - } log.Printf("Account concurrency acquire failed: %v", err) h.handleConcurrencyError(c, err, "account", streamStarted) return } + if accountWaitCounted { + h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID) + accountWaitCounted = false + } if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionHash, account.ID); err != nil { log.Printf("Bind sticky session failed: %v", err) } } // 账号槽位/等待计数需要在超时或断开时安全回收 accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc) - accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease) // Forward request result, err := h.gatewayService.Forward(c.Request.Context(), c, account, body) if accountReleaseFunc != nil { accountReleaseFunc() } - if accountWaitRelease != nil { - accountWaitRelease() - } if err != nil { var failoverErr *service.UpstreamFailoverError if errors.As(err, &failoverErr) { @@ -252,7 +264,7 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { } // Async record usage - go func(result *service.OpenAIForwardResult, usedAccount *service.Account, ua string, cip string) { + go func(result *service.OpenAIForwardResult, usedAccount *service.Account) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() if err := h.gatewayService.RecordUsage(ctx, &service.OpenAIRecordUsageInput{ @@ -261,12 +273,10 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { User: apiKey.User, Account: usedAccount, Subscription: subscription, - UserAgent: ua, - IPAddress: cip, }); err != nil { log.Printf("Record usage failed: %v", err) } - }(result, account, userAgent, clientIP) + }(result, account) return } } diff --git a/backend/internal/handler/ops_error_logger.go b/backend/internal/handler/ops_error_logger.go new file mode 100644 index 00000000..7115059a --- /dev/null +++ b/backend/internal/handler/ops_error_logger.go @@ -0,0 +1,954 @@ +package handler + +import ( + "bytes" + "context" + "encoding/json" + "log" + "runtime" + "runtime/debug" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + "unicode/utf8" + + "github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey" + middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware" + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/gin-gonic/gin" +) + +const ( + opsModelKey = "ops_model" + opsStreamKey = "ops_stream" + opsRequestBodyKey = "ops_request_body" + opsAccountIDKey = "ops_account_id" +) + +const ( + opsErrorLogTimeout = 5 * time.Second + opsErrorLogDrainTimeout = 10 * time.Second + + opsErrorLogMinWorkerCount = 4 + opsErrorLogMaxWorkerCount = 32 + + opsErrorLogQueueSizePerWorker = 128 + opsErrorLogMinQueueSize = 256 + opsErrorLogMaxQueueSize = 8192 +) + +type opsErrorLogJob struct { + ops *service.OpsService + entry *service.OpsInsertErrorLogInput + requestBody []byte +} + +var ( + opsErrorLogOnce sync.Once + opsErrorLogQueue chan opsErrorLogJob + + opsErrorLogStopOnce sync.Once + opsErrorLogWorkersWg sync.WaitGroup + opsErrorLogMu sync.RWMutex + opsErrorLogStopping bool + opsErrorLogQueueLen atomic.Int64 + opsErrorLogEnqueued atomic.Int64 + opsErrorLogDropped atomic.Int64 + opsErrorLogProcessed atomic.Int64 + + opsErrorLogLastDropLogAt atomic.Int64 + + opsErrorLogShutdownCh = make(chan struct{}) + opsErrorLogShutdownOnce sync.Once + opsErrorLogDrained atomic.Bool +) + +func startOpsErrorLogWorkers() { + opsErrorLogMu.Lock() + defer opsErrorLogMu.Unlock() + + if opsErrorLogStopping { + return + } + + workerCount, queueSize := opsErrorLogConfig() + opsErrorLogQueue = make(chan opsErrorLogJob, queueSize) + opsErrorLogQueueLen.Store(0) + + opsErrorLogWorkersWg.Add(workerCount) + for i := 0; i < workerCount; i++ { + go func() { + defer opsErrorLogWorkersWg.Done() + for job := range opsErrorLogQueue { + opsErrorLogQueueLen.Add(-1) + if job.ops == nil || job.entry == nil { + continue + } + func() { + defer func() { + if r := recover(); r != nil { + log.Printf("[OpsErrorLogger] worker panic: %v\n%s", r, debug.Stack()) + } + }() + ctx, cancel := context.WithTimeout(context.Background(), opsErrorLogTimeout) + _ = job.ops.RecordError(ctx, job.entry, job.requestBody) + cancel() + opsErrorLogProcessed.Add(1) + }() + } + }() + } +} + +func enqueueOpsErrorLog(ops *service.OpsService, entry *service.OpsInsertErrorLogInput, requestBody []byte) { + if ops == nil || entry == nil { + return + } + select { + case <-opsErrorLogShutdownCh: + return + default: + } + + opsErrorLogMu.RLock() + stopping := opsErrorLogStopping + opsErrorLogMu.RUnlock() + if stopping { + return + } + + opsErrorLogOnce.Do(startOpsErrorLogWorkers) + + opsErrorLogMu.RLock() + defer opsErrorLogMu.RUnlock() + if opsErrorLogStopping || opsErrorLogQueue == nil { + return + } + + select { + case opsErrorLogQueue <- opsErrorLogJob{ops: ops, entry: entry, requestBody: requestBody}: + opsErrorLogQueueLen.Add(1) + opsErrorLogEnqueued.Add(1) + default: + // Queue is full; drop to avoid blocking request handling. + opsErrorLogDropped.Add(1) + maybeLogOpsErrorLogDrop() + } +} + +func StopOpsErrorLogWorkers() bool { + opsErrorLogStopOnce.Do(func() { + opsErrorLogShutdownOnce.Do(func() { + close(opsErrorLogShutdownCh) + }) + opsErrorLogDrained.Store(stopOpsErrorLogWorkers()) + }) + return opsErrorLogDrained.Load() +} + +func stopOpsErrorLogWorkers() bool { + opsErrorLogMu.Lock() + opsErrorLogStopping = true + ch := opsErrorLogQueue + if ch != nil { + close(ch) + } + opsErrorLogQueue = nil + opsErrorLogMu.Unlock() + + if ch == nil { + opsErrorLogQueueLen.Store(0) + return true + } + + done := make(chan struct{}) + go func() { + opsErrorLogWorkersWg.Wait() + close(done) + }() + + select { + case <-done: + opsErrorLogQueueLen.Store(0) + return true + case <-time.After(opsErrorLogDrainTimeout): + return false + } +} + +func OpsErrorLogQueueLength() int64 { + return opsErrorLogQueueLen.Load() +} + +func OpsErrorLogQueueCapacity() int { + opsErrorLogMu.RLock() + ch := opsErrorLogQueue + opsErrorLogMu.RUnlock() + if ch == nil { + return 0 + } + return cap(ch) +} + +func OpsErrorLogDroppedTotal() int64 { + return opsErrorLogDropped.Load() +} + +func OpsErrorLogEnqueuedTotal() int64 { + return opsErrorLogEnqueued.Load() +} + +func OpsErrorLogProcessedTotal() int64 { + return opsErrorLogProcessed.Load() +} + +func maybeLogOpsErrorLogDrop() { + now := time.Now().Unix() + + for { + last := opsErrorLogLastDropLogAt.Load() + if last != 0 && now-last < 60 { + return + } + if opsErrorLogLastDropLogAt.CompareAndSwap(last, now) { + break + } + } + + queued := opsErrorLogQueueLen.Load() + queueCap := OpsErrorLogQueueCapacity() + + log.Printf( + "[OpsErrorLogger] queue is full; dropping logs (queued=%d cap=%d enqueued_total=%d dropped_total=%d processed_total=%d)", + queued, + queueCap, + opsErrorLogEnqueued.Load(), + opsErrorLogDropped.Load(), + opsErrorLogProcessed.Load(), + ) +} + +func opsErrorLogConfig() (workerCount int, queueSize int) { + workerCount = runtime.GOMAXPROCS(0) * 2 + if workerCount < opsErrorLogMinWorkerCount { + workerCount = opsErrorLogMinWorkerCount + } + if workerCount > opsErrorLogMaxWorkerCount { + workerCount = opsErrorLogMaxWorkerCount + } + + queueSize = workerCount * opsErrorLogQueueSizePerWorker + if queueSize < opsErrorLogMinQueueSize { + queueSize = opsErrorLogMinQueueSize + } + if queueSize > opsErrorLogMaxQueueSize { + queueSize = opsErrorLogMaxQueueSize + } + + return workerCount, queueSize +} + +func setOpsRequestContext(c *gin.Context, model string, stream bool, requestBody []byte) { + if c == nil { + return + } + c.Set(opsModelKey, model) + c.Set(opsStreamKey, stream) + if len(requestBody) > 0 { + c.Set(opsRequestBodyKey, requestBody) + } +} + +func setOpsSelectedAccount(c *gin.Context, accountID int64) { + if c == nil || accountID <= 0 { + return + } + c.Set(opsAccountIDKey, accountID) +} + +type opsCaptureWriter struct { + gin.ResponseWriter + limit int + buf bytes.Buffer +} + +func (w *opsCaptureWriter) Write(b []byte) (int, error) { + if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit { + remaining := w.limit - w.buf.Len() + if len(b) > remaining { + _, _ = w.buf.Write(b[:remaining]) + } else { + _, _ = w.buf.Write(b) + } + } + return w.ResponseWriter.Write(b) +} + +func (w *opsCaptureWriter) WriteString(s string) (int, error) { + if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit { + remaining := w.limit - w.buf.Len() + if len(s) > remaining { + _, _ = w.buf.WriteString(s[:remaining]) + } else { + _, _ = w.buf.WriteString(s) + } + } + return w.ResponseWriter.WriteString(s) +} + +// OpsErrorLoggerMiddleware records error responses (status >= 400) into ops_error_logs. +// +// Notes: +// - It buffers response bodies only when status >= 400 to avoid overhead for successful traffic. +// - Streaming errors after the response has started (SSE) may still need explicit logging. +func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc { + return func(c *gin.Context) { + w := &opsCaptureWriter{ResponseWriter: c.Writer, limit: 64 * 1024} + c.Writer = w + c.Next() + + if ops == nil { + return + } + if !ops.IsMonitoringEnabled(c.Request.Context()) { + return + } + + status := c.Writer.Status() + if status < 400 { + // Even when the client request succeeds, we still want to persist upstream error attempts + // (retries/failover) so ops can observe upstream instability that gets "covered" by retries. + var events []*service.OpsUpstreamErrorEvent + if v, ok := c.Get(service.OpsUpstreamErrorsKey); ok { + if arr, ok := v.([]*service.OpsUpstreamErrorEvent); ok && len(arr) > 0 { + events = arr + } + } + // Also accept single upstream fields set by gateway services (rare for successful requests). + hasUpstreamContext := len(events) > 0 + if !hasUpstreamContext { + if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok { + switch t := v.(type) { + case int: + hasUpstreamContext = t > 0 + case int64: + hasUpstreamContext = t > 0 + } + } + } + if !hasUpstreamContext { + if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok { + if s, ok := v.(string); ok && strings.TrimSpace(s) != "" { + hasUpstreamContext = true + } + } + } + if !hasUpstreamContext { + if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok { + if s, ok := v.(string); ok && strings.TrimSpace(s) != "" { + hasUpstreamContext = true + } + } + } + if !hasUpstreamContext { + return + } + + apiKey, _ := middleware2.GetAPIKeyFromContext(c) + clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string) + + model, _ := c.Get(opsModelKey) + streamV, _ := c.Get(opsStreamKey) + accountIDV, _ := c.Get(opsAccountIDKey) + + var modelName string + if s, ok := model.(string); ok { + modelName = s + } + stream := false + if b, ok := streamV.(bool); ok { + stream = b + } + + // Prefer showing the account that experienced the upstream error (if we have events), + // otherwise fall back to the final selected account (best-effort). + var accountID *int64 + if len(events) > 0 { + if last := events[len(events)-1]; last != nil && last.AccountID > 0 { + v := last.AccountID + accountID = &v + } + } + if accountID == nil { + if v, ok := accountIDV.(int64); ok && v > 0 { + accountID = &v + } + } + + fallbackPlatform := guessPlatformFromPath(c.Request.URL.Path) + platform := resolveOpsPlatform(apiKey, fallbackPlatform) + + requestID := c.Writer.Header().Get("X-Request-Id") + if requestID == "" { + requestID = c.Writer.Header().Get("x-request-id") + } + + // Best-effort backfill single upstream fields from the last event (if present). + var upstreamStatusCode *int + var upstreamErrorMessage *string + var upstreamErrorDetail *string + if len(events) > 0 { + last := events[len(events)-1] + if last != nil { + if last.UpstreamStatusCode > 0 { + code := last.UpstreamStatusCode + upstreamStatusCode = &code + } + if msg := strings.TrimSpace(last.Message); msg != "" { + upstreamErrorMessage = &msg + } + if detail := strings.TrimSpace(last.Detail); detail != "" { + upstreamErrorDetail = &detail + } + } + } + + if upstreamStatusCode == nil { + if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok { + switch t := v.(type) { + case int: + if t > 0 { + code := t + upstreamStatusCode = &code + } + case int64: + if t > 0 { + code := int(t) + upstreamStatusCode = &code + } + } + } + } + if upstreamErrorMessage == nil { + if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok { + if s, ok := v.(string); ok && strings.TrimSpace(s) != "" { + msg := strings.TrimSpace(s) + upstreamErrorMessage = &msg + } + } + } + if upstreamErrorDetail == nil { + if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok { + if s, ok := v.(string); ok && strings.TrimSpace(s) != "" { + detail := strings.TrimSpace(s) + upstreamErrorDetail = &detail + } + } + } + + // If we still have nothing meaningful, skip. + if upstreamStatusCode == nil && upstreamErrorMessage == nil && upstreamErrorDetail == nil && len(events) == 0 { + return + } + + effectiveUpstreamStatus := 0 + if upstreamStatusCode != nil { + effectiveUpstreamStatus = *upstreamStatusCode + } + + recoveredMsg := "Recovered upstream error" + if effectiveUpstreamStatus > 0 { + recoveredMsg += " " + strconvItoa(effectiveUpstreamStatus) + } + if upstreamErrorMessage != nil && strings.TrimSpace(*upstreamErrorMessage) != "" { + recoveredMsg += ": " + strings.TrimSpace(*upstreamErrorMessage) + } + recoveredMsg = truncateString(recoveredMsg, 2048) + + entry := &service.OpsInsertErrorLogInput{ + RequestID: requestID, + ClientRequestID: clientRequestID, + + AccountID: accountID, + Platform: platform, + Model: modelName, + RequestPath: func() string { + if c.Request != nil && c.Request.URL != nil { + return c.Request.URL.Path + } + return "" + }(), + Stream: stream, + UserAgent: c.GetHeader("User-Agent"), + + ErrorPhase: "upstream", + ErrorType: "upstream_error", + // Severity/retryability should reflect the upstream failure, not the final client status (200). + Severity: classifyOpsSeverity("upstream_error", effectiveUpstreamStatus), + StatusCode: status, + IsBusinessLimited: false, + + ErrorMessage: recoveredMsg, + ErrorBody: "", + + ErrorSource: "upstream_http", + ErrorOwner: "provider", + + UpstreamStatusCode: upstreamStatusCode, + UpstreamErrorMessage: upstreamErrorMessage, + UpstreamErrorDetail: upstreamErrorDetail, + UpstreamErrors: events, + + IsRetryable: classifyOpsIsRetryable("upstream_error", effectiveUpstreamStatus), + RetryCount: 0, + CreatedAt: time.Now(), + } + + if apiKey != nil { + entry.APIKeyID = &apiKey.ID + if apiKey.User != nil { + entry.UserID = &apiKey.User.ID + } + if apiKey.GroupID != nil { + entry.GroupID = apiKey.GroupID + } + // Prefer group platform if present (more stable than inferring from path). + if apiKey.Group != nil && apiKey.Group.Platform != "" { + entry.Platform = apiKey.Group.Platform + } + } + + var clientIP string + if ip := strings.TrimSpace(c.ClientIP()); ip != "" { + clientIP = ip + entry.ClientIP = &clientIP + } + + var requestBody []byte + if v, ok := c.Get(opsRequestBodyKey); ok { + if b, ok := v.([]byte); ok && len(b) > 0 { + requestBody = b + } + } + // Store request headers/body only when an upstream error occurred to keep overhead minimal. + entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c) + + enqueueOpsErrorLog(ops, entry, requestBody) + return + } + + body := w.buf.Bytes() + parsed := parseOpsErrorResponse(body) + + apiKey, _ := middleware2.GetAPIKeyFromContext(c) + + clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string) + + model, _ := c.Get(opsModelKey) + streamV, _ := c.Get(opsStreamKey) + accountIDV, _ := c.Get(opsAccountIDKey) + + var modelName string + if s, ok := model.(string); ok { + modelName = s + } + stream := false + if b, ok := streamV.(bool); ok { + stream = b + } + var accountID *int64 + if v, ok := accountIDV.(int64); ok && v > 0 { + accountID = &v + } + + fallbackPlatform := guessPlatformFromPath(c.Request.URL.Path) + platform := resolveOpsPlatform(apiKey, fallbackPlatform) + + requestID := c.Writer.Header().Get("X-Request-Id") + if requestID == "" { + requestID = c.Writer.Header().Get("x-request-id") + } + + phase := classifyOpsPhase(parsed.ErrorType, parsed.Message, parsed.Code) + isBusinessLimited := classifyOpsIsBusinessLimited(parsed.ErrorType, phase, parsed.Code, status, parsed.Message) + + errorOwner := classifyOpsErrorOwner(phase, parsed.Message) + errorSource := classifyOpsErrorSource(phase, parsed.Message) + + entry := &service.OpsInsertErrorLogInput{ + RequestID: requestID, + ClientRequestID: clientRequestID, + + AccountID: accountID, + Platform: platform, + Model: modelName, + RequestPath: func() string { + if c.Request != nil && c.Request.URL != nil { + return c.Request.URL.Path + } + return "" + }(), + Stream: stream, + UserAgent: c.GetHeader("User-Agent"), + + ErrorPhase: phase, + ErrorType: normalizeOpsErrorType(parsed.ErrorType, parsed.Code), + Severity: classifyOpsSeverity(parsed.ErrorType, status), + StatusCode: status, + IsBusinessLimited: isBusinessLimited, + + ErrorMessage: parsed.Message, + // Keep the full captured error body (capture is already capped at 64KB) so the + // service layer can sanitize JSON before truncating for storage. + ErrorBody: string(body), + ErrorSource: errorSource, + ErrorOwner: errorOwner, + + IsRetryable: classifyOpsIsRetryable(parsed.ErrorType, status), + RetryCount: 0, + CreatedAt: time.Now(), + } + + // Capture upstream error context set by gateway services (if present). + // This does NOT affect the client response; it enriches Ops troubleshooting data. + { + if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok { + switch t := v.(type) { + case int: + if t > 0 { + code := t + entry.UpstreamStatusCode = &code + } + case int64: + if t > 0 { + code := int(t) + entry.UpstreamStatusCode = &code + } + } + } + if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok { + if s, ok := v.(string); ok { + if msg := strings.TrimSpace(s); msg != "" { + entry.UpstreamErrorMessage = &msg + } + } + } + if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok { + if s, ok := v.(string); ok { + if detail := strings.TrimSpace(s); detail != "" { + entry.UpstreamErrorDetail = &detail + } + } + } + if v, ok := c.Get(service.OpsUpstreamErrorsKey); ok { + if events, ok := v.([]*service.OpsUpstreamErrorEvent); ok && len(events) > 0 { + entry.UpstreamErrors = events + // Best-effort backfill the single upstream fields from the last event when missing. + last := events[len(events)-1] + if last != nil { + if entry.UpstreamStatusCode == nil && last.UpstreamStatusCode > 0 { + code := last.UpstreamStatusCode + entry.UpstreamStatusCode = &code + } + if entry.UpstreamErrorMessage == nil && strings.TrimSpace(last.Message) != "" { + msg := strings.TrimSpace(last.Message) + entry.UpstreamErrorMessage = &msg + } + if entry.UpstreamErrorDetail == nil && strings.TrimSpace(last.Detail) != "" { + detail := strings.TrimSpace(last.Detail) + entry.UpstreamErrorDetail = &detail + } + } + } + } + } + + if apiKey != nil { + entry.APIKeyID = &apiKey.ID + if apiKey.User != nil { + entry.UserID = &apiKey.User.ID + } + if apiKey.GroupID != nil { + entry.GroupID = apiKey.GroupID + } + // Prefer group platform if present (more stable than inferring from path). + if apiKey.Group != nil && apiKey.Group.Platform != "" { + entry.Platform = apiKey.Group.Platform + } + } + + var clientIP string + if ip := strings.TrimSpace(c.ClientIP()); ip != "" { + clientIP = ip + entry.ClientIP = &clientIP + } + + var requestBody []byte + if v, ok := c.Get(opsRequestBodyKey); ok { + if b, ok := v.([]byte); ok && len(b) > 0 { + requestBody = b + } + } + // Persist only a minimal, whitelisted set of request headers to improve retry fidelity. + // Do NOT store Authorization/Cookie/etc. + entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c) + + enqueueOpsErrorLog(ops, entry, requestBody) + } +} + +var opsRetryRequestHeaderAllowlist = []string{ + "anthropic-beta", + "anthropic-version", +} + +func extractOpsRetryRequestHeaders(c *gin.Context) *string { + if c == nil || c.Request == nil { + return nil + } + + headers := make(map[string]string, 4) + for _, key := range opsRetryRequestHeaderAllowlist { + v := strings.TrimSpace(c.GetHeader(key)) + if v == "" { + continue + } + // Keep headers small even if a client sends something unexpected. + headers[key] = truncateString(v, 512) + } + if len(headers) == 0 { + return nil + } + + raw, err := json.Marshal(headers) + if err != nil { + return nil + } + s := string(raw) + return &s +} + +type parsedOpsError struct { + ErrorType string + Message string + Code string +} + +func parseOpsErrorResponse(body []byte) parsedOpsError { + if len(body) == 0 { + return parsedOpsError{} + } + + // Fast path: attempt to decode into a generic map. + var m map[string]any + if err := json.Unmarshal(body, &m); err != nil { + return parsedOpsError{Message: truncateString(string(body), 1024)} + } + + // Claude/OpenAI-style gateway error: { type:"error", error:{ type, message } } + if errObj, ok := m["error"].(map[string]any); ok { + t, _ := errObj["type"].(string) + msg, _ := errObj["message"].(string) + // Gemini googleError also uses "error": { code, message, status } + if msg == "" { + if v, ok := errObj["message"]; ok { + msg, _ = v.(string) + } + } + if t == "" { + // Gemini error does not have "type" field. + t = "api_error" + } + // For gemini error, capture numeric code as string for business-limited mapping if needed. + var code string + if v, ok := errObj["code"]; ok { + switch n := v.(type) { + case float64: + code = strconvItoa(int(n)) + case int: + code = strconvItoa(n) + } + } + return parsedOpsError{ErrorType: t, Message: msg, Code: code} + } + + // APIKeyAuth-style: { code:"INSUFFICIENT_BALANCE", message:"..." } + code, _ := m["code"].(string) + msg, _ := m["message"].(string) + if code != "" || msg != "" { + return parsedOpsError{ErrorType: "api_error", Message: msg, Code: code} + } + + return parsedOpsError{Message: truncateString(string(body), 1024)} +} + +func resolveOpsPlatform(apiKey *service.APIKey, fallback string) string { + if apiKey != nil && apiKey.Group != nil && apiKey.Group.Platform != "" { + return apiKey.Group.Platform + } + return fallback +} + +func guessPlatformFromPath(path string) string { + p := strings.ToLower(path) + switch { + case strings.HasPrefix(p, "/antigravity/"): + return service.PlatformAntigravity + case strings.HasPrefix(p, "/v1beta/"): + return service.PlatformGemini + case strings.Contains(p, "/responses"): + return service.PlatformOpenAI + default: + return "" + } +} + +func normalizeOpsErrorType(errType string, code string) string { + if errType != "" { + return errType + } + switch strings.TrimSpace(code) { + case "INSUFFICIENT_BALANCE": + return "billing_error" + case "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID": + return "subscription_error" + default: + return "api_error" + } +} + +func classifyOpsPhase(errType, message, code string) string { + msg := strings.ToLower(message) + switch strings.TrimSpace(code) { + case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID": + return "billing" + } + + switch errType { + case "authentication_error": + return "auth" + case "billing_error", "subscription_error": + return "billing" + case "rate_limit_error": + if strings.Contains(msg, "concurrency") || strings.Contains(msg, "pending") || strings.Contains(msg, "queue") { + return "concurrency" + } + return "upstream" + case "invalid_request_error": + return "response" + case "upstream_error", "overloaded_error": + return "upstream" + case "api_error": + if strings.Contains(msg, "no available accounts") { + return "scheduling" + } + return "internal" + default: + return "internal" + } +} + +func classifyOpsSeverity(errType string, status int) string { + switch errType { + case "invalid_request_error", "authentication_error", "billing_error", "subscription_error": + return "P3" + } + if status >= 500 { + return "P1" + } + if status == 429 { + return "P1" + } + if status >= 400 { + return "P2" + } + return "P3" +} + +func classifyOpsIsRetryable(errType string, statusCode int) bool { + switch errType { + case "authentication_error", "invalid_request_error": + return false + case "timeout_error": + return true + case "rate_limit_error": + // May be transient (upstream or queue); retry can help. + return true + case "billing_error", "subscription_error": + return false + case "upstream_error", "overloaded_error": + return statusCode >= 500 || statusCode == 429 || statusCode == 529 + default: + return statusCode >= 500 + } +} + +func classifyOpsIsBusinessLimited(errType, phase, code string, status int, message string) bool { + switch strings.TrimSpace(code) { + case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID": + return true + } + if phase == "billing" || phase == "concurrency" { + // SLA/错误率排除“用户级业务限制” + return true + } + // Avoid treating upstream rate limits as business-limited. + if errType == "rate_limit_error" && strings.Contains(strings.ToLower(message), "upstream") { + return false + } + _ = status + return false +} + +func classifyOpsErrorOwner(phase string, message string) string { + switch phase { + case "upstream", "network": + return "provider" + case "billing", "concurrency", "auth", "response": + return "client" + default: + if strings.Contains(strings.ToLower(message), "upstream") { + return "provider" + } + return "sub2api" + } +} + +func classifyOpsErrorSource(phase string, message string) string { + switch phase { + case "upstream": + return "upstream_http" + case "network": + return "upstream_network" + case "billing": + return "billing" + case "concurrency": + return "concurrency" + default: + if strings.Contains(strings.ToLower(message), "upstream") { + return "upstream_http" + } + return "internal" + } +} + +func truncateString(s string, max int) string { + if max <= 0 { + return "" + } + if len(s) <= max { + return s + } + cut := s[:max] + // Ensure truncation does not split multi-byte characters. + for len(cut) > 0 && !utf8.ValidString(cut) { + cut = cut[:len(cut)-1] + } + return cut +} + +func strconvItoa(v int) string { + return strconv.Itoa(v) +} diff --git a/backend/internal/handler/wire.go b/backend/internal/handler/wire.go index a5e62d0a..2af7905e 100644 --- a/backend/internal/handler/wire.go +++ b/backend/internal/handler/wire.go @@ -21,6 +21,7 @@ func ProvideAdminHandlers( redeemHandler *admin.RedeemHandler, promoHandler *admin.PromoHandler, settingHandler *admin.SettingHandler, + opsHandler *admin.OpsHandler, systemHandler *admin.SystemHandler, subscriptionHandler *admin.SubscriptionHandler, usageHandler *admin.UsageHandler, @@ -39,6 +40,7 @@ func ProvideAdminHandlers( Redeem: redeemHandler, Promo: promoHandler, Setting: settingHandler, + Ops: opsHandler, System: systemHandler, Subscription: subscriptionHandler, Usage: usageHandler, @@ -109,6 +111,7 @@ var ProviderSet = wire.NewSet( admin.NewRedeemHandler, admin.NewPromoHandler, admin.NewSettingHandler, + admin.NewOpsHandler, ProvideSystemHandler, admin.NewSubscriptionHandler, admin.NewUsageHandler, diff --git a/backend/internal/pkg/ctxkey/ctxkey.go b/backend/internal/pkg/ctxkey/ctxkey.go index bd10eae0..27bb5ac5 100644 --- a/backend/internal/pkg/ctxkey/ctxkey.go +++ b/backend/internal/pkg/ctxkey/ctxkey.go @@ -7,7 +7,14 @@ type Key string const ( // ForcePlatform 强制平台(用于 /antigravity 路由),由 middleware.ForcePlatform 设置 ForcePlatform Key = "ctx_force_platform" - // IsClaudeCodeClient 是否为 Claude Code 客户端,由中间件设置 + + // ClientRequestID 客户端请求的唯一标识,用于追踪请求全生命周期(用于 Ops 监控与排障)。 + ClientRequestID Key = "ctx_client_request_id" + + // RetryCount 表示当前请求在网关层的重试次数(用于 Ops 记录与排障)。 + RetryCount Key = "ctx_retry_count" + + // IsClaudeCodeClient 标识当前请求是否来自 Claude Code 客户端 IsClaudeCodeClient Key = "ctx_is_claude_code_client" // Group 认证后的分组信息,由 API Key 认证中间件设置 Group Key = "ctx_group" diff --git a/backend/internal/repository/concurrency_cache.go b/backend/internal/repository/concurrency_cache.go index 0831f5eb..b34961e1 100644 --- a/backend/internal/repository/concurrency_cache.go +++ b/backend/internal/repository/concurrency_cache.go @@ -93,7 +93,7 @@ var ( return redis.call('ZCARD', key) `) - // incrementWaitScript - only sets TTL on first creation to avoid refreshing + // incrementWaitScript - refreshes TTL on each increment to keep queue depth accurate // KEYS[1] = wait queue key // ARGV[1] = maxWait // ARGV[2] = TTL in seconds @@ -111,15 +111,13 @@ var ( local newVal = redis.call('INCR', KEYS[1]) - -- Only set TTL on first creation to avoid refreshing zombie data - if newVal == 1 then - redis.call('EXPIRE', KEYS[1], ARGV[2]) - end + -- Refresh TTL so long-running traffic doesn't expire active queue counters. + redis.call('EXPIRE', KEYS[1], ARGV[2]) return 1 `) - // incrementAccountWaitScript - account-level wait queue count + // incrementAccountWaitScript - account-level wait queue count (refresh TTL on each increment) incrementAccountWaitScript = redis.NewScript(` local current = redis.call('GET', KEYS[1]) if current == false then @@ -134,10 +132,8 @@ var ( local newVal = redis.call('INCR', KEYS[1]) - -- Only set TTL on first creation to avoid refreshing zombie data - if newVal == 1 then - redis.call('EXPIRE', KEYS[1], ARGV[2]) - end + -- Refresh TTL so long-running traffic doesn't expire active queue counters. + redis.call('EXPIRE', KEYS[1], ARGV[2]) return 1 `) diff --git a/backend/internal/repository/ops_repo.go b/backend/internal/repository/ops_repo.go new file mode 100644 index 00000000..8e157dbf --- /dev/null +++ b/backend/internal/repository/ops_repo.go @@ -0,0 +1,707 @@ +package repository + +import ( + "context" + "database/sql" + "fmt" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/lib/pq" +) + +type opsRepository struct { + db *sql.DB +} + +func NewOpsRepository(db *sql.DB) service.OpsRepository { + return &opsRepository{db: db} +} + +func (r *opsRepository) InsertErrorLog(ctx context.Context, input *service.OpsInsertErrorLogInput) (int64, error) { + if r == nil || r.db == nil { + return 0, fmt.Errorf("nil ops repository") + } + if input == nil { + return 0, fmt.Errorf("nil input") + } + + q := ` +INSERT INTO ops_error_logs ( + request_id, + client_request_id, + user_id, + api_key_id, + account_id, + group_id, + client_ip, + platform, + model, + request_path, + stream, + user_agent, + error_phase, + error_type, + severity, + status_code, + is_business_limited, + error_message, + error_body, + error_source, + error_owner, + upstream_status_code, + upstream_error_message, + upstream_error_detail, + upstream_errors, + duration_ms, + time_to_first_token_ms, + request_body, + request_body_truncated, + request_body_bytes, + request_headers, + is_retryable, + retry_count, + created_at +) VALUES ( + $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34 +) RETURNING id` + + var id int64 + err := r.db.QueryRowContext( + ctx, + q, + opsNullString(input.RequestID), + opsNullString(input.ClientRequestID), + opsNullInt64(input.UserID), + opsNullInt64(input.APIKeyID), + opsNullInt64(input.AccountID), + opsNullInt64(input.GroupID), + opsNullString(input.ClientIP), + opsNullString(input.Platform), + opsNullString(input.Model), + opsNullString(input.RequestPath), + input.Stream, + opsNullString(input.UserAgent), + input.ErrorPhase, + input.ErrorType, + opsNullString(input.Severity), + opsNullInt(input.StatusCode), + input.IsBusinessLimited, + opsNullString(input.ErrorMessage), + opsNullString(input.ErrorBody), + opsNullString(input.ErrorSource), + opsNullString(input.ErrorOwner), + opsNullInt(input.UpstreamStatusCode), + opsNullString(input.UpstreamErrorMessage), + opsNullString(input.UpstreamErrorDetail), + opsNullString(input.UpstreamErrorsJSON), + opsNullInt(input.DurationMs), + opsNullInt64(input.TimeToFirstTokenMs), + opsNullString(input.RequestBodyJSON), + input.RequestBodyTruncated, + opsNullInt(input.RequestBodyBytes), + opsNullString(input.RequestHeadersJSON), + input.IsRetryable, + input.RetryCount, + input.CreatedAt, + ).Scan(&id) + if err != nil { + return 0, err + } + return id, nil +} + +func (r *opsRepository) ListErrorLogs(ctx context.Context, filter *service.OpsErrorLogFilter) (*service.OpsErrorLogList, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + filter = &service.OpsErrorLogFilter{} + } + + page := filter.Page + if page <= 0 { + page = 1 + } + pageSize := filter.PageSize + if pageSize <= 0 { + pageSize = 20 + } + if pageSize > 500 { + pageSize = 500 + } + + where, args := buildOpsErrorLogsWhere(filter) + countSQL := "SELECT COUNT(*) FROM ops_error_logs " + where + + var total int + if err := r.db.QueryRowContext(ctx, countSQL, args...).Scan(&total); err != nil { + return nil, err + } + + offset := (page - 1) * pageSize + argsWithLimit := append(args, pageSize, offset) + selectSQL := ` +SELECT + id, + created_at, + error_phase, + error_type, + severity, + COALESCE(upstream_status_code, status_code, 0), + COALESCE(platform, ''), + COALESCE(model, ''), + duration_ms, + COALESCE(client_request_id, ''), + COALESCE(request_id, ''), + COALESCE(error_message, ''), + user_id, + api_key_id, + account_id, + group_id, + CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END, + COALESCE(request_path, ''), + stream +FROM ops_error_logs +` + where + ` +ORDER BY created_at DESC +LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2) + + rows, err := r.db.QueryContext(ctx, selectSQL, argsWithLimit...) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + out := make([]*service.OpsErrorLog, 0, pageSize) + for rows.Next() { + var item service.OpsErrorLog + var latency sql.NullInt64 + var statusCode sql.NullInt64 + var clientIP sql.NullString + var userID sql.NullInt64 + var apiKeyID sql.NullInt64 + var accountID sql.NullInt64 + var groupID sql.NullInt64 + if err := rows.Scan( + &item.ID, + &item.CreatedAt, + &item.Phase, + &item.Type, + &item.Severity, + &statusCode, + &item.Platform, + &item.Model, + &latency, + &item.ClientRequestID, + &item.RequestID, + &item.Message, + &userID, + &apiKeyID, + &accountID, + &groupID, + &clientIP, + &item.RequestPath, + &item.Stream, + ); err != nil { + return nil, err + } + if latency.Valid { + v := int(latency.Int64) + item.LatencyMs = &v + } + item.StatusCode = int(statusCode.Int64) + if clientIP.Valid { + s := clientIP.String + item.ClientIP = &s + } + if userID.Valid { + v := userID.Int64 + item.UserID = &v + } + if apiKeyID.Valid { + v := apiKeyID.Int64 + item.APIKeyID = &v + } + if accountID.Valid { + v := accountID.Int64 + item.AccountID = &v + } + if groupID.Valid { + v := groupID.Int64 + item.GroupID = &v + } + out = append(out, &item) + } + if err := rows.Err(); err != nil { + return nil, err + } + + return &service.OpsErrorLogList{ + Errors: out, + Total: total, + Page: page, + PageSize: pageSize, + }, nil +} + +func (r *opsRepository) GetErrorLogByID(ctx context.Context, id int64) (*service.OpsErrorLogDetail, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if id <= 0 { + return nil, fmt.Errorf("invalid id") + } + + q := ` +SELECT + id, + created_at, + error_phase, + error_type, + severity, + COALESCE(upstream_status_code, status_code, 0), + COALESCE(platform, ''), + COALESCE(model, ''), + duration_ms, + COALESCE(client_request_id, ''), + COALESCE(request_id, ''), + COALESCE(error_message, ''), + COALESCE(error_body, ''), + upstream_status_code, + COALESCE(upstream_error_message, ''), + COALESCE(upstream_error_detail, ''), + COALESCE(upstream_errors::text, ''), + is_business_limited, + user_id, + api_key_id, + account_id, + group_id, + CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END, + COALESCE(request_path, ''), + stream, + COALESCE(user_agent, ''), + auth_latency_ms, + routing_latency_ms, + upstream_latency_ms, + response_latency_ms, + time_to_first_token_ms, + COALESCE(request_body::text, ''), + request_body_truncated, + request_body_bytes, + COALESCE(request_headers::text, '') +FROM ops_error_logs +WHERE id = $1 +LIMIT 1` + + var out service.OpsErrorLogDetail + var latency sql.NullInt64 + var statusCode sql.NullInt64 + var upstreamStatusCode sql.NullInt64 + var clientIP sql.NullString + var userID sql.NullInt64 + var apiKeyID sql.NullInt64 + var accountID sql.NullInt64 + var groupID sql.NullInt64 + var authLatency sql.NullInt64 + var routingLatency sql.NullInt64 + var upstreamLatency sql.NullInt64 + var responseLatency sql.NullInt64 + var ttft sql.NullInt64 + var requestBodyBytes sql.NullInt64 + + err := r.db.QueryRowContext(ctx, q, id).Scan( + &out.ID, + &out.CreatedAt, + &out.Phase, + &out.Type, + &out.Severity, + &statusCode, + &out.Platform, + &out.Model, + &latency, + &out.ClientRequestID, + &out.RequestID, + &out.Message, + &out.ErrorBody, + &upstreamStatusCode, + &out.UpstreamErrorMessage, + &out.UpstreamErrorDetail, + &out.UpstreamErrors, + &out.IsBusinessLimited, + &userID, + &apiKeyID, + &accountID, + &groupID, + &clientIP, + &out.RequestPath, + &out.Stream, + &out.UserAgent, + &authLatency, + &routingLatency, + &upstreamLatency, + &responseLatency, + &ttft, + &out.RequestBody, + &out.RequestBodyTruncated, + &requestBodyBytes, + &out.RequestHeaders, + ) + if err != nil { + return nil, err + } + + out.StatusCode = int(statusCode.Int64) + if latency.Valid { + v := int(latency.Int64) + out.LatencyMs = &v + } + if clientIP.Valid { + s := clientIP.String + out.ClientIP = &s + } + if upstreamStatusCode.Valid && upstreamStatusCode.Int64 > 0 { + v := int(upstreamStatusCode.Int64) + out.UpstreamStatusCode = &v + } + if userID.Valid { + v := userID.Int64 + out.UserID = &v + } + if apiKeyID.Valid { + v := apiKeyID.Int64 + out.APIKeyID = &v + } + if accountID.Valid { + v := accountID.Int64 + out.AccountID = &v + } + if groupID.Valid { + v := groupID.Int64 + out.GroupID = &v + } + if authLatency.Valid { + v := authLatency.Int64 + out.AuthLatencyMs = &v + } + if routingLatency.Valid { + v := routingLatency.Int64 + out.RoutingLatencyMs = &v + } + if upstreamLatency.Valid { + v := upstreamLatency.Int64 + out.UpstreamLatencyMs = &v + } + if responseLatency.Valid { + v := responseLatency.Int64 + out.ResponseLatencyMs = &v + } + if ttft.Valid { + v := ttft.Int64 + out.TimeToFirstTokenMs = &v + } + if requestBodyBytes.Valid { + v := int(requestBodyBytes.Int64) + out.RequestBodyBytes = &v + } + + // Normalize request_body to empty string when stored as JSON null. + out.RequestBody = strings.TrimSpace(out.RequestBody) + if out.RequestBody == "null" { + out.RequestBody = "" + } + // Normalize request_headers to empty string when stored as JSON null. + out.RequestHeaders = strings.TrimSpace(out.RequestHeaders) + if out.RequestHeaders == "null" { + out.RequestHeaders = "" + } + // Normalize upstream_errors to empty string when stored as JSON null. + out.UpstreamErrors = strings.TrimSpace(out.UpstreamErrors) + if out.UpstreamErrors == "null" { + out.UpstreamErrors = "" + } + + return &out, nil +} + +func (r *opsRepository) InsertRetryAttempt(ctx context.Context, input *service.OpsInsertRetryAttemptInput) (int64, error) { + if r == nil || r.db == nil { + return 0, fmt.Errorf("nil ops repository") + } + if input == nil { + return 0, fmt.Errorf("nil input") + } + if input.SourceErrorID <= 0 { + return 0, fmt.Errorf("invalid source_error_id") + } + if strings.TrimSpace(input.Mode) == "" { + return 0, fmt.Errorf("invalid mode") + } + + q := ` +INSERT INTO ops_retry_attempts ( + requested_by_user_id, + source_error_id, + mode, + pinned_account_id, + status, + started_at +) VALUES ( + $1,$2,$3,$4,$5,$6 +) RETURNING id` + + var id int64 + err := r.db.QueryRowContext( + ctx, + q, + opsNullInt64(&input.RequestedByUserID), + input.SourceErrorID, + strings.TrimSpace(input.Mode), + opsNullInt64(input.PinnedAccountID), + strings.TrimSpace(input.Status), + input.StartedAt, + ).Scan(&id) + if err != nil { + return 0, err + } + return id, nil +} + +func (r *opsRepository) UpdateRetryAttempt(ctx context.Context, input *service.OpsUpdateRetryAttemptInput) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if input == nil { + return fmt.Errorf("nil input") + } + if input.ID <= 0 { + return fmt.Errorf("invalid id") + } + + q := ` +UPDATE ops_retry_attempts +SET + status = $2, + finished_at = $3, + duration_ms = $4, + result_request_id = $5, + result_error_id = $6, + error_message = $7 +WHERE id = $1` + + _, err := r.db.ExecContext( + ctx, + q, + input.ID, + strings.TrimSpace(input.Status), + nullTime(input.FinishedAt), + input.DurationMs, + opsNullString(input.ResultRequestID), + opsNullInt64(input.ResultErrorID), + opsNullString(input.ErrorMessage), + ) + return err +} + +func (r *opsRepository) GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*service.OpsRetryAttempt, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if sourceErrorID <= 0 { + return nil, fmt.Errorf("invalid source_error_id") + } + + q := ` +SELECT + id, + created_at, + COALESCE(requested_by_user_id, 0), + source_error_id, + COALESCE(mode, ''), + pinned_account_id, + COALESCE(status, ''), + started_at, + finished_at, + duration_ms, + result_request_id, + result_error_id, + error_message +FROM ops_retry_attempts +WHERE source_error_id = $1 +ORDER BY created_at DESC +LIMIT 1` + + var out service.OpsRetryAttempt + var pinnedAccountID sql.NullInt64 + var requestedBy sql.NullInt64 + var startedAt sql.NullTime + var finishedAt sql.NullTime + var durationMs sql.NullInt64 + var resultRequestID sql.NullString + var resultErrorID sql.NullInt64 + var errorMessage sql.NullString + + err := r.db.QueryRowContext(ctx, q, sourceErrorID).Scan( + &out.ID, + &out.CreatedAt, + &requestedBy, + &out.SourceErrorID, + &out.Mode, + &pinnedAccountID, + &out.Status, + &startedAt, + &finishedAt, + &durationMs, + &resultRequestID, + &resultErrorID, + &errorMessage, + ) + if err != nil { + return nil, err + } + out.RequestedByUserID = requestedBy.Int64 + if pinnedAccountID.Valid { + v := pinnedAccountID.Int64 + out.PinnedAccountID = &v + } + if startedAt.Valid { + t := startedAt.Time + out.StartedAt = &t + } + if finishedAt.Valid { + t := finishedAt.Time + out.FinishedAt = &t + } + if durationMs.Valid { + v := durationMs.Int64 + out.DurationMs = &v + } + if resultRequestID.Valid { + s := resultRequestID.String + out.ResultRequestID = &s + } + if resultErrorID.Valid { + v := resultErrorID.Int64 + out.ResultErrorID = &v + } + if errorMessage.Valid { + s := errorMessage.String + out.ErrorMessage = &s + } + + return &out, nil +} + +func nullTime(t time.Time) sql.NullTime { + if t.IsZero() { + return sql.NullTime{} + } + return sql.NullTime{Time: t, Valid: true} +} + +func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) { + clauses := make([]string, 0, 8) + args := make([]any, 0, 8) + clauses = append(clauses, "1=1") + + phaseFilter := "" + if filter != nil { + phaseFilter = strings.TrimSpace(strings.ToLower(filter.Phase)) + } + // ops_error_logs primarily stores client-visible error requests (status>=400), + // but we also persist "recovered" upstream errors (status<400) for upstream health visibility. + // By default, keep list endpoints scoped to client errors unless explicitly filtering upstream phase. + if phaseFilter != "upstream" { + clauses = append(clauses, "COALESCE(status_code, 0) >= 400") + } + + if filter.StartTime != nil && !filter.StartTime.IsZero() { + args = append(args, filter.StartTime.UTC()) + clauses = append(clauses, "created_at >= $"+itoa(len(args))) + } + if filter.EndTime != nil && !filter.EndTime.IsZero() { + args = append(args, filter.EndTime.UTC()) + // Keep time-window semantics consistent with other ops queries: [start, end) + clauses = append(clauses, "created_at < $"+itoa(len(args))) + } + if p := strings.TrimSpace(filter.Platform); p != "" { + args = append(args, p) + clauses = append(clauses, "platform = $"+itoa(len(args))) + } + if filter.GroupID != nil && *filter.GroupID > 0 { + args = append(args, *filter.GroupID) + clauses = append(clauses, "group_id = $"+itoa(len(args))) + } + if filter.AccountID != nil && *filter.AccountID > 0 { + args = append(args, *filter.AccountID) + clauses = append(clauses, "account_id = $"+itoa(len(args))) + } + if phase := phaseFilter; phase != "" { + args = append(args, phase) + clauses = append(clauses, "error_phase = $"+itoa(len(args))) + } + if len(filter.StatusCodes) > 0 { + args = append(args, pq.Array(filter.StatusCodes)) + clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) = ANY($"+itoa(len(args))+")") + } + if q := strings.TrimSpace(filter.Query); q != "" { + like := "%" + q + "%" + args = append(args, like) + n := itoa(len(args)) + clauses = append(clauses, "(request_id ILIKE $"+n+" OR client_request_id ILIKE $"+n+" OR error_message ILIKE $"+n+")") + } + + return "WHERE " + strings.Join(clauses, " AND "), args +} + +// Helpers for nullable args +func opsNullString(v any) any { + switch s := v.(type) { + case nil: + return sql.NullString{} + case *string: + if s == nil || strings.TrimSpace(*s) == "" { + return sql.NullString{} + } + return sql.NullString{String: strings.TrimSpace(*s), Valid: true} + case string: + if strings.TrimSpace(s) == "" { + return sql.NullString{} + } + return sql.NullString{String: strings.TrimSpace(s), Valid: true} + default: + return sql.NullString{} + } +} + +func opsNullInt64(v *int64) any { + if v == nil || *v == 0 { + return sql.NullInt64{} + } + return sql.NullInt64{Int64: *v, Valid: true} +} + +func opsNullInt(v any) any { + switch n := v.(type) { + case nil: + return sql.NullInt64{} + case *int: + if n == nil || *n == 0 { + return sql.NullInt64{} + } + return sql.NullInt64{Int64: int64(*n), Valid: true} + case *int64: + if n == nil || *n == 0 { + return sql.NullInt64{} + } + return sql.NullInt64{Int64: *n, Valid: true} + case int: + if n == 0 { + return sql.NullInt64{} + } + return sql.NullInt64{Int64: int64(n), Valid: true} + default: + return sql.NullInt64{} + } +} diff --git a/backend/internal/repository/ops_repo_alerts.go b/backend/internal/repository/ops_repo_alerts.go new file mode 100644 index 00000000..f601c363 --- /dev/null +++ b/backend/internal/repository/ops_repo_alerts.go @@ -0,0 +1,689 @@ +package repository + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" +) + +func (r *opsRepository) ListAlertRules(ctx context.Context) ([]*service.OpsAlertRule, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + + q := ` +SELECT + id, + name, + COALESCE(description, ''), + enabled, + COALESCE(severity, ''), + metric_type, + operator, + threshold, + window_minutes, + sustained_minutes, + cooldown_minutes, + COALESCE(notify_email, true), + filters, + last_triggered_at, + created_at, + updated_at +FROM ops_alert_rules +ORDER BY id DESC` + + rows, err := r.db.QueryContext(ctx, q) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + out := []*service.OpsAlertRule{} + for rows.Next() { + var rule service.OpsAlertRule + var filtersRaw []byte + var lastTriggeredAt sql.NullTime + if err := rows.Scan( + &rule.ID, + &rule.Name, + &rule.Description, + &rule.Enabled, + &rule.Severity, + &rule.MetricType, + &rule.Operator, + &rule.Threshold, + &rule.WindowMinutes, + &rule.SustainedMinutes, + &rule.CooldownMinutes, + &rule.NotifyEmail, + &filtersRaw, + &lastTriggeredAt, + &rule.CreatedAt, + &rule.UpdatedAt, + ); err != nil { + return nil, err + } + if lastTriggeredAt.Valid { + v := lastTriggeredAt.Time + rule.LastTriggeredAt = &v + } + if len(filtersRaw) > 0 && string(filtersRaw) != "null" { + var decoded map[string]any + if err := json.Unmarshal(filtersRaw, &decoded); err == nil { + rule.Filters = decoded + } + } + out = append(out, &rule) + } + if err := rows.Err(); err != nil { + return nil, err + } + return out, nil +} + +func (r *opsRepository) CreateAlertRule(ctx context.Context, input *service.OpsAlertRule) (*service.OpsAlertRule, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if input == nil { + return nil, fmt.Errorf("nil input") + } + + filtersArg, err := opsNullJSONMap(input.Filters) + if err != nil { + return nil, err + } + + q := ` +INSERT INTO ops_alert_rules ( + name, + description, + enabled, + severity, + metric_type, + operator, + threshold, + window_minutes, + sustained_minutes, + cooldown_minutes, + notify_email, + filters, + created_at, + updated_at +) VALUES ( + $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,NOW(),NOW() +) +RETURNING + id, + name, + COALESCE(description, ''), + enabled, + COALESCE(severity, ''), + metric_type, + operator, + threshold, + window_minutes, + sustained_minutes, + cooldown_minutes, + COALESCE(notify_email, true), + filters, + last_triggered_at, + created_at, + updated_at` + + var out service.OpsAlertRule + var filtersRaw []byte + var lastTriggeredAt sql.NullTime + + if err := r.db.QueryRowContext( + ctx, + q, + strings.TrimSpace(input.Name), + strings.TrimSpace(input.Description), + input.Enabled, + strings.TrimSpace(input.Severity), + strings.TrimSpace(input.MetricType), + strings.TrimSpace(input.Operator), + input.Threshold, + input.WindowMinutes, + input.SustainedMinutes, + input.CooldownMinutes, + input.NotifyEmail, + filtersArg, + ).Scan( + &out.ID, + &out.Name, + &out.Description, + &out.Enabled, + &out.Severity, + &out.MetricType, + &out.Operator, + &out.Threshold, + &out.WindowMinutes, + &out.SustainedMinutes, + &out.CooldownMinutes, + &out.NotifyEmail, + &filtersRaw, + &lastTriggeredAt, + &out.CreatedAt, + &out.UpdatedAt, + ); err != nil { + return nil, err + } + if lastTriggeredAt.Valid { + v := lastTriggeredAt.Time + out.LastTriggeredAt = &v + } + if len(filtersRaw) > 0 && string(filtersRaw) != "null" { + var decoded map[string]any + if err := json.Unmarshal(filtersRaw, &decoded); err == nil { + out.Filters = decoded + } + } + + return &out, nil +} + +func (r *opsRepository) UpdateAlertRule(ctx context.Context, input *service.OpsAlertRule) (*service.OpsAlertRule, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if input == nil { + return nil, fmt.Errorf("nil input") + } + if input.ID <= 0 { + return nil, fmt.Errorf("invalid id") + } + + filtersArg, err := opsNullJSONMap(input.Filters) + if err != nil { + return nil, err + } + + q := ` +UPDATE ops_alert_rules +SET + name = $2, + description = $3, + enabled = $4, + severity = $5, + metric_type = $6, + operator = $7, + threshold = $8, + window_minutes = $9, + sustained_minutes = $10, + cooldown_minutes = $11, + notify_email = $12, + filters = $13, + updated_at = NOW() +WHERE id = $1 +RETURNING + id, + name, + COALESCE(description, ''), + enabled, + COALESCE(severity, ''), + metric_type, + operator, + threshold, + window_minutes, + sustained_minutes, + cooldown_minutes, + COALESCE(notify_email, true), + filters, + last_triggered_at, + created_at, + updated_at` + + var out service.OpsAlertRule + var filtersRaw []byte + var lastTriggeredAt sql.NullTime + + if err := r.db.QueryRowContext( + ctx, + q, + input.ID, + strings.TrimSpace(input.Name), + strings.TrimSpace(input.Description), + input.Enabled, + strings.TrimSpace(input.Severity), + strings.TrimSpace(input.MetricType), + strings.TrimSpace(input.Operator), + input.Threshold, + input.WindowMinutes, + input.SustainedMinutes, + input.CooldownMinutes, + input.NotifyEmail, + filtersArg, + ).Scan( + &out.ID, + &out.Name, + &out.Description, + &out.Enabled, + &out.Severity, + &out.MetricType, + &out.Operator, + &out.Threshold, + &out.WindowMinutes, + &out.SustainedMinutes, + &out.CooldownMinutes, + &out.NotifyEmail, + &filtersRaw, + &lastTriggeredAt, + &out.CreatedAt, + &out.UpdatedAt, + ); err != nil { + return nil, err + } + + if lastTriggeredAt.Valid { + v := lastTriggeredAt.Time + out.LastTriggeredAt = &v + } + if len(filtersRaw) > 0 && string(filtersRaw) != "null" { + var decoded map[string]any + if err := json.Unmarshal(filtersRaw, &decoded); err == nil { + out.Filters = decoded + } + } + + return &out, nil +} + +func (r *opsRepository) DeleteAlertRule(ctx context.Context, id int64) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if id <= 0 { + return fmt.Errorf("invalid id") + } + + res, err := r.db.ExecContext(ctx, "DELETE FROM ops_alert_rules WHERE id = $1", id) + if err != nil { + return err + } + affected, err := res.RowsAffected() + if err != nil { + return err + } + if affected == 0 { + return sql.ErrNoRows + } + return nil +} + +func (r *opsRepository) ListAlertEvents(ctx context.Context, filter *service.OpsAlertEventFilter) ([]*service.OpsAlertEvent, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + filter = &service.OpsAlertEventFilter{} + } + + limit := filter.Limit + if limit <= 0 { + limit = 100 + } + if limit > 500 { + limit = 500 + } + + where, args := buildOpsAlertEventsWhere(filter) + args = append(args, limit) + limitArg := "$" + itoa(len(args)) + + q := ` +SELECT + id, + COALESCE(rule_id, 0), + COALESCE(severity, ''), + COALESCE(status, ''), + COALESCE(title, ''), + COALESCE(description, ''), + metric_value, + threshold_value, + dimensions, + fired_at, + resolved_at, + email_sent, + created_at +FROM ops_alert_events +` + where + ` +ORDER BY fired_at DESC +LIMIT ` + limitArg + + rows, err := r.db.QueryContext(ctx, q, args...) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + out := []*service.OpsAlertEvent{} + for rows.Next() { + var ev service.OpsAlertEvent + var metricValue sql.NullFloat64 + var thresholdValue sql.NullFloat64 + var dimensionsRaw []byte + var resolvedAt sql.NullTime + if err := rows.Scan( + &ev.ID, + &ev.RuleID, + &ev.Severity, + &ev.Status, + &ev.Title, + &ev.Description, + &metricValue, + &thresholdValue, + &dimensionsRaw, + &ev.FiredAt, + &resolvedAt, + &ev.EmailSent, + &ev.CreatedAt, + ); err != nil { + return nil, err + } + if metricValue.Valid { + v := metricValue.Float64 + ev.MetricValue = &v + } + if thresholdValue.Valid { + v := thresholdValue.Float64 + ev.ThresholdValue = &v + } + if resolvedAt.Valid { + v := resolvedAt.Time + ev.ResolvedAt = &v + } + if len(dimensionsRaw) > 0 && string(dimensionsRaw) != "null" { + var decoded map[string]any + if err := json.Unmarshal(dimensionsRaw, &decoded); err == nil { + ev.Dimensions = decoded + } + } + out = append(out, &ev) + } + if err := rows.Err(); err != nil { + return nil, err + } + return out, nil +} + +func (r *opsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if ruleID <= 0 { + return nil, fmt.Errorf("invalid rule id") + } + + q := ` +SELECT + id, + COALESCE(rule_id, 0), + COALESCE(severity, ''), + COALESCE(status, ''), + COALESCE(title, ''), + COALESCE(description, ''), + metric_value, + threshold_value, + dimensions, + fired_at, + resolved_at, + email_sent, + created_at +FROM ops_alert_events +WHERE rule_id = $1 AND status = $2 +ORDER BY fired_at DESC +LIMIT 1` + + row := r.db.QueryRowContext(ctx, q, ruleID, service.OpsAlertStatusFiring) + ev, err := scanOpsAlertEvent(row) + if err != nil { + if err == sql.ErrNoRows { + return nil, nil + } + return nil, err + } + return ev, nil +} + +func (r *opsRepository) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if ruleID <= 0 { + return nil, fmt.Errorf("invalid rule id") + } + + q := ` +SELECT + id, + COALESCE(rule_id, 0), + COALESCE(severity, ''), + COALESCE(status, ''), + COALESCE(title, ''), + COALESCE(description, ''), + metric_value, + threshold_value, + dimensions, + fired_at, + resolved_at, + email_sent, + created_at +FROM ops_alert_events +WHERE rule_id = $1 +ORDER BY fired_at DESC +LIMIT 1` + + row := r.db.QueryRowContext(ctx, q, ruleID) + ev, err := scanOpsAlertEvent(row) + if err != nil { + if err == sql.ErrNoRows { + return nil, nil + } + return nil, err + } + return ev, nil +} + +func (r *opsRepository) CreateAlertEvent(ctx context.Context, event *service.OpsAlertEvent) (*service.OpsAlertEvent, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if event == nil { + return nil, fmt.Errorf("nil event") + } + + dimensionsArg, err := opsNullJSONMap(event.Dimensions) + if err != nil { + return nil, err + } + + q := ` +INSERT INTO ops_alert_events ( + rule_id, + severity, + status, + title, + description, + metric_value, + threshold_value, + dimensions, + fired_at, + resolved_at, + email_sent, + created_at +) VALUES ( + $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,NOW() +) +RETURNING + id, + COALESCE(rule_id, 0), + COALESCE(severity, ''), + COALESCE(status, ''), + COALESCE(title, ''), + COALESCE(description, ''), + metric_value, + threshold_value, + dimensions, + fired_at, + resolved_at, + email_sent, + created_at` + + row := r.db.QueryRowContext( + ctx, + q, + opsNullInt64(&event.RuleID), + opsNullString(event.Severity), + opsNullString(event.Status), + opsNullString(event.Title), + opsNullString(event.Description), + opsNullFloat64(event.MetricValue), + opsNullFloat64(event.ThresholdValue), + dimensionsArg, + event.FiredAt, + opsNullTime(event.ResolvedAt), + event.EmailSent, + ) + return scanOpsAlertEvent(row) +} + +func (r *opsRepository) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if eventID <= 0 { + return fmt.Errorf("invalid event id") + } + if strings.TrimSpace(status) == "" { + return fmt.Errorf("invalid status") + } + + q := ` +UPDATE ops_alert_events +SET status = $2, + resolved_at = $3 +WHERE id = $1` + + _, err := r.db.ExecContext(ctx, q, eventID, strings.TrimSpace(status), opsNullTime(resolvedAt)) + return err +} + +func (r *opsRepository) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if eventID <= 0 { + return fmt.Errorf("invalid event id") + } + + _, err := r.db.ExecContext(ctx, "UPDATE ops_alert_events SET email_sent = $2 WHERE id = $1", eventID, emailSent) + return err +} + +type opsAlertEventRow interface { + Scan(dest ...any) error +} + +func scanOpsAlertEvent(row opsAlertEventRow) (*service.OpsAlertEvent, error) { + var ev service.OpsAlertEvent + var metricValue sql.NullFloat64 + var thresholdValue sql.NullFloat64 + var dimensionsRaw []byte + var resolvedAt sql.NullTime + + if err := row.Scan( + &ev.ID, + &ev.RuleID, + &ev.Severity, + &ev.Status, + &ev.Title, + &ev.Description, + &metricValue, + &thresholdValue, + &dimensionsRaw, + &ev.FiredAt, + &resolvedAt, + &ev.EmailSent, + &ev.CreatedAt, + ); err != nil { + return nil, err + } + if metricValue.Valid { + v := metricValue.Float64 + ev.MetricValue = &v + } + if thresholdValue.Valid { + v := thresholdValue.Float64 + ev.ThresholdValue = &v + } + if resolvedAt.Valid { + v := resolvedAt.Time + ev.ResolvedAt = &v + } + if len(dimensionsRaw) > 0 && string(dimensionsRaw) != "null" { + var decoded map[string]any + if err := json.Unmarshal(dimensionsRaw, &decoded); err == nil { + ev.Dimensions = decoded + } + } + return &ev, nil +} + +func buildOpsAlertEventsWhere(filter *service.OpsAlertEventFilter) (string, []any) { + clauses := []string{"1=1"} + args := []any{} + + if filter == nil { + return "WHERE " + strings.Join(clauses, " AND "), args + } + + if status := strings.TrimSpace(filter.Status); status != "" { + args = append(args, status) + clauses = append(clauses, "status = $"+itoa(len(args))) + } + if severity := strings.TrimSpace(filter.Severity); severity != "" { + args = append(args, severity) + clauses = append(clauses, "severity = $"+itoa(len(args))) + } + if filter.StartTime != nil && !filter.StartTime.IsZero() { + args = append(args, *filter.StartTime) + clauses = append(clauses, "fired_at >= $"+itoa(len(args))) + } + if filter.EndTime != nil && !filter.EndTime.IsZero() { + args = append(args, *filter.EndTime) + clauses = append(clauses, "fired_at < $"+itoa(len(args))) + } + + // Dimensions are stored in JSONB. We filter best-effort without requiring GIN indexes. + if platform := strings.TrimSpace(filter.Platform); platform != "" { + args = append(args, platform) + clauses = append(clauses, "(dimensions->>'platform') = $"+itoa(len(args))) + } + if filter.GroupID != nil && *filter.GroupID > 0 { + args = append(args, fmt.Sprintf("%d", *filter.GroupID)) + clauses = append(clauses, "(dimensions->>'group_id') = $"+itoa(len(args))) + } + + return "WHERE " + strings.Join(clauses, " AND "), args +} + +func opsNullJSONMap(v map[string]any) (any, error) { + if v == nil { + return sql.NullString{}, nil + } + b, err := json.Marshal(v) + if err != nil { + return nil, err + } + if len(b) == 0 { + return sql.NullString{}, nil + } + return sql.NullString{String: string(b), Valid: true}, nil +} diff --git a/backend/internal/repository/ops_repo_dashboard.go b/backend/internal/repository/ops_repo_dashboard.go new file mode 100644 index 00000000..194020bb --- /dev/null +++ b/backend/internal/repository/ops_repo_dashboard.go @@ -0,0 +1,1013 @@ +package repository + +import ( + "context" + "database/sql" + "errors" + "fmt" + "math" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" +) + +func (r *opsRepository) GetDashboardOverview(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsDashboardOverview, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + return nil, fmt.Errorf("nil filter") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, fmt.Errorf("start_time/end_time required") + } + + mode := filter.QueryMode + if !mode.IsValid() { + mode = service.OpsQueryModeRaw + } + + switch mode { + case service.OpsQueryModePreagg: + return r.getDashboardOverviewPreaggregated(ctx, filter) + case service.OpsQueryModeAuto: + out, err := r.getDashboardOverviewPreaggregated(ctx, filter) + if err != nil && errors.Is(err, service.ErrOpsPreaggregatedNotPopulated) { + return r.getDashboardOverviewRaw(ctx, filter) + } + return out, err + default: + return r.getDashboardOverviewRaw(ctx, filter) + } +} + +func (r *opsRepository) getDashboardOverviewRaw(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsDashboardOverview, error) { + start := filter.StartTime.UTC() + end := filter.EndTime.UTC() + + successCount, tokenConsumed, err := r.queryUsageCounts(ctx, filter, start, end) + if err != nil { + return nil, err + } + + duration, ttft, err := r.queryUsageLatency(ctx, filter, start, end) + if err != nil { + return nil, err + } + + errorTotal, businessLimited, errorCountSLA, upstreamExcl, upstream429, upstream529, err := r.queryErrorCounts(ctx, filter, start, end) + if err != nil { + return nil, err + } + + windowSeconds := end.Sub(start).Seconds() + if windowSeconds <= 0 { + windowSeconds = 1 + } + + requestCountTotal := successCount + errorTotal + requestCountSLA := successCount + errorCountSLA + + sla := safeDivideFloat64(float64(successCount), float64(requestCountSLA)) + errorRate := safeDivideFloat64(float64(errorCountSLA), float64(requestCountSLA)) + upstreamErrorRate := safeDivideFloat64(float64(upstreamExcl), float64(requestCountSLA)) + + qpsCurrent, tpsCurrent, err := r.queryCurrentRates(ctx, filter, end) + if err != nil { + return nil, err + } + + qpsPeak, err := r.queryPeakQPS(ctx, filter, start, end) + if err != nil { + return nil, err + } + tpsPeak, err := r.queryPeakTPS(ctx, filter, start, end) + if err != nil { + return nil, err + } + + qpsAvg := roundTo1DP(float64(requestCountTotal) / windowSeconds) + tpsAvg := roundTo1DP(float64(tokenConsumed) / windowSeconds) + + return &service.OpsDashboardOverview{ + StartTime: start, + EndTime: end, + Platform: strings.TrimSpace(filter.Platform), + GroupID: filter.GroupID, + + SuccessCount: successCount, + ErrorCountTotal: errorTotal, + BusinessLimitedCount: businessLimited, + ErrorCountSLA: errorCountSLA, + RequestCountTotal: requestCountTotal, + RequestCountSLA: requestCountSLA, + TokenConsumed: tokenConsumed, + + SLA: roundTo4DP(sla), + ErrorRate: roundTo4DP(errorRate), + UpstreamErrorRate: roundTo4DP(upstreamErrorRate), + UpstreamErrorCountExcl429529: upstreamExcl, + Upstream429Count: upstream429, + Upstream529Count: upstream529, + + QPS: service.OpsRateSummary{ + Current: qpsCurrent, + Peak: qpsPeak, + Avg: qpsAvg, + }, + TPS: service.OpsRateSummary{ + Current: tpsCurrent, + Peak: tpsPeak, + Avg: tpsAvg, + }, + + Duration: duration, + TTFT: ttft, + }, nil +} + +type opsDashboardPartial struct { + successCount int64 + errorCountTotal int64 + businessLimitedCount int64 + errorCountSLA int64 + + upstreamErrorCountExcl429529 int64 + upstream429Count int64 + upstream529Count int64 + + tokenConsumed int64 + + duration service.OpsPercentiles + ttft service.OpsPercentiles +} + +func (r *opsRepository) getDashboardOverviewPreaggregated(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsDashboardOverview, error) { + if filter == nil { + return nil, fmt.Errorf("nil filter") + } + + start := filter.StartTime.UTC() + end := filter.EndTime.UTC() + + // Stable full-hour range covered by pre-aggregation. + aggSafeEnd := preaggSafeEnd(end) + aggFullStart := utcCeilToHour(start) + aggFullEnd := utcFloorToHour(aggSafeEnd) + + // If there are no stable full-hour buckets, use raw directly (short windows). + if !aggFullStart.Before(aggFullEnd) { + return r.getDashboardOverviewRaw(ctx, filter) + } + + // 1) Pre-aggregated stable segment. + preaggRows, err := r.listHourlyMetricsRows(ctx, filter, aggFullStart, aggFullEnd) + if err != nil { + return nil, err + } + if len(preaggRows) == 0 { + // Distinguish "no data" vs "preagg not populated yet". + if exists, err := r.rawOpsDataExists(ctx, filter, aggFullStart, aggFullEnd); err == nil && exists { + return nil, service.ErrOpsPreaggregatedNotPopulated + } + } + preagg := aggregateHourlyRows(preaggRows) + + // 2) Raw head/tail fragments (at most ~1 hour each). + head := opsDashboardPartial{} + tail := opsDashboardPartial{} + + if start.Before(aggFullStart) { + part, err := r.queryRawPartial(ctx, filter, start, minTime(end, aggFullStart)) + if err != nil { + return nil, err + } + head = *part + } + if aggFullEnd.Before(end) { + part, err := r.queryRawPartial(ctx, filter, maxTime(start, aggFullEnd), end) + if err != nil { + return nil, err + } + tail = *part + } + + // Merge counts. + successCount := preagg.successCount + head.successCount + tail.successCount + errorTotal := preagg.errorCountTotal + head.errorCountTotal + tail.errorCountTotal + businessLimited := preagg.businessLimitedCount + head.businessLimitedCount + tail.businessLimitedCount + errorCountSLA := preagg.errorCountSLA + head.errorCountSLA + tail.errorCountSLA + + upstreamExcl := preagg.upstreamErrorCountExcl429529 + head.upstreamErrorCountExcl429529 + tail.upstreamErrorCountExcl429529 + upstream429 := preagg.upstream429Count + head.upstream429Count + tail.upstream429Count + upstream529 := preagg.upstream529Count + head.upstream529Count + tail.upstream529Count + + tokenConsumed := preagg.tokenConsumed + head.tokenConsumed + tail.tokenConsumed + + // Approximate percentiles across segments: + // - p50/p90/avg: weighted average by success_count + // - p95/p99/max: max (conservative tail) + duration := combineApproxPercentiles([]opsPercentileSegment{ + {weight: preagg.successCount, p: preagg.duration}, + {weight: head.successCount, p: head.duration}, + {weight: tail.successCount, p: tail.duration}, + }) + ttft := combineApproxPercentiles([]opsPercentileSegment{ + {weight: preagg.successCount, p: preagg.ttft}, + {weight: head.successCount, p: head.ttft}, + {weight: tail.successCount, p: tail.ttft}, + }) + + windowSeconds := end.Sub(start).Seconds() + if windowSeconds <= 0 { + windowSeconds = 1 + } + + requestCountTotal := successCount + errorTotal + requestCountSLA := successCount + errorCountSLA + + sla := safeDivideFloat64(float64(successCount), float64(requestCountSLA)) + errorRate := safeDivideFloat64(float64(errorCountSLA), float64(requestCountSLA)) + upstreamErrorRate := safeDivideFloat64(float64(upstreamExcl), float64(requestCountSLA)) + + // Keep "current" rates as raw, to preserve realtime semantics. + qpsCurrent, tpsCurrent, err := r.queryCurrentRates(ctx, filter, end) + if err != nil { + return nil, err + } + + // NOTE: peak still uses raw logs (minute granularity). This is typically cheaper than percentile_cont + // and keeps semantics consistent across modes. + qpsPeak, err := r.queryPeakQPS(ctx, filter, start, end) + if err != nil { + return nil, err + } + tpsPeak, err := r.queryPeakTPS(ctx, filter, start, end) + if err != nil { + return nil, err + } + + qpsAvg := roundTo1DP(float64(requestCountTotal) / windowSeconds) + tpsAvg := roundTo1DP(float64(tokenConsumed) / windowSeconds) + + return &service.OpsDashboardOverview{ + StartTime: start, + EndTime: end, + Platform: strings.TrimSpace(filter.Platform), + GroupID: filter.GroupID, + + SuccessCount: successCount, + ErrorCountTotal: errorTotal, + BusinessLimitedCount: businessLimited, + ErrorCountSLA: errorCountSLA, + RequestCountTotal: requestCountTotal, + RequestCountSLA: requestCountSLA, + TokenConsumed: tokenConsumed, + + SLA: roundTo4DP(sla), + ErrorRate: roundTo4DP(errorRate), + UpstreamErrorRate: roundTo4DP(upstreamErrorRate), + UpstreamErrorCountExcl429529: upstreamExcl, + Upstream429Count: upstream429, + Upstream529Count: upstream529, + + QPS: service.OpsRateSummary{ + Current: qpsCurrent, + Peak: qpsPeak, + Avg: qpsAvg, + }, + TPS: service.OpsRateSummary{ + Current: tpsCurrent, + Peak: tpsPeak, + Avg: tpsAvg, + }, + + Duration: duration, + TTFT: ttft, + }, nil +} + +type opsHourlyMetricsRow struct { + bucketStart time.Time + + successCount int64 + errorCountTotal int64 + businessLimitedCount int64 + errorCountSLA int64 + + upstreamErrorCountExcl429529 int64 + upstream429Count int64 + upstream529Count int64 + + tokenConsumed int64 + + durationP50 sql.NullInt64 + durationP90 sql.NullInt64 + durationP95 sql.NullInt64 + durationP99 sql.NullInt64 + durationAvg sql.NullFloat64 + durationMax sql.NullInt64 + + ttftP50 sql.NullInt64 + ttftP90 sql.NullInt64 + ttftP95 sql.NullInt64 + ttftP99 sql.NullInt64 + ttftAvg sql.NullFloat64 + ttftMax sql.NullInt64 +} + +func (r *opsRepository) listHourlyMetricsRows(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) ([]opsHourlyMetricsRow, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if start.IsZero() || end.IsZero() || !start.Before(end) { + return []opsHourlyMetricsRow{}, nil + } + + where := "bucket_start >= $1 AND bucket_start < $2" + args := []any{start.UTC(), end.UTC()} + idx := 3 + + platform := "" + groupID := (*int64)(nil) + if filter != nil { + platform = strings.TrimSpace(strings.ToLower(filter.Platform)) + groupID = filter.GroupID + } + + switch { + case groupID != nil && *groupID > 0: + where += fmt.Sprintf(" AND group_id = $%d", idx) + args = append(args, *groupID) + idx++ + if platform != "" { + where += fmt.Sprintf(" AND platform = $%d", idx) + args = append(args, platform) + // idx++ removed - not used after this + } + case platform != "": + where += fmt.Sprintf(" AND platform = $%d AND group_id IS NULL", idx) + args = append(args, platform) + // idx++ removed - not used after this + default: + where += " AND platform IS NULL AND group_id IS NULL" + } + + q := ` +SELECT + bucket_start, + success_count, + error_count_total, + business_limited_count, + error_count_sla, + upstream_error_count_excl_429_529, + upstream_429_count, + upstream_529_count, + token_consumed, + duration_p50_ms, + duration_p90_ms, + duration_p95_ms, + duration_p99_ms, + duration_avg_ms, + duration_max_ms, + ttft_p50_ms, + ttft_p90_ms, + ttft_p95_ms, + ttft_p99_ms, + ttft_avg_ms, + ttft_max_ms +FROM ops_metrics_hourly +WHERE ` + where + ` +ORDER BY bucket_start ASC` + + rows, err := r.db.QueryContext(ctx, q, args...) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + out := make([]opsHourlyMetricsRow, 0, 64) + for rows.Next() { + var row opsHourlyMetricsRow + if err := rows.Scan( + &row.bucketStart, + &row.successCount, + &row.errorCountTotal, + &row.businessLimitedCount, + &row.errorCountSLA, + &row.upstreamErrorCountExcl429529, + &row.upstream429Count, + &row.upstream529Count, + &row.tokenConsumed, + &row.durationP50, + &row.durationP90, + &row.durationP95, + &row.durationP99, + &row.durationAvg, + &row.durationMax, + &row.ttftP50, + &row.ttftP90, + &row.ttftP95, + &row.ttftP99, + &row.ttftAvg, + &row.ttftMax, + ); err != nil { + return nil, err + } + out = append(out, row) + } + if err := rows.Err(); err != nil { + return nil, err + } + return out, nil +} + +func aggregateHourlyRows(rows []opsHourlyMetricsRow) opsDashboardPartial { + out := opsDashboardPartial{} + if len(rows) == 0 { + return out + } + + var ( + p50Sum float64 + p50W int64 + p90Sum float64 + p90W int64 + avgSum float64 + avgW int64 + ) + var ( + ttftP50Sum float64 + ttftP50W int64 + ttftP90Sum float64 + ttftP90W int64 + ttftAvgSum float64 + ttftAvgW int64 + ) + + var ( + p95Max *int + p99Max *int + maxMax *int + + ttftP95Max *int + ttftP99Max *int + ttftMaxMax *int + ) + + for _, row := range rows { + out.successCount += row.successCount + out.errorCountTotal += row.errorCountTotal + out.businessLimitedCount += row.businessLimitedCount + out.errorCountSLA += row.errorCountSLA + + out.upstreamErrorCountExcl429529 += row.upstreamErrorCountExcl429529 + out.upstream429Count += row.upstream429Count + out.upstream529Count += row.upstream529Count + + out.tokenConsumed += row.tokenConsumed + + if row.successCount > 0 { + if row.durationP50.Valid { + p50Sum += float64(row.durationP50.Int64) * float64(row.successCount) + p50W += row.successCount + } + if row.durationP90.Valid { + p90Sum += float64(row.durationP90.Int64) * float64(row.successCount) + p90W += row.successCount + } + if row.durationAvg.Valid { + avgSum += row.durationAvg.Float64 * float64(row.successCount) + avgW += row.successCount + } + if row.ttftP50.Valid { + ttftP50Sum += float64(row.ttftP50.Int64) * float64(row.successCount) + ttftP50W += row.successCount + } + if row.ttftP90.Valid { + ttftP90Sum += float64(row.ttftP90.Int64) * float64(row.successCount) + ttftP90W += row.successCount + } + if row.ttftAvg.Valid { + ttftAvgSum += row.ttftAvg.Float64 * float64(row.successCount) + ttftAvgW += row.successCount + } + } + + if row.durationP95.Valid { + v := int(row.durationP95.Int64) + if p95Max == nil || v > *p95Max { + p95Max = &v + } + } + if row.durationP99.Valid { + v := int(row.durationP99.Int64) + if p99Max == nil || v > *p99Max { + p99Max = &v + } + } + if row.durationMax.Valid { + v := int(row.durationMax.Int64) + if maxMax == nil || v > *maxMax { + maxMax = &v + } + } + + if row.ttftP95.Valid { + v := int(row.ttftP95.Int64) + if ttftP95Max == nil || v > *ttftP95Max { + ttftP95Max = &v + } + } + if row.ttftP99.Valid { + v := int(row.ttftP99.Int64) + if ttftP99Max == nil || v > *ttftP99Max { + ttftP99Max = &v + } + } + if row.ttftMax.Valid { + v := int(row.ttftMax.Int64) + if ttftMaxMax == nil || v > *ttftMaxMax { + ttftMaxMax = &v + } + } + } + + // duration + if p50W > 0 { + v := int(math.Round(p50Sum / float64(p50W))) + out.duration.P50 = &v + } + if p90W > 0 { + v := int(math.Round(p90Sum / float64(p90W))) + out.duration.P90 = &v + } + out.duration.P95 = p95Max + out.duration.P99 = p99Max + if avgW > 0 { + v := int(math.Round(avgSum / float64(avgW))) + out.duration.Avg = &v + } + out.duration.Max = maxMax + + // ttft + if ttftP50W > 0 { + v := int(math.Round(ttftP50Sum / float64(ttftP50W))) + out.ttft.P50 = &v + } + if ttftP90W > 0 { + v := int(math.Round(ttftP90Sum / float64(ttftP90W))) + out.ttft.P90 = &v + } + out.ttft.P95 = ttftP95Max + out.ttft.P99 = ttftP99Max + if ttftAvgW > 0 { + v := int(math.Round(ttftAvgSum / float64(ttftAvgW))) + out.ttft.Avg = &v + } + out.ttft.Max = ttftMaxMax + + return out +} + +func (r *opsRepository) queryRawPartial(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (*opsDashboardPartial, error) { + successCount, tokenConsumed, err := r.queryUsageCounts(ctx, filter, start, end) + if err != nil { + return nil, err + } + + duration, ttft, err := r.queryUsageLatency(ctx, filter, start, end) + if err != nil { + return nil, err + } + + errorTotal, businessLimited, errorCountSLA, upstreamExcl, upstream429, upstream529, err := r.queryErrorCounts(ctx, filter, start, end) + if err != nil { + return nil, err + } + + return &opsDashboardPartial{ + successCount: successCount, + errorCountTotal: errorTotal, + businessLimitedCount: businessLimited, + errorCountSLA: errorCountSLA, + upstreamErrorCountExcl429529: upstreamExcl, + upstream429Count: upstream429, + upstream529Count: upstream529, + tokenConsumed: tokenConsumed, + duration: duration, + ttft: ttft, + }, nil +} + +func (r *opsRepository) rawOpsDataExists(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (bool, error) { + { + join, where, args, _ := buildUsageWhere(filter, start, end, 1) + q := `SELECT EXISTS(SELECT 1 FROM usage_logs ul ` + join + ` ` + where + ` LIMIT 1)` + var exists bool + if err := r.db.QueryRowContext(ctx, q, args...).Scan(&exists); err != nil { + return false, err + } + if exists { + return true, nil + } + } + + { + where, args, _ := buildErrorWhere(filter, start, end, 1) + q := `SELECT EXISTS(SELECT 1 FROM ops_error_logs ` + where + ` LIMIT 1)` + var exists bool + if err := r.db.QueryRowContext(ctx, q, args...).Scan(&exists); err != nil { + return false, err + } + return exists, nil + } +} + +type opsPercentileSegment struct { + weight int64 + p service.OpsPercentiles +} + +func combineApproxPercentiles(segments []opsPercentileSegment) service.OpsPercentiles { + weightedInt := func(get func(service.OpsPercentiles) *int) *int { + var sum float64 + var w int64 + for _, seg := range segments { + if seg.weight <= 0 { + continue + } + v := get(seg.p) + if v == nil { + continue + } + sum += float64(*v) * float64(seg.weight) + w += seg.weight + } + if w <= 0 { + return nil + } + out := int(math.Round(sum / float64(w))) + return &out + } + + maxInt := func(get func(service.OpsPercentiles) *int) *int { + var max *int + for _, seg := range segments { + v := get(seg.p) + if v == nil { + continue + } + if max == nil || *v > *max { + c := *v + max = &c + } + } + return max + } + + return service.OpsPercentiles{ + P50: weightedInt(func(p service.OpsPercentiles) *int { return p.P50 }), + P90: weightedInt(func(p service.OpsPercentiles) *int { return p.P90 }), + P95: maxInt(func(p service.OpsPercentiles) *int { return p.P95 }), + P99: maxInt(func(p service.OpsPercentiles) *int { return p.P99 }), + Avg: weightedInt(func(p service.OpsPercentiles) *int { return p.Avg }), + Max: maxInt(func(p service.OpsPercentiles) *int { return p.Max }), + } +} + +func preaggSafeEnd(endTime time.Time) time.Time { + now := time.Now().UTC() + cutoff := now.Add(-5 * time.Minute) + if endTime.After(cutoff) { + return cutoff + } + return endTime +} + +func utcCeilToHour(t time.Time) time.Time { + u := t.UTC() + f := u.Truncate(time.Hour) + if f.Equal(u) { + return f + } + return f.Add(time.Hour) +} + +func utcFloorToHour(t time.Time) time.Time { + return t.UTC().Truncate(time.Hour) +} + +func minTime(a, b time.Time) time.Time { + if a.Before(b) { + return a + } + return b +} + +func maxTime(a, b time.Time) time.Time { + if a.After(b) { + return a + } + return b +} + +func (r *opsRepository) queryUsageCounts(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (successCount int64, tokenConsumed int64, err error) { + join, where, args, _ := buildUsageWhere(filter, start, end, 1) + + q := ` +SELECT + COALESCE(COUNT(*), 0) AS success_count, + COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed +FROM usage_logs ul +` + join + ` +` + where + + var tokens sql.NullInt64 + if err := r.db.QueryRowContext(ctx, q, args...).Scan(&successCount, &tokens); err != nil { + return 0, 0, err + } + if tokens.Valid { + tokenConsumed = tokens.Int64 + } + return successCount, tokenConsumed, nil +} + +func (r *opsRepository) queryUsageLatency(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (duration service.OpsPercentiles, ttft service.OpsPercentiles, err error) { + { + join, where, args, _ := buildUsageWhere(filter, start, end, 1) + q := ` +SELECT + percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50, + percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90, + percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95, + percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99, + AVG(duration_ms) AS avg_ms, + MAX(duration_ms) AS max_ms +FROM usage_logs ul +` + join + ` +` + where + ` +AND duration_ms IS NOT NULL` + + var p50, p90, p95, p99 sql.NullFloat64 + var avg sql.NullFloat64 + var max sql.NullInt64 + if err := r.db.QueryRowContext(ctx, q, args...).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil { + return service.OpsPercentiles{}, service.OpsPercentiles{}, err + } + duration.P50 = floatToIntPtr(p50) + duration.P90 = floatToIntPtr(p90) + duration.P95 = floatToIntPtr(p95) + duration.P99 = floatToIntPtr(p99) + duration.Avg = floatToIntPtr(avg) + if max.Valid { + v := int(max.Int64) + duration.Max = &v + } + } + + { + join, where, args, _ := buildUsageWhere(filter, start, end, 1) + q := ` +SELECT + percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50, + percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90, + percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95, + percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99, + AVG(first_token_ms) AS avg_ms, + MAX(first_token_ms) AS max_ms +FROM usage_logs ul +` + join + ` +` + where + ` +AND first_token_ms IS NOT NULL` + + var p50, p90, p95, p99 sql.NullFloat64 + var avg sql.NullFloat64 + var max sql.NullInt64 + if err := r.db.QueryRowContext(ctx, q, args...).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil { + return service.OpsPercentiles{}, service.OpsPercentiles{}, err + } + ttft.P50 = floatToIntPtr(p50) + ttft.P90 = floatToIntPtr(p90) + ttft.P95 = floatToIntPtr(p95) + ttft.P99 = floatToIntPtr(p99) + ttft.Avg = floatToIntPtr(avg) + if max.Valid { + v := int(max.Int64) + ttft.Max = &v + } + } + + return duration, ttft, nil +} + +func (r *opsRepository) queryErrorCounts(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) ( + errorTotal int64, + businessLimited int64, + errorCountSLA int64, + upstreamExcl429529 int64, + upstream429 int64, + upstream529 int64, + err error, +) { + where, args, _ := buildErrorWhere(filter, start, end, 1) + + q := ` +SELECT + COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400), 0) AS error_total, + COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited), 0) AS business_limited, + COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited), 0) AS error_sla, + COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl, + COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429), 0) AS upstream_429, + COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529), 0) AS upstream_529 +FROM ops_error_logs +` + where + + if err := r.db.QueryRowContext(ctx, q, args...).Scan( + &errorTotal, + &businessLimited, + &errorCountSLA, + &upstreamExcl429529, + &upstream429, + &upstream529, + ); err != nil { + return 0, 0, 0, 0, 0, 0, err + } + return errorTotal, businessLimited, errorCountSLA, upstreamExcl429529, upstream429, upstream529, nil +} + +func (r *opsRepository) queryCurrentRates(ctx context.Context, filter *service.OpsDashboardFilter, end time.Time) (qpsCurrent float64, tpsCurrent float64, err error) { + windowStart := end.Add(-1 * time.Minute) + + successCount1m, token1m, err := r.queryUsageCounts(ctx, filter, windowStart, end) + if err != nil { + return 0, 0, err + } + errorCount1m, _, _, _, _, _, err := r.queryErrorCounts(ctx, filter, windowStart, end) + if err != nil { + return 0, 0, err + } + + qpsCurrent = roundTo1DP(float64(successCount1m+errorCount1m) / 60.0) + tpsCurrent = roundTo1DP(float64(token1m) / 60.0) + return qpsCurrent, tpsCurrent, nil +} + +func (r *opsRepository) queryPeakQPS(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (float64, error) { + usageJoin, usageWhere, usageArgs, next := buildUsageWhere(filter, start, end, 1) + errorWhere, errorArgs, _ := buildErrorWhere(filter, start, end, next) + + q := ` +WITH usage_buckets AS ( + SELECT date_trunc('minute', ul.created_at) AS bucket, COUNT(*) AS cnt + FROM usage_logs ul + ` + usageJoin + ` + ` + usageWhere + ` + GROUP BY 1 +), +error_buckets AS ( + SELECT date_trunc('minute', created_at) AS bucket, COUNT(*) AS cnt + FROM ops_error_logs + ` + errorWhere + ` + AND COALESCE(status_code, 0) >= 400 + GROUP BY 1 +), +combined AS ( + SELECT COALESCE(u.bucket, e.bucket) AS bucket, + COALESCE(u.cnt, 0) + COALESCE(e.cnt, 0) AS total + FROM usage_buckets u + FULL OUTER JOIN error_buckets e ON u.bucket = e.bucket +) +SELECT COALESCE(MAX(total), 0) FROM combined` + + args := append(usageArgs, errorArgs...) + + var maxPerMinute sql.NullInt64 + if err := r.db.QueryRowContext(ctx, q, args...).Scan(&maxPerMinute); err != nil { + return 0, err + } + if !maxPerMinute.Valid || maxPerMinute.Int64 <= 0 { + return 0, nil + } + return roundTo1DP(float64(maxPerMinute.Int64) / 60.0), nil +} + +func (r *opsRepository) queryPeakTPS(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (float64, error) { + join, where, args, _ := buildUsageWhere(filter, start, end, 1) + + q := ` +SELECT COALESCE(MAX(tokens_per_min), 0) +FROM ( + SELECT + date_trunc('minute', ul.created_at) AS bucket, + COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS tokens_per_min + FROM usage_logs ul + ` + join + ` + ` + where + ` + GROUP BY 1 +) t` + + var maxPerMinute sql.NullInt64 + if err := r.db.QueryRowContext(ctx, q, args...).Scan(&maxPerMinute); err != nil { + return 0, err + } + if !maxPerMinute.Valid || maxPerMinute.Int64 <= 0 { + return 0, nil + } + return roundTo1DP(float64(maxPerMinute.Int64) / 60.0), nil +} + +func buildUsageWhere(filter *service.OpsDashboardFilter, start, end time.Time, startIndex int) (join string, where string, args []any, nextIndex int) { + platform := "" + groupID := (*int64)(nil) + if filter != nil { + platform = strings.TrimSpace(strings.ToLower(filter.Platform)) + groupID = filter.GroupID + } + + idx := startIndex + clauses := make([]string, 0, 4) + args = make([]any, 0, 4) + + args = append(args, start) + clauses = append(clauses, fmt.Sprintf("ul.created_at >= $%d", idx)) + idx++ + args = append(args, end) + clauses = append(clauses, fmt.Sprintf("ul.created_at < $%d", idx)) + idx++ + + if groupID != nil && *groupID > 0 { + args = append(args, *groupID) + clauses = append(clauses, fmt.Sprintf("ul.group_id = $%d", idx)) + idx++ + } + if platform != "" { + // Prefer group.platform when available; fall back to account.platform so we don't + // drop rows where group_id is NULL. + join = "LEFT JOIN groups g ON g.id = ul.group_id LEFT JOIN accounts a ON a.id = ul.account_id" + args = append(args, platform) + clauses = append(clauses, fmt.Sprintf("COALESCE(NULLIF(g.platform,''), a.platform) = $%d", idx)) + idx++ + } + + where = "WHERE " + strings.Join(clauses, " AND ") + return join, where, args, idx +} + +func buildErrorWhere(filter *service.OpsDashboardFilter, start, end time.Time, startIndex int) (where string, args []any, nextIndex int) { + platform := "" + groupID := (*int64)(nil) + if filter != nil { + platform = strings.TrimSpace(strings.ToLower(filter.Platform)) + groupID = filter.GroupID + } + + idx := startIndex + clauses := make([]string, 0, 4) + args = make([]any, 0, 4) + + args = append(args, start) + clauses = append(clauses, fmt.Sprintf("created_at >= $%d", idx)) + idx++ + args = append(args, end) + clauses = append(clauses, fmt.Sprintf("created_at < $%d", idx)) + idx++ + + if groupID != nil && *groupID > 0 { + args = append(args, *groupID) + clauses = append(clauses, fmt.Sprintf("group_id = $%d", idx)) + idx++ + } + if platform != "" { + args = append(args, platform) + clauses = append(clauses, fmt.Sprintf("platform = $%d", idx)) + idx++ + } + + where = "WHERE " + strings.Join(clauses, " AND ") + return where, args, idx +} + +func floatToIntPtr(v sql.NullFloat64) *int { + if !v.Valid { + return nil + } + n := int(math.Round(v.Float64)) + return &n +} + +func safeDivideFloat64(numerator float64, denominator float64) float64 { + if denominator == 0 { + return 0 + } + return numerator / denominator +} + +func roundTo1DP(v float64) float64 { + return math.Round(v*10) / 10 +} + +func roundTo4DP(v float64) float64 { + return math.Round(v*10000) / 10000 +} diff --git a/backend/internal/repository/ops_repo_histograms.go b/backend/internal/repository/ops_repo_histograms.go new file mode 100644 index 00000000..c2978798 --- /dev/null +++ b/backend/internal/repository/ops_repo_histograms.go @@ -0,0 +1,79 @@ +package repository + +import ( + "context" + "fmt" + "strings" + + "github.com/Wei-Shaw/sub2api/internal/service" +) + +func (r *opsRepository) GetLatencyHistogram(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsLatencyHistogramResponse, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + return nil, fmt.Errorf("nil filter") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, fmt.Errorf("start_time/end_time required") + } + + start := filter.StartTime.UTC() + end := filter.EndTime.UTC() + + join, where, args, _ := buildUsageWhere(filter, start, end, 1) + rangeExpr := latencyHistogramRangeCaseExpr("ul.duration_ms") + orderExpr := latencyHistogramRangeOrderCaseExpr("ul.duration_ms") + + q := ` +SELECT + ` + rangeExpr + ` AS range, + COALESCE(COUNT(*), 0) AS count, + ` + orderExpr + ` AS ord +FROM usage_logs ul +` + join + ` +` + where + ` +AND ul.duration_ms IS NOT NULL +GROUP BY 1, 3 +ORDER BY 3 ASC` + + rows, err := r.db.QueryContext(ctx, q, args...) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + counts := make(map[string]int64, len(latencyHistogramOrderedRanges)) + var total int64 + for rows.Next() { + var label string + var count int64 + var _ord int + if err := rows.Scan(&label, &count, &_ord); err != nil { + return nil, err + } + counts[label] = count + total += count + } + if err := rows.Err(); err != nil { + return nil, err + } + + buckets := make([]*service.OpsLatencyHistogramBucket, 0, len(latencyHistogramOrderedRanges)) + for _, label := range latencyHistogramOrderedRanges { + buckets = append(buckets, &service.OpsLatencyHistogramBucket{ + Range: label, + Count: counts[label], + }) + } + + return &service.OpsLatencyHistogramResponse{ + StartTime: start, + EndTime: end, + Platform: strings.TrimSpace(filter.Platform), + GroupID: filter.GroupID, + TotalRequests: total, + Buckets: buckets, + }, nil +} diff --git a/backend/internal/repository/ops_repo_latency_histogram_buckets.go b/backend/internal/repository/ops_repo_latency_histogram_buckets.go new file mode 100644 index 00000000..cd5bed37 --- /dev/null +++ b/backend/internal/repository/ops_repo_latency_histogram_buckets.go @@ -0,0 +1,64 @@ +package repository + +import ( + "fmt" + "strings" +) + +type latencyHistogramBucket struct { + upperMs int + label string +} + +var latencyHistogramBuckets = []latencyHistogramBucket{ + {upperMs: 100, label: "0-100ms"}, + {upperMs: 200, label: "100-200ms"}, + {upperMs: 500, label: "200-500ms"}, + {upperMs: 1000, label: "500-1000ms"}, + {upperMs: 2000, label: "1000-2000ms"}, + {upperMs: 0, label: "2000ms+"}, // default bucket +} + +var latencyHistogramOrderedRanges = func() []string { + out := make([]string, 0, len(latencyHistogramBuckets)) + for _, b := range latencyHistogramBuckets { + out = append(out, b.label) + } + return out +}() + +func latencyHistogramRangeCaseExpr(column string) string { + var sb strings.Builder + _, _ = sb.WriteString("CASE\n") + + for _, b := range latencyHistogramBuckets { + if b.upperMs <= 0 { + continue + } + _, _ = sb.WriteString(fmt.Sprintf("\tWHEN %s < %d THEN '%s'\n", column, b.upperMs, b.label)) + } + + // Default bucket. + last := latencyHistogramBuckets[len(latencyHistogramBuckets)-1] + _, _ = sb.WriteString(fmt.Sprintf("\tELSE '%s'\n", last.label)) + _, _ = sb.WriteString("END") + return sb.String() +} + +func latencyHistogramRangeOrderCaseExpr(column string) string { + var sb strings.Builder + _, _ = sb.WriteString("CASE\n") + + order := 1 + for _, b := range latencyHistogramBuckets { + if b.upperMs <= 0 { + continue + } + _, _ = sb.WriteString(fmt.Sprintf("\tWHEN %s < %d THEN %d\n", column, b.upperMs, order)) + order++ + } + + _, _ = sb.WriteString(fmt.Sprintf("\tELSE %d\n", order)) + _, _ = sb.WriteString("END") + return sb.String() +} diff --git a/backend/internal/repository/ops_repo_latency_histogram_buckets_test.go b/backend/internal/repository/ops_repo_latency_histogram_buckets_test.go new file mode 100644 index 00000000..dc79f6cc --- /dev/null +++ b/backend/internal/repository/ops_repo_latency_histogram_buckets_test.go @@ -0,0 +1,14 @@ +package repository + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestLatencyHistogramBuckets_AreConsistent(t *testing.T) { + require.Equal(t, len(latencyHistogramBuckets), len(latencyHistogramOrderedRanges)) + for i, b := range latencyHistogramBuckets { + require.Equal(t, b.label, latencyHistogramOrderedRanges[i]) + } +} diff --git a/backend/internal/repository/ops_repo_metrics.go b/backend/internal/repository/ops_repo_metrics.go new file mode 100644 index 00000000..bc80ed6e --- /dev/null +++ b/backend/internal/repository/ops_repo_metrics.go @@ -0,0 +1,422 @@ +package repository + +import ( + "context" + "database/sql" + "fmt" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" +) + +func (r *opsRepository) InsertSystemMetrics(ctx context.Context, input *service.OpsInsertSystemMetricsInput) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if input == nil { + return fmt.Errorf("nil input") + } + + window := input.WindowMinutes + if window <= 0 { + window = 1 + } + createdAt := input.CreatedAt + if createdAt.IsZero() { + createdAt = time.Now().UTC() + } + + q := ` +INSERT INTO ops_system_metrics ( + created_at, + window_minutes, + platform, + group_id, + + success_count, + error_count_total, + business_limited_count, + error_count_sla, + + upstream_error_count_excl_429_529, + upstream_429_count, + upstream_529_count, + + token_consumed, + qps, + tps, + + duration_p50_ms, + duration_p90_ms, + duration_p95_ms, + duration_p99_ms, + duration_avg_ms, + duration_max_ms, + + ttft_p50_ms, + ttft_p90_ms, + ttft_p95_ms, + ttft_p99_ms, + ttft_avg_ms, + ttft_max_ms, + + cpu_usage_percent, + memory_used_mb, + memory_total_mb, + memory_usage_percent, + + db_ok, + redis_ok, + + redis_conn_total, + redis_conn_idle, + + db_conn_active, + db_conn_idle, + db_conn_waiting, + + goroutine_count, + concurrency_queue_depth +) VALUES ( + $1,$2,$3,$4, + $5,$6,$7,$8, + $9,$10,$11, + $12,$13,$14, + $15,$16,$17,$18,$19,$20, + $21,$22,$23,$24,$25,$26, + $27,$28,$29,$30, + $31,$32, + $33,$34, + $35,$36,$37, + $38,$39 +)` + + _, err := r.db.ExecContext( + ctx, + q, + createdAt, + window, + opsNullString(input.Platform), + opsNullInt64(input.GroupID), + + input.SuccessCount, + input.ErrorCountTotal, + input.BusinessLimitedCount, + input.ErrorCountSLA, + + input.UpstreamErrorCountExcl429529, + input.Upstream429Count, + input.Upstream529Count, + + input.TokenConsumed, + opsNullFloat64(input.QPS), + opsNullFloat64(input.TPS), + + opsNullInt(input.DurationP50Ms), + opsNullInt(input.DurationP90Ms), + opsNullInt(input.DurationP95Ms), + opsNullInt(input.DurationP99Ms), + opsNullFloat64(input.DurationAvgMs), + opsNullInt(input.DurationMaxMs), + + opsNullInt(input.TTFTP50Ms), + opsNullInt(input.TTFTP90Ms), + opsNullInt(input.TTFTP95Ms), + opsNullInt(input.TTFTP99Ms), + opsNullFloat64(input.TTFTAvgMs), + opsNullInt(input.TTFTMaxMs), + + opsNullFloat64(input.CPUUsagePercent), + opsNullInt(input.MemoryUsedMB), + opsNullInt(input.MemoryTotalMB), + opsNullFloat64(input.MemoryUsagePercent), + + opsNullBool(input.DBOK), + opsNullBool(input.RedisOK), + + opsNullInt(input.RedisConnTotal), + opsNullInt(input.RedisConnIdle), + + opsNullInt(input.DBConnActive), + opsNullInt(input.DBConnIdle), + opsNullInt(input.DBConnWaiting), + + opsNullInt(input.GoroutineCount), + opsNullInt(input.ConcurrencyQueueDepth), + ) + return err +} + +func (r *opsRepository) GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*service.OpsSystemMetricsSnapshot, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if windowMinutes <= 0 { + windowMinutes = 1 + } + + q := ` +SELECT + id, + created_at, + window_minutes, + + cpu_usage_percent, + memory_used_mb, + memory_total_mb, + memory_usage_percent, + + db_ok, + redis_ok, + + redis_conn_total, + redis_conn_idle, + + db_conn_active, + db_conn_idle, + db_conn_waiting, + + goroutine_count, + concurrency_queue_depth +FROM ops_system_metrics +WHERE window_minutes = $1 + AND platform IS NULL + AND group_id IS NULL +ORDER BY created_at DESC +LIMIT 1` + + var out service.OpsSystemMetricsSnapshot + var cpu sql.NullFloat64 + var memUsed sql.NullInt64 + var memTotal sql.NullInt64 + var memPct sql.NullFloat64 + var dbOK sql.NullBool + var redisOK sql.NullBool + var redisTotal sql.NullInt64 + var redisIdle sql.NullInt64 + var dbActive sql.NullInt64 + var dbIdle sql.NullInt64 + var dbWaiting sql.NullInt64 + var goroutines sql.NullInt64 + var queueDepth sql.NullInt64 + + if err := r.db.QueryRowContext(ctx, q, windowMinutes).Scan( + &out.ID, + &out.CreatedAt, + &out.WindowMinutes, + &cpu, + &memUsed, + &memTotal, + &memPct, + &dbOK, + &redisOK, + &redisTotal, + &redisIdle, + &dbActive, + &dbIdle, + &dbWaiting, + &goroutines, + &queueDepth, + ); err != nil { + return nil, err + } + + if cpu.Valid { + v := cpu.Float64 + out.CPUUsagePercent = &v + } + if memUsed.Valid { + v := memUsed.Int64 + out.MemoryUsedMB = &v + } + if memTotal.Valid { + v := memTotal.Int64 + out.MemoryTotalMB = &v + } + if memPct.Valid { + v := memPct.Float64 + out.MemoryUsagePercent = &v + } + if dbOK.Valid { + v := dbOK.Bool + out.DBOK = &v + } + if redisOK.Valid { + v := redisOK.Bool + out.RedisOK = &v + } + if redisTotal.Valid { + v := int(redisTotal.Int64) + out.RedisConnTotal = &v + } + if redisIdle.Valid { + v := int(redisIdle.Int64) + out.RedisConnIdle = &v + } + if dbActive.Valid { + v := int(dbActive.Int64) + out.DBConnActive = &v + } + if dbIdle.Valid { + v := int(dbIdle.Int64) + out.DBConnIdle = &v + } + if dbWaiting.Valid { + v := int(dbWaiting.Int64) + out.DBConnWaiting = &v + } + if goroutines.Valid { + v := int(goroutines.Int64) + out.GoroutineCount = &v + } + if queueDepth.Valid { + v := int(queueDepth.Int64) + out.ConcurrencyQueueDepth = &v + } + + return &out, nil +} + +func (r *opsRepository) UpsertJobHeartbeat(ctx context.Context, input *service.OpsUpsertJobHeartbeatInput) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if input == nil { + return fmt.Errorf("nil input") + } + if input.JobName == "" { + return fmt.Errorf("job_name required") + } + + q := ` +INSERT INTO ops_job_heartbeats ( + job_name, + last_run_at, + last_success_at, + last_error_at, + last_error, + last_duration_ms, + updated_at +) VALUES ( + $1,$2,$3,$4,$5,$6,NOW() +) +ON CONFLICT (job_name) DO UPDATE SET + last_run_at = COALESCE(EXCLUDED.last_run_at, ops_job_heartbeats.last_run_at), + last_success_at = COALESCE(EXCLUDED.last_success_at, ops_job_heartbeats.last_success_at), + last_error_at = CASE + WHEN EXCLUDED.last_success_at IS NOT NULL THEN NULL + ELSE COALESCE(EXCLUDED.last_error_at, ops_job_heartbeats.last_error_at) + END, + last_error = CASE + WHEN EXCLUDED.last_success_at IS NOT NULL THEN NULL + ELSE COALESCE(EXCLUDED.last_error, ops_job_heartbeats.last_error) + END, + last_duration_ms = COALESCE(EXCLUDED.last_duration_ms, ops_job_heartbeats.last_duration_ms), + updated_at = NOW()` + + _, err := r.db.ExecContext( + ctx, + q, + input.JobName, + opsNullTime(input.LastRunAt), + opsNullTime(input.LastSuccessAt), + opsNullTime(input.LastErrorAt), + opsNullString(input.LastError), + opsNullInt(input.LastDurationMs), + ) + return err +} + +func (r *opsRepository) ListJobHeartbeats(ctx context.Context) ([]*service.OpsJobHeartbeat, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + + q := ` +SELECT + job_name, + last_run_at, + last_success_at, + last_error_at, + last_error, + last_duration_ms, + updated_at +FROM ops_job_heartbeats +ORDER BY job_name ASC` + + rows, err := r.db.QueryContext(ctx, q) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + out := make([]*service.OpsJobHeartbeat, 0, 8) + for rows.Next() { + var item service.OpsJobHeartbeat + var lastRun sql.NullTime + var lastSuccess sql.NullTime + var lastErrorAt sql.NullTime + var lastError sql.NullString + var lastDuration sql.NullInt64 + + if err := rows.Scan( + &item.JobName, + &lastRun, + &lastSuccess, + &lastErrorAt, + &lastError, + &lastDuration, + &item.UpdatedAt, + ); err != nil { + return nil, err + } + + if lastRun.Valid { + v := lastRun.Time + item.LastRunAt = &v + } + if lastSuccess.Valid { + v := lastSuccess.Time + item.LastSuccessAt = &v + } + if lastErrorAt.Valid { + v := lastErrorAt.Time + item.LastErrorAt = &v + } + if lastError.Valid { + v := lastError.String + item.LastError = &v + } + if lastDuration.Valid { + v := lastDuration.Int64 + item.LastDurationMs = &v + } + + out = append(out, &item) + } + if err := rows.Err(); err != nil { + return nil, err + } + return out, nil +} + +func opsNullBool(v *bool) any { + if v == nil { + return sql.NullBool{} + } + return sql.NullBool{Bool: *v, Valid: true} +} + +func opsNullFloat64(v *float64) any { + if v == nil { + return sql.NullFloat64{} + } + return sql.NullFloat64{Float64: *v, Valid: true} +} + +func opsNullTime(v *time.Time) any { + if v == nil || v.IsZero() { + return sql.NullTime{} + } + return sql.NullTime{Time: *v, Valid: true} +} diff --git a/backend/internal/repository/ops_repo_preagg.go b/backend/internal/repository/ops_repo_preagg.go new file mode 100644 index 00000000..fc74e4f6 --- /dev/null +++ b/backend/internal/repository/ops_repo_preagg.go @@ -0,0 +1,359 @@ +package repository + +import ( + "context" + "database/sql" + "fmt" + "time" +) + +func (r *opsRepository) UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) { + return nil + } + + start := startTime.UTC() + end := endTime.UTC() + + // NOTE: + // - We aggregate usage_logs + ops_error_logs into ops_metrics_hourly. + // - We emit three dimension granularities via GROUPING SETS: + // 1) overall: (bucket_start) + // 2) platform: (bucket_start, platform) + // 3) group: (bucket_start, platform, group_id) + // + // IMPORTANT: Postgres UNIQUE treats NULLs as distinct, so the table uses a COALESCE-based + // unique index; our ON CONFLICT target must match that expression set. + q := ` +WITH usage_base AS ( + SELECT + date_trunc('hour', ul.created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start, + g.platform AS platform, + ul.group_id AS group_id, + ul.duration_ms AS duration_ms, + ul.first_token_ms AS first_token_ms, + (ul.input_tokens + ul.output_tokens + ul.cache_creation_tokens + ul.cache_read_tokens) AS tokens + FROM usage_logs ul + JOIN groups g ON g.id = ul.group_id + WHERE ul.created_at >= $1 AND ul.created_at < $2 +), +usage_agg AS ( + SELECT + bucket_start, + CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform, + CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id, + COUNT(*) AS success_count, + COALESCE(SUM(tokens), 0) AS token_consumed, + + percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p50_ms, + percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p90_ms, + percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p95_ms, + percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p99_ms, + AVG(duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_avg_ms, + MAX(duration_ms) AS duration_max_ms, + + percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p50_ms, + percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p90_ms, + percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p95_ms, + percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p99_ms, + AVG(first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_avg_ms, + MAX(first_token_ms) AS ttft_max_ms + FROM usage_base + GROUP BY GROUPING SETS ( + (bucket_start), + (bucket_start, platform), + (bucket_start, platform, group_id) + ) +), +error_base AS ( + SELECT + date_trunc('hour', created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start, + platform AS platform, + group_id AS group_id, + is_business_limited AS is_business_limited, + error_owner AS error_owner, + status_code AS client_status_code, + COALESCE(upstream_status_code, status_code, 0) AS effective_status_code + FROM ops_error_logs + WHERE created_at >= $1 AND created_at < $2 +), +error_agg AS ( + SELECT + bucket_start, + CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform, + CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id, + COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400) AS error_count_total, + COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND is_business_limited) AS business_limited_count, + COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND NOT is_business_limited) AS error_count_sla, + COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) NOT IN (429, 529)) AS upstream_error_count_excl_429_529, + COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 429) AS upstream_429_count, + COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 529) AS upstream_529_count + FROM error_base + GROUP BY GROUPING SETS ( + (bucket_start), + (bucket_start, platform), + (bucket_start, platform, group_id) + ) + HAVING GROUPING(group_id) = 1 OR group_id IS NOT NULL +), +combined AS ( + SELECT + COALESCE(u.bucket_start, e.bucket_start) AS bucket_start, + COALESCE(u.platform, e.platform) AS platform, + COALESCE(u.group_id, e.group_id) AS group_id, + + COALESCE(u.success_count, 0) AS success_count, + COALESCE(e.error_count_total, 0) AS error_count_total, + COALESCE(e.business_limited_count, 0) AS business_limited_count, + COALESCE(e.error_count_sla, 0) AS error_count_sla, + COALESCE(e.upstream_error_count_excl_429_529, 0) AS upstream_error_count_excl_429_529, + COALESCE(e.upstream_429_count, 0) AS upstream_429_count, + COALESCE(e.upstream_529_count, 0) AS upstream_529_count, + + COALESCE(u.token_consumed, 0) AS token_consumed, + + u.duration_p50_ms, + u.duration_p90_ms, + u.duration_p95_ms, + u.duration_p99_ms, + u.duration_avg_ms, + u.duration_max_ms, + + u.ttft_p50_ms, + u.ttft_p90_ms, + u.ttft_p95_ms, + u.ttft_p99_ms, + u.ttft_avg_ms, + u.ttft_max_ms + FROM usage_agg u + FULL OUTER JOIN error_agg e + ON u.bucket_start = e.bucket_start + AND COALESCE(u.platform, '') = COALESCE(e.platform, '') + AND COALESCE(u.group_id, 0) = COALESCE(e.group_id, 0) +) +INSERT INTO ops_metrics_hourly ( + bucket_start, + platform, + group_id, + success_count, + error_count_total, + business_limited_count, + error_count_sla, + upstream_error_count_excl_429_529, + upstream_429_count, + upstream_529_count, + token_consumed, + duration_p50_ms, + duration_p90_ms, + duration_p95_ms, + duration_p99_ms, + duration_avg_ms, + duration_max_ms, + ttft_p50_ms, + ttft_p90_ms, + ttft_p95_ms, + ttft_p99_ms, + ttft_avg_ms, + ttft_max_ms, + computed_at +) +SELECT + bucket_start, + NULLIF(platform, '') AS platform, + group_id, + success_count, + error_count_total, + business_limited_count, + error_count_sla, + upstream_error_count_excl_429_529, + upstream_429_count, + upstream_529_count, + token_consumed, + duration_p50_ms::int, + duration_p90_ms::int, + duration_p95_ms::int, + duration_p99_ms::int, + duration_avg_ms, + duration_max_ms::int, + ttft_p50_ms::int, + ttft_p90_ms::int, + ttft_p95_ms::int, + ttft_p99_ms::int, + ttft_avg_ms, + ttft_max_ms::int, + NOW() +FROM combined +WHERE bucket_start IS NOT NULL + AND (platform IS NULL OR platform <> '') +ON CONFLICT (bucket_start, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET + success_count = EXCLUDED.success_count, + error_count_total = EXCLUDED.error_count_total, + business_limited_count = EXCLUDED.business_limited_count, + error_count_sla = EXCLUDED.error_count_sla, + upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529, + upstream_429_count = EXCLUDED.upstream_429_count, + upstream_529_count = EXCLUDED.upstream_529_count, + token_consumed = EXCLUDED.token_consumed, + + duration_p50_ms = EXCLUDED.duration_p50_ms, + duration_p90_ms = EXCLUDED.duration_p90_ms, + duration_p95_ms = EXCLUDED.duration_p95_ms, + duration_p99_ms = EXCLUDED.duration_p99_ms, + duration_avg_ms = EXCLUDED.duration_avg_ms, + duration_max_ms = EXCLUDED.duration_max_ms, + + ttft_p50_ms = EXCLUDED.ttft_p50_ms, + ttft_p90_ms = EXCLUDED.ttft_p90_ms, + ttft_p95_ms = EXCLUDED.ttft_p95_ms, + ttft_p99_ms = EXCLUDED.ttft_p99_ms, + ttft_avg_ms = EXCLUDED.ttft_avg_ms, + ttft_max_ms = EXCLUDED.ttft_max_ms, + + computed_at = NOW() +` + + _, err := r.db.ExecContext(ctx, q, start, end) + return err +} + +func (r *opsRepository) UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) { + return nil + } + + start := startTime.UTC() + end := endTime.UTC() + + q := ` +INSERT INTO ops_metrics_daily ( + bucket_date, + platform, + group_id, + success_count, + error_count_total, + business_limited_count, + error_count_sla, + upstream_error_count_excl_429_529, + upstream_429_count, + upstream_529_count, + token_consumed, + duration_p50_ms, + duration_p90_ms, + duration_p95_ms, + duration_p99_ms, + duration_avg_ms, + duration_max_ms, + ttft_p50_ms, + ttft_p90_ms, + ttft_p95_ms, + ttft_p99_ms, + ttft_avg_ms, + ttft_max_ms, + computed_at +) +SELECT + (bucket_start AT TIME ZONE 'UTC')::date AS bucket_date, + platform, + group_id, + + COALESCE(SUM(success_count), 0) AS success_count, + COALESCE(SUM(error_count_total), 0) AS error_count_total, + COALESCE(SUM(business_limited_count), 0) AS business_limited_count, + COALESCE(SUM(error_count_sla), 0) AS error_count_sla, + COALESCE(SUM(upstream_error_count_excl_429_529), 0) AS upstream_error_count_excl_429_529, + COALESCE(SUM(upstream_429_count), 0) AS upstream_429_count, + COALESCE(SUM(upstream_529_count), 0) AS upstream_529_count, + COALESCE(SUM(token_consumed), 0) AS token_consumed, + + -- Approximation: weighted average for p50/p90, max for p95/p99 (conservative tail). + ROUND(SUM(duration_p50_ms::double precision * success_count) FILTER (WHERE duration_p50_ms IS NOT NULL) + / NULLIF(SUM(success_count) FILTER (WHERE duration_p50_ms IS NOT NULL), 0))::int AS duration_p50_ms, + ROUND(SUM(duration_p90_ms::double precision * success_count) FILTER (WHERE duration_p90_ms IS NOT NULL) + / NULLIF(SUM(success_count) FILTER (WHERE duration_p90_ms IS NOT NULL), 0))::int AS duration_p90_ms, + MAX(duration_p95_ms) AS duration_p95_ms, + MAX(duration_p99_ms) AS duration_p99_ms, + SUM(duration_avg_ms * success_count) FILTER (WHERE duration_avg_ms IS NOT NULL) + / NULLIF(SUM(success_count) FILTER (WHERE duration_avg_ms IS NOT NULL), 0) AS duration_avg_ms, + MAX(duration_max_ms) AS duration_max_ms, + + ROUND(SUM(ttft_p50_ms::double precision * success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL) + / NULLIF(SUM(success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL), 0))::int AS ttft_p50_ms, + ROUND(SUM(ttft_p90_ms::double precision * success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL) + / NULLIF(SUM(success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL), 0))::int AS ttft_p90_ms, + MAX(ttft_p95_ms) AS ttft_p95_ms, + MAX(ttft_p99_ms) AS ttft_p99_ms, + SUM(ttft_avg_ms * success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL) + / NULLIF(SUM(success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL), 0) AS ttft_avg_ms, + MAX(ttft_max_ms) AS ttft_max_ms, + + NOW() +FROM ops_metrics_hourly +WHERE bucket_start >= $1 AND bucket_start < $2 +GROUP BY 1, 2, 3 +ON CONFLICT (bucket_date, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET + success_count = EXCLUDED.success_count, + error_count_total = EXCLUDED.error_count_total, + business_limited_count = EXCLUDED.business_limited_count, + error_count_sla = EXCLUDED.error_count_sla, + upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529, + upstream_429_count = EXCLUDED.upstream_429_count, + upstream_529_count = EXCLUDED.upstream_529_count, + token_consumed = EXCLUDED.token_consumed, + + duration_p50_ms = EXCLUDED.duration_p50_ms, + duration_p90_ms = EXCLUDED.duration_p90_ms, + duration_p95_ms = EXCLUDED.duration_p95_ms, + duration_p99_ms = EXCLUDED.duration_p99_ms, + duration_avg_ms = EXCLUDED.duration_avg_ms, + duration_max_ms = EXCLUDED.duration_max_ms, + + ttft_p50_ms = EXCLUDED.ttft_p50_ms, + ttft_p90_ms = EXCLUDED.ttft_p90_ms, + ttft_p95_ms = EXCLUDED.ttft_p95_ms, + ttft_p99_ms = EXCLUDED.ttft_p99_ms, + ttft_avg_ms = EXCLUDED.ttft_avg_ms, + ttft_max_ms = EXCLUDED.ttft_max_ms, + + computed_at = NOW() +` + + _, err := r.db.ExecContext(ctx, q, start, end) + return err +} + +func (r *opsRepository) GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error) { + if r == nil || r.db == nil { + return time.Time{}, false, fmt.Errorf("nil ops repository") + } + + var value sql.NullTime + if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_start) FROM ops_metrics_hourly`).Scan(&value); err != nil { + return time.Time{}, false, err + } + if !value.Valid { + return time.Time{}, false, nil + } + return value.Time.UTC(), true, nil +} + +func (r *opsRepository) GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error) { + if r == nil || r.db == nil { + return time.Time{}, false, fmt.Errorf("nil ops repository") + } + + var value sql.NullTime + if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_date) FROM ops_metrics_daily`).Scan(&value); err != nil { + return time.Time{}, false, err + } + if !value.Valid { + return time.Time{}, false, nil + } + t := value.Time.UTC() + return time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, time.UTC), true, nil +} diff --git a/backend/internal/repository/ops_repo_request_details.go b/backend/internal/repository/ops_repo_request_details.go new file mode 100644 index 00000000..d8d5d111 --- /dev/null +++ b/backend/internal/repository/ops_repo_request_details.go @@ -0,0 +1,286 @@ +package repository + +import ( + "context" + "database/sql" + "fmt" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" +) + +func (r *opsRepository) ListRequestDetails(ctx context.Context, filter *service.OpsRequestDetailFilter) ([]*service.OpsRequestDetail, int64, error) { + if r == nil || r.db == nil { + return nil, 0, fmt.Errorf("nil ops repository") + } + + page, pageSize, startTime, endTime := filter.Normalize() + offset := (page - 1) * pageSize + + conditions := make([]string, 0, 16) + args := make([]any, 0, 24) + + // Placeholders $1/$2 reserved for time window inside the CTE. + args = append(args, startTime.UTC(), endTime.UTC()) + + addCondition := func(condition string, values ...any) { + conditions = append(conditions, condition) + args = append(args, values...) + } + + if filter != nil { + if kind := strings.TrimSpace(strings.ToLower(filter.Kind)); kind != "" && kind != "all" { + if kind != string(service.OpsRequestKindSuccess) && kind != string(service.OpsRequestKindError) { + return nil, 0, fmt.Errorf("invalid kind") + } + addCondition(fmt.Sprintf("kind = $%d", len(args)+1), kind) + } + + if platform := strings.TrimSpace(strings.ToLower(filter.Platform)); platform != "" { + addCondition(fmt.Sprintf("platform = $%d", len(args)+1), platform) + } + if filter.GroupID != nil && *filter.GroupID > 0 { + addCondition(fmt.Sprintf("group_id = $%d", len(args)+1), *filter.GroupID) + } + + if filter.UserID != nil && *filter.UserID > 0 { + addCondition(fmt.Sprintf("user_id = $%d", len(args)+1), *filter.UserID) + } + if filter.APIKeyID != nil && *filter.APIKeyID > 0 { + addCondition(fmt.Sprintf("api_key_id = $%d", len(args)+1), *filter.APIKeyID) + } + if filter.AccountID != nil && *filter.AccountID > 0 { + addCondition(fmt.Sprintf("account_id = $%d", len(args)+1), *filter.AccountID) + } + + if model := strings.TrimSpace(filter.Model); model != "" { + addCondition(fmt.Sprintf("model = $%d", len(args)+1), model) + } + if requestID := strings.TrimSpace(filter.RequestID); requestID != "" { + addCondition(fmt.Sprintf("request_id = $%d", len(args)+1), requestID) + } + if q := strings.TrimSpace(filter.Query); q != "" { + like := "%" + strings.ToLower(q) + "%" + startIdx := len(args) + 1 + addCondition( + fmt.Sprintf("(LOWER(COALESCE(request_id,'')) LIKE $%d OR LOWER(COALESCE(model,'')) LIKE $%d OR LOWER(COALESCE(message,'')) LIKE $%d)", + startIdx, startIdx+1, startIdx+2, + ), + like, like, like, + ) + } + + if filter.MinDurationMs != nil { + addCondition(fmt.Sprintf("duration_ms >= $%d", len(args)+1), *filter.MinDurationMs) + } + if filter.MaxDurationMs != nil { + addCondition(fmt.Sprintf("duration_ms <= $%d", len(args)+1), *filter.MaxDurationMs) + } + } + + where := "" + if len(conditions) > 0 { + where = "WHERE " + strings.Join(conditions, " AND ") + } + + cte := ` +WITH combined AS ( + SELECT + 'success'::TEXT AS kind, + ul.created_at AS created_at, + ul.request_id AS request_id, + COALESCE(NULLIF(g.platform, ''), NULLIF(a.platform, ''), '') AS platform, + ul.model AS model, + ul.duration_ms AS duration_ms, + NULL::INT AS status_code, + NULL::BIGINT AS error_id, + NULL::TEXT AS phase, + NULL::TEXT AS severity, + NULL::TEXT AS message, + ul.user_id AS user_id, + ul.api_key_id AS api_key_id, + ul.account_id AS account_id, + ul.group_id AS group_id, + ul.stream AS stream + FROM usage_logs ul + LEFT JOIN groups g ON g.id = ul.group_id + LEFT JOIN accounts a ON a.id = ul.account_id + WHERE ul.created_at >= $1 AND ul.created_at < $2 + + UNION ALL + + SELECT + 'error'::TEXT AS kind, + o.created_at AS created_at, + COALESCE(NULLIF(o.request_id,''), NULLIF(o.client_request_id,''), '') AS request_id, + COALESCE(NULLIF(o.platform, ''), NULLIF(g.platform, ''), NULLIF(a.platform, ''), '') AS platform, + o.model AS model, + o.duration_ms AS duration_ms, + o.status_code AS status_code, + o.id AS error_id, + o.error_phase AS phase, + o.severity AS severity, + o.error_message AS message, + o.user_id AS user_id, + o.api_key_id AS api_key_id, + o.account_id AS account_id, + o.group_id AS group_id, + o.stream AS stream + FROM ops_error_logs o + LEFT JOIN groups g ON g.id = o.group_id + LEFT JOIN accounts a ON a.id = o.account_id + WHERE o.created_at >= $1 AND o.created_at < $2 + AND COALESCE(o.status_code, 0) >= 400 +) +` + + countQuery := fmt.Sprintf(`%s SELECT COUNT(1) FROM combined %s`, cte, where) + var total int64 + if err := r.db.QueryRowContext(ctx, countQuery, args...).Scan(&total); err != nil { + if err == sql.ErrNoRows { + total = 0 + } else { + return nil, 0, err + } + } + + sort := "ORDER BY created_at DESC" + if filter != nil { + switch strings.TrimSpace(strings.ToLower(filter.Sort)) { + case "", "created_at_desc": + // default + case "duration_desc": + sort = "ORDER BY duration_ms DESC NULLS LAST, created_at DESC" + default: + return nil, 0, fmt.Errorf("invalid sort") + } + } + + listQuery := fmt.Sprintf(` +%s +SELECT + kind, + created_at, + request_id, + platform, + model, + duration_ms, + status_code, + error_id, + phase, + severity, + message, + user_id, + api_key_id, + account_id, + group_id, + stream +FROM combined +%s +%s +LIMIT $%d OFFSET $%d +`, cte, where, sort, len(args)+1, len(args)+2) + + listArgs := append(append([]any{}, args...), pageSize, offset) + rows, err := r.db.QueryContext(ctx, listQuery, listArgs...) + if err != nil { + return nil, 0, err + } + defer func() { _ = rows.Close() }() + + toIntPtr := func(v sql.NullInt64) *int { + if !v.Valid { + return nil + } + i := int(v.Int64) + return &i + } + toInt64Ptr := func(v sql.NullInt64) *int64 { + if !v.Valid { + return nil + } + i := v.Int64 + return &i + } + + out := make([]*service.OpsRequestDetail, 0, pageSize) + for rows.Next() { + var ( + kind string + createdAt time.Time + requestID sql.NullString + platform sql.NullString + model sql.NullString + + durationMs sql.NullInt64 + statusCode sql.NullInt64 + errorID sql.NullInt64 + + phase sql.NullString + severity sql.NullString + message sql.NullString + + userID sql.NullInt64 + apiKeyID sql.NullInt64 + accountID sql.NullInt64 + groupID sql.NullInt64 + + stream bool + ) + + if err := rows.Scan( + &kind, + &createdAt, + &requestID, + &platform, + &model, + &durationMs, + &statusCode, + &errorID, + &phase, + &severity, + &message, + &userID, + &apiKeyID, + &accountID, + &groupID, + &stream, + ); err != nil { + return nil, 0, err + } + + item := &service.OpsRequestDetail{ + Kind: service.OpsRequestKind(kind), + CreatedAt: createdAt, + RequestID: strings.TrimSpace(requestID.String), + Platform: strings.TrimSpace(platform.String), + Model: strings.TrimSpace(model.String), + + DurationMs: toIntPtr(durationMs), + StatusCode: toIntPtr(statusCode), + ErrorID: toInt64Ptr(errorID), + Phase: phase.String, + Severity: severity.String, + Message: message.String, + + UserID: toInt64Ptr(userID), + APIKeyID: toInt64Ptr(apiKeyID), + AccountID: toInt64Ptr(accountID), + GroupID: toInt64Ptr(groupID), + + Stream: stream, + } + + if item.Platform == "" { + item.Platform = "unknown" + } + + out = append(out, item) + } + if err := rows.Err(); err != nil { + return nil, 0, err + } + + return out, total, nil +} diff --git a/backend/internal/repository/ops_repo_trends.go b/backend/internal/repository/ops_repo_trends.go new file mode 100644 index 00000000..e4ac96d3 --- /dev/null +++ b/backend/internal/repository/ops_repo_trends.go @@ -0,0 +1,571 @@ +package repository + +import ( + "context" + "database/sql" + "fmt" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" +) + +func (r *opsRepository) GetThroughputTrend(ctx context.Context, filter *service.OpsDashboardFilter, bucketSeconds int) (*service.OpsThroughputTrendResponse, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + return nil, fmt.Errorf("nil filter") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, fmt.Errorf("start_time/end_time required") + } + + if bucketSeconds <= 0 { + bucketSeconds = 60 + } + if bucketSeconds != 60 && bucketSeconds != 300 && bucketSeconds != 3600 { + // Keep a small, predictable set of supported buckets for now. + bucketSeconds = 60 + } + + start := filter.StartTime.UTC() + end := filter.EndTime.UTC() + + usageJoin, usageWhere, usageArgs, next := buildUsageWhere(filter, start, end, 1) + errorWhere, errorArgs, _ := buildErrorWhere(filter, start, end, next) + + usageBucketExpr := opsBucketExprForUsage(bucketSeconds) + errorBucketExpr := opsBucketExprForError(bucketSeconds) + + q := ` +WITH usage_buckets AS ( + SELECT ` + usageBucketExpr + ` AS bucket, + COUNT(*) AS success_count, + COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed + FROM usage_logs ul + ` + usageJoin + ` + ` + usageWhere + ` + GROUP BY 1 +), +error_buckets AS ( + SELECT ` + errorBucketExpr + ` AS bucket, + COUNT(*) AS error_count + FROM ops_error_logs + ` + errorWhere + ` + AND COALESCE(status_code, 0) >= 400 + GROUP BY 1 +), +combined AS ( + SELECT COALESCE(u.bucket, e.bucket) AS bucket, + COALESCE(u.success_count, 0) AS success_count, + COALESCE(e.error_count, 0) AS error_count, + COALESCE(u.token_consumed, 0) AS token_consumed + FROM usage_buckets u + FULL OUTER JOIN error_buckets e ON u.bucket = e.bucket +) +SELECT + bucket, + (success_count + error_count) AS request_count, + token_consumed +FROM combined +ORDER BY bucket ASC` + + args := append(usageArgs, errorArgs...) + + rows, err := r.db.QueryContext(ctx, q, args...) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + points := make([]*service.OpsThroughputTrendPoint, 0, 256) + for rows.Next() { + var bucket time.Time + var requests int64 + var tokens sql.NullInt64 + if err := rows.Scan(&bucket, &requests, &tokens); err != nil { + return nil, err + } + tokenConsumed := int64(0) + if tokens.Valid { + tokenConsumed = tokens.Int64 + } + + denom := float64(bucketSeconds) + if denom <= 0 { + denom = 60 + } + qps := roundTo1DP(float64(requests) / denom) + tps := roundTo1DP(float64(tokenConsumed) / denom) + + points = append(points, &service.OpsThroughputTrendPoint{ + BucketStart: bucket.UTC(), + RequestCount: requests, + TokenConsumed: tokenConsumed, + QPS: qps, + TPS: tps, + }) + } + if err := rows.Err(); err != nil { + return nil, err + } + + // Fill missing buckets with zeros so charts render continuous timelines. + points = fillOpsThroughputBuckets(start, end, bucketSeconds, points) + + var byPlatform []*service.OpsThroughputPlatformBreakdownItem + var topGroups []*service.OpsThroughputGroupBreakdownItem + + platform := "" + if filter != nil { + platform = strings.TrimSpace(strings.ToLower(filter.Platform)) + } + groupID := (*int64)(nil) + if filter != nil { + groupID = filter.GroupID + } + + // Drilldown helpers: + // - No platform/group: totals by platform + // - Platform selected but no group: top groups in that platform + if platform == "" && (groupID == nil || *groupID <= 0) { + items, err := r.getThroughputBreakdownByPlatform(ctx, start, end) + if err != nil { + return nil, err + } + byPlatform = items + } else if platform != "" && (groupID == nil || *groupID <= 0) { + items, err := r.getThroughputTopGroupsByPlatform(ctx, start, end, platform, 10) + if err != nil { + return nil, err + } + topGroups = items + } + + return &service.OpsThroughputTrendResponse{ + Bucket: opsBucketLabel(bucketSeconds), + Points: points, + + ByPlatform: byPlatform, + TopGroups: topGroups, + }, nil +} + +func (r *opsRepository) getThroughputBreakdownByPlatform(ctx context.Context, start, end time.Time) ([]*service.OpsThroughputPlatformBreakdownItem, error) { + q := ` +WITH usage_totals AS ( + SELECT COALESCE(NULLIF(g.platform,''), a.platform) AS platform, + COUNT(*) AS success_count, + COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed + FROM usage_logs ul + LEFT JOIN groups g ON g.id = ul.group_id + LEFT JOIN accounts a ON a.id = ul.account_id + WHERE ul.created_at >= $1 AND ul.created_at < $2 + GROUP BY 1 +), +error_totals AS ( + SELECT platform, + COUNT(*) AS error_count + FROM ops_error_logs + WHERE created_at >= $1 AND created_at < $2 + AND COALESCE(status_code, 0) >= 400 + GROUP BY 1 +), +combined AS ( + SELECT COALESCE(u.platform, e.platform) AS platform, + COALESCE(u.success_count, 0) AS success_count, + COALESCE(e.error_count, 0) AS error_count, + COALESCE(u.token_consumed, 0) AS token_consumed + FROM usage_totals u + FULL OUTER JOIN error_totals e ON u.platform = e.platform +) +SELECT platform, (success_count + error_count) AS request_count, token_consumed +FROM combined +WHERE platform IS NOT NULL AND platform <> '' +ORDER BY request_count DESC` + + rows, err := r.db.QueryContext(ctx, q, start, end) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + items := make([]*service.OpsThroughputPlatformBreakdownItem, 0, 8) + for rows.Next() { + var platform string + var requests int64 + var tokens sql.NullInt64 + if err := rows.Scan(&platform, &requests, &tokens); err != nil { + return nil, err + } + tokenConsumed := int64(0) + if tokens.Valid { + tokenConsumed = tokens.Int64 + } + items = append(items, &service.OpsThroughputPlatformBreakdownItem{ + Platform: platform, + RequestCount: requests, + TokenConsumed: tokenConsumed, + }) + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +func (r *opsRepository) getThroughputTopGroupsByPlatform(ctx context.Context, start, end time.Time, platform string, limit int) ([]*service.OpsThroughputGroupBreakdownItem, error) { + if strings.TrimSpace(platform) == "" { + return nil, nil + } + if limit <= 0 || limit > 100 { + limit = 10 + } + + q := ` +WITH usage_totals AS ( + SELECT ul.group_id AS group_id, + g.name AS group_name, + COUNT(*) AS success_count, + COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed + FROM usage_logs ul + JOIN groups g ON g.id = ul.group_id + WHERE ul.created_at >= $1 AND ul.created_at < $2 + AND g.platform = $3 + GROUP BY 1, 2 +), +error_totals AS ( + SELECT group_id, + COUNT(*) AS error_count + FROM ops_error_logs + WHERE created_at >= $1 AND created_at < $2 + AND platform = $3 + AND group_id IS NOT NULL + AND COALESCE(status_code, 0) >= 400 + GROUP BY 1 +), +combined AS ( + SELECT COALESCE(u.group_id, e.group_id) AS group_id, + COALESCE(u.group_name, g2.name, '') AS group_name, + COALESCE(u.success_count, 0) AS success_count, + COALESCE(e.error_count, 0) AS error_count, + COALESCE(u.token_consumed, 0) AS token_consumed + FROM usage_totals u + FULL OUTER JOIN error_totals e ON u.group_id = e.group_id + LEFT JOIN groups g2 ON g2.id = COALESCE(u.group_id, e.group_id) +) +SELECT group_id, group_name, (success_count + error_count) AS request_count, token_consumed +FROM combined +WHERE group_id IS NOT NULL +ORDER BY request_count DESC +LIMIT $4` + + rows, err := r.db.QueryContext(ctx, q, start, end, platform, limit) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + items := make([]*service.OpsThroughputGroupBreakdownItem, 0, limit) + for rows.Next() { + var groupID int64 + var groupName sql.NullString + var requests int64 + var tokens sql.NullInt64 + if err := rows.Scan(&groupID, &groupName, &requests, &tokens); err != nil { + return nil, err + } + tokenConsumed := int64(0) + if tokens.Valid { + tokenConsumed = tokens.Int64 + } + name := "" + if groupName.Valid { + name = groupName.String + } + items = append(items, &service.OpsThroughputGroupBreakdownItem{ + GroupID: groupID, + GroupName: name, + RequestCount: requests, + TokenConsumed: tokenConsumed, + }) + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +func opsBucketExprForUsage(bucketSeconds int) string { + switch bucketSeconds { + case 3600: + return "date_trunc('hour', ul.created_at)" + case 300: + // 5-minute buckets in UTC. + return "to_timestamp(floor(extract(epoch from ul.created_at) / 300) * 300)" + default: + return "date_trunc('minute', ul.created_at)" + } +} + +func opsBucketExprForError(bucketSeconds int) string { + switch bucketSeconds { + case 3600: + return "date_trunc('hour', created_at)" + case 300: + return "to_timestamp(floor(extract(epoch from created_at) / 300) * 300)" + default: + return "date_trunc('minute', created_at)" + } +} + +func opsBucketLabel(bucketSeconds int) string { + if bucketSeconds <= 0 { + return "1m" + } + if bucketSeconds%3600 == 0 { + h := bucketSeconds / 3600 + if h <= 0 { + h = 1 + } + return fmt.Sprintf("%dh", h) + } + m := bucketSeconds / 60 + if m <= 0 { + m = 1 + } + return fmt.Sprintf("%dm", m) +} + +func opsFloorToBucketStart(t time.Time, bucketSeconds int) time.Time { + t = t.UTC() + if bucketSeconds <= 0 { + bucketSeconds = 60 + } + secs := t.Unix() + floored := secs - (secs % int64(bucketSeconds)) + return time.Unix(floored, 0).UTC() +} + +func fillOpsThroughputBuckets(start, end time.Time, bucketSeconds int, points []*service.OpsThroughputTrendPoint) []*service.OpsThroughputTrendPoint { + if bucketSeconds <= 0 { + bucketSeconds = 60 + } + if !start.Before(end) { + return points + } + + endMinus := end.Add(-time.Nanosecond) + if endMinus.Before(start) { + return points + } + + first := opsFloorToBucketStart(start, bucketSeconds) + last := opsFloorToBucketStart(endMinus, bucketSeconds) + step := time.Duration(bucketSeconds) * time.Second + + existing := make(map[int64]*service.OpsThroughputTrendPoint, len(points)) + for _, p := range points { + if p == nil { + continue + } + existing[p.BucketStart.UTC().Unix()] = p + } + + out := make([]*service.OpsThroughputTrendPoint, 0, int(last.Sub(first)/step)+1) + for cursor := first; !cursor.After(last); cursor = cursor.Add(step) { + if p, ok := existing[cursor.Unix()]; ok && p != nil { + out = append(out, p) + continue + } + out = append(out, &service.OpsThroughputTrendPoint{ + BucketStart: cursor, + RequestCount: 0, + TokenConsumed: 0, + QPS: 0, + TPS: 0, + }) + } + return out +} + +func (r *opsRepository) GetErrorTrend(ctx context.Context, filter *service.OpsDashboardFilter, bucketSeconds int) (*service.OpsErrorTrendResponse, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + return nil, fmt.Errorf("nil filter") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, fmt.Errorf("start_time/end_time required") + } + + if bucketSeconds <= 0 { + bucketSeconds = 60 + } + if bucketSeconds != 60 && bucketSeconds != 300 && bucketSeconds != 3600 { + bucketSeconds = 60 + } + + start := filter.StartTime.UTC() + end := filter.EndTime.UTC() + where, args, _ := buildErrorWhere(filter, start, end, 1) + bucketExpr := opsBucketExprForError(bucketSeconds) + + q := ` +SELECT + ` + bucketExpr + ` AS bucket, + COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400) AS error_total, + COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited) AS business_limited, + COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited) AS error_sla, + COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)) AS upstream_excl, + COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429) AS upstream_429, + COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529) AS upstream_529 +FROM ops_error_logs +` + where + ` +GROUP BY 1 +ORDER BY 1 ASC` + + rows, err := r.db.QueryContext(ctx, q, args...) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + points := make([]*service.OpsErrorTrendPoint, 0, 256) + for rows.Next() { + var bucket time.Time + var total, businessLimited, sla, upstreamExcl, upstream429, upstream529 int64 + if err := rows.Scan(&bucket, &total, &businessLimited, &sla, &upstreamExcl, &upstream429, &upstream529); err != nil { + return nil, err + } + points = append(points, &service.OpsErrorTrendPoint{ + BucketStart: bucket.UTC(), + + ErrorCountTotal: total, + BusinessLimitedCount: businessLimited, + ErrorCountSLA: sla, + + UpstreamErrorCountExcl429529: upstreamExcl, + Upstream429Count: upstream429, + Upstream529Count: upstream529, + }) + } + if err := rows.Err(); err != nil { + return nil, err + } + + points = fillOpsErrorTrendBuckets(start, end, bucketSeconds, points) + + return &service.OpsErrorTrendResponse{ + Bucket: opsBucketLabel(bucketSeconds), + Points: points, + }, nil +} + +func fillOpsErrorTrendBuckets(start, end time.Time, bucketSeconds int, points []*service.OpsErrorTrendPoint) []*service.OpsErrorTrendPoint { + if bucketSeconds <= 0 { + bucketSeconds = 60 + } + if !start.Before(end) { + return points + } + + endMinus := end.Add(-time.Nanosecond) + if endMinus.Before(start) { + return points + } + + first := opsFloorToBucketStart(start, bucketSeconds) + last := opsFloorToBucketStart(endMinus, bucketSeconds) + step := time.Duration(bucketSeconds) * time.Second + + existing := make(map[int64]*service.OpsErrorTrendPoint, len(points)) + for _, p := range points { + if p == nil { + continue + } + existing[p.BucketStart.UTC().Unix()] = p + } + + out := make([]*service.OpsErrorTrendPoint, 0, int(last.Sub(first)/step)+1) + for cursor := first; !cursor.After(last); cursor = cursor.Add(step) { + if p, ok := existing[cursor.Unix()]; ok && p != nil { + out = append(out, p) + continue + } + out = append(out, &service.OpsErrorTrendPoint{ + BucketStart: cursor, + + ErrorCountTotal: 0, + BusinessLimitedCount: 0, + ErrorCountSLA: 0, + + UpstreamErrorCountExcl429529: 0, + Upstream429Count: 0, + Upstream529Count: 0, + }) + } + return out +} + +func (r *opsRepository) GetErrorDistribution(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsErrorDistributionResponse, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + return nil, fmt.Errorf("nil filter") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, fmt.Errorf("start_time/end_time required") + } + + start := filter.StartTime.UTC() + end := filter.EndTime.UTC() + where, args, _ := buildErrorWhere(filter, start, end, 1) + + q := ` +SELECT + COALESCE(upstream_status_code, status_code, 0) AS status_code, + COUNT(*) AS total, + COUNT(*) FILTER (WHERE NOT is_business_limited) AS sla, + COUNT(*) FILTER (WHERE is_business_limited) AS business_limited +FROM ops_error_logs +` + where + ` + AND COALESCE(status_code, 0) >= 400 +GROUP BY 1 +ORDER BY total DESC +LIMIT 20` + + rows, err := r.db.QueryContext(ctx, q, args...) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + items := make([]*service.OpsErrorDistributionItem, 0, 16) + var total int64 + for rows.Next() { + var statusCode int + var cntTotal, cntSLA, cntBiz int64 + if err := rows.Scan(&statusCode, &cntTotal, &cntSLA, &cntBiz); err != nil { + return nil, err + } + total += cntTotal + items = append(items, &service.OpsErrorDistributionItem{ + StatusCode: statusCode, + Total: cntTotal, + SLA: cntSLA, + BusinessLimited: cntBiz, + }) + } + if err := rows.Err(); err != nil { + return nil, err + } + + return &service.OpsErrorDistributionResponse{ + Total: total, + Items: items, + }, nil +} diff --git a/backend/internal/repository/ops_repo_window_stats.go b/backend/internal/repository/ops_repo_window_stats.go new file mode 100644 index 00000000..8221c473 --- /dev/null +++ b/backend/internal/repository/ops_repo_window_stats.go @@ -0,0 +1,50 @@ +package repository + +import ( + "context" + "fmt" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" +) + +func (r *opsRepository) GetWindowStats(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsWindowStats, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + return nil, fmt.Errorf("nil filter") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, fmt.Errorf("start_time/end_time required") + } + + start := filter.StartTime.UTC() + end := filter.EndTime.UTC() + if start.After(end) { + return nil, fmt.Errorf("start_time must be <= end_time") + } + // Bound excessively large windows to prevent accidental heavy queries. + if end.Sub(start) > 24*time.Hour { + return nil, fmt.Errorf("window too large") + } + + successCount, tokenConsumed, err := r.queryUsageCounts(ctx, filter, start, end) + if err != nil { + return nil, err + } + + errorTotal, _, _, _, _, _, err := r.queryErrorCounts(ctx, filter, start, end) + if err != nil { + return nil, err + } + + return &service.OpsWindowStats{ + StartTime: start, + EndTime: end, + + SuccessCount: successCount, + ErrorCountTotal: errorTotal, + TokenConsumed: tokenConsumed, + }, nil +} diff --git a/backend/internal/repository/usage_log_repo_integration_test.go b/backend/internal/repository/usage_log_repo_integration_test.go index 51964782..3f90e49e 100644 --- a/backend/internal/repository/usage_log_repo_integration_test.go +++ b/backend/internal/repository/usage_log_repo_integration_test.go @@ -204,7 +204,7 @@ func (s *UsageLogRepoSuite) TestDashboardStats_TodayTotalsAndPerformance() { userToday := mustCreateUser(s.T(), s.client, &service.User{ Email: "today@example.com", - CreatedAt: maxTime(todayStart.Add(10*time.Second), now.Add(-10*time.Second)), + CreatedAt: testMaxTime(todayStart.Add(10*time.Second), now.Add(-10*time.Second)), UpdatedAt: now, }) userOld := mustCreateUser(s.T(), s.client, &service.User{ @@ -237,7 +237,7 @@ func (s *UsageLogRepoSuite) TestDashboardStats_TodayTotalsAndPerformance() { TotalCost: 1.5, ActualCost: 1.2, DurationMs: &d1, - CreatedAt: maxTime(todayStart.Add(2*time.Minute), now.Add(-2*time.Minute)), + CreatedAt: testMaxTime(todayStart.Add(2*time.Minute), now.Add(-2*time.Minute)), } _, err = s.repo.Create(s.ctx, logToday) s.Require().NoError(err, "Create logToday") @@ -413,9 +413,17 @@ func (s *UsageLogRepoSuite) TestGetAccountTodayStats() { func (s *UsageLogRepoSuite) TestDashboardAggregationConsistency() { now := time.Now().UTC().Truncate(time.Second) - hour1 := now.Add(-90 * time.Minute).Truncate(time.Hour) - hour2 := now.Add(-30 * time.Minute).Truncate(time.Hour) + // 使用固定的时间偏移确保 hour1 和 hour2 在同一天且都在过去 + // 选择当天 02:00 和 03:00 作为测试时间点(基于 now 的日期) dayStart := truncateToDayUTC(now) + hour1 := dayStart.Add(2 * time.Hour) // 当天 02:00 + hour2 := dayStart.Add(3 * time.Hour) // 当天 03:00 + // 如果当前时间早于 hour2,则使用昨天的时间 + if now.Before(hour2.Add(time.Hour)) { + dayStart = dayStart.Add(-24 * time.Hour) + hour1 = dayStart.Add(2 * time.Hour) + hour2 = dayStart.Add(3 * time.Hour) + } user1 := mustCreateUser(s.T(), s.client, &service.User{Email: "agg-u1@test.com"}) user2 := mustCreateUser(s.T(), s.client, &service.User{Email: "agg-u2@test.com"}) @@ -473,7 +481,7 @@ func (s *UsageLogRepoSuite) TestDashboardAggregationConsistency() { aggRepo := newDashboardAggregationRepositoryWithSQL(s.tx) aggStart := hour1.Add(-5 * time.Minute) - aggEnd := now.Add(5 * time.Minute) + aggEnd := hour2.Add(time.Hour) // 确保覆盖 hour2 的所有数据 s.Require().NoError(aggRepo.AggregateRange(s.ctx, aggStart, aggEnd)) type hourlyRow struct { @@ -621,7 +629,7 @@ func (s *UsageLogRepoSuite) TestGetGlobalStats() { s.Require().Equal(int64(45), stats.TotalOutputTokens) } -func maxTime(a, b time.Time) time.Time { +func testMaxTime(a, b time.Time) time.Time { if a.After(b) { return a } diff --git a/backend/internal/repository/wire.go b/backend/internal/repository/wire.go index 8cc937bb..e1c6c3d4 100644 --- a/backend/internal/repository/wire.go +++ b/backend/internal/repository/wire.go @@ -49,6 +49,7 @@ var ProviderSet = wire.NewSet( NewUsageLogRepository, NewDashboardAggregationRepository, NewSettingRepository, + NewOpsRepository, NewUserSubscriptionRepository, NewUserAttributeDefinitionRepository, NewUserAttributeValueRepository, diff --git a/backend/internal/server/api_contract_test.go b/backend/internal/server/api_contract_test.go index ebb98a50..d96732bd 100644 --- a/backend/internal/server/api_contract_test.go +++ b/backend/internal/server/api_contract_test.go @@ -262,11 +262,11 @@ func TestAPIContracts(t *testing.T) { name: "GET /api/v1/admin/settings", setup: func(t *testing.T, deps *contractDeps) { t.Helper() - deps.settingRepo.SetAll(map[string]string{ - service.SettingKeyRegistrationEnabled: "true", - service.SettingKeyEmailVerifyEnabled: "false", + deps.settingRepo.SetAll(map[string]string{ + service.SettingKeyRegistrationEnabled: "true", + service.SettingKeyEmailVerifyEnabled: "false", - service.SettingKeySMTPHost: "smtp.example.com", + service.SettingKeySMTPHost: "smtp.example.com", service.SettingKeySMTPPort: "587", service.SettingKeySMTPUsername: "user", service.SettingKeySMTPPassword: "secret", @@ -285,10 +285,15 @@ func TestAPIContracts(t *testing.T) { service.SettingKeyContactInfo: "support", service.SettingKeyDocURL: "https://docs.example.com", - service.SettingKeyDefaultConcurrency: "5", - service.SettingKeyDefaultBalance: "1.25", - }) - }, + service.SettingKeyDefaultConcurrency: "5", + service.SettingKeyDefaultBalance: "1.25", + + service.SettingKeyOpsMonitoringEnabled: "false", + service.SettingKeyOpsRealtimeMonitoringEnabled: "true", + service.SettingKeyOpsQueryModeDefault: "auto", + service.SettingKeyOpsMetricsIntervalSeconds: "60", + }) + }, method: http.MethodGet, path: "/api/v1/admin/settings", wantStatus: http.StatusOK, @@ -309,13 +314,17 @@ func TestAPIContracts(t *testing.T) { "turnstile_site_key": "site-key", "turnstile_secret_key_configured": true, "linuxdo_connect_enabled": false, - "linuxdo_connect_client_id": "", - "linuxdo_connect_client_secret_configured": false, - "linuxdo_connect_redirect_url": "", - "site_name": "Sub2API", - "site_logo": "", - "site_subtitle": "Subtitle", - "api_base_url": "https://api.example.com", + "linuxdo_connect_client_id": "", + "linuxdo_connect_client_secret_configured": false, + "linuxdo_connect_redirect_url": "", + "ops_monitoring_enabled": false, + "ops_realtime_monitoring_enabled": true, + "ops_query_mode_default": "auto", + "ops_metrics_interval_seconds": 60, + "site_name": "Sub2API", + "site_logo": "", + "site_subtitle": "Subtitle", + "api_base_url": "https://api.example.com", "contact_info": "support", "doc_url": "https://docs.example.com", "default_concurrency": 5, @@ -430,7 +439,7 @@ func newContractDeps(t *testing.T) *contractDeps { authHandler := handler.NewAuthHandler(cfg, nil, userService, settingService, nil) apiKeyHandler := handler.NewAPIKeyHandler(apiKeyService) usageHandler := handler.NewUsageHandler(usageService, apiKeyService) - adminSettingHandler := adminhandler.NewSettingHandler(settingService, nil, nil) + adminSettingHandler := adminhandler.NewSettingHandler(settingService, nil, nil, nil) adminAccountHandler := adminhandler.NewAccountHandler(adminService, nil, nil, nil, nil, nil, nil, nil, nil, nil) jwtAuth := func(c *gin.Context) { diff --git a/backend/internal/server/http.go b/backend/internal/server/http.go index a7d1d3b5..52d5c926 100644 --- a/backend/internal/server/http.go +++ b/backend/internal/server/http.go @@ -31,6 +31,7 @@ func ProvideRouter( apiKeyAuth middleware2.APIKeyAuthMiddleware, apiKeyService *service.APIKeyService, subscriptionService *service.SubscriptionService, + opsService *service.OpsService, settingService *service.SettingService, redisClient *redis.Client, ) *gin.Engine { @@ -50,7 +51,7 @@ func ProvideRouter( } } - return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, settingService, cfg, redisClient) + return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, cfg, redisClient) } // ProvideHTTPServer 提供 HTTP 服务器 diff --git a/backend/internal/server/middleware/admin_auth.go b/backend/internal/server/middleware/admin_auth.go index e02a7b0a..8f30107c 100644 --- a/backend/internal/server/middleware/admin_auth.go +++ b/backend/internal/server/middleware/admin_auth.go @@ -30,6 +30,20 @@ func adminAuth( settingService *service.SettingService, ) gin.HandlerFunc { return func(c *gin.Context) { + // WebSocket upgrade requests cannot set Authorization headers in browsers. + // For admin WebSocket endpoints (e.g. Ops realtime), allow passing the JWT via + // Sec-WebSocket-Protocol (subprotocol list) using a prefixed token item: + // Sec-WebSocket-Protocol: sub2api-admin, jwt. + if isWebSocketUpgradeRequest(c) { + if token := extractJWTFromWebSocketSubprotocol(c); token != "" { + if !validateJWTForAdmin(c, token, authService, userService) { + return + } + c.Next() + return + } + } + // 检查 x-api-key header(Admin API Key 认证) apiKey := c.GetHeader("x-api-key") if apiKey != "" { @@ -58,6 +72,44 @@ func adminAuth( } } +func isWebSocketUpgradeRequest(c *gin.Context) bool { + if c == nil || c.Request == nil { + return false + } + // RFC6455 handshake uses: + // Connection: Upgrade + // Upgrade: websocket + upgrade := strings.ToLower(strings.TrimSpace(c.GetHeader("Upgrade"))) + if upgrade != "websocket" { + return false + } + connection := strings.ToLower(c.GetHeader("Connection")) + return strings.Contains(connection, "upgrade") +} + +func extractJWTFromWebSocketSubprotocol(c *gin.Context) string { + if c == nil { + return "" + } + raw := strings.TrimSpace(c.GetHeader("Sec-WebSocket-Protocol")) + if raw == "" { + return "" + } + + // The header is a comma-separated list of tokens. We reserve the prefix "jwt." + // for carrying the admin JWT. + for _, part := range strings.Split(raw, ",") { + p := strings.TrimSpace(part) + if strings.HasPrefix(p, "jwt.") { + token := strings.TrimSpace(strings.TrimPrefix(p, "jwt.")) + if token != "" { + return token + } + } + } + return "" +} + // validateAdminAPIKey 验证管理员 API Key func validateAdminAPIKey( c *gin.Context, diff --git a/backend/internal/server/middleware/client_request_id.go b/backend/internal/server/middleware/client_request_id.go new file mode 100644 index 00000000..d22b6cc5 --- /dev/null +++ b/backend/internal/server/middleware/client_request_id.go @@ -0,0 +1,30 @@ +package middleware + +import ( + "context" + + "github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey" + "github.com/gin-gonic/gin" + "github.com/google/uuid" +) + +// ClientRequestID ensures every request has a unique client_request_id in request.Context(). +// +// This is used by the Ops monitoring module for end-to-end request correlation. +func ClientRequestID() gin.HandlerFunc { + return func(c *gin.Context) { + if c.Request == nil { + c.Next() + return + } + + if v := c.Request.Context().Value(ctxkey.ClientRequestID); v != nil { + c.Next() + return + } + + id := uuid.New().String() + c.Request = c.Request.WithContext(context.WithValue(c.Request.Context(), ctxkey.ClientRequestID, id)) + c.Next() + } +} diff --git a/backend/internal/server/router.go b/backend/internal/server/router.go index 70f7da84..cf9015e4 100644 --- a/backend/internal/server/router.go +++ b/backend/internal/server/router.go @@ -23,6 +23,7 @@ func SetupRouter( apiKeyAuth middleware2.APIKeyAuthMiddleware, apiKeyService *service.APIKeyService, subscriptionService *service.SubscriptionService, + opsService *service.OpsService, settingService *service.SettingService, cfg *config.Config, redisClient *redis.Client, @@ -46,7 +47,7 @@ func SetupRouter( } // 注册路由 - registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, cfg, redisClient) + registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg, redisClient) return r } @@ -60,6 +61,7 @@ func registerRoutes( apiKeyAuth middleware2.APIKeyAuthMiddleware, apiKeyService *service.APIKeyService, subscriptionService *service.SubscriptionService, + opsService *service.OpsService, cfg *config.Config, redisClient *redis.Client, ) { @@ -73,5 +75,5 @@ func registerRoutes( routes.RegisterAuthRoutes(v1, h, jwtAuth, redisClient) routes.RegisterUserRoutes(v1, h, jwtAuth) routes.RegisterAdminRoutes(v1, h, adminAuth) - routes.RegisterGatewayRoutes(r, h, apiKeyAuth, apiKeyService, subscriptionService, cfg) + routes.RegisterGatewayRoutes(r, h, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg) } diff --git a/backend/internal/server/routes/admin.go b/backend/internal/server/routes/admin.go index c9c5352c..a2f1b8c7 100644 --- a/backend/internal/server/routes/admin.go +++ b/backend/internal/server/routes/admin.go @@ -50,6 +50,9 @@ func RegisterAdminRoutes( // 系统设置 registerSettingsRoutes(admin, h) + // 运维监控(Ops) + registerOpsRoutes(admin, h) + // 系统管理 registerSystemRoutes(admin, h) @@ -64,6 +67,58 @@ func RegisterAdminRoutes( } } +func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) { + ops := admin.Group("/ops") + { + // Realtime ops signals + ops.GET("/concurrency", h.Admin.Ops.GetConcurrencyStats) + ops.GET("/account-availability", h.Admin.Ops.GetAccountAvailability) + + // Alerts (rules + events) + ops.GET("/alert-rules", h.Admin.Ops.ListAlertRules) + ops.POST("/alert-rules", h.Admin.Ops.CreateAlertRule) + ops.PUT("/alert-rules/:id", h.Admin.Ops.UpdateAlertRule) + ops.DELETE("/alert-rules/:id", h.Admin.Ops.DeleteAlertRule) + ops.GET("/alert-events", h.Admin.Ops.ListAlertEvents) + + // Email notification config (DB-backed) + ops.GET("/email-notification/config", h.Admin.Ops.GetEmailNotificationConfig) + ops.PUT("/email-notification/config", h.Admin.Ops.UpdateEmailNotificationConfig) + + // Runtime settings (DB-backed) + runtime := ops.Group("/runtime") + { + runtime.GET("/alert", h.Admin.Ops.GetAlertRuntimeSettings) + runtime.PUT("/alert", h.Admin.Ops.UpdateAlertRuntimeSettings) + } + + // Advanced settings (DB-backed) + ops.GET("/advanced-settings", h.Admin.Ops.GetAdvancedSettings) + ops.PUT("/advanced-settings", h.Admin.Ops.UpdateAdvancedSettings) + + // WebSocket realtime (QPS/TPS) + ws := ops.Group("/ws") + { + ws.GET("/qps", h.Admin.Ops.QPSWSHandler) + } + + // Error logs (MVP-1) + ops.GET("/errors", h.Admin.Ops.GetErrorLogs) + ops.GET("/errors/:id", h.Admin.Ops.GetErrorLogByID) + ops.POST("/errors/:id/retry", h.Admin.Ops.RetryErrorRequest) + + // Request drilldown (success + error) + ops.GET("/requests", h.Admin.Ops.ListRequestDetails) + + // Dashboard (vNext - raw path for MVP) + ops.GET("/dashboard/overview", h.Admin.Ops.GetDashboardOverview) + ops.GET("/dashboard/throughput-trend", h.Admin.Ops.GetDashboardThroughputTrend) + ops.GET("/dashboard/latency-histogram", h.Admin.Ops.GetDashboardLatencyHistogram) + ops.GET("/dashboard/error-trend", h.Admin.Ops.GetDashboardErrorTrend) + ops.GET("/dashboard/error-distribution", h.Admin.Ops.GetDashboardErrorDistribution) + } +} + func registerDashboardRoutes(admin *gin.RouterGroup, h *handler.Handlers) { dashboard := admin.Group("/dashboard") { diff --git a/backend/internal/server/routes/gateway.go b/backend/internal/server/routes/gateway.go index 0b62185e..bf019ce3 100644 --- a/backend/internal/server/routes/gateway.go +++ b/backend/internal/server/routes/gateway.go @@ -16,13 +16,18 @@ func RegisterGatewayRoutes( apiKeyAuth middleware.APIKeyAuthMiddleware, apiKeyService *service.APIKeyService, subscriptionService *service.SubscriptionService, + opsService *service.OpsService, cfg *config.Config, ) { bodyLimit := middleware.RequestBodyLimit(cfg.Gateway.MaxBodySize) + clientRequestID := middleware.ClientRequestID() + opsErrorLogger := handler.OpsErrorLoggerMiddleware(opsService) // API网关(Claude API兼容) gateway := r.Group("/v1") gateway.Use(bodyLimit) + gateway.Use(clientRequestID) + gateway.Use(opsErrorLogger) gateway.Use(gin.HandlerFunc(apiKeyAuth)) { gateway.POST("/messages", h.Gateway.Messages) @@ -36,6 +41,8 @@ func RegisterGatewayRoutes( // Gemini 原生 API 兼容层(Gemini SDK/CLI 直连) gemini := r.Group("/v1beta") gemini.Use(bodyLimit) + gemini.Use(clientRequestID) + gemini.Use(opsErrorLogger) gemini.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg)) { gemini.GET("/models", h.Gateway.GeminiV1BetaListModels) @@ -45,7 +52,7 @@ func RegisterGatewayRoutes( } // OpenAI Responses API(不带v1前缀的别名) - r.POST("/responses", bodyLimit, gin.HandlerFunc(apiKeyAuth), h.OpenAIGateway.Responses) + r.POST("/responses", bodyLimit, clientRequestID, opsErrorLogger, gin.HandlerFunc(apiKeyAuth), h.OpenAIGateway.Responses) // Antigravity 模型列表 r.GET("/antigravity/models", gin.HandlerFunc(apiKeyAuth), h.Gateway.AntigravityModels) @@ -53,6 +60,8 @@ func RegisterGatewayRoutes( // Antigravity 专用路由(仅使用 antigravity 账户,不混合调度) antigravityV1 := r.Group("/antigravity/v1") antigravityV1.Use(bodyLimit) + antigravityV1.Use(clientRequestID) + antigravityV1.Use(opsErrorLogger) antigravityV1.Use(middleware.ForcePlatform(service.PlatformAntigravity)) antigravityV1.Use(gin.HandlerFunc(apiKeyAuth)) { @@ -64,6 +73,8 @@ func RegisterGatewayRoutes( antigravityV1Beta := r.Group("/antigravity/v1beta") antigravityV1Beta.Use(bodyLimit) + antigravityV1Beta.Use(clientRequestID) + antigravityV1Beta.Use(opsErrorLogger) antigravityV1Beta.Use(middleware.ForcePlatform(service.PlatformAntigravity)) antigravityV1Beta.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg)) { diff --git a/backend/internal/service/antigravity_gateway_service.go b/backend/internal/service/antigravity_gateway_service.go index 4fd55757..4dd4d303 100644 --- a/backend/internal/service/antigravity_gateway_service.go +++ b/backend/internal/service/antigravity_gateway_service.go @@ -564,6 +564,14 @@ urlFallbackLoop: resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { + safeErr := sanitizeUpstreamErrorMessage(err.Error()) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "request_error", + Message: safeErr, + }) // 检查是否应触发 URL 降级 if shouldAntigravityFallbackToNextURL(err, 0) && urlIdx < len(availableURLs)-1 { antigravity.DefaultURLAvailability.MarkUnavailable(baseURL) @@ -579,6 +587,7 @@ urlFallbackLoop: continue } log.Printf("%s status=request_failed retries_exhausted error=%v", prefix, err) + setOpsUpstreamError(c, 0, safeErr, "") return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries") } @@ -586,6 +595,26 @@ urlFallbackLoop: if resp.StatusCode == http.StatusTooManyRequests && urlIdx < len(availableURLs)-1 { respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) _ = resp.Body.Close() + upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "retry", + Message: upstreamMsg, + Detail: upstreamDetail, + }) antigravity.DefaultURLAvailability.MarkUnavailable(baseURL) log.Printf("%s URL fallback (HTTP 429): %s -> %s body=%s", prefix, baseURL, availableURLs[urlIdx+1], truncateForLog(respBody, 200)) continue urlFallbackLoop @@ -596,6 +625,26 @@ urlFallbackLoop: _ = resp.Body.Close() if attempt < antigravityMaxRetries { + upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "retry", + Message: upstreamMsg, + Detail: upstreamDetail, + }) log.Printf("%s status=%d retry=%d/%d body=%s", prefix, resp.StatusCode, attempt, antigravityMaxRetries, truncateForLog(respBody, 500)) if !sleepAntigravityBackoffWithContext(ctx, attempt) { log.Printf("%s status=context_canceled_during_backoff", prefix) @@ -628,6 +677,27 @@ urlFallbackLoop: // Antigravity /v1internal 链路在部分场景会对 thought/thinking signature 做严格校验, // 当历史消息携带的 signature 不合法时会直接 400;去除 thinking 后可继续完成请求。 if resp.StatusCode == http.StatusBadRequest && isSignatureRelatedError(respBody) { + upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "signature_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + // Conservative two-stage fallback: // 1) Disable top-level thinking + thinking->text // 2) Only if still signature-related 400: also downgrade tool_use/tool_result to text. @@ -661,6 +731,13 @@ urlFallbackLoop: } retryResp, retryErr := s.httpUpstream.Do(retryReq, proxyURL, account.ID, account.Concurrency) if retryErr != nil { + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "signature_retry_request_error", + Message: sanitizeUpstreamErrorMessage(retryErr.Error()), + }) log.Printf("Antigravity account %d: signature retry request failed (%s): %v", account.ID, stage.name, retryErr) continue } @@ -674,6 +751,25 @@ urlFallbackLoop: retryBody, _ := io.ReadAll(io.LimitReader(retryResp.Body, 2<<20)) _ = retryResp.Body.Close() + kind := "signature_retry" + if strings.TrimSpace(stage.name) != "" { + kind = "signature_retry_" + strings.ReplaceAll(stage.name, "+", "_") + } + retryUpstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(retryBody)) + retryUpstreamMsg = sanitizeUpstreamErrorMessage(retryUpstreamMsg) + retryUpstreamDetail := "" + if logBody { + retryUpstreamDetail = truncateString(string(retryBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: retryResp.StatusCode, + UpstreamRequestID: retryResp.Header.Get("x-request-id"), + Kind: kind, + Message: retryUpstreamMsg, + Detail: retryUpstreamDetail, + }) // If this stage fixed the signature issue, we stop; otherwise we may try the next stage. if retryResp.StatusCode != http.StatusBadRequest || !isSignatureRelatedError(retryBody) { @@ -701,10 +797,30 @@ urlFallbackLoop: s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope) if s.shouldFailoverUpstreamError(resp.StatusCode) { + upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "failover", + Message: upstreamMsg, + Detail: upstreamDetail, + }) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } - return nil, s.writeMappedClaudeError(c, resp.StatusCode, respBody) + return nil, s.writeMappedClaudeError(c, account, resp.StatusCode, resp.Header.Get("x-request-id"), respBody) } } @@ -1108,6 +1224,14 @@ urlFallbackLoop: resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { + safeErr := sanitizeUpstreamErrorMessage(err.Error()) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "request_error", + Message: safeErr, + }) // 检查是否应触发 URL 降级 if shouldAntigravityFallbackToNextURL(err, 0) && urlIdx < len(availableURLs)-1 { antigravity.DefaultURLAvailability.MarkUnavailable(baseURL) @@ -1123,6 +1247,7 @@ urlFallbackLoop: continue } log.Printf("%s status=request_failed retries_exhausted error=%v", prefix, err) + setOpsUpstreamError(c, 0, safeErr, "") return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries") } @@ -1130,6 +1255,26 @@ urlFallbackLoop: if resp.StatusCode == http.StatusTooManyRequests && urlIdx < len(availableURLs)-1 { respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) _ = resp.Body.Close() + upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "retry", + Message: upstreamMsg, + Detail: upstreamDetail, + }) antigravity.DefaultURLAvailability.MarkUnavailable(baseURL) log.Printf("%s URL fallback (HTTP 429): %s -> %s body=%s", prefix, baseURL, availableURLs[urlIdx+1], truncateForLog(respBody, 200)) continue urlFallbackLoop @@ -1140,6 +1285,26 @@ urlFallbackLoop: _ = resp.Body.Close() if attempt < antigravityMaxRetries { + upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "retry", + Message: upstreamMsg, + Detail: upstreamDetail, + }) log.Printf("%s status=%d retry=%d/%d", prefix, resp.StatusCode, attempt, antigravityMaxRetries) if !sleepAntigravityBackoffWithContext(ctx, attempt) { log.Printf("%s status=context_canceled_during_backoff", prefix) @@ -1205,21 +1370,59 @@ urlFallbackLoop: s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope) - if s.shouldFailoverUpstreamError(resp.StatusCode) { - return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} - } - - // 解包并返回错误 requestID := resp.Header.Get("x-request-id") if requestID != "" { c.Header("x-request-id", requestID) } - unwrapped, _ := s.unwrapV1InternalResponse(respBody) + + unwrapped, unwrapErr := s.unwrapV1InternalResponse(respBody) + unwrappedForOps := unwrapped + if unwrapErr != nil || len(unwrappedForOps) == 0 { + unwrappedForOps = respBody + } + upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(unwrappedForOps)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(unwrappedForOps), maxBytes) + } + + // Always record upstream context for Ops error logs, even when we will failover. + setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail) + + if s.shouldFailoverUpstreamError(resp.StatusCode) { + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: requestID, + Kind: "failover", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} + } + contentType := resp.Header.Get("Content-Type") if contentType == "" { contentType = "application/json" } - c.Data(resp.StatusCode, contentType, unwrapped) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: requestID, + Kind: "http_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + c.Data(resp.StatusCode, contentType, unwrappedForOps) return nil, fmt.Errorf("antigravity upstream error: %d", resp.StatusCode) } @@ -1674,9 +1877,35 @@ func (s *AntigravityGatewayService) writeClaudeError(c *gin.Context, status int, return fmt.Errorf("%s", message) } -func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, upstreamStatus int, body []byte) error { - // 记录上游错误详情便于调试 - log.Printf("[antigravity-Forward] upstream_error status=%d body=%s", upstreamStatus, string(body)) +func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, account *Account, upstreamStatus int, upstreamRequestID string, body []byte) error { + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(body), maxBytes) + } + setOpsUpstreamError(c, upstreamStatus, upstreamMsg, upstreamDetail) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: upstreamStatus, + UpstreamRequestID: upstreamRequestID, + Kind: "http_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + + // 记录上游错误详情便于排障(可选:由配置控制;不回显到客户端) + if logBody { + log.Printf("[antigravity-Forward] upstream_error status=%d body=%s", upstreamStatus, truncateForLog(body, maxBytes)) + } var statusCode int var errType, errMsg string @@ -1712,7 +1941,10 @@ func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, upstr "type": "error", "error": gin.H{"type": errType, "message": errMsg}, }) - return fmt.Errorf("upstream error: %d", upstreamStatus) + if upstreamMsg == "" { + return fmt.Errorf("upstream error: %d", upstreamStatus) + } + return fmt.Errorf("upstream error: %d message=%s", upstreamStatus, upstreamMsg) } func (s *AntigravityGatewayService) writeGoogleError(c *gin.Context, status int, message string) error { diff --git a/backend/internal/service/auth_service.go b/backend/internal/service/auth_service.go index 61b15cd8..386b43fc 100644 --- a/backend/internal/service/auth_service.go +++ b/backend/internal/service/auth_service.go @@ -357,7 +357,7 @@ func (s *AuthService) Login(ctx context.Context, email, password string) (string // - 如果邮箱已存在:直接登录(不需要本地密码) // - 如果邮箱不存在:创建新用户并登录 // -// 注意:该函数用于“终端用户登录 Sub2API 本身”的场景(不同于上游账号的 OAuth,例如 OpenAI/Gemini)。 +// 注意:该函数用于 LinuxDo OAuth 登录场景(不同于上游账号的 OAuth,例如 Claude/OpenAI/Gemini)。 // 为了满足现有数据库约束(需要密码哈希),新用户会生成随机密码并进行哈希保存。 func (s *AuthService) LoginOrRegisterOAuth(ctx context.Context, email, username string) (string, *User, error) { email = strings.TrimSpace(email) @@ -376,8 +376,8 @@ func (s *AuthService) LoginOrRegisterOAuth(ctx context.Context, email, username user, err := s.userRepo.GetByEmail(ctx, email) if err != nil { if errors.Is(err, ErrUserNotFound) { - // OAuth 首次登录视为注册。 - if s.settingService != nil && !s.settingService.IsRegistrationEnabled(ctx) { + // OAuth 首次登录视为注册(fail-close:settingService 未配置时不允许注册) + if s.settingService == nil || !s.settingService.IsRegistrationEnabled(ctx) { return "", nil, ErrRegDisabled } diff --git a/backend/internal/service/domain_constants.go b/backend/internal/service/domain_constants.go index 77709553..398d9fbd 100644 --- a/backend/internal/service/domain_constants.go +++ b/backend/internal/service/domain_constants.go @@ -63,6 +63,9 @@ const ( SubscriptionStatusSuspended = "suspended" ) +// LinuxDoConnectSyntheticEmailDomain 是 LinuxDo Connect 用户的合成邮箱后缀(RFC 保留域名)。 +const LinuxDoConnectSyntheticEmailDomain = "@linuxdo-connect.invalid" + // Setting keys const ( // 注册设置 @@ -83,6 +86,12 @@ const ( SettingKeyTurnstileSiteKey = "turnstile_site_key" // Turnstile Site Key SettingKeyTurnstileSecretKey = "turnstile_secret_key" // Turnstile Secret Key + // LinuxDo Connect OAuth 登录设置 + SettingKeyLinuxDoConnectEnabled = "linuxdo_connect_enabled" + SettingKeyLinuxDoConnectClientID = "linuxdo_connect_client_id" + SettingKeyLinuxDoConnectClientSecret = "linuxdo_connect_client_secret" + SettingKeyLinuxDoConnectRedirectURL = "linuxdo_connect_redirect_url" + // OEM设置 SettingKeySiteName = "site_name" // 网站名称 SettingKeySiteLogo = "site_logo" // 网站Logo (base64) @@ -113,16 +122,31 @@ const ( SettingKeyEnableIdentityPatch = "enable_identity_patch" SettingKeyIdentityPatchPrompt = "identity_patch_prompt" - // LinuxDo Connect OAuth 登录(终端用户 SSO) - SettingKeyLinuxDoConnectEnabled = "linuxdo_connect_enabled" - SettingKeyLinuxDoConnectClientID = "linuxdo_connect_client_id" - SettingKeyLinuxDoConnectClientSecret = "linuxdo_connect_client_secret" - SettingKeyLinuxDoConnectRedirectURL = "linuxdo_connect_redirect_url" -) + // ========================= + // Ops Monitoring (vNext) + // ========================= -// LinuxDoConnectSyntheticEmailDomain 是 LinuxDo Connect 用户的合成邮箱后缀(RFC 保留域名)。 -// 目的:避免第三方登录返回的用户标识与本地真实邮箱发生碰撞,进而造成账号被接管的风险。 -const LinuxDoConnectSyntheticEmailDomain = "@linuxdo-connect.invalid" + // SettingKeyOpsMonitoringEnabled is a DB-backed soft switch to enable/disable ops module at runtime. + SettingKeyOpsMonitoringEnabled = "ops_monitoring_enabled" + + // SettingKeyOpsRealtimeMonitoringEnabled controls realtime features (e.g. WS/QPS push). + SettingKeyOpsRealtimeMonitoringEnabled = "ops_realtime_monitoring_enabled" + + // SettingKeyOpsQueryModeDefault controls the default query mode for ops dashboard (auto/raw/preagg). + SettingKeyOpsQueryModeDefault = "ops_query_mode_default" + + // SettingKeyOpsEmailNotificationConfig stores JSON config for ops email notifications. + SettingKeyOpsEmailNotificationConfig = "ops_email_notification_config" + + // SettingKeyOpsAlertRuntimeSettings stores JSON config for ops alert evaluator runtime settings. + SettingKeyOpsAlertRuntimeSettings = "ops_alert_runtime_settings" + + // SettingKeyOpsMetricsIntervalSeconds controls the ops metrics collector interval (>=60). + SettingKeyOpsMetricsIntervalSeconds = "ops_metrics_interval_seconds" + + // SettingKeyOpsAdvancedSettings stores JSON config for ops advanced settings (data retention, aggregation). + SettingKeyOpsAdvancedSettings = "ops_advanced_settings" +) // AdminAPIKeyPrefix is the prefix for admin API keys (distinct from user "sk-" keys). const AdminAPIKeyPrefix = "admin-" diff --git a/backend/internal/service/gateway_service.go b/backend/internal/service/gateway_service.go index 31148b17..b48af7b0 100644 --- a/backend/internal/service/gateway_service.go +++ b/backend/internal/service/gateway_service.go @@ -1399,7 +1399,24 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A if resp != nil && resp.Body != nil { _ = resp.Body.Close() } - return nil, fmt.Errorf("upstream request failed: %w", err) + // Ensure the client receives an error response (handlers assume Forward writes on non-failover errors). + safeErr := sanitizeUpstreamErrorMessage(err.Error()) + setOpsUpstreamError(c, 0, safeErr, "") + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "request_error", + Message: safeErr, + }) + c.JSON(http.StatusBadGateway, gin.H{ + "type": "error", + "error": gin.H{ + "type": "upstream_error", + "message": "Upstream request failed", + }, + }) + return nil, fmt.Errorf("upstream request failed: %s", safeErr) } // 优先检测thinking block签名错误(400)并重试一次 @@ -1409,6 +1426,21 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A _ = resp.Body.Close() if s.isThinkingBlockSignatureError(respBody) { + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "signature_error", + Message: extractUpstreamErrorMessage(respBody), + Detail: func() string { + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes) + } + return "" + }(), + }) + looksLikeToolSignatureError := func(msg string) bool { m := strings.ToLower(msg) return strings.Contains(m, "tool_use") || @@ -1445,6 +1477,20 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A retryRespBody, retryReadErr := io.ReadAll(io.LimitReader(retryResp.Body, 2<<20)) _ = retryResp.Body.Close() if retryReadErr == nil && retryResp.StatusCode == 400 && s.isThinkingBlockSignatureError(retryRespBody) { + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: retryResp.StatusCode, + UpstreamRequestID: retryResp.Header.Get("x-request-id"), + Kind: "signature_retry_thinking", + Message: extractUpstreamErrorMessage(retryRespBody), + Detail: func() string { + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + return truncateString(string(retryRespBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes) + } + return "" + }(), + }) msg2 := extractUpstreamErrorMessage(retryRespBody) if looksLikeToolSignatureError(msg2) && time.Since(retryStart) < maxRetryElapsed { log.Printf("Account %d: signature retry still failing and looks tool-related, retrying with tool blocks downgraded", account.ID) @@ -1459,6 +1505,13 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A if retryResp2 != nil && retryResp2.Body != nil { _ = retryResp2.Body.Close() } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "signature_retry_tools_request_error", + Message: sanitizeUpstreamErrorMessage(retryErr2.Error()), + }) log.Printf("Account %d: tool-downgrade signature retry failed: %v", account.ID, retryErr2) } else { log.Printf("Account %d: tool-downgrade signature retry build failed: %v", account.ID, buildErr2) @@ -1508,9 +1561,24 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A break } + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) + _ = resp.Body.Close() + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "retry", + Message: extractUpstreamErrorMessage(respBody), + Detail: func() string { + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes) + } + return "" + }(), + }) log.Printf("Account %d: upstream error %d, retry %d/%d after %v (elapsed=%v/%v)", account.ID, resp.StatusCode, attempt, maxRetryAttempts, delay, elapsed, maxRetryElapsed) - _ = resp.Body.Close() if err := sleepWithContext(ctx, delay); err != nil { return nil, err } @@ -1538,7 +1606,25 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A // 处理重试耗尽的情况 if resp.StatusCode >= 400 && s.shouldRetryUpstreamError(account, resp.StatusCode) { if s.shouldFailoverUpstreamError(resp.StatusCode) { + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) + _ = resp.Body.Close() + resp.Body = io.NopCloser(bytes.NewReader(respBody)) + s.handleRetryExhaustedSideEffects(ctx, resp, account) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "retry_exhausted_failover", + Message: extractUpstreamErrorMessage(respBody), + Detail: func() string { + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes) + } + return "" + }(), + }) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } return s.handleRetryExhaustedError(ctx, resp, c, account) @@ -1546,7 +1632,25 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A // 处理可切换账号的错误 if resp.StatusCode >= 400 && s.shouldFailoverUpstreamError(resp.StatusCode) { + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) + _ = resp.Body.Close() + resp.Body = io.NopCloser(bytes.NewReader(respBody)) + s.handleFailoverSideEffects(ctx, resp, account) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "failover", + Message: extractUpstreamErrorMessage(respBody), + Detail: func() string { + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes) + } + return "" + }(), + }) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } @@ -1563,6 +1667,26 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A resp.Body = io.NopCloser(bytes.NewReader(respBody)) if s.shouldFailoverOn400(respBody) { + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "failover_on_400", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + if s.cfg.Gateway.LogUpstreamErrorBody { log.Printf( "Account %d: 400 error, attempting failover: %s", @@ -1859,7 +1983,30 @@ func extractUpstreamErrorMessage(body []byte) string { } func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*ForwardResult, error) { - body, _ := io.ReadAll(resp.Body) + body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) + + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + + // Enrich Ops error logs with upstream status + message, and optionally a truncated body snippet. + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(body), maxBytes) + } + setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "http_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) // 处理上游错误,标记账号状态 shouldDisable := false @@ -1870,24 +2017,33 @@ func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Res return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } + // 记录上游错误响应体摘要便于排障(可选:由配置控制;不回显到客户端) + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + log.Printf( + "Upstream error %d (account=%d platform=%s type=%s): %s", + resp.StatusCode, + account.ID, + account.Platform, + account.Type, + truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes), + ) + } + // 根据状态码返回适当的自定义错误响应(不透传上游详细信息) var errType, errMsg string var statusCode int switch resp.StatusCode { case 400: - // 仅记录上游错误摘要(避免输出请求内容);需要时可通过配置打开 - if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { - log.Printf( - "Upstream 400 error (account=%d platform=%s type=%s): %s", - account.ID, - account.Platform, - account.Type, - truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes), - ) - } c.Data(http.StatusBadRequest, "application/json", body) - return nil, fmt.Errorf("upstream error: %d", resp.StatusCode) + summary := upstreamMsg + if summary == "" { + summary = truncateForLog(body, 512) + } + if summary == "" { + return nil, fmt.Errorf("upstream error: %d", resp.StatusCode) + } + return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, summary) case 401: statusCode = http.StatusBadGateway errType = "upstream_error" @@ -1923,11 +2079,14 @@ func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Res }, }) - return nil, fmt.Errorf("upstream error: %d", resp.StatusCode) + if upstreamMsg == "" { + return nil, fmt.Errorf("upstream error: %d", resp.StatusCode) + } + return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg) } func (s *GatewayService) handleRetryExhaustedSideEffects(ctx context.Context, resp *http.Response, account *Account) { - body, _ := io.ReadAll(resp.Body) + body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) statusCode := resp.StatusCode // OAuth/Setup Token 账号的 403:标记账号异常 @@ -1941,7 +2100,7 @@ func (s *GatewayService) handleRetryExhaustedSideEffects(ctx context.Context, re } func (s *GatewayService) handleFailoverSideEffects(ctx context.Context, resp *http.Response, account *Account) { - body, _ := io.ReadAll(resp.Body) + body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body) } @@ -1949,8 +2108,45 @@ func (s *GatewayService) handleFailoverSideEffects(ctx context.Context, resp *ht // OAuth 403:标记账号异常 // API Key 未配置错误码:仅返回错误,不标记账号 func (s *GatewayService) handleRetryExhaustedError(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*ForwardResult, error) { + // Capture upstream error body before side-effects consume the stream. + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) + _ = resp.Body.Close() + resp.Body = io.NopCloser(bytes.NewReader(respBody)) + s.handleRetryExhaustedSideEffects(ctx, resp, account) + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "retry_exhausted", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + log.Printf( + "Upstream error %d retries_exhausted (account=%d platform=%s type=%s): %s", + resp.StatusCode, + account.ID, + account.Platform, + account.Type, + truncateForLog(respBody, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes), + ) + } + // 返回统一的重试耗尽错误响应 c.JSON(http.StatusBadGateway, gin.H{ "type": "error", @@ -1960,7 +2156,10 @@ func (s *GatewayService) handleRetryExhaustedError(ctx context.Context, resp *ht }, }) - return nil, fmt.Errorf("upstream error: %d (retries exhausted)", resp.StatusCode) + if upstreamMsg == "" { + return nil, fmt.Errorf("upstream error: %d (retries exhausted)", resp.StatusCode) + } + return nil, fmt.Errorf("upstream error: %d (retries exhausted) message=%s", resp.StatusCode, upstreamMsg) } // streamingResult 流式响应结果 @@ -2490,6 +2689,7 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context, // 发送请求 resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { + setOpsUpstreamError(c, 0, sanitizeUpstreamErrorMessage(err.Error()), "") s.countTokensError(c, http.StatusBadGateway, "upstream_error", "Request failed") return fmt.Errorf("upstream request failed: %w", err) } @@ -2527,6 +2727,18 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context, // 标记账号状态(429/529等) s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody) + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail) + // 记录上游错误摘要便于排障(不回显请求内容) if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { log.Printf( @@ -2548,7 +2760,10 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context, errMsg = "Service overloaded" } s.countTokensError(c, resp.StatusCode, "upstream_error", errMsg) - return fmt.Errorf("upstream error: %d", resp.StatusCode) + if upstreamMsg == "" { + return fmt.Errorf("upstream error: %d", resp.StatusCode) + } + return fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg) } // 透传成功响应 diff --git a/backend/internal/service/gemini_messages_compat_service.go b/backend/internal/service/gemini_messages_compat_service.go index 78452b1e..d1b65b71 100644 --- a/backend/internal/service/gemini_messages_compat_service.go +++ b/backend/internal/service/gemini_messages_compat_service.go @@ -543,12 +543,21 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { + safeErr := sanitizeUpstreamErrorMessage(err.Error()) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "request_error", + Message: safeErr, + }) if attempt < geminiMaxRetries { log.Printf("Gemini account %d: upstream request failed, retry %d/%d: %v", account.ID, attempt, geminiMaxRetries, err) sleepGeminiBackoff(attempt) continue } - return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries: "+sanitizeUpstreamErrorMessage(err.Error())) + setOpsUpstreamError(c, 0, safeErr, "") + return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries: "+safeErr) } // Special-case: signature/thought_signature validation errors are not transient, but may be fixed by @@ -558,6 +567,30 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex _ = resp.Body.Close() if isGeminiSignatureRelatedError(respBody) { + upstreamReqID := resp.Header.Get(requestIDHeader) + if upstreamReqID == "" { + upstreamReqID = resp.Header.Get("x-goog-request-id") + } + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: upstreamReqID, + Kind: "signature_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + var strippedClaudeBody []byte stageName := "" switch signatureRetryStage { @@ -608,6 +641,30 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody) } if attempt < geminiMaxRetries { + upstreamReqID := resp.Header.Get(requestIDHeader) + if upstreamReqID == "" { + upstreamReqID = resp.Header.Get("x-goog-request-id") + } + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: upstreamReqID, + Kind: "retry", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + log.Printf("Gemini account %d: upstream status %d, retry %d/%d", account.ID, resp.StatusCode, attempt, geminiMaxRetries) sleepGeminiBackoff(attempt) continue @@ -633,12 +690,62 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex } s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody) if tempMatched { + upstreamReqID := resp.Header.Get(requestIDHeader) + if upstreamReqID == "" { + upstreamReqID = resp.Header.Get("x-goog-request-id") + } + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: upstreamReqID, + Kind: "failover", + Message: upstreamMsg, + Detail: upstreamDetail, + }) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) { + upstreamReqID := resp.Header.Get(requestIDHeader) + if upstreamReqID == "" { + upstreamReqID = resp.Header.Get("x-goog-request-id") + } + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: upstreamReqID, + Kind: "failover", + Message: upstreamMsg, + Detail: upstreamDetail, + }) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } - return nil, s.writeGeminiMappedError(c, resp.StatusCode, respBody) + upstreamReqID := resp.Header.Get(requestIDHeader) + if upstreamReqID == "" { + upstreamReqID = resp.Header.Get("x-goog-request-id") + } + return nil, s.writeGeminiMappedError(c, account, resp.StatusCode, upstreamReqID, respBody) } requestID := resp.Header.Get(requestIDHeader) @@ -863,6 +970,14 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { + safeErr := sanitizeUpstreamErrorMessage(err.Error()) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "request_error", + Message: safeErr, + }) if attempt < geminiMaxRetries { log.Printf("Gemini account %d: upstream request failed, retry %d/%d: %v", account.ID, attempt, geminiMaxRetries, err) sleepGeminiBackoff(attempt) @@ -880,7 +995,8 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. FirstTokenMs: nil, }, nil } - return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries: "+sanitizeUpstreamErrorMessage(err.Error())) + setOpsUpstreamError(c, 0, safeErr, "") + return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries: "+safeErr) } if resp.StatusCode >= 400 && s.shouldRetryGeminiUpstreamError(account, resp.StatusCode) { @@ -899,6 +1015,30 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody) } if attempt < geminiMaxRetries { + upstreamReqID := resp.Header.Get(requestIDHeader) + if upstreamReqID == "" { + upstreamReqID = resp.Header.Get("x-goog-request-id") + } + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: upstreamReqID, + Kind: "retry", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + log.Printf("Gemini account %d: upstream status %d, retry %d/%d", account.ID, resp.StatusCode, attempt, geminiMaxRetries) sleepGeminiBackoff(attempt) continue @@ -962,19 +1102,84 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. } if tempMatched { + evBody := unwrapIfNeeded(isOAuth, respBody) + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(evBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(evBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: requestID, + Kind: "failover", + Message: upstreamMsg, + Detail: upstreamDetail, + }) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) { + evBody := unwrapIfNeeded(isOAuth, respBody) + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(evBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(evBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: requestID, + Kind: "failover", + Message: upstreamMsg, + Detail: upstreamDetail, + }) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } respBody = unwrapIfNeeded(isOAuth, respBody) + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + log.Printf("[Gemini] native upstream error %d: %s", resp.StatusCode, truncateForLog(respBody, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)) + } + setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: requestID, + Kind: "http_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + contentType := resp.Header.Get("Content-Type") if contentType == "" { contentType = "application/json" } c.Data(resp.StatusCode, contentType, respBody) - return nil, fmt.Errorf("gemini upstream error: %d", resp.StatusCode) + if upstreamMsg == "" { + return nil, fmt.Errorf("gemini upstream error: %d", resp.StatusCode) + } + return nil, fmt.Errorf("gemini upstream error: %d message=%s", resp.StatusCode, upstreamMsg) } var usage *ClaudeUsage @@ -1076,7 +1281,32 @@ func sanitizeUpstreamErrorMessage(msg string) string { return sensitiveQueryParamRegex.ReplaceAllString(msg, `$1***`) } -func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, upstreamStatus int, body []byte) error { +func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, account *Account, upstreamStatus int, upstreamRequestID string, body []byte) error { + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(body), maxBytes) + } + setOpsUpstreamError(c, upstreamStatus, upstreamMsg, upstreamDetail) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: upstreamStatus, + UpstreamRequestID: upstreamRequestID, + Kind: "http_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + log.Printf("[Gemini] upstream error %d: %s", upstreamStatus, truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)) + } + var statusCode int var errType, errMsg string @@ -1184,7 +1414,10 @@ func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, ups "type": "error", "error": gin.H{"type": errType, "message": errMsg}, }) - return fmt.Errorf("upstream error: %d", upstreamStatus) + if upstreamMsg == "" { + return fmt.Errorf("upstream error: %d", upstreamStatus) + } + return fmt.Errorf("upstream error: %d message=%s", upstreamStatus, upstreamMsg) } type claudeErrorMapping struct { diff --git a/backend/internal/service/openai_codex_transform.go b/backend/internal/service/openai_codex_transform.go index 965fb770..94e74f22 100644 --- a/backend/internal/service/openai_codex_transform.go +++ b/backend/internal/service/openai_codex_transform.go @@ -115,12 +115,6 @@ func applyCodexOAuthTransform(reqBody map[string]any) codexTransformResult { existingInstructions = strings.TrimSpace(existingInstructions) if instructions != "" { - if existingInstructions != "" && existingInstructions != instructions { - if input, ok := reqBody["input"].([]any); ok { - reqBody["input"] = prependSystemInstruction(input, existingInstructions) - result.Modified = true - } - } if existingInstructions != instructions { reqBody["instructions"] = instructions result.Modified = true @@ -129,7 +123,6 @@ func applyCodexOAuthTransform(reqBody map[string]any) codexTransformResult { if input, ok := reqBody["input"].([]any); ok { input = filterCodexInput(input) - input = normalizeOrphanedToolOutputs(input) reqBody["input"] = input result.Modified = true } @@ -266,19 +259,6 @@ func filterCodexInput(input []any) []any { return filtered } -func prependSystemInstruction(input []any, instructions string) []any { - message := map[string]any{ - "role": "system", - "content": []any{ - map[string]any{ - "type": "input_text", - "text": instructions, - }, - }, - } - return append([]any{message}, input...) -} - func normalizeCodexTools(reqBody map[string]any) bool { rawTools, ok := reqBody["tools"] if !ok || rawTools == nil { @@ -341,110 +321,6 @@ func normalizeCodexTools(reqBody map[string]any) bool { return modified } -func normalizeOrphanedToolOutputs(input []any) []any { - functionCallIDs := map[string]bool{} - localShellCallIDs := map[string]bool{} - customToolCallIDs := map[string]bool{} - - for _, item := range input { - m, ok := item.(map[string]any) - if !ok { - continue - } - callID := getCallID(m) - if callID == "" { - continue - } - switch m["type"] { - case "function_call": - functionCallIDs[callID] = true - case "local_shell_call": - localShellCallIDs[callID] = true - case "custom_tool_call": - customToolCallIDs[callID] = true - } - } - - output := make([]any, 0, len(input)) - for _, item := range input { - m, ok := item.(map[string]any) - if !ok { - output = append(output, item) - continue - } - switch m["type"] { - case "function_call_output": - callID := getCallID(m) - if callID == "" || (!functionCallIDs[callID] && !localShellCallIDs[callID]) { - output = append(output, convertOrphanedOutputToMessage(m, callID)) - continue - } - case "custom_tool_call_output": - callID := getCallID(m) - if callID == "" || !customToolCallIDs[callID] { - output = append(output, convertOrphanedOutputToMessage(m, callID)) - continue - } - case "local_shell_call_output": - callID := getCallID(m) - if callID == "" || !localShellCallIDs[callID] { - output = append(output, convertOrphanedOutputToMessage(m, callID)) - continue - } - } - output = append(output, m) - } - return output -} - -func getCallID(item map[string]any) string { - raw, ok := item["call_id"] - if !ok { - return "" - } - callID, ok := raw.(string) - if !ok { - return "" - } - callID = strings.TrimSpace(callID) - if callID == "" { - return "" - } - return callID -} - -func convertOrphanedOutputToMessage(item map[string]any, callID string) map[string]any { - toolName := "tool" - if name, ok := item["name"].(string); ok && name != "" { - toolName = name - } - labelID := callID - if labelID == "" { - labelID = "unknown" - } - text := stringifyOutput(item["output"]) - if len(text) > 16000 { - text = text[:16000] + "\n...[truncated]" - } - return map[string]any{ - "type": "message", - "role": "assistant", - "content": fmt.Sprintf("[Previous %s result; call_id=%s]: %s", toolName, labelID, text), - } -} - -func stringifyOutput(output any) string { - switch v := output.(type) { - case string: - return v - default: - if data, err := json.Marshal(v); err == nil { - return string(data) - } - return fmt.Sprintf("%v", v) - } -} - func codexCachePath(filename string) string { home, err := os.UserHomeDir() if err != nil { diff --git a/backend/internal/service/openai_gateway_service.go b/backend/internal/service/openai_gateway_service.go index 8b1f214b..b3ee469a 100644 --- a/backend/internal/service/openai_gateway_service.go +++ b/backend/internal/service/openai_gateway_service.go @@ -12,7 +12,6 @@ import ( "io" "log" "net/http" - "os" "regexp" "sort" "strconv" @@ -513,7 +512,7 @@ func (s *OpenAIGatewayService) shouldFailoverUpstreamError(statusCode int) bool } func (s *OpenAIGatewayService) handleFailoverSideEffects(ctx context.Context, resp *http.Response, account *Account) { - body, _ := io.ReadAll(resp.Body) + body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body) } @@ -594,13 +593,53 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco // Send request resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { - return nil, fmt.Errorf("upstream request failed: %w", err) + // Ensure the client receives an error response (handlers assume Forward writes on non-failover errors). + safeErr := sanitizeUpstreamErrorMessage(err.Error()) + setOpsUpstreamError(c, 0, safeErr, "") + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "request_error", + Message: safeErr, + }) + c.JSON(http.StatusBadGateway, gin.H{ + "error": gin.H{ + "type": "upstream_error", + "message": "Upstream request failed", + }, + }) + return nil, fmt.Errorf("upstream request failed: %s", safeErr) } defer func() { _ = resp.Body.Close() }() // Handle error response if resp.StatusCode >= 400 { if s.shouldFailoverUpstreamError(resp.StatusCode) { + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) + _ = resp.Body.Close() + resp.Body = io.NopCloser(bytes.NewReader(respBody)) + + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "failover", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + s.handleFailoverSideEffects(ctx, resp, account) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } @@ -724,18 +763,52 @@ func (s *OpenAIGatewayService) buildUpstreamRequest(ctx context.Context, c *gin. } func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*OpenAIForwardResult, error) { - body, _ := io.ReadAll(resp.Body) - logUpstreamErrorBody(account.ID, resp.StatusCode, body) + body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) + + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(body), maxBytes) + } + setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail) + + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + log.Printf( + "OpenAI upstream error %d (account=%d platform=%s type=%s): %s", + resp.StatusCode, + account.ID, + account.Platform, + account.Type, + truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes), + ) + } // Check custom error codes if !account.ShouldHandleErrorCode(resp.StatusCode) { + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "http_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) c.JSON(http.StatusInternalServerError, gin.H{ "error": gin.H{ "type": "upstream_error", "message": "Upstream gateway error", }, }) - return nil, fmt.Errorf("upstream error: %d (not in custom error codes)", resp.StatusCode) + if upstreamMsg == "" { + return nil, fmt.Errorf("upstream error: %d (not in custom error codes)", resp.StatusCode) + } + return nil, fmt.Errorf("upstream error: %d (not in custom error codes) message=%s", resp.StatusCode, upstreamMsg) } // Handle upstream error (mark account status) @@ -743,6 +816,19 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht if s.rateLimitService != nil { shouldDisable = s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body) } + kind := "http_error" + if shouldDisable { + kind = "failover" + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: kind, + Message: upstreamMsg, + Detail: upstreamDetail, + }) if shouldDisable { return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } @@ -781,25 +867,10 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht }, }) - return nil, fmt.Errorf("upstream error: %d", resp.StatusCode) -} - -func logUpstreamErrorBody(accountID int64, statusCode int, body []byte) { - if strings.ToLower(strings.TrimSpace(os.Getenv("GATEWAY_LOG_UPSTREAM_ERROR_BODY"))) != "true" { - return + if upstreamMsg == "" { + return nil, fmt.Errorf("upstream error: %d", resp.StatusCode) } - - maxBytes := 2048 - if rawMax := strings.TrimSpace(os.Getenv("GATEWAY_LOG_UPSTREAM_ERROR_BODY_MAX_BYTES")); rawMax != "" { - if parsed, err := strconv.Atoi(rawMax); err == nil && parsed > 0 { - maxBytes = parsed - } - } - if len(body) > maxBytes { - body = body[:maxBytes] - } - - log.Printf("Upstream error body: account=%d status=%d body=%q", accountID, statusCode, string(body)) + return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg) } // openaiStreamingResult streaming response result diff --git a/backend/internal/service/ops_account_availability.go b/backend/internal/service/ops_account_availability.go new file mode 100644 index 00000000..da66ec4d --- /dev/null +++ b/backend/internal/service/ops_account_availability.go @@ -0,0 +1,194 @@ +package service + +import ( + "context" + "errors" + "time" +) + +// GetAccountAvailabilityStats returns current account availability stats. +// +// Query-level filtering is intentionally limited to platform/group to match the dashboard scope. +func (s *OpsService) GetAccountAvailabilityStats(ctx context.Context, platformFilter string, groupIDFilter *int64) ( + map[string]*PlatformAvailability, + map[int64]*GroupAvailability, + map[int64]*AccountAvailability, + *time.Time, + error, +) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, nil, nil, nil, err + } + + accounts, err := s.listAllAccountsForOps(ctx, platformFilter) + if err != nil { + return nil, nil, nil, nil, err + } + + if groupIDFilter != nil && *groupIDFilter > 0 { + filtered := make([]Account, 0, len(accounts)) + for _, acc := range accounts { + for _, grp := range acc.Groups { + if grp != nil && grp.ID == *groupIDFilter { + filtered = append(filtered, acc) + break + } + } + } + accounts = filtered + } + + now := time.Now() + collectedAt := now + + platform := make(map[string]*PlatformAvailability) + group := make(map[int64]*GroupAvailability) + account := make(map[int64]*AccountAvailability) + + for _, acc := range accounts { + if acc.ID <= 0 { + continue + } + + isTempUnsched := false + if acc.TempUnschedulableUntil != nil && now.Before(*acc.TempUnschedulableUntil) { + isTempUnsched = true + } + + isRateLimited := acc.RateLimitResetAt != nil && now.Before(*acc.RateLimitResetAt) + isOverloaded := acc.OverloadUntil != nil && now.Before(*acc.OverloadUntil) + hasError := acc.Status == StatusError + + // Normalize exclusive status flags so the UI doesn't show conflicting badges. + if hasError { + isRateLimited = false + isOverloaded = false + } + + isAvailable := acc.Status == StatusActive && acc.Schedulable && !isRateLimited && !isOverloaded && !isTempUnsched + + if acc.Platform != "" { + if _, ok := platform[acc.Platform]; !ok { + platform[acc.Platform] = &PlatformAvailability{ + Platform: acc.Platform, + } + } + p := platform[acc.Platform] + p.TotalAccounts++ + if isAvailable { + p.AvailableCount++ + } + if isRateLimited { + p.RateLimitCount++ + } + if hasError { + p.ErrorCount++ + } + } + + for _, grp := range acc.Groups { + if grp == nil || grp.ID <= 0 { + continue + } + if _, ok := group[grp.ID]; !ok { + group[grp.ID] = &GroupAvailability{ + GroupID: grp.ID, + GroupName: grp.Name, + Platform: grp.Platform, + } + } + g := group[grp.ID] + g.TotalAccounts++ + if isAvailable { + g.AvailableCount++ + } + if isRateLimited { + g.RateLimitCount++ + } + if hasError { + g.ErrorCount++ + } + } + + displayGroupID := int64(0) + displayGroupName := "" + if len(acc.Groups) > 0 && acc.Groups[0] != nil { + displayGroupID = acc.Groups[0].ID + displayGroupName = acc.Groups[0].Name + } + + item := &AccountAvailability{ + AccountID: acc.ID, + AccountName: acc.Name, + Platform: acc.Platform, + GroupID: displayGroupID, + GroupName: displayGroupName, + Status: acc.Status, + + IsAvailable: isAvailable, + IsRateLimited: isRateLimited, + IsOverloaded: isOverloaded, + HasError: hasError, + + ErrorMessage: acc.ErrorMessage, + } + + if isRateLimited && acc.RateLimitResetAt != nil { + item.RateLimitResetAt = acc.RateLimitResetAt + remainingSec := int64(time.Until(*acc.RateLimitResetAt).Seconds()) + if remainingSec > 0 { + item.RateLimitRemainingSec = &remainingSec + } + } + if isOverloaded && acc.OverloadUntil != nil { + item.OverloadUntil = acc.OverloadUntil + remainingSec := int64(time.Until(*acc.OverloadUntil).Seconds()) + if remainingSec > 0 { + item.OverloadRemainingSec = &remainingSec + } + } + if isTempUnsched && acc.TempUnschedulableUntil != nil { + item.TempUnschedulableUntil = acc.TempUnschedulableUntil + } + + account[acc.ID] = item + } + + return platform, group, account, &collectedAt, nil +} + +type OpsAccountAvailability struct { + Group *GroupAvailability + Accounts map[int64]*AccountAvailability + CollectedAt *time.Time +} + +func (s *OpsService) GetAccountAvailability(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error) { + if s == nil { + return nil, errors.New("ops service is nil") + } + + if s.getAccountAvailability != nil { + return s.getAccountAvailability(ctx, platformFilter, groupIDFilter) + } + + _, groupStats, accountStats, collectedAt, err := s.GetAccountAvailabilityStats(ctx, platformFilter, groupIDFilter) + if err != nil { + return nil, err + } + + var group *GroupAvailability + if groupIDFilter != nil && *groupIDFilter > 0 { + group = groupStats[*groupIDFilter] + } + + if accountStats == nil { + accountStats = map[int64]*AccountAvailability{} + } + + return &OpsAccountAvailability{ + Group: group, + Accounts: accountStats, + CollectedAt: collectedAt, + }, nil +} diff --git a/backend/internal/service/ops_advisory_lock.go b/backend/internal/service/ops_advisory_lock.go new file mode 100644 index 00000000..f7ef4cee --- /dev/null +++ b/backend/internal/service/ops_advisory_lock.go @@ -0,0 +1,46 @@ +package service + +import ( + "context" + "database/sql" + "hash/fnv" + "time" +) + +func hashAdvisoryLockID(key string) int64 { + h := fnv.New64a() + _, _ = h.Write([]byte(key)) + return int64(h.Sum64()) +} + +func tryAcquireDBAdvisoryLock(ctx context.Context, db *sql.DB, lockID int64) (func(), bool) { + if db == nil { + return nil, false + } + if ctx == nil { + ctx = context.Background() + } + + conn, err := db.Conn(ctx) + if err != nil { + return nil, false + } + + acquired := false + if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", lockID).Scan(&acquired); err != nil { + _ = conn.Close() + return nil, false + } + if !acquired { + _ = conn.Close() + return nil, false + } + + release := func() { + unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", lockID) + _ = conn.Close() + } + return release, true +} diff --git a/backend/internal/service/ops_aggregation_service.go b/backend/internal/service/ops_aggregation_service.go new file mode 100644 index 00000000..2a6afbba --- /dev/null +++ b/backend/internal/service/ops_aggregation_service.go @@ -0,0 +1,443 @@ +package service + +import ( + "context" + "database/sql" + "errors" + "log" + "strings" + "sync" + "time" + + "github.com/Wei-Shaw/sub2api/internal/config" + "github.com/google/uuid" + "github.com/redis/go-redis/v9" +) + +const ( + opsAggHourlyJobName = "ops_preaggregation_hourly" + opsAggDailyJobName = "ops_preaggregation_daily" + + opsAggHourlyInterval = 10 * time.Minute + opsAggDailyInterval = 1 * time.Hour + + // Keep in sync with ops retention target (vNext default 30d). + opsAggBackfillWindow = 30 * 24 * time.Hour + + // Recompute overlap to absorb late-arriving rows near boundaries. + opsAggHourlyOverlap = 2 * time.Hour + opsAggDailyOverlap = 48 * time.Hour + + opsAggHourlyChunk = 24 * time.Hour + opsAggDailyChunk = 7 * 24 * time.Hour + + // Delay around boundaries (e.g. 10:00..10:05) to avoid aggregating buckets + // that may still receive late inserts. + opsAggSafeDelay = 5 * time.Minute + + opsAggMaxQueryTimeout = 3 * time.Second + opsAggHourlyTimeout = 5 * time.Minute + opsAggDailyTimeout = 2 * time.Minute + + opsAggHourlyLeaderLockKey = "ops:aggregation:hourly:leader" + opsAggDailyLeaderLockKey = "ops:aggregation:daily:leader" + + opsAggHourlyLeaderLockTTL = 15 * time.Minute + opsAggDailyLeaderLockTTL = 10 * time.Minute +) + +// OpsAggregationService periodically backfills ops_metrics_hourly / ops_metrics_daily +// for stable long-window dashboard queries. +// +// It is safe to run in multi-replica deployments when Redis is available (leader lock). +type OpsAggregationService struct { + opsRepo OpsRepository + settingRepo SettingRepository + cfg *config.Config + + db *sql.DB + redisClient *redis.Client + instanceID string + + stopCh chan struct{} + startOnce sync.Once + stopOnce sync.Once + + hourlyMu sync.Mutex + dailyMu sync.Mutex + + skipLogMu sync.Mutex + skipLogAt time.Time +} + +func NewOpsAggregationService( + opsRepo OpsRepository, + settingRepo SettingRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsAggregationService { + return &OpsAggregationService{ + opsRepo: opsRepo, + settingRepo: settingRepo, + cfg: cfg, + db: db, + redisClient: redisClient, + instanceID: uuid.NewString(), + } +} + +func (s *OpsAggregationService) Start() { + if s == nil { + return + } + s.startOnce.Do(func() { + if s.stopCh == nil { + s.stopCh = make(chan struct{}) + } + go s.hourlyLoop() + go s.dailyLoop() + }) +} + +func (s *OpsAggregationService) Stop() { + if s == nil { + return + } + s.stopOnce.Do(func() { + if s.stopCh != nil { + close(s.stopCh) + } + }) +} + +func (s *OpsAggregationService) hourlyLoop() { + // First run immediately. + s.aggregateHourly() + + ticker := time.NewTicker(opsAggHourlyInterval) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + s.aggregateHourly() + case <-s.stopCh: + return + } + } +} + +func (s *OpsAggregationService) dailyLoop() { + // First run immediately. + s.aggregateDaily() + + ticker := time.NewTicker(opsAggDailyInterval) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + s.aggregateDaily() + case <-s.stopCh: + return + } + } +} + +func (s *OpsAggregationService) aggregateHourly() { + if s == nil || s.opsRepo == nil { + return + } + if s.cfg != nil { + if !s.cfg.Ops.Enabled { + return + } + if !s.cfg.Ops.Aggregation.Enabled { + return + } + } + + ctx, cancel := context.WithTimeout(context.Background(), opsAggHourlyTimeout) + defer cancel() + + if !s.isMonitoringEnabled(ctx) { + return + } + + release, ok := s.tryAcquireLeaderLock(ctx, opsAggHourlyLeaderLockKey, opsAggHourlyLeaderLockTTL, "[OpsAggregation][hourly]") + if !ok { + return + } + if release != nil { + defer release() + } + + s.hourlyMu.Lock() + defer s.hourlyMu.Unlock() + + startedAt := time.Now().UTC() + runAt := startedAt + + // Aggregate stable full hours only. + end := utcFloorToHour(time.Now().UTC().Add(-opsAggSafeDelay)) + start := end.Add(-opsAggBackfillWindow) + + // Resume from the latest bucket with overlap. + { + ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout) + latest, ok, err := s.opsRepo.GetLatestHourlyBucketStart(ctxMax) + cancelMax() + if err != nil { + log.Printf("[OpsAggregation][hourly] failed to read latest bucket: %v", err) + } else if ok { + candidate := latest.Add(-opsAggHourlyOverlap) + if candidate.After(start) { + start = candidate + } + } + } + + start = utcFloorToHour(start) + if !start.Before(end) { + return + } + + var aggErr error + for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggHourlyChunk) { + chunkEnd := minTime(cursor.Add(opsAggHourlyChunk), end) + if err := s.opsRepo.UpsertHourlyMetrics(ctx, cursor, chunkEnd); err != nil { + aggErr = err + log.Printf("[OpsAggregation][hourly] upsert failed (%s..%s): %v", cursor.Format(time.RFC3339), chunkEnd.Format(time.RFC3339), err) + break + } + } + + finishedAt := time.Now().UTC() + durationMs := finishedAt.Sub(startedAt).Milliseconds() + dur := durationMs + + if aggErr != nil { + msg := truncateString(aggErr.Error(), 2048) + errAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer hbCancel() + _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAggHourlyJobName, + LastRunAt: &runAt, + LastErrorAt: &errAt, + LastError: &msg, + LastDurationMs: &dur, + }) + return + } + + successAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer hbCancel() + _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAggHourlyJobName, + LastRunAt: &runAt, + LastSuccessAt: &successAt, + LastDurationMs: &dur, + }) +} + +func (s *OpsAggregationService) aggregateDaily() { + if s == nil || s.opsRepo == nil { + return + } + if s.cfg != nil { + if !s.cfg.Ops.Enabled { + return + } + if !s.cfg.Ops.Aggregation.Enabled { + return + } + } + + ctx, cancel := context.WithTimeout(context.Background(), opsAggDailyTimeout) + defer cancel() + + if !s.isMonitoringEnabled(ctx) { + return + } + + release, ok := s.tryAcquireLeaderLock(ctx, opsAggDailyLeaderLockKey, opsAggDailyLeaderLockTTL, "[OpsAggregation][daily]") + if !ok { + return + } + if release != nil { + defer release() + } + + s.dailyMu.Lock() + defer s.dailyMu.Unlock() + + startedAt := time.Now().UTC() + runAt := startedAt + + end := utcFloorToDay(time.Now().UTC()) + start := end.Add(-opsAggBackfillWindow) + + { + ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout) + latest, ok, err := s.opsRepo.GetLatestDailyBucketDate(ctxMax) + cancelMax() + if err != nil { + log.Printf("[OpsAggregation][daily] failed to read latest bucket: %v", err) + } else if ok { + candidate := latest.Add(-opsAggDailyOverlap) + if candidate.After(start) { + start = candidate + } + } + } + + start = utcFloorToDay(start) + if !start.Before(end) { + return + } + + var aggErr error + for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggDailyChunk) { + chunkEnd := minTime(cursor.Add(opsAggDailyChunk), end) + if err := s.opsRepo.UpsertDailyMetrics(ctx, cursor, chunkEnd); err != nil { + aggErr = err + log.Printf("[OpsAggregation][daily] upsert failed (%s..%s): %v", cursor.Format("2006-01-02"), chunkEnd.Format("2006-01-02"), err) + break + } + } + + finishedAt := time.Now().UTC() + durationMs := finishedAt.Sub(startedAt).Milliseconds() + dur := durationMs + + if aggErr != nil { + msg := truncateString(aggErr.Error(), 2048) + errAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer hbCancel() + _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAggDailyJobName, + LastRunAt: &runAt, + LastErrorAt: &errAt, + LastError: &msg, + LastDurationMs: &dur, + }) + return + } + + successAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer hbCancel() + _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAggDailyJobName, + LastRunAt: &runAt, + LastSuccessAt: &successAt, + LastDurationMs: &dur, + }) +} + +func (s *OpsAggregationService) isMonitoringEnabled(ctx context.Context) bool { + if s == nil { + return false + } + if s.cfg != nil && !s.cfg.Ops.Enabled { + return false + } + if s.settingRepo == nil { + return true + } + if ctx == nil { + ctx = context.Background() + } + + value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled) + if err != nil { + if errors.Is(err, ErrSettingNotFound) { + return true + } + return true + } + switch strings.ToLower(strings.TrimSpace(value)) { + case "false", "0", "off", "disabled": + return false + default: + return true + } +} + +var opsAggReleaseScript = redis.NewScript(` +if redis.call("GET", KEYS[1]) == ARGV[1] then + return redis.call("DEL", KEYS[1]) +end +return 0 +`) + +func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) { + if s == nil { + return nil, false + } + if ctx == nil { + ctx = context.Background() + } + + // Prefer Redis leader lock when available (multi-instance), but avoid stampeding + // the DB when Redis is flaky by falling back to a DB advisory lock. + if s.redisClient != nil { + ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() + if err == nil { + if !ok { + s.maybeLogSkip(logPrefix) + return nil, false + } + release := func() { + ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result() + } + return release, true + } + // Redis error: fall through to DB advisory lock. + } + + release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key)) + if !ok { + s.maybeLogSkip(logPrefix) + return nil, false + } + return release, true +} + +func (s *OpsAggregationService) maybeLogSkip(prefix string) { + s.skipLogMu.Lock() + defer s.skipLogMu.Unlock() + + now := time.Now() + if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < time.Minute { + return + } + s.skipLogAt = now + if prefix == "" { + prefix = "[OpsAggregation]" + } + log.Printf("%s leader lock held by another instance; skipping", prefix) +} + +func utcFloorToHour(t time.Time) time.Time { + return t.UTC().Truncate(time.Hour) +} + +func utcFloorToDay(t time.Time) time.Time { + u := t.UTC() + y, m, d := u.Date() + return time.Date(y, m, d, 0, 0, 0, 0, time.UTC) +} + +func minTime(a, b time.Time) time.Time { + if a.Before(b) { + return a + } + return b +} diff --git a/backend/internal/service/ops_alert_evaluator_service.go b/backend/internal/service/ops_alert_evaluator_service.go new file mode 100644 index 00000000..f376c246 --- /dev/null +++ b/backend/internal/service/ops_alert_evaluator_service.go @@ -0,0 +1,913 @@ +package service + +import ( + "context" + "fmt" + "log" + "math" + "strconv" + "strings" + "sync" + "time" + + "github.com/Wei-Shaw/sub2api/internal/config" + "github.com/google/uuid" + "github.com/redis/go-redis/v9" +) + +const ( + opsAlertEvaluatorJobName = "ops_alert_evaluator" + + opsAlertEvaluatorTimeout = 45 * time.Second + opsAlertEvaluatorLeaderLockKey = "ops:alert:evaluator:leader" + opsAlertEvaluatorLeaderLockTTL = 90 * time.Second + opsAlertEvaluatorSkipLogInterval = 1 * time.Minute +) + +var opsAlertEvaluatorReleaseScript = redis.NewScript(` +if redis.call("GET", KEYS[1]) == ARGV[1] then + return redis.call("DEL", KEYS[1]) +end +return 0 +`) + +type OpsAlertEvaluatorService struct { + opsService *OpsService + opsRepo OpsRepository + emailService *EmailService + + redisClient *redis.Client + cfg *config.Config + instanceID string + + stopCh chan struct{} + startOnce sync.Once + stopOnce sync.Once + wg sync.WaitGroup + + mu sync.Mutex + ruleStates map[int64]*opsAlertRuleState + + emailLimiter *slidingWindowLimiter + + skipLogMu sync.Mutex + skipLogAt time.Time + + warnNoRedisOnce sync.Once +} + +type opsAlertRuleState struct { + LastEvaluatedAt time.Time + ConsecutiveBreaches int +} + +func NewOpsAlertEvaluatorService( + opsService *OpsService, + opsRepo OpsRepository, + emailService *EmailService, + redisClient *redis.Client, + cfg *config.Config, +) *OpsAlertEvaluatorService { + return &OpsAlertEvaluatorService{ + opsService: opsService, + opsRepo: opsRepo, + emailService: emailService, + redisClient: redisClient, + cfg: cfg, + instanceID: uuid.NewString(), + ruleStates: map[int64]*opsAlertRuleState{}, + emailLimiter: newSlidingWindowLimiter(0, time.Hour), + } +} + +func (s *OpsAlertEvaluatorService) Start() { + if s == nil { + return + } + s.startOnce.Do(func() { + if s.stopCh == nil { + s.stopCh = make(chan struct{}) + } + go s.run() + }) +} + +func (s *OpsAlertEvaluatorService) Stop() { + if s == nil { + return + } + s.stopOnce.Do(func() { + if s.stopCh != nil { + close(s.stopCh) + } + }) + s.wg.Wait() +} + +func (s *OpsAlertEvaluatorService) run() { + s.wg.Add(1) + defer s.wg.Done() + + // Start immediately to produce early feedback in ops dashboard. + timer := time.NewTimer(0) + defer timer.Stop() + + for { + select { + case <-timer.C: + interval := s.getInterval() + s.evaluateOnce(interval) + timer.Reset(interval) + case <-s.stopCh: + return + } + } +} + +func (s *OpsAlertEvaluatorService) getInterval() time.Duration { + // Default. + interval := 60 * time.Second + + if s == nil || s.opsService == nil { + return interval + } + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + cfg, err := s.opsService.GetOpsAlertRuntimeSettings(ctx) + if err != nil || cfg == nil { + return interval + } + if cfg.EvaluationIntervalSeconds <= 0 { + return interval + } + if cfg.EvaluationIntervalSeconds < 1 { + return interval + } + if cfg.EvaluationIntervalSeconds > int((24 * time.Hour).Seconds()) { + return interval + } + return time.Duration(cfg.EvaluationIntervalSeconds) * time.Second +} + +func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) { + if s == nil || s.opsRepo == nil { + return + } + if s.cfg != nil && !s.cfg.Ops.Enabled { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), opsAlertEvaluatorTimeout) + defer cancel() + + if s.opsService != nil && !s.opsService.IsMonitoringEnabled(ctx) { + return + } + + runtimeCfg := defaultOpsAlertRuntimeSettings() + if s.opsService != nil { + if loaded, err := s.opsService.GetOpsAlertRuntimeSettings(ctx); err == nil && loaded != nil { + runtimeCfg = loaded + } + } + + release, ok := s.tryAcquireLeaderLock(ctx, runtimeCfg.DistributedLock) + if !ok { + return + } + if release != nil { + defer release() + } + + startedAt := time.Now().UTC() + runAt := startedAt + + rules, err := s.opsRepo.ListAlertRules(ctx) + if err != nil { + s.recordHeartbeatError(runAt, time.Since(startedAt), err) + log.Printf("[OpsAlertEvaluator] list rules failed: %v", err) + return + } + + now := time.Now().UTC() + safeEnd := now.Truncate(time.Minute) + if safeEnd.IsZero() { + safeEnd = now + } + + systemMetrics, _ := s.opsRepo.GetLatestSystemMetrics(ctx, 1) + + // Cleanup stale state for removed rules. + s.pruneRuleStates(rules) + + for _, rule := range rules { + if rule == nil || !rule.Enabled || rule.ID <= 0 { + continue + } + + scopePlatform, scopeGroupID := parseOpsAlertRuleScope(rule.Filters) + + windowMinutes := rule.WindowMinutes + if windowMinutes <= 0 { + windowMinutes = 1 + } + windowStart := safeEnd.Add(-time.Duration(windowMinutes) * time.Minute) + windowEnd := safeEnd + + metricValue, ok := s.computeRuleMetric(ctx, rule, systemMetrics, windowStart, windowEnd, scopePlatform, scopeGroupID) + if !ok { + s.resetRuleState(rule.ID, now) + continue + } + + breachedNow := compareMetric(metricValue, rule.Operator, rule.Threshold) + required := requiredSustainedBreaches(rule.SustainedMinutes, interval) + consecutive := s.updateRuleBreaches(rule.ID, now, interval, breachedNow) + + activeEvent, err := s.opsRepo.GetActiveAlertEvent(ctx, rule.ID) + if err != nil { + log.Printf("[OpsAlertEvaluator] get active event failed (rule=%d): %v", rule.ID, err) + continue + } + + if breachedNow && consecutive >= required { + if activeEvent != nil { + continue + } + + latestEvent, err := s.opsRepo.GetLatestAlertEvent(ctx, rule.ID) + if err != nil { + log.Printf("[OpsAlertEvaluator] get latest event failed (rule=%d): %v", rule.ID, err) + continue + } + if latestEvent != nil && rule.CooldownMinutes > 0 { + cooldown := time.Duration(rule.CooldownMinutes) * time.Minute + if now.Sub(latestEvent.FiredAt) < cooldown { + continue + } + } + + firedEvent := &OpsAlertEvent{ + RuleID: rule.ID, + Severity: strings.TrimSpace(rule.Severity), + Status: OpsAlertStatusFiring, + Title: fmt.Sprintf("%s: %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name)), + Description: buildOpsAlertDescription(rule, metricValue, windowMinutes, scopePlatform, scopeGroupID), + MetricValue: float64Ptr(metricValue), + ThresholdValue: float64Ptr(rule.Threshold), + Dimensions: buildOpsAlertDimensions(scopePlatform, scopeGroupID), + FiredAt: now, + CreatedAt: now, + } + + created, err := s.opsRepo.CreateAlertEvent(ctx, firedEvent) + if err != nil { + log.Printf("[OpsAlertEvaluator] create event failed (rule=%d): %v", rule.ID, err) + continue + } + + if created != nil && created.ID > 0 { + s.maybeSendAlertEmail(ctx, runtimeCfg, rule, created) + } + continue + } + + // Not breached: resolve active event if present. + if activeEvent != nil { + resolvedAt := now + if err := s.opsRepo.UpdateAlertEventStatus(ctx, activeEvent.ID, OpsAlertStatusResolved, &resolvedAt); err != nil { + log.Printf("[OpsAlertEvaluator] resolve event failed (event=%d): %v", activeEvent.ID, err) + } + } + } + + s.recordHeartbeatSuccess(runAt, time.Since(startedAt)) +} + +func (s *OpsAlertEvaluatorService) pruneRuleStates(rules []*OpsAlertRule) { + s.mu.Lock() + defer s.mu.Unlock() + + live := map[int64]struct{}{} + for _, r := range rules { + if r != nil && r.ID > 0 { + live[r.ID] = struct{}{} + } + } + for id := range s.ruleStates { + if _, ok := live[id]; !ok { + delete(s.ruleStates, id) + } + } +} + +func (s *OpsAlertEvaluatorService) resetRuleState(ruleID int64, now time.Time) { + if ruleID <= 0 { + return + } + s.mu.Lock() + defer s.mu.Unlock() + state, ok := s.ruleStates[ruleID] + if !ok { + state = &opsAlertRuleState{} + s.ruleStates[ruleID] = state + } + state.LastEvaluatedAt = now + state.ConsecutiveBreaches = 0 +} + +func (s *OpsAlertEvaluatorService) updateRuleBreaches(ruleID int64, now time.Time, interval time.Duration, breached bool) int { + if ruleID <= 0 { + return 0 + } + s.mu.Lock() + defer s.mu.Unlock() + + state, ok := s.ruleStates[ruleID] + if !ok { + state = &opsAlertRuleState{} + s.ruleStates[ruleID] = state + } + + if !state.LastEvaluatedAt.IsZero() && interval > 0 { + if now.Sub(state.LastEvaluatedAt) > interval*2 { + state.ConsecutiveBreaches = 0 + } + } + + state.LastEvaluatedAt = now + if breached { + state.ConsecutiveBreaches++ + } else { + state.ConsecutiveBreaches = 0 + } + return state.ConsecutiveBreaches +} + +func requiredSustainedBreaches(sustainedMinutes int, interval time.Duration) int { + if sustainedMinutes <= 0 { + return 1 + } + if interval <= 0 { + return sustainedMinutes + } + required := int(math.Ceil(float64(sustainedMinutes*60) / interval.Seconds())) + if required < 1 { + return 1 + } + return required +} + +func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64) { + if filters == nil { + return "", nil + } + if v, ok := filters["platform"]; ok { + if s, ok := v.(string); ok { + platform = strings.TrimSpace(s) + } + } + if v, ok := filters["group_id"]; ok { + switch t := v.(type) { + case float64: + if t > 0 { + id := int64(t) + groupID = &id + } + case int64: + if t > 0 { + id := t + groupID = &id + } + case int: + if t > 0 { + id := int64(t) + groupID = &id + } + case string: + n, err := strconv.ParseInt(strings.TrimSpace(t), 10, 64) + if err == nil && n > 0 { + groupID = &n + } + } + } + return platform, groupID +} + +func (s *OpsAlertEvaluatorService) computeRuleMetric( + ctx context.Context, + rule *OpsAlertRule, + systemMetrics *OpsSystemMetricsSnapshot, + start time.Time, + end time.Time, + platform string, + groupID *int64, +) (float64, bool) { + if rule == nil { + return 0, false + } + switch strings.TrimSpace(rule.MetricType) { + case "cpu_usage_percent": + if systemMetrics != nil && systemMetrics.CPUUsagePercent != nil { + return *systemMetrics.CPUUsagePercent, true + } + return 0, false + case "memory_usage_percent": + if systemMetrics != nil && systemMetrics.MemoryUsagePercent != nil { + return *systemMetrics.MemoryUsagePercent, true + } + return 0, false + case "concurrency_queue_depth": + if systemMetrics != nil && systemMetrics.ConcurrencyQueueDepth != nil { + return float64(*systemMetrics.ConcurrencyQueueDepth), true + } + return 0, false + case "group_available_accounts": + if groupID == nil || *groupID <= 0 { + return 0, false + } + if s == nil || s.opsService == nil { + return 0, false + } + availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID) + if err != nil || availability == nil { + return 0, false + } + if availability.Group == nil { + return 0, true + } + return float64(availability.Group.AvailableCount), true + case "group_available_ratio": + if groupID == nil || *groupID <= 0 { + return 0, false + } + if s == nil || s.opsService == nil { + return 0, false + } + availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID) + if err != nil || availability == nil { + return 0, false + } + return computeGroupAvailableRatio(availability.Group), true + case "account_rate_limited_count": + if s == nil || s.opsService == nil { + return 0, false + } + availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID) + if err != nil || availability == nil { + return 0, false + } + return float64(countAccountsByCondition(availability.Accounts, func(acc *AccountAvailability) bool { + return acc.IsRateLimited + })), true + case "account_error_count": + if s == nil || s.opsService == nil { + return 0, false + } + availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID) + if err != nil || availability == nil { + return 0, false + } + return float64(countAccountsByCondition(availability.Accounts, func(acc *AccountAvailability) bool { + return acc.HasError && acc.TempUnschedulableUntil == nil + })), true + } + + overview, err := s.opsRepo.GetDashboardOverview(ctx, &OpsDashboardFilter{ + StartTime: start, + EndTime: end, + Platform: platform, + GroupID: groupID, + QueryMode: OpsQueryModeRaw, + }) + if err != nil { + return 0, false + } + if overview == nil { + return 0, false + } + + switch strings.TrimSpace(rule.MetricType) { + case "success_rate": + if overview.RequestCountSLA <= 0 { + return 0, false + } + return overview.SLA * 100, true + case "error_rate": + if overview.RequestCountSLA <= 0 { + return 0, false + } + return overview.ErrorRate * 100, true + case "upstream_error_rate": + if overview.RequestCountSLA <= 0 { + return 0, false + } + return overview.UpstreamErrorRate * 100, true + case "p95_latency_ms": + if overview.Duration.P95 == nil { + return 0, false + } + return float64(*overview.Duration.P95), true + case "p99_latency_ms": + if overview.Duration.P99 == nil { + return 0, false + } + return float64(*overview.Duration.P99), true + default: + return 0, false + } +} + +func compareMetric(value float64, operator string, threshold float64) bool { + switch strings.TrimSpace(operator) { + case ">": + return value > threshold + case ">=": + return value >= threshold + case "<": + return value < threshold + case "<=": + return value <= threshold + case "==": + return value == threshold + case "!=": + return value != threshold + default: + return false + } +} + +func buildOpsAlertDimensions(platform string, groupID *int64) map[string]any { + dims := map[string]any{} + if strings.TrimSpace(platform) != "" { + dims["platform"] = strings.TrimSpace(platform) + } + if groupID != nil && *groupID > 0 { + dims["group_id"] = *groupID + } + if len(dims) == 0 { + return nil + } + return dims +} + +func buildOpsAlertDescription(rule *OpsAlertRule, value float64, windowMinutes int, platform string, groupID *int64) string { + if rule == nil { + return "" + } + scope := "overall" + if strings.TrimSpace(platform) != "" { + scope = fmt.Sprintf("platform=%s", strings.TrimSpace(platform)) + } + if groupID != nil && *groupID > 0 { + scope = fmt.Sprintf("%s group_id=%d", scope, *groupID) + } + if windowMinutes <= 0 { + windowMinutes = 1 + } + return fmt.Sprintf("%s %s %.2f (current %.2f) over last %dm (%s)", + strings.TrimSpace(rule.MetricType), + strings.TrimSpace(rule.Operator), + rule.Threshold, + value, + windowMinutes, + strings.TrimSpace(scope), + ) +} + +func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runtimeCfg *OpsAlertRuntimeSettings, rule *OpsAlertRule, event *OpsAlertEvent) { + if s == nil || s.emailService == nil || s.opsService == nil || event == nil || rule == nil { + return + } + if event.EmailSent { + return + } + if !rule.NotifyEmail { + return + } + + emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx) + if err != nil || emailCfg == nil || !emailCfg.Alert.Enabled { + return + } + + if len(emailCfg.Alert.Recipients) == 0 { + return + } + if !shouldSendOpsAlertEmailByMinSeverity(strings.TrimSpace(emailCfg.Alert.MinSeverity), strings.TrimSpace(rule.Severity)) { + return + } + + if runtimeCfg != nil && runtimeCfg.Silencing.Enabled { + if isOpsAlertSilenced(time.Now().UTC(), rule, event, runtimeCfg.Silencing) { + return + } + } + + // Apply/update rate limiter. + s.emailLimiter.SetLimit(emailCfg.Alert.RateLimitPerHour) + + subject := fmt.Sprintf("[Ops Alert][%s] %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name)) + body := buildOpsAlertEmailBody(rule, event) + + anySent := false + for _, to := range emailCfg.Alert.Recipients { + addr := strings.TrimSpace(to) + if addr == "" { + continue + } + if !s.emailLimiter.Allow(time.Now().UTC()) { + continue + } + if err := s.emailService.SendEmail(ctx, addr, subject, body); err != nil { + // Ignore per-recipient failures; continue best-effort. + continue + } + anySent = true + } + + if anySent { + _ = s.opsRepo.UpdateAlertEventEmailSent(context.Background(), event.ID, true) + } +} + +func buildOpsAlertEmailBody(rule *OpsAlertRule, event *OpsAlertEvent) string { + if rule == nil || event == nil { + return "" + } + metric := strings.TrimSpace(rule.MetricType) + value := "-" + threshold := fmt.Sprintf("%.2f", rule.Threshold) + if event.MetricValue != nil { + value = fmt.Sprintf("%.2f", *event.MetricValue) + } + if event.ThresholdValue != nil { + threshold = fmt.Sprintf("%.2f", *event.ThresholdValue) + } + return fmt.Sprintf(` +

Ops Alert

+

Rule: %s

+

Severity: %s

+

Status: %s

+

Metric: %s %s %s

+

Fired at: %s

+

Description: %s

+`, + htmlEscape(rule.Name), + htmlEscape(rule.Severity), + htmlEscape(event.Status), + htmlEscape(metric), + htmlEscape(rule.Operator), + htmlEscape(fmt.Sprintf("%s (threshold %s)", value, threshold)), + event.FiredAt.Format(time.RFC3339), + htmlEscape(event.Description), + ) +} + +func shouldSendOpsAlertEmailByMinSeverity(minSeverity string, ruleSeverity string) bool { + minSeverity = strings.ToLower(strings.TrimSpace(minSeverity)) + if minSeverity == "" { + return true + } + + eventLevel := opsEmailSeverityForOps(ruleSeverity) + minLevel := strings.ToLower(minSeverity) + + rank := func(level string) int { + switch level { + case "critical": + return 3 + case "warning": + return 2 + case "info": + return 1 + default: + return 0 + } + } + return rank(eventLevel) >= rank(minLevel) +} + +func opsEmailSeverityForOps(severity string) string { + switch strings.ToUpper(strings.TrimSpace(severity)) { + case "P0": + return "critical" + case "P1": + return "warning" + default: + return "info" + } +} + +func isOpsAlertSilenced(now time.Time, rule *OpsAlertRule, event *OpsAlertEvent, silencing OpsAlertSilencingSettings) bool { + if !silencing.Enabled { + return false + } + if now.IsZero() { + now = time.Now().UTC() + } + if strings.TrimSpace(silencing.GlobalUntilRFC3339) != "" { + if t, err := time.Parse(time.RFC3339, strings.TrimSpace(silencing.GlobalUntilRFC3339)); err == nil { + if now.Before(t) { + return true + } + } + } + + for _, entry := range silencing.Entries { + untilRaw := strings.TrimSpace(entry.UntilRFC3339) + if untilRaw == "" { + continue + } + until, err := time.Parse(time.RFC3339, untilRaw) + if err != nil { + continue + } + if now.After(until) { + continue + } + if entry.RuleID != nil && rule != nil && rule.ID > 0 && *entry.RuleID != rule.ID { + continue + } + if len(entry.Severities) > 0 { + match := false + for _, s := range entry.Severities { + if strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(event.Severity)) || strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(rule.Severity)) { + match = true + break + } + } + if !match { + continue + } + } + return true + } + + return false +} + +func (s *OpsAlertEvaluatorService) tryAcquireLeaderLock(ctx context.Context, lock OpsDistributedLockSettings) (func(), bool) { + if !lock.Enabled { + return nil, true + } + if s.redisClient == nil { + s.warnNoRedisOnce.Do(func() { + log.Printf("[OpsAlertEvaluator] redis not configured; running without distributed lock") + }) + return nil, true + } + key := strings.TrimSpace(lock.Key) + if key == "" { + key = opsAlertEvaluatorLeaderLockKey + } + ttl := time.Duration(lock.TTLSeconds) * time.Second + if ttl <= 0 { + ttl = opsAlertEvaluatorLeaderLockTTL + } + + ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() + if err != nil { + // Prefer fail-closed to avoid duplicate evaluators stampeding the DB when Redis is flaky. + // Single-node deployments can disable the distributed lock via runtime settings. + s.warnNoRedisOnce.Do(func() { + log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; skipping this cycle: %v", err) + }) + return nil, false + } + if !ok { + s.maybeLogSkip(key) + return nil, false + } + return func() { + _, _ = opsAlertEvaluatorReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result() + }, true +} + +func (s *OpsAlertEvaluatorService) maybeLogSkip(key string) { + s.skipLogMu.Lock() + defer s.skipLogMu.Unlock() + + now := time.Now() + if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < opsAlertEvaluatorSkipLogInterval { + return + } + s.skipLogAt = now + log.Printf("[OpsAlertEvaluator] leader lock held by another instance; skipping (key=%q)", key) +} + +func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) { + if s == nil || s.opsRepo == nil { + return + } + now := time.Now().UTC() + durMs := duration.Milliseconds() + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAlertEvaluatorJobName, + LastRunAt: &runAt, + LastSuccessAt: &now, + LastDurationMs: &durMs, + }) +} + +func (s *OpsAlertEvaluatorService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) { + if s == nil || s.opsRepo == nil || err == nil { + return + } + now := time.Now().UTC() + durMs := duration.Milliseconds() + msg := truncateString(err.Error(), 2048) + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAlertEvaluatorJobName, + LastRunAt: &runAt, + LastErrorAt: &now, + LastError: &msg, + LastDurationMs: &durMs, + }) +} + +func htmlEscape(s string) string { + replacer := strings.NewReplacer( + "&", "&", + "<", "<", + ">", ">", + `"`, """, + "'", "'", + ) + return replacer.Replace(s) +} + +type slidingWindowLimiter struct { + mu sync.Mutex + limit int + window time.Duration + sent []time.Time +} + +func newSlidingWindowLimiter(limit int, window time.Duration) *slidingWindowLimiter { + if window <= 0 { + window = time.Hour + } + return &slidingWindowLimiter{ + limit: limit, + window: window, + sent: []time.Time{}, + } +} + +func (l *slidingWindowLimiter) SetLimit(limit int) { + l.mu.Lock() + defer l.mu.Unlock() + l.limit = limit +} + +func (l *slidingWindowLimiter) Allow(now time.Time) bool { + l.mu.Lock() + defer l.mu.Unlock() + + if l.limit <= 0 { + return true + } + cutoff := now.Add(-l.window) + keep := l.sent[:0] + for _, t := range l.sent { + if t.After(cutoff) { + keep = append(keep, t) + } + } + l.sent = keep + if len(l.sent) >= l.limit { + return false + } + l.sent = append(l.sent, now) + return true +} + +// computeGroupAvailableRatio returns the available percentage for a group. +// Formula: (AvailableCount / TotalAccounts) * 100. +// Returns 0 when TotalAccounts is 0. +func computeGroupAvailableRatio(group *GroupAvailability) float64 { + if group == nil || group.TotalAccounts <= 0 { + return 0 + } + return (float64(group.AvailableCount) / float64(group.TotalAccounts)) * 100 +} + +// countAccountsByCondition counts accounts that satisfy the given condition. +func countAccountsByCondition(accounts map[int64]*AccountAvailability, condition func(*AccountAvailability) bool) int64 { + if len(accounts) == 0 || condition == nil { + return 0 + } + var count int64 + for _, account := range accounts { + if account != nil && condition(account) { + count++ + } + } + return count +} diff --git a/backend/internal/service/ops_alert_evaluator_service_test.go b/backend/internal/service/ops_alert_evaluator_service_test.go new file mode 100644 index 00000000..068ab6bb --- /dev/null +++ b/backend/internal/service/ops_alert_evaluator_service_test.go @@ -0,0 +1,210 @@ +//go:build unit + +package service + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +type stubOpsRepo struct { + OpsRepository + overview *OpsDashboardOverview + err error +} + +func (s *stubOpsRepo) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) { + if s.err != nil { + return nil, s.err + } + if s.overview != nil { + return s.overview, nil + } + return &OpsDashboardOverview{}, nil +} + +func TestComputeGroupAvailableRatio(t *testing.T) { + t.Parallel() + + t.Run("正常情况: 10个账号, 8个可用 = 80%", func(t *testing.T) { + t.Parallel() + + got := computeGroupAvailableRatio(&GroupAvailability{ + TotalAccounts: 10, + AvailableCount: 8, + }) + require.InDelta(t, 80.0, got, 0.0001) + }) + + t.Run("边界情况: TotalAccounts = 0 应返回 0", func(t *testing.T) { + t.Parallel() + + got := computeGroupAvailableRatio(&GroupAvailability{ + TotalAccounts: 0, + AvailableCount: 8, + }) + require.Equal(t, 0.0, got) + }) + + t.Run("边界情况: AvailableCount = 0 应返回 0%", func(t *testing.T) { + t.Parallel() + + got := computeGroupAvailableRatio(&GroupAvailability{ + TotalAccounts: 10, + AvailableCount: 0, + }) + require.Equal(t, 0.0, got) + }) +} + +func TestCountAccountsByCondition(t *testing.T) { + t.Parallel() + + t.Run("测试限流账号统计: acc.IsRateLimited", func(t *testing.T) { + t.Parallel() + + accounts := map[int64]*AccountAvailability{ + 1: {IsRateLimited: true}, + 2: {IsRateLimited: false}, + 3: {IsRateLimited: true}, + } + + got := countAccountsByCondition(accounts, func(acc *AccountAvailability) bool { + return acc.IsRateLimited + }) + require.Equal(t, int64(2), got) + }) + + t.Run("测试错误账号统计(排除临时不可调度): acc.HasError && acc.TempUnschedulableUntil == nil", func(t *testing.T) { + t.Parallel() + + until := time.Now().UTC().Add(5 * time.Minute) + accounts := map[int64]*AccountAvailability{ + 1: {HasError: true}, + 2: {HasError: true, TempUnschedulableUntil: &until}, + 3: {HasError: false}, + } + + got := countAccountsByCondition(accounts, func(acc *AccountAvailability) bool { + return acc.HasError && acc.TempUnschedulableUntil == nil + }) + require.Equal(t, int64(1), got) + }) + + t.Run("边界情况: 空 map 应返回 0", func(t *testing.T) { + t.Parallel() + + got := countAccountsByCondition(map[int64]*AccountAvailability{}, func(acc *AccountAvailability) bool { + return acc.IsRateLimited + }) + require.Equal(t, int64(0), got) + }) +} + +func TestComputeRuleMetricNewIndicators(t *testing.T) { + t.Parallel() + + groupID := int64(101) + platform := "openai" + + availability := &OpsAccountAvailability{ + Group: &GroupAvailability{ + GroupID: groupID, + TotalAccounts: 10, + AvailableCount: 8, + }, + Accounts: map[int64]*AccountAvailability{ + 1: {IsRateLimited: true}, + 2: {IsRateLimited: true}, + 3: {HasError: true}, + 4: {HasError: true, TempUnschedulableUntil: timePtr(time.Now().UTC().Add(2 * time.Minute))}, + 5: {HasError: false, IsRateLimited: false}, + }, + } + + opsService := &OpsService{ + getAccountAvailability: func(_ context.Context, _ string, _ *int64) (*OpsAccountAvailability, error) { + return availability, nil + }, + } + + svc := &OpsAlertEvaluatorService{ + opsService: opsService, + opsRepo: &stubOpsRepo{overview: &OpsDashboardOverview{}}, + } + + start := time.Now().UTC().Add(-5 * time.Minute) + end := time.Now().UTC() + ctx := context.Background() + + tests := []struct { + name string + metricType string + groupID *int64 + wantValue float64 + wantOK bool + }{ + { + name: "group_available_accounts", + metricType: "group_available_accounts", + groupID: &groupID, + wantValue: 8, + wantOK: true, + }, + { + name: "group_available_ratio", + metricType: "group_available_ratio", + groupID: &groupID, + wantValue: 80.0, + wantOK: true, + }, + { + name: "account_rate_limited_count", + metricType: "account_rate_limited_count", + groupID: nil, + wantValue: 2, + wantOK: true, + }, + { + name: "account_error_count", + metricType: "account_error_count", + groupID: nil, + wantValue: 1, + wantOK: true, + }, + { + name: "group_available_accounts without group_id returns false", + metricType: "group_available_accounts", + groupID: nil, + wantValue: 0, + wantOK: false, + }, + { + name: "group_available_ratio without group_id returns false", + metricType: "group_available_ratio", + groupID: nil, + wantValue: 0, + wantOK: false, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + rule := &OpsAlertRule{ + MetricType: tt.metricType, + } + gotValue, gotOK := svc.computeRuleMetric(ctx, rule, nil, start, end, platform, tt.groupID) + require.Equal(t, tt.wantOK, gotOK) + if !tt.wantOK { + return + } + require.InDelta(t, tt.wantValue, gotValue, 0.0001) + }) + } +} diff --git a/backend/internal/service/ops_alert_models.go b/backend/internal/service/ops_alert_models.go new file mode 100644 index 00000000..0acf13ab --- /dev/null +++ b/backend/internal/service/ops_alert_models.go @@ -0,0 +1,74 @@ +package service + +import "time" + +// Ops alert rule/event models. +// +// NOTE: These are admin-facing DTOs and intentionally keep JSON naming aligned +// with the existing ops dashboard frontend (backup style). + +const ( + OpsAlertStatusFiring = "firing" + OpsAlertStatusResolved = "resolved" +) + +type OpsAlertRule struct { + ID int64 `json:"id"` + Name string `json:"name"` + Description string `json:"description"` + + Enabled bool `json:"enabled"` + Severity string `json:"severity"` + + MetricType string `json:"metric_type"` + Operator string `json:"operator"` + Threshold float64 `json:"threshold"` + + WindowMinutes int `json:"window_minutes"` + SustainedMinutes int `json:"sustained_minutes"` + CooldownMinutes int `json:"cooldown_minutes"` + + NotifyEmail bool `json:"notify_email"` + + Filters map[string]any `json:"filters,omitempty"` + + LastTriggeredAt *time.Time `json:"last_triggered_at,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type OpsAlertEvent struct { + ID int64 `json:"id"` + RuleID int64 `json:"rule_id"` + Severity string `json:"severity"` + Status string `json:"status"` + + Title string `json:"title"` + Description string `json:"description"` + + MetricValue *float64 `json:"metric_value,omitempty"` + ThresholdValue *float64 `json:"threshold_value,omitempty"` + + Dimensions map[string]any `json:"dimensions,omitempty"` + + FiredAt time.Time `json:"fired_at"` + ResolvedAt *time.Time `json:"resolved_at,omitempty"` + + EmailSent bool `json:"email_sent"` + CreatedAt time.Time `json:"created_at"` +} + +type OpsAlertEventFilter struct { + Limit int + + // Optional filters. + Status string + Severity string + + StartTime *time.Time + EndTime *time.Time + + // Dimensions filters (best-effort). + Platform string + GroupID *int64 +} diff --git a/backend/internal/service/ops_alerts.go b/backend/internal/service/ops_alerts.go new file mode 100644 index 00000000..b6c3d1c3 --- /dev/null +++ b/backend/internal/service/ops_alerts.go @@ -0,0 +1,162 @@ +package service + +import ( + "context" + "database/sql" + "errors" + "strings" + "time" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +func (s *OpsService) ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return []*OpsAlertRule{}, nil + } + return s.opsRepo.ListAlertRules(ctx) +} + +func (s *OpsService) CreateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if rule == nil { + return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule") + } + + created, err := s.opsRepo.CreateAlertRule(ctx, rule) + if err != nil { + return nil, err + } + return created, nil +} + +func (s *OpsService) UpdateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if rule == nil || rule.ID <= 0 { + return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule") + } + + updated, err := s.opsRepo.UpdateAlertRule(ctx, rule) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found") + } + return nil, err + } + return updated, nil +} + +func (s *OpsService) DeleteAlertRule(ctx context.Context, id int64) error { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return err + } + if s.opsRepo == nil { + return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if id <= 0 { + return infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id") + } + if err := s.opsRepo.DeleteAlertRule(ctx, id); err != nil { + if errors.Is(err, sql.ErrNoRows) { + return infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found") + } + return err + } + return nil +} + +func (s *OpsService) ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return []*OpsAlertEvent{}, nil + } + return s.opsRepo.ListAlertEvents(ctx, filter) +} + +func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if ruleID <= 0 { + return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id") + } + return s.opsRepo.GetActiveAlertEvent(ctx, ruleID) +} + +func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if ruleID <= 0 { + return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id") + } + return s.opsRepo.GetLatestAlertEvent(ctx, ruleID) +} + +func (s *OpsService) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if event == nil { + return nil, infraerrors.BadRequest("INVALID_EVENT", "invalid event") + } + + created, err := s.opsRepo.CreateAlertEvent(ctx, event) + if err != nil { + return nil, err + } + return created, nil +} + +func (s *OpsService) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return err + } + if s.opsRepo == nil { + return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if eventID <= 0 { + return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id") + } + if strings.TrimSpace(status) == "" { + return infraerrors.BadRequest("INVALID_STATUS", "invalid status") + } + return s.opsRepo.UpdateAlertEventStatus(ctx, eventID, status, resolvedAt) +} + +func (s *OpsService) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return err + } + if s.opsRepo == nil { + return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if eventID <= 0 { + return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id") + } + return s.opsRepo.UpdateAlertEventEmailSent(ctx, eventID, emailSent) +} diff --git a/backend/internal/service/ops_cleanup_service.go b/backend/internal/service/ops_cleanup_service.go new file mode 100644 index 00000000..afd2d22c --- /dev/null +++ b/backend/internal/service/ops_cleanup_service.go @@ -0,0 +1,365 @@ +package service + +import ( + "context" + "database/sql" + "fmt" + "log" + "strings" + "sync" + "time" + + "github.com/Wei-Shaw/sub2api/internal/config" + "github.com/google/uuid" + "github.com/redis/go-redis/v9" + "github.com/robfig/cron/v3" +) + +const ( + opsCleanupJobName = "ops_cleanup" + + opsCleanupLeaderLockKeyDefault = "ops:cleanup:leader" + opsCleanupLeaderLockTTLDefault = 30 * time.Minute +) + +var opsCleanupCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow) + +var opsCleanupReleaseScript = redis.NewScript(` +if redis.call("GET", KEYS[1]) == ARGV[1] then + return redis.call("DEL", KEYS[1]) +end +return 0 +`) + +// OpsCleanupService periodically deletes old ops data to prevent unbounded DB growth. +// +// - Scheduling: 5-field cron spec (minute hour dom month dow). +// - Multi-instance: best-effort Redis leader lock so only one node runs cleanup. +// - Safety: deletes in batches to avoid long transactions. +type OpsCleanupService struct { + opsRepo OpsRepository + db *sql.DB + redisClient *redis.Client + cfg *config.Config + + instanceID string + + cron *cron.Cron + + startOnce sync.Once + stopOnce sync.Once + + warnNoRedisOnce sync.Once +} + +func NewOpsCleanupService( + opsRepo OpsRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsCleanupService { + return &OpsCleanupService{ + opsRepo: opsRepo, + db: db, + redisClient: redisClient, + cfg: cfg, + instanceID: uuid.NewString(), + } +} + +func (s *OpsCleanupService) Start() { + if s == nil { + return + } + if s.cfg != nil && !s.cfg.Ops.Enabled { + return + } + if s.cfg != nil && !s.cfg.Ops.Cleanup.Enabled { + log.Printf("[OpsCleanup] not started (disabled)") + return + } + if s.opsRepo == nil || s.db == nil { + log.Printf("[OpsCleanup] not started (missing deps)") + return + } + + s.startOnce.Do(func() { + schedule := "0 2 * * *" + if s.cfg != nil && strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule) != "" { + schedule = strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule) + } + + loc := time.Local + if s.cfg != nil && strings.TrimSpace(s.cfg.Timezone) != "" { + if parsed, err := time.LoadLocation(strings.TrimSpace(s.cfg.Timezone)); err == nil && parsed != nil { + loc = parsed + } + } + + c := cron.New(cron.WithParser(opsCleanupCronParser), cron.WithLocation(loc)) + _, err := c.AddFunc(schedule, func() { s.runScheduled() }) + if err != nil { + log.Printf("[OpsCleanup] not started (invalid schedule=%q): %v", schedule, err) + return + } + s.cron = c + s.cron.Start() + log.Printf("[OpsCleanup] started (schedule=%q tz=%s)", schedule, loc.String()) + }) +} + +func (s *OpsCleanupService) Stop() { + if s == nil { + return + } + s.stopOnce.Do(func() { + if s.cron != nil { + ctx := s.cron.Stop() + select { + case <-ctx.Done(): + case <-time.After(3 * time.Second): + log.Printf("[OpsCleanup] cron stop timed out") + } + } + }) +} + +func (s *OpsCleanupService) runScheduled() { + if s == nil || s.db == nil || s.opsRepo == nil { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute) + defer cancel() + + release, ok := s.tryAcquireLeaderLock(ctx) + if !ok { + return + } + if release != nil { + defer release() + } + + startedAt := time.Now().UTC() + runAt := startedAt + + counts, err := s.runCleanupOnce(ctx) + if err != nil { + s.recordHeartbeatError(runAt, time.Since(startedAt), err) + log.Printf("[OpsCleanup] cleanup failed: %v", err) + return + } + s.recordHeartbeatSuccess(runAt, time.Since(startedAt)) + log.Printf("[OpsCleanup] cleanup complete: %s", counts) +} + +type opsCleanupDeletedCounts struct { + errorLogs int64 + retryAttempts int64 + alertEvents int64 + systemMetrics int64 + hourlyPreagg int64 + dailyPreagg int64 +} + +func (c opsCleanupDeletedCounts) String() string { + return fmt.Sprintf( + "error_logs=%d retry_attempts=%d alert_events=%d system_metrics=%d hourly_preagg=%d daily_preagg=%d", + c.errorLogs, + c.retryAttempts, + c.alertEvents, + c.systemMetrics, + c.hourlyPreagg, + c.dailyPreagg, + ) +} + +func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDeletedCounts, error) { + out := opsCleanupDeletedCounts{} + if s == nil || s.db == nil || s.cfg == nil { + return out, nil + } + + batchSize := 5000 + + now := time.Now().UTC() + + // Error-like tables: error logs / retry attempts / alert events. + if days := s.cfg.Ops.Cleanup.ErrorLogRetentionDays; days > 0 { + cutoff := now.AddDate(0, 0, -days) + n, err := deleteOldRowsByID(ctx, s.db, "ops_error_logs", "created_at", cutoff, batchSize, false) + if err != nil { + return out, err + } + out.errorLogs = n + + n, err = deleteOldRowsByID(ctx, s.db, "ops_retry_attempts", "created_at", cutoff, batchSize, false) + if err != nil { + return out, err + } + out.retryAttempts = n + + n, err = deleteOldRowsByID(ctx, s.db, "ops_alert_events", "created_at", cutoff, batchSize, false) + if err != nil { + return out, err + } + out.alertEvents = n + } + + // Minute-level metrics snapshots. + if days := s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays; days > 0 { + cutoff := now.AddDate(0, 0, -days) + n, err := deleteOldRowsByID(ctx, s.db, "ops_system_metrics", "created_at", cutoff, batchSize, false) + if err != nil { + return out, err + } + out.systemMetrics = n + } + + // Pre-aggregation tables (hourly/daily). + if days := s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays; days > 0 { + cutoff := now.AddDate(0, 0, -days) + n, err := deleteOldRowsByID(ctx, s.db, "ops_metrics_hourly", "bucket_start", cutoff, batchSize, false) + if err != nil { + return out, err + } + out.hourlyPreagg = n + + n, err = deleteOldRowsByID(ctx, s.db, "ops_metrics_daily", "bucket_date", cutoff, batchSize, true) + if err != nil { + return out, err + } + out.dailyPreagg = n + } + + return out, nil +} + +func deleteOldRowsByID( + ctx context.Context, + db *sql.DB, + table string, + timeColumn string, + cutoff time.Time, + batchSize int, + castCutoffToDate bool, +) (int64, error) { + if db == nil { + return 0, nil + } + if batchSize <= 0 { + batchSize = 5000 + } + + where := fmt.Sprintf("%s < $1", timeColumn) + if castCutoffToDate { + where = fmt.Sprintf("%s < $1::date", timeColumn) + } + + q := fmt.Sprintf(` +WITH batch AS ( + SELECT id FROM %s + WHERE %s + ORDER BY id + LIMIT $2 +) +DELETE FROM %s +WHERE id IN (SELECT id FROM batch) +`, table, where, table) + + var total int64 + for { + res, err := db.ExecContext(ctx, q, cutoff, batchSize) + if err != nil { + // If ops tables aren't present yet (partial deployments), treat as no-op. + if strings.Contains(strings.ToLower(err.Error()), "does not exist") && strings.Contains(strings.ToLower(err.Error()), "relation") { + return total, nil + } + return total, err + } + affected, err := res.RowsAffected() + if err != nil { + return total, err + } + total += affected + if affected == 0 { + break + } + } + return total, nil +} + +func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) { + if s == nil { + return nil, false + } + // In simple run mode, assume single instance. + if s.cfg != nil && s.cfg.RunMode == config.RunModeSimple { + return nil, true + } + + key := opsCleanupLeaderLockKeyDefault + ttl := opsCleanupLeaderLockTTLDefault + + // Prefer Redis leader lock when available, but avoid stampeding the DB when Redis is flaky by + // falling back to a DB advisory lock. + if s.redisClient != nil { + ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() + if err == nil { + if !ok { + return nil, false + } + return func() { + _, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result() + }, true + } + // Redis error: fall back to DB advisory lock. + s.warnNoRedisOnce.Do(func() { + log.Printf("[OpsCleanup] leader lock SetNX failed; falling back to DB advisory lock: %v", err) + }) + } else { + s.warnNoRedisOnce.Do(func() { + log.Printf("[OpsCleanup] redis not configured; using DB advisory lock") + }) + } + + release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key)) + if !ok { + return nil, false + } + return release, true +} + +func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) { + if s == nil || s.opsRepo == nil { + return + } + now := time.Now().UTC() + durMs := duration.Milliseconds() + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ + JobName: opsCleanupJobName, + LastRunAt: &runAt, + LastSuccessAt: &now, + LastDurationMs: &durMs, + }) +} + +func (s *OpsCleanupService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) { + if s == nil || s.opsRepo == nil || err == nil { + return + } + now := time.Now().UTC() + durMs := duration.Milliseconds() + msg := truncateString(err.Error(), 2048) + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ + JobName: opsCleanupJobName, + LastRunAt: &runAt, + LastErrorAt: &now, + LastError: &msg, + LastDurationMs: &durMs, + }) +} diff --git a/backend/internal/service/ops_concurrency.go b/backend/internal/service/ops_concurrency.go new file mode 100644 index 00000000..c3b7b853 --- /dev/null +++ b/backend/internal/service/ops_concurrency.go @@ -0,0 +1,257 @@ +package service + +import ( + "context" + "log" + "time" + + "github.com/Wei-Shaw/sub2api/internal/pkg/pagination" +) + +const ( + opsAccountsPageSize = 100 + opsConcurrencyBatchChunkSize = 200 +) + +func (s *OpsService) listAllAccountsForOps(ctx context.Context, platformFilter string) ([]Account, error) { + if s == nil || s.accountRepo == nil { + return []Account{}, nil + } + + out := make([]Account, 0, 128) + page := 1 + for { + accounts, pageInfo, err := s.accountRepo.ListWithFilters(ctx, pagination.PaginationParams{ + Page: page, + PageSize: opsAccountsPageSize, + }, platformFilter, "", "", "") + if err != nil { + return nil, err + } + if len(accounts) == 0 { + break + } + + out = append(out, accounts...) + if pageInfo != nil && int64(len(out)) >= pageInfo.Total { + break + } + if len(accounts) < opsAccountsPageSize { + break + } + + page++ + if page > 10_000 { + log.Printf("[Ops] listAllAccountsForOps: aborting after too many pages (platform=%q)", platformFilter) + break + } + } + + return out, nil +} + +func (s *OpsService) getAccountsLoadMapBestEffort(ctx context.Context, accounts []Account) map[int64]*AccountLoadInfo { + if s == nil || s.concurrencyService == nil { + return map[int64]*AccountLoadInfo{} + } + if len(accounts) == 0 { + return map[int64]*AccountLoadInfo{} + } + + // De-duplicate IDs (and keep the max concurrency to avoid under-reporting). + unique := make(map[int64]int, len(accounts)) + for _, acc := range accounts { + if acc.ID <= 0 { + continue + } + if prev, ok := unique[acc.ID]; !ok || acc.Concurrency > prev { + unique[acc.ID] = acc.Concurrency + } + } + + batch := make([]AccountWithConcurrency, 0, len(unique)) + for id, maxConc := range unique { + batch = append(batch, AccountWithConcurrency{ + ID: id, + MaxConcurrency: maxConc, + }) + } + + out := make(map[int64]*AccountLoadInfo, len(batch)) + for i := 0; i < len(batch); i += opsConcurrencyBatchChunkSize { + end := i + opsConcurrencyBatchChunkSize + if end > len(batch) { + end = len(batch) + } + part, err := s.concurrencyService.GetAccountsLoadBatch(ctx, batch[i:end]) + if err != nil { + // Best-effort: return zeros rather than failing the ops UI. + log.Printf("[Ops] GetAccountsLoadBatch failed: %v", err) + continue + } + for k, v := range part { + out[k] = v + } + } + + return out +} + +// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account. +// +// Optional filters: +// - platformFilter: only include accounts in that platform (best-effort reduces DB load) +// - groupIDFilter: only include accounts that belong to that group +func (s *OpsService) GetConcurrencyStats( + ctx context.Context, + platformFilter string, + groupIDFilter *int64, +) (map[string]*PlatformConcurrencyInfo, map[int64]*GroupConcurrencyInfo, map[int64]*AccountConcurrencyInfo, *time.Time, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, nil, nil, nil, err + } + + accounts, err := s.listAllAccountsForOps(ctx, platformFilter) + if err != nil { + return nil, nil, nil, nil, err + } + + collectedAt := time.Now() + loadMap := s.getAccountsLoadMapBestEffort(ctx, accounts) + + platform := make(map[string]*PlatformConcurrencyInfo) + group := make(map[int64]*GroupConcurrencyInfo) + account := make(map[int64]*AccountConcurrencyInfo) + + for _, acc := range accounts { + if acc.ID <= 0 { + continue + } + + var matchedGroup *Group + if groupIDFilter != nil && *groupIDFilter > 0 { + for _, grp := range acc.Groups { + if grp == nil || grp.ID <= 0 { + continue + } + if grp.ID == *groupIDFilter { + matchedGroup = grp + break + } + } + // Group filter provided: skip accounts not in that group. + if matchedGroup == nil { + continue + } + } + + load := loadMap[acc.ID] + currentInUse := int64(0) + waiting := int64(0) + if load != nil { + currentInUse = int64(load.CurrentConcurrency) + waiting = int64(load.WaitingCount) + } + + // Account-level view picks one display group (the first group). + displayGroupID := int64(0) + displayGroupName := "" + if matchedGroup != nil { + displayGroupID = matchedGroup.ID + displayGroupName = matchedGroup.Name + } else if len(acc.Groups) > 0 && acc.Groups[0] != nil { + displayGroupID = acc.Groups[0].ID + displayGroupName = acc.Groups[0].Name + } + + if _, ok := account[acc.ID]; !ok { + info := &AccountConcurrencyInfo{ + AccountID: acc.ID, + AccountName: acc.Name, + Platform: acc.Platform, + GroupID: displayGroupID, + GroupName: displayGroupName, + CurrentInUse: currentInUse, + MaxCapacity: int64(acc.Concurrency), + WaitingInQueue: waiting, + } + if info.MaxCapacity > 0 { + info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100 + } + account[acc.ID] = info + } + + // Platform aggregation. + if acc.Platform != "" { + if _, ok := platform[acc.Platform]; !ok { + platform[acc.Platform] = &PlatformConcurrencyInfo{ + Platform: acc.Platform, + } + } + p := platform[acc.Platform] + p.MaxCapacity += int64(acc.Concurrency) + p.CurrentInUse += currentInUse + p.WaitingInQueue += waiting + } + + // Group aggregation (one account may contribute to multiple groups). + if matchedGroup != nil { + grp := matchedGroup + if _, ok := group[grp.ID]; !ok { + group[grp.ID] = &GroupConcurrencyInfo{ + GroupID: grp.ID, + GroupName: grp.Name, + Platform: grp.Platform, + } + } + g := group[grp.ID] + if g.GroupName == "" && grp.Name != "" { + g.GroupName = grp.Name + } + if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform { + // Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels. + g.Platform = "" + } + g.MaxCapacity += int64(acc.Concurrency) + g.CurrentInUse += currentInUse + g.WaitingInQueue += waiting + } else { + for _, grp := range acc.Groups { + if grp == nil || grp.ID <= 0 { + continue + } + if _, ok := group[grp.ID]; !ok { + group[grp.ID] = &GroupConcurrencyInfo{ + GroupID: grp.ID, + GroupName: grp.Name, + Platform: grp.Platform, + } + } + g := group[grp.ID] + if g.GroupName == "" && grp.Name != "" { + g.GroupName = grp.Name + } + if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform { + // Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels. + g.Platform = "" + } + g.MaxCapacity += int64(acc.Concurrency) + g.CurrentInUse += currentInUse + g.WaitingInQueue += waiting + } + } + } + + for _, info := range platform { + if info.MaxCapacity > 0 { + info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100 + } + } + for _, info := range group { + if info.MaxCapacity > 0 { + info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100 + } + } + + return platform, group, account, &collectedAt, nil +} diff --git a/backend/internal/service/ops_dashboard.go b/backend/internal/service/ops_dashboard.go new file mode 100644 index 00000000..31822ba8 --- /dev/null +++ b/backend/internal/service/ops_dashboard.go @@ -0,0 +1,90 @@ +package service + +import ( + "context" + "database/sql" + "errors" + "log" + "time" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + + // Resolve query mode (requested via query param, or DB default). + filter.QueryMode = s.resolveOpsQueryMode(ctx, filter.QueryMode) + + overview, err := s.opsRepo.GetDashboardOverview(ctx, filter) + if err != nil { + if errors.Is(err, ErrOpsPreaggregatedNotPopulated) { + return nil, infraerrors.Conflict("OPS_PREAGG_NOT_READY", "Pre-aggregated ops metrics are not populated yet") + } + return nil, err + } + + // Best-effort system health + jobs; dashboard metrics should still render if these are missing. + if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil { + // Attach config-derived limits so the UI can show "current / max" for connection pools. + // These are best-effort and should never block the dashboard rendering. + if s != nil && s.cfg != nil { + if s.cfg.Database.MaxOpenConns > 0 { + metrics.DBMaxOpenConns = intPtr(s.cfg.Database.MaxOpenConns) + } + if s.cfg.Redis.PoolSize > 0 { + metrics.RedisPoolSize = intPtr(s.cfg.Redis.PoolSize) + } + } + overview.SystemMetrics = metrics + } else if err != nil && !errors.Is(err, sql.ErrNoRows) { + log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err) + } + + if heartbeats, err := s.opsRepo.ListJobHeartbeats(ctx); err == nil { + overview.JobHeartbeats = heartbeats + } else { + log.Printf("[Ops] ListJobHeartbeats failed: %v", err) + } + + overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview) + + return overview, nil +} + +func (s *OpsService) resolveOpsQueryMode(ctx context.Context, requested OpsQueryMode) OpsQueryMode { + if requested.IsValid() { + // Allow "auto" to be disabled via config until preagg is proven stable in production. + // Forced `preagg` via query param still works. + if requested == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables { + return OpsQueryModeRaw + } + return requested + } + + mode := OpsQueryModeAuto + if s != nil && s.settingRepo != nil { + if raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsQueryModeDefault); err == nil { + mode = ParseOpsQueryMode(raw) + } + } + + if mode == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables { + return OpsQueryModeRaw + } + return mode +} diff --git a/backend/internal/service/ops_dashboard_models.go b/backend/internal/service/ops_dashboard_models.go new file mode 100644 index 00000000..f189031b --- /dev/null +++ b/backend/internal/service/ops_dashboard_models.go @@ -0,0 +1,87 @@ +package service + +import "time" + +type OpsDashboardFilter struct { + StartTime time.Time + EndTime time.Time + + Platform string + GroupID *int64 + + // QueryMode controls whether dashboard queries should use raw logs or pre-aggregated tables. + // Expected values: auto/raw/preagg (see OpsQueryMode). + QueryMode OpsQueryMode +} + +type OpsRateSummary struct { + Current float64 `json:"current"` + Peak float64 `json:"peak"` + Avg float64 `json:"avg"` +} + +type OpsPercentiles struct { + P50 *int `json:"p50_ms"` + P90 *int `json:"p90_ms"` + P95 *int `json:"p95_ms"` + P99 *int `json:"p99_ms"` + Avg *int `json:"avg_ms"` + Max *int `json:"max_ms"` +} + +type OpsDashboardOverview struct { + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + Platform string `json:"platform"` + GroupID *int64 `json:"group_id"` + + // HealthScore is a backend-computed overall health score (0-100). + // It is derived from the monitored metrics in this overview, plus best-effort system metrics/job heartbeats. + HealthScore int `json:"health_score"` + + // Latest system-level snapshot (window=1m, global). + SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"` + + // Background jobs health (heartbeats). + JobHeartbeats []*OpsJobHeartbeat `json:"job_heartbeats"` + + SuccessCount int64 `json:"success_count"` + ErrorCountTotal int64 `json:"error_count_total"` + BusinessLimitedCount int64 `json:"business_limited_count"` + + ErrorCountSLA int64 `json:"error_count_sla"` + RequestCountTotal int64 `json:"request_count_total"` + RequestCountSLA int64 `json:"request_count_sla"` + + TokenConsumed int64 `json:"token_consumed"` + + SLA float64 `json:"sla"` + ErrorRate float64 `json:"error_rate"` + UpstreamErrorRate float64 `json:"upstream_error_rate"` + UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"` + Upstream429Count int64 `json:"upstream_429_count"` + Upstream529Count int64 `json:"upstream_529_count"` + + QPS OpsRateSummary `json:"qps"` + TPS OpsRateSummary `json:"tps"` + + Duration OpsPercentiles `json:"duration"` + TTFT OpsPercentiles `json:"ttft"` +} + +type OpsLatencyHistogramBucket struct { + Range string `json:"range"` + Count int64 `json:"count"` +} + +// OpsLatencyHistogramResponse is a coarse latency distribution histogram (success requests only). +// It is used by the Ops dashboard to quickly identify tail latency regressions. +type OpsLatencyHistogramResponse struct { + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + Platform string `json:"platform"` + GroupID *int64 `json:"group_id"` + + TotalRequests int64 `json:"total_requests"` + Buckets []*OpsLatencyHistogramBucket `json:"buckets"` +} diff --git a/backend/internal/service/ops_errors.go b/backend/internal/service/ops_errors.go new file mode 100644 index 00000000..76b5ce8b --- /dev/null +++ b/backend/internal/service/ops_errors.go @@ -0,0 +1,45 @@ +package service + +import ( + "context" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +func (s *OpsService) GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + return s.opsRepo.GetErrorTrend(ctx, filter, bucketSeconds) +} + +func (s *OpsService) GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + return s.opsRepo.GetErrorDistribution(ctx, filter) +} diff --git a/backend/internal/service/ops_health_score.go b/backend/internal/service/ops_health_score.go new file mode 100644 index 00000000..feb0d843 --- /dev/null +++ b/backend/internal/service/ops_health_score.go @@ -0,0 +1,154 @@ +package service + +import ( + "math" + "time" +) + +// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview. +// +// Design goals: +// - Backend-owned scoring (UI only displays). +// - Layered scoring: Business Health (70%) + Infrastructure Health (30%) +// - Avoids double-counting (e.g., DB failure affects both infra and business metrics) +// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data. +func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int { + if overview == nil { + return 0 + } + + // Idle/no-data: avoid showing a "bad" score when there is no traffic. + // UI can still render a gray/idle state based on QPS + error rate. + if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 { + return 100 + } + + businessHealth := computeBusinessHealth(overview) + infraHealth := computeInfraHealth(now, overview) + + // Weighted combination: 70% business + 30% infrastructure + score := businessHealth*0.7 + infraHealth*0.3 + return int(math.Round(clampFloat64(score, 0, 100))) +} + +// computeBusinessHealth calculates business health score (0-100) +// Components: SLA (50%) + Error Rate (30%) + Latency (20%) +func computeBusinessHealth(overview *OpsDashboardOverview) float64 { + // SLA score: 99.5% → 100, 95% → 0 (linear) + slaScore := 100.0 + slaPct := clampFloat64(overview.SLA*100, 0, 100) + if slaPct < 99.5 { + if slaPct >= 95 { + slaScore = (slaPct - 95) / 4.5 * 100 + } else { + slaScore = 0 + } + } + + // Error rate score: 0.5% → 100, 5% → 0 (linear) + // Combines request errors and upstream errors + errorScore := 100.0 + errorPct := clampFloat64(overview.ErrorRate*100, 0, 100) + upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100) + combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case + if combinedErrorPct > 0.5 { + if combinedErrorPct <= 5 { + errorScore = (5 - combinedErrorPct) / 4.5 * 100 + } else { + errorScore = 0 + } + } + + // Latency score: 1s → 100, 10s → 0 (linear) + // Uses P99 of duration (TTFT is less critical for overall health) + latencyScore := 100.0 + if overview.Duration.P99 != nil { + p99 := float64(*overview.Duration.P99) + if p99 > 1000 { + if p99 <= 10000 { + latencyScore = (10000 - p99) / 9000 * 100 + } else { + latencyScore = 0 + } + } + } + + // Weighted combination + return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2 +} + +// computeInfraHealth calculates infrastructure health score (0-100) +// Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%) +func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 { + // Storage score: DB critical, Redis less critical + storageScore := 100.0 + if overview.SystemMetrics != nil { + if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK { + storageScore = 0 // DB failure is critical + } else if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK { + storageScore = 50 // Redis failure is degraded but not critical + } + } + + // Compute resources score: CPU + Memory + computeScore := 100.0 + if overview.SystemMetrics != nil { + cpuScore := 100.0 + if overview.SystemMetrics.CPUUsagePercent != nil { + cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100) + if cpuPct > 80 { + if cpuPct <= 100 { + cpuScore = (100 - cpuPct) / 20 * 100 + } else { + cpuScore = 0 + } + } + } + + memScore := 100.0 + if overview.SystemMetrics.MemoryUsagePercent != nil { + memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100) + if memPct > 85 { + if memPct <= 100 { + memScore = (100 - memPct) / 15 * 100 + } else { + memScore = 0 + } + } + } + + computeScore = (cpuScore + memScore) / 2 + } + + // Background jobs score + jobScore := 100.0 + failedJobs := 0 + totalJobs := 0 + for _, hb := range overview.JobHeartbeats { + if hb == nil { + continue + } + totalJobs++ + if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) { + failedJobs++ + } else if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute { + failedJobs++ + } + } + if totalJobs > 0 && failedJobs > 0 { + jobScore = (1 - float64(failedJobs)/float64(totalJobs)) * 100 + } + + // Weighted combination + return storageScore*0.4 + computeScore*0.3 + jobScore*0.3 +} + +func clampFloat64(v float64, min float64, max float64) float64 { + if v < min { + return min + } + if v > max { + return max + } + return v +} diff --git a/backend/internal/service/ops_health_score_test.go b/backend/internal/service/ops_health_score_test.go new file mode 100644 index 00000000..849ba146 --- /dev/null +++ b/backend/internal/service/ops_health_score_test.go @@ -0,0 +1,431 @@ +//go:build unit + +package service + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestComputeDashboardHealthScore_IdleReturns100(t *testing.T) { + t.Parallel() + + score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{}) + require.Equal(t, 100, score) +} + +func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) { + t.Parallel() + + ov := &OpsDashboardOverview{ + RequestCountTotal: 100, + RequestCountSLA: 100, + SuccessCount: 90, + ErrorCountTotal: 10, + ErrorCountSLA: 10, + + SLA: 0.90, + ErrorRate: 0.10, + UpstreamErrorRate: 0.08, + + Duration: OpsPercentiles{P99: intPtr(20_000)}, + TTFT: OpsPercentiles{P99: intPtr(2_000)}, + + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(false), + RedisOK: boolPtr(false), + CPUUsagePercent: float64Ptr(98.0), + MemoryUsagePercent: float64Ptr(97.0), + DBConnWaiting: intPtr(3), + ConcurrencyQueueDepth: intPtr(10), + }, + JobHeartbeats: []*OpsJobHeartbeat{ + { + JobName: "job-a", + LastErrorAt: timePtr(time.Now().UTC().Add(-1 * time.Minute)), + LastError: stringPtr("boom"), + }, + }, + } + + score := computeDashboardHealthScore(time.Now().UTC(), ov) + require.Less(t, score, 80) + require.GreaterOrEqual(t, score, 0) +} + +func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + overview *OpsDashboardOverview + wantMin int + wantMax int + }{ + { + name: "nil overview returns 0", + overview: nil, + wantMin: 0, + wantMax: 0, + }, + { + name: "perfect health", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 1.0, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + TTFT: OpsPercentiles{P99: intPtr(100)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(30), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 100, + wantMax: 100, + }, + { + name: "good health - SLA 99.8%", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 0.998, + ErrorRate: 0.003, + UpstreamErrorRate: 0.001, + Duration: OpsPercentiles{P99: intPtr(800)}, + TTFT: OpsPercentiles{P99: intPtr(200)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(50), + MemoryUsagePercent: float64Ptr(60), + }, + }, + wantMin: 95, + wantMax: 100, + }, + { + name: "medium health - SLA 96%", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 0.96, + ErrorRate: 0.02, + UpstreamErrorRate: 0.01, + Duration: OpsPercentiles{P99: intPtr(3000)}, + TTFT: OpsPercentiles{P99: intPtr(600)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(70), + MemoryUsagePercent: float64Ptr(75), + }, + }, + wantMin: 60, + wantMax: 85, + }, + { + name: "DB failure", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 0.995, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(false), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(30), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 70, + wantMax: 90, + }, + { + name: "Redis failure", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 0.995, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(false), + CPUUsagePercent: float64Ptr(30), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 85, + wantMax: 95, + }, + { + name: "high CPU usage", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 0.995, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(95), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 85, + wantMax: 100, + }, + { + name: "combined failures - business degraded + infra healthy", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 0.90, + ErrorRate: 0.05, + UpstreamErrorRate: 0.02, + Duration: OpsPercentiles{P99: intPtr(10000)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(20), + MemoryUsagePercent: float64Ptr(30), + }, + }, + wantMin: 25, + wantMax: 50, + }, + { + name: "combined failures - business healthy + infra degraded", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + RequestCountSLA: 1000, + SLA: 0.998, + ErrorRate: 0.001, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(600)}, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(false), + RedisOK: boolPtr(false), + CPUUsagePercent: float64Ptr(95), + MemoryUsagePercent: float64Ptr(95), + }, + }, + wantMin: 70, + wantMax: 90, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + score := computeDashboardHealthScore(time.Now().UTC(), tt.overview) + require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %d", tt.wantMin) + require.LessOrEqual(t, score, tt.wantMax, "score should be <= %d", tt.wantMax) + require.GreaterOrEqual(t, score, 0, "score must be >= 0") + require.LessOrEqual(t, score, 100, "score must be <= 100") + }) + } +} + +func TestComputeBusinessHealth(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + overview *OpsDashboardOverview + wantMin float64 + wantMax float64 + }{ + { + name: "perfect metrics", + overview: &OpsDashboardOverview{ + SLA: 1.0, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + }, + wantMin: 100, + wantMax: 100, + }, + { + name: "SLA boundary 99.5%", + overview: &OpsDashboardOverview{ + SLA: 0.995, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + }, + wantMin: 100, + wantMax: 100, + }, + { + name: "SLA boundary 95%", + overview: &OpsDashboardOverview{ + SLA: 0.95, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + }, + wantMin: 50, + wantMax: 60, + }, + { + name: "error rate boundary 0.5%", + overview: &OpsDashboardOverview{ + SLA: 0.995, + ErrorRate: 0.005, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + }, + wantMin: 95, + wantMax: 100, + }, + { + name: "latency boundary 1000ms", + overview: &OpsDashboardOverview{ + SLA: 0.995, + ErrorRate: 0, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(1000)}, + }, + wantMin: 95, + wantMax: 100, + }, + { + name: "upstream error dominates", + overview: &OpsDashboardOverview{ + SLA: 0.995, + ErrorRate: 0.001, + UpstreamErrorRate: 0.03, + Duration: OpsPercentiles{P99: intPtr(500)}, + }, + wantMin: 75, + wantMax: 90, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + score := computeBusinessHealth(tt.overview) + require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin) + require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax) + require.GreaterOrEqual(t, score, 0.0, "score must be >= 0") + require.LessOrEqual(t, score, 100.0, "score must be <= 100") + }) + } +} + +func TestComputeInfraHealth(t *testing.T) { + t.Parallel() + + now := time.Now().UTC() + + tests := []struct { + name string + overview *OpsDashboardOverview + wantMin float64 + wantMax float64 + }{ + { + name: "all infrastructure healthy", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(30), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 100, + wantMax: 100, + }, + { + name: "DB down", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(false), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(30), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 50, + wantMax: 70, + }, + { + name: "Redis down", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(false), + CPUUsagePercent: float64Ptr(30), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 80, + wantMax: 95, + }, + { + name: "CPU at 90%", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(90), + MemoryUsagePercent: float64Ptr(40), + }, + }, + wantMin: 85, + wantMax: 95, + }, + { + name: "failed background job", + overview: &OpsDashboardOverview{ + RequestCountTotal: 1000, + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(true), + RedisOK: boolPtr(true), + CPUUsagePercent: float64Ptr(30), + MemoryUsagePercent: float64Ptr(40), + }, + JobHeartbeats: []*OpsJobHeartbeat{ + { + JobName: "test-job", + LastErrorAt: &now, + }, + }, + }, + wantMin: 70, + wantMax: 90, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + score := computeInfraHealth(now, tt.overview) + require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin) + require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax) + require.GreaterOrEqual(t, score, 0.0, "score must be >= 0") + require.LessOrEqual(t, score, 100.0, "score must be <= 100") + }) + } +} + +func timePtr(v time.Time) *time.Time { return &v } + +func stringPtr(v string) *string { return &v } diff --git a/backend/internal/service/ops_histograms.go b/backend/internal/service/ops_histograms.go new file mode 100644 index 00000000..9f5b514f --- /dev/null +++ b/backend/internal/service/ops_histograms.go @@ -0,0 +1,26 @@ +package service + +import ( + "context" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +func (s *OpsService) GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + return s.opsRepo.GetLatencyHistogram(ctx, filter) +} diff --git a/backend/internal/service/ops_metrics_collector.go b/backend/internal/service/ops_metrics_collector.go new file mode 100644 index 00000000..edf32cf2 --- /dev/null +++ b/backend/internal/service/ops_metrics_collector.go @@ -0,0 +1,920 @@ +package service + +import ( + "context" + "database/sql" + "errors" + "fmt" + "log" + "math" + "os" + "runtime" + "strconv" + "strings" + "sync" + "time" + "unicode/utf8" + + "github.com/Wei-Shaw/sub2api/internal/config" + "github.com/google/uuid" + "github.com/redis/go-redis/v9" + "github.com/shirou/gopsutil/v4/cpu" + "github.com/shirou/gopsutil/v4/mem" +) + +const ( + opsMetricsCollectorJobName = "ops_metrics_collector" + opsMetricsCollectorMinInterval = 60 * time.Second + opsMetricsCollectorMaxInterval = 1 * time.Hour + + opsMetricsCollectorTimeout = 10 * time.Second + + opsMetricsCollectorLeaderLockKey = "ops:metrics:collector:leader" + opsMetricsCollectorLeaderLockTTL = 90 * time.Second + + opsMetricsCollectorHeartbeatTimeout = 2 * time.Second + + bytesPerMB = 1024 * 1024 +) + +var opsMetricsCollectorAdvisoryLockID = hashAdvisoryLockID(opsMetricsCollectorLeaderLockKey) + +type OpsMetricsCollector struct { + opsRepo OpsRepository + settingRepo SettingRepository + cfg *config.Config + + accountRepo AccountRepository + concurrencyService *ConcurrencyService + + db *sql.DB + redisClient *redis.Client + instanceID string + + lastCgroupCPUUsageNanos uint64 + lastCgroupCPUSampleAt time.Time + + stopCh chan struct{} + startOnce sync.Once + stopOnce sync.Once + + skipLogMu sync.Mutex + skipLogAt time.Time +} + +func NewOpsMetricsCollector( + opsRepo OpsRepository, + settingRepo SettingRepository, + accountRepo AccountRepository, + concurrencyService *ConcurrencyService, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsMetricsCollector { + return &OpsMetricsCollector{ + opsRepo: opsRepo, + settingRepo: settingRepo, + cfg: cfg, + accountRepo: accountRepo, + concurrencyService: concurrencyService, + db: db, + redisClient: redisClient, + instanceID: uuid.NewString(), + } +} + +func (c *OpsMetricsCollector) Start() { + if c == nil { + return + } + c.startOnce.Do(func() { + if c.stopCh == nil { + c.stopCh = make(chan struct{}) + } + go c.run() + }) +} + +func (c *OpsMetricsCollector) Stop() { + if c == nil { + return + } + c.stopOnce.Do(func() { + if c.stopCh != nil { + close(c.stopCh) + } + }) +} + +func (c *OpsMetricsCollector) run() { + // First run immediately so the dashboard has data soon after startup. + c.collectOnce() + + for { + interval := c.getInterval() + timer := time.NewTimer(interval) + select { + case <-timer.C: + c.collectOnce() + case <-c.stopCh: + timer.Stop() + return + } + } +} + +func (c *OpsMetricsCollector) getInterval() time.Duration { + interval := opsMetricsCollectorMinInterval + + if c.settingRepo == nil { + return interval + } + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + raw, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMetricsIntervalSeconds) + if err != nil { + return interval + } + raw = strings.TrimSpace(raw) + if raw == "" { + return interval + } + + seconds, err := strconv.Atoi(raw) + if err != nil { + return interval + } + if seconds < int(opsMetricsCollectorMinInterval.Seconds()) { + seconds = int(opsMetricsCollectorMinInterval.Seconds()) + } + if seconds > int(opsMetricsCollectorMaxInterval.Seconds()) { + seconds = int(opsMetricsCollectorMaxInterval.Seconds()) + } + return time.Duration(seconds) * time.Second +} + +func (c *OpsMetricsCollector) collectOnce() { + if c == nil { + return + } + if c.cfg != nil && !c.cfg.Ops.Enabled { + return + } + if c.opsRepo == nil { + return + } + if c.db == nil { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), opsMetricsCollectorTimeout) + defer cancel() + + if !c.isMonitoringEnabled(ctx) { + return + } + + release, ok := c.tryAcquireLeaderLock(ctx) + if !ok { + return + } + if release != nil { + defer release() + } + + startedAt := time.Now().UTC() + err := c.collectAndPersist(ctx) + finishedAt := time.Now().UTC() + + durationMs := finishedAt.Sub(startedAt).Milliseconds() + dur := durationMs + runAt := startedAt + + if err != nil { + msg := truncateString(err.Error(), 2048) + errAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout) + defer hbCancel() + _ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsMetricsCollectorJobName, + LastRunAt: &runAt, + LastErrorAt: &errAt, + LastError: &msg, + LastDurationMs: &dur, + }) + log.Printf("[OpsMetricsCollector] collect failed: %v", err) + return + } + + successAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout) + defer hbCancel() + _ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsMetricsCollectorJobName, + LastRunAt: &runAt, + LastSuccessAt: &successAt, + LastDurationMs: &dur, + }) +} + +func (c *OpsMetricsCollector) isMonitoringEnabled(ctx context.Context) bool { + if c == nil { + return false + } + if c.cfg != nil && !c.cfg.Ops.Enabled { + return false + } + if c.settingRepo == nil { + return true + } + if ctx == nil { + ctx = context.Background() + } + + value, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled) + if err != nil { + if errors.Is(err, ErrSettingNotFound) { + return true + } + // Fail-open: collector should not become a hard dependency. + return true + } + switch strings.ToLower(strings.TrimSpace(value)) { + case "false", "0", "off", "disabled": + return false + default: + return true + } +} + +func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error { + if ctx == nil { + ctx = context.Background() + } + + // Align to stable minute boundaries to avoid partial buckets and to maximize cache hits. + now := time.Now().UTC() + windowEnd := now.Truncate(time.Minute) + windowStart := windowEnd.Add(-1 * time.Minute) + + sys, err := c.collectSystemStats(ctx) + if err != nil { + // Continue; system stats are best-effort. + log.Printf("[OpsMetricsCollector] system stats error: %v", err) + } + + dbOK := c.checkDB(ctx) + redisOK := c.checkRedis(ctx) + active, idle := c.dbPoolStats() + redisTotal, redisIdle, redisStatsOK := c.redisPoolStats() + + successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd) + if err != nil { + return fmt.Errorf("query usage counts: %w", err) + } + + duration, ttft, err := c.queryUsageLatency(ctx, windowStart, windowEnd) + if err != nil { + return fmt.Errorf("query usage latency: %w", err) + } + + errorTotal, businessLimited, errorSLA, upstreamExcl, upstream429, upstream529, err := c.queryErrorCounts(ctx, windowStart, windowEnd) + if err != nil { + return fmt.Errorf("query error counts: %w", err) + } + + windowSeconds := windowEnd.Sub(windowStart).Seconds() + if windowSeconds <= 0 { + windowSeconds = 60 + } + requestTotal := successCount + errorTotal + qps := float64(requestTotal) / windowSeconds + tps := float64(tokenConsumed) / windowSeconds + + goroutines := runtime.NumGoroutine() + concurrencyQueueDepth := c.collectConcurrencyQueueDepth(ctx) + + input := &OpsInsertSystemMetricsInput{ + CreatedAt: windowEnd, + WindowMinutes: 1, + + SuccessCount: successCount, + ErrorCountTotal: errorTotal, + BusinessLimitedCount: businessLimited, + ErrorCountSLA: errorSLA, + + UpstreamErrorCountExcl429529: upstreamExcl, + Upstream429Count: upstream429, + Upstream529Count: upstream529, + + TokenConsumed: tokenConsumed, + QPS: float64Ptr(roundTo1DP(qps)), + TPS: float64Ptr(roundTo1DP(tps)), + + DurationP50Ms: duration.p50, + DurationP90Ms: duration.p90, + DurationP95Ms: duration.p95, + DurationP99Ms: duration.p99, + DurationAvgMs: duration.avg, + DurationMaxMs: duration.max, + + TTFTP50Ms: ttft.p50, + TTFTP90Ms: ttft.p90, + TTFTP95Ms: ttft.p95, + TTFTP99Ms: ttft.p99, + TTFTAvgMs: ttft.avg, + TTFTMaxMs: ttft.max, + + CPUUsagePercent: sys.cpuUsagePercent, + MemoryUsedMB: sys.memoryUsedMB, + MemoryTotalMB: sys.memoryTotalMB, + MemoryUsagePercent: sys.memoryUsagePercent, + + DBOK: boolPtr(dbOK), + RedisOK: boolPtr(redisOK), + + RedisConnTotal: func() *int { + if !redisStatsOK { + return nil + } + return intPtr(redisTotal) + }(), + RedisConnIdle: func() *int { + if !redisStatsOK { + return nil + } + return intPtr(redisIdle) + }(), + + DBConnActive: intPtr(active), + DBConnIdle: intPtr(idle), + GoroutineCount: intPtr(goroutines), + ConcurrencyQueueDepth: concurrencyQueueDepth, + } + + return c.opsRepo.InsertSystemMetrics(ctx, input) +} + +func (c *OpsMetricsCollector) collectConcurrencyQueueDepth(parentCtx context.Context) *int { + if c == nil || c.accountRepo == nil || c.concurrencyService == nil { + return nil + } + if parentCtx == nil { + parentCtx = context.Background() + } + + // Best-effort: never let concurrency sampling break the metrics collector. + ctx, cancel := context.WithTimeout(parentCtx, 2*time.Second) + defer cancel() + + accounts, err := c.accountRepo.ListSchedulable(ctx) + if err != nil { + return nil + } + if len(accounts) == 0 { + zero := 0 + return &zero + } + + batch := make([]AccountWithConcurrency, 0, len(accounts)) + for _, acc := range accounts { + if acc.ID <= 0 { + continue + } + maxConc := acc.Concurrency + if maxConc < 0 { + maxConc = 0 + } + batch = append(batch, AccountWithConcurrency{ + ID: acc.ID, + MaxConcurrency: maxConc, + }) + } + if len(batch) == 0 { + zero := 0 + return &zero + } + + loadMap, err := c.concurrencyService.GetAccountsLoadBatch(ctx, batch) + if err != nil { + return nil + } + + var total int64 + for _, info := range loadMap { + if info == nil || info.WaitingCount <= 0 { + continue + } + total += int64(info.WaitingCount) + } + if total < 0 { + total = 0 + } + + maxInt := int64(^uint(0) >> 1) + if total > maxInt { + total = maxInt + } + v := int(total) + return &v +} + +type opsCollectedPercentiles struct { + p50 *int + p90 *int + p95 *int + p99 *int + avg *float64 + max *int +} + +func (c *OpsMetricsCollector) queryUsageCounts(ctx context.Context, start, end time.Time) (successCount int64, tokenConsumed int64, err error) { + q := ` +SELECT + COALESCE(COUNT(*), 0) AS success_count, + COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed +FROM usage_logs +WHERE created_at >= $1 AND created_at < $2` + + var tokens sql.NullInt64 + if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&successCount, &tokens); err != nil { + return 0, 0, err + } + if tokens.Valid { + tokenConsumed = tokens.Int64 + } + return successCount, tokenConsumed, nil +} + +func (c *OpsMetricsCollector) queryUsageLatency(ctx context.Context, start, end time.Time) (duration opsCollectedPercentiles, ttft opsCollectedPercentiles, err error) { + { + q := ` +SELECT + percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50, + percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90, + percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95, + percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99, + AVG(duration_ms) AS avg_ms, + MAX(duration_ms) AS max_ms +FROM usage_logs +WHERE created_at >= $1 AND created_at < $2 + AND duration_ms IS NOT NULL` + + var p50, p90, p95, p99 sql.NullFloat64 + var avg sql.NullFloat64 + var max sql.NullInt64 + if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil { + return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err + } + duration.p50 = floatToIntPtr(p50) + duration.p90 = floatToIntPtr(p90) + duration.p95 = floatToIntPtr(p95) + duration.p99 = floatToIntPtr(p99) + if avg.Valid { + v := roundTo1DP(avg.Float64) + duration.avg = &v + } + if max.Valid { + v := int(max.Int64) + duration.max = &v + } + } + + { + q := ` +SELECT + percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50, + percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90, + percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95, + percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99, + AVG(first_token_ms) AS avg_ms, + MAX(first_token_ms) AS max_ms +FROM usage_logs +WHERE created_at >= $1 AND created_at < $2 + AND first_token_ms IS NOT NULL` + + var p50, p90, p95, p99 sql.NullFloat64 + var avg sql.NullFloat64 + var max sql.NullInt64 + if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil { + return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err + } + ttft.p50 = floatToIntPtr(p50) + ttft.p90 = floatToIntPtr(p90) + ttft.p95 = floatToIntPtr(p95) + ttft.p99 = floatToIntPtr(p99) + if avg.Valid { + v := roundTo1DP(avg.Float64) + ttft.avg = &v + } + if max.Valid { + v := int(max.Int64) + ttft.max = &v + } + } + + return duration, ttft, nil +} + +func (c *OpsMetricsCollector) queryErrorCounts(ctx context.Context, start, end time.Time) ( + errorTotal int64, + businessLimited int64, + errorSLA int64, + upstreamExcl429529 int64, + upstream429 int64, + upstream529 int64, + err error, +) { + q := ` +SELECT + COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400), 0) AS error_total, + COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited), 0) AS business_limited, + COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited), 0) AS error_sla, + COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl, + COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429), 0) AS upstream_429, + COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529), 0) AS upstream_529 +FROM ops_error_logs +WHERE created_at >= $1 AND created_at < $2` + + if err := c.db.QueryRowContext(ctx, q, start, end).Scan( + &errorTotal, + &businessLimited, + &errorSLA, + &upstreamExcl429529, + &upstream429, + &upstream529, + ); err != nil { + return 0, 0, 0, 0, 0, 0, err + } + return errorTotal, businessLimited, errorSLA, upstreamExcl429529, upstream429, upstream529, nil +} + +type opsCollectedSystemStats struct { + cpuUsagePercent *float64 + memoryUsedMB *int64 + memoryTotalMB *int64 + memoryUsagePercent *float64 +} + +func (c *OpsMetricsCollector) collectSystemStats(ctx context.Context) (*opsCollectedSystemStats, error) { + out := &opsCollectedSystemStats{} + if ctx == nil { + ctx = context.Background() + } + + sampleAt := time.Now().UTC() + + // Prefer cgroup (container) metrics when available. + if cpuPct := c.tryCgroupCPUPercent(sampleAt); cpuPct != nil { + out.cpuUsagePercent = cpuPct + } + + cgroupUsed, cgroupTotal, cgroupOK := readCgroupMemoryBytes() + if cgroupOK { + usedMB := int64(cgroupUsed / bytesPerMB) + out.memoryUsedMB = &usedMB + if cgroupTotal > 0 { + totalMB := int64(cgroupTotal / bytesPerMB) + out.memoryTotalMB = &totalMB + pct := roundTo1DP(float64(cgroupUsed) / float64(cgroupTotal) * 100) + out.memoryUsagePercent = &pct + } + } + + // Fallback to host metrics if cgroup metrics are unavailable (or incomplete). + if out.cpuUsagePercent == nil { + if cpuPercents, err := cpu.PercentWithContext(ctx, 0, false); err == nil && len(cpuPercents) > 0 { + v := roundTo1DP(cpuPercents[0]) + out.cpuUsagePercent = &v + } + } + + // If total memory isn't available from cgroup (e.g. memory.max = "max"), fill total from host. + if out.memoryUsedMB == nil || out.memoryTotalMB == nil || out.memoryUsagePercent == nil { + if vm, err := mem.VirtualMemoryWithContext(ctx); err == nil && vm != nil { + if out.memoryUsedMB == nil { + usedMB := int64(vm.Used / bytesPerMB) + out.memoryUsedMB = &usedMB + } + if out.memoryTotalMB == nil { + totalMB := int64(vm.Total / bytesPerMB) + out.memoryTotalMB = &totalMB + } + if out.memoryUsagePercent == nil { + if out.memoryUsedMB != nil && out.memoryTotalMB != nil && *out.memoryTotalMB > 0 { + pct := roundTo1DP(float64(*out.memoryUsedMB) / float64(*out.memoryTotalMB) * 100) + out.memoryUsagePercent = &pct + } else { + pct := roundTo1DP(vm.UsedPercent) + out.memoryUsagePercent = &pct + } + } + } + } + + return out, nil +} + +func (c *OpsMetricsCollector) tryCgroupCPUPercent(now time.Time) *float64 { + usageNanos, ok := readCgroupCPUUsageNanos() + if !ok { + return nil + } + + // Initialize baseline sample. + if c.lastCgroupCPUSampleAt.IsZero() { + c.lastCgroupCPUUsageNanos = usageNanos + c.lastCgroupCPUSampleAt = now + return nil + } + + elapsed := now.Sub(c.lastCgroupCPUSampleAt) + if elapsed <= 0 { + c.lastCgroupCPUUsageNanos = usageNanos + c.lastCgroupCPUSampleAt = now + return nil + } + + prev := c.lastCgroupCPUUsageNanos + c.lastCgroupCPUUsageNanos = usageNanos + c.lastCgroupCPUSampleAt = now + + if usageNanos < prev { + // Counter reset (container restarted). + return nil + } + + deltaUsageSec := float64(usageNanos-prev) / 1e9 + elapsedSec := elapsed.Seconds() + if elapsedSec <= 0 { + return nil + } + + cores := readCgroupCPULimitCores() + if cores <= 0 { + // Can't reliably normalize; skip and fall back to gopsutil. + return nil + } + + pct := (deltaUsageSec / (elapsedSec * cores)) * 100 + if pct < 0 { + pct = 0 + } + // Clamp to avoid noise/jitter showing impossible values. + if pct > 100 { + pct = 100 + } + v := roundTo1DP(pct) + return &v +} + +func readCgroupMemoryBytes() (usedBytes uint64, totalBytes uint64, ok bool) { + // cgroup v2 (most common in modern containers) + if used, ok1 := readUintFile("/sys/fs/cgroup/memory.current"); ok1 { + usedBytes = used + rawMax, err := os.ReadFile("/sys/fs/cgroup/memory.max") + if err == nil { + s := strings.TrimSpace(string(rawMax)) + if s != "" && s != "max" { + if v, err := strconv.ParseUint(s, 10, 64); err == nil { + totalBytes = v + } + } + } + return usedBytes, totalBytes, true + } + + // cgroup v1 fallback + if used, ok1 := readUintFile("/sys/fs/cgroup/memory/memory.usage_in_bytes"); ok1 { + usedBytes = used + if limit, ok2 := readUintFile("/sys/fs/cgroup/memory/memory.limit_in_bytes"); ok2 { + // Some environments report a very large number when unlimited. + if limit > 0 && limit < (1<<60) { + totalBytes = limit + } + } + return usedBytes, totalBytes, true + } + + return 0, 0, false +} + +func readCgroupCPUUsageNanos() (usageNanos uint64, ok bool) { + // cgroup v2: cpu.stat has usage_usec + if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.stat"); err == nil { + lines := strings.Split(string(raw), "\n") + for _, line := range lines { + fields := strings.Fields(line) + if len(fields) != 2 { + continue + } + if fields[0] != "usage_usec" { + continue + } + v, err := strconv.ParseUint(fields[1], 10, 64) + if err != nil { + continue + } + return v * 1000, true + } + } + + // cgroup v1: cpuacct.usage is in nanoseconds + if v, ok := readUintFile("/sys/fs/cgroup/cpuacct/cpuacct.usage"); ok { + return v, true + } + + return 0, false +} + +func readCgroupCPULimitCores() float64 { + // cgroup v2: cpu.max => " " or "max " + if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.max"); err == nil { + fields := strings.Fields(string(raw)) + if len(fields) >= 2 && fields[0] != "max" { + quota, err1 := strconv.ParseFloat(fields[0], 64) + period, err2 := strconv.ParseFloat(fields[1], 64) + if err1 == nil && err2 == nil && quota > 0 && period > 0 { + return quota / period + } + } + } + + // cgroup v1: cpu.cfs_quota_us / cpu.cfs_period_us + quota, okQuota := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_quota_us") + period, okPeriod := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_period_us") + if okQuota && okPeriod && quota > 0 && period > 0 { + return float64(quota) / float64(period) + } + + return 0 +} + +func readUintFile(path string) (uint64, bool) { + raw, err := os.ReadFile(path) + if err != nil { + return 0, false + } + s := strings.TrimSpace(string(raw)) + if s == "" { + return 0, false + } + v, err := strconv.ParseUint(s, 10, 64) + if err != nil { + return 0, false + } + return v, true +} + +func readIntFile(path string) (int64, bool) { + raw, err := os.ReadFile(path) + if err != nil { + return 0, false + } + s := strings.TrimSpace(string(raw)) + if s == "" { + return 0, false + } + v, err := strconv.ParseInt(s, 10, 64) + if err != nil { + return 0, false + } + return v, true +} + +func (c *OpsMetricsCollector) checkDB(ctx context.Context) bool { + if c == nil || c.db == nil { + return false + } + if ctx == nil { + ctx = context.Background() + } + var one int + if err := c.db.QueryRowContext(ctx, "SELECT 1").Scan(&one); err != nil { + return false + } + return one == 1 +} + +func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool { + if c == nil || c.redisClient == nil { + return false + } + if ctx == nil { + ctx = context.Background() + } + return c.redisClient.Ping(ctx).Err() == nil +} + +func (c *OpsMetricsCollector) redisPoolStats() (total int, idle int, ok bool) { + if c == nil || c.redisClient == nil { + return 0, 0, false + } + stats := c.redisClient.PoolStats() + if stats == nil { + return 0, 0, false + } + return int(stats.TotalConns), int(stats.IdleConns), true +} + +func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) { + if c == nil || c.db == nil { + return 0, 0 + } + stats := c.db.Stats() + return stats.InUse, stats.Idle +} + +var opsMetricsCollectorReleaseScript = redis.NewScript(` +if redis.call("GET", KEYS[1]) == ARGV[1] then + return redis.call("DEL", KEYS[1]) +end +return 0 +`) + +func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), bool) { + if c == nil || c.redisClient == nil { + return nil, true + } + if ctx == nil { + ctx = context.Background() + } + + ok, err := c.redisClient.SetNX(ctx, opsMetricsCollectorLeaderLockKey, c.instanceID, opsMetricsCollectorLeaderLockTTL).Result() + if err != nil { + // Prefer fail-closed to avoid stampeding the database when Redis is flaky. + // Fallback to a DB advisory lock when Redis is present but unavailable. + release, ok := tryAcquireDBAdvisoryLock(ctx, c.db, opsMetricsCollectorAdvisoryLockID) + if !ok { + c.maybeLogSkip() + return nil, false + } + return release, true + } + if !ok { + c.maybeLogSkip() + return nil, false + } + + release := func() { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _, _ = opsMetricsCollectorReleaseScript.Run(ctx, c.redisClient, []string{opsMetricsCollectorLeaderLockKey}, c.instanceID).Result() + } + return release, true +} + +func (c *OpsMetricsCollector) maybeLogSkip() { + c.skipLogMu.Lock() + defer c.skipLogMu.Unlock() + + now := time.Now() + if !c.skipLogAt.IsZero() && now.Sub(c.skipLogAt) < time.Minute { + return + } + c.skipLogAt = now + log.Printf("[OpsMetricsCollector] leader lock held by another instance; skipping") +} + +func floatToIntPtr(v sql.NullFloat64) *int { + if !v.Valid { + return nil + } + n := int(math.Round(v.Float64)) + return &n +} + +func roundTo1DP(v float64) float64 { + return math.Round(v*10) / 10 +} + +func truncateString(s string, max int) string { + if max <= 0 { + return "" + } + if len(s) <= max { + return s + } + cut := s[:max] + for len(cut) > 0 && !utf8.ValidString(cut) { + cut = cut[:len(cut)-1] + } + return cut +} + +func boolPtr(v bool) *bool { + out := v + return &out +} + +func intPtr(v int) *int { + out := v + return &out +} + +func float64Ptr(v float64) *float64 { + out := v + return &out +} diff --git a/backend/internal/service/ops_models.go b/backend/internal/service/ops_models.go new file mode 100644 index 00000000..996267fd --- /dev/null +++ b/backend/internal/service/ops_models.go @@ -0,0 +1,124 @@ +package service + +import "time" + +type OpsErrorLog struct { + ID int64 `json:"id"` + CreatedAt time.Time `json:"created_at"` + + Phase string `json:"phase"` + Type string `json:"type"` + Severity string `json:"severity"` + + StatusCode int `json:"status_code"` + Platform string `json:"platform"` + Model string `json:"model"` + + LatencyMs *int `json:"latency_ms"` + + ClientRequestID string `json:"client_request_id"` + RequestID string `json:"request_id"` + Message string `json:"message"` + + UserID *int64 `json:"user_id"` + APIKeyID *int64 `json:"api_key_id"` + AccountID *int64 `json:"account_id"` + GroupID *int64 `json:"group_id"` + + ClientIP *string `json:"client_ip"` + RequestPath string `json:"request_path"` + Stream bool `json:"stream"` +} + +type OpsErrorLogDetail struct { + OpsErrorLog + + ErrorBody string `json:"error_body"` + UserAgent string `json:"user_agent"` + + // Upstream context (optional) + UpstreamStatusCode *int `json:"upstream_status_code,omitempty"` + UpstreamErrorMessage string `json:"upstream_error_message,omitempty"` + UpstreamErrorDetail string `json:"upstream_error_detail,omitempty"` + UpstreamErrors string `json:"upstream_errors,omitempty"` // JSON array (string) for display/parsing + + // Timings (optional) + AuthLatencyMs *int64 `json:"auth_latency_ms"` + RoutingLatencyMs *int64 `json:"routing_latency_ms"` + UpstreamLatencyMs *int64 `json:"upstream_latency_ms"` + ResponseLatencyMs *int64 `json:"response_latency_ms"` + TimeToFirstTokenMs *int64 `json:"time_to_first_token_ms"` + + // Retry context + RequestBody string `json:"request_body"` + RequestBodyTruncated bool `json:"request_body_truncated"` + RequestBodyBytes *int `json:"request_body_bytes"` + RequestHeaders string `json:"request_headers,omitempty"` + + // vNext metric semantics + IsBusinessLimited bool `json:"is_business_limited"` +} + +type OpsErrorLogFilter struct { + StartTime *time.Time + EndTime *time.Time + + Platform string + GroupID *int64 + AccountID *int64 + + StatusCodes []int + Phase string + Query string + + Page int + PageSize int +} + +type OpsErrorLogList struct { + Errors []*OpsErrorLog `json:"errors"` + Total int `json:"total"` + Page int `json:"page"` + PageSize int `json:"page_size"` +} + +type OpsRetryAttempt struct { + ID int64 `json:"id"` + CreatedAt time.Time `json:"created_at"` + + RequestedByUserID int64 `json:"requested_by_user_id"` + SourceErrorID int64 `json:"source_error_id"` + Mode string `json:"mode"` + PinnedAccountID *int64 `json:"pinned_account_id"` + + Status string `json:"status"` + StartedAt *time.Time `json:"started_at"` + FinishedAt *time.Time `json:"finished_at"` + DurationMs *int64 `json:"duration_ms"` + + ResultRequestID *string `json:"result_request_id"` + ResultErrorID *int64 `json:"result_error_id"` + + ErrorMessage *string `json:"error_message"` +} + +type OpsRetryResult struct { + AttemptID int64 `json:"attempt_id"` + Mode string `json:"mode"` + Status string `json:"status"` + + PinnedAccountID *int64 `json:"pinned_account_id"` + UsedAccountID *int64 `json:"used_account_id"` + + HTTPStatusCode int `json:"http_status_code"` + UpstreamRequestID string `json:"upstream_request_id"` + + ResponsePreview string `json:"response_preview"` + ResponseTruncated bool `json:"response_truncated"` + + ErrorMessage string `json:"error_message"` + + StartedAt time.Time `json:"started_at"` + FinishedAt time.Time `json:"finished_at"` + DurationMs int64 `json:"duration_ms"` +} diff --git a/backend/internal/service/ops_port.go b/backend/internal/service/ops_port.go new file mode 100644 index 00000000..39f3aaf2 --- /dev/null +++ b/backend/internal/service/ops_port.go @@ -0,0 +1,242 @@ +package service + +import ( + "context" + "time" +) + +type OpsRepository interface { + InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error) + ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error) + GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) + ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error) + + InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error) + UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error + GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error) + + // Lightweight window stats (for realtime WS / quick sampling). + GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error) + + GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) + GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) + GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error) + GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error) + GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error) + + InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error + GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error) + + UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error + ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error) + + // Alerts (rules + events) + ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error) + CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error) + UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error) + DeleteAlertRule(ctx context.Context, id int64) error + + ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error) + GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) + GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) + CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error) + UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error + UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error + + // Pre-aggregation (hourly/daily) used for long-window dashboard performance. + UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error + UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error + GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error) + GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error) +} + +type OpsInsertErrorLogInput struct { + RequestID string + ClientRequestID string + + UserID *int64 + APIKeyID *int64 + AccountID *int64 + GroupID *int64 + ClientIP *string + + Platform string + Model string + RequestPath string + Stream bool + UserAgent string + + ErrorPhase string + ErrorType string + Severity string + StatusCode int + IsBusinessLimited bool + + ErrorMessage string + ErrorBody string + + ErrorSource string + ErrorOwner string + + UpstreamStatusCode *int + UpstreamErrorMessage *string + UpstreamErrorDetail *string + // UpstreamErrors captures all upstream error attempts observed during handling this request. + // It is populated during request processing (gin context) and sanitized+serialized by OpsService. + UpstreamErrors []*OpsUpstreamErrorEvent + // UpstreamErrorsJSON is the sanitized JSON string stored into ops_error_logs.upstream_errors. + // It is set by OpsService.RecordError before persisting. + UpstreamErrorsJSON *string + + DurationMs *int + TimeToFirstTokenMs *int64 + + RequestBodyJSON *string // sanitized json string (not raw bytes) + RequestBodyTruncated bool + RequestBodyBytes *int + RequestHeadersJSON *string // optional json string + + IsRetryable bool + RetryCount int + + CreatedAt time.Time +} + +type OpsInsertRetryAttemptInput struct { + RequestedByUserID int64 + SourceErrorID int64 + Mode string + PinnedAccountID *int64 + + // running|queued etc. + Status string + StartedAt time.Time +} + +type OpsUpdateRetryAttemptInput struct { + ID int64 + + // succeeded|failed + Status string + FinishedAt time.Time + DurationMs int64 + + // Optional correlation + ResultRequestID *string + ResultErrorID *int64 + + ErrorMessage *string +} + +type OpsInsertSystemMetricsInput struct { + CreatedAt time.Time + WindowMinutes int + + Platform *string + GroupID *int64 + + SuccessCount int64 + ErrorCountTotal int64 + BusinessLimitedCount int64 + ErrorCountSLA int64 + + UpstreamErrorCountExcl429529 int64 + Upstream429Count int64 + Upstream529Count int64 + + TokenConsumed int64 + + QPS *float64 + TPS *float64 + + DurationP50Ms *int + DurationP90Ms *int + DurationP95Ms *int + DurationP99Ms *int + DurationAvgMs *float64 + DurationMaxMs *int + + TTFTP50Ms *int + TTFTP90Ms *int + TTFTP95Ms *int + TTFTP99Ms *int + TTFTAvgMs *float64 + TTFTMaxMs *int + + CPUUsagePercent *float64 + MemoryUsedMB *int64 + MemoryTotalMB *int64 + MemoryUsagePercent *float64 + + DBOK *bool + RedisOK *bool + + RedisConnTotal *int + RedisConnIdle *int + + DBConnActive *int + DBConnIdle *int + DBConnWaiting *int + + GoroutineCount *int + ConcurrencyQueueDepth *int +} + +type OpsSystemMetricsSnapshot struct { + ID int64 `json:"id"` + CreatedAt time.Time `json:"created_at"` + WindowMinutes int `json:"window_minutes"` + + CPUUsagePercent *float64 `json:"cpu_usage_percent"` + MemoryUsedMB *int64 `json:"memory_used_mb"` + MemoryTotalMB *int64 `json:"memory_total_mb"` + MemoryUsagePercent *float64 `json:"memory_usage_percent"` + + DBOK *bool `json:"db_ok"` + RedisOK *bool `json:"redis_ok"` + + // Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max". + DBMaxOpenConns *int `json:"db_max_open_conns"` + RedisPoolSize *int `json:"redis_pool_size"` + + RedisConnTotal *int `json:"redis_conn_total"` + RedisConnIdle *int `json:"redis_conn_idle"` + + DBConnActive *int `json:"db_conn_active"` + DBConnIdle *int `json:"db_conn_idle"` + DBConnWaiting *int `json:"db_conn_waiting"` + + GoroutineCount *int `json:"goroutine_count"` + ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"` +} + +type OpsUpsertJobHeartbeatInput struct { + JobName string + + LastRunAt *time.Time + LastSuccessAt *time.Time + LastErrorAt *time.Time + LastError *string + LastDurationMs *int64 +} + +type OpsJobHeartbeat struct { + JobName string `json:"job_name"` + + LastRunAt *time.Time `json:"last_run_at"` + LastSuccessAt *time.Time `json:"last_success_at"` + LastErrorAt *time.Time `json:"last_error_at"` + LastError *string `json:"last_error"` + LastDurationMs *int64 `json:"last_duration_ms"` + + UpdatedAt time.Time `json:"updated_at"` +} + +type OpsWindowStats struct { + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + + SuccessCount int64 `json:"success_count"` + ErrorCountTotal int64 `json:"error_count_total"` + TokenConsumed int64 `json:"token_consumed"` +} diff --git a/backend/internal/service/ops_query_mode.go b/backend/internal/service/ops_query_mode.go new file mode 100644 index 00000000..e6fa9c1e --- /dev/null +++ b/backend/internal/service/ops_query_mode.go @@ -0,0 +1,40 @@ +package service + +import ( + "errors" + "strings" +) + +type OpsQueryMode string + +const ( + OpsQueryModeAuto OpsQueryMode = "auto" + OpsQueryModeRaw OpsQueryMode = "raw" + OpsQueryModePreagg OpsQueryMode = "preagg" +) + +// ErrOpsPreaggregatedNotPopulated indicates that raw logs exist for a window, but the +// pre-aggregation tables are not populated yet. This is primarily used to implement +// the forced `preagg` mode UX. +var ErrOpsPreaggregatedNotPopulated = errors.New("ops pre-aggregated tables not populated") + +func ParseOpsQueryMode(raw string) OpsQueryMode { + v := strings.ToLower(strings.TrimSpace(raw)) + switch v { + case string(OpsQueryModeRaw): + return OpsQueryModeRaw + case string(OpsQueryModePreagg): + return OpsQueryModePreagg + default: + return OpsQueryModeAuto + } +} + +func (m OpsQueryMode) IsValid() bool { + switch m { + case OpsQueryModeAuto, OpsQueryModeRaw, OpsQueryModePreagg: + return true + default: + return false + } +} diff --git a/backend/internal/service/ops_realtime.go b/backend/internal/service/ops_realtime.go new file mode 100644 index 00000000..479b9482 --- /dev/null +++ b/backend/internal/service/ops_realtime.go @@ -0,0 +1,36 @@ +package service + +import ( + "context" + "errors" + "strings" +) + +// IsRealtimeMonitoringEnabled returns true when realtime ops features are enabled. +// +// This is a soft switch controlled by the DB setting `ops_realtime_monitoring_enabled`, +// and it is also gated by the hard switch/soft switch of overall ops monitoring. +func (s *OpsService) IsRealtimeMonitoringEnabled(ctx context.Context) bool { + if !s.IsMonitoringEnabled(ctx) { + return false + } + if s.settingRepo == nil { + return true + } + + value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsRealtimeMonitoringEnabled) + if err != nil { + // Default enabled when key is missing; fail-open on transient errors. + if errors.Is(err, ErrSettingNotFound) { + return true + } + return true + } + + switch strings.ToLower(strings.TrimSpace(value)) { + case "false", "0", "off", "disabled": + return false + default: + return true + } +} diff --git a/backend/internal/service/ops_realtime_models.go b/backend/internal/service/ops_realtime_models.go new file mode 100644 index 00000000..f7514a24 --- /dev/null +++ b/backend/internal/service/ops_realtime_models.go @@ -0,0 +1,81 @@ +package service + +import "time" + +// PlatformConcurrencyInfo aggregates concurrency usage by platform. +type PlatformConcurrencyInfo struct { + Platform string `json:"platform"` + CurrentInUse int64 `json:"current_in_use"` + MaxCapacity int64 `json:"max_capacity"` + LoadPercentage float64 `json:"load_percentage"` + WaitingInQueue int64 `json:"waiting_in_queue"` +} + +// GroupConcurrencyInfo aggregates concurrency usage by group. +// +// Note: one account can belong to multiple groups; group totals are therefore not additive across groups. +type GroupConcurrencyInfo struct { + GroupID int64 `json:"group_id"` + GroupName string `json:"group_name"` + Platform string `json:"platform"` + CurrentInUse int64 `json:"current_in_use"` + MaxCapacity int64 `json:"max_capacity"` + LoadPercentage float64 `json:"load_percentage"` + WaitingInQueue int64 `json:"waiting_in_queue"` +} + +// AccountConcurrencyInfo represents real-time concurrency usage for a single account. +type AccountConcurrencyInfo struct { + AccountID int64 `json:"account_id"` + AccountName string `json:"account_name"` + Platform string `json:"platform"` + GroupID int64 `json:"group_id"` + GroupName string `json:"group_name"` + CurrentInUse int64 `json:"current_in_use"` + MaxCapacity int64 `json:"max_capacity"` + LoadPercentage float64 `json:"load_percentage"` + WaitingInQueue int64 `json:"waiting_in_queue"` +} + +// PlatformAvailability aggregates account availability by platform. +type PlatformAvailability struct { + Platform string `json:"platform"` + TotalAccounts int64 `json:"total_accounts"` + AvailableCount int64 `json:"available_count"` + RateLimitCount int64 `json:"rate_limit_count"` + ErrorCount int64 `json:"error_count"` +} + +// GroupAvailability aggregates account availability by group. +type GroupAvailability struct { + GroupID int64 `json:"group_id"` + GroupName string `json:"group_name"` + Platform string `json:"platform"` + TotalAccounts int64 `json:"total_accounts"` + AvailableCount int64 `json:"available_count"` + RateLimitCount int64 `json:"rate_limit_count"` + ErrorCount int64 `json:"error_count"` +} + +// AccountAvailability represents current availability for a single account. +type AccountAvailability struct { + AccountID int64 `json:"account_id"` + AccountName string `json:"account_name"` + Platform string `json:"platform"` + GroupID int64 `json:"group_id"` + GroupName string `json:"group_name"` + + Status string `json:"status"` + + IsAvailable bool `json:"is_available"` + IsRateLimited bool `json:"is_rate_limited"` + IsOverloaded bool `json:"is_overloaded"` + HasError bool `json:"has_error"` + + RateLimitResetAt *time.Time `json:"rate_limit_reset_at"` + RateLimitRemainingSec *int64 `json:"rate_limit_remaining_sec"` + OverloadUntil *time.Time `json:"overload_until"` + OverloadRemainingSec *int64 `json:"overload_remaining_sec"` + ErrorMessage string `json:"error_message"` + TempUnschedulableUntil *time.Time `json:"temp_unschedulable_until,omitempty"` +} diff --git a/backend/internal/service/ops_request_details.go b/backend/internal/service/ops_request_details.go new file mode 100644 index 00000000..12b9aa1b --- /dev/null +++ b/backend/internal/service/ops_request_details.go @@ -0,0 +1,151 @@ +package service + +import ( + "context" + "time" +) + +type OpsRequestKind string + +const ( + OpsRequestKindSuccess OpsRequestKind = "success" + OpsRequestKindError OpsRequestKind = "error" +) + +// OpsRequestDetail is a request-level view across success (usage_logs) and error (ops_error_logs). +// It powers "request drilldown" UIs without exposing full request bodies for successful requests. +type OpsRequestDetail struct { + Kind OpsRequestKind `json:"kind"` + CreatedAt time.Time `json:"created_at"` + RequestID string `json:"request_id"` + + Platform string `json:"platform,omitempty"` + Model string `json:"model,omitempty"` + + DurationMs *int `json:"duration_ms,omitempty"` + StatusCode *int `json:"status_code,omitempty"` + + // When Kind == "error", ErrorID links to /admin/ops/errors/:id. + ErrorID *int64 `json:"error_id,omitempty"` + + Phase string `json:"phase,omitempty"` + Severity string `json:"severity,omitempty"` + Message string `json:"message,omitempty"` + + UserID *int64 `json:"user_id,omitempty"` + APIKeyID *int64 `json:"api_key_id,omitempty"` + AccountID *int64 `json:"account_id,omitempty"` + GroupID *int64 `json:"group_id,omitempty"` + + Stream bool `json:"stream"` +} + +type OpsRequestDetailFilter struct { + StartTime *time.Time + EndTime *time.Time + + // kind: success|error|all + Kind string + + Platform string + GroupID *int64 + + UserID *int64 + APIKeyID *int64 + AccountID *int64 + + Model string + RequestID string + Query string + + MinDurationMs *int + MaxDurationMs *int + + // Sort: created_at_desc (default) or duration_desc. + Sort string + + Page int + PageSize int +} + +func (f *OpsRequestDetailFilter) Normalize() (page, pageSize int, startTime, endTime time.Time) { + page = 1 + pageSize = 50 + endTime = time.Now() + startTime = endTime.Add(-1 * time.Hour) + + if f == nil { + return page, pageSize, startTime, endTime + } + + if f.Page > 0 { + page = f.Page + } + if f.PageSize > 0 { + pageSize = f.PageSize + } + if pageSize > 100 { + pageSize = 100 + } + + if f.EndTime != nil { + endTime = *f.EndTime + } + if f.StartTime != nil { + startTime = *f.StartTime + } else if f.EndTime != nil { + startTime = endTime.Add(-1 * time.Hour) + } + + if startTime.After(endTime) { + startTime, endTime = endTime, startTime + } + + return page, pageSize, startTime, endTime +} + +type OpsRequestDetailList struct { + Items []*OpsRequestDetail `json:"items"` + Total int64 `json:"total"` + Page int `json:"page"` + PageSize int `json:"page_size"` +} + +func (s *OpsService) ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) (*OpsRequestDetailList, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return &OpsRequestDetailList{ + Items: []*OpsRequestDetail{}, + Total: 0, + Page: 1, + PageSize: 50, + }, nil + } + + page, pageSize, startTime, endTime := filter.Normalize() + filterCopy := &OpsRequestDetailFilter{} + if filter != nil { + *filterCopy = *filter + } + filterCopy.Page = page + filterCopy.PageSize = pageSize + filterCopy.StartTime = &startTime + filterCopy.EndTime = &endTime + + items, total, err := s.opsRepo.ListRequestDetails(ctx, filterCopy) + if err != nil { + return nil, err + } + if items == nil { + items = []*OpsRequestDetail{} + } + + return &OpsRequestDetailList{ + Items: items, + Total: total, + Page: page, + PageSize: pageSize, + }, nil +} diff --git a/backend/internal/service/ops_retry.go b/backend/internal/service/ops_retry.go new file mode 100644 index 00000000..747aa3b8 --- /dev/null +++ b/backend/internal/service/ops_retry.go @@ -0,0 +1,632 @@ +package service + +import ( + "bytes" + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "log" + "net/http" + "strings" + "time" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" + "github.com/gin-gonic/gin" + "github.com/lib/pq" +) + +const ( + OpsRetryModeClient = "client" + OpsRetryModeUpstream = "upstream" +) + +const ( + opsRetryStatusRunning = "running" + opsRetryStatusSucceeded = "succeeded" + opsRetryStatusFailed = "failed" +) + +const ( + opsRetryTimeout = 60 * time.Second + opsRetryCaptureBytesLimit = 64 * 1024 + opsRetryResponsePreviewMax = 8 * 1024 + opsRetryMinIntervalPerError = 10 * time.Second + opsRetryMaxAccountSwitches = 3 +) + +var opsRetryRequestHeaderAllowlist = map[string]bool{ + "anthropic-beta": true, + "anthropic-version": true, +} + +type opsRetryRequestType string + +const ( + opsRetryTypeMessages opsRetryRequestType = "messages" + opsRetryTypeOpenAI opsRetryRequestType = "openai_responses" + opsRetryTypeGeminiV1B opsRetryRequestType = "gemini_v1beta" +) + +type limitedResponseWriter struct { + header http.Header + wroteHeader bool + + limit int + totalWritten int64 + buf bytes.Buffer +} + +func newLimitedResponseWriter(limit int) *limitedResponseWriter { + if limit <= 0 { + limit = 1 + } + return &limitedResponseWriter{ + header: make(http.Header), + limit: limit, + } +} + +func (w *limitedResponseWriter) Header() http.Header { + return w.header +} + +func (w *limitedResponseWriter) WriteHeader(statusCode int) { + if w.wroteHeader { + return + } + w.wroteHeader = true +} + +func (w *limitedResponseWriter) Write(p []byte) (int, error) { + if !w.wroteHeader { + w.WriteHeader(http.StatusOK) + } + w.totalWritten += int64(len(p)) + + if w.buf.Len() < w.limit { + remaining := w.limit - w.buf.Len() + if len(p) > remaining { + _, _ = w.buf.Write(p[:remaining]) + } else { + _, _ = w.buf.Write(p) + } + } + + // Pretend we wrote everything to avoid upstream/client code treating it as an error. + return len(p), nil +} + +func (w *limitedResponseWriter) Flush() {} + +func (w *limitedResponseWriter) bodyBytes() []byte { + return w.buf.Bytes() +} + +func (w *limitedResponseWriter) truncated() bool { + return w.totalWritten > int64(w.limit) +} + +func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + + mode = strings.ToLower(strings.TrimSpace(mode)) + switch mode { + case OpsRetryModeClient, OpsRetryModeUpstream: + default: + return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream") + } + + latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID) + if err != nil && !errors.Is(err, sql.ErrNoRows) { + return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err) + } + if latest != nil { + if strings.EqualFold(latest.Status, opsRetryStatusRunning) || strings.EqualFold(latest.Status, "queued") { + return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error") + } + + lastAttemptAt := latest.CreatedAt + if latest.FinishedAt != nil && !latest.FinishedAt.IsZero() { + lastAttemptAt = *latest.FinishedAt + } else if latest.StartedAt != nil && !latest.StartedAt.IsZero() { + lastAttemptAt = *latest.StartedAt + } + + if time.Since(lastAttemptAt) < opsRetryMinIntervalPerError { + return nil, infraerrors.Conflict("OPS_RETRY_TOO_FREQUENT", "Please wait before retrying this error again") + } + } + + errorLog, err := s.GetErrorLogByID(ctx, errorID) + if err != nil { + return nil, err + } + if strings.TrimSpace(errorLog.RequestBody) == "" { + return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry") + } + + var pinned *int64 + if mode == OpsRetryModeUpstream { + if pinnedAccountID != nil && *pinnedAccountID > 0 { + pinned = pinnedAccountID + } else if errorLog.AccountID != nil && *errorLog.AccountID > 0 { + pinned = errorLog.AccountID + } else { + return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry") + } + } + + startedAt := time.Now() + attemptID, err := s.opsRepo.InsertRetryAttempt(ctx, &OpsInsertRetryAttemptInput{ + RequestedByUserID: requestedByUserID, + SourceErrorID: errorID, + Mode: mode, + PinnedAccountID: pinned, + Status: opsRetryStatusRunning, + StartedAt: startedAt, + }) + if err != nil { + var pqErr *pq.Error + if errors.As(err, &pqErr) && string(pqErr.Code) == "23505" { + return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error") + } + return nil, infraerrors.InternalServer("OPS_RETRY_CREATE_ATTEMPT_FAILED", "Failed to create retry attempt").WithCause(err) + } + + result := &OpsRetryResult{ + AttemptID: attemptID, + Mode: mode, + Status: opsRetryStatusFailed, + PinnedAccountID: pinned, + HTTPStatusCode: 0, + UpstreamRequestID: "", + ResponsePreview: "", + ResponseTruncated: false, + ErrorMessage: "", + StartedAt: startedAt, + } + + execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout) + defer cancel() + + execRes := s.executeRetry(execCtx, errorLog, mode, pinned) + + finishedAt := time.Now() + result.FinishedAt = finishedAt + result.DurationMs = finishedAt.Sub(startedAt).Milliseconds() + + if execRes != nil { + result.Status = execRes.status + result.UsedAccountID = execRes.usedAccountID + result.HTTPStatusCode = execRes.httpStatusCode + result.UpstreamRequestID = execRes.upstreamRequestID + result.ResponsePreview = execRes.responsePreview + result.ResponseTruncated = execRes.responseTruncated + result.ErrorMessage = execRes.errorMessage + } + + updateCtx, updateCancel := context.WithTimeout(context.Background(), 3*time.Second) + defer updateCancel() + + var updateErrMsg *string + if strings.TrimSpace(result.ErrorMessage) != "" { + msg := result.ErrorMessage + updateErrMsg = &msg + } + var resultRequestID *string + if strings.TrimSpace(result.UpstreamRequestID) != "" { + v := result.UpstreamRequestID + resultRequestID = &v + } + + finalStatus := result.Status + if strings.TrimSpace(finalStatus) == "" { + finalStatus = opsRetryStatusFailed + } + + if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{ + ID: attemptID, + Status: finalStatus, + FinishedAt: finishedAt, + DurationMs: result.DurationMs, + ResultRequestID: resultRequestID, + ErrorMessage: updateErrMsg, + }); err != nil { + // Best-effort: retry itself already executed; do not fail the API response. + log.Printf("[Ops] UpdateRetryAttempt failed: %v", err) + } + + return result, nil +} + +type opsRetryExecution struct { + status string + + usedAccountID *int64 + httpStatusCode int + upstreamRequestID string + + responsePreview string + responseTruncated bool + + errorMessage string +} + +func (s *OpsService) executeRetry(ctx context.Context, errorLog *OpsErrorLogDetail, mode string, pinnedAccountID *int64) *opsRetryExecution { + if errorLog == nil { + return &opsRetryExecution{ + status: opsRetryStatusFailed, + errorMessage: "missing error log", + } + } + + reqType := detectOpsRetryType(errorLog.RequestPath) + bodyBytes := []byte(errorLog.RequestBody) + + switch reqType { + case opsRetryTypeMessages: + bodyBytes = FilterThinkingBlocksForRetry(bodyBytes) + case opsRetryTypeOpenAI, opsRetryTypeGeminiV1B: + // No-op + } + + switch strings.ToLower(strings.TrimSpace(mode)) { + case OpsRetryModeUpstream: + if pinnedAccountID == nil || *pinnedAccountID <= 0 { + return &opsRetryExecution{ + status: opsRetryStatusFailed, + errorMessage: "pinned_account_id required for upstream retry", + } + } + return s.executePinnedRetry(ctx, reqType, errorLog, bodyBytes, *pinnedAccountID) + case OpsRetryModeClient: + return s.executeClientRetry(ctx, reqType, errorLog, bodyBytes) + default: + return &opsRetryExecution{ + status: opsRetryStatusFailed, + errorMessage: "invalid retry mode", + } + } +} + +func detectOpsRetryType(path string) opsRetryRequestType { + p := strings.ToLower(strings.TrimSpace(path)) + switch { + case strings.Contains(p, "/responses"): + return opsRetryTypeOpenAI + case strings.Contains(p, "/v1beta/"): + return opsRetryTypeGeminiV1B + default: + return opsRetryTypeMessages + } +} + +func (s *OpsService) executePinnedRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, pinnedAccountID int64) *opsRetryExecution { + if s.accountRepo == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account repository not available"} + } + + account, err := s.accountRepo.GetByID(ctx, pinnedAccountID) + if err != nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("account not found: %v", err)} + } + if account == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account not found"} + } + if !account.IsSchedulable() { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account is not schedulable"} + } + if errorLog.GroupID != nil && *errorLog.GroupID > 0 { + if !containsInt64(account.GroupIDs, *errorLog.GroupID) { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "pinned account is not in the same group as the original request"} + } + } + + var release func() + if s.concurrencyService != nil { + acq, err := s.concurrencyService.AcquireAccountSlot(ctx, account.ID, account.Concurrency) + if err != nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("acquire account slot failed: %v", err)} + } + if acq == nil || !acq.Acquired { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account concurrency limit reached"} + } + release = acq.ReleaseFunc + } + if release != nil { + defer release() + } + + usedID := account.ID + exec := s.executeWithAccount(ctx, reqType, errorLog, body, account) + exec.usedAccountID = &usedID + if exec.status == "" { + exec.status = opsRetryStatusFailed + } + return exec +} + +func (s *OpsService) executeClientRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) *opsRetryExecution { + groupID := errorLog.GroupID + if groupID == nil || *groupID <= 0 { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "group_id missing; cannot reselect account"} + } + + model, stream, parsedErr := extractRetryModelAndStream(reqType, errorLog, body) + if parsedErr != nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: parsedErr.Error()} + } + _ = stream + + excluded := make(map[int64]struct{}) + switches := 0 + + for { + if switches >= opsRetryMaxAccountSwitches { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "retry failed after exhausting account failovers"} + } + + selection, selErr := s.selectAccountForRetry(ctx, reqType, groupID, model, excluded) + if selErr != nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: selErr.Error()} + } + if selection == nil || selection.Account == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "no available accounts"} + } + + account := selection.Account + if !selection.Acquired || selection.ReleaseFunc == nil { + excluded[account.ID] = struct{}{} + switches++ + continue + } + + exec := func() *opsRetryExecution { + defer selection.ReleaseFunc() + return s.executeWithAccount(ctx, reqType, errorLog, body, account) + }() + + if exec != nil { + if exec.status == opsRetryStatusSucceeded { + usedID := account.ID + exec.usedAccountID = &usedID + return exec + } + // If the gateway services ask for failover, try another account. + if s.isFailoverError(exec.errorMessage) { + excluded[account.ID] = struct{}{} + switches++ + continue + } + usedID := account.ID + exec.usedAccountID = &usedID + return exec + } + + excluded[account.ID] = struct{}{} + switches++ + } +} + +func (s *OpsService) selectAccountForRetry(ctx context.Context, reqType opsRetryRequestType, groupID *int64, model string, excludedIDs map[int64]struct{}) (*AccountSelectionResult, error) { + switch reqType { + case opsRetryTypeOpenAI: + if s.openAIGatewayService == nil { + return nil, fmt.Errorf("openai gateway service not available") + } + return s.openAIGatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs) + case opsRetryTypeGeminiV1B, opsRetryTypeMessages: + if s.gatewayService == nil { + return nil, fmt.Errorf("gateway service not available") + } + return s.gatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs) + default: + return nil, fmt.Errorf("unsupported retry type: %s", reqType) + } +} + +func extractRetryModelAndStream(reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) (model string, stream bool, err error) { + switch reqType { + case opsRetryTypeMessages: + parsed, parseErr := ParseGatewayRequest(body) + if parseErr != nil { + return "", false, fmt.Errorf("failed to parse messages request body: %w", parseErr) + } + return parsed.Model, parsed.Stream, nil + case opsRetryTypeOpenAI: + var v struct { + Model string `json:"model"` + Stream bool `json:"stream"` + } + if err := json.Unmarshal(body, &v); err != nil { + return "", false, fmt.Errorf("failed to parse openai request body: %w", err) + } + return strings.TrimSpace(v.Model), v.Stream, nil + case opsRetryTypeGeminiV1B: + if strings.TrimSpace(errorLog.Model) == "" { + return "", false, fmt.Errorf("missing model for gemini v1beta retry") + } + return strings.TrimSpace(errorLog.Model), errorLog.Stream, nil + default: + return "", false, fmt.Errorf("unsupported retry type: %s", reqType) + } +} + +func (s *OpsService) executeWithAccount(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, account *Account) *opsRetryExecution { + if account == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "missing account"} + } + + c, w := newOpsRetryContext(ctx, errorLog) + + var err error + switch reqType { + case opsRetryTypeOpenAI: + if s.openAIGatewayService == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "openai gateway service not available"} + } + _, err = s.openAIGatewayService.Forward(ctx, c, account, body) + case opsRetryTypeGeminiV1B: + if s.geminiCompatService == nil || s.antigravityGatewayService == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini services not available"} + } + modelName := strings.TrimSpace(errorLog.Model) + action := "generateContent" + if errorLog.Stream { + action = "streamGenerateContent" + } + if account.Platform == PlatformAntigravity { + _, err = s.antigravityGatewayService.ForwardGemini(ctx, c, account, modelName, action, errorLog.Stream, body) + } else { + _, err = s.geminiCompatService.ForwardNative(ctx, c, account, modelName, action, errorLog.Stream, body) + } + case opsRetryTypeMessages: + switch account.Platform { + case PlatformAntigravity: + if s.antigravityGatewayService == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "antigravity gateway service not available"} + } + _, err = s.antigravityGatewayService.Forward(ctx, c, account, body) + case PlatformGemini: + if s.geminiCompatService == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini gateway service not available"} + } + _, err = s.geminiCompatService.Forward(ctx, c, account, body) + default: + if s.gatewayService == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gateway service not available"} + } + parsedReq, parseErr := ParseGatewayRequest(body) + if parseErr != nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "failed to parse request body"} + } + _, err = s.gatewayService.Forward(ctx, c, account, parsedReq) + } + default: + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "unsupported retry type"} + } + + statusCode := http.StatusOK + if c != nil && c.Writer != nil { + statusCode = c.Writer.Status() + } + + upstreamReqID := extractUpstreamRequestID(c) + preview, truncated := extractResponsePreview(w) + + exec := &opsRetryExecution{ + status: opsRetryStatusFailed, + httpStatusCode: statusCode, + upstreamRequestID: upstreamReqID, + responsePreview: preview, + responseTruncated: truncated, + errorMessage: "", + } + + if err == nil && statusCode < 400 { + exec.status = opsRetryStatusSucceeded + return exec + } + + if err != nil { + exec.errorMessage = err.Error() + } else { + exec.errorMessage = fmt.Sprintf("upstream returned status %d", statusCode) + } + + return exec +} + +func newOpsRetryContext(ctx context.Context, errorLog *OpsErrorLogDetail) (*gin.Context, *limitedResponseWriter) { + w := newLimitedResponseWriter(opsRetryCaptureBytesLimit) + c, _ := gin.CreateTestContext(w) + + path := "/" + if errorLog != nil && strings.TrimSpace(errorLog.RequestPath) != "" { + path = errorLog.RequestPath + } + + req, _ := http.NewRequestWithContext(ctx, http.MethodPost, "http://localhost"+path, bytes.NewReader(nil)) + req.Header.Set("content-type", "application/json") + if errorLog != nil && strings.TrimSpace(errorLog.UserAgent) != "" { + req.Header.Set("user-agent", errorLog.UserAgent) + } + // Restore a minimal, whitelisted subset of request headers to improve retry fidelity + // (e.g. anthropic-beta / anthropic-version). Never replay auth credentials. + if errorLog != nil && strings.TrimSpace(errorLog.RequestHeaders) != "" { + var stored map[string]string + if err := json.Unmarshal([]byte(errorLog.RequestHeaders), &stored); err == nil { + for k, v := range stored { + key := strings.TrimSpace(k) + if key == "" { + continue + } + if !opsRetryRequestHeaderAllowlist[strings.ToLower(key)] { + continue + } + val := strings.TrimSpace(v) + if val == "" { + continue + } + req.Header.Set(key, val) + } + } + } + + c.Request = req + return c, w +} + +func extractUpstreamRequestID(c *gin.Context) string { + if c == nil || c.Writer == nil { + return "" + } + h := c.Writer.Header() + if h == nil { + return "" + } + for _, key := range []string{"x-request-id", "X-Request-Id", "X-Request-ID"} { + if v := strings.TrimSpace(h.Get(key)); v != "" { + return v + } + } + return "" +} + +func extractResponsePreview(w *limitedResponseWriter) (preview string, truncated bool) { + if w == nil { + return "", false + } + b := bytes.TrimSpace(w.bodyBytes()) + if len(b) == 0 { + return "", w.truncated() + } + if len(b) > opsRetryResponsePreviewMax { + return string(b[:opsRetryResponsePreviewMax]), true + } + return string(b), w.truncated() +} + +func containsInt64(items []int64, needle int64) bool { + for _, v := range items { + if v == needle { + return true + } + } + return false +} + +func (s *OpsService) isFailoverError(message string) bool { + msg := strings.ToLower(strings.TrimSpace(message)) + if msg == "" { + return false + } + return strings.Contains(msg, "upstream error:") && strings.Contains(msg, "failover") +} diff --git a/backend/internal/service/ops_scheduled_report_service.go b/backend/internal/service/ops_scheduled_report_service.go new file mode 100644 index 00000000..28902cbc --- /dev/null +++ b/backend/internal/service/ops_scheduled_report_service.go @@ -0,0 +1,705 @@ +package service + +import ( + "context" + "fmt" + "log" + "strconv" + "strings" + "sync" + "time" + + "github.com/Wei-Shaw/sub2api/internal/config" + "github.com/google/uuid" + "github.com/redis/go-redis/v9" + "github.com/robfig/cron/v3" +) + +const ( + opsScheduledReportJobName = "ops_scheduled_reports" + + opsScheduledReportLeaderLockKeyDefault = "ops:scheduled_reports:leader" + opsScheduledReportLeaderLockTTLDefault = 5 * time.Minute + + opsScheduledReportLastRunKeyPrefix = "ops:scheduled_reports:last_run:" + + opsScheduledReportTickInterval = 1 * time.Minute +) + +var opsScheduledReportCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow) + +var opsScheduledReportReleaseScript = redis.NewScript(` +if redis.call("GET", KEYS[1]) == ARGV[1] then + return redis.call("DEL", KEYS[1]) +end +return 0 +`) + +type OpsScheduledReportService struct { + opsService *OpsService + userService *UserService + emailService *EmailService + redisClient *redis.Client + cfg *config.Config + + instanceID string + loc *time.Location + + distributedLockOn bool + warnNoRedisOnce sync.Once + + startOnce sync.Once + stopOnce sync.Once + stopCtx context.Context + stop context.CancelFunc + wg sync.WaitGroup +} + +func NewOpsScheduledReportService( + opsService *OpsService, + userService *UserService, + emailService *EmailService, + redisClient *redis.Client, + cfg *config.Config, +) *OpsScheduledReportService { + lockOn := cfg == nil || strings.TrimSpace(cfg.RunMode) != config.RunModeSimple + + loc := time.Local + if cfg != nil && strings.TrimSpace(cfg.Timezone) != "" { + if parsed, err := time.LoadLocation(strings.TrimSpace(cfg.Timezone)); err == nil && parsed != nil { + loc = parsed + } + } + return &OpsScheduledReportService{ + opsService: opsService, + userService: userService, + emailService: emailService, + redisClient: redisClient, + cfg: cfg, + + instanceID: uuid.NewString(), + loc: loc, + distributedLockOn: lockOn, + warnNoRedisOnce: sync.Once{}, + startOnce: sync.Once{}, + stopOnce: sync.Once{}, + stopCtx: nil, + stop: nil, + wg: sync.WaitGroup{}, + } +} + +func (s *OpsScheduledReportService) Start() { + s.StartWithContext(context.Background()) +} + +func (s *OpsScheduledReportService) StartWithContext(ctx context.Context) { + if s == nil { + return + } + if ctx == nil { + ctx = context.Background() + } + if s.cfg != nil && !s.cfg.Ops.Enabled { + return + } + if s.opsService == nil || s.emailService == nil { + return + } + + s.startOnce.Do(func() { + s.stopCtx, s.stop = context.WithCancel(ctx) + s.wg.Add(1) + go s.run() + }) +} + +func (s *OpsScheduledReportService) Stop() { + if s == nil { + return + } + s.stopOnce.Do(func() { + if s.stop != nil { + s.stop() + } + }) + s.wg.Wait() +} + +func (s *OpsScheduledReportService) run() { + defer s.wg.Done() + + ticker := time.NewTicker(opsScheduledReportTickInterval) + defer ticker.Stop() + + s.runOnce() + for { + select { + case <-ticker.C: + s.runOnce() + case <-s.stopCtx.Done(): + return + } + } +} + +func (s *OpsScheduledReportService) runOnce() { + if s == nil || s.opsService == nil || s.emailService == nil { + return + } + + startedAt := time.Now().UTC() + runAt := startedAt + + ctx, cancel := context.WithTimeout(s.stopCtx, 60*time.Second) + defer cancel() + + // Respect ops monitoring enabled switch. + if !s.opsService.IsMonitoringEnabled(ctx) { + return + } + + release, ok := s.tryAcquireLeaderLock(ctx) + if !ok { + return + } + if release != nil { + defer release() + } + + now := time.Now() + if s.loc != nil { + now = now.In(s.loc) + } + + reports := s.listScheduledReports(ctx, now) + if len(reports) == 0 { + return + } + + for _, report := range reports { + if report == nil || !report.Enabled { + continue + } + if report.NextRunAt.After(now) { + continue + } + + if err := s.runReport(ctx, report, now); err != nil { + s.recordHeartbeatError(runAt, time.Since(startedAt), err) + return + } + } + + s.recordHeartbeatSuccess(runAt, time.Since(startedAt)) +} + +type opsScheduledReport struct { + Name string + ReportType string + Schedule string + Enabled bool + + TimeRange time.Duration + + Recipients []string + + ErrorDigestMinCount int + AccountHealthErrorRateThreshold float64 + + LastRunAt *time.Time + NextRunAt time.Time +} + +func (s *OpsScheduledReportService) listScheduledReports(ctx context.Context, now time.Time) []*opsScheduledReport { + if s == nil || s.opsService == nil { + return nil + } + if ctx == nil { + ctx = context.Background() + } + + emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx) + if err != nil || emailCfg == nil { + return nil + } + if !emailCfg.Report.Enabled { + return nil + } + + recipients := normalizeEmails(emailCfg.Report.Recipients) + + type reportDef struct { + enabled bool + name string + kind string + timeRange time.Duration + schedule string + } + + defs := []reportDef{ + {enabled: emailCfg.Report.DailySummaryEnabled, name: "日报", kind: "daily_summary", timeRange: 24 * time.Hour, schedule: emailCfg.Report.DailySummarySchedule}, + {enabled: emailCfg.Report.WeeklySummaryEnabled, name: "周报", kind: "weekly_summary", timeRange: 7 * 24 * time.Hour, schedule: emailCfg.Report.WeeklySummarySchedule}, + {enabled: emailCfg.Report.ErrorDigestEnabled, name: "错误摘要", kind: "error_digest", timeRange: 24 * time.Hour, schedule: emailCfg.Report.ErrorDigestSchedule}, + {enabled: emailCfg.Report.AccountHealthEnabled, name: "账号健康", kind: "account_health", timeRange: 24 * time.Hour, schedule: emailCfg.Report.AccountHealthSchedule}, + } + + out := make([]*opsScheduledReport, 0, len(defs)) + for _, d := range defs { + if !d.enabled { + continue + } + spec := strings.TrimSpace(d.schedule) + if spec == "" { + continue + } + sched, err := opsScheduledReportCronParser.Parse(spec) + if err != nil { + log.Printf("[OpsScheduledReport] invalid cron spec=%q for report=%s: %v", spec, d.kind, err) + continue + } + + lastRun := s.getLastRunAt(ctx, d.kind) + base := lastRun + if base.IsZero() { + // Allow a schedule matching the current minute to trigger right after startup. + base = now.Add(-1 * time.Minute) + } + next := sched.Next(base) + if next.IsZero() { + continue + } + + var lastRunPtr *time.Time + if !lastRun.IsZero() { + lastCopy := lastRun + lastRunPtr = &lastCopy + } + + out = append(out, &opsScheduledReport{ + Name: d.name, + ReportType: d.kind, + Schedule: spec, + Enabled: true, + + TimeRange: d.timeRange, + + Recipients: recipients, + + ErrorDigestMinCount: emailCfg.Report.ErrorDigestMinCount, + AccountHealthErrorRateThreshold: emailCfg.Report.AccountHealthErrorRateThreshold, + + LastRunAt: lastRunPtr, + NextRunAt: next, + }) + } + + return out +} + +func (s *OpsScheduledReportService) runReport(ctx context.Context, report *opsScheduledReport, now time.Time) error { + if s == nil || s.opsService == nil || s.emailService == nil || report == nil { + return nil + } + if ctx == nil { + ctx = context.Background() + } + + // Mark as "run" up-front so a broken SMTP config doesn't spam retries every minute. + s.setLastRunAt(ctx, report.ReportType, now) + + content, err := s.generateReportHTML(ctx, report, now) + if err != nil { + return err + } + if strings.TrimSpace(content) == "" { + // Skip sending when the report decides not to emit content (e.g., digest below min count). + return nil + } + + recipients := report.Recipients + if len(recipients) == 0 && s.userService != nil { + admin, err := s.userService.GetFirstAdmin(ctx) + if err == nil && admin != nil && strings.TrimSpace(admin.Email) != "" { + recipients = []string{strings.TrimSpace(admin.Email)} + } + } + if len(recipients) == 0 { + return nil + } + + subject := fmt.Sprintf("[Ops Report] %s", strings.TrimSpace(report.Name)) + + for _, to := range recipients { + addr := strings.TrimSpace(to) + if addr == "" { + continue + } + if err := s.emailService.SendEmail(ctx, addr, subject, content); err != nil { + // Ignore per-recipient failures; continue best-effort. + continue + } + } + return nil +} + +func (s *OpsScheduledReportService) generateReportHTML(ctx context.Context, report *opsScheduledReport, now time.Time) (string, error) { + if s == nil || s.opsService == nil || report == nil { + return "", fmt.Errorf("service not initialized") + } + if report.TimeRange <= 0 { + return "", fmt.Errorf("invalid time range") + } + + end := now.UTC() + start := end.Add(-report.TimeRange) + + switch strings.TrimSpace(report.ReportType) { + case "daily_summary", "weekly_summary": + overview, err := s.opsService.GetDashboardOverview(ctx, &OpsDashboardFilter{ + StartTime: start, + EndTime: end, + Platform: "", + GroupID: nil, + QueryMode: OpsQueryModeAuto, + }) + if err != nil { + // If pre-aggregation isn't ready but the report is requested, fall back to raw. + if strings.TrimSpace(report.ReportType) == "daily_summary" || strings.TrimSpace(report.ReportType) == "weekly_summary" { + overview, err = s.opsService.GetDashboardOverview(ctx, &OpsDashboardFilter{ + StartTime: start, + EndTime: end, + Platform: "", + GroupID: nil, + QueryMode: OpsQueryModeRaw, + }) + } + if err != nil { + return "", err + } + } + return buildOpsSummaryEmailHTML(report.Name, start, end, overview), nil + case "error_digest": + // Lightweight digest: list recent errors (status>=400) and breakdown by type. + startTime := start + endTime := end + filter := &OpsErrorLogFilter{ + StartTime: &startTime, + EndTime: &endTime, + Page: 1, + PageSize: 100, + } + out, err := s.opsService.GetErrorLogs(ctx, filter) + if err != nil { + return "", err + } + if report.ErrorDigestMinCount > 0 && out != nil && out.Total < report.ErrorDigestMinCount { + return "", nil + } + return buildOpsErrorDigestEmailHTML(report.Name, start, end, out), nil + case "account_health": + // Best-effort: use account availability (not error rate yet). + avail, err := s.opsService.GetAccountAvailability(ctx, "", nil) + if err != nil { + return "", err + } + _ = report.AccountHealthErrorRateThreshold // reserved for future per-account error rate report + return buildOpsAccountHealthEmailHTML(report.Name, start, end, avail), nil + default: + return "", fmt.Errorf("unknown report type: %s", report.ReportType) + } +} + +func buildOpsSummaryEmailHTML(title string, start, end time.Time, overview *OpsDashboardOverview) string { + if overview == nil { + return fmt.Sprintf("

%s

No data.

", htmlEscape(title)) + } + + latP50 := "-" + latP99 := "-" + if overview.Duration.P50 != nil { + latP50 = fmt.Sprintf("%dms", *overview.Duration.P50) + } + if overview.Duration.P99 != nil { + latP99 = fmt.Sprintf("%dms", *overview.Duration.P99) + } + + ttftP50 := "-" + ttftP99 := "-" + if overview.TTFT.P50 != nil { + ttftP50 = fmt.Sprintf("%dms", *overview.TTFT.P50) + } + if overview.TTFT.P99 != nil { + ttftP99 = fmt.Sprintf("%dms", *overview.TTFT.P99) + } + + return fmt.Sprintf(` +

%s

+

Period: %s ~ %s (UTC)

+
    +
  • Total Requests: %d
  • +
  • Success: %d
  • +
  • Errors (SLA): %d
  • +
  • Business Limited: %d
  • +
  • SLA: %.2f%%
  • +
  • Error Rate: %.2f%%
  • +
  • Upstream Error Rate (excl 429/529): %.2f%%
  • +
  • Upstream Errors: excl429/529=%d, 429=%d, 529=%d
  • +
  • Latency: p50=%s, p99=%s
  • +
  • TTFT: p50=%s, p99=%s
  • +
  • Tokens: %d
  • +
  • QPS: current=%.1f, peak=%.1f, avg=%.1f
  • +
  • TPS: current=%.1f, peak=%.1f, avg=%.1f
  • +
+`, + htmlEscape(strings.TrimSpace(title)), + htmlEscape(start.UTC().Format(time.RFC3339)), + htmlEscape(end.UTC().Format(time.RFC3339)), + overview.RequestCountTotal, + overview.SuccessCount, + overview.ErrorCountSLA, + overview.BusinessLimitedCount, + overview.SLA*100, + overview.ErrorRate*100, + overview.UpstreamErrorRate*100, + overview.UpstreamErrorCountExcl429529, + overview.Upstream429Count, + overview.Upstream529Count, + htmlEscape(latP50), + htmlEscape(latP99), + htmlEscape(ttftP50), + htmlEscape(ttftP99), + overview.TokenConsumed, + overview.QPS.Current, + overview.QPS.Peak, + overview.QPS.Avg, + overview.TPS.Current, + overview.TPS.Peak, + overview.TPS.Avg, + ) +} + +func buildOpsErrorDigestEmailHTML(title string, start, end time.Time, list *OpsErrorLogList) string { + total := 0 + recent := []*OpsErrorLog{} + if list != nil { + total = list.Total + recent = list.Errors + } + if len(recent) > 10 { + recent = recent[:10] + } + + rows := "" + for _, item := range recent { + if item == nil { + continue + } + rows += fmt.Sprintf( + "%s%s%d%s", + htmlEscape(item.CreatedAt.UTC().Format(time.RFC3339)), + htmlEscape(item.Platform), + item.StatusCode, + htmlEscape(truncateString(item.Message, 180)), + ) + } + if rows == "" { + rows = "No recent errors." + } + + return fmt.Sprintf(` +

%s

+

Period: %s ~ %s (UTC)

+

Total Errors: %d

+

Recent

+ + + %s +
TimePlatformStatusMessage
+`, + htmlEscape(strings.TrimSpace(title)), + htmlEscape(start.UTC().Format(time.RFC3339)), + htmlEscape(end.UTC().Format(time.RFC3339)), + total, + rows, + ) +} + +func buildOpsAccountHealthEmailHTML(title string, start, end time.Time, avail *OpsAccountAvailability) string { + total := 0 + available := 0 + rateLimited := 0 + hasError := 0 + + if avail != nil && avail.Accounts != nil { + for _, a := range avail.Accounts { + if a == nil { + continue + } + total++ + if a.IsAvailable { + available++ + } + if a.IsRateLimited { + rateLimited++ + } + if a.HasError { + hasError++ + } + } + } + + return fmt.Sprintf(` +

%s

+

Period: %s ~ %s (UTC)

+
    +
  • Total Accounts: %d
  • +
  • Available: %d
  • +
  • Rate Limited: %d
  • +
  • Error: %d
  • +
+

Note: This report currently reflects account availability status only.

+`, + htmlEscape(strings.TrimSpace(title)), + htmlEscape(start.UTC().Format(time.RFC3339)), + htmlEscape(end.UTC().Format(time.RFC3339)), + total, + available, + rateLimited, + hasError, + ) +} + +func (s *OpsScheduledReportService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) { + if s == nil || !s.distributedLockOn { + return nil, true + } + if s.redisClient == nil { + s.warnNoRedisOnce.Do(func() { + log.Printf("[OpsScheduledReport] redis not configured; running without distributed lock") + }) + return nil, true + } + if ctx == nil { + ctx = context.Background() + } + + key := opsScheduledReportLeaderLockKeyDefault + ttl := opsScheduledReportLeaderLockTTLDefault + if strings.TrimSpace(key) == "" { + key = "ops:scheduled_reports:leader" + } + if ttl <= 0 { + ttl = 5 * time.Minute + } + + ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() + if err != nil { + // Prefer fail-closed to avoid duplicate report sends when Redis is flaky. + log.Printf("[OpsScheduledReport] leader lock SetNX failed; skipping this cycle: %v", err) + return nil, false + } + if !ok { + return nil, false + } + return func() { + _, _ = opsScheduledReportReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result() + }, true +} + +func (s *OpsScheduledReportService) getLastRunAt(ctx context.Context, reportType string) time.Time { + if s == nil || s.redisClient == nil { + return time.Time{} + } + kind := strings.TrimSpace(reportType) + if kind == "" { + return time.Time{} + } + key := opsScheduledReportLastRunKeyPrefix + kind + + raw, err := s.redisClient.Get(ctx, key).Result() + if err != nil || strings.TrimSpace(raw) == "" { + return time.Time{} + } + sec, err := strconv.ParseInt(strings.TrimSpace(raw), 10, 64) + if err != nil || sec <= 0 { + return time.Time{} + } + last := time.Unix(sec, 0) + // Cron schedules are interpreted in the configured timezone (s.loc). Ensure the base time + // passed into cron.Next() uses the same location; otherwise the job will drift by timezone + // offset (e.g. Asia/Shanghai default would run 8h later after the first execution). + if s.loc != nil { + return last.In(s.loc) + } + return last.UTC() +} + +func (s *OpsScheduledReportService) setLastRunAt(ctx context.Context, reportType string, t time.Time) { + if s == nil || s.redisClient == nil { + return + } + kind := strings.TrimSpace(reportType) + if kind == "" { + return + } + if t.IsZero() { + t = time.Now().UTC() + } + key := opsScheduledReportLastRunKeyPrefix + kind + _ = s.redisClient.Set(ctx, key, strconv.FormatInt(t.UTC().Unix(), 10), 14*24*time.Hour).Err() +} + +func (s *OpsScheduledReportService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) { + if s == nil || s.opsService == nil || s.opsService.opsRepo == nil { + return + } + now := time.Now().UTC() + durMs := duration.Milliseconds() + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ + JobName: opsScheduledReportJobName, + LastRunAt: &runAt, + LastSuccessAt: &now, + LastDurationMs: &durMs, + }) +} + +func (s *OpsScheduledReportService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) { + if s == nil || s.opsService == nil || s.opsService.opsRepo == nil || err == nil { + return + } + now := time.Now().UTC() + durMs := duration.Milliseconds() + msg := truncateString(err.Error(), 2048) + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ + JobName: opsScheduledReportJobName, + LastRunAt: &runAt, + LastErrorAt: &now, + LastError: &msg, + LastDurationMs: &durMs, + }) +} + +func normalizeEmails(in []string) []string { + if len(in) == 0 { + return nil + } + seen := make(map[string]struct{}, len(in)) + out := make([]string, 0, len(in)) + for _, raw := range in { + addr := strings.ToLower(strings.TrimSpace(raw)) + if addr == "" { + continue + } + if _, ok := seen[addr]; ok { + continue + } + seen[addr] = struct{}{} + out = append(out, addr) + } + return out +} diff --git a/backend/internal/service/ops_service.go b/backend/internal/service/ops_service.go new file mode 100644 index 00000000..426d46f1 --- /dev/null +++ b/backend/internal/service/ops_service.go @@ -0,0 +1,537 @@ +package service + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "log" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/config" + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +var ErrOpsDisabled = infraerrors.NotFound("OPS_DISABLED", "Ops monitoring is disabled") + +const ( + opsMaxStoredRequestBodyBytes = 10 * 1024 + opsMaxStoredErrorBodyBytes = 20 * 1024 +) + +// OpsService provides ingestion and query APIs for the Ops monitoring module. +type OpsService struct { + opsRepo OpsRepository + settingRepo SettingRepository + cfg *config.Config + + accountRepo AccountRepository + + // getAccountAvailability is a unit-test hook for overriding account availability lookup. + getAccountAvailability func(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error) + + concurrencyService *ConcurrencyService + gatewayService *GatewayService + openAIGatewayService *OpenAIGatewayService + geminiCompatService *GeminiMessagesCompatService + antigravityGatewayService *AntigravityGatewayService +} + +func NewOpsService( + opsRepo OpsRepository, + settingRepo SettingRepository, + cfg *config.Config, + accountRepo AccountRepository, + concurrencyService *ConcurrencyService, + gatewayService *GatewayService, + openAIGatewayService *OpenAIGatewayService, + geminiCompatService *GeminiMessagesCompatService, + antigravityGatewayService *AntigravityGatewayService, +) *OpsService { + return &OpsService{ + opsRepo: opsRepo, + settingRepo: settingRepo, + cfg: cfg, + + accountRepo: accountRepo, + + concurrencyService: concurrencyService, + gatewayService: gatewayService, + openAIGatewayService: openAIGatewayService, + geminiCompatService: geminiCompatService, + antigravityGatewayService: antigravityGatewayService, + } +} + +func (s *OpsService) RequireMonitoringEnabled(ctx context.Context) error { + if s.IsMonitoringEnabled(ctx) { + return nil + } + return ErrOpsDisabled +} + +func (s *OpsService) IsMonitoringEnabled(ctx context.Context) bool { + // Hard switch: disable ops entirely. + if s.cfg != nil && !s.cfg.Ops.Enabled { + return false + } + if s.settingRepo == nil { + return true + } + value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled) + if err != nil { + // Default enabled when key is missing, and fail-open on transient errors + // (ops should never block gateway traffic). + if errors.Is(err, ErrSettingNotFound) { + return true + } + return true + } + switch strings.ToLower(strings.TrimSpace(value)) { + case "false", "0", "off", "disabled": + return false + default: + return true + } +} + +func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogInput, rawRequestBody []byte) error { + if entry == nil { + return nil + } + if !s.IsMonitoringEnabled(ctx) { + return nil + } + if s.opsRepo == nil { + return nil + } + + // Ensure timestamps are always populated. + if entry.CreatedAt.IsZero() { + entry.CreatedAt = time.Now() + } + + // Ensure required fields exist (DB has NOT NULL constraints). + entry.ErrorPhase = strings.TrimSpace(entry.ErrorPhase) + entry.ErrorType = strings.TrimSpace(entry.ErrorType) + if entry.ErrorPhase == "" { + entry.ErrorPhase = "internal" + } + if entry.ErrorType == "" { + entry.ErrorType = "api_error" + } + + // Sanitize + trim request body (errors only). + if len(rawRequestBody) > 0 { + sanitized, truncated, bytesLen := sanitizeAndTrimRequestBody(rawRequestBody, opsMaxStoredRequestBodyBytes) + if sanitized != "" { + entry.RequestBodyJSON = &sanitized + } + entry.RequestBodyTruncated = truncated + entry.RequestBodyBytes = &bytesLen + } + + // Sanitize + truncate error_body to avoid storing sensitive data. + if strings.TrimSpace(entry.ErrorBody) != "" { + sanitized, _ := sanitizeErrorBodyForStorage(entry.ErrorBody, opsMaxStoredErrorBodyBytes) + entry.ErrorBody = sanitized + } + + // Sanitize upstream error context if provided by gateway services. + if entry.UpstreamStatusCode != nil && *entry.UpstreamStatusCode <= 0 { + entry.UpstreamStatusCode = nil + } + if entry.UpstreamErrorMessage != nil { + msg := strings.TrimSpace(*entry.UpstreamErrorMessage) + msg = sanitizeUpstreamErrorMessage(msg) + msg = truncateString(msg, 2048) + if strings.TrimSpace(msg) == "" { + entry.UpstreamErrorMessage = nil + } else { + entry.UpstreamErrorMessage = &msg + } + } + if entry.UpstreamErrorDetail != nil { + detail := strings.TrimSpace(*entry.UpstreamErrorDetail) + if detail == "" { + entry.UpstreamErrorDetail = nil + } else { + sanitized, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes) + if strings.TrimSpace(sanitized) == "" { + entry.UpstreamErrorDetail = nil + } else { + entry.UpstreamErrorDetail = &sanitized + } + } + } + + // Sanitize + serialize upstream error events list. + if len(entry.UpstreamErrors) > 0 { + const maxEvents = 32 + events := entry.UpstreamErrors + if len(events) > maxEvents { + events = events[len(events)-maxEvents:] + } + + sanitized := make([]*OpsUpstreamErrorEvent, 0, len(events)) + for _, ev := range events { + if ev == nil { + continue + } + out := *ev + + out.Platform = strings.TrimSpace(out.Platform) + out.UpstreamRequestID = truncateString(strings.TrimSpace(out.UpstreamRequestID), 128) + out.Kind = truncateString(strings.TrimSpace(out.Kind), 64) + + if out.AccountID < 0 { + out.AccountID = 0 + } + if out.UpstreamStatusCode < 0 { + out.UpstreamStatusCode = 0 + } + if out.AtUnixMs < 0 { + out.AtUnixMs = 0 + } + + msg := sanitizeUpstreamErrorMessage(strings.TrimSpace(out.Message)) + msg = truncateString(msg, 2048) + out.Message = msg + + detail := strings.TrimSpace(out.Detail) + if detail != "" { + // Keep upstream detail small; request bodies are not stored here, only upstream error payloads. + sanitizedDetail, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes) + out.Detail = sanitizedDetail + } else { + out.Detail = "" + } + + // Drop fully-empty events (can happen if only status code was known). + if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" { + continue + } + + evCopy := out + sanitized = append(sanitized, &evCopy) + } + + entry.UpstreamErrorsJSON = marshalOpsUpstreamErrors(sanitized) + entry.UpstreamErrors = nil + } + + if _, err := s.opsRepo.InsertErrorLog(ctx, entry); err != nil { + // Never bubble up to gateway; best-effort logging. + log.Printf("[Ops] RecordError failed: %v", err) + return err + } + return nil +} + +func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil + } + return s.opsRepo.ListErrorLogs(ctx, filter) +} + +func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found") + } + detail, err := s.opsRepo.GetErrorLogByID(ctx, id) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found") + } + return nil, infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err) + } + return detail, nil +} + +func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) { + bytesLen = len(raw) + if len(raw) == 0 { + return "", false, 0 + } + + var decoded any + if err := json.Unmarshal(raw, &decoded); err != nil { + // If it's not valid JSON, don't store (retry would not be reliable anyway). + return "", false, bytesLen + } + + decoded = redactSensitiveJSON(decoded) + + encoded, err := json.Marshal(decoded) + if err != nil { + return "", false, bytesLen + } + if len(encoded) <= maxBytes { + return string(encoded), false, bytesLen + } + + // Trim conversation history to keep the most recent context. + if root, ok := decoded.(map[string]any); ok { + if trimmed, ok := trimConversationArrays(root, maxBytes); ok { + encoded2, err2 := json.Marshal(trimmed) + if err2 == nil && len(encoded2) <= maxBytes { + return string(encoded2), true, bytesLen + } + // Fallthrough: keep shrinking. + decoded = trimmed + } + + essential := shrinkToEssentials(root) + encoded3, err3 := json.Marshal(essential) + if err3 == nil && len(encoded3) <= maxBytes { + return string(encoded3), true, bytesLen + } + } + + // Last resort: store a minimal placeholder (still valid JSON). + placeholder := map[string]any{ + "request_body_truncated": true, + } + if model := extractString(decoded, "model"); model != "" { + placeholder["model"] = model + } + encoded4, err4 := json.Marshal(placeholder) + if err4 != nil { + return "", true, bytesLen + } + return string(encoded4), true, bytesLen +} + +func redactSensitiveJSON(v any) any { + switch t := v.(type) { + case map[string]any: + out := make(map[string]any, len(t)) + for k, vv := range t { + if isSensitiveKey(k) { + out[k] = "[REDACTED]" + continue + } + out[k] = redactSensitiveJSON(vv) + } + return out + case []any: + out := make([]any, 0, len(t)) + for _, vv := range t { + out = append(out, redactSensitiveJSON(vv)) + } + return out + default: + return v + } +} + +func isSensitiveKey(key string) bool { + k := strings.ToLower(strings.TrimSpace(key)) + if k == "" { + return false + } + + // Exact matches (common credential fields). + switch k { + case "authorization", + "proxy-authorization", + "x-api-key", + "api_key", + "apikey", + "access_token", + "refresh_token", + "id_token", + "session_token", + "token", + "password", + "passwd", + "passphrase", + "secret", + "client_secret", + "private_key", + "jwt", + "signature", + "accesskeyid", + "secretaccesskey": + return true + } + + // Suffix matches. + for _, suffix := range []string{ + "_secret", + "_token", + "_id_token", + "_session_token", + "_password", + "_passwd", + "_passphrase", + "_key", + "secret_key", + "private_key", + } { + if strings.HasSuffix(k, suffix) { + return true + } + } + + // Substring matches (conservative, but errs on the side of privacy). + for _, sub := range []string{ + "secret", + "token", + "password", + "passwd", + "passphrase", + "privatekey", + "private_key", + "apikey", + "api_key", + "accesskeyid", + "secretaccesskey", + "bearer", + "cookie", + "credential", + "session", + "jwt", + "signature", + } { + if strings.Contains(k, sub) { + return true + } + } + + return false +} + +func trimConversationArrays(root map[string]any, maxBytes int) (map[string]any, bool) { + // Supported: anthropic/openai: messages; gemini: contents. + if out, ok := trimArrayField(root, "messages", maxBytes); ok { + return out, true + } + if out, ok := trimArrayField(root, "contents", maxBytes); ok { + return out, true + } + return root, false +} + +func trimArrayField(root map[string]any, field string, maxBytes int) (map[string]any, bool) { + raw, ok := root[field] + if !ok { + return nil, false + } + arr, ok := raw.([]any) + if !ok || len(arr) == 0 { + return nil, false + } + + // Keep at least the last message/content. Use binary search so we don't marshal O(n) times. + // We are dropping from the *front* of the array (oldest context first). + lo := 0 + hi := len(arr) - 1 // inclusive; hi ensures at least one item remains + + var best map[string]any + found := false + + for lo <= hi { + mid := (lo + hi) / 2 + candidateArr := arr[mid:] + if len(candidateArr) == 0 { + lo = mid + 1 + continue + } + + next := shallowCopyMap(root) + next[field] = candidateArr + encoded, err := json.Marshal(next) + if err != nil { + // If marshal fails, try dropping more. + lo = mid + 1 + continue + } + + if len(encoded) <= maxBytes { + best = next + found = true + // Try to keep more context by dropping fewer items. + hi = mid - 1 + continue + } + + // Need to drop more. + lo = mid + 1 + } + + if found { + return best, true + } + + // Nothing fit (even with only one element); return the smallest slice and let the + // caller fall back to shrinkToEssentials(). + next := shallowCopyMap(root) + next[field] = arr[len(arr)-1:] + return next, true +} + +func shrinkToEssentials(root map[string]any) map[string]any { + out := make(map[string]any) + for _, key := range []string{"model", "stream", "max_tokens", "temperature", "top_p", "top_k"} { + if v, ok := root[key]; ok { + out[key] = v + } + } + + // Keep only the last element of the conversation array. + if v, ok := root["messages"]; ok { + if arr, ok := v.([]any); ok && len(arr) > 0 { + out["messages"] = []any{arr[len(arr)-1]} + } + } + if v, ok := root["contents"]; ok { + if arr, ok := v.([]any); ok && len(arr) > 0 { + out["contents"] = []any{arr[len(arr)-1]} + } + } + return out +} + +func shallowCopyMap(m map[string]any) map[string]any { + out := make(map[string]any, len(m)) + for k, v := range m { + out[k] = v + } + return out +} + +func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, truncated bool) { + raw = strings.TrimSpace(raw) + if raw == "" { + return "", false + } + + // Prefer JSON-safe sanitization when possible. + if out, trunc, _ := sanitizeAndTrimRequestBody([]byte(raw), maxBytes); out != "" { + return out, trunc + } + + // Non-JSON: best-effort truncate. + if maxBytes > 0 && len(raw) > maxBytes { + return truncateString(raw, maxBytes), true + } + return raw, false +} + +func extractString(v any, key string) string { + root, ok := v.(map[string]any) + if !ok { + return "" + } + s, _ := root[key].(string) + return strings.TrimSpace(s) +} diff --git a/backend/internal/service/ops_settings.go b/backend/internal/service/ops_settings.go new file mode 100644 index 00000000..fbf8f069 --- /dev/null +++ b/backend/internal/service/ops_settings.go @@ -0,0 +1,465 @@ +package service + +import ( + "context" + "encoding/json" + "errors" + "strings" + "time" +) + +const ( + opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader" + opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second +) + +// ========================= +// Email notification config +// ========================= + +func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) { + defaultCfg := defaultOpsEmailNotificationConfig() + if s == nil || s.settingRepo == nil { + return defaultCfg, nil + } + if ctx == nil { + ctx = context.Background() + } + + raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig) + if err != nil { + if errors.Is(err, ErrSettingNotFound) { + // Initialize defaults on first read (best-effort). + if b, mErr := json.Marshal(defaultCfg); mErr == nil { + _ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b)) + } + return defaultCfg, nil + } + return nil, err + } + + cfg := &OpsEmailNotificationConfig{} + if err := json.Unmarshal([]byte(raw), cfg); err != nil { + // Corrupted JSON should not break ops UI; fall back to defaults. + return defaultCfg, nil + } + normalizeOpsEmailNotificationConfig(cfg) + return cfg, nil +} + +func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) { + if s == nil || s.settingRepo == nil { + return nil, errors.New("setting repository not initialized") + } + if ctx == nil { + ctx = context.Background() + } + if req == nil { + return nil, errors.New("invalid request") + } + + cfg, err := s.GetEmailNotificationConfig(ctx) + if err != nil { + return nil, err + } + + if req.Alert != nil { + cfg.Alert.Enabled = req.Alert.Enabled + if req.Alert.Recipients != nil { + cfg.Alert.Recipients = req.Alert.Recipients + } + cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity) + cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour + cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds + cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts + } + + if req.Report != nil { + cfg.Report.Enabled = req.Report.Enabled + if req.Report.Recipients != nil { + cfg.Report.Recipients = req.Report.Recipients + } + cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled + cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule) + cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled + cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule) + cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled + cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule) + cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount + cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled + cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule) + cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold + } + + if err := validateOpsEmailNotificationConfig(cfg); err != nil { + return nil, err + } + + normalizeOpsEmailNotificationConfig(cfg) + raw, err := json.Marshal(cfg) + if err != nil { + return nil, err + } + if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil { + return nil, err + } + return cfg, nil +} + +func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig { + return &OpsEmailNotificationConfig{ + Alert: OpsEmailAlertConfig{ + Enabled: true, + Recipients: []string{}, + MinSeverity: "", + RateLimitPerHour: 0, + BatchingWindowSeconds: 0, + IncludeResolvedAlerts: false, + }, + Report: OpsEmailReportConfig{ + Enabled: false, + Recipients: []string{}, + DailySummaryEnabled: false, + DailySummarySchedule: "0 9 * * *", + WeeklySummaryEnabled: false, + WeeklySummarySchedule: "0 9 * * 1", + ErrorDigestEnabled: false, + ErrorDigestSchedule: "0 9 * * *", + ErrorDigestMinCount: 10, + AccountHealthEnabled: false, + AccountHealthSchedule: "0 9 * * *", + AccountHealthErrorRateThreshold: 10.0, + }, + } +} + +func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) { + if cfg == nil { + return + } + if cfg.Alert.Recipients == nil { + cfg.Alert.Recipients = []string{} + } + if cfg.Report.Recipients == nil { + cfg.Report.Recipients = []string{} + } + + cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity) + cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule) + cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule) + cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule) + cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule) + + // Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings. + if cfg.Report.DailySummarySchedule == "" { + cfg.Report.DailySummarySchedule = "0 9 * * *" + } + if cfg.Report.WeeklySummarySchedule == "" { + cfg.Report.WeeklySummarySchedule = "0 9 * * 1" + } + if cfg.Report.ErrorDigestSchedule == "" { + cfg.Report.ErrorDigestSchedule = "0 9 * * *" + } + if cfg.Report.AccountHealthSchedule == "" { + cfg.Report.AccountHealthSchedule = "0 9 * * *" + } +} + +func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error { + if cfg == nil { + return errors.New("invalid config") + } + + if cfg.Alert.RateLimitPerHour < 0 { + return errors.New("alert.rate_limit_per_hour must be >= 0") + } + if cfg.Alert.BatchingWindowSeconds < 0 { + return errors.New("alert.batching_window_seconds must be >= 0") + } + switch strings.TrimSpace(cfg.Alert.MinSeverity) { + case "", "critical", "warning", "info": + default: + return errors.New("alert.min_severity must be one of: critical, warning, info, or empty") + } + + if cfg.Report.ErrorDigestMinCount < 0 { + return errors.New("report.error_digest_min_count must be >= 0") + } + if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 { + return errors.New("report.account_health_error_rate_threshold must be between 0 and 100") + } + return nil +} + +// ========================= +// Alert runtime settings +// ========================= + +func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings { + return &OpsAlertRuntimeSettings{ + EvaluationIntervalSeconds: 60, + DistributedLock: OpsDistributedLockSettings{ + Enabled: true, + Key: opsAlertEvaluatorLeaderLockKeyDefault, + TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()), + }, + Silencing: OpsAlertSilencingSettings{ + Enabled: false, + GlobalUntilRFC3339: "", + GlobalReason: "", + Entries: []OpsAlertSilenceEntry{}, + }, + } +} + +func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) { + if s == nil { + return + } + s.Key = strings.TrimSpace(s.Key) + if s.Key == "" { + s.Key = defaultKey + } + if s.TTLSeconds <= 0 { + s.TTLSeconds = defaultTTLSeconds + } +} + +func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) { + if s == nil { + return + } + s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339) + s.GlobalReason = strings.TrimSpace(s.GlobalReason) + if s.Entries == nil { + s.Entries = []OpsAlertSilenceEntry{} + } + for i := range s.Entries { + s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339) + s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason) + } +} + +func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error { + if strings.TrimSpace(s.Key) == "" { + return errors.New("distributed_lock.key is required") + } + if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) { + return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400") + } + return nil +} + +func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error { + parse := func(raw string) error { + if strings.TrimSpace(raw) == "" { + return nil + } + if _, err := time.Parse(time.RFC3339, raw); err != nil { + return errors.New("silencing time must be RFC3339") + } + return nil + } + + if err := parse(s.GlobalUntilRFC3339); err != nil { + return err + } + for _, entry := range s.Entries { + if strings.TrimSpace(entry.UntilRFC3339) == "" { + return errors.New("silencing.entries.until_rfc3339 is required") + } + if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil { + return errors.New("silencing.entries.until_rfc3339 must be RFC3339") + } + } + return nil +} + +func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) { + defaultCfg := defaultOpsAlertRuntimeSettings() + if s == nil || s.settingRepo == nil { + return defaultCfg, nil + } + if ctx == nil { + ctx = context.Background() + } + + raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings) + if err != nil { + if errors.Is(err, ErrSettingNotFound) { + if b, mErr := json.Marshal(defaultCfg); mErr == nil { + _ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b)) + } + return defaultCfg, nil + } + return nil, err + } + + cfg := &OpsAlertRuntimeSettings{} + if err := json.Unmarshal([]byte(raw), cfg); err != nil { + return defaultCfg, nil + } + + if cfg.EvaluationIntervalSeconds <= 0 { + cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds + } + normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds) + normalizeOpsAlertSilencingSettings(&cfg.Silencing) + + return cfg, nil +} + +func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) { + if s == nil || s.settingRepo == nil { + return nil, errors.New("setting repository not initialized") + } + if ctx == nil { + ctx = context.Background() + } + if cfg == nil { + return nil, errors.New("invalid config") + } + + if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) { + return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400") + } + if cfg.DistributedLock.Enabled { + if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil { + return nil, err + } + } + if cfg.Silencing.Enabled { + if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil { + return nil, err + } + } + + defaultCfg := defaultOpsAlertRuntimeSettings() + normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds) + normalizeOpsAlertSilencingSettings(&cfg.Silencing) + + raw, err := json.Marshal(cfg) + if err != nil { + return nil, err + } + if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil { + return nil, err + } + + // Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated). + updated := &OpsAlertRuntimeSettings{} + _ = json.Unmarshal(raw, updated) + return updated, nil +} + +// ========================= +// Advanced settings +// ========================= + +func defaultOpsAdvancedSettings() *OpsAdvancedSettings { + return &OpsAdvancedSettings{ + DataRetention: OpsDataRetentionSettings{ + CleanupEnabled: false, + CleanupSchedule: "0 2 * * *", + ErrorLogRetentionDays: 30, + MinuteMetricsRetentionDays: 30, + HourlyMetricsRetentionDays: 30, + }, + Aggregation: OpsAggregationSettings{ + AggregationEnabled: false, + }, + } +} + +func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) { + if cfg == nil { + return + } + cfg.DataRetention.CleanupSchedule = strings.TrimSpace(cfg.DataRetention.CleanupSchedule) + if cfg.DataRetention.CleanupSchedule == "" { + cfg.DataRetention.CleanupSchedule = "0 2 * * *" + } + if cfg.DataRetention.ErrorLogRetentionDays <= 0 { + cfg.DataRetention.ErrorLogRetentionDays = 30 + } + if cfg.DataRetention.MinuteMetricsRetentionDays <= 0 { + cfg.DataRetention.MinuteMetricsRetentionDays = 30 + } + if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 { + cfg.DataRetention.HourlyMetricsRetentionDays = 30 + } +} + +func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error { + if cfg == nil { + return errors.New("invalid config") + } + if cfg.DataRetention.ErrorLogRetentionDays < 1 || cfg.DataRetention.ErrorLogRetentionDays > 365 { + return errors.New("error_log_retention_days must be between 1 and 365") + } + if cfg.DataRetention.MinuteMetricsRetentionDays < 1 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 { + return errors.New("minute_metrics_retention_days must be between 1 and 365") + } + if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 { + return errors.New("hourly_metrics_retention_days must be between 1 and 365") + } + return nil +} + +func (s *OpsService) GetOpsAdvancedSettings(ctx context.Context) (*OpsAdvancedSettings, error) { + defaultCfg := defaultOpsAdvancedSettings() + if s == nil || s.settingRepo == nil { + return defaultCfg, nil + } + if ctx == nil { + ctx = context.Background() + } + + raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAdvancedSettings) + if err != nil { + if errors.Is(err, ErrSettingNotFound) { + if b, mErr := json.Marshal(defaultCfg); mErr == nil { + _ = s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(b)) + } + return defaultCfg, nil + } + return nil, err + } + + cfg := &OpsAdvancedSettings{} + if err := json.Unmarshal([]byte(raw), cfg); err != nil { + return defaultCfg, nil + } + + normalizeOpsAdvancedSettings(cfg) + return cfg, nil +} + +func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdvancedSettings) (*OpsAdvancedSettings, error) { + if s == nil || s.settingRepo == nil { + return nil, errors.New("setting repository not initialized") + } + if ctx == nil { + ctx = context.Background() + } + if cfg == nil { + return nil, errors.New("invalid config") + } + + if err := validateOpsAdvancedSettings(cfg); err != nil { + return nil, err + } + + normalizeOpsAdvancedSettings(cfg) + raw, err := json.Marshal(cfg) + if err != nil { + return nil, err + } + if err := s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(raw)); err != nil { + return nil, err + } + + updated := &OpsAdvancedSettings{} + _ = json.Unmarshal(raw, updated) + return updated, nil +} diff --git a/backend/internal/service/ops_settings_models.go b/backend/internal/service/ops_settings_models.go new file mode 100644 index 00000000..7d9a823c --- /dev/null +++ b/backend/internal/service/ops_settings_models.go @@ -0,0 +1,87 @@ +package service + +// Ops settings models stored in DB `settings` table (JSON blobs). + +type OpsEmailNotificationConfig struct { + Alert OpsEmailAlertConfig `json:"alert"` + Report OpsEmailReportConfig `json:"report"` +} + +type OpsEmailAlertConfig struct { + Enabled bool `json:"enabled"` + Recipients []string `json:"recipients"` + MinSeverity string `json:"min_severity"` + RateLimitPerHour int `json:"rate_limit_per_hour"` + BatchingWindowSeconds int `json:"batching_window_seconds"` + IncludeResolvedAlerts bool `json:"include_resolved_alerts"` +} + +type OpsEmailReportConfig struct { + Enabled bool `json:"enabled"` + Recipients []string `json:"recipients"` + DailySummaryEnabled bool `json:"daily_summary_enabled"` + DailySummarySchedule string `json:"daily_summary_schedule"` + WeeklySummaryEnabled bool `json:"weekly_summary_enabled"` + WeeklySummarySchedule string `json:"weekly_summary_schedule"` + ErrorDigestEnabled bool `json:"error_digest_enabled"` + ErrorDigestSchedule string `json:"error_digest_schedule"` + ErrorDigestMinCount int `json:"error_digest_min_count"` + AccountHealthEnabled bool `json:"account_health_enabled"` + AccountHealthSchedule string `json:"account_health_schedule"` + AccountHealthErrorRateThreshold float64 `json:"account_health_error_rate_threshold"` +} + +// OpsEmailNotificationConfigUpdateRequest allows partial updates, while the +// frontend can still send the full config shape. +type OpsEmailNotificationConfigUpdateRequest struct { + Alert *OpsEmailAlertConfig `json:"alert"` + Report *OpsEmailReportConfig `json:"report"` +} + +type OpsDistributedLockSettings struct { + Enabled bool `json:"enabled"` + Key string `json:"key"` + TTLSeconds int `json:"ttl_seconds"` +} + +type OpsAlertSilenceEntry struct { + RuleID *int64 `json:"rule_id,omitempty"` + Severities []string `json:"severities,omitempty"` + + UntilRFC3339 string `json:"until_rfc3339"` + Reason string `json:"reason"` +} + +type OpsAlertSilencingSettings struct { + Enabled bool `json:"enabled"` + + GlobalUntilRFC3339 string `json:"global_until_rfc3339"` + GlobalReason string `json:"global_reason"` + + Entries []OpsAlertSilenceEntry `json:"entries,omitempty"` +} + +type OpsAlertRuntimeSettings struct { + EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"` + + DistributedLock OpsDistributedLockSettings `json:"distributed_lock"` + Silencing OpsAlertSilencingSettings `json:"silencing"` +} + +// OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation). +type OpsAdvancedSettings struct { + DataRetention OpsDataRetentionSettings `json:"data_retention"` + Aggregation OpsAggregationSettings `json:"aggregation"` +} + +type OpsDataRetentionSettings struct { + CleanupEnabled bool `json:"cleanup_enabled"` + CleanupSchedule string `json:"cleanup_schedule"` + ErrorLogRetentionDays int `json:"error_log_retention_days"` + MinuteMetricsRetentionDays int `json:"minute_metrics_retention_days"` + HourlyMetricsRetentionDays int `json:"hourly_metrics_retention_days"` +} + +type OpsAggregationSettings struct { + AggregationEnabled bool `json:"aggregation_enabled"` +} diff --git a/backend/internal/service/ops_trend_models.go b/backend/internal/service/ops_trend_models.go new file mode 100644 index 00000000..f6d07c14 --- /dev/null +++ b/backend/internal/service/ops_trend_models.go @@ -0,0 +1,65 @@ +package service + +import "time" + +type OpsThroughputTrendPoint struct { + BucketStart time.Time `json:"bucket_start"` + RequestCount int64 `json:"request_count"` + TokenConsumed int64 `json:"token_consumed"` + QPS float64 `json:"qps"` + TPS float64 `json:"tps"` +} + +type OpsThroughputPlatformBreakdownItem struct { + Platform string `json:"platform"` + RequestCount int64 `json:"request_count"` + TokenConsumed int64 `json:"token_consumed"` +} + +type OpsThroughputGroupBreakdownItem struct { + GroupID int64 `json:"group_id"` + GroupName string `json:"group_name"` + RequestCount int64 `json:"request_count"` + TokenConsumed int64 `json:"token_consumed"` +} + +type OpsThroughputTrendResponse struct { + Bucket string `json:"bucket"` + + Points []*OpsThroughputTrendPoint `json:"points"` + + // Optional drilldown helpers: + // - When no platform/group is selected: returns totals by platform. + // - When platform is selected but group is not: returns top groups in that platform. + ByPlatform []*OpsThroughputPlatformBreakdownItem `json:"by_platform,omitempty"` + TopGroups []*OpsThroughputGroupBreakdownItem `json:"top_groups,omitempty"` +} + +type OpsErrorTrendPoint struct { + BucketStart time.Time `json:"bucket_start"` + + ErrorCountTotal int64 `json:"error_count_total"` + BusinessLimitedCount int64 `json:"business_limited_count"` + ErrorCountSLA int64 `json:"error_count_sla"` + + UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"` + Upstream429Count int64 `json:"upstream_429_count"` + Upstream529Count int64 `json:"upstream_529_count"` +} + +type OpsErrorTrendResponse struct { + Bucket string `json:"bucket"` + Points []*OpsErrorTrendPoint `json:"points"` +} + +type OpsErrorDistributionItem struct { + StatusCode int `json:"status_code"` + Total int64 `json:"total"` + SLA int64 `json:"sla"` + BusinessLimited int64 `json:"business_limited"` +} + +type OpsErrorDistributionResponse struct { + Total int64 `json:"total"` + Items []*OpsErrorDistributionItem `json:"items"` +} diff --git a/backend/internal/service/ops_trends.go b/backend/internal/service/ops_trends.go new file mode 100644 index 00000000..ec55c6ce --- /dev/null +++ b/backend/internal/service/ops_trends.go @@ -0,0 +1,26 @@ +package service + +import ( + "context" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +func (s *OpsService) GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + return s.opsRepo.GetThroughputTrend(ctx, filter, bucketSeconds) +} diff --git a/backend/internal/service/ops_upstream_context.go b/backend/internal/service/ops_upstream_context.go new file mode 100644 index 00000000..615ae6a1 --- /dev/null +++ b/backend/internal/service/ops_upstream_context.go @@ -0,0 +1,94 @@ +package service + +import ( + "encoding/json" + "strings" + "time" + + "github.com/gin-gonic/gin" +) + +// Gin context keys used by Ops error logger for capturing upstream error details. +// These keys are set by gateway services and consumed by handler/ops_error_logger.go. +const ( + OpsUpstreamStatusCodeKey = "ops_upstream_status_code" + OpsUpstreamErrorMessageKey = "ops_upstream_error_message" + OpsUpstreamErrorDetailKey = "ops_upstream_error_detail" + OpsUpstreamErrorsKey = "ops_upstream_errors" +) + +func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) { + if c == nil { + return + } + if upstreamStatusCode > 0 { + c.Set(OpsUpstreamStatusCodeKey, upstreamStatusCode) + } + if msg := strings.TrimSpace(upstreamMessage); msg != "" { + c.Set(OpsUpstreamErrorMessageKey, msg) + } + if detail := strings.TrimSpace(upstreamDetail); detail != "" { + c.Set(OpsUpstreamErrorDetailKey, detail) + } +} + +// OpsUpstreamErrorEvent describes one upstream error attempt during a single gateway request. +// It is stored in ops_error_logs.upstream_errors as a JSON array. +type OpsUpstreamErrorEvent struct { + AtUnixMs int64 `json:"at_unix_ms,omitempty"` + + // Context + Platform string `json:"platform,omitempty"` + AccountID int64 `json:"account_id,omitempty"` + + // Outcome + UpstreamStatusCode int `json:"upstream_status_code,omitempty"` + UpstreamRequestID string `json:"upstream_request_id,omitempty"` + + // Kind: http_error | request_error | retry_exhausted | failover + Kind string `json:"kind,omitempty"` + + Message string `json:"message,omitempty"` + Detail string `json:"detail,omitempty"` +} + +func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) { + if c == nil { + return + } + if ev.AtUnixMs <= 0 { + ev.AtUnixMs = time.Now().UnixMilli() + } + ev.Platform = strings.TrimSpace(ev.Platform) + ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID) + ev.Kind = strings.TrimSpace(ev.Kind) + ev.Message = strings.TrimSpace(ev.Message) + ev.Detail = strings.TrimSpace(ev.Detail) + if ev.Message != "" { + ev.Message = sanitizeUpstreamErrorMessage(ev.Message) + } + + var existing []*OpsUpstreamErrorEvent + if v, ok := c.Get(OpsUpstreamErrorsKey); ok { + if arr, ok := v.([]*OpsUpstreamErrorEvent); ok { + existing = arr + } + } + + evCopy := ev + existing = append(existing, &evCopy) + c.Set(OpsUpstreamErrorsKey, existing) +} + +func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string { + if len(events) == 0 { + return nil + } + // Ensure we always store a valid JSON value. + raw, err := json.Marshal(events) + if err != nil || len(raw) == 0 { + return nil + } + s := string(raw) + return &s +} diff --git a/backend/internal/service/ops_window_stats.go b/backend/internal/service/ops_window_stats.go new file mode 100644 index 00000000..71021d15 --- /dev/null +++ b/backend/internal/service/ops_window_stats.go @@ -0,0 +1,24 @@ +package service + +import ( + "context" + "time" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +// GetWindowStats returns lightweight request/token counts for the provided window. +// It is intended for realtime sampling (e.g. WebSocket QPS push) without computing percentiles/peaks. +func (s *OpsService) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + filter := &OpsDashboardFilter{ + StartTime: startTime, + EndTime: endTime, + } + return s.opsRepo.GetWindowStats(ctx, filter) +} diff --git a/backend/internal/service/ratelimit_service.go b/backend/internal/service/ratelimit_service.go index f1362646..d570b92e 100644 --- a/backend/internal/service/ratelimit_service.go +++ b/backend/internal/service/ratelimit_service.go @@ -55,19 +55,36 @@ func (s *RateLimitService) HandleUpstreamError(ctx context.Context, account *Acc } tempMatched := s.tryTempUnschedulable(ctx, account, statusCode, responseBody) + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(responseBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + if upstreamMsg != "" { + upstreamMsg = truncateForLog([]byte(upstreamMsg), 512) + } switch statusCode { case 401: // 认证失败:停止调度,记录错误 - s.handleAuthError(ctx, account, "Authentication failed (401): invalid or expired credentials") + msg := "Authentication failed (401): invalid or expired credentials" + if upstreamMsg != "" { + msg = "Authentication failed (401): " + upstreamMsg + } + s.handleAuthError(ctx, account, msg) shouldDisable = true case 402: // 支付要求:余额不足或计费问题,停止调度 - s.handleAuthError(ctx, account, "Payment required (402): insufficient balance or billing issue") + msg := "Payment required (402): insufficient balance or billing issue" + if upstreamMsg != "" { + msg = "Payment required (402): " + upstreamMsg + } + s.handleAuthError(ctx, account, msg) shouldDisable = true case 403: // 禁止访问:停止调度,记录错误 - s.handleAuthError(ctx, account, "Access forbidden (403): account may be suspended or lack permissions") + msg := "Access forbidden (403): account may be suspended or lack permissions" + if upstreamMsg != "" { + msg = "Access forbidden (403): " + upstreamMsg + } + s.handleAuthError(ctx, account, msg) shouldDisable = true case 429: s.handle429(ctx, account, headers) diff --git a/backend/internal/service/setting_service.go b/backend/internal/service/setting_service.go index 3e47d9d4..863d8a57 100644 --- a/backend/internal/service/setting_service.go +++ b/backend/internal/service/setting_service.go @@ -176,7 +176,7 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet updates[SettingKeyTurnstileSecretKey] = settings.TurnstileSecretKey } - // LinuxDo Connect OAuth 登录(终端用户 SSO) + // LinuxDo Connect OAuth 登录 updates[SettingKeyLinuxDoConnectEnabled] = strconv.FormatBool(settings.LinuxDoConnectEnabled) updates[SettingKeyLinuxDoConnectClientID] = settings.LinuxDoConnectClientID updates[SettingKeyLinuxDoConnectRedirectURL] = settings.LinuxDoConnectRedirectURL @@ -208,6 +208,14 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet updates[SettingKeyEnableIdentityPatch] = strconv.FormatBool(settings.EnableIdentityPatch) updates[SettingKeyIdentityPatchPrompt] = settings.IdentityPatchPrompt + // Ops monitoring (vNext) + updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled) + updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled) + updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault)) + if settings.OpsMetricsIntervalSeconds > 0 { + updates[SettingKeyOpsMetricsIntervalSeconds] = strconv.Itoa(settings.OpsMetricsIntervalSeconds) + } + err := s.settingRepo.SetMultiple(ctx, updates) if err == nil && s.onUpdate != nil { s.onUpdate() // Invalidate cache after settings update @@ -298,6 +306,12 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error { // Identity patch defaults SettingKeyEnableIdentityPatch: "true", SettingKeyIdentityPatchPrompt: "", + + // Ops monitoring defaults (vNext) + SettingKeyOpsMonitoringEnabled: "true", + SettingKeyOpsRealtimeMonitoringEnabled: "true", + SettingKeyOpsQueryModeDefault: "auto", + SettingKeyOpsMetricsIntervalSeconds: "60", } return s.settingRepo.SetMultiple(ctx, defaults) @@ -397,100 +411,33 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin } result.IdentityPatchPrompt = settings[SettingKeyIdentityPatchPrompt] + // Ops monitoring settings (default: enabled, fail-open) + result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled]) + result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled]) + result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault])) + result.OpsMetricsIntervalSeconds = 60 + if raw := strings.TrimSpace(settings[SettingKeyOpsMetricsIntervalSeconds]); raw != "" { + if v, err := strconv.Atoi(raw); err == nil { + if v < 60 { + v = 60 + } + if v > 3600 { + v = 3600 + } + result.OpsMetricsIntervalSeconds = v + } + } + return result } -// GetLinuxDoConnectOAuthConfig 返回用于登录的“最终生效” LinuxDo Connect 配置。 -// -// 优先级: -// - 若对应系统设置键存在,则覆盖 config.yaml/env 的值 -// - 否则回退到 config.yaml/env 的值 -func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) { - if s == nil || s.cfg == nil { - return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded") - } - - effective := s.cfg.LinuxDo - - keys := []string{ - SettingKeyLinuxDoConnectEnabled, - SettingKeyLinuxDoConnectClientID, - SettingKeyLinuxDoConnectClientSecret, - SettingKeyLinuxDoConnectRedirectURL, - } - settings, err := s.settingRepo.GetMultiple(ctx, keys) - if err != nil { - return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err) - } - - if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok { - effective.Enabled = raw == "true" - } - if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" { - effective.ClientID = strings.TrimSpace(v) - } - if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" { - effective.ClientSecret = strings.TrimSpace(v) - } - if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" { - effective.RedirectURL = strings.TrimSpace(v) - } - - if !effective.Enabled { - return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled") - } - - // 基础健壮性校验(避免把用户重定向到一个必然失败或不安全的 OAuth 流程里)。 - if strings.TrimSpace(effective.ClientID) == "" { - return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured") - } - if strings.TrimSpace(effective.AuthorizeURL) == "" { - return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured") - } - if strings.TrimSpace(effective.TokenURL) == "" { - return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured") - } - if strings.TrimSpace(effective.UserInfoURL) == "" { - return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured") - } - if strings.TrimSpace(effective.RedirectURL) == "" { - return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured") - } - if strings.TrimSpace(effective.FrontendRedirectURL) == "" { - return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured") - } - - if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil { - return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid") - } - if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil { - return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid") - } - if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil { - return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid") - } - if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil { - return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid") - } - if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil { - return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid") - } - - method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod)) - switch method { - case "", "client_secret_post", "client_secret_basic": - if strings.TrimSpace(effective.ClientSecret) == "" { - return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured") - } - case "none": - if !effective.UsePKCE { - return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none") - } +func isFalseSettingValue(value string) bool { + switch strings.ToLower(strings.TrimSpace(value)) { + case "false", "0", "off", "disabled": + return true default: - return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid") + return false } - - return effective, nil } // getStringOrDefault 获取字符串值或默认值 @@ -635,3 +582,96 @@ func (s *SettingService) GetFallbackModel(ctx context.Context, platform string) } return value } + +// GetLinuxDoConnectOAuthConfig 返回用于登录的"最终生效" LinuxDo Connect 配置。 +// +// 优先级: +// - 若对应系统设置键存在,则覆盖 config.yaml/env 的值 +// - 否则回退到 config.yaml/env 的值 +func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) { + if s == nil || s.cfg == nil { + return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded") + } + + effective := s.cfg.LinuxDo + + keys := []string{ + SettingKeyLinuxDoConnectEnabled, + SettingKeyLinuxDoConnectClientID, + SettingKeyLinuxDoConnectClientSecret, + SettingKeyLinuxDoConnectRedirectURL, + } + settings, err := s.settingRepo.GetMultiple(ctx, keys) + if err != nil { + return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err) + } + + if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok { + effective.Enabled = raw == "true" + } + if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" { + effective.ClientID = strings.TrimSpace(v) + } + if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" { + effective.ClientSecret = strings.TrimSpace(v) + } + if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" { + effective.RedirectURL = strings.TrimSpace(v) + } + + if !effective.Enabled { + return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled") + } + + // 基础健壮性校验(避免把用户重定向到一个必然失败或不安全的 OAuth 流程里)。 + if strings.TrimSpace(effective.ClientID) == "" { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured") + } + if strings.TrimSpace(effective.AuthorizeURL) == "" { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured") + } + if strings.TrimSpace(effective.TokenURL) == "" { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured") + } + if strings.TrimSpace(effective.UserInfoURL) == "" { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured") + } + if strings.TrimSpace(effective.RedirectURL) == "" { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured") + } + if strings.TrimSpace(effective.FrontendRedirectURL) == "" { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured") + } + + if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid") + } + if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid") + } + if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid") + } + if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid") + } + if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid") + } + + method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod)) + switch method { + case "", "client_secret_post", "client_secret_basic": + if strings.TrimSpace(effective.ClientSecret) == "" { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured") + } + case "none": + if !effective.UsePKCE { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none") + } + default: + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid") + } + + return effective, nil +} diff --git a/backend/internal/service/settings_view.go b/backend/internal/service/settings_view.go index 325b7f8f..e20a230a 100644 --- a/backend/internal/service/settings_view.go +++ b/backend/internal/service/settings_view.go @@ -18,7 +18,7 @@ type SystemSettings struct { TurnstileSecretKey string TurnstileSecretKeyConfigured bool - // LinuxDo Connect OAuth 登录(终端用户 SSO) + // LinuxDo Connect OAuth 登录 LinuxDoConnectEnabled bool LinuxDoConnectClientID string LinuxDoConnectClientSecret string @@ -46,6 +46,12 @@ type SystemSettings struct { // Identity patch configuration (Claude -> Gemini) EnableIdentityPatch bool `json:"enable_identity_patch"` IdentityPatchPrompt string `json:"identity_patch_prompt"` + + // Ops monitoring (vNext) + OpsMonitoringEnabled bool + OpsRealtimeMonitoringEnabled bool + OpsQueryModeDefault string + OpsMetricsIntervalSeconds int } type PublicSettings struct { diff --git a/backend/internal/service/wire.go b/backend/internal/service/wire.go index f1074e9d..f2cb9c44 100644 --- a/backend/internal/service/wire.go +++ b/backend/internal/service/wire.go @@ -1,10 +1,12 @@ package service import ( + "database/sql" "time" "github.com/Wei-Shaw/sub2api/internal/config" "github.com/google/wire" + "github.com/redis/go-redis/v9" ) // BuildInfo contains build information @@ -84,6 +86,72 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi return svc } +// ProvideOpsMetricsCollector creates and starts OpsMetricsCollector. +func ProvideOpsMetricsCollector( + opsRepo OpsRepository, + settingRepo SettingRepository, + accountRepo AccountRepository, + concurrencyService *ConcurrencyService, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsMetricsCollector { + collector := NewOpsMetricsCollector(opsRepo, settingRepo, accountRepo, concurrencyService, db, redisClient, cfg) + collector.Start() + return collector +} + +// ProvideOpsAggregationService creates and starts OpsAggregationService (hourly/daily pre-aggregation). +func ProvideOpsAggregationService( + opsRepo OpsRepository, + settingRepo SettingRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsAggregationService { + svc := NewOpsAggregationService(opsRepo, settingRepo, db, redisClient, cfg) + svc.Start() + return svc +} + +// ProvideOpsAlertEvaluatorService creates and starts OpsAlertEvaluatorService. +func ProvideOpsAlertEvaluatorService( + opsService *OpsService, + opsRepo OpsRepository, + emailService *EmailService, + redisClient *redis.Client, + cfg *config.Config, +) *OpsAlertEvaluatorService { + svc := NewOpsAlertEvaluatorService(opsService, opsRepo, emailService, redisClient, cfg) + svc.Start() + return svc +} + +// ProvideOpsCleanupService creates and starts OpsCleanupService (cron scheduled). +func ProvideOpsCleanupService( + opsRepo OpsRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsCleanupService { + svc := NewOpsCleanupService(opsRepo, db, redisClient, cfg) + svc.Start() + return svc +} + +// ProvideOpsScheduledReportService creates and starts OpsScheduledReportService. +func ProvideOpsScheduledReportService( + opsService *OpsService, + userService *UserService, + emailService *EmailService, + redisClient *redis.Client, + cfg *config.Config, +) *OpsScheduledReportService { + svc := NewOpsScheduledReportService(opsService, userService, emailService, redisClient, cfg) + svc.Start() + return svc +} + // ProvideAPIKeyAuthCacheInvalidator 提供 API Key 认证缓存失效能力 func ProvideAPIKeyAuthCacheInvalidator(apiKeyService *APIKeyService) APIKeyAuthCacheInvalidator { return apiKeyService @@ -122,6 +190,12 @@ var ProviderSet = wire.NewSet( NewAccountUsageService, NewAccountTestService, NewSettingService, + NewOpsService, + ProvideOpsMetricsCollector, + ProvideOpsAggregationService, + ProvideOpsAlertEvaluatorService, + ProvideOpsCleanupService, + ProvideOpsScheduledReportService, NewEmailService, ProvideEmailQueueService, NewTurnstileService, diff --git a/backend/migrations/033_ops_monitoring_vnext.sql b/backend/migrations/033_ops_monitoring_vnext.sql new file mode 100644 index 00000000..a18c061d --- /dev/null +++ b/backend/migrations/033_ops_monitoring_vnext.sql @@ -0,0 +1,717 @@ +-- Ops Monitoring (vNext): squashed migration (030) +-- +-- This repository originally planned Ops vNext as migrations 030-036: +-- 030 drop legacy ops tables +-- 031 core schema +-- 032 pre-aggregation tables +-- 033 indexes + optional extensions +-- 034 add avg/max to preagg +-- 035 add notify_email to alert rules +-- 036 seed default alert rules +-- +-- Since these migrations have NOT been applied to any environment yet, we squash them +-- into a single 030 migration for easier review and a cleaner migration history. +-- +-- Notes: +-- - This is intentionally destructive for ops_* data (error logs / metrics / alerts). +-- - It is idempotent (DROP/CREATE/ALTER IF EXISTS/IF NOT EXISTS), but will wipe ops_* data if re-run. + +-- ===================================================================== +-- 030_ops_drop_legacy_ops_tables.sql +-- ===================================================================== + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- Legacy pre-aggregation tables (from 026 and/or previous branches) +DROP TABLE IF EXISTS ops_metrics_daily CASCADE; +DROP TABLE IF EXISTS ops_metrics_hourly CASCADE; + +-- Core ops tables that may exist in some deployments / branches +DROP TABLE IF EXISTS ops_system_metrics CASCADE; +DROP TABLE IF EXISTS ops_error_logs CASCADE; +DROP TABLE IF EXISTS ops_alert_events CASCADE; +DROP TABLE IF EXISTS ops_alert_rules CASCADE; +DROP TABLE IF EXISTS ops_job_heartbeats CASCADE; +DROP TABLE IF EXISTS ops_retry_attempts CASCADE; + +-- Optional legacy tables (best-effort cleanup) +DROP TABLE IF EXISTS ops_scheduled_reports CASCADE; +DROP TABLE IF EXISTS ops_group_availability_configs CASCADE; +DROP TABLE IF EXISTS ops_group_availability_events CASCADE; + +-- Optional legacy views/indexes +DROP VIEW IF EXISTS ops_latest_metrics CASCADE; + +-- ===================================================================== +-- 031_ops_core_schema.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): core schema (errors / retries / metrics / jobs / alerts) +-- +-- Design goals: +-- - Support global filtering (time/platform/group) across all ops modules. +-- - Persist enough context for two retry modes (client retry / pinned upstream retry). +-- - Make ops background jobs observable via job heartbeats. +-- - Keep schema stable and indexes targeted (high-write tables). +-- +-- Notes: +-- - This migration is idempotent. +-- - ops_* tables intentionally avoid strict foreign keys to reduce write amplification/locks. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- ============================================ +-- 1) ops_error_logs: error log details (high-write) +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_error_logs ( + id BIGSERIAL PRIMARY KEY, + + -- Correlation / identities + request_id VARCHAR(64), + client_request_id VARCHAR(64), + user_id BIGINT, + api_key_id BIGINT, + account_id BIGINT, + group_id BIGINT, + client_ip inet, + + -- Dimensions for global filtering + platform VARCHAR(32), + + -- Request metadata + model VARCHAR(100), + request_path VARCHAR(256), + stream BOOLEAN NOT NULL DEFAULT false, + user_agent TEXT, + + -- Core error classification + error_phase VARCHAR(32) NOT NULL, + error_type VARCHAR(64) NOT NULL, + severity VARCHAR(8) NOT NULL DEFAULT 'P2', + status_code INT, + + -- vNext metric semantics + is_business_limited BOOLEAN NOT NULL DEFAULT false, + + -- Error details (sanitized/truncated at ingest time) + error_message TEXT, + error_body TEXT, + + -- Provider/upstream details (optional; useful for trends & account health) + error_source VARCHAR(64), + error_owner VARCHAR(32), + account_status VARCHAR(50), + upstream_status_code INT, + upstream_error_message TEXT, + upstream_error_detail TEXT, + provider_error_code VARCHAR(64), + provider_error_type VARCHAR(64), + network_error_type VARCHAR(50), + retry_after_seconds INT, + + -- Timings (ms) - optional + duration_ms INT, + time_to_first_token_ms BIGINT, + auth_latency_ms BIGINT, + routing_latency_ms BIGINT, + upstream_latency_ms BIGINT, + response_latency_ms BIGINT, + + -- Retry context (only stored for error requests) + request_body JSONB, + request_headers JSONB, + request_body_truncated BOOLEAN NOT NULL DEFAULT false, + request_body_bytes INT, + + -- Retryability flags (best-effort classification) + is_retryable BOOLEAN NOT NULL DEFAULT false, + retry_count INT NOT NULL DEFAULT 0, + + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +COMMENT ON TABLE ops_error_logs IS 'Ops error logs (vNext). Stores sanitized error details and request_body for retries (errors only).'; + +-- ============================================ +-- 2) ops_retry_attempts: audit log for retries +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_retry_attempts ( + id BIGSERIAL PRIMARY KEY, + + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + + requested_by_user_id BIGINT, + source_error_id BIGINT, + + -- client|upstream + mode VARCHAR(16) NOT NULL, + pinned_account_id BIGINT, + + -- queued|running|succeeded|failed + status VARCHAR(16) NOT NULL DEFAULT 'queued', + started_at TIMESTAMPTZ, + finished_at TIMESTAMPTZ, + duration_ms BIGINT, + + -- Optional result correlation + result_request_id VARCHAR(64), + result_error_id BIGINT, + result_usage_request_id VARCHAR(64), + + error_message TEXT +); + +COMMENT ON TABLE ops_retry_attempts IS 'Audit table for ops retries (client retry / pinned upstream retry).'; + +-- ============================================ +-- 3) ops_system_metrics: system + request window snapshots +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_system_metrics ( + id BIGSERIAL PRIMARY KEY, + + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + window_minutes INT NOT NULL DEFAULT 1, + + -- Optional dimensions (only if collector chooses to write per-dimension snapshots) + platform VARCHAR(32), + group_id BIGINT, + + -- Core counts + success_count BIGINT NOT NULL DEFAULT 0, + error_count_total BIGINT NOT NULL DEFAULT 0, + business_limited_count BIGINT NOT NULL DEFAULT 0, + error_count_sla BIGINT NOT NULL DEFAULT 0, + + upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0, + upstream_429_count BIGINT NOT NULL DEFAULT 0, + upstream_529_count BIGINT NOT NULL DEFAULT 0, + + token_consumed BIGINT NOT NULL DEFAULT 0, + + -- Rates + qps DOUBLE PRECISION, + tps DOUBLE PRECISION, + + -- Duration percentiles (ms) - success requests + duration_p50_ms INT, + duration_p90_ms INT, + duration_p95_ms INT, + duration_p99_ms INT, + duration_avg_ms DOUBLE PRECISION, + duration_max_ms INT, + + -- TTFT percentiles (ms) - success requests (streaming) + ttft_p50_ms INT, + ttft_p90_ms INT, + ttft_p95_ms INT, + ttft_p99_ms INT, + ttft_avg_ms DOUBLE PRECISION, + ttft_max_ms INT, + + -- System resources + cpu_usage_percent DOUBLE PRECISION, + memory_used_mb BIGINT, + memory_total_mb BIGINT, + memory_usage_percent DOUBLE PRECISION, + + -- Dependency health (best-effort) + db_ok BOOLEAN, + redis_ok BOOLEAN, + + -- DB pool & runtime + db_conn_active INT, + db_conn_idle INT, + db_conn_waiting INT, + goroutine_count INT, + + -- Queue / concurrency + concurrency_queue_depth INT +); + +COMMENT ON TABLE ops_system_metrics IS 'Ops system/request metrics snapshots (vNext). Used for dashboard overview and realtime rates.'; + +-- ============================================ +-- 4) ops_job_heartbeats: background jobs health +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_job_heartbeats ( + job_name VARCHAR(64) PRIMARY KEY, + + last_run_at TIMESTAMPTZ, + last_success_at TIMESTAMPTZ, + last_error_at TIMESTAMPTZ, + last_error TEXT, + last_duration_ms BIGINT, + + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +COMMENT ON TABLE ops_job_heartbeats IS 'Ops background jobs heartbeats (vNext).'; + +-- ============================================ +-- 5) ops_alert_rules / ops_alert_events +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_alert_rules ( + id BIGSERIAL PRIMARY KEY, + + name VARCHAR(128) NOT NULL, + description TEXT, + enabled BOOLEAN NOT NULL DEFAULT true, + + severity VARCHAR(16) NOT NULL DEFAULT 'warning', + + -- Metric definition + -- Metric definition + metric_type VARCHAR(64) NOT NULL, + operator VARCHAR(8) NOT NULL, + threshold DOUBLE PRECISION NOT NULL, + + window_minutes INT NOT NULL DEFAULT 5, + sustained_minutes INT NOT NULL DEFAULT 5, + cooldown_minutes INT NOT NULL DEFAULT 10, + + -- Optional scoping: platform/group filters etc. + filters JSONB, + + last_triggered_at TIMESTAMPTZ, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_alert_rules_name_unique + ON ops_alert_rules (name); + +CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled + ON ops_alert_rules (enabled); + +CREATE TABLE IF NOT EXISTS ops_alert_events ( + id BIGSERIAL PRIMARY KEY, + + rule_id BIGINT, + severity VARCHAR(16) NOT NULL, + status VARCHAR(16) NOT NULL DEFAULT 'firing', + + title VARCHAR(200), + description TEXT, + + metric_value DOUBLE PRECISION, + threshold_value DOUBLE PRECISION, + dimensions JSONB, + + fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + resolved_at TIMESTAMPTZ, + + email_sent BOOLEAN NOT NULL DEFAULT false, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status + ON ops_alert_events (rule_id, status); + +CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at + ON ops_alert_events (fired_at DESC); + +-- ===================================================================== +-- 032_ops_preaggregation_tables.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): pre-aggregation tables +-- +-- Purpose: +-- - Provide stable query performance for 1–24h windows (and beyond), avoiding expensive +-- percentile_cont scans on raw logs for every dashboard refresh. +-- - Support global filter dimensions: overall / platform / group. +-- +-- Design note: +-- - We keep a single table with nullable platform/group_id, and enforce uniqueness via a +-- COALESCE-based unique index (because UNIQUE with NULLs allows duplicates in Postgres). + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- ============================================ +-- 1) ops_metrics_hourly +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_metrics_hourly ( + id BIGSERIAL PRIMARY KEY, + + bucket_start TIMESTAMPTZ NOT NULL, + platform VARCHAR(32), + group_id BIGINT, + + success_count BIGINT NOT NULL DEFAULT 0, + error_count_total BIGINT NOT NULL DEFAULT 0, + business_limited_count BIGINT NOT NULL DEFAULT 0, + error_count_sla BIGINT NOT NULL DEFAULT 0, + + upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0, + upstream_429_count BIGINT NOT NULL DEFAULT 0, + upstream_529_count BIGINT NOT NULL DEFAULT 0, + + token_consumed BIGINT NOT NULL DEFAULT 0, + + -- Duration percentiles (ms) + duration_p50_ms INT, + duration_p90_ms INT, + duration_p95_ms INT, + duration_p99_ms INT, + + -- TTFT percentiles (ms) + ttft_p50_ms INT, + ttft_p90_ms INT, + ttft_p95_ms INT, + ttft_p99_ms INT, + + computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +-- Uniqueness across three “dimension modes” (overall / platform / group). +-- Postgres UNIQUE treats NULLs as distinct, so we enforce uniqueness via COALESCE. +CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_hourly_unique_dim + ON ops_metrics_hourly ( + bucket_start, + COALESCE(platform, ''), + COALESCE(group_id, 0) + ); + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_bucket + ON ops_metrics_hourly (bucket_start DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_platform_bucket + ON ops_metrics_hourly (platform, bucket_start DESC) + WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL; + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_group_bucket + ON ops_metrics_hourly (group_id, bucket_start DESC) + WHERE group_id IS NOT NULL AND group_id <> 0; + +COMMENT ON TABLE ops_metrics_hourly IS 'vNext hourly pre-aggregated ops metrics (overall/platform/group).'; + +-- ============================================ +-- 2) ops_metrics_daily (optional; for longer windows) +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_metrics_daily ( + id BIGSERIAL PRIMARY KEY, + + bucket_date DATE NOT NULL, + platform VARCHAR(32), + group_id BIGINT, + + success_count BIGINT NOT NULL DEFAULT 0, + error_count_total BIGINT NOT NULL DEFAULT 0, + business_limited_count BIGINT NOT NULL DEFAULT 0, + error_count_sla BIGINT NOT NULL DEFAULT 0, + + upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0, + upstream_429_count BIGINT NOT NULL DEFAULT 0, + upstream_529_count BIGINT NOT NULL DEFAULT 0, + + token_consumed BIGINT NOT NULL DEFAULT 0, + + duration_p50_ms INT, + duration_p90_ms INT, + duration_p95_ms INT, + duration_p99_ms INT, + + ttft_p50_ms INT, + ttft_p90_ms INT, + ttft_p95_ms INT, + ttft_p99_ms INT, + + computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_daily_unique_dim + ON ops_metrics_daily ( + bucket_date, + COALESCE(platform, ''), + COALESCE(group_id, 0) + ); + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_bucket + ON ops_metrics_daily (bucket_date DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_platform_bucket + ON ops_metrics_daily (platform, bucket_date DESC) + WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL; + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_group_bucket + ON ops_metrics_daily (group_id, bucket_date DESC) + WHERE group_id IS NOT NULL AND group_id <> 0; + +COMMENT ON TABLE ops_metrics_daily IS 'vNext daily pre-aggregated ops metrics (overall/platform/group).'; + +-- ===================================================================== +-- 033_ops_indexes_and_extensions.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): indexes and optional extensions +-- +-- This migration intentionally keeps "optional" objects (like pg_trgm) best-effort, +-- so environments without extension privileges won't fail the whole migration chain. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- ============================================ +-- 1) Core btree indexes (always safe) +-- ============================================ + +-- ops_error_logs +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at + ON ops_error_logs (created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform_time + ON ops_error_logs (platform, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_group_time + ON ops_error_logs (group_id, created_at DESC) + WHERE group_id IS NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_account_time + ON ops_error_logs (account_id, created_at DESC) + WHERE account_id IS NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_status_time + ON ops_error_logs (status_code, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_time + ON ops_error_logs (error_phase, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_type_time + ON ops_error_logs (error_type, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id + ON ops_error_logs (request_id); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id + ON ops_error_logs (client_request_id); + +-- ops_system_metrics +CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at + ON ops_system_metrics (created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time + ON ops_system_metrics (window_minutes, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_platform_time + ON ops_system_metrics (platform, created_at DESC) + WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL; + +CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_group_time + ON ops_system_metrics (group_id, created_at DESC) + WHERE group_id IS NOT NULL; + +-- ops_retry_attempts +CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_created_at + ON ops_retry_attempts (created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_source_error + ON ops_retry_attempts (source_error_id, created_at DESC) + WHERE source_error_id IS NOT NULL; + +-- Prevent concurrent retries for the same ops_error_logs row (race-free, multi-instance safe). +CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_retry_attempts_unique_active + ON ops_retry_attempts (source_error_id) + WHERE source_error_id IS NOT NULL AND status IN ('queued', 'running'); + +-- ============================================ +-- 2) Optional: pg_trgm + trigram indexes for fuzzy search +-- ============================================ + +DO $$ +BEGIN + BEGIN + CREATE EXTENSION IF NOT EXISTS pg_trgm; + EXCEPTION WHEN OTHERS THEN + -- Missing privileges or extension package should not block migrations. + RAISE NOTICE 'pg_trgm extension not created: %', SQLERRM; + END; + + IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm') THEN + -- request_id / client_request_id fuzzy search + EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id_trgm + ON ops_error_logs USING gin (request_id gin_trgm_ops)'; + EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id_trgm + ON ops_error_logs USING gin (client_request_id gin_trgm_ops)'; + + -- error_message fuzzy search + EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_error_message_trgm + ON ops_error_logs USING gin (error_message gin_trgm_ops)'; + END IF; +END $$; + +-- ===================================================================== +-- 034_ops_preaggregation_add_avg_max.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): extend pre-aggregation tables with avg/max latency fields +-- +-- Why: +-- - The dashboard overview returns avg/max for duration/TTFT. +-- - Hourly/daily pre-aggregation tables originally stored only p50/p90/p95/p99, which makes +-- it impossible to answer avg/max in preagg mode without falling back to raw scans. +-- +-- This migration is idempotent and safe to run multiple times. +-- +-- NOTE: We keep the existing p50/p90/p95/p99 columns as-is; these are still used for +-- approximate long-window summaries. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- Hourly table +ALTER TABLE ops_metrics_hourly + ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION, + ADD COLUMN IF NOT EXISTS duration_max_ms INT, + ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION, + ADD COLUMN IF NOT EXISTS ttft_max_ms INT; + +-- Daily table +ALTER TABLE ops_metrics_daily + ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION, + ADD COLUMN IF NOT EXISTS duration_max_ms INT, + ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION, + ADD COLUMN IF NOT EXISTS ttft_max_ms INT; + +-- ===================================================================== +-- 035_ops_alert_rules_notify_email.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): alert rule notify settings +-- +-- Adds notify_email flag to ops_alert_rules to keep UI parity with the backup Ops dashboard. +-- Migration is idempotent. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +ALTER TABLE ops_alert_rules + ADD COLUMN IF NOT EXISTS notify_email BOOLEAN NOT NULL DEFAULT true; + +-- ===================================================================== +-- 036_ops_seed_default_alert_rules.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): seed default alert rules (idempotent) +-- +-- Goal: +-- - Provide "out of the box" alert rules so the Ops dashboard can immediately show alert events. +-- - Keep inserts idempotent via ON CONFLICT (name) DO NOTHING. +-- +-- Notes: +-- - Thresholds are intentionally conservative defaults and should be tuned per deployment. +-- - Metric semantics follow vNext: +-- - success_rate / error_rate are based on SLA-scope counts (exclude is_business_limited). +-- - upstream_error_rate excludes 429/529. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- 1) High error rate (P1) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + '错误率过高', + '当错误率超过 5% 且持续 5 分钟时触发告警', + true, 'error_rate', '>', 5.0, 5, 5, 'P1', true, 20, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 2) Low success rate (P0) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + '成功率过低', + '当成功率低于 95% 且持续 5 分钟时触发告警(服务可用性下降)', + true, 'success_rate', '<', 95.0, 5, 5, 'P0', true, 15, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 3) P99 latency too high (P2) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + 'P99延迟过高', + '当 P99 延迟超过 3000ms 且持续 10 分钟时触发告警', + true, 'p99_latency_ms', '>', 3000.0, 5, 10, 'P2', true, 30, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 4) P95 latency too high (P2) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + 'P95延迟过高', + '当 P95 延迟超过 2000ms 且持续 10 分钟时触发告警', + true, 'p95_latency_ms', '>', 2000.0, 5, 10, 'P2', true, 30, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 5) CPU usage too high (P2) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + 'CPU使用率过高', + '当 CPU 使用率超过 85% 且持续 10 分钟时触发告警', + true, 'cpu_usage_percent', '>', 85.0, 5, 10, 'P2', true, 30, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 6) Memory usage too high (P1) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + '内存使用率过高', + '当内存使用率超过 90% 且持续 10 分钟时触发告警(可能导致 OOM)', + true, 'memory_usage_percent', '>', 90.0, 5, 10, 'P1', true, 20, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 7) Concurrency queue buildup (P1) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + '并发队列积压', + '当并发队列深度超过 100 且持续 5 分钟时触发告警(系统处理能力不足)', + true, 'concurrency_queue_depth', '>', 100.0, 5, 5, 'P1', true, 20, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 8) Extremely high error rate (P0) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + '错误率极高', + '当错误率超过 20% 且持续 1 分钟时触发告警(服务严重异常)', + true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- Ops Monitoring vNext: add Redis pool stats fields to system metrics snapshots. +-- This migration is intentionally idempotent. + +ALTER TABLE ops_system_metrics + ADD COLUMN IF NOT EXISTS redis_conn_total INT, + ADD COLUMN IF NOT EXISTS redis_conn_idle INT; + +COMMENT ON COLUMN ops_system_metrics.redis_conn_total IS 'Redis pool total connections (go-redis PoolStats.TotalConns).'; +COMMENT ON COLUMN ops_system_metrics.redis_conn_idle IS 'Redis pool idle connections (go-redis PoolStats.IdleConns).'; diff --git a/backend/migrations/034_ops_upstream_error_events.sql b/backend/migrations/034_ops_upstream_error_events.sql new file mode 100644 index 00000000..f8bfa5e2 --- /dev/null +++ b/backend/migrations/034_ops_upstream_error_events.sql @@ -0,0 +1,9 @@ +-- Add upstream error events list (JSONB) to ops_error_logs for per-request correlation. +-- +-- This is intentionally idempotent. + +ALTER TABLE ops_error_logs + ADD COLUMN IF NOT EXISTS upstream_errors JSONB; + +COMMENT ON COLUMN ops_error_logs.upstream_errors IS + 'Sanitized upstream error events list (JSON array), correlated per gateway request (request_id/client_request_id); used for per-request upstream debugging.'; diff --git a/config.yaml b/config.yaml index b5272aac..424ce9eb 100644 --- a/config.yaml +++ b/config.yaml @@ -159,7 +159,7 @@ gateway: max_line_size: 41943040 # Log upstream error response body summary (safe/truncated; does not log request content) # 记录上游错误响应体摘要(安全/截断;不记录请求内容) - log_upstream_error_body: false + log_upstream_error_body: true # Max bytes to log from upstream error body # 记录上游错误响应体的最大字节数 log_upstream_error_body_max_bytes: 2048 @@ -302,6 +302,41 @@ redis: # 数据库编号(0-15) db: 0 +# ============================================================================= +# Ops Monitoring (Optional) +# 运维监控 (可选) +# ============================================================================= +ops: + # Hard switch: disable all ops background jobs and APIs when false + # 硬开关:为 false 时禁用所有 Ops 后台任务与接口 + enabled: true + + # Prefer pre-aggregated tables (ops_metrics_hourly/ops_metrics_daily) for long-window dashboard queries. + # 优先使用预聚合表(用于长时间窗口查询性能) + use_preaggregated_tables: false + + # Data cleanup configuration + # 数据清理配置(vNext 默认统一保留 30 天) + cleanup: + enabled: true + # Cron expression (minute hour dom month dow), e.g. "0 2 * * *" = daily at 2 AM + # Cron 表达式(分 时 日 月 周),例如 "0 2 * * *" = 每天凌晨 2 点 + schedule: "0 2 * * *" + error_log_retention_days: 30 + minute_metrics_retention_days: 30 + hourly_metrics_retention_days: 30 + + # Pre-aggregation configuration + # 预聚合任务配置 + aggregation: + enabled: true + + # OpsMetricsCollector Redis cache (reduces duplicate expensive window aggregation in multi-replica deployments) + # 指标采集 Redis 缓存(多副本部署时减少重复计算) + metrics_collector_cache: + enabled: true + ttl: 65s + # ============================================================================= # JWT Configuration # JWT 配置 diff --git a/deploy/.env.example b/deploy/.env.example index 83e58a50..27618284 100644 --- a/deploy/.env.example +++ b/deploy/.env.example @@ -151,6 +151,15 @@ GEMINI_OAUTH_SCOPES= # GEMINI_QUOTA_POLICY={"tiers":{"LEGACY":{"pro_rpd":50,"flash_rpd":1500,"cooldown_minutes":30},"PRO":{"pro_rpd":1500,"flash_rpd":4000,"cooldown_minutes":5},"ULTRA":{"pro_rpd":2000,"flash_rpd":0,"cooldown_minutes":5}}} GEMINI_QUOTA_POLICY= +# ----------------------------------------------------------------------------- +# Ops Monitoring Configuration (运维监控配置) +# ----------------------------------------------------------------------------- +# Enable ops monitoring features (background jobs and APIs) +# 是否启用运维监控功能(后台任务和接口) +# Set to false to hide ops menu in sidebar and disable all ops features +# 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能 +OPS_ENABLED=true + # ----------------------------------------------------------------------------- # Update Configuration (在线更新配置) # ----------------------------------------------------------------------------- diff --git a/deploy/config.example.yaml b/deploy/config.example.yaml index 57239f8e..b1fc9bbd 100644 --- a/deploy/config.example.yaml +++ b/deploy/config.example.yaml @@ -159,7 +159,7 @@ gateway: max_line_size: 41943040 # Log upstream error response body summary (safe/truncated; does not log request content) # 记录上游错误响应体摘要(安全/截断;不记录请求内容) - log_upstream_error_body: false + log_upstream_error_body: true # Max bytes to log from upstream error body # 记录上游错误响应体的最大字节数 log_upstream_error_body_max_bytes: 2048 @@ -302,6 +302,19 @@ redis: # 数据库编号(0-15) db: 0 +# ============================================================================= +# Ops Monitoring (Optional) +# 运维监控 (可选) +# ============================================================================= +ops: + # Enable ops monitoring features (background jobs and APIs) + # 是否启用运维监控功能(后台任务和接口) + # Set to false to hide ops menu in sidebar and disable all ops features + # 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能 + # Other detailed settings (cleanup, aggregation, etc.) are configured in ops settings dialog + # 其他详细设置(数据清理、预聚合等)在运维监控设置对话框中配置 + enabled: true + # ============================================================================= # JWT Configuration # JWT 配置 diff --git a/frontend/src/api/admin/index.ts b/frontend/src/api/admin/index.ts index c90017a8..e86f6348 100644 --- a/frontend/src/api/admin/index.ts +++ b/frontend/src/api/admin/index.ts @@ -17,6 +17,7 @@ import usageAPI from './usage' import geminiAPI from './gemini' import antigravityAPI from './antigravity' import userAttributesAPI from './userAttributes' +import opsAPI from './ops' /** * Unified admin API object for convenient access @@ -35,7 +36,8 @@ export const adminAPI = { usage: usageAPI, gemini: geminiAPI, antigravity: antigravityAPI, - userAttributes: userAttributesAPI + userAttributes: userAttributesAPI, + ops: opsAPI } export { @@ -52,7 +54,8 @@ export { usageAPI, geminiAPI, antigravityAPI, - userAttributesAPI + userAttributesAPI, + opsAPI } export default adminAPI diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts new file mode 100644 index 00000000..1d1453f5 --- /dev/null +++ b/frontend/src/api/admin/ops.ts @@ -0,0 +1,958 @@ +/** + * Admin Ops API endpoints (vNext) + * - Error logs list/detail + retry (client/upstream) + * - Dashboard overview (raw path) + */ + +import { apiClient } from '../client' +import type { PaginatedResponse } from '@/types' + +export type OpsRetryMode = 'client' | 'upstream' +export type OpsQueryMode = 'auto' | 'raw' | 'preagg' + +export interface OpsRequestOptions { + signal?: AbortSignal +} + +export interface OpsRetryRequest { + mode: OpsRetryMode + pinned_account_id?: number +} + +export interface OpsRetryResult { + attempt_id: number + mode: OpsRetryMode + status: 'running' | 'succeeded' | 'failed' | string + + pinned_account_id?: number | null + used_account_id?: number | null + + http_status_code: number + upstream_request_id: string + + response_preview: string + response_truncated: boolean + + error_message: string + + started_at: string + finished_at: string + duration_ms: number +} + +export interface OpsDashboardOverview { + start_time: string + end_time: string + platform: string + group_id?: number | null + + health_score?: number + + system_metrics?: OpsSystemMetricsSnapshot | null + job_heartbeats?: OpsJobHeartbeat[] | null + + success_count: number + error_count_total: number + business_limited_count: number + error_count_sla: number + request_count_total: number + request_count_sla: number + + token_consumed: number + + sla: number + error_rate: number + upstream_error_rate: number + upstream_error_count_excl_429_529: number + upstream_429_count: number + upstream_529_count: number + + qps: { + current: number + peak: number + avg: number + } + tps: { + current: number + peak: number + avg: number + } + + duration: OpsPercentiles + ttft: OpsPercentiles +} + +export interface OpsPercentiles { + p50_ms?: number | null + p90_ms?: number | null + p95_ms?: number | null + p99_ms?: number | null + avg_ms?: number | null + max_ms?: number | null +} + +export interface OpsThroughputTrendPoint { + bucket_start: string + request_count: number + token_consumed: number + qps: number + tps: number +} + +export interface OpsThroughputPlatformBreakdownItem { + platform: string + request_count: number + token_consumed: number +} + +export interface OpsThroughputGroupBreakdownItem { + group_id: number + group_name: string + request_count: number + token_consumed: number +} + +export interface OpsThroughputTrendResponse { + bucket: string + points: OpsThroughputTrendPoint[] + by_platform?: OpsThroughputPlatformBreakdownItem[] + top_groups?: OpsThroughputGroupBreakdownItem[] +} + +export type OpsRequestKind = 'success' | 'error' +export type OpsRequestDetailsKind = OpsRequestKind | 'all' +export type OpsRequestDetailsSort = 'created_at_desc' | 'duration_desc' + +export interface OpsRequestDetail { + kind: OpsRequestKind + created_at: string + request_id: string + + platform?: string + model?: string + duration_ms?: number | null + status_code?: number | null + + error_id?: number | null + phase?: string + severity?: string + message?: string + + user_id?: number | null + api_key_id?: number | null + account_id?: number | null + group_id?: number | null + + stream?: boolean +} + +export interface OpsRequestDetailsParams { + time_range?: '5m' | '30m' | '1h' | '6h' | '24h' + start_time?: string + end_time?: string + + kind?: OpsRequestDetailsKind + + platform?: string + group_id?: number | null + + user_id?: number + api_key_id?: number + account_id?: number + + model?: string + request_id?: string + q?: string + + min_duration_ms?: number + max_duration_ms?: number + + sort?: OpsRequestDetailsSort + + page?: number + page_size?: number +} + +export type OpsRequestDetailsResponse = PaginatedResponse + +export interface OpsLatencyHistogramBucket { + range: string + count: number +} + +export interface OpsLatencyHistogramResponse { + start_time: string + end_time: string + platform: string + group_id?: number | null + + total_requests: number + buckets: OpsLatencyHistogramBucket[] +} + +export interface OpsErrorTrendPoint { + bucket_start: string + error_count_total: number + business_limited_count: number + error_count_sla: number + upstream_error_count_excl_429_529: number + upstream_429_count: number + upstream_529_count: number +} + +export interface OpsErrorTrendResponse { + bucket: string + points: OpsErrorTrendPoint[] +} + +export interface OpsErrorDistributionItem { + status_code: number + total: number + sla: number + business_limited: number +} + +export interface OpsErrorDistributionResponse { + total: number + items: OpsErrorDistributionItem[] +} + +export interface OpsSystemMetricsSnapshot { + id: number + created_at: string + window_minutes: number + + cpu_usage_percent?: number | null + memory_used_mb?: number | null + memory_total_mb?: number | null + memory_usage_percent?: number | null + + db_ok?: boolean | null + redis_ok?: boolean | null + + // Config-derived limits (best-effort) for rendering "current vs max". + db_max_open_conns?: number | null + redis_pool_size?: number | null + + redis_conn_total?: number | null + redis_conn_idle?: number | null + + db_conn_active?: number | null + db_conn_idle?: number | null + db_conn_waiting?: number | null + + goroutine_count?: number | null + concurrency_queue_depth?: number | null +} + +export interface OpsJobHeartbeat { + job_name: string + last_run_at?: string | null + last_success_at?: string | null + last_error_at?: string | null + last_error?: string | null + last_duration_ms?: number | null + updated_at: string +} + +export interface PlatformConcurrencyInfo { + platform: string + current_in_use: number + max_capacity: number + load_percentage: number + waiting_in_queue: number +} + +export interface GroupConcurrencyInfo { + group_id: number + group_name: string + platform: string + current_in_use: number + max_capacity: number + load_percentage: number + waiting_in_queue: number +} + +export interface AccountConcurrencyInfo { + account_id: number + account_name?: string + platform: string + group_id: number + group_name: string + current_in_use: number + max_capacity: number + load_percentage: number + waiting_in_queue: number +} + +export interface OpsConcurrencyStatsResponse { + enabled: boolean + platform: Record + group: Record + account: Record + timestamp?: string +} + +export async function getConcurrencyStats(platform?: string, groupId?: number | null): Promise { + const params: Record = {} + if (platform) { + params.platform = platform + } + if (typeof groupId === 'number' && groupId > 0) { + params.group_id = groupId + } + + const { data } = await apiClient.get('/admin/ops/concurrency', { params }) + return data +} + +export interface PlatformAvailability { + platform: string + total_accounts: number + available_count: number + rate_limit_count: number + error_count: number +} + +export interface GroupAvailability { + group_id: number + group_name: string + platform: string + total_accounts: number + available_count: number + rate_limit_count: number + error_count: number +} + +export interface AccountAvailability { + account_id: number + account_name: string + platform: string + group_id: number + group_name: string + status: string + is_available: boolean + is_rate_limited: boolean + rate_limit_reset_at?: string + rate_limit_remaining_sec?: number + is_overloaded: boolean + overload_until?: string + overload_remaining_sec?: number + has_error: boolean + error_message?: string +} + +export interface OpsAccountAvailabilityStatsResponse { + enabled: boolean + platform: Record + group: Record + account: Record + timestamp?: string +} + +export async function getAccountAvailabilityStats(platform?: string, groupId?: number | null): Promise { + const params: Record = {} + if (platform) { + params.platform = platform + } + if (typeof groupId === 'number' && groupId > 0) { + params.group_id = groupId + } + const { data } = await apiClient.get('/admin/ops/account-availability', { params }) + return data +} + +/** + * Subscribe to realtime QPS updates via WebSocket. + * + * Note: browsers cannot set Authorization headers for WebSockets. + * We authenticate via Sec-WebSocket-Protocol using a prefixed token item: + * ["sub2api-admin", "jwt."] + */ +export interface SubscribeQPSOptions { + token?: string | null + onOpen?: () => void + onClose?: (event: CloseEvent) => void + onError?: (event: Event) => void + /** + * Called when the server closes with an application close code that indicates + * reconnecting is not useful (e.g. feature flag disabled). + */ + onFatalClose?: (event: CloseEvent) => void + /** + * More granular status updates for UI (connecting/reconnecting/offline/etc). + */ + onStatusChange?: (status: OpsWSStatus) => void + /** + * Called when a reconnect is scheduled (helps display "retry in Xs"). + */ + onReconnectScheduled?: (info: { attempt: number, delayMs: number }) => void + wsBaseUrl?: string + /** + * Maximum reconnect attempts. Defaults to Infinity to keep the dashboard live. + * Set to 0 to disable reconnect. + */ + maxReconnectAttempts?: number + reconnectBaseDelayMs?: number + reconnectMaxDelayMs?: number + /** + * Stale connection detection (heartbeat-by-observation). + * If no messages are received within this window, the socket is closed to trigger a reconnect. + * Set to 0 to disable. + */ + staleTimeoutMs?: number + /** + * How often to check staleness. Only used when `staleTimeoutMs > 0`. + */ + staleCheckIntervalMs?: number +} + +export type OpsWSStatus = 'connecting' | 'connected' | 'reconnecting' | 'offline' | 'closed' + +export const OPS_WS_CLOSE_CODES = { + REALTIME_DISABLED: 4001 +} as const + +const OPS_WS_BASE_PROTOCOL = 'sub2api-admin' + +export function subscribeQPS(onMessage: (data: any) => void, options: SubscribeQPSOptions = {}): () => void { + let ws: WebSocket | null = null + let reconnectAttempts = 0 + const maxReconnectAttempts = Number.isFinite(options.maxReconnectAttempts as number) + ? (options.maxReconnectAttempts as number) + : Infinity + const baseDelayMs = options.reconnectBaseDelayMs ?? 1000 + const maxDelayMs = options.reconnectMaxDelayMs ?? 30000 + let reconnectTimer: ReturnType | null = null + let shouldReconnect = true + let isConnecting = false + let hasConnectedOnce = false + let lastMessageAt = 0 + const staleTimeoutMs = options.staleTimeoutMs ?? 120_000 + const staleCheckIntervalMs = options.staleCheckIntervalMs ?? 30_000 + let staleTimer: ReturnType | null = null + + const setStatus = (status: OpsWSStatus) => { + options.onStatusChange?.(status) + } + + const clearReconnectTimer = () => { + if (reconnectTimer) { + clearTimeout(reconnectTimer) + reconnectTimer = null + } + } + + const clearStaleTimer = () => { + if (staleTimer) { + clearInterval(staleTimer) + staleTimer = null + } + } + + const startStaleTimer = () => { + clearStaleTimer() + if (!staleTimeoutMs || staleTimeoutMs <= 0) return + staleTimer = setInterval(() => { + if (!shouldReconnect) return + if (!ws || ws.readyState !== WebSocket.OPEN) return + if (!lastMessageAt) return + const ageMs = Date.now() - lastMessageAt + if (ageMs > staleTimeoutMs) { + // Treat as a half-open connection; closing triggers the normal reconnect path. + ws.close() + } + }, staleCheckIntervalMs) + } + + const scheduleReconnect = () => { + if (!shouldReconnect) return + if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return + + // If we're offline, wait for the browser to come back online. + if (typeof navigator !== 'undefined' && 'onLine' in navigator && !navigator.onLine) { + setStatus('offline') + return + } + + const expDelay = baseDelayMs * Math.pow(2, reconnectAttempts) + const delay = Math.min(expDelay, maxDelayMs) + const jitter = Math.floor(Math.random() * 250) + clearReconnectTimer() + reconnectTimer = setTimeout(() => { + reconnectAttempts++ + connect() + }, delay + jitter) + options.onReconnectScheduled?.({ attempt: reconnectAttempts + 1, delayMs: delay + jitter }) + } + + const handleOnline = () => { + if (!shouldReconnect) return + if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return + connect() + } + + const handleOffline = () => { + setStatus('offline') + } + + const connect = () => { + if (!shouldReconnect) return + if (isConnecting) return + if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return + if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return + + isConnecting = true + setStatus(hasConnectedOnce ? 'reconnecting' : 'connecting') + const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:' + const wsBaseUrl = options.wsBaseUrl || import.meta.env.VITE_WS_BASE_URL || window.location.host + const wsURL = new URL(`${protocol}//${wsBaseUrl}/api/v1/admin/ops/ws/qps`) + + // Do NOT put admin JWT in the URL query string (it can leak via access logs, proxies, etc). + // Browsers cannot set Authorization headers for WebSockets, so we pass the token via + // Sec-WebSocket-Protocol (subprotocol list): ["sub2api-admin", "jwt."]. + const rawToken = String(options.token ?? localStorage.getItem('auth_token') ?? '').trim() + const protocols: string[] = [OPS_WS_BASE_PROTOCOL] + if (rawToken) protocols.push(`jwt.${rawToken}`) + + ws = new WebSocket(wsURL.toString(), protocols) + + ws.onopen = () => { + reconnectAttempts = 0 + isConnecting = false + hasConnectedOnce = true + clearReconnectTimer() + lastMessageAt = Date.now() + startStaleTimer() + setStatus('connected') + options.onOpen?.() + } + + ws.onmessage = (e) => { + try { + const data = JSON.parse(e.data) + lastMessageAt = Date.now() + onMessage(data) + } catch (err) { + console.warn('[OpsWS] Failed to parse message:', err) + } + } + + ws.onerror = (error) => { + console.error('[OpsWS] Connection error:', error) + options.onError?.(error) + } + + ws.onclose = (event) => { + isConnecting = false + options.onClose?.(event) + clearStaleTimer() + ws = null + + // If the server explicitly tells us to stop reconnecting, honor it. + if (event && typeof event.code === 'number' && event.code === OPS_WS_CLOSE_CODES.REALTIME_DISABLED) { + shouldReconnect = false + clearReconnectTimer() + setStatus('closed') + options.onFatalClose?.(event) + return + } + + scheduleReconnect() + } + } + + window.addEventListener('online', handleOnline) + window.addEventListener('offline', handleOffline) + connect() + + return () => { + shouldReconnect = false + window.removeEventListener('online', handleOnline) + window.removeEventListener('offline', handleOffline) + clearReconnectTimer() + clearStaleTimer() + if (ws) ws.close() + ws = null + setStatus('closed') + } +} + +export type OpsSeverity = string +export type OpsPhase = string + +export type AlertSeverity = 'critical' | 'warning' | 'info' +export type ThresholdMode = 'count' | 'percentage' | 'both' +export type MetricType = + | 'success_rate' + | 'error_rate' + | 'upstream_error_rate' + | 'p95_latency_ms' + | 'p99_latency_ms' + | 'cpu_usage_percent' + | 'memory_usage_percent' + | 'concurrency_queue_depth' + | 'group_available_accounts' + | 'group_available_ratio' + | 'group_rate_limit_ratio' + | 'account_rate_limited_count' + | 'account_error_count' + | 'account_error_ratio' + | 'overload_account_count' +export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!=' + +export interface AlertRule { + id?: number + name: string + description?: string + enabled: boolean + metric_type: MetricType + operator: Operator + threshold: number + window_minutes: number + sustained_minutes: number + severity: OpsSeverity + cooldown_minutes: number + notify_email: boolean + filters?: Record + created_at?: string + updated_at?: string + last_triggered_at?: string | null +} + +export interface AlertEvent { + id: number + rule_id: number + severity: OpsSeverity | string + status: 'firing' | 'resolved' | string + title?: string + description?: string + metric_value?: number + threshold_value?: number + dimensions?: Record + fired_at: string + resolved_at?: string | null + email_sent: boolean + created_at: string +} + +export interface EmailNotificationConfig { + alert: { + enabled: boolean + recipients: string[] + min_severity: AlertSeverity | '' + rate_limit_per_hour: number + batching_window_seconds: number + include_resolved_alerts: boolean + } + report: { + enabled: boolean + recipients: string[] + daily_summary_enabled: boolean + daily_summary_schedule: string + weekly_summary_enabled: boolean + weekly_summary_schedule: string + error_digest_enabled: boolean + error_digest_schedule: string + error_digest_min_count: number + account_health_enabled: boolean + account_health_schedule: string + account_health_error_rate_threshold: number + } +} + +export interface OpsDistributedLockSettings { + enabled: boolean + key: string + ttl_seconds: number +} + +export interface OpsAlertRuntimeSettings { + evaluation_interval_seconds: number + distributed_lock: OpsDistributedLockSettings + silencing: { + enabled: boolean + global_until_rfc3339: string + global_reason: string + entries?: Array<{ + rule_id?: number + severities?: Array + until_rfc3339: string + reason: string + }> + } +} + +export interface OpsAdvancedSettings { + data_retention: OpsDataRetentionSettings + aggregation: OpsAggregationSettings +} + +export interface OpsDataRetentionSettings { + cleanup_enabled: boolean + cleanup_schedule: string + error_log_retention_days: number + minute_metrics_retention_days: number + hourly_metrics_retention_days: number +} + +export interface OpsAggregationSettings { + aggregation_enabled: boolean +} + +export interface OpsErrorLog { + id: number + created_at: string + phase: OpsPhase + type: string + severity: OpsSeverity + status_code: number + platform: string + model: string + latency_ms?: number | null + client_request_id: string + request_id: string + message: string + + user_id?: number | null + api_key_id?: number | null + account_id?: number | null + group_id?: number | null + + client_ip?: string | null + request_path?: string + stream?: boolean +} + +export interface OpsErrorDetail extends OpsErrorLog { + error_body: string + user_agent: string + + // Upstream context (optional; enriched by gateway services) + upstream_status_code?: number | null + upstream_error_message?: string + upstream_error_detail?: string + upstream_errors?: string + + auth_latency_ms?: number | null + routing_latency_ms?: number | null + upstream_latency_ms?: number | null + response_latency_ms?: number | null + time_to_first_token_ms?: number | null + + request_body: string + request_body_truncated: boolean + request_body_bytes?: number | null + + is_business_limited: boolean +} + +export type OpsErrorLogsResponse = PaginatedResponse + +export async function getDashboardOverview( + params: { + time_range?: '5m' | '30m' | '1h' | '6h' | '24h' + start_time?: string + end_time?: string + platform?: string + group_id?: number | null + mode?: OpsQueryMode + }, + options: OpsRequestOptions = {} +): Promise { + const { data } = await apiClient.get('/admin/ops/dashboard/overview', { + params, + signal: options.signal + }) + return data +} + +export async function getThroughputTrend( + params: { + time_range?: '5m' | '30m' | '1h' | '6h' | '24h' + start_time?: string + end_time?: string + platform?: string + group_id?: number | null + mode?: OpsQueryMode + }, + options: OpsRequestOptions = {} +): Promise { + const { data } = await apiClient.get('/admin/ops/dashboard/throughput-trend', { + params, + signal: options.signal + }) + return data +} + +export async function getLatencyHistogram( + params: { + time_range?: '5m' | '30m' | '1h' | '6h' | '24h' + start_time?: string + end_time?: string + platform?: string + group_id?: number | null + mode?: OpsQueryMode + }, + options: OpsRequestOptions = {} +): Promise { + const { data } = await apiClient.get('/admin/ops/dashboard/latency-histogram', { + params, + signal: options.signal + }) + return data +} + +export async function getErrorTrend( + params: { + time_range?: '5m' | '30m' | '1h' | '6h' | '24h' + start_time?: string + end_time?: string + platform?: string + group_id?: number | null + mode?: OpsQueryMode + }, + options: OpsRequestOptions = {} +): Promise { + const { data } = await apiClient.get('/admin/ops/dashboard/error-trend', { + params, + signal: options.signal + }) + return data +} + +export async function getErrorDistribution( + params: { + time_range?: '5m' | '30m' | '1h' | '6h' | '24h' + start_time?: string + end_time?: string + platform?: string + group_id?: number | null + mode?: OpsQueryMode + }, + options: OpsRequestOptions = {} +): Promise { + const { data } = await apiClient.get('/admin/ops/dashboard/error-distribution', { + params, + signal: options.signal + }) + return data +} + +export async function listErrorLogs(params: { + page?: number + page_size?: number + time_range?: string + start_time?: string + end_time?: string + platform?: string + group_id?: number | null + account_id?: number | null + phase?: string + q?: string + status_codes?: string +}): Promise { + const { data } = await apiClient.get('/admin/ops/errors', { params }) + return data +} + +export async function getErrorLogDetail(id: number): Promise { + const { data } = await apiClient.get(`/admin/ops/errors/${id}`) + return data +} + +export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promise { + const { data } = await apiClient.post(`/admin/ops/errors/${id}/retry`, req) + return data +} + +export async function listRequestDetails(params: OpsRequestDetailsParams): Promise { + const { data } = await apiClient.get('/admin/ops/requests', { params }) + return data +} + +// Alert rules +export async function listAlertRules(): Promise { + const { data } = await apiClient.get('/admin/ops/alert-rules') + return data +} + +export async function createAlertRule(rule: AlertRule): Promise { + const { data } = await apiClient.post('/admin/ops/alert-rules', rule) + return data +} + +export async function updateAlertRule(id: number, rule: Partial): Promise { + const { data } = await apiClient.put(`/admin/ops/alert-rules/${id}`, rule) + return data +} + +export async function deleteAlertRule(id: number): Promise { + await apiClient.delete(`/admin/ops/alert-rules/${id}`) +} + +export async function listAlertEvents(limit = 100): Promise { + const { data } = await apiClient.get('/admin/ops/alert-events', { params: { limit } }) + return data +} + +// Email notification config +export async function getEmailNotificationConfig(): Promise { + const { data } = await apiClient.get('/admin/ops/email-notification/config') + return data +} + +export async function updateEmailNotificationConfig(config: EmailNotificationConfig): Promise { + const { data } = await apiClient.put('/admin/ops/email-notification/config', config) + return data +} + +// Runtime settings (DB-backed) +export async function getAlertRuntimeSettings(): Promise { + const { data } = await apiClient.get('/admin/ops/runtime/alert') + return data +} + +export async function updateAlertRuntimeSettings(config: OpsAlertRuntimeSettings): Promise { + const { data } = await apiClient.put('/admin/ops/runtime/alert', config) + return data +} + +// Advanced settings (DB-backed) +export async function getAdvancedSettings(): Promise { + const { data } = await apiClient.get('/admin/ops/advanced-settings') + return data +} + +export async function updateAdvancedSettings(config: OpsAdvancedSettings): Promise { + const { data } = await apiClient.put('/admin/ops/advanced-settings', config) + return data +} + +export const opsAPI = { + getDashboardOverview, + getThroughputTrend, + getLatencyHistogram, + getErrorTrend, + getErrorDistribution, + getConcurrencyStats, + getAccountAvailabilityStats, + subscribeQPS, + listErrorLogs, + getErrorLogDetail, + retryErrorRequest, + listRequestDetails, + listAlertRules, + createAlertRule, + updateAlertRule, + deleteAlertRule, + listAlertEvents, + getEmailNotificationConfig, + updateEmailNotificationConfig, + getAlertRuntimeSettings, + updateAlertRuntimeSettings, + getAdvancedSettings, + updateAdvancedSettings +} + +export default opsAPI diff --git a/frontend/src/api/admin/settings.ts b/frontend/src/api/admin/settings.ts index fc68eee4..913c9652 100644 --- a/frontend/src/api/admin/settings.ts +++ b/frontend/src/api/admin/settings.ts @@ -35,14 +35,29 @@ export interface SystemSettings { turnstile_enabled: boolean turnstile_site_key: string turnstile_secret_key_configured: boolean - // LinuxDo Connect OAuth 登录(终端用户 SSO) + + // LinuxDo Connect OAuth settings linuxdo_connect_enabled: boolean linuxdo_connect_client_id: string linuxdo_connect_client_secret_configured: boolean linuxdo_connect_redirect_url: string + + // Model fallback configuration + enable_model_fallback: boolean + fallback_model_anthropic: string + fallback_model_openai: string + fallback_model_gemini: string + fallback_model_antigravity: string + // Identity patch configuration (Claude -> Gemini) enable_identity_patch: boolean identity_patch_prompt: string + + // Ops Monitoring (vNext) + ops_monitoring_enabled: boolean + ops_realtime_monitoring_enabled: boolean + ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string + ops_metrics_interval_seconds: number } export interface UpdateSettingsRequest { @@ -71,8 +86,17 @@ export interface UpdateSettingsRequest { linuxdo_connect_client_id?: string linuxdo_connect_client_secret?: string linuxdo_connect_redirect_url?: string + enable_model_fallback?: boolean + fallback_model_anthropic?: string + fallback_model_openai?: string + fallback_model_gemini?: string + fallback_model_antigravity?: string enable_identity_patch?: boolean identity_patch_prompt?: string + ops_monitoring_enabled?: boolean + ops_realtime_monitoring_enabled?: boolean + ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string + ops_metrics_interval_seconds?: number } /** diff --git a/frontend/src/api/client.ts b/frontend/src/api/client.ts index 4e53069a..3827498b 100644 --- a/frontend/src/api/client.ts +++ b/frontend/src/api/client.ts @@ -80,9 +80,45 @@ apiClient.interceptors.response.use( return response }, (error: AxiosError>) => { + // Request cancellation: keep the original axios cancellation error so callers can ignore it. + // Otherwise we'd misclassify it as a generic "network error". + if (error.code === 'ERR_CANCELED' || axios.isCancel(error)) { + return Promise.reject(error) + } + // Handle common errors if (error.response) { const { status, data } = error.response + const url = String(error.config?.url || '') + + // Validate `data` shape to avoid HTML error pages breaking our error handling. + const apiData = (typeof data === 'object' && data !== null ? data : {}) as Record + + // Ops monitoring disabled: treat as feature-flagged 404, and proactively redirect away + // from ops pages to avoid broken UI states. + if (status === 404 && apiData.message === 'Ops monitoring is disabled') { + try { + localStorage.setItem('ops_monitoring_enabled_cached', 'false') + } catch { + // ignore localStorage failures + } + try { + window.dispatchEvent(new CustomEvent('ops-monitoring-disabled')) + } catch { + // ignore event failures + } + + if (window.location.pathname.startsWith('/admin/ops')) { + window.location.href = '/admin/settings' + } + + return Promise.reject({ + status, + code: 'OPS_DISABLED', + message: apiData.message || error.message, + url + }) + } // 401: Unauthorized - clear token and redirect to login if (status === 401) { @@ -113,8 +149,8 @@ apiClient.interceptors.response.use( // Return structured error return Promise.reject({ status, - code: data?.code, - message: data?.message || error.message + code: apiData.code, + message: apiData.message || apiData.detail || error.message }) } diff --git a/frontend/src/components/common/HelpTooltip.vue b/frontend/src/components/common/HelpTooltip.vue new file mode 100644 index 00000000..7679ced4 --- /dev/null +++ b/frontend/src/components/common/HelpTooltip.vue @@ -0,0 +1,44 @@ + + + + diff --git a/frontend/src/components/common/Select.vue b/frontend/src/components/common/Select.vue index 74f564f5..c90d0201 100644 --- a/frontend/src/components/common/Select.vue +++ b/frontend/src/components/common/Select.vue @@ -67,12 +67,13 @@ :aria-selected="isSelected(option)" :aria-disabled="isOptionDisabled(option)" @click.stop="!isOptionDisabled(option) && selectOption(option)" - @mouseenter="focusedIndex = index" + @mouseenter="handleOptionMouseEnter(option, index)" :class="[ 'select-option', + isGroupHeaderOption(option) && 'select-option-group', isSelected(option) && 'select-option-selected', - isOptionDisabled(option) && 'select-option-disabled', - focusedIndex === index && 'select-option-focused' + isOptionDisabled(option) && !isGroupHeaderOption(option) && 'select-option-disabled', + focusedIndex === index && !isGroupHeaderOption(option) && 'select-option-focused' ]" > @@ -201,6 +202,13 @@ const isOptionDisabled = (option: any): boolean => { return false } +const isGroupHeaderOption = (option: any): boolean => { + if (typeof option === 'object' && option !== null) { + return option.kind === 'group' + } + return false +} + const selectedOption = computed(() => { return props.options.find((opt) => getOptionValue(opt) === props.modelValue) || null }) @@ -225,6 +233,31 @@ const isSelected = (option: any): boolean => { return getOptionValue(option) === props.modelValue } +const findNextEnabledIndex = (startIndex: number): number => { + const opts = filteredOptions.value + if (opts.length === 0) return -1 + for (let offset = 0; offset < opts.length; offset++) { + const idx = (startIndex + offset) % opts.length + if (!isOptionDisabled(opts[idx])) return idx + } + return -1 +} + +const findPrevEnabledIndex = (startIndex: number): number => { + const opts = filteredOptions.value + if (opts.length === 0) return -1 + for (let offset = 0; offset < opts.length; offset++) { + const idx = (startIndex - offset + opts.length) % opts.length + if (!isOptionDisabled(opts[idx])) return idx + } + return -1 +} + +const handleOptionMouseEnter = (option: any, index: number) => { + if (isOptionDisabled(option) || isGroupHeaderOption(option)) return + focusedIndex.value = index +} + // Update trigger rect periodically while open to follow scroll/resize const updateTriggerRect = () => { if (containerRef.value) { @@ -259,8 +292,15 @@ watch(isOpen, (open) => { if (open) { calculateDropdownPosition() // Reset focused index to current selection or first item - const selectedIdx = filteredOptions.value.findIndex(isSelected) - focusedIndex.value = selectedIdx >= 0 ? selectedIdx : 0 + if (filteredOptions.value.length === 0) { + focusedIndex.value = -1 + } else { + const selectedIdx = filteredOptions.value.findIndex(isSelected) + const initialIdx = selectedIdx >= 0 ? selectedIdx : 0 + focusedIndex.value = isOptionDisabled(filteredOptions.value[initialIdx]) + ? findNextEnabledIndex(initialIdx + 1) + : initialIdx + } if (props.searchable) { nextTick(() => searchInputRef.value?.focus()) @@ -295,13 +335,13 @@ const onDropdownKeyDown = (e: KeyboardEvent) => { switch (e.key) { case 'ArrowDown': e.preventDefault() - focusedIndex.value = (focusedIndex.value + 1) % filteredOptions.value.length - scrollToFocused() + focusedIndex.value = findNextEnabledIndex(focusedIndex.value + 1) + if (focusedIndex.value >= 0) scrollToFocused() break case 'ArrowUp': e.preventDefault() - focusedIndex.value = (focusedIndex.value - 1 + filteredOptions.value.length) % filteredOptions.value.length - scrollToFocused() + focusedIndex.value = findPrevEnabledIndex(focusedIndex.value - 1) + if (focusedIndex.value >= 0) scrollToFocused() break case 'Enter': e.preventDefault() @@ -441,6 +481,17 @@ onUnmounted(() => { @apply cursor-not-allowed opacity-40; } +.select-dropdown-portal .select-option-group { + @apply cursor-default select-none; + @apply bg-gray-50 dark:bg-dark-900; + @apply text-[11px] font-bold uppercase tracking-wider; + @apply text-gray-500 dark:text-gray-400; +} + +.select-dropdown-portal .select-option-group:hover { + @apply bg-gray-50 dark:bg-dark-900; +} + .select-dropdown-portal .select-option-label { @apply flex-1 min-w-0 truncate text-left; } diff --git a/frontend/src/components/keys/UseKeyModal.vue b/frontend/src/components/keys/UseKeyModal.vue index 546a53ab..58f42ae6 100644 --- a/frontend/src/components/keys/UseKeyModal.vue +++ b/frontend/src/components/keys/UseKeyModal.vue @@ -28,8 +28,8 @@ {{ platformDescription }}

- -
+ +