This commit is contained in:
yangjianbo
2026-01-12 10:38:44 +08:00
124 changed files with 26865 additions and 605 deletions

2
backend/.dockerignore Normal file
View File

@@ -0,0 +1,2 @@
.cache/
.DS_Store

View File

@@ -18,6 +18,12 @@ linters:
list-mode: original
files:
- "**/internal/service/**"
- "!**/internal/service/ops_aggregation_service.go"
- "!**/internal/service/ops_alert_evaluator_service.go"
- "!**/internal/service/ops_cleanup_service.go"
- "!**/internal/service/ops_metrics_collector.go"
- "!**/internal/service/ops_scheduled_report_service.go"
- "!**/internal/service/wire.go"
deny:
- pkg: github.com/Wei-Shaw/sub2api/internal/repository
desc: "service must not import repository"

View File

@@ -62,6 +62,11 @@ func provideServiceBuildInfo(buildInfo handler.BuildInfo) service.BuildInfo {
func provideCleanup(
entClient *ent.Client,
rdb *redis.Client,
opsMetricsCollector *service.OpsMetricsCollector,
opsAggregation *service.OpsAggregationService,
opsAlertEvaluator *service.OpsAlertEvaluatorService,
opsCleanup *service.OpsCleanupService,
opsScheduledReport *service.OpsScheduledReportService,
tokenRefresh *service.TokenRefreshService,
accountExpiry *service.AccountExpiryService,
pricing *service.PricingService,
@@ -81,6 +86,36 @@ func provideCleanup(
name string
fn func() error
}{
{"OpsScheduledReportService", func() error {
if opsScheduledReport != nil {
opsScheduledReport.Stop()
}
return nil
}},
{"OpsCleanupService", func() error {
if opsCleanup != nil {
opsCleanup.Stop()
}
return nil
}},
{"OpsAlertEvaluatorService", func() error {
if opsAlertEvaluator != nil {
opsAlertEvaluator.Stop()
}
return nil
}},
{"OpsAggregationService", func() error {
if opsAggregation != nil {
opsAggregation.Stop()
}
return nil
}},
{"OpsMetricsCollector", func() error {
if opsMetricsCollector != nil {
opsMetricsCollector.Stop()
}
return nil
}},
{"TokenRefreshService", func() error {
tokenRefresh.Stop()
return nil

View File

@@ -120,7 +120,22 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
proxyHandler := admin.NewProxyHandler(adminService)
adminRedeemHandler := admin.NewRedeemHandler(adminService)
promoHandler := admin.NewPromoHandler(promoService)
settingHandler := admin.NewSettingHandler(settingService, emailService, turnstileService)
opsRepository := repository.NewOpsRepository(db)
pricingRemoteClient := repository.ProvidePricingRemoteClient(configConfig)
pricingService, err := service.ProvidePricingService(configConfig, pricingRemoteClient)
if err != nil {
return nil, err
}
billingService := service.NewBillingService(configConfig, pricingService)
identityCache := repository.NewIdentityCache(redisClient)
identityService := service.NewIdentityService(identityCache)
deferredService := service.ProvideDeferredService(accountRepository, timingWheelService)
gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService)
openAIGatewayService := service.NewOpenAIGatewayService(accountRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, httpUpstream, deferredService)
geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig)
opsService := service.NewOpsService(opsRepository, settingRepository, configConfig, accountRepository, concurrencyService, gatewayService, openAIGatewayService, geminiMessagesCompatService, antigravityGatewayService)
settingHandler := admin.NewSettingHandler(settingService, emailService, turnstileService, opsService)
opsHandler := admin.NewOpsHandler(opsService)
updateCache := repository.NewUpdateCache(redisClient)
gitHubReleaseClient := repository.ProvideGitHubReleaseClient(configConfig)
serviceBuildInfo := provideServiceBuildInfo(buildInfo)
@@ -132,31 +147,24 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
userAttributeValueRepository := repository.NewUserAttributeValueRepository(client)
userAttributeService := service.NewUserAttributeService(userAttributeDefinitionRepository, userAttributeValueRepository)
userAttributeHandler := admin.NewUserAttributeHandler(userAttributeService)
adminHandlers := handler.ProvideAdminHandlers(dashboardHandler, adminUserHandler, groupHandler, accountHandler, oAuthHandler, openAIOAuthHandler, geminiOAuthHandler, antigravityOAuthHandler, proxyHandler, adminRedeemHandler, promoHandler, settingHandler, systemHandler, adminSubscriptionHandler, adminUsageHandler, userAttributeHandler)
pricingRemoteClient := repository.ProvidePricingRemoteClient(configConfig)
pricingService, err := service.ProvidePricingService(configConfig, pricingRemoteClient)
if err != nil {
return nil, err
}
billingService := service.NewBillingService(configConfig, pricingService)
identityCache := repository.NewIdentityCache(redisClient)
identityService := service.NewIdentityService(identityCache)
deferredService := service.ProvideDeferredService(accountRepository, timingWheelService)
gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService)
geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig)
adminHandlers := handler.ProvideAdminHandlers(dashboardHandler, adminUserHandler, groupHandler, accountHandler, oAuthHandler, openAIOAuthHandler, geminiOAuthHandler, antigravityOAuthHandler, proxyHandler, adminRedeemHandler, promoHandler, settingHandler, opsHandler, systemHandler, adminSubscriptionHandler, adminUsageHandler, userAttributeHandler)
gatewayHandler := handler.NewGatewayHandler(gatewayService, geminiMessagesCompatService, antigravityGatewayService, userService, concurrencyService, billingCacheService, configConfig)
openAIGatewayService := service.NewOpenAIGatewayService(accountRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, httpUpstream, deferredService)
openAIGatewayHandler := handler.NewOpenAIGatewayHandler(openAIGatewayService, concurrencyService, billingCacheService, configConfig)
handlerSettingHandler := handler.ProvideSettingHandler(settingService, buildInfo)
handlers := handler.ProvideHandlers(authHandler, userHandler, apiKeyHandler, usageHandler, redeemHandler, subscriptionHandler, adminHandlers, gatewayHandler, openAIGatewayHandler, handlerSettingHandler)
jwtAuthMiddleware := middleware.NewJWTAuthMiddleware(authService, userService)
adminAuthMiddleware := middleware.NewAdminAuthMiddleware(authService, userService, settingService)
apiKeyAuthMiddleware := middleware.NewAPIKeyAuthMiddleware(apiKeyService, subscriptionService, configConfig)
engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, settingService, redisClient)
engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, opsService, settingService, redisClient)
httpServer := server.ProvideHTTPServer(configConfig, engine)
opsMetricsCollector := service.ProvideOpsMetricsCollector(opsRepository, settingRepository, accountRepository, concurrencyService, db, redisClient, configConfig)
opsAggregationService := service.ProvideOpsAggregationService(opsRepository, settingRepository, db, redisClient, configConfig)
opsAlertEvaluatorService := service.ProvideOpsAlertEvaluatorService(opsService, opsRepository, emailService, redisClient, configConfig)
opsCleanupService := service.ProvideOpsCleanupService(opsRepository, db, redisClient, configConfig)
opsScheduledReportService := service.ProvideOpsScheduledReportService(opsService, userService, emailService, redisClient, configConfig)
tokenRefreshService := service.ProvideTokenRefreshService(accountRepository, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService, configConfig)
accountExpiryService := service.ProvideAccountExpiryService(accountRepository)
v := provideCleanup(client, redisClient, tokenRefreshService, accountExpiryService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService)
v := provideCleanup(client, redisClient, opsMetricsCollector, opsAggregationService, opsAlertEvaluatorService, opsCleanupService, opsScheduledReportService, tokenRefreshService, accountExpiryService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService)
application := &Application{
Server: httpServer,
Cleanup: v,
@@ -181,6 +189,11 @@ func provideServiceBuildInfo(buildInfo handler.BuildInfo) service.BuildInfo {
func provideCleanup(
entClient *ent.Client,
rdb *redis.Client,
opsMetricsCollector *service.OpsMetricsCollector,
opsAggregation *service.OpsAggregationService,
opsAlertEvaluator *service.OpsAlertEvaluatorService,
opsCleanup *service.OpsCleanupService,
opsScheduledReport *service.OpsScheduledReportService,
tokenRefresh *service.TokenRefreshService,
accountExpiry *service.AccountExpiryService,
pricing *service.PricingService,
@@ -199,6 +212,36 @@ func provideCleanup(
name string
fn func() error
}{
{"OpsScheduledReportService", func() error {
if opsScheduledReport != nil {
opsScheduledReport.Stop()
}
return nil
}},
{"OpsCleanupService", func() error {
if opsCleanup != nil {
opsCleanup.Stop()
}
return nil
}},
{"OpsAlertEvaluatorService", func() error {
if opsAlertEvaluator != nil {
opsAlertEvaluator.Stop()
}
return nil
}},
{"OpsAggregationService", func() error {
if opsAggregation != nil {
opsAggregation.Stop()
}
return nil
}},
{"OpsMetricsCollector", func() error {
if opsMetricsCollector != nil {
opsMetricsCollector.Stop()
}
return nil
}},
{"TokenRefreshService", func() error {
tokenRefresh.Stop()
return nil

View File

@@ -8,9 +8,11 @@ require (
github.com/golang-jwt/jwt/v5 v5.2.2
github.com/google/uuid v1.6.0
github.com/google/wire v0.7.0
github.com/gorilla/websocket v1.5.3
github.com/imroc/req/v3 v3.57.0
github.com/lib/pq v1.10.9
github.com/redis/go-redis/v9 v9.17.2
github.com/shirou/gopsutil/v4 v4.25.6
github.com/spf13/viper v1.18.2
github.com/stretchr/testify v1.11.1
github.com/testcontainers/testcontainers-go/modules/postgres v0.40.0
@@ -106,9 +108,9 @@ require (
github.com/quic-go/quic-go v0.57.1 // indirect
github.com/refraction-networking/utls v1.8.1 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/sagikazarmark/locafero v0.4.0 // indirect
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
github.com/shirou/gopsutil/v4 v4.25.6 // indirect
github.com/sirupsen/logrus v1.9.3 // indirect
github.com/sourcegraph/conc v0.3.0 // indirect
github.com/spaolacci/murmur3 v1.1.0 // indirect

View File

@@ -117,6 +117,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/google/wire v0.7.0 h1:JxUKI6+CVBgCO2WToKy/nQk0sS+amI9z9EjVmdaocj4=
github.com/google/wire v0.7.0/go.mod h1:n6YbUQD9cPKTnHXEBN2DXlOp/mVADhVErcMFb0v3J18=
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg=
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4=
github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
@@ -224,6 +226,8 @@ github.com/refraction-networking/utls v1.8.1 h1:yNY1kapmQU8JeM1sSw2H2asfTIwWxIkr
github.com/refraction-networking/utls v1.8.1/go.mod h1:jkSOEkLqn+S/jtpEHPOsVv/4V4EVnelwbMQl4vCWXAM=
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=

View File

@@ -43,6 +43,7 @@ type Config struct {
Turnstile TurnstileConfig `mapstructure:"turnstile"`
Database DatabaseConfig `mapstructure:"database"`
Redis RedisConfig `mapstructure:"redis"`
Ops OpsConfig `mapstructure:"ops"`
JWT JWTConfig `mapstructure:"jwt"`
LinuxDo LinuxDoConnectConfig `mapstructure:"linuxdo_connect"`
Default DefaultConfig `mapstructure:"default"`
@@ -60,14 +61,6 @@ type Config struct {
Update UpdateConfig `mapstructure:"update"`
}
// UpdateConfig 在线更新相关配置
type UpdateConfig struct {
// ProxyURL 用于访问 GitHub 的代理地址
// 支持 http/https/socks5/socks5h 协议
// 例如: "http://127.0.0.1:7890", "socks5://127.0.0.1:1080"
ProxyURL string `mapstructure:"proxy_url"`
}
type GeminiConfig struct {
OAuth GeminiOAuthConfig `mapstructure:"oauth"`
Quota GeminiQuotaConfig `mapstructure:"quota"`
@@ -90,6 +83,33 @@ type GeminiTierQuotaConfig struct {
CooldownMinutes *int `mapstructure:"cooldown_minutes" json:"cooldown_minutes"`
}
type UpdateConfig struct {
// ProxyURL 用于访问 GitHub 的代理地址
// 支持 http/https/socks5/socks5h 协议
// 例如: "http://127.0.0.1:7890", "socks5://127.0.0.1:1080"
ProxyURL string `mapstructure:"proxy_url"`
}
type LinuxDoConnectConfig struct {
Enabled bool `mapstructure:"enabled"`
ClientID string `mapstructure:"client_id"`
ClientSecret string `mapstructure:"client_secret"`
AuthorizeURL string `mapstructure:"authorize_url"`
TokenURL string `mapstructure:"token_url"`
UserInfoURL string `mapstructure:"userinfo_url"`
Scopes string `mapstructure:"scopes"`
RedirectURL string `mapstructure:"redirect_url"` // 后端回调地址(需在提供方后台登记)
FrontendRedirectURL string `mapstructure:"frontend_redirect_url"` // 前端接收 token 的路由(默认:/auth/linuxdo/callback
TokenAuthMethod string `mapstructure:"token_auth_method"` // client_secret_post / client_secret_basic / none
UsePKCE bool `mapstructure:"use_pkce"`
// 可选:用于从 userinfo JSON 中提取字段的 gjson 路径。
// 为空时,服务端会尝试一组常见字段名。
UserInfoEmailPath string `mapstructure:"userinfo_email_path"`
UserInfoIDPath string `mapstructure:"userinfo_id_path"`
UserInfoUsernamePath string `mapstructure:"userinfo_username_path"`
}
// TokenRefreshConfig OAuth token自动刷新配置
type TokenRefreshConfig struct {
// 是否启用自动刷新
@@ -332,6 +352,47 @@ func (r *RedisConfig) Address() string {
return fmt.Sprintf("%s:%d", r.Host, r.Port)
}
type OpsConfig struct {
// Enabled controls whether ops features should run.
//
// NOTE: vNext still has a DB-backed feature flag (ops_monitoring_enabled) for runtime on/off.
// This config flag is the "hard switch" for deployments that want to disable ops completely.
Enabled bool `mapstructure:"enabled"`
// UsePreaggregatedTables prefers ops_metrics_hourly/daily for long-window dashboard queries.
UsePreaggregatedTables bool `mapstructure:"use_preaggregated_tables"`
// Cleanup controls periodic deletion of old ops data to prevent unbounded growth.
Cleanup OpsCleanupConfig `mapstructure:"cleanup"`
// MetricsCollectorCache controls Redis caching for expensive per-window collector queries.
MetricsCollectorCache OpsMetricsCollectorCacheConfig `mapstructure:"metrics_collector_cache"`
// Pre-aggregation configuration.
Aggregation OpsAggregationConfig `mapstructure:"aggregation"`
}
type OpsCleanupConfig struct {
Enabled bool `mapstructure:"enabled"`
Schedule string `mapstructure:"schedule"`
// Retention days (0 disables that cleanup target).
//
// vNext requirement: default 30 days across ops datasets.
ErrorLogRetentionDays int `mapstructure:"error_log_retention_days"`
MinuteMetricsRetentionDays int `mapstructure:"minute_metrics_retention_days"`
HourlyMetricsRetentionDays int `mapstructure:"hourly_metrics_retention_days"`
}
type OpsAggregationConfig struct {
Enabled bool `mapstructure:"enabled"`
}
type OpsMetricsCollectorCacheConfig struct {
Enabled bool `mapstructure:"enabled"`
TTL time.Duration `mapstructure:"ttl"`
}
type JWTConfig struct {
Secret string `mapstructure:"secret"`
ExpireHour int `mapstructure:"expire_hour"`
@@ -341,30 +402,6 @@ type TurnstileConfig struct {
Required bool `mapstructure:"required"`
}
// LinuxDoConnectConfig 用于 LinuxDo Connect OAuth 登录(终端用户 SSO
//
// 注意:这与上游账号的 OAuth例如 OpenAI/Gemini 账号接入)不是一回事。
// 这里是用于登录 Sub2API 本身的用户体系。
type LinuxDoConnectConfig struct {
Enabled bool `mapstructure:"enabled"`
ClientID string `mapstructure:"client_id"`
ClientSecret string `mapstructure:"client_secret"`
AuthorizeURL string `mapstructure:"authorize_url"`
TokenURL string `mapstructure:"token_url"`
UserInfoURL string `mapstructure:"userinfo_url"`
Scopes string `mapstructure:"scopes"`
RedirectURL string `mapstructure:"redirect_url"` // 后端回调地址(需在提供方后台登记)
FrontendRedirectURL string `mapstructure:"frontend_redirect_url"` // 前端接收 token 的路由(默认:/auth/linuxdo/callback
TokenAuthMethod string `mapstructure:"token_auth_method"` // client_secret_post / client_secret_basic / none
UsePKCE bool `mapstructure:"use_pkce"`
// 可选:用于从 userinfo JSON 中提取字段的 gjson 路径。
// 为空时,服务端会尝试一组常见字段名。
UserInfoEmailPath string `mapstructure:"userinfo_email_path"`
UserInfoIDPath string `mapstructure:"userinfo_id_path"`
UserInfoUsernamePath string `mapstructure:"userinfo_username_path"`
}
type DefaultConfig struct {
AdminEmail string `mapstructure:"admin_email"`
AdminPassword string `mapstructure:"admin_password"`
@@ -531,81 +568,6 @@ func Load() (*Config, error) {
return &cfg, nil
}
// ValidateAbsoluteHTTPURL 校验一个绝对 http(s) URL禁止 fragment
func ValidateAbsoluteHTTPURL(raw string) error {
raw = strings.TrimSpace(raw)
if raw == "" {
return fmt.Errorf("empty url")
}
u, err := url.Parse(raw)
if err != nil {
return err
}
if !u.IsAbs() {
return fmt.Errorf("must be absolute")
}
if !isHTTPScheme(u.Scheme) {
return fmt.Errorf("unsupported scheme: %s", u.Scheme)
}
if strings.TrimSpace(u.Host) == "" {
return fmt.Errorf("missing host")
}
if u.Fragment != "" {
return fmt.Errorf("must not include fragment")
}
return nil
}
// ValidateFrontendRedirectURL 校验前端回调地址:
// - 允许同源相对路径(以 / 开头)
// - 或绝对 http(s) URL禁止 fragment
func ValidateFrontendRedirectURL(raw string) error {
raw = strings.TrimSpace(raw)
if raw == "" {
return fmt.Errorf("empty url")
}
if strings.ContainsAny(raw, "\r\n") {
return fmt.Errorf("contains invalid characters")
}
if strings.HasPrefix(raw, "/") {
if strings.HasPrefix(raw, "//") {
return fmt.Errorf("must not start with //")
}
return nil
}
u, err := url.Parse(raw)
if err != nil {
return err
}
if !u.IsAbs() {
return fmt.Errorf("must be absolute http(s) url or relative path")
}
if !isHTTPScheme(u.Scheme) {
return fmt.Errorf("unsupported scheme: %s", u.Scheme)
}
if strings.TrimSpace(u.Host) == "" {
return fmt.Errorf("missing host")
}
if u.Fragment != "" {
return fmt.Errorf("must not include fragment")
}
return nil
}
func isHTTPScheme(scheme string) bool {
return strings.EqualFold(scheme, "http") || strings.EqualFold(scheme, "https")
}
func warnIfInsecureURL(field, raw string) {
u, err := url.Parse(strings.TrimSpace(raw))
if err != nil {
return
}
if strings.EqualFold(u.Scheme, "http") {
log.Printf("Warning: %s uses http scheme; use https in production to avoid token leakage.", field)
}
}
func setDefaults() {
viper.SetDefault("run_mode", RunModeStandard)
@@ -655,7 +617,7 @@ func setDefaults() {
// Turnstile
viper.SetDefault("turnstile.required", false)
// LinuxDo Connect OAuth 登录(终端用户 SSO
// LinuxDo Connect OAuth 登录
viper.SetDefault("linuxdo_connect.enabled", false)
viper.SetDefault("linuxdo_connect.client_id", "")
viper.SetDefault("linuxdo_connect.client_secret", "")
@@ -694,6 +656,20 @@ func setDefaults() {
viper.SetDefault("redis.pool_size", 128)
viper.SetDefault("redis.min_idle_conns", 10)
// Ops (vNext)
viper.SetDefault("ops.enabled", true)
viper.SetDefault("ops.use_preaggregated_tables", false)
viper.SetDefault("ops.cleanup.enabled", true)
viper.SetDefault("ops.cleanup.schedule", "0 2 * * *")
// Retention days: vNext defaults to 30 days across ops datasets.
viper.SetDefault("ops.cleanup.error_log_retention_days", 30)
viper.SetDefault("ops.cleanup.minute_metrics_retention_days", 30)
viper.SetDefault("ops.cleanup.hourly_metrics_retention_days", 30)
viper.SetDefault("ops.aggregation.enabled", true)
viper.SetDefault("ops.metrics_collector_cache.enabled", true)
// TTL should be slightly larger than collection interval (1m) to maximize cross-replica cache hits.
viper.SetDefault("ops.metrics_collector_cache.ttl", 65*time.Second)
// JWT
viper.SetDefault("jwt.secret", "")
viper.SetDefault("jwt.expire_hour", 24)
@@ -750,7 +726,7 @@ func setDefaults() {
// Gateway
viper.SetDefault("gateway.response_header_timeout", 600) // 600秒(10分钟)等待上游响应头LLM高负载时可能排队较久
viper.SetDefault("gateway.log_upstream_error_body", false)
viper.SetDefault("gateway.log_upstream_error_body", true)
viper.SetDefault("gateway.log_upstream_error_body_max_bytes", 2048)
viper.SetDefault("gateway.inject_beta_for_apikey", false)
viper.SetDefault("gateway.failover_on_400", false)
@@ -766,7 +742,7 @@ func setDefaults() {
viper.SetDefault("gateway.concurrency_slot_ttl_minutes", 30) // 并发槽位过期时间(支持超长请求)
viper.SetDefault("gateway.stream_data_interval_timeout", 180)
viper.SetDefault("gateway.stream_keepalive_interval", 10)
viper.SetDefault("gateway.max_line_size", 40*1024*1024)
viper.SetDefault("gateway.max_line_size", 10*1024*1024)
viper.SetDefault("gateway.scheduling.sticky_session_max_waiting", 3)
viper.SetDefault("gateway.scheduling.sticky_session_wait_timeout", 45*time.Second)
viper.SetDefault("gateway.scheduling.fallback_wait_timeout", 30*time.Second)
@@ -789,10 +765,6 @@ func setDefaults() {
viper.SetDefault("gemini.oauth.client_secret", "")
viper.SetDefault("gemini.oauth.scopes", "")
viper.SetDefault("gemini.quota.policy", "")
// Update - 在线更新配置
// 代理地址为空表示直连 GitHub适用于海外服务器
viper.SetDefault("update.proxy_url", "")
}
func (c *Config) Validate() error {
@@ -833,7 +805,8 @@ func (c *Config) Validate() error {
if method == "none" && !c.LinuxDo.UsePKCE {
return fmt.Errorf("linuxdo_connect.use_pkce must be true when linuxdo_connect.token_auth_method=none")
}
if (method == "" || method == "client_secret_post" || method == "client_secret_basic") && strings.TrimSpace(c.LinuxDo.ClientSecret) == "" {
if (method == "" || method == "client_secret_post" || method == "client_secret_basic") &&
strings.TrimSpace(c.LinuxDo.ClientSecret) == "" {
return fmt.Errorf("linuxdo_connect.client_secret is required when linuxdo_connect.enabled=true and token_auth_method is client_secret_post/client_secret_basic")
}
if strings.TrimSpace(c.LinuxDo.FrontendRedirectURL) == "" {
@@ -1048,6 +1021,21 @@ func (c *Config) Validate() error {
if c.Gateway.Scheduling.SlotCleanupInterval < 0 {
return fmt.Errorf("gateway.scheduling.slot_cleanup_interval must be non-negative")
}
if c.Ops.MetricsCollectorCache.TTL < 0 {
return fmt.Errorf("ops.metrics_collector_cache.ttl must be non-negative")
}
if c.Ops.Cleanup.ErrorLogRetentionDays < 0 {
return fmt.Errorf("ops.cleanup.error_log_retention_days must be non-negative")
}
if c.Ops.Cleanup.MinuteMetricsRetentionDays < 0 {
return fmt.Errorf("ops.cleanup.minute_metrics_retention_days must be non-negative")
}
if c.Ops.Cleanup.HourlyMetricsRetentionDays < 0 {
return fmt.Errorf("ops.cleanup.hourly_metrics_retention_days must be non-negative")
}
if c.Ops.Cleanup.Enabled && strings.TrimSpace(c.Ops.Cleanup.Schedule) == "" {
return fmt.Errorf("ops.cleanup.schedule is required when ops.cleanup.enabled=true")
}
if c.Concurrency.PingInterval < 5 || c.Concurrency.PingInterval > 30 {
return fmt.Errorf("concurrency.ping_interval must be between 5-30 seconds")
}
@@ -1124,3 +1112,77 @@ func GetServerAddress() string {
port := v.GetInt("server.port")
return fmt.Sprintf("%s:%d", host, port)
}
// ValidateAbsoluteHTTPURL 验证是否为有效的绝对 HTTP(S) URL
func ValidateAbsoluteHTTPURL(raw string) error {
raw = strings.TrimSpace(raw)
if raw == "" {
return fmt.Errorf("empty url")
}
u, err := url.Parse(raw)
if err != nil {
return err
}
if !u.IsAbs() {
return fmt.Errorf("must be absolute")
}
if !isHTTPScheme(u.Scheme) {
return fmt.Errorf("unsupported scheme: %s", u.Scheme)
}
if strings.TrimSpace(u.Host) == "" {
return fmt.Errorf("missing host")
}
if u.Fragment != "" {
return fmt.Errorf("must not include fragment")
}
return nil
}
// ValidateFrontendRedirectURL 验证前端重定向 URL可以是绝对 URL 或相对路径)
func ValidateFrontendRedirectURL(raw string) error {
raw = strings.TrimSpace(raw)
if raw == "" {
return fmt.Errorf("empty url")
}
if strings.ContainsAny(raw, "\r\n") {
return fmt.Errorf("contains invalid characters")
}
if strings.HasPrefix(raw, "/") {
if strings.HasPrefix(raw, "//") {
return fmt.Errorf("must not start with //")
}
return nil
}
u, err := url.Parse(raw)
if err != nil {
return err
}
if !u.IsAbs() {
return fmt.Errorf("must be absolute http(s) url or relative path")
}
if !isHTTPScheme(u.Scheme) {
return fmt.Errorf("unsupported scheme: %s", u.Scheme)
}
if strings.TrimSpace(u.Host) == "" {
return fmt.Errorf("missing host")
}
if u.Fragment != "" {
return fmt.Errorf("must not include fragment")
}
return nil
}
// isHTTPScheme 检查是否为 HTTP 或 HTTPS 协议
func isHTTPScheme(scheme string) bool {
return strings.EqualFold(scheme, "http") || strings.EqualFold(scheme, "https")
}
func warnIfInsecureURL(field, raw string) {
u, err := url.Parse(strings.TrimSpace(raw))
if err != nil {
return
}
if strings.EqualFold(u.Scheme, "http") {
log.Printf("Warning: %s uses http scheme; use https in production to avoid token leakage.", field)
}
}

View File

@@ -0,0 +1,432 @@
package admin
import (
"encoding/json"
"fmt"
"math"
"net/http"
"strconv"
"strings"
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
"github.com/gin-gonic/gin/binding"
)
var validOpsAlertMetricTypes = []string{
"success_rate",
"error_rate",
"upstream_error_rate",
"p95_latency_ms",
"p99_latency_ms",
"cpu_usage_percent",
"memory_usage_percent",
"concurrency_queue_depth",
}
var validOpsAlertMetricTypeSet = func() map[string]struct{} {
set := make(map[string]struct{}, len(validOpsAlertMetricTypes))
for _, v := range validOpsAlertMetricTypes {
set[v] = struct{}{}
}
return set
}()
var validOpsAlertOperators = []string{">", "<", ">=", "<=", "==", "!="}
var validOpsAlertOperatorSet = func() map[string]struct{} {
set := make(map[string]struct{}, len(validOpsAlertOperators))
for _, v := range validOpsAlertOperators {
set[v] = struct{}{}
}
return set
}()
var validOpsAlertSeverities = []string{"P0", "P1", "P2", "P3"}
var validOpsAlertSeveritySet = func() map[string]struct{} {
set := make(map[string]struct{}, len(validOpsAlertSeverities))
for _, v := range validOpsAlertSeverities {
set[v] = struct{}{}
}
return set
}()
type opsAlertRuleValidatedInput struct {
Name string
MetricType string
Operator string
Threshold float64
Severity string
WindowMinutes int
SustainedMinutes int
CooldownMinutes int
Enabled bool
NotifyEmail bool
WindowProvided bool
SustainedProvided bool
CooldownProvided bool
SeverityProvided bool
EnabledProvided bool
NotifyProvided bool
}
func isPercentOrRateMetric(metricType string) bool {
switch metricType {
case "success_rate",
"error_rate",
"upstream_error_rate",
"cpu_usage_percent",
"memory_usage_percent":
return true
default:
return false
}
}
func validateOpsAlertRulePayload(raw map[string]json.RawMessage) (*opsAlertRuleValidatedInput, error) {
if raw == nil {
return nil, fmt.Errorf("invalid request body")
}
requiredFields := []string{"name", "metric_type", "operator", "threshold"}
for _, field := range requiredFields {
if _, ok := raw[field]; !ok {
return nil, fmt.Errorf("%s is required", field)
}
}
var name string
if err := json.Unmarshal(raw["name"], &name); err != nil || strings.TrimSpace(name) == "" {
return nil, fmt.Errorf("name is required")
}
name = strings.TrimSpace(name)
var metricType string
if err := json.Unmarshal(raw["metric_type"], &metricType); err != nil || strings.TrimSpace(metricType) == "" {
return nil, fmt.Errorf("metric_type is required")
}
metricType = strings.TrimSpace(metricType)
if _, ok := validOpsAlertMetricTypeSet[metricType]; !ok {
return nil, fmt.Errorf("metric_type must be one of: %s", strings.Join(validOpsAlertMetricTypes, ", "))
}
var operator string
if err := json.Unmarshal(raw["operator"], &operator); err != nil || strings.TrimSpace(operator) == "" {
return nil, fmt.Errorf("operator is required")
}
operator = strings.TrimSpace(operator)
if _, ok := validOpsAlertOperatorSet[operator]; !ok {
return nil, fmt.Errorf("operator must be one of: %s", strings.Join(validOpsAlertOperators, ", "))
}
var threshold float64
if err := json.Unmarshal(raw["threshold"], &threshold); err != nil {
return nil, fmt.Errorf("threshold must be a number")
}
if math.IsNaN(threshold) || math.IsInf(threshold, 0) {
return nil, fmt.Errorf("threshold must be a finite number")
}
if isPercentOrRateMetric(metricType) {
if threshold < 0 || threshold > 100 {
return nil, fmt.Errorf("threshold must be between 0 and 100 for metric_type %s", metricType)
}
} else if threshold < 0 {
return nil, fmt.Errorf("threshold must be >= 0")
}
validated := &opsAlertRuleValidatedInput{
Name: name,
MetricType: metricType,
Operator: operator,
Threshold: threshold,
}
if v, ok := raw["severity"]; ok {
validated.SeverityProvided = true
var sev string
if err := json.Unmarshal(v, &sev); err != nil {
return nil, fmt.Errorf("severity must be a string")
}
sev = strings.ToUpper(strings.TrimSpace(sev))
if sev != "" {
if _, ok := validOpsAlertSeveritySet[sev]; !ok {
return nil, fmt.Errorf("severity must be one of: %s", strings.Join(validOpsAlertSeverities, ", "))
}
validated.Severity = sev
}
}
if validated.Severity == "" {
validated.Severity = "P2"
}
if v, ok := raw["enabled"]; ok {
validated.EnabledProvided = true
if err := json.Unmarshal(v, &validated.Enabled); err != nil {
return nil, fmt.Errorf("enabled must be a boolean")
}
} else {
validated.Enabled = true
}
if v, ok := raw["notify_email"]; ok {
validated.NotifyProvided = true
if err := json.Unmarshal(v, &validated.NotifyEmail); err != nil {
return nil, fmt.Errorf("notify_email must be a boolean")
}
} else {
validated.NotifyEmail = true
}
if v, ok := raw["window_minutes"]; ok {
validated.WindowProvided = true
if err := json.Unmarshal(v, &validated.WindowMinutes); err != nil {
return nil, fmt.Errorf("window_minutes must be an integer")
}
switch validated.WindowMinutes {
case 1, 5, 60:
default:
return nil, fmt.Errorf("window_minutes must be one of: 1, 5, 60")
}
} else {
validated.WindowMinutes = 1
}
if v, ok := raw["sustained_minutes"]; ok {
validated.SustainedProvided = true
if err := json.Unmarshal(v, &validated.SustainedMinutes); err != nil {
return nil, fmt.Errorf("sustained_minutes must be an integer")
}
if validated.SustainedMinutes < 1 || validated.SustainedMinutes > 1440 {
return nil, fmt.Errorf("sustained_minutes must be between 1 and 1440")
}
} else {
validated.SustainedMinutes = 1
}
if v, ok := raw["cooldown_minutes"]; ok {
validated.CooldownProvided = true
if err := json.Unmarshal(v, &validated.CooldownMinutes); err != nil {
return nil, fmt.Errorf("cooldown_minutes must be an integer")
}
if validated.CooldownMinutes < 0 || validated.CooldownMinutes > 1440 {
return nil, fmt.Errorf("cooldown_minutes must be between 0 and 1440")
}
} else {
validated.CooldownMinutes = 0
}
return validated, nil
}
// ListAlertRules returns all ops alert rules.
// GET /api/v1/admin/ops/alert-rules
func (h *OpsHandler) ListAlertRules(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
rules, err := h.opsService.ListAlertRules(c.Request.Context())
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, rules)
}
// CreateAlertRule creates an ops alert rule.
// POST /api/v1/admin/ops/alert-rules
func (h *OpsHandler) CreateAlertRule(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
var raw map[string]json.RawMessage
if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil {
response.BadRequest(c, "Invalid request body")
return
}
validated, err := validateOpsAlertRulePayload(raw)
if err != nil {
response.BadRequest(c, err.Error())
return
}
var rule service.OpsAlertRule
if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil {
response.BadRequest(c, "Invalid request body")
return
}
rule.Name = validated.Name
rule.MetricType = validated.MetricType
rule.Operator = validated.Operator
rule.Threshold = validated.Threshold
rule.WindowMinutes = validated.WindowMinutes
rule.SustainedMinutes = validated.SustainedMinutes
rule.CooldownMinutes = validated.CooldownMinutes
rule.Severity = validated.Severity
rule.Enabled = validated.Enabled
rule.NotifyEmail = validated.NotifyEmail
created, err := h.opsService.CreateAlertRule(c.Request.Context(), &rule)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, created)
}
// UpdateAlertRule updates an existing ops alert rule.
// PUT /api/v1/admin/ops/alert-rules/:id
func (h *OpsHandler) UpdateAlertRule(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
id, err := strconv.ParseInt(c.Param("id"), 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid rule ID")
return
}
var raw map[string]json.RawMessage
if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil {
response.BadRequest(c, "Invalid request body")
return
}
validated, err := validateOpsAlertRulePayload(raw)
if err != nil {
response.BadRequest(c, err.Error())
return
}
var rule service.OpsAlertRule
if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil {
response.BadRequest(c, "Invalid request body")
return
}
rule.ID = id
rule.Name = validated.Name
rule.MetricType = validated.MetricType
rule.Operator = validated.Operator
rule.Threshold = validated.Threshold
rule.WindowMinutes = validated.WindowMinutes
rule.SustainedMinutes = validated.SustainedMinutes
rule.CooldownMinutes = validated.CooldownMinutes
rule.Severity = validated.Severity
rule.Enabled = validated.Enabled
rule.NotifyEmail = validated.NotifyEmail
updated, err := h.opsService.UpdateAlertRule(c.Request.Context(), &rule)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, updated)
}
// DeleteAlertRule deletes an ops alert rule.
// DELETE /api/v1/admin/ops/alert-rules/:id
func (h *OpsHandler) DeleteAlertRule(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
id, err := strconv.ParseInt(c.Param("id"), 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid rule ID")
return
}
if err := h.opsService.DeleteAlertRule(c.Request.Context(), id); err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, gin.H{"deleted": true})
}
// ListAlertEvents lists recent ops alert events.
// GET /api/v1/admin/ops/alert-events
func (h *OpsHandler) ListAlertEvents(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
limit := 100
if raw := strings.TrimSpace(c.Query("limit")); raw != "" {
n, err := strconv.Atoi(raw)
if err != nil || n <= 0 {
response.BadRequest(c, "Invalid limit")
return
}
limit = n
}
filter := &service.OpsAlertEventFilter{
Limit: limit,
Status: strings.TrimSpace(c.Query("status")),
Severity: strings.TrimSpace(c.Query("severity")),
}
// Optional global filter support (platform/group/time range).
if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
filter.Platform = platform
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
if startTime, endTime, err := parseOpsTimeRange(c, "24h"); err == nil {
// Only apply when explicitly provided to avoid surprising default narrowing.
if strings.TrimSpace(c.Query("start_time")) != "" || strings.TrimSpace(c.Query("end_time")) != "" || strings.TrimSpace(c.Query("time_range")) != "" {
filter.StartTime = &startTime
filter.EndTime = &endTime
}
} else {
response.BadRequest(c, err.Error())
return
}
events, err := h.opsService.ListAlertEvents(c.Request.Context(), filter)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, events)
}

View File

@@ -0,0 +1,243 @@
package admin
import (
"net/http"
"strconv"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
)
// GetDashboardOverview returns vNext ops dashboard overview (raw path).
// GET /api/v1/admin/ops/dashboard/overview
func (h *OpsHandler) GetDashboardOverview(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
startTime, endTime, err := parseOpsTimeRange(c, "1h")
if err != nil {
response.BadRequest(c, err.Error())
return
}
filter := &service.OpsDashboardFilter{
StartTime: startTime,
EndTime: endTime,
Platform: strings.TrimSpace(c.Query("platform")),
QueryMode: parseOpsQueryMode(c),
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
data, err := h.opsService.GetDashboardOverview(c.Request.Context(), filter)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, data)
}
// GetDashboardThroughputTrend returns throughput time series (raw path).
// GET /api/v1/admin/ops/dashboard/throughput-trend
func (h *OpsHandler) GetDashboardThroughputTrend(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
startTime, endTime, err := parseOpsTimeRange(c, "1h")
if err != nil {
response.BadRequest(c, err.Error())
return
}
filter := &service.OpsDashboardFilter{
StartTime: startTime,
EndTime: endTime,
Platform: strings.TrimSpace(c.Query("platform")),
QueryMode: parseOpsQueryMode(c),
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime))
data, err := h.opsService.GetThroughputTrend(c.Request.Context(), filter, bucketSeconds)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, data)
}
// GetDashboardLatencyHistogram returns the latency distribution histogram (success requests).
// GET /api/v1/admin/ops/dashboard/latency-histogram
func (h *OpsHandler) GetDashboardLatencyHistogram(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
startTime, endTime, err := parseOpsTimeRange(c, "1h")
if err != nil {
response.BadRequest(c, err.Error())
return
}
filter := &service.OpsDashboardFilter{
StartTime: startTime,
EndTime: endTime,
Platform: strings.TrimSpace(c.Query("platform")),
QueryMode: parseOpsQueryMode(c),
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
data, err := h.opsService.GetLatencyHistogram(c.Request.Context(), filter)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, data)
}
// GetDashboardErrorTrend returns error counts time series (raw path).
// GET /api/v1/admin/ops/dashboard/error-trend
func (h *OpsHandler) GetDashboardErrorTrend(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
startTime, endTime, err := parseOpsTimeRange(c, "1h")
if err != nil {
response.BadRequest(c, err.Error())
return
}
filter := &service.OpsDashboardFilter{
StartTime: startTime,
EndTime: endTime,
Platform: strings.TrimSpace(c.Query("platform")),
QueryMode: parseOpsQueryMode(c),
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime))
data, err := h.opsService.GetErrorTrend(c.Request.Context(), filter, bucketSeconds)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, data)
}
// GetDashboardErrorDistribution returns error distribution by status code (raw path).
// GET /api/v1/admin/ops/dashboard/error-distribution
func (h *OpsHandler) GetDashboardErrorDistribution(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
startTime, endTime, err := parseOpsTimeRange(c, "1h")
if err != nil {
response.BadRequest(c, err.Error())
return
}
filter := &service.OpsDashboardFilter{
StartTime: startTime,
EndTime: endTime,
Platform: strings.TrimSpace(c.Query("platform")),
QueryMode: parseOpsQueryMode(c),
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
data, err := h.opsService.GetErrorDistribution(c.Request.Context(), filter)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, data)
}
func pickThroughputBucketSeconds(window time.Duration) int {
// Keep buckets predictable and avoid huge responses.
switch {
case window <= 2*time.Hour:
return 60
case window <= 24*time.Hour:
return 300
default:
return 3600
}
}
func parseOpsQueryMode(c *gin.Context) service.OpsQueryMode {
if c == nil {
return ""
}
raw := strings.TrimSpace(c.Query("mode"))
if raw == "" {
// Empty means "use server default" (DB setting ops_query_mode_default).
return ""
}
return service.ParseOpsQueryMode(raw)
}

View File

@@ -0,0 +1,364 @@
package admin
import (
"errors"
"fmt"
"io"
"net/http"
"strconv"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
"github.com/Wei-Shaw/sub2api/internal/server/middleware"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
)
type OpsHandler struct {
opsService *service.OpsService
}
func NewOpsHandler(opsService *service.OpsService) *OpsHandler {
return &OpsHandler{opsService: opsService}
}
// GetErrorLogs lists ops error logs.
// GET /api/v1/admin/ops/errors
func (h *OpsHandler) GetErrorLogs(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
page, pageSize := response.ParsePagination(c)
// Ops list can be larger than standard admin tables.
if pageSize > 500 {
pageSize = 500
}
startTime, endTime, err := parseOpsTimeRange(c, "1h")
if err != nil {
response.BadRequest(c, err.Error())
return
}
filter := &service.OpsErrorLogFilter{
Page: page,
PageSize: pageSize,
}
if !startTime.IsZero() {
filter.StartTime = &startTime
}
if !endTime.IsZero() {
filter.EndTime = &endTime
}
if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
filter.Platform = platform
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
if v := strings.TrimSpace(c.Query("account_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid account_id")
return
}
filter.AccountID = &id
}
if phase := strings.TrimSpace(c.Query("phase")); phase != "" {
filter.Phase = phase
}
if q := strings.TrimSpace(c.Query("q")); q != "" {
filter.Query = q
}
if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" {
parts := strings.Split(statusCodesStr, ",")
out := make([]int, 0, len(parts))
for _, part := range parts {
p := strings.TrimSpace(part)
if p == "" {
continue
}
n, err := strconv.Atoi(p)
if err != nil || n < 0 {
response.BadRequest(c, "Invalid status_codes")
return
}
out = append(out, n)
}
filter.StatusCodes = out
}
result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize)
}
// GetErrorLogByID returns a single error log detail.
// GET /api/v1/admin/ops/errors/:id
func (h *OpsHandler) GetErrorLogByID(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
idStr := strings.TrimSpace(c.Param("id"))
id, err := strconv.ParseInt(idStr, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid error id")
return
}
detail, err := h.opsService.GetErrorLogByID(c.Request.Context(), id)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, detail)
}
// ListRequestDetails returns a request-level list (success + error) for drill-down.
// GET /api/v1/admin/ops/requests
func (h *OpsHandler) ListRequestDetails(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
page, pageSize := response.ParsePagination(c)
if pageSize > 100 {
pageSize = 100
}
startTime, endTime, err := parseOpsTimeRange(c, "1h")
if err != nil {
response.BadRequest(c, err.Error())
return
}
filter := &service.OpsRequestDetailFilter{
Page: page,
PageSize: pageSize,
StartTime: &startTime,
EndTime: &endTime,
}
filter.Kind = strings.TrimSpace(c.Query("kind"))
filter.Platform = strings.TrimSpace(c.Query("platform"))
filter.Model = strings.TrimSpace(c.Query("model"))
filter.RequestID = strings.TrimSpace(c.Query("request_id"))
filter.Query = strings.TrimSpace(c.Query("q"))
filter.Sort = strings.TrimSpace(c.Query("sort"))
if v := strings.TrimSpace(c.Query("user_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid user_id")
return
}
filter.UserID = &id
}
if v := strings.TrimSpace(c.Query("api_key_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid api_key_id")
return
}
filter.APIKeyID = &id
}
if v := strings.TrimSpace(c.Query("account_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid account_id")
return
}
filter.AccountID = &id
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
if v := strings.TrimSpace(c.Query("min_duration_ms")); v != "" {
parsed, err := strconv.Atoi(v)
if err != nil || parsed < 0 {
response.BadRequest(c, "Invalid min_duration_ms")
return
}
filter.MinDurationMs = &parsed
}
if v := strings.TrimSpace(c.Query("max_duration_ms")); v != "" {
parsed, err := strconv.Atoi(v)
if err != nil || parsed < 0 {
response.BadRequest(c, "Invalid max_duration_ms")
return
}
filter.MaxDurationMs = &parsed
}
out, err := h.opsService.ListRequestDetails(c.Request.Context(), filter)
if err != nil {
// Invalid sort/kind/platform etc should be a bad request; keep it simple.
if strings.Contains(strings.ToLower(err.Error()), "invalid") {
response.BadRequest(c, err.Error())
return
}
response.Error(c, http.StatusInternalServerError, "Failed to list request details")
return
}
response.Paginated(c, out.Items, out.Total, out.Page, out.PageSize)
}
type opsRetryRequest struct {
Mode string `json:"mode"`
PinnedAccountID *int64 `json:"pinned_account_id"`
}
// RetryErrorRequest retries a failed request using stored request_body.
// POST /api/v1/admin/ops/errors/:id/retry
func (h *OpsHandler) RetryErrorRequest(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
subject, ok := middleware.GetAuthSubjectFromContext(c)
if !ok || subject.UserID <= 0 {
response.Error(c, http.StatusUnauthorized, "Unauthorized")
return
}
idStr := strings.TrimSpace(c.Param("id"))
id, err := strconv.ParseInt(idStr, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid error id")
return
}
req := opsRetryRequest{Mode: service.OpsRetryModeClient}
if err := c.ShouldBindJSON(&req); err != nil && !errors.Is(err, io.EOF) {
response.BadRequest(c, "Invalid request: "+err.Error())
return
}
if strings.TrimSpace(req.Mode) == "" {
req.Mode = service.OpsRetryModeClient
}
result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, req.Mode, req.PinnedAccountID)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, result)
}
func parseOpsTimeRange(c *gin.Context, defaultRange string) (time.Time, time.Time, error) {
startStr := strings.TrimSpace(c.Query("start_time"))
endStr := strings.TrimSpace(c.Query("end_time"))
parseTS := func(s string) (time.Time, error) {
if s == "" {
return time.Time{}, nil
}
if t, err := time.Parse(time.RFC3339Nano, s); err == nil {
return t, nil
}
return time.Parse(time.RFC3339, s)
}
start, err := parseTS(startStr)
if err != nil {
return time.Time{}, time.Time{}, err
}
end, err := parseTS(endStr)
if err != nil {
return time.Time{}, time.Time{}, err
}
// start/end explicitly provided (even partially)
if startStr != "" || endStr != "" {
if end.IsZero() {
end = time.Now()
}
if start.IsZero() {
dur, _ := parseOpsDuration(defaultRange)
start = end.Add(-dur)
}
if start.After(end) {
return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: start_time must be <= end_time")
}
if end.Sub(start) > 30*24*time.Hour {
return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days")
}
return start, end, nil
}
// time_range fallback
tr := strings.TrimSpace(c.Query("time_range"))
if tr == "" {
tr = defaultRange
}
dur, ok := parseOpsDuration(tr)
if !ok {
dur, _ = parseOpsDuration(defaultRange)
}
end = time.Now()
start = end.Add(-dur)
if end.Sub(start) > 30*24*time.Hour {
return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days")
}
return start, end, nil
}
func parseOpsDuration(v string) (time.Duration, bool) {
switch strings.TrimSpace(v) {
case "5m":
return 5 * time.Minute, true
case "30m":
return 30 * time.Minute, true
case "1h":
return time.Hour, true
case "6h":
return 6 * time.Hour, true
case "24h":
return 24 * time.Hour, true
default:
return 0, false
}
}

View File

@@ -0,0 +1,120 @@
package admin
import (
"net/http"
"strconv"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
)
// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account.
// GET /api/v1/admin/ops/concurrency
func (h *OpsHandler) GetConcurrencyStats(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
response.Success(c, gin.H{
"enabled": false,
"platform": map[string]*service.PlatformConcurrencyInfo{},
"group": map[int64]*service.GroupConcurrencyInfo{},
"account": map[int64]*service.AccountConcurrencyInfo{},
"timestamp": time.Now().UTC(),
})
return
}
platformFilter := strings.TrimSpace(c.Query("platform"))
var groupID *int64
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
groupID = &id
}
platform, group, account, collectedAt, err := h.opsService.GetConcurrencyStats(c.Request.Context(), platformFilter, groupID)
if err != nil {
response.ErrorFrom(c, err)
return
}
payload := gin.H{
"enabled": true,
"platform": platform,
"group": group,
"account": account,
}
if collectedAt != nil {
payload["timestamp"] = collectedAt.UTC()
}
response.Success(c, payload)
}
// GetAccountAvailability returns account availability statistics.
// GET /api/v1/admin/ops/account-availability
//
// Query params:
// - platform: optional
// - group_id: optional
func (h *OpsHandler) GetAccountAvailability(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
response.Success(c, gin.H{
"enabled": false,
"platform": map[string]*service.PlatformAvailability{},
"group": map[int64]*service.GroupAvailability{},
"account": map[int64]*service.AccountAvailability{},
"timestamp": time.Now().UTC(),
})
return
}
platform := strings.TrimSpace(c.Query("platform"))
var groupID *int64
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
groupID = &id
}
platformStats, groupStats, accountStats, collectedAt, err := h.opsService.GetAccountAvailabilityStats(c.Request.Context(), platform, groupID)
if err != nil {
response.ErrorFrom(c, err)
return
}
payload := gin.H{
"enabled": true,
"platform": platformStats,
"group": groupStats,
"account": accountStats,
}
if collectedAt != nil {
payload["timestamp"] = collectedAt.UTC()
}
response.Success(c, payload)
}

View File

@@ -0,0 +1,148 @@
package admin
import (
"net/http"
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
)
// GetEmailNotificationConfig returns Ops email notification config (DB-backed).
// GET /api/v1/admin/ops/email-notification/config
func (h *OpsHandler) GetEmailNotificationConfig(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
cfg, err := h.opsService.GetEmailNotificationConfig(c.Request.Context())
if err != nil {
response.Error(c, http.StatusInternalServerError, "Failed to get email notification config")
return
}
response.Success(c, cfg)
}
// UpdateEmailNotificationConfig updates Ops email notification config (DB-backed).
// PUT /api/v1/admin/ops/email-notification/config
func (h *OpsHandler) UpdateEmailNotificationConfig(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
var req service.OpsEmailNotificationConfigUpdateRequest
if err := c.ShouldBindJSON(&req); err != nil {
response.BadRequest(c, "Invalid request body")
return
}
updated, err := h.opsService.UpdateEmailNotificationConfig(c.Request.Context(), &req)
if err != nil {
// Most failures here are validation errors from request payload; treat as 400.
response.Error(c, http.StatusBadRequest, err.Error())
return
}
response.Success(c, updated)
}
// GetAlertRuntimeSettings returns Ops alert evaluator runtime settings (DB-backed).
// GET /api/v1/admin/ops/runtime/alert
func (h *OpsHandler) GetAlertRuntimeSettings(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
cfg, err := h.opsService.GetOpsAlertRuntimeSettings(c.Request.Context())
if err != nil {
response.Error(c, http.StatusInternalServerError, "Failed to get alert runtime settings")
return
}
response.Success(c, cfg)
}
// UpdateAlertRuntimeSettings updates Ops alert evaluator runtime settings (DB-backed).
// PUT /api/v1/admin/ops/runtime/alert
func (h *OpsHandler) UpdateAlertRuntimeSettings(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
var req service.OpsAlertRuntimeSettings
if err := c.ShouldBindJSON(&req); err != nil {
response.BadRequest(c, "Invalid request body")
return
}
updated, err := h.opsService.UpdateOpsAlertRuntimeSettings(c.Request.Context(), &req)
if err != nil {
response.Error(c, http.StatusBadRequest, err.Error())
return
}
response.Success(c, updated)
}
// GetAdvancedSettings returns Ops advanced settings (DB-backed).
// GET /api/v1/admin/ops/advanced-settings
func (h *OpsHandler) GetAdvancedSettings(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
cfg, err := h.opsService.GetOpsAdvancedSettings(c.Request.Context())
if err != nil {
response.Error(c, http.StatusInternalServerError, "Failed to get advanced settings")
return
}
response.Success(c, cfg)
}
// UpdateAdvancedSettings updates Ops advanced settings (DB-backed).
// PUT /api/v1/admin/ops/advanced-settings
func (h *OpsHandler) UpdateAdvancedSettings(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
var req service.OpsAdvancedSettings
if err := c.ShouldBindJSON(&req); err != nil {
response.BadRequest(c, "Invalid request body")
return
}
updated, err := h.opsService.UpdateOpsAdvancedSettings(c.Request.Context(), &req)
if err != nil {
response.Error(c, http.StatusBadRequest, err.Error())
return
}
response.Success(c, updated)
}

View File

@@ -0,0 +1,771 @@
package admin
import (
"context"
"encoding/json"
"log"
"math"
"net"
"net/http"
"net/netip"
"net/url"
"os"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
"github.com/gorilla/websocket"
)
type OpsWSProxyConfig struct {
TrustProxy bool
TrustedProxies []netip.Prefix
OriginPolicy string
}
const (
envOpsWSTrustProxy = "OPS_WS_TRUST_PROXY"
envOpsWSTrustedProxies = "OPS_WS_TRUSTED_PROXIES"
envOpsWSOriginPolicy = "OPS_WS_ORIGIN_POLICY"
envOpsWSMaxConns = "OPS_WS_MAX_CONNS"
envOpsWSMaxConnsPerIP = "OPS_WS_MAX_CONNS_PER_IP"
)
const (
OriginPolicyStrict = "strict"
OriginPolicyPermissive = "permissive"
)
var opsWSProxyConfig = loadOpsWSProxyConfigFromEnv()
var upgrader = websocket.Upgrader{
CheckOrigin: func(r *http.Request) bool {
return isAllowedOpsWSOrigin(r)
},
// Subprotocol negotiation:
// - The frontend passes ["sub2api-admin", "jwt.<token>"].
// - We always select "sub2api-admin" so the token is never echoed back in the handshake response.
Subprotocols: []string{"sub2api-admin"},
}
const (
qpsWSPushInterval = 2 * time.Second
qpsWSRefreshInterval = 5 * time.Second
qpsWSRequestCountWindow = 1 * time.Minute
defaultMaxWSConns = 100
defaultMaxWSConnsPerIP = 20
)
var wsConnCount atomic.Int32
var wsConnCountByIP sync.Map // map[string]*atomic.Int32
const qpsWSIdleStopDelay = 30 * time.Second
const (
opsWSCloseRealtimeDisabled = 4001
)
var qpsWSIdleStopMu sync.Mutex
var qpsWSIdleStopTimer *time.Timer
func cancelQPSWSIdleStop() {
qpsWSIdleStopMu.Lock()
if qpsWSIdleStopTimer != nil {
qpsWSIdleStopTimer.Stop()
qpsWSIdleStopTimer = nil
}
qpsWSIdleStopMu.Unlock()
}
func scheduleQPSWSIdleStop() {
qpsWSIdleStopMu.Lock()
if qpsWSIdleStopTimer != nil {
qpsWSIdleStopMu.Unlock()
return
}
qpsWSIdleStopTimer = time.AfterFunc(qpsWSIdleStopDelay, func() {
// Only stop if truly idle at fire time.
if wsConnCount.Load() == 0 {
qpsWSCache.Stop()
}
qpsWSIdleStopMu.Lock()
qpsWSIdleStopTimer = nil
qpsWSIdleStopMu.Unlock()
})
qpsWSIdleStopMu.Unlock()
}
type opsWSRuntimeLimits struct {
MaxConns int32
MaxConnsPerIP int32
}
var opsWSLimits = loadOpsWSRuntimeLimitsFromEnv()
const (
qpsWSWriteTimeout = 10 * time.Second
qpsWSPongWait = 60 * time.Second
qpsWSPingInterval = 30 * time.Second
// We don't expect clients to send application messages; we only read to process control frames (Pong/Close).
qpsWSMaxReadBytes = 1024
)
type opsWSQPSCache struct {
refreshInterval time.Duration
requestCountWindow time.Duration
lastUpdatedUnixNano atomic.Int64
payload atomic.Value // []byte
opsService *service.OpsService
cancel context.CancelFunc
done chan struct{}
mu sync.Mutex
running bool
}
var qpsWSCache = &opsWSQPSCache{
refreshInterval: qpsWSRefreshInterval,
requestCountWindow: qpsWSRequestCountWindow,
}
func (c *opsWSQPSCache) start(opsService *service.OpsService) {
if c == nil || opsService == nil {
return
}
for {
c.mu.Lock()
if c.running {
c.mu.Unlock()
return
}
// If a previous refresh loop is currently stopping, wait for it to fully exit.
done := c.done
if done != nil {
c.mu.Unlock()
<-done
c.mu.Lock()
if c.done == done && !c.running {
c.done = nil
}
c.mu.Unlock()
continue
}
c.opsService = opsService
ctx, cancel := context.WithCancel(context.Background())
c.cancel = cancel
c.done = make(chan struct{})
done = c.done
c.running = true
c.mu.Unlock()
go func() {
defer close(done)
c.refreshLoop(ctx)
}()
return
}
}
// Stop stops the background refresh loop.
// It is safe to call multiple times.
func (c *opsWSQPSCache) Stop() {
if c == nil {
return
}
c.mu.Lock()
if !c.running {
done := c.done
c.mu.Unlock()
if done != nil {
<-done
}
return
}
cancel := c.cancel
c.cancel = nil
c.running = false
c.opsService = nil
done := c.done
c.mu.Unlock()
if cancel != nil {
cancel()
}
if done != nil {
<-done
}
c.mu.Lock()
if c.done == done && !c.running {
c.done = nil
}
c.mu.Unlock()
}
func (c *opsWSQPSCache) refreshLoop(ctx context.Context) {
ticker := time.NewTicker(c.refreshInterval)
defer ticker.Stop()
c.refresh(ctx)
for {
select {
case <-ticker.C:
c.refresh(ctx)
case <-ctx.Done():
return
}
}
}
func (c *opsWSQPSCache) refresh(parentCtx context.Context) {
if c == nil {
return
}
c.mu.Lock()
opsService := c.opsService
c.mu.Unlock()
if opsService == nil {
return
}
if parentCtx == nil {
parentCtx = context.Background()
}
ctx, cancel := context.WithTimeout(parentCtx, 10*time.Second)
defer cancel()
now := time.Now().UTC()
stats, err := opsService.GetWindowStats(ctx, now.Add(-c.requestCountWindow), now)
if err != nil || stats == nil {
if err != nil {
log.Printf("[OpsWS] refresh: get window stats failed: %v", err)
}
return
}
requestCount := stats.SuccessCount + stats.ErrorCountTotal
qps := 0.0
tps := 0.0
if c.requestCountWindow > 0 {
seconds := c.requestCountWindow.Seconds()
qps = roundTo1DP(float64(requestCount) / seconds)
tps = roundTo1DP(float64(stats.TokenConsumed) / seconds)
}
payload := gin.H{
"type": "qps_update",
"timestamp": now.Format(time.RFC3339),
"data": gin.H{
"qps": qps,
"tps": tps,
"request_count": requestCount,
},
}
msg, err := json.Marshal(payload)
if err != nil {
log.Printf("[OpsWS] refresh: marshal payload failed: %v", err)
return
}
c.payload.Store(msg)
c.lastUpdatedUnixNano.Store(now.UnixNano())
}
func roundTo1DP(v float64) float64 {
return math.Round(v*10) / 10
}
func (c *opsWSQPSCache) getPayload() []byte {
if c == nil {
return nil
}
if cached, ok := c.payload.Load().([]byte); ok && cached != nil {
return cached
}
return nil
}
func closeWS(conn *websocket.Conn, code int, reason string) {
if conn == nil {
return
}
msg := websocket.FormatCloseMessage(code, reason)
_ = conn.WriteControl(websocket.CloseMessage, msg, time.Now().Add(qpsWSWriteTimeout))
_ = conn.Close()
}
// QPSWSHandler handles realtime QPS push via WebSocket.
// GET /api/v1/admin/ops/ws/qps
func (h *OpsHandler) QPSWSHandler(c *gin.Context) {
clientIP := requestClientIP(c.Request)
if h == nil || h.opsService == nil {
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "ops service not initialized"})
return
}
// If realtime monitoring is disabled, prefer a successful WS upgrade followed by a clean close
// with a deterministic close code. This prevents clients from spinning on 404/1006 reconnect loops.
if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
if err != nil {
c.JSON(http.StatusNotFound, gin.H{"error": "ops realtime monitoring is disabled"})
return
}
closeWS(conn, opsWSCloseRealtimeDisabled, "realtime_disabled")
return
}
cancelQPSWSIdleStop()
// Lazily start the background refresh loop so unit tests that never hit the
// websocket route don't spawn goroutines that depend on DB/Redis stubs.
qpsWSCache.start(h.opsService)
// Reserve a global slot before upgrading the connection to keep the limit strict.
if !tryAcquireOpsWSTotalSlot(opsWSLimits.MaxConns) {
log.Printf("[OpsWS] connection limit reached: %d/%d", wsConnCount.Load(), opsWSLimits.MaxConns)
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"})
return
}
defer func() {
if wsConnCount.Add(-1) == 0 {
scheduleQPSWSIdleStop()
}
}()
if opsWSLimits.MaxConnsPerIP > 0 && clientIP != "" {
if !tryAcquireOpsWSIPSlot(clientIP, opsWSLimits.MaxConnsPerIP) {
log.Printf("[OpsWS] per-ip connection limit reached: ip=%s limit=%d", clientIP, opsWSLimits.MaxConnsPerIP)
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"})
return
}
defer releaseOpsWSIPSlot(clientIP)
}
conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
if err != nil {
log.Printf("[OpsWS] upgrade failed: %v", err)
return
}
defer func() {
_ = conn.Close()
}()
handleQPSWebSocket(c.Request.Context(), conn)
}
func tryAcquireOpsWSTotalSlot(limit int32) bool {
if limit <= 0 {
return true
}
for {
current := wsConnCount.Load()
if current >= limit {
return false
}
if wsConnCount.CompareAndSwap(current, current+1) {
return true
}
}
}
func tryAcquireOpsWSIPSlot(clientIP string, limit int32) bool {
if strings.TrimSpace(clientIP) == "" || limit <= 0 {
return true
}
v, _ := wsConnCountByIP.LoadOrStore(clientIP, &atomic.Int32{})
counter, ok := v.(*atomic.Int32)
if !ok {
return false
}
for {
current := counter.Load()
if current >= limit {
return false
}
if counter.CompareAndSwap(current, current+1) {
return true
}
}
}
func releaseOpsWSIPSlot(clientIP string) {
if strings.TrimSpace(clientIP) == "" {
return
}
v, ok := wsConnCountByIP.Load(clientIP)
if !ok {
return
}
counter, ok := v.(*atomic.Int32)
if !ok {
return
}
next := counter.Add(-1)
if next <= 0 {
// Best-effort cleanup; safe even if a new slot was acquired concurrently.
wsConnCountByIP.Delete(clientIP)
}
}
func handleQPSWebSocket(parentCtx context.Context, conn *websocket.Conn) {
if conn == nil {
return
}
ctx, cancel := context.WithCancel(parentCtx)
defer cancel()
var closeOnce sync.Once
closeConn := func() {
closeOnce.Do(func() {
_ = conn.Close()
})
}
closeFrameCh := make(chan []byte, 1)
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
defer cancel()
conn.SetReadLimit(qpsWSMaxReadBytes)
if err := conn.SetReadDeadline(time.Now().Add(qpsWSPongWait)); err != nil {
log.Printf("[OpsWS] set read deadline failed: %v", err)
return
}
conn.SetPongHandler(func(string) error {
return conn.SetReadDeadline(time.Now().Add(qpsWSPongWait))
})
conn.SetCloseHandler(func(code int, text string) error {
select {
case closeFrameCh <- websocket.FormatCloseMessage(code, text):
default:
}
cancel()
return nil
})
for {
_, _, err := conn.ReadMessage()
if err != nil {
if websocket.IsUnexpectedCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway, websocket.CloseNoStatusReceived) {
log.Printf("[OpsWS] read failed: %v", err)
}
return
}
}
}()
// Push QPS data every 2 seconds (values are globally cached and refreshed at most once per qpsWSRefreshInterval).
pushTicker := time.NewTicker(qpsWSPushInterval)
defer pushTicker.Stop()
// Heartbeat ping every 30 seconds.
pingTicker := time.NewTicker(qpsWSPingInterval)
defer pingTicker.Stop()
writeWithTimeout := func(messageType int, data []byte) error {
if err := conn.SetWriteDeadline(time.Now().Add(qpsWSWriteTimeout)); err != nil {
return err
}
return conn.WriteMessage(messageType, data)
}
sendClose := func(closeFrame []byte) {
if closeFrame == nil {
closeFrame = websocket.FormatCloseMessage(websocket.CloseNormalClosure, "")
}
_ = writeWithTimeout(websocket.CloseMessage, closeFrame)
}
for {
select {
case <-pushTicker.C:
msg := qpsWSCache.getPayload()
if msg == nil {
continue
}
if err := writeWithTimeout(websocket.TextMessage, msg); err != nil {
log.Printf("[OpsWS] write failed: %v", err)
cancel()
closeConn()
wg.Wait()
return
}
case <-pingTicker.C:
if err := writeWithTimeout(websocket.PingMessage, nil); err != nil {
log.Printf("[OpsWS] ping failed: %v", err)
cancel()
closeConn()
wg.Wait()
return
}
case closeFrame := <-closeFrameCh:
sendClose(closeFrame)
closeConn()
wg.Wait()
return
case <-ctx.Done():
var closeFrame []byte
select {
case closeFrame = <-closeFrameCh:
default:
}
sendClose(closeFrame)
closeConn()
wg.Wait()
return
}
}
}
func isAllowedOpsWSOrigin(r *http.Request) bool {
if r == nil {
return false
}
origin := strings.TrimSpace(r.Header.Get("Origin"))
if origin == "" {
switch strings.ToLower(strings.TrimSpace(opsWSProxyConfig.OriginPolicy)) {
case OriginPolicyStrict:
return false
case OriginPolicyPermissive, "":
return true
default:
return true
}
}
parsed, err := url.Parse(origin)
if err != nil || parsed.Hostname() == "" {
return false
}
originHost := strings.ToLower(parsed.Hostname())
trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r)
reqHost := hostWithoutPort(r.Host)
if trustProxyHeaders {
xfHost := strings.TrimSpace(r.Header.Get("X-Forwarded-Host"))
if xfHost != "" {
xfHost = strings.TrimSpace(strings.Split(xfHost, ",")[0])
if xfHost != "" {
reqHost = hostWithoutPort(xfHost)
}
}
}
reqHost = strings.ToLower(reqHost)
if reqHost == "" {
return false
}
return originHost == reqHost
}
func shouldTrustOpsWSProxyHeaders(r *http.Request) bool {
if r == nil {
return false
}
if !opsWSProxyConfig.TrustProxy {
return false
}
peerIP, ok := requestPeerIP(r)
if !ok {
return false
}
return isAddrInTrustedProxies(peerIP, opsWSProxyConfig.TrustedProxies)
}
func requestPeerIP(r *http.Request) (netip.Addr, bool) {
if r == nil {
return netip.Addr{}, false
}
host, _, err := net.SplitHostPort(strings.TrimSpace(r.RemoteAddr))
if err != nil {
host = strings.TrimSpace(r.RemoteAddr)
}
host = strings.TrimPrefix(host, "[")
host = strings.TrimSuffix(host, "]")
if host == "" {
return netip.Addr{}, false
}
addr, err := netip.ParseAddr(host)
if err != nil {
return netip.Addr{}, false
}
return addr.Unmap(), true
}
func requestClientIP(r *http.Request) string {
if r == nil {
return ""
}
trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r)
if trustProxyHeaders {
xff := strings.TrimSpace(r.Header.Get("X-Forwarded-For"))
if xff != "" {
// Use the left-most entry (original client). If multiple proxies add values, they are comma-separated.
xff = strings.TrimSpace(strings.Split(xff, ",")[0])
xff = strings.TrimPrefix(xff, "[")
xff = strings.TrimSuffix(xff, "]")
if addr, err := netip.ParseAddr(xff); err == nil && addr.IsValid() {
return addr.Unmap().String()
}
}
}
if peer, ok := requestPeerIP(r); ok && peer.IsValid() {
return peer.String()
}
return ""
}
func isAddrInTrustedProxies(addr netip.Addr, trusted []netip.Prefix) bool {
if !addr.IsValid() {
return false
}
for _, p := range trusted {
if p.Contains(addr) {
return true
}
}
return false
}
func loadOpsWSProxyConfigFromEnv() OpsWSProxyConfig {
cfg := OpsWSProxyConfig{
TrustProxy: true,
TrustedProxies: defaultTrustedProxies(),
OriginPolicy: OriginPolicyPermissive,
}
if v := strings.TrimSpace(os.Getenv(envOpsWSTrustProxy)); v != "" {
if parsed, err := strconv.ParseBool(v); err == nil {
cfg.TrustProxy = parsed
} else {
log.Printf("[OpsWS] invalid %s=%q (expected bool); using default=%v", envOpsWSTrustProxy, v, cfg.TrustProxy)
}
}
if raw := strings.TrimSpace(os.Getenv(envOpsWSTrustedProxies)); raw != "" {
prefixes, invalid := parseTrustedProxyList(raw)
if len(invalid) > 0 {
log.Printf("[OpsWS] invalid %s entries ignored: %s", envOpsWSTrustedProxies, strings.Join(invalid, ", "))
}
cfg.TrustedProxies = prefixes
}
if v := strings.TrimSpace(os.Getenv(envOpsWSOriginPolicy)); v != "" {
normalized := strings.ToLower(v)
switch normalized {
case OriginPolicyStrict, OriginPolicyPermissive:
cfg.OriginPolicy = normalized
default:
log.Printf("[OpsWS] invalid %s=%q (expected %q or %q); using default=%q", envOpsWSOriginPolicy, v, OriginPolicyStrict, OriginPolicyPermissive, cfg.OriginPolicy)
}
}
return cfg
}
func loadOpsWSRuntimeLimitsFromEnv() opsWSRuntimeLimits {
cfg := opsWSRuntimeLimits{
MaxConns: defaultMaxWSConns,
MaxConnsPerIP: defaultMaxWSConnsPerIP,
}
if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConns)); v != "" {
if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 {
cfg.MaxConns = int32(parsed)
} else {
log.Printf("[OpsWS] invalid %s=%q (expected int>0); using default=%d", envOpsWSMaxConns, v, cfg.MaxConns)
}
}
if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConnsPerIP)); v != "" {
if parsed, err := strconv.Atoi(v); err == nil && parsed >= 0 {
cfg.MaxConnsPerIP = int32(parsed)
} else {
log.Printf("[OpsWS] invalid %s=%q (expected int>=0); using default=%d", envOpsWSMaxConnsPerIP, v, cfg.MaxConnsPerIP)
}
}
return cfg
}
func defaultTrustedProxies() []netip.Prefix {
prefixes, _ := parseTrustedProxyList("127.0.0.0/8,::1/128")
return prefixes
}
func parseTrustedProxyList(raw string) (prefixes []netip.Prefix, invalid []string) {
for _, token := range strings.Split(raw, ",") {
item := strings.TrimSpace(token)
if item == "" {
continue
}
var (
p netip.Prefix
err error
)
if strings.Contains(item, "/") {
p, err = netip.ParsePrefix(item)
} else {
var addr netip.Addr
addr, err = netip.ParseAddr(item)
if err == nil {
addr = addr.Unmap()
bits := 128
if addr.Is4() {
bits = 32
}
p = netip.PrefixFrom(addr, bits)
}
}
if err != nil || !p.IsValid() {
invalid = append(invalid, item)
continue
}
prefixes = append(prefixes, p.Masked())
}
return prefixes, invalid
}
func hostWithoutPort(hostport string) string {
hostport = strings.TrimSpace(hostport)
if hostport == "" {
return ""
}
if host, _, err := net.SplitHostPort(hostport); err == nil {
return host
}
if strings.HasPrefix(hostport, "[") && strings.HasSuffix(hostport, "]") {
return strings.Trim(hostport, "[]")
}
parts := strings.Split(hostport, ":")
return parts[0]
}

View File

@@ -19,14 +19,16 @@ type SettingHandler struct {
settingService *service.SettingService
emailService *service.EmailService
turnstileService *service.TurnstileService
opsService *service.OpsService
}
// NewSettingHandler 创建系统设置处理器
func NewSettingHandler(settingService *service.SettingService, emailService *service.EmailService, turnstileService *service.TurnstileService) *SettingHandler {
func NewSettingHandler(settingService *service.SettingService, emailService *service.EmailService, turnstileService *service.TurnstileService, opsService *service.OpsService) *SettingHandler {
return &SettingHandler{
settingService: settingService,
emailService: emailService,
turnstileService: turnstileService,
opsService: opsService,
}
}
@@ -39,6 +41,9 @@ func (h *SettingHandler) GetSettings(c *gin.Context) {
return
}
// Check if ops monitoring is enabled (respects config.ops.enabled)
opsEnabled := h.opsService != nil && h.opsService.IsMonitoringEnabled(c.Request.Context())
response.Success(c, dto.SystemSettings{
RegistrationEnabled: settings.RegistrationEnabled,
EmailVerifyEnabled: settings.EmailVerifyEnabled,
@@ -72,6 +77,10 @@ func (h *SettingHandler) GetSettings(c *gin.Context) {
FallbackModelAntigravity: settings.FallbackModelAntigravity,
EnableIdentityPatch: settings.EnableIdentityPatch,
IdentityPatchPrompt: settings.IdentityPatchPrompt,
OpsMonitoringEnabled: opsEnabled && settings.OpsMonitoringEnabled,
OpsRealtimeMonitoringEnabled: settings.OpsRealtimeMonitoringEnabled,
OpsQueryModeDefault: settings.OpsQueryModeDefault,
OpsMetricsIntervalSeconds: settings.OpsMetricsIntervalSeconds,
})
}
@@ -95,7 +104,7 @@ type UpdateSettingsRequest struct {
TurnstileSiteKey string `json:"turnstile_site_key"`
TurnstileSecretKey string `json:"turnstile_secret_key"`
// LinuxDo Connect OAuth 登录(终端用户 SSO
// LinuxDo Connect OAuth 登录
LinuxDoConnectEnabled bool `json:"linuxdo_connect_enabled"`
LinuxDoConnectClientID string `json:"linuxdo_connect_client_id"`
LinuxDoConnectClientSecret string `json:"linuxdo_connect_client_secret"`
@@ -124,6 +133,12 @@ type UpdateSettingsRequest struct {
// Identity patch configuration (Claude -> Gemini)
EnableIdentityPatch bool `json:"enable_identity_patch"`
IdentityPatchPrompt string `json:"identity_patch_prompt"`
// Ops monitoring (vNext)
OpsMonitoringEnabled *bool `json:"ops_monitoring_enabled"`
OpsRealtimeMonitoringEnabled *bool `json:"ops_realtime_monitoring_enabled"`
OpsQueryModeDefault *string `json:"ops_query_mode_default"`
OpsMetricsIntervalSeconds *int `json:"ops_metrics_interval_seconds"`
}
// UpdateSettings 更新系统设置
@@ -208,6 +223,18 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
}
}
// Ops metrics collector interval validation (seconds).
if req.OpsMetricsIntervalSeconds != nil {
v := *req.OpsMetricsIntervalSeconds
if v < 60 {
v = 60
}
if v > 3600 {
v = 3600
}
req.OpsMetricsIntervalSeconds = &v
}
settings := &service.SystemSettings{
RegistrationEnabled: req.RegistrationEnabled,
EmailVerifyEnabled: req.EmailVerifyEnabled,
@@ -241,6 +268,30 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
FallbackModelAntigravity: req.FallbackModelAntigravity,
EnableIdentityPatch: req.EnableIdentityPatch,
IdentityPatchPrompt: req.IdentityPatchPrompt,
OpsMonitoringEnabled: func() bool {
if req.OpsMonitoringEnabled != nil {
return *req.OpsMonitoringEnabled
}
return previousSettings.OpsMonitoringEnabled
}(),
OpsRealtimeMonitoringEnabled: func() bool {
if req.OpsRealtimeMonitoringEnabled != nil {
return *req.OpsRealtimeMonitoringEnabled
}
return previousSettings.OpsRealtimeMonitoringEnabled
}(),
OpsQueryModeDefault: func() string {
if req.OpsQueryModeDefault != nil {
return *req.OpsQueryModeDefault
}
return previousSettings.OpsQueryModeDefault
}(),
OpsMetricsIntervalSeconds: func() int {
if req.OpsMetricsIntervalSeconds != nil {
return *req.OpsMetricsIntervalSeconds
}
return previousSettings.OpsMetricsIntervalSeconds
}(),
}
if err := h.settingService.UpdateSettings(c.Request.Context(), settings); err != nil {
@@ -290,6 +341,10 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
FallbackModelAntigravity: updatedSettings.FallbackModelAntigravity,
EnableIdentityPatch: updatedSettings.EnableIdentityPatch,
IdentityPatchPrompt: updatedSettings.IdentityPatchPrompt,
OpsMonitoringEnabled: updatedSettings.OpsMonitoringEnabled,
OpsRealtimeMonitoringEnabled: updatedSettings.OpsRealtimeMonitoringEnabled,
OpsQueryModeDefault: updatedSettings.OpsQueryModeDefault,
OpsMetricsIntervalSeconds: updatedSettings.OpsMetricsIntervalSeconds,
})
}
@@ -411,6 +466,18 @@ func diffSettings(before *service.SystemSettings, after *service.SystemSettings,
if before.IdentityPatchPrompt != after.IdentityPatchPrompt {
changed = append(changed, "identity_patch_prompt")
}
if before.OpsMonitoringEnabled != after.OpsMonitoringEnabled {
changed = append(changed, "ops_monitoring_enabled")
}
if before.OpsRealtimeMonitoringEnabled != after.OpsRealtimeMonitoringEnabled {
changed = append(changed, "ops_realtime_monitoring_enabled")
}
if before.OpsQueryModeDefault != after.OpsQueryModeDefault {
changed = append(changed, "ops_query_mode_default")
}
if before.OpsMetricsIntervalSeconds != after.OpsMetricsIntervalSeconds {
changed = append(changed, "ops_metrics_interval_seconds")
}
return changed
}

View File

@@ -43,6 +43,12 @@ type SystemSettings struct {
// Identity patch configuration (Claude -> Gemini)
EnableIdentityPatch bool `json:"enable_identity_patch"`
IdentityPatchPrompt string `json:"identity_patch_prompt"`
// Ops monitoring (vNext)
OpsMonitoringEnabled bool `json:"ops_monitoring_enabled"`
OpsRealtimeMonitoringEnabled bool `json:"ops_realtime_monitoring_enabled"`
OpsQueryModeDefault string `json:"ops_query_mode_default"`
OpsMetricsIntervalSeconds int `json:"ops_metrics_interval_seconds"`
}
type PublicSettings struct {

View File

@@ -15,7 +15,6 @@ import (
"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
"github.com/Wei-Shaw/sub2api/internal/pkg/claude"
pkgerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
"github.com/Wei-Shaw/sub2api/internal/pkg/ip"
"github.com/Wei-Shaw/sub2api/internal/pkg/openai"
middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
"github.com/Wei-Shaw/sub2api/internal/service"
@@ -89,6 +88,8 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
return
}
setOpsRequestContext(c, "", false, body)
parsedReq, err := service.ParseGatewayRequest(body)
if err != nil {
h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
@@ -97,8 +98,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
reqModel := parsedReq.Model
reqStream := parsedReq.Stream
// 设置 Claude Code 客户端标识到 context用于分组限制检查
SetClaudeCodeClientContext(c, body)
setOpsRequestContext(c, reqModel, reqStream, body)
// 验证 model 必填
if reqModel == "" {
@@ -112,15 +112,10 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
// 获取订阅信息可能为nil- 提前获取用于后续检查
subscription, _ := middleware2.GetSubscriptionFromContext(c)
// 获取 User-Agent
userAgent := c.Request.UserAgent()
// 获取客户端 IP
clientIP := ip.GetClientIP(c)
// 0. 检查wait队列是否已满
maxWait := service.CalculateMaxWait(subject.Concurrency)
canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait)
waitCounted := false
if err != nil {
log.Printf("Increment wait count failed: %v", err)
// On error, allow request to proceed
@@ -128,8 +123,15 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later")
return
}
// 确保在函数退出时减少wait计数
defer h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
if err == nil && canWait {
waitCounted = true
}
// Ensure we decrement if we exit before acquiring the user slot.
defer func() {
if waitCounted {
h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
}
}()
// 1. 首先获取用户并发槽位
userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted)
@@ -138,6 +140,11 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
h.handleConcurrencyError(c, err, "user", streamStarted)
return
}
// User slot acquired: no longer waiting in the queue.
if waitCounted {
h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
waitCounted = false
}
// 在请求结束或 Context 取消时确保释放槽位,避免客户端断开造成泄漏
userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
if userReleaseFunc != nil {
@@ -184,6 +191,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
return
}
account := selection.Account
setOpsSelectedAccount(c, account.ID)
// 检查预热请求拦截(在账号选择后、转发前检查)
if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) {
@@ -200,12 +208,12 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
// 3. 获取账号并发槽位
accountReleaseFunc := selection.ReleaseFunc
var accountWaitRelease func()
if !selection.Acquired {
if selection.WaitPlan == nil {
h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
return
}
accountWaitCounted := false
canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
if err != nil {
log.Printf("Increment account wait count failed: %v", err)
@@ -213,12 +221,16 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
log.Printf("Account wait queue full: account=%d", account.ID)
h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
return
} else {
// Only set release function if increment succeeded
accountWaitRelease = func() {
}
if err == nil && canWait {
accountWaitCounted = true
}
// Ensure the wait counter is decremented if we exit before acquiring the slot.
defer func() {
if accountWaitCounted {
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
}
}
}()
accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
c,
@@ -229,20 +241,21 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
&streamStarted,
)
if err != nil {
if accountWaitRelease != nil {
accountWaitRelease()
}
log.Printf("Account concurrency acquire failed: %v", err)
h.handleConcurrencyError(c, err, "account", streamStarted)
return
}
// Slot acquired: no longer waiting in queue.
if accountWaitCounted {
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
accountWaitCounted = false
}
if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil {
log.Printf("Bind sticky session failed: %v", err)
}
}
// 账号槽位/等待计数需要在超时或断开时安全回收
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
// 转发请求 - 根据账号平台分流
var result *service.ForwardResult
@@ -254,9 +267,6 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
if accountReleaseFunc != nil {
accountReleaseFunc()
}
if accountWaitRelease != nil {
accountWaitRelease()
}
if err != nil {
var failoverErr *service.UpstreamFailoverError
if errors.As(err, &failoverErr) {
@@ -277,7 +287,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
}
// 异步记录使用量subscription已在函数开头获取
go func(result *service.ForwardResult, usedAccount *service.Account, ua string, cip string) {
go func(result *service.ForwardResult, usedAccount *service.Account) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{
@@ -286,12 +296,10 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
User: apiKey.User,
Account: usedAccount,
Subscription: subscription,
UserAgent: ua,
IPAddress: cip,
}); err != nil {
log.Printf("Record usage failed: %v", err)
}
}(result, account, userAgent, clientIP)
}(result, account)
return
}
}
@@ -313,6 +321,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
return
}
account := selection.Account
setOpsSelectedAccount(c, account.ID)
// 检查预热请求拦截(在账号选择后、转发前检查)
if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) {
@@ -329,12 +338,12 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
// 3. 获取账号并发槽位
accountReleaseFunc := selection.ReleaseFunc
var accountWaitRelease func()
if !selection.Acquired {
if selection.WaitPlan == nil {
h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
return
}
accountWaitCounted := false
canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
if err != nil {
log.Printf("Increment account wait count failed: %v", err)
@@ -342,12 +351,15 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
log.Printf("Account wait queue full: account=%d", account.ID)
h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
return
} else {
// Only set release function if increment succeeded
accountWaitRelease = func() {
}
if err == nil && canWait {
accountWaitCounted = true
}
defer func() {
if accountWaitCounted {
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
}
}
}()
accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
c,
@@ -358,20 +370,20 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
&streamStarted,
)
if err != nil {
if accountWaitRelease != nil {
accountWaitRelease()
}
log.Printf("Account concurrency acquire failed: %v", err)
h.handleConcurrencyError(c, err, "account", streamStarted)
return
}
if accountWaitCounted {
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
accountWaitCounted = false
}
if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil {
log.Printf("Bind sticky session failed: %v", err)
}
}
// 账号槽位/等待计数需要在超时或断开时安全回收
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
// 转发请求 - 根据账号平台分流
var result *service.ForwardResult
@@ -383,9 +395,6 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
if accountReleaseFunc != nil {
accountReleaseFunc()
}
if accountWaitRelease != nil {
accountWaitRelease()
}
if err != nil {
var failoverErr *service.UpstreamFailoverError
if errors.As(err, &failoverErr) {
@@ -406,7 +415,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
}
// 异步记录使用量subscription已在函数开头获取
go func(result *service.ForwardResult, usedAccount *service.Account, ua string, cip string) {
go func(result *service.ForwardResult, usedAccount *service.Account) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{
@@ -415,12 +424,10 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
User: apiKey.User,
Account: usedAccount,
Subscription: subscription,
UserAgent: ua,
IPAddress: cip,
}); err != nil {
log.Printf("Record usage failed: %v", err)
}
}(result, account, userAgent, clientIP)
}(result, account)
return
}
}
@@ -686,21 +693,22 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) {
return
}
setOpsRequestContext(c, "", false, body)
parsedReq, err := service.ParseGatewayRequest(body)
if err != nil {
h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
return
}
// 设置 Claude Code 客户端标识到 context用于分组限制检查
SetClaudeCodeClientContext(c, body)
// 验证 model 必填
if parsedReq.Model == "" {
h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "model is required")
return
}
setOpsRequestContext(c, parsedReq.Model, parsedReq.Stream, body)
// 获取订阅信息可能为nil
subscription, _ := middleware2.GetSubscriptionFromContext(c)
@@ -721,6 +729,7 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) {
h.errorResponse(c, http.StatusServiceUnavailable, "api_error", "No available accounts: "+err.Error())
return
}
setOpsSelectedAccount(c, account.ID)
// 转发请求(不记录使用量)
if err := h.gatewayService.ForwardCountTokens(c.Request.Context(), c, account, parsedReq); err != nil {

View File

@@ -12,7 +12,6 @@ import (
"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
"github.com/Wei-Shaw/sub2api/internal/pkg/gemini"
"github.com/Wei-Shaw/sub2api/internal/pkg/googleapi"
"github.com/Wei-Shaw/sub2api/internal/pkg/ip"
"github.com/Wei-Shaw/sub2api/internal/server/middleware"
"github.com/Wei-Shaw/sub2api/internal/service"
@@ -162,28 +161,32 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
return
}
setOpsRequestContext(c, modelName, stream, body)
// Get subscription (may be nil)
subscription, _ := middleware.GetSubscriptionFromContext(c)
// 获取 User-Agent
userAgent := c.Request.UserAgent()
// 获取客户端 IP
clientIP := ip.GetClientIP(c)
// For Gemini native API, do not send Claude-style ping frames.
geminiConcurrency := NewConcurrencyHelper(h.concurrencyHelper.concurrencyService, SSEPingFormatNone, 0)
// 0) wait queue check
maxWait := service.CalculateMaxWait(authSubject.Concurrency)
canWait, err := geminiConcurrency.IncrementWaitCount(c.Request.Context(), authSubject.UserID, maxWait)
waitCounted := false
if err != nil {
log.Printf("Increment wait count failed: %v", err)
} else if !canWait {
googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later")
return
}
defer geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID)
if err == nil && canWait {
waitCounted = true
}
defer func() {
if waitCounted {
geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID)
}
}()
// 1) user concurrency slot
streamStarted := false
@@ -192,6 +195,10 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
googleError(c, http.StatusTooManyRequests, err.Error())
return
}
if waitCounted {
geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID)
waitCounted = false
}
// 确保请求取消时也会释放槽位,避免长连接被动中断造成泄漏
userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
if userReleaseFunc != nil {
@@ -207,10 +214,6 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
// 3) select account (sticky session based on request body)
parsedReq, _ := service.ParseGatewayRequest(body)
// 设置 Claude Code 客户端标识到 context用于分组限制检查
SetClaudeCodeClientContext(c, body)
sessionHash := h.gatewayService.GenerateSessionHash(parsedReq)
sessionKey := sessionHash
if sessionHash != "" {
@@ -232,15 +235,16 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
return
}
account := selection.Account
setOpsSelectedAccount(c, account.ID)
// 4) account concurrency slot
accountReleaseFunc := selection.ReleaseFunc
var accountWaitRelease func()
if !selection.Acquired {
if selection.WaitPlan == nil {
googleError(c, http.StatusServiceUnavailable, "No available Gemini accounts")
return
}
accountWaitCounted := false
canWait, err := geminiConcurrency.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
if err != nil {
log.Printf("Increment account wait count failed: %v", err)
@@ -248,12 +252,15 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
log.Printf("Account wait queue full: account=%d", account.ID)
googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later")
return
} else {
// Only set release function if increment succeeded
accountWaitRelease = func() {
}
if err == nil && canWait {
accountWaitCounted = true
}
defer func() {
if accountWaitCounted {
geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID)
}
}
}()
accountReleaseFunc, err = geminiConcurrency.AcquireAccountSlotWithWaitTimeout(
c,
@@ -264,19 +271,19 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
&streamStarted,
)
if err != nil {
if accountWaitRelease != nil {
accountWaitRelease()
}
googleError(c, http.StatusTooManyRequests, err.Error())
return
}
if accountWaitCounted {
geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID)
accountWaitCounted = false
}
if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil {
log.Printf("Bind sticky session failed: %v", err)
}
}
// 账号槽位/等待计数需要在超时或断开时安全回收
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
// 5) forward (根据平台分流)
var result *service.ForwardResult
@@ -288,9 +295,6 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
if accountReleaseFunc != nil {
accountReleaseFunc()
}
if accountWaitRelease != nil {
accountWaitRelease()
}
if err != nil {
var failoverErr *service.UpstreamFailoverError
if errors.As(err, &failoverErr) {
@@ -311,7 +315,7 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
}
// 6) record usage async
go func(result *service.ForwardResult, usedAccount *service.Account, ua string, cip string) {
go func(result *service.ForwardResult, usedAccount *service.Account) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{
@@ -320,12 +324,10 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
User: apiKey.User,
Account: usedAccount,
Subscription: subscription,
UserAgent: ua,
IPAddress: cip,
}); err != nil {
log.Printf("Record usage failed: %v", err)
}
}(result, account, userAgent, clientIP)
}(result, account)
return
}
}

View File

@@ -18,6 +18,7 @@ type AdminHandlers struct {
Redeem *admin.RedeemHandler
Promo *admin.PromoHandler
Setting *admin.SettingHandler
Ops *admin.OpsHandler
System *admin.SystemHandler
Subscription *admin.SubscriptionHandler
Usage *admin.UsageHandler

View File

@@ -12,7 +12,6 @@ import (
"time"
"github.com/Wei-Shaw/sub2api/internal/config"
"github.com/Wei-Shaw/sub2api/internal/pkg/ip"
"github.com/Wei-Shaw/sub2api/internal/pkg/openai"
middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
"github.com/Wei-Shaw/sub2api/internal/service"
@@ -77,6 +76,8 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
return
}
setOpsRequestContext(c, "", false, body)
// Parse request body to map for potential modification
var reqBody map[string]any
if err := json.Unmarshal(body, &reqBody); err != nil {
@@ -95,10 +96,6 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
}
userAgent := c.GetHeader("User-Agent")
// 获取客户端 IP
clientIP := ip.GetClientIP(c)
if !openai.IsCodexCLIRequest(userAgent) {
existingInstructions, _ := reqBody["instructions"].(string)
if strings.TrimSpace(existingInstructions) == "" {
@@ -114,6 +111,8 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
}
}
setOpsRequestContext(c, reqModel, reqStream, body)
// Track if we've started streaming (for error handling)
streamStarted := false
@@ -123,6 +122,7 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
// 0. Check if wait queue is full
maxWait := service.CalculateMaxWait(subject.Concurrency)
canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait)
waitCounted := false
if err != nil {
log.Printf("Increment wait count failed: %v", err)
// On error, allow request to proceed
@@ -130,8 +130,14 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later")
return
}
// Ensure wait count is decremented when function exits
defer h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
if err == nil && canWait {
waitCounted = true
}
defer func() {
if waitCounted {
h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
}
}()
// 1. First acquire user concurrency slot
userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted)
@@ -140,6 +146,11 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
h.handleConcurrencyError(c, err, "user", streamStarted)
return
}
// User slot acquired: no longer waiting.
if waitCounted {
h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
waitCounted = false
}
// 确保请求取消时也会释放槽位,避免长连接被动中断造成泄漏
userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
if userReleaseFunc != nil {
@@ -177,15 +188,16 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
}
account := selection.Account
log.Printf("[OpenAI Handler] Selected account: id=%d name=%s", account.ID, account.Name)
setOpsSelectedAccount(c, account.ID)
// 3. Acquire account concurrency slot
accountReleaseFunc := selection.ReleaseFunc
var accountWaitRelease func()
if !selection.Acquired {
if selection.WaitPlan == nil {
h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
return
}
accountWaitCounted := false
canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
if err != nil {
log.Printf("Increment account wait count failed: %v", err)
@@ -193,12 +205,15 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
log.Printf("Account wait queue full: account=%d", account.ID)
h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
return
} else {
// Only set release function if increment succeeded
accountWaitRelease = func() {
}
if err == nil && canWait {
accountWaitCounted = true
}
defer func() {
if accountWaitCounted {
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
}
}
}()
accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
c,
@@ -209,29 +224,26 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
&streamStarted,
)
if err != nil {
if accountWaitRelease != nil {
accountWaitRelease()
}
log.Printf("Account concurrency acquire failed: %v", err)
h.handleConcurrencyError(c, err, "account", streamStarted)
return
}
if accountWaitCounted {
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
accountWaitCounted = false
}
if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionHash, account.ID); err != nil {
log.Printf("Bind sticky session failed: %v", err)
}
}
// 账号槽位/等待计数需要在超时或断开时安全回收
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
// Forward request
result, err := h.gatewayService.Forward(c.Request.Context(), c, account, body)
if accountReleaseFunc != nil {
accountReleaseFunc()
}
if accountWaitRelease != nil {
accountWaitRelease()
}
if err != nil {
var failoverErr *service.UpstreamFailoverError
if errors.As(err, &failoverErr) {
@@ -252,7 +264,7 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
}
// Async record usage
go func(result *service.OpenAIForwardResult, usedAccount *service.Account, ua string, cip string) {
go func(result *service.OpenAIForwardResult, usedAccount *service.Account) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := h.gatewayService.RecordUsage(ctx, &service.OpenAIRecordUsageInput{
@@ -261,12 +273,10 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
User: apiKey.User,
Account: usedAccount,
Subscription: subscription,
UserAgent: ua,
IPAddress: cip,
}); err != nil {
log.Printf("Record usage failed: %v", err)
}
}(result, account, userAgent, clientIP)
}(result, account)
return
}
}

View File

@@ -0,0 +1,954 @@
package handler
import (
"bytes"
"context"
"encoding/json"
"log"
"runtime"
"runtime/debug"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"unicode/utf8"
"github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
)
const (
opsModelKey = "ops_model"
opsStreamKey = "ops_stream"
opsRequestBodyKey = "ops_request_body"
opsAccountIDKey = "ops_account_id"
)
const (
opsErrorLogTimeout = 5 * time.Second
opsErrorLogDrainTimeout = 10 * time.Second
opsErrorLogMinWorkerCount = 4
opsErrorLogMaxWorkerCount = 32
opsErrorLogQueueSizePerWorker = 128
opsErrorLogMinQueueSize = 256
opsErrorLogMaxQueueSize = 8192
)
type opsErrorLogJob struct {
ops *service.OpsService
entry *service.OpsInsertErrorLogInput
requestBody []byte
}
var (
opsErrorLogOnce sync.Once
opsErrorLogQueue chan opsErrorLogJob
opsErrorLogStopOnce sync.Once
opsErrorLogWorkersWg sync.WaitGroup
opsErrorLogMu sync.RWMutex
opsErrorLogStopping bool
opsErrorLogQueueLen atomic.Int64
opsErrorLogEnqueued atomic.Int64
opsErrorLogDropped atomic.Int64
opsErrorLogProcessed atomic.Int64
opsErrorLogLastDropLogAt atomic.Int64
opsErrorLogShutdownCh = make(chan struct{})
opsErrorLogShutdownOnce sync.Once
opsErrorLogDrained atomic.Bool
)
func startOpsErrorLogWorkers() {
opsErrorLogMu.Lock()
defer opsErrorLogMu.Unlock()
if opsErrorLogStopping {
return
}
workerCount, queueSize := opsErrorLogConfig()
opsErrorLogQueue = make(chan opsErrorLogJob, queueSize)
opsErrorLogQueueLen.Store(0)
opsErrorLogWorkersWg.Add(workerCount)
for i := 0; i < workerCount; i++ {
go func() {
defer opsErrorLogWorkersWg.Done()
for job := range opsErrorLogQueue {
opsErrorLogQueueLen.Add(-1)
if job.ops == nil || job.entry == nil {
continue
}
func() {
defer func() {
if r := recover(); r != nil {
log.Printf("[OpsErrorLogger] worker panic: %v\n%s", r, debug.Stack())
}
}()
ctx, cancel := context.WithTimeout(context.Background(), opsErrorLogTimeout)
_ = job.ops.RecordError(ctx, job.entry, job.requestBody)
cancel()
opsErrorLogProcessed.Add(1)
}()
}
}()
}
}
func enqueueOpsErrorLog(ops *service.OpsService, entry *service.OpsInsertErrorLogInput, requestBody []byte) {
if ops == nil || entry == nil {
return
}
select {
case <-opsErrorLogShutdownCh:
return
default:
}
opsErrorLogMu.RLock()
stopping := opsErrorLogStopping
opsErrorLogMu.RUnlock()
if stopping {
return
}
opsErrorLogOnce.Do(startOpsErrorLogWorkers)
opsErrorLogMu.RLock()
defer opsErrorLogMu.RUnlock()
if opsErrorLogStopping || opsErrorLogQueue == nil {
return
}
select {
case opsErrorLogQueue <- opsErrorLogJob{ops: ops, entry: entry, requestBody: requestBody}:
opsErrorLogQueueLen.Add(1)
opsErrorLogEnqueued.Add(1)
default:
// Queue is full; drop to avoid blocking request handling.
opsErrorLogDropped.Add(1)
maybeLogOpsErrorLogDrop()
}
}
func StopOpsErrorLogWorkers() bool {
opsErrorLogStopOnce.Do(func() {
opsErrorLogShutdownOnce.Do(func() {
close(opsErrorLogShutdownCh)
})
opsErrorLogDrained.Store(stopOpsErrorLogWorkers())
})
return opsErrorLogDrained.Load()
}
func stopOpsErrorLogWorkers() bool {
opsErrorLogMu.Lock()
opsErrorLogStopping = true
ch := opsErrorLogQueue
if ch != nil {
close(ch)
}
opsErrorLogQueue = nil
opsErrorLogMu.Unlock()
if ch == nil {
opsErrorLogQueueLen.Store(0)
return true
}
done := make(chan struct{})
go func() {
opsErrorLogWorkersWg.Wait()
close(done)
}()
select {
case <-done:
opsErrorLogQueueLen.Store(0)
return true
case <-time.After(opsErrorLogDrainTimeout):
return false
}
}
func OpsErrorLogQueueLength() int64 {
return opsErrorLogQueueLen.Load()
}
func OpsErrorLogQueueCapacity() int {
opsErrorLogMu.RLock()
ch := opsErrorLogQueue
opsErrorLogMu.RUnlock()
if ch == nil {
return 0
}
return cap(ch)
}
func OpsErrorLogDroppedTotal() int64 {
return opsErrorLogDropped.Load()
}
func OpsErrorLogEnqueuedTotal() int64 {
return opsErrorLogEnqueued.Load()
}
func OpsErrorLogProcessedTotal() int64 {
return opsErrorLogProcessed.Load()
}
func maybeLogOpsErrorLogDrop() {
now := time.Now().Unix()
for {
last := opsErrorLogLastDropLogAt.Load()
if last != 0 && now-last < 60 {
return
}
if opsErrorLogLastDropLogAt.CompareAndSwap(last, now) {
break
}
}
queued := opsErrorLogQueueLen.Load()
queueCap := OpsErrorLogQueueCapacity()
log.Printf(
"[OpsErrorLogger] queue is full; dropping logs (queued=%d cap=%d enqueued_total=%d dropped_total=%d processed_total=%d)",
queued,
queueCap,
opsErrorLogEnqueued.Load(),
opsErrorLogDropped.Load(),
opsErrorLogProcessed.Load(),
)
}
func opsErrorLogConfig() (workerCount int, queueSize int) {
workerCount = runtime.GOMAXPROCS(0) * 2
if workerCount < opsErrorLogMinWorkerCount {
workerCount = opsErrorLogMinWorkerCount
}
if workerCount > opsErrorLogMaxWorkerCount {
workerCount = opsErrorLogMaxWorkerCount
}
queueSize = workerCount * opsErrorLogQueueSizePerWorker
if queueSize < opsErrorLogMinQueueSize {
queueSize = opsErrorLogMinQueueSize
}
if queueSize > opsErrorLogMaxQueueSize {
queueSize = opsErrorLogMaxQueueSize
}
return workerCount, queueSize
}
func setOpsRequestContext(c *gin.Context, model string, stream bool, requestBody []byte) {
if c == nil {
return
}
c.Set(opsModelKey, model)
c.Set(opsStreamKey, stream)
if len(requestBody) > 0 {
c.Set(opsRequestBodyKey, requestBody)
}
}
func setOpsSelectedAccount(c *gin.Context, accountID int64) {
if c == nil || accountID <= 0 {
return
}
c.Set(opsAccountIDKey, accountID)
}
type opsCaptureWriter struct {
gin.ResponseWriter
limit int
buf bytes.Buffer
}
func (w *opsCaptureWriter) Write(b []byte) (int, error) {
if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit {
remaining := w.limit - w.buf.Len()
if len(b) > remaining {
_, _ = w.buf.Write(b[:remaining])
} else {
_, _ = w.buf.Write(b)
}
}
return w.ResponseWriter.Write(b)
}
func (w *opsCaptureWriter) WriteString(s string) (int, error) {
if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit {
remaining := w.limit - w.buf.Len()
if len(s) > remaining {
_, _ = w.buf.WriteString(s[:remaining])
} else {
_, _ = w.buf.WriteString(s)
}
}
return w.ResponseWriter.WriteString(s)
}
// OpsErrorLoggerMiddleware records error responses (status >= 400) into ops_error_logs.
//
// Notes:
// - It buffers response bodies only when status >= 400 to avoid overhead for successful traffic.
// - Streaming errors after the response has started (SSE) may still need explicit logging.
func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
return func(c *gin.Context) {
w := &opsCaptureWriter{ResponseWriter: c.Writer, limit: 64 * 1024}
c.Writer = w
c.Next()
if ops == nil {
return
}
if !ops.IsMonitoringEnabled(c.Request.Context()) {
return
}
status := c.Writer.Status()
if status < 400 {
// Even when the client request succeeds, we still want to persist upstream error attempts
// (retries/failover) so ops can observe upstream instability that gets "covered" by retries.
var events []*service.OpsUpstreamErrorEvent
if v, ok := c.Get(service.OpsUpstreamErrorsKey); ok {
if arr, ok := v.([]*service.OpsUpstreamErrorEvent); ok && len(arr) > 0 {
events = arr
}
}
// Also accept single upstream fields set by gateway services (rare for successful requests).
hasUpstreamContext := len(events) > 0
if !hasUpstreamContext {
if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
switch t := v.(type) {
case int:
hasUpstreamContext = t > 0
case int64:
hasUpstreamContext = t > 0
}
}
}
if !hasUpstreamContext {
if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
hasUpstreamContext = true
}
}
}
if !hasUpstreamContext {
if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
hasUpstreamContext = true
}
}
}
if !hasUpstreamContext {
return
}
apiKey, _ := middleware2.GetAPIKeyFromContext(c)
clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string)
model, _ := c.Get(opsModelKey)
streamV, _ := c.Get(opsStreamKey)
accountIDV, _ := c.Get(opsAccountIDKey)
var modelName string
if s, ok := model.(string); ok {
modelName = s
}
stream := false
if b, ok := streamV.(bool); ok {
stream = b
}
// Prefer showing the account that experienced the upstream error (if we have events),
// otherwise fall back to the final selected account (best-effort).
var accountID *int64
if len(events) > 0 {
if last := events[len(events)-1]; last != nil && last.AccountID > 0 {
v := last.AccountID
accountID = &v
}
}
if accountID == nil {
if v, ok := accountIDV.(int64); ok && v > 0 {
accountID = &v
}
}
fallbackPlatform := guessPlatformFromPath(c.Request.URL.Path)
platform := resolveOpsPlatform(apiKey, fallbackPlatform)
requestID := c.Writer.Header().Get("X-Request-Id")
if requestID == "" {
requestID = c.Writer.Header().Get("x-request-id")
}
// Best-effort backfill single upstream fields from the last event (if present).
var upstreamStatusCode *int
var upstreamErrorMessage *string
var upstreamErrorDetail *string
if len(events) > 0 {
last := events[len(events)-1]
if last != nil {
if last.UpstreamStatusCode > 0 {
code := last.UpstreamStatusCode
upstreamStatusCode = &code
}
if msg := strings.TrimSpace(last.Message); msg != "" {
upstreamErrorMessage = &msg
}
if detail := strings.TrimSpace(last.Detail); detail != "" {
upstreamErrorDetail = &detail
}
}
}
if upstreamStatusCode == nil {
if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
switch t := v.(type) {
case int:
if t > 0 {
code := t
upstreamStatusCode = &code
}
case int64:
if t > 0 {
code := int(t)
upstreamStatusCode = &code
}
}
}
}
if upstreamErrorMessage == nil {
if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
msg := strings.TrimSpace(s)
upstreamErrorMessage = &msg
}
}
}
if upstreamErrorDetail == nil {
if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
detail := strings.TrimSpace(s)
upstreamErrorDetail = &detail
}
}
}
// If we still have nothing meaningful, skip.
if upstreamStatusCode == nil && upstreamErrorMessage == nil && upstreamErrorDetail == nil && len(events) == 0 {
return
}
effectiveUpstreamStatus := 0
if upstreamStatusCode != nil {
effectiveUpstreamStatus = *upstreamStatusCode
}
recoveredMsg := "Recovered upstream error"
if effectiveUpstreamStatus > 0 {
recoveredMsg += " " + strconvItoa(effectiveUpstreamStatus)
}
if upstreamErrorMessage != nil && strings.TrimSpace(*upstreamErrorMessage) != "" {
recoveredMsg += ": " + strings.TrimSpace(*upstreamErrorMessage)
}
recoveredMsg = truncateString(recoveredMsg, 2048)
entry := &service.OpsInsertErrorLogInput{
RequestID: requestID,
ClientRequestID: clientRequestID,
AccountID: accountID,
Platform: platform,
Model: modelName,
RequestPath: func() string {
if c.Request != nil && c.Request.URL != nil {
return c.Request.URL.Path
}
return ""
}(),
Stream: stream,
UserAgent: c.GetHeader("User-Agent"),
ErrorPhase: "upstream",
ErrorType: "upstream_error",
// Severity/retryability should reflect the upstream failure, not the final client status (200).
Severity: classifyOpsSeverity("upstream_error", effectiveUpstreamStatus),
StatusCode: status,
IsBusinessLimited: false,
ErrorMessage: recoveredMsg,
ErrorBody: "",
ErrorSource: "upstream_http",
ErrorOwner: "provider",
UpstreamStatusCode: upstreamStatusCode,
UpstreamErrorMessage: upstreamErrorMessage,
UpstreamErrorDetail: upstreamErrorDetail,
UpstreamErrors: events,
IsRetryable: classifyOpsIsRetryable("upstream_error", effectiveUpstreamStatus),
RetryCount: 0,
CreatedAt: time.Now(),
}
if apiKey != nil {
entry.APIKeyID = &apiKey.ID
if apiKey.User != nil {
entry.UserID = &apiKey.User.ID
}
if apiKey.GroupID != nil {
entry.GroupID = apiKey.GroupID
}
// Prefer group platform if present (more stable than inferring from path).
if apiKey.Group != nil && apiKey.Group.Platform != "" {
entry.Platform = apiKey.Group.Platform
}
}
var clientIP string
if ip := strings.TrimSpace(c.ClientIP()); ip != "" {
clientIP = ip
entry.ClientIP = &clientIP
}
var requestBody []byte
if v, ok := c.Get(opsRequestBodyKey); ok {
if b, ok := v.([]byte); ok && len(b) > 0 {
requestBody = b
}
}
// Store request headers/body only when an upstream error occurred to keep overhead minimal.
entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c)
enqueueOpsErrorLog(ops, entry, requestBody)
return
}
body := w.buf.Bytes()
parsed := parseOpsErrorResponse(body)
apiKey, _ := middleware2.GetAPIKeyFromContext(c)
clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string)
model, _ := c.Get(opsModelKey)
streamV, _ := c.Get(opsStreamKey)
accountIDV, _ := c.Get(opsAccountIDKey)
var modelName string
if s, ok := model.(string); ok {
modelName = s
}
stream := false
if b, ok := streamV.(bool); ok {
stream = b
}
var accountID *int64
if v, ok := accountIDV.(int64); ok && v > 0 {
accountID = &v
}
fallbackPlatform := guessPlatformFromPath(c.Request.URL.Path)
platform := resolveOpsPlatform(apiKey, fallbackPlatform)
requestID := c.Writer.Header().Get("X-Request-Id")
if requestID == "" {
requestID = c.Writer.Header().Get("x-request-id")
}
phase := classifyOpsPhase(parsed.ErrorType, parsed.Message, parsed.Code)
isBusinessLimited := classifyOpsIsBusinessLimited(parsed.ErrorType, phase, parsed.Code, status, parsed.Message)
errorOwner := classifyOpsErrorOwner(phase, parsed.Message)
errorSource := classifyOpsErrorSource(phase, parsed.Message)
entry := &service.OpsInsertErrorLogInput{
RequestID: requestID,
ClientRequestID: clientRequestID,
AccountID: accountID,
Platform: platform,
Model: modelName,
RequestPath: func() string {
if c.Request != nil && c.Request.URL != nil {
return c.Request.URL.Path
}
return ""
}(),
Stream: stream,
UserAgent: c.GetHeader("User-Agent"),
ErrorPhase: phase,
ErrorType: normalizeOpsErrorType(parsed.ErrorType, parsed.Code),
Severity: classifyOpsSeverity(parsed.ErrorType, status),
StatusCode: status,
IsBusinessLimited: isBusinessLimited,
ErrorMessage: parsed.Message,
// Keep the full captured error body (capture is already capped at 64KB) so the
// service layer can sanitize JSON before truncating for storage.
ErrorBody: string(body),
ErrorSource: errorSource,
ErrorOwner: errorOwner,
IsRetryable: classifyOpsIsRetryable(parsed.ErrorType, status),
RetryCount: 0,
CreatedAt: time.Now(),
}
// Capture upstream error context set by gateway services (if present).
// This does NOT affect the client response; it enriches Ops troubleshooting data.
{
if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
switch t := v.(type) {
case int:
if t > 0 {
code := t
entry.UpstreamStatusCode = &code
}
case int64:
if t > 0 {
code := int(t)
entry.UpstreamStatusCode = &code
}
}
}
if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
if s, ok := v.(string); ok {
if msg := strings.TrimSpace(s); msg != "" {
entry.UpstreamErrorMessage = &msg
}
}
}
if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
if s, ok := v.(string); ok {
if detail := strings.TrimSpace(s); detail != "" {
entry.UpstreamErrorDetail = &detail
}
}
}
if v, ok := c.Get(service.OpsUpstreamErrorsKey); ok {
if events, ok := v.([]*service.OpsUpstreamErrorEvent); ok && len(events) > 0 {
entry.UpstreamErrors = events
// Best-effort backfill the single upstream fields from the last event when missing.
last := events[len(events)-1]
if last != nil {
if entry.UpstreamStatusCode == nil && last.UpstreamStatusCode > 0 {
code := last.UpstreamStatusCode
entry.UpstreamStatusCode = &code
}
if entry.UpstreamErrorMessage == nil && strings.TrimSpace(last.Message) != "" {
msg := strings.TrimSpace(last.Message)
entry.UpstreamErrorMessage = &msg
}
if entry.UpstreamErrorDetail == nil && strings.TrimSpace(last.Detail) != "" {
detail := strings.TrimSpace(last.Detail)
entry.UpstreamErrorDetail = &detail
}
}
}
}
}
if apiKey != nil {
entry.APIKeyID = &apiKey.ID
if apiKey.User != nil {
entry.UserID = &apiKey.User.ID
}
if apiKey.GroupID != nil {
entry.GroupID = apiKey.GroupID
}
// Prefer group platform if present (more stable than inferring from path).
if apiKey.Group != nil && apiKey.Group.Platform != "" {
entry.Platform = apiKey.Group.Platform
}
}
var clientIP string
if ip := strings.TrimSpace(c.ClientIP()); ip != "" {
clientIP = ip
entry.ClientIP = &clientIP
}
var requestBody []byte
if v, ok := c.Get(opsRequestBodyKey); ok {
if b, ok := v.([]byte); ok && len(b) > 0 {
requestBody = b
}
}
// Persist only a minimal, whitelisted set of request headers to improve retry fidelity.
// Do NOT store Authorization/Cookie/etc.
entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c)
enqueueOpsErrorLog(ops, entry, requestBody)
}
}
var opsRetryRequestHeaderAllowlist = []string{
"anthropic-beta",
"anthropic-version",
}
func extractOpsRetryRequestHeaders(c *gin.Context) *string {
if c == nil || c.Request == nil {
return nil
}
headers := make(map[string]string, 4)
for _, key := range opsRetryRequestHeaderAllowlist {
v := strings.TrimSpace(c.GetHeader(key))
if v == "" {
continue
}
// Keep headers small even if a client sends something unexpected.
headers[key] = truncateString(v, 512)
}
if len(headers) == 0 {
return nil
}
raw, err := json.Marshal(headers)
if err != nil {
return nil
}
s := string(raw)
return &s
}
type parsedOpsError struct {
ErrorType string
Message string
Code string
}
func parseOpsErrorResponse(body []byte) parsedOpsError {
if len(body) == 0 {
return parsedOpsError{}
}
// Fast path: attempt to decode into a generic map.
var m map[string]any
if err := json.Unmarshal(body, &m); err != nil {
return parsedOpsError{Message: truncateString(string(body), 1024)}
}
// Claude/OpenAI-style gateway error: { type:"error", error:{ type, message } }
if errObj, ok := m["error"].(map[string]any); ok {
t, _ := errObj["type"].(string)
msg, _ := errObj["message"].(string)
// Gemini googleError also uses "error": { code, message, status }
if msg == "" {
if v, ok := errObj["message"]; ok {
msg, _ = v.(string)
}
}
if t == "" {
// Gemini error does not have "type" field.
t = "api_error"
}
// For gemini error, capture numeric code as string for business-limited mapping if needed.
var code string
if v, ok := errObj["code"]; ok {
switch n := v.(type) {
case float64:
code = strconvItoa(int(n))
case int:
code = strconvItoa(n)
}
}
return parsedOpsError{ErrorType: t, Message: msg, Code: code}
}
// APIKeyAuth-style: { code:"INSUFFICIENT_BALANCE", message:"..." }
code, _ := m["code"].(string)
msg, _ := m["message"].(string)
if code != "" || msg != "" {
return parsedOpsError{ErrorType: "api_error", Message: msg, Code: code}
}
return parsedOpsError{Message: truncateString(string(body), 1024)}
}
func resolveOpsPlatform(apiKey *service.APIKey, fallback string) string {
if apiKey != nil && apiKey.Group != nil && apiKey.Group.Platform != "" {
return apiKey.Group.Platform
}
return fallback
}
func guessPlatformFromPath(path string) string {
p := strings.ToLower(path)
switch {
case strings.HasPrefix(p, "/antigravity/"):
return service.PlatformAntigravity
case strings.HasPrefix(p, "/v1beta/"):
return service.PlatformGemini
case strings.Contains(p, "/responses"):
return service.PlatformOpenAI
default:
return ""
}
}
func normalizeOpsErrorType(errType string, code string) string {
if errType != "" {
return errType
}
switch strings.TrimSpace(code) {
case "INSUFFICIENT_BALANCE":
return "billing_error"
case "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
return "subscription_error"
default:
return "api_error"
}
}
func classifyOpsPhase(errType, message, code string) string {
msg := strings.ToLower(message)
switch strings.TrimSpace(code) {
case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
return "billing"
}
switch errType {
case "authentication_error":
return "auth"
case "billing_error", "subscription_error":
return "billing"
case "rate_limit_error":
if strings.Contains(msg, "concurrency") || strings.Contains(msg, "pending") || strings.Contains(msg, "queue") {
return "concurrency"
}
return "upstream"
case "invalid_request_error":
return "response"
case "upstream_error", "overloaded_error":
return "upstream"
case "api_error":
if strings.Contains(msg, "no available accounts") {
return "scheduling"
}
return "internal"
default:
return "internal"
}
}
func classifyOpsSeverity(errType string, status int) string {
switch errType {
case "invalid_request_error", "authentication_error", "billing_error", "subscription_error":
return "P3"
}
if status >= 500 {
return "P1"
}
if status == 429 {
return "P1"
}
if status >= 400 {
return "P2"
}
return "P3"
}
func classifyOpsIsRetryable(errType string, statusCode int) bool {
switch errType {
case "authentication_error", "invalid_request_error":
return false
case "timeout_error":
return true
case "rate_limit_error":
// May be transient (upstream or queue); retry can help.
return true
case "billing_error", "subscription_error":
return false
case "upstream_error", "overloaded_error":
return statusCode >= 500 || statusCode == 429 || statusCode == 529
default:
return statusCode >= 500
}
}
func classifyOpsIsBusinessLimited(errType, phase, code string, status int, message string) bool {
switch strings.TrimSpace(code) {
case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
return true
}
if phase == "billing" || phase == "concurrency" {
// SLA/错误率排除“用户级业务限制”
return true
}
// Avoid treating upstream rate limits as business-limited.
if errType == "rate_limit_error" && strings.Contains(strings.ToLower(message), "upstream") {
return false
}
_ = status
return false
}
func classifyOpsErrorOwner(phase string, message string) string {
switch phase {
case "upstream", "network":
return "provider"
case "billing", "concurrency", "auth", "response":
return "client"
default:
if strings.Contains(strings.ToLower(message), "upstream") {
return "provider"
}
return "sub2api"
}
}
func classifyOpsErrorSource(phase string, message string) string {
switch phase {
case "upstream":
return "upstream_http"
case "network":
return "upstream_network"
case "billing":
return "billing"
case "concurrency":
return "concurrency"
default:
if strings.Contains(strings.ToLower(message), "upstream") {
return "upstream_http"
}
return "internal"
}
}
func truncateString(s string, max int) string {
if max <= 0 {
return ""
}
if len(s) <= max {
return s
}
cut := s[:max]
// Ensure truncation does not split multi-byte characters.
for len(cut) > 0 && !utf8.ValidString(cut) {
cut = cut[:len(cut)-1]
}
return cut
}
func strconvItoa(v int) string {
return strconv.Itoa(v)
}

View File

@@ -21,6 +21,7 @@ func ProvideAdminHandlers(
redeemHandler *admin.RedeemHandler,
promoHandler *admin.PromoHandler,
settingHandler *admin.SettingHandler,
opsHandler *admin.OpsHandler,
systemHandler *admin.SystemHandler,
subscriptionHandler *admin.SubscriptionHandler,
usageHandler *admin.UsageHandler,
@@ -39,6 +40,7 @@ func ProvideAdminHandlers(
Redeem: redeemHandler,
Promo: promoHandler,
Setting: settingHandler,
Ops: opsHandler,
System: systemHandler,
Subscription: subscriptionHandler,
Usage: usageHandler,
@@ -109,6 +111,7 @@ var ProviderSet = wire.NewSet(
admin.NewRedeemHandler,
admin.NewPromoHandler,
admin.NewSettingHandler,
admin.NewOpsHandler,
ProvideSystemHandler,
admin.NewSubscriptionHandler,
admin.NewUsageHandler,

View File

@@ -7,7 +7,14 @@ type Key string
const (
// ForcePlatform 强制平台(用于 /antigravity 路由),由 middleware.ForcePlatform 设置
ForcePlatform Key = "ctx_force_platform"
// IsClaudeCodeClient 是否为 Claude Code 客户端,由中间件设置
// ClientRequestID 客户端请求的唯一标识,用于追踪请求全生命周期(用于 Ops 监控与排障)。
ClientRequestID Key = "ctx_client_request_id"
// RetryCount 表示当前请求在网关层的重试次数(用于 Ops 记录与排障)。
RetryCount Key = "ctx_retry_count"
// IsClaudeCodeClient 标识当前请求是否来自 Claude Code 客户端
IsClaudeCodeClient Key = "ctx_is_claude_code_client"
// Group 认证后的分组信息,由 API Key 认证中间件设置
Group Key = "ctx_group"

View File

@@ -93,7 +93,7 @@ var (
return redis.call('ZCARD', key)
`)
// incrementWaitScript - only sets TTL on first creation to avoid refreshing
// incrementWaitScript - refreshes TTL on each increment to keep queue depth accurate
// KEYS[1] = wait queue key
// ARGV[1] = maxWait
// ARGV[2] = TTL in seconds
@@ -111,15 +111,13 @@ var (
local newVal = redis.call('INCR', KEYS[1])
-- Only set TTL on first creation to avoid refreshing zombie data
if newVal == 1 then
redis.call('EXPIRE', KEYS[1], ARGV[2])
end
-- Refresh TTL so long-running traffic doesn't expire active queue counters.
redis.call('EXPIRE', KEYS[1], ARGV[2])
return 1
`)
// incrementAccountWaitScript - account-level wait queue count
// incrementAccountWaitScript - account-level wait queue count (refresh TTL on each increment)
incrementAccountWaitScript = redis.NewScript(`
local current = redis.call('GET', KEYS[1])
if current == false then
@@ -134,10 +132,8 @@ var (
local newVal = redis.call('INCR', KEYS[1])
-- Only set TTL on first creation to avoid refreshing zombie data
if newVal == 1 then
redis.call('EXPIRE', KEYS[1], ARGV[2])
end
-- Refresh TTL so long-running traffic doesn't expire active queue counters.
redis.call('EXPIRE', KEYS[1], ARGV[2])
return 1
`)

View File

@@ -0,0 +1,707 @@
package repository
import (
"context"
"database/sql"
"fmt"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/lib/pq"
)
type opsRepository struct {
db *sql.DB
}
func NewOpsRepository(db *sql.DB) service.OpsRepository {
return &opsRepository{db: db}
}
func (r *opsRepository) InsertErrorLog(ctx context.Context, input *service.OpsInsertErrorLogInput) (int64, error) {
if r == nil || r.db == nil {
return 0, fmt.Errorf("nil ops repository")
}
if input == nil {
return 0, fmt.Errorf("nil input")
}
q := `
INSERT INTO ops_error_logs (
request_id,
client_request_id,
user_id,
api_key_id,
account_id,
group_id,
client_ip,
platform,
model,
request_path,
stream,
user_agent,
error_phase,
error_type,
severity,
status_code,
is_business_limited,
error_message,
error_body,
error_source,
error_owner,
upstream_status_code,
upstream_error_message,
upstream_error_detail,
upstream_errors,
duration_ms,
time_to_first_token_ms,
request_body,
request_body_truncated,
request_body_bytes,
request_headers,
is_retryable,
retry_count,
created_at
) VALUES (
$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34
) RETURNING id`
var id int64
err := r.db.QueryRowContext(
ctx,
q,
opsNullString(input.RequestID),
opsNullString(input.ClientRequestID),
opsNullInt64(input.UserID),
opsNullInt64(input.APIKeyID),
opsNullInt64(input.AccountID),
opsNullInt64(input.GroupID),
opsNullString(input.ClientIP),
opsNullString(input.Platform),
opsNullString(input.Model),
opsNullString(input.RequestPath),
input.Stream,
opsNullString(input.UserAgent),
input.ErrorPhase,
input.ErrorType,
opsNullString(input.Severity),
opsNullInt(input.StatusCode),
input.IsBusinessLimited,
opsNullString(input.ErrorMessage),
opsNullString(input.ErrorBody),
opsNullString(input.ErrorSource),
opsNullString(input.ErrorOwner),
opsNullInt(input.UpstreamStatusCode),
opsNullString(input.UpstreamErrorMessage),
opsNullString(input.UpstreamErrorDetail),
opsNullString(input.UpstreamErrorsJSON),
opsNullInt(input.DurationMs),
opsNullInt64(input.TimeToFirstTokenMs),
opsNullString(input.RequestBodyJSON),
input.RequestBodyTruncated,
opsNullInt(input.RequestBodyBytes),
opsNullString(input.RequestHeadersJSON),
input.IsRetryable,
input.RetryCount,
input.CreatedAt,
).Scan(&id)
if err != nil {
return 0, err
}
return id, nil
}
func (r *opsRepository) ListErrorLogs(ctx context.Context, filter *service.OpsErrorLogFilter) (*service.OpsErrorLogList, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
if filter == nil {
filter = &service.OpsErrorLogFilter{}
}
page := filter.Page
if page <= 0 {
page = 1
}
pageSize := filter.PageSize
if pageSize <= 0 {
pageSize = 20
}
if pageSize > 500 {
pageSize = 500
}
where, args := buildOpsErrorLogsWhere(filter)
countSQL := "SELECT COUNT(*) FROM ops_error_logs " + where
var total int
if err := r.db.QueryRowContext(ctx, countSQL, args...).Scan(&total); err != nil {
return nil, err
}
offset := (page - 1) * pageSize
argsWithLimit := append(args, pageSize, offset)
selectSQL := `
SELECT
id,
created_at,
error_phase,
error_type,
severity,
COALESCE(upstream_status_code, status_code, 0),
COALESCE(platform, ''),
COALESCE(model, ''),
duration_ms,
COALESCE(client_request_id, ''),
COALESCE(request_id, ''),
COALESCE(error_message, ''),
user_id,
api_key_id,
account_id,
group_id,
CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END,
COALESCE(request_path, ''),
stream
FROM ops_error_logs
` + where + `
ORDER BY created_at DESC
LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2)
rows, err := r.db.QueryContext(ctx, selectSQL, argsWithLimit...)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
out := make([]*service.OpsErrorLog, 0, pageSize)
for rows.Next() {
var item service.OpsErrorLog
var latency sql.NullInt64
var statusCode sql.NullInt64
var clientIP sql.NullString
var userID sql.NullInt64
var apiKeyID sql.NullInt64
var accountID sql.NullInt64
var groupID sql.NullInt64
if err := rows.Scan(
&item.ID,
&item.CreatedAt,
&item.Phase,
&item.Type,
&item.Severity,
&statusCode,
&item.Platform,
&item.Model,
&latency,
&item.ClientRequestID,
&item.RequestID,
&item.Message,
&userID,
&apiKeyID,
&accountID,
&groupID,
&clientIP,
&item.RequestPath,
&item.Stream,
); err != nil {
return nil, err
}
if latency.Valid {
v := int(latency.Int64)
item.LatencyMs = &v
}
item.StatusCode = int(statusCode.Int64)
if clientIP.Valid {
s := clientIP.String
item.ClientIP = &s
}
if userID.Valid {
v := userID.Int64
item.UserID = &v
}
if apiKeyID.Valid {
v := apiKeyID.Int64
item.APIKeyID = &v
}
if accountID.Valid {
v := accountID.Int64
item.AccountID = &v
}
if groupID.Valid {
v := groupID.Int64
item.GroupID = &v
}
out = append(out, &item)
}
if err := rows.Err(); err != nil {
return nil, err
}
return &service.OpsErrorLogList{
Errors: out,
Total: total,
Page: page,
PageSize: pageSize,
}, nil
}
func (r *opsRepository) GetErrorLogByID(ctx context.Context, id int64) (*service.OpsErrorLogDetail, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
if id <= 0 {
return nil, fmt.Errorf("invalid id")
}
q := `
SELECT
id,
created_at,
error_phase,
error_type,
severity,
COALESCE(upstream_status_code, status_code, 0),
COALESCE(platform, ''),
COALESCE(model, ''),
duration_ms,
COALESCE(client_request_id, ''),
COALESCE(request_id, ''),
COALESCE(error_message, ''),
COALESCE(error_body, ''),
upstream_status_code,
COALESCE(upstream_error_message, ''),
COALESCE(upstream_error_detail, ''),
COALESCE(upstream_errors::text, ''),
is_business_limited,
user_id,
api_key_id,
account_id,
group_id,
CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END,
COALESCE(request_path, ''),
stream,
COALESCE(user_agent, ''),
auth_latency_ms,
routing_latency_ms,
upstream_latency_ms,
response_latency_ms,
time_to_first_token_ms,
COALESCE(request_body::text, ''),
request_body_truncated,
request_body_bytes,
COALESCE(request_headers::text, '')
FROM ops_error_logs
WHERE id = $1
LIMIT 1`
var out service.OpsErrorLogDetail
var latency sql.NullInt64
var statusCode sql.NullInt64
var upstreamStatusCode sql.NullInt64
var clientIP sql.NullString
var userID sql.NullInt64
var apiKeyID sql.NullInt64
var accountID sql.NullInt64
var groupID sql.NullInt64
var authLatency sql.NullInt64
var routingLatency sql.NullInt64
var upstreamLatency sql.NullInt64
var responseLatency sql.NullInt64
var ttft sql.NullInt64
var requestBodyBytes sql.NullInt64
err := r.db.QueryRowContext(ctx, q, id).Scan(
&out.ID,
&out.CreatedAt,
&out.Phase,
&out.Type,
&out.Severity,
&statusCode,
&out.Platform,
&out.Model,
&latency,
&out.ClientRequestID,
&out.RequestID,
&out.Message,
&out.ErrorBody,
&upstreamStatusCode,
&out.UpstreamErrorMessage,
&out.UpstreamErrorDetail,
&out.UpstreamErrors,
&out.IsBusinessLimited,
&userID,
&apiKeyID,
&accountID,
&groupID,
&clientIP,
&out.RequestPath,
&out.Stream,
&out.UserAgent,
&authLatency,
&routingLatency,
&upstreamLatency,
&responseLatency,
&ttft,
&out.RequestBody,
&out.RequestBodyTruncated,
&requestBodyBytes,
&out.RequestHeaders,
)
if err != nil {
return nil, err
}
out.StatusCode = int(statusCode.Int64)
if latency.Valid {
v := int(latency.Int64)
out.LatencyMs = &v
}
if clientIP.Valid {
s := clientIP.String
out.ClientIP = &s
}
if upstreamStatusCode.Valid && upstreamStatusCode.Int64 > 0 {
v := int(upstreamStatusCode.Int64)
out.UpstreamStatusCode = &v
}
if userID.Valid {
v := userID.Int64
out.UserID = &v
}
if apiKeyID.Valid {
v := apiKeyID.Int64
out.APIKeyID = &v
}
if accountID.Valid {
v := accountID.Int64
out.AccountID = &v
}
if groupID.Valid {
v := groupID.Int64
out.GroupID = &v
}
if authLatency.Valid {
v := authLatency.Int64
out.AuthLatencyMs = &v
}
if routingLatency.Valid {
v := routingLatency.Int64
out.RoutingLatencyMs = &v
}
if upstreamLatency.Valid {
v := upstreamLatency.Int64
out.UpstreamLatencyMs = &v
}
if responseLatency.Valid {
v := responseLatency.Int64
out.ResponseLatencyMs = &v
}
if ttft.Valid {
v := ttft.Int64
out.TimeToFirstTokenMs = &v
}
if requestBodyBytes.Valid {
v := int(requestBodyBytes.Int64)
out.RequestBodyBytes = &v
}
// Normalize request_body to empty string when stored as JSON null.
out.RequestBody = strings.TrimSpace(out.RequestBody)
if out.RequestBody == "null" {
out.RequestBody = ""
}
// Normalize request_headers to empty string when stored as JSON null.
out.RequestHeaders = strings.TrimSpace(out.RequestHeaders)
if out.RequestHeaders == "null" {
out.RequestHeaders = ""
}
// Normalize upstream_errors to empty string when stored as JSON null.
out.UpstreamErrors = strings.TrimSpace(out.UpstreamErrors)
if out.UpstreamErrors == "null" {
out.UpstreamErrors = ""
}
return &out, nil
}
func (r *opsRepository) InsertRetryAttempt(ctx context.Context, input *service.OpsInsertRetryAttemptInput) (int64, error) {
if r == nil || r.db == nil {
return 0, fmt.Errorf("nil ops repository")
}
if input == nil {
return 0, fmt.Errorf("nil input")
}
if input.SourceErrorID <= 0 {
return 0, fmt.Errorf("invalid source_error_id")
}
if strings.TrimSpace(input.Mode) == "" {
return 0, fmt.Errorf("invalid mode")
}
q := `
INSERT INTO ops_retry_attempts (
requested_by_user_id,
source_error_id,
mode,
pinned_account_id,
status,
started_at
) VALUES (
$1,$2,$3,$4,$5,$6
) RETURNING id`
var id int64
err := r.db.QueryRowContext(
ctx,
q,
opsNullInt64(&input.RequestedByUserID),
input.SourceErrorID,
strings.TrimSpace(input.Mode),
opsNullInt64(input.PinnedAccountID),
strings.TrimSpace(input.Status),
input.StartedAt,
).Scan(&id)
if err != nil {
return 0, err
}
return id, nil
}
func (r *opsRepository) UpdateRetryAttempt(ctx context.Context, input *service.OpsUpdateRetryAttemptInput) error {
if r == nil || r.db == nil {
return fmt.Errorf("nil ops repository")
}
if input == nil {
return fmt.Errorf("nil input")
}
if input.ID <= 0 {
return fmt.Errorf("invalid id")
}
q := `
UPDATE ops_retry_attempts
SET
status = $2,
finished_at = $3,
duration_ms = $4,
result_request_id = $5,
result_error_id = $6,
error_message = $7
WHERE id = $1`
_, err := r.db.ExecContext(
ctx,
q,
input.ID,
strings.TrimSpace(input.Status),
nullTime(input.FinishedAt),
input.DurationMs,
opsNullString(input.ResultRequestID),
opsNullInt64(input.ResultErrorID),
opsNullString(input.ErrorMessage),
)
return err
}
func (r *opsRepository) GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*service.OpsRetryAttempt, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
if sourceErrorID <= 0 {
return nil, fmt.Errorf("invalid source_error_id")
}
q := `
SELECT
id,
created_at,
COALESCE(requested_by_user_id, 0),
source_error_id,
COALESCE(mode, ''),
pinned_account_id,
COALESCE(status, ''),
started_at,
finished_at,
duration_ms,
result_request_id,
result_error_id,
error_message
FROM ops_retry_attempts
WHERE source_error_id = $1
ORDER BY created_at DESC
LIMIT 1`
var out service.OpsRetryAttempt
var pinnedAccountID sql.NullInt64
var requestedBy sql.NullInt64
var startedAt sql.NullTime
var finishedAt sql.NullTime
var durationMs sql.NullInt64
var resultRequestID sql.NullString
var resultErrorID sql.NullInt64
var errorMessage sql.NullString
err := r.db.QueryRowContext(ctx, q, sourceErrorID).Scan(
&out.ID,
&out.CreatedAt,
&requestedBy,
&out.SourceErrorID,
&out.Mode,
&pinnedAccountID,
&out.Status,
&startedAt,
&finishedAt,
&durationMs,
&resultRequestID,
&resultErrorID,
&errorMessage,
)
if err != nil {
return nil, err
}
out.RequestedByUserID = requestedBy.Int64
if pinnedAccountID.Valid {
v := pinnedAccountID.Int64
out.PinnedAccountID = &v
}
if startedAt.Valid {
t := startedAt.Time
out.StartedAt = &t
}
if finishedAt.Valid {
t := finishedAt.Time
out.FinishedAt = &t
}
if durationMs.Valid {
v := durationMs.Int64
out.DurationMs = &v
}
if resultRequestID.Valid {
s := resultRequestID.String
out.ResultRequestID = &s
}
if resultErrorID.Valid {
v := resultErrorID.Int64
out.ResultErrorID = &v
}
if errorMessage.Valid {
s := errorMessage.String
out.ErrorMessage = &s
}
return &out, nil
}
func nullTime(t time.Time) sql.NullTime {
if t.IsZero() {
return sql.NullTime{}
}
return sql.NullTime{Time: t, Valid: true}
}
func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) {
clauses := make([]string, 0, 8)
args := make([]any, 0, 8)
clauses = append(clauses, "1=1")
phaseFilter := ""
if filter != nil {
phaseFilter = strings.TrimSpace(strings.ToLower(filter.Phase))
}
// ops_error_logs primarily stores client-visible error requests (status>=400),
// but we also persist "recovered" upstream errors (status<400) for upstream health visibility.
// By default, keep list endpoints scoped to client errors unless explicitly filtering upstream phase.
if phaseFilter != "upstream" {
clauses = append(clauses, "COALESCE(status_code, 0) >= 400")
}
if filter.StartTime != nil && !filter.StartTime.IsZero() {
args = append(args, filter.StartTime.UTC())
clauses = append(clauses, "created_at >= $"+itoa(len(args)))
}
if filter.EndTime != nil && !filter.EndTime.IsZero() {
args = append(args, filter.EndTime.UTC())
// Keep time-window semantics consistent with other ops queries: [start, end)
clauses = append(clauses, "created_at < $"+itoa(len(args)))
}
if p := strings.TrimSpace(filter.Platform); p != "" {
args = append(args, p)
clauses = append(clauses, "platform = $"+itoa(len(args)))
}
if filter.GroupID != nil && *filter.GroupID > 0 {
args = append(args, *filter.GroupID)
clauses = append(clauses, "group_id = $"+itoa(len(args)))
}
if filter.AccountID != nil && *filter.AccountID > 0 {
args = append(args, *filter.AccountID)
clauses = append(clauses, "account_id = $"+itoa(len(args)))
}
if phase := phaseFilter; phase != "" {
args = append(args, phase)
clauses = append(clauses, "error_phase = $"+itoa(len(args)))
}
if len(filter.StatusCodes) > 0 {
args = append(args, pq.Array(filter.StatusCodes))
clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) = ANY($"+itoa(len(args))+")")
}
if q := strings.TrimSpace(filter.Query); q != "" {
like := "%" + q + "%"
args = append(args, like)
n := itoa(len(args))
clauses = append(clauses, "(request_id ILIKE $"+n+" OR client_request_id ILIKE $"+n+" OR error_message ILIKE $"+n+")")
}
return "WHERE " + strings.Join(clauses, " AND "), args
}
// Helpers for nullable args
func opsNullString(v any) any {
switch s := v.(type) {
case nil:
return sql.NullString{}
case *string:
if s == nil || strings.TrimSpace(*s) == "" {
return sql.NullString{}
}
return sql.NullString{String: strings.TrimSpace(*s), Valid: true}
case string:
if strings.TrimSpace(s) == "" {
return sql.NullString{}
}
return sql.NullString{String: strings.TrimSpace(s), Valid: true}
default:
return sql.NullString{}
}
}
func opsNullInt64(v *int64) any {
if v == nil || *v == 0 {
return sql.NullInt64{}
}
return sql.NullInt64{Int64: *v, Valid: true}
}
func opsNullInt(v any) any {
switch n := v.(type) {
case nil:
return sql.NullInt64{}
case *int:
if n == nil || *n == 0 {
return sql.NullInt64{}
}
return sql.NullInt64{Int64: int64(*n), Valid: true}
case *int64:
if n == nil || *n == 0 {
return sql.NullInt64{}
}
return sql.NullInt64{Int64: *n, Valid: true}
case int:
if n == 0 {
return sql.NullInt64{}
}
return sql.NullInt64{Int64: int64(n), Valid: true}
default:
return sql.NullInt64{}
}
}

View File

@@ -0,0 +1,689 @@
package repository
import (
"context"
"database/sql"
"encoding/json"
"fmt"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/service"
)
func (r *opsRepository) ListAlertRules(ctx context.Context) ([]*service.OpsAlertRule, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
q := `
SELECT
id,
name,
COALESCE(description, ''),
enabled,
COALESCE(severity, ''),
metric_type,
operator,
threshold,
window_minutes,
sustained_minutes,
cooldown_minutes,
COALESCE(notify_email, true),
filters,
last_triggered_at,
created_at,
updated_at
FROM ops_alert_rules
ORDER BY id DESC`
rows, err := r.db.QueryContext(ctx, q)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
out := []*service.OpsAlertRule{}
for rows.Next() {
var rule service.OpsAlertRule
var filtersRaw []byte
var lastTriggeredAt sql.NullTime
if err := rows.Scan(
&rule.ID,
&rule.Name,
&rule.Description,
&rule.Enabled,
&rule.Severity,
&rule.MetricType,
&rule.Operator,
&rule.Threshold,
&rule.WindowMinutes,
&rule.SustainedMinutes,
&rule.CooldownMinutes,
&rule.NotifyEmail,
&filtersRaw,
&lastTriggeredAt,
&rule.CreatedAt,
&rule.UpdatedAt,
); err != nil {
return nil, err
}
if lastTriggeredAt.Valid {
v := lastTriggeredAt.Time
rule.LastTriggeredAt = &v
}
if len(filtersRaw) > 0 && string(filtersRaw) != "null" {
var decoded map[string]any
if err := json.Unmarshal(filtersRaw, &decoded); err == nil {
rule.Filters = decoded
}
}
out = append(out, &rule)
}
if err := rows.Err(); err != nil {
return nil, err
}
return out, nil
}
func (r *opsRepository) CreateAlertRule(ctx context.Context, input *service.OpsAlertRule) (*service.OpsAlertRule, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
if input == nil {
return nil, fmt.Errorf("nil input")
}
filtersArg, err := opsNullJSONMap(input.Filters)
if err != nil {
return nil, err
}
q := `
INSERT INTO ops_alert_rules (
name,
description,
enabled,
severity,
metric_type,
operator,
threshold,
window_minutes,
sustained_minutes,
cooldown_minutes,
notify_email,
filters,
created_at,
updated_at
) VALUES (
$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,NOW(),NOW()
)
RETURNING
id,
name,
COALESCE(description, ''),
enabled,
COALESCE(severity, ''),
metric_type,
operator,
threshold,
window_minutes,
sustained_minutes,
cooldown_minutes,
COALESCE(notify_email, true),
filters,
last_triggered_at,
created_at,
updated_at`
var out service.OpsAlertRule
var filtersRaw []byte
var lastTriggeredAt sql.NullTime
if err := r.db.QueryRowContext(
ctx,
q,
strings.TrimSpace(input.Name),
strings.TrimSpace(input.Description),
input.Enabled,
strings.TrimSpace(input.Severity),
strings.TrimSpace(input.MetricType),
strings.TrimSpace(input.Operator),
input.Threshold,
input.WindowMinutes,
input.SustainedMinutes,
input.CooldownMinutes,
input.NotifyEmail,
filtersArg,
).Scan(
&out.ID,
&out.Name,
&out.Description,
&out.Enabled,
&out.Severity,
&out.MetricType,
&out.Operator,
&out.Threshold,
&out.WindowMinutes,
&out.SustainedMinutes,
&out.CooldownMinutes,
&out.NotifyEmail,
&filtersRaw,
&lastTriggeredAt,
&out.CreatedAt,
&out.UpdatedAt,
); err != nil {
return nil, err
}
if lastTriggeredAt.Valid {
v := lastTriggeredAt.Time
out.LastTriggeredAt = &v
}
if len(filtersRaw) > 0 && string(filtersRaw) != "null" {
var decoded map[string]any
if err := json.Unmarshal(filtersRaw, &decoded); err == nil {
out.Filters = decoded
}
}
return &out, nil
}
func (r *opsRepository) UpdateAlertRule(ctx context.Context, input *service.OpsAlertRule) (*service.OpsAlertRule, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
if input == nil {
return nil, fmt.Errorf("nil input")
}
if input.ID <= 0 {
return nil, fmt.Errorf("invalid id")
}
filtersArg, err := opsNullJSONMap(input.Filters)
if err != nil {
return nil, err
}
q := `
UPDATE ops_alert_rules
SET
name = $2,
description = $3,
enabled = $4,
severity = $5,
metric_type = $6,
operator = $7,
threshold = $8,
window_minutes = $9,
sustained_minutes = $10,
cooldown_minutes = $11,
notify_email = $12,
filters = $13,
updated_at = NOW()
WHERE id = $1
RETURNING
id,
name,
COALESCE(description, ''),
enabled,
COALESCE(severity, ''),
metric_type,
operator,
threshold,
window_minutes,
sustained_minutes,
cooldown_minutes,
COALESCE(notify_email, true),
filters,
last_triggered_at,
created_at,
updated_at`
var out service.OpsAlertRule
var filtersRaw []byte
var lastTriggeredAt sql.NullTime
if err := r.db.QueryRowContext(
ctx,
q,
input.ID,
strings.TrimSpace(input.Name),
strings.TrimSpace(input.Description),
input.Enabled,
strings.TrimSpace(input.Severity),
strings.TrimSpace(input.MetricType),
strings.TrimSpace(input.Operator),
input.Threshold,
input.WindowMinutes,
input.SustainedMinutes,
input.CooldownMinutes,
input.NotifyEmail,
filtersArg,
).Scan(
&out.ID,
&out.Name,
&out.Description,
&out.Enabled,
&out.Severity,
&out.MetricType,
&out.Operator,
&out.Threshold,
&out.WindowMinutes,
&out.SustainedMinutes,
&out.CooldownMinutes,
&out.NotifyEmail,
&filtersRaw,
&lastTriggeredAt,
&out.CreatedAt,
&out.UpdatedAt,
); err != nil {
return nil, err
}
if lastTriggeredAt.Valid {
v := lastTriggeredAt.Time
out.LastTriggeredAt = &v
}
if len(filtersRaw) > 0 && string(filtersRaw) != "null" {
var decoded map[string]any
if err := json.Unmarshal(filtersRaw, &decoded); err == nil {
out.Filters = decoded
}
}
return &out, nil
}
func (r *opsRepository) DeleteAlertRule(ctx context.Context, id int64) error {
if r == nil || r.db == nil {
return fmt.Errorf("nil ops repository")
}
if id <= 0 {
return fmt.Errorf("invalid id")
}
res, err := r.db.ExecContext(ctx, "DELETE FROM ops_alert_rules WHERE id = $1", id)
if err != nil {
return err
}
affected, err := res.RowsAffected()
if err != nil {
return err
}
if affected == 0 {
return sql.ErrNoRows
}
return nil
}
func (r *opsRepository) ListAlertEvents(ctx context.Context, filter *service.OpsAlertEventFilter) ([]*service.OpsAlertEvent, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
if filter == nil {
filter = &service.OpsAlertEventFilter{}
}
limit := filter.Limit
if limit <= 0 {
limit = 100
}
if limit > 500 {
limit = 500
}
where, args := buildOpsAlertEventsWhere(filter)
args = append(args, limit)
limitArg := "$" + itoa(len(args))
q := `
SELECT
id,
COALESCE(rule_id, 0),
COALESCE(severity, ''),
COALESCE(status, ''),
COALESCE(title, ''),
COALESCE(description, ''),
metric_value,
threshold_value,
dimensions,
fired_at,
resolved_at,
email_sent,
created_at
FROM ops_alert_events
` + where + `
ORDER BY fired_at DESC
LIMIT ` + limitArg
rows, err := r.db.QueryContext(ctx, q, args...)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
out := []*service.OpsAlertEvent{}
for rows.Next() {
var ev service.OpsAlertEvent
var metricValue sql.NullFloat64
var thresholdValue sql.NullFloat64
var dimensionsRaw []byte
var resolvedAt sql.NullTime
if err := rows.Scan(
&ev.ID,
&ev.RuleID,
&ev.Severity,
&ev.Status,
&ev.Title,
&ev.Description,
&metricValue,
&thresholdValue,
&dimensionsRaw,
&ev.FiredAt,
&resolvedAt,
&ev.EmailSent,
&ev.CreatedAt,
); err != nil {
return nil, err
}
if metricValue.Valid {
v := metricValue.Float64
ev.MetricValue = &v
}
if thresholdValue.Valid {
v := thresholdValue.Float64
ev.ThresholdValue = &v
}
if resolvedAt.Valid {
v := resolvedAt.Time
ev.ResolvedAt = &v
}
if len(dimensionsRaw) > 0 && string(dimensionsRaw) != "null" {
var decoded map[string]any
if err := json.Unmarshal(dimensionsRaw, &decoded); err == nil {
ev.Dimensions = decoded
}
}
out = append(out, &ev)
}
if err := rows.Err(); err != nil {
return nil, err
}
return out, nil
}
func (r *opsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
if ruleID <= 0 {
return nil, fmt.Errorf("invalid rule id")
}
q := `
SELECT
id,
COALESCE(rule_id, 0),
COALESCE(severity, ''),
COALESCE(status, ''),
COALESCE(title, ''),
COALESCE(description, ''),
metric_value,
threshold_value,
dimensions,
fired_at,
resolved_at,
email_sent,
created_at
FROM ops_alert_events
WHERE rule_id = $1 AND status = $2
ORDER BY fired_at DESC
LIMIT 1`
row := r.db.QueryRowContext(ctx, q, ruleID, service.OpsAlertStatusFiring)
ev, err := scanOpsAlertEvent(row)
if err != nil {
if err == sql.ErrNoRows {
return nil, nil
}
return nil, err
}
return ev, nil
}
func (r *opsRepository) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
if ruleID <= 0 {
return nil, fmt.Errorf("invalid rule id")
}
q := `
SELECT
id,
COALESCE(rule_id, 0),
COALESCE(severity, ''),
COALESCE(status, ''),
COALESCE(title, ''),
COALESCE(description, ''),
metric_value,
threshold_value,
dimensions,
fired_at,
resolved_at,
email_sent,
created_at
FROM ops_alert_events
WHERE rule_id = $1
ORDER BY fired_at DESC
LIMIT 1`
row := r.db.QueryRowContext(ctx, q, ruleID)
ev, err := scanOpsAlertEvent(row)
if err != nil {
if err == sql.ErrNoRows {
return nil, nil
}
return nil, err
}
return ev, nil
}
func (r *opsRepository) CreateAlertEvent(ctx context.Context, event *service.OpsAlertEvent) (*service.OpsAlertEvent, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
if event == nil {
return nil, fmt.Errorf("nil event")
}
dimensionsArg, err := opsNullJSONMap(event.Dimensions)
if err != nil {
return nil, err
}
q := `
INSERT INTO ops_alert_events (
rule_id,
severity,
status,
title,
description,
metric_value,
threshold_value,
dimensions,
fired_at,
resolved_at,
email_sent,
created_at
) VALUES (
$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,NOW()
)
RETURNING
id,
COALESCE(rule_id, 0),
COALESCE(severity, ''),
COALESCE(status, ''),
COALESCE(title, ''),
COALESCE(description, ''),
metric_value,
threshold_value,
dimensions,
fired_at,
resolved_at,
email_sent,
created_at`
row := r.db.QueryRowContext(
ctx,
q,
opsNullInt64(&event.RuleID),
opsNullString(event.Severity),
opsNullString(event.Status),
opsNullString(event.Title),
opsNullString(event.Description),
opsNullFloat64(event.MetricValue),
opsNullFloat64(event.ThresholdValue),
dimensionsArg,
event.FiredAt,
opsNullTime(event.ResolvedAt),
event.EmailSent,
)
return scanOpsAlertEvent(row)
}
func (r *opsRepository) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
if r == nil || r.db == nil {
return fmt.Errorf("nil ops repository")
}
if eventID <= 0 {
return fmt.Errorf("invalid event id")
}
if strings.TrimSpace(status) == "" {
return fmt.Errorf("invalid status")
}
q := `
UPDATE ops_alert_events
SET status = $2,
resolved_at = $3
WHERE id = $1`
_, err := r.db.ExecContext(ctx, q, eventID, strings.TrimSpace(status), opsNullTime(resolvedAt))
return err
}
func (r *opsRepository) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error {
if r == nil || r.db == nil {
return fmt.Errorf("nil ops repository")
}
if eventID <= 0 {
return fmt.Errorf("invalid event id")
}
_, err := r.db.ExecContext(ctx, "UPDATE ops_alert_events SET email_sent = $2 WHERE id = $1", eventID, emailSent)
return err
}
type opsAlertEventRow interface {
Scan(dest ...any) error
}
func scanOpsAlertEvent(row opsAlertEventRow) (*service.OpsAlertEvent, error) {
var ev service.OpsAlertEvent
var metricValue sql.NullFloat64
var thresholdValue sql.NullFloat64
var dimensionsRaw []byte
var resolvedAt sql.NullTime
if err := row.Scan(
&ev.ID,
&ev.RuleID,
&ev.Severity,
&ev.Status,
&ev.Title,
&ev.Description,
&metricValue,
&thresholdValue,
&dimensionsRaw,
&ev.FiredAt,
&resolvedAt,
&ev.EmailSent,
&ev.CreatedAt,
); err != nil {
return nil, err
}
if metricValue.Valid {
v := metricValue.Float64
ev.MetricValue = &v
}
if thresholdValue.Valid {
v := thresholdValue.Float64
ev.ThresholdValue = &v
}
if resolvedAt.Valid {
v := resolvedAt.Time
ev.ResolvedAt = &v
}
if len(dimensionsRaw) > 0 && string(dimensionsRaw) != "null" {
var decoded map[string]any
if err := json.Unmarshal(dimensionsRaw, &decoded); err == nil {
ev.Dimensions = decoded
}
}
return &ev, nil
}
func buildOpsAlertEventsWhere(filter *service.OpsAlertEventFilter) (string, []any) {
clauses := []string{"1=1"}
args := []any{}
if filter == nil {
return "WHERE " + strings.Join(clauses, " AND "), args
}
if status := strings.TrimSpace(filter.Status); status != "" {
args = append(args, status)
clauses = append(clauses, "status = $"+itoa(len(args)))
}
if severity := strings.TrimSpace(filter.Severity); severity != "" {
args = append(args, severity)
clauses = append(clauses, "severity = $"+itoa(len(args)))
}
if filter.StartTime != nil && !filter.StartTime.IsZero() {
args = append(args, *filter.StartTime)
clauses = append(clauses, "fired_at >= $"+itoa(len(args)))
}
if filter.EndTime != nil && !filter.EndTime.IsZero() {
args = append(args, *filter.EndTime)
clauses = append(clauses, "fired_at < $"+itoa(len(args)))
}
// Dimensions are stored in JSONB. We filter best-effort without requiring GIN indexes.
if platform := strings.TrimSpace(filter.Platform); platform != "" {
args = append(args, platform)
clauses = append(clauses, "(dimensions->>'platform') = $"+itoa(len(args)))
}
if filter.GroupID != nil && *filter.GroupID > 0 {
args = append(args, fmt.Sprintf("%d", *filter.GroupID))
clauses = append(clauses, "(dimensions->>'group_id') = $"+itoa(len(args)))
}
return "WHERE " + strings.Join(clauses, " AND "), args
}
func opsNullJSONMap(v map[string]any) (any, error) {
if v == nil {
return sql.NullString{}, nil
}
b, err := json.Marshal(v)
if err != nil {
return nil, err
}
if len(b) == 0 {
return sql.NullString{}, nil
}
return sql.NullString{String: string(b), Valid: true}, nil
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,79 @@
package repository
import (
"context"
"fmt"
"strings"
"github.com/Wei-Shaw/sub2api/internal/service"
)
func (r *opsRepository) GetLatencyHistogram(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsLatencyHistogramResponse, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
if filter == nil {
return nil, fmt.Errorf("nil filter")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, fmt.Errorf("start_time/end_time required")
}
start := filter.StartTime.UTC()
end := filter.EndTime.UTC()
join, where, args, _ := buildUsageWhere(filter, start, end, 1)
rangeExpr := latencyHistogramRangeCaseExpr("ul.duration_ms")
orderExpr := latencyHistogramRangeOrderCaseExpr("ul.duration_ms")
q := `
SELECT
` + rangeExpr + ` AS range,
COALESCE(COUNT(*), 0) AS count,
` + orderExpr + ` AS ord
FROM usage_logs ul
` + join + `
` + where + `
AND ul.duration_ms IS NOT NULL
GROUP BY 1, 3
ORDER BY 3 ASC`
rows, err := r.db.QueryContext(ctx, q, args...)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
counts := make(map[string]int64, len(latencyHistogramOrderedRanges))
var total int64
for rows.Next() {
var label string
var count int64
var _ord int
if err := rows.Scan(&label, &count, &_ord); err != nil {
return nil, err
}
counts[label] = count
total += count
}
if err := rows.Err(); err != nil {
return nil, err
}
buckets := make([]*service.OpsLatencyHistogramBucket, 0, len(latencyHistogramOrderedRanges))
for _, label := range latencyHistogramOrderedRanges {
buckets = append(buckets, &service.OpsLatencyHistogramBucket{
Range: label,
Count: counts[label],
})
}
return &service.OpsLatencyHistogramResponse{
StartTime: start,
EndTime: end,
Platform: strings.TrimSpace(filter.Platform),
GroupID: filter.GroupID,
TotalRequests: total,
Buckets: buckets,
}, nil
}

View File

@@ -0,0 +1,64 @@
package repository
import (
"fmt"
"strings"
)
type latencyHistogramBucket struct {
upperMs int
label string
}
var latencyHistogramBuckets = []latencyHistogramBucket{
{upperMs: 100, label: "0-100ms"},
{upperMs: 200, label: "100-200ms"},
{upperMs: 500, label: "200-500ms"},
{upperMs: 1000, label: "500-1000ms"},
{upperMs: 2000, label: "1000-2000ms"},
{upperMs: 0, label: "2000ms+"}, // default bucket
}
var latencyHistogramOrderedRanges = func() []string {
out := make([]string, 0, len(latencyHistogramBuckets))
for _, b := range latencyHistogramBuckets {
out = append(out, b.label)
}
return out
}()
func latencyHistogramRangeCaseExpr(column string) string {
var sb strings.Builder
_, _ = sb.WriteString("CASE\n")
for _, b := range latencyHistogramBuckets {
if b.upperMs <= 0 {
continue
}
_, _ = sb.WriteString(fmt.Sprintf("\tWHEN %s < %d THEN '%s'\n", column, b.upperMs, b.label))
}
// Default bucket.
last := latencyHistogramBuckets[len(latencyHistogramBuckets)-1]
_, _ = sb.WriteString(fmt.Sprintf("\tELSE '%s'\n", last.label))
_, _ = sb.WriteString("END")
return sb.String()
}
func latencyHistogramRangeOrderCaseExpr(column string) string {
var sb strings.Builder
_, _ = sb.WriteString("CASE\n")
order := 1
for _, b := range latencyHistogramBuckets {
if b.upperMs <= 0 {
continue
}
_, _ = sb.WriteString(fmt.Sprintf("\tWHEN %s < %d THEN %d\n", column, b.upperMs, order))
order++
}
_, _ = sb.WriteString(fmt.Sprintf("\tELSE %d\n", order))
_, _ = sb.WriteString("END")
return sb.String()
}

View File

@@ -0,0 +1,14 @@
package repository
import (
"testing"
"github.com/stretchr/testify/require"
)
func TestLatencyHistogramBuckets_AreConsistent(t *testing.T) {
require.Equal(t, len(latencyHistogramBuckets), len(latencyHistogramOrderedRanges))
for i, b := range latencyHistogramBuckets {
require.Equal(t, b.label, latencyHistogramOrderedRanges[i])
}
}

View File

@@ -0,0 +1,422 @@
package repository
import (
"context"
"database/sql"
"fmt"
"time"
"github.com/Wei-Shaw/sub2api/internal/service"
)
func (r *opsRepository) InsertSystemMetrics(ctx context.Context, input *service.OpsInsertSystemMetricsInput) error {
if r == nil || r.db == nil {
return fmt.Errorf("nil ops repository")
}
if input == nil {
return fmt.Errorf("nil input")
}
window := input.WindowMinutes
if window <= 0 {
window = 1
}
createdAt := input.CreatedAt
if createdAt.IsZero() {
createdAt = time.Now().UTC()
}
q := `
INSERT INTO ops_system_metrics (
created_at,
window_minutes,
platform,
group_id,
success_count,
error_count_total,
business_limited_count,
error_count_sla,
upstream_error_count_excl_429_529,
upstream_429_count,
upstream_529_count,
token_consumed,
qps,
tps,
duration_p50_ms,
duration_p90_ms,
duration_p95_ms,
duration_p99_ms,
duration_avg_ms,
duration_max_ms,
ttft_p50_ms,
ttft_p90_ms,
ttft_p95_ms,
ttft_p99_ms,
ttft_avg_ms,
ttft_max_ms,
cpu_usage_percent,
memory_used_mb,
memory_total_mb,
memory_usage_percent,
db_ok,
redis_ok,
redis_conn_total,
redis_conn_idle,
db_conn_active,
db_conn_idle,
db_conn_waiting,
goroutine_count,
concurrency_queue_depth
) VALUES (
$1,$2,$3,$4,
$5,$6,$7,$8,
$9,$10,$11,
$12,$13,$14,
$15,$16,$17,$18,$19,$20,
$21,$22,$23,$24,$25,$26,
$27,$28,$29,$30,
$31,$32,
$33,$34,
$35,$36,$37,
$38,$39
)`
_, err := r.db.ExecContext(
ctx,
q,
createdAt,
window,
opsNullString(input.Platform),
opsNullInt64(input.GroupID),
input.SuccessCount,
input.ErrorCountTotal,
input.BusinessLimitedCount,
input.ErrorCountSLA,
input.UpstreamErrorCountExcl429529,
input.Upstream429Count,
input.Upstream529Count,
input.TokenConsumed,
opsNullFloat64(input.QPS),
opsNullFloat64(input.TPS),
opsNullInt(input.DurationP50Ms),
opsNullInt(input.DurationP90Ms),
opsNullInt(input.DurationP95Ms),
opsNullInt(input.DurationP99Ms),
opsNullFloat64(input.DurationAvgMs),
opsNullInt(input.DurationMaxMs),
opsNullInt(input.TTFTP50Ms),
opsNullInt(input.TTFTP90Ms),
opsNullInt(input.TTFTP95Ms),
opsNullInt(input.TTFTP99Ms),
opsNullFloat64(input.TTFTAvgMs),
opsNullInt(input.TTFTMaxMs),
opsNullFloat64(input.CPUUsagePercent),
opsNullInt(input.MemoryUsedMB),
opsNullInt(input.MemoryTotalMB),
opsNullFloat64(input.MemoryUsagePercent),
opsNullBool(input.DBOK),
opsNullBool(input.RedisOK),
opsNullInt(input.RedisConnTotal),
opsNullInt(input.RedisConnIdle),
opsNullInt(input.DBConnActive),
opsNullInt(input.DBConnIdle),
opsNullInt(input.DBConnWaiting),
opsNullInt(input.GoroutineCount),
opsNullInt(input.ConcurrencyQueueDepth),
)
return err
}
func (r *opsRepository) GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*service.OpsSystemMetricsSnapshot, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
if windowMinutes <= 0 {
windowMinutes = 1
}
q := `
SELECT
id,
created_at,
window_minutes,
cpu_usage_percent,
memory_used_mb,
memory_total_mb,
memory_usage_percent,
db_ok,
redis_ok,
redis_conn_total,
redis_conn_idle,
db_conn_active,
db_conn_idle,
db_conn_waiting,
goroutine_count,
concurrency_queue_depth
FROM ops_system_metrics
WHERE window_minutes = $1
AND platform IS NULL
AND group_id IS NULL
ORDER BY created_at DESC
LIMIT 1`
var out service.OpsSystemMetricsSnapshot
var cpu sql.NullFloat64
var memUsed sql.NullInt64
var memTotal sql.NullInt64
var memPct sql.NullFloat64
var dbOK sql.NullBool
var redisOK sql.NullBool
var redisTotal sql.NullInt64
var redisIdle sql.NullInt64
var dbActive sql.NullInt64
var dbIdle sql.NullInt64
var dbWaiting sql.NullInt64
var goroutines sql.NullInt64
var queueDepth sql.NullInt64
if err := r.db.QueryRowContext(ctx, q, windowMinutes).Scan(
&out.ID,
&out.CreatedAt,
&out.WindowMinutes,
&cpu,
&memUsed,
&memTotal,
&memPct,
&dbOK,
&redisOK,
&redisTotal,
&redisIdle,
&dbActive,
&dbIdle,
&dbWaiting,
&goroutines,
&queueDepth,
); err != nil {
return nil, err
}
if cpu.Valid {
v := cpu.Float64
out.CPUUsagePercent = &v
}
if memUsed.Valid {
v := memUsed.Int64
out.MemoryUsedMB = &v
}
if memTotal.Valid {
v := memTotal.Int64
out.MemoryTotalMB = &v
}
if memPct.Valid {
v := memPct.Float64
out.MemoryUsagePercent = &v
}
if dbOK.Valid {
v := dbOK.Bool
out.DBOK = &v
}
if redisOK.Valid {
v := redisOK.Bool
out.RedisOK = &v
}
if redisTotal.Valid {
v := int(redisTotal.Int64)
out.RedisConnTotal = &v
}
if redisIdle.Valid {
v := int(redisIdle.Int64)
out.RedisConnIdle = &v
}
if dbActive.Valid {
v := int(dbActive.Int64)
out.DBConnActive = &v
}
if dbIdle.Valid {
v := int(dbIdle.Int64)
out.DBConnIdle = &v
}
if dbWaiting.Valid {
v := int(dbWaiting.Int64)
out.DBConnWaiting = &v
}
if goroutines.Valid {
v := int(goroutines.Int64)
out.GoroutineCount = &v
}
if queueDepth.Valid {
v := int(queueDepth.Int64)
out.ConcurrencyQueueDepth = &v
}
return &out, nil
}
func (r *opsRepository) UpsertJobHeartbeat(ctx context.Context, input *service.OpsUpsertJobHeartbeatInput) error {
if r == nil || r.db == nil {
return fmt.Errorf("nil ops repository")
}
if input == nil {
return fmt.Errorf("nil input")
}
if input.JobName == "" {
return fmt.Errorf("job_name required")
}
q := `
INSERT INTO ops_job_heartbeats (
job_name,
last_run_at,
last_success_at,
last_error_at,
last_error,
last_duration_ms,
updated_at
) VALUES (
$1,$2,$3,$4,$5,$6,NOW()
)
ON CONFLICT (job_name) DO UPDATE SET
last_run_at = COALESCE(EXCLUDED.last_run_at, ops_job_heartbeats.last_run_at),
last_success_at = COALESCE(EXCLUDED.last_success_at, ops_job_heartbeats.last_success_at),
last_error_at = CASE
WHEN EXCLUDED.last_success_at IS NOT NULL THEN NULL
ELSE COALESCE(EXCLUDED.last_error_at, ops_job_heartbeats.last_error_at)
END,
last_error = CASE
WHEN EXCLUDED.last_success_at IS NOT NULL THEN NULL
ELSE COALESCE(EXCLUDED.last_error, ops_job_heartbeats.last_error)
END,
last_duration_ms = COALESCE(EXCLUDED.last_duration_ms, ops_job_heartbeats.last_duration_ms),
updated_at = NOW()`
_, err := r.db.ExecContext(
ctx,
q,
input.JobName,
opsNullTime(input.LastRunAt),
opsNullTime(input.LastSuccessAt),
opsNullTime(input.LastErrorAt),
opsNullString(input.LastError),
opsNullInt(input.LastDurationMs),
)
return err
}
func (r *opsRepository) ListJobHeartbeats(ctx context.Context) ([]*service.OpsJobHeartbeat, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
q := `
SELECT
job_name,
last_run_at,
last_success_at,
last_error_at,
last_error,
last_duration_ms,
updated_at
FROM ops_job_heartbeats
ORDER BY job_name ASC`
rows, err := r.db.QueryContext(ctx, q)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
out := make([]*service.OpsJobHeartbeat, 0, 8)
for rows.Next() {
var item service.OpsJobHeartbeat
var lastRun sql.NullTime
var lastSuccess sql.NullTime
var lastErrorAt sql.NullTime
var lastError sql.NullString
var lastDuration sql.NullInt64
if err := rows.Scan(
&item.JobName,
&lastRun,
&lastSuccess,
&lastErrorAt,
&lastError,
&lastDuration,
&item.UpdatedAt,
); err != nil {
return nil, err
}
if lastRun.Valid {
v := lastRun.Time
item.LastRunAt = &v
}
if lastSuccess.Valid {
v := lastSuccess.Time
item.LastSuccessAt = &v
}
if lastErrorAt.Valid {
v := lastErrorAt.Time
item.LastErrorAt = &v
}
if lastError.Valid {
v := lastError.String
item.LastError = &v
}
if lastDuration.Valid {
v := lastDuration.Int64
item.LastDurationMs = &v
}
out = append(out, &item)
}
if err := rows.Err(); err != nil {
return nil, err
}
return out, nil
}
func opsNullBool(v *bool) any {
if v == nil {
return sql.NullBool{}
}
return sql.NullBool{Bool: *v, Valid: true}
}
func opsNullFloat64(v *float64) any {
if v == nil {
return sql.NullFloat64{}
}
return sql.NullFloat64{Float64: *v, Valid: true}
}
func opsNullTime(v *time.Time) any {
if v == nil || v.IsZero() {
return sql.NullTime{}
}
return sql.NullTime{Time: *v, Valid: true}
}

View File

@@ -0,0 +1,359 @@
package repository
import (
"context"
"database/sql"
"fmt"
"time"
)
func (r *opsRepository) UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error {
if r == nil || r.db == nil {
return fmt.Errorf("nil ops repository")
}
if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) {
return nil
}
start := startTime.UTC()
end := endTime.UTC()
// NOTE:
// - We aggregate usage_logs + ops_error_logs into ops_metrics_hourly.
// - We emit three dimension granularities via GROUPING SETS:
// 1) overall: (bucket_start)
// 2) platform: (bucket_start, platform)
// 3) group: (bucket_start, platform, group_id)
//
// IMPORTANT: Postgres UNIQUE treats NULLs as distinct, so the table uses a COALESCE-based
// unique index; our ON CONFLICT target must match that expression set.
q := `
WITH usage_base AS (
SELECT
date_trunc('hour', ul.created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start,
g.platform AS platform,
ul.group_id AS group_id,
ul.duration_ms AS duration_ms,
ul.first_token_ms AS first_token_ms,
(ul.input_tokens + ul.output_tokens + ul.cache_creation_tokens + ul.cache_read_tokens) AS tokens
FROM usage_logs ul
JOIN groups g ON g.id = ul.group_id
WHERE ul.created_at >= $1 AND ul.created_at < $2
),
usage_agg AS (
SELECT
bucket_start,
CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform,
CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id,
COUNT(*) AS success_count,
COALESCE(SUM(tokens), 0) AS token_consumed,
percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p50_ms,
percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p90_ms,
percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p95_ms,
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p99_ms,
AVG(duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_avg_ms,
MAX(duration_ms) AS duration_max_ms,
percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p50_ms,
percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p90_ms,
percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p95_ms,
percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p99_ms,
AVG(first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_avg_ms,
MAX(first_token_ms) AS ttft_max_ms
FROM usage_base
GROUP BY GROUPING SETS (
(bucket_start),
(bucket_start, platform),
(bucket_start, platform, group_id)
)
),
error_base AS (
SELECT
date_trunc('hour', created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start,
platform AS platform,
group_id AS group_id,
is_business_limited AS is_business_limited,
error_owner AS error_owner,
status_code AS client_status_code,
COALESCE(upstream_status_code, status_code, 0) AS effective_status_code
FROM ops_error_logs
WHERE created_at >= $1 AND created_at < $2
),
error_agg AS (
SELECT
bucket_start,
CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform,
CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id,
COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400) AS error_count_total,
COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND is_business_limited) AS business_limited_count,
COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND NOT is_business_limited) AS error_count_sla,
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) NOT IN (429, 529)) AS upstream_error_count_excl_429_529,
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 429) AS upstream_429_count,
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 529) AS upstream_529_count
FROM error_base
GROUP BY GROUPING SETS (
(bucket_start),
(bucket_start, platform),
(bucket_start, platform, group_id)
)
HAVING GROUPING(group_id) = 1 OR group_id IS NOT NULL
),
combined AS (
SELECT
COALESCE(u.bucket_start, e.bucket_start) AS bucket_start,
COALESCE(u.platform, e.platform) AS platform,
COALESCE(u.group_id, e.group_id) AS group_id,
COALESCE(u.success_count, 0) AS success_count,
COALESCE(e.error_count_total, 0) AS error_count_total,
COALESCE(e.business_limited_count, 0) AS business_limited_count,
COALESCE(e.error_count_sla, 0) AS error_count_sla,
COALESCE(e.upstream_error_count_excl_429_529, 0) AS upstream_error_count_excl_429_529,
COALESCE(e.upstream_429_count, 0) AS upstream_429_count,
COALESCE(e.upstream_529_count, 0) AS upstream_529_count,
COALESCE(u.token_consumed, 0) AS token_consumed,
u.duration_p50_ms,
u.duration_p90_ms,
u.duration_p95_ms,
u.duration_p99_ms,
u.duration_avg_ms,
u.duration_max_ms,
u.ttft_p50_ms,
u.ttft_p90_ms,
u.ttft_p95_ms,
u.ttft_p99_ms,
u.ttft_avg_ms,
u.ttft_max_ms
FROM usage_agg u
FULL OUTER JOIN error_agg e
ON u.bucket_start = e.bucket_start
AND COALESCE(u.platform, '') = COALESCE(e.platform, '')
AND COALESCE(u.group_id, 0) = COALESCE(e.group_id, 0)
)
INSERT INTO ops_metrics_hourly (
bucket_start,
platform,
group_id,
success_count,
error_count_total,
business_limited_count,
error_count_sla,
upstream_error_count_excl_429_529,
upstream_429_count,
upstream_529_count,
token_consumed,
duration_p50_ms,
duration_p90_ms,
duration_p95_ms,
duration_p99_ms,
duration_avg_ms,
duration_max_ms,
ttft_p50_ms,
ttft_p90_ms,
ttft_p95_ms,
ttft_p99_ms,
ttft_avg_ms,
ttft_max_ms,
computed_at
)
SELECT
bucket_start,
NULLIF(platform, '') AS platform,
group_id,
success_count,
error_count_total,
business_limited_count,
error_count_sla,
upstream_error_count_excl_429_529,
upstream_429_count,
upstream_529_count,
token_consumed,
duration_p50_ms::int,
duration_p90_ms::int,
duration_p95_ms::int,
duration_p99_ms::int,
duration_avg_ms,
duration_max_ms::int,
ttft_p50_ms::int,
ttft_p90_ms::int,
ttft_p95_ms::int,
ttft_p99_ms::int,
ttft_avg_ms,
ttft_max_ms::int,
NOW()
FROM combined
WHERE bucket_start IS NOT NULL
AND (platform IS NULL OR platform <> '')
ON CONFLICT (bucket_start, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET
success_count = EXCLUDED.success_count,
error_count_total = EXCLUDED.error_count_total,
business_limited_count = EXCLUDED.business_limited_count,
error_count_sla = EXCLUDED.error_count_sla,
upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529,
upstream_429_count = EXCLUDED.upstream_429_count,
upstream_529_count = EXCLUDED.upstream_529_count,
token_consumed = EXCLUDED.token_consumed,
duration_p50_ms = EXCLUDED.duration_p50_ms,
duration_p90_ms = EXCLUDED.duration_p90_ms,
duration_p95_ms = EXCLUDED.duration_p95_ms,
duration_p99_ms = EXCLUDED.duration_p99_ms,
duration_avg_ms = EXCLUDED.duration_avg_ms,
duration_max_ms = EXCLUDED.duration_max_ms,
ttft_p50_ms = EXCLUDED.ttft_p50_ms,
ttft_p90_ms = EXCLUDED.ttft_p90_ms,
ttft_p95_ms = EXCLUDED.ttft_p95_ms,
ttft_p99_ms = EXCLUDED.ttft_p99_ms,
ttft_avg_ms = EXCLUDED.ttft_avg_ms,
ttft_max_ms = EXCLUDED.ttft_max_ms,
computed_at = NOW()
`
_, err := r.db.ExecContext(ctx, q, start, end)
return err
}
func (r *opsRepository) UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error {
if r == nil || r.db == nil {
return fmt.Errorf("nil ops repository")
}
if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) {
return nil
}
start := startTime.UTC()
end := endTime.UTC()
q := `
INSERT INTO ops_metrics_daily (
bucket_date,
platform,
group_id,
success_count,
error_count_total,
business_limited_count,
error_count_sla,
upstream_error_count_excl_429_529,
upstream_429_count,
upstream_529_count,
token_consumed,
duration_p50_ms,
duration_p90_ms,
duration_p95_ms,
duration_p99_ms,
duration_avg_ms,
duration_max_ms,
ttft_p50_ms,
ttft_p90_ms,
ttft_p95_ms,
ttft_p99_ms,
ttft_avg_ms,
ttft_max_ms,
computed_at
)
SELECT
(bucket_start AT TIME ZONE 'UTC')::date AS bucket_date,
platform,
group_id,
COALESCE(SUM(success_count), 0) AS success_count,
COALESCE(SUM(error_count_total), 0) AS error_count_total,
COALESCE(SUM(business_limited_count), 0) AS business_limited_count,
COALESCE(SUM(error_count_sla), 0) AS error_count_sla,
COALESCE(SUM(upstream_error_count_excl_429_529), 0) AS upstream_error_count_excl_429_529,
COALESCE(SUM(upstream_429_count), 0) AS upstream_429_count,
COALESCE(SUM(upstream_529_count), 0) AS upstream_529_count,
COALESCE(SUM(token_consumed), 0) AS token_consumed,
-- Approximation: weighted average for p50/p90, max for p95/p99 (conservative tail).
ROUND(SUM(duration_p50_ms::double precision * success_count) FILTER (WHERE duration_p50_ms IS NOT NULL)
/ NULLIF(SUM(success_count) FILTER (WHERE duration_p50_ms IS NOT NULL), 0))::int AS duration_p50_ms,
ROUND(SUM(duration_p90_ms::double precision * success_count) FILTER (WHERE duration_p90_ms IS NOT NULL)
/ NULLIF(SUM(success_count) FILTER (WHERE duration_p90_ms IS NOT NULL), 0))::int AS duration_p90_ms,
MAX(duration_p95_ms) AS duration_p95_ms,
MAX(duration_p99_ms) AS duration_p99_ms,
SUM(duration_avg_ms * success_count) FILTER (WHERE duration_avg_ms IS NOT NULL)
/ NULLIF(SUM(success_count) FILTER (WHERE duration_avg_ms IS NOT NULL), 0) AS duration_avg_ms,
MAX(duration_max_ms) AS duration_max_ms,
ROUND(SUM(ttft_p50_ms::double precision * success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL)
/ NULLIF(SUM(success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL), 0))::int AS ttft_p50_ms,
ROUND(SUM(ttft_p90_ms::double precision * success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL)
/ NULLIF(SUM(success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL), 0))::int AS ttft_p90_ms,
MAX(ttft_p95_ms) AS ttft_p95_ms,
MAX(ttft_p99_ms) AS ttft_p99_ms,
SUM(ttft_avg_ms * success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL)
/ NULLIF(SUM(success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL), 0) AS ttft_avg_ms,
MAX(ttft_max_ms) AS ttft_max_ms,
NOW()
FROM ops_metrics_hourly
WHERE bucket_start >= $1 AND bucket_start < $2
GROUP BY 1, 2, 3
ON CONFLICT (bucket_date, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET
success_count = EXCLUDED.success_count,
error_count_total = EXCLUDED.error_count_total,
business_limited_count = EXCLUDED.business_limited_count,
error_count_sla = EXCLUDED.error_count_sla,
upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529,
upstream_429_count = EXCLUDED.upstream_429_count,
upstream_529_count = EXCLUDED.upstream_529_count,
token_consumed = EXCLUDED.token_consumed,
duration_p50_ms = EXCLUDED.duration_p50_ms,
duration_p90_ms = EXCLUDED.duration_p90_ms,
duration_p95_ms = EXCLUDED.duration_p95_ms,
duration_p99_ms = EXCLUDED.duration_p99_ms,
duration_avg_ms = EXCLUDED.duration_avg_ms,
duration_max_ms = EXCLUDED.duration_max_ms,
ttft_p50_ms = EXCLUDED.ttft_p50_ms,
ttft_p90_ms = EXCLUDED.ttft_p90_ms,
ttft_p95_ms = EXCLUDED.ttft_p95_ms,
ttft_p99_ms = EXCLUDED.ttft_p99_ms,
ttft_avg_ms = EXCLUDED.ttft_avg_ms,
ttft_max_ms = EXCLUDED.ttft_max_ms,
computed_at = NOW()
`
_, err := r.db.ExecContext(ctx, q, start, end)
return err
}
func (r *opsRepository) GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error) {
if r == nil || r.db == nil {
return time.Time{}, false, fmt.Errorf("nil ops repository")
}
var value sql.NullTime
if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_start) FROM ops_metrics_hourly`).Scan(&value); err != nil {
return time.Time{}, false, err
}
if !value.Valid {
return time.Time{}, false, nil
}
return value.Time.UTC(), true, nil
}
func (r *opsRepository) GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error) {
if r == nil || r.db == nil {
return time.Time{}, false, fmt.Errorf("nil ops repository")
}
var value sql.NullTime
if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_date) FROM ops_metrics_daily`).Scan(&value); err != nil {
return time.Time{}, false, err
}
if !value.Valid {
return time.Time{}, false, nil
}
t := value.Time.UTC()
return time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, time.UTC), true, nil
}

View File

@@ -0,0 +1,286 @@
package repository
import (
"context"
"database/sql"
"fmt"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/service"
)
func (r *opsRepository) ListRequestDetails(ctx context.Context, filter *service.OpsRequestDetailFilter) ([]*service.OpsRequestDetail, int64, error) {
if r == nil || r.db == nil {
return nil, 0, fmt.Errorf("nil ops repository")
}
page, pageSize, startTime, endTime := filter.Normalize()
offset := (page - 1) * pageSize
conditions := make([]string, 0, 16)
args := make([]any, 0, 24)
// Placeholders $1/$2 reserved for time window inside the CTE.
args = append(args, startTime.UTC(), endTime.UTC())
addCondition := func(condition string, values ...any) {
conditions = append(conditions, condition)
args = append(args, values...)
}
if filter != nil {
if kind := strings.TrimSpace(strings.ToLower(filter.Kind)); kind != "" && kind != "all" {
if kind != string(service.OpsRequestKindSuccess) && kind != string(service.OpsRequestKindError) {
return nil, 0, fmt.Errorf("invalid kind")
}
addCondition(fmt.Sprintf("kind = $%d", len(args)+1), kind)
}
if platform := strings.TrimSpace(strings.ToLower(filter.Platform)); platform != "" {
addCondition(fmt.Sprintf("platform = $%d", len(args)+1), platform)
}
if filter.GroupID != nil && *filter.GroupID > 0 {
addCondition(fmt.Sprintf("group_id = $%d", len(args)+1), *filter.GroupID)
}
if filter.UserID != nil && *filter.UserID > 0 {
addCondition(fmt.Sprintf("user_id = $%d", len(args)+1), *filter.UserID)
}
if filter.APIKeyID != nil && *filter.APIKeyID > 0 {
addCondition(fmt.Sprintf("api_key_id = $%d", len(args)+1), *filter.APIKeyID)
}
if filter.AccountID != nil && *filter.AccountID > 0 {
addCondition(fmt.Sprintf("account_id = $%d", len(args)+1), *filter.AccountID)
}
if model := strings.TrimSpace(filter.Model); model != "" {
addCondition(fmt.Sprintf("model = $%d", len(args)+1), model)
}
if requestID := strings.TrimSpace(filter.RequestID); requestID != "" {
addCondition(fmt.Sprintf("request_id = $%d", len(args)+1), requestID)
}
if q := strings.TrimSpace(filter.Query); q != "" {
like := "%" + strings.ToLower(q) + "%"
startIdx := len(args) + 1
addCondition(
fmt.Sprintf("(LOWER(COALESCE(request_id,'')) LIKE $%d OR LOWER(COALESCE(model,'')) LIKE $%d OR LOWER(COALESCE(message,'')) LIKE $%d)",
startIdx, startIdx+1, startIdx+2,
),
like, like, like,
)
}
if filter.MinDurationMs != nil {
addCondition(fmt.Sprintf("duration_ms >= $%d", len(args)+1), *filter.MinDurationMs)
}
if filter.MaxDurationMs != nil {
addCondition(fmt.Sprintf("duration_ms <= $%d", len(args)+1), *filter.MaxDurationMs)
}
}
where := ""
if len(conditions) > 0 {
where = "WHERE " + strings.Join(conditions, " AND ")
}
cte := `
WITH combined AS (
SELECT
'success'::TEXT AS kind,
ul.created_at AS created_at,
ul.request_id AS request_id,
COALESCE(NULLIF(g.platform, ''), NULLIF(a.platform, ''), '') AS platform,
ul.model AS model,
ul.duration_ms AS duration_ms,
NULL::INT AS status_code,
NULL::BIGINT AS error_id,
NULL::TEXT AS phase,
NULL::TEXT AS severity,
NULL::TEXT AS message,
ul.user_id AS user_id,
ul.api_key_id AS api_key_id,
ul.account_id AS account_id,
ul.group_id AS group_id,
ul.stream AS stream
FROM usage_logs ul
LEFT JOIN groups g ON g.id = ul.group_id
LEFT JOIN accounts a ON a.id = ul.account_id
WHERE ul.created_at >= $1 AND ul.created_at < $2
UNION ALL
SELECT
'error'::TEXT AS kind,
o.created_at AS created_at,
COALESCE(NULLIF(o.request_id,''), NULLIF(o.client_request_id,''), '') AS request_id,
COALESCE(NULLIF(o.platform, ''), NULLIF(g.platform, ''), NULLIF(a.platform, ''), '') AS platform,
o.model AS model,
o.duration_ms AS duration_ms,
o.status_code AS status_code,
o.id AS error_id,
o.error_phase AS phase,
o.severity AS severity,
o.error_message AS message,
o.user_id AS user_id,
o.api_key_id AS api_key_id,
o.account_id AS account_id,
o.group_id AS group_id,
o.stream AS stream
FROM ops_error_logs o
LEFT JOIN groups g ON g.id = o.group_id
LEFT JOIN accounts a ON a.id = o.account_id
WHERE o.created_at >= $1 AND o.created_at < $2
AND COALESCE(o.status_code, 0) >= 400
)
`
countQuery := fmt.Sprintf(`%s SELECT COUNT(1) FROM combined %s`, cte, where)
var total int64
if err := r.db.QueryRowContext(ctx, countQuery, args...).Scan(&total); err != nil {
if err == sql.ErrNoRows {
total = 0
} else {
return nil, 0, err
}
}
sort := "ORDER BY created_at DESC"
if filter != nil {
switch strings.TrimSpace(strings.ToLower(filter.Sort)) {
case "", "created_at_desc":
// default
case "duration_desc":
sort = "ORDER BY duration_ms DESC NULLS LAST, created_at DESC"
default:
return nil, 0, fmt.Errorf("invalid sort")
}
}
listQuery := fmt.Sprintf(`
%s
SELECT
kind,
created_at,
request_id,
platform,
model,
duration_ms,
status_code,
error_id,
phase,
severity,
message,
user_id,
api_key_id,
account_id,
group_id,
stream
FROM combined
%s
%s
LIMIT $%d OFFSET $%d
`, cte, where, sort, len(args)+1, len(args)+2)
listArgs := append(append([]any{}, args...), pageSize, offset)
rows, err := r.db.QueryContext(ctx, listQuery, listArgs...)
if err != nil {
return nil, 0, err
}
defer func() { _ = rows.Close() }()
toIntPtr := func(v sql.NullInt64) *int {
if !v.Valid {
return nil
}
i := int(v.Int64)
return &i
}
toInt64Ptr := func(v sql.NullInt64) *int64 {
if !v.Valid {
return nil
}
i := v.Int64
return &i
}
out := make([]*service.OpsRequestDetail, 0, pageSize)
for rows.Next() {
var (
kind string
createdAt time.Time
requestID sql.NullString
platform sql.NullString
model sql.NullString
durationMs sql.NullInt64
statusCode sql.NullInt64
errorID sql.NullInt64
phase sql.NullString
severity sql.NullString
message sql.NullString
userID sql.NullInt64
apiKeyID sql.NullInt64
accountID sql.NullInt64
groupID sql.NullInt64
stream bool
)
if err := rows.Scan(
&kind,
&createdAt,
&requestID,
&platform,
&model,
&durationMs,
&statusCode,
&errorID,
&phase,
&severity,
&message,
&userID,
&apiKeyID,
&accountID,
&groupID,
&stream,
); err != nil {
return nil, 0, err
}
item := &service.OpsRequestDetail{
Kind: service.OpsRequestKind(kind),
CreatedAt: createdAt,
RequestID: strings.TrimSpace(requestID.String),
Platform: strings.TrimSpace(platform.String),
Model: strings.TrimSpace(model.String),
DurationMs: toIntPtr(durationMs),
StatusCode: toIntPtr(statusCode),
ErrorID: toInt64Ptr(errorID),
Phase: phase.String,
Severity: severity.String,
Message: message.String,
UserID: toInt64Ptr(userID),
APIKeyID: toInt64Ptr(apiKeyID),
AccountID: toInt64Ptr(accountID),
GroupID: toInt64Ptr(groupID),
Stream: stream,
}
if item.Platform == "" {
item.Platform = "unknown"
}
out = append(out, item)
}
if err := rows.Err(); err != nil {
return nil, 0, err
}
return out, total, nil
}

View File

@@ -0,0 +1,571 @@
package repository
import (
"context"
"database/sql"
"fmt"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/service"
)
func (r *opsRepository) GetThroughputTrend(ctx context.Context, filter *service.OpsDashboardFilter, bucketSeconds int) (*service.OpsThroughputTrendResponse, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
if filter == nil {
return nil, fmt.Errorf("nil filter")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, fmt.Errorf("start_time/end_time required")
}
if bucketSeconds <= 0 {
bucketSeconds = 60
}
if bucketSeconds != 60 && bucketSeconds != 300 && bucketSeconds != 3600 {
// Keep a small, predictable set of supported buckets for now.
bucketSeconds = 60
}
start := filter.StartTime.UTC()
end := filter.EndTime.UTC()
usageJoin, usageWhere, usageArgs, next := buildUsageWhere(filter, start, end, 1)
errorWhere, errorArgs, _ := buildErrorWhere(filter, start, end, next)
usageBucketExpr := opsBucketExprForUsage(bucketSeconds)
errorBucketExpr := opsBucketExprForError(bucketSeconds)
q := `
WITH usage_buckets AS (
SELECT ` + usageBucketExpr + ` AS bucket,
COUNT(*) AS success_count,
COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
FROM usage_logs ul
` + usageJoin + `
` + usageWhere + `
GROUP BY 1
),
error_buckets AS (
SELECT ` + errorBucketExpr + ` AS bucket,
COUNT(*) AS error_count
FROM ops_error_logs
` + errorWhere + `
AND COALESCE(status_code, 0) >= 400
GROUP BY 1
),
combined AS (
SELECT COALESCE(u.bucket, e.bucket) AS bucket,
COALESCE(u.success_count, 0) AS success_count,
COALESCE(e.error_count, 0) AS error_count,
COALESCE(u.token_consumed, 0) AS token_consumed
FROM usage_buckets u
FULL OUTER JOIN error_buckets e ON u.bucket = e.bucket
)
SELECT
bucket,
(success_count + error_count) AS request_count,
token_consumed
FROM combined
ORDER BY bucket ASC`
args := append(usageArgs, errorArgs...)
rows, err := r.db.QueryContext(ctx, q, args...)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
points := make([]*service.OpsThroughputTrendPoint, 0, 256)
for rows.Next() {
var bucket time.Time
var requests int64
var tokens sql.NullInt64
if err := rows.Scan(&bucket, &requests, &tokens); err != nil {
return nil, err
}
tokenConsumed := int64(0)
if tokens.Valid {
tokenConsumed = tokens.Int64
}
denom := float64(bucketSeconds)
if denom <= 0 {
denom = 60
}
qps := roundTo1DP(float64(requests) / denom)
tps := roundTo1DP(float64(tokenConsumed) / denom)
points = append(points, &service.OpsThroughputTrendPoint{
BucketStart: bucket.UTC(),
RequestCount: requests,
TokenConsumed: tokenConsumed,
QPS: qps,
TPS: tps,
})
}
if err := rows.Err(); err != nil {
return nil, err
}
// Fill missing buckets with zeros so charts render continuous timelines.
points = fillOpsThroughputBuckets(start, end, bucketSeconds, points)
var byPlatform []*service.OpsThroughputPlatformBreakdownItem
var topGroups []*service.OpsThroughputGroupBreakdownItem
platform := ""
if filter != nil {
platform = strings.TrimSpace(strings.ToLower(filter.Platform))
}
groupID := (*int64)(nil)
if filter != nil {
groupID = filter.GroupID
}
// Drilldown helpers:
// - No platform/group: totals by platform
// - Platform selected but no group: top groups in that platform
if platform == "" && (groupID == nil || *groupID <= 0) {
items, err := r.getThroughputBreakdownByPlatform(ctx, start, end)
if err != nil {
return nil, err
}
byPlatform = items
} else if platform != "" && (groupID == nil || *groupID <= 0) {
items, err := r.getThroughputTopGroupsByPlatform(ctx, start, end, platform, 10)
if err != nil {
return nil, err
}
topGroups = items
}
return &service.OpsThroughputTrendResponse{
Bucket: opsBucketLabel(bucketSeconds),
Points: points,
ByPlatform: byPlatform,
TopGroups: topGroups,
}, nil
}
func (r *opsRepository) getThroughputBreakdownByPlatform(ctx context.Context, start, end time.Time) ([]*service.OpsThroughputPlatformBreakdownItem, error) {
q := `
WITH usage_totals AS (
SELECT COALESCE(NULLIF(g.platform,''), a.platform) AS platform,
COUNT(*) AS success_count,
COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
FROM usage_logs ul
LEFT JOIN groups g ON g.id = ul.group_id
LEFT JOIN accounts a ON a.id = ul.account_id
WHERE ul.created_at >= $1 AND ul.created_at < $2
GROUP BY 1
),
error_totals AS (
SELECT platform,
COUNT(*) AS error_count
FROM ops_error_logs
WHERE created_at >= $1 AND created_at < $2
AND COALESCE(status_code, 0) >= 400
GROUP BY 1
),
combined AS (
SELECT COALESCE(u.platform, e.platform) AS platform,
COALESCE(u.success_count, 0) AS success_count,
COALESCE(e.error_count, 0) AS error_count,
COALESCE(u.token_consumed, 0) AS token_consumed
FROM usage_totals u
FULL OUTER JOIN error_totals e ON u.platform = e.platform
)
SELECT platform, (success_count + error_count) AS request_count, token_consumed
FROM combined
WHERE platform IS NOT NULL AND platform <> ''
ORDER BY request_count DESC`
rows, err := r.db.QueryContext(ctx, q, start, end)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
items := make([]*service.OpsThroughputPlatformBreakdownItem, 0, 8)
for rows.Next() {
var platform string
var requests int64
var tokens sql.NullInt64
if err := rows.Scan(&platform, &requests, &tokens); err != nil {
return nil, err
}
tokenConsumed := int64(0)
if tokens.Valid {
tokenConsumed = tokens.Int64
}
items = append(items, &service.OpsThroughputPlatformBreakdownItem{
Platform: platform,
RequestCount: requests,
TokenConsumed: tokenConsumed,
})
}
if err := rows.Err(); err != nil {
return nil, err
}
return items, nil
}
func (r *opsRepository) getThroughputTopGroupsByPlatform(ctx context.Context, start, end time.Time, platform string, limit int) ([]*service.OpsThroughputGroupBreakdownItem, error) {
if strings.TrimSpace(platform) == "" {
return nil, nil
}
if limit <= 0 || limit > 100 {
limit = 10
}
q := `
WITH usage_totals AS (
SELECT ul.group_id AS group_id,
g.name AS group_name,
COUNT(*) AS success_count,
COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
FROM usage_logs ul
JOIN groups g ON g.id = ul.group_id
WHERE ul.created_at >= $1 AND ul.created_at < $2
AND g.platform = $3
GROUP BY 1, 2
),
error_totals AS (
SELECT group_id,
COUNT(*) AS error_count
FROM ops_error_logs
WHERE created_at >= $1 AND created_at < $2
AND platform = $3
AND group_id IS NOT NULL
AND COALESCE(status_code, 0) >= 400
GROUP BY 1
),
combined AS (
SELECT COALESCE(u.group_id, e.group_id) AS group_id,
COALESCE(u.group_name, g2.name, '') AS group_name,
COALESCE(u.success_count, 0) AS success_count,
COALESCE(e.error_count, 0) AS error_count,
COALESCE(u.token_consumed, 0) AS token_consumed
FROM usage_totals u
FULL OUTER JOIN error_totals e ON u.group_id = e.group_id
LEFT JOIN groups g2 ON g2.id = COALESCE(u.group_id, e.group_id)
)
SELECT group_id, group_name, (success_count + error_count) AS request_count, token_consumed
FROM combined
WHERE group_id IS NOT NULL
ORDER BY request_count DESC
LIMIT $4`
rows, err := r.db.QueryContext(ctx, q, start, end, platform, limit)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
items := make([]*service.OpsThroughputGroupBreakdownItem, 0, limit)
for rows.Next() {
var groupID int64
var groupName sql.NullString
var requests int64
var tokens sql.NullInt64
if err := rows.Scan(&groupID, &groupName, &requests, &tokens); err != nil {
return nil, err
}
tokenConsumed := int64(0)
if tokens.Valid {
tokenConsumed = tokens.Int64
}
name := ""
if groupName.Valid {
name = groupName.String
}
items = append(items, &service.OpsThroughputGroupBreakdownItem{
GroupID: groupID,
GroupName: name,
RequestCount: requests,
TokenConsumed: tokenConsumed,
})
}
if err := rows.Err(); err != nil {
return nil, err
}
return items, nil
}
func opsBucketExprForUsage(bucketSeconds int) string {
switch bucketSeconds {
case 3600:
return "date_trunc('hour', ul.created_at)"
case 300:
// 5-minute buckets in UTC.
return "to_timestamp(floor(extract(epoch from ul.created_at) / 300) * 300)"
default:
return "date_trunc('minute', ul.created_at)"
}
}
func opsBucketExprForError(bucketSeconds int) string {
switch bucketSeconds {
case 3600:
return "date_trunc('hour', created_at)"
case 300:
return "to_timestamp(floor(extract(epoch from created_at) / 300) * 300)"
default:
return "date_trunc('minute', created_at)"
}
}
func opsBucketLabel(bucketSeconds int) string {
if bucketSeconds <= 0 {
return "1m"
}
if bucketSeconds%3600 == 0 {
h := bucketSeconds / 3600
if h <= 0 {
h = 1
}
return fmt.Sprintf("%dh", h)
}
m := bucketSeconds / 60
if m <= 0 {
m = 1
}
return fmt.Sprintf("%dm", m)
}
func opsFloorToBucketStart(t time.Time, bucketSeconds int) time.Time {
t = t.UTC()
if bucketSeconds <= 0 {
bucketSeconds = 60
}
secs := t.Unix()
floored := secs - (secs % int64(bucketSeconds))
return time.Unix(floored, 0).UTC()
}
func fillOpsThroughputBuckets(start, end time.Time, bucketSeconds int, points []*service.OpsThroughputTrendPoint) []*service.OpsThroughputTrendPoint {
if bucketSeconds <= 0 {
bucketSeconds = 60
}
if !start.Before(end) {
return points
}
endMinus := end.Add(-time.Nanosecond)
if endMinus.Before(start) {
return points
}
first := opsFloorToBucketStart(start, bucketSeconds)
last := opsFloorToBucketStart(endMinus, bucketSeconds)
step := time.Duration(bucketSeconds) * time.Second
existing := make(map[int64]*service.OpsThroughputTrendPoint, len(points))
for _, p := range points {
if p == nil {
continue
}
existing[p.BucketStart.UTC().Unix()] = p
}
out := make([]*service.OpsThroughputTrendPoint, 0, int(last.Sub(first)/step)+1)
for cursor := first; !cursor.After(last); cursor = cursor.Add(step) {
if p, ok := existing[cursor.Unix()]; ok && p != nil {
out = append(out, p)
continue
}
out = append(out, &service.OpsThroughputTrendPoint{
BucketStart: cursor,
RequestCount: 0,
TokenConsumed: 0,
QPS: 0,
TPS: 0,
})
}
return out
}
func (r *opsRepository) GetErrorTrend(ctx context.Context, filter *service.OpsDashboardFilter, bucketSeconds int) (*service.OpsErrorTrendResponse, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
if filter == nil {
return nil, fmt.Errorf("nil filter")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, fmt.Errorf("start_time/end_time required")
}
if bucketSeconds <= 0 {
bucketSeconds = 60
}
if bucketSeconds != 60 && bucketSeconds != 300 && bucketSeconds != 3600 {
bucketSeconds = 60
}
start := filter.StartTime.UTC()
end := filter.EndTime.UTC()
where, args, _ := buildErrorWhere(filter, start, end, 1)
bucketExpr := opsBucketExprForError(bucketSeconds)
q := `
SELECT
` + bucketExpr + ` AS bucket,
COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400) AS error_total,
COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited) AS business_limited,
COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited) AS error_sla,
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)) AS upstream_excl,
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429) AS upstream_429,
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529) AS upstream_529
FROM ops_error_logs
` + where + `
GROUP BY 1
ORDER BY 1 ASC`
rows, err := r.db.QueryContext(ctx, q, args...)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
points := make([]*service.OpsErrorTrendPoint, 0, 256)
for rows.Next() {
var bucket time.Time
var total, businessLimited, sla, upstreamExcl, upstream429, upstream529 int64
if err := rows.Scan(&bucket, &total, &businessLimited, &sla, &upstreamExcl, &upstream429, &upstream529); err != nil {
return nil, err
}
points = append(points, &service.OpsErrorTrendPoint{
BucketStart: bucket.UTC(),
ErrorCountTotal: total,
BusinessLimitedCount: businessLimited,
ErrorCountSLA: sla,
UpstreamErrorCountExcl429529: upstreamExcl,
Upstream429Count: upstream429,
Upstream529Count: upstream529,
})
}
if err := rows.Err(); err != nil {
return nil, err
}
points = fillOpsErrorTrendBuckets(start, end, bucketSeconds, points)
return &service.OpsErrorTrendResponse{
Bucket: opsBucketLabel(bucketSeconds),
Points: points,
}, nil
}
func fillOpsErrorTrendBuckets(start, end time.Time, bucketSeconds int, points []*service.OpsErrorTrendPoint) []*service.OpsErrorTrendPoint {
if bucketSeconds <= 0 {
bucketSeconds = 60
}
if !start.Before(end) {
return points
}
endMinus := end.Add(-time.Nanosecond)
if endMinus.Before(start) {
return points
}
first := opsFloorToBucketStart(start, bucketSeconds)
last := opsFloorToBucketStart(endMinus, bucketSeconds)
step := time.Duration(bucketSeconds) * time.Second
existing := make(map[int64]*service.OpsErrorTrendPoint, len(points))
for _, p := range points {
if p == nil {
continue
}
existing[p.BucketStart.UTC().Unix()] = p
}
out := make([]*service.OpsErrorTrendPoint, 0, int(last.Sub(first)/step)+1)
for cursor := first; !cursor.After(last); cursor = cursor.Add(step) {
if p, ok := existing[cursor.Unix()]; ok && p != nil {
out = append(out, p)
continue
}
out = append(out, &service.OpsErrorTrendPoint{
BucketStart: cursor,
ErrorCountTotal: 0,
BusinessLimitedCount: 0,
ErrorCountSLA: 0,
UpstreamErrorCountExcl429529: 0,
Upstream429Count: 0,
Upstream529Count: 0,
})
}
return out
}
func (r *opsRepository) GetErrorDistribution(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsErrorDistributionResponse, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
if filter == nil {
return nil, fmt.Errorf("nil filter")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, fmt.Errorf("start_time/end_time required")
}
start := filter.StartTime.UTC()
end := filter.EndTime.UTC()
where, args, _ := buildErrorWhere(filter, start, end, 1)
q := `
SELECT
COALESCE(upstream_status_code, status_code, 0) AS status_code,
COUNT(*) AS total,
COUNT(*) FILTER (WHERE NOT is_business_limited) AS sla,
COUNT(*) FILTER (WHERE is_business_limited) AS business_limited
FROM ops_error_logs
` + where + `
AND COALESCE(status_code, 0) >= 400
GROUP BY 1
ORDER BY total DESC
LIMIT 20`
rows, err := r.db.QueryContext(ctx, q, args...)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
items := make([]*service.OpsErrorDistributionItem, 0, 16)
var total int64
for rows.Next() {
var statusCode int
var cntTotal, cntSLA, cntBiz int64
if err := rows.Scan(&statusCode, &cntTotal, &cntSLA, &cntBiz); err != nil {
return nil, err
}
total += cntTotal
items = append(items, &service.OpsErrorDistributionItem{
StatusCode: statusCode,
Total: cntTotal,
SLA: cntSLA,
BusinessLimited: cntBiz,
})
}
if err := rows.Err(); err != nil {
return nil, err
}
return &service.OpsErrorDistributionResponse{
Total: total,
Items: items,
}, nil
}

View File

@@ -0,0 +1,50 @@
package repository
import (
"context"
"fmt"
"time"
"github.com/Wei-Shaw/sub2api/internal/service"
)
func (r *opsRepository) GetWindowStats(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsWindowStats, error) {
if r == nil || r.db == nil {
return nil, fmt.Errorf("nil ops repository")
}
if filter == nil {
return nil, fmt.Errorf("nil filter")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, fmt.Errorf("start_time/end_time required")
}
start := filter.StartTime.UTC()
end := filter.EndTime.UTC()
if start.After(end) {
return nil, fmt.Errorf("start_time must be <= end_time")
}
// Bound excessively large windows to prevent accidental heavy queries.
if end.Sub(start) > 24*time.Hour {
return nil, fmt.Errorf("window too large")
}
successCount, tokenConsumed, err := r.queryUsageCounts(ctx, filter, start, end)
if err != nil {
return nil, err
}
errorTotal, _, _, _, _, _, err := r.queryErrorCounts(ctx, filter, start, end)
if err != nil {
return nil, err
}
return &service.OpsWindowStats{
StartTime: start,
EndTime: end,
SuccessCount: successCount,
ErrorCountTotal: errorTotal,
TokenConsumed: tokenConsumed,
}, nil
}

View File

@@ -204,7 +204,7 @@ func (s *UsageLogRepoSuite) TestDashboardStats_TodayTotalsAndPerformance() {
userToday := mustCreateUser(s.T(), s.client, &service.User{
Email: "today@example.com",
CreatedAt: maxTime(todayStart.Add(10*time.Second), now.Add(-10*time.Second)),
CreatedAt: testMaxTime(todayStart.Add(10*time.Second), now.Add(-10*time.Second)),
UpdatedAt: now,
})
userOld := mustCreateUser(s.T(), s.client, &service.User{
@@ -237,7 +237,7 @@ func (s *UsageLogRepoSuite) TestDashboardStats_TodayTotalsAndPerformance() {
TotalCost: 1.5,
ActualCost: 1.2,
DurationMs: &d1,
CreatedAt: maxTime(todayStart.Add(2*time.Minute), now.Add(-2*time.Minute)),
CreatedAt: testMaxTime(todayStart.Add(2*time.Minute), now.Add(-2*time.Minute)),
}
_, err = s.repo.Create(s.ctx, logToday)
s.Require().NoError(err, "Create logToday")
@@ -413,9 +413,17 @@ func (s *UsageLogRepoSuite) TestGetAccountTodayStats() {
func (s *UsageLogRepoSuite) TestDashboardAggregationConsistency() {
now := time.Now().UTC().Truncate(time.Second)
hour1 := now.Add(-90 * time.Minute).Truncate(time.Hour)
hour2 := now.Add(-30 * time.Minute).Truncate(time.Hour)
// 使用固定的时间偏移确保 hour1 和 hour2 在同一天且都在过去
// 选择当天 02:00 和 03:00 作为测试时间点(基于 now 的日期)
dayStart := truncateToDayUTC(now)
hour1 := dayStart.Add(2 * time.Hour) // 当天 02:00
hour2 := dayStart.Add(3 * time.Hour) // 当天 03:00
// 如果当前时间早于 hour2则使用昨天的时间
if now.Before(hour2.Add(time.Hour)) {
dayStart = dayStart.Add(-24 * time.Hour)
hour1 = dayStart.Add(2 * time.Hour)
hour2 = dayStart.Add(3 * time.Hour)
}
user1 := mustCreateUser(s.T(), s.client, &service.User{Email: "agg-u1@test.com"})
user2 := mustCreateUser(s.T(), s.client, &service.User{Email: "agg-u2@test.com"})
@@ -473,7 +481,7 @@ func (s *UsageLogRepoSuite) TestDashboardAggregationConsistency() {
aggRepo := newDashboardAggregationRepositoryWithSQL(s.tx)
aggStart := hour1.Add(-5 * time.Minute)
aggEnd := now.Add(5 * time.Minute)
aggEnd := hour2.Add(time.Hour) // 确保覆盖 hour2 的所有数据
s.Require().NoError(aggRepo.AggregateRange(s.ctx, aggStart, aggEnd))
type hourlyRow struct {
@@ -621,7 +629,7 @@ func (s *UsageLogRepoSuite) TestGetGlobalStats() {
s.Require().Equal(int64(45), stats.TotalOutputTokens)
}
func maxTime(a, b time.Time) time.Time {
func testMaxTime(a, b time.Time) time.Time {
if a.After(b) {
return a
}

View File

@@ -49,6 +49,7 @@ var ProviderSet = wire.NewSet(
NewUsageLogRepository,
NewDashboardAggregationRepository,
NewSettingRepository,
NewOpsRepository,
NewUserSubscriptionRepository,
NewUserAttributeDefinitionRepository,
NewUserAttributeValueRepository,

View File

@@ -262,11 +262,11 @@ func TestAPIContracts(t *testing.T) {
name: "GET /api/v1/admin/settings",
setup: func(t *testing.T, deps *contractDeps) {
t.Helper()
deps.settingRepo.SetAll(map[string]string{
service.SettingKeyRegistrationEnabled: "true",
service.SettingKeyEmailVerifyEnabled: "false",
deps.settingRepo.SetAll(map[string]string{
service.SettingKeyRegistrationEnabled: "true",
service.SettingKeyEmailVerifyEnabled: "false",
service.SettingKeySMTPHost: "smtp.example.com",
service.SettingKeySMTPHost: "smtp.example.com",
service.SettingKeySMTPPort: "587",
service.SettingKeySMTPUsername: "user",
service.SettingKeySMTPPassword: "secret",
@@ -285,10 +285,15 @@ func TestAPIContracts(t *testing.T) {
service.SettingKeyContactInfo: "support",
service.SettingKeyDocURL: "https://docs.example.com",
service.SettingKeyDefaultConcurrency: "5",
service.SettingKeyDefaultBalance: "1.25",
})
},
service.SettingKeyDefaultConcurrency: "5",
service.SettingKeyDefaultBalance: "1.25",
service.SettingKeyOpsMonitoringEnabled: "false",
service.SettingKeyOpsRealtimeMonitoringEnabled: "true",
service.SettingKeyOpsQueryModeDefault: "auto",
service.SettingKeyOpsMetricsIntervalSeconds: "60",
})
},
method: http.MethodGet,
path: "/api/v1/admin/settings",
wantStatus: http.StatusOK,
@@ -309,13 +314,17 @@ func TestAPIContracts(t *testing.T) {
"turnstile_site_key": "site-key",
"turnstile_secret_key_configured": true,
"linuxdo_connect_enabled": false,
"linuxdo_connect_client_id": "",
"linuxdo_connect_client_secret_configured": false,
"linuxdo_connect_redirect_url": "",
"site_name": "Sub2API",
"site_logo": "",
"site_subtitle": "Subtitle",
"api_base_url": "https://api.example.com",
"linuxdo_connect_client_id": "",
"linuxdo_connect_client_secret_configured": false,
"linuxdo_connect_redirect_url": "",
"ops_monitoring_enabled": false,
"ops_realtime_monitoring_enabled": true,
"ops_query_mode_default": "auto",
"ops_metrics_interval_seconds": 60,
"site_name": "Sub2API",
"site_logo": "",
"site_subtitle": "Subtitle",
"api_base_url": "https://api.example.com",
"contact_info": "support",
"doc_url": "https://docs.example.com",
"default_concurrency": 5,
@@ -430,7 +439,7 @@ func newContractDeps(t *testing.T) *contractDeps {
authHandler := handler.NewAuthHandler(cfg, nil, userService, settingService, nil)
apiKeyHandler := handler.NewAPIKeyHandler(apiKeyService)
usageHandler := handler.NewUsageHandler(usageService, apiKeyService)
adminSettingHandler := adminhandler.NewSettingHandler(settingService, nil, nil)
adminSettingHandler := adminhandler.NewSettingHandler(settingService, nil, nil, nil)
adminAccountHandler := adminhandler.NewAccountHandler(adminService, nil, nil, nil, nil, nil, nil, nil, nil, nil)
jwtAuth := func(c *gin.Context) {

View File

@@ -31,6 +31,7 @@ func ProvideRouter(
apiKeyAuth middleware2.APIKeyAuthMiddleware,
apiKeyService *service.APIKeyService,
subscriptionService *service.SubscriptionService,
opsService *service.OpsService,
settingService *service.SettingService,
redisClient *redis.Client,
) *gin.Engine {
@@ -50,7 +51,7 @@ func ProvideRouter(
}
}
return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, settingService, cfg, redisClient)
return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, cfg, redisClient)
}
// ProvideHTTPServer 提供 HTTP 服务器

View File

@@ -30,6 +30,20 @@ func adminAuth(
settingService *service.SettingService,
) gin.HandlerFunc {
return func(c *gin.Context) {
// WebSocket upgrade requests cannot set Authorization headers in browsers.
// For admin WebSocket endpoints (e.g. Ops realtime), allow passing the JWT via
// Sec-WebSocket-Protocol (subprotocol list) using a prefixed token item:
// Sec-WebSocket-Protocol: sub2api-admin, jwt.<token>
if isWebSocketUpgradeRequest(c) {
if token := extractJWTFromWebSocketSubprotocol(c); token != "" {
if !validateJWTForAdmin(c, token, authService, userService) {
return
}
c.Next()
return
}
}
// 检查 x-api-key headerAdmin API Key 认证)
apiKey := c.GetHeader("x-api-key")
if apiKey != "" {
@@ -58,6 +72,44 @@ func adminAuth(
}
}
func isWebSocketUpgradeRequest(c *gin.Context) bool {
if c == nil || c.Request == nil {
return false
}
// RFC6455 handshake uses:
// Connection: Upgrade
// Upgrade: websocket
upgrade := strings.ToLower(strings.TrimSpace(c.GetHeader("Upgrade")))
if upgrade != "websocket" {
return false
}
connection := strings.ToLower(c.GetHeader("Connection"))
return strings.Contains(connection, "upgrade")
}
func extractJWTFromWebSocketSubprotocol(c *gin.Context) string {
if c == nil {
return ""
}
raw := strings.TrimSpace(c.GetHeader("Sec-WebSocket-Protocol"))
if raw == "" {
return ""
}
// The header is a comma-separated list of tokens. We reserve the prefix "jwt."
// for carrying the admin JWT.
for _, part := range strings.Split(raw, ",") {
p := strings.TrimSpace(part)
if strings.HasPrefix(p, "jwt.") {
token := strings.TrimSpace(strings.TrimPrefix(p, "jwt."))
if token != "" {
return token
}
}
}
return ""
}
// validateAdminAPIKey 验证管理员 API Key
func validateAdminAPIKey(
c *gin.Context,

View File

@@ -0,0 +1,30 @@
package middleware
import (
"context"
"github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
"github.com/gin-gonic/gin"
"github.com/google/uuid"
)
// ClientRequestID ensures every request has a unique client_request_id in request.Context().
//
// This is used by the Ops monitoring module for end-to-end request correlation.
func ClientRequestID() gin.HandlerFunc {
return func(c *gin.Context) {
if c.Request == nil {
c.Next()
return
}
if v := c.Request.Context().Value(ctxkey.ClientRequestID); v != nil {
c.Next()
return
}
id := uuid.New().String()
c.Request = c.Request.WithContext(context.WithValue(c.Request.Context(), ctxkey.ClientRequestID, id))
c.Next()
}
}

View File

@@ -23,6 +23,7 @@ func SetupRouter(
apiKeyAuth middleware2.APIKeyAuthMiddleware,
apiKeyService *service.APIKeyService,
subscriptionService *service.SubscriptionService,
opsService *service.OpsService,
settingService *service.SettingService,
cfg *config.Config,
redisClient *redis.Client,
@@ -46,7 +47,7 @@ func SetupRouter(
}
// 注册路由
registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, cfg, redisClient)
registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg, redisClient)
return r
}
@@ -60,6 +61,7 @@ func registerRoutes(
apiKeyAuth middleware2.APIKeyAuthMiddleware,
apiKeyService *service.APIKeyService,
subscriptionService *service.SubscriptionService,
opsService *service.OpsService,
cfg *config.Config,
redisClient *redis.Client,
) {
@@ -73,5 +75,5 @@ func registerRoutes(
routes.RegisterAuthRoutes(v1, h, jwtAuth, redisClient)
routes.RegisterUserRoutes(v1, h, jwtAuth)
routes.RegisterAdminRoutes(v1, h, adminAuth)
routes.RegisterGatewayRoutes(r, h, apiKeyAuth, apiKeyService, subscriptionService, cfg)
routes.RegisterGatewayRoutes(r, h, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg)
}

View File

@@ -50,6 +50,9 @@ func RegisterAdminRoutes(
// 系统设置
registerSettingsRoutes(admin, h)
// 运维监控Ops
registerOpsRoutes(admin, h)
// 系统管理
registerSystemRoutes(admin, h)
@@ -64,6 +67,58 @@ func RegisterAdminRoutes(
}
}
func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) {
ops := admin.Group("/ops")
{
// Realtime ops signals
ops.GET("/concurrency", h.Admin.Ops.GetConcurrencyStats)
ops.GET("/account-availability", h.Admin.Ops.GetAccountAvailability)
// Alerts (rules + events)
ops.GET("/alert-rules", h.Admin.Ops.ListAlertRules)
ops.POST("/alert-rules", h.Admin.Ops.CreateAlertRule)
ops.PUT("/alert-rules/:id", h.Admin.Ops.UpdateAlertRule)
ops.DELETE("/alert-rules/:id", h.Admin.Ops.DeleteAlertRule)
ops.GET("/alert-events", h.Admin.Ops.ListAlertEvents)
// Email notification config (DB-backed)
ops.GET("/email-notification/config", h.Admin.Ops.GetEmailNotificationConfig)
ops.PUT("/email-notification/config", h.Admin.Ops.UpdateEmailNotificationConfig)
// Runtime settings (DB-backed)
runtime := ops.Group("/runtime")
{
runtime.GET("/alert", h.Admin.Ops.GetAlertRuntimeSettings)
runtime.PUT("/alert", h.Admin.Ops.UpdateAlertRuntimeSettings)
}
// Advanced settings (DB-backed)
ops.GET("/advanced-settings", h.Admin.Ops.GetAdvancedSettings)
ops.PUT("/advanced-settings", h.Admin.Ops.UpdateAdvancedSettings)
// WebSocket realtime (QPS/TPS)
ws := ops.Group("/ws")
{
ws.GET("/qps", h.Admin.Ops.QPSWSHandler)
}
// Error logs (MVP-1)
ops.GET("/errors", h.Admin.Ops.GetErrorLogs)
ops.GET("/errors/:id", h.Admin.Ops.GetErrorLogByID)
ops.POST("/errors/:id/retry", h.Admin.Ops.RetryErrorRequest)
// Request drilldown (success + error)
ops.GET("/requests", h.Admin.Ops.ListRequestDetails)
// Dashboard (vNext - raw path for MVP)
ops.GET("/dashboard/overview", h.Admin.Ops.GetDashboardOverview)
ops.GET("/dashboard/throughput-trend", h.Admin.Ops.GetDashboardThroughputTrend)
ops.GET("/dashboard/latency-histogram", h.Admin.Ops.GetDashboardLatencyHistogram)
ops.GET("/dashboard/error-trend", h.Admin.Ops.GetDashboardErrorTrend)
ops.GET("/dashboard/error-distribution", h.Admin.Ops.GetDashboardErrorDistribution)
}
}
func registerDashboardRoutes(admin *gin.RouterGroup, h *handler.Handlers) {
dashboard := admin.Group("/dashboard")
{

View File

@@ -16,13 +16,18 @@ func RegisterGatewayRoutes(
apiKeyAuth middleware.APIKeyAuthMiddleware,
apiKeyService *service.APIKeyService,
subscriptionService *service.SubscriptionService,
opsService *service.OpsService,
cfg *config.Config,
) {
bodyLimit := middleware.RequestBodyLimit(cfg.Gateway.MaxBodySize)
clientRequestID := middleware.ClientRequestID()
opsErrorLogger := handler.OpsErrorLoggerMiddleware(opsService)
// API网关Claude API兼容
gateway := r.Group("/v1")
gateway.Use(bodyLimit)
gateway.Use(clientRequestID)
gateway.Use(opsErrorLogger)
gateway.Use(gin.HandlerFunc(apiKeyAuth))
{
gateway.POST("/messages", h.Gateway.Messages)
@@ -36,6 +41,8 @@ func RegisterGatewayRoutes(
// Gemini 原生 API 兼容层Gemini SDK/CLI 直连)
gemini := r.Group("/v1beta")
gemini.Use(bodyLimit)
gemini.Use(clientRequestID)
gemini.Use(opsErrorLogger)
gemini.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg))
{
gemini.GET("/models", h.Gateway.GeminiV1BetaListModels)
@@ -45,7 +52,7 @@ func RegisterGatewayRoutes(
}
// OpenAI Responses API不带v1前缀的别名
r.POST("/responses", bodyLimit, gin.HandlerFunc(apiKeyAuth), h.OpenAIGateway.Responses)
r.POST("/responses", bodyLimit, clientRequestID, opsErrorLogger, gin.HandlerFunc(apiKeyAuth), h.OpenAIGateway.Responses)
// Antigravity 模型列表
r.GET("/antigravity/models", gin.HandlerFunc(apiKeyAuth), h.Gateway.AntigravityModels)
@@ -53,6 +60,8 @@ func RegisterGatewayRoutes(
// Antigravity 专用路由(仅使用 antigravity 账户,不混合调度)
antigravityV1 := r.Group("/antigravity/v1")
antigravityV1.Use(bodyLimit)
antigravityV1.Use(clientRequestID)
antigravityV1.Use(opsErrorLogger)
antigravityV1.Use(middleware.ForcePlatform(service.PlatformAntigravity))
antigravityV1.Use(gin.HandlerFunc(apiKeyAuth))
{
@@ -64,6 +73,8 @@ func RegisterGatewayRoutes(
antigravityV1Beta := r.Group("/antigravity/v1beta")
antigravityV1Beta.Use(bodyLimit)
antigravityV1Beta.Use(clientRequestID)
antigravityV1Beta.Use(opsErrorLogger)
antigravityV1Beta.Use(middleware.ForcePlatform(service.PlatformAntigravity))
antigravityV1Beta.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg))
{

View File

@@ -564,6 +564,14 @@ urlFallbackLoop:
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
if err != nil {
safeErr := sanitizeUpstreamErrorMessage(err.Error())
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: 0,
Kind: "request_error",
Message: safeErr,
})
// 检查是否应触发 URL 降级
if shouldAntigravityFallbackToNextURL(err, 0) && urlIdx < len(availableURLs)-1 {
antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
@@ -579,6 +587,7 @@ urlFallbackLoop:
continue
}
log.Printf("%s status=request_failed retries_exhausted error=%v", prefix, err)
setOpsUpstreamError(c, 0, safeErr, "")
return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries")
}
@@ -586,6 +595,26 @@ urlFallbackLoop:
if resp.StatusCode == http.StatusTooManyRequests && urlIdx < len(availableURLs)-1 {
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
_ = resp.Body.Close()
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
maxBytes := 2048
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
}
upstreamDetail := ""
if logBody {
upstreamDetail = truncateString(string(respBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: "retry",
Message: upstreamMsg,
Detail: upstreamDetail,
})
antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
log.Printf("%s URL fallback (HTTP 429): %s -> %s body=%s", prefix, baseURL, availableURLs[urlIdx+1], truncateForLog(respBody, 200))
continue urlFallbackLoop
@@ -596,6 +625,26 @@ urlFallbackLoop:
_ = resp.Body.Close()
if attempt < antigravityMaxRetries {
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
maxBytes := 2048
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
}
upstreamDetail := ""
if logBody {
upstreamDetail = truncateString(string(respBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: "retry",
Message: upstreamMsg,
Detail: upstreamDetail,
})
log.Printf("%s status=%d retry=%d/%d body=%s", prefix, resp.StatusCode, attempt, antigravityMaxRetries, truncateForLog(respBody, 500))
if !sleepAntigravityBackoffWithContext(ctx, attempt) {
log.Printf("%s status=context_canceled_during_backoff", prefix)
@@ -628,6 +677,27 @@ urlFallbackLoop:
// Antigravity /v1internal 链路在部分场景会对 thought/thinking signature 做严格校验,
// 当历史消息携带的 signature 不合法时会直接 400去除 thinking 后可继续完成请求。
if resp.StatusCode == http.StatusBadRequest && isSignatureRelatedError(respBody) {
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
maxBytes := 2048
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
}
upstreamDetail := ""
if logBody {
upstreamDetail = truncateString(string(respBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: "signature_error",
Message: upstreamMsg,
Detail: upstreamDetail,
})
// Conservative two-stage fallback:
// 1) Disable top-level thinking + thinking->text
// 2) Only if still signature-related 400: also downgrade tool_use/tool_result to text.
@@ -661,6 +731,13 @@ urlFallbackLoop:
}
retryResp, retryErr := s.httpUpstream.Do(retryReq, proxyURL, account.ID, account.Concurrency)
if retryErr != nil {
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: 0,
Kind: "signature_retry_request_error",
Message: sanitizeUpstreamErrorMessage(retryErr.Error()),
})
log.Printf("Antigravity account %d: signature retry request failed (%s): %v", account.ID, stage.name, retryErr)
continue
}
@@ -674,6 +751,25 @@ urlFallbackLoop:
retryBody, _ := io.ReadAll(io.LimitReader(retryResp.Body, 2<<20))
_ = retryResp.Body.Close()
kind := "signature_retry"
if strings.TrimSpace(stage.name) != "" {
kind = "signature_retry_" + strings.ReplaceAll(stage.name, "+", "_")
}
retryUpstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(retryBody))
retryUpstreamMsg = sanitizeUpstreamErrorMessage(retryUpstreamMsg)
retryUpstreamDetail := ""
if logBody {
retryUpstreamDetail = truncateString(string(retryBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: retryResp.StatusCode,
UpstreamRequestID: retryResp.Header.Get("x-request-id"),
Kind: kind,
Message: retryUpstreamMsg,
Detail: retryUpstreamDetail,
})
// If this stage fixed the signature issue, we stop; otherwise we may try the next stage.
if retryResp.StatusCode != http.StatusBadRequest || !isSignatureRelatedError(retryBody) {
@@ -701,10 +797,30 @@ urlFallbackLoop:
s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope)
if s.shouldFailoverUpstreamError(resp.StatusCode) {
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
maxBytes := 2048
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
}
upstreamDetail := ""
if logBody {
upstreamDetail = truncateString(string(respBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: "failover",
Message: upstreamMsg,
Detail: upstreamDetail,
})
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
}
return nil, s.writeMappedClaudeError(c, resp.StatusCode, respBody)
return nil, s.writeMappedClaudeError(c, account, resp.StatusCode, resp.Header.Get("x-request-id"), respBody)
}
}
@@ -1108,6 +1224,14 @@ urlFallbackLoop:
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
if err != nil {
safeErr := sanitizeUpstreamErrorMessage(err.Error())
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: 0,
Kind: "request_error",
Message: safeErr,
})
// 检查是否应触发 URL 降级
if shouldAntigravityFallbackToNextURL(err, 0) && urlIdx < len(availableURLs)-1 {
antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
@@ -1123,6 +1247,7 @@ urlFallbackLoop:
continue
}
log.Printf("%s status=request_failed retries_exhausted error=%v", prefix, err)
setOpsUpstreamError(c, 0, safeErr, "")
return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries")
}
@@ -1130,6 +1255,26 @@ urlFallbackLoop:
if resp.StatusCode == http.StatusTooManyRequests && urlIdx < len(availableURLs)-1 {
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
_ = resp.Body.Close()
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
maxBytes := 2048
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
}
upstreamDetail := ""
if logBody {
upstreamDetail = truncateString(string(respBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: "retry",
Message: upstreamMsg,
Detail: upstreamDetail,
})
antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
log.Printf("%s URL fallback (HTTP 429): %s -> %s body=%s", prefix, baseURL, availableURLs[urlIdx+1], truncateForLog(respBody, 200))
continue urlFallbackLoop
@@ -1140,6 +1285,26 @@ urlFallbackLoop:
_ = resp.Body.Close()
if attempt < antigravityMaxRetries {
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
maxBytes := 2048
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
}
upstreamDetail := ""
if logBody {
upstreamDetail = truncateString(string(respBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: "retry",
Message: upstreamMsg,
Detail: upstreamDetail,
})
log.Printf("%s status=%d retry=%d/%d", prefix, resp.StatusCode, attempt, antigravityMaxRetries)
if !sleepAntigravityBackoffWithContext(ctx, attempt) {
log.Printf("%s status=context_canceled_during_backoff", prefix)
@@ -1205,21 +1370,59 @@ urlFallbackLoop:
s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope)
if s.shouldFailoverUpstreamError(resp.StatusCode) {
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
}
// 解包并返回错误
requestID := resp.Header.Get("x-request-id")
if requestID != "" {
c.Header("x-request-id", requestID)
}
unwrapped, _ := s.unwrapV1InternalResponse(respBody)
unwrapped, unwrapErr := s.unwrapV1InternalResponse(respBody)
unwrappedForOps := unwrapped
if unwrapErr != nil || len(unwrappedForOps) == 0 {
unwrappedForOps = respBody
}
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(unwrappedForOps))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
maxBytes := 2048
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
}
upstreamDetail := ""
if logBody {
upstreamDetail = truncateString(string(unwrappedForOps), maxBytes)
}
// Always record upstream context for Ops error logs, even when we will failover.
setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
if s.shouldFailoverUpstreamError(resp.StatusCode) {
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: requestID,
Kind: "failover",
Message: upstreamMsg,
Detail: upstreamDetail,
})
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
}
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
contentType = "application/json"
}
c.Data(resp.StatusCode, contentType, unwrapped)
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: requestID,
Kind: "http_error",
Message: upstreamMsg,
Detail: upstreamDetail,
})
c.Data(resp.StatusCode, contentType, unwrappedForOps)
return nil, fmt.Errorf("antigravity upstream error: %d", resp.StatusCode)
}
@@ -1674,9 +1877,35 @@ func (s *AntigravityGatewayService) writeClaudeError(c *gin.Context, status int,
return fmt.Errorf("%s", message)
}
func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, upstreamStatus int, body []byte) error {
// 记录上游错误详情便于调试
log.Printf("[antigravity-Forward] upstream_error status=%d body=%s", upstreamStatus, string(body))
func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, account *Account, upstreamStatus int, upstreamRequestID string, body []byte) error {
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
maxBytes := 2048
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
}
upstreamDetail := ""
if logBody {
upstreamDetail = truncateString(string(body), maxBytes)
}
setOpsUpstreamError(c, upstreamStatus, upstreamMsg, upstreamDetail)
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: upstreamStatus,
UpstreamRequestID: upstreamRequestID,
Kind: "http_error",
Message: upstreamMsg,
Detail: upstreamDetail,
})
// 记录上游错误详情便于排障(可选:由配置控制;不回显到客户端)
if logBody {
log.Printf("[antigravity-Forward] upstream_error status=%d body=%s", upstreamStatus, truncateForLog(body, maxBytes))
}
var statusCode int
var errType, errMsg string
@@ -1712,7 +1941,10 @@ func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, upstr
"type": "error",
"error": gin.H{"type": errType, "message": errMsg},
})
return fmt.Errorf("upstream error: %d", upstreamStatus)
if upstreamMsg == "" {
return fmt.Errorf("upstream error: %d", upstreamStatus)
}
return fmt.Errorf("upstream error: %d message=%s", upstreamStatus, upstreamMsg)
}
func (s *AntigravityGatewayService) writeGoogleError(c *gin.Context, status int, message string) error {

View File

@@ -357,7 +357,7 @@ func (s *AuthService) Login(ctx context.Context, email, password string) (string
// - 如果邮箱已存在:直接登录(不需要本地密码)
// - 如果邮箱不存在:创建新用户并登录
//
// 注意:该函数用于“终端用户登录 Sub2API 本身”的场景(不同于上游账号的 OAuth例如 OpenAI/Gemini
// 注意:该函数用于 LinuxDo OAuth 登录场景(不同于上游账号的 OAuth例如 Claude/OpenAI/Gemini
// 为了满足现有数据库约束(需要密码哈希),新用户会生成随机密码并进行哈希保存。
func (s *AuthService) LoginOrRegisterOAuth(ctx context.Context, email, username string) (string, *User, error) {
email = strings.TrimSpace(email)
@@ -376,8 +376,8 @@ func (s *AuthService) LoginOrRegisterOAuth(ctx context.Context, email, username
user, err := s.userRepo.GetByEmail(ctx, email)
if err != nil {
if errors.Is(err, ErrUserNotFound) {
// OAuth 首次登录视为注册
if s.settingService != nil && !s.settingService.IsRegistrationEnabled(ctx) {
// OAuth 首次登录视为注册fail-closesettingService 未配置时不允许注册)
if s.settingService == nil || !s.settingService.IsRegistrationEnabled(ctx) {
return "", nil, ErrRegDisabled
}

View File

@@ -63,6 +63,9 @@ const (
SubscriptionStatusSuspended = "suspended"
)
// LinuxDoConnectSyntheticEmailDomain 是 LinuxDo Connect 用户的合成邮箱后缀RFC 保留域名)。
const LinuxDoConnectSyntheticEmailDomain = "@linuxdo-connect.invalid"
// Setting keys
const (
// 注册设置
@@ -83,6 +86,12 @@ const (
SettingKeyTurnstileSiteKey = "turnstile_site_key" // Turnstile Site Key
SettingKeyTurnstileSecretKey = "turnstile_secret_key" // Turnstile Secret Key
// LinuxDo Connect OAuth 登录设置
SettingKeyLinuxDoConnectEnabled = "linuxdo_connect_enabled"
SettingKeyLinuxDoConnectClientID = "linuxdo_connect_client_id"
SettingKeyLinuxDoConnectClientSecret = "linuxdo_connect_client_secret"
SettingKeyLinuxDoConnectRedirectURL = "linuxdo_connect_redirect_url"
// OEM设置
SettingKeySiteName = "site_name" // 网站名称
SettingKeySiteLogo = "site_logo" // 网站Logo (base64)
@@ -113,16 +122,31 @@ const (
SettingKeyEnableIdentityPatch = "enable_identity_patch"
SettingKeyIdentityPatchPrompt = "identity_patch_prompt"
// LinuxDo Connect OAuth 登录(终端用户 SSO
SettingKeyLinuxDoConnectEnabled = "linuxdo_connect_enabled"
SettingKeyLinuxDoConnectClientID = "linuxdo_connect_client_id"
SettingKeyLinuxDoConnectClientSecret = "linuxdo_connect_client_secret"
SettingKeyLinuxDoConnectRedirectURL = "linuxdo_connect_redirect_url"
)
// =========================
// Ops Monitoring (vNext)
// =========================
// LinuxDoConnectSyntheticEmailDomain 是 LinuxDo Connect 用户的合成邮箱后缀RFC 保留域名)。
// 目的:避免第三方登录返回的用户标识与本地真实邮箱发生碰撞,进而造成账号被接管的风险。
const LinuxDoConnectSyntheticEmailDomain = "@linuxdo-connect.invalid"
// SettingKeyOpsMonitoringEnabled is a DB-backed soft switch to enable/disable ops module at runtime.
SettingKeyOpsMonitoringEnabled = "ops_monitoring_enabled"
// SettingKeyOpsRealtimeMonitoringEnabled controls realtime features (e.g. WS/QPS push).
SettingKeyOpsRealtimeMonitoringEnabled = "ops_realtime_monitoring_enabled"
// SettingKeyOpsQueryModeDefault controls the default query mode for ops dashboard (auto/raw/preagg).
SettingKeyOpsQueryModeDefault = "ops_query_mode_default"
// SettingKeyOpsEmailNotificationConfig stores JSON config for ops email notifications.
SettingKeyOpsEmailNotificationConfig = "ops_email_notification_config"
// SettingKeyOpsAlertRuntimeSettings stores JSON config for ops alert evaluator runtime settings.
SettingKeyOpsAlertRuntimeSettings = "ops_alert_runtime_settings"
// SettingKeyOpsMetricsIntervalSeconds controls the ops metrics collector interval (>=60).
SettingKeyOpsMetricsIntervalSeconds = "ops_metrics_interval_seconds"
// SettingKeyOpsAdvancedSettings stores JSON config for ops advanced settings (data retention, aggregation).
SettingKeyOpsAdvancedSettings = "ops_advanced_settings"
)
// AdminAPIKeyPrefix is the prefix for admin API keys (distinct from user "sk-" keys).
const AdminAPIKeyPrefix = "admin-"

View File

@@ -1399,7 +1399,24 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
if resp != nil && resp.Body != nil {
_ = resp.Body.Close()
}
return nil, fmt.Errorf("upstream request failed: %w", err)
// Ensure the client receives an error response (handlers assume Forward writes on non-failover errors).
safeErr := sanitizeUpstreamErrorMessage(err.Error())
setOpsUpstreamError(c, 0, safeErr, "")
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: 0,
Kind: "request_error",
Message: safeErr,
})
c.JSON(http.StatusBadGateway, gin.H{
"type": "error",
"error": gin.H{
"type": "upstream_error",
"message": "Upstream request failed",
},
})
return nil, fmt.Errorf("upstream request failed: %s", safeErr)
}
// 优先检测thinking block签名错误400并重试一次
@@ -1409,6 +1426,21 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
_ = resp.Body.Close()
if s.isThinkingBlockSignatureError(respBody) {
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: "signature_error",
Message: extractUpstreamErrorMessage(respBody),
Detail: func() string {
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
}
return ""
}(),
})
looksLikeToolSignatureError := func(msg string) bool {
m := strings.ToLower(msg)
return strings.Contains(m, "tool_use") ||
@@ -1445,6 +1477,20 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
retryRespBody, retryReadErr := io.ReadAll(io.LimitReader(retryResp.Body, 2<<20))
_ = retryResp.Body.Close()
if retryReadErr == nil && retryResp.StatusCode == 400 && s.isThinkingBlockSignatureError(retryRespBody) {
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: retryResp.StatusCode,
UpstreamRequestID: retryResp.Header.Get("x-request-id"),
Kind: "signature_retry_thinking",
Message: extractUpstreamErrorMessage(retryRespBody),
Detail: func() string {
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
return truncateString(string(retryRespBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
}
return ""
}(),
})
msg2 := extractUpstreamErrorMessage(retryRespBody)
if looksLikeToolSignatureError(msg2) && time.Since(retryStart) < maxRetryElapsed {
log.Printf("Account %d: signature retry still failing and looks tool-related, retrying with tool blocks downgraded", account.ID)
@@ -1459,6 +1505,13 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
if retryResp2 != nil && retryResp2.Body != nil {
_ = retryResp2.Body.Close()
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: 0,
Kind: "signature_retry_tools_request_error",
Message: sanitizeUpstreamErrorMessage(retryErr2.Error()),
})
log.Printf("Account %d: tool-downgrade signature retry failed: %v", account.ID, retryErr2)
} else {
log.Printf("Account %d: tool-downgrade signature retry build failed: %v", account.ID, buildErr2)
@@ -1508,9 +1561,24 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
break
}
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
_ = resp.Body.Close()
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: "retry",
Message: extractUpstreamErrorMessage(respBody),
Detail: func() string {
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
}
return ""
}(),
})
log.Printf("Account %d: upstream error %d, retry %d/%d after %v (elapsed=%v/%v)",
account.ID, resp.StatusCode, attempt, maxRetryAttempts, delay, elapsed, maxRetryElapsed)
_ = resp.Body.Close()
if err := sleepWithContext(ctx, delay); err != nil {
return nil, err
}
@@ -1538,7 +1606,25 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
// 处理重试耗尽的情况
if resp.StatusCode >= 400 && s.shouldRetryUpstreamError(account, resp.StatusCode) {
if s.shouldFailoverUpstreamError(resp.StatusCode) {
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
_ = resp.Body.Close()
resp.Body = io.NopCloser(bytes.NewReader(respBody))
s.handleRetryExhaustedSideEffects(ctx, resp, account)
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: "retry_exhausted_failover",
Message: extractUpstreamErrorMessage(respBody),
Detail: func() string {
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
}
return ""
}(),
})
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
}
return s.handleRetryExhaustedError(ctx, resp, c, account)
@@ -1546,7 +1632,25 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
// 处理可切换账号的错误
if resp.StatusCode >= 400 && s.shouldFailoverUpstreamError(resp.StatusCode) {
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
_ = resp.Body.Close()
resp.Body = io.NopCloser(bytes.NewReader(respBody))
s.handleFailoverSideEffects(ctx, resp, account)
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: "failover",
Message: extractUpstreamErrorMessage(respBody),
Detail: func() string {
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
}
return ""
}(),
})
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
}
@@ -1563,6 +1667,26 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
resp.Body = io.NopCloser(bytes.NewReader(respBody))
if s.shouldFailoverOn400(respBody) {
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
upstreamDetail := ""
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
if maxBytes <= 0 {
maxBytes = 2048
}
upstreamDetail = truncateString(string(respBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: "failover_on_400",
Message: upstreamMsg,
Detail: upstreamDetail,
})
if s.cfg.Gateway.LogUpstreamErrorBody {
log.Printf(
"Account %d: 400 error, attempting failover: %s",
@@ -1859,7 +1983,30 @@ func extractUpstreamErrorMessage(body []byte) string {
}
func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*ForwardResult, error) {
body, _ := io.ReadAll(resp.Body)
body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
// Enrich Ops error logs with upstream status + message, and optionally a truncated body snippet.
upstreamDetail := ""
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
if maxBytes <= 0 {
maxBytes = 2048
}
upstreamDetail = truncateString(string(body), maxBytes)
}
setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: "http_error",
Message: upstreamMsg,
Detail: upstreamDetail,
})
// 处理上游错误,标记账号状态
shouldDisable := false
@@ -1870,24 +2017,33 @@ func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Res
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
}
// 记录上游错误响应体摘要便于排障(可选:由配置控制;不回显到客户端)
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
log.Printf(
"Upstream error %d (account=%d platform=%s type=%s): %s",
resp.StatusCode,
account.ID,
account.Platform,
account.Type,
truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
)
}
// 根据状态码返回适当的自定义错误响应(不透传上游详细信息)
var errType, errMsg string
var statusCode int
switch resp.StatusCode {
case 400:
// 仅记录上游错误摘要(避免输出请求内容);需要时可通过配置打开
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
log.Printf(
"Upstream 400 error (account=%d platform=%s type=%s): %s",
account.ID,
account.Platform,
account.Type,
truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
)
}
c.Data(http.StatusBadRequest, "application/json", body)
return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
summary := upstreamMsg
if summary == "" {
summary = truncateForLog(body, 512)
}
if summary == "" {
return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
}
return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, summary)
case 401:
statusCode = http.StatusBadGateway
errType = "upstream_error"
@@ -1923,11 +2079,14 @@ func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Res
},
})
return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
if upstreamMsg == "" {
return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
}
return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
}
func (s *GatewayService) handleRetryExhaustedSideEffects(ctx context.Context, resp *http.Response, account *Account) {
body, _ := io.ReadAll(resp.Body)
body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
statusCode := resp.StatusCode
// OAuth/Setup Token 账号的 403标记账号异常
@@ -1941,7 +2100,7 @@ func (s *GatewayService) handleRetryExhaustedSideEffects(ctx context.Context, re
}
func (s *GatewayService) handleFailoverSideEffects(ctx context.Context, resp *http.Response, account *Account) {
body, _ := io.ReadAll(resp.Body)
body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body)
}
@@ -1949,8 +2108,45 @@ func (s *GatewayService) handleFailoverSideEffects(ctx context.Context, resp *ht
// OAuth 403标记账号异常
// API Key 未配置错误码:仅返回错误,不标记账号
func (s *GatewayService) handleRetryExhaustedError(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*ForwardResult, error) {
// Capture upstream error body before side-effects consume the stream.
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
_ = resp.Body.Close()
resp.Body = io.NopCloser(bytes.NewReader(respBody))
s.handleRetryExhaustedSideEffects(ctx, resp, account)
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
upstreamDetail := ""
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
if maxBytes <= 0 {
maxBytes = 2048
}
upstreamDetail = truncateString(string(respBody), maxBytes)
}
setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: "retry_exhausted",
Message: upstreamMsg,
Detail: upstreamDetail,
})
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
log.Printf(
"Upstream error %d retries_exhausted (account=%d platform=%s type=%s): %s",
resp.StatusCode,
account.ID,
account.Platform,
account.Type,
truncateForLog(respBody, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
)
}
// 返回统一的重试耗尽错误响应
c.JSON(http.StatusBadGateway, gin.H{
"type": "error",
@@ -1960,7 +2156,10 @@ func (s *GatewayService) handleRetryExhaustedError(ctx context.Context, resp *ht
},
})
return nil, fmt.Errorf("upstream error: %d (retries exhausted)", resp.StatusCode)
if upstreamMsg == "" {
return nil, fmt.Errorf("upstream error: %d (retries exhausted)", resp.StatusCode)
}
return nil, fmt.Errorf("upstream error: %d (retries exhausted) message=%s", resp.StatusCode, upstreamMsg)
}
// streamingResult 流式响应结果
@@ -2490,6 +2689,7 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context,
// 发送请求
resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
if err != nil {
setOpsUpstreamError(c, 0, sanitizeUpstreamErrorMessage(err.Error()), "")
s.countTokensError(c, http.StatusBadGateway, "upstream_error", "Request failed")
return fmt.Errorf("upstream request failed: %w", err)
}
@@ -2527,6 +2727,18 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context,
// 标记账号状态429/529等
s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
upstreamDetail := ""
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
if maxBytes <= 0 {
maxBytes = 2048
}
upstreamDetail = truncateString(string(respBody), maxBytes)
}
setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
// 记录上游错误摘要便于排障(不回显请求内容)
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
log.Printf(
@@ -2548,7 +2760,10 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context,
errMsg = "Service overloaded"
}
s.countTokensError(c, resp.StatusCode, "upstream_error", errMsg)
return fmt.Errorf("upstream error: %d", resp.StatusCode)
if upstreamMsg == "" {
return fmt.Errorf("upstream error: %d", resp.StatusCode)
}
return fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
}
// 透传成功响应

View File

@@ -543,12 +543,21 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
if err != nil {
safeErr := sanitizeUpstreamErrorMessage(err.Error())
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: 0,
Kind: "request_error",
Message: safeErr,
})
if attempt < geminiMaxRetries {
log.Printf("Gemini account %d: upstream request failed, retry %d/%d: %v", account.ID, attempt, geminiMaxRetries, err)
sleepGeminiBackoff(attempt)
continue
}
return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries: "+sanitizeUpstreamErrorMessage(err.Error()))
setOpsUpstreamError(c, 0, safeErr, "")
return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries: "+safeErr)
}
// Special-case: signature/thought_signature validation errors are not transient, but may be fixed by
@@ -558,6 +567,30 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
_ = resp.Body.Close()
if isGeminiSignatureRelatedError(respBody) {
upstreamReqID := resp.Header.Get(requestIDHeader)
if upstreamReqID == "" {
upstreamReqID = resp.Header.Get("x-goog-request-id")
}
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
upstreamDetail := ""
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
if maxBytes <= 0 {
maxBytes = 2048
}
upstreamDetail = truncateString(string(respBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: upstreamReqID,
Kind: "signature_error",
Message: upstreamMsg,
Detail: upstreamDetail,
})
var strippedClaudeBody []byte
stageName := ""
switch signatureRetryStage {
@@ -608,6 +641,30 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
}
if attempt < geminiMaxRetries {
upstreamReqID := resp.Header.Get(requestIDHeader)
if upstreamReqID == "" {
upstreamReqID = resp.Header.Get("x-goog-request-id")
}
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
upstreamDetail := ""
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
if maxBytes <= 0 {
maxBytes = 2048
}
upstreamDetail = truncateString(string(respBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: upstreamReqID,
Kind: "retry",
Message: upstreamMsg,
Detail: upstreamDetail,
})
log.Printf("Gemini account %d: upstream status %d, retry %d/%d", account.ID, resp.StatusCode, attempt, geminiMaxRetries)
sleepGeminiBackoff(attempt)
continue
@@ -633,12 +690,62 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
}
s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
if tempMatched {
upstreamReqID := resp.Header.Get(requestIDHeader)
if upstreamReqID == "" {
upstreamReqID = resp.Header.Get("x-goog-request-id")
}
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
upstreamDetail := ""
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
if maxBytes <= 0 {
maxBytes = 2048
}
upstreamDetail = truncateString(string(respBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: upstreamReqID,
Kind: "failover",
Message: upstreamMsg,
Detail: upstreamDetail,
})
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
}
if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {
upstreamReqID := resp.Header.Get(requestIDHeader)
if upstreamReqID == "" {
upstreamReqID = resp.Header.Get("x-goog-request-id")
}
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
upstreamDetail := ""
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
if maxBytes <= 0 {
maxBytes = 2048
}
upstreamDetail = truncateString(string(respBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: upstreamReqID,
Kind: "failover",
Message: upstreamMsg,
Detail: upstreamDetail,
})
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
}
return nil, s.writeGeminiMappedError(c, resp.StatusCode, respBody)
upstreamReqID := resp.Header.Get(requestIDHeader)
if upstreamReqID == "" {
upstreamReqID = resp.Header.Get("x-goog-request-id")
}
return nil, s.writeGeminiMappedError(c, account, resp.StatusCode, upstreamReqID, respBody)
}
requestID := resp.Header.Get(requestIDHeader)
@@ -863,6 +970,14 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
if err != nil {
safeErr := sanitizeUpstreamErrorMessage(err.Error())
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: 0,
Kind: "request_error",
Message: safeErr,
})
if attempt < geminiMaxRetries {
log.Printf("Gemini account %d: upstream request failed, retry %d/%d: %v", account.ID, attempt, geminiMaxRetries, err)
sleepGeminiBackoff(attempt)
@@ -880,7 +995,8 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
FirstTokenMs: nil,
}, nil
}
return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries: "+sanitizeUpstreamErrorMessage(err.Error()))
setOpsUpstreamError(c, 0, safeErr, "")
return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries: "+safeErr)
}
if resp.StatusCode >= 400 && s.shouldRetryGeminiUpstreamError(account, resp.StatusCode) {
@@ -899,6 +1015,30 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
}
if attempt < geminiMaxRetries {
upstreamReqID := resp.Header.Get(requestIDHeader)
if upstreamReqID == "" {
upstreamReqID = resp.Header.Get("x-goog-request-id")
}
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
upstreamDetail := ""
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
if maxBytes <= 0 {
maxBytes = 2048
}
upstreamDetail = truncateString(string(respBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: upstreamReqID,
Kind: "retry",
Message: upstreamMsg,
Detail: upstreamDetail,
})
log.Printf("Gemini account %d: upstream status %d, retry %d/%d", account.ID, resp.StatusCode, attempt, geminiMaxRetries)
sleepGeminiBackoff(attempt)
continue
@@ -962,19 +1102,84 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
}
if tempMatched {
evBody := unwrapIfNeeded(isOAuth, respBody)
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(evBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
upstreamDetail := ""
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
if maxBytes <= 0 {
maxBytes = 2048
}
upstreamDetail = truncateString(string(evBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: requestID,
Kind: "failover",
Message: upstreamMsg,
Detail: upstreamDetail,
})
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
}
if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {
evBody := unwrapIfNeeded(isOAuth, respBody)
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(evBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
upstreamDetail := ""
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
if maxBytes <= 0 {
maxBytes = 2048
}
upstreamDetail = truncateString(string(evBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: requestID,
Kind: "failover",
Message: upstreamMsg,
Detail: upstreamDetail,
})
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
}
respBody = unwrapIfNeeded(isOAuth, respBody)
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
upstreamDetail := ""
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
if maxBytes <= 0 {
maxBytes = 2048
}
upstreamDetail = truncateString(string(respBody), maxBytes)
log.Printf("[Gemini] native upstream error %d: %s", resp.StatusCode, truncateForLog(respBody, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes))
}
setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: requestID,
Kind: "http_error",
Message: upstreamMsg,
Detail: upstreamDetail,
})
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
contentType = "application/json"
}
c.Data(resp.StatusCode, contentType, respBody)
return nil, fmt.Errorf("gemini upstream error: %d", resp.StatusCode)
if upstreamMsg == "" {
return nil, fmt.Errorf("gemini upstream error: %d", resp.StatusCode)
}
return nil, fmt.Errorf("gemini upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
}
var usage *ClaudeUsage
@@ -1076,7 +1281,32 @@ func sanitizeUpstreamErrorMessage(msg string) string {
return sensitiveQueryParamRegex.ReplaceAllString(msg, `$1***`)
}
func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, upstreamStatus int, body []byte) error {
func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, account *Account, upstreamStatus int, upstreamRequestID string, body []byte) error {
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
upstreamDetail := ""
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
if maxBytes <= 0 {
maxBytes = 2048
}
upstreamDetail = truncateString(string(body), maxBytes)
}
setOpsUpstreamError(c, upstreamStatus, upstreamMsg, upstreamDetail)
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: upstreamStatus,
UpstreamRequestID: upstreamRequestID,
Kind: "http_error",
Message: upstreamMsg,
Detail: upstreamDetail,
})
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
log.Printf("[Gemini] upstream error %d: %s", upstreamStatus, truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes))
}
var statusCode int
var errType, errMsg string
@@ -1184,7 +1414,10 @@ func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, ups
"type": "error",
"error": gin.H{"type": errType, "message": errMsg},
})
return fmt.Errorf("upstream error: %d", upstreamStatus)
if upstreamMsg == "" {
return fmt.Errorf("upstream error: %d", upstreamStatus)
}
return fmt.Errorf("upstream error: %d message=%s", upstreamStatus, upstreamMsg)
}
type claudeErrorMapping struct {

View File

@@ -115,12 +115,6 @@ func applyCodexOAuthTransform(reqBody map[string]any) codexTransformResult {
existingInstructions = strings.TrimSpace(existingInstructions)
if instructions != "" {
if existingInstructions != "" && existingInstructions != instructions {
if input, ok := reqBody["input"].([]any); ok {
reqBody["input"] = prependSystemInstruction(input, existingInstructions)
result.Modified = true
}
}
if existingInstructions != instructions {
reqBody["instructions"] = instructions
result.Modified = true
@@ -129,7 +123,6 @@ func applyCodexOAuthTransform(reqBody map[string]any) codexTransformResult {
if input, ok := reqBody["input"].([]any); ok {
input = filterCodexInput(input)
input = normalizeOrphanedToolOutputs(input)
reqBody["input"] = input
result.Modified = true
}
@@ -266,19 +259,6 @@ func filterCodexInput(input []any) []any {
return filtered
}
func prependSystemInstruction(input []any, instructions string) []any {
message := map[string]any{
"role": "system",
"content": []any{
map[string]any{
"type": "input_text",
"text": instructions,
},
},
}
return append([]any{message}, input...)
}
func normalizeCodexTools(reqBody map[string]any) bool {
rawTools, ok := reqBody["tools"]
if !ok || rawTools == nil {
@@ -341,110 +321,6 @@ func normalizeCodexTools(reqBody map[string]any) bool {
return modified
}
func normalizeOrphanedToolOutputs(input []any) []any {
functionCallIDs := map[string]bool{}
localShellCallIDs := map[string]bool{}
customToolCallIDs := map[string]bool{}
for _, item := range input {
m, ok := item.(map[string]any)
if !ok {
continue
}
callID := getCallID(m)
if callID == "" {
continue
}
switch m["type"] {
case "function_call":
functionCallIDs[callID] = true
case "local_shell_call":
localShellCallIDs[callID] = true
case "custom_tool_call":
customToolCallIDs[callID] = true
}
}
output := make([]any, 0, len(input))
for _, item := range input {
m, ok := item.(map[string]any)
if !ok {
output = append(output, item)
continue
}
switch m["type"] {
case "function_call_output":
callID := getCallID(m)
if callID == "" || (!functionCallIDs[callID] && !localShellCallIDs[callID]) {
output = append(output, convertOrphanedOutputToMessage(m, callID))
continue
}
case "custom_tool_call_output":
callID := getCallID(m)
if callID == "" || !customToolCallIDs[callID] {
output = append(output, convertOrphanedOutputToMessage(m, callID))
continue
}
case "local_shell_call_output":
callID := getCallID(m)
if callID == "" || !localShellCallIDs[callID] {
output = append(output, convertOrphanedOutputToMessage(m, callID))
continue
}
}
output = append(output, m)
}
return output
}
func getCallID(item map[string]any) string {
raw, ok := item["call_id"]
if !ok {
return ""
}
callID, ok := raw.(string)
if !ok {
return ""
}
callID = strings.TrimSpace(callID)
if callID == "" {
return ""
}
return callID
}
func convertOrphanedOutputToMessage(item map[string]any, callID string) map[string]any {
toolName := "tool"
if name, ok := item["name"].(string); ok && name != "" {
toolName = name
}
labelID := callID
if labelID == "" {
labelID = "unknown"
}
text := stringifyOutput(item["output"])
if len(text) > 16000 {
text = text[:16000] + "\n...[truncated]"
}
return map[string]any{
"type": "message",
"role": "assistant",
"content": fmt.Sprintf("[Previous %s result; call_id=%s]: %s", toolName, labelID, text),
}
}
func stringifyOutput(output any) string {
switch v := output.(type) {
case string:
return v
default:
if data, err := json.Marshal(v); err == nil {
return string(data)
}
return fmt.Sprintf("%v", v)
}
}
func codexCachePath(filename string) string {
home, err := os.UserHomeDir()
if err != nil {

View File

@@ -12,7 +12,6 @@ import (
"io"
"log"
"net/http"
"os"
"regexp"
"sort"
"strconv"
@@ -513,7 +512,7 @@ func (s *OpenAIGatewayService) shouldFailoverUpstreamError(statusCode int) bool
}
func (s *OpenAIGatewayService) handleFailoverSideEffects(ctx context.Context, resp *http.Response, account *Account) {
body, _ := io.ReadAll(resp.Body)
body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body)
}
@@ -594,13 +593,53 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco
// Send request
resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
if err != nil {
return nil, fmt.Errorf("upstream request failed: %w", err)
// Ensure the client receives an error response (handlers assume Forward writes on non-failover errors).
safeErr := sanitizeUpstreamErrorMessage(err.Error())
setOpsUpstreamError(c, 0, safeErr, "")
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: 0,
Kind: "request_error",
Message: safeErr,
})
c.JSON(http.StatusBadGateway, gin.H{
"error": gin.H{
"type": "upstream_error",
"message": "Upstream request failed",
},
})
return nil, fmt.Errorf("upstream request failed: %s", safeErr)
}
defer func() { _ = resp.Body.Close() }()
// Handle error response
if resp.StatusCode >= 400 {
if s.shouldFailoverUpstreamError(resp.StatusCode) {
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
_ = resp.Body.Close()
resp.Body = io.NopCloser(bytes.NewReader(respBody))
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
upstreamDetail := ""
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
if maxBytes <= 0 {
maxBytes = 2048
}
upstreamDetail = truncateString(string(respBody), maxBytes)
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: "failover",
Message: upstreamMsg,
Detail: upstreamDetail,
})
s.handleFailoverSideEffects(ctx, resp, account)
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
}
@@ -724,18 +763,52 @@ func (s *OpenAIGatewayService) buildUpstreamRequest(ctx context.Context, c *gin.
}
func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*OpenAIForwardResult, error) {
body, _ := io.ReadAll(resp.Body)
logUpstreamErrorBody(account.ID, resp.StatusCode, body)
body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
upstreamDetail := ""
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
if maxBytes <= 0 {
maxBytes = 2048
}
upstreamDetail = truncateString(string(body), maxBytes)
}
setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
log.Printf(
"OpenAI upstream error %d (account=%d platform=%s type=%s): %s",
resp.StatusCode,
account.ID,
account.Platform,
account.Type,
truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
)
}
// Check custom error codes
if !account.ShouldHandleErrorCode(resp.StatusCode) {
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: "http_error",
Message: upstreamMsg,
Detail: upstreamDetail,
})
c.JSON(http.StatusInternalServerError, gin.H{
"error": gin.H{
"type": "upstream_error",
"message": "Upstream gateway error",
},
})
return nil, fmt.Errorf("upstream error: %d (not in custom error codes)", resp.StatusCode)
if upstreamMsg == "" {
return nil, fmt.Errorf("upstream error: %d (not in custom error codes)", resp.StatusCode)
}
return nil, fmt.Errorf("upstream error: %d (not in custom error codes) message=%s", resp.StatusCode, upstreamMsg)
}
// Handle upstream error (mark account status)
@@ -743,6 +816,19 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht
if s.rateLimitService != nil {
shouldDisable = s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body)
}
kind := "http_error"
if shouldDisable {
kind = "failover"
}
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
Platform: account.Platform,
AccountID: account.ID,
UpstreamStatusCode: resp.StatusCode,
UpstreamRequestID: resp.Header.Get("x-request-id"),
Kind: kind,
Message: upstreamMsg,
Detail: upstreamDetail,
})
if shouldDisable {
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
}
@@ -781,25 +867,10 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht
},
})
return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
}
func logUpstreamErrorBody(accountID int64, statusCode int, body []byte) {
if strings.ToLower(strings.TrimSpace(os.Getenv("GATEWAY_LOG_UPSTREAM_ERROR_BODY"))) != "true" {
return
if upstreamMsg == "" {
return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
}
maxBytes := 2048
if rawMax := strings.TrimSpace(os.Getenv("GATEWAY_LOG_UPSTREAM_ERROR_BODY_MAX_BYTES")); rawMax != "" {
if parsed, err := strconv.Atoi(rawMax); err == nil && parsed > 0 {
maxBytes = parsed
}
}
if len(body) > maxBytes {
body = body[:maxBytes]
}
log.Printf("Upstream error body: account=%d status=%d body=%q", accountID, statusCode, string(body))
return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
}
// openaiStreamingResult streaming response result

View File

@@ -0,0 +1,194 @@
package service
import (
"context"
"errors"
"time"
)
// GetAccountAvailabilityStats returns current account availability stats.
//
// Query-level filtering is intentionally limited to platform/group to match the dashboard scope.
func (s *OpsService) GetAccountAvailabilityStats(ctx context.Context, platformFilter string, groupIDFilter *int64) (
map[string]*PlatformAvailability,
map[int64]*GroupAvailability,
map[int64]*AccountAvailability,
*time.Time,
error,
) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, nil, nil, nil, err
}
accounts, err := s.listAllAccountsForOps(ctx, platformFilter)
if err != nil {
return nil, nil, nil, nil, err
}
if groupIDFilter != nil && *groupIDFilter > 0 {
filtered := make([]Account, 0, len(accounts))
for _, acc := range accounts {
for _, grp := range acc.Groups {
if grp != nil && grp.ID == *groupIDFilter {
filtered = append(filtered, acc)
break
}
}
}
accounts = filtered
}
now := time.Now()
collectedAt := now
platform := make(map[string]*PlatformAvailability)
group := make(map[int64]*GroupAvailability)
account := make(map[int64]*AccountAvailability)
for _, acc := range accounts {
if acc.ID <= 0 {
continue
}
isTempUnsched := false
if acc.TempUnschedulableUntil != nil && now.Before(*acc.TempUnschedulableUntil) {
isTempUnsched = true
}
isRateLimited := acc.RateLimitResetAt != nil && now.Before(*acc.RateLimitResetAt)
isOverloaded := acc.OverloadUntil != nil && now.Before(*acc.OverloadUntil)
hasError := acc.Status == StatusError
// Normalize exclusive status flags so the UI doesn't show conflicting badges.
if hasError {
isRateLimited = false
isOverloaded = false
}
isAvailable := acc.Status == StatusActive && acc.Schedulable && !isRateLimited && !isOverloaded && !isTempUnsched
if acc.Platform != "" {
if _, ok := platform[acc.Platform]; !ok {
platform[acc.Platform] = &PlatformAvailability{
Platform: acc.Platform,
}
}
p := platform[acc.Platform]
p.TotalAccounts++
if isAvailable {
p.AvailableCount++
}
if isRateLimited {
p.RateLimitCount++
}
if hasError {
p.ErrorCount++
}
}
for _, grp := range acc.Groups {
if grp == nil || grp.ID <= 0 {
continue
}
if _, ok := group[grp.ID]; !ok {
group[grp.ID] = &GroupAvailability{
GroupID: grp.ID,
GroupName: grp.Name,
Platform: grp.Platform,
}
}
g := group[grp.ID]
g.TotalAccounts++
if isAvailable {
g.AvailableCount++
}
if isRateLimited {
g.RateLimitCount++
}
if hasError {
g.ErrorCount++
}
}
displayGroupID := int64(0)
displayGroupName := ""
if len(acc.Groups) > 0 && acc.Groups[0] != nil {
displayGroupID = acc.Groups[0].ID
displayGroupName = acc.Groups[0].Name
}
item := &AccountAvailability{
AccountID: acc.ID,
AccountName: acc.Name,
Platform: acc.Platform,
GroupID: displayGroupID,
GroupName: displayGroupName,
Status: acc.Status,
IsAvailable: isAvailable,
IsRateLimited: isRateLimited,
IsOverloaded: isOverloaded,
HasError: hasError,
ErrorMessage: acc.ErrorMessage,
}
if isRateLimited && acc.RateLimitResetAt != nil {
item.RateLimitResetAt = acc.RateLimitResetAt
remainingSec := int64(time.Until(*acc.RateLimitResetAt).Seconds())
if remainingSec > 0 {
item.RateLimitRemainingSec = &remainingSec
}
}
if isOverloaded && acc.OverloadUntil != nil {
item.OverloadUntil = acc.OverloadUntil
remainingSec := int64(time.Until(*acc.OverloadUntil).Seconds())
if remainingSec > 0 {
item.OverloadRemainingSec = &remainingSec
}
}
if isTempUnsched && acc.TempUnschedulableUntil != nil {
item.TempUnschedulableUntil = acc.TempUnschedulableUntil
}
account[acc.ID] = item
}
return platform, group, account, &collectedAt, nil
}
type OpsAccountAvailability struct {
Group *GroupAvailability
Accounts map[int64]*AccountAvailability
CollectedAt *time.Time
}
func (s *OpsService) GetAccountAvailability(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error) {
if s == nil {
return nil, errors.New("ops service is nil")
}
if s.getAccountAvailability != nil {
return s.getAccountAvailability(ctx, platformFilter, groupIDFilter)
}
_, groupStats, accountStats, collectedAt, err := s.GetAccountAvailabilityStats(ctx, platformFilter, groupIDFilter)
if err != nil {
return nil, err
}
var group *GroupAvailability
if groupIDFilter != nil && *groupIDFilter > 0 {
group = groupStats[*groupIDFilter]
}
if accountStats == nil {
accountStats = map[int64]*AccountAvailability{}
}
return &OpsAccountAvailability{
Group: group,
Accounts: accountStats,
CollectedAt: collectedAt,
}, nil
}

View File

@@ -0,0 +1,46 @@
package service
import (
"context"
"database/sql"
"hash/fnv"
"time"
)
func hashAdvisoryLockID(key string) int64 {
h := fnv.New64a()
_, _ = h.Write([]byte(key))
return int64(h.Sum64())
}
func tryAcquireDBAdvisoryLock(ctx context.Context, db *sql.DB, lockID int64) (func(), bool) {
if db == nil {
return nil, false
}
if ctx == nil {
ctx = context.Background()
}
conn, err := db.Conn(ctx)
if err != nil {
return nil, false
}
acquired := false
if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", lockID).Scan(&acquired); err != nil {
_ = conn.Close()
return nil, false
}
if !acquired {
_ = conn.Close()
return nil, false
}
release := func() {
unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", lockID)
_ = conn.Close()
}
return release, true
}

View File

@@ -0,0 +1,443 @@
package service
import (
"context"
"database/sql"
"errors"
"log"
"strings"
"sync"
"time"
"github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/uuid"
"github.com/redis/go-redis/v9"
)
const (
opsAggHourlyJobName = "ops_preaggregation_hourly"
opsAggDailyJobName = "ops_preaggregation_daily"
opsAggHourlyInterval = 10 * time.Minute
opsAggDailyInterval = 1 * time.Hour
// Keep in sync with ops retention target (vNext default 30d).
opsAggBackfillWindow = 30 * 24 * time.Hour
// Recompute overlap to absorb late-arriving rows near boundaries.
opsAggHourlyOverlap = 2 * time.Hour
opsAggDailyOverlap = 48 * time.Hour
opsAggHourlyChunk = 24 * time.Hour
opsAggDailyChunk = 7 * 24 * time.Hour
// Delay around boundaries (e.g. 10:00..10:05) to avoid aggregating buckets
// that may still receive late inserts.
opsAggSafeDelay = 5 * time.Minute
opsAggMaxQueryTimeout = 3 * time.Second
opsAggHourlyTimeout = 5 * time.Minute
opsAggDailyTimeout = 2 * time.Minute
opsAggHourlyLeaderLockKey = "ops:aggregation:hourly:leader"
opsAggDailyLeaderLockKey = "ops:aggregation:daily:leader"
opsAggHourlyLeaderLockTTL = 15 * time.Minute
opsAggDailyLeaderLockTTL = 10 * time.Minute
)
// OpsAggregationService periodically backfills ops_metrics_hourly / ops_metrics_daily
// for stable long-window dashboard queries.
//
// It is safe to run in multi-replica deployments when Redis is available (leader lock).
type OpsAggregationService struct {
opsRepo OpsRepository
settingRepo SettingRepository
cfg *config.Config
db *sql.DB
redisClient *redis.Client
instanceID string
stopCh chan struct{}
startOnce sync.Once
stopOnce sync.Once
hourlyMu sync.Mutex
dailyMu sync.Mutex
skipLogMu sync.Mutex
skipLogAt time.Time
}
func NewOpsAggregationService(
opsRepo OpsRepository,
settingRepo SettingRepository,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsAggregationService {
return &OpsAggregationService{
opsRepo: opsRepo,
settingRepo: settingRepo,
cfg: cfg,
db: db,
redisClient: redisClient,
instanceID: uuid.NewString(),
}
}
func (s *OpsAggregationService) Start() {
if s == nil {
return
}
s.startOnce.Do(func() {
if s.stopCh == nil {
s.stopCh = make(chan struct{})
}
go s.hourlyLoop()
go s.dailyLoop()
})
}
func (s *OpsAggregationService) Stop() {
if s == nil {
return
}
s.stopOnce.Do(func() {
if s.stopCh != nil {
close(s.stopCh)
}
})
}
func (s *OpsAggregationService) hourlyLoop() {
// First run immediately.
s.aggregateHourly()
ticker := time.NewTicker(opsAggHourlyInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
s.aggregateHourly()
case <-s.stopCh:
return
}
}
}
func (s *OpsAggregationService) dailyLoop() {
// First run immediately.
s.aggregateDaily()
ticker := time.NewTicker(opsAggDailyInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
s.aggregateDaily()
case <-s.stopCh:
return
}
}
}
func (s *OpsAggregationService) aggregateHourly() {
if s == nil || s.opsRepo == nil {
return
}
if s.cfg != nil {
if !s.cfg.Ops.Enabled {
return
}
if !s.cfg.Ops.Aggregation.Enabled {
return
}
}
ctx, cancel := context.WithTimeout(context.Background(), opsAggHourlyTimeout)
defer cancel()
if !s.isMonitoringEnabled(ctx) {
return
}
release, ok := s.tryAcquireLeaderLock(ctx, opsAggHourlyLeaderLockKey, opsAggHourlyLeaderLockTTL, "[OpsAggregation][hourly]")
if !ok {
return
}
if release != nil {
defer release()
}
s.hourlyMu.Lock()
defer s.hourlyMu.Unlock()
startedAt := time.Now().UTC()
runAt := startedAt
// Aggregate stable full hours only.
end := utcFloorToHour(time.Now().UTC().Add(-opsAggSafeDelay))
start := end.Add(-opsAggBackfillWindow)
// Resume from the latest bucket with overlap.
{
ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout)
latest, ok, err := s.opsRepo.GetLatestHourlyBucketStart(ctxMax)
cancelMax()
if err != nil {
log.Printf("[OpsAggregation][hourly] failed to read latest bucket: %v", err)
} else if ok {
candidate := latest.Add(-opsAggHourlyOverlap)
if candidate.After(start) {
start = candidate
}
}
}
start = utcFloorToHour(start)
if !start.Before(end) {
return
}
var aggErr error
for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggHourlyChunk) {
chunkEnd := minTime(cursor.Add(opsAggHourlyChunk), end)
if err := s.opsRepo.UpsertHourlyMetrics(ctx, cursor, chunkEnd); err != nil {
aggErr = err
log.Printf("[OpsAggregation][hourly] upsert failed (%s..%s): %v", cursor.Format(time.RFC3339), chunkEnd.Format(time.RFC3339), err)
break
}
}
finishedAt := time.Now().UTC()
durationMs := finishedAt.Sub(startedAt).Milliseconds()
dur := durationMs
if aggErr != nil {
msg := truncateString(aggErr.Error(), 2048)
errAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
defer hbCancel()
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsAggHourlyJobName,
LastRunAt: &runAt,
LastErrorAt: &errAt,
LastError: &msg,
LastDurationMs: &dur,
})
return
}
successAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
defer hbCancel()
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsAggHourlyJobName,
LastRunAt: &runAt,
LastSuccessAt: &successAt,
LastDurationMs: &dur,
})
}
func (s *OpsAggregationService) aggregateDaily() {
if s == nil || s.opsRepo == nil {
return
}
if s.cfg != nil {
if !s.cfg.Ops.Enabled {
return
}
if !s.cfg.Ops.Aggregation.Enabled {
return
}
}
ctx, cancel := context.WithTimeout(context.Background(), opsAggDailyTimeout)
defer cancel()
if !s.isMonitoringEnabled(ctx) {
return
}
release, ok := s.tryAcquireLeaderLock(ctx, opsAggDailyLeaderLockKey, opsAggDailyLeaderLockTTL, "[OpsAggregation][daily]")
if !ok {
return
}
if release != nil {
defer release()
}
s.dailyMu.Lock()
defer s.dailyMu.Unlock()
startedAt := time.Now().UTC()
runAt := startedAt
end := utcFloorToDay(time.Now().UTC())
start := end.Add(-opsAggBackfillWindow)
{
ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout)
latest, ok, err := s.opsRepo.GetLatestDailyBucketDate(ctxMax)
cancelMax()
if err != nil {
log.Printf("[OpsAggregation][daily] failed to read latest bucket: %v", err)
} else if ok {
candidate := latest.Add(-opsAggDailyOverlap)
if candidate.After(start) {
start = candidate
}
}
}
start = utcFloorToDay(start)
if !start.Before(end) {
return
}
var aggErr error
for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggDailyChunk) {
chunkEnd := minTime(cursor.Add(opsAggDailyChunk), end)
if err := s.opsRepo.UpsertDailyMetrics(ctx, cursor, chunkEnd); err != nil {
aggErr = err
log.Printf("[OpsAggregation][daily] upsert failed (%s..%s): %v", cursor.Format("2006-01-02"), chunkEnd.Format("2006-01-02"), err)
break
}
}
finishedAt := time.Now().UTC()
durationMs := finishedAt.Sub(startedAt).Milliseconds()
dur := durationMs
if aggErr != nil {
msg := truncateString(aggErr.Error(), 2048)
errAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
defer hbCancel()
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsAggDailyJobName,
LastRunAt: &runAt,
LastErrorAt: &errAt,
LastError: &msg,
LastDurationMs: &dur,
})
return
}
successAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
defer hbCancel()
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsAggDailyJobName,
LastRunAt: &runAt,
LastSuccessAt: &successAt,
LastDurationMs: &dur,
})
}
func (s *OpsAggregationService) isMonitoringEnabled(ctx context.Context) bool {
if s == nil {
return false
}
if s.cfg != nil && !s.cfg.Ops.Enabled {
return false
}
if s.settingRepo == nil {
return true
}
if ctx == nil {
ctx = context.Background()
}
value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
return true
}
return true
}
switch strings.ToLower(strings.TrimSpace(value)) {
case "false", "0", "off", "disabled":
return false
default:
return true
}
}
var opsAggReleaseScript = redis.NewScript(`
if redis.call("GET", KEYS[1]) == ARGV[1] then
return redis.call("DEL", KEYS[1])
end
return 0
`)
func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) {
if s == nil {
return nil, false
}
if ctx == nil {
ctx = context.Background()
}
// Prefer Redis leader lock when available (multi-instance), but avoid stampeding
// the DB when Redis is flaky by falling back to a DB advisory lock.
if s.redisClient != nil {
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
if err == nil {
if !ok {
s.maybeLogSkip(logPrefix)
return nil, false
}
release := func() {
ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result()
}
return release, true
}
// Redis error: fall through to DB advisory lock.
}
release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
if !ok {
s.maybeLogSkip(logPrefix)
return nil, false
}
return release, true
}
func (s *OpsAggregationService) maybeLogSkip(prefix string) {
s.skipLogMu.Lock()
defer s.skipLogMu.Unlock()
now := time.Now()
if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < time.Minute {
return
}
s.skipLogAt = now
if prefix == "" {
prefix = "[OpsAggregation]"
}
log.Printf("%s leader lock held by another instance; skipping", prefix)
}
func utcFloorToHour(t time.Time) time.Time {
return t.UTC().Truncate(time.Hour)
}
func utcFloorToDay(t time.Time) time.Time {
u := t.UTC()
y, m, d := u.Date()
return time.Date(y, m, d, 0, 0, 0, 0, time.UTC)
}
func minTime(a, b time.Time) time.Time {
if a.Before(b) {
return a
}
return b
}

View File

@@ -0,0 +1,913 @@
package service
import (
"context"
"fmt"
"log"
"math"
"strconv"
"strings"
"sync"
"time"
"github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/uuid"
"github.com/redis/go-redis/v9"
)
const (
opsAlertEvaluatorJobName = "ops_alert_evaluator"
opsAlertEvaluatorTimeout = 45 * time.Second
opsAlertEvaluatorLeaderLockKey = "ops:alert:evaluator:leader"
opsAlertEvaluatorLeaderLockTTL = 90 * time.Second
opsAlertEvaluatorSkipLogInterval = 1 * time.Minute
)
var opsAlertEvaluatorReleaseScript = redis.NewScript(`
if redis.call("GET", KEYS[1]) == ARGV[1] then
return redis.call("DEL", KEYS[1])
end
return 0
`)
type OpsAlertEvaluatorService struct {
opsService *OpsService
opsRepo OpsRepository
emailService *EmailService
redisClient *redis.Client
cfg *config.Config
instanceID string
stopCh chan struct{}
startOnce sync.Once
stopOnce sync.Once
wg sync.WaitGroup
mu sync.Mutex
ruleStates map[int64]*opsAlertRuleState
emailLimiter *slidingWindowLimiter
skipLogMu sync.Mutex
skipLogAt time.Time
warnNoRedisOnce sync.Once
}
type opsAlertRuleState struct {
LastEvaluatedAt time.Time
ConsecutiveBreaches int
}
func NewOpsAlertEvaluatorService(
opsService *OpsService,
opsRepo OpsRepository,
emailService *EmailService,
redisClient *redis.Client,
cfg *config.Config,
) *OpsAlertEvaluatorService {
return &OpsAlertEvaluatorService{
opsService: opsService,
opsRepo: opsRepo,
emailService: emailService,
redisClient: redisClient,
cfg: cfg,
instanceID: uuid.NewString(),
ruleStates: map[int64]*opsAlertRuleState{},
emailLimiter: newSlidingWindowLimiter(0, time.Hour),
}
}
func (s *OpsAlertEvaluatorService) Start() {
if s == nil {
return
}
s.startOnce.Do(func() {
if s.stopCh == nil {
s.stopCh = make(chan struct{})
}
go s.run()
})
}
func (s *OpsAlertEvaluatorService) Stop() {
if s == nil {
return
}
s.stopOnce.Do(func() {
if s.stopCh != nil {
close(s.stopCh)
}
})
s.wg.Wait()
}
func (s *OpsAlertEvaluatorService) run() {
s.wg.Add(1)
defer s.wg.Done()
// Start immediately to produce early feedback in ops dashboard.
timer := time.NewTimer(0)
defer timer.Stop()
for {
select {
case <-timer.C:
interval := s.getInterval()
s.evaluateOnce(interval)
timer.Reset(interval)
case <-s.stopCh:
return
}
}
}
func (s *OpsAlertEvaluatorService) getInterval() time.Duration {
// Default.
interval := 60 * time.Second
if s == nil || s.opsService == nil {
return interval
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
cfg, err := s.opsService.GetOpsAlertRuntimeSettings(ctx)
if err != nil || cfg == nil {
return interval
}
if cfg.EvaluationIntervalSeconds <= 0 {
return interval
}
if cfg.EvaluationIntervalSeconds < 1 {
return interval
}
if cfg.EvaluationIntervalSeconds > int((24 * time.Hour).Seconds()) {
return interval
}
return time.Duration(cfg.EvaluationIntervalSeconds) * time.Second
}
func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
if s == nil || s.opsRepo == nil {
return
}
if s.cfg != nil && !s.cfg.Ops.Enabled {
return
}
ctx, cancel := context.WithTimeout(context.Background(), opsAlertEvaluatorTimeout)
defer cancel()
if s.opsService != nil && !s.opsService.IsMonitoringEnabled(ctx) {
return
}
runtimeCfg := defaultOpsAlertRuntimeSettings()
if s.opsService != nil {
if loaded, err := s.opsService.GetOpsAlertRuntimeSettings(ctx); err == nil && loaded != nil {
runtimeCfg = loaded
}
}
release, ok := s.tryAcquireLeaderLock(ctx, runtimeCfg.DistributedLock)
if !ok {
return
}
if release != nil {
defer release()
}
startedAt := time.Now().UTC()
runAt := startedAt
rules, err := s.opsRepo.ListAlertRules(ctx)
if err != nil {
s.recordHeartbeatError(runAt, time.Since(startedAt), err)
log.Printf("[OpsAlertEvaluator] list rules failed: %v", err)
return
}
now := time.Now().UTC()
safeEnd := now.Truncate(time.Minute)
if safeEnd.IsZero() {
safeEnd = now
}
systemMetrics, _ := s.opsRepo.GetLatestSystemMetrics(ctx, 1)
// Cleanup stale state for removed rules.
s.pruneRuleStates(rules)
for _, rule := range rules {
if rule == nil || !rule.Enabled || rule.ID <= 0 {
continue
}
scopePlatform, scopeGroupID := parseOpsAlertRuleScope(rule.Filters)
windowMinutes := rule.WindowMinutes
if windowMinutes <= 0 {
windowMinutes = 1
}
windowStart := safeEnd.Add(-time.Duration(windowMinutes) * time.Minute)
windowEnd := safeEnd
metricValue, ok := s.computeRuleMetric(ctx, rule, systemMetrics, windowStart, windowEnd, scopePlatform, scopeGroupID)
if !ok {
s.resetRuleState(rule.ID, now)
continue
}
breachedNow := compareMetric(metricValue, rule.Operator, rule.Threshold)
required := requiredSustainedBreaches(rule.SustainedMinutes, interval)
consecutive := s.updateRuleBreaches(rule.ID, now, interval, breachedNow)
activeEvent, err := s.opsRepo.GetActiveAlertEvent(ctx, rule.ID)
if err != nil {
log.Printf("[OpsAlertEvaluator] get active event failed (rule=%d): %v", rule.ID, err)
continue
}
if breachedNow && consecutive >= required {
if activeEvent != nil {
continue
}
latestEvent, err := s.opsRepo.GetLatestAlertEvent(ctx, rule.ID)
if err != nil {
log.Printf("[OpsAlertEvaluator] get latest event failed (rule=%d): %v", rule.ID, err)
continue
}
if latestEvent != nil && rule.CooldownMinutes > 0 {
cooldown := time.Duration(rule.CooldownMinutes) * time.Minute
if now.Sub(latestEvent.FiredAt) < cooldown {
continue
}
}
firedEvent := &OpsAlertEvent{
RuleID: rule.ID,
Severity: strings.TrimSpace(rule.Severity),
Status: OpsAlertStatusFiring,
Title: fmt.Sprintf("%s: %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name)),
Description: buildOpsAlertDescription(rule, metricValue, windowMinutes, scopePlatform, scopeGroupID),
MetricValue: float64Ptr(metricValue),
ThresholdValue: float64Ptr(rule.Threshold),
Dimensions: buildOpsAlertDimensions(scopePlatform, scopeGroupID),
FiredAt: now,
CreatedAt: now,
}
created, err := s.opsRepo.CreateAlertEvent(ctx, firedEvent)
if err != nil {
log.Printf("[OpsAlertEvaluator] create event failed (rule=%d): %v", rule.ID, err)
continue
}
if created != nil && created.ID > 0 {
s.maybeSendAlertEmail(ctx, runtimeCfg, rule, created)
}
continue
}
// Not breached: resolve active event if present.
if activeEvent != nil {
resolvedAt := now
if err := s.opsRepo.UpdateAlertEventStatus(ctx, activeEvent.ID, OpsAlertStatusResolved, &resolvedAt); err != nil {
log.Printf("[OpsAlertEvaluator] resolve event failed (event=%d): %v", activeEvent.ID, err)
}
}
}
s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
}
func (s *OpsAlertEvaluatorService) pruneRuleStates(rules []*OpsAlertRule) {
s.mu.Lock()
defer s.mu.Unlock()
live := map[int64]struct{}{}
for _, r := range rules {
if r != nil && r.ID > 0 {
live[r.ID] = struct{}{}
}
}
for id := range s.ruleStates {
if _, ok := live[id]; !ok {
delete(s.ruleStates, id)
}
}
}
func (s *OpsAlertEvaluatorService) resetRuleState(ruleID int64, now time.Time) {
if ruleID <= 0 {
return
}
s.mu.Lock()
defer s.mu.Unlock()
state, ok := s.ruleStates[ruleID]
if !ok {
state = &opsAlertRuleState{}
s.ruleStates[ruleID] = state
}
state.LastEvaluatedAt = now
state.ConsecutiveBreaches = 0
}
func (s *OpsAlertEvaluatorService) updateRuleBreaches(ruleID int64, now time.Time, interval time.Duration, breached bool) int {
if ruleID <= 0 {
return 0
}
s.mu.Lock()
defer s.mu.Unlock()
state, ok := s.ruleStates[ruleID]
if !ok {
state = &opsAlertRuleState{}
s.ruleStates[ruleID] = state
}
if !state.LastEvaluatedAt.IsZero() && interval > 0 {
if now.Sub(state.LastEvaluatedAt) > interval*2 {
state.ConsecutiveBreaches = 0
}
}
state.LastEvaluatedAt = now
if breached {
state.ConsecutiveBreaches++
} else {
state.ConsecutiveBreaches = 0
}
return state.ConsecutiveBreaches
}
func requiredSustainedBreaches(sustainedMinutes int, interval time.Duration) int {
if sustainedMinutes <= 0 {
return 1
}
if interval <= 0 {
return sustainedMinutes
}
required := int(math.Ceil(float64(sustainedMinutes*60) / interval.Seconds()))
if required < 1 {
return 1
}
return required
}
func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64) {
if filters == nil {
return "", nil
}
if v, ok := filters["platform"]; ok {
if s, ok := v.(string); ok {
platform = strings.TrimSpace(s)
}
}
if v, ok := filters["group_id"]; ok {
switch t := v.(type) {
case float64:
if t > 0 {
id := int64(t)
groupID = &id
}
case int64:
if t > 0 {
id := t
groupID = &id
}
case int:
if t > 0 {
id := int64(t)
groupID = &id
}
case string:
n, err := strconv.ParseInt(strings.TrimSpace(t), 10, 64)
if err == nil && n > 0 {
groupID = &n
}
}
}
return platform, groupID
}
func (s *OpsAlertEvaluatorService) computeRuleMetric(
ctx context.Context,
rule *OpsAlertRule,
systemMetrics *OpsSystemMetricsSnapshot,
start time.Time,
end time.Time,
platform string,
groupID *int64,
) (float64, bool) {
if rule == nil {
return 0, false
}
switch strings.TrimSpace(rule.MetricType) {
case "cpu_usage_percent":
if systemMetrics != nil && systemMetrics.CPUUsagePercent != nil {
return *systemMetrics.CPUUsagePercent, true
}
return 0, false
case "memory_usage_percent":
if systemMetrics != nil && systemMetrics.MemoryUsagePercent != nil {
return *systemMetrics.MemoryUsagePercent, true
}
return 0, false
case "concurrency_queue_depth":
if systemMetrics != nil && systemMetrics.ConcurrencyQueueDepth != nil {
return float64(*systemMetrics.ConcurrencyQueueDepth), true
}
return 0, false
case "group_available_accounts":
if groupID == nil || *groupID <= 0 {
return 0, false
}
if s == nil || s.opsService == nil {
return 0, false
}
availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
if err != nil || availability == nil {
return 0, false
}
if availability.Group == nil {
return 0, true
}
return float64(availability.Group.AvailableCount), true
case "group_available_ratio":
if groupID == nil || *groupID <= 0 {
return 0, false
}
if s == nil || s.opsService == nil {
return 0, false
}
availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
if err != nil || availability == nil {
return 0, false
}
return computeGroupAvailableRatio(availability.Group), true
case "account_rate_limited_count":
if s == nil || s.opsService == nil {
return 0, false
}
availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
if err != nil || availability == nil {
return 0, false
}
return float64(countAccountsByCondition(availability.Accounts, func(acc *AccountAvailability) bool {
return acc.IsRateLimited
})), true
case "account_error_count":
if s == nil || s.opsService == nil {
return 0, false
}
availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
if err != nil || availability == nil {
return 0, false
}
return float64(countAccountsByCondition(availability.Accounts, func(acc *AccountAvailability) bool {
return acc.HasError && acc.TempUnschedulableUntil == nil
})), true
}
overview, err := s.opsRepo.GetDashboardOverview(ctx, &OpsDashboardFilter{
StartTime: start,
EndTime: end,
Platform: platform,
GroupID: groupID,
QueryMode: OpsQueryModeRaw,
})
if err != nil {
return 0, false
}
if overview == nil {
return 0, false
}
switch strings.TrimSpace(rule.MetricType) {
case "success_rate":
if overview.RequestCountSLA <= 0 {
return 0, false
}
return overview.SLA * 100, true
case "error_rate":
if overview.RequestCountSLA <= 0 {
return 0, false
}
return overview.ErrorRate * 100, true
case "upstream_error_rate":
if overview.RequestCountSLA <= 0 {
return 0, false
}
return overview.UpstreamErrorRate * 100, true
case "p95_latency_ms":
if overview.Duration.P95 == nil {
return 0, false
}
return float64(*overview.Duration.P95), true
case "p99_latency_ms":
if overview.Duration.P99 == nil {
return 0, false
}
return float64(*overview.Duration.P99), true
default:
return 0, false
}
}
func compareMetric(value float64, operator string, threshold float64) bool {
switch strings.TrimSpace(operator) {
case ">":
return value > threshold
case ">=":
return value >= threshold
case "<":
return value < threshold
case "<=":
return value <= threshold
case "==":
return value == threshold
case "!=":
return value != threshold
default:
return false
}
}
func buildOpsAlertDimensions(platform string, groupID *int64) map[string]any {
dims := map[string]any{}
if strings.TrimSpace(platform) != "" {
dims["platform"] = strings.TrimSpace(platform)
}
if groupID != nil && *groupID > 0 {
dims["group_id"] = *groupID
}
if len(dims) == 0 {
return nil
}
return dims
}
func buildOpsAlertDescription(rule *OpsAlertRule, value float64, windowMinutes int, platform string, groupID *int64) string {
if rule == nil {
return ""
}
scope := "overall"
if strings.TrimSpace(platform) != "" {
scope = fmt.Sprintf("platform=%s", strings.TrimSpace(platform))
}
if groupID != nil && *groupID > 0 {
scope = fmt.Sprintf("%s group_id=%d", scope, *groupID)
}
if windowMinutes <= 0 {
windowMinutes = 1
}
return fmt.Sprintf("%s %s %.2f (current %.2f) over last %dm (%s)",
strings.TrimSpace(rule.MetricType),
strings.TrimSpace(rule.Operator),
rule.Threshold,
value,
windowMinutes,
strings.TrimSpace(scope),
)
}
func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runtimeCfg *OpsAlertRuntimeSettings, rule *OpsAlertRule, event *OpsAlertEvent) {
if s == nil || s.emailService == nil || s.opsService == nil || event == nil || rule == nil {
return
}
if event.EmailSent {
return
}
if !rule.NotifyEmail {
return
}
emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx)
if err != nil || emailCfg == nil || !emailCfg.Alert.Enabled {
return
}
if len(emailCfg.Alert.Recipients) == 0 {
return
}
if !shouldSendOpsAlertEmailByMinSeverity(strings.TrimSpace(emailCfg.Alert.MinSeverity), strings.TrimSpace(rule.Severity)) {
return
}
if runtimeCfg != nil && runtimeCfg.Silencing.Enabled {
if isOpsAlertSilenced(time.Now().UTC(), rule, event, runtimeCfg.Silencing) {
return
}
}
// Apply/update rate limiter.
s.emailLimiter.SetLimit(emailCfg.Alert.RateLimitPerHour)
subject := fmt.Sprintf("[Ops Alert][%s] %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name))
body := buildOpsAlertEmailBody(rule, event)
anySent := false
for _, to := range emailCfg.Alert.Recipients {
addr := strings.TrimSpace(to)
if addr == "" {
continue
}
if !s.emailLimiter.Allow(time.Now().UTC()) {
continue
}
if err := s.emailService.SendEmail(ctx, addr, subject, body); err != nil {
// Ignore per-recipient failures; continue best-effort.
continue
}
anySent = true
}
if anySent {
_ = s.opsRepo.UpdateAlertEventEmailSent(context.Background(), event.ID, true)
}
}
func buildOpsAlertEmailBody(rule *OpsAlertRule, event *OpsAlertEvent) string {
if rule == nil || event == nil {
return ""
}
metric := strings.TrimSpace(rule.MetricType)
value := "-"
threshold := fmt.Sprintf("%.2f", rule.Threshold)
if event.MetricValue != nil {
value = fmt.Sprintf("%.2f", *event.MetricValue)
}
if event.ThresholdValue != nil {
threshold = fmt.Sprintf("%.2f", *event.ThresholdValue)
}
return fmt.Sprintf(`
<h2>Ops Alert</h2>
<p><b>Rule</b>: %s</p>
<p><b>Severity</b>: %s</p>
<p><b>Status</b>: %s</p>
<p><b>Metric</b>: %s %s %s</p>
<p><b>Fired at</b>: %s</p>
<p><b>Description</b>: %s</p>
`,
htmlEscape(rule.Name),
htmlEscape(rule.Severity),
htmlEscape(event.Status),
htmlEscape(metric),
htmlEscape(rule.Operator),
htmlEscape(fmt.Sprintf("%s (threshold %s)", value, threshold)),
event.FiredAt.Format(time.RFC3339),
htmlEscape(event.Description),
)
}
func shouldSendOpsAlertEmailByMinSeverity(minSeverity string, ruleSeverity string) bool {
minSeverity = strings.ToLower(strings.TrimSpace(minSeverity))
if minSeverity == "" {
return true
}
eventLevel := opsEmailSeverityForOps(ruleSeverity)
minLevel := strings.ToLower(minSeverity)
rank := func(level string) int {
switch level {
case "critical":
return 3
case "warning":
return 2
case "info":
return 1
default:
return 0
}
}
return rank(eventLevel) >= rank(minLevel)
}
func opsEmailSeverityForOps(severity string) string {
switch strings.ToUpper(strings.TrimSpace(severity)) {
case "P0":
return "critical"
case "P1":
return "warning"
default:
return "info"
}
}
func isOpsAlertSilenced(now time.Time, rule *OpsAlertRule, event *OpsAlertEvent, silencing OpsAlertSilencingSettings) bool {
if !silencing.Enabled {
return false
}
if now.IsZero() {
now = time.Now().UTC()
}
if strings.TrimSpace(silencing.GlobalUntilRFC3339) != "" {
if t, err := time.Parse(time.RFC3339, strings.TrimSpace(silencing.GlobalUntilRFC3339)); err == nil {
if now.Before(t) {
return true
}
}
}
for _, entry := range silencing.Entries {
untilRaw := strings.TrimSpace(entry.UntilRFC3339)
if untilRaw == "" {
continue
}
until, err := time.Parse(time.RFC3339, untilRaw)
if err != nil {
continue
}
if now.After(until) {
continue
}
if entry.RuleID != nil && rule != nil && rule.ID > 0 && *entry.RuleID != rule.ID {
continue
}
if len(entry.Severities) > 0 {
match := false
for _, s := range entry.Severities {
if strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(event.Severity)) || strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(rule.Severity)) {
match = true
break
}
}
if !match {
continue
}
}
return true
}
return false
}
func (s *OpsAlertEvaluatorService) tryAcquireLeaderLock(ctx context.Context, lock OpsDistributedLockSettings) (func(), bool) {
if !lock.Enabled {
return nil, true
}
if s.redisClient == nil {
s.warnNoRedisOnce.Do(func() {
log.Printf("[OpsAlertEvaluator] redis not configured; running without distributed lock")
})
return nil, true
}
key := strings.TrimSpace(lock.Key)
if key == "" {
key = opsAlertEvaluatorLeaderLockKey
}
ttl := time.Duration(lock.TTLSeconds) * time.Second
if ttl <= 0 {
ttl = opsAlertEvaluatorLeaderLockTTL
}
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
if err != nil {
// Prefer fail-closed to avoid duplicate evaluators stampeding the DB when Redis is flaky.
// Single-node deployments can disable the distributed lock via runtime settings.
s.warnNoRedisOnce.Do(func() {
log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; skipping this cycle: %v", err)
})
return nil, false
}
if !ok {
s.maybeLogSkip(key)
return nil, false
}
return func() {
_, _ = opsAlertEvaluatorReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
}, true
}
func (s *OpsAlertEvaluatorService) maybeLogSkip(key string) {
s.skipLogMu.Lock()
defer s.skipLogMu.Unlock()
now := time.Now()
if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < opsAlertEvaluatorSkipLogInterval {
return
}
s.skipLogAt = now
log.Printf("[OpsAlertEvaluator] leader lock held by another instance; skipping (key=%q)", key)
}
func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
if s == nil || s.opsRepo == nil {
return
}
now := time.Now().UTC()
durMs := duration.Milliseconds()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
JobName: opsAlertEvaluatorJobName,
LastRunAt: &runAt,
LastSuccessAt: &now,
LastDurationMs: &durMs,
})
}
func (s *OpsAlertEvaluatorService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
if s == nil || s.opsRepo == nil || err == nil {
return
}
now := time.Now().UTC()
durMs := duration.Milliseconds()
msg := truncateString(err.Error(), 2048)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
JobName: opsAlertEvaluatorJobName,
LastRunAt: &runAt,
LastErrorAt: &now,
LastError: &msg,
LastDurationMs: &durMs,
})
}
func htmlEscape(s string) string {
replacer := strings.NewReplacer(
"&", "&amp;",
"<", "&lt;",
">", "&gt;",
`"`, "&quot;",
"'", "&#39;",
)
return replacer.Replace(s)
}
type slidingWindowLimiter struct {
mu sync.Mutex
limit int
window time.Duration
sent []time.Time
}
func newSlidingWindowLimiter(limit int, window time.Duration) *slidingWindowLimiter {
if window <= 0 {
window = time.Hour
}
return &slidingWindowLimiter{
limit: limit,
window: window,
sent: []time.Time{},
}
}
func (l *slidingWindowLimiter) SetLimit(limit int) {
l.mu.Lock()
defer l.mu.Unlock()
l.limit = limit
}
func (l *slidingWindowLimiter) Allow(now time.Time) bool {
l.mu.Lock()
defer l.mu.Unlock()
if l.limit <= 0 {
return true
}
cutoff := now.Add(-l.window)
keep := l.sent[:0]
for _, t := range l.sent {
if t.After(cutoff) {
keep = append(keep, t)
}
}
l.sent = keep
if len(l.sent) >= l.limit {
return false
}
l.sent = append(l.sent, now)
return true
}
// computeGroupAvailableRatio returns the available percentage for a group.
// Formula: (AvailableCount / TotalAccounts) * 100.
// Returns 0 when TotalAccounts is 0.
func computeGroupAvailableRatio(group *GroupAvailability) float64 {
if group == nil || group.TotalAccounts <= 0 {
return 0
}
return (float64(group.AvailableCount) / float64(group.TotalAccounts)) * 100
}
// countAccountsByCondition counts accounts that satisfy the given condition.
func countAccountsByCondition(accounts map[int64]*AccountAvailability, condition func(*AccountAvailability) bool) int64 {
if len(accounts) == 0 || condition == nil {
return 0
}
var count int64
for _, account := range accounts {
if account != nil && condition(account) {
count++
}
}
return count
}

View File

@@ -0,0 +1,210 @@
//go:build unit
package service
import (
"context"
"testing"
"time"
"github.com/stretchr/testify/require"
)
type stubOpsRepo struct {
OpsRepository
overview *OpsDashboardOverview
err error
}
func (s *stubOpsRepo) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) {
if s.err != nil {
return nil, s.err
}
if s.overview != nil {
return s.overview, nil
}
return &OpsDashboardOverview{}, nil
}
func TestComputeGroupAvailableRatio(t *testing.T) {
t.Parallel()
t.Run("正常情况: 10个账号, 8个可用 = 80%", func(t *testing.T) {
t.Parallel()
got := computeGroupAvailableRatio(&GroupAvailability{
TotalAccounts: 10,
AvailableCount: 8,
})
require.InDelta(t, 80.0, got, 0.0001)
})
t.Run("边界情况: TotalAccounts = 0 应返回 0", func(t *testing.T) {
t.Parallel()
got := computeGroupAvailableRatio(&GroupAvailability{
TotalAccounts: 0,
AvailableCount: 8,
})
require.Equal(t, 0.0, got)
})
t.Run("边界情况: AvailableCount = 0 应返回 0%", func(t *testing.T) {
t.Parallel()
got := computeGroupAvailableRatio(&GroupAvailability{
TotalAccounts: 10,
AvailableCount: 0,
})
require.Equal(t, 0.0, got)
})
}
func TestCountAccountsByCondition(t *testing.T) {
t.Parallel()
t.Run("测试限流账号统计: acc.IsRateLimited", func(t *testing.T) {
t.Parallel()
accounts := map[int64]*AccountAvailability{
1: {IsRateLimited: true},
2: {IsRateLimited: false},
3: {IsRateLimited: true},
}
got := countAccountsByCondition(accounts, func(acc *AccountAvailability) bool {
return acc.IsRateLimited
})
require.Equal(t, int64(2), got)
})
t.Run("测试错误账号统计(排除临时不可调度): acc.HasError && acc.TempUnschedulableUntil == nil", func(t *testing.T) {
t.Parallel()
until := time.Now().UTC().Add(5 * time.Minute)
accounts := map[int64]*AccountAvailability{
1: {HasError: true},
2: {HasError: true, TempUnschedulableUntil: &until},
3: {HasError: false},
}
got := countAccountsByCondition(accounts, func(acc *AccountAvailability) bool {
return acc.HasError && acc.TempUnschedulableUntil == nil
})
require.Equal(t, int64(1), got)
})
t.Run("边界情况: 空 map 应返回 0", func(t *testing.T) {
t.Parallel()
got := countAccountsByCondition(map[int64]*AccountAvailability{}, func(acc *AccountAvailability) bool {
return acc.IsRateLimited
})
require.Equal(t, int64(0), got)
})
}
func TestComputeRuleMetricNewIndicators(t *testing.T) {
t.Parallel()
groupID := int64(101)
platform := "openai"
availability := &OpsAccountAvailability{
Group: &GroupAvailability{
GroupID: groupID,
TotalAccounts: 10,
AvailableCount: 8,
},
Accounts: map[int64]*AccountAvailability{
1: {IsRateLimited: true},
2: {IsRateLimited: true},
3: {HasError: true},
4: {HasError: true, TempUnschedulableUntil: timePtr(time.Now().UTC().Add(2 * time.Minute))},
5: {HasError: false, IsRateLimited: false},
},
}
opsService := &OpsService{
getAccountAvailability: func(_ context.Context, _ string, _ *int64) (*OpsAccountAvailability, error) {
return availability, nil
},
}
svc := &OpsAlertEvaluatorService{
opsService: opsService,
opsRepo: &stubOpsRepo{overview: &OpsDashboardOverview{}},
}
start := time.Now().UTC().Add(-5 * time.Minute)
end := time.Now().UTC()
ctx := context.Background()
tests := []struct {
name string
metricType string
groupID *int64
wantValue float64
wantOK bool
}{
{
name: "group_available_accounts",
metricType: "group_available_accounts",
groupID: &groupID,
wantValue: 8,
wantOK: true,
},
{
name: "group_available_ratio",
metricType: "group_available_ratio",
groupID: &groupID,
wantValue: 80.0,
wantOK: true,
},
{
name: "account_rate_limited_count",
metricType: "account_rate_limited_count",
groupID: nil,
wantValue: 2,
wantOK: true,
},
{
name: "account_error_count",
metricType: "account_error_count",
groupID: nil,
wantValue: 1,
wantOK: true,
},
{
name: "group_available_accounts without group_id returns false",
metricType: "group_available_accounts",
groupID: nil,
wantValue: 0,
wantOK: false,
},
{
name: "group_available_ratio without group_id returns false",
metricType: "group_available_ratio",
groupID: nil,
wantValue: 0,
wantOK: false,
},
}
for _, tt := range tests {
tt := tt
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
rule := &OpsAlertRule{
MetricType: tt.metricType,
}
gotValue, gotOK := svc.computeRuleMetric(ctx, rule, nil, start, end, platform, tt.groupID)
require.Equal(t, tt.wantOK, gotOK)
if !tt.wantOK {
return
}
require.InDelta(t, tt.wantValue, gotValue, 0.0001)
})
}
}

View File

@@ -0,0 +1,74 @@
package service
import "time"
// Ops alert rule/event models.
//
// NOTE: These are admin-facing DTOs and intentionally keep JSON naming aligned
// with the existing ops dashboard frontend (backup style).
const (
OpsAlertStatusFiring = "firing"
OpsAlertStatusResolved = "resolved"
)
type OpsAlertRule struct {
ID int64 `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
Enabled bool `json:"enabled"`
Severity string `json:"severity"`
MetricType string `json:"metric_type"`
Operator string `json:"operator"`
Threshold float64 `json:"threshold"`
WindowMinutes int `json:"window_minutes"`
SustainedMinutes int `json:"sustained_minutes"`
CooldownMinutes int `json:"cooldown_minutes"`
NotifyEmail bool `json:"notify_email"`
Filters map[string]any `json:"filters,omitempty"`
LastTriggeredAt *time.Time `json:"last_triggered_at,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
type OpsAlertEvent struct {
ID int64 `json:"id"`
RuleID int64 `json:"rule_id"`
Severity string `json:"severity"`
Status string `json:"status"`
Title string `json:"title"`
Description string `json:"description"`
MetricValue *float64 `json:"metric_value,omitempty"`
ThresholdValue *float64 `json:"threshold_value,omitempty"`
Dimensions map[string]any `json:"dimensions,omitempty"`
FiredAt time.Time `json:"fired_at"`
ResolvedAt *time.Time `json:"resolved_at,omitempty"`
EmailSent bool `json:"email_sent"`
CreatedAt time.Time `json:"created_at"`
}
type OpsAlertEventFilter struct {
Limit int
// Optional filters.
Status string
Severity string
StartTime *time.Time
EndTime *time.Time
// Dimensions filters (best-effort).
Platform string
GroupID *int64
}

View File

@@ -0,0 +1,162 @@
package service
import (
"context"
"database/sql"
"errors"
"strings"
"time"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
func (s *OpsService) ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return []*OpsAlertRule{}, nil
}
return s.opsRepo.ListAlertRules(ctx)
}
func (s *OpsService) CreateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if rule == nil {
return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule")
}
created, err := s.opsRepo.CreateAlertRule(ctx, rule)
if err != nil {
return nil, err
}
return created, nil
}
func (s *OpsService) UpdateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if rule == nil || rule.ID <= 0 {
return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule")
}
updated, err := s.opsRepo.UpdateAlertRule(ctx, rule)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
return nil, infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found")
}
return nil, err
}
return updated, nil
}
func (s *OpsService) DeleteAlertRule(ctx context.Context, id int64) error {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return err
}
if s.opsRepo == nil {
return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if id <= 0 {
return infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
}
if err := s.opsRepo.DeleteAlertRule(ctx, id); err != nil {
if errors.Is(err, sql.ErrNoRows) {
return infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found")
}
return err
}
return nil
}
func (s *OpsService) ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return []*OpsAlertEvent{}, nil
}
return s.opsRepo.ListAlertEvents(ctx, filter)
}
func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if ruleID <= 0 {
return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
}
return s.opsRepo.GetActiveAlertEvent(ctx, ruleID)
}
func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if ruleID <= 0 {
return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
}
return s.opsRepo.GetLatestAlertEvent(ctx, ruleID)
}
func (s *OpsService) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if event == nil {
return nil, infraerrors.BadRequest("INVALID_EVENT", "invalid event")
}
created, err := s.opsRepo.CreateAlertEvent(ctx, event)
if err != nil {
return nil, err
}
return created, nil
}
func (s *OpsService) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return err
}
if s.opsRepo == nil {
return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if eventID <= 0 {
return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
}
if strings.TrimSpace(status) == "" {
return infraerrors.BadRequest("INVALID_STATUS", "invalid status")
}
return s.opsRepo.UpdateAlertEventStatus(ctx, eventID, status, resolvedAt)
}
func (s *OpsService) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return err
}
if s.opsRepo == nil {
return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if eventID <= 0 {
return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
}
return s.opsRepo.UpdateAlertEventEmailSent(ctx, eventID, emailSent)
}

View File

@@ -0,0 +1,365 @@
package service
import (
"context"
"database/sql"
"fmt"
"log"
"strings"
"sync"
"time"
"github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/uuid"
"github.com/redis/go-redis/v9"
"github.com/robfig/cron/v3"
)
const (
opsCleanupJobName = "ops_cleanup"
opsCleanupLeaderLockKeyDefault = "ops:cleanup:leader"
opsCleanupLeaderLockTTLDefault = 30 * time.Minute
)
var opsCleanupCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
var opsCleanupReleaseScript = redis.NewScript(`
if redis.call("GET", KEYS[1]) == ARGV[1] then
return redis.call("DEL", KEYS[1])
end
return 0
`)
// OpsCleanupService periodically deletes old ops data to prevent unbounded DB growth.
//
// - Scheduling: 5-field cron spec (minute hour dom month dow).
// - Multi-instance: best-effort Redis leader lock so only one node runs cleanup.
// - Safety: deletes in batches to avoid long transactions.
type OpsCleanupService struct {
opsRepo OpsRepository
db *sql.DB
redisClient *redis.Client
cfg *config.Config
instanceID string
cron *cron.Cron
startOnce sync.Once
stopOnce sync.Once
warnNoRedisOnce sync.Once
}
func NewOpsCleanupService(
opsRepo OpsRepository,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsCleanupService {
return &OpsCleanupService{
opsRepo: opsRepo,
db: db,
redisClient: redisClient,
cfg: cfg,
instanceID: uuid.NewString(),
}
}
func (s *OpsCleanupService) Start() {
if s == nil {
return
}
if s.cfg != nil && !s.cfg.Ops.Enabled {
return
}
if s.cfg != nil && !s.cfg.Ops.Cleanup.Enabled {
log.Printf("[OpsCleanup] not started (disabled)")
return
}
if s.opsRepo == nil || s.db == nil {
log.Printf("[OpsCleanup] not started (missing deps)")
return
}
s.startOnce.Do(func() {
schedule := "0 2 * * *"
if s.cfg != nil && strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule) != "" {
schedule = strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule)
}
loc := time.Local
if s.cfg != nil && strings.TrimSpace(s.cfg.Timezone) != "" {
if parsed, err := time.LoadLocation(strings.TrimSpace(s.cfg.Timezone)); err == nil && parsed != nil {
loc = parsed
}
}
c := cron.New(cron.WithParser(opsCleanupCronParser), cron.WithLocation(loc))
_, err := c.AddFunc(schedule, func() { s.runScheduled() })
if err != nil {
log.Printf("[OpsCleanup] not started (invalid schedule=%q): %v", schedule, err)
return
}
s.cron = c
s.cron.Start()
log.Printf("[OpsCleanup] started (schedule=%q tz=%s)", schedule, loc.String())
})
}
func (s *OpsCleanupService) Stop() {
if s == nil {
return
}
s.stopOnce.Do(func() {
if s.cron != nil {
ctx := s.cron.Stop()
select {
case <-ctx.Done():
case <-time.After(3 * time.Second):
log.Printf("[OpsCleanup] cron stop timed out")
}
}
})
}
func (s *OpsCleanupService) runScheduled() {
if s == nil || s.db == nil || s.opsRepo == nil {
return
}
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
defer cancel()
release, ok := s.tryAcquireLeaderLock(ctx)
if !ok {
return
}
if release != nil {
defer release()
}
startedAt := time.Now().UTC()
runAt := startedAt
counts, err := s.runCleanupOnce(ctx)
if err != nil {
s.recordHeartbeatError(runAt, time.Since(startedAt), err)
log.Printf("[OpsCleanup] cleanup failed: %v", err)
return
}
s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
log.Printf("[OpsCleanup] cleanup complete: %s", counts)
}
type opsCleanupDeletedCounts struct {
errorLogs int64
retryAttempts int64
alertEvents int64
systemMetrics int64
hourlyPreagg int64
dailyPreagg int64
}
func (c opsCleanupDeletedCounts) String() string {
return fmt.Sprintf(
"error_logs=%d retry_attempts=%d alert_events=%d system_metrics=%d hourly_preagg=%d daily_preagg=%d",
c.errorLogs,
c.retryAttempts,
c.alertEvents,
c.systemMetrics,
c.hourlyPreagg,
c.dailyPreagg,
)
}
func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDeletedCounts, error) {
out := opsCleanupDeletedCounts{}
if s == nil || s.db == nil || s.cfg == nil {
return out, nil
}
batchSize := 5000
now := time.Now().UTC()
// Error-like tables: error logs / retry attempts / alert events.
if days := s.cfg.Ops.Cleanup.ErrorLogRetentionDays; days > 0 {
cutoff := now.AddDate(0, 0, -days)
n, err := deleteOldRowsByID(ctx, s.db, "ops_error_logs", "created_at", cutoff, batchSize, false)
if err != nil {
return out, err
}
out.errorLogs = n
n, err = deleteOldRowsByID(ctx, s.db, "ops_retry_attempts", "created_at", cutoff, batchSize, false)
if err != nil {
return out, err
}
out.retryAttempts = n
n, err = deleteOldRowsByID(ctx, s.db, "ops_alert_events", "created_at", cutoff, batchSize, false)
if err != nil {
return out, err
}
out.alertEvents = n
}
// Minute-level metrics snapshots.
if days := s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays; days > 0 {
cutoff := now.AddDate(0, 0, -days)
n, err := deleteOldRowsByID(ctx, s.db, "ops_system_metrics", "created_at", cutoff, batchSize, false)
if err != nil {
return out, err
}
out.systemMetrics = n
}
// Pre-aggregation tables (hourly/daily).
if days := s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays; days > 0 {
cutoff := now.AddDate(0, 0, -days)
n, err := deleteOldRowsByID(ctx, s.db, "ops_metrics_hourly", "bucket_start", cutoff, batchSize, false)
if err != nil {
return out, err
}
out.hourlyPreagg = n
n, err = deleteOldRowsByID(ctx, s.db, "ops_metrics_daily", "bucket_date", cutoff, batchSize, true)
if err != nil {
return out, err
}
out.dailyPreagg = n
}
return out, nil
}
func deleteOldRowsByID(
ctx context.Context,
db *sql.DB,
table string,
timeColumn string,
cutoff time.Time,
batchSize int,
castCutoffToDate bool,
) (int64, error) {
if db == nil {
return 0, nil
}
if batchSize <= 0 {
batchSize = 5000
}
where := fmt.Sprintf("%s < $1", timeColumn)
if castCutoffToDate {
where = fmt.Sprintf("%s < $1::date", timeColumn)
}
q := fmt.Sprintf(`
WITH batch AS (
SELECT id FROM %s
WHERE %s
ORDER BY id
LIMIT $2
)
DELETE FROM %s
WHERE id IN (SELECT id FROM batch)
`, table, where, table)
var total int64
for {
res, err := db.ExecContext(ctx, q, cutoff, batchSize)
if err != nil {
// If ops tables aren't present yet (partial deployments), treat as no-op.
if strings.Contains(strings.ToLower(err.Error()), "does not exist") && strings.Contains(strings.ToLower(err.Error()), "relation") {
return total, nil
}
return total, err
}
affected, err := res.RowsAffected()
if err != nil {
return total, err
}
total += affected
if affected == 0 {
break
}
}
return total, nil
}
func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
if s == nil {
return nil, false
}
// In simple run mode, assume single instance.
if s.cfg != nil && s.cfg.RunMode == config.RunModeSimple {
return nil, true
}
key := opsCleanupLeaderLockKeyDefault
ttl := opsCleanupLeaderLockTTLDefault
// Prefer Redis leader lock when available, but avoid stampeding the DB when Redis is flaky by
// falling back to a DB advisory lock.
if s.redisClient != nil {
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
if err == nil {
if !ok {
return nil, false
}
return func() {
_, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
}, true
}
// Redis error: fall back to DB advisory lock.
s.warnNoRedisOnce.Do(func() {
log.Printf("[OpsCleanup] leader lock SetNX failed; falling back to DB advisory lock: %v", err)
})
} else {
s.warnNoRedisOnce.Do(func() {
log.Printf("[OpsCleanup] redis not configured; using DB advisory lock")
})
}
release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
if !ok {
return nil, false
}
return release, true
}
func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
if s == nil || s.opsRepo == nil {
return
}
now := time.Now().UTC()
durMs := duration.Milliseconds()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
JobName: opsCleanupJobName,
LastRunAt: &runAt,
LastSuccessAt: &now,
LastDurationMs: &durMs,
})
}
func (s *OpsCleanupService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
if s == nil || s.opsRepo == nil || err == nil {
return
}
now := time.Now().UTC()
durMs := duration.Milliseconds()
msg := truncateString(err.Error(), 2048)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
JobName: opsCleanupJobName,
LastRunAt: &runAt,
LastErrorAt: &now,
LastError: &msg,
LastDurationMs: &durMs,
})
}

View File

@@ -0,0 +1,257 @@
package service
import (
"context"
"log"
"time"
"github.com/Wei-Shaw/sub2api/internal/pkg/pagination"
)
const (
opsAccountsPageSize = 100
opsConcurrencyBatchChunkSize = 200
)
func (s *OpsService) listAllAccountsForOps(ctx context.Context, platformFilter string) ([]Account, error) {
if s == nil || s.accountRepo == nil {
return []Account{}, nil
}
out := make([]Account, 0, 128)
page := 1
for {
accounts, pageInfo, err := s.accountRepo.ListWithFilters(ctx, pagination.PaginationParams{
Page: page,
PageSize: opsAccountsPageSize,
}, platformFilter, "", "", "")
if err != nil {
return nil, err
}
if len(accounts) == 0 {
break
}
out = append(out, accounts...)
if pageInfo != nil && int64(len(out)) >= pageInfo.Total {
break
}
if len(accounts) < opsAccountsPageSize {
break
}
page++
if page > 10_000 {
log.Printf("[Ops] listAllAccountsForOps: aborting after too many pages (platform=%q)", platformFilter)
break
}
}
return out, nil
}
func (s *OpsService) getAccountsLoadMapBestEffort(ctx context.Context, accounts []Account) map[int64]*AccountLoadInfo {
if s == nil || s.concurrencyService == nil {
return map[int64]*AccountLoadInfo{}
}
if len(accounts) == 0 {
return map[int64]*AccountLoadInfo{}
}
// De-duplicate IDs (and keep the max concurrency to avoid under-reporting).
unique := make(map[int64]int, len(accounts))
for _, acc := range accounts {
if acc.ID <= 0 {
continue
}
if prev, ok := unique[acc.ID]; !ok || acc.Concurrency > prev {
unique[acc.ID] = acc.Concurrency
}
}
batch := make([]AccountWithConcurrency, 0, len(unique))
for id, maxConc := range unique {
batch = append(batch, AccountWithConcurrency{
ID: id,
MaxConcurrency: maxConc,
})
}
out := make(map[int64]*AccountLoadInfo, len(batch))
for i := 0; i < len(batch); i += opsConcurrencyBatchChunkSize {
end := i + opsConcurrencyBatchChunkSize
if end > len(batch) {
end = len(batch)
}
part, err := s.concurrencyService.GetAccountsLoadBatch(ctx, batch[i:end])
if err != nil {
// Best-effort: return zeros rather than failing the ops UI.
log.Printf("[Ops] GetAccountsLoadBatch failed: %v", err)
continue
}
for k, v := range part {
out[k] = v
}
}
return out
}
// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account.
//
// Optional filters:
// - platformFilter: only include accounts in that platform (best-effort reduces DB load)
// - groupIDFilter: only include accounts that belong to that group
func (s *OpsService) GetConcurrencyStats(
ctx context.Context,
platformFilter string,
groupIDFilter *int64,
) (map[string]*PlatformConcurrencyInfo, map[int64]*GroupConcurrencyInfo, map[int64]*AccountConcurrencyInfo, *time.Time, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, nil, nil, nil, err
}
accounts, err := s.listAllAccountsForOps(ctx, platformFilter)
if err != nil {
return nil, nil, nil, nil, err
}
collectedAt := time.Now()
loadMap := s.getAccountsLoadMapBestEffort(ctx, accounts)
platform := make(map[string]*PlatformConcurrencyInfo)
group := make(map[int64]*GroupConcurrencyInfo)
account := make(map[int64]*AccountConcurrencyInfo)
for _, acc := range accounts {
if acc.ID <= 0 {
continue
}
var matchedGroup *Group
if groupIDFilter != nil && *groupIDFilter > 0 {
for _, grp := range acc.Groups {
if grp == nil || grp.ID <= 0 {
continue
}
if grp.ID == *groupIDFilter {
matchedGroup = grp
break
}
}
// Group filter provided: skip accounts not in that group.
if matchedGroup == nil {
continue
}
}
load := loadMap[acc.ID]
currentInUse := int64(0)
waiting := int64(0)
if load != nil {
currentInUse = int64(load.CurrentConcurrency)
waiting = int64(load.WaitingCount)
}
// Account-level view picks one display group (the first group).
displayGroupID := int64(0)
displayGroupName := ""
if matchedGroup != nil {
displayGroupID = matchedGroup.ID
displayGroupName = matchedGroup.Name
} else if len(acc.Groups) > 0 && acc.Groups[0] != nil {
displayGroupID = acc.Groups[0].ID
displayGroupName = acc.Groups[0].Name
}
if _, ok := account[acc.ID]; !ok {
info := &AccountConcurrencyInfo{
AccountID: acc.ID,
AccountName: acc.Name,
Platform: acc.Platform,
GroupID: displayGroupID,
GroupName: displayGroupName,
CurrentInUse: currentInUse,
MaxCapacity: int64(acc.Concurrency),
WaitingInQueue: waiting,
}
if info.MaxCapacity > 0 {
info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
}
account[acc.ID] = info
}
// Platform aggregation.
if acc.Platform != "" {
if _, ok := platform[acc.Platform]; !ok {
platform[acc.Platform] = &PlatformConcurrencyInfo{
Platform: acc.Platform,
}
}
p := platform[acc.Platform]
p.MaxCapacity += int64(acc.Concurrency)
p.CurrentInUse += currentInUse
p.WaitingInQueue += waiting
}
// Group aggregation (one account may contribute to multiple groups).
if matchedGroup != nil {
grp := matchedGroup
if _, ok := group[grp.ID]; !ok {
group[grp.ID] = &GroupConcurrencyInfo{
GroupID: grp.ID,
GroupName: grp.Name,
Platform: grp.Platform,
}
}
g := group[grp.ID]
if g.GroupName == "" && grp.Name != "" {
g.GroupName = grp.Name
}
if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform {
// Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels.
g.Platform = ""
}
g.MaxCapacity += int64(acc.Concurrency)
g.CurrentInUse += currentInUse
g.WaitingInQueue += waiting
} else {
for _, grp := range acc.Groups {
if grp == nil || grp.ID <= 0 {
continue
}
if _, ok := group[grp.ID]; !ok {
group[grp.ID] = &GroupConcurrencyInfo{
GroupID: grp.ID,
GroupName: grp.Name,
Platform: grp.Platform,
}
}
g := group[grp.ID]
if g.GroupName == "" && grp.Name != "" {
g.GroupName = grp.Name
}
if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform {
// Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels.
g.Platform = ""
}
g.MaxCapacity += int64(acc.Concurrency)
g.CurrentInUse += currentInUse
g.WaitingInQueue += waiting
}
}
}
for _, info := range platform {
if info.MaxCapacity > 0 {
info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
}
}
for _, info := range group {
if info.MaxCapacity > 0 {
info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
}
}
return platform, group, account, &collectedAt, nil
}

View File

@@ -0,0 +1,90 @@
package service
import (
"context"
"database/sql"
"errors"
"log"
"time"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if filter == nil {
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
}
if filter.StartTime.After(filter.EndTime) {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
}
// Resolve query mode (requested via query param, or DB default).
filter.QueryMode = s.resolveOpsQueryMode(ctx, filter.QueryMode)
overview, err := s.opsRepo.GetDashboardOverview(ctx, filter)
if err != nil {
if errors.Is(err, ErrOpsPreaggregatedNotPopulated) {
return nil, infraerrors.Conflict("OPS_PREAGG_NOT_READY", "Pre-aggregated ops metrics are not populated yet")
}
return nil, err
}
// Best-effort system health + jobs; dashboard metrics should still render if these are missing.
if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil {
// Attach config-derived limits so the UI can show "current / max" for connection pools.
// These are best-effort and should never block the dashboard rendering.
if s != nil && s.cfg != nil {
if s.cfg.Database.MaxOpenConns > 0 {
metrics.DBMaxOpenConns = intPtr(s.cfg.Database.MaxOpenConns)
}
if s.cfg.Redis.PoolSize > 0 {
metrics.RedisPoolSize = intPtr(s.cfg.Redis.PoolSize)
}
}
overview.SystemMetrics = metrics
} else if err != nil && !errors.Is(err, sql.ErrNoRows) {
log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err)
}
if heartbeats, err := s.opsRepo.ListJobHeartbeats(ctx); err == nil {
overview.JobHeartbeats = heartbeats
} else {
log.Printf("[Ops] ListJobHeartbeats failed: %v", err)
}
overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview)
return overview, nil
}
func (s *OpsService) resolveOpsQueryMode(ctx context.Context, requested OpsQueryMode) OpsQueryMode {
if requested.IsValid() {
// Allow "auto" to be disabled via config until preagg is proven stable in production.
// Forced `preagg` via query param still works.
if requested == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
return OpsQueryModeRaw
}
return requested
}
mode := OpsQueryModeAuto
if s != nil && s.settingRepo != nil {
if raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsQueryModeDefault); err == nil {
mode = ParseOpsQueryMode(raw)
}
}
if mode == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
return OpsQueryModeRaw
}
return mode
}

View File

@@ -0,0 +1,87 @@
package service
import "time"
type OpsDashboardFilter struct {
StartTime time.Time
EndTime time.Time
Platform string
GroupID *int64
// QueryMode controls whether dashboard queries should use raw logs or pre-aggregated tables.
// Expected values: auto/raw/preagg (see OpsQueryMode).
QueryMode OpsQueryMode
}
type OpsRateSummary struct {
Current float64 `json:"current"`
Peak float64 `json:"peak"`
Avg float64 `json:"avg"`
}
type OpsPercentiles struct {
P50 *int `json:"p50_ms"`
P90 *int `json:"p90_ms"`
P95 *int `json:"p95_ms"`
P99 *int `json:"p99_ms"`
Avg *int `json:"avg_ms"`
Max *int `json:"max_ms"`
}
type OpsDashboardOverview struct {
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
Platform string `json:"platform"`
GroupID *int64 `json:"group_id"`
// HealthScore is a backend-computed overall health score (0-100).
// It is derived from the monitored metrics in this overview, plus best-effort system metrics/job heartbeats.
HealthScore int `json:"health_score"`
// Latest system-level snapshot (window=1m, global).
SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"`
// Background jobs health (heartbeats).
JobHeartbeats []*OpsJobHeartbeat `json:"job_heartbeats"`
SuccessCount int64 `json:"success_count"`
ErrorCountTotal int64 `json:"error_count_total"`
BusinessLimitedCount int64 `json:"business_limited_count"`
ErrorCountSLA int64 `json:"error_count_sla"`
RequestCountTotal int64 `json:"request_count_total"`
RequestCountSLA int64 `json:"request_count_sla"`
TokenConsumed int64 `json:"token_consumed"`
SLA float64 `json:"sla"`
ErrorRate float64 `json:"error_rate"`
UpstreamErrorRate float64 `json:"upstream_error_rate"`
UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
Upstream429Count int64 `json:"upstream_429_count"`
Upstream529Count int64 `json:"upstream_529_count"`
QPS OpsRateSummary `json:"qps"`
TPS OpsRateSummary `json:"tps"`
Duration OpsPercentiles `json:"duration"`
TTFT OpsPercentiles `json:"ttft"`
}
type OpsLatencyHistogramBucket struct {
Range string `json:"range"`
Count int64 `json:"count"`
}
// OpsLatencyHistogramResponse is a coarse latency distribution histogram (success requests only).
// It is used by the Ops dashboard to quickly identify tail latency regressions.
type OpsLatencyHistogramResponse struct {
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
Platform string `json:"platform"`
GroupID *int64 `json:"group_id"`
TotalRequests int64 `json:"total_requests"`
Buckets []*OpsLatencyHistogramBucket `json:"buckets"`
}

View File

@@ -0,0 +1,45 @@
package service
import (
"context"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
func (s *OpsService) GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if filter == nil {
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
}
if filter.StartTime.After(filter.EndTime) {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
}
return s.opsRepo.GetErrorTrend(ctx, filter, bucketSeconds)
}
func (s *OpsService) GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if filter == nil {
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
}
if filter.StartTime.After(filter.EndTime) {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
}
return s.opsRepo.GetErrorDistribution(ctx, filter)
}

View File

@@ -0,0 +1,154 @@
package service
import (
"math"
"time"
)
// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
//
// Design goals:
// - Backend-owned scoring (UI only displays).
// - Layered scoring: Business Health (70%) + Infrastructure Health (30%)
// - Avoids double-counting (e.g., DB failure affects both infra and business metrics)
// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
if overview == nil {
return 0
}
// Idle/no-data: avoid showing a "bad" score when there is no traffic.
// UI can still render a gray/idle state based on QPS + error rate.
if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 {
return 100
}
businessHealth := computeBusinessHealth(overview)
infraHealth := computeInfraHealth(now, overview)
// Weighted combination: 70% business + 30% infrastructure
score := businessHealth*0.7 + infraHealth*0.3
return int(math.Round(clampFloat64(score, 0, 100)))
}
// computeBusinessHealth calculates business health score (0-100)
// Components: SLA (50%) + Error Rate (30%) + Latency (20%)
func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
// SLA score: 99.5% → 100, 95% → 0 (linear)
slaScore := 100.0
slaPct := clampFloat64(overview.SLA*100, 0, 100)
if slaPct < 99.5 {
if slaPct >= 95 {
slaScore = (slaPct - 95) / 4.5 * 100
} else {
slaScore = 0
}
}
// Error rate score: 0.5% → 100, 5% → 0 (linear)
// Combines request errors and upstream errors
errorScore := 100.0
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
if combinedErrorPct > 0.5 {
if combinedErrorPct <= 5 {
errorScore = (5 - combinedErrorPct) / 4.5 * 100
} else {
errorScore = 0
}
}
// Latency score: 1s → 100, 10s → 0 (linear)
// Uses P99 of duration (TTFT is less critical for overall health)
latencyScore := 100.0
if overview.Duration.P99 != nil {
p99 := float64(*overview.Duration.P99)
if p99 > 1000 {
if p99 <= 10000 {
latencyScore = (10000 - p99) / 9000 * 100
} else {
latencyScore = 0
}
}
}
// Weighted combination
return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2
}
// computeInfraHealth calculates infrastructure health score (0-100)
// Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%)
func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 {
// Storage score: DB critical, Redis less critical
storageScore := 100.0
if overview.SystemMetrics != nil {
if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
storageScore = 0 // DB failure is critical
} else if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
storageScore = 50 // Redis failure is degraded but not critical
}
}
// Compute resources score: CPU + Memory
computeScore := 100.0
if overview.SystemMetrics != nil {
cpuScore := 100.0
if overview.SystemMetrics.CPUUsagePercent != nil {
cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
if cpuPct > 80 {
if cpuPct <= 100 {
cpuScore = (100 - cpuPct) / 20 * 100
} else {
cpuScore = 0
}
}
}
memScore := 100.0
if overview.SystemMetrics.MemoryUsagePercent != nil {
memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
if memPct > 85 {
if memPct <= 100 {
memScore = (100 - memPct) / 15 * 100
} else {
memScore = 0
}
}
}
computeScore = (cpuScore + memScore) / 2
}
// Background jobs score
jobScore := 100.0
failedJobs := 0
totalJobs := 0
for _, hb := range overview.JobHeartbeats {
if hb == nil {
continue
}
totalJobs++
if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
failedJobs++
} else if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
failedJobs++
}
}
if totalJobs > 0 && failedJobs > 0 {
jobScore = (1 - float64(failedJobs)/float64(totalJobs)) * 100
}
// Weighted combination
return storageScore*0.4 + computeScore*0.3 + jobScore*0.3
}
func clampFloat64(v float64, min float64, max float64) float64 {
if v < min {
return min
}
if v > max {
return max
}
return v
}

View File

@@ -0,0 +1,431 @@
//go:build unit
package service
import (
"testing"
"time"
"github.com/stretchr/testify/require"
)
func TestComputeDashboardHealthScore_IdleReturns100(t *testing.T) {
t.Parallel()
score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{})
require.Equal(t, 100, score)
}
func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) {
t.Parallel()
ov := &OpsDashboardOverview{
RequestCountTotal: 100,
RequestCountSLA: 100,
SuccessCount: 90,
ErrorCountTotal: 10,
ErrorCountSLA: 10,
SLA: 0.90,
ErrorRate: 0.10,
UpstreamErrorRate: 0.08,
Duration: OpsPercentiles{P99: intPtr(20_000)},
TTFT: OpsPercentiles{P99: intPtr(2_000)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(false),
RedisOK: boolPtr(false),
CPUUsagePercent: float64Ptr(98.0),
MemoryUsagePercent: float64Ptr(97.0),
DBConnWaiting: intPtr(3),
ConcurrencyQueueDepth: intPtr(10),
},
JobHeartbeats: []*OpsJobHeartbeat{
{
JobName: "job-a",
LastErrorAt: timePtr(time.Now().UTC().Add(-1 * time.Minute)),
LastError: stringPtr("boom"),
},
},
}
score := computeDashboardHealthScore(time.Now().UTC(), ov)
require.Less(t, score, 80)
require.GreaterOrEqual(t, score, 0)
}
func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
t.Parallel()
tests := []struct {
name string
overview *OpsDashboardOverview
wantMin int
wantMax int
}{
{
name: "nil overview returns 0",
overview: nil,
wantMin: 0,
wantMax: 0,
},
{
name: "perfect health",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 1.0,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
TTFT: OpsPercentiles{P99: intPtr(100)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 100,
wantMax: 100,
},
{
name: "good health - SLA 99.8%",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.998,
ErrorRate: 0.003,
UpstreamErrorRate: 0.001,
Duration: OpsPercentiles{P99: intPtr(800)},
TTFT: OpsPercentiles{P99: intPtr(200)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(50),
MemoryUsagePercent: float64Ptr(60),
},
},
wantMin: 95,
wantMax: 100,
},
{
name: "medium health - SLA 96%",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.96,
ErrorRate: 0.02,
UpstreamErrorRate: 0.01,
Duration: OpsPercentiles{P99: intPtr(3000)},
TTFT: OpsPercentiles{P99: intPtr(600)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(70),
MemoryUsagePercent: float64Ptr(75),
},
},
wantMin: 60,
wantMax: 85,
},
{
name: "DB failure",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.995,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(false),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 70,
wantMax: 90,
},
{
name: "Redis failure",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.995,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(false),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 85,
wantMax: 95,
},
{
name: "high CPU usage",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.995,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(95),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 85,
wantMax: 100,
},
{
name: "combined failures - business degraded + infra healthy",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.90,
ErrorRate: 0.05,
UpstreamErrorRate: 0.02,
Duration: OpsPercentiles{P99: intPtr(10000)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(20),
MemoryUsagePercent: float64Ptr(30),
},
},
wantMin: 25,
wantMax: 50,
},
{
name: "combined failures - business healthy + infra degraded",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.998,
ErrorRate: 0.001,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(600)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(false),
RedisOK: boolPtr(false),
CPUUsagePercent: float64Ptr(95),
MemoryUsagePercent: float64Ptr(95),
},
},
wantMin: 70,
wantMax: 90,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
score := computeDashboardHealthScore(time.Now().UTC(), tt.overview)
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %d", tt.wantMin)
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %d", tt.wantMax)
require.GreaterOrEqual(t, score, 0, "score must be >= 0")
require.LessOrEqual(t, score, 100, "score must be <= 100")
})
}
}
func TestComputeBusinessHealth(t *testing.T) {
t.Parallel()
tests := []struct {
name string
overview *OpsDashboardOverview
wantMin float64
wantMax float64
}{
{
name: "perfect metrics",
overview: &OpsDashboardOverview{
SLA: 1.0,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
},
wantMin: 100,
wantMax: 100,
},
{
name: "SLA boundary 99.5%",
overview: &OpsDashboardOverview{
SLA: 0.995,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
},
wantMin: 100,
wantMax: 100,
},
{
name: "SLA boundary 95%",
overview: &OpsDashboardOverview{
SLA: 0.95,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
},
wantMin: 50,
wantMax: 60,
},
{
name: "error rate boundary 0.5%",
overview: &OpsDashboardOverview{
SLA: 0.995,
ErrorRate: 0.005,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
},
wantMin: 95,
wantMax: 100,
},
{
name: "latency boundary 1000ms",
overview: &OpsDashboardOverview{
SLA: 0.995,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(1000)},
},
wantMin: 95,
wantMax: 100,
},
{
name: "upstream error dominates",
overview: &OpsDashboardOverview{
SLA: 0.995,
ErrorRate: 0.001,
UpstreamErrorRate: 0.03,
Duration: OpsPercentiles{P99: intPtr(500)},
},
wantMin: 75,
wantMax: 90,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
score := computeBusinessHealth(tt.overview)
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
require.LessOrEqual(t, score, 100.0, "score must be <= 100")
})
}
}
func TestComputeInfraHealth(t *testing.T) {
t.Parallel()
now := time.Now().UTC()
tests := []struct {
name string
overview *OpsDashboardOverview
wantMin float64
wantMax float64
}{
{
name: "all infrastructure healthy",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 100,
wantMax: 100,
},
{
name: "DB down",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(false),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 50,
wantMax: 70,
},
{
name: "Redis down",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(false),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 80,
wantMax: 95,
},
{
name: "CPU at 90%",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(90),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 85,
wantMax: 95,
},
{
name: "failed background job",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
JobHeartbeats: []*OpsJobHeartbeat{
{
JobName: "test-job",
LastErrorAt: &now,
},
},
},
wantMin: 70,
wantMax: 90,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
score := computeInfraHealth(now, tt.overview)
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
require.LessOrEqual(t, score, 100.0, "score must be <= 100")
})
}
}
func timePtr(v time.Time) *time.Time { return &v }
func stringPtr(v string) *string { return &v }

View File

@@ -0,0 +1,26 @@
package service
import (
"context"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
func (s *OpsService) GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if filter == nil {
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
}
if filter.StartTime.After(filter.EndTime) {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
}
return s.opsRepo.GetLatencyHistogram(ctx, filter)
}

View File

@@ -0,0 +1,920 @@
package service
import (
"context"
"database/sql"
"errors"
"fmt"
"log"
"math"
"os"
"runtime"
"strconv"
"strings"
"sync"
"time"
"unicode/utf8"
"github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/uuid"
"github.com/redis/go-redis/v9"
"github.com/shirou/gopsutil/v4/cpu"
"github.com/shirou/gopsutil/v4/mem"
)
const (
opsMetricsCollectorJobName = "ops_metrics_collector"
opsMetricsCollectorMinInterval = 60 * time.Second
opsMetricsCollectorMaxInterval = 1 * time.Hour
opsMetricsCollectorTimeout = 10 * time.Second
opsMetricsCollectorLeaderLockKey = "ops:metrics:collector:leader"
opsMetricsCollectorLeaderLockTTL = 90 * time.Second
opsMetricsCollectorHeartbeatTimeout = 2 * time.Second
bytesPerMB = 1024 * 1024
)
var opsMetricsCollectorAdvisoryLockID = hashAdvisoryLockID(opsMetricsCollectorLeaderLockKey)
type OpsMetricsCollector struct {
opsRepo OpsRepository
settingRepo SettingRepository
cfg *config.Config
accountRepo AccountRepository
concurrencyService *ConcurrencyService
db *sql.DB
redisClient *redis.Client
instanceID string
lastCgroupCPUUsageNanos uint64
lastCgroupCPUSampleAt time.Time
stopCh chan struct{}
startOnce sync.Once
stopOnce sync.Once
skipLogMu sync.Mutex
skipLogAt time.Time
}
func NewOpsMetricsCollector(
opsRepo OpsRepository,
settingRepo SettingRepository,
accountRepo AccountRepository,
concurrencyService *ConcurrencyService,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsMetricsCollector {
return &OpsMetricsCollector{
opsRepo: opsRepo,
settingRepo: settingRepo,
cfg: cfg,
accountRepo: accountRepo,
concurrencyService: concurrencyService,
db: db,
redisClient: redisClient,
instanceID: uuid.NewString(),
}
}
func (c *OpsMetricsCollector) Start() {
if c == nil {
return
}
c.startOnce.Do(func() {
if c.stopCh == nil {
c.stopCh = make(chan struct{})
}
go c.run()
})
}
func (c *OpsMetricsCollector) Stop() {
if c == nil {
return
}
c.stopOnce.Do(func() {
if c.stopCh != nil {
close(c.stopCh)
}
})
}
func (c *OpsMetricsCollector) run() {
// First run immediately so the dashboard has data soon after startup.
c.collectOnce()
for {
interval := c.getInterval()
timer := time.NewTimer(interval)
select {
case <-timer.C:
c.collectOnce()
case <-c.stopCh:
timer.Stop()
return
}
}
}
func (c *OpsMetricsCollector) getInterval() time.Duration {
interval := opsMetricsCollectorMinInterval
if c.settingRepo == nil {
return interval
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
raw, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMetricsIntervalSeconds)
if err != nil {
return interval
}
raw = strings.TrimSpace(raw)
if raw == "" {
return interval
}
seconds, err := strconv.Atoi(raw)
if err != nil {
return interval
}
if seconds < int(opsMetricsCollectorMinInterval.Seconds()) {
seconds = int(opsMetricsCollectorMinInterval.Seconds())
}
if seconds > int(opsMetricsCollectorMaxInterval.Seconds()) {
seconds = int(opsMetricsCollectorMaxInterval.Seconds())
}
return time.Duration(seconds) * time.Second
}
func (c *OpsMetricsCollector) collectOnce() {
if c == nil {
return
}
if c.cfg != nil && !c.cfg.Ops.Enabled {
return
}
if c.opsRepo == nil {
return
}
if c.db == nil {
return
}
ctx, cancel := context.WithTimeout(context.Background(), opsMetricsCollectorTimeout)
defer cancel()
if !c.isMonitoringEnabled(ctx) {
return
}
release, ok := c.tryAcquireLeaderLock(ctx)
if !ok {
return
}
if release != nil {
defer release()
}
startedAt := time.Now().UTC()
err := c.collectAndPersist(ctx)
finishedAt := time.Now().UTC()
durationMs := finishedAt.Sub(startedAt).Milliseconds()
dur := durationMs
runAt := startedAt
if err != nil {
msg := truncateString(err.Error(), 2048)
errAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
defer hbCancel()
_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsMetricsCollectorJobName,
LastRunAt: &runAt,
LastErrorAt: &errAt,
LastError: &msg,
LastDurationMs: &dur,
})
log.Printf("[OpsMetricsCollector] collect failed: %v", err)
return
}
successAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
defer hbCancel()
_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsMetricsCollectorJobName,
LastRunAt: &runAt,
LastSuccessAt: &successAt,
LastDurationMs: &dur,
})
}
func (c *OpsMetricsCollector) isMonitoringEnabled(ctx context.Context) bool {
if c == nil {
return false
}
if c.cfg != nil && !c.cfg.Ops.Enabled {
return false
}
if c.settingRepo == nil {
return true
}
if ctx == nil {
ctx = context.Background()
}
value, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
return true
}
// Fail-open: collector should not become a hard dependency.
return true
}
switch strings.ToLower(strings.TrimSpace(value)) {
case "false", "0", "off", "disabled":
return false
default:
return true
}
}
func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
if ctx == nil {
ctx = context.Background()
}
// Align to stable minute boundaries to avoid partial buckets and to maximize cache hits.
now := time.Now().UTC()
windowEnd := now.Truncate(time.Minute)
windowStart := windowEnd.Add(-1 * time.Minute)
sys, err := c.collectSystemStats(ctx)
if err != nil {
// Continue; system stats are best-effort.
log.Printf("[OpsMetricsCollector] system stats error: %v", err)
}
dbOK := c.checkDB(ctx)
redisOK := c.checkRedis(ctx)
active, idle := c.dbPoolStats()
redisTotal, redisIdle, redisStatsOK := c.redisPoolStats()
successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd)
if err != nil {
return fmt.Errorf("query usage counts: %w", err)
}
duration, ttft, err := c.queryUsageLatency(ctx, windowStart, windowEnd)
if err != nil {
return fmt.Errorf("query usage latency: %w", err)
}
errorTotal, businessLimited, errorSLA, upstreamExcl, upstream429, upstream529, err := c.queryErrorCounts(ctx, windowStart, windowEnd)
if err != nil {
return fmt.Errorf("query error counts: %w", err)
}
windowSeconds := windowEnd.Sub(windowStart).Seconds()
if windowSeconds <= 0 {
windowSeconds = 60
}
requestTotal := successCount + errorTotal
qps := float64(requestTotal) / windowSeconds
tps := float64(tokenConsumed) / windowSeconds
goroutines := runtime.NumGoroutine()
concurrencyQueueDepth := c.collectConcurrencyQueueDepth(ctx)
input := &OpsInsertSystemMetricsInput{
CreatedAt: windowEnd,
WindowMinutes: 1,
SuccessCount: successCount,
ErrorCountTotal: errorTotal,
BusinessLimitedCount: businessLimited,
ErrorCountSLA: errorSLA,
UpstreamErrorCountExcl429529: upstreamExcl,
Upstream429Count: upstream429,
Upstream529Count: upstream529,
TokenConsumed: tokenConsumed,
QPS: float64Ptr(roundTo1DP(qps)),
TPS: float64Ptr(roundTo1DP(tps)),
DurationP50Ms: duration.p50,
DurationP90Ms: duration.p90,
DurationP95Ms: duration.p95,
DurationP99Ms: duration.p99,
DurationAvgMs: duration.avg,
DurationMaxMs: duration.max,
TTFTP50Ms: ttft.p50,
TTFTP90Ms: ttft.p90,
TTFTP95Ms: ttft.p95,
TTFTP99Ms: ttft.p99,
TTFTAvgMs: ttft.avg,
TTFTMaxMs: ttft.max,
CPUUsagePercent: sys.cpuUsagePercent,
MemoryUsedMB: sys.memoryUsedMB,
MemoryTotalMB: sys.memoryTotalMB,
MemoryUsagePercent: sys.memoryUsagePercent,
DBOK: boolPtr(dbOK),
RedisOK: boolPtr(redisOK),
RedisConnTotal: func() *int {
if !redisStatsOK {
return nil
}
return intPtr(redisTotal)
}(),
RedisConnIdle: func() *int {
if !redisStatsOK {
return nil
}
return intPtr(redisIdle)
}(),
DBConnActive: intPtr(active),
DBConnIdle: intPtr(idle),
GoroutineCount: intPtr(goroutines),
ConcurrencyQueueDepth: concurrencyQueueDepth,
}
return c.opsRepo.InsertSystemMetrics(ctx, input)
}
func (c *OpsMetricsCollector) collectConcurrencyQueueDepth(parentCtx context.Context) *int {
if c == nil || c.accountRepo == nil || c.concurrencyService == nil {
return nil
}
if parentCtx == nil {
parentCtx = context.Background()
}
// Best-effort: never let concurrency sampling break the metrics collector.
ctx, cancel := context.WithTimeout(parentCtx, 2*time.Second)
defer cancel()
accounts, err := c.accountRepo.ListSchedulable(ctx)
if err != nil {
return nil
}
if len(accounts) == 0 {
zero := 0
return &zero
}
batch := make([]AccountWithConcurrency, 0, len(accounts))
for _, acc := range accounts {
if acc.ID <= 0 {
continue
}
maxConc := acc.Concurrency
if maxConc < 0 {
maxConc = 0
}
batch = append(batch, AccountWithConcurrency{
ID: acc.ID,
MaxConcurrency: maxConc,
})
}
if len(batch) == 0 {
zero := 0
return &zero
}
loadMap, err := c.concurrencyService.GetAccountsLoadBatch(ctx, batch)
if err != nil {
return nil
}
var total int64
for _, info := range loadMap {
if info == nil || info.WaitingCount <= 0 {
continue
}
total += int64(info.WaitingCount)
}
if total < 0 {
total = 0
}
maxInt := int64(^uint(0) >> 1)
if total > maxInt {
total = maxInt
}
v := int(total)
return &v
}
type opsCollectedPercentiles struct {
p50 *int
p90 *int
p95 *int
p99 *int
avg *float64
max *int
}
func (c *OpsMetricsCollector) queryUsageCounts(ctx context.Context, start, end time.Time) (successCount int64, tokenConsumed int64, err error) {
q := `
SELECT
COALESCE(COUNT(*), 0) AS success_count,
COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
FROM usage_logs
WHERE created_at >= $1 AND created_at < $2`
var tokens sql.NullInt64
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&successCount, &tokens); err != nil {
return 0, 0, err
}
if tokens.Valid {
tokenConsumed = tokens.Int64
}
return successCount, tokenConsumed, nil
}
func (c *OpsMetricsCollector) queryUsageLatency(ctx context.Context, start, end time.Time) (duration opsCollectedPercentiles, ttft opsCollectedPercentiles, err error) {
{
q := `
SELECT
percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50,
percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90,
percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95,
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99,
AVG(duration_ms) AS avg_ms,
MAX(duration_ms) AS max_ms
FROM usage_logs
WHERE created_at >= $1 AND created_at < $2
AND duration_ms IS NOT NULL`
var p50, p90, p95, p99 sql.NullFloat64
var avg sql.NullFloat64
var max sql.NullInt64
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
}
duration.p50 = floatToIntPtr(p50)
duration.p90 = floatToIntPtr(p90)
duration.p95 = floatToIntPtr(p95)
duration.p99 = floatToIntPtr(p99)
if avg.Valid {
v := roundTo1DP(avg.Float64)
duration.avg = &v
}
if max.Valid {
v := int(max.Int64)
duration.max = &v
}
}
{
q := `
SELECT
percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50,
percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90,
percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95,
percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99,
AVG(first_token_ms) AS avg_ms,
MAX(first_token_ms) AS max_ms
FROM usage_logs
WHERE created_at >= $1 AND created_at < $2
AND first_token_ms IS NOT NULL`
var p50, p90, p95, p99 sql.NullFloat64
var avg sql.NullFloat64
var max sql.NullInt64
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
}
ttft.p50 = floatToIntPtr(p50)
ttft.p90 = floatToIntPtr(p90)
ttft.p95 = floatToIntPtr(p95)
ttft.p99 = floatToIntPtr(p99)
if avg.Valid {
v := roundTo1DP(avg.Float64)
ttft.avg = &v
}
if max.Valid {
v := int(max.Int64)
ttft.max = &v
}
}
return duration, ttft, nil
}
func (c *OpsMetricsCollector) queryErrorCounts(ctx context.Context, start, end time.Time) (
errorTotal int64,
businessLimited int64,
errorSLA int64,
upstreamExcl429529 int64,
upstream429 int64,
upstream529 int64,
err error,
) {
q := `
SELECT
COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400), 0) AS error_total,
COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited), 0) AS business_limited,
COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited), 0) AS error_sla,
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl,
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429), 0) AS upstream_429,
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529), 0) AS upstream_529
FROM ops_error_logs
WHERE created_at >= $1 AND created_at < $2`
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(
&errorTotal,
&businessLimited,
&errorSLA,
&upstreamExcl429529,
&upstream429,
&upstream529,
); err != nil {
return 0, 0, 0, 0, 0, 0, err
}
return errorTotal, businessLimited, errorSLA, upstreamExcl429529, upstream429, upstream529, nil
}
type opsCollectedSystemStats struct {
cpuUsagePercent *float64
memoryUsedMB *int64
memoryTotalMB *int64
memoryUsagePercent *float64
}
func (c *OpsMetricsCollector) collectSystemStats(ctx context.Context) (*opsCollectedSystemStats, error) {
out := &opsCollectedSystemStats{}
if ctx == nil {
ctx = context.Background()
}
sampleAt := time.Now().UTC()
// Prefer cgroup (container) metrics when available.
if cpuPct := c.tryCgroupCPUPercent(sampleAt); cpuPct != nil {
out.cpuUsagePercent = cpuPct
}
cgroupUsed, cgroupTotal, cgroupOK := readCgroupMemoryBytes()
if cgroupOK {
usedMB := int64(cgroupUsed / bytesPerMB)
out.memoryUsedMB = &usedMB
if cgroupTotal > 0 {
totalMB := int64(cgroupTotal / bytesPerMB)
out.memoryTotalMB = &totalMB
pct := roundTo1DP(float64(cgroupUsed) / float64(cgroupTotal) * 100)
out.memoryUsagePercent = &pct
}
}
// Fallback to host metrics if cgroup metrics are unavailable (or incomplete).
if out.cpuUsagePercent == nil {
if cpuPercents, err := cpu.PercentWithContext(ctx, 0, false); err == nil && len(cpuPercents) > 0 {
v := roundTo1DP(cpuPercents[0])
out.cpuUsagePercent = &v
}
}
// If total memory isn't available from cgroup (e.g. memory.max = "max"), fill total from host.
if out.memoryUsedMB == nil || out.memoryTotalMB == nil || out.memoryUsagePercent == nil {
if vm, err := mem.VirtualMemoryWithContext(ctx); err == nil && vm != nil {
if out.memoryUsedMB == nil {
usedMB := int64(vm.Used / bytesPerMB)
out.memoryUsedMB = &usedMB
}
if out.memoryTotalMB == nil {
totalMB := int64(vm.Total / bytesPerMB)
out.memoryTotalMB = &totalMB
}
if out.memoryUsagePercent == nil {
if out.memoryUsedMB != nil && out.memoryTotalMB != nil && *out.memoryTotalMB > 0 {
pct := roundTo1DP(float64(*out.memoryUsedMB) / float64(*out.memoryTotalMB) * 100)
out.memoryUsagePercent = &pct
} else {
pct := roundTo1DP(vm.UsedPercent)
out.memoryUsagePercent = &pct
}
}
}
}
return out, nil
}
func (c *OpsMetricsCollector) tryCgroupCPUPercent(now time.Time) *float64 {
usageNanos, ok := readCgroupCPUUsageNanos()
if !ok {
return nil
}
// Initialize baseline sample.
if c.lastCgroupCPUSampleAt.IsZero() {
c.lastCgroupCPUUsageNanos = usageNanos
c.lastCgroupCPUSampleAt = now
return nil
}
elapsed := now.Sub(c.lastCgroupCPUSampleAt)
if elapsed <= 0 {
c.lastCgroupCPUUsageNanos = usageNanos
c.lastCgroupCPUSampleAt = now
return nil
}
prev := c.lastCgroupCPUUsageNanos
c.lastCgroupCPUUsageNanos = usageNanos
c.lastCgroupCPUSampleAt = now
if usageNanos < prev {
// Counter reset (container restarted).
return nil
}
deltaUsageSec := float64(usageNanos-prev) / 1e9
elapsedSec := elapsed.Seconds()
if elapsedSec <= 0 {
return nil
}
cores := readCgroupCPULimitCores()
if cores <= 0 {
// Can't reliably normalize; skip and fall back to gopsutil.
return nil
}
pct := (deltaUsageSec / (elapsedSec * cores)) * 100
if pct < 0 {
pct = 0
}
// Clamp to avoid noise/jitter showing impossible values.
if pct > 100 {
pct = 100
}
v := roundTo1DP(pct)
return &v
}
func readCgroupMemoryBytes() (usedBytes uint64, totalBytes uint64, ok bool) {
// cgroup v2 (most common in modern containers)
if used, ok1 := readUintFile("/sys/fs/cgroup/memory.current"); ok1 {
usedBytes = used
rawMax, err := os.ReadFile("/sys/fs/cgroup/memory.max")
if err == nil {
s := strings.TrimSpace(string(rawMax))
if s != "" && s != "max" {
if v, err := strconv.ParseUint(s, 10, 64); err == nil {
totalBytes = v
}
}
}
return usedBytes, totalBytes, true
}
// cgroup v1 fallback
if used, ok1 := readUintFile("/sys/fs/cgroup/memory/memory.usage_in_bytes"); ok1 {
usedBytes = used
if limit, ok2 := readUintFile("/sys/fs/cgroup/memory/memory.limit_in_bytes"); ok2 {
// Some environments report a very large number when unlimited.
if limit > 0 && limit < (1<<60) {
totalBytes = limit
}
}
return usedBytes, totalBytes, true
}
return 0, 0, false
}
func readCgroupCPUUsageNanos() (usageNanos uint64, ok bool) {
// cgroup v2: cpu.stat has usage_usec
if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.stat"); err == nil {
lines := strings.Split(string(raw), "\n")
for _, line := range lines {
fields := strings.Fields(line)
if len(fields) != 2 {
continue
}
if fields[0] != "usage_usec" {
continue
}
v, err := strconv.ParseUint(fields[1], 10, 64)
if err != nil {
continue
}
return v * 1000, true
}
}
// cgroup v1: cpuacct.usage is in nanoseconds
if v, ok := readUintFile("/sys/fs/cgroup/cpuacct/cpuacct.usage"); ok {
return v, true
}
return 0, false
}
func readCgroupCPULimitCores() float64 {
// cgroup v2: cpu.max => "<quota> <period>" or "max <period>"
if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.max"); err == nil {
fields := strings.Fields(string(raw))
if len(fields) >= 2 && fields[0] != "max" {
quota, err1 := strconv.ParseFloat(fields[0], 64)
period, err2 := strconv.ParseFloat(fields[1], 64)
if err1 == nil && err2 == nil && quota > 0 && period > 0 {
return quota / period
}
}
}
// cgroup v1: cpu.cfs_quota_us / cpu.cfs_period_us
quota, okQuota := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_quota_us")
period, okPeriod := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_period_us")
if okQuota && okPeriod && quota > 0 && period > 0 {
return float64(quota) / float64(period)
}
return 0
}
func readUintFile(path string) (uint64, bool) {
raw, err := os.ReadFile(path)
if err != nil {
return 0, false
}
s := strings.TrimSpace(string(raw))
if s == "" {
return 0, false
}
v, err := strconv.ParseUint(s, 10, 64)
if err != nil {
return 0, false
}
return v, true
}
func readIntFile(path string) (int64, bool) {
raw, err := os.ReadFile(path)
if err != nil {
return 0, false
}
s := strings.TrimSpace(string(raw))
if s == "" {
return 0, false
}
v, err := strconv.ParseInt(s, 10, 64)
if err != nil {
return 0, false
}
return v, true
}
func (c *OpsMetricsCollector) checkDB(ctx context.Context) bool {
if c == nil || c.db == nil {
return false
}
if ctx == nil {
ctx = context.Background()
}
var one int
if err := c.db.QueryRowContext(ctx, "SELECT 1").Scan(&one); err != nil {
return false
}
return one == 1
}
func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool {
if c == nil || c.redisClient == nil {
return false
}
if ctx == nil {
ctx = context.Background()
}
return c.redisClient.Ping(ctx).Err() == nil
}
func (c *OpsMetricsCollector) redisPoolStats() (total int, idle int, ok bool) {
if c == nil || c.redisClient == nil {
return 0, 0, false
}
stats := c.redisClient.PoolStats()
if stats == nil {
return 0, 0, false
}
return int(stats.TotalConns), int(stats.IdleConns), true
}
func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) {
if c == nil || c.db == nil {
return 0, 0
}
stats := c.db.Stats()
return stats.InUse, stats.Idle
}
var opsMetricsCollectorReleaseScript = redis.NewScript(`
if redis.call("GET", KEYS[1]) == ARGV[1] then
return redis.call("DEL", KEYS[1])
end
return 0
`)
func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
if c == nil || c.redisClient == nil {
return nil, true
}
if ctx == nil {
ctx = context.Background()
}
ok, err := c.redisClient.SetNX(ctx, opsMetricsCollectorLeaderLockKey, c.instanceID, opsMetricsCollectorLeaderLockTTL).Result()
if err != nil {
// Prefer fail-closed to avoid stampeding the database when Redis is flaky.
// Fallback to a DB advisory lock when Redis is present but unavailable.
release, ok := tryAcquireDBAdvisoryLock(ctx, c.db, opsMetricsCollectorAdvisoryLockID)
if !ok {
c.maybeLogSkip()
return nil, false
}
return release, true
}
if !ok {
c.maybeLogSkip()
return nil, false
}
release := func() {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_, _ = opsMetricsCollectorReleaseScript.Run(ctx, c.redisClient, []string{opsMetricsCollectorLeaderLockKey}, c.instanceID).Result()
}
return release, true
}
func (c *OpsMetricsCollector) maybeLogSkip() {
c.skipLogMu.Lock()
defer c.skipLogMu.Unlock()
now := time.Now()
if !c.skipLogAt.IsZero() && now.Sub(c.skipLogAt) < time.Minute {
return
}
c.skipLogAt = now
log.Printf("[OpsMetricsCollector] leader lock held by another instance; skipping")
}
func floatToIntPtr(v sql.NullFloat64) *int {
if !v.Valid {
return nil
}
n := int(math.Round(v.Float64))
return &n
}
func roundTo1DP(v float64) float64 {
return math.Round(v*10) / 10
}
func truncateString(s string, max int) string {
if max <= 0 {
return ""
}
if len(s) <= max {
return s
}
cut := s[:max]
for len(cut) > 0 && !utf8.ValidString(cut) {
cut = cut[:len(cut)-1]
}
return cut
}
func boolPtr(v bool) *bool {
out := v
return &out
}
func intPtr(v int) *int {
out := v
return &out
}
func float64Ptr(v float64) *float64 {
out := v
return &out
}

View File

@@ -0,0 +1,124 @@
package service
import "time"
type OpsErrorLog struct {
ID int64 `json:"id"`
CreatedAt time.Time `json:"created_at"`
Phase string `json:"phase"`
Type string `json:"type"`
Severity string `json:"severity"`
StatusCode int `json:"status_code"`
Platform string `json:"platform"`
Model string `json:"model"`
LatencyMs *int `json:"latency_ms"`
ClientRequestID string `json:"client_request_id"`
RequestID string `json:"request_id"`
Message string `json:"message"`
UserID *int64 `json:"user_id"`
APIKeyID *int64 `json:"api_key_id"`
AccountID *int64 `json:"account_id"`
GroupID *int64 `json:"group_id"`
ClientIP *string `json:"client_ip"`
RequestPath string `json:"request_path"`
Stream bool `json:"stream"`
}
type OpsErrorLogDetail struct {
OpsErrorLog
ErrorBody string `json:"error_body"`
UserAgent string `json:"user_agent"`
// Upstream context (optional)
UpstreamStatusCode *int `json:"upstream_status_code,omitempty"`
UpstreamErrorMessage string `json:"upstream_error_message,omitempty"`
UpstreamErrorDetail string `json:"upstream_error_detail,omitempty"`
UpstreamErrors string `json:"upstream_errors,omitempty"` // JSON array (string) for display/parsing
// Timings (optional)
AuthLatencyMs *int64 `json:"auth_latency_ms"`
RoutingLatencyMs *int64 `json:"routing_latency_ms"`
UpstreamLatencyMs *int64 `json:"upstream_latency_ms"`
ResponseLatencyMs *int64 `json:"response_latency_ms"`
TimeToFirstTokenMs *int64 `json:"time_to_first_token_ms"`
// Retry context
RequestBody string `json:"request_body"`
RequestBodyTruncated bool `json:"request_body_truncated"`
RequestBodyBytes *int `json:"request_body_bytes"`
RequestHeaders string `json:"request_headers,omitempty"`
// vNext metric semantics
IsBusinessLimited bool `json:"is_business_limited"`
}
type OpsErrorLogFilter struct {
StartTime *time.Time
EndTime *time.Time
Platform string
GroupID *int64
AccountID *int64
StatusCodes []int
Phase string
Query string
Page int
PageSize int
}
type OpsErrorLogList struct {
Errors []*OpsErrorLog `json:"errors"`
Total int `json:"total"`
Page int `json:"page"`
PageSize int `json:"page_size"`
}
type OpsRetryAttempt struct {
ID int64 `json:"id"`
CreatedAt time.Time `json:"created_at"`
RequestedByUserID int64 `json:"requested_by_user_id"`
SourceErrorID int64 `json:"source_error_id"`
Mode string `json:"mode"`
PinnedAccountID *int64 `json:"pinned_account_id"`
Status string `json:"status"`
StartedAt *time.Time `json:"started_at"`
FinishedAt *time.Time `json:"finished_at"`
DurationMs *int64 `json:"duration_ms"`
ResultRequestID *string `json:"result_request_id"`
ResultErrorID *int64 `json:"result_error_id"`
ErrorMessage *string `json:"error_message"`
}
type OpsRetryResult struct {
AttemptID int64 `json:"attempt_id"`
Mode string `json:"mode"`
Status string `json:"status"`
PinnedAccountID *int64 `json:"pinned_account_id"`
UsedAccountID *int64 `json:"used_account_id"`
HTTPStatusCode int `json:"http_status_code"`
UpstreamRequestID string `json:"upstream_request_id"`
ResponsePreview string `json:"response_preview"`
ResponseTruncated bool `json:"response_truncated"`
ErrorMessage string `json:"error_message"`
StartedAt time.Time `json:"started_at"`
FinishedAt time.Time `json:"finished_at"`
DurationMs int64 `json:"duration_ms"`
}

View File

@@ -0,0 +1,242 @@
package service
import (
"context"
"time"
)
type OpsRepository interface {
InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error)
ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error)
GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error)
ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error)
InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)
// Lightweight window stats (for realtime WS / quick sampling).
GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error)
GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error)
GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error)
GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error)
GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error)
InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error
GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error)
UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error
ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error)
// Alerts (rules + events)
ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error)
CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
DeleteAlertRule(ctx context.Context, id int64) error
ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error
// Pre-aggregation (hourly/daily) used for long-window dashboard performance.
UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error)
GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error)
}
type OpsInsertErrorLogInput struct {
RequestID string
ClientRequestID string
UserID *int64
APIKeyID *int64
AccountID *int64
GroupID *int64
ClientIP *string
Platform string
Model string
RequestPath string
Stream bool
UserAgent string
ErrorPhase string
ErrorType string
Severity string
StatusCode int
IsBusinessLimited bool
ErrorMessage string
ErrorBody string
ErrorSource string
ErrorOwner string
UpstreamStatusCode *int
UpstreamErrorMessage *string
UpstreamErrorDetail *string
// UpstreamErrors captures all upstream error attempts observed during handling this request.
// It is populated during request processing (gin context) and sanitized+serialized by OpsService.
UpstreamErrors []*OpsUpstreamErrorEvent
// UpstreamErrorsJSON is the sanitized JSON string stored into ops_error_logs.upstream_errors.
// It is set by OpsService.RecordError before persisting.
UpstreamErrorsJSON *string
DurationMs *int
TimeToFirstTokenMs *int64
RequestBodyJSON *string // sanitized json string (not raw bytes)
RequestBodyTruncated bool
RequestBodyBytes *int
RequestHeadersJSON *string // optional json string
IsRetryable bool
RetryCount int
CreatedAt time.Time
}
type OpsInsertRetryAttemptInput struct {
RequestedByUserID int64
SourceErrorID int64
Mode string
PinnedAccountID *int64
// running|queued etc.
Status string
StartedAt time.Time
}
type OpsUpdateRetryAttemptInput struct {
ID int64
// succeeded|failed
Status string
FinishedAt time.Time
DurationMs int64
// Optional correlation
ResultRequestID *string
ResultErrorID *int64
ErrorMessage *string
}
type OpsInsertSystemMetricsInput struct {
CreatedAt time.Time
WindowMinutes int
Platform *string
GroupID *int64
SuccessCount int64
ErrorCountTotal int64
BusinessLimitedCount int64
ErrorCountSLA int64
UpstreamErrorCountExcl429529 int64
Upstream429Count int64
Upstream529Count int64
TokenConsumed int64
QPS *float64
TPS *float64
DurationP50Ms *int
DurationP90Ms *int
DurationP95Ms *int
DurationP99Ms *int
DurationAvgMs *float64
DurationMaxMs *int
TTFTP50Ms *int
TTFTP90Ms *int
TTFTP95Ms *int
TTFTP99Ms *int
TTFTAvgMs *float64
TTFTMaxMs *int
CPUUsagePercent *float64
MemoryUsedMB *int64
MemoryTotalMB *int64
MemoryUsagePercent *float64
DBOK *bool
RedisOK *bool
RedisConnTotal *int
RedisConnIdle *int
DBConnActive *int
DBConnIdle *int
DBConnWaiting *int
GoroutineCount *int
ConcurrencyQueueDepth *int
}
type OpsSystemMetricsSnapshot struct {
ID int64 `json:"id"`
CreatedAt time.Time `json:"created_at"`
WindowMinutes int `json:"window_minutes"`
CPUUsagePercent *float64 `json:"cpu_usage_percent"`
MemoryUsedMB *int64 `json:"memory_used_mb"`
MemoryTotalMB *int64 `json:"memory_total_mb"`
MemoryUsagePercent *float64 `json:"memory_usage_percent"`
DBOK *bool `json:"db_ok"`
RedisOK *bool `json:"redis_ok"`
// Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max".
DBMaxOpenConns *int `json:"db_max_open_conns"`
RedisPoolSize *int `json:"redis_pool_size"`
RedisConnTotal *int `json:"redis_conn_total"`
RedisConnIdle *int `json:"redis_conn_idle"`
DBConnActive *int `json:"db_conn_active"`
DBConnIdle *int `json:"db_conn_idle"`
DBConnWaiting *int `json:"db_conn_waiting"`
GoroutineCount *int `json:"goroutine_count"`
ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"`
}
type OpsUpsertJobHeartbeatInput struct {
JobName string
LastRunAt *time.Time
LastSuccessAt *time.Time
LastErrorAt *time.Time
LastError *string
LastDurationMs *int64
}
type OpsJobHeartbeat struct {
JobName string `json:"job_name"`
LastRunAt *time.Time `json:"last_run_at"`
LastSuccessAt *time.Time `json:"last_success_at"`
LastErrorAt *time.Time `json:"last_error_at"`
LastError *string `json:"last_error"`
LastDurationMs *int64 `json:"last_duration_ms"`
UpdatedAt time.Time `json:"updated_at"`
}
type OpsWindowStats struct {
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
SuccessCount int64 `json:"success_count"`
ErrorCountTotal int64 `json:"error_count_total"`
TokenConsumed int64 `json:"token_consumed"`
}

View File

@@ -0,0 +1,40 @@
package service
import (
"errors"
"strings"
)
type OpsQueryMode string
const (
OpsQueryModeAuto OpsQueryMode = "auto"
OpsQueryModeRaw OpsQueryMode = "raw"
OpsQueryModePreagg OpsQueryMode = "preagg"
)
// ErrOpsPreaggregatedNotPopulated indicates that raw logs exist for a window, but the
// pre-aggregation tables are not populated yet. This is primarily used to implement
// the forced `preagg` mode UX.
var ErrOpsPreaggregatedNotPopulated = errors.New("ops pre-aggregated tables not populated")
func ParseOpsQueryMode(raw string) OpsQueryMode {
v := strings.ToLower(strings.TrimSpace(raw))
switch v {
case string(OpsQueryModeRaw):
return OpsQueryModeRaw
case string(OpsQueryModePreagg):
return OpsQueryModePreagg
default:
return OpsQueryModeAuto
}
}
func (m OpsQueryMode) IsValid() bool {
switch m {
case OpsQueryModeAuto, OpsQueryModeRaw, OpsQueryModePreagg:
return true
default:
return false
}
}

View File

@@ -0,0 +1,36 @@
package service
import (
"context"
"errors"
"strings"
)
// IsRealtimeMonitoringEnabled returns true when realtime ops features are enabled.
//
// This is a soft switch controlled by the DB setting `ops_realtime_monitoring_enabled`,
// and it is also gated by the hard switch/soft switch of overall ops monitoring.
func (s *OpsService) IsRealtimeMonitoringEnabled(ctx context.Context) bool {
if !s.IsMonitoringEnabled(ctx) {
return false
}
if s.settingRepo == nil {
return true
}
value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsRealtimeMonitoringEnabled)
if err != nil {
// Default enabled when key is missing; fail-open on transient errors.
if errors.Is(err, ErrSettingNotFound) {
return true
}
return true
}
switch strings.ToLower(strings.TrimSpace(value)) {
case "false", "0", "off", "disabled":
return false
default:
return true
}
}

View File

@@ -0,0 +1,81 @@
package service
import "time"
// PlatformConcurrencyInfo aggregates concurrency usage by platform.
type PlatformConcurrencyInfo struct {
Platform string `json:"platform"`
CurrentInUse int64 `json:"current_in_use"`
MaxCapacity int64 `json:"max_capacity"`
LoadPercentage float64 `json:"load_percentage"`
WaitingInQueue int64 `json:"waiting_in_queue"`
}
// GroupConcurrencyInfo aggregates concurrency usage by group.
//
// Note: one account can belong to multiple groups; group totals are therefore not additive across groups.
type GroupConcurrencyInfo struct {
GroupID int64 `json:"group_id"`
GroupName string `json:"group_name"`
Platform string `json:"platform"`
CurrentInUse int64 `json:"current_in_use"`
MaxCapacity int64 `json:"max_capacity"`
LoadPercentage float64 `json:"load_percentage"`
WaitingInQueue int64 `json:"waiting_in_queue"`
}
// AccountConcurrencyInfo represents real-time concurrency usage for a single account.
type AccountConcurrencyInfo struct {
AccountID int64 `json:"account_id"`
AccountName string `json:"account_name"`
Platform string `json:"platform"`
GroupID int64 `json:"group_id"`
GroupName string `json:"group_name"`
CurrentInUse int64 `json:"current_in_use"`
MaxCapacity int64 `json:"max_capacity"`
LoadPercentage float64 `json:"load_percentage"`
WaitingInQueue int64 `json:"waiting_in_queue"`
}
// PlatformAvailability aggregates account availability by platform.
type PlatformAvailability struct {
Platform string `json:"platform"`
TotalAccounts int64 `json:"total_accounts"`
AvailableCount int64 `json:"available_count"`
RateLimitCount int64 `json:"rate_limit_count"`
ErrorCount int64 `json:"error_count"`
}
// GroupAvailability aggregates account availability by group.
type GroupAvailability struct {
GroupID int64 `json:"group_id"`
GroupName string `json:"group_name"`
Platform string `json:"platform"`
TotalAccounts int64 `json:"total_accounts"`
AvailableCount int64 `json:"available_count"`
RateLimitCount int64 `json:"rate_limit_count"`
ErrorCount int64 `json:"error_count"`
}
// AccountAvailability represents current availability for a single account.
type AccountAvailability struct {
AccountID int64 `json:"account_id"`
AccountName string `json:"account_name"`
Platform string `json:"platform"`
GroupID int64 `json:"group_id"`
GroupName string `json:"group_name"`
Status string `json:"status"`
IsAvailable bool `json:"is_available"`
IsRateLimited bool `json:"is_rate_limited"`
IsOverloaded bool `json:"is_overloaded"`
HasError bool `json:"has_error"`
RateLimitResetAt *time.Time `json:"rate_limit_reset_at"`
RateLimitRemainingSec *int64 `json:"rate_limit_remaining_sec"`
OverloadUntil *time.Time `json:"overload_until"`
OverloadRemainingSec *int64 `json:"overload_remaining_sec"`
ErrorMessage string `json:"error_message"`
TempUnschedulableUntil *time.Time `json:"temp_unschedulable_until,omitempty"`
}

View File

@@ -0,0 +1,151 @@
package service
import (
"context"
"time"
)
type OpsRequestKind string
const (
OpsRequestKindSuccess OpsRequestKind = "success"
OpsRequestKindError OpsRequestKind = "error"
)
// OpsRequestDetail is a request-level view across success (usage_logs) and error (ops_error_logs).
// It powers "request drilldown" UIs without exposing full request bodies for successful requests.
type OpsRequestDetail struct {
Kind OpsRequestKind `json:"kind"`
CreatedAt time.Time `json:"created_at"`
RequestID string `json:"request_id"`
Platform string `json:"platform,omitempty"`
Model string `json:"model,omitempty"`
DurationMs *int `json:"duration_ms,omitempty"`
StatusCode *int `json:"status_code,omitempty"`
// When Kind == "error", ErrorID links to /admin/ops/errors/:id.
ErrorID *int64 `json:"error_id,omitempty"`
Phase string `json:"phase,omitempty"`
Severity string `json:"severity,omitempty"`
Message string `json:"message,omitempty"`
UserID *int64 `json:"user_id,omitempty"`
APIKeyID *int64 `json:"api_key_id,omitempty"`
AccountID *int64 `json:"account_id,omitempty"`
GroupID *int64 `json:"group_id,omitempty"`
Stream bool `json:"stream"`
}
type OpsRequestDetailFilter struct {
StartTime *time.Time
EndTime *time.Time
// kind: success|error|all
Kind string
Platform string
GroupID *int64
UserID *int64
APIKeyID *int64
AccountID *int64
Model string
RequestID string
Query string
MinDurationMs *int
MaxDurationMs *int
// Sort: created_at_desc (default) or duration_desc.
Sort string
Page int
PageSize int
}
func (f *OpsRequestDetailFilter) Normalize() (page, pageSize int, startTime, endTime time.Time) {
page = 1
pageSize = 50
endTime = time.Now()
startTime = endTime.Add(-1 * time.Hour)
if f == nil {
return page, pageSize, startTime, endTime
}
if f.Page > 0 {
page = f.Page
}
if f.PageSize > 0 {
pageSize = f.PageSize
}
if pageSize > 100 {
pageSize = 100
}
if f.EndTime != nil {
endTime = *f.EndTime
}
if f.StartTime != nil {
startTime = *f.StartTime
} else if f.EndTime != nil {
startTime = endTime.Add(-1 * time.Hour)
}
if startTime.After(endTime) {
startTime, endTime = endTime, startTime
}
return page, pageSize, startTime, endTime
}
type OpsRequestDetailList struct {
Items []*OpsRequestDetail `json:"items"`
Total int64 `json:"total"`
Page int `json:"page"`
PageSize int `json:"page_size"`
}
func (s *OpsService) ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) (*OpsRequestDetailList, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return &OpsRequestDetailList{
Items: []*OpsRequestDetail{},
Total: 0,
Page: 1,
PageSize: 50,
}, nil
}
page, pageSize, startTime, endTime := filter.Normalize()
filterCopy := &OpsRequestDetailFilter{}
if filter != nil {
*filterCopy = *filter
}
filterCopy.Page = page
filterCopy.PageSize = pageSize
filterCopy.StartTime = &startTime
filterCopy.EndTime = &endTime
items, total, err := s.opsRepo.ListRequestDetails(ctx, filterCopy)
if err != nil {
return nil, err
}
if items == nil {
items = []*OpsRequestDetail{}
}
return &OpsRequestDetailList{
Items: items,
Total: total,
Page: page,
PageSize: pageSize,
}, nil
}

View File

@@ -0,0 +1,632 @@
package service
import (
"bytes"
"context"
"database/sql"
"encoding/json"
"errors"
"fmt"
"log"
"net/http"
"strings"
"time"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
"github.com/gin-gonic/gin"
"github.com/lib/pq"
)
const (
OpsRetryModeClient = "client"
OpsRetryModeUpstream = "upstream"
)
const (
opsRetryStatusRunning = "running"
opsRetryStatusSucceeded = "succeeded"
opsRetryStatusFailed = "failed"
)
const (
opsRetryTimeout = 60 * time.Second
opsRetryCaptureBytesLimit = 64 * 1024
opsRetryResponsePreviewMax = 8 * 1024
opsRetryMinIntervalPerError = 10 * time.Second
opsRetryMaxAccountSwitches = 3
)
var opsRetryRequestHeaderAllowlist = map[string]bool{
"anthropic-beta": true,
"anthropic-version": true,
}
type opsRetryRequestType string
const (
opsRetryTypeMessages opsRetryRequestType = "messages"
opsRetryTypeOpenAI opsRetryRequestType = "openai_responses"
opsRetryTypeGeminiV1B opsRetryRequestType = "gemini_v1beta"
)
type limitedResponseWriter struct {
header http.Header
wroteHeader bool
limit int
totalWritten int64
buf bytes.Buffer
}
func newLimitedResponseWriter(limit int) *limitedResponseWriter {
if limit <= 0 {
limit = 1
}
return &limitedResponseWriter{
header: make(http.Header),
limit: limit,
}
}
func (w *limitedResponseWriter) Header() http.Header {
return w.header
}
func (w *limitedResponseWriter) WriteHeader(statusCode int) {
if w.wroteHeader {
return
}
w.wroteHeader = true
}
func (w *limitedResponseWriter) Write(p []byte) (int, error) {
if !w.wroteHeader {
w.WriteHeader(http.StatusOK)
}
w.totalWritten += int64(len(p))
if w.buf.Len() < w.limit {
remaining := w.limit - w.buf.Len()
if len(p) > remaining {
_, _ = w.buf.Write(p[:remaining])
} else {
_, _ = w.buf.Write(p)
}
}
// Pretend we wrote everything to avoid upstream/client code treating it as an error.
return len(p), nil
}
func (w *limitedResponseWriter) Flush() {}
func (w *limitedResponseWriter) bodyBytes() []byte {
return w.buf.Bytes()
}
func (w *limitedResponseWriter) truncated() bool {
return w.totalWritten > int64(w.limit)
}
func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
mode = strings.ToLower(strings.TrimSpace(mode))
switch mode {
case OpsRetryModeClient, OpsRetryModeUpstream:
default:
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
}
latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
if err != nil && !errors.Is(err, sql.ErrNoRows) {
return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
}
if latest != nil {
if strings.EqualFold(latest.Status, opsRetryStatusRunning) || strings.EqualFold(latest.Status, "queued") {
return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
}
lastAttemptAt := latest.CreatedAt
if latest.FinishedAt != nil && !latest.FinishedAt.IsZero() {
lastAttemptAt = *latest.FinishedAt
} else if latest.StartedAt != nil && !latest.StartedAt.IsZero() {
lastAttemptAt = *latest.StartedAt
}
if time.Since(lastAttemptAt) < opsRetryMinIntervalPerError {
return nil, infraerrors.Conflict("OPS_RETRY_TOO_FREQUENT", "Please wait before retrying this error again")
}
}
errorLog, err := s.GetErrorLogByID(ctx, errorID)
if err != nil {
return nil, err
}
if strings.TrimSpace(errorLog.RequestBody) == "" {
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
}
var pinned *int64
if mode == OpsRetryModeUpstream {
if pinnedAccountID != nil && *pinnedAccountID > 0 {
pinned = pinnedAccountID
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
pinned = errorLog.AccountID
} else {
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
}
}
startedAt := time.Now()
attemptID, err := s.opsRepo.InsertRetryAttempt(ctx, &OpsInsertRetryAttemptInput{
RequestedByUserID: requestedByUserID,
SourceErrorID: errorID,
Mode: mode,
PinnedAccountID: pinned,
Status: opsRetryStatusRunning,
StartedAt: startedAt,
})
if err != nil {
var pqErr *pq.Error
if errors.As(err, &pqErr) && string(pqErr.Code) == "23505" {
return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
}
return nil, infraerrors.InternalServer("OPS_RETRY_CREATE_ATTEMPT_FAILED", "Failed to create retry attempt").WithCause(err)
}
result := &OpsRetryResult{
AttemptID: attemptID,
Mode: mode,
Status: opsRetryStatusFailed,
PinnedAccountID: pinned,
HTTPStatusCode: 0,
UpstreamRequestID: "",
ResponsePreview: "",
ResponseTruncated: false,
ErrorMessage: "",
StartedAt: startedAt,
}
execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
defer cancel()
execRes := s.executeRetry(execCtx, errorLog, mode, pinned)
finishedAt := time.Now()
result.FinishedAt = finishedAt
result.DurationMs = finishedAt.Sub(startedAt).Milliseconds()
if execRes != nil {
result.Status = execRes.status
result.UsedAccountID = execRes.usedAccountID
result.HTTPStatusCode = execRes.httpStatusCode
result.UpstreamRequestID = execRes.upstreamRequestID
result.ResponsePreview = execRes.responsePreview
result.ResponseTruncated = execRes.responseTruncated
result.ErrorMessage = execRes.errorMessage
}
updateCtx, updateCancel := context.WithTimeout(context.Background(), 3*time.Second)
defer updateCancel()
var updateErrMsg *string
if strings.TrimSpace(result.ErrorMessage) != "" {
msg := result.ErrorMessage
updateErrMsg = &msg
}
var resultRequestID *string
if strings.TrimSpace(result.UpstreamRequestID) != "" {
v := result.UpstreamRequestID
resultRequestID = &v
}
finalStatus := result.Status
if strings.TrimSpace(finalStatus) == "" {
finalStatus = opsRetryStatusFailed
}
if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{
ID: attemptID,
Status: finalStatus,
FinishedAt: finishedAt,
DurationMs: result.DurationMs,
ResultRequestID: resultRequestID,
ErrorMessage: updateErrMsg,
}); err != nil {
// Best-effort: retry itself already executed; do not fail the API response.
log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
}
return result, nil
}
type opsRetryExecution struct {
status string
usedAccountID *int64
httpStatusCode int
upstreamRequestID string
responsePreview string
responseTruncated bool
errorMessage string
}
func (s *OpsService) executeRetry(ctx context.Context, errorLog *OpsErrorLogDetail, mode string, pinnedAccountID *int64) *opsRetryExecution {
if errorLog == nil {
return &opsRetryExecution{
status: opsRetryStatusFailed,
errorMessage: "missing error log",
}
}
reqType := detectOpsRetryType(errorLog.RequestPath)
bodyBytes := []byte(errorLog.RequestBody)
switch reqType {
case opsRetryTypeMessages:
bodyBytes = FilterThinkingBlocksForRetry(bodyBytes)
case opsRetryTypeOpenAI, opsRetryTypeGeminiV1B:
// No-op
}
switch strings.ToLower(strings.TrimSpace(mode)) {
case OpsRetryModeUpstream:
if pinnedAccountID == nil || *pinnedAccountID <= 0 {
return &opsRetryExecution{
status: opsRetryStatusFailed,
errorMessage: "pinned_account_id required for upstream retry",
}
}
return s.executePinnedRetry(ctx, reqType, errorLog, bodyBytes, *pinnedAccountID)
case OpsRetryModeClient:
return s.executeClientRetry(ctx, reqType, errorLog, bodyBytes)
default:
return &opsRetryExecution{
status: opsRetryStatusFailed,
errorMessage: "invalid retry mode",
}
}
}
func detectOpsRetryType(path string) opsRetryRequestType {
p := strings.ToLower(strings.TrimSpace(path))
switch {
case strings.Contains(p, "/responses"):
return opsRetryTypeOpenAI
case strings.Contains(p, "/v1beta/"):
return opsRetryTypeGeminiV1B
default:
return opsRetryTypeMessages
}
}
func (s *OpsService) executePinnedRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, pinnedAccountID int64) *opsRetryExecution {
if s.accountRepo == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account repository not available"}
}
account, err := s.accountRepo.GetByID(ctx, pinnedAccountID)
if err != nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("account not found: %v", err)}
}
if account == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account not found"}
}
if !account.IsSchedulable() {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account is not schedulable"}
}
if errorLog.GroupID != nil && *errorLog.GroupID > 0 {
if !containsInt64(account.GroupIDs, *errorLog.GroupID) {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "pinned account is not in the same group as the original request"}
}
}
var release func()
if s.concurrencyService != nil {
acq, err := s.concurrencyService.AcquireAccountSlot(ctx, account.ID, account.Concurrency)
if err != nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("acquire account slot failed: %v", err)}
}
if acq == nil || !acq.Acquired {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account concurrency limit reached"}
}
release = acq.ReleaseFunc
}
if release != nil {
defer release()
}
usedID := account.ID
exec := s.executeWithAccount(ctx, reqType, errorLog, body, account)
exec.usedAccountID = &usedID
if exec.status == "" {
exec.status = opsRetryStatusFailed
}
return exec
}
func (s *OpsService) executeClientRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) *opsRetryExecution {
groupID := errorLog.GroupID
if groupID == nil || *groupID <= 0 {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "group_id missing; cannot reselect account"}
}
model, stream, parsedErr := extractRetryModelAndStream(reqType, errorLog, body)
if parsedErr != nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: parsedErr.Error()}
}
_ = stream
excluded := make(map[int64]struct{})
switches := 0
for {
if switches >= opsRetryMaxAccountSwitches {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "retry failed after exhausting account failovers"}
}
selection, selErr := s.selectAccountForRetry(ctx, reqType, groupID, model, excluded)
if selErr != nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: selErr.Error()}
}
if selection == nil || selection.Account == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "no available accounts"}
}
account := selection.Account
if !selection.Acquired || selection.ReleaseFunc == nil {
excluded[account.ID] = struct{}{}
switches++
continue
}
exec := func() *opsRetryExecution {
defer selection.ReleaseFunc()
return s.executeWithAccount(ctx, reqType, errorLog, body, account)
}()
if exec != nil {
if exec.status == opsRetryStatusSucceeded {
usedID := account.ID
exec.usedAccountID = &usedID
return exec
}
// If the gateway services ask for failover, try another account.
if s.isFailoverError(exec.errorMessage) {
excluded[account.ID] = struct{}{}
switches++
continue
}
usedID := account.ID
exec.usedAccountID = &usedID
return exec
}
excluded[account.ID] = struct{}{}
switches++
}
}
func (s *OpsService) selectAccountForRetry(ctx context.Context, reqType opsRetryRequestType, groupID *int64, model string, excludedIDs map[int64]struct{}) (*AccountSelectionResult, error) {
switch reqType {
case opsRetryTypeOpenAI:
if s.openAIGatewayService == nil {
return nil, fmt.Errorf("openai gateway service not available")
}
return s.openAIGatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
case opsRetryTypeGeminiV1B, opsRetryTypeMessages:
if s.gatewayService == nil {
return nil, fmt.Errorf("gateway service not available")
}
return s.gatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
default:
return nil, fmt.Errorf("unsupported retry type: %s", reqType)
}
}
func extractRetryModelAndStream(reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) (model string, stream bool, err error) {
switch reqType {
case opsRetryTypeMessages:
parsed, parseErr := ParseGatewayRequest(body)
if parseErr != nil {
return "", false, fmt.Errorf("failed to parse messages request body: %w", parseErr)
}
return parsed.Model, parsed.Stream, nil
case opsRetryTypeOpenAI:
var v struct {
Model string `json:"model"`
Stream bool `json:"stream"`
}
if err := json.Unmarshal(body, &v); err != nil {
return "", false, fmt.Errorf("failed to parse openai request body: %w", err)
}
return strings.TrimSpace(v.Model), v.Stream, nil
case opsRetryTypeGeminiV1B:
if strings.TrimSpace(errorLog.Model) == "" {
return "", false, fmt.Errorf("missing model for gemini v1beta retry")
}
return strings.TrimSpace(errorLog.Model), errorLog.Stream, nil
default:
return "", false, fmt.Errorf("unsupported retry type: %s", reqType)
}
}
func (s *OpsService) executeWithAccount(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, account *Account) *opsRetryExecution {
if account == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "missing account"}
}
c, w := newOpsRetryContext(ctx, errorLog)
var err error
switch reqType {
case opsRetryTypeOpenAI:
if s.openAIGatewayService == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "openai gateway service not available"}
}
_, err = s.openAIGatewayService.Forward(ctx, c, account, body)
case opsRetryTypeGeminiV1B:
if s.geminiCompatService == nil || s.antigravityGatewayService == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini services not available"}
}
modelName := strings.TrimSpace(errorLog.Model)
action := "generateContent"
if errorLog.Stream {
action = "streamGenerateContent"
}
if account.Platform == PlatformAntigravity {
_, err = s.antigravityGatewayService.ForwardGemini(ctx, c, account, modelName, action, errorLog.Stream, body)
} else {
_, err = s.geminiCompatService.ForwardNative(ctx, c, account, modelName, action, errorLog.Stream, body)
}
case opsRetryTypeMessages:
switch account.Platform {
case PlatformAntigravity:
if s.antigravityGatewayService == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "antigravity gateway service not available"}
}
_, err = s.antigravityGatewayService.Forward(ctx, c, account, body)
case PlatformGemini:
if s.geminiCompatService == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini gateway service not available"}
}
_, err = s.geminiCompatService.Forward(ctx, c, account, body)
default:
if s.gatewayService == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gateway service not available"}
}
parsedReq, parseErr := ParseGatewayRequest(body)
if parseErr != nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "failed to parse request body"}
}
_, err = s.gatewayService.Forward(ctx, c, account, parsedReq)
}
default:
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "unsupported retry type"}
}
statusCode := http.StatusOK
if c != nil && c.Writer != nil {
statusCode = c.Writer.Status()
}
upstreamReqID := extractUpstreamRequestID(c)
preview, truncated := extractResponsePreview(w)
exec := &opsRetryExecution{
status: opsRetryStatusFailed,
httpStatusCode: statusCode,
upstreamRequestID: upstreamReqID,
responsePreview: preview,
responseTruncated: truncated,
errorMessage: "",
}
if err == nil && statusCode < 400 {
exec.status = opsRetryStatusSucceeded
return exec
}
if err != nil {
exec.errorMessage = err.Error()
} else {
exec.errorMessage = fmt.Sprintf("upstream returned status %d", statusCode)
}
return exec
}
func newOpsRetryContext(ctx context.Context, errorLog *OpsErrorLogDetail) (*gin.Context, *limitedResponseWriter) {
w := newLimitedResponseWriter(opsRetryCaptureBytesLimit)
c, _ := gin.CreateTestContext(w)
path := "/"
if errorLog != nil && strings.TrimSpace(errorLog.RequestPath) != "" {
path = errorLog.RequestPath
}
req, _ := http.NewRequestWithContext(ctx, http.MethodPost, "http://localhost"+path, bytes.NewReader(nil))
req.Header.Set("content-type", "application/json")
if errorLog != nil && strings.TrimSpace(errorLog.UserAgent) != "" {
req.Header.Set("user-agent", errorLog.UserAgent)
}
// Restore a minimal, whitelisted subset of request headers to improve retry fidelity
// (e.g. anthropic-beta / anthropic-version). Never replay auth credentials.
if errorLog != nil && strings.TrimSpace(errorLog.RequestHeaders) != "" {
var stored map[string]string
if err := json.Unmarshal([]byte(errorLog.RequestHeaders), &stored); err == nil {
for k, v := range stored {
key := strings.TrimSpace(k)
if key == "" {
continue
}
if !opsRetryRequestHeaderAllowlist[strings.ToLower(key)] {
continue
}
val := strings.TrimSpace(v)
if val == "" {
continue
}
req.Header.Set(key, val)
}
}
}
c.Request = req
return c, w
}
func extractUpstreamRequestID(c *gin.Context) string {
if c == nil || c.Writer == nil {
return ""
}
h := c.Writer.Header()
if h == nil {
return ""
}
for _, key := range []string{"x-request-id", "X-Request-Id", "X-Request-ID"} {
if v := strings.TrimSpace(h.Get(key)); v != "" {
return v
}
}
return ""
}
func extractResponsePreview(w *limitedResponseWriter) (preview string, truncated bool) {
if w == nil {
return "", false
}
b := bytes.TrimSpace(w.bodyBytes())
if len(b) == 0 {
return "", w.truncated()
}
if len(b) > opsRetryResponsePreviewMax {
return string(b[:opsRetryResponsePreviewMax]), true
}
return string(b), w.truncated()
}
func containsInt64(items []int64, needle int64) bool {
for _, v := range items {
if v == needle {
return true
}
}
return false
}
func (s *OpsService) isFailoverError(message string) bool {
msg := strings.ToLower(strings.TrimSpace(message))
if msg == "" {
return false
}
return strings.Contains(msg, "upstream error:") && strings.Contains(msg, "failover")
}

View File

@@ -0,0 +1,705 @@
package service
import (
"context"
"fmt"
"log"
"strconv"
"strings"
"sync"
"time"
"github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/uuid"
"github.com/redis/go-redis/v9"
"github.com/robfig/cron/v3"
)
const (
opsScheduledReportJobName = "ops_scheduled_reports"
opsScheduledReportLeaderLockKeyDefault = "ops:scheduled_reports:leader"
opsScheduledReportLeaderLockTTLDefault = 5 * time.Minute
opsScheduledReportLastRunKeyPrefix = "ops:scheduled_reports:last_run:"
opsScheduledReportTickInterval = 1 * time.Minute
)
var opsScheduledReportCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
var opsScheduledReportReleaseScript = redis.NewScript(`
if redis.call("GET", KEYS[1]) == ARGV[1] then
return redis.call("DEL", KEYS[1])
end
return 0
`)
type OpsScheduledReportService struct {
opsService *OpsService
userService *UserService
emailService *EmailService
redisClient *redis.Client
cfg *config.Config
instanceID string
loc *time.Location
distributedLockOn bool
warnNoRedisOnce sync.Once
startOnce sync.Once
stopOnce sync.Once
stopCtx context.Context
stop context.CancelFunc
wg sync.WaitGroup
}
func NewOpsScheduledReportService(
opsService *OpsService,
userService *UserService,
emailService *EmailService,
redisClient *redis.Client,
cfg *config.Config,
) *OpsScheduledReportService {
lockOn := cfg == nil || strings.TrimSpace(cfg.RunMode) != config.RunModeSimple
loc := time.Local
if cfg != nil && strings.TrimSpace(cfg.Timezone) != "" {
if parsed, err := time.LoadLocation(strings.TrimSpace(cfg.Timezone)); err == nil && parsed != nil {
loc = parsed
}
}
return &OpsScheduledReportService{
opsService: opsService,
userService: userService,
emailService: emailService,
redisClient: redisClient,
cfg: cfg,
instanceID: uuid.NewString(),
loc: loc,
distributedLockOn: lockOn,
warnNoRedisOnce: sync.Once{},
startOnce: sync.Once{},
stopOnce: sync.Once{},
stopCtx: nil,
stop: nil,
wg: sync.WaitGroup{},
}
}
func (s *OpsScheduledReportService) Start() {
s.StartWithContext(context.Background())
}
func (s *OpsScheduledReportService) StartWithContext(ctx context.Context) {
if s == nil {
return
}
if ctx == nil {
ctx = context.Background()
}
if s.cfg != nil && !s.cfg.Ops.Enabled {
return
}
if s.opsService == nil || s.emailService == nil {
return
}
s.startOnce.Do(func() {
s.stopCtx, s.stop = context.WithCancel(ctx)
s.wg.Add(1)
go s.run()
})
}
func (s *OpsScheduledReportService) Stop() {
if s == nil {
return
}
s.stopOnce.Do(func() {
if s.stop != nil {
s.stop()
}
})
s.wg.Wait()
}
func (s *OpsScheduledReportService) run() {
defer s.wg.Done()
ticker := time.NewTicker(opsScheduledReportTickInterval)
defer ticker.Stop()
s.runOnce()
for {
select {
case <-ticker.C:
s.runOnce()
case <-s.stopCtx.Done():
return
}
}
}
func (s *OpsScheduledReportService) runOnce() {
if s == nil || s.opsService == nil || s.emailService == nil {
return
}
startedAt := time.Now().UTC()
runAt := startedAt
ctx, cancel := context.WithTimeout(s.stopCtx, 60*time.Second)
defer cancel()
// Respect ops monitoring enabled switch.
if !s.opsService.IsMonitoringEnabled(ctx) {
return
}
release, ok := s.tryAcquireLeaderLock(ctx)
if !ok {
return
}
if release != nil {
defer release()
}
now := time.Now()
if s.loc != nil {
now = now.In(s.loc)
}
reports := s.listScheduledReports(ctx, now)
if len(reports) == 0 {
return
}
for _, report := range reports {
if report == nil || !report.Enabled {
continue
}
if report.NextRunAt.After(now) {
continue
}
if err := s.runReport(ctx, report, now); err != nil {
s.recordHeartbeatError(runAt, time.Since(startedAt), err)
return
}
}
s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
}
type opsScheduledReport struct {
Name string
ReportType string
Schedule string
Enabled bool
TimeRange time.Duration
Recipients []string
ErrorDigestMinCount int
AccountHealthErrorRateThreshold float64
LastRunAt *time.Time
NextRunAt time.Time
}
func (s *OpsScheduledReportService) listScheduledReports(ctx context.Context, now time.Time) []*opsScheduledReport {
if s == nil || s.opsService == nil {
return nil
}
if ctx == nil {
ctx = context.Background()
}
emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx)
if err != nil || emailCfg == nil {
return nil
}
if !emailCfg.Report.Enabled {
return nil
}
recipients := normalizeEmails(emailCfg.Report.Recipients)
type reportDef struct {
enabled bool
name string
kind string
timeRange time.Duration
schedule string
}
defs := []reportDef{
{enabled: emailCfg.Report.DailySummaryEnabled, name: "日报", kind: "daily_summary", timeRange: 24 * time.Hour, schedule: emailCfg.Report.DailySummarySchedule},
{enabled: emailCfg.Report.WeeklySummaryEnabled, name: "周报", kind: "weekly_summary", timeRange: 7 * 24 * time.Hour, schedule: emailCfg.Report.WeeklySummarySchedule},
{enabled: emailCfg.Report.ErrorDigestEnabled, name: "错误摘要", kind: "error_digest", timeRange: 24 * time.Hour, schedule: emailCfg.Report.ErrorDigestSchedule},
{enabled: emailCfg.Report.AccountHealthEnabled, name: "账号健康", kind: "account_health", timeRange: 24 * time.Hour, schedule: emailCfg.Report.AccountHealthSchedule},
}
out := make([]*opsScheduledReport, 0, len(defs))
for _, d := range defs {
if !d.enabled {
continue
}
spec := strings.TrimSpace(d.schedule)
if spec == "" {
continue
}
sched, err := opsScheduledReportCronParser.Parse(spec)
if err != nil {
log.Printf("[OpsScheduledReport] invalid cron spec=%q for report=%s: %v", spec, d.kind, err)
continue
}
lastRun := s.getLastRunAt(ctx, d.kind)
base := lastRun
if base.IsZero() {
// Allow a schedule matching the current minute to trigger right after startup.
base = now.Add(-1 * time.Minute)
}
next := sched.Next(base)
if next.IsZero() {
continue
}
var lastRunPtr *time.Time
if !lastRun.IsZero() {
lastCopy := lastRun
lastRunPtr = &lastCopy
}
out = append(out, &opsScheduledReport{
Name: d.name,
ReportType: d.kind,
Schedule: spec,
Enabled: true,
TimeRange: d.timeRange,
Recipients: recipients,
ErrorDigestMinCount: emailCfg.Report.ErrorDigestMinCount,
AccountHealthErrorRateThreshold: emailCfg.Report.AccountHealthErrorRateThreshold,
LastRunAt: lastRunPtr,
NextRunAt: next,
})
}
return out
}
func (s *OpsScheduledReportService) runReport(ctx context.Context, report *opsScheduledReport, now time.Time) error {
if s == nil || s.opsService == nil || s.emailService == nil || report == nil {
return nil
}
if ctx == nil {
ctx = context.Background()
}
// Mark as "run" up-front so a broken SMTP config doesn't spam retries every minute.
s.setLastRunAt(ctx, report.ReportType, now)
content, err := s.generateReportHTML(ctx, report, now)
if err != nil {
return err
}
if strings.TrimSpace(content) == "" {
// Skip sending when the report decides not to emit content (e.g., digest below min count).
return nil
}
recipients := report.Recipients
if len(recipients) == 0 && s.userService != nil {
admin, err := s.userService.GetFirstAdmin(ctx)
if err == nil && admin != nil && strings.TrimSpace(admin.Email) != "" {
recipients = []string{strings.TrimSpace(admin.Email)}
}
}
if len(recipients) == 0 {
return nil
}
subject := fmt.Sprintf("[Ops Report] %s", strings.TrimSpace(report.Name))
for _, to := range recipients {
addr := strings.TrimSpace(to)
if addr == "" {
continue
}
if err := s.emailService.SendEmail(ctx, addr, subject, content); err != nil {
// Ignore per-recipient failures; continue best-effort.
continue
}
}
return nil
}
func (s *OpsScheduledReportService) generateReportHTML(ctx context.Context, report *opsScheduledReport, now time.Time) (string, error) {
if s == nil || s.opsService == nil || report == nil {
return "", fmt.Errorf("service not initialized")
}
if report.TimeRange <= 0 {
return "", fmt.Errorf("invalid time range")
}
end := now.UTC()
start := end.Add(-report.TimeRange)
switch strings.TrimSpace(report.ReportType) {
case "daily_summary", "weekly_summary":
overview, err := s.opsService.GetDashboardOverview(ctx, &OpsDashboardFilter{
StartTime: start,
EndTime: end,
Platform: "",
GroupID: nil,
QueryMode: OpsQueryModeAuto,
})
if err != nil {
// If pre-aggregation isn't ready but the report is requested, fall back to raw.
if strings.TrimSpace(report.ReportType) == "daily_summary" || strings.TrimSpace(report.ReportType) == "weekly_summary" {
overview, err = s.opsService.GetDashboardOverview(ctx, &OpsDashboardFilter{
StartTime: start,
EndTime: end,
Platform: "",
GroupID: nil,
QueryMode: OpsQueryModeRaw,
})
}
if err != nil {
return "", err
}
}
return buildOpsSummaryEmailHTML(report.Name, start, end, overview), nil
case "error_digest":
// Lightweight digest: list recent errors (status>=400) and breakdown by type.
startTime := start
endTime := end
filter := &OpsErrorLogFilter{
StartTime: &startTime,
EndTime: &endTime,
Page: 1,
PageSize: 100,
}
out, err := s.opsService.GetErrorLogs(ctx, filter)
if err != nil {
return "", err
}
if report.ErrorDigestMinCount > 0 && out != nil && out.Total < report.ErrorDigestMinCount {
return "", nil
}
return buildOpsErrorDigestEmailHTML(report.Name, start, end, out), nil
case "account_health":
// Best-effort: use account availability (not error rate yet).
avail, err := s.opsService.GetAccountAvailability(ctx, "", nil)
if err != nil {
return "", err
}
_ = report.AccountHealthErrorRateThreshold // reserved for future per-account error rate report
return buildOpsAccountHealthEmailHTML(report.Name, start, end, avail), nil
default:
return "", fmt.Errorf("unknown report type: %s", report.ReportType)
}
}
func buildOpsSummaryEmailHTML(title string, start, end time.Time, overview *OpsDashboardOverview) string {
if overview == nil {
return fmt.Sprintf("<h2>%s</h2><p>No data.</p>", htmlEscape(title))
}
latP50 := "-"
latP99 := "-"
if overview.Duration.P50 != nil {
latP50 = fmt.Sprintf("%dms", *overview.Duration.P50)
}
if overview.Duration.P99 != nil {
latP99 = fmt.Sprintf("%dms", *overview.Duration.P99)
}
ttftP50 := "-"
ttftP99 := "-"
if overview.TTFT.P50 != nil {
ttftP50 = fmt.Sprintf("%dms", *overview.TTFT.P50)
}
if overview.TTFT.P99 != nil {
ttftP99 = fmt.Sprintf("%dms", *overview.TTFT.P99)
}
return fmt.Sprintf(`
<h2>%s</h2>
<p><b>Period</b>: %s ~ %s (UTC)</p>
<ul>
<li><b>Total Requests</b>: %d</li>
<li><b>Success</b>: %d</li>
<li><b>Errors (SLA)</b>: %d</li>
<li><b>Business Limited</b>: %d</li>
<li><b>SLA</b>: %.2f%%</li>
<li><b>Error Rate</b>: %.2f%%</li>
<li><b>Upstream Error Rate (excl 429/529)</b>: %.2f%%</li>
<li><b>Upstream Errors</b>: excl429/529=%d, 429=%d, 529=%d</li>
<li><b>Latency</b>: p50=%s, p99=%s</li>
<li><b>TTFT</b>: p50=%s, p99=%s</li>
<li><b>Tokens</b>: %d</li>
<li><b>QPS</b>: current=%.1f, peak=%.1f, avg=%.1f</li>
<li><b>TPS</b>: current=%.1f, peak=%.1f, avg=%.1f</li>
</ul>
`,
htmlEscape(strings.TrimSpace(title)),
htmlEscape(start.UTC().Format(time.RFC3339)),
htmlEscape(end.UTC().Format(time.RFC3339)),
overview.RequestCountTotal,
overview.SuccessCount,
overview.ErrorCountSLA,
overview.BusinessLimitedCount,
overview.SLA*100,
overview.ErrorRate*100,
overview.UpstreamErrorRate*100,
overview.UpstreamErrorCountExcl429529,
overview.Upstream429Count,
overview.Upstream529Count,
htmlEscape(latP50),
htmlEscape(latP99),
htmlEscape(ttftP50),
htmlEscape(ttftP99),
overview.TokenConsumed,
overview.QPS.Current,
overview.QPS.Peak,
overview.QPS.Avg,
overview.TPS.Current,
overview.TPS.Peak,
overview.TPS.Avg,
)
}
func buildOpsErrorDigestEmailHTML(title string, start, end time.Time, list *OpsErrorLogList) string {
total := 0
recent := []*OpsErrorLog{}
if list != nil {
total = list.Total
recent = list.Errors
}
if len(recent) > 10 {
recent = recent[:10]
}
rows := ""
for _, item := range recent {
if item == nil {
continue
}
rows += fmt.Sprintf(
"<tr><td>%s</td><td>%s</td><td>%d</td><td>%s</td></tr>",
htmlEscape(item.CreatedAt.UTC().Format(time.RFC3339)),
htmlEscape(item.Platform),
item.StatusCode,
htmlEscape(truncateString(item.Message, 180)),
)
}
if rows == "" {
rows = "<tr><td colspan=\"4\">No recent errors.</td></tr>"
}
return fmt.Sprintf(`
<h2>%s</h2>
<p><b>Period</b>: %s ~ %s (UTC)</p>
<p><b>Total Errors</b>: %d</p>
<h3>Recent</h3>
<table border="1" cellpadding="6" cellspacing="0" style="border-collapse:collapse;">
<thead><tr><th>Time</th><th>Platform</th><th>Status</th><th>Message</th></tr></thead>
<tbody>%s</tbody>
</table>
`,
htmlEscape(strings.TrimSpace(title)),
htmlEscape(start.UTC().Format(time.RFC3339)),
htmlEscape(end.UTC().Format(time.RFC3339)),
total,
rows,
)
}
func buildOpsAccountHealthEmailHTML(title string, start, end time.Time, avail *OpsAccountAvailability) string {
total := 0
available := 0
rateLimited := 0
hasError := 0
if avail != nil && avail.Accounts != nil {
for _, a := range avail.Accounts {
if a == nil {
continue
}
total++
if a.IsAvailable {
available++
}
if a.IsRateLimited {
rateLimited++
}
if a.HasError {
hasError++
}
}
}
return fmt.Sprintf(`
<h2>%s</h2>
<p><b>Period</b>: %s ~ %s (UTC)</p>
<ul>
<li><b>Total Accounts</b>: %d</li>
<li><b>Available</b>: %d</li>
<li><b>Rate Limited</b>: %d</li>
<li><b>Error</b>: %d</li>
</ul>
<p>Note: This report currently reflects account availability status only.</p>
`,
htmlEscape(strings.TrimSpace(title)),
htmlEscape(start.UTC().Format(time.RFC3339)),
htmlEscape(end.UTC().Format(time.RFC3339)),
total,
available,
rateLimited,
hasError,
)
}
func (s *OpsScheduledReportService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
if s == nil || !s.distributedLockOn {
return nil, true
}
if s.redisClient == nil {
s.warnNoRedisOnce.Do(func() {
log.Printf("[OpsScheduledReport] redis not configured; running without distributed lock")
})
return nil, true
}
if ctx == nil {
ctx = context.Background()
}
key := opsScheduledReportLeaderLockKeyDefault
ttl := opsScheduledReportLeaderLockTTLDefault
if strings.TrimSpace(key) == "" {
key = "ops:scheduled_reports:leader"
}
if ttl <= 0 {
ttl = 5 * time.Minute
}
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
if err != nil {
// Prefer fail-closed to avoid duplicate report sends when Redis is flaky.
log.Printf("[OpsScheduledReport] leader lock SetNX failed; skipping this cycle: %v", err)
return nil, false
}
if !ok {
return nil, false
}
return func() {
_, _ = opsScheduledReportReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
}, true
}
func (s *OpsScheduledReportService) getLastRunAt(ctx context.Context, reportType string) time.Time {
if s == nil || s.redisClient == nil {
return time.Time{}
}
kind := strings.TrimSpace(reportType)
if kind == "" {
return time.Time{}
}
key := opsScheduledReportLastRunKeyPrefix + kind
raw, err := s.redisClient.Get(ctx, key).Result()
if err != nil || strings.TrimSpace(raw) == "" {
return time.Time{}
}
sec, err := strconv.ParseInt(strings.TrimSpace(raw), 10, 64)
if err != nil || sec <= 0 {
return time.Time{}
}
last := time.Unix(sec, 0)
// Cron schedules are interpreted in the configured timezone (s.loc). Ensure the base time
// passed into cron.Next() uses the same location; otherwise the job will drift by timezone
// offset (e.g. Asia/Shanghai default would run 8h later after the first execution).
if s.loc != nil {
return last.In(s.loc)
}
return last.UTC()
}
func (s *OpsScheduledReportService) setLastRunAt(ctx context.Context, reportType string, t time.Time) {
if s == nil || s.redisClient == nil {
return
}
kind := strings.TrimSpace(reportType)
if kind == "" {
return
}
if t.IsZero() {
t = time.Now().UTC()
}
key := opsScheduledReportLastRunKeyPrefix + kind
_ = s.redisClient.Set(ctx, key, strconv.FormatInt(t.UTC().Unix(), 10), 14*24*time.Hour).Err()
}
func (s *OpsScheduledReportService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
if s == nil || s.opsService == nil || s.opsService.opsRepo == nil {
return
}
now := time.Now().UTC()
durMs := duration.Milliseconds()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
JobName: opsScheduledReportJobName,
LastRunAt: &runAt,
LastSuccessAt: &now,
LastDurationMs: &durMs,
})
}
func (s *OpsScheduledReportService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
if s == nil || s.opsService == nil || s.opsService.opsRepo == nil || err == nil {
return
}
now := time.Now().UTC()
durMs := duration.Milliseconds()
msg := truncateString(err.Error(), 2048)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
JobName: opsScheduledReportJobName,
LastRunAt: &runAt,
LastErrorAt: &now,
LastError: &msg,
LastDurationMs: &durMs,
})
}
func normalizeEmails(in []string) []string {
if len(in) == 0 {
return nil
}
seen := make(map[string]struct{}, len(in))
out := make([]string, 0, len(in))
for _, raw := range in {
addr := strings.ToLower(strings.TrimSpace(raw))
if addr == "" {
continue
}
if _, ok := seen[addr]; ok {
continue
}
seen[addr] = struct{}{}
out = append(out, addr)
}
return out
}

View File

@@ -0,0 +1,537 @@
package service
import (
"context"
"database/sql"
"encoding/json"
"errors"
"log"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/config"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
var ErrOpsDisabled = infraerrors.NotFound("OPS_DISABLED", "Ops monitoring is disabled")
const (
opsMaxStoredRequestBodyBytes = 10 * 1024
opsMaxStoredErrorBodyBytes = 20 * 1024
)
// OpsService provides ingestion and query APIs for the Ops monitoring module.
type OpsService struct {
opsRepo OpsRepository
settingRepo SettingRepository
cfg *config.Config
accountRepo AccountRepository
// getAccountAvailability is a unit-test hook for overriding account availability lookup.
getAccountAvailability func(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error)
concurrencyService *ConcurrencyService
gatewayService *GatewayService
openAIGatewayService *OpenAIGatewayService
geminiCompatService *GeminiMessagesCompatService
antigravityGatewayService *AntigravityGatewayService
}
func NewOpsService(
opsRepo OpsRepository,
settingRepo SettingRepository,
cfg *config.Config,
accountRepo AccountRepository,
concurrencyService *ConcurrencyService,
gatewayService *GatewayService,
openAIGatewayService *OpenAIGatewayService,
geminiCompatService *GeminiMessagesCompatService,
antigravityGatewayService *AntigravityGatewayService,
) *OpsService {
return &OpsService{
opsRepo: opsRepo,
settingRepo: settingRepo,
cfg: cfg,
accountRepo: accountRepo,
concurrencyService: concurrencyService,
gatewayService: gatewayService,
openAIGatewayService: openAIGatewayService,
geminiCompatService: geminiCompatService,
antigravityGatewayService: antigravityGatewayService,
}
}
func (s *OpsService) RequireMonitoringEnabled(ctx context.Context) error {
if s.IsMonitoringEnabled(ctx) {
return nil
}
return ErrOpsDisabled
}
func (s *OpsService) IsMonitoringEnabled(ctx context.Context) bool {
// Hard switch: disable ops entirely.
if s.cfg != nil && !s.cfg.Ops.Enabled {
return false
}
if s.settingRepo == nil {
return true
}
value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
if err != nil {
// Default enabled when key is missing, and fail-open on transient errors
// (ops should never block gateway traffic).
if errors.Is(err, ErrSettingNotFound) {
return true
}
return true
}
switch strings.ToLower(strings.TrimSpace(value)) {
case "false", "0", "off", "disabled":
return false
default:
return true
}
}
func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogInput, rawRequestBody []byte) error {
if entry == nil {
return nil
}
if !s.IsMonitoringEnabled(ctx) {
return nil
}
if s.opsRepo == nil {
return nil
}
// Ensure timestamps are always populated.
if entry.CreatedAt.IsZero() {
entry.CreatedAt = time.Now()
}
// Ensure required fields exist (DB has NOT NULL constraints).
entry.ErrorPhase = strings.TrimSpace(entry.ErrorPhase)
entry.ErrorType = strings.TrimSpace(entry.ErrorType)
if entry.ErrorPhase == "" {
entry.ErrorPhase = "internal"
}
if entry.ErrorType == "" {
entry.ErrorType = "api_error"
}
// Sanitize + trim request body (errors only).
if len(rawRequestBody) > 0 {
sanitized, truncated, bytesLen := sanitizeAndTrimRequestBody(rawRequestBody, opsMaxStoredRequestBodyBytes)
if sanitized != "" {
entry.RequestBodyJSON = &sanitized
}
entry.RequestBodyTruncated = truncated
entry.RequestBodyBytes = &bytesLen
}
// Sanitize + truncate error_body to avoid storing sensitive data.
if strings.TrimSpace(entry.ErrorBody) != "" {
sanitized, _ := sanitizeErrorBodyForStorage(entry.ErrorBody, opsMaxStoredErrorBodyBytes)
entry.ErrorBody = sanitized
}
// Sanitize upstream error context if provided by gateway services.
if entry.UpstreamStatusCode != nil && *entry.UpstreamStatusCode <= 0 {
entry.UpstreamStatusCode = nil
}
if entry.UpstreamErrorMessage != nil {
msg := strings.TrimSpace(*entry.UpstreamErrorMessage)
msg = sanitizeUpstreamErrorMessage(msg)
msg = truncateString(msg, 2048)
if strings.TrimSpace(msg) == "" {
entry.UpstreamErrorMessage = nil
} else {
entry.UpstreamErrorMessage = &msg
}
}
if entry.UpstreamErrorDetail != nil {
detail := strings.TrimSpace(*entry.UpstreamErrorDetail)
if detail == "" {
entry.UpstreamErrorDetail = nil
} else {
sanitized, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes)
if strings.TrimSpace(sanitized) == "" {
entry.UpstreamErrorDetail = nil
} else {
entry.UpstreamErrorDetail = &sanitized
}
}
}
// Sanitize + serialize upstream error events list.
if len(entry.UpstreamErrors) > 0 {
const maxEvents = 32
events := entry.UpstreamErrors
if len(events) > maxEvents {
events = events[len(events)-maxEvents:]
}
sanitized := make([]*OpsUpstreamErrorEvent, 0, len(events))
for _, ev := range events {
if ev == nil {
continue
}
out := *ev
out.Platform = strings.TrimSpace(out.Platform)
out.UpstreamRequestID = truncateString(strings.TrimSpace(out.UpstreamRequestID), 128)
out.Kind = truncateString(strings.TrimSpace(out.Kind), 64)
if out.AccountID < 0 {
out.AccountID = 0
}
if out.UpstreamStatusCode < 0 {
out.UpstreamStatusCode = 0
}
if out.AtUnixMs < 0 {
out.AtUnixMs = 0
}
msg := sanitizeUpstreamErrorMessage(strings.TrimSpace(out.Message))
msg = truncateString(msg, 2048)
out.Message = msg
detail := strings.TrimSpace(out.Detail)
if detail != "" {
// Keep upstream detail small; request bodies are not stored here, only upstream error payloads.
sanitizedDetail, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes)
out.Detail = sanitizedDetail
} else {
out.Detail = ""
}
// Drop fully-empty events (can happen if only status code was known).
if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" {
continue
}
evCopy := out
sanitized = append(sanitized, &evCopy)
}
entry.UpstreamErrorsJSON = marshalOpsUpstreamErrors(sanitized)
entry.UpstreamErrors = nil
}
if _, err := s.opsRepo.InsertErrorLog(ctx, entry); err != nil {
// Never bubble up to gateway; best-effort logging.
log.Printf("[Ops] RecordError failed: %v", err)
return err
}
return nil
}
func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil
}
return s.opsRepo.ListErrorLogs(ctx, filter)
}
func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
}
detail, err := s.opsRepo.GetErrorLogByID(ctx, id)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
}
return nil, infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err)
}
return detail, nil
}
func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) {
bytesLen = len(raw)
if len(raw) == 0 {
return "", false, 0
}
var decoded any
if err := json.Unmarshal(raw, &decoded); err != nil {
// If it's not valid JSON, don't store (retry would not be reliable anyway).
return "", false, bytesLen
}
decoded = redactSensitiveJSON(decoded)
encoded, err := json.Marshal(decoded)
if err != nil {
return "", false, bytesLen
}
if len(encoded) <= maxBytes {
return string(encoded), false, bytesLen
}
// Trim conversation history to keep the most recent context.
if root, ok := decoded.(map[string]any); ok {
if trimmed, ok := trimConversationArrays(root, maxBytes); ok {
encoded2, err2 := json.Marshal(trimmed)
if err2 == nil && len(encoded2) <= maxBytes {
return string(encoded2), true, bytesLen
}
// Fallthrough: keep shrinking.
decoded = trimmed
}
essential := shrinkToEssentials(root)
encoded3, err3 := json.Marshal(essential)
if err3 == nil && len(encoded3) <= maxBytes {
return string(encoded3), true, bytesLen
}
}
// Last resort: store a minimal placeholder (still valid JSON).
placeholder := map[string]any{
"request_body_truncated": true,
}
if model := extractString(decoded, "model"); model != "" {
placeholder["model"] = model
}
encoded4, err4 := json.Marshal(placeholder)
if err4 != nil {
return "", true, bytesLen
}
return string(encoded4), true, bytesLen
}
func redactSensitiveJSON(v any) any {
switch t := v.(type) {
case map[string]any:
out := make(map[string]any, len(t))
for k, vv := range t {
if isSensitiveKey(k) {
out[k] = "[REDACTED]"
continue
}
out[k] = redactSensitiveJSON(vv)
}
return out
case []any:
out := make([]any, 0, len(t))
for _, vv := range t {
out = append(out, redactSensitiveJSON(vv))
}
return out
default:
return v
}
}
func isSensitiveKey(key string) bool {
k := strings.ToLower(strings.TrimSpace(key))
if k == "" {
return false
}
// Exact matches (common credential fields).
switch k {
case "authorization",
"proxy-authorization",
"x-api-key",
"api_key",
"apikey",
"access_token",
"refresh_token",
"id_token",
"session_token",
"token",
"password",
"passwd",
"passphrase",
"secret",
"client_secret",
"private_key",
"jwt",
"signature",
"accesskeyid",
"secretaccesskey":
return true
}
// Suffix matches.
for _, suffix := range []string{
"_secret",
"_token",
"_id_token",
"_session_token",
"_password",
"_passwd",
"_passphrase",
"_key",
"secret_key",
"private_key",
} {
if strings.HasSuffix(k, suffix) {
return true
}
}
// Substring matches (conservative, but errs on the side of privacy).
for _, sub := range []string{
"secret",
"token",
"password",
"passwd",
"passphrase",
"privatekey",
"private_key",
"apikey",
"api_key",
"accesskeyid",
"secretaccesskey",
"bearer",
"cookie",
"credential",
"session",
"jwt",
"signature",
} {
if strings.Contains(k, sub) {
return true
}
}
return false
}
func trimConversationArrays(root map[string]any, maxBytes int) (map[string]any, bool) {
// Supported: anthropic/openai: messages; gemini: contents.
if out, ok := trimArrayField(root, "messages", maxBytes); ok {
return out, true
}
if out, ok := trimArrayField(root, "contents", maxBytes); ok {
return out, true
}
return root, false
}
func trimArrayField(root map[string]any, field string, maxBytes int) (map[string]any, bool) {
raw, ok := root[field]
if !ok {
return nil, false
}
arr, ok := raw.([]any)
if !ok || len(arr) == 0 {
return nil, false
}
// Keep at least the last message/content. Use binary search so we don't marshal O(n) times.
// We are dropping from the *front* of the array (oldest context first).
lo := 0
hi := len(arr) - 1 // inclusive; hi ensures at least one item remains
var best map[string]any
found := false
for lo <= hi {
mid := (lo + hi) / 2
candidateArr := arr[mid:]
if len(candidateArr) == 0 {
lo = mid + 1
continue
}
next := shallowCopyMap(root)
next[field] = candidateArr
encoded, err := json.Marshal(next)
if err != nil {
// If marshal fails, try dropping more.
lo = mid + 1
continue
}
if len(encoded) <= maxBytes {
best = next
found = true
// Try to keep more context by dropping fewer items.
hi = mid - 1
continue
}
// Need to drop more.
lo = mid + 1
}
if found {
return best, true
}
// Nothing fit (even with only one element); return the smallest slice and let the
// caller fall back to shrinkToEssentials().
next := shallowCopyMap(root)
next[field] = arr[len(arr)-1:]
return next, true
}
func shrinkToEssentials(root map[string]any) map[string]any {
out := make(map[string]any)
for _, key := range []string{"model", "stream", "max_tokens", "temperature", "top_p", "top_k"} {
if v, ok := root[key]; ok {
out[key] = v
}
}
// Keep only the last element of the conversation array.
if v, ok := root["messages"]; ok {
if arr, ok := v.([]any); ok && len(arr) > 0 {
out["messages"] = []any{arr[len(arr)-1]}
}
}
if v, ok := root["contents"]; ok {
if arr, ok := v.([]any); ok && len(arr) > 0 {
out["contents"] = []any{arr[len(arr)-1]}
}
}
return out
}
func shallowCopyMap(m map[string]any) map[string]any {
out := make(map[string]any, len(m))
for k, v := range m {
out[k] = v
}
return out
}
func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, truncated bool) {
raw = strings.TrimSpace(raw)
if raw == "" {
return "", false
}
// Prefer JSON-safe sanitization when possible.
if out, trunc, _ := sanitizeAndTrimRequestBody([]byte(raw), maxBytes); out != "" {
return out, trunc
}
// Non-JSON: best-effort truncate.
if maxBytes > 0 && len(raw) > maxBytes {
return truncateString(raw, maxBytes), true
}
return raw, false
}
func extractString(v any, key string) string {
root, ok := v.(map[string]any)
if !ok {
return ""
}
s, _ := root[key].(string)
return strings.TrimSpace(s)
}

View File

@@ -0,0 +1,465 @@
package service
import (
"context"
"encoding/json"
"errors"
"strings"
"time"
)
const (
opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader"
opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second
)
// =========================
// Email notification config
// =========================
func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) {
defaultCfg := defaultOpsEmailNotificationConfig()
if s == nil || s.settingRepo == nil {
return defaultCfg, nil
}
if ctx == nil {
ctx = context.Background()
}
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
// Initialize defaults on first read (best-effort).
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
_ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b))
}
return defaultCfg, nil
}
return nil, err
}
cfg := &OpsEmailNotificationConfig{}
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
// Corrupted JSON should not break ops UI; fall back to defaults.
return defaultCfg, nil
}
normalizeOpsEmailNotificationConfig(cfg)
return cfg, nil
}
func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) {
if s == nil || s.settingRepo == nil {
return nil, errors.New("setting repository not initialized")
}
if ctx == nil {
ctx = context.Background()
}
if req == nil {
return nil, errors.New("invalid request")
}
cfg, err := s.GetEmailNotificationConfig(ctx)
if err != nil {
return nil, err
}
if req.Alert != nil {
cfg.Alert.Enabled = req.Alert.Enabled
if req.Alert.Recipients != nil {
cfg.Alert.Recipients = req.Alert.Recipients
}
cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity)
cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour
cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds
cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts
}
if req.Report != nil {
cfg.Report.Enabled = req.Report.Enabled
if req.Report.Recipients != nil {
cfg.Report.Recipients = req.Report.Recipients
}
cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled
cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule)
cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled
cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule)
cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled
cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule)
cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount
cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled
cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule)
cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold
}
if err := validateOpsEmailNotificationConfig(cfg); err != nil {
return nil, err
}
normalizeOpsEmailNotificationConfig(cfg)
raw, err := json.Marshal(cfg)
if err != nil {
return nil, err
}
if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil {
return nil, err
}
return cfg, nil
}
func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig {
return &OpsEmailNotificationConfig{
Alert: OpsEmailAlertConfig{
Enabled: true,
Recipients: []string{},
MinSeverity: "",
RateLimitPerHour: 0,
BatchingWindowSeconds: 0,
IncludeResolvedAlerts: false,
},
Report: OpsEmailReportConfig{
Enabled: false,
Recipients: []string{},
DailySummaryEnabled: false,
DailySummarySchedule: "0 9 * * *",
WeeklySummaryEnabled: false,
WeeklySummarySchedule: "0 9 * * 1",
ErrorDigestEnabled: false,
ErrorDigestSchedule: "0 9 * * *",
ErrorDigestMinCount: 10,
AccountHealthEnabled: false,
AccountHealthSchedule: "0 9 * * *",
AccountHealthErrorRateThreshold: 10.0,
},
}
}
func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) {
if cfg == nil {
return
}
if cfg.Alert.Recipients == nil {
cfg.Alert.Recipients = []string{}
}
if cfg.Report.Recipients == nil {
cfg.Report.Recipients = []string{}
}
cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity)
cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule)
cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule)
cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule)
cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule)
// Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings.
if cfg.Report.DailySummarySchedule == "" {
cfg.Report.DailySummarySchedule = "0 9 * * *"
}
if cfg.Report.WeeklySummarySchedule == "" {
cfg.Report.WeeklySummarySchedule = "0 9 * * 1"
}
if cfg.Report.ErrorDigestSchedule == "" {
cfg.Report.ErrorDigestSchedule = "0 9 * * *"
}
if cfg.Report.AccountHealthSchedule == "" {
cfg.Report.AccountHealthSchedule = "0 9 * * *"
}
}
func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error {
if cfg == nil {
return errors.New("invalid config")
}
if cfg.Alert.RateLimitPerHour < 0 {
return errors.New("alert.rate_limit_per_hour must be >= 0")
}
if cfg.Alert.BatchingWindowSeconds < 0 {
return errors.New("alert.batching_window_seconds must be >= 0")
}
switch strings.TrimSpace(cfg.Alert.MinSeverity) {
case "", "critical", "warning", "info":
default:
return errors.New("alert.min_severity must be one of: critical, warning, info, or empty")
}
if cfg.Report.ErrorDigestMinCount < 0 {
return errors.New("report.error_digest_min_count must be >= 0")
}
if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 {
return errors.New("report.account_health_error_rate_threshold must be between 0 and 100")
}
return nil
}
// =========================
// Alert runtime settings
// =========================
func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings {
return &OpsAlertRuntimeSettings{
EvaluationIntervalSeconds: 60,
DistributedLock: OpsDistributedLockSettings{
Enabled: true,
Key: opsAlertEvaluatorLeaderLockKeyDefault,
TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()),
},
Silencing: OpsAlertSilencingSettings{
Enabled: false,
GlobalUntilRFC3339: "",
GlobalReason: "",
Entries: []OpsAlertSilenceEntry{},
},
}
}
func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) {
if s == nil {
return
}
s.Key = strings.TrimSpace(s.Key)
if s.Key == "" {
s.Key = defaultKey
}
if s.TTLSeconds <= 0 {
s.TTLSeconds = defaultTTLSeconds
}
}
func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) {
if s == nil {
return
}
s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339)
s.GlobalReason = strings.TrimSpace(s.GlobalReason)
if s.Entries == nil {
s.Entries = []OpsAlertSilenceEntry{}
}
for i := range s.Entries {
s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339)
s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason)
}
}
func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error {
if strings.TrimSpace(s.Key) == "" {
return errors.New("distributed_lock.key is required")
}
if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) {
return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400")
}
return nil
}
func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error {
parse := func(raw string) error {
if strings.TrimSpace(raw) == "" {
return nil
}
if _, err := time.Parse(time.RFC3339, raw); err != nil {
return errors.New("silencing time must be RFC3339")
}
return nil
}
if err := parse(s.GlobalUntilRFC3339); err != nil {
return err
}
for _, entry := range s.Entries {
if strings.TrimSpace(entry.UntilRFC3339) == "" {
return errors.New("silencing.entries.until_rfc3339 is required")
}
if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil {
return errors.New("silencing.entries.until_rfc3339 must be RFC3339")
}
}
return nil
}
func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) {
defaultCfg := defaultOpsAlertRuntimeSettings()
if s == nil || s.settingRepo == nil {
return defaultCfg, nil
}
if ctx == nil {
ctx = context.Background()
}
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
_ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b))
}
return defaultCfg, nil
}
return nil, err
}
cfg := &OpsAlertRuntimeSettings{}
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
return defaultCfg, nil
}
if cfg.EvaluationIntervalSeconds <= 0 {
cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds
}
normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
normalizeOpsAlertSilencingSettings(&cfg.Silencing)
return cfg, nil
}
func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) {
if s == nil || s.settingRepo == nil {
return nil, errors.New("setting repository not initialized")
}
if ctx == nil {
ctx = context.Background()
}
if cfg == nil {
return nil, errors.New("invalid config")
}
if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) {
return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400")
}
if cfg.DistributedLock.Enabled {
if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil {
return nil, err
}
}
if cfg.Silencing.Enabled {
if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil {
return nil, err
}
}
defaultCfg := defaultOpsAlertRuntimeSettings()
normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
normalizeOpsAlertSilencingSettings(&cfg.Silencing)
raw, err := json.Marshal(cfg)
if err != nil {
return nil, err
}
if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil {
return nil, err
}
// Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated).
updated := &OpsAlertRuntimeSettings{}
_ = json.Unmarshal(raw, updated)
return updated, nil
}
// =========================
// Advanced settings
// =========================
func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
return &OpsAdvancedSettings{
DataRetention: OpsDataRetentionSettings{
CleanupEnabled: false,
CleanupSchedule: "0 2 * * *",
ErrorLogRetentionDays: 30,
MinuteMetricsRetentionDays: 30,
HourlyMetricsRetentionDays: 30,
},
Aggregation: OpsAggregationSettings{
AggregationEnabled: false,
},
}
}
func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) {
if cfg == nil {
return
}
cfg.DataRetention.CleanupSchedule = strings.TrimSpace(cfg.DataRetention.CleanupSchedule)
if cfg.DataRetention.CleanupSchedule == "" {
cfg.DataRetention.CleanupSchedule = "0 2 * * *"
}
if cfg.DataRetention.ErrorLogRetentionDays <= 0 {
cfg.DataRetention.ErrorLogRetentionDays = 30
}
if cfg.DataRetention.MinuteMetricsRetentionDays <= 0 {
cfg.DataRetention.MinuteMetricsRetentionDays = 30
}
if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 {
cfg.DataRetention.HourlyMetricsRetentionDays = 30
}
}
func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
if cfg == nil {
return errors.New("invalid config")
}
if cfg.DataRetention.ErrorLogRetentionDays < 1 || cfg.DataRetention.ErrorLogRetentionDays > 365 {
return errors.New("error_log_retention_days must be between 1 and 365")
}
if cfg.DataRetention.MinuteMetricsRetentionDays < 1 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 {
return errors.New("minute_metrics_retention_days must be between 1 and 365")
}
if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
return errors.New("hourly_metrics_retention_days must be between 1 and 365")
}
return nil
}
func (s *OpsService) GetOpsAdvancedSettings(ctx context.Context) (*OpsAdvancedSettings, error) {
defaultCfg := defaultOpsAdvancedSettings()
if s == nil || s.settingRepo == nil {
return defaultCfg, nil
}
if ctx == nil {
ctx = context.Background()
}
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAdvancedSettings)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
_ = s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(b))
}
return defaultCfg, nil
}
return nil, err
}
cfg := &OpsAdvancedSettings{}
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
return defaultCfg, nil
}
normalizeOpsAdvancedSettings(cfg)
return cfg, nil
}
func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdvancedSettings) (*OpsAdvancedSettings, error) {
if s == nil || s.settingRepo == nil {
return nil, errors.New("setting repository not initialized")
}
if ctx == nil {
ctx = context.Background()
}
if cfg == nil {
return nil, errors.New("invalid config")
}
if err := validateOpsAdvancedSettings(cfg); err != nil {
return nil, err
}
normalizeOpsAdvancedSettings(cfg)
raw, err := json.Marshal(cfg)
if err != nil {
return nil, err
}
if err := s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(raw)); err != nil {
return nil, err
}
updated := &OpsAdvancedSettings{}
_ = json.Unmarshal(raw, updated)
return updated, nil
}

View File

@@ -0,0 +1,87 @@
package service
// Ops settings models stored in DB `settings` table (JSON blobs).
type OpsEmailNotificationConfig struct {
Alert OpsEmailAlertConfig `json:"alert"`
Report OpsEmailReportConfig `json:"report"`
}
type OpsEmailAlertConfig struct {
Enabled bool `json:"enabled"`
Recipients []string `json:"recipients"`
MinSeverity string `json:"min_severity"`
RateLimitPerHour int `json:"rate_limit_per_hour"`
BatchingWindowSeconds int `json:"batching_window_seconds"`
IncludeResolvedAlerts bool `json:"include_resolved_alerts"`
}
type OpsEmailReportConfig struct {
Enabled bool `json:"enabled"`
Recipients []string `json:"recipients"`
DailySummaryEnabled bool `json:"daily_summary_enabled"`
DailySummarySchedule string `json:"daily_summary_schedule"`
WeeklySummaryEnabled bool `json:"weekly_summary_enabled"`
WeeklySummarySchedule string `json:"weekly_summary_schedule"`
ErrorDigestEnabled bool `json:"error_digest_enabled"`
ErrorDigestSchedule string `json:"error_digest_schedule"`
ErrorDigestMinCount int `json:"error_digest_min_count"`
AccountHealthEnabled bool `json:"account_health_enabled"`
AccountHealthSchedule string `json:"account_health_schedule"`
AccountHealthErrorRateThreshold float64 `json:"account_health_error_rate_threshold"`
}
// OpsEmailNotificationConfigUpdateRequest allows partial updates, while the
// frontend can still send the full config shape.
type OpsEmailNotificationConfigUpdateRequest struct {
Alert *OpsEmailAlertConfig `json:"alert"`
Report *OpsEmailReportConfig `json:"report"`
}
type OpsDistributedLockSettings struct {
Enabled bool `json:"enabled"`
Key string `json:"key"`
TTLSeconds int `json:"ttl_seconds"`
}
type OpsAlertSilenceEntry struct {
RuleID *int64 `json:"rule_id,omitempty"`
Severities []string `json:"severities,omitempty"`
UntilRFC3339 string `json:"until_rfc3339"`
Reason string `json:"reason"`
}
type OpsAlertSilencingSettings struct {
Enabled bool `json:"enabled"`
GlobalUntilRFC3339 string `json:"global_until_rfc3339"`
GlobalReason string `json:"global_reason"`
Entries []OpsAlertSilenceEntry `json:"entries,omitempty"`
}
type OpsAlertRuntimeSettings struct {
EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"`
DistributedLock OpsDistributedLockSettings `json:"distributed_lock"`
Silencing OpsAlertSilencingSettings `json:"silencing"`
}
// OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation).
type OpsAdvancedSettings struct {
DataRetention OpsDataRetentionSettings `json:"data_retention"`
Aggregation OpsAggregationSettings `json:"aggregation"`
}
type OpsDataRetentionSettings struct {
CleanupEnabled bool `json:"cleanup_enabled"`
CleanupSchedule string `json:"cleanup_schedule"`
ErrorLogRetentionDays int `json:"error_log_retention_days"`
MinuteMetricsRetentionDays int `json:"minute_metrics_retention_days"`
HourlyMetricsRetentionDays int `json:"hourly_metrics_retention_days"`
}
type OpsAggregationSettings struct {
AggregationEnabled bool `json:"aggregation_enabled"`
}

View File

@@ -0,0 +1,65 @@
package service
import "time"
type OpsThroughputTrendPoint struct {
BucketStart time.Time `json:"bucket_start"`
RequestCount int64 `json:"request_count"`
TokenConsumed int64 `json:"token_consumed"`
QPS float64 `json:"qps"`
TPS float64 `json:"tps"`
}
type OpsThroughputPlatformBreakdownItem struct {
Platform string `json:"platform"`
RequestCount int64 `json:"request_count"`
TokenConsumed int64 `json:"token_consumed"`
}
type OpsThroughputGroupBreakdownItem struct {
GroupID int64 `json:"group_id"`
GroupName string `json:"group_name"`
RequestCount int64 `json:"request_count"`
TokenConsumed int64 `json:"token_consumed"`
}
type OpsThroughputTrendResponse struct {
Bucket string `json:"bucket"`
Points []*OpsThroughputTrendPoint `json:"points"`
// Optional drilldown helpers:
// - When no platform/group is selected: returns totals by platform.
// - When platform is selected but group is not: returns top groups in that platform.
ByPlatform []*OpsThroughputPlatformBreakdownItem `json:"by_platform,omitempty"`
TopGroups []*OpsThroughputGroupBreakdownItem `json:"top_groups,omitempty"`
}
type OpsErrorTrendPoint struct {
BucketStart time.Time `json:"bucket_start"`
ErrorCountTotal int64 `json:"error_count_total"`
BusinessLimitedCount int64 `json:"business_limited_count"`
ErrorCountSLA int64 `json:"error_count_sla"`
UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
Upstream429Count int64 `json:"upstream_429_count"`
Upstream529Count int64 `json:"upstream_529_count"`
}
type OpsErrorTrendResponse struct {
Bucket string `json:"bucket"`
Points []*OpsErrorTrendPoint `json:"points"`
}
type OpsErrorDistributionItem struct {
StatusCode int `json:"status_code"`
Total int64 `json:"total"`
SLA int64 `json:"sla"`
BusinessLimited int64 `json:"business_limited"`
}
type OpsErrorDistributionResponse struct {
Total int64 `json:"total"`
Items []*OpsErrorDistributionItem `json:"items"`
}

View File

@@ -0,0 +1,26 @@
package service
import (
"context"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
func (s *OpsService) GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if filter == nil {
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
}
if filter.StartTime.After(filter.EndTime) {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
}
return s.opsRepo.GetThroughputTrend(ctx, filter, bucketSeconds)
}

View File

@@ -0,0 +1,94 @@
package service
import (
"encoding/json"
"strings"
"time"
"github.com/gin-gonic/gin"
)
// Gin context keys used by Ops error logger for capturing upstream error details.
// These keys are set by gateway services and consumed by handler/ops_error_logger.go.
const (
OpsUpstreamStatusCodeKey = "ops_upstream_status_code"
OpsUpstreamErrorMessageKey = "ops_upstream_error_message"
OpsUpstreamErrorDetailKey = "ops_upstream_error_detail"
OpsUpstreamErrorsKey = "ops_upstream_errors"
)
func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) {
if c == nil {
return
}
if upstreamStatusCode > 0 {
c.Set(OpsUpstreamStatusCodeKey, upstreamStatusCode)
}
if msg := strings.TrimSpace(upstreamMessage); msg != "" {
c.Set(OpsUpstreamErrorMessageKey, msg)
}
if detail := strings.TrimSpace(upstreamDetail); detail != "" {
c.Set(OpsUpstreamErrorDetailKey, detail)
}
}
// OpsUpstreamErrorEvent describes one upstream error attempt during a single gateway request.
// It is stored in ops_error_logs.upstream_errors as a JSON array.
type OpsUpstreamErrorEvent struct {
AtUnixMs int64 `json:"at_unix_ms,omitempty"`
// Context
Platform string `json:"platform,omitempty"`
AccountID int64 `json:"account_id,omitempty"`
// Outcome
UpstreamStatusCode int `json:"upstream_status_code,omitempty"`
UpstreamRequestID string `json:"upstream_request_id,omitempty"`
// Kind: http_error | request_error | retry_exhausted | failover
Kind string `json:"kind,omitempty"`
Message string `json:"message,omitempty"`
Detail string `json:"detail,omitempty"`
}
func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
if c == nil {
return
}
if ev.AtUnixMs <= 0 {
ev.AtUnixMs = time.Now().UnixMilli()
}
ev.Platform = strings.TrimSpace(ev.Platform)
ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID)
ev.Kind = strings.TrimSpace(ev.Kind)
ev.Message = strings.TrimSpace(ev.Message)
ev.Detail = strings.TrimSpace(ev.Detail)
if ev.Message != "" {
ev.Message = sanitizeUpstreamErrorMessage(ev.Message)
}
var existing []*OpsUpstreamErrorEvent
if v, ok := c.Get(OpsUpstreamErrorsKey); ok {
if arr, ok := v.([]*OpsUpstreamErrorEvent); ok {
existing = arr
}
}
evCopy := ev
existing = append(existing, &evCopy)
c.Set(OpsUpstreamErrorsKey, existing)
}
func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string {
if len(events) == 0 {
return nil
}
// Ensure we always store a valid JSON value.
raw, err := json.Marshal(events)
if err != nil || len(raw) == 0 {
return nil
}
s := string(raw)
return &s
}

View File

@@ -0,0 +1,24 @@
package service
import (
"context"
"time"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
// GetWindowStats returns lightweight request/token counts for the provided window.
// It is intended for realtime sampling (e.g. WebSocket QPS push) without computing percentiles/peaks.
func (s *OpsService) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
filter := &OpsDashboardFilter{
StartTime: startTime,
EndTime: endTime,
}
return s.opsRepo.GetWindowStats(ctx, filter)
}

View File

@@ -55,19 +55,36 @@ func (s *RateLimitService) HandleUpstreamError(ctx context.Context, account *Acc
}
tempMatched := s.tryTempUnschedulable(ctx, account, statusCode, responseBody)
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(responseBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
if upstreamMsg != "" {
upstreamMsg = truncateForLog([]byte(upstreamMsg), 512)
}
switch statusCode {
case 401:
// 认证失败:停止调度,记录错误
s.handleAuthError(ctx, account, "Authentication failed (401): invalid or expired credentials")
msg := "Authentication failed (401): invalid or expired credentials"
if upstreamMsg != "" {
msg = "Authentication failed (401): " + upstreamMsg
}
s.handleAuthError(ctx, account, msg)
shouldDisable = true
case 402:
// 支付要求:余额不足或计费问题,停止调度
s.handleAuthError(ctx, account, "Payment required (402): insufficient balance or billing issue")
msg := "Payment required (402): insufficient balance or billing issue"
if upstreamMsg != "" {
msg = "Payment required (402): " + upstreamMsg
}
s.handleAuthError(ctx, account, msg)
shouldDisable = true
case 403:
// 禁止访问:停止调度,记录错误
s.handleAuthError(ctx, account, "Access forbidden (403): account may be suspended or lack permissions")
msg := "Access forbidden (403): account may be suspended or lack permissions"
if upstreamMsg != "" {
msg = "Access forbidden (403): " + upstreamMsg
}
s.handleAuthError(ctx, account, msg)
shouldDisable = true
case 429:
s.handle429(ctx, account, headers)

View File

@@ -176,7 +176,7 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
updates[SettingKeyTurnstileSecretKey] = settings.TurnstileSecretKey
}
// LinuxDo Connect OAuth 登录(终端用户 SSO
// LinuxDo Connect OAuth 登录
updates[SettingKeyLinuxDoConnectEnabled] = strconv.FormatBool(settings.LinuxDoConnectEnabled)
updates[SettingKeyLinuxDoConnectClientID] = settings.LinuxDoConnectClientID
updates[SettingKeyLinuxDoConnectRedirectURL] = settings.LinuxDoConnectRedirectURL
@@ -208,6 +208,14 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
updates[SettingKeyEnableIdentityPatch] = strconv.FormatBool(settings.EnableIdentityPatch)
updates[SettingKeyIdentityPatchPrompt] = settings.IdentityPatchPrompt
// Ops monitoring (vNext)
updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled)
updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled)
updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault))
if settings.OpsMetricsIntervalSeconds > 0 {
updates[SettingKeyOpsMetricsIntervalSeconds] = strconv.Itoa(settings.OpsMetricsIntervalSeconds)
}
err := s.settingRepo.SetMultiple(ctx, updates)
if err == nil && s.onUpdate != nil {
s.onUpdate() // Invalidate cache after settings update
@@ -298,6 +306,12 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error {
// Identity patch defaults
SettingKeyEnableIdentityPatch: "true",
SettingKeyIdentityPatchPrompt: "",
// Ops monitoring defaults (vNext)
SettingKeyOpsMonitoringEnabled: "true",
SettingKeyOpsRealtimeMonitoringEnabled: "true",
SettingKeyOpsQueryModeDefault: "auto",
SettingKeyOpsMetricsIntervalSeconds: "60",
}
return s.settingRepo.SetMultiple(ctx, defaults)
@@ -397,100 +411,33 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin
}
result.IdentityPatchPrompt = settings[SettingKeyIdentityPatchPrompt]
// Ops monitoring settings (default: enabled, fail-open)
result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled])
result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled])
result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault]))
result.OpsMetricsIntervalSeconds = 60
if raw := strings.TrimSpace(settings[SettingKeyOpsMetricsIntervalSeconds]); raw != "" {
if v, err := strconv.Atoi(raw); err == nil {
if v < 60 {
v = 60
}
if v > 3600 {
v = 3600
}
result.OpsMetricsIntervalSeconds = v
}
}
return result
}
// GetLinuxDoConnectOAuthConfig 返回用于登录的“最终生效” LinuxDo Connect 配置。
//
// 优先级:
// - 若对应系统设置键存在,则覆盖 config.yaml/env 的值
// - 否则回退到 config.yaml/env 的值
func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) {
if s == nil || s.cfg == nil {
return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded")
}
effective := s.cfg.LinuxDo
keys := []string{
SettingKeyLinuxDoConnectEnabled,
SettingKeyLinuxDoConnectClientID,
SettingKeyLinuxDoConnectClientSecret,
SettingKeyLinuxDoConnectRedirectURL,
}
settings, err := s.settingRepo.GetMultiple(ctx, keys)
if err != nil {
return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
}
if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
effective.Enabled = raw == "true"
}
if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
effective.ClientID = strings.TrimSpace(v)
}
if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
effective.ClientSecret = strings.TrimSpace(v)
}
if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
effective.RedirectURL = strings.TrimSpace(v)
}
if !effective.Enabled {
return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
}
// 基础健壮性校验(避免把用户重定向到一个必然失败或不安全的 OAuth 流程里)。
if strings.TrimSpace(effective.ClientID) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
}
if strings.TrimSpace(effective.AuthorizeURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
}
if strings.TrimSpace(effective.TokenURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
}
if strings.TrimSpace(effective.UserInfoURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
}
if strings.TrimSpace(effective.RedirectURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
}
if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
}
if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
}
if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
}
method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod))
switch method {
case "", "client_secret_post", "client_secret_basic":
if strings.TrimSpace(effective.ClientSecret) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
}
case "none":
if !effective.UsePKCE {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
}
func isFalseSettingValue(value string) bool {
switch strings.ToLower(strings.TrimSpace(value)) {
case "false", "0", "off", "disabled":
return true
default:
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid")
return false
}
return effective, nil
}
// getStringOrDefault 获取字符串值或默认值
@@ -635,3 +582,96 @@ func (s *SettingService) GetFallbackModel(ctx context.Context, platform string)
}
return value
}
// GetLinuxDoConnectOAuthConfig 返回用于登录的"最终生效" LinuxDo Connect 配置。
//
// 优先级:
// - 若对应系统设置键存在,则覆盖 config.yaml/env 的值
// - 否则回退到 config.yaml/env 的值
func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) {
if s == nil || s.cfg == nil {
return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded")
}
effective := s.cfg.LinuxDo
keys := []string{
SettingKeyLinuxDoConnectEnabled,
SettingKeyLinuxDoConnectClientID,
SettingKeyLinuxDoConnectClientSecret,
SettingKeyLinuxDoConnectRedirectURL,
}
settings, err := s.settingRepo.GetMultiple(ctx, keys)
if err != nil {
return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
}
if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
effective.Enabled = raw == "true"
}
if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
effective.ClientID = strings.TrimSpace(v)
}
if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
effective.ClientSecret = strings.TrimSpace(v)
}
if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
effective.RedirectURL = strings.TrimSpace(v)
}
if !effective.Enabled {
return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
}
// 基础健壮性校验(避免把用户重定向到一个必然失败或不安全的 OAuth 流程里)。
if strings.TrimSpace(effective.ClientID) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
}
if strings.TrimSpace(effective.AuthorizeURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
}
if strings.TrimSpace(effective.TokenURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
}
if strings.TrimSpace(effective.UserInfoURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
}
if strings.TrimSpace(effective.RedirectURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
}
if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
}
if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
}
if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
}
method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod))
switch method {
case "", "client_secret_post", "client_secret_basic":
if strings.TrimSpace(effective.ClientSecret) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
}
case "none":
if !effective.UsePKCE {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
}
default:
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid")
}
return effective, nil
}

View File

@@ -18,7 +18,7 @@ type SystemSettings struct {
TurnstileSecretKey string
TurnstileSecretKeyConfigured bool
// LinuxDo Connect OAuth 登录(终端用户 SSO
// LinuxDo Connect OAuth 登录
LinuxDoConnectEnabled bool
LinuxDoConnectClientID string
LinuxDoConnectClientSecret string
@@ -46,6 +46,12 @@ type SystemSettings struct {
// Identity patch configuration (Claude -> Gemini)
EnableIdentityPatch bool `json:"enable_identity_patch"`
IdentityPatchPrompt string `json:"identity_patch_prompt"`
// Ops monitoring (vNext)
OpsMonitoringEnabled bool
OpsRealtimeMonitoringEnabled bool
OpsQueryModeDefault string
OpsMetricsIntervalSeconds int
}
type PublicSettings struct {

View File

@@ -1,10 +1,12 @@
package service
import (
"database/sql"
"time"
"github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/wire"
"github.com/redis/go-redis/v9"
)
// BuildInfo contains build information
@@ -84,6 +86,72 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi
return svc
}
// ProvideOpsMetricsCollector creates and starts OpsMetricsCollector.
func ProvideOpsMetricsCollector(
opsRepo OpsRepository,
settingRepo SettingRepository,
accountRepo AccountRepository,
concurrencyService *ConcurrencyService,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsMetricsCollector {
collector := NewOpsMetricsCollector(opsRepo, settingRepo, accountRepo, concurrencyService, db, redisClient, cfg)
collector.Start()
return collector
}
// ProvideOpsAggregationService creates and starts OpsAggregationService (hourly/daily pre-aggregation).
func ProvideOpsAggregationService(
opsRepo OpsRepository,
settingRepo SettingRepository,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsAggregationService {
svc := NewOpsAggregationService(opsRepo, settingRepo, db, redisClient, cfg)
svc.Start()
return svc
}
// ProvideOpsAlertEvaluatorService creates and starts OpsAlertEvaluatorService.
func ProvideOpsAlertEvaluatorService(
opsService *OpsService,
opsRepo OpsRepository,
emailService *EmailService,
redisClient *redis.Client,
cfg *config.Config,
) *OpsAlertEvaluatorService {
svc := NewOpsAlertEvaluatorService(opsService, opsRepo, emailService, redisClient, cfg)
svc.Start()
return svc
}
// ProvideOpsCleanupService creates and starts OpsCleanupService (cron scheduled).
func ProvideOpsCleanupService(
opsRepo OpsRepository,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsCleanupService {
svc := NewOpsCleanupService(opsRepo, db, redisClient, cfg)
svc.Start()
return svc
}
// ProvideOpsScheduledReportService creates and starts OpsScheduledReportService.
func ProvideOpsScheduledReportService(
opsService *OpsService,
userService *UserService,
emailService *EmailService,
redisClient *redis.Client,
cfg *config.Config,
) *OpsScheduledReportService {
svc := NewOpsScheduledReportService(opsService, userService, emailService, redisClient, cfg)
svc.Start()
return svc
}
// ProvideAPIKeyAuthCacheInvalidator 提供 API Key 认证缓存失效能力
func ProvideAPIKeyAuthCacheInvalidator(apiKeyService *APIKeyService) APIKeyAuthCacheInvalidator {
return apiKeyService
@@ -122,6 +190,12 @@ var ProviderSet = wire.NewSet(
NewAccountUsageService,
NewAccountTestService,
NewSettingService,
NewOpsService,
ProvideOpsMetricsCollector,
ProvideOpsAggregationService,
ProvideOpsAlertEvaluatorService,
ProvideOpsCleanupService,
ProvideOpsScheduledReportService,
NewEmailService,
ProvideEmailQueueService,
NewTurnstileService,

View File

@@ -0,0 +1,717 @@
-- Ops Monitoring (vNext): squashed migration (030)
--
-- This repository originally planned Ops vNext as migrations 030-036:
-- 030 drop legacy ops tables
-- 031 core schema
-- 032 pre-aggregation tables
-- 033 indexes + optional extensions
-- 034 add avg/max to preagg
-- 035 add notify_email to alert rules
-- 036 seed default alert rules
--
-- Since these migrations have NOT been applied to any environment yet, we squash them
-- into a single 030 migration for easier review and a cleaner migration history.
--
-- Notes:
-- - This is intentionally destructive for ops_* data (error logs / metrics / alerts).
-- - It is idempotent (DROP/CREATE/ALTER IF EXISTS/IF NOT EXISTS), but will wipe ops_* data if re-run.
-- =====================================================================
-- 030_ops_drop_legacy_ops_tables.sql
-- =====================================================================
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- Legacy pre-aggregation tables (from 026 and/or previous branches)
DROP TABLE IF EXISTS ops_metrics_daily CASCADE;
DROP TABLE IF EXISTS ops_metrics_hourly CASCADE;
-- Core ops tables that may exist in some deployments / branches
DROP TABLE IF EXISTS ops_system_metrics CASCADE;
DROP TABLE IF EXISTS ops_error_logs CASCADE;
DROP TABLE IF EXISTS ops_alert_events CASCADE;
DROP TABLE IF EXISTS ops_alert_rules CASCADE;
DROP TABLE IF EXISTS ops_job_heartbeats CASCADE;
DROP TABLE IF EXISTS ops_retry_attempts CASCADE;
-- Optional legacy tables (best-effort cleanup)
DROP TABLE IF EXISTS ops_scheduled_reports CASCADE;
DROP TABLE IF EXISTS ops_group_availability_configs CASCADE;
DROP TABLE IF EXISTS ops_group_availability_events CASCADE;
-- Optional legacy views/indexes
DROP VIEW IF EXISTS ops_latest_metrics CASCADE;
-- =====================================================================
-- 031_ops_core_schema.sql
-- =====================================================================
-- Ops Monitoring (vNext): core schema (errors / retries / metrics / jobs / alerts)
--
-- Design goals:
-- - Support global filtering (time/platform/group) across all ops modules.
-- - Persist enough context for two retry modes (client retry / pinned upstream retry).
-- - Make ops background jobs observable via job heartbeats.
-- - Keep schema stable and indexes targeted (high-write tables).
--
-- Notes:
-- - This migration is idempotent.
-- - ops_* tables intentionally avoid strict foreign keys to reduce write amplification/locks.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- ============================================
-- 1) ops_error_logs: error log details (high-write)
-- ============================================
CREATE TABLE IF NOT EXISTS ops_error_logs (
id BIGSERIAL PRIMARY KEY,
-- Correlation / identities
request_id VARCHAR(64),
client_request_id VARCHAR(64),
user_id BIGINT,
api_key_id BIGINT,
account_id BIGINT,
group_id BIGINT,
client_ip inet,
-- Dimensions for global filtering
platform VARCHAR(32),
-- Request metadata
model VARCHAR(100),
request_path VARCHAR(256),
stream BOOLEAN NOT NULL DEFAULT false,
user_agent TEXT,
-- Core error classification
error_phase VARCHAR(32) NOT NULL,
error_type VARCHAR(64) NOT NULL,
severity VARCHAR(8) NOT NULL DEFAULT 'P2',
status_code INT,
-- vNext metric semantics
is_business_limited BOOLEAN NOT NULL DEFAULT false,
-- Error details (sanitized/truncated at ingest time)
error_message TEXT,
error_body TEXT,
-- Provider/upstream details (optional; useful for trends & account health)
error_source VARCHAR(64),
error_owner VARCHAR(32),
account_status VARCHAR(50),
upstream_status_code INT,
upstream_error_message TEXT,
upstream_error_detail TEXT,
provider_error_code VARCHAR(64),
provider_error_type VARCHAR(64),
network_error_type VARCHAR(50),
retry_after_seconds INT,
-- Timings (ms) - optional
duration_ms INT,
time_to_first_token_ms BIGINT,
auth_latency_ms BIGINT,
routing_latency_ms BIGINT,
upstream_latency_ms BIGINT,
response_latency_ms BIGINT,
-- Retry context (only stored for error requests)
request_body JSONB,
request_headers JSONB,
request_body_truncated BOOLEAN NOT NULL DEFAULT false,
request_body_bytes INT,
-- Retryability flags (best-effort classification)
is_retryable BOOLEAN NOT NULL DEFAULT false,
retry_count INT NOT NULL DEFAULT 0,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
COMMENT ON TABLE ops_error_logs IS 'Ops error logs (vNext). Stores sanitized error details and request_body for retries (errors only).';
-- ============================================
-- 2) ops_retry_attempts: audit log for retries
-- ============================================
CREATE TABLE IF NOT EXISTS ops_retry_attempts (
id BIGSERIAL PRIMARY KEY,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
requested_by_user_id BIGINT,
source_error_id BIGINT,
-- client|upstream
mode VARCHAR(16) NOT NULL,
pinned_account_id BIGINT,
-- queued|running|succeeded|failed
status VARCHAR(16) NOT NULL DEFAULT 'queued',
started_at TIMESTAMPTZ,
finished_at TIMESTAMPTZ,
duration_ms BIGINT,
-- Optional result correlation
result_request_id VARCHAR(64),
result_error_id BIGINT,
result_usage_request_id VARCHAR(64),
error_message TEXT
);
COMMENT ON TABLE ops_retry_attempts IS 'Audit table for ops retries (client retry / pinned upstream retry).';
-- ============================================
-- 3) ops_system_metrics: system + request window snapshots
-- ============================================
CREATE TABLE IF NOT EXISTS ops_system_metrics (
id BIGSERIAL PRIMARY KEY,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
window_minutes INT NOT NULL DEFAULT 1,
-- Optional dimensions (only if collector chooses to write per-dimension snapshots)
platform VARCHAR(32),
group_id BIGINT,
-- Core counts
success_count BIGINT NOT NULL DEFAULT 0,
error_count_total BIGINT NOT NULL DEFAULT 0,
business_limited_count BIGINT NOT NULL DEFAULT 0,
error_count_sla BIGINT NOT NULL DEFAULT 0,
upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
upstream_429_count BIGINT NOT NULL DEFAULT 0,
upstream_529_count BIGINT NOT NULL DEFAULT 0,
token_consumed BIGINT NOT NULL DEFAULT 0,
-- Rates
qps DOUBLE PRECISION,
tps DOUBLE PRECISION,
-- Duration percentiles (ms) - success requests
duration_p50_ms INT,
duration_p90_ms INT,
duration_p95_ms INT,
duration_p99_ms INT,
duration_avg_ms DOUBLE PRECISION,
duration_max_ms INT,
-- TTFT percentiles (ms) - success requests (streaming)
ttft_p50_ms INT,
ttft_p90_ms INT,
ttft_p95_ms INT,
ttft_p99_ms INT,
ttft_avg_ms DOUBLE PRECISION,
ttft_max_ms INT,
-- System resources
cpu_usage_percent DOUBLE PRECISION,
memory_used_mb BIGINT,
memory_total_mb BIGINT,
memory_usage_percent DOUBLE PRECISION,
-- Dependency health (best-effort)
db_ok BOOLEAN,
redis_ok BOOLEAN,
-- DB pool & runtime
db_conn_active INT,
db_conn_idle INT,
db_conn_waiting INT,
goroutine_count INT,
-- Queue / concurrency
concurrency_queue_depth INT
);
COMMENT ON TABLE ops_system_metrics IS 'Ops system/request metrics snapshots (vNext). Used for dashboard overview and realtime rates.';
-- ============================================
-- 4) ops_job_heartbeats: background jobs health
-- ============================================
CREATE TABLE IF NOT EXISTS ops_job_heartbeats (
job_name VARCHAR(64) PRIMARY KEY,
last_run_at TIMESTAMPTZ,
last_success_at TIMESTAMPTZ,
last_error_at TIMESTAMPTZ,
last_error TEXT,
last_duration_ms BIGINT,
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
COMMENT ON TABLE ops_job_heartbeats IS 'Ops background jobs heartbeats (vNext).';
-- ============================================
-- 5) ops_alert_rules / ops_alert_events
-- ============================================
CREATE TABLE IF NOT EXISTS ops_alert_rules (
id BIGSERIAL PRIMARY KEY,
name VARCHAR(128) NOT NULL,
description TEXT,
enabled BOOLEAN NOT NULL DEFAULT true,
severity VARCHAR(16) NOT NULL DEFAULT 'warning',
-- Metric definition
-- Metric definition
metric_type VARCHAR(64) NOT NULL,
operator VARCHAR(8) NOT NULL,
threshold DOUBLE PRECISION NOT NULL,
window_minutes INT NOT NULL DEFAULT 5,
sustained_minutes INT NOT NULL DEFAULT 5,
cooldown_minutes INT NOT NULL DEFAULT 10,
-- Optional scoping: platform/group filters etc.
filters JSONB,
last_triggered_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_alert_rules_name_unique
ON ops_alert_rules (name);
CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled
ON ops_alert_rules (enabled);
CREATE TABLE IF NOT EXISTS ops_alert_events (
id BIGSERIAL PRIMARY KEY,
rule_id BIGINT,
severity VARCHAR(16) NOT NULL,
status VARCHAR(16) NOT NULL DEFAULT 'firing',
title VARCHAR(200),
description TEXT,
metric_value DOUBLE PRECISION,
threshold_value DOUBLE PRECISION,
dimensions JSONB,
fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
resolved_at TIMESTAMPTZ,
email_sent BOOLEAN NOT NULL DEFAULT false,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status
ON ops_alert_events (rule_id, status);
CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at
ON ops_alert_events (fired_at DESC);
-- =====================================================================
-- 032_ops_preaggregation_tables.sql
-- =====================================================================
-- Ops Monitoring (vNext): pre-aggregation tables
--
-- Purpose:
-- - Provide stable query performance for 124h windows (and beyond), avoiding expensive
-- percentile_cont scans on raw logs for every dashboard refresh.
-- - Support global filter dimensions: overall / platform / group.
--
-- Design note:
-- - We keep a single table with nullable platform/group_id, and enforce uniqueness via a
-- COALESCE-based unique index (because UNIQUE with NULLs allows duplicates in Postgres).
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- ============================================
-- 1) ops_metrics_hourly
-- ============================================
CREATE TABLE IF NOT EXISTS ops_metrics_hourly (
id BIGSERIAL PRIMARY KEY,
bucket_start TIMESTAMPTZ NOT NULL,
platform VARCHAR(32),
group_id BIGINT,
success_count BIGINT NOT NULL DEFAULT 0,
error_count_total BIGINT NOT NULL DEFAULT 0,
business_limited_count BIGINT NOT NULL DEFAULT 0,
error_count_sla BIGINT NOT NULL DEFAULT 0,
upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
upstream_429_count BIGINT NOT NULL DEFAULT 0,
upstream_529_count BIGINT NOT NULL DEFAULT 0,
token_consumed BIGINT NOT NULL DEFAULT 0,
-- Duration percentiles (ms)
duration_p50_ms INT,
duration_p90_ms INT,
duration_p95_ms INT,
duration_p99_ms INT,
-- TTFT percentiles (ms)
ttft_p50_ms INT,
ttft_p90_ms INT,
ttft_p95_ms INT,
ttft_p99_ms INT,
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Uniqueness across three “dimension modes” (overall / platform / group).
-- Postgres UNIQUE treats NULLs as distinct, so we enforce uniqueness via COALESCE.
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_hourly_unique_dim
ON ops_metrics_hourly (
bucket_start,
COALESCE(platform, ''),
COALESCE(group_id, 0)
);
CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_bucket
ON ops_metrics_hourly (bucket_start DESC);
CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_platform_bucket
ON ops_metrics_hourly (platform, bucket_start DESC)
WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_group_bucket
ON ops_metrics_hourly (group_id, bucket_start DESC)
WHERE group_id IS NOT NULL AND group_id <> 0;
COMMENT ON TABLE ops_metrics_hourly IS 'vNext hourly pre-aggregated ops metrics (overall/platform/group).';
-- ============================================
-- 2) ops_metrics_daily (optional; for longer windows)
-- ============================================
CREATE TABLE IF NOT EXISTS ops_metrics_daily (
id BIGSERIAL PRIMARY KEY,
bucket_date DATE NOT NULL,
platform VARCHAR(32),
group_id BIGINT,
success_count BIGINT NOT NULL DEFAULT 0,
error_count_total BIGINT NOT NULL DEFAULT 0,
business_limited_count BIGINT NOT NULL DEFAULT 0,
error_count_sla BIGINT NOT NULL DEFAULT 0,
upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
upstream_429_count BIGINT NOT NULL DEFAULT 0,
upstream_529_count BIGINT NOT NULL DEFAULT 0,
token_consumed BIGINT NOT NULL DEFAULT 0,
duration_p50_ms INT,
duration_p90_ms INT,
duration_p95_ms INT,
duration_p99_ms INT,
ttft_p50_ms INT,
ttft_p90_ms INT,
ttft_p95_ms INT,
ttft_p99_ms INT,
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_daily_unique_dim
ON ops_metrics_daily (
bucket_date,
COALESCE(platform, ''),
COALESCE(group_id, 0)
);
CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_bucket
ON ops_metrics_daily (bucket_date DESC);
CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_platform_bucket
ON ops_metrics_daily (platform, bucket_date DESC)
WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_group_bucket
ON ops_metrics_daily (group_id, bucket_date DESC)
WHERE group_id IS NOT NULL AND group_id <> 0;
COMMENT ON TABLE ops_metrics_daily IS 'vNext daily pre-aggregated ops metrics (overall/platform/group).';
-- =====================================================================
-- 033_ops_indexes_and_extensions.sql
-- =====================================================================
-- Ops Monitoring (vNext): indexes and optional extensions
--
-- This migration intentionally keeps "optional" objects (like pg_trgm) best-effort,
-- so environments without extension privileges won't fail the whole migration chain.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- ============================================
-- 1) Core btree indexes (always safe)
-- ============================================
-- ops_error_logs
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at
ON ops_error_logs (created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform_time
ON ops_error_logs (platform, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_group_time
ON ops_error_logs (group_id, created_at DESC)
WHERE group_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_account_time
ON ops_error_logs (account_id, created_at DESC)
WHERE account_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_status_time
ON ops_error_logs (status_code, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_time
ON ops_error_logs (error_phase, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_type_time
ON ops_error_logs (error_type, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id
ON ops_error_logs (request_id);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id
ON ops_error_logs (client_request_id);
-- ops_system_metrics
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at
ON ops_system_metrics (created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time
ON ops_system_metrics (window_minutes, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_platform_time
ON ops_system_metrics (platform, created_at DESC)
WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_group_time
ON ops_system_metrics (group_id, created_at DESC)
WHERE group_id IS NOT NULL;
-- ops_retry_attempts
CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_created_at
ON ops_retry_attempts (created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_source_error
ON ops_retry_attempts (source_error_id, created_at DESC)
WHERE source_error_id IS NOT NULL;
-- Prevent concurrent retries for the same ops_error_logs row (race-free, multi-instance safe).
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_retry_attempts_unique_active
ON ops_retry_attempts (source_error_id)
WHERE source_error_id IS NOT NULL AND status IN ('queued', 'running');
-- ============================================
-- 2) Optional: pg_trgm + trigram indexes for fuzzy search
-- ============================================
DO $$
BEGIN
BEGIN
CREATE EXTENSION IF NOT EXISTS pg_trgm;
EXCEPTION WHEN OTHERS THEN
-- Missing privileges or extension package should not block migrations.
RAISE NOTICE 'pg_trgm extension not created: %', SQLERRM;
END;
IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm') THEN
-- request_id / client_request_id fuzzy search
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id_trgm
ON ops_error_logs USING gin (request_id gin_trgm_ops)';
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id_trgm
ON ops_error_logs USING gin (client_request_id gin_trgm_ops)';
-- error_message fuzzy search
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_error_message_trgm
ON ops_error_logs USING gin (error_message gin_trgm_ops)';
END IF;
END $$;
-- =====================================================================
-- 034_ops_preaggregation_add_avg_max.sql
-- =====================================================================
-- Ops Monitoring (vNext): extend pre-aggregation tables with avg/max latency fields
--
-- Why:
-- - The dashboard overview returns avg/max for duration/TTFT.
-- - Hourly/daily pre-aggregation tables originally stored only p50/p90/p95/p99, which makes
-- it impossible to answer avg/max in preagg mode without falling back to raw scans.
--
-- This migration is idempotent and safe to run multiple times.
--
-- NOTE: We keep the existing p50/p90/p95/p99 columns as-is; these are still used for
-- approximate long-window summaries.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- Hourly table
ALTER TABLE ops_metrics_hourly
ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
ADD COLUMN IF NOT EXISTS duration_max_ms INT,
ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
-- Daily table
ALTER TABLE ops_metrics_daily
ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
ADD COLUMN IF NOT EXISTS duration_max_ms INT,
ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
-- =====================================================================
-- 035_ops_alert_rules_notify_email.sql
-- =====================================================================
-- Ops Monitoring (vNext): alert rule notify settings
--
-- Adds notify_email flag to ops_alert_rules to keep UI parity with the backup Ops dashboard.
-- Migration is idempotent.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
ALTER TABLE ops_alert_rules
ADD COLUMN IF NOT EXISTS notify_email BOOLEAN NOT NULL DEFAULT true;
-- =====================================================================
-- 036_ops_seed_default_alert_rules.sql
-- =====================================================================
-- Ops Monitoring (vNext): seed default alert rules (idempotent)
--
-- Goal:
-- - Provide "out of the box" alert rules so the Ops dashboard can immediately show alert events.
-- - Keep inserts idempotent via ON CONFLICT (name) DO NOTHING.
--
-- Notes:
-- - Thresholds are intentionally conservative defaults and should be tuned per deployment.
-- - Metric semantics follow vNext:
-- - success_rate / error_rate are based on SLA-scope counts (exclude is_business_limited).
-- - upstream_error_rate excludes 429/529.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- 1) High error rate (P1)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'错误率过高',
'当错误率超过 5% 且持续 5 分钟时触发告警',
true, 'error_rate', '>', 5.0, 5, 5, 'P1', true, 20, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 2) Low success rate (P0)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'成功率过低',
'当成功率低于 95% 且持续 5 分钟时触发告警(服务可用性下降)',
true, 'success_rate', '<', 95.0, 5, 5, 'P0', true, 15, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 3) P99 latency too high (P2)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'P99延迟过高',
'当 P99 延迟超过 3000ms 且持续 10 分钟时触发告警',
true, 'p99_latency_ms', '>', 3000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 4) P95 latency too high (P2)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'P95延迟过高',
'当 P95 延迟超过 2000ms 且持续 10 分钟时触发告警',
true, 'p95_latency_ms', '>', 2000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 5) CPU usage too high (P2)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'CPU使用率过高',
'当 CPU 使用率超过 85% 且持续 10 分钟时触发告警',
true, 'cpu_usage_percent', '>', 85.0, 5, 10, 'P2', true, 30, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 6) Memory usage too high (P1)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'内存使用率过高',
'当内存使用率超过 90% 且持续 10 分钟时触发告警(可能导致 OOM',
true, 'memory_usage_percent', '>', 90.0, 5, 10, 'P1', true, 20, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 7) Concurrency queue buildup (P1)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'并发队列积压',
'当并发队列深度超过 100 且持续 5 分钟时触发告警(系统处理能力不足)',
true, 'concurrency_queue_depth', '>', 100.0, 5, 5, 'P1', true, 20, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 8) Extremely high error rate (P0)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'错误率极高',
'当错误率超过 20% 且持续 1 分钟时触发告警(服务严重异常)',
true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- Ops Monitoring vNext: add Redis pool stats fields to system metrics snapshots.
-- This migration is intentionally idempotent.
ALTER TABLE ops_system_metrics
ADD COLUMN IF NOT EXISTS redis_conn_total INT,
ADD COLUMN IF NOT EXISTS redis_conn_idle INT;
COMMENT ON COLUMN ops_system_metrics.redis_conn_total IS 'Redis pool total connections (go-redis PoolStats.TotalConns).';
COMMENT ON COLUMN ops_system_metrics.redis_conn_idle IS 'Redis pool idle connections (go-redis PoolStats.IdleConns).';

View File

@@ -0,0 +1,9 @@
-- Add upstream error events list (JSONB) to ops_error_logs for per-request correlation.
--
-- This is intentionally idempotent.
ALTER TABLE ops_error_logs
ADD COLUMN IF NOT EXISTS upstream_errors JSONB;
COMMENT ON COLUMN ops_error_logs.upstream_errors IS
'Sanitized upstream error events list (JSON array), correlated per gateway request (request_id/client_request_id); used for per-request upstream debugging.';