Merge branch 'main' of https://github.com/mt21625457/aicodex2api
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -126,6 +126,4 @@ backend/cmd/server/server
|
|||||||
deploy/docker-compose.override.yml
|
deploy/docker-compose.override.yml
|
||||||
.gocache/
|
.gocache/
|
||||||
vite.config.js
|
vite.config.js
|
||||||
!docs/
|
|
||||||
docs/*
|
docs/*
|
||||||
!docs/dependency-security.md
|
|
||||||
|
|||||||
2
backend/.dockerignore
Normal file
2
backend/.dockerignore
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
.cache/
|
||||||
|
.DS_Store
|
||||||
@@ -18,6 +18,12 @@ linters:
|
|||||||
list-mode: original
|
list-mode: original
|
||||||
files:
|
files:
|
||||||
- "**/internal/service/**"
|
- "**/internal/service/**"
|
||||||
|
- "!**/internal/service/ops_aggregation_service.go"
|
||||||
|
- "!**/internal/service/ops_alert_evaluator_service.go"
|
||||||
|
- "!**/internal/service/ops_cleanup_service.go"
|
||||||
|
- "!**/internal/service/ops_metrics_collector.go"
|
||||||
|
- "!**/internal/service/ops_scheduled_report_service.go"
|
||||||
|
- "!**/internal/service/wire.go"
|
||||||
deny:
|
deny:
|
||||||
- pkg: github.com/Wei-Shaw/sub2api/internal/repository
|
- pkg: github.com/Wei-Shaw/sub2api/internal/repository
|
||||||
desc: "service must not import repository"
|
desc: "service must not import repository"
|
||||||
|
|||||||
@@ -62,6 +62,11 @@ func provideServiceBuildInfo(buildInfo handler.BuildInfo) service.BuildInfo {
|
|||||||
func provideCleanup(
|
func provideCleanup(
|
||||||
entClient *ent.Client,
|
entClient *ent.Client,
|
||||||
rdb *redis.Client,
|
rdb *redis.Client,
|
||||||
|
opsMetricsCollector *service.OpsMetricsCollector,
|
||||||
|
opsAggregation *service.OpsAggregationService,
|
||||||
|
opsAlertEvaluator *service.OpsAlertEvaluatorService,
|
||||||
|
opsCleanup *service.OpsCleanupService,
|
||||||
|
opsScheduledReport *service.OpsScheduledReportService,
|
||||||
tokenRefresh *service.TokenRefreshService,
|
tokenRefresh *service.TokenRefreshService,
|
||||||
accountExpiry *service.AccountExpiryService,
|
accountExpiry *service.AccountExpiryService,
|
||||||
pricing *service.PricingService,
|
pricing *service.PricingService,
|
||||||
@@ -81,6 +86,36 @@ func provideCleanup(
|
|||||||
name string
|
name string
|
||||||
fn func() error
|
fn func() error
|
||||||
}{
|
}{
|
||||||
|
{"OpsScheduledReportService", func() error {
|
||||||
|
if opsScheduledReport != nil {
|
||||||
|
opsScheduledReport.Stop()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}},
|
||||||
|
{"OpsCleanupService", func() error {
|
||||||
|
if opsCleanup != nil {
|
||||||
|
opsCleanup.Stop()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}},
|
||||||
|
{"OpsAlertEvaluatorService", func() error {
|
||||||
|
if opsAlertEvaluator != nil {
|
||||||
|
opsAlertEvaluator.Stop()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}},
|
||||||
|
{"OpsAggregationService", func() error {
|
||||||
|
if opsAggregation != nil {
|
||||||
|
opsAggregation.Stop()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}},
|
||||||
|
{"OpsMetricsCollector", func() error {
|
||||||
|
if opsMetricsCollector != nil {
|
||||||
|
opsMetricsCollector.Stop()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}},
|
||||||
{"TokenRefreshService", func() error {
|
{"TokenRefreshService", func() error {
|
||||||
tokenRefresh.Stop()
|
tokenRefresh.Stop()
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -120,7 +120,22 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
|
|||||||
proxyHandler := admin.NewProxyHandler(adminService)
|
proxyHandler := admin.NewProxyHandler(adminService)
|
||||||
adminRedeemHandler := admin.NewRedeemHandler(adminService)
|
adminRedeemHandler := admin.NewRedeemHandler(adminService)
|
||||||
promoHandler := admin.NewPromoHandler(promoService)
|
promoHandler := admin.NewPromoHandler(promoService)
|
||||||
settingHandler := admin.NewSettingHandler(settingService, emailService, turnstileService)
|
opsRepository := repository.NewOpsRepository(db)
|
||||||
|
pricingRemoteClient := repository.ProvidePricingRemoteClient(configConfig)
|
||||||
|
pricingService, err := service.ProvidePricingService(configConfig, pricingRemoteClient)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
billingService := service.NewBillingService(configConfig, pricingService)
|
||||||
|
identityCache := repository.NewIdentityCache(redisClient)
|
||||||
|
identityService := service.NewIdentityService(identityCache)
|
||||||
|
deferredService := service.ProvideDeferredService(accountRepository, timingWheelService)
|
||||||
|
gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService)
|
||||||
|
openAIGatewayService := service.NewOpenAIGatewayService(accountRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, httpUpstream, deferredService)
|
||||||
|
geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig)
|
||||||
|
opsService := service.NewOpsService(opsRepository, settingRepository, configConfig, accountRepository, concurrencyService, gatewayService, openAIGatewayService, geminiMessagesCompatService, antigravityGatewayService)
|
||||||
|
settingHandler := admin.NewSettingHandler(settingService, emailService, turnstileService, opsService)
|
||||||
|
opsHandler := admin.NewOpsHandler(opsService)
|
||||||
updateCache := repository.NewUpdateCache(redisClient)
|
updateCache := repository.NewUpdateCache(redisClient)
|
||||||
gitHubReleaseClient := repository.ProvideGitHubReleaseClient(configConfig)
|
gitHubReleaseClient := repository.ProvideGitHubReleaseClient(configConfig)
|
||||||
serviceBuildInfo := provideServiceBuildInfo(buildInfo)
|
serviceBuildInfo := provideServiceBuildInfo(buildInfo)
|
||||||
@@ -132,31 +147,24 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
|
|||||||
userAttributeValueRepository := repository.NewUserAttributeValueRepository(client)
|
userAttributeValueRepository := repository.NewUserAttributeValueRepository(client)
|
||||||
userAttributeService := service.NewUserAttributeService(userAttributeDefinitionRepository, userAttributeValueRepository)
|
userAttributeService := service.NewUserAttributeService(userAttributeDefinitionRepository, userAttributeValueRepository)
|
||||||
userAttributeHandler := admin.NewUserAttributeHandler(userAttributeService)
|
userAttributeHandler := admin.NewUserAttributeHandler(userAttributeService)
|
||||||
adminHandlers := handler.ProvideAdminHandlers(dashboardHandler, adminUserHandler, groupHandler, accountHandler, oAuthHandler, openAIOAuthHandler, geminiOAuthHandler, antigravityOAuthHandler, proxyHandler, adminRedeemHandler, promoHandler, settingHandler, systemHandler, adminSubscriptionHandler, adminUsageHandler, userAttributeHandler)
|
adminHandlers := handler.ProvideAdminHandlers(dashboardHandler, adminUserHandler, groupHandler, accountHandler, oAuthHandler, openAIOAuthHandler, geminiOAuthHandler, antigravityOAuthHandler, proxyHandler, adminRedeemHandler, promoHandler, settingHandler, opsHandler, systemHandler, adminSubscriptionHandler, adminUsageHandler, userAttributeHandler)
|
||||||
pricingRemoteClient := repository.ProvidePricingRemoteClient(configConfig)
|
|
||||||
pricingService, err := service.ProvidePricingService(configConfig, pricingRemoteClient)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
billingService := service.NewBillingService(configConfig, pricingService)
|
|
||||||
identityCache := repository.NewIdentityCache(redisClient)
|
|
||||||
identityService := service.NewIdentityService(identityCache)
|
|
||||||
deferredService := service.ProvideDeferredService(accountRepository, timingWheelService)
|
|
||||||
gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService)
|
|
||||||
geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig)
|
|
||||||
gatewayHandler := handler.NewGatewayHandler(gatewayService, geminiMessagesCompatService, antigravityGatewayService, userService, concurrencyService, billingCacheService, configConfig)
|
gatewayHandler := handler.NewGatewayHandler(gatewayService, geminiMessagesCompatService, antigravityGatewayService, userService, concurrencyService, billingCacheService, configConfig)
|
||||||
openAIGatewayService := service.NewOpenAIGatewayService(accountRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, httpUpstream, deferredService)
|
|
||||||
openAIGatewayHandler := handler.NewOpenAIGatewayHandler(openAIGatewayService, concurrencyService, billingCacheService, configConfig)
|
openAIGatewayHandler := handler.NewOpenAIGatewayHandler(openAIGatewayService, concurrencyService, billingCacheService, configConfig)
|
||||||
handlerSettingHandler := handler.ProvideSettingHandler(settingService, buildInfo)
|
handlerSettingHandler := handler.ProvideSettingHandler(settingService, buildInfo)
|
||||||
handlers := handler.ProvideHandlers(authHandler, userHandler, apiKeyHandler, usageHandler, redeemHandler, subscriptionHandler, adminHandlers, gatewayHandler, openAIGatewayHandler, handlerSettingHandler)
|
handlers := handler.ProvideHandlers(authHandler, userHandler, apiKeyHandler, usageHandler, redeemHandler, subscriptionHandler, adminHandlers, gatewayHandler, openAIGatewayHandler, handlerSettingHandler)
|
||||||
jwtAuthMiddleware := middleware.NewJWTAuthMiddleware(authService, userService)
|
jwtAuthMiddleware := middleware.NewJWTAuthMiddleware(authService, userService)
|
||||||
adminAuthMiddleware := middleware.NewAdminAuthMiddleware(authService, userService, settingService)
|
adminAuthMiddleware := middleware.NewAdminAuthMiddleware(authService, userService, settingService)
|
||||||
apiKeyAuthMiddleware := middleware.NewAPIKeyAuthMiddleware(apiKeyService, subscriptionService, configConfig)
|
apiKeyAuthMiddleware := middleware.NewAPIKeyAuthMiddleware(apiKeyService, subscriptionService, configConfig)
|
||||||
engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, settingService, redisClient)
|
engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, opsService, settingService, redisClient)
|
||||||
httpServer := server.ProvideHTTPServer(configConfig, engine)
|
httpServer := server.ProvideHTTPServer(configConfig, engine)
|
||||||
|
opsMetricsCollector := service.ProvideOpsMetricsCollector(opsRepository, settingRepository, accountRepository, concurrencyService, db, redisClient, configConfig)
|
||||||
|
opsAggregationService := service.ProvideOpsAggregationService(opsRepository, settingRepository, db, redisClient, configConfig)
|
||||||
|
opsAlertEvaluatorService := service.ProvideOpsAlertEvaluatorService(opsService, opsRepository, emailService, redisClient, configConfig)
|
||||||
|
opsCleanupService := service.ProvideOpsCleanupService(opsRepository, db, redisClient, configConfig)
|
||||||
|
opsScheduledReportService := service.ProvideOpsScheduledReportService(opsService, userService, emailService, redisClient, configConfig)
|
||||||
tokenRefreshService := service.ProvideTokenRefreshService(accountRepository, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService, configConfig)
|
tokenRefreshService := service.ProvideTokenRefreshService(accountRepository, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService, configConfig)
|
||||||
accountExpiryService := service.ProvideAccountExpiryService(accountRepository)
|
accountExpiryService := service.ProvideAccountExpiryService(accountRepository)
|
||||||
v := provideCleanup(client, redisClient, tokenRefreshService, accountExpiryService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService)
|
v := provideCleanup(client, redisClient, opsMetricsCollector, opsAggregationService, opsAlertEvaluatorService, opsCleanupService, opsScheduledReportService, tokenRefreshService, accountExpiryService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService)
|
||||||
application := &Application{
|
application := &Application{
|
||||||
Server: httpServer,
|
Server: httpServer,
|
||||||
Cleanup: v,
|
Cleanup: v,
|
||||||
@@ -181,6 +189,11 @@ func provideServiceBuildInfo(buildInfo handler.BuildInfo) service.BuildInfo {
|
|||||||
func provideCleanup(
|
func provideCleanup(
|
||||||
entClient *ent.Client,
|
entClient *ent.Client,
|
||||||
rdb *redis.Client,
|
rdb *redis.Client,
|
||||||
|
opsMetricsCollector *service.OpsMetricsCollector,
|
||||||
|
opsAggregation *service.OpsAggregationService,
|
||||||
|
opsAlertEvaluator *service.OpsAlertEvaluatorService,
|
||||||
|
opsCleanup *service.OpsCleanupService,
|
||||||
|
opsScheduledReport *service.OpsScheduledReportService,
|
||||||
tokenRefresh *service.TokenRefreshService,
|
tokenRefresh *service.TokenRefreshService,
|
||||||
accountExpiry *service.AccountExpiryService,
|
accountExpiry *service.AccountExpiryService,
|
||||||
pricing *service.PricingService,
|
pricing *service.PricingService,
|
||||||
@@ -199,6 +212,36 @@ func provideCleanup(
|
|||||||
name string
|
name string
|
||||||
fn func() error
|
fn func() error
|
||||||
}{
|
}{
|
||||||
|
{"OpsScheduledReportService", func() error {
|
||||||
|
if opsScheduledReport != nil {
|
||||||
|
opsScheduledReport.Stop()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}},
|
||||||
|
{"OpsCleanupService", func() error {
|
||||||
|
if opsCleanup != nil {
|
||||||
|
opsCleanup.Stop()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}},
|
||||||
|
{"OpsAlertEvaluatorService", func() error {
|
||||||
|
if opsAlertEvaluator != nil {
|
||||||
|
opsAlertEvaluator.Stop()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}},
|
||||||
|
{"OpsAggregationService", func() error {
|
||||||
|
if opsAggregation != nil {
|
||||||
|
opsAggregation.Stop()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}},
|
||||||
|
{"OpsMetricsCollector", func() error {
|
||||||
|
if opsMetricsCollector != nil {
|
||||||
|
opsMetricsCollector.Stop()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}},
|
||||||
{"TokenRefreshService", func() error {
|
{"TokenRefreshService", func() error {
|
||||||
tokenRefresh.Stop()
|
tokenRefresh.Stop()
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -8,9 +8,11 @@ require (
|
|||||||
github.com/golang-jwt/jwt/v5 v5.2.2
|
github.com/golang-jwt/jwt/v5 v5.2.2
|
||||||
github.com/google/uuid v1.6.0
|
github.com/google/uuid v1.6.0
|
||||||
github.com/google/wire v0.7.0
|
github.com/google/wire v0.7.0
|
||||||
|
github.com/gorilla/websocket v1.5.3
|
||||||
github.com/imroc/req/v3 v3.57.0
|
github.com/imroc/req/v3 v3.57.0
|
||||||
github.com/lib/pq v1.10.9
|
github.com/lib/pq v1.10.9
|
||||||
github.com/redis/go-redis/v9 v9.17.2
|
github.com/redis/go-redis/v9 v9.17.2
|
||||||
|
github.com/shirou/gopsutil/v4 v4.25.6
|
||||||
github.com/spf13/viper v1.18.2
|
github.com/spf13/viper v1.18.2
|
||||||
github.com/stretchr/testify v1.11.1
|
github.com/stretchr/testify v1.11.1
|
||||||
github.com/testcontainers/testcontainers-go/modules/postgres v0.40.0
|
github.com/testcontainers/testcontainers-go/modules/postgres v0.40.0
|
||||||
@@ -106,9 +108,9 @@ require (
|
|||||||
github.com/quic-go/quic-go v0.57.1 // indirect
|
github.com/quic-go/quic-go v0.57.1 // indirect
|
||||||
github.com/refraction-networking/utls v1.8.1 // indirect
|
github.com/refraction-networking/utls v1.8.1 // indirect
|
||||||
github.com/rivo/uniseg v0.2.0 // indirect
|
github.com/rivo/uniseg v0.2.0 // indirect
|
||||||
|
github.com/robfig/cron/v3 v3.0.1 // indirect
|
||||||
github.com/sagikazarmark/locafero v0.4.0 // indirect
|
github.com/sagikazarmark/locafero v0.4.0 // indirect
|
||||||
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
|
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
|
||||||
github.com/shirou/gopsutil/v4 v4.25.6 // indirect
|
|
||||||
github.com/sirupsen/logrus v1.9.3 // indirect
|
github.com/sirupsen/logrus v1.9.3 // indirect
|
||||||
github.com/sourcegraph/conc v0.3.0 // indirect
|
github.com/sourcegraph/conc v0.3.0 // indirect
|
||||||
github.com/spaolacci/murmur3 v1.1.0 // indirect
|
github.com/spaolacci/murmur3 v1.1.0 // indirect
|
||||||
|
|||||||
@@ -117,6 +117,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
|||||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
github.com/google/wire v0.7.0 h1:JxUKI6+CVBgCO2WToKy/nQk0sS+amI9z9EjVmdaocj4=
|
github.com/google/wire v0.7.0 h1:JxUKI6+CVBgCO2WToKy/nQk0sS+amI9z9EjVmdaocj4=
|
||||||
github.com/google/wire v0.7.0/go.mod h1:n6YbUQD9cPKTnHXEBN2DXlOp/mVADhVErcMFb0v3J18=
|
github.com/google/wire v0.7.0/go.mod h1:n6YbUQD9cPKTnHXEBN2DXlOp/mVADhVErcMFb0v3J18=
|
||||||
|
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
|
||||||
|
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
|
||||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg=
|
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg=
|
||||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4=
|
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4=
|
||||||
github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
|
github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
|
||||||
@@ -224,6 +226,8 @@ github.com/refraction-networking/utls v1.8.1 h1:yNY1kapmQU8JeM1sSw2H2asfTIwWxIkr
|
|||||||
github.com/refraction-networking/utls v1.8.1/go.mod h1:jkSOEkLqn+S/jtpEHPOsVv/4V4EVnelwbMQl4vCWXAM=
|
github.com/refraction-networking/utls v1.8.1/go.mod h1:jkSOEkLqn+S/jtpEHPOsVv/4V4EVnelwbMQl4vCWXAM=
|
||||||
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
|
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
|
||||||
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||||
|
github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
|
||||||
|
github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
|
||||||
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
|
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
|
||||||
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
|
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
|
||||||
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
|
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
|
||||||
|
|||||||
@@ -43,6 +43,7 @@ type Config struct {
|
|||||||
Turnstile TurnstileConfig `mapstructure:"turnstile"`
|
Turnstile TurnstileConfig `mapstructure:"turnstile"`
|
||||||
Database DatabaseConfig `mapstructure:"database"`
|
Database DatabaseConfig `mapstructure:"database"`
|
||||||
Redis RedisConfig `mapstructure:"redis"`
|
Redis RedisConfig `mapstructure:"redis"`
|
||||||
|
Ops OpsConfig `mapstructure:"ops"`
|
||||||
JWT JWTConfig `mapstructure:"jwt"`
|
JWT JWTConfig `mapstructure:"jwt"`
|
||||||
LinuxDo LinuxDoConnectConfig `mapstructure:"linuxdo_connect"`
|
LinuxDo LinuxDoConnectConfig `mapstructure:"linuxdo_connect"`
|
||||||
Default DefaultConfig `mapstructure:"default"`
|
Default DefaultConfig `mapstructure:"default"`
|
||||||
@@ -60,14 +61,6 @@ type Config struct {
|
|||||||
Update UpdateConfig `mapstructure:"update"`
|
Update UpdateConfig `mapstructure:"update"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// UpdateConfig 在线更新相关配置
|
|
||||||
type UpdateConfig struct {
|
|
||||||
// ProxyURL 用于访问 GitHub 的代理地址
|
|
||||||
// 支持 http/https/socks5/socks5h 协议
|
|
||||||
// 例如: "http://127.0.0.1:7890", "socks5://127.0.0.1:1080"
|
|
||||||
ProxyURL string `mapstructure:"proxy_url"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type GeminiConfig struct {
|
type GeminiConfig struct {
|
||||||
OAuth GeminiOAuthConfig `mapstructure:"oauth"`
|
OAuth GeminiOAuthConfig `mapstructure:"oauth"`
|
||||||
Quota GeminiQuotaConfig `mapstructure:"quota"`
|
Quota GeminiQuotaConfig `mapstructure:"quota"`
|
||||||
@@ -90,6 +83,33 @@ type GeminiTierQuotaConfig struct {
|
|||||||
CooldownMinutes *int `mapstructure:"cooldown_minutes" json:"cooldown_minutes"`
|
CooldownMinutes *int `mapstructure:"cooldown_minutes" json:"cooldown_minutes"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type UpdateConfig struct {
|
||||||
|
// ProxyURL 用于访问 GitHub 的代理地址
|
||||||
|
// 支持 http/https/socks5/socks5h 协议
|
||||||
|
// 例如: "http://127.0.0.1:7890", "socks5://127.0.0.1:1080"
|
||||||
|
ProxyURL string `mapstructure:"proxy_url"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type LinuxDoConnectConfig struct {
|
||||||
|
Enabled bool `mapstructure:"enabled"`
|
||||||
|
ClientID string `mapstructure:"client_id"`
|
||||||
|
ClientSecret string `mapstructure:"client_secret"`
|
||||||
|
AuthorizeURL string `mapstructure:"authorize_url"`
|
||||||
|
TokenURL string `mapstructure:"token_url"`
|
||||||
|
UserInfoURL string `mapstructure:"userinfo_url"`
|
||||||
|
Scopes string `mapstructure:"scopes"`
|
||||||
|
RedirectURL string `mapstructure:"redirect_url"` // 后端回调地址(需在提供方后台登记)
|
||||||
|
FrontendRedirectURL string `mapstructure:"frontend_redirect_url"` // 前端接收 token 的路由(默认:/auth/linuxdo/callback)
|
||||||
|
TokenAuthMethod string `mapstructure:"token_auth_method"` // client_secret_post / client_secret_basic / none
|
||||||
|
UsePKCE bool `mapstructure:"use_pkce"`
|
||||||
|
|
||||||
|
// 可选:用于从 userinfo JSON 中提取字段的 gjson 路径。
|
||||||
|
// 为空时,服务端会尝试一组常见字段名。
|
||||||
|
UserInfoEmailPath string `mapstructure:"userinfo_email_path"`
|
||||||
|
UserInfoIDPath string `mapstructure:"userinfo_id_path"`
|
||||||
|
UserInfoUsernamePath string `mapstructure:"userinfo_username_path"`
|
||||||
|
}
|
||||||
|
|
||||||
// TokenRefreshConfig OAuth token自动刷新配置
|
// TokenRefreshConfig OAuth token自动刷新配置
|
||||||
type TokenRefreshConfig struct {
|
type TokenRefreshConfig struct {
|
||||||
// 是否启用自动刷新
|
// 是否启用自动刷新
|
||||||
@@ -332,6 +352,47 @@ func (r *RedisConfig) Address() string {
|
|||||||
return fmt.Sprintf("%s:%d", r.Host, r.Port)
|
return fmt.Sprintf("%s:%d", r.Host, r.Port)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type OpsConfig struct {
|
||||||
|
// Enabled controls whether ops features should run.
|
||||||
|
//
|
||||||
|
// NOTE: vNext still has a DB-backed feature flag (ops_monitoring_enabled) for runtime on/off.
|
||||||
|
// This config flag is the "hard switch" for deployments that want to disable ops completely.
|
||||||
|
Enabled bool `mapstructure:"enabled"`
|
||||||
|
|
||||||
|
// UsePreaggregatedTables prefers ops_metrics_hourly/daily for long-window dashboard queries.
|
||||||
|
UsePreaggregatedTables bool `mapstructure:"use_preaggregated_tables"`
|
||||||
|
|
||||||
|
// Cleanup controls periodic deletion of old ops data to prevent unbounded growth.
|
||||||
|
Cleanup OpsCleanupConfig `mapstructure:"cleanup"`
|
||||||
|
|
||||||
|
// MetricsCollectorCache controls Redis caching for expensive per-window collector queries.
|
||||||
|
MetricsCollectorCache OpsMetricsCollectorCacheConfig `mapstructure:"metrics_collector_cache"`
|
||||||
|
|
||||||
|
// Pre-aggregation configuration.
|
||||||
|
Aggregation OpsAggregationConfig `mapstructure:"aggregation"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsCleanupConfig struct {
|
||||||
|
Enabled bool `mapstructure:"enabled"`
|
||||||
|
Schedule string `mapstructure:"schedule"`
|
||||||
|
|
||||||
|
// Retention days (0 disables that cleanup target).
|
||||||
|
//
|
||||||
|
// vNext requirement: default 30 days across ops datasets.
|
||||||
|
ErrorLogRetentionDays int `mapstructure:"error_log_retention_days"`
|
||||||
|
MinuteMetricsRetentionDays int `mapstructure:"minute_metrics_retention_days"`
|
||||||
|
HourlyMetricsRetentionDays int `mapstructure:"hourly_metrics_retention_days"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsAggregationConfig struct {
|
||||||
|
Enabled bool `mapstructure:"enabled"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsMetricsCollectorCacheConfig struct {
|
||||||
|
Enabled bool `mapstructure:"enabled"`
|
||||||
|
TTL time.Duration `mapstructure:"ttl"`
|
||||||
|
}
|
||||||
|
|
||||||
type JWTConfig struct {
|
type JWTConfig struct {
|
||||||
Secret string `mapstructure:"secret"`
|
Secret string `mapstructure:"secret"`
|
||||||
ExpireHour int `mapstructure:"expire_hour"`
|
ExpireHour int `mapstructure:"expire_hour"`
|
||||||
@@ -341,30 +402,6 @@ type TurnstileConfig struct {
|
|||||||
Required bool `mapstructure:"required"`
|
Required bool `mapstructure:"required"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// LinuxDoConnectConfig 用于 LinuxDo Connect OAuth 登录(终端用户 SSO)。
|
|
||||||
//
|
|
||||||
// 注意:这与上游账号的 OAuth(例如 OpenAI/Gemini 账号接入)不是一回事。
|
|
||||||
// 这里是用于登录 Sub2API 本身的用户体系。
|
|
||||||
type LinuxDoConnectConfig struct {
|
|
||||||
Enabled bool `mapstructure:"enabled"`
|
|
||||||
ClientID string `mapstructure:"client_id"`
|
|
||||||
ClientSecret string `mapstructure:"client_secret"`
|
|
||||||
AuthorizeURL string `mapstructure:"authorize_url"`
|
|
||||||
TokenURL string `mapstructure:"token_url"`
|
|
||||||
UserInfoURL string `mapstructure:"userinfo_url"`
|
|
||||||
Scopes string `mapstructure:"scopes"`
|
|
||||||
RedirectURL string `mapstructure:"redirect_url"` // 后端回调地址(需在提供方后台登记)
|
|
||||||
FrontendRedirectURL string `mapstructure:"frontend_redirect_url"` // 前端接收 token 的路由(默认:/auth/linuxdo/callback)
|
|
||||||
TokenAuthMethod string `mapstructure:"token_auth_method"` // client_secret_post / client_secret_basic / none
|
|
||||||
UsePKCE bool `mapstructure:"use_pkce"`
|
|
||||||
|
|
||||||
// 可选:用于从 userinfo JSON 中提取字段的 gjson 路径。
|
|
||||||
// 为空时,服务端会尝试一组常见字段名。
|
|
||||||
UserInfoEmailPath string `mapstructure:"userinfo_email_path"`
|
|
||||||
UserInfoIDPath string `mapstructure:"userinfo_id_path"`
|
|
||||||
UserInfoUsernamePath string `mapstructure:"userinfo_username_path"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type DefaultConfig struct {
|
type DefaultConfig struct {
|
||||||
AdminEmail string `mapstructure:"admin_email"`
|
AdminEmail string `mapstructure:"admin_email"`
|
||||||
AdminPassword string `mapstructure:"admin_password"`
|
AdminPassword string `mapstructure:"admin_password"`
|
||||||
@@ -531,81 +568,6 @@ func Load() (*Config, error) {
|
|||||||
return &cfg, nil
|
return &cfg, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// ValidateAbsoluteHTTPURL 校验一个绝对 http(s) URL(禁止 fragment)。
|
|
||||||
func ValidateAbsoluteHTTPURL(raw string) error {
|
|
||||||
raw = strings.TrimSpace(raw)
|
|
||||||
if raw == "" {
|
|
||||||
return fmt.Errorf("empty url")
|
|
||||||
}
|
|
||||||
u, err := url.Parse(raw)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if !u.IsAbs() {
|
|
||||||
return fmt.Errorf("must be absolute")
|
|
||||||
}
|
|
||||||
if !isHTTPScheme(u.Scheme) {
|
|
||||||
return fmt.Errorf("unsupported scheme: %s", u.Scheme)
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(u.Host) == "" {
|
|
||||||
return fmt.Errorf("missing host")
|
|
||||||
}
|
|
||||||
if u.Fragment != "" {
|
|
||||||
return fmt.Errorf("must not include fragment")
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// ValidateFrontendRedirectURL 校验前端回调地址:
|
|
||||||
// - 允许同源相对路径(以 / 开头)
|
|
||||||
// - 或绝对 http(s) URL(禁止 fragment)
|
|
||||||
func ValidateFrontendRedirectURL(raw string) error {
|
|
||||||
raw = strings.TrimSpace(raw)
|
|
||||||
if raw == "" {
|
|
||||||
return fmt.Errorf("empty url")
|
|
||||||
}
|
|
||||||
if strings.ContainsAny(raw, "\r\n") {
|
|
||||||
return fmt.Errorf("contains invalid characters")
|
|
||||||
}
|
|
||||||
if strings.HasPrefix(raw, "/") {
|
|
||||||
if strings.HasPrefix(raw, "//") {
|
|
||||||
return fmt.Errorf("must not start with //")
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
u, err := url.Parse(raw)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if !u.IsAbs() {
|
|
||||||
return fmt.Errorf("must be absolute http(s) url or relative path")
|
|
||||||
}
|
|
||||||
if !isHTTPScheme(u.Scheme) {
|
|
||||||
return fmt.Errorf("unsupported scheme: %s", u.Scheme)
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(u.Host) == "" {
|
|
||||||
return fmt.Errorf("missing host")
|
|
||||||
}
|
|
||||||
if u.Fragment != "" {
|
|
||||||
return fmt.Errorf("must not include fragment")
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func isHTTPScheme(scheme string) bool {
|
|
||||||
return strings.EqualFold(scheme, "http") || strings.EqualFold(scheme, "https")
|
|
||||||
}
|
|
||||||
|
|
||||||
func warnIfInsecureURL(field, raw string) {
|
|
||||||
u, err := url.Parse(strings.TrimSpace(raw))
|
|
||||||
if err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if strings.EqualFold(u.Scheme, "http") {
|
|
||||||
log.Printf("Warning: %s uses http scheme; use https in production to avoid token leakage.", field)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func setDefaults() {
|
func setDefaults() {
|
||||||
viper.SetDefault("run_mode", RunModeStandard)
|
viper.SetDefault("run_mode", RunModeStandard)
|
||||||
|
|
||||||
@@ -655,7 +617,7 @@ func setDefaults() {
|
|||||||
// Turnstile
|
// Turnstile
|
||||||
viper.SetDefault("turnstile.required", false)
|
viper.SetDefault("turnstile.required", false)
|
||||||
|
|
||||||
// LinuxDo Connect OAuth 登录(终端用户 SSO)
|
// LinuxDo Connect OAuth 登录
|
||||||
viper.SetDefault("linuxdo_connect.enabled", false)
|
viper.SetDefault("linuxdo_connect.enabled", false)
|
||||||
viper.SetDefault("linuxdo_connect.client_id", "")
|
viper.SetDefault("linuxdo_connect.client_id", "")
|
||||||
viper.SetDefault("linuxdo_connect.client_secret", "")
|
viper.SetDefault("linuxdo_connect.client_secret", "")
|
||||||
@@ -694,6 +656,20 @@ func setDefaults() {
|
|||||||
viper.SetDefault("redis.pool_size", 128)
|
viper.SetDefault("redis.pool_size", 128)
|
||||||
viper.SetDefault("redis.min_idle_conns", 10)
|
viper.SetDefault("redis.min_idle_conns", 10)
|
||||||
|
|
||||||
|
// Ops (vNext)
|
||||||
|
viper.SetDefault("ops.enabled", true)
|
||||||
|
viper.SetDefault("ops.use_preaggregated_tables", false)
|
||||||
|
viper.SetDefault("ops.cleanup.enabled", true)
|
||||||
|
viper.SetDefault("ops.cleanup.schedule", "0 2 * * *")
|
||||||
|
// Retention days: vNext defaults to 30 days across ops datasets.
|
||||||
|
viper.SetDefault("ops.cleanup.error_log_retention_days", 30)
|
||||||
|
viper.SetDefault("ops.cleanup.minute_metrics_retention_days", 30)
|
||||||
|
viper.SetDefault("ops.cleanup.hourly_metrics_retention_days", 30)
|
||||||
|
viper.SetDefault("ops.aggregation.enabled", true)
|
||||||
|
viper.SetDefault("ops.metrics_collector_cache.enabled", true)
|
||||||
|
// TTL should be slightly larger than collection interval (1m) to maximize cross-replica cache hits.
|
||||||
|
viper.SetDefault("ops.metrics_collector_cache.ttl", 65*time.Second)
|
||||||
|
|
||||||
// JWT
|
// JWT
|
||||||
viper.SetDefault("jwt.secret", "")
|
viper.SetDefault("jwt.secret", "")
|
||||||
viper.SetDefault("jwt.expire_hour", 24)
|
viper.SetDefault("jwt.expire_hour", 24)
|
||||||
@@ -750,7 +726,7 @@ func setDefaults() {
|
|||||||
|
|
||||||
// Gateway
|
// Gateway
|
||||||
viper.SetDefault("gateway.response_header_timeout", 600) // 600秒(10分钟)等待上游响应头,LLM高负载时可能排队较久
|
viper.SetDefault("gateway.response_header_timeout", 600) // 600秒(10分钟)等待上游响应头,LLM高负载时可能排队较久
|
||||||
viper.SetDefault("gateway.log_upstream_error_body", false)
|
viper.SetDefault("gateway.log_upstream_error_body", true)
|
||||||
viper.SetDefault("gateway.log_upstream_error_body_max_bytes", 2048)
|
viper.SetDefault("gateway.log_upstream_error_body_max_bytes", 2048)
|
||||||
viper.SetDefault("gateway.inject_beta_for_apikey", false)
|
viper.SetDefault("gateway.inject_beta_for_apikey", false)
|
||||||
viper.SetDefault("gateway.failover_on_400", false)
|
viper.SetDefault("gateway.failover_on_400", false)
|
||||||
@@ -766,7 +742,7 @@ func setDefaults() {
|
|||||||
viper.SetDefault("gateway.concurrency_slot_ttl_minutes", 30) // 并发槽位过期时间(支持超长请求)
|
viper.SetDefault("gateway.concurrency_slot_ttl_minutes", 30) // 并发槽位过期时间(支持超长请求)
|
||||||
viper.SetDefault("gateway.stream_data_interval_timeout", 180)
|
viper.SetDefault("gateway.stream_data_interval_timeout", 180)
|
||||||
viper.SetDefault("gateway.stream_keepalive_interval", 10)
|
viper.SetDefault("gateway.stream_keepalive_interval", 10)
|
||||||
viper.SetDefault("gateway.max_line_size", 40*1024*1024)
|
viper.SetDefault("gateway.max_line_size", 10*1024*1024)
|
||||||
viper.SetDefault("gateway.scheduling.sticky_session_max_waiting", 3)
|
viper.SetDefault("gateway.scheduling.sticky_session_max_waiting", 3)
|
||||||
viper.SetDefault("gateway.scheduling.sticky_session_wait_timeout", 45*time.Second)
|
viper.SetDefault("gateway.scheduling.sticky_session_wait_timeout", 45*time.Second)
|
||||||
viper.SetDefault("gateway.scheduling.fallback_wait_timeout", 30*time.Second)
|
viper.SetDefault("gateway.scheduling.fallback_wait_timeout", 30*time.Second)
|
||||||
@@ -789,10 +765,6 @@ func setDefaults() {
|
|||||||
viper.SetDefault("gemini.oauth.client_secret", "")
|
viper.SetDefault("gemini.oauth.client_secret", "")
|
||||||
viper.SetDefault("gemini.oauth.scopes", "")
|
viper.SetDefault("gemini.oauth.scopes", "")
|
||||||
viper.SetDefault("gemini.quota.policy", "")
|
viper.SetDefault("gemini.quota.policy", "")
|
||||||
|
|
||||||
// Update - 在线更新配置
|
|
||||||
// 代理地址为空表示直连 GitHub(适用于海外服务器)
|
|
||||||
viper.SetDefault("update.proxy_url", "")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Config) Validate() error {
|
func (c *Config) Validate() error {
|
||||||
@@ -833,7 +805,8 @@ func (c *Config) Validate() error {
|
|||||||
if method == "none" && !c.LinuxDo.UsePKCE {
|
if method == "none" && !c.LinuxDo.UsePKCE {
|
||||||
return fmt.Errorf("linuxdo_connect.use_pkce must be true when linuxdo_connect.token_auth_method=none")
|
return fmt.Errorf("linuxdo_connect.use_pkce must be true when linuxdo_connect.token_auth_method=none")
|
||||||
}
|
}
|
||||||
if (method == "" || method == "client_secret_post" || method == "client_secret_basic") && strings.TrimSpace(c.LinuxDo.ClientSecret) == "" {
|
if (method == "" || method == "client_secret_post" || method == "client_secret_basic") &&
|
||||||
|
strings.TrimSpace(c.LinuxDo.ClientSecret) == "" {
|
||||||
return fmt.Errorf("linuxdo_connect.client_secret is required when linuxdo_connect.enabled=true and token_auth_method is client_secret_post/client_secret_basic")
|
return fmt.Errorf("linuxdo_connect.client_secret is required when linuxdo_connect.enabled=true and token_auth_method is client_secret_post/client_secret_basic")
|
||||||
}
|
}
|
||||||
if strings.TrimSpace(c.LinuxDo.FrontendRedirectURL) == "" {
|
if strings.TrimSpace(c.LinuxDo.FrontendRedirectURL) == "" {
|
||||||
@@ -1048,6 +1021,21 @@ func (c *Config) Validate() error {
|
|||||||
if c.Gateway.Scheduling.SlotCleanupInterval < 0 {
|
if c.Gateway.Scheduling.SlotCleanupInterval < 0 {
|
||||||
return fmt.Errorf("gateway.scheduling.slot_cleanup_interval must be non-negative")
|
return fmt.Errorf("gateway.scheduling.slot_cleanup_interval must be non-negative")
|
||||||
}
|
}
|
||||||
|
if c.Ops.MetricsCollectorCache.TTL < 0 {
|
||||||
|
return fmt.Errorf("ops.metrics_collector_cache.ttl must be non-negative")
|
||||||
|
}
|
||||||
|
if c.Ops.Cleanup.ErrorLogRetentionDays < 0 {
|
||||||
|
return fmt.Errorf("ops.cleanup.error_log_retention_days must be non-negative")
|
||||||
|
}
|
||||||
|
if c.Ops.Cleanup.MinuteMetricsRetentionDays < 0 {
|
||||||
|
return fmt.Errorf("ops.cleanup.minute_metrics_retention_days must be non-negative")
|
||||||
|
}
|
||||||
|
if c.Ops.Cleanup.HourlyMetricsRetentionDays < 0 {
|
||||||
|
return fmt.Errorf("ops.cleanup.hourly_metrics_retention_days must be non-negative")
|
||||||
|
}
|
||||||
|
if c.Ops.Cleanup.Enabled && strings.TrimSpace(c.Ops.Cleanup.Schedule) == "" {
|
||||||
|
return fmt.Errorf("ops.cleanup.schedule is required when ops.cleanup.enabled=true")
|
||||||
|
}
|
||||||
if c.Concurrency.PingInterval < 5 || c.Concurrency.PingInterval > 30 {
|
if c.Concurrency.PingInterval < 5 || c.Concurrency.PingInterval > 30 {
|
||||||
return fmt.Errorf("concurrency.ping_interval must be between 5-30 seconds")
|
return fmt.Errorf("concurrency.ping_interval must be between 5-30 seconds")
|
||||||
}
|
}
|
||||||
@@ -1124,3 +1112,77 @@ func GetServerAddress() string {
|
|||||||
port := v.GetInt("server.port")
|
port := v.GetInt("server.port")
|
||||||
return fmt.Sprintf("%s:%d", host, port)
|
return fmt.Sprintf("%s:%d", host, port)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ValidateAbsoluteHTTPURL 验证是否为有效的绝对 HTTP(S) URL
|
||||||
|
func ValidateAbsoluteHTTPURL(raw string) error {
|
||||||
|
raw = strings.TrimSpace(raw)
|
||||||
|
if raw == "" {
|
||||||
|
return fmt.Errorf("empty url")
|
||||||
|
}
|
||||||
|
u, err := url.Parse(raw)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if !u.IsAbs() {
|
||||||
|
return fmt.Errorf("must be absolute")
|
||||||
|
}
|
||||||
|
if !isHTTPScheme(u.Scheme) {
|
||||||
|
return fmt.Errorf("unsupported scheme: %s", u.Scheme)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(u.Host) == "" {
|
||||||
|
return fmt.Errorf("missing host")
|
||||||
|
}
|
||||||
|
if u.Fragment != "" {
|
||||||
|
return fmt.Errorf("must not include fragment")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ValidateFrontendRedirectURL 验证前端重定向 URL(可以是绝对 URL 或相对路径)
|
||||||
|
func ValidateFrontendRedirectURL(raw string) error {
|
||||||
|
raw = strings.TrimSpace(raw)
|
||||||
|
if raw == "" {
|
||||||
|
return fmt.Errorf("empty url")
|
||||||
|
}
|
||||||
|
if strings.ContainsAny(raw, "\r\n") {
|
||||||
|
return fmt.Errorf("contains invalid characters")
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(raw, "/") {
|
||||||
|
if strings.HasPrefix(raw, "//") {
|
||||||
|
return fmt.Errorf("must not start with //")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
u, err := url.Parse(raw)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if !u.IsAbs() {
|
||||||
|
return fmt.Errorf("must be absolute http(s) url or relative path")
|
||||||
|
}
|
||||||
|
if !isHTTPScheme(u.Scheme) {
|
||||||
|
return fmt.Errorf("unsupported scheme: %s", u.Scheme)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(u.Host) == "" {
|
||||||
|
return fmt.Errorf("missing host")
|
||||||
|
}
|
||||||
|
if u.Fragment != "" {
|
||||||
|
return fmt.Errorf("must not include fragment")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// isHTTPScheme 检查是否为 HTTP 或 HTTPS 协议
|
||||||
|
func isHTTPScheme(scheme string) bool {
|
||||||
|
return strings.EqualFold(scheme, "http") || strings.EqualFold(scheme, "https")
|
||||||
|
}
|
||||||
|
|
||||||
|
func warnIfInsecureURL(field, raw string) {
|
||||||
|
u, err := url.Parse(strings.TrimSpace(raw))
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if strings.EqualFold(u.Scheme, "http") {
|
||||||
|
log.Printf("Warning: %s uses http scheme; use https in production to avoid token leakage.", field)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
432
backend/internal/handler/admin/ops_alerts_handler.go
Normal file
432
backend/internal/handler/admin/ops_alerts_handler.go
Normal file
@@ -0,0 +1,432 @@
|
|||||||
|
package admin
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"net/http"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
|
"github.com/gin-gonic/gin"
|
||||||
|
"github.com/gin-gonic/gin/binding"
|
||||||
|
)
|
||||||
|
|
||||||
|
var validOpsAlertMetricTypes = []string{
|
||||||
|
"success_rate",
|
||||||
|
"error_rate",
|
||||||
|
"upstream_error_rate",
|
||||||
|
"p95_latency_ms",
|
||||||
|
"p99_latency_ms",
|
||||||
|
"cpu_usage_percent",
|
||||||
|
"memory_usage_percent",
|
||||||
|
"concurrency_queue_depth",
|
||||||
|
}
|
||||||
|
|
||||||
|
var validOpsAlertMetricTypeSet = func() map[string]struct{} {
|
||||||
|
set := make(map[string]struct{}, len(validOpsAlertMetricTypes))
|
||||||
|
for _, v := range validOpsAlertMetricTypes {
|
||||||
|
set[v] = struct{}{}
|
||||||
|
}
|
||||||
|
return set
|
||||||
|
}()
|
||||||
|
|
||||||
|
var validOpsAlertOperators = []string{">", "<", ">=", "<=", "==", "!="}
|
||||||
|
|
||||||
|
var validOpsAlertOperatorSet = func() map[string]struct{} {
|
||||||
|
set := make(map[string]struct{}, len(validOpsAlertOperators))
|
||||||
|
for _, v := range validOpsAlertOperators {
|
||||||
|
set[v] = struct{}{}
|
||||||
|
}
|
||||||
|
return set
|
||||||
|
}()
|
||||||
|
|
||||||
|
var validOpsAlertSeverities = []string{"P0", "P1", "P2", "P3"}
|
||||||
|
|
||||||
|
var validOpsAlertSeveritySet = func() map[string]struct{} {
|
||||||
|
set := make(map[string]struct{}, len(validOpsAlertSeverities))
|
||||||
|
for _, v := range validOpsAlertSeverities {
|
||||||
|
set[v] = struct{}{}
|
||||||
|
}
|
||||||
|
return set
|
||||||
|
}()
|
||||||
|
|
||||||
|
type opsAlertRuleValidatedInput struct {
|
||||||
|
Name string
|
||||||
|
MetricType string
|
||||||
|
Operator string
|
||||||
|
Threshold float64
|
||||||
|
|
||||||
|
Severity string
|
||||||
|
|
||||||
|
WindowMinutes int
|
||||||
|
SustainedMinutes int
|
||||||
|
CooldownMinutes int
|
||||||
|
|
||||||
|
Enabled bool
|
||||||
|
NotifyEmail bool
|
||||||
|
|
||||||
|
WindowProvided bool
|
||||||
|
SustainedProvided bool
|
||||||
|
CooldownProvided bool
|
||||||
|
SeverityProvided bool
|
||||||
|
EnabledProvided bool
|
||||||
|
NotifyProvided bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func isPercentOrRateMetric(metricType string) bool {
|
||||||
|
switch metricType {
|
||||||
|
case "success_rate",
|
||||||
|
"error_rate",
|
||||||
|
"upstream_error_rate",
|
||||||
|
"cpu_usage_percent",
|
||||||
|
"memory_usage_percent":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateOpsAlertRulePayload(raw map[string]json.RawMessage) (*opsAlertRuleValidatedInput, error) {
|
||||||
|
if raw == nil {
|
||||||
|
return nil, fmt.Errorf("invalid request body")
|
||||||
|
}
|
||||||
|
|
||||||
|
requiredFields := []string{"name", "metric_type", "operator", "threshold"}
|
||||||
|
for _, field := range requiredFields {
|
||||||
|
if _, ok := raw[field]; !ok {
|
||||||
|
return nil, fmt.Errorf("%s is required", field)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var name string
|
||||||
|
if err := json.Unmarshal(raw["name"], &name); err != nil || strings.TrimSpace(name) == "" {
|
||||||
|
return nil, fmt.Errorf("name is required")
|
||||||
|
}
|
||||||
|
name = strings.TrimSpace(name)
|
||||||
|
|
||||||
|
var metricType string
|
||||||
|
if err := json.Unmarshal(raw["metric_type"], &metricType); err != nil || strings.TrimSpace(metricType) == "" {
|
||||||
|
return nil, fmt.Errorf("metric_type is required")
|
||||||
|
}
|
||||||
|
metricType = strings.TrimSpace(metricType)
|
||||||
|
if _, ok := validOpsAlertMetricTypeSet[metricType]; !ok {
|
||||||
|
return nil, fmt.Errorf("metric_type must be one of: %s", strings.Join(validOpsAlertMetricTypes, ", "))
|
||||||
|
}
|
||||||
|
|
||||||
|
var operator string
|
||||||
|
if err := json.Unmarshal(raw["operator"], &operator); err != nil || strings.TrimSpace(operator) == "" {
|
||||||
|
return nil, fmt.Errorf("operator is required")
|
||||||
|
}
|
||||||
|
operator = strings.TrimSpace(operator)
|
||||||
|
if _, ok := validOpsAlertOperatorSet[operator]; !ok {
|
||||||
|
return nil, fmt.Errorf("operator must be one of: %s", strings.Join(validOpsAlertOperators, ", "))
|
||||||
|
}
|
||||||
|
|
||||||
|
var threshold float64
|
||||||
|
if err := json.Unmarshal(raw["threshold"], &threshold); err != nil {
|
||||||
|
return nil, fmt.Errorf("threshold must be a number")
|
||||||
|
}
|
||||||
|
if math.IsNaN(threshold) || math.IsInf(threshold, 0) {
|
||||||
|
return nil, fmt.Errorf("threshold must be a finite number")
|
||||||
|
}
|
||||||
|
if isPercentOrRateMetric(metricType) {
|
||||||
|
if threshold < 0 || threshold > 100 {
|
||||||
|
return nil, fmt.Errorf("threshold must be between 0 and 100 for metric_type %s", metricType)
|
||||||
|
}
|
||||||
|
} else if threshold < 0 {
|
||||||
|
return nil, fmt.Errorf("threshold must be >= 0")
|
||||||
|
}
|
||||||
|
|
||||||
|
validated := &opsAlertRuleValidatedInput{
|
||||||
|
Name: name,
|
||||||
|
MetricType: metricType,
|
||||||
|
Operator: operator,
|
||||||
|
Threshold: threshold,
|
||||||
|
}
|
||||||
|
|
||||||
|
if v, ok := raw["severity"]; ok {
|
||||||
|
validated.SeverityProvided = true
|
||||||
|
var sev string
|
||||||
|
if err := json.Unmarshal(v, &sev); err != nil {
|
||||||
|
return nil, fmt.Errorf("severity must be a string")
|
||||||
|
}
|
||||||
|
sev = strings.ToUpper(strings.TrimSpace(sev))
|
||||||
|
if sev != "" {
|
||||||
|
if _, ok := validOpsAlertSeveritySet[sev]; !ok {
|
||||||
|
return nil, fmt.Errorf("severity must be one of: %s", strings.Join(validOpsAlertSeverities, ", "))
|
||||||
|
}
|
||||||
|
validated.Severity = sev
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if validated.Severity == "" {
|
||||||
|
validated.Severity = "P2"
|
||||||
|
}
|
||||||
|
|
||||||
|
if v, ok := raw["enabled"]; ok {
|
||||||
|
validated.EnabledProvided = true
|
||||||
|
if err := json.Unmarshal(v, &validated.Enabled); err != nil {
|
||||||
|
return nil, fmt.Errorf("enabled must be a boolean")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
validated.Enabled = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if v, ok := raw["notify_email"]; ok {
|
||||||
|
validated.NotifyProvided = true
|
||||||
|
if err := json.Unmarshal(v, &validated.NotifyEmail); err != nil {
|
||||||
|
return nil, fmt.Errorf("notify_email must be a boolean")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
validated.NotifyEmail = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if v, ok := raw["window_minutes"]; ok {
|
||||||
|
validated.WindowProvided = true
|
||||||
|
if err := json.Unmarshal(v, &validated.WindowMinutes); err != nil {
|
||||||
|
return nil, fmt.Errorf("window_minutes must be an integer")
|
||||||
|
}
|
||||||
|
switch validated.WindowMinutes {
|
||||||
|
case 1, 5, 60:
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("window_minutes must be one of: 1, 5, 60")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
validated.WindowMinutes = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if v, ok := raw["sustained_minutes"]; ok {
|
||||||
|
validated.SustainedProvided = true
|
||||||
|
if err := json.Unmarshal(v, &validated.SustainedMinutes); err != nil {
|
||||||
|
return nil, fmt.Errorf("sustained_minutes must be an integer")
|
||||||
|
}
|
||||||
|
if validated.SustainedMinutes < 1 || validated.SustainedMinutes > 1440 {
|
||||||
|
return nil, fmt.Errorf("sustained_minutes must be between 1 and 1440")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
validated.SustainedMinutes = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if v, ok := raw["cooldown_minutes"]; ok {
|
||||||
|
validated.CooldownProvided = true
|
||||||
|
if err := json.Unmarshal(v, &validated.CooldownMinutes); err != nil {
|
||||||
|
return nil, fmt.Errorf("cooldown_minutes must be an integer")
|
||||||
|
}
|
||||||
|
if validated.CooldownMinutes < 0 || validated.CooldownMinutes > 1440 {
|
||||||
|
return nil, fmt.Errorf("cooldown_minutes must be between 0 and 1440")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
validated.CooldownMinutes = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
return validated, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListAlertRules returns all ops alert rules.
|
||||||
|
// GET /api/v1/admin/ops/alert-rules
|
||||||
|
func (h *OpsHandler) ListAlertRules(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
rules, err := h.opsService.ListAlertRules(c.Request.Context())
|
||||||
|
if err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, rules)
|
||||||
|
}
|
||||||
|
|
||||||
|
// CreateAlertRule creates an ops alert rule.
|
||||||
|
// POST /api/v1/admin/ops/alert-rules
|
||||||
|
func (h *OpsHandler) CreateAlertRule(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var raw map[string]json.RawMessage
|
||||||
|
if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil {
|
||||||
|
response.BadRequest(c, "Invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
validated, err := validateOpsAlertRulePayload(raw)
|
||||||
|
if err != nil {
|
||||||
|
response.BadRequest(c, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var rule service.OpsAlertRule
|
||||||
|
if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil {
|
||||||
|
response.BadRequest(c, "Invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
rule.Name = validated.Name
|
||||||
|
rule.MetricType = validated.MetricType
|
||||||
|
rule.Operator = validated.Operator
|
||||||
|
rule.Threshold = validated.Threshold
|
||||||
|
rule.WindowMinutes = validated.WindowMinutes
|
||||||
|
rule.SustainedMinutes = validated.SustainedMinutes
|
||||||
|
rule.CooldownMinutes = validated.CooldownMinutes
|
||||||
|
rule.Severity = validated.Severity
|
||||||
|
rule.Enabled = validated.Enabled
|
||||||
|
rule.NotifyEmail = validated.NotifyEmail
|
||||||
|
|
||||||
|
created, err := h.opsService.CreateAlertRule(c.Request.Context(), &rule)
|
||||||
|
if err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, created)
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateAlertRule updates an existing ops alert rule.
|
||||||
|
// PUT /api/v1/admin/ops/alert-rules/:id
|
||||||
|
func (h *OpsHandler) UpdateAlertRule(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
id, err := strconv.ParseInt(c.Param("id"), 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid rule ID")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var raw map[string]json.RawMessage
|
||||||
|
if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil {
|
||||||
|
response.BadRequest(c, "Invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
validated, err := validateOpsAlertRulePayload(raw)
|
||||||
|
if err != nil {
|
||||||
|
response.BadRequest(c, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var rule service.OpsAlertRule
|
||||||
|
if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil {
|
||||||
|
response.BadRequest(c, "Invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
rule.ID = id
|
||||||
|
rule.Name = validated.Name
|
||||||
|
rule.MetricType = validated.MetricType
|
||||||
|
rule.Operator = validated.Operator
|
||||||
|
rule.Threshold = validated.Threshold
|
||||||
|
rule.WindowMinutes = validated.WindowMinutes
|
||||||
|
rule.SustainedMinutes = validated.SustainedMinutes
|
||||||
|
rule.CooldownMinutes = validated.CooldownMinutes
|
||||||
|
rule.Severity = validated.Severity
|
||||||
|
rule.Enabled = validated.Enabled
|
||||||
|
rule.NotifyEmail = validated.NotifyEmail
|
||||||
|
|
||||||
|
updated, err := h.opsService.UpdateAlertRule(c.Request.Context(), &rule)
|
||||||
|
if err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, updated)
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeleteAlertRule deletes an ops alert rule.
|
||||||
|
// DELETE /api/v1/admin/ops/alert-rules/:id
|
||||||
|
func (h *OpsHandler) DeleteAlertRule(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
id, err := strconv.ParseInt(c.Param("id"), 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid rule ID")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := h.opsService.DeleteAlertRule(c.Request.Context(), id); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, gin.H{"deleted": true})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListAlertEvents lists recent ops alert events.
|
||||||
|
// GET /api/v1/admin/ops/alert-events
|
||||||
|
func (h *OpsHandler) ListAlertEvents(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
limit := 100
|
||||||
|
if raw := strings.TrimSpace(c.Query("limit")); raw != "" {
|
||||||
|
n, err := strconv.Atoi(raw)
|
||||||
|
if err != nil || n <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid limit")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
limit = n
|
||||||
|
}
|
||||||
|
|
||||||
|
filter := &service.OpsAlertEventFilter{
|
||||||
|
Limit: limit,
|
||||||
|
Status: strings.TrimSpace(c.Query("status")),
|
||||||
|
Severity: strings.TrimSpace(c.Query("severity")),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Optional global filter support (platform/group/time range).
|
||||||
|
if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
|
||||||
|
filter.Platform = platform
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
|
||||||
|
id, err := strconv.ParseInt(v, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid group_id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
filter.GroupID = &id
|
||||||
|
}
|
||||||
|
if startTime, endTime, err := parseOpsTimeRange(c, "24h"); err == nil {
|
||||||
|
// Only apply when explicitly provided to avoid surprising default narrowing.
|
||||||
|
if strings.TrimSpace(c.Query("start_time")) != "" || strings.TrimSpace(c.Query("end_time")) != "" || strings.TrimSpace(c.Query("time_range")) != "" {
|
||||||
|
filter.StartTime = &startTime
|
||||||
|
filter.EndTime = &endTime
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
response.BadRequest(c, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
events, err := h.opsService.ListAlertEvents(c.Request.Context(), filter)
|
||||||
|
if err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, events)
|
||||||
|
}
|
||||||
243
backend/internal/handler/admin/ops_dashboard_handler.go
Normal file
243
backend/internal/handler/admin/ops_dashboard_handler.go
Normal file
@@ -0,0 +1,243 @@
|
|||||||
|
package admin
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/http"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
|
"github.com/gin-gonic/gin"
|
||||||
|
)
|
||||||
|
|
||||||
|
// GetDashboardOverview returns vNext ops dashboard overview (raw path).
|
||||||
|
// GET /api/v1/admin/ops/dashboard/overview
|
||||||
|
func (h *OpsHandler) GetDashboardOverview(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
startTime, endTime, err := parseOpsTimeRange(c, "1h")
|
||||||
|
if err != nil {
|
||||||
|
response.BadRequest(c, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
filter := &service.OpsDashboardFilter{
|
||||||
|
StartTime: startTime,
|
||||||
|
EndTime: endTime,
|
||||||
|
Platform: strings.TrimSpace(c.Query("platform")),
|
||||||
|
QueryMode: parseOpsQueryMode(c),
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
|
||||||
|
id, err := strconv.ParseInt(v, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid group_id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
filter.GroupID = &id
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := h.opsService.GetDashboardOverview(c.Request.Context(), filter)
|
||||||
|
if err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, data)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetDashboardThroughputTrend returns throughput time series (raw path).
|
||||||
|
// GET /api/v1/admin/ops/dashboard/throughput-trend
|
||||||
|
func (h *OpsHandler) GetDashboardThroughputTrend(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
startTime, endTime, err := parseOpsTimeRange(c, "1h")
|
||||||
|
if err != nil {
|
||||||
|
response.BadRequest(c, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
filter := &service.OpsDashboardFilter{
|
||||||
|
StartTime: startTime,
|
||||||
|
EndTime: endTime,
|
||||||
|
Platform: strings.TrimSpace(c.Query("platform")),
|
||||||
|
QueryMode: parseOpsQueryMode(c),
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
|
||||||
|
id, err := strconv.ParseInt(v, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid group_id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
filter.GroupID = &id
|
||||||
|
}
|
||||||
|
|
||||||
|
bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime))
|
||||||
|
data, err := h.opsService.GetThroughputTrend(c.Request.Context(), filter, bucketSeconds)
|
||||||
|
if err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, data)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetDashboardLatencyHistogram returns the latency distribution histogram (success requests).
|
||||||
|
// GET /api/v1/admin/ops/dashboard/latency-histogram
|
||||||
|
func (h *OpsHandler) GetDashboardLatencyHistogram(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
startTime, endTime, err := parseOpsTimeRange(c, "1h")
|
||||||
|
if err != nil {
|
||||||
|
response.BadRequest(c, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
filter := &service.OpsDashboardFilter{
|
||||||
|
StartTime: startTime,
|
||||||
|
EndTime: endTime,
|
||||||
|
Platform: strings.TrimSpace(c.Query("platform")),
|
||||||
|
QueryMode: parseOpsQueryMode(c),
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
|
||||||
|
id, err := strconv.ParseInt(v, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid group_id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
filter.GroupID = &id
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := h.opsService.GetLatencyHistogram(c.Request.Context(), filter)
|
||||||
|
if err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, data)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetDashboardErrorTrend returns error counts time series (raw path).
|
||||||
|
// GET /api/v1/admin/ops/dashboard/error-trend
|
||||||
|
func (h *OpsHandler) GetDashboardErrorTrend(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
startTime, endTime, err := parseOpsTimeRange(c, "1h")
|
||||||
|
if err != nil {
|
||||||
|
response.BadRequest(c, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
filter := &service.OpsDashboardFilter{
|
||||||
|
StartTime: startTime,
|
||||||
|
EndTime: endTime,
|
||||||
|
Platform: strings.TrimSpace(c.Query("platform")),
|
||||||
|
QueryMode: parseOpsQueryMode(c),
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
|
||||||
|
id, err := strconv.ParseInt(v, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid group_id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
filter.GroupID = &id
|
||||||
|
}
|
||||||
|
|
||||||
|
bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime))
|
||||||
|
data, err := h.opsService.GetErrorTrend(c.Request.Context(), filter, bucketSeconds)
|
||||||
|
if err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, data)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetDashboardErrorDistribution returns error distribution by status code (raw path).
|
||||||
|
// GET /api/v1/admin/ops/dashboard/error-distribution
|
||||||
|
func (h *OpsHandler) GetDashboardErrorDistribution(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
startTime, endTime, err := parseOpsTimeRange(c, "1h")
|
||||||
|
if err != nil {
|
||||||
|
response.BadRequest(c, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
filter := &service.OpsDashboardFilter{
|
||||||
|
StartTime: startTime,
|
||||||
|
EndTime: endTime,
|
||||||
|
Platform: strings.TrimSpace(c.Query("platform")),
|
||||||
|
QueryMode: parseOpsQueryMode(c),
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
|
||||||
|
id, err := strconv.ParseInt(v, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid group_id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
filter.GroupID = &id
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := h.opsService.GetErrorDistribution(c.Request.Context(), filter)
|
||||||
|
if err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, data)
|
||||||
|
}
|
||||||
|
|
||||||
|
func pickThroughputBucketSeconds(window time.Duration) int {
|
||||||
|
// Keep buckets predictable and avoid huge responses.
|
||||||
|
switch {
|
||||||
|
case window <= 2*time.Hour:
|
||||||
|
return 60
|
||||||
|
case window <= 24*time.Hour:
|
||||||
|
return 300
|
||||||
|
default:
|
||||||
|
return 3600
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseOpsQueryMode(c *gin.Context) service.OpsQueryMode {
|
||||||
|
if c == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
raw := strings.TrimSpace(c.Query("mode"))
|
||||||
|
if raw == "" {
|
||||||
|
// Empty means "use server default" (DB setting ops_query_mode_default).
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return service.ParseOpsQueryMode(raw)
|
||||||
|
}
|
||||||
364
backend/internal/handler/admin/ops_handler.go
Normal file
364
backend/internal/handler/admin/ops_handler.go
Normal file
@@ -0,0 +1,364 @@
|
|||||||
|
package admin
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/server/middleware"
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
|
"github.com/gin-gonic/gin"
|
||||||
|
)
|
||||||
|
|
||||||
|
type OpsHandler struct {
|
||||||
|
opsService *service.OpsService
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewOpsHandler(opsService *service.OpsService) *OpsHandler {
|
||||||
|
return &OpsHandler{opsService: opsService}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetErrorLogs lists ops error logs.
|
||||||
|
// GET /api/v1/admin/ops/errors
|
||||||
|
func (h *OpsHandler) GetErrorLogs(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
page, pageSize := response.ParsePagination(c)
|
||||||
|
// Ops list can be larger than standard admin tables.
|
||||||
|
if pageSize > 500 {
|
||||||
|
pageSize = 500
|
||||||
|
}
|
||||||
|
|
||||||
|
startTime, endTime, err := parseOpsTimeRange(c, "1h")
|
||||||
|
if err != nil {
|
||||||
|
response.BadRequest(c, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
filter := &service.OpsErrorLogFilter{
|
||||||
|
Page: page,
|
||||||
|
PageSize: pageSize,
|
||||||
|
}
|
||||||
|
if !startTime.IsZero() {
|
||||||
|
filter.StartTime = &startTime
|
||||||
|
}
|
||||||
|
if !endTime.IsZero() {
|
||||||
|
filter.EndTime = &endTime
|
||||||
|
}
|
||||||
|
|
||||||
|
if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
|
||||||
|
filter.Platform = platform
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
|
||||||
|
id, err := strconv.ParseInt(v, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid group_id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
filter.GroupID = &id
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(c.Query("account_id")); v != "" {
|
||||||
|
id, err := strconv.ParseInt(v, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid account_id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
filter.AccountID = &id
|
||||||
|
}
|
||||||
|
if phase := strings.TrimSpace(c.Query("phase")); phase != "" {
|
||||||
|
filter.Phase = phase
|
||||||
|
}
|
||||||
|
if q := strings.TrimSpace(c.Query("q")); q != "" {
|
||||||
|
filter.Query = q
|
||||||
|
}
|
||||||
|
if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" {
|
||||||
|
parts := strings.Split(statusCodesStr, ",")
|
||||||
|
out := make([]int, 0, len(parts))
|
||||||
|
for _, part := range parts {
|
||||||
|
p := strings.TrimSpace(part)
|
||||||
|
if p == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
n, err := strconv.Atoi(p)
|
||||||
|
if err != nil || n < 0 {
|
||||||
|
response.BadRequest(c, "Invalid status_codes")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
out = append(out, n)
|
||||||
|
}
|
||||||
|
filter.StatusCodes = out
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter)
|
||||||
|
if err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetErrorLogByID returns a single error log detail.
|
||||||
|
// GET /api/v1/admin/ops/errors/:id
|
||||||
|
func (h *OpsHandler) GetErrorLogByID(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
idStr := strings.TrimSpace(c.Param("id"))
|
||||||
|
id, err := strconv.ParseInt(idStr, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid error id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
detail, err := h.opsService.GetErrorLogByID(c.Request.Context(), id)
|
||||||
|
if err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
response.Success(c, detail)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListRequestDetails returns a request-level list (success + error) for drill-down.
|
||||||
|
// GET /api/v1/admin/ops/requests
|
||||||
|
func (h *OpsHandler) ListRequestDetails(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
page, pageSize := response.ParsePagination(c)
|
||||||
|
if pageSize > 100 {
|
||||||
|
pageSize = 100
|
||||||
|
}
|
||||||
|
|
||||||
|
startTime, endTime, err := parseOpsTimeRange(c, "1h")
|
||||||
|
if err != nil {
|
||||||
|
response.BadRequest(c, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
filter := &service.OpsRequestDetailFilter{
|
||||||
|
Page: page,
|
||||||
|
PageSize: pageSize,
|
||||||
|
StartTime: &startTime,
|
||||||
|
EndTime: &endTime,
|
||||||
|
}
|
||||||
|
|
||||||
|
filter.Kind = strings.TrimSpace(c.Query("kind"))
|
||||||
|
filter.Platform = strings.TrimSpace(c.Query("platform"))
|
||||||
|
filter.Model = strings.TrimSpace(c.Query("model"))
|
||||||
|
filter.RequestID = strings.TrimSpace(c.Query("request_id"))
|
||||||
|
filter.Query = strings.TrimSpace(c.Query("q"))
|
||||||
|
filter.Sort = strings.TrimSpace(c.Query("sort"))
|
||||||
|
|
||||||
|
if v := strings.TrimSpace(c.Query("user_id")); v != "" {
|
||||||
|
id, err := strconv.ParseInt(v, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid user_id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
filter.UserID = &id
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(c.Query("api_key_id")); v != "" {
|
||||||
|
id, err := strconv.ParseInt(v, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid api_key_id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
filter.APIKeyID = &id
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(c.Query("account_id")); v != "" {
|
||||||
|
id, err := strconv.ParseInt(v, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid account_id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
filter.AccountID = &id
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
|
||||||
|
id, err := strconv.ParseInt(v, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid group_id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
filter.GroupID = &id
|
||||||
|
}
|
||||||
|
|
||||||
|
if v := strings.TrimSpace(c.Query("min_duration_ms")); v != "" {
|
||||||
|
parsed, err := strconv.Atoi(v)
|
||||||
|
if err != nil || parsed < 0 {
|
||||||
|
response.BadRequest(c, "Invalid min_duration_ms")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
filter.MinDurationMs = &parsed
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(c.Query("max_duration_ms")); v != "" {
|
||||||
|
parsed, err := strconv.Atoi(v)
|
||||||
|
if err != nil || parsed < 0 {
|
||||||
|
response.BadRequest(c, "Invalid max_duration_ms")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
filter.MaxDurationMs = &parsed
|
||||||
|
}
|
||||||
|
|
||||||
|
out, err := h.opsService.ListRequestDetails(c.Request.Context(), filter)
|
||||||
|
if err != nil {
|
||||||
|
// Invalid sort/kind/platform etc should be a bad request; keep it simple.
|
||||||
|
if strings.Contains(strings.ToLower(err.Error()), "invalid") {
|
||||||
|
response.BadRequest(c, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Error(c, http.StatusInternalServerError, "Failed to list request details")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
response.Paginated(c, out.Items, out.Total, out.Page, out.PageSize)
|
||||||
|
}
|
||||||
|
|
||||||
|
type opsRetryRequest struct {
|
||||||
|
Mode string `json:"mode"`
|
||||||
|
PinnedAccountID *int64 `json:"pinned_account_id"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// RetryErrorRequest retries a failed request using stored request_body.
|
||||||
|
// POST /api/v1/admin/ops/errors/:id/retry
|
||||||
|
func (h *OpsHandler) RetryErrorRequest(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
subject, ok := middleware.GetAuthSubjectFromContext(c)
|
||||||
|
if !ok || subject.UserID <= 0 {
|
||||||
|
response.Error(c, http.StatusUnauthorized, "Unauthorized")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
idStr := strings.TrimSpace(c.Param("id"))
|
||||||
|
id, err := strconv.ParseInt(idStr, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid error id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
req := opsRetryRequest{Mode: service.OpsRetryModeClient}
|
||||||
|
if err := c.ShouldBindJSON(&req); err != nil && !errors.Is(err, io.EOF) {
|
||||||
|
response.BadRequest(c, "Invalid request: "+err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(req.Mode) == "" {
|
||||||
|
req.Mode = service.OpsRetryModeClient
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, req.Mode, req.PinnedAccountID)
|
||||||
|
if err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
response.Success(c, result)
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseOpsTimeRange(c *gin.Context, defaultRange string) (time.Time, time.Time, error) {
|
||||||
|
startStr := strings.TrimSpace(c.Query("start_time"))
|
||||||
|
endStr := strings.TrimSpace(c.Query("end_time"))
|
||||||
|
|
||||||
|
parseTS := func(s string) (time.Time, error) {
|
||||||
|
if s == "" {
|
||||||
|
return time.Time{}, nil
|
||||||
|
}
|
||||||
|
if t, err := time.Parse(time.RFC3339Nano, s); err == nil {
|
||||||
|
return t, nil
|
||||||
|
}
|
||||||
|
return time.Parse(time.RFC3339, s)
|
||||||
|
}
|
||||||
|
|
||||||
|
start, err := parseTS(startStr)
|
||||||
|
if err != nil {
|
||||||
|
return time.Time{}, time.Time{}, err
|
||||||
|
}
|
||||||
|
end, err := parseTS(endStr)
|
||||||
|
if err != nil {
|
||||||
|
return time.Time{}, time.Time{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// start/end explicitly provided (even partially)
|
||||||
|
if startStr != "" || endStr != "" {
|
||||||
|
if end.IsZero() {
|
||||||
|
end = time.Now()
|
||||||
|
}
|
||||||
|
if start.IsZero() {
|
||||||
|
dur, _ := parseOpsDuration(defaultRange)
|
||||||
|
start = end.Add(-dur)
|
||||||
|
}
|
||||||
|
if start.After(end) {
|
||||||
|
return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: start_time must be <= end_time")
|
||||||
|
}
|
||||||
|
if end.Sub(start) > 30*24*time.Hour {
|
||||||
|
return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days")
|
||||||
|
}
|
||||||
|
return start, end, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// time_range fallback
|
||||||
|
tr := strings.TrimSpace(c.Query("time_range"))
|
||||||
|
if tr == "" {
|
||||||
|
tr = defaultRange
|
||||||
|
}
|
||||||
|
dur, ok := parseOpsDuration(tr)
|
||||||
|
if !ok {
|
||||||
|
dur, _ = parseOpsDuration(defaultRange)
|
||||||
|
}
|
||||||
|
|
||||||
|
end = time.Now()
|
||||||
|
start = end.Add(-dur)
|
||||||
|
if end.Sub(start) > 30*24*time.Hour {
|
||||||
|
return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days")
|
||||||
|
}
|
||||||
|
return start, end, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseOpsDuration(v string) (time.Duration, bool) {
|
||||||
|
switch strings.TrimSpace(v) {
|
||||||
|
case "5m":
|
||||||
|
return 5 * time.Minute, true
|
||||||
|
case "30m":
|
||||||
|
return 30 * time.Minute, true
|
||||||
|
case "1h":
|
||||||
|
return time.Hour, true
|
||||||
|
case "6h":
|
||||||
|
return 6 * time.Hour, true
|
||||||
|
case "24h":
|
||||||
|
return 24 * time.Hour, true
|
||||||
|
default:
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
}
|
||||||
120
backend/internal/handler/admin/ops_realtime_handler.go
Normal file
120
backend/internal/handler/admin/ops_realtime_handler.go
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
package admin
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/http"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
|
"github.com/gin-gonic/gin"
|
||||||
|
)
|
||||||
|
|
||||||
|
// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account.
|
||||||
|
// GET /api/v1/admin/ops/concurrency
|
||||||
|
func (h *OpsHandler) GetConcurrencyStats(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
|
||||||
|
response.Success(c, gin.H{
|
||||||
|
"enabled": false,
|
||||||
|
"platform": map[string]*service.PlatformConcurrencyInfo{},
|
||||||
|
"group": map[int64]*service.GroupConcurrencyInfo{},
|
||||||
|
"account": map[int64]*service.AccountConcurrencyInfo{},
|
||||||
|
"timestamp": time.Now().UTC(),
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
platformFilter := strings.TrimSpace(c.Query("platform"))
|
||||||
|
var groupID *int64
|
||||||
|
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
|
||||||
|
id, err := strconv.ParseInt(v, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid group_id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
groupID = &id
|
||||||
|
}
|
||||||
|
|
||||||
|
platform, group, account, collectedAt, err := h.opsService.GetConcurrencyStats(c.Request.Context(), platformFilter, groupID)
|
||||||
|
if err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
payload := gin.H{
|
||||||
|
"enabled": true,
|
||||||
|
"platform": platform,
|
||||||
|
"group": group,
|
||||||
|
"account": account,
|
||||||
|
}
|
||||||
|
if collectedAt != nil {
|
||||||
|
payload["timestamp"] = collectedAt.UTC()
|
||||||
|
}
|
||||||
|
response.Success(c, payload)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetAccountAvailability returns account availability statistics.
|
||||||
|
// GET /api/v1/admin/ops/account-availability
|
||||||
|
//
|
||||||
|
// Query params:
|
||||||
|
// - platform: optional
|
||||||
|
// - group_id: optional
|
||||||
|
func (h *OpsHandler) GetAccountAvailability(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
|
||||||
|
response.Success(c, gin.H{
|
||||||
|
"enabled": false,
|
||||||
|
"platform": map[string]*service.PlatformAvailability{},
|
||||||
|
"group": map[int64]*service.GroupAvailability{},
|
||||||
|
"account": map[int64]*service.AccountAvailability{},
|
||||||
|
"timestamp": time.Now().UTC(),
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
platform := strings.TrimSpace(c.Query("platform"))
|
||||||
|
var groupID *int64
|
||||||
|
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
|
||||||
|
id, err := strconv.ParseInt(v, 10, 64)
|
||||||
|
if err != nil || id <= 0 {
|
||||||
|
response.BadRequest(c, "Invalid group_id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
groupID = &id
|
||||||
|
}
|
||||||
|
|
||||||
|
platformStats, groupStats, accountStats, collectedAt, err := h.opsService.GetAccountAvailabilityStats(c.Request.Context(), platform, groupID)
|
||||||
|
if err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
payload := gin.H{
|
||||||
|
"enabled": true,
|
||||||
|
"platform": platformStats,
|
||||||
|
"group": groupStats,
|
||||||
|
"account": accountStats,
|
||||||
|
}
|
||||||
|
if collectedAt != nil {
|
||||||
|
payload["timestamp"] = collectedAt.UTC()
|
||||||
|
}
|
||||||
|
response.Success(c, payload)
|
||||||
|
}
|
||||||
148
backend/internal/handler/admin/ops_settings_handler.go
Normal file
148
backend/internal/handler/admin/ops_settings_handler.go
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
package admin
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/http"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
|
"github.com/gin-gonic/gin"
|
||||||
|
)
|
||||||
|
|
||||||
|
// GetEmailNotificationConfig returns Ops email notification config (DB-backed).
|
||||||
|
// GET /api/v1/admin/ops/email-notification/config
|
||||||
|
func (h *OpsHandler) GetEmailNotificationConfig(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg, err := h.opsService.GetEmailNotificationConfig(c.Request.Context())
|
||||||
|
if err != nil {
|
||||||
|
response.Error(c, http.StatusInternalServerError, "Failed to get email notification config")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, cfg)
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateEmailNotificationConfig updates Ops email notification config (DB-backed).
|
||||||
|
// PUT /api/v1/admin/ops/email-notification/config
|
||||||
|
func (h *OpsHandler) UpdateEmailNotificationConfig(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var req service.OpsEmailNotificationConfigUpdateRequest
|
||||||
|
if err := c.ShouldBindJSON(&req); err != nil {
|
||||||
|
response.BadRequest(c, "Invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
updated, err := h.opsService.UpdateEmailNotificationConfig(c.Request.Context(), &req)
|
||||||
|
if err != nil {
|
||||||
|
// Most failures here are validation errors from request payload; treat as 400.
|
||||||
|
response.Error(c, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, updated)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetAlertRuntimeSettings returns Ops alert evaluator runtime settings (DB-backed).
|
||||||
|
// GET /api/v1/admin/ops/runtime/alert
|
||||||
|
func (h *OpsHandler) GetAlertRuntimeSettings(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg, err := h.opsService.GetOpsAlertRuntimeSettings(c.Request.Context())
|
||||||
|
if err != nil {
|
||||||
|
response.Error(c, http.StatusInternalServerError, "Failed to get alert runtime settings")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, cfg)
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateAlertRuntimeSettings updates Ops alert evaluator runtime settings (DB-backed).
|
||||||
|
// PUT /api/v1/admin/ops/runtime/alert
|
||||||
|
func (h *OpsHandler) UpdateAlertRuntimeSettings(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var req service.OpsAlertRuntimeSettings
|
||||||
|
if err := c.ShouldBindJSON(&req); err != nil {
|
||||||
|
response.BadRequest(c, "Invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
updated, err := h.opsService.UpdateOpsAlertRuntimeSettings(c.Request.Context(), &req)
|
||||||
|
if err != nil {
|
||||||
|
response.Error(c, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, updated)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetAdvancedSettings returns Ops advanced settings (DB-backed).
|
||||||
|
// GET /api/v1/admin/ops/advanced-settings
|
||||||
|
func (h *OpsHandler) GetAdvancedSettings(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg, err := h.opsService.GetOpsAdvancedSettings(c.Request.Context())
|
||||||
|
if err != nil {
|
||||||
|
response.Error(c, http.StatusInternalServerError, "Failed to get advanced settings")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, cfg)
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateAdvancedSettings updates Ops advanced settings (DB-backed).
|
||||||
|
// PUT /api/v1/admin/ops/advanced-settings
|
||||||
|
func (h *OpsHandler) UpdateAdvancedSettings(c *gin.Context) {
|
||||||
|
if h.opsService == nil {
|
||||||
|
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||||
|
response.ErrorFrom(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var req service.OpsAdvancedSettings
|
||||||
|
if err := c.ShouldBindJSON(&req); err != nil {
|
||||||
|
response.BadRequest(c, "Invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
updated, err := h.opsService.UpdateOpsAdvancedSettings(c.Request.Context(), &req)
|
||||||
|
if err != nil {
|
||||||
|
response.Error(c, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
response.Success(c, updated)
|
||||||
|
}
|
||||||
771
backend/internal/handler/admin/ops_ws_handler.go
Normal file
771
backend/internal/handler/admin/ops_ws_handler.go
Normal file
@@ -0,0 +1,771 @@
|
|||||||
|
package admin
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"log"
|
||||||
|
"math"
|
||||||
|
"net"
|
||||||
|
"net/http"
|
||||||
|
"net/netip"
|
||||||
|
"net/url"
|
||||||
|
"os"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
|
"github.com/gin-gonic/gin"
|
||||||
|
"github.com/gorilla/websocket"
|
||||||
|
)
|
||||||
|
|
||||||
|
type OpsWSProxyConfig struct {
|
||||||
|
TrustProxy bool
|
||||||
|
TrustedProxies []netip.Prefix
|
||||||
|
OriginPolicy string
|
||||||
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
envOpsWSTrustProxy = "OPS_WS_TRUST_PROXY"
|
||||||
|
envOpsWSTrustedProxies = "OPS_WS_TRUSTED_PROXIES"
|
||||||
|
envOpsWSOriginPolicy = "OPS_WS_ORIGIN_POLICY"
|
||||||
|
envOpsWSMaxConns = "OPS_WS_MAX_CONNS"
|
||||||
|
envOpsWSMaxConnsPerIP = "OPS_WS_MAX_CONNS_PER_IP"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
OriginPolicyStrict = "strict"
|
||||||
|
OriginPolicyPermissive = "permissive"
|
||||||
|
)
|
||||||
|
|
||||||
|
var opsWSProxyConfig = loadOpsWSProxyConfigFromEnv()
|
||||||
|
|
||||||
|
var upgrader = websocket.Upgrader{
|
||||||
|
CheckOrigin: func(r *http.Request) bool {
|
||||||
|
return isAllowedOpsWSOrigin(r)
|
||||||
|
},
|
||||||
|
// Subprotocol negotiation:
|
||||||
|
// - The frontend passes ["sub2api-admin", "jwt.<token>"].
|
||||||
|
// - We always select "sub2api-admin" so the token is never echoed back in the handshake response.
|
||||||
|
Subprotocols: []string{"sub2api-admin"},
|
||||||
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
qpsWSPushInterval = 2 * time.Second
|
||||||
|
qpsWSRefreshInterval = 5 * time.Second
|
||||||
|
qpsWSRequestCountWindow = 1 * time.Minute
|
||||||
|
|
||||||
|
defaultMaxWSConns = 100
|
||||||
|
defaultMaxWSConnsPerIP = 20
|
||||||
|
)
|
||||||
|
|
||||||
|
var wsConnCount atomic.Int32
|
||||||
|
var wsConnCountByIP sync.Map // map[string]*atomic.Int32
|
||||||
|
|
||||||
|
const qpsWSIdleStopDelay = 30 * time.Second
|
||||||
|
|
||||||
|
const (
|
||||||
|
opsWSCloseRealtimeDisabled = 4001
|
||||||
|
)
|
||||||
|
|
||||||
|
var qpsWSIdleStopMu sync.Mutex
|
||||||
|
var qpsWSIdleStopTimer *time.Timer
|
||||||
|
|
||||||
|
func cancelQPSWSIdleStop() {
|
||||||
|
qpsWSIdleStopMu.Lock()
|
||||||
|
if qpsWSIdleStopTimer != nil {
|
||||||
|
qpsWSIdleStopTimer.Stop()
|
||||||
|
qpsWSIdleStopTimer = nil
|
||||||
|
}
|
||||||
|
qpsWSIdleStopMu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func scheduleQPSWSIdleStop() {
|
||||||
|
qpsWSIdleStopMu.Lock()
|
||||||
|
if qpsWSIdleStopTimer != nil {
|
||||||
|
qpsWSIdleStopMu.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
qpsWSIdleStopTimer = time.AfterFunc(qpsWSIdleStopDelay, func() {
|
||||||
|
// Only stop if truly idle at fire time.
|
||||||
|
if wsConnCount.Load() == 0 {
|
||||||
|
qpsWSCache.Stop()
|
||||||
|
}
|
||||||
|
qpsWSIdleStopMu.Lock()
|
||||||
|
qpsWSIdleStopTimer = nil
|
||||||
|
qpsWSIdleStopMu.Unlock()
|
||||||
|
})
|
||||||
|
qpsWSIdleStopMu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
type opsWSRuntimeLimits struct {
|
||||||
|
MaxConns int32
|
||||||
|
MaxConnsPerIP int32
|
||||||
|
}
|
||||||
|
|
||||||
|
var opsWSLimits = loadOpsWSRuntimeLimitsFromEnv()
|
||||||
|
|
||||||
|
const (
|
||||||
|
qpsWSWriteTimeout = 10 * time.Second
|
||||||
|
qpsWSPongWait = 60 * time.Second
|
||||||
|
qpsWSPingInterval = 30 * time.Second
|
||||||
|
|
||||||
|
// We don't expect clients to send application messages; we only read to process control frames (Pong/Close).
|
||||||
|
qpsWSMaxReadBytes = 1024
|
||||||
|
)
|
||||||
|
|
||||||
|
type opsWSQPSCache struct {
|
||||||
|
refreshInterval time.Duration
|
||||||
|
requestCountWindow time.Duration
|
||||||
|
|
||||||
|
lastUpdatedUnixNano atomic.Int64
|
||||||
|
payload atomic.Value // []byte
|
||||||
|
|
||||||
|
opsService *service.OpsService
|
||||||
|
cancel context.CancelFunc
|
||||||
|
done chan struct{}
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
running bool
|
||||||
|
}
|
||||||
|
|
||||||
|
var qpsWSCache = &opsWSQPSCache{
|
||||||
|
refreshInterval: qpsWSRefreshInterval,
|
||||||
|
requestCountWindow: qpsWSRequestCountWindow,
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *opsWSQPSCache) start(opsService *service.OpsService) {
|
||||||
|
if c == nil || opsService == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for {
|
||||||
|
c.mu.Lock()
|
||||||
|
if c.running {
|
||||||
|
c.mu.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// If a previous refresh loop is currently stopping, wait for it to fully exit.
|
||||||
|
done := c.done
|
||||||
|
if done != nil {
|
||||||
|
c.mu.Unlock()
|
||||||
|
<-done
|
||||||
|
|
||||||
|
c.mu.Lock()
|
||||||
|
if c.done == done && !c.running {
|
||||||
|
c.done = nil
|
||||||
|
}
|
||||||
|
c.mu.Unlock()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
c.opsService = opsService
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
c.cancel = cancel
|
||||||
|
c.done = make(chan struct{})
|
||||||
|
done = c.done
|
||||||
|
c.running = true
|
||||||
|
c.mu.Unlock()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
defer close(done)
|
||||||
|
c.refreshLoop(ctx)
|
||||||
|
}()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop stops the background refresh loop.
|
||||||
|
// It is safe to call multiple times.
|
||||||
|
func (c *opsWSQPSCache) Stop() {
|
||||||
|
if c == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
c.mu.Lock()
|
||||||
|
if !c.running {
|
||||||
|
done := c.done
|
||||||
|
c.mu.Unlock()
|
||||||
|
if done != nil {
|
||||||
|
<-done
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
cancel := c.cancel
|
||||||
|
c.cancel = nil
|
||||||
|
c.running = false
|
||||||
|
c.opsService = nil
|
||||||
|
done := c.done
|
||||||
|
c.mu.Unlock()
|
||||||
|
|
||||||
|
if cancel != nil {
|
||||||
|
cancel()
|
||||||
|
}
|
||||||
|
if done != nil {
|
||||||
|
<-done
|
||||||
|
}
|
||||||
|
|
||||||
|
c.mu.Lock()
|
||||||
|
if c.done == done && !c.running {
|
||||||
|
c.done = nil
|
||||||
|
}
|
||||||
|
c.mu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *opsWSQPSCache) refreshLoop(ctx context.Context) {
|
||||||
|
ticker := time.NewTicker(c.refreshInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
c.refresh(ctx)
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ticker.C:
|
||||||
|
c.refresh(ctx)
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *opsWSQPSCache) refresh(parentCtx context.Context) {
|
||||||
|
if c == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
c.mu.Lock()
|
||||||
|
opsService := c.opsService
|
||||||
|
c.mu.Unlock()
|
||||||
|
if opsService == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if parentCtx == nil {
|
||||||
|
parentCtx = context.Background()
|
||||||
|
}
|
||||||
|
ctx, cancel := context.WithTimeout(parentCtx, 10*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
now := time.Now().UTC()
|
||||||
|
stats, err := opsService.GetWindowStats(ctx, now.Add(-c.requestCountWindow), now)
|
||||||
|
if err != nil || stats == nil {
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("[OpsWS] refresh: get window stats failed: %v", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
requestCount := stats.SuccessCount + stats.ErrorCountTotal
|
||||||
|
qps := 0.0
|
||||||
|
tps := 0.0
|
||||||
|
if c.requestCountWindow > 0 {
|
||||||
|
seconds := c.requestCountWindow.Seconds()
|
||||||
|
qps = roundTo1DP(float64(requestCount) / seconds)
|
||||||
|
tps = roundTo1DP(float64(stats.TokenConsumed) / seconds)
|
||||||
|
}
|
||||||
|
|
||||||
|
payload := gin.H{
|
||||||
|
"type": "qps_update",
|
||||||
|
"timestamp": now.Format(time.RFC3339),
|
||||||
|
"data": gin.H{
|
||||||
|
"qps": qps,
|
||||||
|
"tps": tps,
|
||||||
|
"request_count": requestCount,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
msg, err := json.Marshal(payload)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("[OpsWS] refresh: marshal payload failed: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
c.payload.Store(msg)
|
||||||
|
c.lastUpdatedUnixNano.Store(now.UnixNano())
|
||||||
|
}
|
||||||
|
|
||||||
|
func roundTo1DP(v float64) float64 {
|
||||||
|
return math.Round(v*10) / 10
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *opsWSQPSCache) getPayload() []byte {
|
||||||
|
if c == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if cached, ok := c.payload.Load().([]byte); ok && cached != nil {
|
||||||
|
return cached
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func closeWS(conn *websocket.Conn, code int, reason string) {
|
||||||
|
if conn == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
msg := websocket.FormatCloseMessage(code, reason)
|
||||||
|
_ = conn.WriteControl(websocket.CloseMessage, msg, time.Now().Add(qpsWSWriteTimeout))
|
||||||
|
_ = conn.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
// QPSWSHandler handles realtime QPS push via WebSocket.
|
||||||
|
// GET /api/v1/admin/ops/ws/qps
|
||||||
|
func (h *OpsHandler) QPSWSHandler(c *gin.Context) {
|
||||||
|
clientIP := requestClientIP(c.Request)
|
||||||
|
|
||||||
|
if h == nil || h.opsService == nil {
|
||||||
|
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "ops service not initialized"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// If realtime monitoring is disabled, prefer a successful WS upgrade followed by a clean close
|
||||||
|
// with a deterministic close code. This prevents clients from spinning on 404/1006 reconnect loops.
|
||||||
|
if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
|
||||||
|
conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
|
||||||
|
if err != nil {
|
||||||
|
c.JSON(http.StatusNotFound, gin.H{"error": "ops realtime monitoring is disabled"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
closeWS(conn, opsWSCloseRealtimeDisabled, "realtime_disabled")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
cancelQPSWSIdleStop()
|
||||||
|
// Lazily start the background refresh loop so unit tests that never hit the
|
||||||
|
// websocket route don't spawn goroutines that depend on DB/Redis stubs.
|
||||||
|
qpsWSCache.start(h.opsService)
|
||||||
|
|
||||||
|
// Reserve a global slot before upgrading the connection to keep the limit strict.
|
||||||
|
if !tryAcquireOpsWSTotalSlot(opsWSLimits.MaxConns) {
|
||||||
|
log.Printf("[OpsWS] connection limit reached: %d/%d", wsConnCount.Load(), opsWSLimits.MaxConns)
|
||||||
|
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if wsConnCount.Add(-1) == 0 {
|
||||||
|
scheduleQPSWSIdleStop()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
if opsWSLimits.MaxConnsPerIP > 0 && clientIP != "" {
|
||||||
|
if !tryAcquireOpsWSIPSlot(clientIP, opsWSLimits.MaxConnsPerIP) {
|
||||||
|
log.Printf("[OpsWS] per-ip connection limit reached: ip=%s limit=%d", clientIP, opsWSLimits.MaxConnsPerIP)
|
||||||
|
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer releaseOpsWSIPSlot(clientIP)
|
||||||
|
}
|
||||||
|
|
||||||
|
conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("[OpsWS] upgrade failed: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
defer func() {
|
||||||
|
_ = conn.Close()
|
||||||
|
}()
|
||||||
|
|
||||||
|
handleQPSWebSocket(c.Request.Context(), conn)
|
||||||
|
}
|
||||||
|
|
||||||
|
func tryAcquireOpsWSTotalSlot(limit int32) bool {
|
||||||
|
if limit <= 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
current := wsConnCount.Load()
|
||||||
|
if current >= limit {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if wsConnCount.CompareAndSwap(current, current+1) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func tryAcquireOpsWSIPSlot(clientIP string, limit int32) bool {
|
||||||
|
if strings.TrimSpace(clientIP) == "" || limit <= 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
v, _ := wsConnCountByIP.LoadOrStore(clientIP, &atomic.Int32{})
|
||||||
|
counter, ok := v.(*atomic.Int32)
|
||||||
|
if !ok {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
for {
|
||||||
|
current := counter.Load()
|
||||||
|
if current >= limit {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if counter.CompareAndSwap(current, current+1) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func releaseOpsWSIPSlot(clientIP string) {
|
||||||
|
if strings.TrimSpace(clientIP) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
v, ok := wsConnCountByIP.Load(clientIP)
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
counter, ok := v.(*atomic.Int32)
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
next := counter.Add(-1)
|
||||||
|
if next <= 0 {
|
||||||
|
// Best-effort cleanup; safe even if a new slot was acquired concurrently.
|
||||||
|
wsConnCountByIP.Delete(clientIP)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func handleQPSWebSocket(parentCtx context.Context, conn *websocket.Conn) {
|
||||||
|
if conn == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(parentCtx)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
var closeOnce sync.Once
|
||||||
|
closeConn := func() {
|
||||||
|
closeOnce.Do(func() {
|
||||||
|
_ = conn.Close()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
closeFrameCh := make(chan []byte, 1)
|
||||||
|
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
conn.SetReadLimit(qpsWSMaxReadBytes)
|
||||||
|
if err := conn.SetReadDeadline(time.Now().Add(qpsWSPongWait)); err != nil {
|
||||||
|
log.Printf("[OpsWS] set read deadline failed: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
conn.SetPongHandler(func(string) error {
|
||||||
|
return conn.SetReadDeadline(time.Now().Add(qpsWSPongWait))
|
||||||
|
})
|
||||||
|
conn.SetCloseHandler(func(code int, text string) error {
|
||||||
|
select {
|
||||||
|
case closeFrameCh <- websocket.FormatCloseMessage(code, text):
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
cancel()
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
|
||||||
|
for {
|
||||||
|
_, _, err := conn.ReadMessage()
|
||||||
|
if err != nil {
|
||||||
|
if websocket.IsUnexpectedCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway, websocket.CloseNoStatusReceived) {
|
||||||
|
log.Printf("[OpsWS] read failed: %v", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Push QPS data every 2 seconds (values are globally cached and refreshed at most once per qpsWSRefreshInterval).
|
||||||
|
pushTicker := time.NewTicker(qpsWSPushInterval)
|
||||||
|
defer pushTicker.Stop()
|
||||||
|
|
||||||
|
// Heartbeat ping every 30 seconds.
|
||||||
|
pingTicker := time.NewTicker(qpsWSPingInterval)
|
||||||
|
defer pingTicker.Stop()
|
||||||
|
|
||||||
|
writeWithTimeout := func(messageType int, data []byte) error {
|
||||||
|
if err := conn.SetWriteDeadline(time.Now().Add(qpsWSWriteTimeout)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return conn.WriteMessage(messageType, data)
|
||||||
|
}
|
||||||
|
|
||||||
|
sendClose := func(closeFrame []byte) {
|
||||||
|
if closeFrame == nil {
|
||||||
|
closeFrame = websocket.FormatCloseMessage(websocket.CloseNormalClosure, "")
|
||||||
|
}
|
||||||
|
_ = writeWithTimeout(websocket.CloseMessage, closeFrame)
|
||||||
|
}
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-pushTicker.C:
|
||||||
|
msg := qpsWSCache.getPayload()
|
||||||
|
if msg == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err := writeWithTimeout(websocket.TextMessage, msg); err != nil {
|
||||||
|
log.Printf("[OpsWS] write failed: %v", err)
|
||||||
|
cancel()
|
||||||
|
closeConn()
|
||||||
|
wg.Wait()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
case <-pingTicker.C:
|
||||||
|
if err := writeWithTimeout(websocket.PingMessage, nil); err != nil {
|
||||||
|
log.Printf("[OpsWS] ping failed: %v", err)
|
||||||
|
cancel()
|
||||||
|
closeConn()
|
||||||
|
wg.Wait()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
case closeFrame := <-closeFrameCh:
|
||||||
|
sendClose(closeFrame)
|
||||||
|
closeConn()
|
||||||
|
wg.Wait()
|
||||||
|
return
|
||||||
|
|
||||||
|
case <-ctx.Done():
|
||||||
|
var closeFrame []byte
|
||||||
|
select {
|
||||||
|
case closeFrame = <-closeFrameCh:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
sendClose(closeFrame)
|
||||||
|
|
||||||
|
closeConn()
|
||||||
|
wg.Wait()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func isAllowedOpsWSOrigin(r *http.Request) bool {
|
||||||
|
if r == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
origin := strings.TrimSpace(r.Header.Get("Origin"))
|
||||||
|
if origin == "" {
|
||||||
|
switch strings.ToLower(strings.TrimSpace(opsWSProxyConfig.OriginPolicy)) {
|
||||||
|
case OriginPolicyStrict:
|
||||||
|
return false
|
||||||
|
case OriginPolicyPermissive, "":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parsed, err := url.Parse(origin)
|
||||||
|
if err != nil || parsed.Hostname() == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
originHost := strings.ToLower(parsed.Hostname())
|
||||||
|
|
||||||
|
trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r)
|
||||||
|
reqHost := hostWithoutPort(r.Host)
|
||||||
|
if trustProxyHeaders {
|
||||||
|
xfHost := strings.TrimSpace(r.Header.Get("X-Forwarded-Host"))
|
||||||
|
if xfHost != "" {
|
||||||
|
xfHost = strings.TrimSpace(strings.Split(xfHost, ",")[0])
|
||||||
|
if xfHost != "" {
|
||||||
|
reqHost = hostWithoutPort(xfHost)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
reqHost = strings.ToLower(reqHost)
|
||||||
|
if reqHost == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return originHost == reqHost
|
||||||
|
}
|
||||||
|
|
||||||
|
func shouldTrustOpsWSProxyHeaders(r *http.Request) bool {
|
||||||
|
if r == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if !opsWSProxyConfig.TrustProxy {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
peerIP, ok := requestPeerIP(r)
|
||||||
|
if !ok {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return isAddrInTrustedProxies(peerIP, opsWSProxyConfig.TrustedProxies)
|
||||||
|
}
|
||||||
|
|
||||||
|
func requestPeerIP(r *http.Request) (netip.Addr, bool) {
|
||||||
|
if r == nil {
|
||||||
|
return netip.Addr{}, false
|
||||||
|
}
|
||||||
|
host, _, err := net.SplitHostPort(strings.TrimSpace(r.RemoteAddr))
|
||||||
|
if err != nil {
|
||||||
|
host = strings.TrimSpace(r.RemoteAddr)
|
||||||
|
}
|
||||||
|
host = strings.TrimPrefix(host, "[")
|
||||||
|
host = strings.TrimSuffix(host, "]")
|
||||||
|
if host == "" {
|
||||||
|
return netip.Addr{}, false
|
||||||
|
}
|
||||||
|
addr, err := netip.ParseAddr(host)
|
||||||
|
if err != nil {
|
||||||
|
return netip.Addr{}, false
|
||||||
|
}
|
||||||
|
return addr.Unmap(), true
|
||||||
|
}
|
||||||
|
|
||||||
|
func requestClientIP(r *http.Request) string {
|
||||||
|
if r == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r)
|
||||||
|
if trustProxyHeaders {
|
||||||
|
xff := strings.TrimSpace(r.Header.Get("X-Forwarded-For"))
|
||||||
|
if xff != "" {
|
||||||
|
// Use the left-most entry (original client). If multiple proxies add values, they are comma-separated.
|
||||||
|
xff = strings.TrimSpace(strings.Split(xff, ",")[0])
|
||||||
|
xff = strings.TrimPrefix(xff, "[")
|
||||||
|
xff = strings.TrimSuffix(xff, "]")
|
||||||
|
if addr, err := netip.ParseAddr(xff); err == nil && addr.IsValid() {
|
||||||
|
return addr.Unmap().String()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if peer, ok := requestPeerIP(r); ok && peer.IsValid() {
|
||||||
|
return peer.String()
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func isAddrInTrustedProxies(addr netip.Addr, trusted []netip.Prefix) bool {
|
||||||
|
if !addr.IsValid() {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, p := range trusted {
|
||||||
|
if p.Contains(addr) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadOpsWSProxyConfigFromEnv() OpsWSProxyConfig {
|
||||||
|
cfg := OpsWSProxyConfig{
|
||||||
|
TrustProxy: true,
|
||||||
|
TrustedProxies: defaultTrustedProxies(),
|
||||||
|
OriginPolicy: OriginPolicyPermissive,
|
||||||
|
}
|
||||||
|
|
||||||
|
if v := strings.TrimSpace(os.Getenv(envOpsWSTrustProxy)); v != "" {
|
||||||
|
if parsed, err := strconv.ParseBool(v); err == nil {
|
||||||
|
cfg.TrustProxy = parsed
|
||||||
|
} else {
|
||||||
|
log.Printf("[OpsWS] invalid %s=%q (expected bool); using default=%v", envOpsWSTrustProxy, v, cfg.TrustProxy)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if raw := strings.TrimSpace(os.Getenv(envOpsWSTrustedProxies)); raw != "" {
|
||||||
|
prefixes, invalid := parseTrustedProxyList(raw)
|
||||||
|
if len(invalid) > 0 {
|
||||||
|
log.Printf("[OpsWS] invalid %s entries ignored: %s", envOpsWSTrustedProxies, strings.Join(invalid, ", "))
|
||||||
|
}
|
||||||
|
cfg.TrustedProxies = prefixes
|
||||||
|
}
|
||||||
|
|
||||||
|
if v := strings.TrimSpace(os.Getenv(envOpsWSOriginPolicy)); v != "" {
|
||||||
|
normalized := strings.ToLower(v)
|
||||||
|
switch normalized {
|
||||||
|
case OriginPolicyStrict, OriginPolicyPermissive:
|
||||||
|
cfg.OriginPolicy = normalized
|
||||||
|
default:
|
||||||
|
log.Printf("[OpsWS] invalid %s=%q (expected %q or %q); using default=%q", envOpsWSOriginPolicy, v, OriginPolicyStrict, OriginPolicyPermissive, cfg.OriginPolicy)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return cfg
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadOpsWSRuntimeLimitsFromEnv() opsWSRuntimeLimits {
|
||||||
|
cfg := opsWSRuntimeLimits{
|
||||||
|
MaxConns: defaultMaxWSConns,
|
||||||
|
MaxConnsPerIP: defaultMaxWSConnsPerIP,
|
||||||
|
}
|
||||||
|
|
||||||
|
if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConns)); v != "" {
|
||||||
|
if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 {
|
||||||
|
cfg.MaxConns = int32(parsed)
|
||||||
|
} else {
|
||||||
|
log.Printf("[OpsWS] invalid %s=%q (expected int>0); using default=%d", envOpsWSMaxConns, v, cfg.MaxConns)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConnsPerIP)); v != "" {
|
||||||
|
if parsed, err := strconv.Atoi(v); err == nil && parsed >= 0 {
|
||||||
|
cfg.MaxConnsPerIP = int32(parsed)
|
||||||
|
} else {
|
||||||
|
log.Printf("[OpsWS] invalid %s=%q (expected int>=0); using default=%d", envOpsWSMaxConnsPerIP, v, cfg.MaxConnsPerIP)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return cfg
|
||||||
|
}
|
||||||
|
|
||||||
|
func defaultTrustedProxies() []netip.Prefix {
|
||||||
|
prefixes, _ := parseTrustedProxyList("127.0.0.0/8,::1/128")
|
||||||
|
return prefixes
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseTrustedProxyList(raw string) (prefixes []netip.Prefix, invalid []string) {
|
||||||
|
for _, token := range strings.Split(raw, ",") {
|
||||||
|
item := strings.TrimSpace(token)
|
||||||
|
if item == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
p netip.Prefix
|
||||||
|
err error
|
||||||
|
)
|
||||||
|
if strings.Contains(item, "/") {
|
||||||
|
p, err = netip.ParsePrefix(item)
|
||||||
|
} else {
|
||||||
|
var addr netip.Addr
|
||||||
|
addr, err = netip.ParseAddr(item)
|
||||||
|
if err == nil {
|
||||||
|
addr = addr.Unmap()
|
||||||
|
bits := 128
|
||||||
|
if addr.Is4() {
|
||||||
|
bits = 32
|
||||||
|
}
|
||||||
|
p = netip.PrefixFrom(addr, bits)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil || !p.IsValid() {
|
||||||
|
invalid = append(invalid, item)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
prefixes = append(prefixes, p.Masked())
|
||||||
|
}
|
||||||
|
return prefixes, invalid
|
||||||
|
}
|
||||||
|
|
||||||
|
func hostWithoutPort(hostport string) string {
|
||||||
|
hostport = strings.TrimSpace(hostport)
|
||||||
|
if hostport == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if host, _, err := net.SplitHostPort(hostport); err == nil {
|
||||||
|
return host
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(hostport, "[") && strings.HasSuffix(hostport, "]") {
|
||||||
|
return strings.Trim(hostport, "[]")
|
||||||
|
}
|
||||||
|
parts := strings.Split(hostport, ":")
|
||||||
|
return parts[0]
|
||||||
|
}
|
||||||
@@ -19,14 +19,16 @@ type SettingHandler struct {
|
|||||||
settingService *service.SettingService
|
settingService *service.SettingService
|
||||||
emailService *service.EmailService
|
emailService *service.EmailService
|
||||||
turnstileService *service.TurnstileService
|
turnstileService *service.TurnstileService
|
||||||
|
opsService *service.OpsService
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewSettingHandler 创建系统设置处理器
|
// NewSettingHandler 创建系统设置处理器
|
||||||
func NewSettingHandler(settingService *service.SettingService, emailService *service.EmailService, turnstileService *service.TurnstileService) *SettingHandler {
|
func NewSettingHandler(settingService *service.SettingService, emailService *service.EmailService, turnstileService *service.TurnstileService, opsService *service.OpsService) *SettingHandler {
|
||||||
return &SettingHandler{
|
return &SettingHandler{
|
||||||
settingService: settingService,
|
settingService: settingService,
|
||||||
emailService: emailService,
|
emailService: emailService,
|
||||||
turnstileService: turnstileService,
|
turnstileService: turnstileService,
|
||||||
|
opsService: opsService,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -39,6 +41,9 @@ func (h *SettingHandler) GetSettings(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if ops monitoring is enabled (respects config.ops.enabled)
|
||||||
|
opsEnabled := h.opsService != nil && h.opsService.IsMonitoringEnabled(c.Request.Context())
|
||||||
|
|
||||||
response.Success(c, dto.SystemSettings{
|
response.Success(c, dto.SystemSettings{
|
||||||
RegistrationEnabled: settings.RegistrationEnabled,
|
RegistrationEnabled: settings.RegistrationEnabled,
|
||||||
EmailVerifyEnabled: settings.EmailVerifyEnabled,
|
EmailVerifyEnabled: settings.EmailVerifyEnabled,
|
||||||
@@ -72,6 +77,10 @@ func (h *SettingHandler) GetSettings(c *gin.Context) {
|
|||||||
FallbackModelAntigravity: settings.FallbackModelAntigravity,
|
FallbackModelAntigravity: settings.FallbackModelAntigravity,
|
||||||
EnableIdentityPatch: settings.EnableIdentityPatch,
|
EnableIdentityPatch: settings.EnableIdentityPatch,
|
||||||
IdentityPatchPrompt: settings.IdentityPatchPrompt,
|
IdentityPatchPrompt: settings.IdentityPatchPrompt,
|
||||||
|
OpsMonitoringEnabled: opsEnabled && settings.OpsMonitoringEnabled,
|
||||||
|
OpsRealtimeMonitoringEnabled: settings.OpsRealtimeMonitoringEnabled,
|
||||||
|
OpsQueryModeDefault: settings.OpsQueryModeDefault,
|
||||||
|
OpsMetricsIntervalSeconds: settings.OpsMetricsIntervalSeconds,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -95,7 +104,7 @@ type UpdateSettingsRequest struct {
|
|||||||
TurnstileSiteKey string `json:"turnstile_site_key"`
|
TurnstileSiteKey string `json:"turnstile_site_key"`
|
||||||
TurnstileSecretKey string `json:"turnstile_secret_key"`
|
TurnstileSecretKey string `json:"turnstile_secret_key"`
|
||||||
|
|
||||||
// LinuxDo Connect OAuth 登录(终端用户 SSO)
|
// LinuxDo Connect OAuth 登录
|
||||||
LinuxDoConnectEnabled bool `json:"linuxdo_connect_enabled"`
|
LinuxDoConnectEnabled bool `json:"linuxdo_connect_enabled"`
|
||||||
LinuxDoConnectClientID string `json:"linuxdo_connect_client_id"`
|
LinuxDoConnectClientID string `json:"linuxdo_connect_client_id"`
|
||||||
LinuxDoConnectClientSecret string `json:"linuxdo_connect_client_secret"`
|
LinuxDoConnectClientSecret string `json:"linuxdo_connect_client_secret"`
|
||||||
@@ -124,6 +133,12 @@ type UpdateSettingsRequest struct {
|
|||||||
// Identity patch configuration (Claude -> Gemini)
|
// Identity patch configuration (Claude -> Gemini)
|
||||||
EnableIdentityPatch bool `json:"enable_identity_patch"`
|
EnableIdentityPatch bool `json:"enable_identity_patch"`
|
||||||
IdentityPatchPrompt string `json:"identity_patch_prompt"`
|
IdentityPatchPrompt string `json:"identity_patch_prompt"`
|
||||||
|
|
||||||
|
// Ops monitoring (vNext)
|
||||||
|
OpsMonitoringEnabled *bool `json:"ops_monitoring_enabled"`
|
||||||
|
OpsRealtimeMonitoringEnabled *bool `json:"ops_realtime_monitoring_enabled"`
|
||||||
|
OpsQueryModeDefault *string `json:"ops_query_mode_default"`
|
||||||
|
OpsMetricsIntervalSeconds *int `json:"ops_metrics_interval_seconds"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// UpdateSettings 更新系统设置
|
// UpdateSettings 更新系统设置
|
||||||
@@ -208,6 +223,18 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Ops metrics collector interval validation (seconds).
|
||||||
|
if req.OpsMetricsIntervalSeconds != nil {
|
||||||
|
v := *req.OpsMetricsIntervalSeconds
|
||||||
|
if v < 60 {
|
||||||
|
v = 60
|
||||||
|
}
|
||||||
|
if v > 3600 {
|
||||||
|
v = 3600
|
||||||
|
}
|
||||||
|
req.OpsMetricsIntervalSeconds = &v
|
||||||
|
}
|
||||||
|
|
||||||
settings := &service.SystemSettings{
|
settings := &service.SystemSettings{
|
||||||
RegistrationEnabled: req.RegistrationEnabled,
|
RegistrationEnabled: req.RegistrationEnabled,
|
||||||
EmailVerifyEnabled: req.EmailVerifyEnabled,
|
EmailVerifyEnabled: req.EmailVerifyEnabled,
|
||||||
@@ -241,6 +268,30 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
|
|||||||
FallbackModelAntigravity: req.FallbackModelAntigravity,
|
FallbackModelAntigravity: req.FallbackModelAntigravity,
|
||||||
EnableIdentityPatch: req.EnableIdentityPatch,
|
EnableIdentityPatch: req.EnableIdentityPatch,
|
||||||
IdentityPatchPrompt: req.IdentityPatchPrompt,
|
IdentityPatchPrompt: req.IdentityPatchPrompt,
|
||||||
|
OpsMonitoringEnabled: func() bool {
|
||||||
|
if req.OpsMonitoringEnabled != nil {
|
||||||
|
return *req.OpsMonitoringEnabled
|
||||||
|
}
|
||||||
|
return previousSettings.OpsMonitoringEnabled
|
||||||
|
}(),
|
||||||
|
OpsRealtimeMonitoringEnabled: func() bool {
|
||||||
|
if req.OpsRealtimeMonitoringEnabled != nil {
|
||||||
|
return *req.OpsRealtimeMonitoringEnabled
|
||||||
|
}
|
||||||
|
return previousSettings.OpsRealtimeMonitoringEnabled
|
||||||
|
}(),
|
||||||
|
OpsQueryModeDefault: func() string {
|
||||||
|
if req.OpsQueryModeDefault != nil {
|
||||||
|
return *req.OpsQueryModeDefault
|
||||||
|
}
|
||||||
|
return previousSettings.OpsQueryModeDefault
|
||||||
|
}(),
|
||||||
|
OpsMetricsIntervalSeconds: func() int {
|
||||||
|
if req.OpsMetricsIntervalSeconds != nil {
|
||||||
|
return *req.OpsMetricsIntervalSeconds
|
||||||
|
}
|
||||||
|
return previousSettings.OpsMetricsIntervalSeconds
|
||||||
|
}(),
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := h.settingService.UpdateSettings(c.Request.Context(), settings); err != nil {
|
if err := h.settingService.UpdateSettings(c.Request.Context(), settings); err != nil {
|
||||||
@@ -290,6 +341,10 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
|
|||||||
FallbackModelAntigravity: updatedSettings.FallbackModelAntigravity,
|
FallbackModelAntigravity: updatedSettings.FallbackModelAntigravity,
|
||||||
EnableIdentityPatch: updatedSettings.EnableIdentityPatch,
|
EnableIdentityPatch: updatedSettings.EnableIdentityPatch,
|
||||||
IdentityPatchPrompt: updatedSettings.IdentityPatchPrompt,
|
IdentityPatchPrompt: updatedSettings.IdentityPatchPrompt,
|
||||||
|
OpsMonitoringEnabled: updatedSettings.OpsMonitoringEnabled,
|
||||||
|
OpsRealtimeMonitoringEnabled: updatedSettings.OpsRealtimeMonitoringEnabled,
|
||||||
|
OpsQueryModeDefault: updatedSettings.OpsQueryModeDefault,
|
||||||
|
OpsMetricsIntervalSeconds: updatedSettings.OpsMetricsIntervalSeconds,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -411,6 +466,18 @@ func diffSettings(before *service.SystemSettings, after *service.SystemSettings,
|
|||||||
if before.IdentityPatchPrompt != after.IdentityPatchPrompt {
|
if before.IdentityPatchPrompt != after.IdentityPatchPrompt {
|
||||||
changed = append(changed, "identity_patch_prompt")
|
changed = append(changed, "identity_patch_prompt")
|
||||||
}
|
}
|
||||||
|
if before.OpsMonitoringEnabled != after.OpsMonitoringEnabled {
|
||||||
|
changed = append(changed, "ops_monitoring_enabled")
|
||||||
|
}
|
||||||
|
if before.OpsRealtimeMonitoringEnabled != after.OpsRealtimeMonitoringEnabled {
|
||||||
|
changed = append(changed, "ops_realtime_monitoring_enabled")
|
||||||
|
}
|
||||||
|
if before.OpsQueryModeDefault != after.OpsQueryModeDefault {
|
||||||
|
changed = append(changed, "ops_query_mode_default")
|
||||||
|
}
|
||||||
|
if before.OpsMetricsIntervalSeconds != after.OpsMetricsIntervalSeconds {
|
||||||
|
changed = append(changed, "ops_metrics_interval_seconds")
|
||||||
|
}
|
||||||
return changed
|
return changed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -43,6 +43,12 @@ type SystemSettings struct {
|
|||||||
// Identity patch configuration (Claude -> Gemini)
|
// Identity patch configuration (Claude -> Gemini)
|
||||||
EnableIdentityPatch bool `json:"enable_identity_patch"`
|
EnableIdentityPatch bool `json:"enable_identity_patch"`
|
||||||
IdentityPatchPrompt string `json:"identity_patch_prompt"`
|
IdentityPatchPrompt string `json:"identity_patch_prompt"`
|
||||||
|
|
||||||
|
// Ops monitoring (vNext)
|
||||||
|
OpsMonitoringEnabled bool `json:"ops_monitoring_enabled"`
|
||||||
|
OpsRealtimeMonitoringEnabled bool `json:"ops_realtime_monitoring_enabled"`
|
||||||
|
OpsQueryModeDefault string `json:"ops_query_mode_default"`
|
||||||
|
OpsMetricsIntervalSeconds int `json:"ops_metrics_interval_seconds"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type PublicSettings struct {
|
type PublicSettings struct {
|
||||||
|
|||||||
@@ -15,7 +15,6 @@ import (
|
|||||||
"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
|
"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
|
||||||
"github.com/Wei-Shaw/sub2api/internal/pkg/claude"
|
"github.com/Wei-Shaw/sub2api/internal/pkg/claude"
|
||||||
pkgerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
pkgerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||||
"github.com/Wei-Shaw/sub2api/internal/pkg/ip"
|
|
||||||
"github.com/Wei-Shaw/sub2api/internal/pkg/openai"
|
"github.com/Wei-Shaw/sub2api/internal/pkg/openai"
|
||||||
middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
|
middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
|
||||||
"github.com/Wei-Shaw/sub2api/internal/service"
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
@@ -89,6 +88,8 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
setOpsRequestContext(c, "", false, body)
|
||||||
|
|
||||||
parsedReq, err := service.ParseGatewayRequest(body)
|
parsedReq, err := service.ParseGatewayRequest(body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
|
h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
|
||||||
@@ -97,8 +98,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
reqModel := parsedReq.Model
|
reqModel := parsedReq.Model
|
||||||
reqStream := parsedReq.Stream
|
reqStream := parsedReq.Stream
|
||||||
|
|
||||||
// 设置 Claude Code 客户端标识到 context(用于分组限制检查)
|
setOpsRequestContext(c, reqModel, reqStream, body)
|
||||||
SetClaudeCodeClientContext(c, body)
|
|
||||||
|
|
||||||
// 验证 model 必填
|
// 验证 model 必填
|
||||||
if reqModel == "" {
|
if reqModel == "" {
|
||||||
@@ -112,15 +112,10 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
// 获取订阅信息(可能为nil)- 提前获取用于后续检查
|
// 获取订阅信息(可能为nil)- 提前获取用于后续检查
|
||||||
subscription, _ := middleware2.GetSubscriptionFromContext(c)
|
subscription, _ := middleware2.GetSubscriptionFromContext(c)
|
||||||
|
|
||||||
// 获取 User-Agent
|
|
||||||
userAgent := c.Request.UserAgent()
|
|
||||||
|
|
||||||
// 获取客户端 IP
|
|
||||||
clientIP := ip.GetClientIP(c)
|
|
||||||
|
|
||||||
// 0. 检查wait队列是否已满
|
// 0. 检查wait队列是否已满
|
||||||
maxWait := service.CalculateMaxWait(subject.Concurrency)
|
maxWait := service.CalculateMaxWait(subject.Concurrency)
|
||||||
canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait)
|
canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait)
|
||||||
|
waitCounted := false
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Increment wait count failed: %v", err)
|
log.Printf("Increment wait count failed: %v", err)
|
||||||
// On error, allow request to proceed
|
// On error, allow request to proceed
|
||||||
@@ -128,8 +123,15 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later")
|
h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// 确保在函数退出时减少wait计数
|
if err == nil && canWait {
|
||||||
defer h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
|
waitCounted = true
|
||||||
|
}
|
||||||
|
// Ensure we decrement if we exit before acquiring the user slot.
|
||||||
|
defer func() {
|
||||||
|
if waitCounted {
|
||||||
|
h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
// 1. 首先获取用户并发槽位
|
// 1. 首先获取用户并发槽位
|
||||||
userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted)
|
userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted)
|
||||||
@@ -138,6 +140,11 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
h.handleConcurrencyError(c, err, "user", streamStarted)
|
h.handleConcurrencyError(c, err, "user", streamStarted)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
// User slot acquired: no longer waiting in the queue.
|
||||||
|
if waitCounted {
|
||||||
|
h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
|
||||||
|
waitCounted = false
|
||||||
|
}
|
||||||
// 在请求结束或 Context 取消时确保释放槽位,避免客户端断开造成泄漏
|
// 在请求结束或 Context 取消时确保释放槽位,避免客户端断开造成泄漏
|
||||||
userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
|
userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
|
||||||
if userReleaseFunc != nil {
|
if userReleaseFunc != nil {
|
||||||
@@ -184,6 +191,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
account := selection.Account
|
account := selection.Account
|
||||||
|
setOpsSelectedAccount(c, account.ID)
|
||||||
|
|
||||||
// 检查预热请求拦截(在账号选择后、转发前检查)
|
// 检查预热请求拦截(在账号选择后、转发前检查)
|
||||||
if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) {
|
if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) {
|
||||||
@@ -200,12 +208,12 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
|
|
||||||
// 3. 获取账号并发槽位
|
// 3. 获取账号并发槽位
|
||||||
accountReleaseFunc := selection.ReleaseFunc
|
accountReleaseFunc := selection.ReleaseFunc
|
||||||
var accountWaitRelease func()
|
|
||||||
if !selection.Acquired {
|
if !selection.Acquired {
|
||||||
if selection.WaitPlan == nil {
|
if selection.WaitPlan == nil {
|
||||||
h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
|
h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
accountWaitCounted := false
|
||||||
canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
|
canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Increment account wait count failed: %v", err)
|
log.Printf("Increment account wait count failed: %v", err)
|
||||||
@@ -213,12 +221,16 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
log.Printf("Account wait queue full: account=%d", account.ID)
|
log.Printf("Account wait queue full: account=%d", account.ID)
|
||||||
h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
|
h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
|
||||||
return
|
return
|
||||||
} else {
|
}
|
||||||
// Only set release function if increment succeeded
|
if err == nil && canWait {
|
||||||
accountWaitRelease = func() {
|
accountWaitCounted = true
|
||||||
|
}
|
||||||
|
// Ensure the wait counter is decremented if we exit before acquiring the slot.
|
||||||
|
defer func() {
|
||||||
|
if accountWaitCounted {
|
||||||
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
|
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
|
||||||
}
|
}
|
||||||
}
|
}()
|
||||||
|
|
||||||
accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
|
accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
|
||||||
c,
|
c,
|
||||||
@@ -229,20 +241,21 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
&streamStarted,
|
&streamStarted,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if accountWaitRelease != nil {
|
|
||||||
accountWaitRelease()
|
|
||||||
}
|
|
||||||
log.Printf("Account concurrency acquire failed: %v", err)
|
log.Printf("Account concurrency acquire failed: %v", err)
|
||||||
h.handleConcurrencyError(c, err, "account", streamStarted)
|
h.handleConcurrencyError(c, err, "account", streamStarted)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
// Slot acquired: no longer waiting in queue.
|
||||||
|
if accountWaitCounted {
|
||||||
|
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
|
||||||
|
accountWaitCounted = false
|
||||||
|
}
|
||||||
if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil {
|
if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil {
|
||||||
log.Printf("Bind sticky session failed: %v", err)
|
log.Printf("Bind sticky session failed: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// 账号槽位/等待计数需要在超时或断开时安全回收
|
// 账号槽位/等待计数需要在超时或断开时安全回收
|
||||||
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
|
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
|
||||||
accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
|
|
||||||
|
|
||||||
// 转发请求 - 根据账号平台分流
|
// 转发请求 - 根据账号平台分流
|
||||||
var result *service.ForwardResult
|
var result *service.ForwardResult
|
||||||
@@ -254,9 +267,6 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
if accountReleaseFunc != nil {
|
if accountReleaseFunc != nil {
|
||||||
accountReleaseFunc()
|
accountReleaseFunc()
|
||||||
}
|
}
|
||||||
if accountWaitRelease != nil {
|
|
||||||
accountWaitRelease()
|
|
||||||
}
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
var failoverErr *service.UpstreamFailoverError
|
var failoverErr *service.UpstreamFailoverError
|
||||||
if errors.As(err, &failoverErr) {
|
if errors.As(err, &failoverErr) {
|
||||||
@@ -277,7 +287,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 异步记录使用量(subscription已在函数开头获取)
|
// 异步记录使用量(subscription已在函数开头获取)
|
||||||
go func(result *service.ForwardResult, usedAccount *service.Account, ua string, cip string) {
|
go func(result *service.ForwardResult, usedAccount *service.Account) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{
|
if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{
|
||||||
@@ -286,12 +296,10 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
User: apiKey.User,
|
User: apiKey.User,
|
||||||
Account: usedAccount,
|
Account: usedAccount,
|
||||||
Subscription: subscription,
|
Subscription: subscription,
|
||||||
UserAgent: ua,
|
|
||||||
IPAddress: cip,
|
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
log.Printf("Record usage failed: %v", err)
|
log.Printf("Record usage failed: %v", err)
|
||||||
}
|
}
|
||||||
}(result, account, userAgent, clientIP)
|
}(result, account)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -313,6 +321,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
account := selection.Account
|
account := selection.Account
|
||||||
|
setOpsSelectedAccount(c, account.ID)
|
||||||
|
|
||||||
// 检查预热请求拦截(在账号选择后、转发前检查)
|
// 检查预热请求拦截(在账号选择后、转发前检查)
|
||||||
if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) {
|
if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) {
|
||||||
@@ -329,12 +338,12 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
|
|
||||||
// 3. 获取账号并发槽位
|
// 3. 获取账号并发槽位
|
||||||
accountReleaseFunc := selection.ReleaseFunc
|
accountReleaseFunc := selection.ReleaseFunc
|
||||||
var accountWaitRelease func()
|
|
||||||
if !selection.Acquired {
|
if !selection.Acquired {
|
||||||
if selection.WaitPlan == nil {
|
if selection.WaitPlan == nil {
|
||||||
h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
|
h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
accountWaitCounted := false
|
||||||
canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
|
canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Increment account wait count failed: %v", err)
|
log.Printf("Increment account wait count failed: %v", err)
|
||||||
@@ -342,12 +351,15 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
log.Printf("Account wait queue full: account=%d", account.ID)
|
log.Printf("Account wait queue full: account=%d", account.ID)
|
||||||
h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
|
h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
|
||||||
return
|
return
|
||||||
} else {
|
}
|
||||||
// Only set release function if increment succeeded
|
if err == nil && canWait {
|
||||||
accountWaitRelease = func() {
|
accountWaitCounted = true
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if accountWaitCounted {
|
||||||
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
|
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
|
||||||
}
|
}
|
||||||
}
|
}()
|
||||||
|
|
||||||
accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
|
accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
|
||||||
c,
|
c,
|
||||||
@@ -358,20 +370,20 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
&streamStarted,
|
&streamStarted,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if accountWaitRelease != nil {
|
|
||||||
accountWaitRelease()
|
|
||||||
}
|
|
||||||
log.Printf("Account concurrency acquire failed: %v", err)
|
log.Printf("Account concurrency acquire failed: %v", err)
|
||||||
h.handleConcurrencyError(c, err, "account", streamStarted)
|
h.handleConcurrencyError(c, err, "account", streamStarted)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
if accountWaitCounted {
|
||||||
|
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
|
||||||
|
accountWaitCounted = false
|
||||||
|
}
|
||||||
if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil {
|
if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil {
|
||||||
log.Printf("Bind sticky session failed: %v", err)
|
log.Printf("Bind sticky session failed: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// 账号槽位/等待计数需要在超时或断开时安全回收
|
// 账号槽位/等待计数需要在超时或断开时安全回收
|
||||||
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
|
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
|
||||||
accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
|
|
||||||
|
|
||||||
// 转发请求 - 根据账号平台分流
|
// 转发请求 - 根据账号平台分流
|
||||||
var result *service.ForwardResult
|
var result *service.ForwardResult
|
||||||
@@ -383,9 +395,6 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
if accountReleaseFunc != nil {
|
if accountReleaseFunc != nil {
|
||||||
accountReleaseFunc()
|
accountReleaseFunc()
|
||||||
}
|
}
|
||||||
if accountWaitRelease != nil {
|
|
||||||
accountWaitRelease()
|
|
||||||
}
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
var failoverErr *service.UpstreamFailoverError
|
var failoverErr *service.UpstreamFailoverError
|
||||||
if errors.As(err, &failoverErr) {
|
if errors.As(err, &failoverErr) {
|
||||||
@@ -406,7 +415,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 异步记录使用量(subscription已在函数开头获取)
|
// 异步记录使用量(subscription已在函数开头获取)
|
||||||
go func(result *service.ForwardResult, usedAccount *service.Account, ua string, cip string) {
|
go func(result *service.ForwardResult, usedAccount *service.Account) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{
|
if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{
|
||||||
@@ -415,12 +424,10 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
|||||||
User: apiKey.User,
|
User: apiKey.User,
|
||||||
Account: usedAccount,
|
Account: usedAccount,
|
||||||
Subscription: subscription,
|
Subscription: subscription,
|
||||||
UserAgent: ua,
|
|
||||||
IPAddress: cip,
|
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
log.Printf("Record usage failed: %v", err)
|
log.Printf("Record usage failed: %v", err)
|
||||||
}
|
}
|
||||||
}(result, account, userAgent, clientIP)
|
}(result, account)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -686,21 +693,22 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
setOpsRequestContext(c, "", false, body)
|
||||||
|
|
||||||
parsedReq, err := service.ParseGatewayRequest(body)
|
parsedReq, err := service.ParseGatewayRequest(body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
|
h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// 设置 Claude Code 客户端标识到 context(用于分组限制检查)
|
|
||||||
SetClaudeCodeClientContext(c, body)
|
|
||||||
|
|
||||||
// 验证 model 必填
|
// 验证 model 必填
|
||||||
if parsedReq.Model == "" {
|
if parsedReq.Model == "" {
|
||||||
h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "model is required")
|
h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "model is required")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
setOpsRequestContext(c, parsedReq.Model, parsedReq.Stream, body)
|
||||||
|
|
||||||
// 获取订阅信息(可能为nil)
|
// 获取订阅信息(可能为nil)
|
||||||
subscription, _ := middleware2.GetSubscriptionFromContext(c)
|
subscription, _ := middleware2.GetSubscriptionFromContext(c)
|
||||||
|
|
||||||
@@ -721,6 +729,7 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) {
|
|||||||
h.errorResponse(c, http.StatusServiceUnavailable, "api_error", "No available accounts: "+err.Error())
|
h.errorResponse(c, http.StatusServiceUnavailable, "api_error", "No available accounts: "+err.Error())
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
setOpsSelectedAccount(c, account.ID)
|
||||||
|
|
||||||
// 转发请求(不记录使用量)
|
// 转发请求(不记录使用量)
|
||||||
if err := h.gatewayService.ForwardCountTokens(c.Request.Context(), c, account, parsedReq); err != nil {
|
if err := h.gatewayService.ForwardCountTokens(c.Request.Context(), c, account, parsedReq); err != nil {
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ import (
|
|||||||
"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
|
"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
|
||||||
"github.com/Wei-Shaw/sub2api/internal/pkg/gemini"
|
"github.com/Wei-Shaw/sub2api/internal/pkg/gemini"
|
||||||
"github.com/Wei-Shaw/sub2api/internal/pkg/googleapi"
|
"github.com/Wei-Shaw/sub2api/internal/pkg/googleapi"
|
||||||
"github.com/Wei-Shaw/sub2api/internal/pkg/ip"
|
|
||||||
"github.com/Wei-Shaw/sub2api/internal/server/middleware"
|
"github.com/Wei-Shaw/sub2api/internal/server/middleware"
|
||||||
"github.com/Wei-Shaw/sub2api/internal/service"
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
|
|
||||||
@@ -162,28 +161,32 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
setOpsRequestContext(c, modelName, stream, body)
|
||||||
|
|
||||||
// Get subscription (may be nil)
|
// Get subscription (may be nil)
|
||||||
subscription, _ := middleware.GetSubscriptionFromContext(c)
|
subscription, _ := middleware.GetSubscriptionFromContext(c)
|
||||||
|
|
||||||
// 获取 User-Agent
|
|
||||||
userAgent := c.Request.UserAgent()
|
|
||||||
|
|
||||||
// 获取客户端 IP
|
|
||||||
clientIP := ip.GetClientIP(c)
|
|
||||||
|
|
||||||
// For Gemini native API, do not send Claude-style ping frames.
|
// For Gemini native API, do not send Claude-style ping frames.
|
||||||
geminiConcurrency := NewConcurrencyHelper(h.concurrencyHelper.concurrencyService, SSEPingFormatNone, 0)
|
geminiConcurrency := NewConcurrencyHelper(h.concurrencyHelper.concurrencyService, SSEPingFormatNone, 0)
|
||||||
|
|
||||||
// 0) wait queue check
|
// 0) wait queue check
|
||||||
maxWait := service.CalculateMaxWait(authSubject.Concurrency)
|
maxWait := service.CalculateMaxWait(authSubject.Concurrency)
|
||||||
canWait, err := geminiConcurrency.IncrementWaitCount(c.Request.Context(), authSubject.UserID, maxWait)
|
canWait, err := geminiConcurrency.IncrementWaitCount(c.Request.Context(), authSubject.UserID, maxWait)
|
||||||
|
waitCounted := false
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Increment wait count failed: %v", err)
|
log.Printf("Increment wait count failed: %v", err)
|
||||||
} else if !canWait {
|
} else if !canWait {
|
||||||
googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later")
|
googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID)
|
if err == nil && canWait {
|
||||||
|
waitCounted = true
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if waitCounted {
|
||||||
|
geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
// 1) user concurrency slot
|
// 1) user concurrency slot
|
||||||
streamStarted := false
|
streamStarted := false
|
||||||
@@ -192,6 +195,10 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
|
|||||||
googleError(c, http.StatusTooManyRequests, err.Error())
|
googleError(c, http.StatusTooManyRequests, err.Error())
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
if waitCounted {
|
||||||
|
geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID)
|
||||||
|
waitCounted = false
|
||||||
|
}
|
||||||
// 确保请求取消时也会释放槽位,避免长连接被动中断造成泄漏
|
// 确保请求取消时也会释放槽位,避免长连接被动中断造成泄漏
|
||||||
userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
|
userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
|
||||||
if userReleaseFunc != nil {
|
if userReleaseFunc != nil {
|
||||||
@@ -207,10 +214,6 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
|
|||||||
|
|
||||||
// 3) select account (sticky session based on request body)
|
// 3) select account (sticky session based on request body)
|
||||||
parsedReq, _ := service.ParseGatewayRequest(body)
|
parsedReq, _ := service.ParseGatewayRequest(body)
|
||||||
|
|
||||||
// 设置 Claude Code 客户端标识到 context(用于分组限制检查)
|
|
||||||
SetClaudeCodeClientContext(c, body)
|
|
||||||
|
|
||||||
sessionHash := h.gatewayService.GenerateSessionHash(parsedReq)
|
sessionHash := h.gatewayService.GenerateSessionHash(parsedReq)
|
||||||
sessionKey := sessionHash
|
sessionKey := sessionHash
|
||||||
if sessionHash != "" {
|
if sessionHash != "" {
|
||||||
@@ -232,15 +235,16 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
account := selection.Account
|
account := selection.Account
|
||||||
|
setOpsSelectedAccount(c, account.ID)
|
||||||
|
|
||||||
// 4) account concurrency slot
|
// 4) account concurrency slot
|
||||||
accountReleaseFunc := selection.ReleaseFunc
|
accountReleaseFunc := selection.ReleaseFunc
|
||||||
var accountWaitRelease func()
|
|
||||||
if !selection.Acquired {
|
if !selection.Acquired {
|
||||||
if selection.WaitPlan == nil {
|
if selection.WaitPlan == nil {
|
||||||
googleError(c, http.StatusServiceUnavailable, "No available Gemini accounts")
|
googleError(c, http.StatusServiceUnavailable, "No available Gemini accounts")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
accountWaitCounted := false
|
||||||
canWait, err := geminiConcurrency.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
|
canWait, err := geminiConcurrency.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Increment account wait count failed: %v", err)
|
log.Printf("Increment account wait count failed: %v", err)
|
||||||
@@ -248,12 +252,15 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
|
|||||||
log.Printf("Account wait queue full: account=%d", account.ID)
|
log.Printf("Account wait queue full: account=%d", account.ID)
|
||||||
googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later")
|
googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later")
|
||||||
return
|
return
|
||||||
} else {
|
}
|
||||||
// Only set release function if increment succeeded
|
if err == nil && canWait {
|
||||||
accountWaitRelease = func() {
|
accountWaitCounted = true
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if accountWaitCounted {
|
||||||
geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID)
|
geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID)
|
||||||
}
|
}
|
||||||
}
|
}()
|
||||||
|
|
||||||
accountReleaseFunc, err = geminiConcurrency.AcquireAccountSlotWithWaitTimeout(
|
accountReleaseFunc, err = geminiConcurrency.AcquireAccountSlotWithWaitTimeout(
|
||||||
c,
|
c,
|
||||||
@@ -264,19 +271,19 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
|
|||||||
&streamStarted,
|
&streamStarted,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if accountWaitRelease != nil {
|
|
||||||
accountWaitRelease()
|
|
||||||
}
|
|
||||||
googleError(c, http.StatusTooManyRequests, err.Error())
|
googleError(c, http.StatusTooManyRequests, err.Error())
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
if accountWaitCounted {
|
||||||
|
geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID)
|
||||||
|
accountWaitCounted = false
|
||||||
|
}
|
||||||
if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil {
|
if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil {
|
||||||
log.Printf("Bind sticky session failed: %v", err)
|
log.Printf("Bind sticky session failed: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// 账号槽位/等待计数需要在超时或断开时安全回收
|
// 账号槽位/等待计数需要在超时或断开时安全回收
|
||||||
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
|
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
|
||||||
accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
|
|
||||||
|
|
||||||
// 5) forward (根据平台分流)
|
// 5) forward (根据平台分流)
|
||||||
var result *service.ForwardResult
|
var result *service.ForwardResult
|
||||||
@@ -288,9 +295,6 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
|
|||||||
if accountReleaseFunc != nil {
|
if accountReleaseFunc != nil {
|
||||||
accountReleaseFunc()
|
accountReleaseFunc()
|
||||||
}
|
}
|
||||||
if accountWaitRelease != nil {
|
|
||||||
accountWaitRelease()
|
|
||||||
}
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
var failoverErr *service.UpstreamFailoverError
|
var failoverErr *service.UpstreamFailoverError
|
||||||
if errors.As(err, &failoverErr) {
|
if errors.As(err, &failoverErr) {
|
||||||
@@ -311,7 +315,7 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 6) record usage async
|
// 6) record usage async
|
||||||
go func(result *service.ForwardResult, usedAccount *service.Account, ua string, cip string) {
|
go func(result *service.ForwardResult, usedAccount *service.Account) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{
|
if err := h.gatewayService.RecordUsage(ctx, &service.RecordUsageInput{
|
||||||
@@ -320,12 +324,10 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
|
|||||||
User: apiKey.User,
|
User: apiKey.User,
|
||||||
Account: usedAccount,
|
Account: usedAccount,
|
||||||
Subscription: subscription,
|
Subscription: subscription,
|
||||||
UserAgent: ua,
|
|
||||||
IPAddress: cip,
|
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
log.Printf("Record usage failed: %v", err)
|
log.Printf("Record usage failed: %v", err)
|
||||||
}
|
}
|
||||||
}(result, account, userAgent, clientIP)
|
}(result, account)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ type AdminHandlers struct {
|
|||||||
Redeem *admin.RedeemHandler
|
Redeem *admin.RedeemHandler
|
||||||
Promo *admin.PromoHandler
|
Promo *admin.PromoHandler
|
||||||
Setting *admin.SettingHandler
|
Setting *admin.SettingHandler
|
||||||
|
Ops *admin.OpsHandler
|
||||||
System *admin.SystemHandler
|
System *admin.SystemHandler
|
||||||
Subscription *admin.SubscriptionHandler
|
Subscription *admin.SubscriptionHandler
|
||||||
Usage *admin.UsageHandler
|
Usage *admin.UsageHandler
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/Wei-Shaw/sub2api/internal/config"
|
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||||
"github.com/Wei-Shaw/sub2api/internal/pkg/ip"
|
|
||||||
"github.com/Wei-Shaw/sub2api/internal/pkg/openai"
|
"github.com/Wei-Shaw/sub2api/internal/pkg/openai"
|
||||||
middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
|
middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
|
||||||
"github.com/Wei-Shaw/sub2api/internal/service"
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
@@ -77,6 +76,8 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
setOpsRequestContext(c, "", false, body)
|
||||||
|
|
||||||
// Parse request body to map for potential modification
|
// Parse request body to map for potential modification
|
||||||
var reqBody map[string]any
|
var reqBody map[string]any
|
||||||
if err := json.Unmarshal(body, &reqBody); err != nil {
|
if err := json.Unmarshal(body, &reqBody); err != nil {
|
||||||
@@ -95,10 +96,6 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
userAgent := c.GetHeader("User-Agent")
|
userAgent := c.GetHeader("User-Agent")
|
||||||
|
|
||||||
// 获取客户端 IP
|
|
||||||
clientIP := ip.GetClientIP(c)
|
|
||||||
|
|
||||||
if !openai.IsCodexCLIRequest(userAgent) {
|
if !openai.IsCodexCLIRequest(userAgent) {
|
||||||
existingInstructions, _ := reqBody["instructions"].(string)
|
existingInstructions, _ := reqBody["instructions"].(string)
|
||||||
if strings.TrimSpace(existingInstructions) == "" {
|
if strings.TrimSpace(existingInstructions) == "" {
|
||||||
@@ -114,6 +111,8 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
setOpsRequestContext(c, reqModel, reqStream, body)
|
||||||
|
|
||||||
// Track if we've started streaming (for error handling)
|
// Track if we've started streaming (for error handling)
|
||||||
streamStarted := false
|
streamStarted := false
|
||||||
|
|
||||||
@@ -123,6 +122,7 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
|
|||||||
// 0. Check if wait queue is full
|
// 0. Check if wait queue is full
|
||||||
maxWait := service.CalculateMaxWait(subject.Concurrency)
|
maxWait := service.CalculateMaxWait(subject.Concurrency)
|
||||||
canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait)
|
canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait)
|
||||||
|
waitCounted := false
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Increment wait count failed: %v", err)
|
log.Printf("Increment wait count failed: %v", err)
|
||||||
// On error, allow request to proceed
|
// On error, allow request to proceed
|
||||||
@@ -130,8 +130,14 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
|
|||||||
h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later")
|
h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Ensure wait count is decremented when function exits
|
if err == nil && canWait {
|
||||||
defer h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
|
waitCounted = true
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if waitCounted {
|
||||||
|
h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
// 1. First acquire user concurrency slot
|
// 1. First acquire user concurrency slot
|
||||||
userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted)
|
userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted)
|
||||||
@@ -140,6 +146,11 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
|
|||||||
h.handleConcurrencyError(c, err, "user", streamStarted)
|
h.handleConcurrencyError(c, err, "user", streamStarted)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
// User slot acquired: no longer waiting.
|
||||||
|
if waitCounted {
|
||||||
|
h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
|
||||||
|
waitCounted = false
|
||||||
|
}
|
||||||
// 确保请求取消时也会释放槽位,避免长连接被动中断造成泄漏
|
// 确保请求取消时也会释放槽位,避免长连接被动中断造成泄漏
|
||||||
userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
|
userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
|
||||||
if userReleaseFunc != nil {
|
if userReleaseFunc != nil {
|
||||||
@@ -177,15 +188,16 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
account := selection.Account
|
account := selection.Account
|
||||||
log.Printf("[OpenAI Handler] Selected account: id=%d name=%s", account.ID, account.Name)
|
log.Printf("[OpenAI Handler] Selected account: id=%d name=%s", account.ID, account.Name)
|
||||||
|
setOpsSelectedAccount(c, account.ID)
|
||||||
|
|
||||||
// 3. Acquire account concurrency slot
|
// 3. Acquire account concurrency slot
|
||||||
accountReleaseFunc := selection.ReleaseFunc
|
accountReleaseFunc := selection.ReleaseFunc
|
||||||
var accountWaitRelease func()
|
|
||||||
if !selection.Acquired {
|
if !selection.Acquired {
|
||||||
if selection.WaitPlan == nil {
|
if selection.WaitPlan == nil {
|
||||||
h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
|
h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
accountWaitCounted := false
|
||||||
canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
|
canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Increment account wait count failed: %v", err)
|
log.Printf("Increment account wait count failed: %v", err)
|
||||||
@@ -193,12 +205,15 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
|
|||||||
log.Printf("Account wait queue full: account=%d", account.ID)
|
log.Printf("Account wait queue full: account=%d", account.ID)
|
||||||
h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
|
h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
|
||||||
return
|
return
|
||||||
} else {
|
}
|
||||||
// Only set release function if increment succeeded
|
if err == nil && canWait {
|
||||||
accountWaitRelease = func() {
|
accountWaitCounted = true
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if accountWaitCounted {
|
||||||
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
|
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
|
||||||
}
|
}
|
||||||
}
|
}()
|
||||||
|
|
||||||
accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
|
accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
|
||||||
c,
|
c,
|
||||||
@@ -209,29 +224,26 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
|
|||||||
&streamStarted,
|
&streamStarted,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if accountWaitRelease != nil {
|
|
||||||
accountWaitRelease()
|
|
||||||
}
|
|
||||||
log.Printf("Account concurrency acquire failed: %v", err)
|
log.Printf("Account concurrency acquire failed: %v", err)
|
||||||
h.handleConcurrencyError(c, err, "account", streamStarted)
|
h.handleConcurrencyError(c, err, "account", streamStarted)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
if accountWaitCounted {
|
||||||
|
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
|
||||||
|
accountWaitCounted = false
|
||||||
|
}
|
||||||
if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionHash, account.ID); err != nil {
|
if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionHash, account.ID); err != nil {
|
||||||
log.Printf("Bind sticky session failed: %v", err)
|
log.Printf("Bind sticky session failed: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// 账号槽位/等待计数需要在超时或断开时安全回收
|
// 账号槽位/等待计数需要在超时或断开时安全回收
|
||||||
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
|
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
|
||||||
accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
|
|
||||||
|
|
||||||
// Forward request
|
// Forward request
|
||||||
result, err := h.gatewayService.Forward(c.Request.Context(), c, account, body)
|
result, err := h.gatewayService.Forward(c.Request.Context(), c, account, body)
|
||||||
if accountReleaseFunc != nil {
|
if accountReleaseFunc != nil {
|
||||||
accountReleaseFunc()
|
accountReleaseFunc()
|
||||||
}
|
}
|
||||||
if accountWaitRelease != nil {
|
|
||||||
accountWaitRelease()
|
|
||||||
}
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
var failoverErr *service.UpstreamFailoverError
|
var failoverErr *service.UpstreamFailoverError
|
||||||
if errors.As(err, &failoverErr) {
|
if errors.As(err, &failoverErr) {
|
||||||
@@ -252,7 +264,7 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Async record usage
|
// Async record usage
|
||||||
go func(result *service.OpenAIForwardResult, usedAccount *service.Account, ua string, cip string) {
|
go func(result *service.OpenAIForwardResult, usedAccount *service.Account) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
if err := h.gatewayService.RecordUsage(ctx, &service.OpenAIRecordUsageInput{
|
if err := h.gatewayService.RecordUsage(ctx, &service.OpenAIRecordUsageInput{
|
||||||
@@ -261,12 +273,10 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
|
|||||||
User: apiKey.User,
|
User: apiKey.User,
|
||||||
Account: usedAccount,
|
Account: usedAccount,
|
||||||
Subscription: subscription,
|
Subscription: subscription,
|
||||||
UserAgent: ua,
|
|
||||||
IPAddress: cip,
|
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
log.Printf("Record usage failed: %v", err)
|
log.Printf("Record usage failed: %v", err)
|
||||||
}
|
}
|
||||||
}(result, account, userAgent, clientIP)
|
}(result, account)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
954
backend/internal/handler/ops_error_logger.go
Normal file
954
backend/internal/handler/ops_error_logger.go
Normal file
@@ -0,0 +1,954 @@
|
|||||||
|
package handler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"log"
|
||||||
|
"runtime"
|
||||||
|
"runtime/debug"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
|
||||||
|
middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
|
"github.com/gin-gonic/gin"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
opsModelKey = "ops_model"
|
||||||
|
opsStreamKey = "ops_stream"
|
||||||
|
opsRequestBodyKey = "ops_request_body"
|
||||||
|
opsAccountIDKey = "ops_account_id"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
opsErrorLogTimeout = 5 * time.Second
|
||||||
|
opsErrorLogDrainTimeout = 10 * time.Second
|
||||||
|
|
||||||
|
opsErrorLogMinWorkerCount = 4
|
||||||
|
opsErrorLogMaxWorkerCount = 32
|
||||||
|
|
||||||
|
opsErrorLogQueueSizePerWorker = 128
|
||||||
|
opsErrorLogMinQueueSize = 256
|
||||||
|
opsErrorLogMaxQueueSize = 8192
|
||||||
|
)
|
||||||
|
|
||||||
|
type opsErrorLogJob struct {
|
||||||
|
ops *service.OpsService
|
||||||
|
entry *service.OpsInsertErrorLogInput
|
||||||
|
requestBody []byte
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
opsErrorLogOnce sync.Once
|
||||||
|
opsErrorLogQueue chan opsErrorLogJob
|
||||||
|
|
||||||
|
opsErrorLogStopOnce sync.Once
|
||||||
|
opsErrorLogWorkersWg sync.WaitGroup
|
||||||
|
opsErrorLogMu sync.RWMutex
|
||||||
|
opsErrorLogStopping bool
|
||||||
|
opsErrorLogQueueLen atomic.Int64
|
||||||
|
opsErrorLogEnqueued atomic.Int64
|
||||||
|
opsErrorLogDropped atomic.Int64
|
||||||
|
opsErrorLogProcessed atomic.Int64
|
||||||
|
|
||||||
|
opsErrorLogLastDropLogAt atomic.Int64
|
||||||
|
|
||||||
|
opsErrorLogShutdownCh = make(chan struct{})
|
||||||
|
opsErrorLogShutdownOnce sync.Once
|
||||||
|
opsErrorLogDrained atomic.Bool
|
||||||
|
)
|
||||||
|
|
||||||
|
func startOpsErrorLogWorkers() {
|
||||||
|
opsErrorLogMu.Lock()
|
||||||
|
defer opsErrorLogMu.Unlock()
|
||||||
|
|
||||||
|
if opsErrorLogStopping {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
workerCount, queueSize := opsErrorLogConfig()
|
||||||
|
opsErrorLogQueue = make(chan opsErrorLogJob, queueSize)
|
||||||
|
opsErrorLogQueueLen.Store(0)
|
||||||
|
|
||||||
|
opsErrorLogWorkersWg.Add(workerCount)
|
||||||
|
for i := 0; i < workerCount; i++ {
|
||||||
|
go func() {
|
||||||
|
defer opsErrorLogWorkersWg.Done()
|
||||||
|
for job := range opsErrorLogQueue {
|
||||||
|
opsErrorLogQueueLen.Add(-1)
|
||||||
|
if job.ops == nil || job.entry == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
func() {
|
||||||
|
defer func() {
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
log.Printf("[OpsErrorLogger] worker panic: %v\n%s", r, debug.Stack())
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), opsErrorLogTimeout)
|
||||||
|
_ = job.ops.RecordError(ctx, job.entry, job.requestBody)
|
||||||
|
cancel()
|
||||||
|
opsErrorLogProcessed.Add(1)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func enqueueOpsErrorLog(ops *service.OpsService, entry *service.OpsInsertErrorLogInput, requestBody []byte) {
|
||||||
|
if ops == nil || entry == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-opsErrorLogShutdownCh:
|
||||||
|
return
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
opsErrorLogMu.RLock()
|
||||||
|
stopping := opsErrorLogStopping
|
||||||
|
opsErrorLogMu.RUnlock()
|
||||||
|
if stopping {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
opsErrorLogOnce.Do(startOpsErrorLogWorkers)
|
||||||
|
|
||||||
|
opsErrorLogMu.RLock()
|
||||||
|
defer opsErrorLogMu.RUnlock()
|
||||||
|
if opsErrorLogStopping || opsErrorLogQueue == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case opsErrorLogQueue <- opsErrorLogJob{ops: ops, entry: entry, requestBody: requestBody}:
|
||||||
|
opsErrorLogQueueLen.Add(1)
|
||||||
|
opsErrorLogEnqueued.Add(1)
|
||||||
|
default:
|
||||||
|
// Queue is full; drop to avoid blocking request handling.
|
||||||
|
opsErrorLogDropped.Add(1)
|
||||||
|
maybeLogOpsErrorLogDrop()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func StopOpsErrorLogWorkers() bool {
|
||||||
|
opsErrorLogStopOnce.Do(func() {
|
||||||
|
opsErrorLogShutdownOnce.Do(func() {
|
||||||
|
close(opsErrorLogShutdownCh)
|
||||||
|
})
|
||||||
|
opsErrorLogDrained.Store(stopOpsErrorLogWorkers())
|
||||||
|
})
|
||||||
|
return opsErrorLogDrained.Load()
|
||||||
|
}
|
||||||
|
|
||||||
|
func stopOpsErrorLogWorkers() bool {
|
||||||
|
opsErrorLogMu.Lock()
|
||||||
|
opsErrorLogStopping = true
|
||||||
|
ch := opsErrorLogQueue
|
||||||
|
if ch != nil {
|
||||||
|
close(ch)
|
||||||
|
}
|
||||||
|
opsErrorLogQueue = nil
|
||||||
|
opsErrorLogMu.Unlock()
|
||||||
|
|
||||||
|
if ch == nil {
|
||||||
|
opsErrorLogQueueLen.Store(0)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
opsErrorLogWorkersWg.Wait()
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
opsErrorLogQueueLen.Store(0)
|
||||||
|
return true
|
||||||
|
case <-time.After(opsErrorLogDrainTimeout):
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func OpsErrorLogQueueLength() int64 {
|
||||||
|
return opsErrorLogQueueLen.Load()
|
||||||
|
}
|
||||||
|
|
||||||
|
func OpsErrorLogQueueCapacity() int {
|
||||||
|
opsErrorLogMu.RLock()
|
||||||
|
ch := opsErrorLogQueue
|
||||||
|
opsErrorLogMu.RUnlock()
|
||||||
|
if ch == nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return cap(ch)
|
||||||
|
}
|
||||||
|
|
||||||
|
func OpsErrorLogDroppedTotal() int64 {
|
||||||
|
return opsErrorLogDropped.Load()
|
||||||
|
}
|
||||||
|
|
||||||
|
func OpsErrorLogEnqueuedTotal() int64 {
|
||||||
|
return opsErrorLogEnqueued.Load()
|
||||||
|
}
|
||||||
|
|
||||||
|
func OpsErrorLogProcessedTotal() int64 {
|
||||||
|
return opsErrorLogProcessed.Load()
|
||||||
|
}
|
||||||
|
|
||||||
|
func maybeLogOpsErrorLogDrop() {
|
||||||
|
now := time.Now().Unix()
|
||||||
|
|
||||||
|
for {
|
||||||
|
last := opsErrorLogLastDropLogAt.Load()
|
||||||
|
if last != 0 && now-last < 60 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if opsErrorLogLastDropLogAt.CompareAndSwap(last, now) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
queued := opsErrorLogQueueLen.Load()
|
||||||
|
queueCap := OpsErrorLogQueueCapacity()
|
||||||
|
|
||||||
|
log.Printf(
|
||||||
|
"[OpsErrorLogger] queue is full; dropping logs (queued=%d cap=%d enqueued_total=%d dropped_total=%d processed_total=%d)",
|
||||||
|
queued,
|
||||||
|
queueCap,
|
||||||
|
opsErrorLogEnqueued.Load(),
|
||||||
|
opsErrorLogDropped.Load(),
|
||||||
|
opsErrorLogProcessed.Load(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func opsErrorLogConfig() (workerCount int, queueSize int) {
|
||||||
|
workerCount = runtime.GOMAXPROCS(0) * 2
|
||||||
|
if workerCount < opsErrorLogMinWorkerCount {
|
||||||
|
workerCount = opsErrorLogMinWorkerCount
|
||||||
|
}
|
||||||
|
if workerCount > opsErrorLogMaxWorkerCount {
|
||||||
|
workerCount = opsErrorLogMaxWorkerCount
|
||||||
|
}
|
||||||
|
|
||||||
|
queueSize = workerCount * opsErrorLogQueueSizePerWorker
|
||||||
|
if queueSize < opsErrorLogMinQueueSize {
|
||||||
|
queueSize = opsErrorLogMinQueueSize
|
||||||
|
}
|
||||||
|
if queueSize > opsErrorLogMaxQueueSize {
|
||||||
|
queueSize = opsErrorLogMaxQueueSize
|
||||||
|
}
|
||||||
|
|
||||||
|
return workerCount, queueSize
|
||||||
|
}
|
||||||
|
|
||||||
|
func setOpsRequestContext(c *gin.Context, model string, stream bool, requestBody []byte) {
|
||||||
|
if c == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
c.Set(opsModelKey, model)
|
||||||
|
c.Set(opsStreamKey, stream)
|
||||||
|
if len(requestBody) > 0 {
|
||||||
|
c.Set(opsRequestBodyKey, requestBody)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func setOpsSelectedAccount(c *gin.Context, accountID int64) {
|
||||||
|
if c == nil || accountID <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
c.Set(opsAccountIDKey, accountID)
|
||||||
|
}
|
||||||
|
|
||||||
|
type opsCaptureWriter struct {
|
||||||
|
gin.ResponseWriter
|
||||||
|
limit int
|
||||||
|
buf bytes.Buffer
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *opsCaptureWriter) Write(b []byte) (int, error) {
|
||||||
|
if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit {
|
||||||
|
remaining := w.limit - w.buf.Len()
|
||||||
|
if len(b) > remaining {
|
||||||
|
_, _ = w.buf.Write(b[:remaining])
|
||||||
|
} else {
|
||||||
|
_, _ = w.buf.Write(b)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return w.ResponseWriter.Write(b)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *opsCaptureWriter) WriteString(s string) (int, error) {
|
||||||
|
if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit {
|
||||||
|
remaining := w.limit - w.buf.Len()
|
||||||
|
if len(s) > remaining {
|
||||||
|
_, _ = w.buf.WriteString(s[:remaining])
|
||||||
|
} else {
|
||||||
|
_, _ = w.buf.WriteString(s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return w.ResponseWriter.WriteString(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpsErrorLoggerMiddleware records error responses (status >= 400) into ops_error_logs.
|
||||||
|
//
|
||||||
|
// Notes:
|
||||||
|
// - It buffers response bodies only when status >= 400 to avoid overhead for successful traffic.
|
||||||
|
// - Streaming errors after the response has started (SSE) may still need explicit logging.
|
||||||
|
func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
|
||||||
|
return func(c *gin.Context) {
|
||||||
|
w := &opsCaptureWriter{ResponseWriter: c.Writer, limit: 64 * 1024}
|
||||||
|
c.Writer = w
|
||||||
|
c.Next()
|
||||||
|
|
||||||
|
if ops == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !ops.IsMonitoringEnabled(c.Request.Context()) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
status := c.Writer.Status()
|
||||||
|
if status < 400 {
|
||||||
|
// Even when the client request succeeds, we still want to persist upstream error attempts
|
||||||
|
// (retries/failover) so ops can observe upstream instability that gets "covered" by retries.
|
||||||
|
var events []*service.OpsUpstreamErrorEvent
|
||||||
|
if v, ok := c.Get(service.OpsUpstreamErrorsKey); ok {
|
||||||
|
if arr, ok := v.([]*service.OpsUpstreamErrorEvent); ok && len(arr) > 0 {
|
||||||
|
events = arr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Also accept single upstream fields set by gateway services (rare for successful requests).
|
||||||
|
hasUpstreamContext := len(events) > 0
|
||||||
|
if !hasUpstreamContext {
|
||||||
|
if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
|
||||||
|
switch t := v.(type) {
|
||||||
|
case int:
|
||||||
|
hasUpstreamContext = t > 0
|
||||||
|
case int64:
|
||||||
|
hasUpstreamContext = t > 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !hasUpstreamContext {
|
||||||
|
if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
|
||||||
|
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
|
||||||
|
hasUpstreamContext = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !hasUpstreamContext {
|
||||||
|
if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
|
||||||
|
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
|
||||||
|
hasUpstreamContext = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !hasUpstreamContext {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
apiKey, _ := middleware2.GetAPIKeyFromContext(c)
|
||||||
|
clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string)
|
||||||
|
|
||||||
|
model, _ := c.Get(opsModelKey)
|
||||||
|
streamV, _ := c.Get(opsStreamKey)
|
||||||
|
accountIDV, _ := c.Get(opsAccountIDKey)
|
||||||
|
|
||||||
|
var modelName string
|
||||||
|
if s, ok := model.(string); ok {
|
||||||
|
modelName = s
|
||||||
|
}
|
||||||
|
stream := false
|
||||||
|
if b, ok := streamV.(bool); ok {
|
||||||
|
stream = b
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prefer showing the account that experienced the upstream error (if we have events),
|
||||||
|
// otherwise fall back to the final selected account (best-effort).
|
||||||
|
var accountID *int64
|
||||||
|
if len(events) > 0 {
|
||||||
|
if last := events[len(events)-1]; last != nil && last.AccountID > 0 {
|
||||||
|
v := last.AccountID
|
||||||
|
accountID = &v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if accountID == nil {
|
||||||
|
if v, ok := accountIDV.(int64); ok && v > 0 {
|
||||||
|
accountID = &v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fallbackPlatform := guessPlatformFromPath(c.Request.URL.Path)
|
||||||
|
platform := resolveOpsPlatform(apiKey, fallbackPlatform)
|
||||||
|
|
||||||
|
requestID := c.Writer.Header().Get("X-Request-Id")
|
||||||
|
if requestID == "" {
|
||||||
|
requestID = c.Writer.Header().Get("x-request-id")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Best-effort backfill single upstream fields from the last event (if present).
|
||||||
|
var upstreamStatusCode *int
|
||||||
|
var upstreamErrorMessage *string
|
||||||
|
var upstreamErrorDetail *string
|
||||||
|
if len(events) > 0 {
|
||||||
|
last := events[len(events)-1]
|
||||||
|
if last != nil {
|
||||||
|
if last.UpstreamStatusCode > 0 {
|
||||||
|
code := last.UpstreamStatusCode
|
||||||
|
upstreamStatusCode = &code
|
||||||
|
}
|
||||||
|
if msg := strings.TrimSpace(last.Message); msg != "" {
|
||||||
|
upstreamErrorMessage = &msg
|
||||||
|
}
|
||||||
|
if detail := strings.TrimSpace(last.Detail); detail != "" {
|
||||||
|
upstreamErrorDetail = &detail
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if upstreamStatusCode == nil {
|
||||||
|
if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
|
||||||
|
switch t := v.(type) {
|
||||||
|
case int:
|
||||||
|
if t > 0 {
|
||||||
|
code := t
|
||||||
|
upstreamStatusCode = &code
|
||||||
|
}
|
||||||
|
case int64:
|
||||||
|
if t > 0 {
|
||||||
|
code := int(t)
|
||||||
|
upstreamStatusCode = &code
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if upstreamErrorMessage == nil {
|
||||||
|
if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
|
||||||
|
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
|
||||||
|
msg := strings.TrimSpace(s)
|
||||||
|
upstreamErrorMessage = &msg
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if upstreamErrorDetail == nil {
|
||||||
|
if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
|
||||||
|
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
|
||||||
|
detail := strings.TrimSpace(s)
|
||||||
|
upstreamErrorDetail = &detail
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we still have nothing meaningful, skip.
|
||||||
|
if upstreamStatusCode == nil && upstreamErrorMessage == nil && upstreamErrorDetail == nil && len(events) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
effectiveUpstreamStatus := 0
|
||||||
|
if upstreamStatusCode != nil {
|
||||||
|
effectiveUpstreamStatus = *upstreamStatusCode
|
||||||
|
}
|
||||||
|
|
||||||
|
recoveredMsg := "Recovered upstream error"
|
||||||
|
if effectiveUpstreamStatus > 0 {
|
||||||
|
recoveredMsg += " " + strconvItoa(effectiveUpstreamStatus)
|
||||||
|
}
|
||||||
|
if upstreamErrorMessage != nil && strings.TrimSpace(*upstreamErrorMessage) != "" {
|
||||||
|
recoveredMsg += ": " + strings.TrimSpace(*upstreamErrorMessage)
|
||||||
|
}
|
||||||
|
recoveredMsg = truncateString(recoveredMsg, 2048)
|
||||||
|
|
||||||
|
entry := &service.OpsInsertErrorLogInput{
|
||||||
|
RequestID: requestID,
|
||||||
|
ClientRequestID: clientRequestID,
|
||||||
|
|
||||||
|
AccountID: accountID,
|
||||||
|
Platform: platform,
|
||||||
|
Model: modelName,
|
||||||
|
RequestPath: func() string {
|
||||||
|
if c.Request != nil && c.Request.URL != nil {
|
||||||
|
return c.Request.URL.Path
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}(),
|
||||||
|
Stream: stream,
|
||||||
|
UserAgent: c.GetHeader("User-Agent"),
|
||||||
|
|
||||||
|
ErrorPhase: "upstream",
|
||||||
|
ErrorType: "upstream_error",
|
||||||
|
// Severity/retryability should reflect the upstream failure, not the final client status (200).
|
||||||
|
Severity: classifyOpsSeverity("upstream_error", effectiveUpstreamStatus),
|
||||||
|
StatusCode: status,
|
||||||
|
IsBusinessLimited: false,
|
||||||
|
|
||||||
|
ErrorMessage: recoveredMsg,
|
||||||
|
ErrorBody: "",
|
||||||
|
|
||||||
|
ErrorSource: "upstream_http",
|
||||||
|
ErrorOwner: "provider",
|
||||||
|
|
||||||
|
UpstreamStatusCode: upstreamStatusCode,
|
||||||
|
UpstreamErrorMessage: upstreamErrorMessage,
|
||||||
|
UpstreamErrorDetail: upstreamErrorDetail,
|
||||||
|
UpstreamErrors: events,
|
||||||
|
|
||||||
|
IsRetryable: classifyOpsIsRetryable("upstream_error", effectiveUpstreamStatus),
|
||||||
|
RetryCount: 0,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}
|
||||||
|
|
||||||
|
if apiKey != nil {
|
||||||
|
entry.APIKeyID = &apiKey.ID
|
||||||
|
if apiKey.User != nil {
|
||||||
|
entry.UserID = &apiKey.User.ID
|
||||||
|
}
|
||||||
|
if apiKey.GroupID != nil {
|
||||||
|
entry.GroupID = apiKey.GroupID
|
||||||
|
}
|
||||||
|
// Prefer group platform if present (more stable than inferring from path).
|
||||||
|
if apiKey.Group != nil && apiKey.Group.Platform != "" {
|
||||||
|
entry.Platform = apiKey.Group.Platform
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var clientIP string
|
||||||
|
if ip := strings.TrimSpace(c.ClientIP()); ip != "" {
|
||||||
|
clientIP = ip
|
||||||
|
entry.ClientIP = &clientIP
|
||||||
|
}
|
||||||
|
|
||||||
|
var requestBody []byte
|
||||||
|
if v, ok := c.Get(opsRequestBodyKey); ok {
|
||||||
|
if b, ok := v.([]byte); ok && len(b) > 0 {
|
||||||
|
requestBody = b
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Store request headers/body only when an upstream error occurred to keep overhead minimal.
|
||||||
|
entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c)
|
||||||
|
|
||||||
|
enqueueOpsErrorLog(ops, entry, requestBody)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
body := w.buf.Bytes()
|
||||||
|
parsed := parseOpsErrorResponse(body)
|
||||||
|
|
||||||
|
apiKey, _ := middleware2.GetAPIKeyFromContext(c)
|
||||||
|
|
||||||
|
clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string)
|
||||||
|
|
||||||
|
model, _ := c.Get(opsModelKey)
|
||||||
|
streamV, _ := c.Get(opsStreamKey)
|
||||||
|
accountIDV, _ := c.Get(opsAccountIDKey)
|
||||||
|
|
||||||
|
var modelName string
|
||||||
|
if s, ok := model.(string); ok {
|
||||||
|
modelName = s
|
||||||
|
}
|
||||||
|
stream := false
|
||||||
|
if b, ok := streamV.(bool); ok {
|
||||||
|
stream = b
|
||||||
|
}
|
||||||
|
var accountID *int64
|
||||||
|
if v, ok := accountIDV.(int64); ok && v > 0 {
|
||||||
|
accountID = &v
|
||||||
|
}
|
||||||
|
|
||||||
|
fallbackPlatform := guessPlatformFromPath(c.Request.URL.Path)
|
||||||
|
platform := resolveOpsPlatform(apiKey, fallbackPlatform)
|
||||||
|
|
||||||
|
requestID := c.Writer.Header().Get("X-Request-Id")
|
||||||
|
if requestID == "" {
|
||||||
|
requestID = c.Writer.Header().Get("x-request-id")
|
||||||
|
}
|
||||||
|
|
||||||
|
phase := classifyOpsPhase(parsed.ErrorType, parsed.Message, parsed.Code)
|
||||||
|
isBusinessLimited := classifyOpsIsBusinessLimited(parsed.ErrorType, phase, parsed.Code, status, parsed.Message)
|
||||||
|
|
||||||
|
errorOwner := classifyOpsErrorOwner(phase, parsed.Message)
|
||||||
|
errorSource := classifyOpsErrorSource(phase, parsed.Message)
|
||||||
|
|
||||||
|
entry := &service.OpsInsertErrorLogInput{
|
||||||
|
RequestID: requestID,
|
||||||
|
ClientRequestID: clientRequestID,
|
||||||
|
|
||||||
|
AccountID: accountID,
|
||||||
|
Platform: platform,
|
||||||
|
Model: modelName,
|
||||||
|
RequestPath: func() string {
|
||||||
|
if c.Request != nil && c.Request.URL != nil {
|
||||||
|
return c.Request.URL.Path
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}(),
|
||||||
|
Stream: stream,
|
||||||
|
UserAgent: c.GetHeader("User-Agent"),
|
||||||
|
|
||||||
|
ErrorPhase: phase,
|
||||||
|
ErrorType: normalizeOpsErrorType(parsed.ErrorType, parsed.Code),
|
||||||
|
Severity: classifyOpsSeverity(parsed.ErrorType, status),
|
||||||
|
StatusCode: status,
|
||||||
|
IsBusinessLimited: isBusinessLimited,
|
||||||
|
|
||||||
|
ErrorMessage: parsed.Message,
|
||||||
|
// Keep the full captured error body (capture is already capped at 64KB) so the
|
||||||
|
// service layer can sanitize JSON before truncating for storage.
|
||||||
|
ErrorBody: string(body),
|
||||||
|
ErrorSource: errorSource,
|
||||||
|
ErrorOwner: errorOwner,
|
||||||
|
|
||||||
|
IsRetryable: classifyOpsIsRetryable(parsed.ErrorType, status),
|
||||||
|
RetryCount: 0,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Capture upstream error context set by gateway services (if present).
|
||||||
|
// This does NOT affect the client response; it enriches Ops troubleshooting data.
|
||||||
|
{
|
||||||
|
if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
|
||||||
|
switch t := v.(type) {
|
||||||
|
case int:
|
||||||
|
if t > 0 {
|
||||||
|
code := t
|
||||||
|
entry.UpstreamStatusCode = &code
|
||||||
|
}
|
||||||
|
case int64:
|
||||||
|
if t > 0 {
|
||||||
|
code := int(t)
|
||||||
|
entry.UpstreamStatusCode = &code
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
|
||||||
|
if s, ok := v.(string); ok {
|
||||||
|
if msg := strings.TrimSpace(s); msg != "" {
|
||||||
|
entry.UpstreamErrorMessage = &msg
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
|
||||||
|
if s, ok := v.(string); ok {
|
||||||
|
if detail := strings.TrimSpace(s); detail != "" {
|
||||||
|
entry.UpstreamErrorDetail = &detail
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if v, ok := c.Get(service.OpsUpstreamErrorsKey); ok {
|
||||||
|
if events, ok := v.([]*service.OpsUpstreamErrorEvent); ok && len(events) > 0 {
|
||||||
|
entry.UpstreamErrors = events
|
||||||
|
// Best-effort backfill the single upstream fields from the last event when missing.
|
||||||
|
last := events[len(events)-1]
|
||||||
|
if last != nil {
|
||||||
|
if entry.UpstreamStatusCode == nil && last.UpstreamStatusCode > 0 {
|
||||||
|
code := last.UpstreamStatusCode
|
||||||
|
entry.UpstreamStatusCode = &code
|
||||||
|
}
|
||||||
|
if entry.UpstreamErrorMessage == nil && strings.TrimSpace(last.Message) != "" {
|
||||||
|
msg := strings.TrimSpace(last.Message)
|
||||||
|
entry.UpstreamErrorMessage = &msg
|
||||||
|
}
|
||||||
|
if entry.UpstreamErrorDetail == nil && strings.TrimSpace(last.Detail) != "" {
|
||||||
|
detail := strings.TrimSpace(last.Detail)
|
||||||
|
entry.UpstreamErrorDetail = &detail
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if apiKey != nil {
|
||||||
|
entry.APIKeyID = &apiKey.ID
|
||||||
|
if apiKey.User != nil {
|
||||||
|
entry.UserID = &apiKey.User.ID
|
||||||
|
}
|
||||||
|
if apiKey.GroupID != nil {
|
||||||
|
entry.GroupID = apiKey.GroupID
|
||||||
|
}
|
||||||
|
// Prefer group platform if present (more stable than inferring from path).
|
||||||
|
if apiKey.Group != nil && apiKey.Group.Platform != "" {
|
||||||
|
entry.Platform = apiKey.Group.Platform
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var clientIP string
|
||||||
|
if ip := strings.TrimSpace(c.ClientIP()); ip != "" {
|
||||||
|
clientIP = ip
|
||||||
|
entry.ClientIP = &clientIP
|
||||||
|
}
|
||||||
|
|
||||||
|
var requestBody []byte
|
||||||
|
if v, ok := c.Get(opsRequestBodyKey); ok {
|
||||||
|
if b, ok := v.([]byte); ok && len(b) > 0 {
|
||||||
|
requestBody = b
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Persist only a minimal, whitelisted set of request headers to improve retry fidelity.
|
||||||
|
// Do NOT store Authorization/Cookie/etc.
|
||||||
|
entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c)
|
||||||
|
|
||||||
|
enqueueOpsErrorLog(ops, entry, requestBody)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var opsRetryRequestHeaderAllowlist = []string{
|
||||||
|
"anthropic-beta",
|
||||||
|
"anthropic-version",
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractOpsRetryRequestHeaders(c *gin.Context) *string {
|
||||||
|
if c == nil || c.Request == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
headers := make(map[string]string, 4)
|
||||||
|
for _, key := range opsRetryRequestHeaderAllowlist {
|
||||||
|
v := strings.TrimSpace(c.GetHeader(key))
|
||||||
|
if v == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Keep headers small even if a client sends something unexpected.
|
||||||
|
headers[key] = truncateString(v, 512)
|
||||||
|
}
|
||||||
|
if len(headers) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := json.Marshal(headers)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
s := string(raw)
|
||||||
|
return &s
|
||||||
|
}
|
||||||
|
|
||||||
|
type parsedOpsError struct {
|
||||||
|
ErrorType string
|
||||||
|
Message string
|
||||||
|
Code string
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseOpsErrorResponse(body []byte) parsedOpsError {
|
||||||
|
if len(body) == 0 {
|
||||||
|
return parsedOpsError{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fast path: attempt to decode into a generic map.
|
||||||
|
var m map[string]any
|
||||||
|
if err := json.Unmarshal(body, &m); err != nil {
|
||||||
|
return parsedOpsError{Message: truncateString(string(body), 1024)}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Claude/OpenAI-style gateway error: { type:"error", error:{ type, message } }
|
||||||
|
if errObj, ok := m["error"].(map[string]any); ok {
|
||||||
|
t, _ := errObj["type"].(string)
|
||||||
|
msg, _ := errObj["message"].(string)
|
||||||
|
// Gemini googleError also uses "error": { code, message, status }
|
||||||
|
if msg == "" {
|
||||||
|
if v, ok := errObj["message"]; ok {
|
||||||
|
msg, _ = v.(string)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if t == "" {
|
||||||
|
// Gemini error does not have "type" field.
|
||||||
|
t = "api_error"
|
||||||
|
}
|
||||||
|
// For gemini error, capture numeric code as string for business-limited mapping if needed.
|
||||||
|
var code string
|
||||||
|
if v, ok := errObj["code"]; ok {
|
||||||
|
switch n := v.(type) {
|
||||||
|
case float64:
|
||||||
|
code = strconvItoa(int(n))
|
||||||
|
case int:
|
||||||
|
code = strconvItoa(n)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return parsedOpsError{ErrorType: t, Message: msg, Code: code}
|
||||||
|
}
|
||||||
|
|
||||||
|
// APIKeyAuth-style: { code:"INSUFFICIENT_BALANCE", message:"..." }
|
||||||
|
code, _ := m["code"].(string)
|
||||||
|
msg, _ := m["message"].(string)
|
||||||
|
if code != "" || msg != "" {
|
||||||
|
return parsedOpsError{ErrorType: "api_error", Message: msg, Code: code}
|
||||||
|
}
|
||||||
|
|
||||||
|
return parsedOpsError{Message: truncateString(string(body), 1024)}
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveOpsPlatform(apiKey *service.APIKey, fallback string) string {
|
||||||
|
if apiKey != nil && apiKey.Group != nil && apiKey.Group.Platform != "" {
|
||||||
|
return apiKey.Group.Platform
|
||||||
|
}
|
||||||
|
return fallback
|
||||||
|
}
|
||||||
|
|
||||||
|
func guessPlatformFromPath(path string) string {
|
||||||
|
p := strings.ToLower(path)
|
||||||
|
switch {
|
||||||
|
case strings.HasPrefix(p, "/antigravity/"):
|
||||||
|
return service.PlatformAntigravity
|
||||||
|
case strings.HasPrefix(p, "/v1beta/"):
|
||||||
|
return service.PlatformGemini
|
||||||
|
case strings.Contains(p, "/responses"):
|
||||||
|
return service.PlatformOpenAI
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeOpsErrorType(errType string, code string) string {
|
||||||
|
if errType != "" {
|
||||||
|
return errType
|
||||||
|
}
|
||||||
|
switch strings.TrimSpace(code) {
|
||||||
|
case "INSUFFICIENT_BALANCE":
|
||||||
|
return "billing_error"
|
||||||
|
case "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
|
||||||
|
return "subscription_error"
|
||||||
|
default:
|
||||||
|
return "api_error"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func classifyOpsPhase(errType, message, code string) string {
|
||||||
|
msg := strings.ToLower(message)
|
||||||
|
switch strings.TrimSpace(code) {
|
||||||
|
case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
|
||||||
|
return "billing"
|
||||||
|
}
|
||||||
|
|
||||||
|
switch errType {
|
||||||
|
case "authentication_error":
|
||||||
|
return "auth"
|
||||||
|
case "billing_error", "subscription_error":
|
||||||
|
return "billing"
|
||||||
|
case "rate_limit_error":
|
||||||
|
if strings.Contains(msg, "concurrency") || strings.Contains(msg, "pending") || strings.Contains(msg, "queue") {
|
||||||
|
return "concurrency"
|
||||||
|
}
|
||||||
|
return "upstream"
|
||||||
|
case "invalid_request_error":
|
||||||
|
return "response"
|
||||||
|
case "upstream_error", "overloaded_error":
|
||||||
|
return "upstream"
|
||||||
|
case "api_error":
|
||||||
|
if strings.Contains(msg, "no available accounts") {
|
||||||
|
return "scheduling"
|
||||||
|
}
|
||||||
|
return "internal"
|
||||||
|
default:
|
||||||
|
return "internal"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func classifyOpsSeverity(errType string, status int) string {
|
||||||
|
switch errType {
|
||||||
|
case "invalid_request_error", "authentication_error", "billing_error", "subscription_error":
|
||||||
|
return "P3"
|
||||||
|
}
|
||||||
|
if status >= 500 {
|
||||||
|
return "P1"
|
||||||
|
}
|
||||||
|
if status == 429 {
|
||||||
|
return "P1"
|
||||||
|
}
|
||||||
|
if status >= 400 {
|
||||||
|
return "P2"
|
||||||
|
}
|
||||||
|
return "P3"
|
||||||
|
}
|
||||||
|
|
||||||
|
func classifyOpsIsRetryable(errType string, statusCode int) bool {
|
||||||
|
switch errType {
|
||||||
|
case "authentication_error", "invalid_request_error":
|
||||||
|
return false
|
||||||
|
case "timeout_error":
|
||||||
|
return true
|
||||||
|
case "rate_limit_error":
|
||||||
|
// May be transient (upstream or queue); retry can help.
|
||||||
|
return true
|
||||||
|
case "billing_error", "subscription_error":
|
||||||
|
return false
|
||||||
|
case "upstream_error", "overloaded_error":
|
||||||
|
return statusCode >= 500 || statusCode == 429 || statusCode == 529
|
||||||
|
default:
|
||||||
|
return statusCode >= 500
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func classifyOpsIsBusinessLimited(errType, phase, code string, status int, message string) bool {
|
||||||
|
switch strings.TrimSpace(code) {
|
||||||
|
case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
if phase == "billing" || phase == "concurrency" {
|
||||||
|
// SLA/错误率排除“用户级业务限制”
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
// Avoid treating upstream rate limits as business-limited.
|
||||||
|
if errType == "rate_limit_error" && strings.Contains(strings.ToLower(message), "upstream") {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
_ = status
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func classifyOpsErrorOwner(phase string, message string) string {
|
||||||
|
switch phase {
|
||||||
|
case "upstream", "network":
|
||||||
|
return "provider"
|
||||||
|
case "billing", "concurrency", "auth", "response":
|
||||||
|
return "client"
|
||||||
|
default:
|
||||||
|
if strings.Contains(strings.ToLower(message), "upstream") {
|
||||||
|
return "provider"
|
||||||
|
}
|
||||||
|
return "sub2api"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func classifyOpsErrorSource(phase string, message string) string {
|
||||||
|
switch phase {
|
||||||
|
case "upstream":
|
||||||
|
return "upstream_http"
|
||||||
|
case "network":
|
||||||
|
return "upstream_network"
|
||||||
|
case "billing":
|
||||||
|
return "billing"
|
||||||
|
case "concurrency":
|
||||||
|
return "concurrency"
|
||||||
|
default:
|
||||||
|
if strings.Contains(strings.ToLower(message), "upstream") {
|
||||||
|
return "upstream_http"
|
||||||
|
}
|
||||||
|
return "internal"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func truncateString(s string, max int) string {
|
||||||
|
if max <= 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if len(s) <= max {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
cut := s[:max]
|
||||||
|
// Ensure truncation does not split multi-byte characters.
|
||||||
|
for len(cut) > 0 && !utf8.ValidString(cut) {
|
||||||
|
cut = cut[:len(cut)-1]
|
||||||
|
}
|
||||||
|
return cut
|
||||||
|
}
|
||||||
|
|
||||||
|
func strconvItoa(v int) string {
|
||||||
|
return strconv.Itoa(v)
|
||||||
|
}
|
||||||
@@ -21,6 +21,7 @@ func ProvideAdminHandlers(
|
|||||||
redeemHandler *admin.RedeemHandler,
|
redeemHandler *admin.RedeemHandler,
|
||||||
promoHandler *admin.PromoHandler,
|
promoHandler *admin.PromoHandler,
|
||||||
settingHandler *admin.SettingHandler,
|
settingHandler *admin.SettingHandler,
|
||||||
|
opsHandler *admin.OpsHandler,
|
||||||
systemHandler *admin.SystemHandler,
|
systemHandler *admin.SystemHandler,
|
||||||
subscriptionHandler *admin.SubscriptionHandler,
|
subscriptionHandler *admin.SubscriptionHandler,
|
||||||
usageHandler *admin.UsageHandler,
|
usageHandler *admin.UsageHandler,
|
||||||
@@ -39,6 +40,7 @@ func ProvideAdminHandlers(
|
|||||||
Redeem: redeemHandler,
|
Redeem: redeemHandler,
|
||||||
Promo: promoHandler,
|
Promo: promoHandler,
|
||||||
Setting: settingHandler,
|
Setting: settingHandler,
|
||||||
|
Ops: opsHandler,
|
||||||
System: systemHandler,
|
System: systemHandler,
|
||||||
Subscription: subscriptionHandler,
|
Subscription: subscriptionHandler,
|
||||||
Usage: usageHandler,
|
Usage: usageHandler,
|
||||||
@@ -109,6 +111,7 @@ var ProviderSet = wire.NewSet(
|
|||||||
admin.NewRedeemHandler,
|
admin.NewRedeemHandler,
|
||||||
admin.NewPromoHandler,
|
admin.NewPromoHandler,
|
||||||
admin.NewSettingHandler,
|
admin.NewSettingHandler,
|
||||||
|
admin.NewOpsHandler,
|
||||||
ProvideSystemHandler,
|
ProvideSystemHandler,
|
||||||
admin.NewSubscriptionHandler,
|
admin.NewSubscriptionHandler,
|
||||||
admin.NewUsageHandler,
|
admin.NewUsageHandler,
|
||||||
|
|||||||
@@ -7,7 +7,14 @@ type Key string
|
|||||||
const (
|
const (
|
||||||
// ForcePlatform 强制平台(用于 /antigravity 路由),由 middleware.ForcePlatform 设置
|
// ForcePlatform 强制平台(用于 /antigravity 路由),由 middleware.ForcePlatform 设置
|
||||||
ForcePlatform Key = "ctx_force_platform"
|
ForcePlatform Key = "ctx_force_platform"
|
||||||
// IsClaudeCodeClient 是否为 Claude Code 客户端,由中间件设置
|
|
||||||
|
// ClientRequestID 客户端请求的唯一标识,用于追踪请求全生命周期(用于 Ops 监控与排障)。
|
||||||
|
ClientRequestID Key = "ctx_client_request_id"
|
||||||
|
|
||||||
|
// RetryCount 表示当前请求在网关层的重试次数(用于 Ops 记录与排障)。
|
||||||
|
RetryCount Key = "ctx_retry_count"
|
||||||
|
|
||||||
|
// IsClaudeCodeClient 标识当前请求是否来自 Claude Code 客户端
|
||||||
IsClaudeCodeClient Key = "ctx_is_claude_code_client"
|
IsClaudeCodeClient Key = "ctx_is_claude_code_client"
|
||||||
// Group 认证后的分组信息,由 API Key 认证中间件设置
|
// Group 认证后的分组信息,由 API Key 认证中间件设置
|
||||||
Group Key = "ctx_group"
|
Group Key = "ctx_group"
|
||||||
|
|||||||
@@ -93,7 +93,7 @@ var (
|
|||||||
return redis.call('ZCARD', key)
|
return redis.call('ZCARD', key)
|
||||||
`)
|
`)
|
||||||
|
|
||||||
// incrementWaitScript - only sets TTL on first creation to avoid refreshing
|
// incrementWaitScript - refreshes TTL on each increment to keep queue depth accurate
|
||||||
// KEYS[1] = wait queue key
|
// KEYS[1] = wait queue key
|
||||||
// ARGV[1] = maxWait
|
// ARGV[1] = maxWait
|
||||||
// ARGV[2] = TTL in seconds
|
// ARGV[2] = TTL in seconds
|
||||||
@@ -111,15 +111,13 @@ var (
|
|||||||
|
|
||||||
local newVal = redis.call('INCR', KEYS[1])
|
local newVal = redis.call('INCR', KEYS[1])
|
||||||
|
|
||||||
-- Only set TTL on first creation to avoid refreshing zombie data
|
-- Refresh TTL so long-running traffic doesn't expire active queue counters.
|
||||||
if newVal == 1 then
|
redis.call('EXPIRE', KEYS[1], ARGV[2])
|
||||||
redis.call('EXPIRE', KEYS[1], ARGV[2])
|
|
||||||
end
|
|
||||||
|
|
||||||
return 1
|
return 1
|
||||||
`)
|
`)
|
||||||
|
|
||||||
// incrementAccountWaitScript - account-level wait queue count
|
// incrementAccountWaitScript - account-level wait queue count (refresh TTL on each increment)
|
||||||
incrementAccountWaitScript = redis.NewScript(`
|
incrementAccountWaitScript = redis.NewScript(`
|
||||||
local current = redis.call('GET', KEYS[1])
|
local current = redis.call('GET', KEYS[1])
|
||||||
if current == false then
|
if current == false then
|
||||||
@@ -134,10 +132,8 @@ var (
|
|||||||
|
|
||||||
local newVal = redis.call('INCR', KEYS[1])
|
local newVal = redis.call('INCR', KEYS[1])
|
||||||
|
|
||||||
-- Only set TTL on first creation to avoid refreshing zombie data
|
-- Refresh TTL so long-running traffic doesn't expire active queue counters.
|
||||||
if newVal == 1 then
|
redis.call('EXPIRE', KEYS[1], ARGV[2])
|
||||||
redis.call('EXPIRE', KEYS[1], ARGV[2])
|
|
||||||
end
|
|
||||||
|
|
||||||
return 1
|
return 1
|
||||||
`)
|
`)
|
||||||
|
|||||||
707
backend/internal/repository/ops_repo.go
Normal file
707
backend/internal/repository/ops_repo.go
Normal file
@@ -0,0 +1,707 @@
|
|||||||
|
package repository
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
|
"github.com/lib/pq"
|
||||||
|
)
|
||||||
|
|
||||||
|
type opsRepository struct {
|
||||||
|
db *sql.DB
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewOpsRepository(db *sql.DB) service.OpsRepository {
|
||||||
|
return &opsRepository{db: db}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) InsertErrorLog(ctx context.Context, input *service.OpsInsertErrorLogInput) (int64, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return 0, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if input == nil {
|
||||||
|
return 0, fmt.Errorf("nil input")
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
INSERT INTO ops_error_logs (
|
||||||
|
request_id,
|
||||||
|
client_request_id,
|
||||||
|
user_id,
|
||||||
|
api_key_id,
|
||||||
|
account_id,
|
||||||
|
group_id,
|
||||||
|
client_ip,
|
||||||
|
platform,
|
||||||
|
model,
|
||||||
|
request_path,
|
||||||
|
stream,
|
||||||
|
user_agent,
|
||||||
|
error_phase,
|
||||||
|
error_type,
|
||||||
|
severity,
|
||||||
|
status_code,
|
||||||
|
is_business_limited,
|
||||||
|
error_message,
|
||||||
|
error_body,
|
||||||
|
error_source,
|
||||||
|
error_owner,
|
||||||
|
upstream_status_code,
|
||||||
|
upstream_error_message,
|
||||||
|
upstream_error_detail,
|
||||||
|
upstream_errors,
|
||||||
|
duration_ms,
|
||||||
|
time_to_first_token_ms,
|
||||||
|
request_body,
|
||||||
|
request_body_truncated,
|
||||||
|
request_body_bytes,
|
||||||
|
request_headers,
|
||||||
|
is_retryable,
|
||||||
|
retry_count,
|
||||||
|
created_at
|
||||||
|
) VALUES (
|
||||||
|
$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34
|
||||||
|
) RETURNING id`
|
||||||
|
|
||||||
|
var id int64
|
||||||
|
err := r.db.QueryRowContext(
|
||||||
|
ctx,
|
||||||
|
q,
|
||||||
|
opsNullString(input.RequestID),
|
||||||
|
opsNullString(input.ClientRequestID),
|
||||||
|
opsNullInt64(input.UserID),
|
||||||
|
opsNullInt64(input.APIKeyID),
|
||||||
|
opsNullInt64(input.AccountID),
|
||||||
|
opsNullInt64(input.GroupID),
|
||||||
|
opsNullString(input.ClientIP),
|
||||||
|
opsNullString(input.Platform),
|
||||||
|
opsNullString(input.Model),
|
||||||
|
opsNullString(input.RequestPath),
|
||||||
|
input.Stream,
|
||||||
|
opsNullString(input.UserAgent),
|
||||||
|
input.ErrorPhase,
|
||||||
|
input.ErrorType,
|
||||||
|
opsNullString(input.Severity),
|
||||||
|
opsNullInt(input.StatusCode),
|
||||||
|
input.IsBusinessLimited,
|
||||||
|
opsNullString(input.ErrorMessage),
|
||||||
|
opsNullString(input.ErrorBody),
|
||||||
|
opsNullString(input.ErrorSource),
|
||||||
|
opsNullString(input.ErrorOwner),
|
||||||
|
opsNullInt(input.UpstreamStatusCode),
|
||||||
|
opsNullString(input.UpstreamErrorMessage),
|
||||||
|
opsNullString(input.UpstreamErrorDetail),
|
||||||
|
opsNullString(input.UpstreamErrorsJSON),
|
||||||
|
opsNullInt(input.DurationMs),
|
||||||
|
opsNullInt64(input.TimeToFirstTokenMs),
|
||||||
|
opsNullString(input.RequestBodyJSON),
|
||||||
|
input.RequestBodyTruncated,
|
||||||
|
opsNullInt(input.RequestBodyBytes),
|
||||||
|
opsNullString(input.RequestHeadersJSON),
|
||||||
|
input.IsRetryable,
|
||||||
|
input.RetryCount,
|
||||||
|
input.CreatedAt,
|
||||||
|
).Scan(&id)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
return id, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) ListErrorLogs(ctx context.Context, filter *service.OpsErrorLogFilter) (*service.OpsErrorLogList, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if filter == nil {
|
||||||
|
filter = &service.OpsErrorLogFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
page := filter.Page
|
||||||
|
if page <= 0 {
|
||||||
|
page = 1
|
||||||
|
}
|
||||||
|
pageSize := filter.PageSize
|
||||||
|
if pageSize <= 0 {
|
||||||
|
pageSize = 20
|
||||||
|
}
|
||||||
|
if pageSize > 500 {
|
||||||
|
pageSize = 500
|
||||||
|
}
|
||||||
|
|
||||||
|
where, args := buildOpsErrorLogsWhere(filter)
|
||||||
|
countSQL := "SELECT COUNT(*) FROM ops_error_logs " + where
|
||||||
|
|
||||||
|
var total int
|
||||||
|
if err := r.db.QueryRowContext(ctx, countSQL, args...).Scan(&total); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
offset := (page - 1) * pageSize
|
||||||
|
argsWithLimit := append(args, pageSize, offset)
|
||||||
|
selectSQL := `
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
created_at,
|
||||||
|
error_phase,
|
||||||
|
error_type,
|
||||||
|
severity,
|
||||||
|
COALESCE(upstream_status_code, status_code, 0),
|
||||||
|
COALESCE(platform, ''),
|
||||||
|
COALESCE(model, ''),
|
||||||
|
duration_ms,
|
||||||
|
COALESCE(client_request_id, ''),
|
||||||
|
COALESCE(request_id, ''),
|
||||||
|
COALESCE(error_message, ''),
|
||||||
|
user_id,
|
||||||
|
api_key_id,
|
||||||
|
account_id,
|
||||||
|
group_id,
|
||||||
|
CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END,
|
||||||
|
COALESCE(request_path, ''),
|
||||||
|
stream
|
||||||
|
FROM ops_error_logs
|
||||||
|
` + where + `
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2)
|
||||||
|
|
||||||
|
rows, err := r.db.QueryContext(ctx, selectSQL, argsWithLimit...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() { _ = rows.Close() }()
|
||||||
|
|
||||||
|
out := make([]*service.OpsErrorLog, 0, pageSize)
|
||||||
|
for rows.Next() {
|
||||||
|
var item service.OpsErrorLog
|
||||||
|
var latency sql.NullInt64
|
||||||
|
var statusCode sql.NullInt64
|
||||||
|
var clientIP sql.NullString
|
||||||
|
var userID sql.NullInt64
|
||||||
|
var apiKeyID sql.NullInt64
|
||||||
|
var accountID sql.NullInt64
|
||||||
|
var groupID sql.NullInt64
|
||||||
|
if err := rows.Scan(
|
||||||
|
&item.ID,
|
||||||
|
&item.CreatedAt,
|
||||||
|
&item.Phase,
|
||||||
|
&item.Type,
|
||||||
|
&item.Severity,
|
||||||
|
&statusCode,
|
||||||
|
&item.Platform,
|
||||||
|
&item.Model,
|
||||||
|
&latency,
|
||||||
|
&item.ClientRequestID,
|
||||||
|
&item.RequestID,
|
||||||
|
&item.Message,
|
||||||
|
&userID,
|
||||||
|
&apiKeyID,
|
||||||
|
&accountID,
|
||||||
|
&groupID,
|
||||||
|
&clientIP,
|
||||||
|
&item.RequestPath,
|
||||||
|
&item.Stream,
|
||||||
|
); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if latency.Valid {
|
||||||
|
v := int(latency.Int64)
|
||||||
|
item.LatencyMs = &v
|
||||||
|
}
|
||||||
|
item.StatusCode = int(statusCode.Int64)
|
||||||
|
if clientIP.Valid {
|
||||||
|
s := clientIP.String
|
||||||
|
item.ClientIP = &s
|
||||||
|
}
|
||||||
|
if userID.Valid {
|
||||||
|
v := userID.Int64
|
||||||
|
item.UserID = &v
|
||||||
|
}
|
||||||
|
if apiKeyID.Valid {
|
||||||
|
v := apiKeyID.Int64
|
||||||
|
item.APIKeyID = &v
|
||||||
|
}
|
||||||
|
if accountID.Valid {
|
||||||
|
v := accountID.Int64
|
||||||
|
item.AccountID = &v
|
||||||
|
}
|
||||||
|
if groupID.Valid {
|
||||||
|
v := groupID.Int64
|
||||||
|
item.GroupID = &v
|
||||||
|
}
|
||||||
|
out = append(out, &item)
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return &service.OpsErrorLogList{
|
||||||
|
Errors: out,
|
||||||
|
Total: total,
|
||||||
|
Page: page,
|
||||||
|
PageSize: pageSize,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) GetErrorLogByID(ctx context.Context, id int64) (*service.OpsErrorLogDetail, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if id <= 0 {
|
||||||
|
return nil, fmt.Errorf("invalid id")
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
created_at,
|
||||||
|
error_phase,
|
||||||
|
error_type,
|
||||||
|
severity,
|
||||||
|
COALESCE(upstream_status_code, status_code, 0),
|
||||||
|
COALESCE(platform, ''),
|
||||||
|
COALESCE(model, ''),
|
||||||
|
duration_ms,
|
||||||
|
COALESCE(client_request_id, ''),
|
||||||
|
COALESCE(request_id, ''),
|
||||||
|
COALESCE(error_message, ''),
|
||||||
|
COALESCE(error_body, ''),
|
||||||
|
upstream_status_code,
|
||||||
|
COALESCE(upstream_error_message, ''),
|
||||||
|
COALESCE(upstream_error_detail, ''),
|
||||||
|
COALESCE(upstream_errors::text, ''),
|
||||||
|
is_business_limited,
|
||||||
|
user_id,
|
||||||
|
api_key_id,
|
||||||
|
account_id,
|
||||||
|
group_id,
|
||||||
|
CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END,
|
||||||
|
COALESCE(request_path, ''),
|
||||||
|
stream,
|
||||||
|
COALESCE(user_agent, ''),
|
||||||
|
auth_latency_ms,
|
||||||
|
routing_latency_ms,
|
||||||
|
upstream_latency_ms,
|
||||||
|
response_latency_ms,
|
||||||
|
time_to_first_token_ms,
|
||||||
|
COALESCE(request_body::text, ''),
|
||||||
|
request_body_truncated,
|
||||||
|
request_body_bytes,
|
||||||
|
COALESCE(request_headers::text, '')
|
||||||
|
FROM ops_error_logs
|
||||||
|
WHERE id = $1
|
||||||
|
LIMIT 1`
|
||||||
|
|
||||||
|
var out service.OpsErrorLogDetail
|
||||||
|
var latency sql.NullInt64
|
||||||
|
var statusCode sql.NullInt64
|
||||||
|
var upstreamStatusCode sql.NullInt64
|
||||||
|
var clientIP sql.NullString
|
||||||
|
var userID sql.NullInt64
|
||||||
|
var apiKeyID sql.NullInt64
|
||||||
|
var accountID sql.NullInt64
|
||||||
|
var groupID sql.NullInt64
|
||||||
|
var authLatency sql.NullInt64
|
||||||
|
var routingLatency sql.NullInt64
|
||||||
|
var upstreamLatency sql.NullInt64
|
||||||
|
var responseLatency sql.NullInt64
|
||||||
|
var ttft sql.NullInt64
|
||||||
|
var requestBodyBytes sql.NullInt64
|
||||||
|
|
||||||
|
err := r.db.QueryRowContext(ctx, q, id).Scan(
|
||||||
|
&out.ID,
|
||||||
|
&out.CreatedAt,
|
||||||
|
&out.Phase,
|
||||||
|
&out.Type,
|
||||||
|
&out.Severity,
|
||||||
|
&statusCode,
|
||||||
|
&out.Platform,
|
||||||
|
&out.Model,
|
||||||
|
&latency,
|
||||||
|
&out.ClientRequestID,
|
||||||
|
&out.RequestID,
|
||||||
|
&out.Message,
|
||||||
|
&out.ErrorBody,
|
||||||
|
&upstreamStatusCode,
|
||||||
|
&out.UpstreamErrorMessage,
|
||||||
|
&out.UpstreamErrorDetail,
|
||||||
|
&out.UpstreamErrors,
|
||||||
|
&out.IsBusinessLimited,
|
||||||
|
&userID,
|
||||||
|
&apiKeyID,
|
||||||
|
&accountID,
|
||||||
|
&groupID,
|
||||||
|
&clientIP,
|
||||||
|
&out.RequestPath,
|
||||||
|
&out.Stream,
|
||||||
|
&out.UserAgent,
|
||||||
|
&authLatency,
|
||||||
|
&routingLatency,
|
||||||
|
&upstreamLatency,
|
||||||
|
&responseLatency,
|
||||||
|
&ttft,
|
||||||
|
&out.RequestBody,
|
||||||
|
&out.RequestBodyTruncated,
|
||||||
|
&requestBodyBytes,
|
||||||
|
&out.RequestHeaders,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
out.StatusCode = int(statusCode.Int64)
|
||||||
|
if latency.Valid {
|
||||||
|
v := int(latency.Int64)
|
||||||
|
out.LatencyMs = &v
|
||||||
|
}
|
||||||
|
if clientIP.Valid {
|
||||||
|
s := clientIP.String
|
||||||
|
out.ClientIP = &s
|
||||||
|
}
|
||||||
|
if upstreamStatusCode.Valid && upstreamStatusCode.Int64 > 0 {
|
||||||
|
v := int(upstreamStatusCode.Int64)
|
||||||
|
out.UpstreamStatusCode = &v
|
||||||
|
}
|
||||||
|
if userID.Valid {
|
||||||
|
v := userID.Int64
|
||||||
|
out.UserID = &v
|
||||||
|
}
|
||||||
|
if apiKeyID.Valid {
|
||||||
|
v := apiKeyID.Int64
|
||||||
|
out.APIKeyID = &v
|
||||||
|
}
|
||||||
|
if accountID.Valid {
|
||||||
|
v := accountID.Int64
|
||||||
|
out.AccountID = &v
|
||||||
|
}
|
||||||
|
if groupID.Valid {
|
||||||
|
v := groupID.Int64
|
||||||
|
out.GroupID = &v
|
||||||
|
}
|
||||||
|
if authLatency.Valid {
|
||||||
|
v := authLatency.Int64
|
||||||
|
out.AuthLatencyMs = &v
|
||||||
|
}
|
||||||
|
if routingLatency.Valid {
|
||||||
|
v := routingLatency.Int64
|
||||||
|
out.RoutingLatencyMs = &v
|
||||||
|
}
|
||||||
|
if upstreamLatency.Valid {
|
||||||
|
v := upstreamLatency.Int64
|
||||||
|
out.UpstreamLatencyMs = &v
|
||||||
|
}
|
||||||
|
if responseLatency.Valid {
|
||||||
|
v := responseLatency.Int64
|
||||||
|
out.ResponseLatencyMs = &v
|
||||||
|
}
|
||||||
|
if ttft.Valid {
|
||||||
|
v := ttft.Int64
|
||||||
|
out.TimeToFirstTokenMs = &v
|
||||||
|
}
|
||||||
|
if requestBodyBytes.Valid {
|
||||||
|
v := int(requestBodyBytes.Int64)
|
||||||
|
out.RequestBodyBytes = &v
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize request_body to empty string when stored as JSON null.
|
||||||
|
out.RequestBody = strings.TrimSpace(out.RequestBody)
|
||||||
|
if out.RequestBody == "null" {
|
||||||
|
out.RequestBody = ""
|
||||||
|
}
|
||||||
|
// Normalize request_headers to empty string when stored as JSON null.
|
||||||
|
out.RequestHeaders = strings.TrimSpace(out.RequestHeaders)
|
||||||
|
if out.RequestHeaders == "null" {
|
||||||
|
out.RequestHeaders = ""
|
||||||
|
}
|
||||||
|
// Normalize upstream_errors to empty string when stored as JSON null.
|
||||||
|
out.UpstreamErrors = strings.TrimSpace(out.UpstreamErrors)
|
||||||
|
if out.UpstreamErrors == "null" {
|
||||||
|
out.UpstreamErrors = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
return &out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) InsertRetryAttempt(ctx context.Context, input *service.OpsInsertRetryAttemptInput) (int64, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return 0, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if input == nil {
|
||||||
|
return 0, fmt.Errorf("nil input")
|
||||||
|
}
|
||||||
|
if input.SourceErrorID <= 0 {
|
||||||
|
return 0, fmt.Errorf("invalid source_error_id")
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(input.Mode) == "" {
|
||||||
|
return 0, fmt.Errorf("invalid mode")
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
INSERT INTO ops_retry_attempts (
|
||||||
|
requested_by_user_id,
|
||||||
|
source_error_id,
|
||||||
|
mode,
|
||||||
|
pinned_account_id,
|
||||||
|
status,
|
||||||
|
started_at
|
||||||
|
) VALUES (
|
||||||
|
$1,$2,$3,$4,$5,$6
|
||||||
|
) RETURNING id`
|
||||||
|
|
||||||
|
var id int64
|
||||||
|
err := r.db.QueryRowContext(
|
||||||
|
ctx,
|
||||||
|
q,
|
||||||
|
opsNullInt64(&input.RequestedByUserID),
|
||||||
|
input.SourceErrorID,
|
||||||
|
strings.TrimSpace(input.Mode),
|
||||||
|
opsNullInt64(input.PinnedAccountID),
|
||||||
|
strings.TrimSpace(input.Status),
|
||||||
|
input.StartedAt,
|
||||||
|
).Scan(&id)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
return id, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) UpdateRetryAttempt(ctx context.Context, input *service.OpsUpdateRetryAttemptInput) error {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if input == nil {
|
||||||
|
return fmt.Errorf("nil input")
|
||||||
|
}
|
||||||
|
if input.ID <= 0 {
|
||||||
|
return fmt.Errorf("invalid id")
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
UPDATE ops_retry_attempts
|
||||||
|
SET
|
||||||
|
status = $2,
|
||||||
|
finished_at = $3,
|
||||||
|
duration_ms = $4,
|
||||||
|
result_request_id = $5,
|
||||||
|
result_error_id = $6,
|
||||||
|
error_message = $7
|
||||||
|
WHERE id = $1`
|
||||||
|
|
||||||
|
_, err := r.db.ExecContext(
|
||||||
|
ctx,
|
||||||
|
q,
|
||||||
|
input.ID,
|
||||||
|
strings.TrimSpace(input.Status),
|
||||||
|
nullTime(input.FinishedAt),
|
||||||
|
input.DurationMs,
|
||||||
|
opsNullString(input.ResultRequestID),
|
||||||
|
opsNullInt64(input.ResultErrorID),
|
||||||
|
opsNullString(input.ErrorMessage),
|
||||||
|
)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*service.OpsRetryAttempt, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if sourceErrorID <= 0 {
|
||||||
|
return nil, fmt.Errorf("invalid source_error_id")
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
created_at,
|
||||||
|
COALESCE(requested_by_user_id, 0),
|
||||||
|
source_error_id,
|
||||||
|
COALESCE(mode, ''),
|
||||||
|
pinned_account_id,
|
||||||
|
COALESCE(status, ''),
|
||||||
|
started_at,
|
||||||
|
finished_at,
|
||||||
|
duration_ms,
|
||||||
|
result_request_id,
|
||||||
|
result_error_id,
|
||||||
|
error_message
|
||||||
|
FROM ops_retry_attempts
|
||||||
|
WHERE source_error_id = $1
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
LIMIT 1`
|
||||||
|
|
||||||
|
var out service.OpsRetryAttempt
|
||||||
|
var pinnedAccountID sql.NullInt64
|
||||||
|
var requestedBy sql.NullInt64
|
||||||
|
var startedAt sql.NullTime
|
||||||
|
var finishedAt sql.NullTime
|
||||||
|
var durationMs sql.NullInt64
|
||||||
|
var resultRequestID sql.NullString
|
||||||
|
var resultErrorID sql.NullInt64
|
||||||
|
var errorMessage sql.NullString
|
||||||
|
|
||||||
|
err := r.db.QueryRowContext(ctx, q, sourceErrorID).Scan(
|
||||||
|
&out.ID,
|
||||||
|
&out.CreatedAt,
|
||||||
|
&requestedBy,
|
||||||
|
&out.SourceErrorID,
|
||||||
|
&out.Mode,
|
||||||
|
&pinnedAccountID,
|
||||||
|
&out.Status,
|
||||||
|
&startedAt,
|
||||||
|
&finishedAt,
|
||||||
|
&durationMs,
|
||||||
|
&resultRequestID,
|
||||||
|
&resultErrorID,
|
||||||
|
&errorMessage,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
out.RequestedByUserID = requestedBy.Int64
|
||||||
|
if pinnedAccountID.Valid {
|
||||||
|
v := pinnedAccountID.Int64
|
||||||
|
out.PinnedAccountID = &v
|
||||||
|
}
|
||||||
|
if startedAt.Valid {
|
||||||
|
t := startedAt.Time
|
||||||
|
out.StartedAt = &t
|
||||||
|
}
|
||||||
|
if finishedAt.Valid {
|
||||||
|
t := finishedAt.Time
|
||||||
|
out.FinishedAt = &t
|
||||||
|
}
|
||||||
|
if durationMs.Valid {
|
||||||
|
v := durationMs.Int64
|
||||||
|
out.DurationMs = &v
|
||||||
|
}
|
||||||
|
if resultRequestID.Valid {
|
||||||
|
s := resultRequestID.String
|
||||||
|
out.ResultRequestID = &s
|
||||||
|
}
|
||||||
|
if resultErrorID.Valid {
|
||||||
|
v := resultErrorID.Int64
|
||||||
|
out.ResultErrorID = &v
|
||||||
|
}
|
||||||
|
if errorMessage.Valid {
|
||||||
|
s := errorMessage.String
|
||||||
|
out.ErrorMessage = &s
|
||||||
|
}
|
||||||
|
|
||||||
|
return &out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func nullTime(t time.Time) sql.NullTime {
|
||||||
|
if t.IsZero() {
|
||||||
|
return sql.NullTime{}
|
||||||
|
}
|
||||||
|
return sql.NullTime{Time: t, Valid: true}
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) {
|
||||||
|
clauses := make([]string, 0, 8)
|
||||||
|
args := make([]any, 0, 8)
|
||||||
|
clauses = append(clauses, "1=1")
|
||||||
|
|
||||||
|
phaseFilter := ""
|
||||||
|
if filter != nil {
|
||||||
|
phaseFilter = strings.TrimSpace(strings.ToLower(filter.Phase))
|
||||||
|
}
|
||||||
|
// ops_error_logs primarily stores client-visible error requests (status>=400),
|
||||||
|
// but we also persist "recovered" upstream errors (status<400) for upstream health visibility.
|
||||||
|
// By default, keep list endpoints scoped to client errors unless explicitly filtering upstream phase.
|
||||||
|
if phaseFilter != "upstream" {
|
||||||
|
clauses = append(clauses, "COALESCE(status_code, 0) >= 400")
|
||||||
|
}
|
||||||
|
|
||||||
|
if filter.StartTime != nil && !filter.StartTime.IsZero() {
|
||||||
|
args = append(args, filter.StartTime.UTC())
|
||||||
|
clauses = append(clauses, "created_at >= $"+itoa(len(args)))
|
||||||
|
}
|
||||||
|
if filter.EndTime != nil && !filter.EndTime.IsZero() {
|
||||||
|
args = append(args, filter.EndTime.UTC())
|
||||||
|
// Keep time-window semantics consistent with other ops queries: [start, end)
|
||||||
|
clauses = append(clauses, "created_at < $"+itoa(len(args)))
|
||||||
|
}
|
||||||
|
if p := strings.TrimSpace(filter.Platform); p != "" {
|
||||||
|
args = append(args, p)
|
||||||
|
clauses = append(clauses, "platform = $"+itoa(len(args)))
|
||||||
|
}
|
||||||
|
if filter.GroupID != nil && *filter.GroupID > 0 {
|
||||||
|
args = append(args, *filter.GroupID)
|
||||||
|
clauses = append(clauses, "group_id = $"+itoa(len(args)))
|
||||||
|
}
|
||||||
|
if filter.AccountID != nil && *filter.AccountID > 0 {
|
||||||
|
args = append(args, *filter.AccountID)
|
||||||
|
clauses = append(clauses, "account_id = $"+itoa(len(args)))
|
||||||
|
}
|
||||||
|
if phase := phaseFilter; phase != "" {
|
||||||
|
args = append(args, phase)
|
||||||
|
clauses = append(clauses, "error_phase = $"+itoa(len(args)))
|
||||||
|
}
|
||||||
|
if len(filter.StatusCodes) > 0 {
|
||||||
|
args = append(args, pq.Array(filter.StatusCodes))
|
||||||
|
clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) = ANY($"+itoa(len(args))+")")
|
||||||
|
}
|
||||||
|
if q := strings.TrimSpace(filter.Query); q != "" {
|
||||||
|
like := "%" + q + "%"
|
||||||
|
args = append(args, like)
|
||||||
|
n := itoa(len(args))
|
||||||
|
clauses = append(clauses, "(request_id ILIKE $"+n+" OR client_request_id ILIKE $"+n+" OR error_message ILIKE $"+n+")")
|
||||||
|
}
|
||||||
|
|
||||||
|
return "WHERE " + strings.Join(clauses, " AND "), args
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helpers for nullable args
|
||||||
|
func opsNullString(v any) any {
|
||||||
|
switch s := v.(type) {
|
||||||
|
case nil:
|
||||||
|
return sql.NullString{}
|
||||||
|
case *string:
|
||||||
|
if s == nil || strings.TrimSpace(*s) == "" {
|
||||||
|
return sql.NullString{}
|
||||||
|
}
|
||||||
|
return sql.NullString{String: strings.TrimSpace(*s), Valid: true}
|
||||||
|
case string:
|
||||||
|
if strings.TrimSpace(s) == "" {
|
||||||
|
return sql.NullString{}
|
||||||
|
}
|
||||||
|
return sql.NullString{String: strings.TrimSpace(s), Valid: true}
|
||||||
|
default:
|
||||||
|
return sql.NullString{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func opsNullInt64(v *int64) any {
|
||||||
|
if v == nil || *v == 0 {
|
||||||
|
return sql.NullInt64{}
|
||||||
|
}
|
||||||
|
return sql.NullInt64{Int64: *v, Valid: true}
|
||||||
|
}
|
||||||
|
|
||||||
|
func opsNullInt(v any) any {
|
||||||
|
switch n := v.(type) {
|
||||||
|
case nil:
|
||||||
|
return sql.NullInt64{}
|
||||||
|
case *int:
|
||||||
|
if n == nil || *n == 0 {
|
||||||
|
return sql.NullInt64{}
|
||||||
|
}
|
||||||
|
return sql.NullInt64{Int64: int64(*n), Valid: true}
|
||||||
|
case *int64:
|
||||||
|
if n == nil || *n == 0 {
|
||||||
|
return sql.NullInt64{}
|
||||||
|
}
|
||||||
|
return sql.NullInt64{Int64: *n, Valid: true}
|
||||||
|
case int:
|
||||||
|
if n == 0 {
|
||||||
|
return sql.NullInt64{}
|
||||||
|
}
|
||||||
|
return sql.NullInt64{Int64: int64(n), Valid: true}
|
||||||
|
default:
|
||||||
|
return sql.NullInt64{}
|
||||||
|
}
|
||||||
|
}
|
||||||
689
backend/internal/repository/ops_repo_alerts.go
Normal file
689
backend/internal/repository/ops_repo_alerts.go
Normal file
@@ -0,0 +1,689 @@
|
|||||||
|
package repository
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (r *opsRepository) ListAlertRules(ctx context.Context) ([]*service.OpsAlertRule, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
name,
|
||||||
|
COALESCE(description, ''),
|
||||||
|
enabled,
|
||||||
|
COALESCE(severity, ''),
|
||||||
|
metric_type,
|
||||||
|
operator,
|
||||||
|
threshold,
|
||||||
|
window_minutes,
|
||||||
|
sustained_minutes,
|
||||||
|
cooldown_minutes,
|
||||||
|
COALESCE(notify_email, true),
|
||||||
|
filters,
|
||||||
|
last_triggered_at,
|
||||||
|
created_at,
|
||||||
|
updated_at
|
||||||
|
FROM ops_alert_rules
|
||||||
|
ORDER BY id DESC`
|
||||||
|
|
||||||
|
rows, err := r.db.QueryContext(ctx, q)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() { _ = rows.Close() }()
|
||||||
|
|
||||||
|
out := []*service.OpsAlertRule{}
|
||||||
|
for rows.Next() {
|
||||||
|
var rule service.OpsAlertRule
|
||||||
|
var filtersRaw []byte
|
||||||
|
var lastTriggeredAt sql.NullTime
|
||||||
|
if err := rows.Scan(
|
||||||
|
&rule.ID,
|
||||||
|
&rule.Name,
|
||||||
|
&rule.Description,
|
||||||
|
&rule.Enabled,
|
||||||
|
&rule.Severity,
|
||||||
|
&rule.MetricType,
|
||||||
|
&rule.Operator,
|
||||||
|
&rule.Threshold,
|
||||||
|
&rule.WindowMinutes,
|
||||||
|
&rule.SustainedMinutes,
|
||||||
|
&rule.CooldownMinutes,
|
||||||
|
&rule.NotifyEmail,
|
||||||
|
&filtersRaw,
|
||||||
|
&lastTriggeredAt,
|
||||||
|
&rule.CreatedAt,
|
||||||
|
&rule.UpdatedAt,
|
||||||
|
); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if lastTriggeredAt.Valid {
|
||||||
|
v := lastTriggeredAt.Time
|
||||||
|
rule.LastTriggeredAt = &v
|
||||||
|
}
|
||||||
|
if len(filtersRaw) > 0 && string(filtersRaw) != "null" {
|
||||||
|
var decoded map[string]any
|
||||||
|
if err := json.Unmarshal(filtersRaw, &decoded); err == nil {
|
||||||
|
rule.Filters = decoded
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out = append(out, &rule)
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) CreateAlertRule(ctx context.Context, input *service.OpsAlertRule) (*service.OpsAlertRule, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if input == nil {
|
||||||
|
return nil, fmt.Errorf("nil input")
|
||||||
|
}
|
||||||
|
|
||||||
|
filtersArg, err := opsNullJSONMap(input.Filters)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name,
|
||||||
|
description,
|
||||||
|
enabled,
|
||||||
|
severity,
|
||||||
|
metric_type,
|
||||||
|
operator,
|
||||||
|
threshold,
|
||||||
|
window_minutes,
|
||||||
|
sustained_minutes,
|
||||||
|
cooldown_minutes,
|
||||||
|
notify_email,
|
||||||
|
filters,
|
||||||
|
created_at,
|
||||||
|
updated_at
|
||||||
|
) VALUES (
|
||||||
|
$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,NOW(),NOW()
|
||||||
|
)
|
||||||
|
RETURNING
|
||||||
|
id,
|
||||||
|
name,
|
||||||
|
COALESCE(description, ''),
|
||||||
|
enabled,
|
||||||
|
COALESCE(severity, ''),
|
||||||
|
metric_type,
|
||||||
|
operator,
|
||||||
|
threshold,
|
||||||
|
window_minutes,
|
||||||
|
sustained_minutes,
|
||||||
|
cooldown_minutes,
|
||||||
|
COALESCE(notify_email, true),
|
||||||
|
filters,
|
||||||
|
last_triggered_at,
|
||||||
|
created_at,
|
||||||
|
updated_at`
|
||||||
|
|
||||||
|
var out service.OpsAlertRule
|
||||||
|
var filtersRaw []byte
|
||||||
|
var lastTriggeredAt sql.NullTime
|
||||||
|
|
||||||
|
if err := r.db.QueryRowContext(
|
||||||
|
ctx,
|
||||||
|
q,
|
||||||
|
strings.TrimSpace(input.Name),
|
||||||
|
strings.TrimSpace(input.Description),
|
||||||
|
input.Enabled,
|
||||||
|
strings.TrimSpace(input.Severity),
|
||||||
|
strings.TrimSpace(input.MetricType),
|
||||||
|
strings.TrimSpace(input.Operator),
|
||||||
|
input.Threshold,
|
||||||
|
input.WindowMinutes,
|
||||||
|
input.SustainedMinutes,
|
||||||
|
input.CooldownMinutes,
|
||||||
|
input.NotifyEmail,
|
||||||
|
filtersArg,
|
||||||
|
).Scan(
|
||||||
|
&out.ID,
|
||||||
|
&out.Name,
|
||||||
|
&out.Description,
|
||||||
|
&out.Enabled,
|
||||||
|
&out.Severity,
|
||||||
|
&out.MetricType,
|
||||||
|
&out.Operator,
|
||||||
|
&out.Threshold,
|
||||||
|
&out.WindowMinutes,
|
||||||
|
&out.SustainedMinutes,
|
||||||
|
&out.CooldownMinutes,
|
||||||
|
&out.NotifyEmail,
|
||||||
|
&filtersRaw,
|
||||||
|
&lastTriggeredAt,
|
||||||
|
&out.CreatedAt,
|
||||||
|
&out.UpdatedAt,
|
||||||
|
); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if lastTriggeredAt.Valid {
|
||||||
|
v := lastTriggeredAt.Time
|
||||||
|
out.LastTriggeredAt = &v
|
||||||
|
}
|
||||||
|
if len(filtersRaw) > 0 && string(filtersRaw) != "null" {
|
||||||
|
var decoded map[string]any
|
||||||
|
if err := json.Unmarshal(filtersRaw, &decoded); err == nil {
|
||||||
|
out.Filters = decoded
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) UpdateAlertRule(ctx context.Context, input *service.OpsAlertRule) (*service.OpsAlertRule, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if input == nil {
|
||||||
|
return nil, fmt.Errorf("nil input")
|
||||||
|
}
|
||||||
|
if input.ID <= 0 {
|
||||||
|
return nil, fmt.Errorf("invalid id")
|
||||||
|
}
|
||||||
|
|
||||||
|
filtersArg, err := opsNullJSONMap(input.Filters)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
UPDATE ops_alert_rules
|
||||||
|
SET
|
||||||
|
name = $2,
|
||||||
|
description = $3,
|
||||||
|
enabled = $4,
|
||||||
|
severity = $5,
|
||||||
|
metric_type = $6,
|
||||||
|
operator = $7,
|
||||||
|
threshold = $8,
|
||||||
|
window_minutes = $9,
|
||||||
|
sustained_minutes = $10,
|
||||||
|
cooldown_minutes = $11,
|
||||||
|
notify_email = $12,
|
||||||
|
filters = $13,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1
|
||||||
|
RETURNING
|
||||||
|
id,
|
||||||
|
name,
|
||||||
|
COALESCE(description, ''),
|
||||||
|
enabled,
|
||||||
|
COALESCE(severity, ''),
|
||||||
|
metric_type,
|
||||||
|
operator,
|
||||||
|
threshold,
|
||||||
|
window_minutes,
|
||||||
|
sustained_minutes,
|
||||||
|
cooldown_minutes,
|
||||||
|
COALESCE(notify_email, true),
|
||||||
|
filters,
|
||||||
|
last_triggered_at,
|
||||||
|
created_at,
|
||||||
|
updated_at`
|
||||||
|
|
||||||
|
var out service.OpsAlertRule
|
||||||
|
var filtersRaw []byte
|
||||||
|
var lastTriggeredAt sql.NullTime
|
||||||
|
|
||||||
|
if err := r.db.QueryRowContext(
|
||||||
|
ctx,
|
||||||
|
q,
|
||||||
|
input.ID,
|
||||||
|
strings.TrimSpace(input.Name),
|
||||||
|
strings.TrimSpace(input.Description),
|
||||||
|
input.Enabled,
|
||||||
|
strings.TrimSpace(input.Severity),
|
||||||
|
strings.TrimSpace(input.MetricType),
|
||||||
|
strings.TrimSpace(input.Operator),
|
||||||
|
input.Threshold,
|
||||||
|
input.WindowMinutes,
|
||||||
|
input.SustainedMinutes,
|
||||||
|
input.CooldownMinutes,
|
||||||
|
input.NotifyEmail,
|
||||||
|
filtersArg,
|
||||||
|
).Scan(
|
||||||
|
&out.ID,
|
||||||
|
&out.Name,
|
||||||
|
&out.Description,
|
||||||
|
&out.Enabled,
|
||||||
|
&out.Severity,
|
||||||
|
&out.MetricType,
|
||||||
|
&out.Operator,
|
||||||
|
&out.Threshold,
|
||||||
|
&out.WindowMinutes,
|
||||||
|
&out.SustainedMinutes,
|
||||||
|
&out.CooldownMinutes,
|
||||||
|
&out.NotifyEmail,
|
||||||
|
&filtersRaw,
|
||||||
|
&lastTriggeredAt,
|
||||||
|
&out.CreatedAt,
|
||||||
|
&out.UpdatedAt,
|
||||||
|
); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if lastTriggeredAt.Valid {
|
||||||
|
v := lastTriggeredAt.Time
|
||||||
|
out.LastTriggeredAt = &v
|
||||||
|
}
|
||||||
|
if len(filtersRaw) > 0 && string(filtersRaw) != "null" {
|
||||||
|
var decoded map[string]any
|
||||||
|
if err := json.Unmarshal(filtersRaw, &decoded); err == nil {
|
||||||
|
out.Filters = decoded
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) DeleteAlertRule(ctx context.Context, id int64) error {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if id <= 0 {
|
||||||
|
return fmt.Errorf("invalid id")
|
||||||
|
}
|
||||||
|
|
||||||
|
res, err := r.db.ExecContext(ctx, "DELETE FROM ops_alert_rules WHERE id = $1", id)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
affected, err := res.RowsAffected()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if affected == 0 {
|
||||||
|
return sql.ErrNoRows
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) ListAlertEvents(ctx context.Context, filter *service.OpsAlertEventFilter) ([]*service.OpsAlertEvent, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if filter == nil {
|
||||||
|
filter = &service.OpsAlertEventFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
limit := filter.Limit
|
||||||
|
if limit <= 0 {
|
||||||
|
limit = 100
|
||||||
|
}
|
||||||
|
if limit > 500 {
|
||||||
|
limit = 500
|
||||||
|
}
|
||||||
|
|
||||||
|
where, args := buildOpsAlertEventsWhere(filter)
|
||||||
|
args = append(args, limit)
|
||||||
|
limitArg := "$" + itoa(len(args))
|
||||||
|
|
||||||
|
q := `
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
COALESCE(rule_id, 0),
|
||||||
|
COALESCE(severity, ''),
|
||||||
|
COALESCE(status, ''),
|
||||||
|
COALESCE(title, ''),
|
||||||
|
COALESCE(description, ''),
|
||||||
|
metric_value,
|
||||||
|
threshold_value,
|
||||||
|
dimensions,
|
||||||
|
fired_at,
|
||||||
|
resolved_at,
|
||||||
|
email_sent,
|
||||||
|
created_at
|
||||||
|
FROM ops_alert_events
|
||||||
|
` + where + `
|
||||||
|
ORDER BY fired_at DESC
|
||||||
|
LIMIT ` + limitArg
|
||||||
|
|
||||||
|
rows, err := r.db.QueryContext(ctx, q, args...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() { _ = rows.Close() }()
|
||||||
|
|
||||||
|
out := []*service.OpsAlertEvent{}
|
||||||
|
for rows.Next() {
|
||||||
|
var ev service.OpsAlertEvent
|
||||||
|
var metricValue sql.NullFloat64
|
||||||
|
var thresholdValue sql.NullFloat64
|
||||||
|
var dimensionsRaw []byte
|
||||||
|
var resolvedAt sql.NullTime
|
||||||
|
if err := rows.Scan(
|
||||||
|
&ev.ID,
|
||||||
|
&ev.RuleID,
|
||||||
|
&ev.Severity,
|
||||||
|
&ev.Status,
|
||||||
|
&ev.Title,
|
||||||
|
&ev.Description,
|
||||||
|
&metricValue,
|
||||||
|
&thresholdValue,
|
||||||
|
&dimensionsRaw,
|
||||||
|
&ev.FiredAt,
|
||||||
|
&resolvedAt,
|
||||||
|
&ev.EmailSent,
|
||||||
|
&ev.CreatedAt,
|
||||||
|
); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if metricValue.Valid {
|
||||||
|
v := metricValue.Float64
|
||||||
|
ev.MetricValue = &v
|
||||||
|
}
|
||||||
|
if thresholdValue.Valid {
|
||||||
|
v := thresholdValue.Float64
|
||||||
|
ev.ThresholdValue = &v
|
||||||
|
}
|
||||||
|
if resolvedAt.Valid {
|
||||||
|
v := resolvedAt.Time
|
||||||
|
ev.ResolvedAt = &v
|
||||||
|
}
|
||||||
|
if len(dimensionsRaw) > 0 && string(dimensionsRaw) != "null" {
|
||||||
|
var decoded map[string]any
|
||||||
|
if err := json.Unmarshal(dimensionsRaw, &decoded); err == nil {
|
||||||
|
ev.Dimensions = decoded
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out = append(out, &ev)
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if ruleID <= 0 {
|
||||||
|
return nil, fmt.Errorf("invalid rule id")
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
COALESCE(rule_id, 0),
|
||||||
|
COALESCE(severity, ''),
|
||||||
|
COALESCE(status, ''),
|
||||||
|
COALESCE(title, ''),
|
||||||
|
COALESCE(description, ''),
|
||||||
|
metric_value,
|
||||||
|
threshold_value,
|
||||||
|
dimensions,
|
||||||
|
fired_at,
|
||||||
|
resolved_at,
|
||||||
|
email_sent,
|
||||||
|
created_at
|
||||||
|
FROM ops_alert_events
|
||||||
|
WHERE rule_id = $1 AND status = $2
|
||||||
|
ORDER BY fired_at DESC
|
||||||
|
LIMIT 1`
|
||||||
|
|
||||||
|
row := r.db.QueryRowContext(ctx, q, ruleID, service.OpsAlertStatusFiring)
|
||||||
|
ev, err := scanOpsAlertEvent(row)
|
||||||
|
if err != nil {
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return ev, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if ruleID <= 0 {
|
||||||
|
return nil, fmt.Errorf("invalid rule id")
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
COALESCE(rule_id, 0),
|
||||||
|
COALESCE(severity, ''),
|
||||||
|
COALESCE(status, ''),
|
||||||
|
COALESCE(title, ''),
|
||||||
|
COALESCE(description, ''),
|
||||||
|
metric_value,
|
||||||
|
threshold_value,
|
||||||
|
dimensions,
|
||||||
|
fired_at,
|
||||||
|
resolved_at,
|
||||||
|
email_sent,
|
||||||
|
created_at
|
||||||
|
FROM ops_alert_events
|
||||||
|
WHERE rule_id = $1
|
||||||
|
ORDER BY fired_at DESC
|
||||||
|
LIMIT 1`
|
||||||
|
|
||||||
|
row := r.db.QueryRowContext(ctx, q, ruleID)
|
||||||
|
ev, err := scanOpsAlertEvent(row)
|
||||||
|
if err != nil {
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return ev, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) CreateAlertEvent(ctx context.Context, event *service.OpsAlertEvent) (*service.OpsAlertEvent, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if event == nil {
|
||||||
|
return nil, fmt.Errorf("nil event")
|
||||||
|
}
|
||||||
|
|
||||||
|
dimensionsArg, err := opsNullJSONMap(event.Dimensions)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
INSERT INTO ops_alert_events (
|
||||||
|
rule_id,
|
||||||
|
severity,
|
||||||
|
status,
|
||||||
|
title,
|
||||||
|
description,
|
||||||
|
metric_value,
|
||||||
|
threshold_value,
|
||||||
|
dimensions,
|
||||||
|
fired_at,
|
||||||
|
resolved_at,
|
||||||
|
email_sent,
|
||||||
|
created_at
|
||||||
|
) VALUES (
|
||||||
|
$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,NOW()
|
||||||
|
)
|
||||||
|
RETURNING
|
||||||
|
id,
|
||||||
|
COALESCE(rule_id, 0),
|
||||||
|
COALESCE(severity, ''),
|
||||||
|
COALESCE(status, ''),
|
||||||
|
COALESCE(title, ''),
|
||||||
|
COALESCE(description, ''),
|
||||||
|
metric_value,
|
||||||
|
threshold_value,
|
||||||
|
dimensions,
|
||||||
|
fired_at,
|
||||||
|
resolved_at,
|
||||||
|
email_sent,
|
||||||
|
created_at`
|
||||||
|
|
||||||
|
row := r.db.QueryRowContext(
|
||||||
|
ctx,
|
||||||
|
q,
|
||||||
|
opsNullInt64(&event.RuleID),
|
||||||
|
opsNullString(event.Severity),
|
||||||
|
opsNullString(event.Status),
|
||||||
|
opsNullString(event.Title),
|
||||||
|
opsNullString(event.Description),
|
||||||
|
opsNullFloat64(event.MetricValue),
|
||||||
|
opsNullFloat64(event.ThresholdValue),
|
||||||
|
dimensionsArg,
|
||||||
|
event.FiredAt,
|
||||||
|
opsNullTime(event.ResolvedAt),
|
||||||
|
event.EmailSent,
|
||||||
|
)
|
||||||
|
return scanOpsAlertEvent(row)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if eventID <= 0 {
|
||||||
|
return fmt.Errorf("invalid event id")
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(status) == "" {
|
||||||
|
return fmt.Errorf("invalid status")
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
UPDATE ops_alert_events
|
||||||
|
SET status = $2,
|
||||||
|
resolved_at = $3
|
||||||
|
WHERE id = $1`
|
||||||
|
|
||||||
|
_, err := r.db.ExecContext(ctx, q, eventID, strings.TrimSpace(status), opsNullTime(resolvedAt))
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if eventID <= 0 {
|
||||||
|
return fmt.Errorf("invalid event id")
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := r.db.ExecContext(ctx, "UPDATE ops_alert_events SET email_sent = $2 WHERE id = $1", eventID, emailSent)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
type opsAlertEventRow interface {
|
||||||
|
Scan(dest ...any) error
|
||||||
|
}
|
||||||
|
|
||||||
|
func scanOpsAlertEvent(row opsAlertEventRow) (*service.OpsAlertEvent, error) {
|
||||||
|
var ev service.OpsAlertEvent
|
||||||
|
var metricValue sql.NullFloat64
|
||||||
|
var thresholdValue sql.NullFloat64
|
||||||
|
var dimensionsRaw []byte
|
||||||
|
var resolvedAt sql.NullTime
|
||||||
|
|
||||||
|
if err := row.Scan(
|
||||||
|
&ev.ID,
|
||||||
|
&ev.RuleID,
|
||||||
|
&ev.Severity,
|
||||||
|
&ev.Status,
|
||||||
|
&ev.Title,
|
||||||
|
&ev.Description,
|
||||||
|
&metricValue,
|
||||||
|
&thresholdValue,
|
||||||
|
&dimensionsRaw,
|
||||||
|
&ev.FiredAt,
|
||||||
|
&resolvedAt,
|
||||||
|
&ev.EmailSent,
|
||||||
|
&ev.CreatedAt,
|
||||||
|
); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if metricValue.Valid {
|
||||||
|
v := metricValue.Float64
|
||||||
|
ev.MetricValue = &v
|
||||||
|
}
|
||||||
|
if thresholdValue.Valid {
|
||||||
|
v := thresholdValue.Float64
|
||||||
|
ev.ThresholdValue = &v
|
||||||
|
}
|
||||||
|
if resolvedAt.Valid {
|
||||||
|
v := resolvedAt.Time
|
||||||
|
ev.ResolvedAt = &v
|
||||||
|
}
|
||||||
|
if len(dimensionsRaw) > 0 && string(dimensionsRaw) != "null" {
|
||||||
|
var decoded map[string]any
|
||||||
|
if err := json.Unmarshal(dimensionsRaw, &decoded); err == nil {
|
||||||
|
ev.Dimensions = decoded
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return &ev, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildOpsAlertEventsWhere(filter *service.OpsAlertEventFilter) (string, []any) {
|
||||||
|
clauses := []string{"1=1"}
|
||||||
|
args := []any{}
|
||||||
|
|
||||||
|
if filter == nil {
|
||||||
|
return "WHERE " + strings.Join(clauses, " AND "), args
|
||||||
|
}
|
||||||
|
|
||||||
|
if status := strings.TrimSpace(filter.Status); status != "" {
|
||||||
|
args = append(args, status)
|
||||||
|
clauses = append(clauses, "status = $"+itoa(len(args)))
|
||||||
|
}
|
||||||
|
if severity := strings.TrimSpace(filter.Severity); severity != "" {
|
||||||
|
args = append(args, severity)
|
||||||
|
clauses = append(clauses, "severity = $"+itoa(len(args)))
|
||||||
|
}
|
||||||
|
if filter.StartTime != nil && !filter.StartTime.IsZero() {
|
||||||
|
args = append(args, *filter.StartTime)
|
||||||
|
clauses = append(clauses, "fired_at >= $"+itoa(len(args)))
|
||||||
|
}
|
||||||
|
if filter.EndTime != nil && !filter.EndTime.IsZero() {
|
||||||
|
args = append(args, *filter.EndTime)
|
||||||
|
clauses = append(clauses, "fired_at < $"+itoa(len(args)))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dimensions are stored in JSONB. We filter best-effort without requiring GIN indexes.
|
||||||
|
if platform := strings.TrimSpace(filter.Platform); platform != "" {
|
||||||
|
args = append(args, platform)
|
||||||
|
clauses = append(clauses, "(dimensions->>'platform') = $"+itoa(len(args)))
|
||||||
|
}
|
||||||
|
if filter.GroupID != nil && *filter.GroupID > 0 {
|
||||||
|
args = append(args, fmt.Sprintf("%d", *filter.GroupID))
|
||||||
|
clauses = append(clauses, "(dimensions->>'group_id') = $"+itoa(len(args)))
|
||||||
|
}
|
||||||
|
|
||||||
|
return "WHERE " + strings.Join(clauses, " AND "), args
|
||||||
|
}
|
||||||
|
|
||||||
|
func opsNullJSONMap(v map[string]any) (any, error) {
|
||||||
|
if v == nil {
|
||||||
|
return sql.NullString{}, nil
|
||||||
|
}
|
||||||
|
b, err := json.Marshal(v)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if len(b) == 0 {
|
||||||
|
return sql.NullString{}, nil
|
||||||
|
}
|
||||||
|
return sql.NullString{String: string(b), Valid: true}, nil
|
||||||
|
}
|
||||||
1013
backend/internal/repository/ops_repo_dashboard.go
Normal file
1013
backend/internal/repository/ops_repo_dashboard.go
Normal file
File diff suppressed because it is too large
Load Diff
79
backend/internal/repository/ops_repo_histograms.go
Normal file
79
backend/internal/repository/ops_repo_histograms.go
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
package repository
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (r *opsRepository) GetLatencyHistogram(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsLatencyHistogramResponse, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if filter == nil {
|
||||||
|
return nil, fmt.Errorf("nil filter")
|
||||||
|
}
|
||||||
|
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
|
||||||
|
return nil, fmt.Errorf("start_time/end_time required")
|
||||||
|
}
|
||||||
|
|
||||||
|
start := filter.StartTime.UTC()
|
||||||
|
end := filter.EndTime.UTC()
|
||||||
|
|
||||||
|
join, where, args, _ := buildUsageWhere(filter, start, end, 1)
|
||||||
|
rangeExpr := latencyHistogramRangeCaseExpr("ul.duration_ms")
|
||||||
|
orderExpr := latencyHistogramRangeOrderCaseExpr("ul.duration_ms")
|
||||||
|
|
||||||
|
q := `
|
||||||
|
SELECT
|
||||||
|
` + rangeExpr + ` AS range,
|
||||||
|
COALESCE(COUNT(*), 0) AS count,
|
||||||
|
` + orderExpr + ` AS ord
|
||||||
|
FROM usage_logs ul
|
||||||
|
` + join + `
|
||||||
|
` + where + `
|
||||||
|
AND ul.duration_ms IS NOT NULL
|
||||||
|
GROUP BY 1, 3
|
||||||
|
ORDER BY 3 ASC`
|
||||||
|
|
||||||
|
rows, err := r.db.QueryContext(ctx, q, args...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() { _ = rows.Close() }()
|
||||||
|
|
||||||
|
counts := make(map[string]int64, len(latencyHistogramOrderedRanges))
|
||||||
|
var total int64
|
||||||
|
for rows.Next() {
|
||||||
|
var label string
|
||||||
|
var count int64
|
||||||
|
var _ord int
|
||||||
|
if err := rows.Scan(&label, &count, &_ord); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
counts[label] = count
|
||||||
|
total += count
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
buckets := make([]*service.OpsLatencyHistogramBucket, 0, len(latencyHistogramOrderedRanges))
|
||||||
|
for _, label := range latencyHistogramOrderedRanges {
|
||||||
|
buckets = append(buckets, &service.OpsLatencyHistogramBucket{
|
||||||
|
Range: label,
|
||||||
|
Count: counts[label],
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return &service.OpsLatencyHistogramResponse{
|
||||||
|
StartTime: start,
|
||||||
|
EndTime: end,
|
||||||
|
Platform: strings.TrimSpace(filter.Platform),
|
||||||
|
GroupID: filter.GroupID,
|
||||||
|
TotalRequests: total,
|
||||||
|
Buckets: buckets,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
@@ -0,0 +1,64 @@
|
|||||||
|
package repository
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
type latencyHistogramBucket struct {
|
||||||
|
upperMs int
|
||||||
|
label string
|
||||||
|
}
|
||||||
|
|
||||||
|
var latencyHistogramBuckets = []latencyHistogramBucket{
|
||||||
|
{upperMs: 100, label: "0-100ms"},
|
||||||
|
{upperMs: 200, label: "100-200ms"},
|
||||||
|
{upperMs: 500, label: "200-500ms"},
|
||||||
|
{upperMs: 1000, label: "500-1000ms"},
|
||||||
|
{upperMs: 2000, label: "1000-2000ms"},
|
||||||
|
{upperMs: 0, label: "2000ms+"}, // default bucket
|
||||||
|
}
|
||||||
|
|
||||||
|
var latencyHistogramOrderedRanges = func() []string {
|
||||||
|
out := make([]string, 0, len(latencyHistogramBuckets))
|
||||||
|
for _, b := range latencyHistogramBuckets {
|
||||||
|
out = append(out, b.label)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}()
|
||||||
|
|
||||||
|
func latencyHistogramRangeCaseExpr(column string) string {
|
||||||
|
var sb strings.Builder
|
||||||
|
_, _ = sb.WriteString("CASE\n")
|
||||||
|
|
||||||
|
for _, b := range latencyHistogramBuckets {
|
||||||
|
if b.upperMs <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
_, _ = sb.WriteString(fmt.Sprintf("\tWHEN %s < %d THEN '%s'\n", column, b.upperMs, b.label))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Default bucket.
|
||||||
|
last := latencyHistogramBuckets[len(latencyHistogramBuckets)-1]
|
||||||
|
_, _ = sb.WriteString(fmt.Sprintf("\tELSE '%s'\n", last.label))
|
||||||
|
_, _ = sb.WriteString("END")
|
||||||
|
return sb.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func latencyHistogramRangeOrderCaseExpr(column string) string {
|
||||||
|
var sb strings.Builder
|
||||||
|
_, _ = sb.WriteString("CASE\n")
|
||||||
|
|
||||||
|
order := 1
|
||||||
|
for _, b := range latencyHistogramBuckets {
|
||||||
|
if b.upperMs <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
_, _ = sb.WriteString(fmt.Sprintf("\tWHEN %s < %d THEN %d\n", column, b.upperMs, order))
|
||||||
|
order++
|
||||||
|
}
|
||||||
|
|
||||||
|
_, _ = sb.WriteString(fmt.Sprintf("\tELSE %d\n", order))
|
||||||
|
_, _ = sb.WriteString("END")
|
||||||
|
return sb.String()
|
||||||
|
}
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
package repository
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestLatencyHistogramBuckets_AreConsistent(t *testing.T) {
|
||||||
|
require.Equal(t, len(latencyHistogramBuckets), len(latencyHistogramOrderedRanges))
|
||||||
|
for i, b := range latencyHistogramBuckets {
|
||||||
|
require.Equal(t, b.label, latencyHistogramOrderedRanges[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
422
backend/internal/repository/ops_repo_metrics.go
Normal file
422
backend/internal/repository/ops_repo_metrics.go
Normal file
@@ -0,0 +1,422 @@
|
|||||||
|
package repository
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (r *opsRepository) InsertSystemMetrics(ctx context.Context, input *service.OpsInsertSystemMetricsInput) error {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if input == nil {
|
||||||
|
return fmt.Errorf("nil input")
|
||||||
|
}
|
||||||
|
|
||||||
|
window := input.WindowMinutes
|
||||||
|
if window <= 0 {
|
||||||
|
window = 1
|
||||||
|
}
|
||||||
|
createdAt := input.CreatedAt
|
||||||
|
if createdAt.IsZero() {
|
||||||
|
createdAt = time.Now().UTC()
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
INSERT INTO ops_system_metrics (
|
||||||
|
created_at,
|
||||||
|
window_minutes,
|
||||||
|
platform,
|
||||||
|
group_id,
|
||||||
|
|
||||||
|
success_count,
|
||||||
|
error_count_total,
|
||||||
|
business_limited_count,
|
||||||
|
error_count_sla,
|
||||||
|
|
||||||
|
upstream_error_count_excl_429_529,
|
||||||
|
upstream_429_count,
|
||||||
|
upstream_529_count,
|
||||||
|
|
||||||
|
token_consumed,
|
||||||
|
qps,
|
||||||
|
tps,
|
||||||
|
|
||||||
|
duration_p50_ms,
|
||||||
|
duration_p90_ms,
|
||||||
|
duration_p95_ms,
|
||||||
|
duration_p99_ms,
|
||||||
|
duration_avg_ms,
|
||||||
|
duration_max_ms,
|
||||||
|
|
||||||
|
ttft_p50_ms,
|
||||||
|
ttft_p90_ms,
|
||||||
|
ttft_p95_ms,
|
||||||
|
ttft_p99_ms,
|
||||||
|
ttft_avg_ms,
|
||||||
|
ttft_max_ms,
|
||||||
|
|
||||||
|
cpu_usage_percent,
|
||||||
|
memory_used_mb,
|
||||||
|
memory_total_mb,
|
||||||
|
memory_usage_percent,
|
||||||
|
|
||||||
|
db_ok,
|
||||||
|
redis_ok,
|
||||||
|
|
||||||
|
redis_conn_total,
|
||||||
|
redis_conn_idle,
|
||||||
|
|
||||||
|
db_conn_active,
|
||||||
|
db_conn_idle,
|
||||||
|
db_conn_waiting,
|
||||||
|
|
||||||
|
goroutine_count,
|
||||||
|
concurrency_queue_depth
|
||||||
|
) VALUES (
|
||||||
|
$1,$2,$3,$4,
|
||||||
|
$5,$6,$7,$8,
|
||||||
|
$9,$10,$11,
|
||||||
|
$12,$13,$14,
|
||||||
|
$15,$16,$17,$18,$19,$20,
|
||||||
|
$21,$22,$23,$24,$25,$26,
|
||||||
|
$27,$28,$29,$30,
|
||||||
|
$31,$32,
|
||||||
|
$33,$34,
|
||||||
|
$35,$36,$37,
|
||||||
|
$38,$39
|
||||||
|
)`
|
||||||
|
|
||||||
|
_, err := r.db.ExecContext(
|
||||||
|
ctx,
|
||||||
|
q,
|
||||||
|
createdAt,
|
||||||
|
window,
|
||||||
|
opsNullString(input.Platform),
|
||||||
|
opsNullInt64(input.GroupID),
|
||||||
|
|
||||||
|
input.SuccessCount,
|
||||||
|
input.ErrorCountTotal,
|
||||||
|
input.BusinessLimitedCount,
|
||||||
|
input.ErrorCountSLA,
|
||||||
|
|
||||||
|
input.UpstreamErrorCountExcl429529,
|
||||||
|
input.Upstream429Count,
|
||||||
|
input.Upstream529Count,
|
||||||
|
|
||||||
|
input.TokenConsumed,
|
||||||
|
opsNullFloat64(input.QPS),
|
||||||
|
opsNullFloat64(input.TPS),
|
||||||
|
|
||||||
|
opsNullInt(input.DurationP50Ms),
|
||||||
|
opsNullInt(input.DurationP90Ms),
|
||||||
|
opsNullInt(input.DurationP95Ms),
|
||||||
|
opsNullInt(input.DurationP99Ms),
|
||||||
|
opsNullFloat64(input.DurationAvgMs),
|
||||||
|
opsNullInt(input.DurationMaxMs),
|
||||||
|
|
||||||
|
opsNullInt(input.TTFTP50Ms),
|
||||||
|
opsNullInt(input.TTFTP90Ms),
|
||||||
|
opsNullInt(input.TTFTP95Ms),
|
||||||
|
opsNullInt(input.TTFTP99Ms),
|
||||||
|
opsNullFloat64(input.TTFTAvgMs),
|
||||||
|
opsNullInt(input.TTFTMaxMs),
|
||||||
|
|
||||||
|
opsNullFloat64(input.CPUUsagePercent),
|
||||||
|
opsNullInt(input.MemoryUsedMB),
|
||||||
|
opsNullInt(input.MemoryTotalMB),
|
||||||
|
opsNullFloat64(input.MemoryUsagePercent),
|
||||||
|
|
||||||
|
opsNullBool(input.DBOK),
|
||||||
|
opsNullBool(input.RedisOK),
|
||||||
|
|
||||||
|
opsNullInt(input.RedisConnTotal),
|
||||||
|
opsNullInt(input.RedisConnIdle),
|
||||||
|
|
||||||
|
opsNullInt(input.DBConnActive),
|
||||||
|
opsNullInt(input.DBConnIdle),
|
||||||
|
opsNullInt(input.DBConnWaiting),
|
||||||
|
|
||||||
|
opsNullInt(input.GoroutineCount),
|
||||||
|
opsNullInt(input.ConcurrencyQueueDepth),
|
||||||
|
)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*service.OpsSystemMetricsSnapshot, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if windowMinutes <= 0 {
|
||||||
|
windowMinutes = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
created_at,
|
||||||
|
window_minutes,
|
||||||
|
|
||||||
|
cpu_usage_percent,
|
||||||
|
memory_used_mb,
|
||||||
|
memory_total_mb,
|
||||||
|
memory_usage_percent,
|
||||||
|
|
||||||
|
db_ok,
|
||||||
|
redis_ok,
|
||||||
|
|
||||||
|
redis_conn_total,
|
||||||
|
redis_conn_idle,
|
||||||
|
|
||||||
|
db_conn_active,
|
||||||
|
db_conn_idle,
|
||||||
|
db_conn_waiting,
|
||||||
|
|
||||||
|
goroutine_count,
|
||||||
|
concurrency_queue_depth
|
||||||
|
FROM ops_system_metrics
|
||||||
|
WHERE window_minutes = $1
|
||||||
|
AND platform IS NULL
|
||||||
|
AND group_id IS NULL
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
LIMIT 1`
|
||||||
|
|
||||||
|
var out service.OpsSystemMetricsSnapshot
|
||||||
|
var cpu sql.NullFloat64
|
||||||
|
var memUsed sql.NullInt64
|
||||||
|
var memTotal sql.NullInt64
|
||||||
|
var memPct sql.NullFloat64
|
||||||
|
var dbOK sql.NullBool
|
||||||
|
var redisOK sql.NullBool
|
||||||
|
var redisTotal sql.NullInt64
|
||||||
|
var redisIdle sql.NullInt64
|
||||||
|
var dbActive sql.NullInt64
|
||||||
|
var dbIdle sql.NullInt64
|
||||||
|
var dbWaiting sql.NullInt64
|
||||||
|
var goroutines sql.NullInt64
|
||||||
|
var queueDepth sql.NullInt64
|
||||||
|
|
||||||
|
if err := r.db.QueryRowContext(ctx, q, windowMinutes).Scan(
|
||||||
|
&out.ID,
|
||||||
|
&out.CreatedAt,
|
||||||
|
&out.WindowMinutes,
|
||||||
|
&cpu,
|
||||||
|
&memUsed,
|
||||||
|
&memTotal,
|
||||||
|
&memPct,
|
||||||
|
&dbOK,
|
||||||
|
&redisOK,
|
||||||
|
&redisTotal,
|
||||||
|
&redisIdle,
|
||||||
|
&dbActive,
|
||||||
|
&dbIdle,
|
||||||
|
&dbWaiting,
|
||||||
|
&goroutines,
|
||||||
|
&queueDepth,
|
||||||
|
); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if cpu.Valid {
|
||||||
|
v := cpu.Float64
|
||||||
|
out.CPUUsagePercent = &v
|
||||||
|
}
|
||||||
|
if memUsed.Valid {
|
||||||
|
v := memUsed.Int64
|
||||||
|
out.MemoryUsedMB = &v
|
||||||
|
}
|
||||||
|
if memTotal.Valid {
|
||||||
|
v := memTotal.Int64
|
||||||
|
out.MemoryTotalMB = &v
|
||||||
|
}
|
||||||
|
if memPct.Valid {
|
||||||
|
v := memPct.Float64
|
||||||
|
out.MemoryUsagePercent = &v
|
||||||
|
}
|
||||||
|
if dbOK.Valid {
|
||||||
|
v := dbOK.Bool
|
||||||
|
out.DBOK = &v
|
||||||
|
}
|
||||||
|
if redisOK.Valid {
|
||||||
|
v := redisOK.Bool
|
||||||
|
out.RedisOK = &v
|
||||||
|
}
|
||||||
|
if redisTotal.Valid {
|
||||||
|
v := int(redisTotal.Int64)
|
||||||
|
out.RedisConnTotal = &v
|
||||||
|
}
|
||||||
|
if redisIdle.Valid {
|
||||||
|
v := int(redisIdle.Int64)
|
||||||
|
out.RedisConnIdle = &v
|
||||||
|
}
|
||||||
|
if dbActive.Valid {
|
||||||
|
v := int(dbActive.Int64)
|
||||||
|
out.DBConnActive = &v
|
||||||
|
}
|
||||||
|
if dbIdle.Valid {
|
||||||
|
v := int(dbIdle.Int64)
|
||||||
|
out.DBConnIdle = &v
|
||||||
|
}
|
||||||
|
if dbWaiting.Valid {
|
||||||
|
v := int(dbWaiting.Int64)
|
||||||
|
out.DBConnWaiting = &v
|
||||||
|
}
|
||||||
|
if goroutines.Valid {
|
||||||
|
v := int(goroutines.Int64)
|
||||||
|
out.GoroutineCount = &v
|
||||||
|
}
|
||||||
|
if queueDepth.Valid {
|
||||||
|
v := int(queueDepth.Int64)
|
||||||
|
out.ConcurrencyQueueDepth = &v
|
||||||
|
}
|
||||||
|
|
||||||
|
return &out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) UpsertJobHeartbeat(ctx context.Context, input *service.OpsUpsertJobHeartbeatInput) error {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if input == nil {
|
||||||
|
return fmt.Errorf("nil input")
|
||||||
|
}
|
||||||
|
if input.JobName == "" {
|
||||||
|
return fmt.Errorf("job_name required")
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
INSERT INTO ops_job_heartbeats (
|
||||||
|
job_name,
|
||||||
|
last_run_at,
|
||||||
|
last_success_at,
|
||||||
|
last_error_at,
|
||||||
|
last_error,
|
||||||
|
last_duration_ms,
|
||||||
|
updated_at
|
||||||
|
) VALUES (
|
||||||
|
$1,$2,$3,$4,$5,$6,NOW()
|
||||||
|
)
|
||||||
|
ON CONFLICT (job_name) DO UPDATE SET
|
||||||
|
last_run_at = COALESCE(EXCLUDED.last_run_at, ops_job_heartbeats.last_run_at),
|
||||||
|
last_success_at = COALESCE(EXCLUDED.last_success_at, ops_job_heartbeats.last_success_at),
|
||||||
|
last_error_at = CASE
|
||||||
|
WHEN EXCLUDED.last_success_at IS NOT NULL THEN NULL
|
||||||
|
ELSE COALESCE(EXCLUDED.last_error_at, ops_job_heartbeats.last_error_at)
|
||||||
|
END,
|
||||||
|
last_error = CASE
|
||||||
|
WHEN EXCLUDED.last_success_at IS NOT NULL THEN NULL
|
||||||
|
ELSE COALESCE(EXCLUDED.last_error, ops_job_heartbeats.last_error)
|
||||||
|
END,
|
||||||
|
last_duration_ms = COALESCE(EXCLUDED.last_duration_ms, ops_job_heartbeats.last_duration_ms),
|
||||||
|
updated_at = NOW()`
|
||||||
|
|
||||||
|
_, err := r.db.ExecContext(
|
||||||
|
ctx,
|
||||||
|
q,
|
||||||
|
input.JobName,
|
||||||
|
opsNullTime(input.LastRunAt),
|
||||||
|
opsNullTime(input.LastSuccessAt),
|
||||||
|
opsNullTime(input.LastErrorAt),
|
||||||
|
opsNullString(input.LastError),
|
||||||
|
opsNullInt(input.LastDurationMs),
|
||||||
|
)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) ListJobHeartbeats(ctx context.Context) ([]*service.OpsJobHeartbeat, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
SELECT
|
||||||
|
job_name,
|
||||||
|
last_run_at,
|
||||||
|
last_success_at,
|
||||||
|
last_error_at,
|
||||||
|
last_error,
|
||||||
|
last_duration_ms,
|
||||||
|
updated_at
|
||||||
|
FROM ops_job_heartbeats
|
||||||
|
ORDER BY job_name ASC`
|
||||||
|
|
||||||
|
rows, err := r.db.QueryContext(ctx, q)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() { _ = rows.Close() }()
|
||||||
|
|
||||||
|
out := make([]*service.OpsJobHeartbeat, 0, 8)
|
||||||
|
for rows.Next() {
|
||||||
|
var item service.OpsJobHeartbeat
|
||||||
|
var lastRun sql.NullTime
|
||||||
|
var lastSuccess sql.NullTime
|
||||||
|
var lastErrorAt sql.NullTime
|
||||||
|
var lastError sql.NullString
|
||||||
|
var lastDuration sql.NullInt64
|
||||||
|
|
||||||
|
if err := rows.Scan(
|
||||||
|
&item.JobName,
|
||||||
|
&lastRun,
|
||||||
|
&lastSuccess,
|
||||||
|
&lastErrorAt,
|
||||||
|
&lastError,
|
||||||
|
&lastDuration,
|
||||||
|
&item.UpdatedAt,
|
||||||
|
); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if lastRun.Valid {
|
||||||
|
v := lastRun.Time
|
||||||
|
item.LastRunAt = &v
|
||||||
|
}
|
||||||
|
if lastSuccess.Valid {
|
||||||
|
v := lastSuccess.Time
|
||||||
|
item.LastSuccessAt = &v
|
||||||
|
}
|
||||||
|
if lastErrorAt.Valid {
|
||||||
|
v := lastErrorAt.Time
|
||||||
|
item.LastErrorAt = &v
|
||||||
|
}
|
||||||
|
if lastError.Valid {
|
||||||
|
v := lastError.String
|
||||||
|
item.LastError = &v
|
||||||
|
}
|
||||||
|
if lastDuration.Valid {
|
||||||
|
v := lastDuration.Int64
|
||||||
|
item.LastDurationMs = &v
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, &item)
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func opsNullBool(v *bool) any {
|
||||||
|
if v == nil {
|
||||||
|
return sql.NullBool{}
|
||||||
|
}
|
||||||
|
return sql.NullBool{Bool: *v, Valid: true}
|
||||||
|
}
|
||||||
|
|
||||||
|
func opsNullFloat64(v *float64) any {
|
||||||
|
if v == nil {
|
||||||
|
return sql.NullFloat64{}
|
||||||
|
}
|
||||||
|
return sql.NullFloat64{Float64: *v, Valid: true}
|
||||||
|
}
|
||||||
|
|
||||||
|
func opsNullTime(v *time.Time) any {
|
||||||
|
if v == nil || v.IsZero() {
|
||||||
|
return sql.NullTime{}
|
||||||
|
}
|
||||||
|
return sql.NullTime{Time: *v, Valid: true}
|
||||||
|
}
|
||||||
359
backend/internal/repository/ops_repo_preagg.go
Normal file
359
backend/internal/repository/ops_repo_preagg.go
Normal file
@@ -0,0 +1,359 @@
|
|||||||
|
package repository
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (r *opsRepository) UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
start := startTime.UTC()
|
||||||
|
end := endTime.UTC()
|
||||||
|
|
||||||
|
// NOTE:
|
||||||
|
// - We aggregate usage_logs + ops_error_logs into ops_metrics_hourly.
|
||||||
|
// - We emit three dimension granularities via GROUPING SETS:
|
||||||
|
// 1) overall: (bucket_start)
|
||||||
|
// 2) platform: (bucket_start, platform)
|
||||||
|
// 3) group: (bucket_start, platform, group_id)
|
||||||
|
//
|
||||||
|
// IMPORTANT: Postgres UNIQUE treats NULLs as distinct, so the table uses a COALESCE-based
|
||||||
|
// unique index; our ON CONFLICT target must match that expression set.
|
||||||
|
q := `
|
||||||
|
WITH usage_base AS (
|
||||||
|
SELECT
|
||||||
|
date_trunc('hour', ul.created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start,
|
||||||
|
g.platform AS platform,
|
||||||
|
ul.group_id AS group_id,
|
||||||
|
ul.duration_ms AS duration_ms,
|
||||||
|
ul.first_token_ms AS first_token_ms,
|
||||||
|
(ul.input_tokens + ul.output_tokens + ul.cache_creation_tokens + ul.cache_read_tokens) AS tokens
|
||||||
|
FROM usage_logs ul
|
||||||
|
JOIN groups g ON g.id = ul.group_id
|
||||||
|
WHERE ul.created_at >= $1 AND ul.created_at < $2
|
||||||
|
),
|
||||||
|
usage_agg AS (
|
||||||
|
SELECT
|
||||||
|
bucket_start,
|
||||||
|
CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform,
|
||||||
|
CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id,
|
||||||
|
COUNT(*) AS success_count,
|
||||||
|
COALESCE(SUM(tokens), 0) AS token_consumed,
|
||||||
|
|
||||||
|
percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p50_ms,
|
||||||
|
percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p90_ms,
|
||||||
|
percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p95_ms,
|
||||||
|
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p99_ms,
|
||||||
|
AVG(duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_avg_ms,
|
||||||
|
MAX(duration_ms) AS duration_max_ms,
|
||||||
|
|
||||||
|
percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p50_ms,
|
||||||
|
percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p90_ms,
|
||||||
|
percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p95_ms,
|
||||||
|
percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p99_ms,
|
||||||
|
AVG(first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_avg_ms,
|
||||||
|
MAX(first_token_ms) AS ttft_max_ms
|
||||||
|
FROM usage_base
|
||||||
|
GROUP BY GROUPING SETS (
|
||||||
|
(bucket_start),
|
||||||
|
(bucket_start, platform),
|
||||||
|
(bucket_start, platform, group_id)
|
||||||
|
)
|
||||||
|
),
|
||||||
|
error_base AS (
|
||||||
|
SELECT
|
||||||
|
date_trunc('hour', created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start,
|
||||||
|
platform AS platform,
|
||||||
|
group_id AS group_id,
|
||||||
|
is_business_limited AS is_business_limited,
|
||||||
|
error_owner AS error_owner,
|
||||||
|
status_code AS client_status_code,
|
||||||
|
COALESCE(upstream_status_code, status_code, 0) AS effective_status_code
|
||||||
|
FROM ops_error_logs
|
||||||
|
WHERE created_at >= $1 AND created_at < $2
|
||||||
|
),
|
||||||
|
error_agg AS (
|
||||||
|
SELECT
|
||||||
|
bucket_start,
|
||||||
|
CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform,
|
||||||
|
CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id,
|
||||||
|
COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400) AS error_count_total,
|
||||||
|
COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND is_business_limited) AS business_limited_count,
|
||||||
|
COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND NOT is_business_limited) AS error_count_sla,
|
||||||
|
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) NOT IN (429, 529)) AS upstream_error_count_excl_429_529,
|
||||||
|
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 429) AS upstream_429_count,
|
||||||
|
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 529) AS upstream_529_count
|
||||||
|
FROM error_base
|
||||||
|
GROUP BY GROUPING SETS (
|
||||||
|
(bucket_start),
|
||||||
|
(bucket_start, platform),
|
||||||
|
(bucket_start, platform, group_id)
|
||||||
|
)
|
||||||
|
HAVING GROUPING(group_id) = 1 OR group_id IS NOT NULL
|
||||||
|
),
|
||||||
|
combined AS (
|
||||||
|
SELECT
|
||||||
|
COALESCE(u.bucket_start, e.bucket_start) AS bucket_start,
|
||||||
|
COALESCE(u.platform, e.platform) AS platform,
|
||||||
|
COALESCE(u.group_id, e.group_id) AS group_id,
|
||||||
|
|
||||||
|
COALESCE(u.success_count, 0) AS success_count,
|
||||||
|
COALESCE(e.error_count_total, 0) AS error_count_total,
|
||||||
|
COALESCE(e.business_limited_count, 0) AS business_limited_count,
|
||||||
|
COALESCE(e.error_count_sla, 0) AS error_count_sla,
|
||||||
|
COALESCE(e.upstream_error_count_excl_429_529, 0) AS upstream_error_count_excl_429_529,
|
||||||
|
COALESCE(e.upstream_429_count, 0) AS upstream_429_count,
|
||||||
|
COALESCE(e.upstream_529_count, 0) AS upstream_529_count,
|
||||||
|
|
||||||
|
COALESCE(u.token_consumed, 0) AS token_consumed,
|
||||||
|
|
||||||
|
u.duration_p50_ms,
|
||||||
|
u.duration_p90_ms,
|
||||||
|
u.duration_p95_ms,
|
||||||
|
u.duration_p99_ms,
|
||||||
|
u.duration_avg_ms,
|
||||||
|
u.duration_max_ms,
|
||||||
|
|
||||||
|
u.ttft_p50_ms,
|
||||||
|
u.ttft_p90_ms,
|
||||||
|
u.ttft_p95_ms,
|
||||||
|
u.ttft_p99_ms,
|
||||||
|
u.ttft_avg_ms,
|
||||||
|
u.ttft_max_ms
|
||||||
|
FROM usage_agg u
|
||||||
|
FULL OUTER JOIN error_agg e
|
||||||
|
ON u.bucket_start = e.bucket_start
|
||||||
|
AND COALESCE(u.platform, '') = COALESCE(e.platform, '')
|
||||||
|
AND COALESCE(u.group_id, 0) = COALESCE(e.group_id, 0)
|
||||||
|
)
|
||||||
|
INSERT INTO ops_metrics_hourly (
|
||||||
|
bucket_start,
|
||||||
|
platform,
|
||||||
|
group_id,
|
||||||
|
success_count,
|
||||||
|
error_count_total,
|
||||||
|
business_limited_count,
|
||||||
|
error_count_sla,
|
||||||
|
upstream_error_count_excl_429_529,
|
||||||
|
upstream_429_count,
|
||||||
|
upstream_529_count,
|
||||||
|
token_consumed,
|
||||||
|
duration_p50_ms,
|
||||||
|
duration_p90_ms,
|
||||||
|
duration_p95_ms,
|
||||||
|
duration_p99_ms,
|
||||||
|
duration_avg_ms,
|
||||||
|
duration_max_ms,
|
||||||
|
ttft_p50_ms,
|
||||||
|
ttft_p90_ms,
|
||||||
|
ttft_p95_ms,
|
||||||
|
ttft_p99_ms,
|
||||||
|
ttft_avg_ms,
|
||||||
|
ttft_max_ms,
|
||||||
|
computed_at
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
bucket_start,
|
||||||
|
NULLIF(platform, '') AS platform,
|
||||||
|
group_id,
|
||||||
|
success_count,
|
||||||
|
error_count_total,
|
||||||
|
business_limited_count,
|
||||||
|
error_count_sla,
|
||||||
|
upstream_error_count_excl_429_529,
|
||||||
|
upstream_429_count,
|
||||||
|
upstream_529_count,
|
||||||
|
token_consumed,
|
||||||
|
duration_p50_ms::int,
|
||||||
|
duration_p90_ms::int,
|
||||||
|
duration_p95_ms::int,
|
||||||
|
duration_p99_ms::int,
|
||||||
|
duration_avg_ms,
|
||||||
|
duration_max_ms::int,
|
||||||
|
ttft_p50_ms::int,
|
||||||
|
ttft_p90_ms::int,
|
||||||
|
ttft_p95_ms::int,
|
||||||
|
ttft_p99_ms::int,
|
||||||
|
ttft_avg_ms,
|
||||||
|
ttft_max_ms::int,
|
||||||
|
NOW()
|
||||||
|
FROM combined
|
||||||
|
WHERE bucket_start IS NOT NULL
|
||||||
|
AND (platform IS NULL OR platform <> '')
|
||||||
|
ON CONFLICT (bucket_start, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET
|
||||||
|
success_count = EXCLUDED.success_count,
|
||||||
|
error_count_total = EXCLUDED.error_count_total,
|
||||||
|
business_limited_count = EXCLUDED.business_limited_count,
|
||||||
|
error_count_sla = EXCLUDED.error_count_sla,
|
||||||
|
upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529,
|
||||||
|
upstream_429_count = EXCLUDED.upstream_429_count,
|
||||||
|
upstream_529_count = EXCLUDED.upstream_529_count,
|
||||||
|
token_consumed = EXCLUDED.token_consumed,
|
||||||
|
|
||||||
|
duration_p50_ms = EXCLUDED.duration_p50_ms,
|
||||||
|
duration_p90_ms = EXCLUDED.duration_p90_ms,
|
||||||
|
duration_p95_ms = EXCLUDED.duration_p95_ms,
|
||||||
|
duration_p99_ms = EXCLUDED.duration_p99_ms,
|
||||||
|
duration_avg_ms = EXCLUDED.duration_avg_ms,
|
||||||
|
duration_max_ms = EXCLUDED.duration_max_ms,
|
||||||
|
|
||||||
|
ttft_p50_ms = EXCLUDED.ttft_p50_ms,
|
||||||
|
ttft_p90_ms = EXCLUDED.ttft_p90_ms,
|
||||||
|
ttft_p95_ms = EXCLUDED.ttft_p95_ms,
|
||||||
|
ttft_p99_ms = EXCLUDED.ttft_p99_ms,
|
||||||
|
ttft_avg_ms = EXCLUDED.ttft_avg_ms,
|
||||||
|
ttft_max_ms = EXCLUDED.ttft_max_ms,
|
||||||
|
|
||||||
|
computed_at = NOW()
|
||||||
|
`
|
||||||
|
|
||||||
|
_, err := r.db.ExecContext(ctx, q, start, end)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
start := startTime.UTC()
|
||||||
|
end := endTime.UTC()
|
||||||
|
|
||||||
|
q := `
|
||||||
|
INSERT INTO ops_metrics_daily (
|
||||||
|
bucket_date,
|
||||||
|
platform,
|
||||||
|
group_id,
|
||||||
|
success_count,
|
||||||
|
error_count_total,
|
||||||
|
business_limited_count,
|
||||||
|
error_count_sla,
|
||||||
|
upstream_error_count_excl_429_529,
|
||||||
|
upstream_429_count,
|
||||||
|
upstream_529_count,
|
||||||
|
token_consumed,
|
||||||
|
duration_p50_ms,
|
||||||
|
duration_p90_ms,
|
||||||
|
duration_p95_ms,
|
||||||
|
duration_p99_ms,
|
||||||
|
duration_avg_ms,
|
||||||
|
duration_max_ms,
|
||||||
|
ttft_p50_ms,
|
||||||
|
ttft_p90_ms,
|
||||||
|
ttft_p95_ms,
|
||||||
|
ttft_p99_ms,
|
||||||
|
ttft_avg_ms,
|
||||||
|
ttft_max_ms,
|
||||||
|
computed_at
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
(bucket_start AT TIME ZONE 'UTC')::date AS bucket_date,
|
||||||
|
platform,
|
||||||
|
group_id,
|
||||||
|
|
||||||
|
COALESCE(SUM(success_count), 0) AS success_count,
|
||||||
|
COALESCE(SUM(error_count_total), 0) AS error_count_total,
|
||||||
|
COALESCE(SUM(business_limited_count), 0) AS business_limited_count,
|
||||||
|
COALESCE(SUM(error_count_sla), 0) AS error_count_sla,
|
||||||
|
COALESCE(SUM(upstream_error_count_excl_429_529), 0) AS upstream_error_count_excl_429_529,
|
||||||
|
COALESCE(SUM(upstream_429_count), 0) AS upstream_429_count,
|
||||||
|
COALESCE(SUM(upstream_529_count), 0) AS upstream_529_count,
|
||||||
|
COALESCE(SUM(token_consumed), 0) AS token_consumed,
|
||||||
|
|
||||||
|
-- Approximation: weighted average for p50/p90, max for p95/p99 (conservative tail).
|
||||||
|
ROUND(SUM(duration_p50_ms::double precision * success_count) FILTER (WHERE duration_p50_ms IS NOT NULL)
|
||||||
|
/ NULLIF(SUM(success_count) FILTER (WHERE duration_p50_ms IS NOT NULL), 0))::int AS duration_p50_ms,
|
||||||
|
ROUND(SUM(duration_p90_ms::double precision * success_count) FILTER (WHERE duration_p90_ms IS NOT NULL)
|
||||||
|
/ NULLIF(SUM(success_count) FILTER (WHERE duration_p90_ms IS NOT NULL), 0))::int AS duration_p90_ms,
|
||||||
|
MAX(duration_p95_ms) AS duration_p95_ms,
|
||||||
|
MAX(duration_p99_ms) AS duration_p99_ms,
|
||||||
|
SUM(duration_avg_ms * success_count) FILTER (WHERE duration_avg_ms IS NOT NULL)
|
||||||
|
/ NULLIF(SUM(success_count) FILTER (WHERE duration_avg_ms IS NOT NULL), 0) AS duration_avg_ms,
|
||||||
|
MAX(duration_max_ms) AS duration_max_ms,
|
||||||
|
|
||||||
|
ROUND(SUM(ttft_p50_ms::double precision * success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL)
|
||||||
|
/ NULLIF(SUM(success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL), 0))::int AS ttft_p50_ms,
|
||||||
|
ROUND(SUM(ttft_p90_ms::double precision * success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL)
|
||||||
|
/ NULLIF(SUM(success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL), 0))::int AS ttft_p90_ms,
|
||||||
|
MAX(ttft_p95_ms) AS ttft_p95_ms,
|
||||||
|
MAX(ttft_p99_ms) AS ttft_p99_ms,
|
||||||
|
SUM(ttft_avg_ms * success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL)
|
||||||
|
/ NULLIF(SUM(success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL), 0) AS ttft_avg_ms,
|
||||||
|
MAX(ttft_max_ms) AS ttft_max_ms,
|
||||||
|
|
||||||
|
NOW()
|
||||||
|
FROM ops_metrics_hourly
|
||||||
|
WHERE bucket_start >= $1 AND bucket_start < $2
|
||||||
|
GROUP BY 1, 2, 3
|
||||||
|
ON CONFLICT (bucket_date, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET
|
||||||
|
success_count = EXCLUDED.success_count,
|
||||||
|
error_count_total = EXCLUDED.error_count_total,
|
||||||
|
business_limited_count = EXCLUDED.business_limited_count,
|
||||||
|
error_count_sla = EXCLUDED.error_count_sla,
|
||||||
|
upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529,
|
||||||
|
upstream_429_count = EXCLUDED.upstream_429_count,
|
||||||
|
upstream_529_count = EXCLUDED.upstream_529_count,
|
||||||
|
token_consumed = EXCLUDED.token_consumed,
|
||||||
|
|
||||||
|
duration_p50_ms = EXCLUDED.duration_p50_ms,
|
||||||
|
duration_p90_ms = EXCLUDED.duration_p90_ms,
|
||||||
|
duration_p95_ms = EXCLUDED.duration_p95_ms,
|
||||||
|
duration_p99_ms = EXCLUDED.duration_p99_ms,
|
||||||
|
duration_avg_ms = EXCLUDED.duration_avg_ms,
|
||||||
|
duration_max_ms = EXCLUDED.duration_max_ms,
|
||||||
|
|
||||||
|
ttft_p50_ms = EXCLUDED.ttft_p50_ms,
|
||||||
|
ttft_p90_ms = EXCLUDED.ttft_p90_ms,
|
||||||
|
ttft_p95_ms = EXCLUDED.ttft_p95_ms,
|
||||||
|
ttft_p99_ms = EXCLUDED.ttft_p99_ms,
|
||||||
|
ttft_avg_ms = EXCLUDED.ttft_avg_ms,
|
||||||
|
ttft_max_ms = EXCLUDED.ttft_max_ms,
|
||||||
|
|
||||||
|
computed_at = NOW()
|
||||||
|
`
|
||||||
|
|
||||||
|
_, err := r.db.ExecContext(ctx, q, start, end)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return time.Time{}, false, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
|
||||||
|
var value sql.NullTime
|
||||||
|
if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_start) FROM ops_metrics_hourly`).Scan(&value); err != nil {
|
||||||
|
return time.Time{}, false, err
|
||||||
|
}
|
||||||
|
if !value.Valid {
|
||||||
|
return time.Time{}, false, nil
|
||||||
|
}
|
||||||
|
return value.Time.UTC(), true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return time.Time{}, false, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
|
||||||
|
var value sql.NullTime
|
||||||
|
if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_date) FROM ops_metrics_daily`).Scan(&value); err != nil {
|
||||||
|
return time.Time{}, false, err
|
||||||
|
}
|
||||||
|
if !value.Valid {
|
||||||
|
return time.Time{}, false, nil
|
||||||
|
}
|
||||||
|
t := value.Time.UTC()
|
||||||
|
return time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, time.UTC), true, nil
|
||||||
|
}
|
||||||
286
backend/internal/repository/ops_repo_request_details.go
Normal file
286
backend/internal/repository/ops_repo_request_details.go
Normal file
@@ -0,0 +1,286 @@
|
|||||||
|
package repository
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (r *opsRepository) ListRequestDetails(ctx context.Context, filter *service.OpsRequestDetailFilter) ([]*service.OpsRequestDetail, int64, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, 0, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
|
||||||
|
page, pageSize, startTime, endTime := filter.Normalize()
|
||||||
|
offset := (page - 1) * pageSize
|
||||||
|
|
||||||
|
conditions := make([]string, 0, 16)
|
||||||
|
args := make([]any, 0, 24)
|
||||||
|
|
||||||
|
// Placeholders $1/$2 reserved for time window inside the CTE.
|
||||||
|
args = append(args, startTime.UTC(), endTime.UTC())
|
||||||
|
|
||||||
|
addCondition := func(condition string, values ...any) {
|
||||||
|
conditions = append(conditions, condition)
|
||||||
|
args = append(args, values...)
|
||||||
|
}
|
||||||
|
|
||||||
|
if filter != nil {
|
||||||
|
if kind := strings.TrimSpace(strings.ToLower(filter.Kind)); kind != "" && kind != "all" {
|
||||||
|
if kind != string(service.OpsRequestKindSuccess) && kind != string(service.OpsRequestKindError) {
|
||||||
|
return nil, 0, fmt.Errorf("invalid kind")
|
||||||
|
}
|
||||||
|
addCondition(fmt.Sprintf("kind = $%d", len(args)+1), kind)
|
||||||
|
}
|
||||||
|
|
||||||
|
if platform := strings.TrimSpace(strings.ToLower(filter.Platform)); platform != "" {
|
||||||
|
addCondition(fmt.Sprintf("platform = $%d", len(args)+1), platform)
|
||||||
|
}
|
||||||
|
if filter.GroupID != nil && *filter.GroupID > 0 {
|
||||||
|
addCondition(fmt.Sprintf("group_id = $%d", len(args)+1), *filter.GroupID)
|
||||||
|
}
|
||||||
|
|
||||||
|
if filter.UserID != nil && *filter.UserID > 0 {
|
||||||
|
addCondition(fmt.Sprintf("user_id = $%d", len(args)+1), *filter.UserID)
|
||||||
|
}
|
||||||
|
if filter.APIKeyID != nil && *filter.APIKeyID > 0 {
|
||||||
|
addCondition(fmt.Sprintf("api_key_id = $%d", len(args)+1), *filter.APIKeyID)
|
||||||
|
}
|
||||||
|
if filter.AccountID != nil && *filter.AccountID > 0 {
|
||||||
|
addCondition(fmt.Sprintf("account_id = $%d", len(args)+1), *filter.AccountID)
|
||||||
|
}
|
||||||
|
|
||||||
|
if model := strings.TrimSpace(filter.Model); model != "" {
|
||||||
|
addCondition(fmt.Sprintf("model = $%d", len(args)+1), model)
|
||||||
|
}
|
||||||
|
if requestID := strings.TrimSpace(filter.RequestID); requestID != "" {
|
||||||
|
addCondition(fmt.Sprintf("request_id = $%d", len(args)+1), requestID)
|
||||||
|
}
|
||||||
|
if q := strings.TrimSpace(filter.Query); q != "" {
|
||||||
|
like := "%" + strings.ToLower(q) + "%"
|
||||||
|
startIdx := len(args) + 1
|
||||||
|
addCondition(
|
||||||
|
fmt.Sprintf("(LOWER(COALESCE(request_id,'')) LIKE $%d OR LOWER(COALESCE(model,'')) LIKE $%d OR LOWER(COALESCE(message,'')) LIKE $%d)",
|
||||||
|
startIdx, startIdx+1, startIdx+2,
|
||||||
|
),
|
||||||
|
like, like, like,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if filter.MinDurationMs != nil {
|
||||||
|
addCondition(fmt.Sprintf("duration_ms >= $%d", len(args)+1), *filter.MinDurationMs)
|
||||||
|
}
|
||||||
|
if filter.MaxDurationMs != nil {
|
||||||
|
addCondition(fmt.Sprintf("duration_ms <= $%d", len(args)+1), *filter.MaxDurationMs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
where := ""
|
||||||
|
if len(conditions) > 0 {
|
||||||
|
where = "WHERE " + strings.Join(conditions, " AND ")
|
||||||
|
}
|
||||||
|
|
||||||
|
cte := `
|
||||||
|
WITH combined AS (
|
||||||
|
SELECT
|
||||||
|
'success'::TEXT AS kind,
|
||||||
|
ul.created_at AS created_at,
|
||||||
|
ul.request_id AS request_id,
|
||||||
|
COALESCE(NULLIF(g.platform, ''), NULLIF(a.platform, ''), '') AS platform,
|
||||||
|
ul.model AS model,
|
||||||
|
ul.duration_ms AS duration_ms,
|
||||||
|
NULL::INT AS status_code,
|
||||||
|
NULL::BIGINT AS error_id,
|
||||||
|
NULL::TEXT AS phase,
|
||||||
|
NULL::TEXT AS severity,
|
||||||
|
NULL::TEXT AS message,
|
||||||
|
ul.user_id AS user_id,
|
||||||
|
ul.api_key_id AS api_key_id,
|
||||||
|
ul.account_id AS account_id,
|
||||||
|
ul.group_id AS group_id,
|
||||||
|
ul.stream AS stream
|
||||||
|
FROM usage_logs ul
|
||||||
|
LEFT JOIN groups g ON g.id = ul.group_id
|
||||||
|
LEFT JOIN accounts a ON a.id = ul.account_id
|
||||||
|
WHERE ul.created_at >= $1 AND ul.created_at < $2
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'error'::TEXT AS kind,
|
||||||
|
o.created_at AS created_at,
|
||||||
|
COALESCE(NULLIF(o.request_id,''), NULLIF(o.client_request_id,''), '') AS request_id,
|
||||||
|
COALESCE(NULLIF(o.platform, ''), NULLIF(g.platform, ''), NULLIF(a.platform, ''), '') AS platform,
|
||||||
|
o.model AS model,
|
||||||
|
o.duration_ms AS duration_ms,
|
||||||
|
o.status_code AS status_code,
|
||||||
|
o.id AS error_id,
|
||||||
|
o.error_phase AS phase,
|
||||||
|
o.severity AS severity,
|
||||||
|
o.error_message AS message,
|
||||||
|
o.user_id AS user_id,
|
||||||
|
o.api_key_id AS api_key_id,
|
||||||
|
o.account_id AS account_id,
|
||||||
|
o.group_id AS group_id,
|
||||||
|
o.stream AS stream
|
||||||
|
FROM ops_error_logs o
|
||||||
|
LEFT JOIN groups g ON g.id = o.group_id
|
||||||
|
LEFT JOIN accounts a ON a.id = o.account_id
|
||||||
|
WHERE o.created_at >= $1 AND o.created_at < $2
|
||||||
|
AND COALESCE(o.status_code, 0) >= 400
|
||||||
|
)
|
||||||
|
`
|
||||||
|
|
||||||
|
countQuery := fmt.Sprintf(`%s SELECT COUNT(1) FROM combined %s`, cte, where)
|
||||||
|
var total int64
|
||||||
|
if err := r.db.QueryRowContext(ctx, countQuery, args...).Scan(&total); err != nil {
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
total = 0
|
||||||
|
} else {
|
||||||
|
return nil, 0, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sort := "ORDER BY created_at DESC"
|
||||||
|
if filter != nil {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(filter.Sort)) {
|
||||||
|
case "", "created_at_desc":
|
||||||
|
// default
|
||||||
|
case "duration_desc":
|
||||||
|
sort = "ORDER BY duration_ms DESC NULLS LAST, created_at DESC"
|
||||||
|
default:
|
||||||
|
return nil, 0, fmt.Errorf("invalid sort")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
listQuery := fmt.Sprintf(`
|
||||||
|
%s
|
||||||
|
SELECT
|
||||||
|
kind,
|
||||||
|
created_at,
|
||||||
|
request_id,
|
||||||
|
platform,
|
||||||
|
model,
|
||||||
|
duration_ms,
|
||||||
|
status_code,
|
||||||
|
error_id,
|
||||||
|
phase,
|
||||||
|
severity,
|
||||||
|
message,
|
||||||
|
user_id,
|
||||||
|
api_key_id,
|
||||||
|
account_id,
|
||||||
|
group_id,
|
||||||
|
stream
|
||||||
|
FROM combined
|
||||||
|
%s
|
||||||
|
%s
|
||||||
|
LIMIT $%d OFFSET $%d
|
||||||
|
`, cte, where, sort, len(args)+1, len(args)+2)
|
||||||
|
|
||||||
|
listArgs := append(append([]any{}, args...), pageSize, offset)
|
||||||
|
rows, err := r.db.QueryContext(ctx, listQuery, listArgs...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, 0, err
|
||||||
|
}
|
||||||
|
defer func() { _ = rows.Close() }()
|
||||||
|
|
||||||
|
toIntPtr := func(v sql.NullInt64) *int {
|
||||||
|
if !v.Valid {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
i := int(v.Int64)
|
||||||
|
return &i
|
||||||
|
}
|
||||||
|
toInt64Ptr := func(v sql.NullInt64) *int64 {
|
||||||
|
if !v.Valid {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
i := v.Int64
|
||||||
|
return &i
|
||||||
|
}
|
||||||
|
|
||||||
|
out := make([]*service.OpsRequestDetail, 0, pageSize)
|
||||||
|
for rows.Next() {
|
||||||
|
var (
|
||||||
|
kind string
|
||||||
|
createdAt time.Time
|
||||||
|
requestID sql.NullString
|
||||||
|
platform sql.NullString
|
||||||
|
model sql.NullString
|
||||||
|
|
||||||
|
durationMs sql.NullInt64
|
||||||
|
statusCode sql.NullInt64
|
||||||
|
errorID sql.NullInt64
|
||||||
|
|
||||||
|
phase sql.NullString
|
||||||
|
severity sql.NullString
|
||||||
|
message sql.NullString
|
||||||
|
|
||||||
|
userID sql.NullInt64
|
||||||
|
apiKeyID sql.NullInt64
|
||||||
|
accountID sql.NullInt64
|
||||||
|
groupID sql.NullInt64
|
||||||
|
|
||||||
|
stream bool
|
||||||
|
)
|
||||||
|
|
||||||
|
if err := rows.Scan(
|
||||||
|
&kind,
|
||||||
|
&createdAt,
|
||||||
|
&requestID,
|
||||||
|
&platform,
|
||||||
|
&model,
|
||||||
|
&durationMs,
|
||||||
|
&statusCode,
|
||||||
|
&errorID,
|
||||||
|
&phase,
|
||||||
|
&severity,
|
||||||
|
&message,
|
||||||
|
&userID,
|
||||||
|
&apiKeyID,
|
||||||
|
&accountID,
|
||||||
|
&groupID,
|
||||||
|
&stream,
|
||||||
|
); err != nil {
|
||||||
|
return nil, 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
item := &service.OpsRequestDetail{
|
||||||
|
Kind: service.OpsRequestKind(kind),
|
||||||
|
CreatedAt: createdAt,
|
||||||
|
RequestID: strings.TrimSpace(requestID.String),
|
||||||
|
Platform: strings.TrimSpace(platform.String),
|
||||||
|
Model: strings.TrimSpace(model.String),
|
||||||
|
|
||||||
|
DurationMs: toIntPtr(durationMs),
|
||||||
|
StatusCode: toIntPtr(statusCode),
|
||||||
|
ErrorID: toInt64Ptr(errorID),
|
||||||
|
Phase: phase.String,
|
||||||
|
Severity: severity.String,
|
||||||
|
Message: message.String,
|
||||||
|
|
||||||
|
UserID: toInt64Ptr(userID),
|
||||||
|
APIKeyID: toInt64Ptr(apiKeyID),
|
||||||
|
AccountID: toInt64Ptr(accountID),
|
||||||
|
GroupID: toInt64Ptr(groupID),
|
||||||
|
|
||||||
|
Stream: stream,
|
||||||
|
}
|
||||||
|
|
||||||
|
if item.Platform == "" {
|
||||||
|
item.Platform = "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, item)
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return nil, 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return out, total, nil
|
||||||
|
}
|
||||||
571
backend/internal/repository/ops_repo_trends.go
Normal file
571
backend/internal/repository/ops_repo_trends.go
Normal file
@@ -0,0 +1,571 @@
|
|||||||
|
package repository
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (r *opsRepository) GetThroughputTrend(ctx context.Context, filter *service.OpsDashboardFilter, bucketSeconds int) (*service.OpsThroughputTrendResponse, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if filter == nil {
|
||||||
|
return nil, fmt.Errorf("nil filter")
|
||||||
|
}
|
||||||
|
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
|
||||||
|
return nil, fmt.Errorf("start_time/end_time required")
|
||||||
|
}
|
||||||
|
|
||||||
|
if bucketSeconds <= 0 {
|
||||||
|
bucketSeconds = 60
|
||||||
|
}
|
||||||
|
if bucketSeconds != 60 && bucketSeconds != 300 && bucketSeconds != 3600 {
|
||||||
|
// Keep a small, predictable set of supported buckets for now.
|
||||||
|
bucketSeconds = 60
|
||||||
|
}
|
||||||
|
|
||||||
|
start := filter.StartTime.UTC()
|
||||||
|
end := filter.EndTime.UTC()
|
||||||
|
|
||||||
|
usageJoin, usageWhere, usageArgs, next := buildUsageWhere(filter, start, end, 1)
|
||||||
|
errorWhere, errorArgs, _ := buildErrorWhere(filter, start, end, next)
|
||||||
|
|
||||||
|
usageBucketExpr := opsBucketExprForUsage(bucketSeconds)
|
||||||
|
errorBucketExpr := opsBucketExprForError(bucketSeconds)
|
||||||
|
|
||||||
|
q := `
|
||||||
|
WITH usage_buckets AS (
|
||||||
|
SELECT ` + usageBucketExpr + ` AS bucket,
|
||||||
|
COUNT(*) AS success_count,
|
||||||
|
COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
|
||||||
|
FROM usage_logs ul
|
||||||
|
` + usageJoin + `
|
||||||
|
` + usageWhere + `
|
||||||
|
GROUP BY 1
|
||||||
|
),
|
||||||
|
error_buckets AS (
|
||||||
|
SELECT ` + errorBucketExpr + ` AS bucket,
|
||||||
|
COUNT(*) AS error_count
|
||||||
|
FROM ops_error_logs
|
||||||
|
` + errorWhere + `
|
||||||
|
AND COALESCE(status_code, 0) >= 400
|
||||||
|
GROUP BY 1
|
||||||
|
),
|
||||||
|
combined AS (
|
||||||
|
SELECT COALESCE(u.bucket, e.bucket) AS bucket,
|
||||||
|
COALESCE(u.success_count, 0) AS success_count,
|
||||||
|
COALESCE(e.error_count, 0) AS error_count,
|
||||||
|
COALESCE(u.token_consumed, 0) AS token_consumed
|
||||||
|
FROM usage_buckets u
|
||||||
|
FULL OUTER JOIN error_buckets e ON u.bucket = e.bucket
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
bucket,
|
||||||
|
(success_count + error_count) AS request_count,
|
||||||
|
token_consumed
|
||||||
|
FROM combined
|
||||||
|
ORDER BY bucket ASC`
|
||||||
|
|
||||||
|
args := append(usageArgs, errorArgs...)
|
||||||
|
|
||||||
|
rows, err := r.db.QueryContext(ctx, q, args...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() { _ = rows.Close() }()
|
||||||
|
|
||||||
|
points := make([]*service.OpsThroughputTrendPoint, 0, 256)
|
||||||
|
for rows.Next() {
|
||||||
|
var bucket time.Time
|
||||||
|
var requests int64
|
||||||
|
var tokens sql.NullInt64
|
||||||
|
if err := rows.Scan(&bucket, &requests, &tokens); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
tokenConsumed := int64(0)
|
||||||
|
if tokens.Valid {
|
||||||
|
tokenConsumed = tokens.Int64
|
||||||
|
}
|
||||||
|
|
||||||
|
denom := float64(bucketSeconds)
|
||||||
|
if denom <= 0 {
|
||||||
|
denom = 60
|
||||||
|
}
|
||||||
|
qps := roundTo1DP(float64(requests) / denom)
|
||||||
|
tps := roundTo1DP(float64(tokenConsumed) / denom)
|
||||||
|
|
||||||
|
points = append(points, &service.OpsThroughputTrendPoint{
|
||||||
|
BucketStart: bucket.UTC(),
|
||||||
|
RequestCount: requests,
|
||||||
|
TokenConsumed: tokenConsumed,
|
||||||
|
QPS: qps,
|
||||||
|
TPS: tps,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fill missing buckets with zeros so charts render continuous timelines.
|
||||||
|
points = fillOpsThroughputBuckets(start, end, bucketSeconds, points)
|
||||||
|
|
||||||
|
var byPlatform []*service.OpsThroughputPlatformBreakdownItem
|
||||||
|
var topGroups []*service.OpsThroughputGroupBreakdownItem
|
||||||
|
|
||||||
|
platform := ""
|
||||||
|
if filter != nil {
|
||||||
|
platform = strings.TrimSpace(strings.ToLower(filter.Platform))
|
||||||
|
}
|
||||||
|
groupID := (*int64)(nil)
|
||||||
|
if filter != nil {
|
||||||
|
groupID = filter.GroupID
|
||||||
|
}
|
||||||
|
|
||||||
|
// Drilldown helpers:
|
||||||
|
// - No platform/group: totals by platform
|
||||||
|
// - Platform selected but no group: top groups in that platform
|
||||||
|
if platform == "" && (groupID == nil || *groupID <= 0) {
|
||||||
|
items, err := r.getThroughputBreakdownByPlatform(ctx, start, end)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
byPlatform = items
|
||||||
|
} else if platform != "" && (groupID == nil || *groupID <= 0) {
|
||||||
|
items, err := r.getThroughputTopGroupsByPlatform(ctx, start, end, platform, 10)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
topGroups = items
|
||||||
|
}
|
||||||
|
|
||||||
|
return &service.OpsThroughputTrendResponse{
|
||||||
|
Bucket: opsBucketLabel(bucketSeconds),
|
||||||
|
Points: points,
|
||||||
|
|
||||||
|
ByPlatform: byPlatform,
|
||||||
|
TopGroups: topGroups,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) getThroughputBreakdownByPlatform(ctx context.Context, start, end time.Time) ([]*service.OpsThroughputPlatformBreakdownItem, error) {
|
||||||
|
q := `
|
||||||
|
WITH usage_totals AS (
|
||||||
|
SELECT COALESCE(NULLIF(g.platform,''), a.platform) AS platform,
|
||||||
|
COUNT(*) AS success_count,
|
||||||
|
COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
|
||||||
|
FROM usage_logs ul
|
||||||
|
LEFT JOIN groups g ON g.id = ul.group_id
|
||||||
|
LEFT JOIN accounts a ON a.id = ul.account_id
|
||||||
|
WHERE ul.created_at >= $1 AND ul.created_at < $2
|
||||||
|
GROUP BY 1
|
||||||
|
),
|
||||||
|
error_totals AS (
|
||||||
|
SELECT platform,
|
||||||
|
COUNT(*) AS error_count
|
||||||
|
FROM ops_error_logs
|
||||||
|
WHERE created_at >= $1 AND created_at < $2
|
||||||
|
AND COALESCE(status_code, 0) >= 400
|
||||||
|
GROUP BY 1
|
||||||
|
),
|
||||||
|
combined AS (
|
||||||
|
SELECT COALESCE(u.platform, e.platform) AS platform,
|
||||||
|
COALESCE(u.success_count, 0) AS success_count,
|
||||||
|
COALESCE(e.error_count, 0) AS error_count,
|
||||||
|
COALESCE(u.token_consumed, 0) AS token_consumed
|
||||||
|
FROM usage_totals u
|
||||||
|
FULL OUTER JOIN error_totals e ON u.platform = e.platform
|
||||||
|
)
|
||||||
|
SELECT platform, (success_count + error_count) AS request_count, token_consumed
|
||||||
|
FROM combined
|
||||||
|
WHERE platform IS NOT NULL AND platform <> ''
|
||||||
|
ORDER BY request_count DESC`
|
||||||
|
|
||||||
|
rows, err := r.db.QueryContext(ctx, q, start, end)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() { _ = rows.Close() }()
|
||||||
|
|
||||||
|
items := make([]*service.OpsThroughputPlatformBreakdownItem, 0, 8)
|
||||||
|
for rows.Next() {
|
||||||
|
var platform string
|
||||||
|
var requests int64
|
||||||
|
var tokens sql.NullInt64
|
||||||
|
if err := rows.Scan(&platform, &requests, &tokens); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
tokenConsumed := int64(0)
|
||||||
|
if tokens.Valid {
|
||||||
|
tokenConsumed = tokens.Int64
|
||||||
|
}
|
||||||
|
items = append(items, &service.OpsThroughputPlatformBreakdownItem{
|
||||||
|
Platform: platform,
|
||||||
|
RequestCount: requests,
|
||||||
|
TokenConsumed: tokenConsumed,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return items, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) getThroughputTopGroupsByPlatform(ctx context.Context, start, end time.Time, platform string, limit int) ([]*service.OpsThroughputGroupBreakdownItem, error) {
|
||||||
|
if strings.TrimSpace(platform) == "" {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
if limit <= 0 || limit > 100 {
|
||||||
|
limit = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
q := `
|
||||||
|
WITH usage_totals AS (
|
||||||
|
SELECT ul.group_id AS group_id,
|
||||||
|
g.name AS group_name,
|
||||||
|
COUNT(*) AS success_count,
|
||||||
|
COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
|
||||||
|
FROM usage_logs ul
|
||||||
|
JOIN groups g ON g.id = ul.group_id
|
||||||
|
WHERE ul.created_at >= $1 AND ul.created_at < $2
|
||||||
|
AND g.platform = $3
|
||||||
|
GROUP BY 1, 2
|
||||||
|
),
|
||||||
|
error_totals AS (
|
||||||
|
SELECT group_id,
|
||||||
|
COUNT(*) AS error_count
|
||||||
|
FROM ops_error_logs
|
||||||
|
WHERE created_at >= $1 AND created_at < $2
|
||||||
|
AND platform = $3
|
||||||
|
AND group_id IS NOT NULL
|
||||||
|
AND COALESCE(status_code, 0) >= 400
|
||||||
|
GROUP BY 1
|
||||||
|
),
|
||||||
|
combined AS (
|
||||||
|
SELECT COALESCE(u.group_id, e.group_id) AS group_id,
|
||||||
|
COALESCE(u.group_name, g2.name, '') AS group_name,
|
||||||
|
COALESCE(u.success_count, 0) AS success_count,
|
||||||
|
COALESCE(e.error_count, 0) AS error_count,
|
||||||
|
COALESCE(u.token_consumed, 0) AS token_consumed
|
||||||
|
FROM usage_totals u
|
||||||
|
FULL OUTER JOIN error_totals e ON u.group_id = e.group_id
|
||||||
|
LEFT JOIN groups g2 ON g2.id = COALESCE(u.group_id, e.group_id)
|
||||||
|
)
|
||||||
|
SELECT group_id, group_name, (success_count + error_count) AS request_count, token_consumed
|
||||||
|
FROM combined
|
||||||
|
WHERE group_id IS NOT NULL
|
||||||
|
ORDER BY request_count DESC
|
||||||
|
LIMIT $4`
|
||||||
|
|
||||||
|
rows, err := r.db.QueryContext(ctx, q, start, end, platform, limit)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() { _ = rows.Close() }()
|
||||||
|
|
||||||
|
items := make([]*service.OpsThroughputGroupBreakdownItem, 0, limit)
|
||||||
|
for rows.Next() {
|
||||||
|
var groupID int64
|
||||||
|
var groupName sql.NullString
|
||||||
|
var requests int64
|
||||||
|
var tokens sql.NullInt64
|
||||||
|
if err := rows.Scan(&groupID, &groupName, &requests, &tokens); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
tokenConsumed := int64(0)
|
||||||
|
if tokens.Valid {
|
||||||
|
tokenConsumed = tokens.Int64
|
||||||
|
}
|
||||||
|
name := ""
|
||||||
|
if groupName.Valid {
|
||||||
|
name = groupName.String
|
||||||
|
}
|
||||||
|
items = append(items, &service.OpsThroughputGroupBreakdownItem{
|
||||||
|
GroupID: groupID,
|
||||||
|
GroupName: name,
|
||||||
|
RequestCount: requests,
|
||||||
|
TokenConsumed: tokenConsumed,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return items, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func opsBucketExprForUsage(bucketSeconds int) string {
|
||||||
|
switch bucketSeconds {
|
||||||
|
case 3600:
|
||||||
|
return "date_trunc('hour', ul.created_at)"
|
||||||
|
case 300:
|
||||||
|
// 5-minute buckets in UTC.
|
||||||
|
return "to_timestamp(floor(extract(epoch from ul.created_at) / 300) * 300)"
|
||||||
|
default:
|
||||||
|
return "date_trunc('minute', ul.created_at)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func opsBucketExprForError(bucketSeconds int) string {
|
||||||
|
switch bucketSeconds {
|
||||||
|
case 3600:
|
||||||
|
return "date_trunc('hour', created_at)"
|
||||||
|
case 300:
|
||||||
|
return "to_timestamp(floor(extract(epoch from created_at) / 300) * 300)"
|
||||||
|
default:
|
||||||
|
return "date_trunc('minute', created_at)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func opsBucketLabel(bucketSeconds int) string {
|
||||||
|
if bucketSeconds <= 0 {
|
||||||
|
return "1m"
|
||||||
|
}
|
||||||
|
if bucketSeconds%3600 == 0 {
|
||||||
|
h := bucketSeconds / 3600
|
||||||
|
if h <= 0 {
|
||||||
|
h = 1
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%dh", h)
|
||||||
|
}
|
||||||
|
m := bucketSeconds / 60
|
||||||
|
if m <= 0 {
|
||||||
|
m = 1
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%dm", m)
|
||||||
|
}
|
||||||
|
|
||||||
|
func opsFloorToBucketStart(t time.Time, bucketSeconds int) time.Time {
|
||||||
|
t = t.UTC()
|
||||||
|
if bucketSeconds <= 0 {
|
||||||
|
bucketSeconds = 60
|
||||||
|
}
|
||||||
|
secs := t.Unix()
|
||||||
|
floored := secs - (secs % int64(bucketSeconds))
|
||||||
|
return time.Unix(floored, 0).UTC()
|
||||||
|
}
|
||||||
|
|
||||||
|
func fillOpsThroughputBuckets(start, end time.Time, bucketSeconds int, points []*service.OpsThroughputTrendPoint) []*service.OpsThroughputTrendPoint {
|
||||||
|
if bucketSeconds <= 0 {
|
||||||
|
bucketSeconds = 60
|
||||||
|
}
|
||||||
|
if !start.Before(end) {
|
||||||
|
return points
|
||||||
|
}
|
||||||
|
|
||||||
|
endMinus := end.Add(-time.Nanosecond)
|
||||||
|
if endMinus.Before(start) {
|
||||||
|
return points
|
||||||
|
}
|
||||||
|
|
||||||
|
first := opsFloorToBucketStart(start, bucketSeconds)
|
||||||
|
last := opsFloorToBucketStart(endMinus, bucketSeconds)
|
||||||
|
step := time.Duration(bucketSeconds) * time.Second
|
||||||
|
|
||||||
|
existing := make(map[int64]*service.OpsThroughputTrendPoint, len(points))
|
||||||
|
for _, p := range points {
|
||||||
|
if p == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
existing[p.BucketStart.UTC().Unix()] = p
|
||||||
|
}
|
||||||
|
|
||||||
|
out := make([]*service.OpsThroughputTrendPoint, 0, int(last.Sub(first)/step)+1)
|
||||||
|
for cursor := first; !cursor.After(last); cursor = cursor.Add(step) {
|
||||||
|
if p, ok := existing[cursor.Unix()]; ok && p != nil {
|
||||||
|
out = append(out, p)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, &service.OpsThroughputTrendPoint{
|
||||||
|
BucketStart: cursor,
|
||||||
|
RequestCount: 0,
|
||||||
|
TokenConsumed: 0,
|
||||||
|
QPS: 0,
|
||||||
|
TPS: 0,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) GetErrorTrend(ctx context.Context, filter *service.OpsDashboardFilter, bucketSeconds int) (*service.OpsErrorTrendResponse, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if filter == nil {
|
||||||
|
return nil, fmt.Errorf("nil filter")
|
||||||
|
}
|
||||||
|
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
|
||||||
|
return nil, fmt.Errorf("start_time/end_time required")
|
||||||
|
}
|
||||||
|
|
||||||
|
if bucketSeconds <= 0 {
|
||||||
|
bucketSeconds = 60
|
||||||
|
}
|
||||||
|
if bucketSeconds != 60 && bucketSeconds != 300 && bucketSeconds != 3600 {
|
||||||
|
bucketSeconds = 60
|
||||||
|
}
|
||||||
|
|
||||||
|
start := filter.StartTime.UTC()
|
||||||
|
end := filter.EndTime.UTC()
|
||||||
|
where, args, _ := buildErrorWhere(filter, start, end, 1)
|
||||||
|
bucketExpr := opsBucketExprForError(bucketSeconds)
|
||||||
|
|
||||||
|
q := `
|
||||||
|
SELECT
|
||||||
|
` + bucketExpr + ` AS bucket,
|
||||||
|
COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400) AS error_total,
|
||||||
|
COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited) AS business_limited,
|
||||||
|
COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited) AS error_sla,
|
||||||
|
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)) AS upstream_excl,
|
||||||
|
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429) AS upstream_429,
|
||||||
|
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529) AS upstream_529
|
||||||
|
FROM ops_error_logs
|
||||||
|
` + where + `
|
||||||
|
GROUP BY 1
|
||||||
|
ORDER BY 1 ASC`
|
||||||
|
|
||||||
|
rows, err := r.db.QueryContext(ctx, q, args...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() { _ = rows.Close() }()
|
||||||
|
|
||||||
|
points := make([]*service.OpsErrorTrendPoint, 0, 256)
|
||||||
|
for rows.Next() {
|
||||||
|
var bucket time.Time
|
||||||
|
var total, businessLimited, sla, upstreamExcl, upstream429, upstream529 int64
|
||||||
|
if err := rows.Scan(&bucket, &total, &businessLimited, &sla, &upstreamExcl, &upstream429, &upstream529); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
points = append(points, &service.OpsErrorTrendPoint{
|
||||||
|
BucketStart: bucket.UTC(),
|
||||||
|
|
||||||
|
ErrorCountTotal: total,
|
||||||
|
BusinessLimitedCount: businessLimited,
|
||||||
|
ErrorCountSLA: sla,
|
||||||
|
|
||||||
|
UpstreamErrorCountExcl429529: upstreamExcl,
|
||||||
|
Upstream429Count: upstream429,
|
||||||
|
Upstream529Count: upstream529,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
points = fillOpsErrorTrendBuckets(start, end, bucketSeconds, points)
|
||||||
|
|
||||||
|
return &service.OpsErrorTrendResponse{
|
||||||
|
Bucket: opsBucketLabel(bucketSeconds),
|
||||||
|
Points: points,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func fillOpsErrorTrendBuckets(start, end time.Time, bucketSeconds int, points []*service.OpsErrorTrendPoint) []*service.OpsErrorTrendPoint {
|
||||||
|
if bucketSeconds <= 0 {
|
||||||
|
bucketSeconds = 60
|
||||||
|
}
|
||||||
|
if !start.Before(end) {
|
||||||
|
return points
|
||||||
|
}
|
||||||
|
|
||||||
|
endMinus := end.Add(-time.Nanosecond)
|
||||||
|
if endMinus.Before(start) {
|
||||||
|
return points
|
||||||
|
}
|
||||||
|
|
||||||
|
first := opsFloorToBucketStart(start, bucketSeconds)
|
||||||
|
last := opsFloorToBucketStart(endMinus, bucketSeconds)
|
||||||
|
step := time.Duration(bucketSeconds) * time.Second
|
||||||
|
|
||||||
|
existing := make(map[int64]*service.OpsErrorTrendPoint, len(points))
|
||||||
|
for _, p := range points {
|
||||||
|
if p == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
existing[p.BucketStart.UTC().Unix()] = p
|
||||||
|
}
|
||||||
|
|
||||||
|
out := make([]*service.OpsErrorTrendPoint, 0, int(last.Sub(first)/step)+1)
|
||||||
|
for cursor := first; !cursor.After(last); cursor = cursor.Add(step) {
|
||||||
|
if p, ok := existing[cursor.Unix()]; ok && p != nil {
|
||||||
|
out = append(out, p)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, &service.OpsErrorTrendPoint{
|
||||||
|
BucketStart: cursor,
|
||||||
|
|
||||||
|
ErrorCountTotal: 0,
|
||||||
|
BusinessLimitedCount: 0,
|
||||||
|
ErrorCountSLA: 0,
|
||||||
|
|
||||||
|
UpstreamErrorCountExcl429529: 0,
|
||||||
|
Upstream429Count: 0,
|
||||||
|
Upstream529Count: 0,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *opsRepository) GetErrorDistribution(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsErrorDistributionResponse, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if filter == nil {
|
||||||
|
return nil, fmt.Errorf("nil filter")
|
||||||
|
}
|
||||||
|
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
|
||||||
|
return nil, fmt.Errorf("start_time/end_time required")
|
||||||
|
}
|
||||||
|
|
||||||
|
start := filter.StartTime.UTC()
|
||||||
|
end := filter.EndTime.UTC()
|
||||||
|
where, args, _ := buildErrorWhere(filter, start, end, 1)
|
||||||
|
|
||||||
|
q := `
|
||||||
|
SELECT
|
||||||
|
COALESCE(upstream_status_code, status_code, 0) AS status_code,
|
||||||
|
COUNT(*) AS total,
|
||||||
|
COUNT(*) FILTER (WHERE NOT is_business_limited) AS sla,
|
||||||
|
COUNT(*) FILTER (WHERE is_business_limited) AS business_limited
|
||||||
|
FROM ops_error_logs
|
||||||
|
` + where + `
|
||||||
|
AND COALESCE(status_code, 0) >= 400
|
||||||
|
GROUP BY 1
|
||||||
|
ORDER BY total DESC
|
||||||
|
LIMIT 20`
|
||||||
|
|
||||||
|
rows, err := r.db.QueryContext(ctx, q, args...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() { _ = rows.Close() }()
|
||||||
|
|
||||||
|
items := make([]*service.OpsErrorDistributionItem, 0, 16)
|
||||||
|
var total int64
|
||||||
|
for rows.Next() {
|
||||||
|
var statusCode int
|
||||||
|
var cntTotal, cntSLA, cntBiz int64
|
||||||
|
if err := rows.Scan(&statusCode, &cntTotal, &cntSLA, &cntBiz); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
total += cntTotal
|
||||||
|
items = append(items, &service.OpsErrorDistributionItem{
|
||||||
|
StatusCode: statusCode,
|
||||||
|
Total: cntTotal,
|
||||||
|
SLA: cntSLA,
|
||||||
|
BusinessLimited: cntBiz,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return &service.OpsErrorDistributionResponse{
|
||||||
|
Total: total,
|
||||||
|
Items: items,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
50
backend/internal/repository/ops_repo_window_stats.go
Normal file
50
backend/internal/repository/ops_repo_window_stats.go
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
package repository
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (r *opsRepository) GetWindowStats(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsWindowStats, error) {
|
||||||
|
if r == nil || r.db == nil {
|
||||||
|
return nil, fmt.Errorf("nil ops repository")
|
||||||
|
}
|
||||||
|
if filter == nil {
|
||||||
|
return nil, fmt.Errorf("nil filter")
|
||||||
|
}
|
||||||
|
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
|
||||||
|
return nil, fmt.Errorf("start_time/end_time required")
|
||||||
|
}
|
||||||
|
|
||||||
|
start := filter.StartTime.UTC()
|
||||||
|
end := filter.EndTime.UTC()
|
||||||
|
if start.After(end) {
|
||||||
|
return nil, fmt.Errorf("start_time must be <= end_time")
|
||||||
|
}
|
||||||
|
// Bound excessively large windows to prevent accidental heavy queries.
|
||||||
|
if end.Sub(start) > 24*time.Hour {
|
||||||
|
return nil, fmt.Errorf("window too large")
|
||||||
|
}
|
||||||
|
|
||||||
|
successCount, tokenConsumed, err := r.queryUsageCounts(ctx, filter, start, end)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
errorTotal, _, _, _, _, _, err := r.queryErrorCounts(ctx, filter, start, end)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return &service.OpsWindowStats{
|
||||||
|
StartTime: start,
|
||||||
|
EndTime: end,
|
||||||
|
|
||||||
|
SuccessCount: successCount,
|
||||||
|
ErrorCountTotal: errorTotal,
|
||||||
|
TokenConsumed: tokenConsumed,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
@@ -204,7 +204,7 @@ func (s *UsageLogRepoSuite) TestDashboardStats_TodayTotalsAndPerformance() {
|
|||||||
|
|
||||||
userToday := mustCreateUser(s.T(), s.client, &service.User{
|
userToday := mustCreateUser(s.T(), s.client, &service.User{
|
||||||
Email: "today@example.com",
|
Email: "today@example.com",
|
||||||
CreatedAt: maxTime(todayStart.Add(10*time.Second), now.Add(-10*time.Second)),
|
CreatedAt: testMaxTime(todayStart.Add(10*time.Second), now.Add(-10*time.Second)),
|
||||||
UpdatedAt: now,
|
UpdatedAt: now,
|
||||||
})
|
})
|
||||||
userOld := mustCreateUser(s.T(), s.client, &service.User{
|
userOld := mustCreateUser(s.T(), s.client, &service.User{
|
||||||
@@ -237,7 +237,7 @@ func (s *UsageLogRepoSuite) TestDashboardStats_TodayTotalsAndPerformance() {
|
|||||||
TotalCost: 1.5,
|
TotalCost: 1.5,
|
||||||
ActualCost: 1.2,
|
ActualCost: 1.2,
|
||||||
DurationMs: &d1,
|
DurationMs: &d1,
|
||||||
CreatedAt: maxTime(todayStart.Add(2*time.Minute), now.Add(-2*time.Minute)),
|
CreatedAt: testMaxTime(todayStart.Add(2*time.Minute), now.Add(-2*time.Minute)),
|
||||||
}
|
}
|
||||||
_, err = s.repo.Create(s.ctx, logToday)
|
_, err = s.repo.Create(s.ctx, logToday)
|
||||||
s.Require().NoError(err, "Create logToday")
|
s.Require().NoError(err, "Create logToday")
|
||||||
@@ -413,9 +413,17 @@ func (s *UsageLogRepoSuite) TestGetAccountTodayStats() {
|
|||||||
|
|
||||||
func (s *UsageLogRepoSuite) TestDashboardAggregationConsistency() {
|
func (s *UsageLogRepoSuite) TestDashboardAggregationConsistency() {
|
||||||
now := time.Now().UTC().Truncate(time.Second)
|
now := time.Now().UTC().Truncate(time.Second)
|
||||||
hour1 := now.Add(-90 * time.Minute).Truncate(time.Hour)
|
// 使用固定的时间偏移确保 hour1 和 hour2 在同一天且都在过去
|
||||||
hour2 := now.Add(-30 * time.Minute).Truncate(time.Hour)
|
// 选择当天 02:00 和 03:00 作为测试时间点(基于 now 的日期)
|
||||||
dayStart := truncateToDayUTC(now)
|
dayStart := truncateToDayUTC(now)
|
||||||
|
hour1 := dayStart.Add(2 * time.Hour) // 当天 02:00
|
||||||
|
hour2 := dayStart.Add(3 * time.Hour) // 当天 03:00
|
||||||
|
// 如果当前时间早于 hour2,则使用昨天的时间
|
||||||
|
if now.Before(hour2.Add(time.Hour)) {
|
||||||
|
dayStart = dayStart.Add(-24 * time.Hour)
|
||||||
|
hour1 = dayStart.Add(2 * time.Hour)
|
||||||
|
hour2 = dayStart.Add(3 * time.Hour)
|
||||||
|
}
|
||||||
|
|
||||||
user1 := mustCreateUser(s.T(), s.client, &service.User{Email: "agg-u1@test.com"})
|
user1 := mustCreateUser(s.T(), s.client, &service.User{Email: "agg-u1@test.com"})
|
||||||
user2 := mustCreateUser(s.T(), s.client, &service.User{Email: "agg-u2@test.com"})
|
user2 := mustCreateUser(s.T(), s.client, &service.User{Email: "agg-u2@test.com"})
|
||||||
@@ -473,7 +481,7 @@ func (s *UsageLogRepoSuite) TestDashboardAggregationConsistency() {
|
|||||||
|
|
||||||
aggRepo := newDashboardAggregationRepositoryWithSQL(s.tx)
|
aggRepo := newDashboardAggregationRepositoryWithSQL(s.tx)
|
||||||
aggStart := hour1.Add(-5 * time.Minute)
|
aggStart := hour1.Add(-5 * time.Minute)
|
||||||
aggEnd := now.Add(5 * time.Minute)
|
aggEnd := hour2.Add(time.Hour) // 确保覆盖 hour2 的所有数据
|
||||||
s.Require().NoError(aggRepo.AggregateRange(s.ctx, aggStart, aggEnd))
|
s.Require().NoError(aggRepo.AggregateRange(s.ctx, aggStart, aggEnd))
|
||||||
|
|
||||||
type hourlyRow struct {
|
type hourlyRow struct {
|
||||||
@@ -621,7 +629,7 @@ func (s *UsageLogRepoSuite) TestGetGlobalStats() {
|
|||||||
s.Require().Equal(int64(45), stats.TotalOutputTokens)
|
s.Require().Equal(int64(45), stats.TotalOutputTokens)
|
||||||
}
|
}
|
||||||
|
|
||||||
func maxTime(a, b time.Time) time.Time {
|
func testMaxTime(a, b time.Time) time.Time {
|
||||||
if a.After(b) {
|
if a.After(b) {
|
||||||
return a
|
return a
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,6 +49,7 @@ var ProviderSet = wire.NewSet(
|
|||||||
NewUsageLogRepository,
|
NewUsageLogRepository,
|
||||||
NewDashboardAggregationRepository,
|
NewDashboardAggregationRepository,
|
||||||
NewSettingRepository,
|
NewSettingRepository,
|
||||||
|
NewOpsRepository,
|
||||||
NewUserSubscriptionRepository,
|
NewUserSubscriptionRepository,
|
||||||
NewUserAttributeDefinitionRepository,
|
NewUserAttributeDefinitionRepository,
|
||||||
NewUserAttributeValueRepository,
|
NewUserAttributeValueRepository,
|
||||||
|
|||||||
@@ -262,11 +262,11 @@ func TestAPIContracts(t *testing.T) {
|
|||||||
name: "GET /api/v1/admin/settings",
|
name: "GET /api/v1/admin/settings",
|
||||||
setup: func(t *testing.T, deps *contractDeps) {
|
setup: func(t *testing.T, deps *contractDeps) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
deps.settingRepo.SetAll(map[string]string{
|
deps.settingRepo.SetAll(map[string]string{
|
||||||
service.SettingKeyRegistrationEnabled: "true",
|
service.SettingKeyRegistrationEnabled: "true",
|
||||||
service.SettingKeyEmailVerifyEnabled: "false",
|
service.SettingKeyEmailVerifyEnabled: "false",
|
||||||
|
|
||||||
service.SettingKeySMTPHost: "smtp.example.com",
|
service.SettingKeySMTPHost: "smtp.example.com",
|
||||||
service.SettingKeySMTPPort: "587",
|
service.SettingKeySMTPPort: "587",
|
||||||
service.SettingKeySMTPUsername: "user",
|
service.SettingKeySMTPUsername: "user",
|
||||||
service.SettingKeySMTPPassword: "secret",
|
service.SettingKeySMTPPassword: "secret",
|
||||||
@@ -285,10 +285,15 @@ func TestAPIContracts(t *testing.T) {
|
|||||||
service.SettingKeyContactInfo: "support",
|
service.SettingKeyContactInfo: "support",
|
||||||
service.SettingKeyDocURL: "https://docs.example.com",
|
service.SettingKeyDocURL: "https://docs.example.com",
|
||||||
|
|
||||||
service.SettingKeyDefaultConcurrency: "5",
|
service.SettingKeyDefaultConcurrency: "5",
|
||||||
service.SettingKeyDefaultBalance: "1.25",
|
service.SettingKeyDefaultBalance: "1.25",
|
||||||
})
|
|
||||||
},
|
service.SettingKeyOpsMonitoringEnabled: "false",
|
||||||
|
service.SettingKeyOpsRealtimeMonitoringEnabled: "true",
|
||||||
|
service.SettingKeyOpsQueryModeDefault: "auto",
|
||||||
|
service.SettingKeyOpsMetricsIntervalSeconds: "60",
|
||||||
|
})
|
||||||
|
},
|
||||||
method: http.MethodGet,
|
method: http.MethodGet,
|
||||||
path: "/api/v1/admin/settings",
|
path: "/api/v1/admin/settings",
|
||||||
wantStatus: http.StatusOK,
|
wantStatus: http.StatusOK,
|
||||||
@@ -309,13 +314,17 @@ func TestAPIContracts(t *testing.T) {
|
|||||||
"turnstile_site_key": "site-key",
|
"turnstile_site_key": "site-key",
|
||||||
"turnstile_secret_key_configured": true,
|
"turnstile_secret_key_configured": true,
|
||||||
"linuxdo_connect_enabled": false,
|
"linuxdo_connect_enabled": false,
|
||||||
"linuxdo_connect_client_id": "",
|
"linuxdo_connect_client_id": "",
|
||||||
"linuxdo_connect_client_secret_configured": false,
|
"linuxdo_connect_client_secret_configured": false,
|
||||||
"linuxdo_connect_redirect_url": "",
|
"linuxdo_connect_redirect_url": "",
|
||||||
"site_name": "Sub2API",
|
"ops_monitoring_enabled": false,
|
||||||
"site_logo": "",
|
"ops_realtime_monitoring_enabled": true,
|
||||||
"site_subtitle": "Subtitle",
|
"ops_query_mode_default": "auto",
|
||||||
"api_base_url": "https://api.example.com",
|
"ops_metrics_interval_seconds": 60,
|
||||||
|
"site_name": "Sub2API",
|
||||||
|
"site_logo": "",
|
||||||
|
"site_subtitle": "Subtitle",
|
||||||
|
"api_base_url": "https://api.example.com",
|
||||||
"contact_info": "support",
|
"contact_info": "support",
|
||||||
"doc_url": "https://docs.example.com",
|
"doc_url": "https://docs.example.com",
|
||||||
"default_concurrency": 5,
|
"default_concurrency": 5,
|
||||||
@@ -430,7 +439,7 @@ func newContractDeps(t *testing.T) *contractDeps {
|
|||||||
authHandler := handler.NewAuthHandler(cfg, nil, userService, settingService, nil)
|
authHandler := handler.NewAuthHandler(cfg, nil, userService, settingService, nil)
|
||||||
apiKeyHandler := handler.NewAPIKeyHandler(apiKeyService)
|
apiKeyHandler := handler.NewAPIKeyHandler(apiKeyService)
|
||||||
usageHandler := handler.NewUsageHandler(usageService, apiKeyService)
|
usageHandler := handler.NewUsageHandler(usageService, apiKeyService)
|
||||||
adminSettingHandler := adminhandler.NewSettingHandler(settingService, nil, nil)
|
adminSettingHandler := adminhandler.NewSettingHandler(settingService, nil, nil, nil)
|
||||||
adminAccountHandler := adminhandler.NewAccountHandler(adminService, nil, nil, nil, nil, nil, nil, nil, nil, nil)
|
adminAccountHandler := adminhandler.NewAccountHandler(adminService, nil, nil, nil, nil, nil, nil, nil, nil, nil)
|
||||||
|
|
||||||
jwtAuth := func(c *gin.Context) {
|
jwtAuth := func(c *gin.Context) {
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ func ProvideRouter(
|
|||||||
apiKeyAuth middleware2.APIKeyAuthMiddleware,
|
apiKeyAuth middleware2.APIKeyAuthMiddleware,
|
||||||
apiKeyService *service.APIKeyService,
|
apiKeyService *service.APIKeyService,
|
||||||
subscriptionService *service.SubscriptionService,
|
subscriptionService *service.SubscriptionService,
|
||||||
|
opsService *service.OpsService,
|
||||||
settingService *service.SettingService,
|
settingService *service.SettingService,
|
||||||
redisClient *redis.Client,
|
redisClient *redis.Client,
|
||||||
) *gin.Engine {
|
) *gin.Engine {
|
||||||
@@ -50,7 +51,7 @@ func ProvideRouter(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, settingService, cfg, redisClient)
|
return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, cfg, redisClient)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ProvideHTTPServer 提供 HTTP 服务器
|
// ProvideHTTPServer 提供 HTTP 服务器
|
||||||
|
|||||||
@@ -30,6 +30,20 @@ func adminAuth(
|
|||||||
settingService *service.SettingService,
|
settingService *service.SettingService,
|
||||||
) gin.HandlerFunc {
|
) gin.HandlerFunc {
|
||||||
return func(c *gin.Context) {
|
return func(c *gin.Context) {
|
||||||
|
// WebSocket upgrade requests cannot set Authorization headers in browsers.
|
||||||
|
// For admin WebSocket endpoints (e.g. Ops realtime), allow passing the JWT via
|
||||||
|
// Sec-WebSocket-Protocol (subprotocol list) using a prefixed token item:
|
||||||
|
// Sec-WebSocket-Protocol: sub2api-admin, jwt.<token>
|
||||||
|
if isWebSocketUpgradeRequest(c) {
|
||||||
|
if token := extractJWTFromWebSocketSubprotocol(c); token != "" {
|
||||||
|
if !validateJWTForAdmin(c, token, authService, userService) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
c.Next()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// 检查 x-api-key header(Admin API Key 认证)
|
// 检查 x-api-key header(Admin API Key 认证)
|
||||||
apiKey := c.GetHeader("x-api-key")
|
apiKey := c.GetHeader("x-api-key")
|
||||||
if apiKey != "" {
|
if apiKey != "" {
|
||||||
@@ -58,6 +72,44 @@ func adminAuth(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func isWebSocketUpgradeRequest(c *gin.Context) bool {
|
||||||
|
if c == nil || c.Request == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
// RFC6455 handshake uses:
|
||||||
|
// Connection: Upgrade
|
||||||
|
// Upgrade: websocket
|
||||||
|
upgrade := strings.ToLower(strings.TrimSpace(c.GetHeader("Upgrade")))
|
||||||
|
if upgrade != "websocket" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
connection := strings.ToLower(c.GetHeader("Connection"))
|
||||||
|
return strings.Contains(connection, "upgrade")
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractJWTFromWebSocketSubprotocol(c *gin.Context) string {
|
||||||
|
if c == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
raw := strings.TrimSpace(c.GetHeader("Sec-WebSocket-Protocol"))
|
||||||
|
if raw == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// The header is a comma-separated list of tokens. We reserve the prefix "jwt."
|
||||||
|
// for carrying the admin JWT.
|
||||||
|
for _, part := range strings.Split(raw, ",") {
|
||||||
|
p := strings.TrimSpace(part)
|
||||||
|
if strings.HasPrefix(p, "jwt.") {
|
||||||
|
token := strings.TrimSpace(strings.TrimPrefix(p, "jwt."))
|
||||||
|
if token != "" {
|
||||||
|
return token
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
// validateAdminAPIKey 验证管理员 API Key
|
// validateAdminAPIKey 验证管理员 API Key
|
||||||
func validateAdminAPIKey(
|
func validateAdminAPIKey(
|
||||||
c *gin.Context,
|
c *gin.Context,
|
||||||
|
|||||||
30
backend/internal/server/middleware/client_request_id.go
Normal file
30
backend/internal/server/middleware/client_request_id.go
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
package middleware
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
|
||||||
|
"github.com/gin-gonic/gin"
|
||||||
|
"github.com/google/uuid"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ClientRequestID ensures every request has a unique client_request_id in request.Context().
|
||||||
|
//
|
||||||
|
// This is used by the Ops monitoring module for end-to-end request correlation.
|
||||||
|
func ClientRequestID() gin.HandlerFunc {
|
||||||
|
return func(c *gin.Context) {
|
||||||
|
if c.Request == nil {
|
||||||
|
c.Next()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if v := c.Request.Context().Value(ctxkey.ClientRequestID); v != nil {
|
||||||
|
c.Next()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
id := uuid.New().String()
|
||||||
|
c.Request = c.Request.WithContext(context.WithValue(c.Request.Context(), ctxkey.ClientRequestID, id))
|
||||||
|
c.Next()
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -23,6 +23,7 @@ func SetupRouter(
|
|||||||
apiKeyAuth middleware2.APIKeyAuthMiddleware,
|
apiKeyAuth middleware2.APIKeyAuthMiddleware,
|
||||||
apiKeyService *service.APIKeyService,
|
apiKeyService *service.APIKeyService,
|
||||||
subscriptionService *service.SubscriptionService,
|
subscriptionService *service.SubscriptionService,
|
||||||
|
opsService *service.OpsService,
|
||||||
settingService *service.SettingService,
|
settingService *service.SettingService,
|
||||||
cfg *config.Config,
|
cfg *config.Config,
|
||||||
redisClient *redis.Client,
|
redisClient *redis.Client,
|
||||||
@@ -46,7 +47,7 @@ func SetupRouter(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 注册路由
|
// 注册路由
|
||||||
registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, cfg, redisClient)
|
registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg, redisClient)
|
||||||
|
|
||||||
return r
|
return r
|
||||||
}
|
}
|
||||||
@@ -60,6 +61,7 @@ func registerRoutes(
|
|||||||
apiKeyAuth middleware2.APIKeyAuthMiddleware,
|
apiKeyAuth middleware2.APIKeyAuthMiddleware,
|
||||||
apiKeyService *service.APIKeyService,
|
apiKeyService *service.APIKeyService,
|
||||||
subscriptionService *service.SubscriptionService,
|
subscriptionService *service.SubscriptionService,
|
||||||
|
opsService *service.OpsService,
|
||||||
cfg *config.Config,
|
cfg *config.Config,
|
||||||
redisClient *redis.Client,
|
redisClient *redis.Client,
|
||||||
) {
|
) {
|
||||||
@@ -73,5 +75,5 @@ func registerRoutes(
|
|||||||
routes.RegisterAuthRoutes(v1, h, jwtAuth, redisClient)
|
routes.RegisterAuthRoutes(v1, h, jwtAuth, redisClient)
|
||||||
routes.RegisterUserRoutes(v1, h, jwtAuth)
|
routes.RegisterUserRoutes(v1, h, jwtAuth)
|
||||||
routes.RegisterAdminRoutes(v1, h, adminAuth)
|
routes.RegisterAdminRoutes(v1, h, adminAuth)
|
||||||
routes.RegisterGatewayRoutes(r, h, apiKeyAuth, apiKeyService, subscriptionService, cfg)
|
routes.RegisterGatewayRoutes(r, h, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -50,6 +50,9 @@ func RegisterAdminRoutes(
|
|||||||
// 系统设置
|
// 系统设置
|
||||||
registerSettingsRoutes(admin, h)
|
registerSettingsRoutes(admin, h)
|
||||||
|
|
||||||
|
// 运维监控(Ops)
|
||||||
|
registerOpsRoutes(admin, h)
|
||||||
|
|
||||||
// 系统管理
|
// 系统管理
|
||||||
registerSystemRoutes(admin, h)
|
registerSystemRoutes(admin, h)
|
||||||
|
|
||||||
@@ -64,6 +67,58 @@ func RegisterAdminRoutes(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) {
|
||||||
|
ops := admin.Group("/ops")
|
||||||
|
{
|
||||||
|
// Realtime ops signals
|
||||||
|
ops.GET("/concurrency", h.Admin.Ops.GetConcurrencyStats)
|
||||||
|
ops.GET("/account-availability", h.Admin.Ops.GetAccountAvailability)
|
||||||
|
|
||||||
|
// Alerts (rules + events)
|
||||||
|
ops.GET("/alert-rules", h.Admin.Ops.ListAlertRules)
|
||||||
|
ops.POST("/alert-rules", h.Admin.Ops.CreateAlertRule)
|
||||||
|
ops.PUT("/alert-rules/:id", h.Admin.Ops.UpdateAlertRule)
|
||||||
|
ops.DELETE("/alert-rules/:id", h.Admin.Ops.DeleteAlertRule)
|
||||||
|
ops.GET("/alert-events", h.Admin.Ops.ListAlertEvents)
|
||||||
|
|
||||||
|
// Email notification config (DB-backed)
|
||||||
|
ops.GET("/email-notification/config", h.Admin.Ops.GetEmailNotificationConfig)
|
||||||
|
ops.PUT("/email-notification/config", h.Admin.Ops.UpdateEmailNotificationConfig)
|
||||||
|
|
||||||
|
// Runtime settings (DB-backed)
|
||||||
|
runtime := ops.Group("/runtime")
|
||||||
|
{
|
||||||
|
runtime.GET("/alert", h.Admin.Ops.GetAlertRuntimeSettings)
|
||||||
|
runtime.PUT("/alert", h.Admin.Ops.UpdateAlertRuntimeSettings)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Advanced settings (DB-backed)
|
||||||
|
ops.GET("/advanced-settings", h.Admin.Ops.GetAdvancedSettings)
|
||||||
|
ops.PUT("/advanced-settings", h.Admin.Ops.UpdateAdvancedSettings)
|
||||||
|
|
||||||
|
// WebSocket realtime (QPS/TPS)
|
||||||
|
ws := ops.Group("/ws")
|
||||||
|
{
|
||||||
|
ws.GET("/qps", h.Admin.Ops.QPSWSHandler)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Error logs (MVP-1)
|
||||||
|
ops.GET("/errors", h.Admin.Ops.GetErrorLogs)
|
||||||
|
ops.GET("/errors/:id", h.Admin.Ops.GetErrorLogByID)
|
||||||
|
ops.POST("/errors/:id/retry", h.Admin.Ops.RetryErrorRequest)
|
||||||
|
|
||||||
|
// Request drilldown (success + error)
|
||||||
|
ops.GET("/requests", h.Admin.Ops.ListRequestDetails)
|
||||||
|
|
||||||
|
// Dashboard (vNext - raw path for MVP)
|
||||||
|
ops.GET("/dashboard/overview", h.Admin.Ops.GetDashboardOverview)
|
||||||
|
ops.GET("/dashboard/throughput-trend", h.Admin.Ops.GetDashboardThroughputTrend)
|
||||||
|
ops.GET("/dashboard/latency-histogram", h.Admin.Ops.GetDashboardLatencyHistogram)
|
||||||
|
ops.GET("/dashboard/error-trend", h.Admin.Ops.GetDashboardErrorTrend)
|
||||||
|
ops.GET("/dashboard/error-distribution", h.Admin.Ops.GetDashboardErrorDistribution)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func registerDashboardRoutes(admin *gin.RouterGroup, h *handler.Handlers) {
|
func registerDashboardRoutes(admin *gin.RouterGroup, h *handler.Handlers) {
|
||||||
dashboard := admin.Group("/dashboard")
|
dashboard := admin.Group("/dashboard")
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -16,13 +16,18 @@ func RegisterGatewayRoutes(
|
|||||||
apiKeyAuth middleware.APIKeyAuthMiddleware,
|
apiKeyAuth middleware.APIKeyAuthMiddleware,
|
||||||
apiKeyService *service.APIKeyService,
|
apiKeyService *service.APIKeyService,
|
||||||
subscriptionService *service.SubscriptionService,
|
subscriptionService *service.SubscriptionService,
|
||||||
|
opsService *service.OpsService,
|
||||||
cfg *config.Config,
|
cfg *config.Config,
|
||||||
) {
|
) {
|
||||||
bodyLimit := middleware.RequestBodyLimit(cfg.Gateway.MaxBodySize)
|
bodyLimit := middleware.RequestBodyLimit(cfg.Gateway.MaxBodySize)
|
||||||
|
clientRequestID := middleware.ClientRequestID()
|
||||||
|
opsErrorLogger := handler.OpsErrorLoggerMiddleware(opsService)
|
||||||
|
|
||||||
// API网关(Claude API兼容)
|
// API网关(Claude API兼容)
|
||||||
gateway := r.Group("/v1")
|
gateway := r.Group("/v1")
|
||||||
gateway.Use(bodyLimit)
|
gateway.Use(bodyLimit)
|
||||||
|
gateway.Use(clientRequestID)
|
||||||
|
gateway.Use(opsErrorLogger)
|
||||||
gateway.Use(gin.HandlerFunc(apiKeyAuth))
|
gateway.Use(gin.HandlerFunc(apiKeyAuth))
|
||||||
{
|
{
|
||||||
gateway.POST("/messages", h.Gateway.Messages)
|
gateway.POST("/messages", h.Gateway.Messages)
|
||||||
@@ -36,6 +41,8 @@ func RegisterGatewayRoutes(
|
|||||||
// Gemini 原生 API 兼容层(Gemini SDK/CLI 直连)
|
// Gemini 原生 API 兼容层(Gemini SDK/CLI 直连)
|
||||||
gemini := r.Group("/v1beta")
|
gemini := r.Group("/v1beta")
|
||||||
gemini.Use(bodyLimit)
|
gemini.Use(bodyLimit)
|
||||||
|
gemini.Use(clientRequestID)
|
||||||
|
gemini.Use(opsErrorLogger)
|
||||||
gemini.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg))
|
gemini.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg))
|
||||||
{
|
{
|
||||||
gemini.GET("/models", h.Gateway.GeminiV1BetaListModels)
|
gemini.GET("/models", h.Gateway.GeminiV1BetaListModels)
|
||||||
@@ -45,7 +52,7 @@ func RegisterGatewayRoutes(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// OpenAI Responses API(不带v1前缀的别名)
|
// OpenAI Responses API(不带v1前缀的别名)
|
||||||
r.POST("/responses", bodyLimit, gin.HandlerFunc(apiKeyAuth), h.OpenAIGateway.Responses)
|
r.POST("/responses", bodyLimit, clientRequestID, opsErrorLogger, gin.HandlerFunc(apiKeyAuth), h.OpenAIGateway.Responses)
|
||||||
|
|
||||||
// Antigravity 模型列表
|
// Antigravity 模型列表
|
||||||
r.GET("/antigravity/models", gin.HandlerFunc(apiKeyAuth), h.Gateway.AntigravityModels)
|
r.GET("/antigravity/models", gin.HandlerFunc(apiKeyAuth), h.Gateway.AntigravityModels)
|
||||||
@@ -53,6 +60,8 @@ func RegisterGatewayRoutes(
|
|||||||
// Antigravity 专用路由(仅使用 antigravity 账户,不混合调度)
|
// Antigravity 专用路由(仅使用 antigravity 账户,不混合调度)
|
||||||
antigravityV1 := r.Group("/antigravity/v1")
|
antigravityV1 := r.Group("/antigravity/v1")
|
||||||
antigravityV1.Use(bodyLimit)
|
antigravityV1.Use(bodyLimit)
|
||||||
|
antigravityV1.Use(clientRequestID)
|
||||||
|
antigravityV1.Use(opsErrorLogger)
|
||||||
antigravityV1.Use(middleware.ForcePlatform(service.PlatformAntigravity))
|
antigravityV1.Use(middleware.ForcePlatform(service.PlatformAntigravity))
|
||||||
antigravityV1.Use(gin.HandlerFunc(apiKeyAuth))
|
antigravityV1.Use(gin.HandlerFunc(apiKeyAuth))
|
||||||
{
|
{
|
||||||
@@ -64,6 +73,8 @@ func RegisterGatewayRoutes(
|
|||||||
|
|
||||||
antigravityV1Beta := r.Group("/antigravity/v1beta")
|
antigravityV1Beta := r.Group("/antigravity/v1beta")
|
||||||
antigravityV1Beta.Use(bodyLimit)
|
antigravityV1Beta.Use(bodyLimit)
|
||||||
|
antigravityV1Beta.Use(clientRequestID)
|
||||||
|
antigravityV1Beta.Use(opsErrorLogger)
|
||||||
antigravityV1Beta.Use(middleware.ForcePlatform(service.PlatformAntigravity))
|
antigravityV1Beta.Use(middleware.ForcePlatform(service.PlatformAntigravity))
|
||||||
antigravityV1Beta.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg))
|
antigravityV1Beta.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg))
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -564,6 +564,14 @@ urlFallbackLoop:
|
|||||||
|
|
||||||
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
safeErr := sanitizeUpstreamErrorMessage(err.Error())
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: 0,
|
||||||
|
Kind: "request_error",
|
||||||
|
Message: safeErr,
|
||||||
|
})
|
||||||
// 检查是否应触发 URL 降级
|
// 检查是否应触发 URL 降级
|
||||||
if shouldAntigravityFallbackToNextURL(err, 0) && urlIdx < len(availableURLs)-1 {
|
if shouldAntigravityFallbackToNextURL(err, 0) && urlIdx < len(availableURLs)-1 {
|
||||||
antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
|
antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
|
||||||
@@ -579,6 +587,7 @@ urlFallbackLoop:
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
log.Printf("%s status=request_failed retries_exhausted error=%v", prefix, err)
|
log.Printf("%s status=request_failed retries_exhausted error=%v", prefix, err)
|
||||||
|
setOpsUpstreamError(c, 0, safeErr, "")
|
||||||
return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries")
|
return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -586,6 +595,26 @@ urlFallbackLoop:
|
|||||||
if resp.StatusCode == http.StatusTooManyRequests && urlIdx < len(availableURLs)-1 {
|
if resp.StatusCode == http.StatusTooManyRequests && urlIdx < len(availableURLs)-1 {
|
||||||
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
|
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
|
||||||
_ = resp.Body.Close()
|
_ = resp.Body.Close()
|
||||||
|
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
|
||||||
|
maxBytes := 2048
|
||||||
|
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
|
||||||
|
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
}
|
||||||
|
upstreamDetail := ""
|
||||||
|
if logBody {
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "retry",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
|
antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
|
||||||
log.Printf("%s URL fallback (HTTP 429): %s -> %s body=%s", prefix, baseURL, availableURLs[urlIdx+1], truncateForLog(respBody, 200))
|
log.Printf("%s URL fallback (HTTP 429): %s -> %s body=%s", prefix, baseURL, availableURLs[urlIdx+1], truncateForLog(respBody, 200))
|
||||||
continue urlFallbackLoop
|
continue urlFallbackLoop
|
||||||
@@ -596,6 +625,26 @@ urlFallbackLoop:
|
|||||||
_ = resp.Body.Close()
|
_ = resp.Body.Close()
|
||||||
|
|
||||||
if attempt < antigravityMaxRetries {
|
if attempt < antigravityMaxRetries {
|
||||||
|
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
|
||||||
|
maxBytes := 2048
|
||||||
|
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
|
||||||
|
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
}
|
||||||
|
upstreamDetail := ""
|
||||||
|
if logBody {
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "retry",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
log.Printf("%s status=%d retry=%d/%d body=%s", prefix, resp.StatusCode, attempt, antigravityMaxRetries, truncateForLog(respBody, 500))
|
log.Printf("%s status=%d retry=%d/%d body=%s", prefix, resp.StatusCode, attempt, antigravityMaxRetries, truncateForLog(respBody, 500))
|
||||||
if !sleepAntigravityBackoffWithContext(ctx, attempt) {
|
if !sleepAntigravityBackoffWithContext(ctx, attempt) {
|
||||||
log.Printf("%s status=context_canceled_during_backoff", prefix)
|
log.Printf("%s status=context_canceled_during_backoff", prefix)
|
||||||
@@ -628,6 +677,27 @@ urlFallbackLoop:
|
|||||||
// Antigravity /v1internal 链路在部分场景会对 thought/thinking signature 做严格校验,
|
// Antigravity /v1internal 链路在部分场景会对 thought/thinking signature 做严格校验,
|
||||||
// 当历史消息携带的 signature 不合法时会直接 400;去除 thinking 后可继续完成请求。
|
// 当历史消息携带的 signature 不合法时会直接 400;去除 thinking 后可继续完成请求。
|
||||||
if resp.StatusCode == http.StatusBadRequest && isSignatureRelatedError(respBody) {
|
if resp.StatusCode == http.StatusBadRequest && isSignatureRelatedError(respBody) {
|
||||||
|
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
|
||||||
|
maxBytes := 2048
|
||||||
|
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
|
||||||
|
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
}
|
||||||
|
upstreamDetail := ""
|
||||||
|
if logBody {
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "signature_error",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
|
||||||
// Conservative two-stage fallback:
|
// Conservative two-stage fallback:
|
||||||
// 1) Disable top-level thinking + thinking->text
|
// 1) Disable top-level thinking + thinking->text
|
||||||
// 2) Only if still signature-related 400: also downgrade tool_use/tool_result to text.
|
// 2) Only if still signature-related 400: also downgrade tool_use/tool_result to text.
|
||||||
@@ -661,6 +731,13 @@ urlFallbackLoop:
|
|||||||
}
|
}
|
||||||
retryResp, retryErr := s.httpUpstream.Do(retryReq, proxyURL, account.ID, account.Concurrency)
|
retryResp, retryErr := s.httpUpstream.Do(retryReq, proxyURL, account.ID, account.Concurrency)
|
||||||
if retryErr != nil {
|
if retryErr != nil {
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: 0,
|
||||||
|
Kind: "signature_retry_request_error",
|
||||||
|
Message: sanitizeUpstreamErrorMessage(retryErr.Error()),
|
||||||
|
})
|
||||||
log.Printf("Antigravity account %d: signature retry request failed (%s): %v", account.ID, stage.name, retryErr)
|
log.Printf("Antigravity account %d: signature retry request failed (%s): %v", account.ID, stage.name, retryErr)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -674,6 +751,25 @@ urlFallbackLoop:
|
|||||||
|
|
||||||
retryBody, _ := io.ReadAll(io.LimitReader(retryResp.Body, 2<<20))
|
retryBody, _ := io.ReadAll(io.LimitReader(retryResp.Body, 2<<20))
|
||||||
_ = retryResp.Body.Close()
|
_ = retryResp.Body.Close()
|
||||||
|
kind := "signature_retry"
|
||||||
|
if strings.TrimSpace(stage.name) != "" {
|
||||||
|
kind = "signature_retry_" + strings.ReplaceAll(stage.name, "+", "_")
|
||||||
|
}
|
||||||
|
retryUpstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(retryBody))
|
||||||
|
retryUpstreamMsg = sanitizeUpstreamErrorMessage(retryUpstreamMsg)
|
||||||
|
retryUpstreamDetail := ""
|
||||||
|
if logBody {
|
||||||
|
retryUpstreamDetail = truncateString(string(retryBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: retryResp.StatusCode,
|
||||||
|
UpstreamRequestID: retryResp.Header.Get("x-request-id"),
|
||||||
|
Kind: kind,
|
||||||
|
Message: retryUpstreamMsg,
|
||||||
|
Detail: retryUpstreamDetail,
|
||||||
|
})
|
||||||
|
|
||||||
// If this stage fixed the signature issue, we stop; otherwise we may try the next stage.
|
// If this stage fixed the signature issue, we stop; otherwise we may try the next stage.
|
||||||
if retryResp.StatusCode != http.StatusBadRequest || !isSignatureRelatedError(retryBody) {
|
if retryResp.StatusCode != http.StatusBadRequest || !isSignatureRelatedError(retryBody) {
|
||||||
@@ -701,10 +797,30 @@ urlFallbackLoop:
|
|||||||
s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope)
|
s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope)
|
||||||
|
|
||||||
if s.shouldFailoverUpstreamError(resp.StatusCode) {
|
if s.shouldFailoverUpstreamError(resp.StatusCode) {
|
||||||
|
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
|
||||||
|
maxBytes := 2048
|
||||||
|
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
|
||||||
|
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
}
|
||||||
|
upstreamDetail := ""
|
||||||
|
if logBody {
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "failover",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, s.writeMappedClaudeError(c, resp.StatusCode, respBody)
|
return nil, s.writeMappedClaudeError(c, account, resp.StatusCode, resp.Header.Get("x-request-id"), respBody)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1108,6 +1224,14 @@ urlFallbackLoop:
|
|||||||
|
|
||||||
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
safeErr := sanitizeUpstreamErrorMessage(err.Error())
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: 0,
|
||||||
|
Kind: "request_error",
|
||||||
|
Message: safeErr,
|
||||||
|
})
|
||||||
// 检查是否应触发 URL 降级
|
// 检查是否应触发 URL 降级
|
||||||
if shouldAntigravityFallbackToNextURL(err, 0) && urlIdx < len(availableURLs)-1 {
|
if shouldAntigravityFallbackToNextURL(err, 0) && urlIdx < len(availableURLs)-1 {
|
||||||
antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
|
antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
|
||||||
@@ -1123,6 +1247,7 @@ urlFallbackLoop:
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
log.Printf("%s status=request_failed retries_exhausted error=%v", prefix, err)
|
log.Printf("%s status=request_failed retries_exhausted error=%v", prefix, err)
|
||||||
|
setOpsUpstreamError(c, 0, safeErr, "")
|
||||||
return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries")
|
return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1130,6 +1255,26 @@ urlFallbackLoop:
|
|||||||
if resp.StatusCode == http.StatusTooManyRequests && urlIdx < len(availableURLs)-1 {
|
if resp.StatusCode == http.StatusTooManyRequests && urlIdx < len(availableURLs)-1 {
|
||||||
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
|
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
|
||||||
_ = resp.Body.Close()
|
_ = resp.Body.Close()
|
||||||
|
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
|
||||||
|
maxBytes := 2048
|
||||||
|
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
|
||||||
|
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
}
|
||||||
|
upstreamDetail := ""
|
||||||
|
if logBody {
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "retry",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
|
antigravity.DefaultURLAvailability.MarkUnavailable(baseURL)
|
||||||
log.Printf("%s URL fallback (HTTP 429): %s -> %s body=%s", prefix, baseURL, availableURLs[urlIdx+1], truncateForLog(respBody, 200))
|
log.Printf("%s URL fallback (HTTP 429): %s -> %s body=%s", prefix, baseURL, availableURLs[urlIdx+1], truncateForLog(respBody, 200))
|
||||||
continue urlFallbackLoop
|
continue urlFallbackLoop
|
||||||
@@ -1140,6 +1285,26 @@ urlFallbackLoop:
|
|||||||
_ = resp.Body.Close()
|
_ = resp.Body.Close()
|
||||||
|
|
||||||
if attempt < antigravityMaxRetries {
|
if attempt < antigravityMaxRetries {
|
||||||
|
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
|
||||||
|
maxBytes := 2048
|
||||||
|
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
|
||||||
|
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
}
|
||||||
|
upstreamDetail := ""
|
||||||
|
if logBody {
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "retry",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
log.Printf("%s status=%d retry=%d/%d", prefix, resp.StatusCode, attempt, antigravityMaxRetries)
|
log.Printf("%s status=%d retry=%d/%d", prefix, resp.StatusCode, attempt, antigravityMaxRetries)
|
||||||
if !sleepAntigravityBackoffWithContext(ctx, attempt) {
|
if !sleepAntigravityBackoffWithContext(ctx, attempt) {
|
||||||
log.Printf("%s status=context_canceled_during_backoff", prefix)
|
log.Printf("%s status=context_canceled_during_backoff", prefix)
|
||||||
@@ -1205,21 +1370,59 @@ urlFallbackLoop:
|
|||||||
|
|
||||||
s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope)
|
s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope)
|
||||||
|
|
||||||
if s.shouldFailoverUpstreamError(resp.StatusCode) {
|
|
||||||
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 解包并返回错误
|
|
||||||
requestID := resp.Header.Get("x-request-id")
|
requestID := resp.Header.Get("x-request-id")
|
||||||
if requestID != "" {
|
if requestID != "" {
|
||||||
c.Header("x-request-id", requestID)
|
c.Header("x-request-id", requestID)
|
||||||
}
|
}
|
||||||
unwrapped, _ := s.unwrapV1InternalResponse(respBody)
|
|
||||||
|
unwrapped, unwrapErr := s.unwrapV1InternalResponse(respBody)
|
||||||
|
unwrappedForOps := unwrapped
|
||||||
|
if unwrapErr != nil || len(unwrappedForOps) == 0 {
|
||||||
|
unwrappedForOps = respBody
|
||||||
|
}
|
||||||
|
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(unwrappedForOps))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
|
||||||
|
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
|
||||||
|
maxBytes := 2048
|
||||||
|
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
|
||||||
|
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
}
|
||||||
|
upstreamDetail := ""
|
||||||
|
if logBody {
|
||||||
|
upstreamDetail = truncateString(string(unwrappedForOps), maxBytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Always record upstream context for Ops error logs, even when we will failover.
|
||||||
|
setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
|
||||||
|
|
||||||
|
if s.shouldFailoverUpstreamError(resp.StatusCode) {
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: requestID,
|
||||||
|
Kind: "failover",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
||||||
|
}
|
||||||
|
|
||||||
contentType := resp.Header.Get("Content-Type")
|
contentType := resp.Header.Get("Content-Type")
|
||||||
if contentType == "" {
|
if contentType == "" {
|
||||||
contentType = "application/json"
|
contentType = "application/json"
|
||||||
}
|
}
|
||||||
c.Data(resp.StatusCode, contentType, unwrapped)
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: requestID,
|
||||||
|
Kind: "http_error",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
c.Data(resp.StatusCode, contentType, unwrappedForOps)
|
||||||
return nil, fmt.Errorf("antigravity upstream error: %d", resp.StatusCode)
|
return nil, fmt.Errorf("antigravity upstream error: %d", resp.StatusCode)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1674,9 +1877,35 @@ func (s *AntigravityGatewayService) writeClaudeError(c *gin.Context, status int,
|
|||||||
return fmt.Errorf("%s", message)
|
return fmt.Errorf("%s", message)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, upstreamStatus int, body []byte) error {
|
func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, account *Account, upstreamStatus int, upstreamRequestID string, body []byte) error {
|
||||||
// 记录上游错误详情便于调试
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
|
||||||
log.Printf("[antigravity-Forward] upstream_error status=%d body=%s", upstreamStatus, string(body))
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
|
||||||
|
logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody
|
||||||
|
maxBytes := 2048
|
||||||
|
if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 {
|
||||||
|
maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
}
|
||||||
|
|
||||||
|
upstreamDetail := ""
|
||||||
|
if logBody {
|
||||||
|
upstreamDetail = truncateString(string(body), maxBytes)
|
||||||
|
}
|
||||||
|
setOpsUpstreamError(c, upstreamStatus, upstreamMsg, upstreamDetail)
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: upstreamStatus,
|
||||||
|
UpstreamRequestID: upstreamRequestID,
|
||||||
|
Kind: "http_error",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
|
||||||
|
// 记录上游错误详情便于排障(可选:由配置控制;不回显到客户端)
|
||||||
|
if logBody {
|
||||||
|
log.Printf("[antigravity-Forward] upstream_error status=%d body=%s", upstreamStatus, truncateForLog(body, maxBytes))
|
||||||
|
}
|
||||||
|
|
||||||
var statusCode int
|
var statusCode int
|
||||||
var errType, errMsg string
|
var errType, errMsg string
|
||||||
@@ -1712,7 +1941,10 @@ func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, upstr
|
|||||||
"type": "error",
|
"type": "error",
|
||||||
"error": gin.H{"type": errType, "message": errMsg},
|
"error": gin.H{"type": errType, "message": errMsg},
|
||||||
})
|
})
|
||||||
return fmt.Errorf("upstream error: %d", upstreamStatus)
|
if upstreamMsg == "" {
|
||||||
|
return fmt.Errorf("upstream error: %d", upstreamStatus)
|
||||||
|
}
|
||||||
|
return fmt.Errorf("upstream error: %d message=%s", upstreamStatus, upstreamMsg)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *AntigravityGatewayService) writeGoogleError(c *gin.Context, status int, message string) error {
|
func (s *AntigravityGatewayService) writeGoogleError(c *gin.Context, status int, message string) error {
|
||||||
|
|||||||
@@ -357,7 +357,7 @@ func (s *AuthService) Login(ctx context.Context, email, password string) (string
|
|||||||
// - 如果邮箱已存在:直接登录(不需要本地密码)
|
// - 如果邮箱已存在:直接登录(不需要本地密码)
|
||||||
// - 如果邮箱不存在:创建新用户并登录
|
// - 如果邮箱不存在:创建新用户并登录
|
||||||
//
|
//
|
||||||
// 注意:该函数用于“终端用户登录 Sub2API 本身”的场景(不同于上游账号的 OAuth,例如 OpenAI/Gemini)。
|
// 注意:该函数用于 LinuxDo OAuth 登录场景(不同于上游账号的 OAuth,例如 Claude/OpenAI/Gemini)。
|
||||||
// 为了满足现有数据库约束(需要密码哈希),新用户会生成随机密码并进行哈希保存。
|
// 为了满足现有数据库约束(需要密码哈希),新用户会生成随机密码并进行哈希保存。
|
||||||
func (s *AuthService) LoginOrRegisterOAuth(ctx context.Context, email, username string) (string, *User, error) {
|
func (s *AuthService) LoginOrRegisterOAuth(ctx context.Context, email, username string) (string, *User, error) {
|
||||||
email = strings.TrimSpace(email)
|
email = strings.TrimSpace(email)
|
||||||
@@ -376,8 +376,8 @@ func (s *AuthService) LoginOrRegisterOAuth(ctx context.Context, email, username
|
|||||||
user, err := s.userRepo.GetByEmail(ctx, email)
|
user, err := s.userRepo.GetByEmail(ctx, email)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if errors.Is(err, ErrUserNotFound) {
|
if errors.Is(err, ErrUserNotFound) {
|
||||||
// OAuth 首次登录视为注册。
|
// OAuth 首次登录视为注册(fail-close:settingService 未配置时不允许注册)
|
||||||
if s.settingService != nil && !s.settingService.IsRegistrationEnabled(ctx) {
|
if s.settingService == nil || !s.settingService.IsRegistrationEnabled(ctx) {
|
||||||
return "", nil, ErrRegDisabled
|
return "", nil, ErrRegDisabled
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -63,6 +63,9 @@ const (
|
|||||||
SubscriptionStatusSuspended = "suspended"
|
SubscriptionStatusSuspended = "suspended"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// LinuxDoConnectSyntheticEmailDomain 是 LinuxDo Connect 用户的合成邮箱后缀(RFC 保留域名)。
|
||||||
|
const LinuxDoConnectSyntheticEmailDomain = "@linuxdo-connect.invalid"
|
||||||
|
|
||||||
// Setting keys
|
// Setting keys
|
||||||
const (
|
const (
|
||||||
// 注册设置
|
// 注册设置
|
||||||
@@ -83,6 +86,12 @@ const (
|
|||||||
SettingKeyTurnstileSiteKey = "turnstile_site_key" // Turnstile Site Key
|
SettingKeyTurnstileSiteKey = "turnstile_site_key" // Turnstile Site Key
|
||||||
SettingKeyTurnstileSecretKey = "turnstile_secret_key" // Turnstile Secret Key
|
SettingKeyTurnstileSecretKey = "turnstile_secret_key" // Turnstile Secret Key
|
||||||
|
|
||||||
|
// LinuxDo Connect OAuth 登录设置
|
||||||
|
SettingKeyLinuxDoConnectEnabled = "linuxdo_connect_enabled"
|
||||||
|
SettingKeyLinuxDoConnectClientID = "linuxdo_connect_client_id"
|
||||||
|
SettingKeyLinuxDoConnectClientSecret = "linuxdo_connect_client_secret"
|
||||||
|
SettingKeyLinuxDoConnectRedirectURL = "linuxdo_connect_redirect_url"
|
||||||
|
|
||||||
// OEM设置
|
// OEM设置
|
||||||
SettingKeySiteName = "site_name" // 网站名称
|
SettingKeySiteName = "site_name" // 网站名称
|
||||||
SettingKeySiteLogo = "site_logo" // 网站Logo (base64)
|
SettingKeySiteLogo = "site_logo" // 网站Logo (base64)
|
||||||
@@ -113,16 +122,31 @@ const (
|
|||||||
SettingKeyEnableIdentityPatch = "enable_identity_patch"
|
SettingKeyEnableIdentityPatch = "enable_identity_patch"
|
||||||
SettingKeyIdentityPatchPrompt = "identity_patch_prompt"
|
SettingKeyIdentityPatchPrompt = "identity_patch_prompt"
|
||||||
|
|
||||||
// LinuxDo Connect OAuth 登录(终端用户 SSO)
|
// =========================
|
||||||
SettingKeyLinuxDoConnectEnabled = "linuxdo_connect_enabled"
|
// Ops Monitoring (vNext)
|
||||||
SettingKeyLinuxDoConnectClientID = "linuxdo_connect_client_id"
|
// =========================
|
||||||
SettingKeyLinuxDoConnectClientSecret = "linuxdo_connect_client_secret"
|
|
||||||
SettingKeyLinuxDoConnectRedirectURL = "linuxdo_connect_redirect_url"
|
|
||||||
)
|
|
||||||
|
|
||||||
// LinuxDoConnectSyntheticEmailDomain 是 LinuxDo Connect 用户的合成邮箱后缀(RFC 保留域名)。
|
// SettingKeyOpsMonitoringEnabled is a DB-backed soft switch to enable/disable ops module at runtime.
|
||||||
// 目的:避免第三方登录返回的用户标识与本地真实邮箱发生碰撞,进而造成账号被接管的风险。
|
SettingKeyOpsMonitoringEnabled = "ops_monitoring_enabled"
|
||||||
const LinuxDoConnectSyntheticEmailDomain = "@linuxdo-connect.invalid"
|
|
||||||
|
// SettingKeyOpsRealtimeMonitoringEnabled controls realtime features (e.g. WS/QPS push).
|
||||||
|
SettingKeyOpsRealtimeMonitoringEnabled = "ops_realtime_monitoring_enabled"
|
||||||
|
|
||||||
|
// SettingKeyOpsQueryModeDefault controls the default query mode for ops dashboard (auto/raw/preagg).
|
||||||
|
SettingKeyOpsQueryModeDefault = "ops_query_mode_default"
|
||||||
|
|
||||||
|
// SettingKeyOpsEmailNotificationConfig stores JSON config for ops email notifications.
|
||||||
|
SettingKeyOpsEmailNotificationConfig = "ops_email_notification_config"
|
||||||
|
|
||||||
|
// SettingKeyOpsAlertRuntimeSettings stores JSON config for ops alert evaluator runtime settings.
|
||||||
|
SettingKeyOpsAlertRuntimeSettings = "ops_alert_runtime_settings"
|
||||||
|
|
||||||
|
// SettingKeyOpsMetricsIntervalSeconds controls the ops metrics collector interval (>=60).
|
||||||
|
SettingKeyOpsMetricsIntervalSeconds = "ops_metrics_interval_seconds"
|
||||||
|
|
||||||
|
// SettingKeyOpsAdvancedSettings stores JSON config for ops advanced settings (data retention, aggregation).
|
||||||
|
SettingKeyOpsAdvancedSettings = "ops_advanced_settings"
|
||||||
|
)
|
||||||
|
|
||||||
// AdminAPIKeyPrefix is the prefix for admin API keys (distinct from user "sk-" keys).
|
// AdminAPIKeyPrefix is the prefix for admin API keys (distinct from user "sk-" keys).
|
||||||
const AdminAPIKeyPrefix = "admin-"
|
const AdminAPIKeyPrefix = "admin-"
|
||||||
|
|||||||
@@ -1399,7 +1399,24 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
|||||||
if resp != nil && resp.Body != nil {
|
if resp != nil && resp.Body != nil {
|
||||||
_ = resp.Body.Close()
|
_ = resp.Body.Close()
|
||||||
}
|
}
|
||||||
return nil, fmt.Errorf("upstream request failed: %w", err)
|
// Ensure the client receives an error response (handlers assume Forward writes on non-failover errors).
|
||||||
|
safeErr := sanitizeUpstreamErrorMessage(err.Error())
|
||||||
|
setOpsUpstreamError(c, 0, safeErr, "")
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: 0,
|
||||||
|
Kind: "request_error",
|
||||||
|
Message: safeErr,
|
||||||
|
})
|
||||||
|
c.JSON(http.StatusBadGateway, gin.H{
|
||||||
|
"type": "error",
|
||||||
|
"error": gin.H{
|
||||||
|
"type": "upstream_error",
|
||||||
|
"message": "Upstream request failed",
|
||||||
|
},
|
||||||
|
})
|
||||||
|
return nil, fmt.Errorf("upstream request failed: %s", safeErr)
|
||||||
}
|
}
|
||||||
|
|
||||||
// 优先检测thinking block签名错误(400)并重试一次
|
// 优先检测thinking block签名错误(400)并重试一次
|
||||||
@@ -1409,6 +1426,21 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
|||||||
_ = resp.Body.Close()
|
_ = resp.Body.Close()
|
||||||
|
|
||||||
if s.isThinkingBlockSignatureError(respBody) {
|
if s.isThinkingBlockSignatureError(respBody) {
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "signature_error",
|
||||||
|
Message: extractUpstreamErrorMessage(respBody),
|
||||||
|
Detail: func() string {
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}(),
|
||||||
|
})
|
||||||
|
|
||||||
looksLikeToolSignatureError := func(msg string) bool {
|
looksLikeToolSignatureError := func(msg string) bool {
|
||||||
m := strings.ToLower(msg)
|
m := strings.ToLower(msg)
|
||||||
return strings.Contains(m, "tool_use") ||
|
return strings.Contains(m, "tool_use") ||
|
||||||
@@ -1445,6 +1477,20 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
|||||||
retryRespBody, retryReadErr := io.ReadAll(io.LimitReader(retryResp.Body, 2<<20))
|
retryRespBody, retryReadErr := io.ReadAll(io.LimitReader(retryResp.Body, 2<<20))
|
||||||
_ = retryResp.Body.Close()
|
_ = retryResp.Body.Close()
|
||||||
if retryReadErr == nil && retryResp.StatusCode == 400 && s.isThinkingBlockSignatureError(retryRespBody) {
|
if retryReadErr == nil && retryResp.StatusCode == 400 && s.isThinkingBlockSignatureError(retryRespBody) {
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: retryResp.StatusCode,
|
||||||
|
UpstreamRequestID: retryResp.Header.Get("x-request-id"),
|
||||||
|
Kind: "signature_retry_thinking",
|
||||||
|
Message: extractUpstreamErrorMessage(retryRespBody),
|
||||||
|
Detail: func() string {
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
return truncateString(string(retryRespBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}(),
|
||||||
|
})
|
||||||
msg2 := extractUpstreamErrorMessage(retryRespBody)
|
msg2 := extractUpstreamErrorMessage(retryRespBody)
|
||||||
if looksLikeToolSignatureError(msg2) && time.Since(retryStart) < maxRetryElapsed {
|
if looksLikeToolSignatureError(msg2) && time.Since(retryStart) < maxRetryElapsed {
|
||||||
log.Printf("Account %d: signature retry still failing and looks tool-related, retrying with tool blocks downgraded", account.ID)
|
log.Printf("Account %d: signature retry still failing and looks tool-related, retrying with tool blocks downgraded", account.ID)
|
||||||
@@ -1459,6 +1505,13 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
|||||||
if retryResp2 != nil && retryResp2.Body != nil {
|
if retryResp2 != nil && retryResp2.Body != nil {
|
||||||
_ = retryResp2.Body.Close()
|
_ = retryResp2.Body.Close()
|
||||||
}
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: 0,
|
||||||
|
Kind: "signature_retry_tools_request_error",
|
||||||
|
Message: sanitizeUpstreamErrorMessage(retryErr2.Error()),
|
||||||
|
})
|
||||||
log.Printf("Account %d: tool-downgrade signature retry failed: %v", account.ID, retryErr2)
|
log.Printf("Account %d: tool-downgrade signature retry failed: %v", account.ID, retryErr2)
|
||||||
} else {
|
} else {
|
||||||
log.Printf("Account %d: tool-downgrade signature retry build failed: %v", account.ID, buildErr2)
|
log.Printf("Account %d: tool-downgrade signature retry build failed: %v", account.ID, buildErr2)
|
||||||
@@ -1508,9 +1561,24 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
|
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
|
||||||
|
_ = resp.Body.Close()
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "retry",
|
||||||
|
Message: extractUpstreamErrorMessage(respBody),
|
||||||
|
Detail: func() string {
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}(),
|
||||||
|
})
|
||||||
log.Printf("Account %d: upstream error %d, retry %d/%d after %v (elapsed=%v/%v)",
|
log.Printf("Account %d: upstream error %d, retry %d/%d after %v (elapsed=%v/%v)",
|
||||||
account.ID, resp.StatusCode, attempt, maxRetryAttempts, delay, elapsed, maxRetryElapsed)
|
account.ID, resp.StatusCode, attempt, maxRetryAttempts, delay, elapsed, maxRetryElapsed)
|
||||||
_ = resp.Body.Close()
|
|
||||||
if err := sleepWithContext(ctx, delay); err != nil {
|
if err := sleepWithContext(ctx, delay); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -1538,7 +1606,25 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
|||||||
// 处理重试耗尽的情况
|
// 处理重试耗尽的情况
|
||||||
if resp.StatusCode >= 400 && s.shouldRetryUpstreamError(account, resp.StatusCode) {
|
if resp.StatusCode >= 400 && s.shouldRetryUpstreamError(account, resp.StatusCode) {
|
||||||
if s.shouldFailoverUpstreamError(resp.StatusCode) {
|
if s.shouldFailoverUpstreamError(resp.StatusCode) {
|
||||||
|
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
|
||||||
|
_ = resp.Body.Close()
|
||||||
|
resp.Body = io.NopCloser(bytes.NewReader(respBody))
|
||||||
|
|
||||||
s.handleRetryExhaustedSideEffects(ctx, resp, account)
|
s.handleRetryExhaustedSideEffects(ctx, resp, account)
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "retry_exhausted_failover",
|
||||||
|
Message: extractUpstreamErrorMessage(respBody),
|
||||||
|
Detail: func() string {
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}(),
|
||||||
|
})
|
||||||
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
||||||
}
|
}
|
||||||
return s.handleRetryExhaustedError(ctx, resp, c, account)
|
return s.handleRetryExhaustedError(ctx, resp, c, account)
|
||||||
@@ -1546,7 +1632,25 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
|||||||
|
|
||||||
// 处理可切换账号的错误
|
// 处理可切换账号的错误
|
||||||
if resp.StatusCode >= 400 && s.shouldFailoverUpstreamError(resp.StatusCode) {
|
if resp.StatusCode >= 400 && s.shouldFailoverUpstreamError(resp.StatusCode) {
|
||||||
|
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
|
||||||
|
_ = resp.Body.Close()
|
||||||
|
resp.Body = io.NopCloser(bytes.NewReader(respBody))
|
||||||
|
|
||||||
s.handleFailoverSideEffects(ctx, resp, account)
|
s.handleFailoverSideEffects(ctx, resp, account)
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "failover",
|
||||||
|
Message: extractUpstreamErrorMessage(respBody),
|
||||||
|
Detail: func() string {
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}(),
|
||||||
|
})
|
||||||
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1563,6 +1667,26 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
|||||||
resp.Body = io.NopCloser(bytes.NewReader(respBody))
|
resp.Body = io.NopCloser(bytes.NewReader(respBody))
|
||||||
|
|
||||||
if s.shouldFailoverOn400(respBody) {
|
if s.shouldFailoverOn400(respBody) {
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "failover_on_400",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
|
||||||
if s.cfg.Gateway.LogUpstreamErrorBody {
|
if s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
log.Printf(
|
log.Printf(
|
||||||
"Account %d: 400 error, attempting failover: %s",
|
"Account %d: 400 error, attempting failover: %s",
|
||||||
@@ -1859,7 +1983,30 @@ func extractUpstreamErrorMessage(body []byte) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*ForwardResult, error) {
|
func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*ForwardResult, error) {
|
||||||
body, _ := io.ReadAll(resp.Body)
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
|
||||||
|
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
|
||||||
|
// Enrich Ops error logs with upstream status + message, and optionally a truncated body snippet.
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(body), maxBytes)
|
||||||
|
}
|
||||||
|
setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "http_error",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
|
||||||
// 处理上游错误,标记账号状态
|
// 处理上游错误,标记账号状态
|
||||||
shouldDisable := false
|
shouldDisable := false
|
||||||
@@ -1870,24 +2017,33 @@ func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Res
|
|||||||
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 记录上游错误响应体摘要便于排障(可选:由配置控制;不回显到客户端)
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
log.Printf(
|
||||||
|
"Upstream error %d (account=%d platform=%s type=%s): %s",
|
||||||
|
resp.StatusCode,
|
||||||
|
account.ID,
|
||||||
|
account.Platform,
|
||||||
|
account.Type,
|
||||||
|
truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
// 根据状态码返回适当的自定义错误响应(不透传上游详细信息)
|
// 根据状态码返回适当的自定义错误响应(不透传上游详细信息)
|
||||||
var errType, errMsg string
|
var errType, errMsg string
|
||||||
var statusCode int
|
var statusCode int
|
||||||
|
|
||||||
switch resp.StatusCode {
|
switch resp.StatusCode {
|
||||||
case 400:
|
case 400:
|
||||||
// 仅记录上游错误摘要(避免输出请求内容);需要时可通过配置打开
|
|
||||||
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
|
||||||
log.Printf(
|
|
||||||
"Upstream 400 error (account=%d platform=%s type=%s): %s",
|
|
||||||
account.ID,
|
|
||||||
account.Platform,
|
|
||||||
account.Type,
|
|
||||||
truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
c.Data(http.StatusBadRequest, "application/json", body)
|
c.Data(http.StatusBadRequest, "application/json", body)
|
||||||
return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
|
summary := upstreamMsg
|
||||||
|
if summary == "" {
|
||||||
|
summary = truncateForLog(body, 512)
|
||||||
|
}
|
||||||
|
if summary == "" {
|
||||||
|
return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, summary)
|
||||||
case 401:
|
case 401:
|
||||||
statusCode = http.StatusBadGateway
|
statusCode = http.StatusBadGateway
|
||||||
errType = "upstream_error"
|
errType = "upstream_error"
|
||||||
@@ -1923,11 +2079,14 @@ func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Res
|
|||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
|
if upstreamMsg == "" {
|
||||||
|
return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *GatewayService) handleRetryExhaustedSideEffects(ctx context.Context, resp *http.Response, account *Account) {
|
func (s *GatewayService) handleRetryExhaustedSideEffects(ctx context.Context, resp *http.Response, account *Account) {
|
||||||
body, _ := io.ReadAll(resp.Body)
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
|
||||||
statusCode := resp.StatusCode
|
statusCode := resp.StatusCode
|
||||||
|
|
||||||
// OAuth/Setup Token 账号的 403:标记账号异常
|
// OAuth/Setup Token 账号的 403:标记账号异常
|
||||||
@@ -1941,7 +2100,7 @@ func (s *GatewayService) handleRetryExhaustedSideEffects(ctx context.Context, re
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *GatewayService) handleFailoverSideEffects(ctx context.Context, resp *http.Response, account *Account) {
|
func (s *GatewayService) handleFailoverSideEffects(ctx context.Context, resp *http.Response, account *Account) {
|
||||||
body, _ := io.ReadAll(resp.Body)
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
|
||||||
s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body)
|
s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1949,8 +2108,45 @@ func (s *GatewayService) handleFailoverSideEffects(ctx context.Context, resp *ht
|
|||||||
// OAuth 403:标记账号异常
|
// OAuth 403:标记账号异常
|
||||||
// API Key 未配置错误码:仅返回错误,不标记账号
|
// API Key 未配置错误码:仅返回错误,不标记账号
|
||||||
func (s *GatewayService) handleRetryExhaustedError(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*ForwardResult, error) {
|
func (s *GatewayService) handleRetryExhaustedError(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*ForwardResult, error) {
|
||||||
|
// Capture upstream error body before side-effects consume the stream.
|
||||||
|
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
|
||||||
|
_ = resp.Body.Close()
|
||||||
|
resp.Body = io.NopCloser(bytes.NewReader(respBody))
|
||||||
|
|
||||||
s.handleRetryExhaustedSideEffects(ctx, resp, account)
|
s.handleRetryExhaustedSideEffects(ctx, resp, account)
|
||||||
|
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "retry_exhausted",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
log.Printf(
|
||||||
|
"Upstream error %d retries_exhausted (account=%d platform=%s type=%s): %s",
|
||||||
|
resp.StatusCode,
|
||||||
|
account.ID,
|
||||||
|
account.Platform,
|
||||||
|
account.Type,
|
||||||
|
truncateForLog(respBody, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
// 返回统一的重试耗尽错误响应
|
// 返回统一的重试耗尽错误响应
|
||||||
c.JSON(http.StatusBadGateway, gin.H{
|
c.JSON(http.StatusBadGateway, gin.H{
|
||||||
"type": "error",
|
"type": "error",
|
||||||
@@ -1960,7 +2156,10 @@ func (s *GatewayService) handleRetryExhaustedError(ctx context.Context, resp *ht
|
|||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
return nil, fmt.Errorf("upstream error: %d (retries exhausted)", resp.StatusCode)
|
if upstreamMsg == "" {
|
||||||
|
return nil, fmt.Errorf("upstream error: %d (retries exhausted)", resp.StatusCode)
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("upstream error: %d (retries exhausted) message=%s", resp.StatusCode, upstreamMsg)
|
||||||
}
|
}
|
||||||
|
|
||||||
// streamingResult 流式响应结果
|
// streamingResult 流式响应结果
|
||||||
@@ -2490,6 +2689,7 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context,
|
|||||||
// 发送请求
|
// 发送请求
|
||||||
resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
setOpsUpstreamError(c, 0, sanitizeUpstreamErrorMessage(err.Error()), "")
|
||||||
s.countTokensError(c, http.StatusBadGateway, "upstream_error", "Request failed")
|
s.countTokensError(c, http.StatusBadGateway, "upstream_error", "Request failed")
|
||||||
return fmt.Errorf("upstream request failed: %w", err)
|
return fmt.Errorf("upstream request failed: %w", err)
|
||||||
}
|
}
|
||||||
@@ -2527,6 +2727,18 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context,
|
|||||||
// 标记账号状态(429/529等)
|
// 标记账号状态(429/529等)
|
||||||
s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
|
s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
|
||||||
|
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
|
||||||
|
|
||||||
// 记录上游错误摘要便于排障(不回显请求内容)
|
// 记录上游错误摘要便于排障(不回显请求内容)
|
||||||
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
log.Printf(
|
log.Printf(
|
||||||
@@ -2548,7 +2760,10 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context,
|
|||||||
errMsg = "Service overloaded"
|
errMsg = "Service overloaded"
|
||||||
}
|
}
|
||||||
s.countTokensError(c, resp.StatusCode, "upstream_error", errMsg)
|
s.countTokensError(c, resp.StatusCode, "upstream_error", errMsg)
|
||||||
return fmt.Errorf("upstream error: %d", resp.StatusCode)
|
if upstreamMsg == "" {
|
||||||
|
return fmt.Errorf("upstream error: %d", resp.StatusCode)
|
||||||
|
}
|
||||||
|
return fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
|
||||||
}
|
}
|
||||||
|
|
||||||
// 透传成功响应
|
// 透传成功响应
|
||||||
|
|||||||
@@ -543,12 +543,21 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
|
|||||||
|
|
||||||
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
safeErr := sanitizeUpstreamErrorMessage(err.Error())
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: 0,
|
||||||
|
Kind: "request_error",
|
||||||
|
Message: safeErr,
|
||||||
|
})
|
||||||
if attempt < geminiMaxRetries {
|
if attempt < geminiMaxRetries {
|
||||||
log.Printf("Gemini account %d: upstream request failed, retry %d/%d: %v", account.ID, attempt, geminiMaxRetries, err)
|
log.Printf("Gemini account %d: upstream request failed, retry %d/%d: %v", account.ID, attempt, geminiMaxRetries, err)
|
||||||
sleepGeminiBackoff(attempt)
|
sleepGeminiBackoff(attempt)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries: "+sanitizeUpstreamErrorMessage(err.Error()))
|
setOpsUpstreamError(c, 0, safeErr, "")
|
||||||
|
return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries: "+safeErr)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Special-case: signature/thought_signature validation errors are not transient, but may be fixed by
|
// Special-case: signature/thought_signature validation errors are not transient, but may be fixed by
|
||||||
@@ -558,6 +567,30 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
|
|||||||
_ = resp.Body.Close()
|
_ = resp.Body.Close()
|
||||||
|
|
||||||
if isGeminiSignatureRelatedError(respBody) {
|
if isGeminiSignatureRelatedError(respBody) {
|
||||||
|
upstreamReqID := resp.Header.Get(requestIDHeader)
|
||||||
|
if upstreamReqID == "" {
|
||||||
|
upstreamReqID = resp.Header.Get("x-goog-request-id")
|
||||||
|
}
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: upstreamReqID,
|
||||||
|
Kind: "signature_error",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
|
||||||
var strippedClaudeBody []byte
|
var strippedClaudeBody []byte
|
||||||
stageName := ""
|
stageName := ""
|
||||||
switch signatureRetryStage {
|
switch signatureRetryStage {
|
||||||
@@ -608,6 +641,30 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
|
|||||||
s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
|
s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
|
||||||
}
|
}
|
||||||
if attempt < geminiMaxRetries {
|
if attempt < geminiMaxRetries {
|
||||||
|
upstreamReqID := resp.Header.Get(requestIDHeader)
|
||||||
|
if upstreamReqID == "" {
|
||||||
|
upstreamReqID = resp.Header.Get("x-goog-request-id")
|
||||||
|
}
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: upstreamReqID,
|
||||||
|
Kind: "retry",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
|
||||||
log.Printf("Gemini account %d: upstream status %d, retry %d/%d", account.ID, resp.StatusCode, attempt, geminiMaxRetries)
|
log.Printf("Gemini account %d: upstream status %d, retry %d/%d", account.ID, resp.StatusCode, attempt, geminiMaxRetries)
|
||||||
sleepGeminiBackoff(attempt)
|
sleepGeminiBackoff(attempt)
|
||||||
continue
|
continue
|
||||||
@@ -633,12 +690,62 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
|
|||||||
}
|
}
|
||||||
s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
|
s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
|
||||||
if tempMatched {
|
if tempMatched {
|
||||||
|
upstreamReqID := resp.Header.Get(requestIDHeader)
|
||||||
|
if upstreamReqID == "" {
|
||||||
|
upstreamReqID = resp.Header.Get("x-goog-request-id")
|
||||||
|
}
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: upstreamReqID,
|
||||||
|
Kind: "failover",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
||||||
}
|
}
|
||||||
if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {
|
if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {
|
||||||
|
upstreamReqID := resp.Header.Get(requestIDHeader)
|
||||||
|
if upstreamReqID == "" {
|
||||||
|
upstreamReqID = resp.Header.Get("x-goog-request-id")
|
||||||
|
}
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: upstreamReqID,
|
||||||
|
Kind: "failover",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
||||||
}
|
}
|
||||||
return nil, s.writeGeminiMappedError(c, resp.StatusCode, respBody)
|
upstreamReqID := resp.Header.Get(requestIDHeader)
|
||||||
|
if upstreamReqID == "" {
|
||||||
|
upstreamReqID = resp.Header.Get("x-goog-request-id")
|
||||||
|
}
|
||||||
|
return nil, s.writeGeminiMappedError(c, account, resp.StatusCode, upstreamReqID, respBody)
|
||||||
}
|
}
|
||||||
|
|
||||||
requestID := resp.Header.Get(requestIDHeader)
|
requestID := resp.Header.Get(requestIDHeader)
|
||||||
@@ -863,6 +970,14 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
|
|||||||
|
|
||||||
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
safeErr := sanitizeUpstreamErrorMessage(err.Error())
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: 0,
|
||||||
|
Kind: "request_error",
|
||||||
|
Message: safeErr,
|
||||||
|
})
|
||||||
if attempt < geminiMaxRetries {
|
if attempt < geminiMaxRetries {
|
||||||
log.Printf("Gemini account %d: upstream request failed, retry %d/%d: %v", account.ID, attempt, geminiMaxRetries, err)
|
log.Printf("Gemini account %d: upstream request failed, retry %d/%d: %v", account.ID, attempt, geminiMaxRetries, err)
|
||||||
sleepGeminiBackoff(attempt)
|
sleepGeminiBackoff(attempt)
|
||||||
@@ -880,7 +995,8 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
|
|||||||
FirstTokenMs: nil,
|
FirstTokenMs: nil,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries: "+sanitizeUpstreamErrorMessage(err.Error()))
|
setOpsUpstreamError(c, 0, safeErr, "")
|
||||||
|
return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries: "+safeErr)
|
||||||
}
|
}
|
||||||
|
|
||||||
if resp.StatusCode >= 400 && s.shouldRetryGeminiUpstreamError(account, resp.StatusCode) {
|
if resp.StatusCode >= 400 && s.shouldRetryGeminiUpstreamError(account, resp.StatusCode) {
|
||||||
@@ -899,6 +1015,30 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
|
|||||||
s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
|
s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
|
||||||
}
|
}
|
||||||
if attempt < geminiMaxRetries {
|
if attempt < geminiMaxRetries {
|
||||||
|
upstreamReqID := resp.Header.Get(requestIDHeader)
|
||||||
|
if upstreamReqID == "" {
|
||||||
|
upstreamReqID = resp.Header.Get("x-goog-request-id")
|
||||||
|
}
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: upstreamReqID,
|
||||||
|
Kind: "retry",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
|
||||||
log.Printf("Gemini account %d: upstream status %d, retry %d/%d", account.ID, resp.StatusCode, attempt, geminiMaxRetries)
|
log.Printf("Gemini account %d: upstream status %d, retry %d/%d", account.ID, resp.StatusCode, attempt, geminiMaxRetries)
|
||||||
sleepGeminiBackoff(attempt)
|
sleepGeminiBackoff(attempt)
|
||||||
continue
|
continue
|
||||||
@@ -962,19 +1102,84 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
|
|||||||
}
|
}
|
||||||
|
|
||||||
if tempMatched {
|
if tempMatched {
|
||||||
|
evBody := unwrapIfNeeded(isOAuth, respBody)
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(evBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(evBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: requestID,
|
||||||
|
Kind: "failover",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
||||||
}
|
}
|
||||||
if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {
|
if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {
|
||||||
|
evBody := unwrapIfNeeded(isOAuth, respBody)
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(evBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(evBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: requestID,
|
||||||
|
Kind: "failover",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
||||||
}
|
}
|
||||||
|
|
||||||
respBody = unwrapIfNeeded(isOAuth, respBody)
|
respBody = unwrapIfNeeded(isOAuth, respBody)
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
log.Printf("[Gemini] native upstream error %d: %s", resp.StatusCode, truncateForLog(respBody, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes))
|
||||||
|
}
|
||||||
|
setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: requestID,
|
||||||
|
Kind: "http_error",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
|
||||||
contentType := resp.Header.Get("Content-Type")
|
contentType := resp.Header.Get("Content-Type")
|
||||||
if contentType == "" {
|
if contentType == "" {
|
||||||
contentType = "application/json"
|
contentType = "application/json"
|
||||||
}
|
}
|
||||||
c.Data(resp.StatusCode, contentType, respBody)
|
c.Data(resp.StatusCode, contentType, respBody)
|
||||||
return nil, fmt.Errorf("gemini upstream error: %d", resp.StatusCode)
|
if upstreamMsg == "" {
|
||||||
|
return nil, fmt.Errorf("gemini upstream error: %d", resp.StatusCode)
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("gemini upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
|
||||||
}
|
}
|
||||||
|
|
||||||
var usage *ClaudeUsage
|
var usage *ClaudeUsage
|
||||||
@@ -1076,7 +1281,32 @@ func sanitizeUpstreamErrorMessage(msg string) string {
|
|||||||
return sensitiveQueryParamRegex.ReplaceAllString(msg, `$1***`)
|
return sensitiveQueryParamRegex.ReplaceAllString(msg, `$1***`)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, upstreamStatus int, body []byte) error {
|
func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, account *Account, upstreamStatus int, upstreamRequestID string, body []byte) error {
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(body), maxBytes)
|
||||||
|
}
|
||||||
|
setOpsUpstreamError(c, upstreamStatus, upstreamMsg, upstreamDetail)
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: upstreamStatus,
|
||||||
|
UpstreamRequestID: upstreamRequestID,
|
||||||
|
Kind: "http_error",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
log.Printf("[Gemini] upstream error %d: %s", upstreamStatus, truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes))
|
||||||
|
}
|
||||||
|
|
||||||
var statusCode int
|
var statusCode int
|
||||||
var errType, errMsg string
|
var errType, errMsg string
|
||||||
|
|
||||||
@@ -1184,7 +1414,10 @@ func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, ups
|
|||||||
"type": "error",
|
"type": "error",
|
||||||
"error": gin.H{"type": errType, "message": errMsg},
|
"error": gin.H{"type": errType, "message": errMsg},
|
||||||
})
|
})
|
||||||
return fmt.Errorf("upstream error: %d", upstreamStatus)
|
if upstreamMsg == "" {
|
||||||
|
return fmt.Errorf("upstream error: %d", upstreamStatus)
|
||||||
|
}
|
||||||
|
return fmt.Errorf("upstream error: %d message=%s", upstreamStatus, upstreamMsg)
|
||||||
}
|
}
|
||||||
|
|
||||||
type claudeErrorMapping struct {
|
type claudeErrorMapping struct {
|
||||||
|
|||||||
@@ -115,12 +115,6 @@ func applyCodexOAuthTransform(reqBody map[string]any) codexTransformResult {
|
|||||||
existingInstructions = strings.TrimSpace(existingInstructions)
|
existingInstructions = strings.TrimSpace(existingInstructions)
|
||||||
|
|
||||||
if instructions != "" {
|
if instructions != "" {
|
||||||
if existingInstructions != "" && existingInstructions != instructions {
|
|
||||||
if input, ok := reqBody["input"].([]any); ok {
|
|
||||||
reqBody["input"] = prependSystemInstruction(input, existingInstructions)
|
|
||||||
result.Modified = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if existingInstructions != instructions {
|
if existingInstructions != instructions {
|
||||||
reqBody["instructions"] = instructions
|
reqBody["instructions"] = instructions
|
||||||
result.Modified = true
|
result.Modified = true
|
||||||
@@ -129,7 +123,6 @@ func applyCodexOAuthTransform(reqBody map[string]any) codexTransformResult {
|
|||||||
|
|
||||||
if input, ok := reqBody["input"].([]any); ok {
|
if input, ok := reqBody["input"].([]any); ok {
|
||||||
input = filterCodexInput(input)
|
input = filterCodexInput(input)
|
||||||
input = normalizeOrphanedToolOutputs(input)
|
|
||||||
reqBody["input"] = input
|
reqBody["input"] = input
|
||||||
result.Modified = true
|
result.Modified = true
|
||||||
}
|
}
|
||||||
@@ -266,19 +259,6 @@ func filterCodexInput(input []any) []any {
|
|||||||
return filtered
|
return filtered
|
||||||
}
|
}
|
||||||
|
|
||||||
func prependSystemInstruction(input []any, instructions string) []any {
|
|
||||||
message := map[string]any{
|
|
||||||
"role": "system",
|
|
||||||
"content": []any{
|
|
||||||
map[string]any{
|
|
||||||
"type": "input_text",
|
|
||||||
"text": instructions,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
return append([]any{message}, input...)
|
|
||||||
}
|
|
||||||
|
|
||||||
func normalizeCodexTools(reqBody map[string]any) bool {
|
func normalizeCodexTools(reqBody map[string]any) bool {
|
||||||
rawTools, ok := reqBody["tools"]
|
rawTools, ok := reqBody["tools"]
|
||||||
if !ok || rawTools == nil {
|
if !ok || rawTools == nil {
|
||||||
@@ -341,110 +321,6 @@ func normalizeCodexTools(reqBody map[string]any) bool {
|
|||||||
return modified
|
return modified
|
||||||
}
|
}
|
||||||
|
|
||||||
func normalizeOrphanedToolOutputs(input []any) []any {
|
|
||||||
functionCallIDs := map[string]bool{}
|
|
||||||
localShellCallIDs := map[string]bool{}
|
|
||||||
customToolCallIDs := map[string]bool{}
|
|
||||||
|
|
||||||
for _, item := range input {
|
|
||||||
m, ok := item.(map[string]any)
|
|
||||||
if !ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
callID := getCallID(m)
|
|
||||||
if callID == "" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
switch m["type"] {
|
|
||||||
case "function_call":
|
|
||||||
functionCallIDs[callID] = true
|
|
||||||
case "local_shell_call":
|
|
||||||
localShellCallIDs[callID] = true
|
|
||||||
case "custom_tool_call":
|
|
||||||
customToolCallIDs[callID] = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
output := make([]any, 0, len(input))
|
|
||||||
for _, item := range input {
|
|
||||||
m, ok := item.(map[string]any)
|
|
||||||
if !ok {
|
|
||||||
output = append(output, item)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
switch m["type"] {
|
|
||||||
case "function_call_output":
|
|
||||||
callID := getCallID(m)
|
|
||||||
if callID == "" || (!functionCallIDs[callID] && !localShellCallIDs[callID]) {
|
|
||||||
output = append(output, convertOrphanedOutputToMessage(m, callID))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
case "custom_tool_call_output":
|
|
||||||
callID := getCallID(m)
|
|
||||||
if callID == "" || !customToolCallIDs[callID] {
|
|
||||||
output = append(output, convertOrphanedOutputToMessage(m, callID))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
case "local_shell_call_output":
|
|
||||||
callID := getCallID(m)
|
|
||||||
if callID == "" || !localShellCallIDs[callID] {
|
|
||||||
output = append(output, convertOrphanedOutputToMessage(m, callID))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
output = append(output, m)
|
|
||||||
}
|
|
||||||
return output
|
|
||||||
}
|
|
||||||
|
|
||||||
func getCallID(item map[string]any) string {
|
|
||||||
raw, ok := item["call_id"]
|
|
||||||
if !ok {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
callID, ok := raw.(string)
|
|
||||||
if !ok {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
callID = strings.TrimSpace(callID)
|
|
||||||
if callID == "" {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
return callID
|
|
||||||
}
|
|
||||||
|
|
||||||
func convertOrphanedOutputToMessage(item map[string]any, callID string) map[string]any {
|
|
||||||
toolName := "tool"
|
|
||||||
if name, ok := item["name"].(string); ok && name != "" {
|
|
||||||
toolName = name
|
|
||||||
}
|
|
||||||
labelID := callID
|
|
||||||
if labelID == "" {
|
|
||||||
labelID = "unknown"
|
|
||||||
}
|
|
||||||
text := stringifyOutput(item["output"])
|
|
||||||
if len(text) > 16000 {
|
|
||||||
text = text[:16000] + "\n...[truncated]"
|
|
||||||
}
|
|
||||||
return map[string]any{
|
|
||||||
"type": "message",
|
|
||||||
"role": "assistant",
|
|
||||||
"content": fmt.Sprintf("[Previous %s result; call_id=%s]: %s", toolName, labelID, text),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func stringifyOutput(output any) string {
|
|
||||||
switch v := output.(type) {
|
|
||||||
case string:
|
|
||||||
return v
|
|
||||||
default:
|
|
||||||
if data, err := json.Marshal(v); err == nil {
|
|
||||||
return string(data)
|
|
||||||
}
|
|
||||||
return fmt.Sprintf("%v", v)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func codexCachePath(filename string) string {
|
func codexCachePath(filename string) string {
|
||||||
home, err := os.UserHomeDir()
|
home, err := os.UserHomeDir()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"log"
|
"log"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
|
||||||
"regexp"
|
"regexp"
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
@@ -513,7 +512,7 @@ func (s *OpenAIGatewayService) shouldFailoverUpstreamError(statusCode int) bool
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *OpenAIGatewayService) handleFailoverSideEffects(ctx context.Context, resp *http.Response, account *Account) {
|
func (s *OpenAIGatewayService) handleFailoverSideEffects(ctx context.Context, resp *http.Response, account *Account) {
|
||||||
body, _ := io.ReadAll(resp.Body)
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
|
||||||
s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body)
|
s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -594,13 +593,53 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco
|
|||||||
// Send request
|
// Send request
|
||||||
resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("upstream request failed: %w", err)
|
// Ensure the client receives an error response (handlers assume Forward writes on non-failover errors).
|
||||||
|
safeErr := sanitizeUpstreamErrorMessage(err.Error())
|
||||||
|
setOpsUpstreamError(c, 0, safeErr, "")
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: 0,
|
||||||
|
Kind: "request_error",
|
||||||
|
Message: safeErr,
|
||||||
|
})
|
||||||
|
c.JSON(http.StatusBadGateway, gin.H{
|
||||||
|
"error": gin.H{
|
||||||
|
"type": "upstream_error",
|
||||||
|
"message": "Upstream request failed",
|
||||||
|
},
|
||||||
|
})
|
||||||
|
return nil, fmt.Errorf("upstream request failed: %s", safeErr)
|
||||||
}
|
}
|
||||||
defer func() { _ = resp.Body.Close() }()
|
defer func() { _ = resp.Body.Close() }()
|
||||||
|
|
||||||
// Handle error response
|
// Handle error response
|
||||||
if resp.StatusCode >= 400 {
|
if resp.StatusCode >= 400 {
|
||||||
if s.shouldFailoverUpstreamError(resp.StatusCode) {
|
if s.shouldFailoverUpstreamError(resp.StatusCode) {
|
||||||
|
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
|
||||||
|
_ = resp.Body.Close()
|
||||||
|
resp.Body = io.NopCloser(bytes.NewReader(respBody))
|
||||||
|
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "failover",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
|
||||||
s.handleFailoverSideEffects(ctx, resp, account)
|
s.handleFailoverSideEffects(ctx, resp, account)
|
||||||
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
||||||
}
|
}
|
||||||
@@ -724,18 +763,52 @@ func (s *OpenAIGatewayService) buildUpstreamRequest(ctx context.Context, c *gin.
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*OpenAIForwardResult, error) {
|
func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*OpenAIForwardResult, error) {
|
||||||
body, _ := io.ReadAll(resp.Body)
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
|
||||||
logUpstreamErrorBody(account.ID, resp.StatusCode, body)
|
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(body), maxBytes)
|
||||||
|
}
|
||||||
|
setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
|
||||||
|
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
log.Printf(
|
||||||
|
"OpenAI upstream error %d (account=%d platform=%s type=%s): %s",
|
||||||
|
resp.StatusCode,
|
||||||
|
account.ID,
|
||||||
|
account.Platform,
|
||||||
|
account.Type,
|
||||||
|
truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
// Check custom error codes
|
// Check custom error codes
|
||||||
if !account.ShouldHandleErrorCode(resp.StatusCode) {
|
if !account.ShouldHandleErrorCode(resp.StatusCode) {
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "http_error",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
c.JSON(http.StatusInternalServerError, gin.H{
|
c.JSON(http.StatusInternalServerError, gin.H{
|
||||||
"error": gin.H{
|
"error": gin.H{
|
||||||
"type": "upstream_error",
|
"type": "upstream_error",
|
||||||
"message": "Upstream gateway error",
|
"message": "Upstream gateway error",
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
return nil, fmt.Errorf("upstream error: %d (not in custom error codes)", resp.StatusCode)
|
if upstreamMsg == "" {
|
||||||
|
return nil, fmt.Errorf("upstream error: %d (not in custom error codes)", resp.StatusCode)
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("upstream error: %d (not in custom error codes) message=%s", resp.StatusCode, upstreamMsg)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle upstream error (mark account status)
|
// Handle upstream error (mark account status)
|
||||||
@@ -743,6 +816,19 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht
|
|||||||
if s.rateLimitService != nil {
|
if s.rateLimitService != nil {
|
||||||
shouldDisable = s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body)
|
shouldDisable = s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body)
|
||||||
}
|
}
|
||||||
|
kind := "http_error"
|
||||||
|
if shouldDisable {
|
||||||
|
kind = "failover"
|
||||||
|
}
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: kind,
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
if shouldDisable {
|
if shouldDisable {
|
||||||
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode}
|
||||||
}
|
}
|
||||||
@@ -781,25 +867,10 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht
|
|||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
|
if upstreamMsg == "" {
|
||||||
}
|
return nil, fmt.Errorf("upstream error: %d", resp.StatusCode)
|
||||||
|
|
||||||
func logUpstreamErrorBody(accountID int64, statusCode int, body []byte) {
|
|
||||||
if strings.ToLower(strings.TrimSpace(os.Getenv("GATEWAY_LOG_UPSTREAM_ERROR_BODY"))) != "true" {
|
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg)
|
||||||
maxBytes := 2048
|
|
||||||
if rawMax := strings.TrimSpace(os.Getenv("GATEWAY_LOG_UPSTREAM_ERROR_BODY_MAX_BYTES")); rawMax != "" {
|
|
||||||
if parsed, err := strconv.Atoi(rawMax); err == nil && parsed > 0 {
|
|
||||||
maxBytes = parsed
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if len(body) > maxBytes {
|
|
||||||
body = body[:maxBytes]
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Printf("Upstream error body: account=%d status=%d body=%q", accountID, statusCode, string(body))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// openaiStreamingResult streaming response result
|
// openaiStreamingResult streaming response result
|
||||||
|
|||||||
194
backend/internal/service/ops_account_availability.go
Normal file
194
backend/internal/service/ops_account_availability.go
Normal file
@@ -0,0 +1,194 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// GetAccountAvailabilityStats returns current account availability stats.
|
||||||
|
//
|
||||||
|
// Query-level filtering is intentionally limited to platform/group to match the dashboard scope.
|
||||||
|
func (s *OpsService) GetAccountAvailabilityStats(ctx context.Context, platformFilter string, groupIDFilter *int64) (
|
||||||
|
map[string]*PlatformAvailability,
|
||||||
|
map[int64]*GroupAvailability,
|
||||||
|
map[int64]*AccountAvailability,
|
||||||
|
*time.Time,
|
||||||
|
error,
|
||||||
|
) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, nil, nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
accounts, err := s.listAllAccountsForOps(ctx, platformFilter)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if groupIDFilter != nil && *groupIDFilter > 0 {
|
||||||
|
filtered := make([]Account, 0, len(accounts))
|
||||||
|
for _, acc := range accounts {
|
||||||
|
for _, grp := range acc.Groups {
|
||||||
|
if grp != nil && grp.ID == *groupIDFilter {
|
||||||
|
filtered = append(filtered, acc)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
accounts = filtered
|
||||||
|
}
|
||||||
|
|
||||||
|
now := time.Now()
|
||||||
|
collectedAt := now
|
||||||
|
|
||||||
|
platform := make(map[string]*PlatformAvailability)
|
||||||
|
group := make(map[int64]*GroupAvailability)
|
||||||
|
account := make(map[int64]*AccountAvailability)
|
||||||
|
|
||||||
|
for _, acc := range accounts {
|
||||||
|
if acc.ID <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
isTempUnsched := false
|
||||||
|
if acc.TempUnschedulableUntil != nil && now.Before(*acc.TempUnschedulableUntil) {
|
||||||
|
isTempUnsched = true
|
||||||
|
}
|
||||||
|
|
||||||
|
isRateLimited := acc.RateLimitResetAt != nil && now.Before(*acc.RateLimitResetAt)
|
||||||
|
isOverloaded := acc.OverloadUntil != nil && now.Before(*acc.OverloadUntil)
|
||||||
|
hasError := acc.Status == StatusError
|
||||||
|
|
||||||
|
// Normalize exclusive status flags so the UI doesn't show conflicting badges.
|
||||||
|
if hasError {
|
||||||
|
isRateLimited = false
|
||||||
|
isOverloaded = false
|
||||||
|
}
|
||||||
|
|
||||||
|
isAvailable := acc.Status == StatusActive && acc.Schedulable && !isRateLimited && !isOverloaded && !isTempUnsched
|
||||||
|
|
||||||
|
if acc.Platform != "" {
|
||||||
|
if _, ok := platform[acc.Platform]; !ok {
|
||||||
|
platform[acc.Platform] = &PlatformAvailability{
|
||||||
|
Platform: acc.Platform,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
p := platform[acc.Platform]
|
||||||
|
p.TotalAccounts++
|
||||||
|
if isAvailable {
|
||||||
|
p.AvailableCount++
|
||||||
|
}
|
||||||
|
if isRateLimited {
|
||||||
|
p.RateLimitCount++
|
||||||
|
}
|
||||||
|
if hasError {
|
||||||
|
p.ErrorCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, grp := range acc.Groups {
|
||||||
|
if grp == nil || grp.ID <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, ok := group[grp.ID]; !ok {
|
||||||
|
group[grp.ID] = &GroupAvailability{
|
||||||
|
GroupID: grp.ID,
|
||||||
|
GroupName: grp.Name,
|
||||||
|
Platform: grp.Platform,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
g := group[grp.ID]
|
||||||
|
g.TotalAccounts++
|
||||||
|
if isAvailable {
|
||||||
|
g.AvailableCount++
|
||||||
|
}
|
||||||
|
if isRateLimited {
|
||||||
|
g.RateLimitCount++
|
||||||
|
}
|
||||||
|
if hasError {
|
||||||
|
g.ErrorCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
displayGroupID := int64(0)
|
||||||
|
displayGroupName := ""
|
||||||
|
if len(acc.Groups) > 0 && acc.Groups[0] != nil {
|
||||||
|
displayGroupID = acc.Groups[0].ID
|
||||||
|
displayGroupName = acc.Groups[0].Name
|
||||||
|
}
|
||||||
|
|
||||||
|
item := &AccountAvailability{
|
||||||
|
AccountID: acc.ID,
|
||||||
|
AccountName: acc.Name,
|
||||||
|
Platform: acc.Platform,
|
||||||
|
GroupID: displayGroupID,
|
||||||
|
GroupName: displayGroupName,
|
||||||
|
Status: acc.Status,
|
||||||
|
|
||||||
|
IsAvailable: isAvailable,
|
||||||
|
IsRateLimited: isRateLimited,
|
||||||
|
IsOverloaded: isOverloaded,
|
||||||
|
HasError: hasError,
|
||||||
|
|
||||||
|
ErrorMessage: acc.ErrorMessage,
|
||||||
|
}
|
||||||
|
|
||||||
|
if isRateLimited && acc.RateLimitResetAt != nil {
|
||||||
|
item.RateLimitResetAt = acc.RateLimitResetAt
|
||||||
|
remainingSec := int64(time.Until(*acc.RateLimitResetAt).Seconds())
|
||||||
|
if remainingSec > 0 {
|
||||||
|
item.RateLimitRemainingSec = &remainingSec
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if isOverloaded && acc.OverloadUntil != nil {
|
||||||
|
item.OverloadUntil = acc.OverloadUntil
|
||||||
|
remainingSec := int64(time.Until(*acc.OverloadUntil).Seconds())
|
||||||
|
if remainingSec > 0 {
|
||||||
|
item.OverloadRemainingSec = &remainingSec
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if isTempUnsched && acc.TempUnschedulableUntil != nil {
|
||||||
|
item.TempUnschedulableUntil = acc.TempUnschedulableUntil
|
||||||
|
}
|
||||||
|
|
||||||
|
account[acc.ID] = item
|
||||||
|
}
|
||||||
|
|
||||||
|
return platform, group, account, &collectedAt, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsAccountAvailability struct {
|
||||||
|
Group *GroupAvailability
|
||||||
|
Accounts map[int64]*AccountAvailability
|
||||||
|
CollectedAt *time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) GetAccountAvailability(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error) {
|
||||||
|
if s == nil {
|
||||||
|
return nil, errors.New("ops service is nil")
|
||||||
|
}
|
||||||
|
|
||||||
|
if s.getAccountAvailability != nil {
|
||||||
|
return s.getAccountAvailability(ctx, platformFilter, groupIDFilter)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, groupStats, accountStats, collectedAt, err := s.GetAccountAvailabilityStats(ctx, platformFilter, groupIDFilter)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var group *GroupAvailability
|
||||||
|
if groupIDFilter != nil && *groupIDFilter > 0 {
|
||||||
|
group = groupStats[*groupIDFilter]
|
||||||
|
}
|
||||||
|
|
||||||
|
if accountStats == nil {
|
||||||
|
accountStats = map[int64]*AccountAvailability{}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &OpsAccountAvailability{
|
||||||
|
Group: group,
|
||||||
|
Accounts: accountStats,
|
||||||
|
CollectedAt: collectedAt,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
46
backend/internal/service/ops_advisory_lock.go
Normal file
46
backend/internal/service/ops_advisory_lock.go
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"hash/fnv"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func hashAdvisoryLockID(key string) int64 {
|
||||||
|
h := fnv.New64a()
|
||||||
|
_, _ = h.Write([]byte(key))
|
||||||
|
return int64(h.Sum64())
|
||||||
|
}
|
||||||
|
|
||||||
|
func tryAcquireDBAdvisoryLock(ctx context.Context, db *sql.DB, lockID int64) (func(), bool) {
|
||||||
|
if db == nil {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
conn, err := db.Conn(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
|
||||||
|
acquired := false
|
||||||
|
if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", lockID).Scan(&acquired); err != nil {
|
||||||
|
_ = conn.Close()
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
if !acquired {
|
||||||
|
_ = conn.Close()
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
|
||||||
|
release := func() {
|
||||||
|
unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
_, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", lockID)
|
||||||
|
_ = conn.Close()
|
||||||
|
}
|
||||||
|
return release, true
|
||||||
|
}
|
||||||
443
backend/internal/service/ops_aggregation_service.go
Normal file
443
backend/internal/service/ops_aggregation_service.go
Normal file
@@ -0,0 +1,443 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"errors"
|
||||||
|
"log"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||||
|
"github.com/google/uuid"
|
||||||
|
"github.com/redis/go-redis/v9"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
opsAggHourlyJobName = "ops_preaggregation_hourly"
|
||||||
|
opsAggDailyJobName = "ops_preaggregation_daily"
|
||||||
|
|
||||||
|
opsAggHourlyInterval = 10 * time.Minute
|
||||||
|
opsAggDailyInterval = 1 * time.Hour
|
||||||
|
|
||||||
|
// Keep in sync with ops retention target (vNext default 30d).
|
||||||
|
opsAggBackfillWindow = 30 * 24 * time.Hour
|
||||||
|
|
||||||
|
// Recompute overlap to absorb late-arriving rows near boundaries.
|
||||||
|
opsAggHourlyOverlap = 2 * time.Hour
|
||||||
|
opsAggDailyOverlap = 48 * time.Hour
|
||||||
|
|
||||||
|
opsAggHourlyChunk = 24 * time.Hour
|
||||||
|
opsAggDailyChunk = 7 * 24 * time.Hour
|
||||||
|
|
||||||
|
// Delay around boundaries (e.g. 10:00..10:05) to avoid aggregating buckets
|
||||||
|
// that may still receive late inserts.
|
||||||
|
opsAggSafeDelay = 5 * time.Minute
|
||||||
|
|
||||||
|
opsAggMaxQueryTimeout = 3 * time.Second
|
||||||
|
opsAggHourlyTimeout = 5 * time.Minute
|
||||||
|
opsAggDailyTimeout = 2 * time.Minute
|
||||||
|
|
||||||
|
opsAggHourlyLeaderLockKey = "ops:aggregation:hourly:leader"
|
||||||
|
opsAggDailyLeaderLockKey = "ops:aggregation:daily:leader"
|
||||||
|
|
||||||
|
opsAggHourlyLeaderLockTTL = 15 * time.Minute
|
||||||
|
opsAggDailyLeaderLockTTL = 10 * time.Minute
|
||||||
|
)
|
||||||
|
|
||||||
|
// OpsAggregationService periodically backfills ops_metrics_hourly / ops_metrics_daily
|
||||||
|
// for stable long-window dashboard queries.
|
||||||
|
//
|
||||||
|
// It is safe to run in multi-replica deployments when Redis is available (leader lock).
|
||||||
|
type OpsAggregationService struct {
|
||||||
|
opsRepo OpsRepository
|
||||||
|
settingRepo SettingRepository
|
||||||
|
cfg *config.Config
|
||||||
|
|
||||||
|
db *sql.DB
|
||||||
|
redisClient *redis.Client
|
||||||
|
instanceID string
|
||||||
|
|
||||||
|
stopCh chan struct{}
|
||||||
|
startOnce sync.Once
|
||||||
|
stopOnce sync.Once
|
||||||
|
|
||||||
|
hourlyMu sync.Mutex
|
||||||
|
dailyMu sync.Mutex
|
||||||
|
|
||||||
|
skipLogMu sync.Mutex
|
||||||
|
skipLogAt time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewOpsAggregationService(
|
||||||
|
opsRepo OpsRepository,
|
||||||
|
settingRepo SettingRepository,
|
||||||
|
db *sql.DB,
|
||||||
|
redisClient *redis.Client,
|
||||||
|
cfg *config.Config,
|
||||||
|
) *OpsAggregationService {
|
||||||
|
return &OpsAggregationService{
|
||||||
|
opsRepo: opsRepo,
|
||||||
|
settingRepo: settingRepo,
|
||||||
|
cfg: cfg,
|
||||||
|
db: db,
|
||||||
|
redisClient: redisClient,
|
||||||
|
instanceID: uuid.NewString(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAggregationService) Start() {
|
||||||
|
if s == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.startOnce.Do(func() {
|
||||||
|
if s.stopCh == nil {
|
||||||
|
s.stopCh = make(chan struct{})
|
||||||
|
}
|
||||||
|
go s.hourlyLoop()
|
||||||
|
go s.dailyLoop()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAggregationService) Stop() {
|
||||||
|
if s == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.stopOnce.Do(func() {
|
||||||
|
if s.stopCh != nil {
|
||||||
|
close(s.stopCh)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAggregationService) hourlyLoop() {
|
||||||
|
// First run immediately.
|
||||||
|
s.aggregateHourly()
|
||||||
|
|
||||||
|
ticker := time.NewTicker(opsAggHourlyInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ticker.C:
|
||||||
|
s.aggregateHourly()
|
||||||
|
case <-s.stopCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAggregationService) dailyLoop() {
|
||||||
|
// First run immediately.
|
||||||
|
s.aggregateDaily()
|
||||||
|
|
||||||
|
ticker := time.NewTicker(opsAggDailyInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ticker.C:
|
||||||
|
s.aggregateDaily()
|
||||||
|
case <-s.stopCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAggregationService) aggregateHourly() {
|
||||||
|
if s == nil || s.opsRepo == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if s.cfg != nil {
|
||||||
|
if !s.cfg.Ops.Enabled {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !s.cfg.Ops.Aggregation.Enabled {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), opsAggHourlyTimeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
if !s.isMonitoringEnabled(ctx) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
release, ok := s.tryAcquireLeaderLock(ctx, opsAggHourlyLeaderLockKey, opsAggHourlyLeaderLockTTL, "[OpsAggregation][hourly]")
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if release != nil {
|
||||||
|
defer release()
|
||||||
|
}
|
||||||
|
|
||||||
|
s.hourlyMu.Lock()
|
||||||
|
defer s.hourlyMu.Unlock()
|
||||||
|
|
||||||
|
startedAt := time.Now().UTC()
|
||||||
|
runAt := startedAt
|
||||||
|
|
||||||
|
// Aggregate stable full hours only.
|
||||||
|
end := utcFloorToHour(time.Now().UTC().Add(-opsAggSafeDelay))
|
||||||
|
start := end.Add(-opsAggBackfillWindow)
|
||||||
|
|
||||||
|
// Resume from the latest bucket with overlap.
|
||||||
|
{
|
||||||
|
ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout)
|
||||||
|
latest, ok, err := s.opsRepo.GetLatestHourlyBucketStart(ctxMax)
|
||||||
|
cancelMax()
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("[OpsAggregation][hourly] failed to read latest bucket: %v", err)
|
||||||
|
} else if ok {
|
||||||
|
candidate := latest.Add(-opsAggHourlyOverlap)
|
||||||
|
if candidate.After(start) {
|
||||||
|
start = candidate
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
start = utcFloorToHour(start)
|
||||||
|
if !start.Before(end) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var aggErr error
|
||||||
|
for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggHourlyChunk) {
|
||||||
|
chunkEnd := minTime(cursor.Add(opsAggHourlyChunk), end)
|
||||||
|
if err := s.opsRepo.UpsertHourlyMetrics(ctx, cursor, chunkEnd); err != nil {
|
||||||
|
aggErr = err
|
||||||
|
log.Printf("[OpsAggregation][hourly] upsert failed (%s..%s): %v", cursor.Format(time.RFC3339), chunkEnd.Format(time.RFC3339), err)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
finishedAt := time.Now().UTC()
|
||||||
|
durationMs := finishedAt.Sub(startedAt).Milliseconds()
|
||||||
|
dur := durationMs
|
||||||
|
|
||||||
|
if aggErr != nil {
|
||||||
|
msg := truncateString(aggErr.Error(), 2048)
|
||||||
|
errAt := finishedAt
|
||||||
|
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer hbCancel()
|
||||||
|
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
|
||||||
|
JobName: opsAggHourlyJobName,
|
||||||
|
LastRunAt: &runAt,
|
||||||
|
LastErrorAt: &errAt,
|
||||||
|
LastError: &msg,
|
||||||
|
LastDurationMs: &dur,
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
successAt := finishedAt
|
||||||
|
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer hbCancel()
|
||||||
|
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
|
||||||
|
JobName: opsAggHourlyJobName,
|
||||||
|
LastRunAt: &runAt,
|
||||||
|
LastSuccessAt: &successAt,
|
||||||
|
LastDurationMs: &dur,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAggregationService) aggregateDaily() {
|
||||||
|
if s == nil || s.opsRepo == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if s.cfg != nil {
|
||||||
|
if !s.cfg.Ops.Enabled {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !s.cfg.Ops.Aggregation.Enabled {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), opsAggDailyTimeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
if !s.isMonitoringEnabled(ctx) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
release, ok := s.tryAcquireLeaderLock(ctx, opsAggDailyLeaderLockKey, opsAggDailyLeaderLockTTL, "[OpsAggregation][daily]")
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if release != nil {
|
||||||
|
defer release()
|
||||||
|
}
|
||||||
|
|
||||||
|
s.dailyMu.Lock()
|
||||||
|
defer s.dailyMu.Unlock()
|
||||||
|
|
||||||
|
startedAt := time.Now().UTC()
|
||||||
|
runAt := startedAt
|
||||||
|
|
||||||
|
end := utcFloorToDay(time.Now().UTC())
|
||||||
|
start := end.Add(-opsAggBackfillWindow)
|
||||||
|
|
||||||
|
{
|
||||||
|
ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout)
|
||||||
|
latest, ok, err := s.opsRepo.GetLatestDailyBucketDate(ctxMax)
|
||||||
|
cancelMax()
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("[OpsAggregation][daily] failed to read latest bucket: %v", err)
|
||||||
|
} else if ok {
|
||||||
|
candidate := latest.Add(-opsAggDailyOverlap)
|
||||||
|
if candidate.After(start) {
|
||||||
|
start = candidate
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
start = utcFloorToDay(start)
|
||||||
|
if !start.Before(end) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var aggErr error
|
||||||
|
for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggDailyChunk) {
|
||||||
|
chunkEnd := minTime(cursor.Add(opsAggDailyChunk), end)
|
||||||
|
if err := s.opsRepo.UpsertDailyMetrics(ctx, cursor, chunkEnd); err != nil {
|
||||||
|
aggErr = err
|
||||||
|
log.Printf("[OpsAggregation][daily] upsert failed (%s..%s): %v", cursor.Format("2006-01-02"), chunkEnd.Format("2006-01-02"), err)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
finishedAt := time.Now().UTC()
|
||||||
|
durationMs := finishedAt.Sub(startedAt).Milliseconds()
|
||||||
|
dur := durationMs
|
||||||
|
|
||||||
|
if aggErr != nil {
|
||||||
|
msg := truncateString(aggErr.Error(), 2048)
|
||||||
|
errAt := finishedAt
|
||||||
|
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer hbCancel()
|
||||||
|
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
|
||||||
|
JobName: opsAggDailyJobName,
|
||||||
|
LastRunAt: &runAt,
|
||||||
|
LastErrorAt: &errAt,
|
||||||
|
LastError: &msg,
|
||||||
|
LastDurationMs: &dur,
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
successAt := finishedAt
|
||||||
|
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer hbCancel()
|
||||||
|
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
|
||||||
|
JobName: opsAggDailyJobName,
|
||||||
|
LastRunAt: &runAt,
|
||||||
|
LastSuccessAt: &successAt,
|
||||||
|
LastDurationMs: &dur,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAggregationService) isMonitoringEnabled(ctx context.Context) bool {
|
||||||
|
if s == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if s.cfg != nil && !s.cfg.Ops.Enabled {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if s.settingRepo == nil {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, ErrSettingNotFound) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
switch strings.ToLower(strings.TrimSpace(value)) {
|
||||||
|
case "false", "0", "off", "disabled":
|
||||||
|
return false
|
||||||
|
default:
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var opsAggReleaseScript = redis.NewScript(`
|
||||||
|
if redis.call("GET", KEYS[1]) == ARGV[1] then
|
||||||
|
return redis.call("DEL", KEYS[1])
|
||||||
|
end
|
||||||
|
return 0
|
||||||
|
`)
|
||||||
|
|
||||||
|
func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) {
|
||||||
|
if s == nil {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prefer Redis leader lock when available (multi-instance), but avoid stampeding
|
||||||
|
// the DB when Redis is flaky by falling back to a DB advisory lock.
|
||||||
|
if s.redisClient != nil {
|
||||||
|
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
|
||||||
|
if err == nil {
|
||||||
|
if !ok {
|
||||||
|
s.maybeLogSkip(logPrefix)
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
release := func() {
|
||||||
|
ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
_, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result()
|
||||||
|
}
|
||||||
|
return release, true
|
||||||
|
}
|
||||||
|
// Redis error: fall through to DB advisory lock.
|
||||||
|
}
|
||||||
|
|
||||||
|
release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
|
||||||
|
if !ok {
|
||||||
|
s.maybeLogSkip(logPrefix)
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
return release, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAggregationService) maybeLogSkip(prefix string) {
|
||||||
|
s.skipLogMu.Lock()
|
||||||
|
defer s.skipLogMu.Unlock()
|
||||||
|
|
||||||
|
now := time.Now()
|
||||||
|
if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < time.Minute {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.skipLogAt = now
|
||||||
|
if prefix == "" {
|
||||||
|
prefix = "[OpsAggregation]"
|
||||||
|
}
|
||||||
|
log.Printf("%s leader lock held by another instance; skipping", prefix)
|
||||||
|
}
|
||||||
|
|
||||||
|
func utcFloorToHour(t time.Time) time.Time {
|
||||||
|
return t.UTC().Truncate(time.Hour)
|
||||||
|
}
|
||||||
|
|
||||||
|
func utcFloorToDay(t time.Time) time.Time {
|
||||||
|
u := t.UTC()
|
||||||
|
y, m, d := u.Date()
|
||||||
|
return time.Date(y, m, d, 0, 0, 0, 0, time.UTC)
|
||||||
|
}
|
||||||
|
|
||||||
|
func minTime(a, b time.Time) time.Time {
|
||||||
|
if a.Before(b) {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
913
backend/internal/service/ops_alert_evaluator_service.go
Normal file
913
backend/internal/service/ops_alert_evaluator_service.go
Normal file
@@ -0,0 +1,913 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"math"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||||
|
"github.com/google/uuid"
|
||||||
|
"github.com/redis/go-redis/v9"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
opsAlertEvaluatorJobName = "ops_alert_evaluator"
|
||||||
|
|
||||||
|
opsAlertEvaluatorTimeout = 45 * time.Second
|
||||||
|
opsAlertEvaluatorLeaderLockKey = "ops:alert:evaluator:leader"
|
||||||
|
opsAlertEvaluatorLeaderLockTTL = 90 * time.Second
|
||||||
|
opsAlertEvaluatorSkipLogInterval = 1 * time.Minute
|
||||||
|
)
|
||||||
|
|
||||||
|
var opsAlertEvaluatorReleaseScript = redis.NewScript(`
|
||||||
|
if redis.call("GET", KEYS[1]) == ARGV[1] then
|
||||||
|
return redis.call("DEL", KEYS[1])
|
||||||
|
end
|
||||||
|
return 0
|
||||||
|
`)
|
||||||
|
|
||||||
|
type OpsAlertEvaluatorService struct {
|
||||||
|
opsService *OpsService
|
||||||
|
opsRepo OpsRepository
|
||||||
|
emailService *EmailService
|
||||||
|
|
||||||
|
redisClient *redis.Client
|
||||||
|
cfg *config.Config
|
||||||
|
instanceID string
|
||||||
|
|
||||||
|
stopCh chan struct{}
|
||||||
|
startOnce sync.Once
|
||||||
|
stopOnce sync.Once
|
||||||
|
wg sync.WaitGroup
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
ruleStates map[int64]*opsAlertRuleState
|
||||||
|
|
||||||
|
emailLimiter *slidingWindowLimiter
|
||||||
|
|
||||||
|
skipLogMu sync.Mutex
|
||||||
|
skipLogAt time.Time
|
||||||
|
|
||||||
|
warnNoRedisOnce sync.Once
|
||||||
|
}
|
||||||
|
|
||||||
|
type opsAlertRuleState struct {
|
||||||
|
LastEvaluatedAt time.Time
|
||||||
|
ConsecutiveBreaches int
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewOpsAlertEvaluatorService(
|
||||||
|
opsService *OpsService,
|
||||||
|
opsRepo OpsRepository,
|
||||||
|
emailService *EmailService,
|
||||||
|
redisClient *redis.Client,
|
||||||
|
cfg *config.Config,
|
||||||
|
) *OpsAlertEvaluatorService {
|
||||||
|
return &OpsAlertEvaluatorService{
|
||||||
|
opsService: opsService,
|
||||||
|
opsRepo: opsRepo,
|
||||||
|
emailService: emailService,
|
||||||
|
redisClient: redisClient,
|
||||||
|
cfg: cfg,
|
||||||
|
instanceID: uuid.NewString(),
|
||||||
|
ruleStates: map[int64]*opsAlertRuleState{},
|
||||||
|
emailLimiter: newSlidingWindowLimiter(0, time.Hour),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAlertEvaluatorService) Start() {
|
||||||
|
if s == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.startOnce.Do(func() {
|
||||||
|
if s.stopCh == nil {
|
||||||
|
s.stopCh = make(chan struct{})
|
||||||
|
}
|
||||||
|
go s.run()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAlertEvaluatorService) Stop() {
|
||||||
|
if s == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.stopOnce.Do(func() {
|
||||||
|
if s.stopCh != nil {
|
||||||
|
close(s.stopCh)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
s.wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAlertEvaluatorService) run() {
|
||||||
|
s.wg.Add(1)
|
||||||
|
defer s.wg.Done()
|
||||||
|
|
||||||
|
// Start immediately to produce early feedback in ops dashboard.
|
||||||
|
timer := time.NewTimer(0)
|
||||||
|
defer timer.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-timer.C:
|
||||||
|
interval := s.getInterval()
|
||||||
|
s.evaluateOnce(interval)
|
||||||
|
timer.Reset(interval)
|
||||||
|
case <-s.stopCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAlertEvaluatorService) getInterval() time.Duration {
|
||||||
|
// Default.
|
||||||
|
interval := 60 * time.Second
|
||||||
|
|
||||||
|
if s == nil || s.opsService == nil {
|
||||||
|
return interval
|
||||||
|
}
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
cfg, err := s.opsService.GetOpsAlertRuntimeSettings(ctx)
|
||||||
|
if err != nil || cfg == nil {
|
||||||
|
return interval
|
||||||
|
}
|
||||||
|
if cfg.EvaluationIntervalSeconds <= 0 {
|
||||||
|
return interval
|
||||||
|
}
|
||||||
|
if cfg.EvaluationIntervalSeconds < 1 {
|
||||||
|
return interval
|
||||||
|
}
|
||||||
|
if cfg.EvaluationIntervalSeconds > int((24 * time.Hour).Seconds()) {
|
||||||
|
return interval
|
||||||
|
}
|
||||||
|
return time.Duration(cfg.EvaluationIntervalSeconds) * time.Second
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
|
||||||
|
if s == nil || s.opsRepo == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if s.cfg != nil && !s.cfg.Ops.Enabled {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), opsAlertEvaluatorTimeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
if s.opsService != nil && !s.opsService.IsMonitoringEnabled(ctx) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
runtimeCfg := defaultOpsAlertRuntimeSettings()
|
||||||
|
if s.opsService != nil {
|
||||||
|
if loaded, err := s.opsService.GetOpsAlertRuntimeSettings(ctx); err == nil && loaded != nil {
|
||||||
|
runtimeCfg = loaded
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
release, ok := s.tryAcquireLeaderLock(ctx, runtimeCfg.DistributedLock)
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if release != nil {
|
||||||
|
defer release()
|
||||||
|
}
|
||||||
|
|
||||||
|
startedAt := time.Now().UTC()
|
||||||
|
runAt := startedAt
|
||||||
|
|
||||||
|
rules, err := s.opsRepo.ListAlertRules(ctx)
|
||||||
|
if err != nil {
|
||||||
|
s.recordHeartbeatError(runAt, time.Since(startedAt), err)
|
||||||
|
log.Printf("[OpsAlertEvaluator] list rules failed: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
now := time.Now().UTC()
|
||||||
|
safeEnd := now.Truncate(time.Minute)
|
||||||
|
if safeEnd.IsZero() {
|
||||||
|
safeEnd = now
|
||||||
|
}
|
||||||
|
|
||||||
|
systemMetrics, _ := s.opsRepo.GetLatestSystemMetrics(ctx, 1)
|
||||||
|
|
||||||
|
// Cleanup stale state for removed rules.
|
||||||
|
s.pruneRuleStates(rules)
|
||||||
|
|
||||||
|
for _, rule := range rules {
|
||||||
|
if rule == nil || !rule.Enabled || rule.ID <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
scopePlatform, scopeGroupID := parseOpsAlertRuleScope(rule.Filters)
|
||||||
|
|
||||||
|
windowMinutes := rule.WindowMinutes
|
||||||
|
if windowMinutes <= 0 {
|
||||||
|
windowMinutes = 1
|
||||||
|
}
|
||||||
|
windowStart := safeEnd.Add(-time.Duration(windowMinutes) * time.Minute)
|
||||||
|
windowEnd := safeEnd
|
||||||
|
|
||||||
|
metricValue, ok := s.computeRuleMetric(ctx, rule, systemMetrics, windowStart, windowEnd, scopePlatform, scopeGroupID)
|
||||||
|
if !ok {
|
||||||
|
s.resetRuleState(rule.ID, now)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
breachedNow := compareMetric(metricValue, rule.Operator, rule.Threshold)
|
||||||
|
required := requiredSustainedBreaches(rule.SustainedMinutes, interval)
|
||||||
|
consecutive := s.updateRuleBreaches(rule.ID, now, interval, breachedNow)
|
||||||
|
|
||||||
|
activeEvent, err := s.opsRepo.GetActiveAlertEvent(ctx, rule.ID)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("[OpsAlertEvaluator] get active event failed (rule=%d): %v", rule.ID, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if breachedNow && consecutive >= required {
|
||||||
|
if activeEvent != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
latestEvent, err := s.opsRepo.GetLatestAlertEvent(ctx, rule.ID)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("[OpsAlertEvaluator] get latest event failed (rule=%d): %v", rule.ID, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if latestEvent != nil && rule.CooldownMinutes > 0 {
|
||||||
|
cooldown := time.Duration(rule.CooldownMinutes) * time.Minute
|
||||||
|
if now.Sub(latestEvent.FiredAt) < cooldown {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
firedEvent := &OpsAlertEvent{
|
||||||
|
RuleID: rule.ID,
|
||||||
|
Severity: strings.TrimSpace(rule.Severity),
|
||||||
|
Status: OpsAlertStatusFiring,
|
||||||
|
Title: fmt.Sprintf("%s: %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name)),
|
||||||
|
Description: buildOpsAlertDescription(rule, metricValue, windowMinutes, scopePlatform, scopeGroupID),
|
||||||
|
MetricValue: float64Ptr(metricValue),
|
||||||
|
ThresholdValue: float64Ptr(rule.Threshold),
|
||||||
|
Dimensions: buildOpsAlertDimensions(scopePlatform, scopeGroupID),
|
||||||
|
FiredAt: now,
|
||||||
|
CreatedAt: now,
|
||||||
|
}
|
||||||
|
|
||||||
|
created, err := s.opsRepo.CreateAlertEvent(ctx, firedEvent)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("[OpsAlertEvaluator] create event failed (rule=%d): %v", rule.ID, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if created != nil && created.ID > 0 {
|
||||||
|
s.maybeSendAlertEmail(ctx, runtimeCfg, rule, created)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Not breached: resolve active event if present.
|
||||||
|
if activeEvent != nil {
|
||||||
|
resolvedAt := now
|
||||||
|
if err := s.opsRepo.UpdateAlertEventStatus(ctx, activeEvent.ID, OpsAlertStatusResolved, &resolvedAt); err != nil {
|
||||||
|
log.Printf("[OpsAlertEvaluator] resolve event failed (event=%d): %v", activeEvent.ID, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAlertEvaluatorService) pruneRuleStates(rules []*OpsAlertRule) {
|
||||||
|
s.mu.Lock()
|
||||||
|
defer s.mu.Unlock()
|
||||||
|
|
||||||
|
live := map[int64]struct{}{}
|
||||||
|
for _, r := range rules {
|
||||||
|
if r != nil && r.ID > 0 {
|
||||||
|
live[r.ID] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for id := range s.ruleStates {
|
||||||
|
if _, ok := live[id]; !ok {
|
||||||
|
delete(s.ruleStates, id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAlertEvaluatorService) resetRuleState(ruleID int64, now time.Time) {
|
||||||
|
if ruleID <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.mu.Lock()
|
||||||
|
defer s.mu.Unlock()
|
||||||
|
state, ok := s.ruleStates[ruleID]
|
||||||
|
if !ok {
|
||||||
|
state = &opsAlertRuleState{}
|
||||||
|
s.ruleStates[ruleID] = state
|
||||||
|
}
|
||||||
|
state.LastEvaluatedAt = now
|
||||||
|
state.ConsecutiveBreaches = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAlertEvaluatorService) updateRuleBreaches(ruleID int64, now time.Time, interval time.Duration, breached bool) int {
|
||||||
|
if ruleID <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
s.mu.Lock()
|
||||||
|
defer s.mu.Unlock()
|
||||||
|
|
||||||
|
state, ok := s.ruleStates[ruleID]
|
||||||
|
if !ok {
|
||||||
|
state = &opsAlertRuleState{}
|
||||||
|
s.ruleStates[ruleID] = state
|
||||||
|
}
|
||||||
|
|
||||||
|
if !state.LastEvaluatedAt.IsZero() && interval > 0 {
|
||||||
|
if now.Sub(state.LastEvaluatedAt) > interval*2 {
|
||||||
|
state.ConsecutiveBreaches = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
state.LastEvaluatedAt = now
|
||||||
|
if breached {
|
||||||
|
state.ConsecutiveBreaches++
|
||||||
|
} else {
|
||||||
|
state.ConsecutiveBreaches = 0
|
||||||
|
}
|
||||||
|
return state.ConsecutiveBreaches
|
||||||
|
}
|
||||||
|
|
||||||
|
func requiredSustainedBreaches(sustainedMinutes int, interval time.Duration) int {
|
||||||
|
if sustainedMinutes <= 0 {
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
if interval <= 0 {
|
||||||
|
return sustainedMinutes
|
||||||
|
}
|
||||||
|
required := int(math.Ceil(float64(sustainedMinutes*60) / interval.Seconds()))
|
||||||
|
if required < 1 {
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
return required
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64) {
|
||||||
|
if filters == nil {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
if v, ok := filters["platform"]; ok {
|
||||||
|
if s, ok := v.(string); ok {
|
||||||
|
platform = strings.TrimSpace(s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if v, ok := filters["group_id"]; ok {
|
||||||
|
switch t := v.(type) {
|
||||||
|
case float64:
|
||||||
|
if t > 0 {
|
||||||
|
id := int64(t)
|
||||||
|
groupID = &id
|
||||||
|
}
|
||||||
|
case int64:
|
||||||
|
if t > 0 {
|
||||||
|
id := t
|
||||||
|
groupID = &id
|
||||||
|
}
|
||||||
|
case int:
|
||||||
|
if t > 0 {
|
||||||
|
id := int64(t)
|
||||||
|
groupID = &id
|
||||||
|
}
|
||||||
|
case string:
|
||||||
|
n, err := strconv.ParseInt(strings.TrimSpace(t), 10, 64)
|
||||||
|
if err == nil && n > 0 {
|
||||||
|
groupID = &n
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return platform, groupID
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAlertEvaluatorService) computeRuleMetric(
|
||||||
|
ctx context.Context,
|
||||||
|
rule *OpsAlertRule,
|
||||||
|
systemMetrics *OpsSystemMetricsSnapshot,
|
||||||
|
start time.Time,
|
||||||
|
end time.Time,
|
||||||
|
platform string,
|
||||||
|
groupID *int64,
|
||||||
|
) (float64, bool) {
|
||||||
|
if rule == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
switch strings.TrimSpace(rule.MetricType) {
|
||||||
|
case "cpu_usage_percent":
|
||||||
|
if systemMetrics != nil && systemMetrics.CPUUsagePercent != nil {
|
||||||
|
return *systemMetrics.CPUUsagePercent, true
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
case "memory_usage_percent":
|
||||||
|
if systemMetrics != nil && systemMetrics.MemoryUsagePercent != nil {
|
||||||
|
return *systemMetrics.MemoryUsagePercent, true
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
case "concurrency_queue_depth":
|
||||||
|
if systemMetrics != nil && systemMetrics.ConcurrencyQueueDepth != nil {
|
||||||
|
return float64(*systemMetrics.ConcurrencyQueueDepth), true
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
case "group_available_accounts":
|
||||||
|
if groupID == nil || *groupID <= 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
if s == nil || s.opsService == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
|
||||||
|
if err != nil || availability == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
if availability.Group == nil {
|
||||||
|
return 0, true
|
||||||
|
}
|
||||||
|
return float64(availability.Group.AvailableCount), true
|
||||||
|
case "group_available_ratio":
|
||||||
|
if groupID == nil || *groupID <= 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
if s == nil || s.opsService == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
|
||||||
|
if err != nil || availability == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return computeGroupAvailableRatio(availability.Group), true
|
||||||
|
case "account_rate_limited_count":
|
||||||
|
if s == nil || s.opsService == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
|
||||||
|
if err != nil || availability == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return float64(countAccountsByCondition(availability.Accounts, func(acc *AccountAvailability) bool {
|
||||||
|
return acc.IsRateLimited
|
||||||
|
})), true
|
||||||
|
case "account_error_count":
|
||||||
|
if s == nil || s.opsService == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
|
||||||
|
if err != nil || availability == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return float64(countAccountsByCondition(availability.Accounts, func(acc *AccountAvailability) bool {
|
||||||
|
return acc.HasError && acc.TempUnschedulableUntil == nil
|
||||||
|
})), true
|
||||||
|
}
|
||||||
|
|
||||||
|
overview, err := s.opsRepo.GetDashboardOverview(ctx, &OpsDashboardFilter{
|
||||||
|
StartTime: start,
|
||||||
|
EndTime: end,
|
||||||
|
Platform: platform,
|
||||||
|
GroupID: groupID,
|
||||||
|
QueryMode: OpsQueryModeRaw,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
if overview == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
switch strings.TrimSpace(rule.MetricType) {
|
||||||
|
case "success_rate":
|
||||||
|
if overview.RequestCountSLA <= 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return overview.SLA * 100, true
|
||||||
|
case "error_rate":
|
||||||
|
if overview.RequestCountSLA <= 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return overview.ErrorRate * 100, true
|
||||||
|
case "upstream_error_rate":
|
||||||
|
if overview.RequestCountSLA <= 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return overview.UpstreamErrorRate * 100, true
|
||||||
|
case "p95_latency_ms":
|
||||||
|
if overview.Duration.P95 == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return float64(*overview.Duration.P95), true
|
||||||
|
case "p99_latency_ms":
|
||||||
|
if overview.Duration.P99 == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return float64(*overview.Duration.P99), true
|
||||||
|
default:
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func compareMetric(value float64, operator string, threshold float64) bool {
|
||||||
|
switch strings.TrimSpace(operator) {
|
||||||
|
case ">":
|
||||||
|
return value > threshold
|
||||||
|
case ">=":
|
||||||
|
return value >= threshold
|
||||||
|
case "<":
|
||||||
|
return value < threshold
|
||||||
|
case "<=":
|
||||||
|
return value <= threshold
|
||||||
|
case "==":
|
||||||
|
return value == threshold
|
||||||
|
case "!=":
|
||||||
|
return value != threshold
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildOpsAlertDimensions(platform string, groupID *int64) map[string]any {
|
||||||
|
dims := map[string]any{}
|
||||||
|
if strings.TrimSpace(platform) != "" {
|
||||||
|
dims["platform"] = strings.TrimSpace(platform)
|
||||||
|
}
|
||||||
|
if groupID != nil && *groupID > 0 {
|
||||||
|
dims["group_id"] = *groupID
|
||||||
|
}
|
||||||
|
if len(dims) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return dims
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildOpsAlertDescription(rule *OpsAlertRule, value float64, windowMinutes int, platform string, groupID *int64) string {
|
||||||
|
if rule == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
scope := "overall"
|
||||||
|
if strings.TrimSpace(platform) != "" {
|
||||||
|
scope = fmt.Sprintf("platform=%s", strings.TrimSpace(platform))
|
||||||
|
}
|
||||||
|
if groupID != nil && *groupID > 0 {
|
||||||
|
scope = fmt.Sprintf("%s group_id=%d", scope, *groupID)
|
||||||
|
}
|
||||||
|
if windowMinutes <= 0 {
|
||||||
|
windowMinutes = 1
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%s %s %.2f (current %.2f) over last %dm (%s)",
|
||||||
|
strings.TrimSpace(rule.MetricType),
|
||||||
|
strings.TrimSpace(rule.Operator),
|
||||||
|
rule.Threshold,
|
||||||
|
value,
|
||||||
|
windowMinutes,
|
||||||
|
strings.TrimSpace(scope),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runtimeCfg *OpsAlertRuntimeSettings, rule *OpsAlertRule, event *OpsAlertEvent) {
|
||||||
|
if s == nil || s.emailService == nil || s.opsService == nil || event == nil || rule == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if event.EmailSent {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !rule.NotifyEmail {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx)
|
||||||
|
if err != nil || emailCfg == nil || !emailCfg.Alert.Enabled {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(emailCfg.Alert.Recipients) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !shouldSendOpsAlertEmailByMinSeverity(strings.TrimSpace(emailCfg.Alert.MinSeverity), strings.TrimSpace(rule.Severity)) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if runtimeCfg != nil && runtimeCfg.Silencing.Enabled {
|
||||||
|
if isOpsAlertSilenced(time.Now().UTC(), rule, event, runtimeCfg.Silencing) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply/update rate limiter.
|
||||||
|
s.emailLimiter.SetLimit(emailCfg.Alert.RateLimitPerHour)
|
||||||
|
|
||||||
|
subject := fmt.Sprintf("[Ops Alert][%s] %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name))
|
||||||
|
body := buildOpsAlertEmailBody(rule, event)
|
||||||
|
|
||||||
|
anySent := false
|
||||||
|
for _, to := range emailCfg.Alert.Recipients {
|
||||||
|
addr := strings.TrimSpace(to)
|
||||||
|
if addr == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !s.emailLimiter.Allow(time.Now().UTC()) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err := s.emailService.SendEmail(ctx, addr, subject, body); err != nil {
|
||||||
|
// Ignore per-recipient failures; continue best-effort.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
anySent = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if anySent {
|
||||||
|
_ = s.opsRepo.UpdateAlertEventEmailSent(context.Background(), event.ID, true)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildOpsAlertEmailBody(rule *OpsAlertRule, event *OpsAlertEvent) string {
|
||||||
|
if rule == nil || event == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
metric := strings.TrimSpace(rule.MetricType)
|
||||||
|
value := "-"
|
||||||
|
threshold := fmt.Sprintf("%.2f", rule.Threshold)
|
||||||
|
if event.MetricValue != nil {
|
||||||
|
value = fmt.Sprintf("%.2f", *event.MetricValue)
|
||||||
|
}
|
||||||
|
if event.ThresholdValue != nil {
|
||||||
|
threshold = fmt.Sprintf("%.2f", *event.ThresholdValue)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf(`
|
||||||
|
<h2>Ops Alert</h2>
|
||||||
|
<p><b>Rule</b>: %s</p>
|
||||||
|
<p><b>Severity</b>: %s</p>
|
||||||
|
<p><b>Status</b>: %s</p>
|
||||||
|
<p><b>Metric</b>: %s %s %s</p>
|
||||||
|
<p><b>Fired at</b>: %s</p>
|
||||||
|
<p><b>Description</b>: %s</p>
|
||||||
|
`,
|
||||||
|
htmlEscape(rule.Name),
|
||||||
|
htmlEscape(rule.Severity),
|
||||||
|
htmlEscape(event.Status),
|
||||||
|
htmlEscape(metric),
|
||||||
|
htmlEscape(rule.Operator),
|
||||||
|
htmlEscape(fmt.Sprintf("%s (threshold %s)", value, threshold)),
|
||||||
|
event.FiredAt.Format(time.RFC3339),
|
||||||
|
htmlEscape(event.Description),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func shouldSendOpsAlertEmailByMinSeverity(minSeverity string, ruleSeverity string) bool {
|
||||||
|
minSeverity = strings.ToLower(strings.TrimSpace(minSeverity))
|
||||||
|
if minSeverity == "" {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
eventLevel := opsEmailSeverityForOps(ruleSeverity)
|
||||||
|
minLevel := strings.ToLower(minSeverity)
|
||||||
|
|
||||||
|
rank := func(level string) int {
|
||||||
|
switch level {
|
||||||
|
case "critical":
|
||||||
|
return 3
|
||||||
|
case "warning":
|
||||||
|
return 2
|
||||||
|
case "info":
|
||||||
|
return 1
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rank(eventLevel) >= rank(minLevel)
|
||||||
|
}
|
||||||
|
|
||||||
|
func opsEmailSeverityForOps(severity string) string {
|
||||||
|
switch strings.ToUpper(strings.TrimSpace(severity)) {
|
||||||
|
case "P0":
|
||||||
|
return "critical"
|
||||||
|
case "P1":
|
||||||
|
return "warning"
|
||||||
|
default:
|
||||||
|
return "info"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func isOpsAlertSilenced(now time.Time, rule *OpsAlertRule, event *OpsAlertEvent, silencing OpsAlertSilencingSettings) bool {
|
||||||
|
if !silencing.Enabled {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if now.IsZero() {
|
||||||
|
now = time.Now().UTC()
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(silencing.GlobalUntilRFC3339) != "" {
|
||||||
|
if t, err := time.Parse(time.RFC3339, strings.TrimSpace(silencing.GlobalUntilRFC3339)); err == nil {
|
||||||
|
if now.Before(t) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, entry := range silencing.Entries {
|
||||||
|
untilRaw := strings.TrimSpace(entry.UntilRFC3339)
|
||||||
|
if untilRaw == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
until, err := time.Parse(time.RFC3339, untilRaw)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if now.After(until) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if entry.RuleID != nil && rule != nil && rule.ID > 0 && *entry.RuleID != rule.ID {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if len(entry.Severities) > 0 {
|
||||||
|
match := false
|
||||||
|
for _, s := range entry.Severities {
|
||||||
|
if strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(event.Severity)) || strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(rule.Severity)) {
|
||||||
|
match = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !match {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAlertEvaluatorService) tryAcquireLeaderLock(ctx context.Context, lock OpsDistributedLockSettings) (func(), bool) {
|
||||||
|
if !lock.Enabled {
|
||||||
|
return nil, true
|
||||||
|
}
|
||||||
|
if s.redisClient == nil {
|
||||||
|
s.warnNoRedisOnce.Do(func() {
|
||||||
|
log.Printf("[OpsAlertEvaluator] redis not configured; running without distributed lock")
|
||||||
|
})
|
||||||
|
return nil, true
|
||||||
|
}
|
||||||
|
key := strings.TrimSpace(lock.Key)
|
||||||
|
if key == "" {
|
||||||
|
key = opsAlertEvaluatorLeaderLockKey
|
||||||
|
}
|
||||||
|
ttl := time.Duration(lock.TTLSeconds) * time.Second
|
||||||
|
if ttl <= 0 {
|
||||||
|
ttl = opsAlertEvaluatorLeaderLockTTL
|
||||||
|
}
|
||||||
|
|
||||||
|
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
|
||||||
|
if err != nil {
|
||||||
|
// Prefer fail-closed to avoid duplicate evaluators stampeding the DB when Redis is flaky.
|
||||||
|
// Single-node deployments can disable the distributed lock via runtime settings.
|
||||||
|
s.warnNoRedisOnce.Do(func() {
|
||||||
|
log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; skipping this cycle: %v", err)
|
||||||
|
})
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
s.maybeLogSkip(key)
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
return func() {
|
||||||
|
_, _ = opsAlertEvaluatorReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
|
||||||
|
}, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAlertEvaluatorService) maybeLogSkip(key string) {
|
||||||
|
s.skipLogMu.Lock()
|
||||||
|
defer s.skipLogMu.Unlock()
|
||||||
|
|
||||||
|
now := time.Now()
|
||||||
|
if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < opsAlertEvaluatorSkipLogInterval {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.skipLogAt = now
|
||||||
|
log.Printf("[OpsAlertEvaluator] leader lock held by another instance; skipping (key=%q)", key)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
|
||||||
|
if s == nil || s.opsRepo == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
now := time.Now().UTC()
|
||||||
|
durMs := duration.Milliseconds()
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
|
||||||
|
JobName: opsAlertEvaluatorJobName,
|
||||||
|
LastRunAt: &runAt,
|
||||||
|
LastSuccessAt: &now,
|
||||||
|
LastDurationMs: &durMs,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsAlertEvaluatorService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
|
||||||
|
if s == nil || s.opsRepo == nil || err == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
now := time.Now().UTC()
|
||||||
|
durMs := duration.Milliseconds()
|
||||||
|
msg := truncateString(err.Error(), 2048)
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
|
||||||
|
JobName: opsAlertEvaluatorJobName,
|
||||||
|
LastRunAt: &runAt,
|
||||||
|
LastErrorAt: &now,
|
||||||
|
LastError: &msg,
|
||||||
|
LastDurationMs: &durMs,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func htmlEscape(s string) string {
|
||||||
|
replacer := strings.NewReplacer(
|
||||||
|
"&", "&",
|
||||||
|
"<", "<",
|
||||||
|
">", ">",
|
||||||
|
`"`, """,
|
||||||
|
"'", "'",
|
||||||
|
)
|
||||||
|
return replacer.Replace(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
type slidingWindowLimiter struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
limit int
|
||||||
|
window time.Duration
|
||||||
|
sent []time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
func newSlidingWindowLimiter(limit int, window time.Duration) *slidingWindowLimiter {
|
||||||
|
if window <= 0 {
|
||||||
|
window = time.Hour
|
||||||
|
}
|
||||||
|
return &slidingWindowLimiter{
|
||||||
|
limit: limit,
|
||||||
|
window: window,
|
||||||
|
sent: []time.Time{},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *slidingWindowLimiter) SetLimit(limit int) {
|
||||||
|
l.mu.Lock()
|
||||||
|
defer l.mu.Unlock()
|
||||||
|
l.limit = limit
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *slidingWindowLimiter) Allow(now time.Time) bool {
|
||||||
|
l.mu.Lock()
|
||||||
|
defer l.mu.Unlock()
|
||||||
|
|
||||||
|
if l.limit <= 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
cutoff := now.Add(-l.window)
|
||||||
|
keep := l.sent[:0]
|
||||||
|
for _, t := range l.sent {
|
||||||
|
if t.After(cutoff) {
|
||||||
|
keep = append(keep, t)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
l.sent = keep
|
||||||
|
if len(l.sent) >= l.limit {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
l.sent = append(l.sent, now)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// computeGroupAvailableRatio returns the available percentage for a group.
|
||||||
|
// Formula: (AvailableCount / TotalAccounts) * 100.
|
||||||
|
// Returns 0 when TotalAccounts is 0.
|
||||||
|
func computeGroupAvailableRatio(group *GroupAvailability) float64 {
|
||||||
|
if group == nil || group.TotalAccounts <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return (float64(group.AvailableCount) / float64(group.TotalAccounts)) * 100
|
||||||
|
}
|
||||||
|
|
||||||
|
// countAccountsByCondition counts accounts that satisfy the given condition.
|
||||||
|
func countAccountsByCondition(accounts map[int64]*AccountAvailability, condition func(*AccountAvailability) bool) int64 {
|
||||||
|
if len(accounts) == 0 || condition == nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
var count int64
|
||||||
|
for _, account := range accounts {
|
||||||
|
if account != nil && condition(account) {
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count
|
||||||
|
}
|
||||||
210
backend/internal/service/ops_alert_evaluator_service_test.go
Normal file
210
backend/internal/service/ops_alert_evaluator_service_test.go
Normal file
@@ -0,0 +1,210 @@
|
|||||||
|
//go:build unit
|
||||||
|
|
||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
type stubOpsRepo struct {
|
||||||
|
OpsRepository
|
||||||
|
overview *OpsDashboardOverview
|
||||||
|
err error
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *stubOpsRepo) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) {
|
||||||
|
if s.err != nil {
|
||||||
|
return nil, s.err
|
||||||
|
}
|
||||||
|
if s.overview != nil {
|
||||||
|
return s.overview, nil
|
||||||
|
}
|
||||||
|
return &OpsDashboardOverview{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestComputeGroupAvailableRatio(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
t.Run("正常情况: 10个账号, 8个可用 = 80%", func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got := computeGroupAvailableRatio(&GroupAvailability{
|
||||||
|
TotalAccounts: 10,
|
||||||
|
AvailableCount: 8,
|
||||||
|
})
|
||||||
|
require.InDelta(t, 80.0, got, 0.0001)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("边界情况: TotalAccounts = 0 应返回 0", func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got := computeGroupAvailableRatio(&GroupAvailability{
|
||||||
|
TotalAccounts: 0,
|
||||||
|
AvailableCount: 8,
|
||||||
|
})
|
||||||
|
require.Equal(t, 0.0, got)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("边界情况: AvailableCount = 0 应返回 0%", func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got := computeGroupAvailableRatio(&GroupAvailability{
|
||||||
|
TotalAccounts: 10,
|
||||||
|
AvailableCount: 0,
|
||||||
|
})
|
||||||
|
require.Equal(t, 0.0, got)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCountAccountsByCondition(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
t.Run("测试限流账号统计: acc.IsRateLimited", func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
accounts := map[int64]*AccountAvailability{
|
||||||
|
1: {IsRateLimited: true},
|
||||||
|
2: {IsRateLimited: false},
|
||||||
|
3: {IsRateLimited: true},
|
||||||
|
}
|
||||||
|
|
||||||
|
got := countAccountsByCondition(accounts, func(acc *AccountAvailability) bool {
|
||||||
|
return acc.IsRateLimited
|
||||||
|
})
|
||||||
|
require.Equal(t, int64(2), got)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("测试错误账号统计(排除临时不可调度): acc.HasError && acc.TempUnschedulableUntil == nil", func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
until := time.Now().UTC().Add(5 * time.Minute)
|
||||||
|
accounts := map[int64]*AccountAvailability{
|
||||||
|
1: {HasError: true},
|
||||||
|
2: {HasError: true, TempUnschedulableUntil: &until},
|
||||||
|
3: {HasError: false},
|
||||||
|
}
|
||||||
|
|
||||||
|
got := countAccountsByCondition(accounts, func(acc *AccountAvailability) bool {
|
||||||
|
return acc.HasError && acc.TempUnschedulableUntil == nil
|
||||||
|
})
|
||||||
|
require.Equal(t, int64(1), got)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("边界情况: 空 map 应返回 0", func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got := countAccountsByCondition(map[int64]*AccountAvailability{}, func(acc *AccountAvailability) bool {
|
||||||
|
return acc.IsRateLimited
|
||||||
|
})
|
||||||
|
require.Equal(t, int64(0), got)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestComputeRuleMetricNewIndicators(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
groupID := int64(101)
|
||||||
|
platform := "openai"
|
||||||
|
|
||||||
|
availability := &OpsAccountAvailability{
|
||||||
|
Group: &GroupAvailability{
|
||||||
|
GroupID: groupID,
|
||||||
|
TotalAccounts: 10,
|
||||||
|
AvailableCount: 8,
|
||||||
|
},
|
||||||
|
Accounts: map[int64]*AccountAvailability{
|
||||||
|
1: {IsRateLimited: true},
|
||||||
|
2: {IsRateLimited: true},
|
||||||
|
3: {HasError: true},
|
||||||
|
4: {HasError: true, TempUnschedulableUntil: timePtr(time.Now().UTC().Add(2 * time.Minute))},
|
||||||
|
5: {HasError: false, IsRateLimited: false},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
opsService := &OpsService{
|
||||||
|
getAccountAvailability: func(_ context.Context, _ string, _ *int64) (*OpsAccountAvailability, error) {
|
||||||
|
return availability, nil
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
svc := &OpsAlertEvaluatorService{
|
||||||
|
opsService: opsService,
|
||||||
|
opsRepo: &stubOpsRepo{overview: &OpsDashboardOverview{}},
|
||||||
|
}
|
||||||
|
|
||||||
|
start := time.Now().UTC().Add(-5 * time.Minute)
|
||||||
|
end := time.Now().UTC()
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
metricType string
|
||||||
|
groupID *int64
|
||||||
|
wantValue float64
|
||||||
|
wantOK bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "group_available_accounts",
|
||||||
|
metricType: "group_available_accounts",
|
||||||
|
groupID: &groupID,
|
||||||
|
wantValue: 8,
|
||||||
|
wantOK: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "group_available_ratio",
|
||||||
|
metricType: "group_available_ratio",
|
||||||
|
groupID: &groupID,
|
||||||
|
wantValue: 80.0,
|
||||||
|
wantOK: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "account_rate_limited_count",
|
||||||
|
metricType: "account_rate_limited_count",
|
||||||
|
groupID: nil,
|
||||||
|
wantValue: 2,
|
||||||
|
wantOK: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "account_error_count",
|
||||||
|
metricType: "account_error_count",
|
||||||
|
groupID: nil,
|
||||||
|
wantValue: 1,
|
||||||
|
wantOK: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "group_available_accounts without group_id returns false",
|
||||||
|
metricType: "group_available_accounts",
|
||||||
|
groupID: nil,
|
||||||
|
wantValue: 0,
|
||||||
|
wantOK: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "group_available_ratio without group_id returns false",
|
||||||
|
metricType: "group_available_ratio",
|
||||||
|
groupID: nil,
|
||||||
|
wantValue: 0,
|
||||||
|
wantOK: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
tt := tt
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
rule := &OpsAlertRule{
|
||||||
|
MetricType: tt.metricType,
|
||||||
|
}
|
||||||
|
gotValue, gotOK := svc.computeRuleMetric(ctx, rule, nil, start, end, platform, tt.groupID)
|
||||||
|
require.Equal(t, tt.wantOK, gotOK)
|
||||||
|
if !tt.wantOK {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
require.InDelta(t, tt.wantValue, gotValue, 0.0001)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
74
backend/internal/service/ops_alert_models.go
Normal file
74
backend/internal/service/ops_alert_models.go
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// Ops alert rule/event models.
|
||||||
|
//
|
||||||
|
// NOTE: These are admin-facing DTOs and intentionally keep JSON naming aligned
|
||||||
|
// with the existing ops dashboard frontend (backup style).
|
||||||
|
|
||||||
|
const (
|
||||||
|
OpsAlertStatusFiring = "firing"
|
||||||
|
OpsAlertStatusResolved = "resolved"
|
||||||
|
)
|
||||||
|
|
||||||
|
type OpsAlertRule struct {
|
||||||
|
ID int64 `json:"id"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Description string `json:"description"`
|
||||||
|
|
||||||
|
Enabled bool `json:"enabled"`
|
||||||
|
Severity string `json:"severity"`
|
||||||
|
|
||||||
|
MetricType string `json:"metric_type"`
|
||||||
|
Operator string `json:"operator"`
|
||||||
|
Threshold float64 `json:"threshold"`
|
||||||
|
|
||||||
|
WindowMinutes int `json:"window_minutes"`
|
||||||
|
SustainedMinutes int `json:"sustained_minutes"`
|
||||||
|
CooldownMinutes int `json:"cooldown_minutes"`
|
||||||
|
|
||||||
|
NotifyEmail bool `json:"notify_email"`
|
||||||
|
|
||||||
|
Filters map[string]any `json:"filters,omitempty"`
|
||||||
|
|
||||||
|
LastTriggeredAt *time.Time `json:"last_triggered_at,omitempty"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsAlertEvent struct {
|
||||||
|
ID int64 `json:"id"`
|
||||||
|
RuleID int64 `json:"rule_id"`
|
||||||
|
Severity string `json:"severity"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
|
||||||
|
Title string `json:"title"`
|
||||||
|
Description string `json:"description"`
|
||||||
|
|
||||||
|
MetricValue *float64 `json:"metric_value,omitempty"`
|
||||||
|
ThresholdValue *float64 `json:"threshold_value,omitempty"`
|
||||||
|
|
||||||
|
Dimensions map[string]any `json:"dimensions,omitempty"`
|
||||||
|
|
||||||
|
FiredAt time.Time `json:"fired_at"`
|
||||||
|
ResolvedAt *time.Time `json:"resolved_at,omitempty"`
|
||||||
|
|
||||||
|
EmailSent bool `json:"email_sent"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsAlertEventFilter struct {
|
||||||
|
Limit int
|
||||||
|
|
||||||
|
// Optional filters.
|
||||||
|
Status string
|
||||||
|
Severity string
|
||||||
|
|
||||||
|
StartTime *time.Time
|
||||||
|
EndTime *time.Time
|
||||||
|
|
||||||
|
// Dimensions filters (best-effort).
|
||||||
|
Platform string
|
||||||
|
GroupID *int64
|
||||||
|
}
|
||||||
162
backend/internal/service/ops_alerts.go
Normal file
162
backend/internal/service/ops_alerts.go
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"errors"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *OpsService) ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return []*OpsAlertRule{}, nil
|
||||||
|
}
|
||||||
|
return s.opsRepo.ListAlertRules(ctx)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) CreateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
if rule == nil {
|
||||||
|
return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule")
|
||||||
|
}
|
||||||
|
|
||||||
|
created, err := s.opsRepo.CreateAlertRule(ctx, rule)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return created, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) UpdateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
if rule == nil || rule.ID <= 0 {
|
||||||
|
return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule")
|
||||||
|
}
|
||||||
|
|
||||||
|
updated, err := s.opsRepo.UpdateAlertRule(ctx, rule)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, sql.ErrNoRows) {
|
||||||
|
return nil, infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found")
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return updated, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) DeleteAlertRule(ctx context.Context, id int64) error {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
if id <= 0 {
|
||||||
|
return infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
|
||||||
|
}
|
||||||
|
if err := s.opsRepo.DeleteAlertRule(ctx, id); err != nil {
|
||||||
|
if errors.Is(err, sql.ErrNoRows) {
|
||||||
|
return infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found")
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return []*OpsAlertEvent{}, nil
|
||||||
|
}
|
||||||
|
return s.opsRepo.ListAlertEvents(ctx, filter)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
if ruleID <= 0 {
|
||||||
|
return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
|
||||||
|
}
|
||||||
|
return s.opsRepo.GetActiveAlertEvent(ctx, ruleID)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
if ruleID <= 0 {
|
||||||
|
return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
|
||||||
|
}
|
||||||
|
return s.opsRepo.GetLatestAlertEvent(ctx, ruleID)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
if event == nil {
|
||||||
|
return nil, infraerrors.BadRequest("INVALID_EVENT", "invalid event")
|
||||||
|
}
|
||||||
|
|
||||||
|
created, err := s.opsRepo.CreateAlertEvent(ctx, event)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return created, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
if eventID <= 0 {
|
||||||
|
return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(status) == "" {
|
||||||
|
return infraerrors.BadRequest("INVALID_STATUS", "invalid status")
|
||||||
|
}
|
||||||
|
return s.opsRepo.UpdateAlertEventStatus(ctx, eventID, status, resolvedAt)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
if eventID <= 0 {
|
||||||
|
return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
|
||||||
|
}
|
||||||
|
return s.opsRepo.UpdateAlertEventEmailSent(ctx, eventID, emailSent)
|
||||||
|
}
|
||||||
365
backend/internal/service/ops_cleanup_service.go
Normal file
365
backend/internal/service/ops_cleanup_service.go
Normal file
@@ -0,0 +1,365 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||||
|
"github.com/google/uuid"
|
||||||
|
"github.com/redis/go-redis/v9"
|
||||||
|
"github.com/robfig/cron/v3"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
opsCleanupJobName = "ops_cleanup"
|
||||||
|
|
||||||
|
opsCleanupLeaderLockKeyDefault = "ops:cleanup:leader"
|
||||||
|
opsCleanupLeaderLockTTLDefault = 30 * time.Minute
|
||||||
|
)
|
||||||
|
|
||||||
|
var opsCleanupCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
|
||||||
|
|
||||||
|
var opsCleanupReleaseScript = redis.NewScript(`
|
||||||
|
if redis.call("GET", KEYS[1]) == ARGV[1] then
|
||||||
|
return redis.call("DEL", KEYS[1])
|
||||||
|
end
|
||||||
|
return 0
|
||||||
|
`)
|
||||||
|
|
||||||
|
// OpsCleanupService periodically deletes old ops data to prevent unbounded DB growth.
|
||||||
|
//
|
||||||
|
// - Scheduling: 5-field cron spec (minute hour dom month dow).
|
||||||
|
// - Multi-instance: best-effort Redis leader lock so only one node runs cleanup.
|
||||||
|
// - Safety: deletes in batches to avoid long transactions.
|
||||||
|
type OpsCleanupService struct {
|
||||||
|
opsRepo OpsRepository
|
||||||
|
db *sql.DB
|
||||||
|
redisClient *redis.Client
|
||||||
|
cfg *config.Config
|
||||||
|
|
||||||
|
instanceID string
|
||||||
|
|
||||||
|
cron *cron.Cron
|
||||||
|
|
||||||
|
startOnce sync.Once
|
||||||
|
stopOnce sync.Once
|
||||||
|
|
||||||
|
warnNoRedisOnce sync.Once
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewOpsCleanupService(
|
||||||
|
opsRepo OpsRepository,
|
||||||
|
db *sql.DB,
|
||||||
|
redisClient *redis.Client,
|
||||||
|
cfg *config.Config,
|
||||||
|
) *OpsCleanupService {
|
||||||
|
return &OpsCleanupService{
|
||||||
|
opsRepo: opsRepo,
|
||||||
|
db: db,
|
||||||
|
redisClient: redisClient,
|
||||||
|
cfg: cfg,
|
||||||
|
instanceID: uuid.NewString(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsCleanupService) Start() {
|
||||||
|
if s == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if s.cfg != nil && !s.cfg.Ops.Enabled {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if s.cfg != nil && !s.cfg.Ops.Cleanup.Enabled {
|
||||||
|
log.Printf("[OpsCleanup] not started (disabled)")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil || s.db == nil {
|
||||||
|
log.Printf("[OpsCleanup] not started (missing deps)")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
s.startOnce.Do(func() {
|
||||||
|
schedule := "0 2 * * *"
|
||||||
|
if s.cfg != nil && strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule) != "" {
|
||||||
|
schedule = strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule)
|
||||||
|
}
|
||||||
|
|
||||||
|
loc := time.Local
|
||||||
|
if s.cfg != nil && strings.TrimSpace(s.cfg.Timezone) != "" {
|
||||||
|
if parsed, err := time.LoadLocation(strings.TrimSpace(s.cfg.Timezone)); err == nil && parsed != nil {
|
||||||
|
loc = parsed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
c := cron.New(cron.WithParser(opsCleanupCronParser), cron.WithLocation(loc))
|
||||||
|
_, err := c.AddFunc(schedule, func() { s.runScheduled() })
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("[OpsCleanup] not started (invalid schedule=%q): %v", schedule, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.cron = c
|
||||||
|
s.cron.Start()
|
||||||
|
log.Printf("[OpsCleanup] started (schedule=%q tz=%s)", schedule, loc.String())
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsCleanupService) Stop() {
|
||||||
|
if s == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.stopOnce.Do(func() {
|
||||||
|
if s.cron != nil {
|
||||||
|
ctx := s.cron.Stop()
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
case <-time.After(3 * time.Second):
|
||||||
|
log.Printf("[OpsCleanup] cron stop timed out")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsCleanupService) runScheduled() {
|
||||||
|
if s == nil || s.db == nil || s.opsRepo == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
release, ok := s.tryAcquireLeaderLock(ctx)
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if release != nil {
|
||||||
|
defer release()
|
||||||
|
}
|
||||||
|
|
||||||
|
startedAt := time.Now().UTC()
|
||||||
|
runAt := startedAt
|
||||||
|
|
||||||
|
counts, err := s.runCleanupOnce(ctx)
|
||||||
|
if err != nil {
|
||||||
|
s.recordHeartbeatError(runAt, time.Since(startedAt), err)
|
||||||
|
log.Printf("[OpsCleanup] cleanup failed: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
|
||||||
|
log.Printf("[OpsCleanup] cleanup complete: %s", counts)
|
||||||
|
}
|
||||||
|
|
||||||
|
type opsCleanupDeletedCounts struct {
|
||||||
|
errorLogs int64
|
||||||
|
retryAttempts int64
|
||||||
|
alertEvents int64
|
||||||
|
systemMetrics int64
|
||||||
|
hourlyPreagg int64
|
||||||
|
dailyPreagg int64
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c opsCleanupDeletedCounts) String() string {
|
||||||
|
return fmt.Sprintf(
|
||||||
|
"error_logs=%d retry_attempts=%d alert_events=%d system_metrics=%d hourly_preagg=%d daily_preagg=%d",
|
||||||
|
c.errorLogs,
|
||||||
|
c.retryAttempts,
|
||||||
|
c.alertEvents,
|
||||||
|
c.systemMetrics,
|
||||||
|
c.hourlyPreagg,
|
||||||
|
c.dailyPreagg,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDeletedCounts, error) {
|
||||||
|
out := opsCleanupDeletedCounts{}
|
||||||
|
if s == nil || s.db == nil || s.cfg == nil {
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
batchSize := 5000
|
||||||
|
|
||||||
|
now := time.Now().UTC()
|
||||||
|
|
||||||
|
// Error-like tables: error logs / retry attempts / alert events.
|
||||||
|
if days := s.cfg.Ops.Cleanup.ErrorLogRetentionDays; days > 0 {
|
||||||
|
cutoff := now.AddDate(0, 0, -days)
|
||||||
|
n, err := deleteOldRowsByID(ctx, s.db, "ops_error_logs", "created_at", cutoff, batchSize, false)
|
||||||
|
if err != nil {
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
|
out.errorLogs = n
|
||||||
|
|
||||||
|
n, err = deleteOldRowsByID(ctx, s.db, "ops_retry_attempts", "created_at", cutoff, batchSize, false)
|
||||||
|
if err != nil {
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
|
out.retryAttempts = n
|
||||||
|
|
||||||
|
n, err = deleteOldRowsByID(ctx, s.db, "ops_alert_events", "created_at", cutoff, batchSize, false)
|
||||||
|
if err != nil {
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
|
out.alertEvents = n
|
||||||
|
}
|
||||||
|
|
||||||
|
// Minute-level metrics snapshots.
|
||||||
|
if days := s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays; days > 0 {
|
||||||
|
cutoff := now.AddDate(0, 0, -days)
|
||||||
|
n, err := deleteOldRowsByID(ctx, s.db, "ops_system_metrics", "created_at", cutoff, batchSize, false)
|
||||||
|
if err != nil {
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
|
out.systemMetrics = n
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pre-aggregation tables (hourly/daily).
|
||||||
|
if days := s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays; days > 0 {
|
||||||
|
cutoff := now.AddDate(0, 0, -days)
|
||||||
|
n, err := deleteOldRowsByID(ctx, s.db, "ops_metrics_hourly", "bucket_start", cutoff, batchSize, false)
|
||||||
|
if err != nil {
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
|
out.hourlyPreagg = n
|
||||||
|
|
||||||
|
n, err = deleteOldRowsByID(ctx, s.db, "ops_metrics_daily", "bucket_date", cutoff, batchSize, true)
|
||||||
|
if err != nil {
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
|
out.dailyPreagg = n
|
||||||
|
}
|
||||||
|
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func deleteOldRowsByID(
|
||||||
|
ctx context.Context,
|
||||||
|
db *sql.DB,
|
||||||
|
table string,
|
||||||
|
timeColumn string,
|
||||||
|
cutoff time.Time,
|
||||||
|
batchSize int,
|
||||||
|
castCutoffToDate bool,
|
||||||
|
) (int64, error) {
|
||||||
|
if db == nil {
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
if batchSize <= 0 {
|
||||||
|
batchSize = 5000
|
||||||
|
}
|
||||||
|
|
||||||
|
where := fmt.Sprintf("%s < $1", timeColumn)
|
||||||
|
if castCutoffToDate {
|
||||||
|
where = fmt.Sprintf("%s < $1::date", timeColumn)
|
||||||
|
}
|
||||||
|
|
||||||
|
q := fmt.Sprintf(`
|
||||||
|
WITH batch AS (
|
||||||
|
SELECT id FROM %s
|
||||||
|
WHERE %s
|
||||||
|
ORDER BY id
|
||||||
|
LIMIT $2
|
||||||
|
)
|
||||||
|
DELETE FROM %s
|
||||||
|
WHERE id IN (SELECT id FROM batch)
|
||||||
|
`, table, where, table)
|
||||||
|
|
||||||
|
var total int64
|
||||||
|
for {
|
||||||
|
res, err := db.ExecContext(ctx, q, cutoff, batchSize)
|
||||||
|
if err != nil {
|
||||||
|
// If ops tables aren't present yet (partial deployments), treat as no-op.
|
||||||
|
if strings.Contains(strings.ToLower(err.Error()), "does not exist") && strings.Contains(strings.ToLower(err.Error()), "relation") {
|
||||||
|
return total, nil
|
||||||
|
}
|
||||||
|
return total, err
|
||||||
|
}
|
||||||
|
affected, err := res.RowsAffected()
|
||||||
|
if err != nil {
|
||||||
|
return total, err
|
||||||
|
}
|
||||||
|
total += affected
|
||||||
|
if affected == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return total, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
|
||||||
|
if s == nil {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
// In simple run mode, assume single instance.
|
||||||
|
if s.cfg != nil && s.cfg.RunMode == config.RunModeSimple {
|
||||||
|
return nil, true
|
||||||
|
}
|
||||||
|
|
||||||
|
key := opsCleanupLeaderLockKeyDefault
|
||||||
|
ttl := opsCleanupLeaderLockTTLDefault
|
||||||
|
|
||||||
|
// Prefer Redis leader lock when available, but avoid stampeding the DB when Redis is flaky by
|
||||||
|
// falling back to a DB advisory lock.
|
||||||
|
if s.redisClient != nil {
|
||||||
|
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
|
||||||
|
if err == nil {
|
||||||
|
if !ok {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
return func() {
|
||||||
|
_, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
|
||||||
|
}, true
|
||||||
|
}
|
||||||
|
// Redis error: fall back to DB advisory lock.
|
||||||
|
s.warnNoRedisOnce.Do(func() {
|
||||||
|
log.Printf("[OpsCleanup] leader lock SetNX failed; falling back to DB advisory lock: %v", err)
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
s.warnNoRedisOnce.Do(func() {
|
||||||
|
log.Printf("[OpsCleanup] redis not configured; using DB advisory lock")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
|
||||||
|
if !ok {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
return release, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
|
||||||
|
if s == nil || s.opsRepo == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
now := time.Now().UTC()
|
||||||
|
durMs := duration.Milliseconds()
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
|
||||||
|
JobName: opsCleanupJobName,
|
||||||
|
LastRunAt: &runAt,
|
||||||
|
LastSuccessAt: &now,
|
||||||
|
LastDurationMs: &durMs,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsCleanupService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
|
||||||
|
if s == nil || s.opsRepo == nil || err == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
now := time.Now().UTC()
|
||||||
|
durMs := duration.Milliseconds()
|
||||||
|
msg := truncateString(err.Error(), 2048)
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
|
||||||
|
JobName: opsCleanupJobName,
|
||||||
|
LastRunAt: &runAt,
|
||||||
|
LastErrorAt: &now,
|
||||||
|
LastError: &msg,
|
||||||
|
LastDurationMs: &durMs,
|
||||||
|
})
|
||||||
|
}
|
||||||
257
backend/internal/service/ops_concurrency.go
Normal file
257
backend/internal/service/ops_concurrency.go
Normal file
@@ -0,0 +1,257 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"log"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/pkg/pagination"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
opsAccountsPageSize = 100
|
||||||
|
opsConcurrencyBatchChunkSize = 200
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *OpsService) listAllAccountsForOps(ctx context.Context, platformFilter string) ([]Account, error) {
|
||||||
|
if s == nil || s.accountRepo == nil {
|
||||||
|
return []Account{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
out := make([]Account, 0, 128)
|
||||||
|
page := 1
|
||||||
|
for {
|
||||||
|
accounts, pageInfo, err := s.accountRepo.ListWithFilters(ctx, pagination.PaginationParams{
|
||||||
|
Page: page,
|
||||||
|
PageSize: opsAccountsPageSize,
|
||||||
|
}, platformFilter, "", "", "")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if len(accounts) == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, accounts...)
|
||||||
|
if pageInfo != nil && int64(len(out)) >= pageInfo.Total {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if len(accounts) < opsAccountsPageSize {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
page++
|
||||||
|
if page > 10_000 {
|
||||||
|
log.Printf("[Ops] listAllAccountsForOps: aborting after too many pages (platform=%q)", platformFilter)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) getAccountsLoadMapBestEffort(ctx context.Context, accounts []Account) map[int64]*AccountLoadInfo {
|
||||||
|
if s == nil || s.concurrencyService == nil {
|
||||||
|
return map[int64]*AccountLoadInfo{}
|
||||||
|
}
|
||||||
|
if len(accounts) == 0 {
|
||||||
|
return map[int64]*AccountLoadInfo{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// De-duplicate IDs (and keep the max concurrency to avoid under-reporting).
|
||||||
|
unique := make(map[int64]int, len(accounts))
|
||||||
|
for _, acc := range accounts {
|
||||||
|
if acc.ID <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if prev, ok := unique[acc.ID]; !ok || acc.Concurrency > prev {
|
||||||
|
unique[acc.ID] = acc.Concurrency
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
batch := make([]AccountWithConcurrency, 0, len(unique))
|
||||||
|
for id, maxConc := range unique {
|
||||||
|
batch = append(batch, AccountWithConcurrency{
|
||||||
|
ID: id,
|
||||||
|
MaxConcurrency: maxConc,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
out := make(map[int64]*AccountLoadInfo, len(batch))
|
||||||
|
for i := 0; i < len(batch); i += opsConcurrencyBatchChunkSize {
|
||||||
|
end := i + opsConcurrencyBatchChunkSize
|
||||||
|
if end > len(batch) {
|
||||||
|
end = len(batch)
|
||||||
|
}
|
||||||
|
part, err := s.concurrencyService.GetAccountsLoadBatch(ctx, batch[i:end])
|
||||||
|
if err != nil {
|
||||||
|
// Best-effort: return zeros rather than failing the ops UI.
|
||||||
|
log.Printf("[Ops] GetAccountsLoadBatch failed: %v", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for k, v := range part {
|
||||||
|
out[k] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account.
|
||||||
|
//
|
||||||
|
// Optional filters:
|
||||||
|
// - platformFilter: only include accounts in that platform (best-effort reduces DB load)
|
||||||
|
// - groupIDFilter: only include accounts that belong to that group
|
||||||
|
func (s *OpsService) GetConcurrencyStats(
|
||||||
|
ctx context.Context,
|
||||||
|
platformFilter string,
|
||||||
|
groupIDFilter *int64,
|
||||||
|
) (map[string]*PlatformConcurrencyInfo, map[int64]*GroupConcurrencyInfo, map[int64]*AccountConcurrencyInfo, *time.Time, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, nil, nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
accounts, err := s.listAllAccountsForOps(ctx, platformFilter)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
collectedAt := time.Now()
|
||||||
|
loadMap := s.getAccountsLoadMapBestEffort(ctx, accounts)
|
||||||
|
|
||||||
|
platform := make(map[string]*PlatformConcurrencyInfo)
|
||||||
|
group := make(map[int64]*GroupConcurrencyInfo)
|
||||||
|
account := make(map[int64]*AccountConcurrencyInfo)
|
||||||
|
|
||||||
|
for _, acc := range accounts {
|
||||||
|
if acc.ID <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
var matchedGroup *Group
|
||||||
|
if groupIDFilter != nil && *groupIDFilter > 0 {
|
||||||
|
for _, grp := range acc.Groups {
|
||||||
|
if grp == nil || grp.ID <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if grp.ID == *groupIDFilter {
|
||||||
|
matchedGroup = grp
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Group filter provided: skip accounts not in that group.
|
||||||
|
if matchedGroup == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
load := loadMap[acc.ID]
|
||||||
|
currentInUse := int64(0)
|
||||||
|
waiting := int64(0)
|
||||||
|
if load != nil {
|
||||||
|
currentInUse = int64(load.CurrentConcurrency)
|
||||||
|
waiting = int64(load.WaitingCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Account-level view picks one display group (the first group).
|
||||||
|
displayGroupID := int64(0)
|
||||||
|
displayGroupName := ""
|
||||||
|
if matchedGroup != nil {
|
||||||
|
displayGroupID = matchedGroup.ID
|
||||||
|
displayGroupName = matchedGroup.Name
|
||||||
|
} else if len(acc.Groups) > 0 && acc.Groups[0] != nil {
|
||||||
|
displayGroupID = acc.Groups[0].ID
|
||||||
|
displayGroupName = acc.Groups[0].Name
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, ok := account[acc.ID]; !ok {
|
||||||
|
info := &AccountConcurrencyInfo{
|
||||||
|
AccountID: acc.ID,
|
||||||
|
AccountName: acc.Name,
|
||||||
|
Platform: acc.Platform,
|
||||||
|
GroupID: displayGroupID,
|
||||||
|
GroupName: displayGroupName,
|
||||||
|
CurrentInUse: currentInUse,
|
||||||
|
MaxCapacity: int64(acc.Concurrency),
|
||||||
|
WaitingInQueue: waiting,
|
||||||
|
}
|
||||||
|
if info.MaxCapacity > 0 {
|
||||||
|
info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
|
||||||
|
}
|
||||||
|
account[acc.ID] = info
|
||||||
|
}
|
||||||
|
|
||||||
|
// Platform aggregation.
|
||||||
|
if acc.Platform != "" {
|
||||||
|
if _, ok := platform[acc.Platform]; !ok {
|
||||||
|
platform[acc.Platform] = &PlatformConcurrencyInfo{
|
||||||
|
Platform: acc.Platform,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
p := platform[acc.Platform]
|
||||||
|
p.MaxCapacity += int64(acc.Concurrency)
|
||||||
|
p.CurrentInUse += currentInUse
|
||||||
|
p.WaitingInQueue += waiting
|
||||||
|
}
|
||||||
|
|
||||||
|
// Group aggregation (one account may contribute to multiple groups).
|
||||||
|
if matchedGroup != nil {
|
||||||
|
grp := matchedGroup
|
||||||
|
if _, ok := group[grp.ID]; !ok {
|
||||||
|
group[grp.ID] = &GroupConcurrencyInfo{
|
||||||
|
GroupID: grp.ID,
|
||||||
|
GroupName: grp.Name,
|
||||||
|
Platform: grp.Platform,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
g := group[grp.ID]
|
||||||
|
if g.GroupName == "" && grp.Name != "" {
|
||||||
|
g.GroupName = grp.Name
|
||||||
|
}
|
||||||
|
if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform {
|
||||||
|
// Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels.
|
||||||
|
g.Platform = ""
|
||||||
|
}
|
||||||
|
g.MaxCapacity += int64(acc.Concurrency)
|
||||||
|
g.CurrentInUse += currentInUse
|
||||||
|
g.WaitingInQueue += waiting
|
||||||
|
} else {
|
||||||
|
for _, grp := range acc.Groups {
|
||||||
|
if grp == nil || grp.ID <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, ok := group[grp.ID]; !ok {
|
||||||
|
group[grp.ID] = &GroupConcurrencyInfo{
|
||||||
|
GroupID: grp.ID,
|
||||||
|
GroupName: grp.Name,
|
||||||
|
Platform: grp.Platform,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
g := group[grp.ID]
|
||||||
|
if g.GroupName == "" && grp.Name != "" {
|
||||||
|
g.GroupName = grp.Name
|
||||||
|
}
|
||||||
|
if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform {
|
||||||
|
// Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels.
|
||||||
|
g.Platform = ""
|
||||||
|
}
|
||||||
|
g.MaxCapacity += int64(acc.Concurrency)
|
||||||
|
g.CurrentInUse += currentInUse
|
||||||
|
g.WaitingInQueue += waiting
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, info := range platform {
|
||||||
|
if info.MaxCapacity > 0 {
|
||||||
|
info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, info := range group {
|
||||||
|
if info.MaxCapacity > 0 {
|
||||||
|
info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return platform, group, account, &collectedAt, nil
|
||||||
|
}
|
||||||
90
backend/internal/service/ops_dashboard.go
Normal file
90
backend/internal/service/ops_dashboard.go
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"errors"
|
||||||
|
"log"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
if filter == nil {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
|
||||||
|
}
|
||||||
|
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
|
||||||
|
}
|
||||||
|
if filter.StartTime.After(filter.EndTime) {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resolve query mode (requested via query param, or DB default).
|
||||||
|
filter.QueryMode = s.resolveOpsQueryMode(ctx, filter.QueryMode)
|
||||||
|
|
||||||
|
overview, err := s.opsRepo.GetDashboardOverview(ctx, filter)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, ErrOpsPreaggregatedNotPopulated) {
|
||||||
|
return nil, infraerrors.Conflict("OPS_PREAGG_NOT_READY", "Pre-aggregated ops metrics are not populated yet")
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Best-effort system health + jobs; dashboard metrics should still render if these are missing.
|
||||||
|
if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil {
|
||||||
|
// Attach config-derived limits so the UI can show "current / max" for connection pools.
|
||||||
|
// These are best-effort and should never block the dashboard rendering.
|
||||||
|
if s != nil && s.cfg != nil {
|
||||||
|
if s.cfg.Database.MaxOpenConns > 0 {
|
||||||
|
metrics.DBMaxOpenConns = intPtr(s.cfg.Database.MaxOpenConns)
|
||||||
|
}
|
||||||
|
if s.cfg.Redis.PoolSize > 0 {
|
||||||
|
metrics.RedisPoolSize = intPtr(s.cfg.Redis.PoolSize)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
overview.SystemMetrics = metrics
|
||||||
|
} else if err != nil && !errors.Is(err, sql.ErrNoRows) {
|
||||||
|
log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if heartbeats, err := s.opsRepo.ListJobHeartbeats(ctx); err == nil {
|
||||||
|
overview.JobHeartbeats = heartbeats
|
||||||
|
} else {
|
||||||
|
log.Printf("[Ops] ListJobHeartbeats failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview)
|
||||||
|
|
||||||
|
return overview, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) resolveOpsQueryMode(ctx context.Context, requested OpsQueryMode) OpsQueryMode {
|
||||||
|
if requested.IsValid() {
|
||||||
|
// Allow "auto" to be disabled via config until preagg is proven stable in production.
|
||||||
|
// Forced `preagg` via query param still works.
|
||||||
|
if requested == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
|
||||||
|
return OpsQueryModeRaw
|
||||||
|
}
|
||||||
|
return requested
|
||||||
|
}
|
||||||
|
|
||||||
|
mode := OpsQueryModeAuto
|
||||||
|
if s != nil && s.settingRepo != nil {
|
||||||
|
if raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsQueryModeDefault); err == nil {
|
||||||
|
mode = ParseOpsQueryMode(raw)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if mode == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
|
||||||
|
return OpsQueryModeRaw
|
||||||
|
}
|
||||||
|
return mode
|
||||||
|
}
|
||||||
87
backend/internal/service/ops_dashboard_models.go
Normal file
87
backend/internal/service/ops_dashboard_models.go
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
type OpsDashboardFilter struct {
|
||||||
|
StartTime time.Time
|
||||||
|
EndTime time.Time
|
||||||
|
|
||||||
|
Platform string
|
||||||
|
GroupID *int64
|
||||||
|
|
||||||
|
// QueryMode controls whether dashboard queries should use raw logs or pre-aggregated tables.
|
||||||
|
// Expected values: auto/raw/preagg (see OpsQueryMode).
|
||||||
|
QueryMode OpsQueryMode
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsRateSummary struct {
|
||||||
|
Current float64 `json:"current"`
|
||||||
|
Peak float64 `json:"peak"`
|
||||||
|
Avg float64 `json:"avg"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsPercentiles struct {
|
||||||
|
P50 *int `json:"p50_ms"`
|
||||||
|
P90 *int `json:"p90_ms"`
|
||||||
|
P95 *int `json:"p95_ms"`
|
||||||
|
P99 *int `json:"p99_ms"`
|
||||||
|
Avg *int `json:"avg_ms"`
|
||||||
|
Max *int `json:"max_ms"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsDashboardOverview struct {
|
||||||
|
StartTime time.Time `json:"start_time"`
|
||||||
|
EndTime time.Time `json:"end_time"`
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
GroupID *int64 `json:"group_id"`
|
||||||
|
|
||||||
|
// HealthScore is a backend-computed overall health score (0-100).
|
||||||
|
// It is derived from the monitored metrics in this overview, plus best-effort system metrics/job heartbeats.
|
||||||
|
HealthScore int `json:"health_score"`
|
||||||
|
|
||||||
|
// Latest system-level snapshot (window=1m, global).
|
||||||
|
SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"`
|
||||||
|
|
||||||
|
// Background jobs health (heartbeats).
|
||||||
|
JobHeartbeats []*OpsJobHeartbeat `json:"job_heartbeats"`
|
||||||
|
|
||||||
|
SuccessCount int64 `json:"success_count"`
|
||||||
|
ErrorCountTotal int64 `json:"error_count_total"`
|
||||||
|
BusinessLimitedCount int64 `json:"business_limited_count"`
|
||||||
|
|
||||||
|
ErrorCountSLA int64 `json:"error_count_sla"`
|
||||||
|
RequestCountTotal int64 `json:"request_count_total"`
|
||||||
|
RequestCountSLA int64 `json:"request_count_sla"`
|
||||||
|
|
||||||
|
TokenConsumed int64 `json:"token_consumed"`
|
||||||
|
|
||||||
|
SLA float64 `json:"sla"`
|
||||||
|
ErrorRate float64 `json:"error_rate"`
|
||||||
|
UpstreamErrorRate float64 `json:"upstream_error_rate"`
|
||||||
|
UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
|
||||||
|
Upstream429Count int64 `json:"upstream_429_count"`
|
||||||
|
Upstream529Count int64 `json:"upstream_529_count"`
|
||||||
|
|
||||||
|
QPS OpsRateSummary `json:"qps"`
|
||||||
|
TPS OpsRateSummary `json:"tps"`
|
||||||
|
|
||||||
|
Duration OpsPercentiles `json:"duration"`
|
||||||
|
TTFT OpsPercentiles `json:"ttft"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsLatencyHistogramBucket struct {
|
||||||
|
Range string `json:"range"`
|
||||||
|
Count int64 `json:"count"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpsLatencyHistogramResponse is a coarse latency distribution histogram (success requests only).
|
||||||
|
// It is used by the Ops dashboard to quickly identify tail latency regressions.
|
||||||
|
type OpsLatencyHistogramResponse struct {
|
||||||
|
StartTime time.Time `json:"start_time"`
|
||||||
|
EndTime time.Time `json:"end_time"`
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
GroupID *int64 `json:"group_id"`
|
||||||
|
|
||||||
|
TotalRequests int64 `json:"total_requests"`
|
||||||
|
Buckets []*OpsLatencyHistogramBucket `json:"buckets"`
|
||||||
|
}
|
||||||
45
backend/internal/service/ops_errors.go
Normal file
45
backend/internal/service/ops_errors.go
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *OpsService) GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
if filter == nil {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
|
||||||
|
}
|
||||||
|
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
|
||||||
|
}
|
||||||
|
if filter.StartTime.After(filter.EndTime) {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
|
||||||
|
}
|
||||||
|
return s.opsRepo.GetErrorTrend(ctx, filter, bucketSeconds)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
if filter == nil {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
|
||||||
|
}
|
||||||
|
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
|
||||||
|
}
|
||||||
|
if filter.StartTime.After(filter.EndTime) {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
|
||||||
|
}
|
||||||
|
return s.opsRepo.GetErrorDistribution(ctx, filter)
|
||||||
|
}
|
||||||
154
backend/internal/service/ops_health_score.go
Normal file
154
backend/internal/service/ops_health_score.go
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
|
||||||
|
//
|
||||||
|
// Design goals:
|
||||||
|
// - Backend-owned scoring (UI only displays).
|
||||||
|
// - Layered scoring: Business Health (70%) + Infrastructure Health (30%)
|
||||||
|
// - Avoids double-counting (e.g., DB failure affects both infra and business metrics)
|
||||||
|
// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
|
||||||
|
func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
|
||||||
|
if overview == nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// Idle/no-data: avoid showing a "bad" score when there is no traffic.
|
||||||
|
// UI can still render a gray/idle state based on QPS + error rate.
|
||||||
|
if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 {
|
||||||
|
return 100
|
||||||
|
}
|
||||||
|
|
||||||
|
businessHealth := computeBusinessHealth(overview)
|
||||||
|
infraHealth := computeInfraHealth(now, overview)
|
||||||
|
|
||||||
|
// Weighted combination: 70% business + 30% infrastructure
|
||||||
|
score := businessHealth*0.7 + infraHealth*0.3
|
||||||
|
return int(math.Round(clampFloat64(score, 0, 100)))
|
||||||
|
}
|
||||||
|
|
||||||
|
// computeBusinessHealth calculates business health score (0-100)
|
||||||
|
// Components: SLA (50%) + Error Rate (30%) + Latency (20%)
|
||||||
|
func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
|
||||||
|
// SLA score: 99.5% → 100, 95% → 0 (linear)
|
||||||
|
slaScore := 100.0
|
||||||
|
slaPct := clampFloat64(overview.SLA*100, 0, 100)
|
||||||
|
if slaPct < 99.5 {
|
||||||
|
if slaPct >= 95 {
|
||||||
|
slaScore = (slaPct - 95) / 4.5 * 100
|
||||||
|
} else {
|
||||||
|
slaScore = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Error rate score: 0.5% → 100, 5% → 0 (linear)
|
||||||
|
// Combines request errors and upstream errors
|
||||||
|
errorScore := 100.0
|
||||||
|
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
|
||||||
|
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
|
||||||
|
combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
|
||||||
|
if combinedErrorPct > 0.5 {
|
||||||
|
if combinedErrorPct <= 5 {
|
||||||
|
errorScore = (5 - combinedErrorPct) / 4.5 * 100
|
||||||
|
} else {
|
||||||
|
errorScore = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Latency score: 1s → 100, 10s → 0 (linear)
|
||||||
|
// Uses P99 of duration (TTFT is less critical for overall health)
|
||||||
|
latencyScore := 100.0
|
||||||
|
if overview.Duration.P99 != nil {
|
||||||
|
p99 := float64(*overview.Duration.P99)
|
||||||
|
if p99 > 1000 {
|
||||||
|
if p99 <= 10000 {
|
||||||
|
latencyScore = (10000 - p99) / 9000 * 100
|
||||||
|
} else {
|
||||||
|
latencyScore = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Weighted combination
|
||||||
|
return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2
|
||||||
|
}
|
||||||
|
|
||||||
|
// computeInfraHealth calculates infrastructure health score (0-100)
|
||||||
|
// Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%)
|
||||||
|
func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 {
|
||||||
|
// Storage score: DB critical, Redis less critical
|
||||||
|
storageScore := 100.0
|
||||||
|
if overview.SystemMetrics != nil {
|
||||||
|
if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
|
||||||
|
storageScore = 0 // DB failure is critical
|
||||||
|
} else if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
|
||||||
|
storageScore = 50 // Redis failure is degraded but not critical
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute resources score: CPU + Memory
|
||||||
|
computeScore := 100.0
|
||||||
|
if overview.SystemMetrics != nil {
|
||||||
|
cpuScore := 100.0
|
||||||
|
if overview.SystemMetrics.CPUUsagePercent != nil {
|
||||||
|
cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
|
||||||
|
if cpuPct > 80 {
|
||||||
|
if cpuPct <= 100 {
|
||||||
|
cpuScore = (100 - cpuPct) / 20 * 100
|
||||||
|
} else {
|
||||||
|
cpuScore = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
memScore := 100.0
|
||||||
|
if overview.SystemMetrics.MemoryUsagePercent != nil {
|
||||||
|
memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
|
||||||
|
if memPct > 85 {
|
||||||
|
if memPct <= 100 {
|
||||||
|
memScore = (100 - memPct) / 15 * 100
|
||||||
|
} else {
|
||||||
|
memScore = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
computeScore = (cpuScore + memScore) / 2
|
||||||
|
}
|
||||||
|
|
||||||
|
// Background jobs score
|
||||||
|
jobScore := 100.0
|
||||||
|
failedJobs := 0
|
||||||
|
totalJobs := 0
|
||||||
|
for _, hb := range overview.JobHeartbeats {
|
||||||
|
if hb == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
totalJobs++
|
||||||
|
if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
|
||||||
|
failedJobs++
|
||||||
|
} else if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
|
||||||
|
failedJobs++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if totalJobs > 0 && failedJobs > 0 {
|
||||||
|
jobScore = (1 - float64(failedJobs)/float64(totalJobs)) * 100
|
||||||
|
}
|
||||||
|
|
||||||
|
// Weighted combination
|
||||||
|
return storageScore*0.4 + computeScore*0.3 + jobScore*0.3
|
||||||
|
}
|
||||||
|
|
||||||
|
func clampFloat64(v float64, min float64, max float64) float64 {
|
||||||
|
if v < min {
|
||||||
|
return min
|
||||||
|
}
|
||||||
|
if v > max {
|
||||||
|
return max
|
||||||
|
}
|
||||||
|
return v
|
||||||
|
}
|
||||||
431
backend/internal/service/ops_health_score_test.go
Normal file
431
backend/internal/service/ops_health_score_test.go
Normal file
@@ -0,0 +1,431 @@
|
|||||||
|
//go:build unit
|
||||||
|
|
||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestComputeDashboardHealthScore_IdleReturns100(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{})
|
||||||
|
require.Equal(t, 100, score)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
ov := &OpsDashboardOverview{
|
||||||
|
RequestCountTotal: 100,
|
||||||
|
RequestCountSLA: 100,
|
||||||
|
SuccessCount: 90,
|
||||||
|
ErrorCountTotal: 10,
|
||||||
|
ErrorCountSLA: 10,
|
||||||
|
|
||||||
|
SLA: 0.90,
|
||||||
|
ErrorRate: 0.10,
|
||||||
|
UpstreamErrorRate: 0.08,
|
||||||
|
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(20_000)},
|
||||||
|
TTFT: OpsPercentiles{P99: intPtr(2_000)},
|
||||||
|
|
||||||
|
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||||
|
DBOK: boolPtr(false),
|
||||||
|
RedisOK: boolPtr(false),
|
||||||
|
CPUUsagePercent: float64Ptr(98.0),
|
||||||
|
MemoryUsagePercent: float64Ptr(97.0),
|
||||||
|
DBConnWaiting: intPtr(3),
|
||||||
|
ConcurrencyQueueDepth: intPtr(10),
|
||||||
|
},
|
||||||
|
JobHeartbeats: []*OpsJobHeartbeat{
|
||||||
|
{
|
||||||
|
JobName: "job-a",
|
||||||
|
LastErrorAt: timePtr(time.Now().UTC().Add(-1 * time.Minute)),
|
||||||
|
LastError: stringPtr("boom"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
score := computeDashboardHealthScore(time.Now().UTC(), ov)
|
||||||
|
require.Less(t, score, 80)
|
||||||
|
require.GreaterOrEqual(t, score, 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
overview *OpsDashboardOverview
|
||||||
|
wantMin int
|
||||||
|
wantMax int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "nil overview returns 0",
|
||||||
|
overview: nil,
|
||||||
|
wantMin: 0,
|
||||||
|
wantMax: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "perfect health",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
RequestCountTotal: 1000,
|
||||||
|
RequestCountSLA: 1000,
|
||||||
|
SLA: 1.0,
|
||||||
|
ErrorRate: 0,
|
||||||
|
UpstreamErrorRate: 0,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||||
|
TTFT: OpsPercentiles{P99: intPtr(100)},
|
||||||
|
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||||
|
DBOK: boolPtr(true),
|
||||||
|
RedisOK: boolPtr(true),
|
||||||
|
CPUUsagePercent: float64Ptr(30),
|
||||||
|
MemoryUsagePercent: float64Ptr(40),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantMin: 100,
|
||||||
|
wantMax: 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "good health - SLA 99.8%",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
RequestCountTotal: 1000,
|
||||||
|
RequestCountSLA: 1000,
|
||||||
|
SLA: 0.998,
|
||||||
|
ErrorRate: 0.003,
|
||||||
|
UpstreamErrorRate: 0.001,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(800)},
|
||||||
|
TTFT: OpsPercentiles{P99: intPtr(200)},
|
||||||
|
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||||
|
DBOK: boolPtr(true),
|
||||||
|
RedisOK: boolPtr(true),
|
||||||
|
CPUUsagePercent: float64Ptr(50),
|
||||||
|
MemoryUsagePercent: float64Ptr(60),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantMin: 95,
|
||||||
|
wantMax: 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "medium health - SLA 96%",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
RequestCountTotal: 1000,
|
||||||
|
RequestCountSLA: 1000,
|
||||||
|
SLA: 0.96,
|
||||||
|
ErrorRate: 0.02,
|
||||||
|
UpstreamErrorRate: 0.01,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(3000)},
|
||||||
|
TTFT: OpsPercentiles{P99: intPtr(600)},
|
||||||
|
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||||
|
DBOK: boolPtr(true),
|
||||||
|
RedisOK: boolPtr(true),
|
||||||
|
CPUUsagePercent: float64Ptr(70),
|
||||||
|
MemoryUsagePercent: float64Ptr(75),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantMin: 60,
|
||||||
|
wantMax: 85,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "DB failure",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
RequestCountTotal: 1000,
|
||||||
|
RequestCountSLA: 1000,
|
||||||
|
SLA: 0.995,
|
||||||
|
ErrorRate: 0,
|
||||||
|
UpstreamErrorRate: 0,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||||
|
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||||
|
DBOK: boolPtr(false),
|
||||||
|
RedisOK: boolPtr(true),
|
||||||
|
CPUUsagePercent: float64Ptr(30),
|
||||||
|
MemoryUsagePercent: float64Ptr(40),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantMin: 70,
|
||||||
|
wantMax: 90,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Redis failure",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
RequestCountTotal: 1000,
|
||||||
|
RequestCountSLA: 1000,
|
||||||
|
SLA: 0.995,
|
||||||
|
ErrorRate: 0,
|
||||||
|
UpstreamErrorRate: 0,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||||
|
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||||
|
DBOK: boolPtr(true),
|
||||||
|
RedisOK: boolPtr(false),
|
||||||
|
CPUUsagePercent: float64Ptr(30),
|
||||||
|
MemoryUsagePercent: float64Ptr(40),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantMin: 85,
|
||||||
|
wantMax: 95,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "high CPU usage",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
RequestCountTotal: 1000,
|
||||||
|
RequestCountSLA: 1000,
|
||||||
|
SLA: 0.995,
|
||||||
|
ErrorRate: 0,
|
||||||
|
UpstreamErrorRate: 0,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||||
|
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||||
|
DBOK: boolPtr(true),
|
||||||
|
RedisOK: boolPtr(true),
|
||||||
|
CPUUsagePercent: float64Ptr(95),
|
||||||
|
MemoryUsagePercent: float64Ptr(40),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantMin: 85,
|
||||||
|
wantMax: 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "combined failures - business degraded + infra healthy",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
RequestCountTotal: 1000,
|
||||||
|
RequestCountSLA: 1000,
|
||||||
|
SLA: 0.90,
|
||||||
|
ErrorRate: 0.05,
|
||||||
|
UpstreamErrorRate: 0.02,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(10000)},
|
||||||
|
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||||
|
DBOK: boolPtr(true),
|
||||||
|
RedisOK: boolPtr(true),
|
||||||
|
CPUUsagePercent: float64Ptr(20),
|
||||||
|
MemoryUsagePercent: float64Ptr(30),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantMin: 25,
|
||||||
|
wantMax: 50,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "combined failures - business healthy + infra degraded",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
RequestCountTotal: 1000,
|
||||||
|
RequestCountSLA: 1000,
|
||||||
|
SLA: 0.998,
|
||||||
|
ErrorRate: 0.001,
|
||||||
|
UpstreamErrorRate: 0,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(600)},
|
||||||
|
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||||
|
DBOK: boolPtr(false),
|
||||||
|
RedisOK: boolPtr(false),
|
||||||
|
CPUUsagePercent: float64Ptr(95),
|
||||||
|
MemoryUsagePercent: float64Ptr(95),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantMin: 70,
|
||||||
|
wantMax: 90,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
score := computeDashboardHealthScore(time.Now().UTC(), tt.overview)
|
||||||
|
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %d", tt.wantMin)
|
||||||
|
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %d", tt.wantMax)
|
||||||
|
require.GreaterOrEqual(t, score, 0, "score must be >= 0")
|
||||||
|
require.LessOrEqual(t, score, 100, "score must be <= 100")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestComputeBusinessHealth(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
overview *OpsDashboardOverview
|
||||||
|
wantMin float64
|
||||||
|
wantMax float64
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "perfect metrics",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
SLA: 1.0,
|
||||||
|
ErrorRate: 0,
|
||||||
|
UpstreamErrorRate: 0,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||||
|
},
|
||||||
|
wantMin: 100,
|
||||||
|
wantMax: 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "SLA boundary 99.5%",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
SLA: 0.995,
|
||||||
|
ErrorRate: 0,
|
||||||
|
UpstreamErrorRate: 0,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||||
|
},
|
||||||
|
wantMin: 100,
|
||||||
|
wantMax: 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "SLA boundary 95%",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
SLA: 0.95,
|
||||||
|
ErrorRate: 0,
|
||||||
|
UpstreamErrorRate: 0,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||||
|
},
|
||||||
|
wantMin: 50,
|
||||||
|
wantMax: 60,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "error rate boundary 0.5%",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
SLA: 0.995,
|
||||||
|
ErrorRate: 0.005,
|
||||||
|
UpstreamErrorRate: 0,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||||
|
},
|
||||||
|
wantMin: 95,
|
||||||
|
wantMax: 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "latency boundary 1000ms",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
SLA: 0.995,
|
||||||
|
ErrorRate: 0,
|
||||||
|
UpstreamErrorRate: 0,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(1000)},
|
||||||
|
},
|
||||||
|
wantMin: 95,
|
||||||
|
wantMax: 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "upstream error dominates",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
SLA: 0.995,
|
||||||
|
ErrorRate: 0.001,
|
||||||
|
UpstreamErrorRate: 0.03,
|
||||||
|
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||||
|
},
|
||||||
|
wantMin: 75,
|
||||||
|
wantMax: 90,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
score := computeBusinessHealth(tt.overview)
|
||||||
|
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
|
||||||
|
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
|
||||||
|
require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
|
||||||
|
require.LessOrEqual(t, score, 100.0, "score must be <= 100")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestComputeInfraHealth(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
now := time.Now().UTC()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
overview *OpsDashboardOverview
|
||||||
|
wantMin float64
|
||||||
|
wantMax float64
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "all infrastructure healthy",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
RequestCountTotal: 1000,
|
||||||
|
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||||
|
DBOK: boolPtr(true),
|
||||||
|
RedisOK: boolPtr(true),
|
||||||
|
CPUUsagePercent: float64Ptr(30),
|
||||||
|
MemoryUsagePercent: float64Ptr(40),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantMin: 100,
|
||||||
|
wantMax: 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "DB down",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
RequestCountTotal: 1000,
|
||||||
|
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||||
|
DBOK: boolPtr(false),
|
||||||
|
RedisOK: boolPtr(true),
|
||||||
|
CPUUsagePercent: float64Ptr(30),
|
||||||
|
MemoryUsagePercent: float64Ptr(40),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantMin: 50,
|
||||||
|
wantMax: 70,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Redis down",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
RequestCountTotal: 1000,
|
||||||
|
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||||
|
DBOK: boolPtr(true),
|
||||||
|
RedisOK: boolPtr(false),
|
||||||
|
CPUUsagePercent: float64Ptr(30),
|
||||||
|
MemoryUsagePercent: float64Ptr(40),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantMin: 80,
|
||||||
|
wantMax: 95,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "CPU at 90%",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
RequestCountTotal: 1000,
|
||||||
|
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||||
|
DBOK: boolPtr(true),
|
||||||
|
RedisOK: boolPtr(true),
|
||||||
|
CPUUsagePercent: float64Ptr(90),
|
||||||
|
MemoryUsagePercent: float64Ptr(40),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantMin: 85,
|
||||||
|
wantMax: 95,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "failed background job",
|
||||||
|
overview: &OpsDashboardOverview{
|
||||||
|
RequestCountTotal: 1000,
|
||||||
|
SystemMetrics: &OpsSystemMetricsSnapshot{
|
||||||
|
DBOK: boolPtr(true),
|
||||||
|
RedisOK: boolPtr(true),
|
||||||
|
CPUUsagePercent: float64Ptr(30),
|
||||||
|
MemoryUsagePercent: float64Ptr(40),
|
||||||
|
},
|
||||||
|
JobHeartbeats: []*OpsJobHeartbeat{
|
||||||
|
{
|
||||||
|
JobName: "test-job",
|
||||||
|
LastErrorAt: &now,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantMin: 70,
|
||||||
|
wantMax: 90,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
score := computeInfraHealth(now, tt.overview)
|
||||||
|
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
|
||||||
|
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
|
||||||
|
require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
|
||||||
|
require.LessOrEqual(t, score, 100.0, "score must be <= 100")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func timePtr(v time.Time) *time.Time { return &v }
|
||||||
|
|
||||||
|
func stringPtr(v string) *string { return &v }
|
||||||
26
backend/internal/service/ops_histograms.go
Normal file
26
backend/internal/service/ops_histograms.go
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *OpsService) GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
if filter == nil {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
|
||||||
|
}
|
||||||
|
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
|
||||||
|
}
|
||||||
|
if filter.StartTime.After(filter.EndTime) {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
|
||||||
|
}
|
||||||
|
return s.opsRepo.GetLatencyHistogram(ctx, filter)
|
||||||
|
}
|
||||||
920
backend/internal/service/ops_metrics_collector.go
Normal file
920
backend/internal/service/ops_metrics_collector.go
Normal file
@@ -0,0 +1,920 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"math"
|
||||||
|
"os"
|
||||||
|
"runtime"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||||
|
"github.com/google/uuid"
|
||||||
|
"github.com/redis/go-redis/v9"
|
||||||
|
"github.com/shirou/gopsutil/v4/cpu"
|
||||||
|
"github.com/shirou/gopsutil/v4/mem"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
opsMetricsCollectorJobName = "ops_metrics_collector"
|
||||||
|
opsMetricsCollectorMinInterval = 60 * time.Second
|
||||||
|
opsMetricsCollectorMaxInterval = 1 * time.Hour
|
||||||
|
|
||||||
|
opsMetricsCollectorTimeout = 10 * time.Second
|
||||||
|
|
||||||
|
opsMetricsCollectorLeaderLockKey = "ops:metrics:collector:leader"
|
||||||
|
opsMetricsCollectorLeaderLockTTL = 90 * time.Second
|
||||||
|
|
||||||
|
opsMetricsCollectorHeartbeatTimeout = 2 * time.Second
|
||||||
|
|
||||||
|
bytesPerMB = 1024 * 1024
|
||||||
|
)
|
||||||
|
|
||||||
|
var opsMetricsCollectorAdvisoryLockID = hashAdvisoryLockID(opsMetricsCollectorLeaderLockKey)
|
||||||
|
|
||||||
|
type OpsMetricsCollector struct {
|
||||||
|
opsRepo OpsRepository
|
||||||
|
settingRepo SettingRepository
|
||||||
|
cfg *config.Config
|
||||||
|
|
||||||
|
accountRepo AccountRepository
|
||||||
|
concurrencyService *ConcurrencyService
|
||||||
|
|
||||||
|
db *sql.DB
|
||||||
|
redisClient *redis.Client
|
||||||
|
instanceID string
|
||||||
|
|
||||||
|
lastCgroupCPUUsageNanos uint64
|
||||||
|
lastCgroupCPUSampleAt time.Time
|
||||||
|
|
||||||
|
stopCh chan struct{}
|
||||||
|
startOnce sync.Once
|
||||||
|
stopOnce sync.Once
|
||||||
|
|
||||||
|
skipLogMu sync.Mutex
|
||||||
|
skipLogAt time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewOpsMetricsCollector(
|
||||||
|
opsRepo OpsRepository,
|
||||||
|
settingRepo SettingRepository,
|
||||||
|
accountRepo AccountRepository,
|
||||||
|
concurrencyService *ConcurrencyService,
|
||||||
|
db *sql.DB,
|
||||||
|
redisClient *redis.Client,
|
||||||
|
cfg *config.Config,
|
||||||
|
) *OpsMetricsCollector {
|
||||||
|
return &OpsMetricsCollector{
|
||||||
|
opsRepo: opsRepo,
|
||||||
|
settingRepo: settingRepo,
|
||||||
|
cfg: cfg,
|
||||||
|
accountRepo: accountRepo,
|
||||||
|
concurrencyService: concurrencyService,
|
||||||
|
db: db,
|
||||||
|
redisClient: redisClient,
|
||||||
|
instanceID: uuid.NewString(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) Start() {
|
||||||
|
if c == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
c.startOnce.Do(func() {
|
||||||
|
if c.stopCh == nil {
|
||||||
|
c.stopCh = make(chan struct{})
|
||||||
|
}
|
||||||
|
go c.run()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) Stop() {
|
||||||
|
if c == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
c.stopOnce.Do(func() {
|
||||||
|
if c.stopCh != nil {
|
||||||
|
close(c.stopCh)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) run() {
|
||||||
|
// First run immediately so the dashboard has data soon after startup.
|
||||||
|
c.collectOnce()
|
||||||
|
|
||||||
|
for {
|
||||||
|
interval := c.getInterval()
|
||||||
|
timer := time.NewTimer(interval)
|
||||||
|
select {
|
||||||
|
case <-timer.C:
|
||||||
|
c.collectOnce()
|
||||||
|
case <-c.stopCh:
|
||||||
|
timer.Stop()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) getInterval() time.Duration {
|
||||||
|
interval := opsMetricsCollectorMinInterval
|
||||||
|
|
||||||
|
if c.settingRepo == nil {
|
||||||
|
return interval
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
raw, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMetricsIntervalSeconds)
|
||||||
|
if err != nil {
|
||||||
|
return interval
|
||||||
|
}
|
||||||
|
raw = strings.TrimSpace(raw)
|
||||||
|
if raw == "" {
|
||||||
|
return interval
|
||||||
|
}
|
||||||
|
|
||||||
|
seconds, err := strconv.Atoi(raw)
|
||||||
|
if err != nil {
|
||||||
|
return interval
|
||||||
|
}
|
||||||
|
if seconds < int(opsMetricsCollectorMinInterval.Seconds()) {
|
||||||
|
seconds = int(opsMetricsCollectorMinInterval.Seconds())
|
||||||
|
}
|
||||||
|
if seconds > int(opsMetricsCollectorMaxInterval.Seconds()) {
|
||||||
|
seconds = int(opsMetricsCollectorMaxInterval.Seconds())
|
||||||
|
}
|
||||||
|
return time.Duration(seconds) * time.Second
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) collectOnce() {
|
||||||
|
if c == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if c.cfg != nil && !c.cfg.Ops.Enabled {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if c.opsRepo == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if c.db == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), opsMetricsCollectorTimeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
if !c.isMonitoringEnabled(ctx) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
release, ok := c.tryAcquireLeaderLock(ctx)
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if release != nil {
|
||||||
|
defer release()
|
||||||
|
}
|
||||||
|
|
||||||
|
startedAt := time.Now().UTC()
|
||||||
|
err := c.collectAndPersist(ctx)
|
||||||
|
finishedAt := time.Now().UTC()
|
||||||
|
|
||||||
|
durationMs := finishedAt.Sub(startedAt).Milliseconds()
|
||||||
|
dur := durationMs
|
||||||
|
runAt := startedAt
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
msg := truncateString(err.Error(), 2048)
|
||||||
|
errAt := finishedAt
|
||||||
|
hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
|
||||||
|
defer hbCancel()
|
||||||
|
_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
|
||||||
|
JobName: opsMetricsCollectorJobName,
|
||||||
|
LastRunAt: &runAt,
|
||||||
|
LastErrorAt: &errAt,
|
||||||
|
LastError: &msg,
|
||||||
|
LastDurationMs: &dur,
|
||||||
|
})
|
||||||
|
log.Printf("[OpsMetricsCollector] collect failed: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
successAt := finishedAt
|
||||||
|
hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
|
||||||
|
defer hbCancel()
|
||||||
|
_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
|
||||||
|
JobName: opsMetricsCollectorJobName,
|
||||||
|
LastRunAt: &runAt,
|
||||||
|
LastSuccessAt: &successAt,
|
||||||
|
LastDurationMs: &dur,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) isMonitoringEnabled(ctx context.Context) bool {
|
||||||
|
if c == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if c.cfg != nil && !c.cfg.Ops.Enabled {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if c.settingRepo == nil {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
value, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, ErrSettingNotFound) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
// Fail-open: collector should not become a hard dependency.
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
switch strings.ToLower(strings.TrimSpace(value)) {
|
||||||
|
case "false", "0", "off", "disabled":
|
||||||
|
return false
|
||||||
|
default:
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Align to stable minute boundaries to avoid partial buckets and to maximize cache hits.
|
||||||
|
now := time.Now().UTC()
|
||||||
|
windowEnd := now.Truncate(time.Minute)
|
||||||
|
windowStart := windowEnd.Add(-1 * time.Minute)
|
||||||
|
|
||||||
|
sys, err := c.collectSystemStats(ctx)
|
||||||
|
if err != nil {
|
||||||
|
// Continue; system stats are best-effort.
|
||||||
|
log.Printf("[OpsMetricsCollector] system stats error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
dbOK := c.checkDB(ctx)
|
||||||
|
redisOK := c.checkRedis(ctx)
|
||||||
|
active, idle := c.dbPoolStats()
|
||||||
|
redisTotal, redisIdle, redisStatsOK := c.redisPoolStats()
|
||||||
|
|
||||||
|
successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("query usage counts: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
duration, ttft, err := c.queryUsageLatency(ctx, windowStart, windowEnd)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("query usage latency: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
errorTotal, businessLimited, errorSLA, upstreamExcl, upstream429, upstream529, err := c.queryErrorCounts(ctx, windowStart, windowEnd)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("query error counts: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
windowSeconds := windowEnd.Sub(windowStart).Seconds()
|
||||||
|
if windowSeconds <= 0 {
|
||||||
|
windowSeconds = 60
|
||||||
|
}
|
||||||
|
requestTotal := successCount + errorTotal
|
||||||
|
qps := float64(requestTotal) / windowSeconds
|
||||||
|
tps := float64(tokenConsumed) / windowSeconds
|
||||||
|
|
||||||
|
goroutines := runtime.NumGoroutine()
|
||||||
|
concurrencyQueueDepth := c.collectConcurrencyQueueDepth(ctx)
|
||||||
|
|
||||||
|
input := &OpsInsertSystemMetricsInput{
|
||||||
|
CreatedAt: windowEnd,
|
||||||
|
WindowMinutes: 1,
|
||||||
|
|
||||||
|
SuccessCount: successCount,
|
||||||
|
ErrorCountTotal: errorTotal,
|
||||||
|
BusinessLimitedCount: businessLimited,
|
||||||
|
ErrorCountSLA: errorSLA,
|
||||||
|
|
||||||
|
UpstreamErrorCountExcl429529: upstreamExcl,
|
||||||
|
Upstream429Count: upstream429,
|
||||||
|
Upstream529Count: upstream529,
|
||||||
|
|
||||||
|
TokenConsumed: tokenConsumed,
|
||||||
|
QPS: float64Ptr(roundTo1DP(qps)),
|
||||||
|
TPS: float64Ptr(roundTo1DP(tps)),
|
||||||
|
|
||||||
|
DurationP50Ms: duration.p50,
|
||||||
|
DurationP90Ms: duration.p90,
|
||||||
|
DurationP95Ms: duration.p95,
|
||||||
|
DurationP99Ms: duration.p99,
|
||||||
|
DurationAvgMs: duration.avg,
|
||||||
|
DurationMaxMs: duration.max,
|
||||||
|
|
||||||
|
TTFTP50Ms: ttft.p50,
|
||||||
|
TTFTP90Ms: ttft.p90,
|
||||||
|
TTFTP95Ms: ttft.p95,
|
||||||
|
TTFTP99Ms: ttft.p99,
|
||||||
|
TTFTAvgMs: ttft.avg,
|
||||||
|
TTFTMaxMs: ttft.max,
|
||||||
|
|
||||||
|
CPUUsagePercent: sys.cpuUsagePercent,
|
||||||
|
MemoryUsedMB: sys.memoryUsedMB,
|
||||||
|
MemoryTotalMB: sys.memoryTotalMB,
|
||||||
|
MemoryUsagePercent: sys.memoryUsagePercent,
|
||||||
|
|
||||||
|
DBOK: boolPtr(dbOK),
|
||||||
|
RedisOK: boolPtr(redisOK),
|
||||||
|
|
||||||
|
RedisConnTotal: func() *int {
|
||||||
|
if !redisStatsOK {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return intPtr(redisTotal)
|
||||||
|
}(),
|
||||||
|
RedisConnIdle: func() *int {
|
||||||
|
if !redisStatsOK {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return intPtr(redisIdle)
|
||||||
|
}(),
|
||||||
|
|
||||||
|
DBConnActive: intPtr(active),
|
||||||
|
DBConnIdle: intPtr(idle),
|
||||||
|
GoroutineCount: intPtr(goroutines),
|
||||||
|
ConcurrencyQueueDepth: concurrencyQueueDepth,
|
||||||
|
}
|
||||||
|
|
||||||
|
return c.opsRepo.InsertSystemMetrics(ctx, input)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) collectConcurrencyQueueDepth(parentCtx context.Context) *int {
|
||||||
|
if c == nil || c.accountRepo == nil || c.concurrencyService == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if parentCtx == nil {
|
||||||
|
parentCtx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Best-effort: never let concurrency sampling break the metrics collector.
|
||||||
|
ctx, cancel := context.WithTimeout(parentCtx, 2*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
accounts, err := c.accountRepo.ListSchedulable(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if len(accounts) == 0 {
|
||||||
|
zero := 0
|
||||||
|
return &zero
|
||||||
|
}
|
||||||
|
|
||||||
|
batch := make([]AccountWithConcurrency, 0, len(accounts))
|
||||||
|
for _, acc := range accounts {
|
||||||
|
if acc.ID <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
maxConc := acc.Concurrency
|
||||||
|
if maxConc < 0 {
|
||||||
|
maxConc = 0
|
||||||
|
}
|
||||||
|
batch = append(batch, AccountWithConcurrency{
|
||||||
|
ID: acc.ID,
|
||||||
|
MaxConcurrency: maxConc,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if len(batch) == 0 {
|
||||||
|
zero := 0
|
||||||
|
return &zero
|
||||||
|
}
|
||||||
|
|
||||||
|
loadMap, err := c.concurrencyService.GetAccountsLoadBatch(ctx, batch)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var total int64
|
||||||
|
for _, info := range loadMap {
|
||||||
|
if info == nil || info.WaitingCount <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
total += int64(info.WaitingCount)
|
||||||
|
}
|
||||||
|
if total < 0 {
|
||||||
|
total = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
maxInt := int64(^uint(0) >> 1)
|
||||||
|
if total > maxInt {
|
||||||
|
total = maxInt
|
||||||
|
}
|
||||||
|
v := int(total)
|
||||||
|
return &v
|
||||||
|
}
|
||||||
|
|
||||||
|
type opsCollectedPercentiles struct {
|
||||||
|
p50 *int
|
||||||
|
p90 *int
|
||||||
|
p95 *int
|
||||||
|
p99 *int
|
||||||
|
avg *float64
|
||||||
|
max *int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) queryUsageCounts(ctx context.Context, start, end time.Time) (successCount int64, tokenConsumed int64, err error) {
|
||||||
|
q := `
|
||||||
|
SELECT
|
||||||
|
COALESCE(COUNT(*), 0) AS success_count,
|
||||||
|
COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
|
||||||
|
FROM usage_logs
|
||||||
|
WHERE created_at >= $1 AND created_at < $2`
|
||||||
|
|
||||||
|
var tokens sql.NullInt64
|
||||||
|
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&successCount, &tokens); err != nil {
|
||||||
|
return 0, 0, err
|
||||||
|
}
|
||||||
|
if tokens.Valid {
|
||||||
|
tokenConsumed = tokens.Int64
|
||||||
|
}
|
||||||
|
return successCount, tokenConsumed, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) queryUsageLatency(ctx context.Context, start, end time.Time) (duration opsCollectedPercentiles, ttft opsCollectedPercentiles, err error) {
|
||||||
|
{
|
||||||
|
q := `
|
||||||
|
SELECT
|
||||||
|
percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50,
|
||||||
|
percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90,
|
||||||
|
percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95,
|
||||||
|
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99,
|
||||||
|
AVG(duration_ms) AS avg_ms,
|
||||||
|
MAX(duration_ms) AS max_ms
|
||||||
|
FROM usage_logs
|
||||||
|
WHERE created_at >= $1 AND created_at < $2
|
||||||
|
AND duration_ms IS NOT NULL`
|
||||||
|
|
||||||
|
var p50, p90, p95, p99 sql.NullFloat64
|
||||||
|
var avg sql.NullFloat64
|
||||||
|
var max sql.NullInt64
|
||||||
|
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
|
||||||
|
return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
|
||||||
|
}
|
||||||
|
duration.p50 = floatToIntPtr(p50)
|
||||||
|
duration.p90 = floatToIntPtr(p90)
|
||||||
|
duration.p95 = floatToIntPtr(p95)
|
||||||
|
duration.p99 = floatToIntPtr(p99)
|
||||||
|
if avg.Valid {
|
||||||
|
v := roundTo1DP(avg.Float64)
|
||||||
|
duration.avg = &v
|
||||||
|
}
|
||||||
|
if max.Valid {
|
||||||
|
v := int(max.Int64)
|
||||||
|
duration.max = &v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
q := `
|
||||||
|
SELECT
|
||||||
|
percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50,
|
||||||
|
percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90,
|
||||||
|
percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95,
|
||||||
|
percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99,
|
||||||
|
AVG(first_token_ms) AS avg_ms,
|
||||||
|
MAX(first_token_ms) AS max_ms
|
||||||
|
FROM usage_logs
|
||||||
|
WHERE created_at >= $1 AND created_at < $2
|
||||||
|
AND first_token_ms IS NOT NULL`
|
||||||
|
|
||||||
|
var p50, p90, p95, p99 sql.NullFloat64
|
||||||
|
var avg sql.NullFloat64
|
||||||
|
var max sql.NullInt64
|
||||||
|
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
|
||||||
|
return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
|
||||||
|
}
|
||||||
|
ttft.p50 = floatToIntPtr(p50)
|
||||||
|
ttft.p90 = floatToIntPtr(p90)
|
||||||
|
ttft.p95 = floatToIntPtr(p95)
|
||||||
|
ttft.p99 = floatToIntPtr(p99)
|
||||||
|
if avg.Valid {
|
||||||
|
v := roundTo1DP(avg.Float64)
|
||||||
|
ttft.avg = &v
|
||||||
|
}
|
||||||
|
if max.Valid {
|
||||||
|
v := int(max.Int64)
|
||||||
|
ttft.max = &v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return duration, ttft, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) queryErrorCounts(ctx context.Context, start, end time.Time) (
|
||||||
|
errorTotal int64,
|
||||||
|
businessLimited int64,
|
||||||
|
errorSLA int64,
|
||||||
|
upstreamExcl429529 int64,
|
||||||
|
upstream429 int64,
|
||||||
|
upstream529 int64,
|
||||||
|
err error,
|
||||||
|
) {
|
||||||
|
q := `
|
||||||
|
SELECT
|
||||||
|
COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400), 0) AS error_total,
|
||||||
|
COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited), 0) AS business_limited,
|
||||||
|
COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited), 0) AS error_sla,
|
||||||
|
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl,
|
||||||
|
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429), 0) AS upstream_429,
|
||||||
|
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529), 0) AS upstream_529
|
||||||
|
FROM ops_error_logs
|
||||||
|
WHERE created_at >= $1 AND created_at < $2`
|
||||||
|
|
||||||
|
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(
|
||||||
|
&errorTotal,
|
||||||
|
&businessLimited,
|
||||||
|
&errorSLA,
|
||||||
|
&upstreamExcl429529,
|
||||||
|
&upstream429,
|
||||||
|
&upstream529,
|
||||||
|
); err != nil {
|
||||||
|
return 0, 0, 0, 0, 0, 0, err
|
||||||
|
}
|
||||||
|
return errorTotal, businessLimited, errorSLA, upstreamExcl429529, upstream429, upstream529, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type opsCollectedSystemStats struct {
|
||||||
|
cpuUsagePercent *float64
|
||||||
|
memoryUsedMB *int64
|
||||||
|
memoryTotalMB *int64
|
||||||
|
memoryUsagePercent *float64
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) collectSystemStats(ctx context.Context) (*opsCollectedSystemStats, error) {
|
||||||
|
out := &opsCollectedSystemStats{}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
sampleAt := time.Now().UTC()
|
||||||
|
|
||||||
|
// Prefer cgroup (container) metrics when available.
|
||||||
|
if cpuPct := c.tryCgroupCPUPercent(sampleAt); cpuPct != nil {
|
||||||
|
out.cpuUsagePercent = cpuPct
|
||||||
|
}
|
||||||
|
|
||||||
|
cgroupUsed, cgroupTotal, cgroupOK := readCgroupMemoryBytes()
|
||||||
|
if cgroupOK {
|
||||||
|
usedMB := int64(cgroupUsed / bytesPerMB)
|
||||||
|
out.memoryUsedMB = &usedMB
|
||||||
|
if cgroupTotal > 0 {
|
||||||
|
totalMB := int64(cgroupTotal / bytesPerMB)
|
||||||
|
out.memoryTotalMB = &totalMB
|
||||||
|
pct := roundTo1DP(float64(cgroupUsed) / float64(cgroupTotal) * 100)
|
||||||
|
out.memoryUsagePercent = &pct
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to host metrics if cgroup metrics are unavailable (or incomplete).
|
||||||
|
if out.cpuUsagePercent == nil {
|
||||||
|
if cpuPercents, err := cpu.PercentWithContext(ctx, 0, false); err == nil && len(cpuPercents) > 0 {
|
||||||
|
v := roundTo1DP(cpuPercents[0])
|
||||||
|
out.cpuUsagePercent = &v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If total memory isn't available from cgroup (e.g. memory.max = "max"), fill total from host.
|
||||||
|
if out.memoryUsedMB == nil || out.memoryTotalMB == nil || out.memoryUsagePercent == nil {
|
||||||
|
if vm, err := mem.VirtualMemoryWithContext(ctx); err == nil && vm != nil {
|
||||||
|
if out.memoryUsedMB == nil {
|
||||||
|
usedMB := int64(vm.Used / bytesPerMB)
|
||||||
|
out.memoryUsedMB = &usedMB
|
||||||
|
}
|
||||||
|
if out.memoryTotalMB == nil {
|
||||||
|
totalMB := int64(vm.Total / bytesPerMB)
|
||||||
|
out.memoryTotalMB = &totalMB
|
||||||
|
}
|
||||||
|
if out.memoryUsagePercent == nil {
|
||||||
|
if out.memoryUsedMB != nil && out.memoryTotalMB != nil && *out.memoryTotalMB > 0 {
|
||||||
|
pct := roundTo1DP(float64(*out.memoryUsedMB) / float64(*out.memoryTotalMB) * 100)
|
||||||
|
out.memoryUsagePercent = &pct
|
||||||
|
} else {
|
||||||
|
pct := roundTo1DP(vm.UsedPercent)
|
||||||
|
out.memoryUsagePercent = &pct
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) tryCgroupCPUPercent(now time.Time) *float64 {
|
||||||
|
usageNanos, ok := readCgroupCPUUsageNanos()
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize baseline sample.
|
||||||
|
if c.lastCgroupCPUSampleAt.IsZero() {
|
||||||
|
c.lastCgroupCPUUsageNanos = usageNanos
|
||||||
|
c.lastCgroupCPUSampleAt = now
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
elapsed := now.Sub(c.lastCgroupCPUSampleAt)
|
||||||
|
if elapsed <= 0 {
|
||||||
|
c.lastCgroupCPUUsageNanos = usageNanos
|
||||||
|
c.lastCgroupCPUSampleAt = now
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
prev := c.lastCgroupCPUUsageNanos
|
||||||
|
c.lastCgroupCPUUsageNanos = usageNanos
|
||||||
|
c.lastCgroupCPUSampleAt = now
|
||||||
|
|
||||||
|
if usageNanos < prev {
|
||||||
|
// Counter reset (container restarted).
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
deltaUsageSec := float64(usageNanos-prev) / 1e9
|
||||||
|
elapsedSec := elapsed.Seconds()
|
||||||
|
if elapsedSec <= 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
cores := readCgroupCPULimitCores()
|
||||||
|
if cores <= 0 {
|
||||||
|
// Can't reliably normalize; skip and fall back to gopsutil.
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
pct := (deltaUsageSec / (elapsedSec * cores)) * 100
|
||||||
|
if pct < 0 {
|
||||||
|
pct = 0
|
||||||
|
}
|
||||||
|
// Clamp to avoid noise/jitter showing impossible values.
|
||||||
|
if pct > 100 {
|
||||||
|
pct = 100
|
||||||
|
}
|
||||||
|
v := roundTo1DP(pct)
|
||||||
|
return &v
|
||||||
|
}
|
||||||
|
|
||||||
|
func readCgroupMemoryBytes() (usedBytes uint64, totalBytes uint64, ok bool) {
|
||||||
|
// cgroup v2 (most common in modern containers)
|
||||||
|
if used, ok1 := readUintFile("/sys/fs/cgroup/memory.current"); ok1 {
|
||||||
|
usedBytes = used
|
||||||
|
rawMax, err := os.ReadFile("/sys/fs/cgroup/memory.max")
|
||||||
|
if err == nil {
|
||||||
|
s := strings.TrimSpace(string(rawMax))
|
||||||
|
if s != "" && s != "max" {
|
||||||
|
if v, err := strconv.ParseUint(s, 10, 64); err == nil {
|
||||||
|
totalBytes = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return usedBytes, totalBytes, true
|
||||||
|
}
|
||||||
|
|
||||||
|
// cgroup v1 fallback
|
||||||
|
if used, ok1 := readUintFile("/sys/fs/cgroup/memory/memory.usage_in_bytes"); ok1 {
|
||||||
|
usedBytes = used
|
||||||
|
if limit, ok2 := readUintFile("/sys/fs/cgroup/memory/memory.limit_in_bytes"); ok2 {
|
||||||
|
// Some environments report a very large number when unlimited.
|
||||||
|
if limit > 0 && limit < (1<<60) {
|
||||||
|
totalBytes = limit
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return usedBytes, totalBytes, true
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0, 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func readCgroupCPUUsageNanos() (usageNanos uint64, ok bool) {
|
||||||
|
// cgroup v2: cpu.stat has usage_usec
|
||||||
|
if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.stat"); err == nil {
|
||||||
|
lines := strings.Split(string(raw), "\n")
|
||||||
|
for _, line := range lines {
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if len(fields) != 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if fields[0] != "usage_usec" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
v, err := strconv.ParseUint(fields[1], 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return v * 1000, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// cgroup v1: cpuacct.usage is in nanoseconds
|
||||||
|
if v, ok := readUintFile("/sys/fs/cgroup/cpuacct/cpuacct.usage"); ok {
|
||||||
|
return v, true
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func readCgroupCPULimitCores() float64 {
|
||||||
|
// cgroup v2: cpu.max => "<quota> <period>" or "max <period>"
|
||||||
|
if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.max"); err == nil {
|
||||||
|
fields := strings.Fields(string(raw))
|
||||||
|
if len(fields) >= 2 && fields[0] != "max" {
|
||||||
|
quota, err1 := strconv.ParseFloat(fields[0], 64)
|
||||||
|
period, err2 := strconv.ParseFloat(fields[1], 64)
|
||||||
|
if err1 == nil && err2 == nil && quota > 0 && period > 0 {
|
||||||
|
return quota / period
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// cgroup v1: cpu.cfs_quota_us / cpu.cfs_period_us
|
||||||
|
quota, okQuota := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_quota_us")
|
||||||
|
period, okPeriod := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_period_us")
|
||||||
|
if okQuota && okPeriod && quota > 0 && period > 0 {
|
||||||
|
return float64(quota) / float64(period)
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func readUintFile(path string) (uint64, bool) {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
s := strings.TrimSpace(string(raw))
|
||||||
|
if s == "" {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
v, err := strconv.ParseUint(s, 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return v, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func readIntFile(path string) (int64, bool) {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
s := strings.TrimSpace(string(raw))
|
||||||
|
if s == "" {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
v, err := strconv.ParseInt(s, 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return v, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) checkDB(ctx context.Context) bool {
|
||||||
|
if c == nil || c.db == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
var one int
|
||||||
|
if err := c.db.QueryRowContext(ctx, "SELECT 1").Scan(&one); err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return one == 1
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool {
|
||||||
|
if c == nil || c.redisClient == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
return c.redisClient.Ping(ctx).Err() == nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) redisPoolStats() (total int, idle int, ok bool) {
|
||||||
|
if c == nil || c.redisClient == nil {
|
||||||
|
return 0, 0, false
|
||||||
|
}
|
||||||
|
stats := c.redisClient.PoolStats()
|
||||||
|
if stats == nil {
|
||||||
|
return 0, 0, false
|
||||||
|
}
|
||||||
|
return int(stats.TotalConns), int(stats.IdleConns), true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) {
|
||||||
|
if c == nil || c.db == nil {
|
||||||
|
return 0, 0
|
||||||
|
}
|
||||||
|
stats := c.db.Stats()
|
||||||
|
return stats.InUse, stats.Idle
|
||||||
|
}
|
||||||
|
|
||||||
|
var opsMetricsCollectorReleaseScript = redis.NewScript(`
|
||||||
|
if redis.call("GET", KEYS[1]) == ARGV[1] then
|
||||||
|
return redis.call("DEL", KEYS[1])
|
||||||
|
end
|
||||||
|
return 0
|
||||||
|
`)
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
|
||||||
|
if c == nil || c.redisClient == nil {
|
||||||
|
return nil, true
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
ok, err := c.redisClient.SetNX(ctx, opsMetricsCollectorLeaderLockKey, c.instanceID, opsMetricsCollectorLeaderLockTTL).Result()
|
||||||
|
if err != nil {
|
||||||
|
// Prefer fail-closed to avoid stampeding the database when Redis is flaky.
|
||||||
|
// Fallback to a DB advisory lock when Redis is present but unavailable.
|
||||||
|
release, ok := tryAcquireDBAdvisoryLock(ctx, c.db, opsMetricsCollectorAdvisoryLockID)
|
||||||
|
if !ok {
|
||||||
|
c.maybeLogSkip()
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
return release, true
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
c.maybeLogSkip()
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
|
||||||
|
release := func() {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
_, _ = opsMetricsCollectorReleaseScript.Run(ctx, c.redisClient, []string{opsMetricsCollectorLeaderLockKey}, c.instanceID).Result()
|
||||||
|
}
|
||||||
|
return release, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) maybeLogSkip() {
|
||||||
|
c.skipLogMu.Lock()
|
||||||
|
defer c.skipLogMu.Unlock()
|
||||||
|
|
||||||
|
now := time.Now()
|
||||||
|
if !c.skipLogAt.IsZero() && now.Sub(c.skipLogAt) < time.Minute {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
c.skipLogAt = now
|
||||||
|
log.Printf("[OpsMetricsCollector] leader lock held by another instance; skipping")
|
||||||
|
}
|
||||||
|
|
||||||
|
func floatToIntPtr(v sql.NullFloat64) *int {
|
||||||
|
if !v.Valid {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
n := int(math.Round(v.Float64))
|
||||||
|
return &n
|
||||||
|
}
|
||||||
|
|
||||||
|
func roundTo1DP(v float64) float64 {
|
||||||
|
return math.Round(v*10) / 10
|
||||||
|
}
|
||||||
|
|
||||||
|
func truncateString(s string, max int) string {
|
||||||
|
if max <= 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if len(s) <= max {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
cut := s[:max]
|
||||||
|
for len(cut) > 0 && !utf8.ValidString(cut) {
|
||||||
|
cut = cut[:len(cut)-1]
|
||||||
|
}
|
||||||
|
return cut
|
||||||
|
}
|
||||||
|
|
||||||
|
func boolPtr(v bool) *bool {
|
||||||
|
out := v
|
||||||
|
return &out
|
||||||
|
}
|
||||||
|
|
||||||
|
func intPtr(v int) *int {
|
||||||
|
out := v
|
||||||
|
return &out
|
||||||
|
}
|
||||||
|
|
||||||
|
func float64Ptr(v float64) *float64 {
|
||||||
|
out := v
|
||||||
|
return &out
|
||||||
|
}
|
||||||
124
backend/internal/service/ops_models.go
Normal file
124
backend/internal/service/ops_models.go
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
type OpsErrorLog struct {
|
||||||
|
ID int64 `json:"id"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
|
||||||
|
Phase string `json:"phase"`
|
||||||
|
Type string `json:"type"`
|
||||||
|
Severity string `json:"severity"`
|
||||||
|
|
||||||
|
StatusCode int `json:"status_code"`
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
Model string `json:"model"`
|
||||||
|
|
||||||
|
LatencyMs *int `json:"latency_ms"`
|
||||||
|
|
||||||
|
ClientRequestID string `json:"client_request_id"`
|
||||||
|
RequestID string `json:"request_id"`
|
||||||
|
Message string `json:"message"`
|
||||||
|
|
||||||
|
UserID *int64 `json:"user_id"`
|
||||||
|
APIKeyID *int64 `json:"api_key_id"`
|
||||||
|
AccountID *int64 `json:"account_id"`
|
||||||
|
GroupID *int64 `json:"group_id"`
|
||||||
|
|
||||||
|
ClientIP *string `json:"client_ip"`
|
||||||
|
RequestPath string `json:"request_path"`
|
||||||
|
Stream bool `json:"stream"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsErrorLogDetail struct {
|
||||||
|
OpsErrorLog
|
||||||
|
|
||||||
|
ErrorBody string `json:"error_body"`
|
||||||
|
UserAgent string `json:"user_agent"`
|
||||||
|
|
||||||
|
// Upstream context (optional)
|
||||||
|
UpstreamStatusCode *int `json:"upstream_status_code,omitempty"`
|
||||||
|
UpstreamErrorMessage string `json:"upstream_error_message,omitempty"`
|
||||||
|
UpstreamErrorDetail string `json:"upstream_error_detail,omitempty"`
|
||||||
|
UpstreamErrors string `json:"upstream_errors,omitempty"` // JSON array (string) for display/parsing
|
||||||
|
|
||||||
|
// Timings (optional)
|
||||||
|
AuthLatencyMs *int64 `json:"auth_latency_ms"`
|
||||||
|
RoutingLatencyMs *int64 `json:"routing_latency_ms"`
|
||||||
|
UpstreamLatencyMs *int64 `json:"upstream_latency_ms"`
|
||||||
|
ResponseLatencyMs *int64 `json:"response_latency_ms"`
|
||||||
|
TimeToFirstTokenMs *int64 `json:"time_to_first_token_ms"`
|
||||||
|
|
||||||
|
// Retry context
|
||||||
|
RequestBody string `json:"request_body"`
|
||||||
|
RequestBodyTruncated bool `json:"request_body_truncated"`
|
||||||
|
RequestBodyBytes *int `json:"request_body_bytes"`
|
||||||
|
RequestHeaders string `json:"request_headers,omitempty"`
|
||||||
|
|
||||||
|
// vNext metric semantics
|
||||||
|
IsBusinessLimited bool `json:"is_business_limited"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsErrorLogFilter struct {
|
||||||
|
StartTime *time.Time
|
||||||
|
EndTime *time.Time
|
||||||
|
|
||||||
|
Platform string
|
||||||
|
GroupID *int64
|
||||||
|
AccountID *int64
|
||||||
|
|
||||||
|
StatusCodes []int
|
||||||
|
Phase string
|
||||||
|
Query string
|
||||||
|
|
||||||
|
Page int
|
||||||
|
PageSize int
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsErrorLogList struct {
|
||||||
|
Errors []*OpsErrorLog `json:"errors"`
|
||||||
|
Total int `json:"total"`
|
||||||
|
Page int `json:"page"`
|
||||||
|
PageSize int `json:"page_size"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsRetryAttempt struct {
|
||||||
|
ID int64 `json:"id"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
|
||||||
|
RequestedByUserID int64 `json:"requested_by_user_id"`
|
||||||
|
SourceErrorID int64 `json:"source_error_id"`
|
||||||
|
Mode string `json:"mode"`
|
||||||
|
PinnedAccountID *int64 `json:"pinned_account_id"`
|
||||||
|
|
||||||
|
Status string `json:"status"`
|
||||||
|
StartedAt *time.Time `json:"started_at"`
|
||||||
|
FinishedAt *time.Time `json:"finished_at"`
|
||||||
|
DurationMs *int64 `json:"duration_ms"`
|
||||||
|
|
||||||
|
ResultRequestID *string `json:"result_request_id"`
|
||||||
|
ResultErrorID *int64 `json:"result_error_id"`
|
||||||
|
|
||||||
|
ErrorMessage *string `json:"error_message"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsRetryResult struct {
|
||||||
|
AttemptID int64 `json:"attempt_id"`
|
||||||
|
Mode string `json:"mode"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
|
||||||
|
PinnedAccountID *int64 `json:"pinned_account_id"`
|
||||||
|
UsedAccountID *int64 `json:"used_account_id"`
|
||||||
|
|
||||||
|
HTTPStatusCode int `json:"http_status_code"`
|
||||||
|
UpstreamRequestID string `json:"upstream_request_id"`
|
||||||
|
|
||||||
|
ResponsePreview string `json:"response_preview"`
|
||||||
|
ResponseTruncated bool `json:"response_truncated"`
|
||||||
|
|
||||||
|
ErrorMessage string `json:"error_message"`
|
||||||
|
|
||||||
|
StartedAt time.Time `json:"started_at"`
|
||||||
|
FinishedAt time.Time `json:"finished_at"`
|
||||||
|
DurationMs int64 `json:"duration_ms"`
|
||||||
|
}
|
||||||
242
backend/internal/service/ops_port.go
Normal file
242
backend/internal/service/ops_port.go
Normal file
@@ -0,0 +1,242 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type OpsRepository interface {
|
||||||
|
InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error)
|
||||||
|
ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error)
|
||||||
|
GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error)
|
||||||
|
ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error)
|
||||||
|
|
||||||
|
InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
|
||||||
|
UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
|
||||||
|
GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)
|
||||||
|
|
||||||
|
// Lightweight window stats (for realtime WS / quick sampling).
|
||||||
|
GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
|
||||||
|
|
||||||
|
GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error)
|
||||||
|
GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error)
|
||||||
|
GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error)
|
||||||
|
GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error)
|
||||||
|
GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error)
|
||||||
|
|
||||||
|
InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error
|
||||||
|
GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error)
|
||||||
|
|
||||||
|
UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error
|
||||||
|
ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error)
|
||||||
|
|
||||||
|
// Alerts (rules + events)
|
||||||
|
ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error)
|
||||||
|
CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
|
||||||
|
UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
|
||||||
|
DeleteAlertRule(ctx context.Context, id int64) error
|
||||||
|
|
||||||
|
ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
|
||||||
|
GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
|
||||||
|
GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
|
||||||
|
CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
|
||||||
|
UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
|
||||||
|
UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error
|
||||||
|
|
||||||
|
// Pre-aggregation (hourly/daily) used for long-window dashboard performance.
|
||||||
|
UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
|
||||||
|
UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
|
||||||
|
GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error)
|
||||||
|
GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsInsertErrorLogInput struct {
|
||||||
|
RequestID string
|
||||||
|
ClientRequestID string
|
||||||
|
|
||||||
|
UserID *int64
|
||||||
|
APIKeyID *int64
|
||||||
|
AccountID *int64
|
||||||
|
GroupID *int64
|
||||||
|
ClientIP *string
|
||||||
|
|
||||||
|
Platform string
|
||||||
|
Model string
|
||||||
|
RequestPath string
|
||||||
|
Stream bool
|
||||||
|
UserAgent string
|
||||||
|
|
||||||
|
ErrorPhase string
|
||||||
|
ErrorType string
|
||||||
|
Severity string
|
||||||
|
StatusCode int
|
||||||
|
IsBusinessLimited bool
|
||||||
|
|
||||||
|
ErrorMessage string
|
||||||
|
ErrorBody string
|
||||||
|
|
||||||
|
ErrorSource string
|
||||||
|
ErrorOwner string
|
||||||
|
|
||||||
|
UpstreamStatusCode *int
|
||||||
|
UpstreamErrorMessage *string
|
||||||
|
UpstreamErrorDetail *string
|
||||||
|
// UpstreamErrors captures all upstream error attempts observed during handling this request.
|
||||||
|
// It is populated during request processing (gin context) and sanitized+serialized by OpsService.
|
||||||
|
UpstreamErrors []*OpsUpstreamErrorEvent
|
||||||
|
// UpstreamErrorsJSON is the sanitized JSON string stored into ops_error_logs.upstream_errors.
|
||||||
|
// It is set by OpsService.RecordError before persisting.
|
||||||
|
UpstreamErrorsJSON *string
|
||||||
|
|
||||||
|
DurationMs *int
|
||||||
|
TimeToFirstTokenMs *int64
|
||||||
|
|
||||||
|
RequestBodyJSON *string // sanitized json string (not raw bytes)
|
||||||
|
RequestBodyTruncated bool
|
||||||
|
RequestBodyBytes *int
|
||||||
|
RequestHeadersJSON *string // optional json string
|
||||||
|
|
||||||
|
IsRetryable bool
|
||||||
|
RetryCount int
|
||||||
|
|
||||||
|
CreatedAt time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsInsertRetryAttemptInput struct {
|
||||||
|
RequestedByUserID int64
|
||||||
|
SourceErrorID int64
|
||||||
|
Mode string
|
||||||
|
PinnedAccountID *int64
|
||||||
|
|
||||||
|
// running|queued etc.
|
||||||
|
Status string
|
||||||
|
StartedAt time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsUpdateRetryAttemptInput struct {
|
||||||
|
ID int64
|
||||||
|
|
||||||
|
// succeeded|failed
|
||||||
|
Status string
|
||||||
|
FinishedAt time.Time
|
||||||
|
DurationMs int64
|
||||||
|
|
||||||
|
// Optional correlation
|
||||||
|
ResultRequestID *string
|
||||||
|
ResultErrorID *int64
|
||||||
|
|
||||||
|
ErrorMessage *string
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsInsertSystemMetricsInput struct {
|
||||||
|
CreatedAt time.Time
|
||||||
|
WindowMinutes int
|
||||||
|
|
||||||
|
Platform *string
|
||||||
|
GroupID *int64
|
||||||
|
|
||||||
|
SuccessCount int64
|
||||||
|
ErrorCountTotal int64
|
||||||
|
BusinessLimitedCount int64
|
||||||
|
ErrorCountSLA int64
|
||||||
|
|
||||||
|
UpstreamErrorCountExcl429529 int64
|
||||||
|
Upstream429Count int64
|
||||||
|
Upstream529Count int64
|
||||||
|
|
||||||
|
TokenConsumed int64
|
||||||
|
|
||||||
|
QPS *float64
|
||||||
|
TPS *float64
|
||||||
|
|
||||||
|
DurationP50Ms *int
|
||||||
|
DurationP90Ms *int
|
||||||
|
DurationP95Ms *int
|
||||||
|
DurationP99Ms *int
|
||||||
|
DurationAvgMs *float64
|
||||||
|
DurationMaxMs *int
|
||||||
|
|
||||||
|
TTFTP50Ms *int
|
||||||
|
TTFTP90Ms *int
|
||||||
|
TTFTP95Ms *int
|
||||||
|
TTFTP99Ms *int
|
||||||
|
TTFTAvgMs *float64
|
||||||
|
TTFTMaxMs *int
|
||||||
|
|
||||||
|
CPUUsagePercent *float64
|
||||||
|
MemoryUsedMB *int64
|
||||||
|
MemoryTotalMB *int64
|
||||||
|
MemoryUsagePercent *float64
|
||||||
|
|
||||||
|
DBOK *bool
|
||||||
|
RedisOK *bool
|
||||||
|
|
||||||
|
RedisConnTotal *int
|
||||||
|
RedisConnIdle *int
|
||||||
|
|
||||||
|
DBConnActive *int
|
||||||
|
DBConnIdle *int
|
||||||
|
DBConnWaiting *int
|
||||||
|
|
||||||
|
GoroutineCount *int
|
||||||
|
ConcurrencyQueueDepth *int
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsSystemMetricsSnapshot struct {
|
||||||
|
ID int64 `json:"id"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
WindowMinutes int `json:"window_minutes"`
|
||||||
|
|
||||||
|
CPUUsagePercent *float64 `json:"cpu_usage_percent"`
|
||||||
|
MemoryUsedMB *int64 `json:"memory_used_mb"`
|
||||||
|
MemoryTotalMB *int64 `json:"memory_total_mb"`
|
||||||
|
MemoryUsagePercent *float64 `json:"memory_usage_percent"`
|
||||||
|
|
||||||
|
DBOK *bool `json:"db_ok"`
|
||||||
|
RedisOK *bool `json:"redis_ok"`
|
||||||
|
|
||||||
|
// Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max".
|
||||||
|
DBMaxOpenConns *int `json:"db_max_open_conns"`
|
||||||
|
RedisPoolSize *int `json:"redis_pool_size"`
|
||||||
|
|
||||||
|
RedisConnTotal *int `json:"redis_conn_total"`
|
||||||
|
RedisConnIdle *int `json:"redis_conn_idle"`
|
||||||
|
|
||||||
|
DBConnActive *int `json:"db_conn_active"`
|
||||||
|
DBConnIdle *int `json:"db_conn_idle"`
|
||||||
|
DBConnWaiting *int `json:"db_conn_waiting"`
|
||||||
|
|
||||||
|
GoroutineCount *int `json:"goroutine_count"`
|
||||||
|
ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsUpsertJobHeartbeatInput struct {
|
||||||
|
JobName string
|
||||||
|
|
||||||
|
LastRunAt *time.Time
|
||||||
|
LastSuccessAt *time.Time
|
||||||
|
LastErrorAt *time.Time
|
||||||
|
LastError *string
|
||||||
|
LastDurationMs *int64
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsJobHeartbeat struct {
|
||||||
|
JobName string `json:"job_name"`
|
||||||
|
|
||||||
|
LastRunAt *time.Time `json:"last_run_at"`
|
||||||
|
LastSuccessAt *time.Time `json:"last_success_at"`
|
||||||
|
LastErrorAt *time.Time `json:"last_error_at"`
|
||||||
|
LastError *string `json:"last_error"`
|
||||||
|
LastDurationMs *int64 `json:"last_duration_ms"`
|
||||||
|
|
||||||
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsWindowStats struct {
|
||||||
|
StartTime time.Time `json:"start_time"`
|
||||||
|
EndTime time.Time `json:"end_time"`
|
||||||
|
|
||||||
|
SuccessCount int64 `json:"success_count"`
|
||||||
|
ErrorCountTotal int64 `json:"error_count_total"`
|
||||||
|
TokenConsumed int64 `json:"token_consumed"`
|
||||||
|
}
|
||||||
40
backend/internal/service/ops_query_mode.go
Normal file
40
backend/internal/service/ops_query_mode.go
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
type OpsQueryMode string
|
||||||
|
|
||||||
|
const (
|
||||||
|
OpsQueryModeAuto OpsQueryMode = "auto"
|
||||||
|
OpsQueryModeRaw OpsQueryMode = "raw"
|
||||||
|
OpsQueryModePreagg OpsQueryMode = "preagg"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ErrOpsPreaggregatedNotPopulated indicates that raw logs exist for a window, but the
|
||||||
|
// pre-aggregation tables are not populated yet. This is primarily used to implement
|
||||||
|
// the forced `preagg` mode UX.
|
||||||
|
var ErrOpsPreaggregatedNotPopulated = errors.New("ops pre-aggregated tables not populated")
|
||||||
|
|
||||||
|
func ParseOpsQueryMode(raw string) OpsQueryMode {
|
||||||
|
v := strings.ToLower(strings.TrimSpace(raw))
|
||||||
|
switch v {
|
||||||
|
case string(OpsQueryModeRaw):
|
||||||
|
return OpsQueryModeRaw
|
||||||
|
case string(OpsQueryModePreagg):
|
||||||
|
return OpsQueryModePreagg
|
||||||
|
default:
|
||||||
|
return OpsQueryModeAuto
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m OpsQueryMode) IsValid() bool {
|
||||||
|
switch m {
|
||||||
|
case OpsQueryModeAuto, OpsQueryModeRaw, OpsQueryModePreagg:
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
36
backend/internal/service/ops_realtime.go
Normal file
36
backend/internal/service/ops_realtime.go
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// IsRealtimeMonitoringEnabled returns true when realtime ops features are enabled.
|
||||||
|
//
|
||||||
|
// This is a soft switch controlled by the DB setting `ops_realtime_monitoring_enabled`,
|
||||||
|
// and it is also gated by the hard switch/soft switch of overall ops monitoring.
|
||||||
|
func (s *OpsService) IsRealtimeMonitoringEnabled(ctx context.Context) bool {
|
||||||
|
if !s.IsMonitoringEnabled(ctx) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if s.settingRepo == nil {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsRealtimeMonitoringEnabled)
|
||||||
|
if err != nil {
|
||||||
|
// Default enabled when key is missing; fail-open on transient errors.
|
||||||
|
if errors.Is(err, ErrSettingNotFound) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
switch strings.ToLower(strings.TrimSpace(value)) {
|
||||||
|
case "false", "0", "off", "disabled":
|
||||||
|
return false
|
||||||
|
default:
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
81
backend/internal/service/ops_realtime_models.go
Normal file
81
backend/internal/service/ops_realtime_models.go
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// PlatformConcurrencyInfo aggregates concurrency usage by platform.
|
||||||
|
type PlatformConcurrencyInfo struct {
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
CurrentInUse int64 `json:"current_in_use"`
|
||||||
|
MaxCapacity int64 `json:"max_capacity"`
|
||||||
|
LoadPercentage float64 `json:"load_percentage"`
|
||||||
|
WaitingInQueue int64 `json:"waiting_in_queue"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// GroupConcurrencyInfo aggregates concurrency usage by group.
|
||||||
|
//
|
||||||
|
// Note: one account can belong to multiple groups; group totals are therefore not additive across groups.
|
||||||
|
type GroupConcurrencyInfo struct {
|
||||||
|
GroupID int64 `json:"group_id"`
|
||||||
|
GroupName string `json:"group_name"`
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
CurrentInUse int64 `json:"current_in_use"`
|
||||||
|
MaxCapacity int64 `json:"max_capacity"`
|
||||||
|
LoadPercentage float64 `json:"load_percentage"`
|
||||||
|
WaitingInQueue int64 `json:"waiting_in_queue"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// AccountConcurrencyInfo represents real-time concurrency usage for a single account.
|
||||||
|
type AccountConcurrencyInfo struct {
|
||||||
|
AccountID int64 `json:"account_id"`
|
||||||
|
AccountName string `json:"account_name"`
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
GroupID int64 `json:"group_id"`
|
||||||
|
GroupName string `json:"group_name"`
|
||||||
|
CurrentInUse int64 `json:"current_in_use"`
|
||||||
|
MaxCapacity int64 `json:"max_capacity"`
|
||||||
|
LoadPercentage float64 `json:"load_percentage"`
|
||||||
|
WaitingInQueue int64 `json:"waiting_in_queue"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// PlatformAvailability aggregates account availability by platform.
|
||||||
|
type PlatformAvailability struct {
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
TotalAccounts int64 `json:"total_accounts"`
|
||||||
|
AvailableCount int64 `json:"available_count"`
|
||||||
|
RateLimitCount int64 `json:"rate_limit_count"`
|
||||||
|
ErrorCount int64 `json:"error_count"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// GroupAvailability aggregates account availability by group.
|
||||||
|
type GroupAvailability struct {
|
||||||
|
GroupID int64 `json:"group_id"`
|
||||||
|
GroupName string `json:"group_name"`
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
TotalAccounts int64 `json:"total_accounts"`
|
||||||
|
AvailableCount int64 `json:"available_count"`
|
||||||
|
RateLimitCount int64 `json:"rate_limit_count"`
|
||||||
|
ErrorCount int64 `json:"error_count"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// AccountAvailability represents current availability for a single account.
|
||||||
|
type AccountAvailability struct {
|
||||||
|
AccountID int64 `json:"account_id"`
|
||||||
|
AccountName string `json:"account_name"`
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
GroupID int64 `json:"group_id"`
|
||||||
|
GroupName string `json:"group_name"`
|
||||||
|
|
||||||
|
Status string `json:"status"`
|
||||||
|
|
||||||
|
IsAvailable bool `json:"is_available"`
|
||||||
|
IsRateLimited bool `json:"is_rate_limited"`
|
||||||
|
IsOverloaded bool `json:"is_overloaded"`
|
||||||
|
HasError bool `json:"has_error"`
|
||||||
|
|
||||||
|
RateLimitResetAt *time.Time `json:"rate_limit_reset_at"`
|
||||||
|
RateLimitRemainingSec *int64 `json:"rate_limit_remaining_sec"`
|
||||||
|
OverloadUntil *time.Time `json:"overload_until"`
|
||||||
|
OverloadRemainingSec *int64 `json:"overload_remaining_sec"`
|
||||||
|
ErrorMessage string `json:"error_message"`
|
||||||
|
TempUnschedulableUntil *time.Time `json:"temp_unschedulable_until,omitempty"`
|
||||||
|
}
|
||||||
151
backend/internal/service/ops_request_details.go
Normal file
151
backend/internal/service/ops_request_details.go
Normal file
@@ -0,0 +1,151 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type OpsRequestKind string
|
||||||
|
|
||||||
|
const (
|
||||||
|
OpsRequestKindSuccess OpsRequestKind = "success"
|
||||||
|
OpsRequestKindError OpsRequestKind = "error"
|
||||||
|
)
|
||||||
|
|
||||||
|
// OpsRequestDetail is a request-level view across success (usage_logs) and error (ops_error_logs).
|
||||||
|
// It powers "request drilldown" UIs without exposing full request bodies for successful requests.
|
||||||
|
type OpsRequestDetail struct {
|
||||||
|
Kind OpsRequestKind `json:"kind"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
RequestID string `json:"request_id"`
|
||||||
|
|
||||||
|
Platform string `json:"platform,omitempty"`
|
||||||
|
Model string `json:"model,omitempty"`
|
||||||
|
|
||||||
|
DurationMs *int `json:"duration_ms,omitempty"`
|
||||||
|
StatusCode *int `json:"status_code,omitempty"`
|
||||||
|
|
||||||
|
// When Kind == "error", ErrorID links to /admin/ops/errors/:id.
|
||||||
|
ErrorID *int64 `json:"error_id,omitempty"`
|
||||||
|
|
||||||
|
Phase string `json:"phase,omitempty"`
|
||||||
|
Severity string `json:"severity,omitempty"`
|
||||||
|
Message string `json:"message,omitempty"`
|
||||||
|
|
||||||
|
UserID *int64 `json:"user_id,omitempty"`
|
||||||
|
APIKeyID *int64 `json:"api_key_id,omitempty"`
|
||||||
|
AccountID *int64 `json:"account_id,omitempty"`
|
||||||
|
GroupID *int64 `json:"group_id,omitempty"`
|
||||||
|
|
||||||
|
Stream bool `json:"stream"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsRequestDetailFilter struct {
|
||||||
|
StartTime *time.Time
|
||||||
|
EndTime *time.Time
|
||||||
|
|
||||||
|
// kind: success|error|all
|
||||||
|
Kind string
|
||||||
|
|
||||||
|
Platform string
|
||||||
|
GroupID *int64
|
||||||
|
|
||||||
|
UserID *int64
|
||||||
|
APIKeyID *int64
|
||||||
|
AccountID *int64
|
||||||
|
|
||||||
|
Model string
|
||||||
|
RequestID string
|
||||||
|
Query string
|
||||||
|
|
||||||
|
MinDurationMs *int
|
||||||
|
MaxDurationMs *int
|
||||||
|
|
||||||
|
// Sort: created_at_desc (default) or duration_desc.
|
||||||
|
Sort string
|
||||||
|
|
||||||
|
Page int
|
||||||
|
PageSize int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *OpsRequestDetailFilter) Normalize() (page, pageSize int, startTime, endTime time.Time) {
|
||||||
|
page = 1
|
||||||
|
pageSize = 50
|
||||||
|
endTime = time.Now()
|
||||||
|
startTime = endTime.Add(-1 * time.Hour)
|
||||||
|
|
||||||
|
if f == nil {
|
||||||
|
return page, pageSize, startTime, endTime
|
||||||
|
}
|
||||||
|
|
||||||
|
if f.Page > 0 {
|
||||||
|
page = f.Page
|
||||||
|
}
|
||||||
|
if f.PageSize > 0 {
|
||||||
|
pageSize = f.PageSize
|
||||||
|
}
|
||||||
|
if pageSize > 100 {
|
||||||
|
pageSize = 100
|
||||||
|
}
|
||||||
|
|
||||||
|
if f.EndTime != nil {
|
||||||
|
endTime = *f.EndTime
|
||||||
|
}
|
||||||
|
if f.StartTime != nil {
|
||||||
|
startTime = *f.StartTime
|
||||||
|
} else if f.EndTime != nil {
|
||||||
|
startTime = endTime.Add(-1 * time.Hour)
|
||||||
|
}
|
||||||
|
|
||||||
|
if startTime.After(endTime) {
|
||||||
|
startTime, endTime = endTime, startTime
|
||||||
|
}
|
||||||
|
|
||||||
|
return page, pageSize, startTime, endTime
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsRequestDetailList struct {
|
||||||
|
Items []*OpsRequestDetail `json:"items"`
|
||||||
|
Total int64 `json:"total"`
|
||||||
|
Page int `json:"page"`
|
||||||
|
PageSize int `json:"page_size"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) (*OpsRequestDetailList, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return &OpsRequestDetailList{
|
||||||
|
Items: []*OpsRequestDetail{},
|
||||||
|
Total: 0,
|
||||||
|
Page: 1,
|
||||||
|
PageSize: 50,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
page, pageSize, startTime, endTime := filter.Normalize()
|
||||||
|
filterCopy := &OpsRequestDetailFilter{}
|
||||||
|
if filter != nil {
|
||||||
|
*filterCopy = *filter
|
||||||
|
}
|
||||||
|
filterCopy.Page = page
|
||||||
|
filterCopy.PageSize = pageSize
|
||||||
|
filterCopy.StartTime = &startTime
|
||||||
|
filterCopy.EndTime = &endTime
|
||||||
|
|
||||||
|
items, total, err := s.opsRepo.ListRequestDetails(ctx, filterCopy)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if items == nil {
|
||||||
|
items = []*OpsRequestDetail{}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &OpsRequestDetailList{
|
||||||
|
Items: items,
|
||||||
|
Total: total,
|
||||||
|
Page: page,
|
||||||
|
PageSize: pageSize,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
632
backend/internal/service/ops_retry.go
Normal file
632
backend/internal/service/ops_retry.go
Normal file
@@ -0,0 +1,632 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"net/http"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||||
|
"github.com/gin-gonic/gin"
|
||||||
|
"github.com/lib/pq"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
OpsRetryModeClient = "client"
|
||||||
|
OpsRetryModeUpstream = "upstream"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
opsRetryStatusRunning = "running"
|
||||||
|
opsRetryStatusSucceeded = "succeeded"
|
||||||
|
opsRetryStatusFailed = "failed"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
opsRetryTimeout = 60 * time.Second
|
||||||
|
opsRetryCaptureBytesLimit = 64 * 1024
|
||||||
|
opsRetryResponsePreviewMax = 8 * 1024
|
||||||
|
opsRetryMinIntervalPerError = 10 * time.Second
|
||||||
|
opsRetryMaxAccountSwitches = 3
|
||||||
|
)
|
||||||
|
|
||||||
|
var opsRetryRequestHeaderAllowlist = map[string]bool{
|
||||||
|
"anthropic-beta": true,
|
||||||
|
"anthropic-version": true,
|
||||||
|
}
|
||||||
|
|
||||||
|
type opsRetryRequestType string
|
||||||
|
|
||||||
|
const (
|
||||||
|
opsRetryTypeMessages opsRetryRequestType = "messages"
|
||||||
|
opsRetryTypeOpenAI opsRetryRequestType = "openai_responses"
|
||||||
|
opsRetryTypeGeminiV1B opsRetryRequestType = "gemini_v1beta"
|
||||||
|
)
|
||||||
|
|
||||||
|
type limitedResponseWriter struct {
|
||||||
|
header http.Header
|
||||||
|
wroteHeader bool
|
||||||
|
|
||||||
|
limit int
|
||||||
|
totalWritten int64
|
||||||
|
buf bytes.Buffer
|
||||||
|
}
|
||||||
|
|
||||||
|
func newLimitedResponseWriter(limit int) *limitedResponseWriter {
|
||||||
|
if limit <= 0 {
|
||||||
|
limit = 1
|
||||||
|
}
|
||||||
|
return &limitedResponseWriter{
|
||||||
|
header: make(http.Header),
|
||||||
|
limit: limit,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *limitedResponseWriter) Header() http.Header {
|
||||||
|
return w.header
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *limitedResponseWriter) WriteHeader(statusCode int) {
|
||||||
|
if w.wroteHeader {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.wroteHeader = true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *limitedResponseWriter) Write(p []byte) (int, error) {
|
||||||
|
if !w.wroteHeader {
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
}
|
||||||
|
w.totalWritten += int64(len(p))
|
||||||
|
|
||||||
|
if w.buf.Len() < w.limit {
|
||||||
|
remaining := w.limit - w.buf.Len()
|
||||||
|
if len(p) > remaining {
|
||||||
|
_, _ = w.buf.Write(p[:remaining])
|
||||||
|
} else {
|
||||||
|
_, _ = w.buf.Write(p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pretend we wrote everything to avoid upstream/client code treating it as an error.
|
||||||
|
return len(p), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *limitedResponseWriter) Flush() {}
|
||||||
|
|
||||||
|
func (w *limitedResponseWriter) bodyBytes() []byte {
|
||||||
|
return w.buf.Bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *limitedResponseWriter) truncated() bool {
|
||||||
|
return w.totalWritten > int64(w.limit)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
|
||||||
|
mode = strings.ToLower(strings.TrimSpace(mode))
|
||||||
|
switch mode {
|
||||||
|
case OpsRetryModeClient, OpsRetryModeUpstream:
|
||||||
|
default:
|
||||||
|
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
|
||||||
|
}
|
||||||
|
|
||||||
|
latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
|
||||||
|
if err != nil && !errors.Is(err, sql.ErrNoRows) {
|
||||||
|
return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
|
||||||
|
}
|
||||||
|
if latest != nil {
|
||||||
|
if strings.EqualFold(latest.Status, opsRetryStatusRunning) || strings.EqualFold(latest.Status, "queued") {
|
||||||
|
return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
|
||||||
|
}
|
||||||
|
|
||||||
|
lastAttemptAt := latest.CreatedAt
|
||||||
|
if latest.FinishedAt != nil && !latest.FinishedAt.IsZero() {
|
||||||
|
lastAttemptAt = *latest.FinishedAt
|
||||||
|
} else if latest.StartedAt != nil && !latest.StartedAt.IsZero() {
|
||||||
|
lastAttemptAt = *latest.StartedAt
|
||||||
|
}
|
||||||
|
|
||||||
|
if time.Since(lastAttemptAt) < opsRetryMinIntervalPerError {
|
||||||
|
return nil, infraerrors.Conflict("OPS_RETRY_TOO_FREQUENT", "Please wait before retrying this error again")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
errorLog, err := s.GetErrorLogByID(ctx, errorID)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(errorLog.RequestBody) == "" {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
|
||||||
|
}
|
||||||
|
|
||||||
|
var pinned *int64
|
||||||
|
if mode == OpsRetryModeUpstream {
|
||||||
|
if pinnedAccountID != nil && *pinnedAccountID > 0 {
|
||||||
|
pinned = pinnedAccountID
|
||||||
|
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
|
||||||
|
pinned = errorLog.AccountID
|
||||||
|
} else {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
startedAt := time.Now()
|
||||||
|
attemptID, err := s.opsRepo.InsertRetryAttempt(ctx, &OpsInsertRetryAttemptInput{
|
||||||
|
RequestedByUserID: requestedByUserID,
|
||||||
|
SourceErrorID: errorID,
|
||||||
|
Mode: mode,
|
||||||
|
PinnedAccountID: pinned,
|
||||||
|
Status: opsRetryStatusRunning,
|
||||||
|
StartedAt: startedAt,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
var pqErr *pq.Error
|
||||||
|
if errors.As(err, &pqErr) && string(pqErr.Code) == "23505" {
|
||||||
|
return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
|
||||||
|
}
|
||||||
|
return nil, infraerrors.InternalServer("OPS_RETRY_CREATE_ATTEMPT_FAILED", "Failed to create retry attempt").WithCause(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
result := &OpsRetryResult{
|
||||||
|
AttemptID: attemptID,
|
||||||
|
Mode: mode,
|
||||||
|
Status: opsRetryStatusFailed,
|
||||||
|
PinnedAccountID: pinned,
|
||||||
|
HTTPStatusCode: 0,
|
||||||
|
UpstreamRequestID: "",
|
||||||
|
ResponsePreview: "",
|
||||||
|
ResponseTruncated: false,
|
||||||
|
ErrorMessage: "",
|
||||||
|
StartedAt: startedAt,
|
||||||
|
}
|
||||||
|
|
||||||
|
execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
execRes := s.executeRetry(execCtx, errorLog, mode, pinned)
|
||||||
|
|
||||||
|
finishedAt := time.Now()
|
||||||
|
result.FinishedAt = finishedAt
|
||||||
|
result.DurationMs = finishedAt.Sub(startedAt).Milliseconds()
|
||||||
|
|
||||||
|
if execRes != nil {
|
||||||
|
result.Status = execRes.status
|
||||||
|
result.UsedAccountID = execRes.usedAccountID
|
||||||
|
result.HTTPStatusCode = execRes.httpStatusCode
|
||||||
|
result.UpstreamRequestID = execRes.upstreamRequestID
|
||||||
|
result.ResponsePreview = execRes.responsePreview
|
||||||
|
result.ResponseTruncated = execRes.responseTruncated
|
||||||
|
result.ErrorMessage = execRes.errorMessage
|
||||||
|
}
|
||||||
|
|
||||||
|
updateCtx, updateCancel := context.WithTimeout(context.Background(), 3*time.Second)
|
||||||
|
defer updateCancel()
|
||||||
|
|
||||||
|
var updateErrMsg *string
|
||||||
|
if strings.TrimSpace(result.ErrorMessage) != "" {
|
||||||
|
msg := result.ErrorMessage
|
||||||
|
updateErrMsg = &msg
|
||||||
|
}
|
||||||
|
var resultRequestID *string
|
||||||
|
if strings.TrimSpace(result.UpstreamRequestID) != "" {
|
||||||
|
v := result.UpstreamRequestID
|
||||||
|
resultRequestID = &v
|
||||||
|
}
|
||||||
|
|
||||||
|
finalStatus := result.Status
|
||||||
|
if strings.TrimSpace(finalStatus) == "" {
|
||||||
|
finalStatus = opsRetryStatusFailed
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{
|
||||||
|
ID: attemptID,
|
||||||
|
Status: finalStatus,
|
||||||
|
FinishedAt: finishedAt,
|
||||||
|
DurationMs: result.DurationMs,
|
||||||
|
ResultRequestID: resultRequestID,
|
||||||
|
ErrorMessage: updateErrMsg,
|
||||||
|
}); err != nil {
|
||||||
|
// Best-effort: retry itself already executed; do not fail the API response.
|
||||||
|
log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type opsRetryExecution struct {
|
||||||
|
status string
|
||||||
|
|
||||||
|
usedAccountID *int64
|
||||||
|
httpStatusCode int
|
||||||
|
upstreamRequestID string
|
||||||
|
|
||||||
|
responsePreview string
|
||||||
|
responseTruncated bool
|
||||||
|
|
||||||
|
errorMessage string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) executeRetry(ctx context.Context, errorLog *OpsErrorLogDetail, mode string, pinnedAccountID *int64) *opsRetryExecution {
|
||||||
|
if errorLog == nil {
|
||||||
|
return &opsRetryExecution{
|
||||||
|
status: opsRetryStatusFailed,
|
||||||
|
errorMessage: "missing error log",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
reqType := detectOpsRetryType(errorLog.RequestPath)
|
||||||
|
bodyBytes := []byte(errorLog.RequestBody)
|
||||||
|
|
||||||
|
switch reqType {
|
||||||
|
case opsRetryTypeMessages:
|
||||||
|
bodyBytes = FilterThinkingBlocksForRetry(bodyBytes)
|
||||||
|
case opsRetryTypeOpenAI, opsRetryTypeGeminiV1B:
|
||||||
|
// No-op
|
||||||
|
}
|
||||||
|
|
||||||
|
switch strings.ToLower(strings.TrimSpace(mode)) {
|
||||||
|
case OpsRetryModeUpstream:
|
||||||
|
if pinnedAccountID == nil || *pinnedAccountID <= 0 {
|
||||||
|
return &opsRetryExecution{
|
||||||
|
status: opsRetryStatusFailed,
|
||||||
|
errorMessage: "pinned_account_id required for upstream retry",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s.executePinnedRetry(ctx, reqType, errorLog, bodyBytes, *pinnedAccountID)
|
||||||
|
case OpsRetryModeClient:
|
||||||
|
return s.executeClientRetry(ctx, reqType, errorLog, bodyBytes)
|
||||||
|
default:
|
||||||
|
return &opsRetryExecution{
|
||||||
|
status: opsRetryStatusFailed,
|
||||||
|
errorMessage: "invalid retry mode",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func detectOpsRetryType(path string) opsRetryRequestType {
|
||||||
|
p := strings.ToLower(strings.TrimSpace(path))
|
||||||
|
switch {
|
||||||
|
case strings.Contains(p, "/responses"):
|
||||||
|
return opsRetryTypeOpenAI
|
||||||
|
case strings.Contains(p, "/v1beta/"):
|
||||||
|
return opsRetryTypeGeminiV1B
|
||||||
|
default:
|
||||||
|
return opsRetryTypeMessages
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) executePinnedRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, pinnedAccountID int64) *opsRetryExecution {
|
||||||
|
if s.accountRepo == nil {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account repository not available"}
|
||||||
|
}
|
||||||
|
|
||||||
|
account, err := s.accountRepo.GetByID(ctx, pinnedAccountID)
|
||||||
|
if err != nil {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("account not found: %v", err)}
|
||||||
|
}
|
||||||
|
if account == nil {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account not found"}
|
||||||
|
}
|
||||||
|
if !account.IsSchedulable() {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account is not schedulable"}
|
||||||
|
}
|
||||||
|
if errorLog.GroupID != nil && *errorLog.GroupID > 0 {
|
||||||
|
if !containsInt64(account.GroupIDs, *errorLog.GroupID) {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "pinned account is not in the same group as the original request"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var release func()
|
||||||
|
if s.concurrencyService != nil {
|
||||||
|
acq, err := s.concurrencyService.AcquireAccountSlot(ctx, account.ID, account.Concurrency)
|
||||||
|
if err != nil {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("acquire account slot failed: %v", err)}
|
||||||
|
}
|
||||||
|
if acq == nil || !acq.Acquired {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account concurrency limit reached"}
|
||||||
|
}
|
||||||
|
release = acq.ReleaseFunc
|
||||||
|
}
|
||||||
|
if release != nil {
|
||||||
|
defer release()
|
||||||
|
}
|
||||||
|
|
||||||
|
usedID := account.ID
|
||||||
|
exec := s.executeWithAccount(ctx, reqType, errorLog, body, account)
|
||||||
|
exec.usedAccountID = &usedID
|
||||||
|
if exec.status == "" {
|
||||||
|
exec.status = opsRetryStatusFailed
|
||||||
|
}
|
||||||
|
return exec
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) executeClientRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) *opsRetryExecution {
|
||||||
|
groupID := errorLog.GroupID
|
||||||
|
if groupID == nil || *groupID <= 0 {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "group_id missing; cannot reselect account"}
|
||||||
|
}
|
||||||
|
|
||||||
|
model, stream, parsedErr := extractRetryModelAndStream(reqType, errorLog, body)
|
||||||
|
if parsedErr != nil {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: parsedErr.Error()}
|
||||||
|
}
|
||||||
|
_ = stream
|
||||||
|
|
||||||
|
excluded := make(map[int64]struct{})
|
||||||
|
switches := 0
|
||||||
|
|
||||||
|
for {
|
||||||
|
if switches >= opsRetryMaxAccountSwitches {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "retry failed after exhausting account failovers"}
|
||||||
|
}
|
||||||
|
|
||||||
|
selection, selErr := s.selectAccountForRetry(ctx, reqType, groupID, model, excluded)
|
||||||
|
if selErr != nil {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: selErr.Error()}
|
||||||
|
}
|
||||||
|
if selection == nil || selection.Account == nil {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "no available accounts"}
|
||||||
|
}
|
||||||
|
|
||||||
|
account := selection.Account
|
||||||
|
if !selection.Acquired || selection.ReleaseFunc == nil {
|
||||||
|
excluded[account.ID] = struct{}{}
|
||||||
|
switches++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
exec := func() *opsRetryExecution {
|
||||||
|
defer selection.ReleaseFunc()
|
||||||
|
return s.executeWithAccount(ctx, reqType, errorLog, body, account)
|
||||||
|
}()
|
||||||
|
|
||||||
|
if exec != nil {
|
||||||
|
if exec.status == opsRetryStatusSucceeded {
|
||||||
|
usedID := account.ID
|
||||||
|
exec.usedAccountID = &usedID
|
||||||
|
return exec
|
||||||
|
}
|
||||||
|
// If the gateway services ask for failover, try another account.
|
||||||
|
if s.isFailoverError(exec.errorMessage) {
|
||||||
|
excluded[account.ID] = struct{}{}
|
||||||
|
switches++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
usedID := account.ID
|
||||||
|
exec.usedAccountID = &usedID
|
||||||
|
return exec
|
||||||
|
}
|
||||||
|
|
||||||
|
excluded[account.ID] = struct{}{}
|
||||||
|
switches++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) selectAccountForRetry(ctx context.Context, reqType opsRetryRequestType, groupID *int64, model string, excludedIDs map[int64]struct{}) (*AccountSelectionResult, error) {
|
||||||
|
switch reqType {
|
||||||
|
case opsRetryTypeOpenAI:
|
||||||
|
if s.openAIGatewayService == nil {
|
||||||
|
return nil, fmt.Errorf("openai gateway service not available")
|
||||||
|
}
|
||||||
|
return s.openAIGatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
|
||||||
|
case opsRetryTypeGeminiV1B, opsRetryTypeMessages:
|
||||||
|
if s.gatewayService == nil {
|
||||||
|
return nil, fmt.Errorf("gateway service not available")
|
||||||
|
}
|
||||||
|
return s.gatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("unsupported retry type: %s", reqType)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractRetryModelAndStream(reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) (model string, stream bool, err error) {
|
||||||
|
switch reqType {
|
||||||
|
case opsRetryTypeMessages:
|
||||||
|
parsed, parseErr := ParseGatewayRequest(body)
|
||||||
|
if parseErr != nil {
|
||||||
|
return "", false, fmt.Errorf("failed to parse messages request body: %w", parseErr)
|
||||||
|
}
|
||||||
|
return parsed.Model, parsed.Stream, nil
|
||||||
|
case opsRetryTypeOpenAI:
|
||||||
|
var v struct {
|
||||||
|
Model string `json:"model"`
|
||||||
|
Stream bool `json:"stream"`
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(body, &v); err != nil {
|
||||||
|
return "", false, fmt.Errorf("failed to parse openai request body: %w", err)
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(v.Model), v.Stream, nil
|
||||||
|
case opsRetryTypeGeminiV1B:
|
||||||
|
if strings.TrimSpace(errorLog.Model) == "" {
|
||||||
|
return "", false, fmt.Errorf("missing model for gemini v1beta retry")
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(errorLog.Model), errorLog.Stream, nil
|
||||||
|
default:
|
||||||
|
return "", false, fmt.Errorf("unsupported retry type: %s", reqType)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) executeWithAccount(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, account *Account) *opsRetryExecution {
|
||||||
|
if account == nil {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "missing account"}
|
||||||
|
}
|
||||||
|
|
||||||
|
c, w := newOpsRetryContext(ctx, errorLog)
|
||||||
|
|
||||||
|
var err error
|
||||||
|
switch reqType {
|
||||||
|
case opsRetryTypeOpenAI:
|
||||||
|
if s.openAIGatewayService == nil {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "openai gateway service not available"}
|
||||||
|
}
|
||||||
|
_, err = s.openAIGatewayService.Forward(ctx, c, account, body)
|
||||||
|
case opsRetryTypeGeminiV1B:
|
||||||
|
if s.geminiCompatService == nil || s.antigravityGatewayService == nil {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini services not available"}
|
||||||
|
}
|
||||||
|
modelName := strings.TrimSpace(errorLog.Model)
|
||||||
|
action := "generateContent"
|
||||||
|
if errorLog.Stream {
|
||||||
|
action = "streamGenerateContent"
|
||||||
|
}
|
||||||
|
if account.Platform == PlatformAntigravity {
|
||||||
|
_, err = s.antigravityGatewayService.ForwardGemini(ctx, c, account, modelName, action, errorLog.Stream, body)
|
||||||
|
} else {
|
||||||
|
_, err = s.geminiCompatService.ForwardNative(ctx, c, account, modelName, action, errorLog.Stream, body)
|
||||||
|
}
|
||||||
|
case opsRetryTypeMessages:
|
||||||
|
switch account.Platform {
|
||||||
|
case PlatformAntigravity:
|
||||||
|
if s.antigravityGatewayService == nil {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "antigravity gateway service not available"}
|
||||||
|
}
|
||||||
|
_, err = s.antigravityGatewayService.Forward(ctx, c, account, body)
|
||||||
|
case PlatformGemini:
|
||||||
|
if s.geminiCompatService == nil {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini gateway service not available"}
|
||||||
|
}
|
||||||
|
_, err = s.geminiCompatService.Forward(ctx, c, account, body)
|
||||||
|
default:
|
||||||
|
if s.gatewayService == nil {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gateway service not available"}
|
||||||
|
}
|
||||||
|
parsedReq, parseErr := ParseGatewayRequest(body)
|
||||||
|
if parseErr != nil {
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "failed to parse request body"}
|
||||||
|
}
|
||||||
|
_, err = s.gatewayService.Forward(ctx, c, account, parsedReq)
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "unsupported retry type"}
|
||||||
|
}
|
||||||
|
|
||||||
|
statusCode := http.StatusOK
|
||||||
|
if c != nil && c.Writer != nil {
|
||||||
|
statusCode = c.Writer.Status()
|
||||||
|
}
|
||||||
|
|
||||||
|
upstreamReqID := extractUpstreamRequestID(c)
|
||||||
|
preview, truncated := extractResponsePreview(w)
|
||||||
|
|
||||||
|
exec := &opsRetryExecution{
|
||||||
|
status: opsRetryStatusFailed,
|
||||||
|
httpStatusCode: statusCode,
|
||||||
|
upstreamRequestID: upstreamReqID,
|
||||||
|
responsePreview: preview,
|
||||||
|
responseTruncated: truncated,
|
||||||
|
errorMessage: "",
|
||||||
|
}
|
||||||
|
|
||||||
|
if err == nil && statusCode < 400 {
|
||||||
|
exec.status = opsRetryStatusSucceeded
|
||||||
|
return exec
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
exec.errorMessage = err.Error()
|
||||||
|
} else {
|
||||||
|
exec.errorMessage = fmt.Sprintf("upstream returned status %d", statusCode)
|
||||||
|
}
|
||||||
|
|
||||||
|
return exec
|
||||||
|
}
|
||||||
|
|
||||||
|
func newOpsRetryContext(ctx context.Context, errorLog *OpsErrorLogDetail) (*gin.Context, *limitedResponseWriter) {
|
||||||
|
w := newLimitedResponseWriter(opsRetryCaptureBytesLimit)
|
||||||
|
c, _ := gin.CreateTestContext(w)
|
||||||
|
|
||||||
|
path := "/"
|
||||||
|
if errorLog != nil && strings.TrimSpace(errorLog.RequestPath) != "" {
|
||||||
|
path = errorLog.RequestPath
|
||||||
|
}
|
||||||
|
|
||||||
|
req, _ := http.NewRequestWithContext(ctx, http.MethodPost, "http://localhost"+path, bytes.NewReader(nil))
|
||||||
|
req.Header.Set("content-type", "application/json")
|
||||||
|
if errorLog != nil && strings.TrimSpace(errorLog.UserAgent) != "" {
|
||||||
|
req.Header.Set("user-agent", errorLog.UserAgent)
|
||||||
|
}
|
||||||
|
// Restore a minimal, whitelisted subset of request headers to improve retry fidelity
|
||||||
|
// (e.g. anthropic-beta / anthropic-version). Never replay auth credentials.
|
||||||
|
if errorLog != nil && strings.TrimSpace(errorLog.RequestHeaders) != "" {
|
||||||
|
var stored map[string]string
|
||||||
|
if err := json.Unmarshal([]byte(errorLog.RequestHeaders), &stored); err == nil {
|
||||||
|
for k, v := range stored {
|
||||||
|
key := strings.TrimSpace(k)
|
||||||
|
if key == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !opsRetryRequestHeaderAllowlist[strings.ToLower(key)] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
val := strings.TrimSpace(v)
|
||||||
|
if val == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
req.Header.Set(key, val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
c.Request = req
|
||||||
|
return c, w
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractUpstreamRequestID(c *gin.Context) string {
|
||||||
|
if c == nil || c.Writer == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
h := c.Writer.Header()
|
||||||
|
if h == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
for _, key := range []string{"x-request-id", "X-Request-Id", "X-Request-ID"} {
|
||||||
|
if v := strings.TrimSpace(h.Get(key)); v != "" {
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractResponsePreview(w *limitedResponseWriter) (preview string, truncated bool) {
|
||||||
|
if w == nil {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
b := bytes.TrimSpace(w.bodyBytes())
|
||||||
|
if len(b) == 0 {
|
||||||
|
return "", w.truncated()
|
||||||
|
}
|
||||||
|
if len(b) > opsRetryResponsePreviewMax {
|
||||||
|
return string(b[:opsRetryResponsePreviewMax]), true
|
||||||
|
}
|
||||||
|
return string(b), w.truncated()
|
||||||
|
}
|
||||||
|
|
||||||
|
func containsInt64(items []int64, needle int64) bool {
|
||||||
|
for _, v := range items {
|
||||||
|
if v == needle {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) isFailoverError(message string) bool {
|
||||||
|
msg := strings.ToLower(strings.TrimSpace(message))
|
||||||
|
if msg == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return strings.Contains(msg, "upstream error:") && strings.Contains(msg, "failover")
|
||||||
|
}
|
||||||
705
backend/internal/service/ops_scheduled_report_service.go
Normal file
705
backend/internal/service/ops_scheduled_report_service.go
Normal file
@@ -0,0 +1,705 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||||
|
"github.com/google/uuid"
|
||||||
|
"github.com/redis/go-redis/v9"
|
||||||
|
"github.com/robfig/cron/v3"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
opsScheduledReportJobName = "ops_scheduled_reports"
|
||||||
|
|
||||||
|
opsScheduledReportLeaderLockKeyDefault = "ops:scheduled_reports:leader"
|
||||||
|
opsScheduledReportLeaderLockTTLDefault = 5 * time.Minute
|
||||||
|
|
||||||
|
opsScheduledReportLastRunKeyPrefix = "ops:scheduled_reports:last_run:"
|
||||||
|
|
||||||
|
opsScheduledReportTickInterval = 1 * time.Minute
|
||||||
|
)
|
||||||
|
|
||||||
|
var opsScheduledReportCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
|
||||||
|
|
||||||
|
var opsScheduledReportReleaseScript = redis.NewScript(`
|
||||||
|
if redis.call("GET", KEYS[1]) == ARGV[1] then
|
||||||
|
return redis.call("DEL", KEYS[1])
|
||||||
|
end
|
||||||
|
return 0
|
||||||
|
`)
|
||||||
|
|
||||||
|
type OpsScheduledReportService struct {
|
||||||
|
opsService *OpsService
|
||||||
|
userService *UserService
|
||||||
|
emailService *EmailService
|
||||||
|
redisClient *redis.Client
|
||||||
|
cfg *config.Config
|
||||||
|
|
||||||
|
instanceID string
|
||||||
|
loc *time.Location
|
||||||
|
|
||||||
|
distributedLockOn bool
|
||||||
|
warnNoRedisOnce sync.Once
|
||||||
|
|
||||||
|
startOnce sync.Once
|
||||||
|
stopOnce sync.Once
|
||||||
|
stopCtx context.Context
|
||||||
|
stop context.CancelFunc
|
||||||
|
wg sync.WaitGroup
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewOpsScheduledReportService(
|
||||||
|
opsService *OpsService,
|
||||||
|
userService *UserService,
|
||||||
|
emailService *EmailService,
|
||||||
|
redisClient *redis.Client,
|
||||||
|
cfg *config.Config,
|
||||||
|
) *OpsScheduledReportService {
|
||||||
|
lockOn := cfg == nil || strings.TrimSpace(cfg.RunMode) != config.RunModeSimple
|
||||||
|
|
||||||
|
loc := time.Local
|
||||||
|
if cfg != nil && strings.TrimSpace(cfg.Timezone) != "" {
|
||||||
|
if parsed, err := time.LoadLocation(strings.TrimSpace(cfg.Timezone)); err == nil && parsed != nil {
|
||||||
|
loc = parsed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return &OpsScheduledReportService{
|
||||||
|
opsService: opsService,
|
||||||
|
userService: userService,
|
||||||
|
emailService: emailService,
|
||||||
|
redisClient: redisClient,
|
||||||
|
cfg: cfg,
|
||||||
|
|
||||||
|
instanceID: uuid.NewString(),
|
||||||
|
loc: loc,
|
||||||
|
distributedLockOn: lockOn,
|
||||||
|
warnNoRedisOnce: sync.Once{},
|
||||||
|
startOnce: sync.Once{},
|
||||||
|
stopOnce: sync.Once{},
|
||||||
|
stopCtx: nil,
|
||||||
|
stop: nil,
|
||||||
|
wg: sync.WaitGroup{},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsScheduledReportService) Start() {
|
||||||
|
s.StartWithContext(context.Background())
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsScheduledReportService) StartWithContext(ctx context.Context) {
|
||||||
|
if s == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
if s.cfg != nil && !s.cfg.Ops.Enabled {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if s.opsService == nil || s.emailService == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
s.startOnce.Do(func() {
|
||||||
|
s.stopCtx, s.stop = context.WithCancel(ctx)
|
||||||
|
s.wg.Add(1)
|
||||||
|
go s.run()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsScheduledReportService) Stop() {
|
||||||
|
if s == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.stopOnce.Do(func() {
|
||||||
|
if s.stop != nil {
|
||||||
|
s.stop()
|
||||||
|
}
|
||||||
|
})
|
||||||
|
s.wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsScheduledReportService) run() {
|
||||||
|
defer s.wg.Done()
|
||||||
|
|
||||||
|
ticker := time.NewTicker(opsScheduledReportTickInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
s.runOnce()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ticker.C:
|
||||||
|
s.runOnce()
|
||||||
|
case <-s.stopCtx.Done():
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsScheduledReportService) runOnce() {
|
||||||
|
if s == nil || s.opsService == nil || s.emailService == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
startedAt := time.Now().UTC()
|
||||||
|
runAt := startedAt
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(s.stopCtx, 60*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
// Respect ops monitoring enabled switch.
|
||||||
|
if !s.opsService.IsMonitoringEnabled(ctx) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
release, ok := s.tryAcquireLeaderLock(ctx)
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if release != nil {
|
||||||
|
defer release()
|
||||||
|
}
|
||||||
|
|
||||||
|
now := time.Now()
|
||||||
|
if s.loc != nil {
|
||||||
|
now = now.In(s.loc)
|
||||||
|
}
|
||||||
|
|
||||||
|
reports := s.listScheduledReports(ctx, now)
|
||||||
|
if len(reports) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, report := range reports {
|
||||||
|
if report == nil || !report.Enabled {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if report.NextRunAt.After(now) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := s.runReport(ctx, report, now); err != nil {
|
||||||
|
s.recordHeartbeatError(runAt, time.Since(startedAt), err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
|
||||||
|
}
|
||||||
|
|
||||||
|
type opsScheduledReport struct {
|
||||||
|
Name string
|
||||||
|
ReportType string
|
||||||
|
Schedule string
|
||||||
|
Enabled bool
|
||||||
|
|
||||||
|
TimeRange time.Duration
|
||||||
|
|
||||||
|
Recipients []string
|
||||||
|
|
||||||
|
ErrorDigestMinCount int
|
||||||
|
AccountHealthErrorRateThreshold float64
|
||||||
|
|
||||||
|
LastRunAt *time.Time
|
||||||
|
NextRunAt time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsScheduledReportService) listScheduledReports(ctx context.Context, now time.Time) []*opsScheduledReport {
|
||||||
|
if s == nil || s.opsService == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx)
|
||||||
|
if err != nil || emailCfg == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if !emailCfg.Report.Enabled {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
recipients := normalizeEmails(emailCfg.Report.Recipients)
|
||||||
|
|
||||||
|
type reportDef struct {
|
||||||
|
enabled bool
|
||||||
|
name string
|
||||||
|
kind string
|
||||||
|
timeRange time.Duration
|
||||||
|
schedule string
|
||||||
|
}
|
||||||
|
|
||||||
|
defs := []reportDef{
|
||||||
|
{enabled: emailCfg.Report.DailySummaryEnabled, name: "日报", kind: "daily_summary", timeRange: 24 * time.Hour, schedule: emailCfg.Report.DailySummarySchedule},
|
||||||
|
{enabled: emailCfg.Report.WeeklySummaryEnabled, name: "周报", kind: "weekly_summary", timeRange: 7 * 24 * time.Hour, schedule: emailCfg.Report.WeeklySummarySchedule},
|
||||||
|
{enabled: emailCfg.Report.ErrorDigestEnabled, name: "错误摘要", kind: "error_digest", timeRange: 24 * time.Hour, schedule: emailCfg.Report.ErrorDigestSchedule},
|
||||||
|
{enabled: emailCfg.Report.AccountHealthEnabled, name: "账号健康", kind: "account_health", timeRange: 24 * time.Hour, schedule: emailCfg.Report.AccountHealthSchedule},
|
||||||
|
}
|
||||||
|
|
||||||
|
out := make([]*opsScheduledReport, 0, len(defs))
|
||||||
|
for _, d := range defs {
|
||||||
|
if !d.enabled {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
spec := strings.TrimSpace(d.schedule)
|
||||||
|
if spec == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sched, err := opsScheduledReportCronParser.Parse(spec)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("[OpsScheduledReport] invalid cron spec=%q for report=%s: %v", spec, d.kind, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
lastRun := s.getLastRunAt(ctx, d.kind)
|
||||||
|
base := lastRun
|
||||||
|
if base.IsZero() {
|
||||||
|
// Allow a schedule matching the current minute to trigger right after startup.
|
||||||
|
base = now.Add(-1 * time.Minute)
|
||||||
|
}
|
||||||
|
next := sched.Next(base)
|
||||||
|
if next.IsZero() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
var lastRunPtr *time.Time
|
||||||
|
if !lastRun.IsZero() {
|
||||||
|
lastCopy := lastRun
|
||||||
|
lastRunPtr = &lastCopy
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, &opsScheduledReport{
|
||||||
|
Name: d.name,
|
||||||
|
ReportType: d.kind,
|
||||||
|
Schedule: spec,
|
||||||
|
Enabled: true,
|
||||||
|
|
||||||
|
TimeRange: d.timeRange,
|
||||||
|
|
||||||
|
Recipients: recipients,
|
||||||
|
|
||||||
|
ErrorDigestMinCount: emailCfg.Report.ErrorDigestMinCount,
|
||||||
|
AccountHealthErrorRateThreshold: emailCfg.Report.AccountHealthErrorRateThreshold,
|
||||||
|
|
||||||
|
LastRunAt: lastRunPtr,
|
||||||
|
NextRunAt: next,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsScheduledReportService) runReport(ctx context.Context, report *opsScheduledReport, now time.Time) error {
|
||||||
|
if s == nil || s.opsService == nil || s.emailService == nil || report == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mark as "run" up-front so a broken SMTP config doesn't spam retries every minute.
|
||||||
|
s.setLastRunAt(ctx, report.ReportType, now)
|
||||||
|
|
||||||
|
content, err := s.generateReportHTML(ctx, report, now)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(content) == "" {
|
||||||
|
// Skip sending when the report decides not to emit content (e.g., digest below min count).
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
recipients := report.Recipients
|
||||||
|
if len(recipients) == 0 && s.userService != nil {
|
||||||
|
admin, err := s.userService.GetFirstAdmin(ctx)
|
||||||
|
if err == nil && admin != nil && strings.TrimSpace(admin.Email) != "" {
|
||||||
|
recipients = []string{strings.TrimSpace(admin.Email)}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(recipients) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
subject := fmt.Sprintf("[Ops Report] %s", strings.TrimSpace(report.Name))
|
||||||
|
|
||||||
|
for _, to := range recipients {
|
||||||
|
addr := strings.TrimSpace(to)
|
||||||
|
if addr == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err := s.emailService.SendEmail(ctx, addr, subject, content); err != nil {
|
||||||
|
// Ignore per-recipient failures; continue best-effort.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsScheduledReportService) generateReportHTML(ctx context.Context, report *opsScheduledReport, now time.Time) (string, error) {
|
||||||
|
if s == nil || s.opsService == nil || report == nil {
|
||||||
|
return "", fmt.Errorf("service not initialized")
|
||||||
|
}
|
||||||
|
if report.TimeRange <= 0 {
|
||||||
|
return "", fmt.Errorf("invalid time range")
|
||||||
|
}
|
||||||
|
|
||||||
|
end := now.UTC()
|
||||||
|
start := end.Add(-report.TimeRange)
|
||||||
|
|
||||||
|
switch strings.TrimSpace(report.ReportType) {
|
||||||
|
case "daily_summary", "weekly_summary":
|
||||||
|
overview, err := s.opsService.GetDashboardOverview(ctx, &OpsDashboardFilter{
|
||||||
|
StartTime: start,
|
||||||
|
EndTime: end,
|
||||||
|
Platform: "",
|
||||||
|
GroupID: nil,
|
||||||
|
QueryMode: OpsQueryModeAuto,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
// If pre-aggregation isn't ready but the report is requested, fall back to raw.
|
||||||
|
if strings.TrimSpace(report.ReportType) == "daily_summary" || strings.TrimSpace(report.ReportType) == "weekly_summary" {
|
||||||
|
overview, err = s.opsService.GetDashboardOverview(ctx, &OpsDashboardFilter{
|
||||||
|
StartTime: start,
|
||||||
|
EndTime: end,
|
||||||
|
Platform: "",
|
||||||
|
GroupID: nil,
|
||||||
|
QueryMode: OpsQueryModeRaw,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return buildOpsSummaryEmailHTML(report.Name, start, end, overview), nil
|
||||||
|
case "error_digest":
|
||||||
|
// Lightweight digest: list recent errors (status>=400) and breakdown by type.
|
||||||
|
startTime := start
|
||||||
|
endTime := end
|
||||||
|
filter := &OpsErrorLogFilter{
|
||||||
|
StartTime: &startTime,
|
||||||
|
EndTime: &endTime,
|
||||||
|
Page: 1,
|
||||||
|
PageSize: 100,
|
||||||
|
}
|
||||||
|
out, err := s.opsService.GetErrorLogs(ctx, filter)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if report.ErrorDigestMinCount > 0 && out != nil && out.Total < report.ErrorDigestMinCount {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
return buildOpsErrorDigestEmailHTML(report.Name, start, end, out), nil
|
||||||
|
case "account_health":
|
||||||
|
// Best-effort: use account availability (not error rate yet).
|
||||||
|
avail, err := s.opsService.GetAccountAvailability(ctx, "", nil)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
_ = report.AccountHealthErrorRateThreshold // reserved for future per-account error rate report
|
||||||
|
return buildOpsAccountHealthEmailHTML(report.Name, start, end, avail), nil
|
||||||
|
default:
|
||||||
|
return "", fmt.Errorf("unknown report type: %s", report.ReportType)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildOpsSummaryEmailHTML(title string, start, end time.Time, overview *OpsDashboardOverview) string {
|
||||||
|
if overview == nil {
|
||||||
|
return fmt.Sprintf("<h2>%s</h2><p>No data.</p>", htmlEscape(title))
|
||||||
|
}
|
||||||
|
|
||||||
|
latP50 := "-"
|
||||||
|
latP99 := "-"
|
||||||
|
if overview.Duration.P50 != nil {
|
||||||
|
latP50 = fmt.Sprintf("%dms", *overview.Duration.P50)
|
||||||
|
}
|
||||||
|
if overview.Duration.P99 != nil {
|
||||||
|
latP99 = fmt.Sprintf("%dms", *overview.Duration.P99)
|
||||||
|
}
|
||||||
|
|
||||||
|
ttftP50 := "-"
|
||||||
|
ttftP99 := "-"
|
||||||
|
if overview.TTFT.P50 != nil {
|
||||||
|
ttftP50 = fmt.Sprintf("%dms", *overview.TTFT.P50)
|
||||||
|
}
|
||||||
|
if overview.TTFT.P99 != nil {
|
||||||
|
ttftP99 = fmt.Sprintf("%dms", *overview.TTFT.P99)
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Sprintf(`
|
||||||
|
<h2>%s</h2>
|
||||||
|
<p><b>Period</b>: %s ~ %s (UTC)</p>
|
||||||
|
<ul>
|
||||||
|
<li><b>Total Requests</b>: %d</li>
|
||||||
|
<li><b>Success</b>: %d</li>
|
||||||
|
<li><b>Errors (SLA)</b>: %d</li>
|
||||||
|
<li><b>Business Limited</b>: %d</li>
|
||||||
|
<li><b>SLA</b>: %.2f%%</li>
|
||||||
|
<li><b>Error Rate</b>: %.2f%%</li>
|
||||||
|
<li><b>Upstream Error Rate (excl 429/529)</b>: %.2f%%</li>
|
||||||
|
<li><b>Upstream Errors</b>: excl429/529=%d, 429=%d, 529=%d</li>
|
||||||
|
<li><b>Latency</b>: p50=%s, p99=%s</li>
|
||||||
|
<li><b>TTFT</b>: p50=%s, p99=%s</li>
|
||||||
|
<li><b>Tokens</b>: %d</li>
|
||||||
|
<li><b>QPS</b>: current=%.1f, peak=%.1f, avg=%.1f</li>
|
||||||
|
<li><b>TPS</b>: current=%.1f, peak=%.1f, avg=%.1f</li>
|
||||||
|
</ul>
|
||||||
|
`,
|
||||||
|
htmlEscape(strings.TrimSpace(title)),
|
||||||
|
htmlEscape(start.UTC().Format(time.RFC3339)),
|
||||||
|
htmlEscape(end.UTC().Format(time.RFC3339)),
|
||||||
|
overview.RequestCountTotal,
|
||||||
|
overview.SuccessCount,
|
||||||
|
overview.ErrorCountSLA,
|
||||||
|
overview.BusinessLimitedCount,
|
||||||
|
overview.SLA*100,
|
||||||
|
overview.ErrorRate*100,
|
||||||
|
overview.UpstreamErrorRate*100,
|
||||||
|
overview.UpstreamErrorCountExcl429529,
|
||||||
|
overview.Upstream429Count,
|
||||||
|
overview.Upstream529Count,
|
||||||
|
htmlEscape(latP50),
|
||||||
|
htmlEscape(latP99),
|
||||||
|
htmlEscape(ttftP50),
|
||||||
|
htmlEscape(ttftP99),
|
||||||
|
overview.TokenConsumed,
|
||||||
|
overview.QPS.Current,
|
||||||
|
overview.QPS.Peak,
|
||||||
|
overview.QPS.Avg,
|
||||||
|
overview.TPS.Current,
|
||||||
|
overview.TPS.Peak,
|
||||||
|
overview.TPS.Avg,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildOpsErrorDigestEmailHTML(title string, start, end time.Time, list *OpsErrorLogList) string {
|
||||||
|
total := 0
|
||||||
|
recent := []*OpsErrorLog{}
|
||||||
|
if list != nil {
|
||||||
|
total = list.Total
|
||||||
|
recent = list.Errors
|
||||||
|
}
|
||||||
|
if len(recent) > 10 {
|
||||||
|
recent = recent[:10]
|
||||||
|
}
|
||||||
|
|
||||||
|
rows := ""
|
||||||
|
for _, item := range recent {
|
||||||
|
if item == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
rows += fmt.Sprintf(
|
||||||
|
"<tr><td>%s</td><td>%s</td><td>%d</td><td>%s</td></tr>",
|
||||||
|
htmlEscape(item.CreatedAt.UTC().Format(time.RFC3339)),
|
||||||
|
htmlEscape(item.Platform),
|
||||||
|
item.StatusCode,
|
||||||
|
htmlEscape(truncateString(item.Message, 180)),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
if rows == "" {
|
||||||
|
rows = "<tr><td colspan=\"4\">No recent errors.</td></tr>"
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Sprintf(`
|
||||||
|
<h2>%s</h2>
|
||||||
|
<p><b>Period</b>: %s ~ %s (UTC)</p>
|
||||||
|
<p><b>Total Errors</b>: %d</p>
|
||||||
|
<h3>Recent</h3>
|
||||||
|
<table border="1" cellpadding="6" cellspacing="0" style="border-collapse:collapse;">
|
||||||
|
<thead><tr><th>Time</th><th>Platform</th><th>Status</th><th>Message</th></tr></thead>
|
||||||
|
<tbody>%s</tbody>
|
||||||
|
</table>
|
||||||
|
`,
|
||||||
|
htmlEscape(strings.TrimSpace(title)),
|
||||||
|
htmlEscape(start.UTC().Format(time.RFC3339)),
|
||||||
|
htmlEscape(end.UTC().Format(time.RFC3339)),
|
||||||
|
total,
|
||||||
|
rows,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildOpsAccountHealthEmailHTML(title string, start, end time.Time, avail *OpsAccountAvailability) string {
|
||||||
|
total := 0
|
||||||
|
available := 0
|
||||||
|
rateLimited := 0
|
||||||
|
hasError := 0
|
||||||
|
|
||||||
|
if avail != nil && avail.Accounts != nil {
|
||||||
|
for _, a := range avail.Accounts {
|
||||||
|
if a == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
total++
|
||||||
|
if a.IsAvailable {
|
||||||
|
available++
|
||||||
|
}
|
||||||
|
if a.IsRateLimited {
|
||||||
|
rateLimited++
|
||||||
|
}
|
||||||
|
if a.HasError {
|
||||||
|
hasError++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Sprintf(`
|
||||||
|
<h2>%s</h2>
|
||||||
|
<p><b>Period</b>: %s ~ %s (UTC)</p>
|
||||||
|
<ul>
|
||||||
|
<li><b>Total Accounts</b>: %d</li>
|
||||||
|
<li><b>Available</b>: %d</li>
|
||||||
|
<li><b>Rate Limited</b>: %d</li>
|
||||||
|
<li><b>Error</b>: %d</li>
|
||||||
|
</ul>
|
||||||
|
<p>Note: This report currently reflects account availability status only.</p>
|
||||||
|
`,
|
||||||
|
htmlEscape(strings.TrimSpace(title)),
|
||||||
|
htmlEscape(start.UTC().Format(time.RFC3339)),
|
||||||
|
htmlEscape(end.UTC().Format(time.RFC3339)),
|
||||||
|
total,
|
||||||
|
available,
|
||||||
|
rateLimited,
|
||||||
|
hasError,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsScheduledReportService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
|
||||||
|
if s == nil || !s.distributedLockOn {
|
||||||
|
return nil, true
|
||||||
|
}
|
||||||
|
if s.redisClient == nil {
|
||||||
|
s.warnNoRedisOnce.Do(func() {
|
||||||
|
log.Printf("[OpsScheduledReport] redis not configured; running without distributed lock")
|
||||||
|
})
|
||||||
|
return nil, true
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
key := opsScheduledReportLeaderLockKeyDefault
|
||||||
|
ttl := opsScheduledReportLeaderLockTTLDefault
|
||||||
|
if strings.TrimSpace(key) == "" {
|
||||||
|
key = "ops:scheduled_reports:leader"
|
||||||
|
}
|
||||||
|
if ttl <= 0 {
|
||||||
|
ttl = 5 * time.Minute
|
||||||
|
}
|
||||||
|
|
||||||
|
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
|
||||||
|
if err != nil {
|
||||||
|
// Prefer fail-closed to avoid duplicate report sends when Redis is flaky.
|
||||||
|
log.Printf("[OpsScheduledReport] leader lock SetNX failed; skipping this cycle: %v", err)
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
return func() {
|
||||||
|
_, _ = opsScheduledReportReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
|
||||||
|
}, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsScheduledReportService) getLastRunAt(ctx context.Context, reportType string) time.Time {
|
||||||
|
if s == nil || s.redisClient == nil {
|
||||||
|
return time.Time{}
|
||||||
|
}
|
||||||
|
kind := strings.TrimSpace(reportType)
|
||||||
|
if kind == "" {
|
||||||
|
return time.Time{}
|
||||||
|
}
|
||||||
|
key := opsScheduledReportLastRunKeyPrefix + kind
|
||||||
|
|
||||||
|
raw, err := s.redisClient.Get(ctx, key).Result()
|
||||||
|
if err != nil || strings.TrimSpace(raw) == "" {
|
||||||
|
return time.Time{}
|
||||||
|
}
|
||||||
|
sec, err := strconv.ParseInt(strings.TrimSpace(raw), 10, 64)
|
||||||
|
if err != nil || sec <= 0 {
|
||||||
|
return time.Time{}
|
||||||
|
}
|
||||||
|
last := time.Unix(sec, 0)
|
||||||
|
// Cron schedules are interpreted in the configured timezone (s.loc). Ensure the base time
|
||||||
|
// passed into cron.Next() uses the same location; otherwise the job will drift by timezone
|
||||||
|
// offset (e.g. Asia/Shanghai default would run 8h later after the first execution).
|
||||||
|
if s.loc != nil {
|
||||||
|
return last.In(s.loc)
|
||||||
|
}
|
||||||
|
return last.UTC()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsScheduledReportService) setLastRunAt(ctx context.Context, reportType string, t time.Time) {
|
||||||
|
if s == nil || s.redisClient == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
kind := strings.TrimSpace(reportType)
|
||||||
|
if kind == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if t.IsZero() {
|
||||||
|
t = time.Now().UTC()
|
||||||
|
}
|
||||||
|
key := opsScheduledReportLastRunKeyPrefix + kind
|
||||||
|
_ = s.redisClient.Set(ctx, key, strconv.FormatInt(t.UTC().Unix(), 10), 14*24*time.Hour).Err()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsScheduledReportService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
|
||||||
|
if s == nil || s.opsService == nil || s.opsService.opsRepo == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
now := time.Now().UTC()
|
||||||
|
durMs := duration.Milliseconds()
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
_ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
|
||||||
|
JobName: opsScheduledReportJobName,
|
||||||
|
LastRunAt: &runAt,
|
||||||
|
LastSuccessAt: &now,
|
||||||
|
LastDurationMs: &durMs,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsScheduledReportService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
|
||||||
|
if s == nil || s.opsService == nil || s.opsService.opsRepo == nil || err == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
now := time.Now().UTC()
|
||||||
|
durMs := duration.Milliseconds()
|
||||||
|
msg := truncateString(err.Error(), 2048)
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
_ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
|
||||||
|
JobName: opsScheduledReportJobName,
|
||||||
|
LastRunAt: &runAt,
|
||||||
|
LastErrorAt: &now,
|
||||||
|
LastError: &msg,
|
||||||
|
LastDurationMs: &durMs,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeEmails(in []string) []string {
|
||||||
|
if len(in) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
seen := make(map[string]struct{}, len(in))
|
||||||
|
out := make([]string, 0, len(in))
|
||||||
|
for _, raw := range in {
|
||||||
|
addr := strings.ToLower(strings.TrimSpace(raw))
|
||||||
|
if addr == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, ok := seen[addr]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[addr] = struct{}{}
|
||||||
|
out = append(out, addr)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
537
backend/internal/service/ops_service.go
Normal file
537
backend/internal/service/ops_service.go
Normal file
@@ -0,0 +1,537 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"log"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||||
|
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||||
|
)
|
||||||
|
|
||||||
|
var ErrOpsDisabled = infraerrors.NotFound("OPS_DISABLED", "Ops monitoring is disabled")
|
||||||
|
|
||||||
|
const (
|
||||||
|
opsMaxStoredRequestBodyBytes = 10 * 1024
|
||||||
|
opsMaxStoredErrorBodyBytes = 20 * 1024
|
||||||
|
)
|
||||||
|
|
||||||
|
// OpsService provides ingestion and query APIs for the Ops monitoring module.
|
||||||
|
type OpsService struct {
|
||||||
|
opsRepo OpsRepository
|
||||||
|
settingRepo SettingRepository
|
||||||
|
cfg *config.Config
|
||||||
|
|
||||||
|
accountRepo AccountRepository
|
||||||
|
|
||||||
|
// getAccountAvailability is a unit-test hook for overriding account availability lookup.
|
||||||
|
getAccountAvailability func(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error)
|
||||||
|
|
||||||
|
concurrencyService *ConcurrencyService
|
||||||
|
gatewayService *GatewayService
|
||||||
|
openAIGatewayService *OpenAIGatewayService
|
||||||
|
geminiCompatService *GeminiMessagesCompatService
|
||||||
|
antigravityGatewayService *AntigravityGatewayService
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewOpsService(
|
||||||
|
opsRepo OpsRepository,
|
||||||
|
settingRepo SettingRepository,
|
||||||
|
cfg *config.Config,
|
||||||
|
accountRepo AccountRepository,
|
||||||
|
concurrencyService *ConcurrencyService,
|
||||||
|
gatewayService *GatewayService,
|
||||||
|
openAIGatewayService *OpenAIGatewayService,
|
||||||
|
geminiCompatService *GeminiMessagesCompatService,
|
||||||
|
antigravityGatewayService *AntigravityGatewayService,
|
||||||
|
) *OpsService {
|
||||||
|
return &OpsService{
|
||||||
|
opsRepo: opsRepo,
|
||||||
|
settingRepo: settingRepo,
|
||||||
|
cfg: cfg,
|
||||||
|
|
||||||
|
accountRepo: accountRepo,
|
||||||
|
|
||||||
|
concurrencyService: concurrencyService,
|
||||||
|
gatewayService: gatewayService,
|
||||||
|
openAIGatewayService: openAIGatewayService,
|
||||||
|
geminiCompatService: geminiCompatService,
|
||||||
|
antigravityGatewayService: antigravityGatewayService,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) RequireMonitoringEnabled(ctx context.Context) error {
|
||||||
|
if s.IsMonitoringEnabled(ctx) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return ErrOpsDisabled
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) IsMonitoringEnabled(ctx context.Context) bool {
|
||||||
|
// Hard switch: disable ops entirely.
|
||||||
|
if s.cfg != nil && !s.cfg.Ops.Enabled {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if s.settingRepo == nil {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
|
||||||
|
if err != nil {
|
||||||
|
// Default enabled when key is missing, and fail-open on transient errors
|
||||||
|
// (ops should never block gateway traffic).
|
||||||
|
if errors.Is(err, ErrSettingNotFound) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
switch strings.ToLower(strings.TrimSpace(value)) {
|
||||||
|
case "false", "0", "off", "disabled":
|
||||||
|
return false
|
||||||
|
default:
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogInput, rawRequestBody []byte) error {
|
||||||
|
if entry == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if !s.IsMonitoringEnabled(ctx) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure timestamps are always populated.
|
||||||
|
if entry.CreatedAt.IsZero() {
|
||||||
|
entry.CreatedAt = time.Now()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure required fields exist (DB has NOT NULL constraints).
|
||||||
|
entry.ErrorPhase = strings.TrimSpace(entry.ErrorPhase)
|
||||||
|
entry.ErrorType = strings.TrimSpace(entry.ErrorType)
|
||||||
|
if entry.ErrorPhase == "" {
|
||||||
|
entry.ErrorPhase = "internal"
|
||||||
|
}
|
||||||
|
if entry.ErrorType == "" {
|
||||||
|
entry.ErrorType = "api_error"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sanitize + trim request body (errors only).
|
||||||
|
if len(rawRequestBody) > 0 {
|
||||||
|
sanitized, truncated, bytesLen := sanitizeAndTrimRequestBody(rawRequestBody, opsMaxStoredRequestBodyBytes)
|
||||||
|
if sanitized != "" {
|
||||||
|
entry.RequestBodyJSON = &sanitized
|
||||||
|
}
|
||||||
|
entry.RequestBodyTruncated = truncated
|
||||||
|
entry.RequestBodyBytes = &bytesLen
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sanitize + truncate error_body to avoid storing sensitive data.
|
||||||
|
if strings.TrimSpace(entry.ErrorBody) != "" {
|
||||||
|
sanitized, _ := sanitizeErrorBodyForStorage(entry.ErrorBody, opsMaxStoredErrorBodyBytes)
|
||||||
|
entry.ErrorBody = sanitized
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sanitize upstream error context if provided by gateway services.
|
||||||
|
if entry.UpstreamStatusCode != nil && *entry.UpstreamStatusCode <= 0 {
|
||||||
|
entry.UpstreamStatusCode = nil
|
||||||
|
}
|
||||||
|
if entry.UpstreamErrorMessage != nil {
|
||||||
|
msg := strings.TrimSpace(*entry.UpstreamErrorMessage)
|
||||||
|
msg = sanitizeUpstreamErrorMessage(msg)
|
||||||
|
msg = truncateString(msg, 2048)
|
||||||
|
if strings.TrimSpace(msg) == "" {
|
||||||
|
entry.UpstreamErrorMessage = nil
|
||||||
|
} else {
|
||||||
|
entry.UpstreamErrorMessage = &msg
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if entry.UpstreamErrorDetail != nil {
|
||||||
|
detail := strings.TrimSpace(*entry.UpstreamErrorDetail)
|
||||||
|
if detail == "" {
|
||||||
|
entry.UpstreamErrorDetail = nil
|
||||||
|
} else {
|
||||||
|
sanitized, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes)
|
||||||
|
if strings.TrimSpace(sanitized) == "" {
|
||||||
|
entry.UpstreamErrorDetail = nil
|
||||||
|
} else {
|
||||||
|
entry.UpstreamErrorDetail = &sanitized
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sanitize + serialize upstream error events list.
|
||||||
|
if len(entry.UpstreamErrors) > 0 {
|
||||||
|
const maxEvents = 32
|
||||||
|
events := entry.UpstreamErrors
|
||||||
|
if len(events) > maxEvents {
|
||||||
|
events = events[len(events)-maxEvents:]
|
||||||
|
}
|
||||||
|
|
||||||
|
sanitized := make([]*OpsUpstreamErrorEvent, 0, len(events))
|
||||||
|
for _, ev := range events {
|
||||||
|
if ev == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out := *ev
|
||||||
|
|
||||||
|
out.Platform = strings.TrimSpace(out.Platform)
|
||||||
|
out.UpstreamRequestID = truncateString(strings.TrimSpace(out.UpstreamRequestID), 128)
|
||||||
|
out.Kind = truncateString(strings.TrimSpace(out.Kind), 64)
|
||||||
|
|
||||||
|
if out.AccountID < 0 {
|
||||||
|
out.AccountID = 0
|
||||||
|
}
|
||||||
|
if out.UpstreamStatusCode < 0 {
|
||||||
|
out.UpstreamStatusCode = 0
|
||||||
|
}
|
||||||
|
if out.AtUnixMs < 0 {
|
||||||
|
out.AtUnixMs = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
msg := sanitizeUpstreamErrorMessage(strings.TrimSpace(out.Message))
|
||||||
|
msg = truncateString(msg, 2048)
|
||||||
|
out.Message = msg
|
||||||
|
|
||||||
|
detail := strings.TrimSpace(out.Detail)
|
||||||
|
if detail != "" {
|
||||||
|
// Keep upstream detail small; request bodies are not stored here, only upstream error payloads.
|
||||||
|
sanitizedDetail, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes)
|
||||||
|
out.Detail = sanitizedDetail
|
||||||
|
} else {
|
||||||
|
out.Detail = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Drop fully-empty events (can happen if only status code was known).
|
||||||
|
if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
evCopy := out
|
||||||
|
sanitized = append(sanitized, &evCopy)
|
||||||
|
}
|
||||||
|
|
||||||
|
entry.UpstreamErrorsJSON = marshalOpsUpstreamErrors(sanitized)
|
||||||
|
entry.UpstreamErrors = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := s.opsRepo.InsertErrorLog(ctx, entry); err != nil {
|
||||||
|
// Never bubble up to gateway; best-effort logging.
|
||||||
|
log.Printf("[Ops] RecordError failed: %v", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil
|
||||||
|
}
|
||||||
|
return s.opsRepo.ListErrorLogs(ctx, filter)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
|
||||||
|
}
|
||||||
|
detail, err := s.opsRepo.GetErrorLogByID(ctx, id)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, sql.ErrNoRows) {
|
||||||
|
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
|
||||||
|
}
|
||||||
|
return nil, infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err)
|
||||||
|
}
|
||||||
|
return detail, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) {
|
||||||
|
bytesLen = len(raw)
|
||||||
|
if len(raw) == 0 {
|
||||||
|
return "", false, 0
|
||||||
|
}
|
||||||
|
|
||||||
|
var decoded any
|
||||||
|
if err := json.Unmarshal(raw, &decoded); err != nil {
|
||||||
|
// If it's not valid JSON, don't store (retry would not be reliable anyway).
|
||||||
|
return "", false, bytesLen
|
||||||
|
}
|
||||||
|
|
||||||
|
decoded = redactSensitiveJSON(decoded)
|
||||||
|
|
||||||
|
encoded, err := json.Marshal(decoded)
|
||||||
|
if err != nil {
|
||||||
|
return "", false, bytesLen
|
||||||
|
}
|
||||||
|
if len(encoded) <= maxBytes {
|
||||||
|
return string(encoded), false, bytesLen
|
||||||
|
}
|
||||||
|
|
||||||
|
// Trim conversation history to keep the most recent context.
|
||||||
|
if root, ok := decoded.(map[string]any); ok {
|
||||||
|
if trimmed, ok := trimConversationArrays(root, maxBytes); ok {
|
||||||
|
encoded2, err2 := json.Marshal(trimmed)
|
||||||
|
if err2 == nil && len(encoded2) <= maxBytes {
|
||||||
|
return string(encoded2), true, bytesLen
|
||||||
|
}
|
||||||
|
// Fallthrough: keep shrinking.
|
||||||
|
decoded = trimmed
|
||||||
|
}
|
||||||
|
|
||||||
|
essential := shrinkToEssentials(root)
|
||||||
|
encoded3, err3 := json.Marshal(essential)
|
||||||
|
if err3 == nil && len(encoded3) <= maxBytes {
|
||||||
|
return string(encoded3), true, bytesLen
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Last resort: store a minimal placeholder (still valid JSON).
|
||||||
|
placeholder := map[string]any{
|
||||||
|
"request_body_truncated": true,
|
||||||
|
}
|
||||||
|
if model := extractString(decoded, "model"); model != "" {
|
||||||
|
placeholder["model"] = model
|
||||||
|
}
|
||||||
|
encoded4, err4 := json.Marshal(placeholder)
|
||||||
|
if err4 != nil {
|
||||||
|
return "", true, bytesLen
|
||||||
|
}
|
||||||
|
return string(encoded4), true, bytesLen
|
||||||
|
}
|
||||||
|
|
||||||
|
func redactSensitiveJSON(v any) any {
|
||||||
|
switch t := v.(type) {
|
||||||
|
case map[string]any:
|
||||||
|
out := make(map[string]any, len(t))
|
||||||
|
for k, vv := range t {
|
||||||
|
if isSensitiveKey(k) {
|
||||||
|
out[k] = "[REDACTED]"
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out[k] = redactSensitiveJSON(vv)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
case []any:
|
||||||
|
out := make([]any, 0, len(t))
|
||||||
|
for _, vv := range t {
|
||||||
|
out = append(out, redactSensitiveJSON(vv))
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
default:
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func isSensitiveKey(key string) bool {
|
||||||
|
k := strings.ToLower(strings.TrimSpace(key))
|
||||||
|
if k == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Exact matches (common credential fields).
|
||||||
|
switch k {
|
||||||
|
case "authorization",
|
||||||
|
"proxy-authorization",
|
||||||
|
"x-api-key",
|
||||||
|
"api_key",
|
||||||
|
"apikey",
|
||||||
|
"access_token",
|
||||||
|
"refresh_token",
|
||||||
|
"id_token",
|
||||||
|
"session_token",
|
||||||
|
"token",
|
||||||
|
"password",
|
||||||
|
"passwd",
|
||||||
|
"passphrase",
|
||||||
|
"secret",
|
||||||
|
"client_secret",
|
||||||
|
"private_key",
|
||||||
|
"jwt",
|
||||||
|
"signature",
|
||||||
|
"accesskeyid",
|
||||||
|
"secretaccesskey":
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Suffix matches.
|
||||||
|
for _, suffix := range []string{
|
||||||
|
"_secret",
|
||||||
|
"_token",
|
||||||
|
"_id_token",
|
||||||
|
"_session_token",
|
||||||
|
"_password",
|
||||||
|
"_passwd",
|
||||||
|
"_passphrase",
|
||||||
|
"_key",
|
||||||
|
"secret_key",
|
||||||
|
"private_key",
|
||||||
|
} {
|
||||||
|
if strings.HasSuffix(k, suffix) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Substring matches (conservative, but errs on the side of privacy).
|
||||||
|
for _, sub := range []string{
|
||||||
|
"secret",
|
||||||
|
"token",
|
||||||
|
"password",
|
||||||
|
"passwd",
|
||||||
|
"passphrase",
|
||||||
|
"privatekey",
|
||||||
|
"private_key",
|
||||||
|
"apikey",
|
||||||
|
"api_key",
|
||||||
|
"accesskeyid",
|
||||||
|
"secretaccesskey",
|
||||||
|
"bearer",
|
||||||
|
"cookie",
|
||||||
|
"credential",
|
||||||
|
"session",
|
||||||
|
"jwt",
|
||||||
|
"signature",
|
||||||
|
} {
|
||||||
|
if strings.Contains(k, sub) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func trimConversationArrays(root map[string]any, maxBytes int) (map[string]any, bool) {
|
||||||
|
// Supported: anthropic/openai: messages; gemini: contents.
|
||||||
|
if out, ok := trimArrayField(root, "messages", maxBytes); ok {
|
||||||
|
return out, true
|
||||||
|
}
|
||||||
|
if out, ok := trimArrayField(root, "contents", maxBytes); ok {
|
||||||
|
return out, true
|
||||||
|
}
|
||||||
|
return root, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func trimArrayField(root map[string]any, field string, maxBytes int) (map[string]any, bool) {
|
||||||
|
raw, ok := root[field]
|
||||||
|
if !ok {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
arr, ok := raw.([]any)
|
||||||
|
if !ok || len(arr) == 0 {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Keep at least the last message/content. Use binary search so we don't marshal O(n) times.
|
||||||
|
// We are dropping from the *front* of the array (oldest context first).
|
||||||
|
lo := 0
|
||||||
|
hi := len(arr) - 1 // inclusive; hi ensures at least one item remains
|
||||||
|
|
||||||
|
var best map[string]any
|
||||||
|
found := false
|
||||||
|
|
||||||
|
for lo <= hi {
|
||||||
|
mid := (lo + hi) / 2
|
||||||
|
candidateArr := arr[mid:]
|
||||||
|
if len(candidateArr) == 0 {
|
||||||
|
lo = mid + 1
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
next := shallowCopyMap(root)
|
||||||
|
next[field] = candidateArr
|
||||||
|
encoded, err := json.Marshal(next)
|
||||||
|
if err != nil {
|
||||||
|
// If marshal fails, try dropping more.
|
||||||
|
lo = mid + 1
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(encoded) <= maxBytes {
|
||||||
|
best = next
|
||||||
|
found = true
|
||||||
|
// Try to keep more context by dropping fewer items.
|
||||||
|
hi = mid - 1
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Need to drop more.
|
||||||
|
lo = mid + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if found {
|
||||||
|
return best, true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Nothing fit (even with only one element); return the smallest slice and let the
|
||||||
|
// caller fall back to shrinkToEssentials().
|
||||||
|
next := shallowCopyMap(root)
|
||||||
|
next[field] = arr[len(arr)-1:]
|
||||||
|
return next, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func shrinkToEssentials(root map[string]any) map[string]any {
|
||||||
|
out := make(map[string]any)
|
||||||
|
for _, key := range []string{"model", "stream", "max_tokens", "temperature", "top_p", "top_k"} {
|
||||||
|
if v, ok := root[key]; ok {
|
||||||
|
out[key] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Keep only the last element of the conversation array.
|
||||||
|
if v, ok := root["messages"]; ok {
|
||||||
|
if arr, ok := v.([]any); ok && len(arr) > 0 {
|
||||||
|
out["messages"] = []any{arr[len(arr)-1]}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if v, ok := root["contents"]; ok {
|
||||||
|
if arr, ok := v.([]any); ok && len(arr) > 0 {
|
||||||
|
out["contents"] = []any{arr[len(arr)-1]}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func shallowCopyMap(m map[string]any) map[string]any {
|
||||||
|
out := make(map[string]any, len(m))
|
||||||
|
for k, v := range m {
|
||||||
|
out[k] = v
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, truncated bool) {
|
||||||
|
raw = strings.TrimSpace(raw)
|
||||||
|
if raw == "" {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prefer JSON-safe sanitization when possible.
|
||||||
|
if out, trunc, _ := sanitizeAndTrimRequestBody([]byte(raw), maxBytes); out != "" {
|
||||||
|
return out, trunc
|
||||||
|
}
|
||||||
|
|
||||||
|
// Non-JSON: best-effort truncate.
|
||||||
|
if maxBytes > 0 && len(raw) > maxBytes {
|
||||||
|
return truncateString(raw, maxBytes), true
|
||||||
|
}
|
||||||
|
return raw, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractString(v any, key string) string {
|
||||||
|
root, ok := v.(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
s, _ := root[key].(string)
|
||||||
|
return strings.TrimSpace(s)
|
||||||
|
}
|
||||||
465
backend/internal/service/ops_settings.go
Normal file
465
backend/internal/service/ops_settings.go
Normal file
@@ -0,0 +1,465 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader"
|
||||||
|
opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second
|
||||||
|
)
|
||||||
|
|
||||||
|
// =========================
|
||||||
|
// Email notification config
|
||||||
|
// =========================
|
||||||
|
|
||||||
|
func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) {
|
||||||
|
defaultCfg := defaultOpsEmailNotificationConfig()
|
||||||
|
if s == nil || s.settingRepo == nil {
|
||||||
|
return defaultCfg, nil
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, ErrSettingNotFound) {
|
||||||
|
// Initialize defaults on first read (best-effort).
|
||||||
|
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
|
||||||
|
_ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b))
|
||||||
|
}
|
||||||
|
return defaultCfg, nil
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg := &OpsEmailNotificationConfig{}
|
||||||
|
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
|
||||||
|
// Corrupted JSON should not break ops UI; fall back to defaults.
|
||||||
|
return defaultCfg, nil
|
||||||
|
}
|
||||||
|
normalizeOpsEmailNotificationConfig(cfg)
|
||||||
|
return cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) {
|
||||||
|
if s == nil || s.settingRepo == nil {
|
||||||
|
return nil, errors.New("setting repository not initialized")
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
if req == nil {
|
||||||
|
return nil, errors.New("invalid request")
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg, err := s.GetEmailNotificationConfig(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if req.Alert != nil {
|
||||||
|
cfg.Alert.Enabled = req.Alert.Enabled
|
||||||
|
if req.Alert.Recipients != nil {
|
||||||
|
cfg.Alert.Recipients = req.Alert.Recipients
|
||||||
|
}
|
||||||
|
cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity)
|
||||||
|
cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour
|
||||||
|
cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds
|
||||||
|
cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts
|
||||||
|
}
|
||||||
|
|
||||||
|
if req.Report != nil {
|
||||||
|
cfg.Report.Enabled = req.Report.Enabled
|
||||||
|
if req.Report.Recipients != nil {
|
||||||
|
cfg.Report.Recipients = req.Report.Recipients
|
||||||
|
}
|
||||||
|
cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled
|
||||||
|
cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule)
|
||||||
|
cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled
|
||||||
|
cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule)
|
||||||
|
cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled
|
||||||
|
cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule)
|
||||||
|
cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount
|
||||||
|
cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled
|
||||||
|
cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule)
|
||||||
|
cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := validateOpsEmailNotificationConfig(cfg); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
normalizeOpsEmailNotificationConfig(cfg)
|
||||||
|
raw, err := json.Marshal(cfg)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig {
|
||||||
|
return &OpsEmailNotificationConfig{
|
||||||
|
Alert: OpsEmailAlertConfig{
|
||||||
|
Enabled: true,
|
||||||
|
Recipients: []string{},
|
||||||
|
MinSeverity: "",
|
||||||
|
RateLimitPerHour: 0,
|
||||||
|
BatchingWindowSeconds: 0,
|
||||||
|
IncludeResolvedAlerts: false,
|
||||||
|
},
|
||||||
|
Report: OpsEmailReportConfig{
|
||||||
|
Enabled: false,
|
||||||
|
Recipients: []string{},
|
||||||
|
DailySummaryEnabled: false,
|
||||||
|
DailySummarySchedule: "0 9 * * *",
|
||||||
|
WeeklySummaryEnabled: false,
|
||||||
|
WeeklySummarySchedule: "0 9 * * 1",
|
||||||
|
ErrorDigestEnabled: false,
|
||||||
|
ErrorDigestSchedule: "0 9 * * *",
|
||||||
|
ErrorDigestMinCount: 10,
|
||||||
|
AccountHealthEnabled: false,
|
||||||
|
AccountHealthSchedule: "0 9 * * *",
|
||||||
|
AccountHealthErrorRateThreshold: 10.0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) {
|
||||||
|
if cfg == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if cfg.Alert.Recipients == nil {
|
||||||
|
cfg.Alert.Recipients = []string{}
|
||||||
|
}
|
||||||
|
if cfg.Report.Recipients == nil {
|
||||||
|
cfg.Report.Recipients = []string{}
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity)
|
||||||
|
cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule)
|
||||||
|
cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule)
|
||||||
|
cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule)
|
||||||
|
cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule)
|
||||||
|
|
||||||
|
// Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings.
|
||||||
|
if cfg.Report.DailySummarySchedule == "" {
|
||||||
|
cfg.Report.DailySummarySchedule = "0 9 * * *"
|
||||||
|
}
|
||||||
|
if cfg.Report.WeeklySummarySchedule == "" {
|
||||||
|
cfg.Report.WeeklySummarySchedule = "0 9 * * 1"
|
||||||
|
}
|
||||||
|
if cfg.Report.ErrorDigestSchedule == "" {
|
||||||
|
cfg.Report.ErrorDigestSchedule = "0 9 * * *"
|
||||||
|
}
|
||||||
|
if cfg.Report.AccountHealthSchedule == "" {
|
||||||
|
cfg.Report.AccountHealthSchedule = "0 9 * * *"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error {
|
||||||
|
if cfg == nil {
|
||||||
|
return errors.New("invalid config")
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.Alert.RateLimitPerHour < 0 {
|
||||||
|
return errors.New("alert.rate_limit_per_hour must be >= 0")
|
||||||
|
}
|
||||||
|
if cfg.Alert.BatchingWindowSeconds < 0 {
|
||||||
|
return errors.New("alert.batching_window_seconds must be >= 0")
|
||||||
|
}
|
||||||
|
switch strings.TrimSpace(cfg.Alert.MinSeverity) {
|
||||||
|
case "", "critical", "warning", "info":
|
||||||
|
default:
|
||||||
|
return errors.New("alert.min_severity must be one of: critical, warning, info, or empty")
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.Report.ErrorDigestMinCount < 0 {
|
||||||
|
return errors.New("report.error_digest_min_count must be >= 0")
|
||||||
|
}
|
||||||
|
if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 {
|
||||||
|
return errors.New("report.account_health_error_rate_threshold must be between 0 and 100")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// =========================
|
||||||
|
// Alert runtime settings
|
||||||
|
// =========================
|
||||||
|
|
||||||
|
func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings {
|
||||||
|
return &OpsAlertRuntimeSettings{
|
||||||
|
EvaluationIntervalSeconds: 60,
|
||||||
|
DistributedLock: OpsDistributedLockSettings{
|
||||||
|
Enabled: true,
|
||||||
|
Key: opsAlertEvaluatorLeaderLockKeyDefault,
|
||||||
|
TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()),
|
||||||
|
},
|
||||||
|
Silencing: OpsAlertSilencingSettings{
|
||||||
|
Enabled: false,
|
||||||
|
GlobalUntilRFC3339: "",
|
||||||
|
GlobalReason: "",
|
||||||
|
Entries: []OpsAlertSilenceEntry{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) {
|
||||||
|
if s == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.Key = strings.TrimSpace(s.Key)
|
||||||
|
if s.Key == "" {
|
||||||
|
s.Key = defaultKey
|
||||||
|
}
|
||||||
|
if s.TTLSeconds <= 0 {
|
||||||
|
s.TTLSeconds = defaultTTLSeconds
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) {
|
||||||
|
if s == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339)
|
||||||
|
s.GlobalReason = strings.TrimSpace(s.GlobalReason)
|
||||||
|
if s.Entries == nil {
|
||||||
|
s.Entries = []OpsAlertSilenceEntry{}
|
||||||
|
}
|
||||||
|
for i := range s.Entries {
|
||||||
|
s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339)
|
||||||
|
s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error {
|
||||||
|
if strings.TrimSpace(s.Key) == "" {
|
||||||
|
return errors.New("distributed_lock.key is required")
|
||||||
|
}
|
||||||
|
if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) {
|
||||||
|
return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error {
|
||||||
|
parse := func(raw string) error {
|
||||||
|
if strings.TrimSpace(raw) == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if _, err := time.Parse(time.RFC3339, raw); err != nil {
|
||||||
|
return errors.New("silencing time must be RFC3339")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := parse(s.GlobalUntilRFC3339); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, entry := range s.Entries {
|
||||||
|
if strings.TrimSpace(entry.UntilRFC3339) == "" {
|
||||||
|
return errors.New("silencing.entries.until_rfc3339 is required")
|
||||||
|
}
|
||||||
|
if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil {
|
||||||
|
return errors.New("silencing.entries.until_rfc3339 must be RFC3339")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) {
|
||||||
|
defaultCfg := defaultOpsAlertRuntimeSettings()
|
||||||
|
if s == nil || s.settingRepo == nil {
|
||||||
|
return defaultCfg, nil
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, ErrSettingNotFound) {
|
||||||
|
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
|
||||||
|
_ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b))
|
||||||
|
}
|
||||||
|
return defaultCfg, nil
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg := &OpsAlertRuntimeSettings{}
|
||||||
|
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
|
||||||
|
return defaultCfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.EvaluationIntervalSeconds <= 0 {
|
||||||
|
cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds
|
||||||
|
}
|
||||||
|
normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
|
||||||
|
normalizeOpsAlertSilencingSettings(&cfg.Silencing)
|
||||||
|
|
||||||
|
return cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) {
|
||||||
|
if s == nil || s.settingRepo == nil {
|
||||||
|
return nil, errors.New("setting repository not initialized")
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
if cfg == nil {
|
||||||
|
return nil, errors.New("invalid config")
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) {
|
||||||
|
return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400")
|
||||||
|
}
|
||||||
|
if cfg.DistributedLock.Enabled {
|
||||||
|
if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if cfg.Silencing.Enabled {
|
||||||
|
if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
defaultCfg := defaultOpsAlertRuntimeSettings()
|
||||||
|
normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
|
||||||
|
normalizeOpsAlertSilencingSettings(&cfg.Silencing)
|
||||||
|
|
||||||
|
raw, err := json.Marshal(cfg)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated).
|
||||||
|
updated := &OpsAlertRuntimeSettings{}
|
||||||
|
_ = json.Unmarshal(raw, updated)
|
||||||
|
return updated, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// =========================
|
||||||
|
// Advanced settings
|
||||||
|
// =========================
|
||||||
|
|
||||||
|
func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
|
||||||
|
return &OpsAdvancedSettings{
|
||||||
|
DataRetention: OpsDataRetentionSettings{
|
||||||
|
CleanupEnabled: false,
|
||||||
|
CleanupSchedule: "0 2 * * *",
|
||||||
|
ErrorLogRetentionDays: 30,
|
||||||
|
MinuteMetricsRetentionDays: 30,
|
||||||
|
HourlyMetricsRetentionDays: 30,
|
||||||
|
},
|
||||||
|
Aggregation: OpsAggregationSettings{
|
||||||
|
AggregationEnabled: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) {
|
||||||
|
if cfg == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
cfg.DataRetention.CleanupSchedule = strings.TrimSpace(cfg.DataRetention.CleanupSchedule)
|
||||||
|
if cfg.DataRetention.CleanupSchedule == "" {
|
||||||
|
cfg.DataRetention.CleanupSchedule = "0 2 * * *"
|
||||||
|
}
|
||||||
|
if cfg.DataRetention.ErrorLogRetentionDays <= 0 {
|
||||||
|
cfg.DataRetention.ErrorLogRetentionDays = 30
|
||||||
|
}
|
||||||
|
if cfg.DataRetention.MinuteMetricsRetentionDays <= 0 {
|
||||||
|
cfg.DataRetention.MinuteMetricsRetentionDays = 30
|
||||||
|
}
|
||||||
|
if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 {
|
||||||
|
cfg.DataRetention.HourlyMetricsRetentionDays = 30
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
|
||||||
|
if cfg == nil {
|
||||||
|
return errors.New("invalid config")
|
||||||
|
}
|
||||||
|
if cfg.DataRetention.ErrorLogRetentionDays < 1 || cfg.DataRetention.ErrorLogRetentionDays > 365 {
|
||||||
|
return errors.New("error_log_retention_days must be between 1 and 365")
|
||||||
|
}
|
||||||
|
if cfg.DataRetention.MinuteMetricsRetentionDays < 1 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 {
|
||||||
|
return errors.New("minute_metrics_retention_days must be between 1 and 365")
|
||||||
|
}
|
||||||
|
if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
|
||||||
|
return errors.New("hourly_metrics_retention_days must be between 1 and 365")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) GetOpsAdvancedSettings(ctx context.Context) (*OpsAdvancedSettings, error) {
|
||||||
|
defaultCfg := defaultOpsAdvancedSettings()
|
||||||
|
if s == nil || s.settingRepo == nil {
|
||||||
|
return defaultCfg, nil
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAdvancedSettings)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, ErrSettingNotFound) {
|
||||||
|
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
|
||||||
|
_ = s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(b))
|
||||||
|
}
|
||||||
|
return defaultCfg, nil
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg := &OpsAdvancedSettings{}
|
||||||
|
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
|
||||||
|
return defaultCfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
normalizeOpsAdvancedSettings(cfg)
|
||||||
|
return cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdvancedSettings) (*OpsAdvancedSettings, error) {
|
||||||
|
if s == nil || s.settingRepo == nil {
|
||||||
|
return nil, errors.New("setting repository not initialized")
|
||||||
|
}
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
if cfg == nil {
|
||||||
|
return nil, errors.New("invalid config")
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := validateOpsAdvancedSettings(cfg); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
normalizeOpsAdvancedSettings(cfg)
|
||||||
|
raw, err := json.Marshal(cfg)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if err := s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(raw)); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
updated := &OpsAdvancedSettings{}
|
||||||
|
_ = json.Unmarshal(raw, updated)
|
||||||
|
return updated, nil
|
||||||
|
}
|
||||||
87
backend/internal/service/ops_settings_models.go
Normal file
87
backend/internal/service/ops_settings_models.go
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
// Ops settings models stored in DB `settings` table (JSON blobs).
|
||||||
|
|
||||||
|
type OpsEmailNotificationConfig struct {
|
||||||
|
Alert OpsEmailAlertConfig `json:"alert"`
|
||||||
|
Report OpsEmailReportConfig `json:"report"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsEmailAlertConfig struct {
|
||||||
|
Enabled bool `json:"enabled"`
|
||||||
|
Recipients []string `json:"recipients"`
|
||||||
|
MinSeverity string `json:"min_severity"`
|
||||||
|
RateLimitPerHour int `json:"rate_limit_per_hour"`
|
||||||
|
BatchingWindowSeconds int `json:"batching_window_seconds"`
|
||||||
|
IncludeResolvedAlerts bool `json:"include_resolved_alerts"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsEmailReportConfig struct {
|
||||||
|
Enabled bool `json:"enabled"`
|
||||||
|
Recipients []string `json:"recipients"`
|
||||||
|
DailySummaryEnabled bool `json:"daily_summary_enabled"`
|
||||||
|
DailySummarySchedule string `json:"daily_summary_schedule"`
|
||||||
|
WeeklySummaryEnabled bool `json:"weekly_summary_enabled"`
|
||||||
|
WeeklySummarySchedule string `json:"weekly_summary_schedule"`
|
||||||
|
ErrorDigestEnabled bool `json:"error_digest_enabled"`
|
||||||
|
ErrorDigestSchedule string `json:"error_digest_schedule"`
|
||||||
|
ErrorDigestMinCount int `json:"error_digest_min_count"`
|
||||||
|
AccountHealthEnabled bool `json:"account_health_enabled"`
|
||||||
|
AccountHealthSchedule string `json:"account_health_schedule"`
|
||||||
|
AccountHealthErrorRateThreshold float64 `json:"account_health_error_rate_threshold"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpsEmailNotificationConfigUpdateRequest allows partial updates, while the
|
||||||
|
// frontend can still send the full config shape.
|
||||||
|
type OpsEmailNotificationConfigUpdateRequest struct {
|
||||||
|
Alert *OpsEmailAlertConfig `json:"alert"`
|
||||||
|
Report *OpsEmailReportConfig `json:"report"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsDistributedLockSettings struct {
|
||||||
|
Enabled bool `json:"enabled"`
|
||||||
|
Key string `json:"key"`
|
||||||
|
TTLSeconds int `json:"ttl_seconds"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsAlertSilenceEntry struct {
|
||||||
|
RuleID *int64 `json:"rule_id,omitempty"`
|
||||||
|
Severities []string `json:"severities,omitempty"`
|
||||||
|
|
||||||
|
UntilRFC3339 string `json:"until_rfc3339"`
|
||||||
|
Reason string `json:"reason"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsAlertSilencingSettings struct {
|
||||||
|
Enabled bool `json:"enabled"`
|
||||||
|
|
||||||
|
GlobalUntilRFC3339 string `json:"global_until_rfc3339"`
|
||||||
|
GlobalReason string `json:"global_reason"`
|
||||||
|
|
||||||
|
Entries []OpsAlertSilenceEntry `json:"entries,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsAlertRuntimeSettings struct {
|
||||||
|
EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"`
|
||||||
|
|
||||||
|
DistributedLock OpsDistributedLockSettings `json:"distributed_lock"`
|
||||||
|
Silencing OpsAlertSilencingSettings `json:"silencing"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation).
|
||||||
|
type OpsAdvancedSettings struct {
|
||||||
|
DataRetention OpsDataRetentionSettings `json:"data_retention"`
|
||||||
|
Aggregation OpsAggregationSettings `json:"aggregation"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsDataRetentionSettings struct {
|
||||||
|
CleanupEnabled bool `json:"cleanup_enabled"`
|
||||||
|
CleanupSchedule string `json:"cleanup_schedule"`
|
||||||
|
ErrorLogRetentionDays int `json:"error_log_retention_days"`
|
||||||
|
MinuteMetricsRetentionDays int `json:"minute_metrics_retention_days"`
|
||||||
|
HourlyMetricsRetentionDays int `json:"hourly_metrics_retention_days"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsAggregationSettings struct {
|
||||||
|
AggregationEnabled bool `json:"aggregation_enabled"`
|
||||||
|
}
|
||||||
65
backend/internal/service/ops_trend_models.go
Normal file
65
backend/internal/service/ops_trend_models.go
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
type OpsThroughputTrendPoint struct {
|
||||||
|
BucketStart time.Time `json:"bucket_start"`
|
||||||
|
RequestCount int64 `json:"request_count"`
|
||||||
|
TokenConsumed int64 `json:"token_consumed"`
|
||||||
|
QPS float64 `json:"qps"`
|
||||||
|
TPS float64 `json:"tps"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsThroughputPlatformBreakdownItem struct {
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
RequestCount int64 `json:"request_count"`
|
||||||
|
TokenConsumed int64 `json:"token_consumed"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsThroughputGroupBreakdownItem struct {
|
||||||
|
GroupID int64 `json:"group_id"`
|
||||||
|
GroupName string `json:"group_name"`
|
||||||
|
RequestCount int64 `json:"request_count"`
|
||||||
|
TokenConsumed int64 `json:"token_consumed"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsThroughputTrendResponse struct {
|
||||||
|
Bucket string `json:"bucket"`
|
||||||
|
|
||||||
|
Points []*OpsThroughputTrendPoint `json:"points"`
|
||||||
|
|
||||||
|
// Optional drilldown helpers:
|
||||||
|
// - When no platform/group is selected: returns totals by platform.
|
||||||
|
// - When platform is selected but group is not: returns top groups in that platform.
|
||||||
|
ByPlatform []*OpsThroughputPlatformBreakdownItem `json:"by_platform,omitempty"`
|
||||||
|
TopGroups []*OpsThroughputGroupBreakdownItem `json:"top_groups,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsErrorTrendPoint struct {
|
||||||
|
BucketStart time.Time `json:"bucket_start"`
|
||||||
|
|
||||||
|
ErrorCountTotal int64 `json:"error_count_total"`
|
||||||
|
BusinessLimitedCount int64 `json:"business_limited_count"`
|
||||||
|
ErrorCountSLA int64 `json:"error_count_sla"`
|
||||||
|
|
||||||
|
UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
|
||||||
|
Upstream429Count int64 `json:"upstream_429_count"`
|
||||||
|
Upstream529Count int64 `json:"upstream_529_count"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsErrorTrendResponse struct {
|
||||||
|
Bucket string `json:"bucket"`
|
||||||
|
Points []*OpsErrorTrendPoint `json:"points"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsErrorDistributionItem struct {
|
||||||
|
StatusCode int `json:"status_code"`
|
||||||
|
Total int64 `json:"total"`
|
||||||
|
SLA int64 `json:"sla"`
|
||||||
|
BusinessLimited int64 `json:"business_limited"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsErrorDistributionResponse struct {
|
||||||
|
Total int64 `json:"total"`
|
||||||
|
Items []*OpsErrorDistributionItem `json:"items"`
|
||||||
|
}
|
||||||
26
backend/internal/service/ops_trends.go
Normal file
26
backend/internal/service/ops_trends.go
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *OpsService) GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
if filter == nil {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
|
||||||
|
}
|
||||||
|
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
|
||||||
|
}
|
||||||
|
if filter.StartTime.After(filter.EndTime) {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
|
||||||
|
}
|
||||||
|
return s.opsRepo.GetThroughputTrend(ctx, filter, bucketSeconds)
|
||||||
|
}
|
||||||
94
backend/internal/service/ops_upstream_context.go
Normal file
94
backend/internal/service/ops_upstream_context.go
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/gin-gonic/gin"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Gin context keys used by Ops error logger for capturing upstream error details.
|
||||||
|
// These keys are set by gateway services and consumed by handler/ops_error_logger.go.
|
||||||
|
const (
|
||||||
|
OpsUpstreamStatusCodeKey = "ops_upstream_status_code"
|
||||||
|
OpsUpstreamErrorMessageKey = "ops_upstream_error_message"
|
||||||
|
OpsUpstreamErrorDetailKey = "ops_upstream_error_detail"
|
||||||
|
OpsUpstreamErrorsKey = "ops_upstream_errors"
|
||||||
|
)
|
||||||
|
|
||||||
|
func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) {
|
||||||
|
if c == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if upstreamStatusCode > 0 {
|
||||||
|
c.Set(OpsUpstreamStatusCodeKey, upstreamStatusCode)
|
||||||
|
}
|
||||||
|
if msg := strings.TrimSpace(upstreamMessage); msg != "" {
|
||||||
|
c.Set(OpsUpstreamErrorMessageKey, msg)
|
||||||
|
}
|
||||||
|
if detail := strings.TrimSpace(upstreamDetail); detail != "" {
|
||||||
|
c.Set(OpsUpstreamErrorDetailKey, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpsUpstreamErrorEvent describes one upstream error attempt during a single gateway request.
|
||||||
|
// It is stored in ops_error_logs.upstream_errors as a JSON array.
|
||||||
|
type OpsUpstreamErrorEvent struct {
|
||||||
|
AtUnixMs int64 `json:"at_unix_ms,omitempty"`
|
||||||
|
|
||||||
|
// Context
|
||||||
|
Platform string `json:"platform,omitempty"`
|
||||||
|
AccountID int64 `json:"account_id,omitempty"`
|
||||||
|
|
||||||
|
// Outcome
|
||||||
|
UpstreamStatusCode int `json:"upstream_status_code,omitempty"`
|
||||||
|
UpstreamRequestID string `json:"upstream_request_id,omitempty"`
|
||||||
|
|
||||||
|
// Kind: http_error | request_error | retry_exhausted | failover
|
||||||
|
Kind string `json:"kind,omitempty"`
|
||||||
|
|
||||||
|
Message string `json:"message,omitempty"`
|
||||||
|
Detail string `json:"detail,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
|
||||||
|
if c == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if ev.AtUnixMs <= 0 {
|
||||||
|
ev.AtUnixMs = time.Now().UnixMilli()
|
||||||
|
}
|
||||||
|
ev.Platform = strings.TrimSpace(ev.Platform)
|
||||||
|
ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID)
|
||||||
|
ev.Kind = strings.TrimSpace(ev.Kind)
|
||||||
|
ev.Message = strings.TrimSpace(ev.Message)
|
||||||
|
ev.Detail = strings.TrimSpace(ev.Detail)
|
||||||
|
if ev.Message != "" {
|
||||||
|
ev.Message = sanitizeUpstreamErrorMessage(ev.Message)
|
||||||
|
}
|
||||||
|
|
||||||
|
var existing []*OpsUpstreamErrorEvent
|
||||||
|
if v, ok := c.Get(OpsUpstreamErrorsKey); ok {
|
||||||
|
if arr, ok := v.([]*OpsUpstreamErrorEvent); ok {
|
||||||
|
existing = arr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
evCopy := ev
|
||||||
|
existing = append(existing, &evCopy)
|
||||||
|
c.Set(OpsUpstreamErrorsKey, existing)
|
||||||
|
}
|
||||||
|
|
||||||
|
func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string {
|
||||||
|
if len(events) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// Ensure we always store a valid JSON value.
|
||||||
|
raw, err := json.Marshal(events)
|
||||||
|
if err != nil || len(raw) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
s := string(raw)
|
||||||
|
return &s
|
||||||
|
}
|
||||||
24
backend/internal/service/ops_window_stats.go
Normal file
24
backend/internal/service/ops_window_stats.go
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||||
|
)
|
||||||
|
|
||||||
|
// GetWindowStats returns lightweight request/token counts for the provided window.
|
||||||
|
// It is intended for realtime sampling (e.g. WebSocket QPS push) without computing percentiles/peaks.
|
||||||
|
func (s *OpsService) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
filter := &OpsDashboardFilter{
|
||||||
|
StartTime: startTime,
|
||||||
|
EndTime: endTime,
|
||||||
|
}
|
||||||
|
return s.opsRepo.GetWindowStats(ctx, filter)
|
||||||
|
}
|
||||||
@@ -55,19 +55,36 @@ func (s *RateLimitService) HandleUpstreamError(ctx context.Context, account *Acc
|
|||||||
}
|
}
|
||||||
|
|
||||||
tempMatched := s.tryTempUnschedulable(ctx, account, statusCode, responseBody)
|
tempMatched := s.tryTempUnschedulable(ctx, account, statusCode, responseBody)
|
||||||
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(responseBody))
|
||||||
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
|
if upstreamMsg != "" {
|
||||||
|
upstreamMsg = truncateForLog([]byte(upstreamMsg), 512)
|
||||||
|
}
|
||||||
|
|
||||||
switch statusCode {
|
switch statusCode {
|
||||||
case 401:
|
case 401:
|
||||||
// 认证失败:停止调度,记录错误
|
// 认证失败:停止调度,记录错误
|
||||||
s.handleAuthError(ctx, account, "Authentication failed (401): invalid or expired credentials")
|
msg := "Authentication failed (401): invalid or expired credentials"
|
||||||
|
if upstreamMsg != "" {
|
||||||
|
msg = "Authentication failed (401): " + upstreamMsg
|
||||||
|
}
|
||||||
|
s.handleAuthError(ctx, account, msg)
|
||||||
shouldDisable = true
|
shouldDisable = true
|
||||||
case 402:
|
case 402:
|
||||||
// 支付要求:余额不足或计费问题,停止调度
|
// 支付要求:余额不足或计费问题,停止调度
|
||||||
s.handleAuthError(ctx, account, "Payment required (402): insufficient balance or billing issue")
|
msg := "Payment required (402): insufficient balance or billing issue"
|
||||||
|
if upstreamMsg != "" {
|
||||||
|
msg = "Payment required (402): " + upstreamMsg
|
||||||
|
}
|
||||||
|
s.handleAuthError(ctx, account, msg)
|
||||||
shouldDisable = true
|
shouldDisable = true
|
||||||
case 403:
|
case 403:
|
||||||
// 禁止访问:停止调度,记录错误
|
// 禁止访问:停止调度,记录错误
|
||||||
s.handleAuthError(ctx, account, "Access forbidden (403): account may be suspended or lack permissions")
|
msg := "Access forbidden (403): account may be suspended or lack permissions"
|
||||||
|
if upstreamMsg != "" {
|
||||||
|
msg = "Access forbidden (403): " + upstreamMsg
|
||||||
|
}
|
||||||
|
s.handleAuthError(ctx, account, msg)
|
||||||
shouldDisable = true
|
shouldDisable = true
|
||||||
case 429:
|
case 429:
|
||||||
s.handle429(ctx, account, headers)
|
s.handle429(ctx, account, headers)
|
||||||
|
|||||||
@@ -176,7 +176,7 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
|
|||||||
updates[SettingKeyTurnstileSecretKey] = settings.TurnstileSecretKey
|
updates[SettingKeyTurnstileSecretKey] = settings.TurnstileSecretKey
|
||||||
}
|
}
|
||||||
|
|
||||||
// LinuxDo Connect OAuth 登录(终端用户 SSO)
|
// LinuxDo Connect OAuth 登录
|
||||||
updates[SettingKeyLinuxDoConnectEnabled] = strconv.FormatBool(settings.LinuxDoConnectEnabled)
|
updates[SettingKeyLinuxDoConnectEnabled] = strconv.FormatBool(settings.LinuxDoConnectEnabled)
|
||||||
updates[SettingKeyLinuxDoConnectClientID] = settings.LinuxDoConnectClientID
|
updates[SettingKeyLinuxDoConnectClientID] = settings.LinuxDoConnectClientID
|
||||||
updates[SettingKeyLinuxDoConnectRedirectURL] = settings.LinuxDoConnectRedirectURL
|
updates[SettingKeyLinuxDoConnectRedirectURL] = settings.LinuxDoConnectRedirectURL
|
||||||
@@ -208,6 +208,14 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
|
|||||||
updates[SettingKeyEnableIdentityPatch] = strconv.FormatBool(settings.EnableIdentityPatch)
|
updates[SettingKeyEnableIdentityPatch] = strconv.FormatBool(settings.EnableIdentityPatch)
|
||||||
updates[SettingKeyIdentityPatchPrompt] = settings.IdentityPatchPrompt
|
updates[SettingKeyIdentityPatchPrompt] = settings.IdentityPatchPrompt
|
||||||
|
|
||||||
|
// Ops monitoring (vNext)
|
||||||
|
updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled)
|
||||||
|
updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled)
|
||||||
|
updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault))
|
||||||
|
if settings.OpsMetricsIntervalSeconds > 0 {
|
||||||
|
updates[SettingKeyOpsMetricsIntervalSeconds] = strconv.Itoa(settings.OpsMetricsIntervalSeconds)
|
||||||
|
}
|
||||||
|
|
||||||
err := s.settingRepo.SetMultiple(ctx, updates)
|
err := s.settingRepo.SetMultiple(ctx, updates)
|
||||||
if err == nil && s.onUpdate != nil {
|
if err == nil && s.onUpdate != nil {
|
||||||
s.onUpdate() // Invalidate cache after settings update
|
s.onUpdate() // Invalidate cache after settings update
|
||||||
@@ -298,6 +306,12 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error {
|
|||||||
// Identity patch defaults
|
// Identity patch defaults
|
||||||
SettingKeyEnableIdentityPatch: "true",
|
SettingKeyEnableIdentityPatch: "true",
|
||||||
SettingKeyIdentityPatchPrompt: "",
|
SettingKeyIdentityPatchPrompt: "",
|
||||||
|
|
||||||
|
// Ops monitoring defaults (vNext)
|
||||||
|
SettingKeyOpsMonitoringEnabled: "true",
|
||||||
|
SettingKeyOpsRealtimeMonitoringEnabled: "true",
|
||||||
|
SettingKeyOpsQueryModeDefault: "auto",
|
||||||
|
SettingKeyOpsMetricsIntervalSeconds: "60",
|
||||||
}
|
}
|
||||||
|
|
||||||
return s.settingRepo.SetMultiple(ctx, defaults)
|
return s.settingRepo.SetMultiple(ctx, defaults)
|
||||||
@@ -397,100 +411,33 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin
|
|||||||
}
|
}
|
||||||
result.IdentityPatchPrompt = settings[SettingKeyIdentityPatchPrompt]
|
result.IdentityPatchPrompt = settings[SettingKeyIdentityPatchPrompt]
|
||||||
|
|
||||||
|
// Ops monitoring settings (default: enabled, fail-open)
|
||||||
|
result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled])
|
||||||
|
result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled])
|
||||||
|
result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault]))
|
||||||
|
result.OpsMetricsIntervalSeconds = 60
|
||||||
|
if raw := strings.TrimSpace(settings[SettingKeyOpsMetricsIntervalSeconds]); raw != "" {
|
||||||
|
if v, err := strconv.Atoi(raw); err == nil {
|
||||||
|
if v < 60 {
|
||||||
|
v = 60
|
||||||
|
}
|
||||||
|
if v > 3600 {
|
||||||
|
v = 3600
|
||||||
|
}
|
||||||
|
result.OpsMetricsIntervalSeconds = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetLinuxDoConnectOAuthConfig 返回用于登录的“最终生效” LinuxDo Connect 配置。
|
func isFalseSettingValue(value string) bool {
|
||||||
//
|
switch strings.ToLower(strings.TrimSpace(value)) {
|
||||||
// 优先级:
|
case "false", "0", "off", "disabled":
|
||||||
// - 若对应系统设置键存在,则覆盖 config.yaml/env 的值
|
return true
|
||||||
// - 否则回退到 config.yaml/env 的值
|
|
||||||
func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) {
|
|
||||||
if s == nil || s.cfg == nil {
|
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded")
|
|
||||||
}
|
|
||||||
|
|
||||||
effective := s.cfg.LinuxDo
|
|
||||||
|
|
||||||
keys := []string{
|
|
||||||
SettingKeyLinuxDoConnectEnabled,
|
|
||||||
SettingKeyLinuxDoConnectClientID,
|
|
||||||
SettingKeyLinuxDoConnectClientSecret,
|
|
||||||
SettingKeyLinuxDoConnectRedirectURL,
|
|
||||||
}
|
|
||||||
settings, err := s.settingRepo.GetMultiple(ctx, keys)
|
|
||||||
if err != nil {
|
|
||||||
return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
|
|
||||||
effective.Enabled = raw == "true"
|
|
||||||
}
|
|
||||||
if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
|
|
||||||
effective.ClientID = strings.TrimSpace(v)
|
|
||||||
}
|
|
||||||
if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
|
|
||||||
effective.ClientSecret = strings.TrimSpace(v)
|
|
||||||
}
|
|
||||||
if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
|
|
||||||
effective.RedirectURL = strings.TrimSpace(v)
|
|
||||||
}
|
|
||||||
|
|
||||||
if !effective.Enabled {
|
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
|
|
||||||
}
|
|
||||||
|
|
||||||
// 基础健壮性校验(避免把用户重定向到一个必然失败或不安全的 OAuth 流程里)。
|
|
||||||
if strings.TrimSpace(effective.ClientID) == "" {
|
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(effective.AuthorizeURL) == "" {
|
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(effective.TokenURL) == "" {
|
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(effective.UserInfoURL) == "" {
|
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(effective.RedirectURL) == "" {
|
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
|
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil {
|
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid")
|
|
||||||
}
|
|
||||||
if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
|
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
|
|
||||||
}
|
|
||||||
if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
|
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
|
|
||||||
}
|
|
||||||
if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
|
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
|
|
||||||
}
|
|
||||||
if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
|
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
|
|
||||||
}
|
|
||||||
|
|
||||||
method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod))
|
|
||||||
switch method {
|
|
||||||
case "", "client_secret_post", "client_secret_basic":
|
|
||||||
if strings.TrimSpace(effective.ClientSecret) == "" {
|
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
|
|
||||||
}
|
|
||||||
case "none":
|
|
||||||
if !effective.UsePKCE {
|
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
|
|
||||||
}
|
|
||||||
default:
|
default:
|
||||||
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid")
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
return effective, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// getStringOrDefault 获取字符串值或默认值
|
// getStringOrDefault 获取字符串值或默认值
|
||||||
@@ -635,3 +582,96 @@ func (s *SettingService) GetFallbackModel(ctx context.Context, platform string)
|
|||||||
}
|
}
|
||||||
return value
|
return value
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetLinuxDoConnectOAuthConfig 返回用于登录的"最终生效" LinuxDo Connect 配置。
|
||||||
|
//
|
||||||
|
// 优先级:
|
||||||
|
// - 若对应系统设置键存在,则覆盖 config.yaml/env 的值
|
||||||
|
// - 否则回退到 config.yaml/env 的值
|
||||||
|
func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) {
|
||||||
|
if s == nil || s.cfg == nil {
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded")
|
||||||
|
}
|
||||||
|
|
||||||
|
effective := s.cfg.LinuxDo
|
||||||
|
|
||||||
|
keys := []string{
|
||||||
|
SettingKeyLinuxDoConnectEnabled,
|
||||||
|
SettingKeyLinuxDoConnectClientID,
|
||||||
|
SettingKeyLinuxDoConnectClientSecret,
|
||||||
|
SettingKeyLinuxDoConnectRedirectURL,
|
||||||
|
}
|
||||||
|
settings, err := s.settingRepo.GetMultiple(ctx, keys)
|
||||||
|
if err != nil {
|
||||||
|
return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
|
||||||
|
effective.Enabled = raw == "true"
|
||||||
|
}
|
||||||
|
if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
|
||||||
|
effective.ClientID = strings.TrimSpace(v)
|
||||||
|
}
|
||||||
|
if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
|
||||||
|
effective.ClientSecret = strings.TrimSpace(v)
|
||||||
|
}
|
||||||
|
if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
|
||||||
|
effective.RedirectURL = strings.TrimSpace(v)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !effective.Enabled {
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
|
||||||
|
}
|
||||||
|
|
||||||
|
// 基础健壮性校验(避免把用户重定向到一个必然失败或不安全的 OAuth 流程里)。
|
||||||
|
if strings.TrimSpace(effective.ClientID) == "" {
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(effective.AuthorizeURL) == "" {
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(effective.TokenURL) == "" {
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(effective.UserInfoURL) == "" {
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(effective.RedirectURL) == "" {
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil {
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid")
|
||||||
|
}
|
||||||
|
if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
|
||||||
|
}
|
||||||
|
if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
|
||||||
|
}
|
||||||
|
if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
|
||||||
|
}
|
||||||
|
if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
|
||||||
|
}
|
||||||
|
|
||||||
|
method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod))
|
||||||
|
switch method {
|
||||||
|
case "", "client_secret_post", "client_secret_basic":
|
||||||
|
if strings.TrimSpace(effective.ClientSecret) == "" {
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
|
||||||
|
}
|
||||||
|
case "none":
|
||||||
|
if !effective.UsePKCE {
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid")
|
||||||
|
}
|
||||||
|
|
||||||
|
return effective, nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ type SystemSettings struct {
|
|||||||
TurnstileSecretKey string
|
TurnstileSecretKey string
|
||||||
TurnstileSecretKeyConfigured bool
|
TurnstileSecretKeyConfigured bool
|
||||||
|
|
||||||
// LinuxDo Connect OAuth 登录(终端用户 SSO)
|
// LinuxDo Connect OAuth 登录
|
||||||
LinuxDoConnectEnabled bool
|
LinuxDoConnectEnabled bool
|
||||||
LinuxDoConnectClientID string
|
LinuxDoConnectClientID string
|
||||||
LinuxDoConnectClientSecret string
|
LinuxDoConnectClientSecret string
|
||||||
@@ -46,6 +46,12 @@ type SystemSettings struct {
|
|||||||
// Identity patch configuration (Claude -> Gemini)
|
// Identity patch configuration (Claude -> Gemini)
|
||||||
EnableIdentityPatch bool `json:"enable_identity_patch"`
|
EnableIdentityPatch bool `json:"enable_identity_patch"`
|
||||||
IdentityPatchPrompt string `json:"identity_patch_prompt"`
|
IdentityPatchPrompt string `json:"identity_patch_prompt"`
|
||||||
|
|
||||||
|
// Ops monitoring (vNext)
|
||||||
|
OpsMonitoringEnabled bool
|
||||||
|
OpsRealtimeMonitoringEnabled bool
|
||||||
|
OpsQueryModeDefault string
|
||||||
|
OpsMetricsIntervalSeconds int
|
||||||
}
|
}
|
||||||
|
|
||||||
type PublicSettings struct {
|
type PublicSettings struct {
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
package service
|
package service
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"database/sql"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/Wei-Shaw/sub2api/internal/config"
|
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||||
"github.com/google/wire"
|
"github.com/google/wire"
|
||||||
|
"github.com/redis/go-redis/v9"
|
||||||
)
|
)
|
||||||
|
|
||||||
// BuildInfo contains build information
|
// BuildInfo contains build information
|
||||||
@@ -84,6 +86,72 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi
|
|||||||
return svc
|
return svc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ProvideOpsMetricsCollector creates and starts OpsMetricsCollector.
|
||||||
|
func ProvideOpsMetricsCollector(
|
||||||
|
opsRepo OpsRepository,
|
||||||
|
settingRepo SettingRepository,
|
||||||
|
accountRepo AccountRepository,
|
||||||
|
concurrencyService *ConcurrencyService,
|
||||||
|
db *sql.DB,
|
||||||
|
redisClient *redis.Client,
|
||||||
|
cfg *config.Config,
|
||||||
|
) *OpsMetricsCollector {
|
||||||
|
collector := NewOpsMetricsCollector(opsRepo, settingRepo, accountRepo, concurrencyService, db, redisClient, cfg)
|
||||||
|
collector.Start()
|
||||||
|
return collector
|
||||||
|
}
|
||||||
|
|
||||||
|
// ProvideOpsAggregationService creates and starts OpsAggregationService (hourly/daily pre-aggregation).
|
||||||
|
func ProvideOpsAggregationService(
|
||||||
|
opsRepo OpsRepository,
|
||||||
|
settingRepo SettingRepository,
|
||||||
|
db *sql.DB,
|
||||||
|
redisClient *redis.Client,
|
||||||
|
cfg *config.Config,
|
||||||
|
) *OpsAggregationService {
|
||||||
|
svc := NewOpsAggregationService(opsRepo, settingRepo, db, redisClient, cfg)
|
||||||
|
svc.Start()
|
||||||
|
return svc
|
||||||
|
}
|
||||||
|
|
||||||
|
// ProvideOpsAlertEvaluatorService creates and starts OpsAlertEvaluatorService.
|
||||||
|
func ProvideOpsAlertEvaluatorService(
|
||||||
|
opsService *OpsService,
|
||||||
|
opsRepo OpsRepository,
|
||||||
|
emailService *EmailService,
|
||||||
|
redisClient *redis.Client,
|
||||||
|
cfg *config.Config,
|
||||||
|
) *OpsAlertEvaluatorService {
|
||||||
|
svc := NewOpsAlertEvaluatorService(opsService, opsRepo, emailService, redisClient, cfg)
|
||||||
|
svc.Start()
|
||||||
|
return svc
|
||||||
|
}
|
||||||
|
|
||||||
|
// ProvideOpsCleanupService creates and starts OpsCleanupService (cron scheduled).
|
||||||
|
func ProvideOpsCleanupService(
|
||||||
|
opsRepo OpsRepository,
|
||||||
|
db *sql.DB,
|
||||||
|
redisClient *redis.Client,
|
||||||
|
cfg *config.Config,
|
||||||
|
) *OpsCleanupService {
|
||||||
|
svc := NewOpsCleanupService(opsRepo, db, redisClient, cfg)
|
||||||
|
svc.Start()
|
||||||
|
return svc
|
||||||
|
}
|
||||||
|
|
||||||
|
// ProvideOpsScheduledReportService creates and starts OpsScheduledReportService.
|
||||||
|
func ProvideOpsScheduledReportService(
|
||||||
|
opsService *OpsService,
|
||||||
|
userService *UserService,
|
||||||
|
emailService *EmailService,
|
||||||
|
redisClient *redis.Client,
|
||||||
|
cfg *config.Config,
|
||||||
|
) *OpsScheduledReportService {
|
||||||
|
svc := NewOpsScheduledReportService(opsService, userService, emailService, redisClient, cfg)
|
||||||
|
svc.Start()
|
||||||
|
return svc
|
||||||
|
}
|
||||||
|
|
||||||
// ProvideAPIKeyAuthCacheInvalidator 提供 API Key 认证缓存失效能力
|
// ProvideAPIKeyAuthCacheInvalidator 提供 API Key 认证缓存失效能力
|
||||||
func ProvideAPIKeyAuthCacheInvalidator(apiKeyService *APIKeyService) APIKeyAuthCacheInvalidator {
|
func ProvideAPIKeyAuthCacheInvalidator(apiKeyService *APIKeyService) APIKeyAuthCacheInvalidator {
|
||||||
return apiKeyService
|
return apiKeyService
|
||||||
@@ -122,6 +190,12 @@ var ProviderSet = wire.NewSet(
|
|||||||
NewAccountUsageService,
|
NewAccountUsageService,
|
||||||
NewAccountTestService,
|
NewAccountTestService,
|
||||||
NewSettingService,
|
NewSettingService,
|
||||||
|
NewOpsService,
|
||||||
|
ProvideOpsMetricsCollector,
|
||||||
|
ProvideOpsAggregationService,
|
||||||
|
ProvideOpsAlertEvaluatorService,
|
||||||
|
ProvideOpsCleanupService,
|
||||||
|
ProvideOpsScheduledReportService,
|
||||||
NewEmailService,
|
NewEmailService,
|
||||||
ProvideEmailQueueService,
|
ProvideEmailQueueService,
|
||||||
NewTurnstileService,
|
NewTurnstileService,
|
||||||
|
|||||||
717
backend/migrations/033_ops_monitoring_vnext.sql
Normal file
717
backend/migrations/033_ops_monitoring_vnext.sql
Normal file
@@ -0,0 +1,717 @@
|
|||||||
|
-- Ops Monitoring (vNext): squashed migration (030)
|
||||||
|
--
|
||||||
|
-- This repository originally planned Ops vNext as migrations 030-036:
|
||||||
|
-- 030 drop legacy ops tables
|
||||||
|
-- 031 core schema
|
||||||
|
-- 032 pre-aggregation tables
|
||||||
|
-- 033 indexes + optional extensions
|
||||||
|
-- 034 add avg/max to preagg
|
||||||
|
-- 035 add notify_email to alert rules
|
||||||
|
-- 036 seed default alert rules
|
||||||
|
--
|
||||||
|
-- Since these migrations have NOT been applied to any environment yet, we squash them
|
||||||
|
-- into a single 030 migration for easier review and a cleaner migration history.
|
||||||
|
--
|
||||||
|
-- Notes:
|
||||||
|
-- - This is intentionally destructive for ops_* data (error logs / metrics / alerts).
|
||||||
|
-- - It is idempotent (DROP/CREATE/ALTER IF EXISTS/IF NOT EXISTS), but will wipe ops_* data if re-run.
|
||||||
|
|
||||||
|
-- =====================================================================
|
||||||
|
-- 030_ops_drop_legacy_ops_tables.sql
|
||||||
|
-- =====================================================================
|
||||||
|
|
||||||
|
SET LOCAL lock_timeout = '5s';
|
||||||
|
SET LOCAL statement_timeout = '10min';
|
||||||
|
|
||||||
|
-- Legacy pre-aggregation tables (from 026 and/or previous branches)
|
||||||
|
DROP TABLE IF EXISTS ops_metrics_daily CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_metrics_hourly CASCADE;
|
||||||
|
|
||||||
|
-- Core ops tables that may exist in some deployments / branches
|
||||||
|
DROP TABLE IF EXISTS ops_system_metrics CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_error_logs CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_alert_events CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_alert_rules CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_job_heartbeats CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_retry_attempts CASCADE;
|
||||||
|
|
||||||
|
-- Optional legacy tables (best-effort cleanup)
|
||||||
|
DROP TABLE IF EXISTS ops_scheduled_reports CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_group_availability_configs CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_group_availability_events CASCADE;
|
||||||
|
|
||||||
|
-- Optional legacy views/indexes
|
||||||
|
DROP VIEW IF EXISTS ops_latest_metrics CASCADE;
|
||||||
|
|
||||||
|
-- =====================================================================
|
||||||
|
-- 031_ops_core_schema.sql
|
||||||
|
-- =====================================================================
|
||||||
|
|
||||||
|
-- Ops Monitoring (vNext): core schema (errors / retries / metrics / jobs / alerts)
|
||||||
|
--
|
||||||
|
-- Design goals:
|
||||||
|
-- - Support global filtering (time/platform/group) across all ops modules.
|
||||||
|
-- - Persist enough context for two retry modes (client retry / pinned upstream retry).
|
||||||
|
-- - Make ops background jobs observable via job heartbeats.
|
||||||
|
-- - Keep schema stable and indexes targeted (high-write tables).
|
||||||
|
--
|
||||||
|
-- Notes:
|
||||||
|
-- - This migration is idempotent.
|
||||||
|
-- - ops_* tables intentionally avoid strict foreign keys to reduce write amplification/locks.
|
||||||
|
|
||||||
|
SET LOCAL lock_timeout = '5s';
|
||||||
|
SET LOCAL statement_timeout = '10min';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 1) ops_error_logs: error log details (high-write)
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_error_logs (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
-- Correlation / identities
|
||||||
|
request_id VARCHAR(64),
|
||||||
|
client_request_id VARCHAR(64),
|
||||||
|
user_id BIGINT,
|
||||||
|
api_key_id BIGINT,
|
||||||
|
account_id BIGINT,
|
||||||
|
group_id BIGINT,
|
||||||
|
client_ip inet,
|
||||||
|
|
||||||
|
-- Dimensions for global filtering
|
||||||
|
platform VARCHAR(32),
|
||||||
|
|
||||||
|
-- Request metadata
|
||||||
|
model VARCHAR(100),
|
||||||
|
request_path VARCHAR(256),
|
||||||
|
stream BOOLEAN NOT NULL DEFAULT false,
|
||||||
|
user_agent TEXT,
|
||||||
|
|
||||||
|
-- Core error classification
|
||||||
|
error_phase VARCHAR(32) NOT NULL,
|
||||||
|
error_type VARCHAR(64) NOT NULL,
|
||||||
|
severity VARCHAR(8) NOT NULL DEFAULT 'P2',
|
||||||
|
status_code INT,
|
||||||
|
|
||||||
|
-- vNext metric semantics
|
||||||
|
is_business_limited BOOLEAN NOT NULL DEFAULT false,
|
||||||
|
|
||||||
|
-- Error details (sanitized/truncated at ingest time)
|
||||||
|
error_message TEXT,
|
||||||
|
error_body TEXT,
|
||||||
|
|
||||||
|
-- Provider/upstream details (optional; useful for trends & account health)
|
||||||
|
error_source VARCHAR(64),
|
||||||
|
error_owner VARCHAR(32),
|
||||||
|
account_status VARCHAR(50),
|
||||||
|
upstream_status_code INT,
|
||||||
|
upstream_error_message TEXT,
|
||||||
|
upstream_error_detail TEXT,
|
||||||
|
provider_error_code VARCHAR(64),
|
||||||
|
provider_error_type VARCHAR(64),
|
||||||
|
network_error_type VARCHAR(50),
|
||||||
|
retry_after_seconds INT,
|
||||||
|
|
||||||
|
-- Timings (ms) - optional
|
||||||
|
duration_ms INT,
|
||||||
|
time_to_first_token_ms BIGINT,
|
||||||
|
auth_latency_ms BIGINT,
|
||||||
|
routing_latency_ms BIGINT,
|
||||||
|
upstream_latency_ms BIGINT,
|
||||||
|
response_latency_ms BIGINT,
|
||||||
|
|
||||||
|
-- Retry context (only stored for error requests)
|
||||||
|
request_body JSONB,
|
||||||
|
request_headers JSONB,
|
||||||
|
request_body_truncated BOOLEAN NOT NULL DEFAULT false,
|
||||||
|
request_body_bytes INT,
|
||||||
|
|
||||||
|
-- Retryability flags (best-effort classification)
|
||||||
|
is_retryable BOOLEAN NOT NULL DEFAULT false,
|
||||||
|
retry_count INT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
COMMENT ON TABLE ops_error_logs IS 'Ops error logs (vNext). Stores sanitized error details and request_body for retries (errors only).';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 2) ops_retry_attempts: audit log for retries
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_retry_attempts (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
|
||||||
|
requested_by_user_id BIGINT,
|
||||||
|
source_error_id BIGINT,
|
||||||
|
|
||||||
|
-- client|upstream
|
||||||
|
mode VARCHAR(16) NOT NULL,
|
||||||
|
pinned_account_id BIGINT,
|
||||||
|
|
||||||
|
-- queued|running|succeeded|failed
|
||||||
|
status VARCHAR(16) NOT NULL DEFAULT 'queued',
|
||||||
|
started_at TIMESTAMPTZ,
|
||||||
|
finished_at TIMESTAMPTZ,
|
||||||
|
duration_ms BIGINT,
|
||||||
|
|
||||||
|
-- Optional result correlation
|
||||||
|
result_request_id VARCHAR(64),
|
||||||
|
result_error_id BIGINT,
|
||||||
|
result_usage_request_id VARCHAR(64),
|
||||||
|
|
||||||
|
error_message TEXT
|
||||||
|
);
|
||||||
|
|
||||||
|
COMMENT ON TABLE ops_retry_attempts IS 'Audit table for ops retries (client retry / pinned upstream retry).';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 3) ops_system_metrics: system + request window snapshots
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_system_metrics (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
window_minutes INT NOT NULL DEFAULT 1,
|
||||||
|
|
||||||
|
-- Optional dimensions (only if collector chooses to write per-dimension snapshots)
|
||||||
|
platform VARCHAR(32),
|
||||||
|
group_id BIGINT,
|
||||||
|
|
||||||
|
-- Core counts
|
||||||
|
success_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
error_count_total BIGINT NOT NULL DEFAULT 0,
|
||||||
|
business_limited_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
error_count_sla BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
|
||||||
|
upstream_429_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
upstream_529_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
token_consumed BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
-- Rates
|
||||||
|
qps DOUBLE PRECISION,
|
||||||
|
tps DOUBLE PRECISION,
|
||||||
|
|
||||||
|
-- Duration percentiles (ms) - success requests
|
||||||
|
duration_p50_ms INT,
|
||||||
|
duration_p90_ms INT,
|
||||||
|
duration_p95_ms INT,
|
||||||
|
duration_p99_ms INT,
|
||||||
|
duration_avg_ms DOUBLE PRECISION,
|
||||||
|
duration_max_ms INT,
|
||||||
|
|
||||||
|
-- TTFT percentiles (ms) - success requests (streaming)
|
||||||
|
ttft_p50_ms INT,
|
||||||
|
ttft_p90_ms INT,
|
||||||
|
ttft_p95_ms INT,
|
||||||
|
ttft_p99_ms INT,
|
||||||
|
ttft_avg_ms DOUBLE PRECISION,
|
||||||
|
ttft_max_ms INT,
|
||||||
|
|
||||||
|
-- System resources
|
||||||
|
cpu_usage_percent DOUBLE PRECISION,
|
||||||
|
memory_used_mb BIGINT,
|
||||||
|
memory_total_mb BIGINT,
|
||||||
|
memory_usage_percent DOUBLE PRECISION,
|
||||||
|
|
||||||
|
-- Dependency health (best-effort)
|
||||||
|
db_ok BOOLEAN,
|
||||||
|
redis_ok BOOLEAN,
|
||||||
|
|
||||||
|
-- DB pool & runtime
|
||||||
|
db_conn_active INT,
|
||||||
|
db_conn_idle INT,
|
||||||
|
db_conn_waiting INT,
|
||||||
|
goroutine_count INT,
|
||||||
|
|
||||||
|
-- Queue / concurrency
|
||||||
|
concurrency_queue_depth INT
|
||||||
|
);
|
||||||
|
|
||||||
|
COMMENT ON TABLE ops_system_metrics IS 'Ops system/request metrics snapshots (vNext). Used for dashboard overview and realtime rates.';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 4) ops_job_heartbeats: background jobs health
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_job_heartbeats (
|
||||||
|
job_name VARCHAR(64) PRIMARY KEY,
|
||||||
|
|
||||||
|
last_run_at TIMESTAMPTZ,
|
||||||
|
last_success_at TIMESTAMPTZ,
|
||||||
|
last_error_at TIMESTAMPTZ,
|
||||||
|
last_error TEXT,
|
||||||
|
last_duration_ms BIGINT,
|
||||||
|
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
COMMENT ON TABLE ops_job_heartbeats IS 'Ops background jobs heartbeats (vNext).';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 5) ops_alert_rules / ops_alert_events
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_alert_rules (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
name VARCHAR(128) NOT NULL,
|
||||||
|
description TEXT,
|
||||||
|
enabled BOOLEAN NOT NULL DEFAULT true,
|
||||||
|
|
||||||
|
severity VARCHAR(16) NOT NULL DEFAULT 'warning',
|
||||||
|
|
||||||
|
-- Metric definition
|
||||||
|
-- Metric definition
|
||||||
|
metric_type VARCHAR(64) NOT NULL,
|
||||||
|
operator VARCHAR(8) NOT NULL,
|
||||||
|
threshold DOUBLE PRECISION NOT NULL,
|
||||||
|
|
||||||
|
window_minutes INT NOT NULL DEFAULT 5,
|
||||||
|
sustained_minutes INT NOT NULL DEFAULT 5,
|
||||||
|
cooldown_minutes INT NOT NULL DEFAULT 10,
|
||||||
|
|
||||||
|
-- Optional scoping: platform/group filters etc.
|
||||||
|
filters JSONB,
|
||||||
|
|
||||||
|
last_triggered_at TIMESTAMPTZ,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_alert_rules_name_unique
|
||||||
|
ON ops_alert_rules (name);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled
|
||||||
|
ON ops_alert_rules (enabled);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_alert_events (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
rule_id BIGINT,
|
||||||
|
severity VARCHAR(16) NOT NULL,
|
||||||
|
status VARCHAR(16) NOT NULL DEFAULT 'firing',
|
||||||
|
|
||||||
|
title VARCHAR(200),
|
||||||
|
description TEXT,
|
||||||
|
|
||||||
|
metric_value DOUBLE PRECISION,
|
||||||
|
threshold_value DOUBLE PRECISION,
|
||||||
|
dimensions JSONB,
|
||||||
|
|
||||||
|
fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
resolved_at TIMESTAMPTZ,
|
||||||
|
|
||||||
|
email_sent BOOLEAN NOT NULL DEFAULT false,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status
|
||||||
|
ON ops_alert_events (rule_id, status);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at
|
||||||
|
ON ops_alert_events (fired_at DESC);
|
||||||
|
|
||||||
|
-- =====================================================================
|
||||||
|
-- 032_ops_preaggregation_tables.sql
|
||||||
|
-- =====================================================================
|
||||||
|
|
||||||
|
-- Ops Monitoring (vNext): pre-aggregation tables
|
||||||
|
--
|
||||||
|
-- Purpose:
|
||||||
|
-- - Provide stable query performance for 1–24h windows (and beyond), avoiding expensive
|
||||||
|
-- percentile_cont scans on raw logs for every dashboard refresh.
|
||||||
|
-- - Support global filter dimensions: overall / platform / group.
|
||||||
|
--
|
||||||
|
-- Design note:
|
||||||
|
-- - We keep a single table with nullable platform/group_id, and enforce uniqueness via a
|
||||||
|
-- COALESCE-based unique index (because UNIQUE with NULLs allows duplicates in Postgres).
|
||||||
|
|
||||||
|
SET LOCAL lock_timeout = '5s';
|
||||||
|
SET LOCAL statement_timeout = '10min';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 1) ops_metrics_hourly
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_metrics_hourly (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
bucket_start TIMESTAMPTZ NOT NULL,
|
||||||
|
platform VARCHAR(32),
|
||||||
|
group_id BIGINT,
|
||||||
|
|
||||||
|
success_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
error_count_total BIGINT NOT NULL DEFAULT 0,
|
||||||
|
business_limited_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
error_count_sla BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
|
||||||
|
upstream_429_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
upstream_529_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
token_consumed BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
-- Duration percentiles (ms)
|
||||||
|
duration_p50_ms INT,
|
||||||
|
duration_p90_ms INT,
|
||||||
|
duration_p95_ms INT,
|
||||||
|
duration_p99_ms INT,
|
||||||
|
|
||||||
|
-- TTFT percentiles (ms)
|
||||||
|
ttft_p50_ms INT,
|
||||||
|
ttft_p90_ms INT,
|
||||||
|
ttft_p95_ms INT,
|
||||||
|
ttft_p99_ms INT,
|
||||||
|
|
||||||
|
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Uniqueness across three “dimension modes” (overall / platform / group).
|
||||||
|
-- Postgres UNIQUE treats NULLs as distinct, so we enforce uniqueness via COALESCE.
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_hourly_unique_dim
|
||||||
|
ON ops_metrics_hourly (
|
||||||
|
bucket_start,
|
||||||
|
COALESCE(platform, ''),
|
||||||
|
COALESCE(group_id, 0)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_bucket
|
||||||
|
ON ops_metrics_hourly (bucket_start DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_platform_bucket
|
||||||
|
ON ops_metrics_hourly (platform, bucket_start DESC)
|
||||||
|
WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_group_bucket
|
||||||
|
ON ops_metrics_hourly (group_id, bucket_start DESC)
|
||||||
|
WHERE group_id IS NOT NULL AND group_id <> 0;
|
||||||
|
|
||||||
|
COMMENT ON TABLE ops_metrics_hourly IS 'vNext hourly pre-aggregated ops metrics (overall/platform/group).';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 2) ops_metrics_daily (optional; for longer windows)
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_metrics_daily (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
bucket_date DATE NOT NULL,
|
||||||
|
platform VARCHAR(32),
|
||||||
|
group_id BIGINT,
|
||||||
|
|
||||||
|
success_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
error_count_total BIGINT NOT NULL DEFAULT 0,
|
||||||
|
business_limited_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
error_count_sla BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
|
||||||
|
upstream_429_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
upstream_529_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
token_consumed BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
duration_p50_ms INT,
|
||||||
|
duration_p90_ms INT,
|
||||||
|
duration_p95_ms INT,
|
||||||
|
duration_p99_ms INT,
|
||||||
|
|
||||||
|
ttft_p50_ms INT,
|
||||||
|
ttft_p90_ms INT,
|
||||||
|
ttft_p95_ms INT,
|
||||||
|
ttft_p99_ms INT,
|
||||||
|
|
||||||
|
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_daily_unique_dim
|
||||||
|
ON ops_metrics_daily (
|
||||||
|
bucket_date,
|
||||||
|
COALESCE(platform, ''),
|
||||||
|
COALESCE(group_id, 0)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_bucket
|
||||||
|
ON ops_metrics_daily (bucket_date DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_platform_bucket
|
||||||
|
ON ops_metrics_daily (platform, bucket_date DESC)
|
||||||
|
WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_group_bucket
|
||||||
|
ON ops_metrics_daily (group_id, bucket_date DESC)
|
||||||
|
WHERE group_id IS NOT NULL AND group_id <> 0;
|
||||||
|
|
||||||
|
COMMENT ON TABLE ops_metrics_daily IS 'vNext daily pre-aggregated ops metrics (overall/platform/group).';
|
||||||
|
|
||||||
|
-- =====================================================================
|
||||||
|
-- 033_ops_indexes_and_extensions.sql
|
||||||
|
-- =====================================================================
|
||||||
|
|
||||||
|
-- Ops Monitoring (vNext): indexes and optional extensions
|
||||||
|
--
|
||||||
|
-- This migration intentionally keeps "optional" objects (like pg_trgm) best-effort,
|
||||||
|
-- so environments without extension privileges won't fail the whole migration chain.
|
||||||
|
|
||||||
|
SET LOCAL lock_timeout = '5s';
|
||||||
|
SET LOCAL statement_timeout = '10min';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 1) Core btree indexes (always safe)
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
-- ops_error_logs
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at
|
||||||
|
ON ops_error_logs (created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform_time
|
||||||
|
ON ops_error_logs (platform, created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_group_time
|
||||||
|
ON ops_error_logs (group_id, created_at DESC)
|
||||||
|
WHERE group_id IS NOT NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_account_time
|
||||||
|
ON ops_error_logs (account_id, created_at DESC)
|
||||||
|
WHERE account_id IS NOT NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_status_time
|
||||||
|
ON ops_error_logs (status_code, created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_time
|
||||||
|
ON ops_error_logs (error_phase, created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_type_time
|
||||||
|
ON ops_error_logs (error_type, created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id
|
||||||
|
ON ops_error_logs (request_id);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id
|
||||||
|
ON ops_error_logs (client_request_id);
|
||||||
|
|
||||||
|
-- ops_system_metrics
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at
|
||||||
|
ON ops_system_metrics (created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time
|
||||||
|
ON ops_system_metrics (window_minutes, created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_platform_time
|
||||||
|
ON ops_system_metrics (platform, created_at DESC)
|
||||||
|
WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_group_time
|
||||||
|
ON ops_system_metrics (group_id, created_at DESC)
|
||||||
|
WHERE group_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- ops_retry_attempts
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_created_at
|
||||||
|
ON ops_retry_attempts (created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_source_error
|
||||||
|
ON ops_retry_attempts (source_error_id, created_at DESC)
|
||||||
|
WHERE source_error_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Prevent concurrent retries for the same ops_error_logs row (race-free, multi-instance safe).
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_retry_attempts_unique_active
|
||||||
|
ON ops_retry_attempts (source_error_id)
|
||||||
|
WHERE source_error_id IS NOT NULL AND status IN ('queued', 'running');
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 2) Optional: pg_trgm + trigram indexes for fuzzy search
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
BEGIN
|
||||||
|
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||||
|
EXCEPTION WHEN OTHERS THEN
|
||||||
|
-- Missing privileges or extension package should not block migrations.
|
||||||
|
RAISE NOTICE 'pg_trgm extension not created: %', SQLERRM;
|
||||||
|
END;
|
||||||
|
|
||||||
|
IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm') THEN
|
||||||
|
-- request_id / client_request_id fuzzy search
|
||||||
|
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id_trgm
|
||||||
|
ON ops_error_logs USING gin (request_id gin_trgm_ops)';
|
||||||
|
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id_trgm
|
||||||
|
ON ops_error_logs USING gin (client_request_id gin_trgm_ops)';
|
||||||
|
|
||||||
|
-- error_message fuzzy search
|
||||||
|
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_error_message_trgm
|
||||||
|
ON ops_error_logs USING gin (error_message gin_trgm_ops)';
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- =====================================================================
|
||||||
|
-- 034_ops_preaggregation_add_avg_max.sql
|
||||||
|
-- =====================================================================
|
||||||
|
|
||||||
|
-- Ops Monitoring (vNext): extend pre-aggregation tables with avg/max latency fields
|
||||||
|
--
|
||||||
|
-- Why:
|
||||||
|
-- - The dashboard overview returns avg/max for duration/TTFT.
|
||||||
|
-- - Hourly/daily pre-aggregation tables originally stored only p50/p90/p95/p99, which makes
|
||||||
|
-- it impossible to answer avg/max in preagg mode without falling back to raw scans.
|
||||||
|
--
|
||||||
|
-- This migration is idempotent and safe to run multiple times.
|
||||||
|
--
|
||||||
|
-- NOTE: We keep the existing p50/p90/p95/p99 columns as-is; these are still used for
|
||||||
|
-- approximate long-window summaries.
|
||||||
|
|
||||||
|
SET LOCAL lock_timeout = '5s';
|
||||||
|
SET LOCAL statement_timeout = '10min';
|
||||||
|
|
||||||
|
-- Hourly table
|
||||||
|
ALTER TABLE ops_metrics_hourly
|
||||||
|
ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
|
||||||
|
ADD COLUMN IF NOT EXISTS duration_max_ms INT,
|
||||||
|
ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
|
||||||
|
ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
|
||||||
|
|
||||||
|
-- Daily table
|
||||||
|
ALTER TABLE ops_metrics_daily
|
||||||
|
ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
|
||||||
|
ADD COLUMN IF NOT EXISTS duration_max_ms INT,
|
||||||
|
ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
|
||||||
|
ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
|
||||||
|
|
||||||
|
-- =====================================================================
|
||||||
|
-- 035_ops_alert_rules_notify_email.sql
|
||||||
|
-- =====================================================================
|
||||||
|
|
||||||
|
-- Ops Monitoring (vNext): alert rule notify settings
|
||||||
|
--
|
||||||
|
-- Adds notify_email flag to ops_alert_rules to keep UI parity with the backup Ops dashboard.
|
||||||
|
-- Migration is idempotent.
|
||||||
|
|
||||||
|
SET LOCAL lock_timeout = '5s';
|
||||||
|
SET LOCAL statement_timeout = '10min';
|
||||||
|
|
||||||
|
ALTER TABLE ops_alert_rules
|
||||||
|
ADD COLUMN IF NOT EXISTS notify_email BOOLEAN NOT NULL DEFAULT true;
|
||||||
|
|
||||||
|
-- =====================================================================
|
||||||
|
-- 036_ops_seed_default_alert_rules.sql
|
||||||
|
-- =====================================================================
|
||||||
|
|
||||||
|
-- Ops Monitoring (vNext): seed default alert rules (idempotent)
|
||||||
|
--
|
||||||
|
-- Goal:
|
||||||
|
-- - Provide "out of the box" alert rules so the Ops dashboard can immediately show alert events.
|
||||||
|
-- - Keep inserts idempotent via ON CONFLICT (name) DO NOTHING.
|
||||||
|
--
|
||||||
|
-- Notes:
|
||||||
|
-- - Thresholds are intentionally conservative defaults and should be tuned per deployment.
|
||||||
|
-- - Metric semantics follow vNext:
|
||||||
|
-- - success_rate / error_rate are based on SLA-scope counts (exclude is_business_limited).
|
||||||
|
-- - upstream_error_rate excludes 429/529.
|
||||||
|
|
||||||
|
SET LOCAL lock_timeout = '5s';
|
||||||
|
SET LOCAL statement_timeout = '10min';
|
||||||
|
|
||||||
|
-- 1) High error rate (P1)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'错误率过高',
|
||||||
|
'当错误率超过 5% 且持续 5 分钟时触发告警',
|
||||||
|
true, 'error_rate', '>', 5.0, 5, 5, 'P1', true, 20, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- 2) Low success rate (P0)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'成功率过低',
|
||||||
|
'当成功率低于 95% 且持续 5 分钟时触发告警(服务可用性下降)',
|
||||||
|
true, 'success_rate', '<', 95.0, 5, 5, 'P0', true, 15, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- 3) P99 latency too high (P2)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'P99延迟过高',
|
||||||
|
'当 P99 延迟超过 3000ms 且持续 10 分钟时触发告警',
|
||||||
|
true, 'p99_latency_ms', '>', 3000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- 4) P95 latency too high (P2)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'P95延迟过高',
|
||||||
|
'当 P95 延迟超过 2000ms 且持续 10 分钟时触发告警',
|
||||||
|
true, 'p95_latency_ms', '>', 2000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- 5) CPU usage too high (P2)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'CPU使用率过高',
|
||||||
|
'当 CPU 使用率超过 85% 且持续 10 分钟时触发告警',
|
||||||
|
true, 'cpu_usage_percent', '>', 85.0, 5, 10, 'P2', true, 30, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- 6) Memory usage too high (P1)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'内存使用率过高',
|
||||||
|
'当内存使用率超过 90% 且持续 10 分钟时触发告警(可能导致 OOM)',
|
||||||
|
true, 'memory_usage_percent', '>', 90.0, 5, 10, 'P1', true, 20, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- 7) Concurrency queue buildup (P1)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'并发队列积压',
|
||||||
|
'当并发队列深度超过 100 且持续 5 分钟时触发告警(系统处理能力不足)',
|
||||||
|
true, 'concurrency_queue_depth', '>', 100.0, 5, 5, 'P1', true, 20, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- 8) Extremely high error rate (P0)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'错误率极高',
|
||||||
|
'当错误率超过 20% 且持续 1 分钟时触发告警(服务严重异常)',
|
||||||
|
true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- Ops Monitoring vNext: add Redis pool stats fields to system metrics snapshots.
|
||||||
|
-- This migration is intentionally idempotent.
|
||||||
|
|
||||||
|
ALTER TABLE ops_system_metrics
|
||||||
|
ADD COLUMN IF NOT EXISTS redis_conn_total INT,
|
||||||
|
ADD COLUMN IF NOT EXISTS redis_conn_idle INT;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN ops_system_metrics.redis_conn_total IS 'Redis pool total connections (go-redis PoolStats.TotalConns).';
|
||||||
|
COMMENT ON COLUMN ops_system_metrics.redis_conn_idle IS 'Redis pool idle connections (go-redis PoolStats.IdleConns).';
|
||||||
9
backend/migrations/034_ops_upstream_error_events.sql
Normal file
9
backend/migrations/034_ops_upstream_error_events.sql
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
-- Add upstream error events list (JSONB) to ops_error_logs for per-request correlation.
|
||||||
|
--
|
||||||
|
-- This is intentionally idempotent.
|
||||||
|
|
||||||
|
ALTER TABLE ops_error_logs
|
||||||
|
ADD COLUMN IF NOT EXISTS upstream_errors JSONB;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN ops_error_logs.upstream_errors IS
|
||||||
|
'Sanitized upstream error events list (JSON array), correlated per gateway request (request_id/client_request_id); used for per-request upstream debugging.';
|
||||||
37
config.yaml
37
config.yaml
@@ -159,7 +159,7 @@ gateway:
|
|||||||
max_line_size: 41943040
|
max_line_size: 41943040
|
||||||
# Log upstream error response body summary (safe/truncated; does not log request content)
|
# Log upstream error response body summary (safe/truncated; does not log request content)
|
||||||
# 记录上游错误响应体摘要(安全/截断;不记录请求内容)
|
# 记录上游错误响应体摘要(安全/截断;不记录请求内容)
|
||||||
log_upstream_error_body: false
|
log_upstream_error_body: true
|
||||||
# Max bytes to log from upstream error body
|
# Max bytes to log from upstream error body
|
||||||
# 记录上游错误响应体的最大字节数
|
# 记录上游错误响应体的最大字节数
|
||||||
log_upstream_error_body_max_bytes: 2048
|
log_upstream_error_body_max_bytes: 2048
|
||||||
@@ -302,6 +302,41 @@ redis:
|
|||||||
# 数据库编号(0-15)
|
# 数据库编号(0-15)
|
||||||
db: 0
|
db: 0
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Ops Monitoring (Optional)
|
||||||
|
# 运维监控 (可选)
|
||||||
|
# =============================================================================
|
||||||
|
ops:
|
||||||
|
# Hard switch: disable all ops background jobs and APIs when false
|
||||||
|
# 硬开关:为 false 时禁用所有 Ops 后台任务与接口
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Prefer pre-aggregated tables (ops_metrics_hourly/ops_metrics_daily) for long-window dashboard queries.
|
||||||
|
# 优先使用预聚合表(用于长时间窗口查询性能)
|
||||||
|
use_preaggregated_tables: false
|
||||||
|
|
||||||
|
# Data cleanup configuration
|
||||||
|
# 数据清理配置(vNext 默认统一保留 30 天)
|
||||||
|
cleanup:
|
||||||
|
enabled: true
|
||||||
|
# Cron expression (minute hour dom month dow), e.g. "0 2 * * *" = daily at 2 AM
|
||||||
|
# Cron 表达式(分 时 日 月 周),例如 "0 2 * * *" = 每天凌晨 2 点
|
||||||
|
schedule: "0 2 * * *"
|
||||||
|
error_log_retention_days: 30
|
||||||
|
minute_metrics_retention_days: 30
|
||||||
|
hourly_metrics_retention_days: 30
|
||||||
|
|
||||||
|
# Pre-aggregation configuration
|
||||||
|
# 预聚合任务配置
|
||||||
|
aggregation:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# OpsMetricsCollector Redis cache (reduces duplicate expensive window aggregation in multi-replica deployments)
|
||||||
|
# 指标采集 Redis 缓存(多副本部署时减少重复计算)
|
||||||
|
metrics_collector_cache:
|
||||||
|
enabled: true
|
||||||
|
ttl: 65s
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# JWT Configuration
|
# JWT Configuration
|
||||||
# JWT 配置
|
# JWT 配置
|
||||||
|
|||||||
@@ -151,6 +151,15 @@ GEMINI_OAUTH_SCOPES=
|
|||||||
# GEMINI_QUOTA_POLICY={"tiers":{"LEGACY":{"pro_rpd":50,"flash_rpd":1500,"cooldown_minutes":30},"PRO":{"pro_rpd":1500,"flash_rpd":4000,"cooldown_minutes":5},"ULTRA":{"pro_rpd":2000,"flash_rpd":0,"cooldown_minutes":5}}}
|
# GEMINI_QUOTA_POLICY={"tiers":{"LEGACY":{"pro_rpd":50,"flash_rpd":1500,"cooldown_minutes":30},"PRO":{"pro_rpd":1500,"flash_rpd":4000,"cooldown_minutes":5},"ULTRA":{"pro_rpd":2000,"flash_rpd":0,"cooldown_minutes":5}}}
|
||||||
GEMINI_QUOTA_POLICY=
|
GEMINI_QUOTA_POLICY=
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Ops Monitoring Configuration (运维监控配置)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Enable ops monitoring features (background jobs and APIs)
|
||||||
|
# 是否启用运维监控功能(后台任务和接口)
|
||||||
|
# Set to false to hide ops menu in sidebar and disable all ops features
|
||||||
|
# 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能
|
||||||
|
OPS_ENABLED=true
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Update Configuration (在线更新配置)
|
# Update Configuration (在线更新配置)
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -159,7 +159,7 @@ gateway:
|
|||||||
max_line_size: 41943040
|
max_line_size: 41943040
|
||||||
# Log upstream error response body summary (safe/truncated; does not log request content)
|
# Log upstream error response body summary (safe/truncated; does not log request content)
|
||||||
# 记录上游错误响应体摘要(安全/截断;不记录请求内容)
|
# 记录上游错误响应体摘要(安全/截断;不记录请求内容)
|
||||||
log_upstream_error_body: false
|
log_upstream_error_body: true
|
||||||
# Max bytes to log from upstream error body
|
# Max bytes to log from upstream error body
|
||||||
# 记录上游错误响应体的最大字节数
|
# 记录上游错误响应体的最大字节数
|
||||||
log_upstream_error_body_max_bytes: 2048
|
log_upstream_error_body_max_bytes: 2048
|
||||||
@@ -302,6 +302,19 @@ redis:
|
|||||||
# 数据库编号(0-15)
|
# 数据库编号(0-15)
|
||||||
db: 0
|
db: 0
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Ops Monitoring (Optional)
|
||||||
|
# 运维监控 (可选)
|
||||||
|
# =============================================================================
|
||||||
|
ops:
|
||||||
|
# Enable ops monitoring features (background jobs and APIs)
|
||||||
|
# 是否启用运维监控功能(后台任务和接口)
|
||||||
|
# Set to false to hide ops menu in sidebar and disable all ops features
|
||||||
|
# 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能
|
||||||
|
# Other detailed settings (cleanup, aggregation, etc.) are configured in ops settings dialog
|
||||||
|
# 其他详细设置(数据清理、预聚合等)在运维监控设置对话框中配置
|
||||||
|
enabled: true
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# JWT Configuration
|
# JWT Configuration
|
||||||
# JWT 配置
|
# JWT 配置
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ import usageAPI from './usage'
|
|||||||
import geminiAPI from './gemini'
|
import geminiAPI from './gemini'
|
||||||
import antigravityAPI from './antigravity'
|
import antigravityAPI from './antigravity'
|
||||||
import userAttributesAPI from './userAttributes'
|
import userAttributesAPI from './userAttributes'
|
||||||
|
import opsAPI from './ops'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Unified admin API object for convenient access
|
* Unified admin API object for convenient access
|
||||||
@@ -35,7 +36,8 @@ export const adminAPI = {
|
|||||||
usage: usageAPI,
|
usage: usageAPI,
|
||||||
gemini: geminiAPI,
|
gemini: geminiAPI,
|
||||||
antigravity: antigravityAPI,
|
antigravity: antigravityAPI,
|
||||||
userAttributes: userAttributesAPI
|
userAttributes: userAttributesAPI,
|
||||||
|
ops: opsAPI
|
||||||
}
|
}
|
||||||
|
|
||||||
export {
|
export {
|
||||||
@@ -52,7 +54,8 @@ export {
|
|||||||
usageAPI,
|
usageAPI,
|
||||||
geminiAPI,
|
geminiAPI,
|
||||||
antigravityAPI,
|
antigravityAPI,
|
||||||
userAttributesAPI
|
userAttributesAPI,
|
||||||
|
opsAPI
|
||||||
}
|
}
|
||||||
|
|
||||||
export default adminAPI
|
export default adminAPI
|
||||||
|
|||||||
958
frontend/src/api/admin/ops.ts
Normal file
958
frontend/src/api/admin/ops.ts
Normal file
@@ -0,0 +1,958 @@
|
|||||||
|
/**
|
||||||
|
* Admin Ops API endpoints (vNext)
|
||||||
|
* - Error logs list/detail + retry (client/upstream)
|
||||||
|
* - Dashboard overview (raw path)
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { apiClient } from '../client'
|
||||||
|
import type { PaginatedResponse } from '@/types'
|
||||||
|
|
||||||
|
export type OpsRetryMode = 'client' | 'upstream'
|
||||||
|
export type OpsQueryMode = 'auto' | 'raw' | 'preagg'
|
||||||
|
|
||||||
|
export interface OpsRequestOptions {
|
||||||
|
signal?: AbortSignal
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsRetryRequest {
|
||||||
|
mode: OpsRetryMode
|
||||||
|
pinned_account_id?: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsRetryResult {
|
||||||
|
attempt_id: number
|
||||||
|
mode: OpsRetryMode
|
||||||
|
status: 'running' | 'succeeded' | 'failed' | string
|
||||||
|
|
||||||
|
pinned_account_id?: number | null
|
||||||
|
used_account_id?: number | null
|
||||||
|
|
||||||
|
http_status_code: number
|
||||||
|
upstream_request_id: string
|
||||||
|
|
||||||
|
response_preview: string
|
||||||
|
response_truncated: boolean
|
||||||
|
|
||||||
|
error_message: string
|
||||||
|
|
||||||
|
started_at: string
|
||||||
|
finished_at: string
|
||||||
|
duration_ms: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsDashboardOverview {
|
||||||
|
start_time: string
|
||||||
|
end_time: string
|
||||||
|
platform: string
|
||||||
|
group_id?: number | null
|
||||||
|
|
||||||
|
health_score?: number
|
||||||
|
|
||||||
|
system_metrics?: OpsSystemMetricsSnapshot | null
|
||||||
|
job_heartbeats?: OpsJobHeartbeat[] | null
|
||||||
|
|
||||||
|
success_count: number
|
||||||
|
error_count_total: number
|
||||||
|
business_limited_count: number
|
||||||
|
error_count_sla: number
|
||||||
|
request_count_total: number
|
||||||
|
request_count_sla: number
|
||||||
|
|
||||||
|
token_consumed: number
|
||||||
|
|
||||||
|
sla: number
|
||||||
|
error_rate: number
|
||||||
|
upstream_error_rate: number
|
||||||
|
upstream_error_count_excl_429_529: number
|
||||||
|
upstream_429_count: number
|
||||||
|
upstream_529_count: number
|
||||||
|
|
||||||
|
qps: {
|
||||||
|
current: number
|
||||||
|
peak: number
|
||||||
|
avg: number
|
||||||
|
}
|
||||||
|
tps: {
|
||||||
|
current: number
|
||||||
|
peak: number
|
||||||
|
avg: number
|
||||||
|
}
|
||||||
|
|
||||||
|
duration: OpsPercentiles
|
||||||
|
ttft: OpsPercentiles
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsPercentiles {
|
||||||
|
p50_ms?: number | null
|
||||||
|
p90_ms?: number | null
|
||||||
|
p95_ms?: number | null
|
||||||
|
p99_ms?: number | null
|
||||||
|
avg_ms?: number | null
|
||||||
|
max_ms?: number | null
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsThroughputTrendPoint {
|
||||||
|
bucket_start: string
|
||||||
|
request_count: number
|
||||||
|
token_consumed: number
|
||||||
|
qps: number
|
||||||
|
tps: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsThroughputPlatformBreakdownItem {
|
||||||
|
platform: string
|
||||||
|
request_count: number
|
||||||
|
token_consumed: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsThroughputGroupBreakdownItem {
|
||||||
|
group_id: number
|
||||||
|
group_name: string
|
||||||
|
request_count: number
|
||||||
|
token_consumed: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsThroughputTrendResponse {
|
||||||
|
bucket: string
|
||||||
|
points: OpsThroughputTrendPoint[]
|
||||||
|
by_platform?: OpsThroughputPlatformBreakdownItem[]
|
||||||
|
top_groups?: OpsThroughputGroupBreakdownItem[]
|
||||||
|
}
|
||||||
|
|
||||||
|
export type OpsRequestKind = 'success' | 'error'
|
||||||
|
export type OpsRequestDetailsKind = OpsRequestKind | 'all'
|
||||||
|
export type OpsRequestDetailsSort = 'created_at_desc' | 'duration_desc'
|
||||||
|
|
||||||
|
export interface OpsRequestDetail {
|
||||||
|
kind: OpsRequestKind
|
||||||
|
created_at: string
|
||||||
|
request_id: string
|
||||||
|
|
||||||
|
platform?: string
|
||||||
|
model?: string
|
||||||
|
duration_ms?: number | null
|
||||||
|
status_code?: number | null
|
||||||
|
|
||||||
|
error_id?: number | null
|
||||||
|
phase?: string
|
||||||
|
severity?: string
|
||||||
|
message?: string
|
||||||
|
|
||||||
|
user_id?: number | null
|
||||||
|
api_key_id?: number | null
|
||||||
|
account_id?: number | null
|
||||||
|
group_id?: number | null
|
||||||
|
|
||||||
|
stream?: boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsRequestDetailsParams {
|
||||||
|
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
|
||||||
|
start_time?: string
|
||||||
|
end_time?: string
|
||||||
|
|
||||||
|
kind?: OpsRequestDetailsKind
|
||||||
|
|
||||||
|
platform?: string
|
||||||
|
group_id?: number | null
|
||||||
|
|
||||||
|
user_id?: number
|
||||||
|
api_key_id?: number
|
||||||
|
account_id?: number
|
||||||
|
|
||||||
|
model?: string
|
||||||
|
request_id?: string
|
||||||
|
q?: string
|
||||||
|
|
||||||
|
min_duration_ms?: number
|
||||||
|
max_duration_ms?: number
|
||||||
|
|
||||||
|
sort?: OpsRequestDetailsSort
|
||||||
|
|
||||||
|
page?: number
|
||||||
|
page_size?: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export type OpsRequestDetailsResponse = PaginatedResponse<OpsRequestDetail>
|
||||||
|
|
||||||
|
export interface OpsLatencyHistogramBucket {
|
||||||
|
range: string
|
||||||
|
count: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsLatencyHistogramResponse {
|
||||||
|
start_time: string
|
||||||
|
end_time: string
|
||||||
|
platform: string
|
||||||
|
group_id?: number | null
|
||||||
|
|
||||||
|
total_requests: number
|
||||||
|
buckets: OpsLatencyHistogramBucket[]
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsErrorTrendPoint {
|
||||||
|
bucket_start: string
|
||||||
|
error_count_total: number
|
||||||
|
business_limited_count: number
|
||||||
|
error_count_sla: number
|
||||||
|
upstream_error_count_excl_429_529: number
|
||||||
|
upstream_429_count: number
|
||||||
|
upstream_529_count: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsErrorTrendResponse {
|
||||||
|
bucket: string
|
||||||
|
points: OpsErrorTrendPoint[]
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsErrorDistributionItem {
|
||||||
|
status_code: number
|
||||||
|
total: number
|
||||||
|
sla: number
|
||||||
|
business_limited: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsErrorDistributionResponse {
|
||||||
|
total: number
|
||||||
|
items: OpsErrorDistributionItem[]
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsSystemMetricsSnapshot {
|
||||||
|
id: number
|
||||||
|
created_at: string
|
||||||
|
window_minutes: number
|
||||||
|
|
||||||
|
cpu_usage_percent?: number | null
|
||||||
|
memory_used_mb?: number | null
|
||||||
|
memory_total_mb?: number | null
|
||||||
|
memory_usage_percent?: number | null
|
||||||
|
|
||||||
|
db_ok?: boolean | null
|
||||||
|
redis_ok?: boolean | null
|
||||||
|
|
||||||
|
// Config-derived limits (best-effort) for rendering "current vs max".
|
||||||
|
db_max_open_conns?: number | null
|
||||||
|
redis_pool_size?: number | null
|
||||||
|
|
||||||
|
redis_conn_total?: number | null
|
||||||
|
redis_conn_idle?: number | null
|
||||||
|
|
||||||
|
db_conn_active?: number | null
|
||||||
|
db_conn_idle?: number | null
|
||||||
|
db_conn_waiting?: number | null
|
||||||
|
|
||||||
|
goroutine_count?: number | null
|
||||||
|
concurrency_queue_depth?: number | null
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsJobHeartbeat {
|
||||||
|
job_name: string
|
||||||
|
last_run_at?: string | null
|
||||||
|
last_success_at?: string | null
|
||||||
|
last_error_at?: string | null
|
||||||
|
last_error?: string | null
|
||||||
|
last_duration_ms?: number | null
|
||||||
|
updated_at: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PlatformConcurrencyInfo {
|
||||||
|
platform: string
|
||||||
|
current_in_use: number
|
||||||
|
max_capacity: number
|
||||||
|
load_percentage: number
|
||||||
|
waiting_in_queue: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface GroupConcurrencyInfo {
|
||||||
|
group_id: number
|
||||||
|
group_name: string
|
||||||
|
platform: string
|
||||||
|
current_in_use: number
|
||||||
|
max_capacity: number
|
||||||
|
load_percentage: number
|
||||||
|
waiting_in_queue: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface AccountConcurrencyInfo {
|
||||||
|
account_id: number
|
||||||
|
account_name?: string
|
||||||
|
platform: string
|
||||||
|
group_id: number
|
||||||
|
group_name: string
|
||||||
|
current_in_use: number
|
||||||
|
max_capacity: number
|
||||||
|
load_percentage: number
|
||||||
|
waiting_in_queue: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsConcurrencyStatsResponse {
|
||||||
|
enabled: boolean
|
||||||
|
platform: Record<string, PlatformConcurrencyInfo>
|
||||||
|
group: Record<string, GroupConcurrencyInfo>
|
||||||
|
account: Record<string, AccountConcurrencyInfo>
|
||||||
|
timestamp?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getConcurrencyStats(platform?: string, groupId?: number | null): Promise<OpsConcurrencyStatsResponse> {
|
||||||
|
const params: Record<string, any> = {}
|
||||||
|
if (platform) {
|
||||||
|
params.platform = platform
|
||||||
|
}
|
||||||
|
if (typeof groupId === 'number' && groupId > 0) {
|
||||||
|
params.group_id = groupId
|
||||||
|
}
|
||||||
|
|
||||||
|
const { data } = await apiClient.get<OpsConcurrencyStatsResponse>('/admin/ops/concurrency', { params })
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PlatformAvailability {
|
||||||
|
platform: string
|
||||||
|
total_accounts: number
|
||||||
|
available_count: number
|
||||||
|
rate_limit_count: number
|
||||||
|
error_count: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface GroupAvailability {
|
||||||
|
group_id: number
|
||||||
|
group_name: string
|
||||||
|
platform: string
|
||||||
|
total_accounts: number
|
||||||
|
available_count: number
|
||||||
|
rate_limit_count: number
|
||||||
|
error_count: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface AccountAvailability {
|
||||||
|
account_id: number
|
||||||
|
account_name: string
|
||||||
|
platform: string
|
||||||
|
group_id: number
|
||||||
|
group_name: string
|
||||||
|
status: string
|
||||||
|
is_available: boolean
|
||||||
|
is_rate_limited: boolean
|
||||||
|
rate_limit_reset_at?: string
|
||||||
|
rate_limit_remaining_sec?: number
|
||||||
|
is_overloaded: boolean
|
||||||
|
overload_until?: string
|
||||||
|
overload_remaining_sec?: number
|
||||||
|
has_error: boolean
|
||||||
|
error_message?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsAccountAvailabilityStatsResponse {
|
||||||
|
enabled: boolean
|
||||||
|
platform: Record<string, PlatformAvailability>
|
||||||
|
group: Record<string, GroupAvailability>
|
||||||
|
account: Record<string, AccountAvailability>
|
||||||
|
timestamp?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getAccountAvailabilityStats(platform?: string, groupId?: number | null): Promise<OpsAccountAvailabilityStatsResponse> {
|
||||||
|
const params: Record<string, any> = {}
|
||||||
|
if (platform) {
|
||||||
|
params.platform = platform
|
||||||
|
}
|
||||||
|
if (typeof groupId === 'number' && groupId > 0) {
|
||||||
|
params.group_id = groupId
|
||||||
|
}
|
||||||
|
const { data } = await apiClient.get<OpsAccountAvailabilityStatsResponse>('/admin/ops/account-availability', { params })
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Subscribe to realtime QPS updates via WebSocket.
|
||||||
|
*
|
||||||
|
* Note: browsers cannot set Authorization headers for WebSockets.
|
||||||
|
* We authenticate via Sec-WebSocket-Protocol using a prefixed token item:
|
||||||
|
* ["sub2api-admin", "jwt.<token>"]
|
||||||
|
*/
|
||||||
|
export interface SubscribeQPSOptions {
|
||||||
|
token?: string | null
|
||||||
|
onOpen?: () => void
|
||||||
|
onClose?: (event: CloseEvent) => void
|
||||||
|
onError?: (event: Event) => void
|
||||||
|
/**
|
||||||
|
* Called when the server closes with an application close code that indicates
|
||||||
|
* reconnecting is not useful (e.g. feature flag disabled).
|
||||||
|
*/
|
||||||
|
onFatalClose?: (event: CloseEvent) => void
|
||||||
|
/**
|
||||||
|
* More granular status updates for UI (connecting/reconnecting/offline/etc).
|
||||||
|
*/
|
||||||
|
onStatusChange?: (status: OpsWSStatus) => void
|
||||||
|
/**
|
||||||
|
* Called when a reconnect is scheduled (helps display "retry in Xs").
|
||||||
|
*/
|
||||||
|
onReconnectScheduled?: (info: { attempt: number, delayMs: number }) => void
|
||||||
|
wsBaseUrl?: string
|
||||||
|
/**
|
||||||
|
* Maximum reconnect attempts. Defaults to Infinity to keep the dashboard live.
|
||||||
|
* Set to 0 to disable reconnect.
|
||||||
|
*/
|
||||||
|
maxReconnectAttempts?: number
|
||||||
|
reconnectBaseDelayMs?: number
|
||||||
|
reconnectMaxDelayMs?: number
|
||||||
|
/**
|
||||||
|
* Stale connection detection (heartbeat-by-observation).
|
||||||
|
* If no messages are received within this window, the socket is closed to trigger a reconnect.
|
||||||
|
* Set to 0 to disable.
|
||||||
|
*/
|
||||||
|
staleTimeoutMs?: number
|
||||||
|
/**
|
||||||
|
* How often to check staleness. Only used when `staleTimeoutMs > 0`.
|
||||||
|
*/
|
||||||
|
staleCheckIntervalMs?: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export type OpsWSStatus = 'connecting' | 'connected' | 'reconnecting' | 'offline' | 'closed'
|
||||||
|
|
||||||
|
export const OPS_WS_CLOSE_CODES = {
|
||||||
|
REALTIME_DISABLED: 4001
|
||||||
|
} as const
|
||||||
|
|
||||||
|
const OPS_WS_BASE_PROTOCOL = 'sub2api-admin'
|
||||||
|
|
||||||
|
export function subscribeQPS(onMessage: (data: any) => void, options: SubscribeQPSOptions = {}): () => void {
|
||||||
|
let ws: WebSocket | null = null
|
||||||
|
let reconnectAttempts = 0
|
||||||
|
const maxReconnectAttempts = Number.isFinite(options.maxReconnectAttempts as number)
|
||||||
|
? (options.maxReconnectAttempts as number)
|
||||||
|
: Infinity
|
||||||
|
const baseDelayMs = options.reconnectBaseDelayMs ?? 1000
|
||||||
|
const maxDelayMs = options.reconnectMaxDelayMs ?? 30000
|
||||||
|
let reconnectTimer: ReturnType<typeof setTimeout> | null = null
|
||||||
|
let shouldReconnect = true
|
||||||
|
let isConnecting = false
|
||||||
|
let hasConnectedOnce = false
|
||||||
|
let lastMessageAt = 0
|
||||||
|
const staleTimeoutMs = options.staleTimeoutMs ?? 120_000
|
||||||
|
const staleCheckIntervalMs = options.staleCheckIntervalMs ?? 30_000
|
||||||
|
let staleTimer: ReturnType<typeof setInterval> | null = null
|
||||||
|
|
||||||
|
const setStatus = (status: OpsWSStatus) => {
|
||||||
|
options.onStatusChange?.(status)
|
||||||
|
}
|
||||||
|
|
||||||
|
const clearReconnectTimer = () => {
|
||||||
|
if (reconnectTimer) {
|
||||||
|
clearTimeout(reconnectTimer)
|
||||||
|
reconnectTimer = null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const clearStaleTimer = () => {
|
||||||
|
if (staleTimer) {
|
||||||
|
clearInterval(staleTimer)
|
||||||
|
staleTimer = null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const startStaleTimer = () => {
|
||||||
|
clearStaleTimer()
|
||||||
|
if (!staleTimeoutMs || staleTimeoutMs <= 0) return
|
||||||
|
staleTimer = setInterval(() => {
|
||||||
|
if (!shouldReconnect) return
|
||||||
|
if (!ws || ws.readyState !== WebSocket.OPEN) return
|
||||||
|
if (!lastMessageAt) return
|
||||||
|
const ageMs = Date.now() - lastMessageAt
|
||||||
|
if (ageMs > staleTimeoutMs) {
|
||||||
|
// Treat as a half-open connection; closing triggers the normal reconnect path.
|
||||||
|
ws.close()
|
||||||
|
}
|
||||||
|
}, staleCheckIntervalMs)
|
||||||
|
}
|
||||||
|
|
||||||
|
const scheduleReconnect = () => {
|
||||||
|
if (!shouldReconnect) return
|
||||||
|
if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
|
||||||
|
|
||||||
|
// If we're offline, wait for the browser to come back online.
|
||||||
|
if (typeof navigator !== 'undefined' && 'onLine' in navigator && !navigator.onLine) {
|
||||||
|
setStatus('offline')
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const expDelay = baseDelayMs * Math.pow(2, reconnectAttempts)
|
||||||
|
const delay = Math.min(expDelay, maxDelayMs)
|
||||||
|
const jitter = Math.floor(Math.random() * 250)
|
||||||
|
clearReconnectTimer()
|
||||||
|
reconnectTimer = setTimeout(() => {
|
||||||
|
reconnectAttempts++
|
||||||
|
connect()
|
||||||
|
}, delay + jitter)
|
||||||
|
options.onReconnectScheduled?.({ attempt: reconnectAttempts + 1, delayMs: delay + jitter })
|
||||||
|
}
|
||||||
|
|
||||||
|
const handleOnline = () => {
|
||||||
|
if (!shouldReconnect) return
|
||||||
|
if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
|
||||||
|
connect()
|
||||||
|
}
|
||||||
|
|
||||||
|
const handleOffline = () => {
|
||||||
|
setStatus('offline')
|
||||||
|
}
|
||||||
|
|
||||||
|
const connect = () => {
|
||||||
|
if (!shouldReconnect) return
|
||||||
|
if (isConnecting) return
|
||||||
|
if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
|
||||||
|
if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
|
||||||
|
|
||||||
|
isConnecting = true
|
||||||
|
setStatus(hasConnectedOnce ? 'reconnecting' : 'connecting')
|
||||||
|
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
|
||||||
|
const wsBaseUrl = options.wsBaseUrl || import.meta.env.VITE_WS_BASE_URL || window.location.host
|
||||||
|
const wsURL = new URL(`${protocol}//${wsBaseUrl}/api/v1/admin/ops/ws/qps`)
|
||||||
|
|
||||||
|
// Do NOT put admin JWT in the URL query string (it can leak via access logs, proxies, etc).
|
||||||
|
// Browsers cannot set Authorization headers for WebSockets, so we pass the token via
|
||||||
|
// Sec-WebSocket-Protocol (subprotocol list): ["sub2api-admin", "jwt.<token>"].
|
||||||
|
const rawToken = String(options.token ?? localStorage.getItem('auth_token') ?? '').trim()
|
||||||
|
const protocols: string[] = [OPS_WS_BASE_PROTOCOL]
|
||||||
|
if (rawToken) protocols.push(`jwt.${rawToken}`)
|
||||||
|
|
||||||
|
ws = new WebSocket(wsURL.toString(), protocols)
|
||||||
|
|
||||||
|
ws.onopen = () => {
|
||||||
|
reconnectAttempts = 0
|
||||||
|
isConnecting = false
|
||||||
|
hasConnectedOnce = true
|
||||||
|
clearReconnectTimer()
|
||||||
|
lastMessageAt = Date.now()
|
||||||
|
startStaleTimer()
|
||||||
|
setStatus('connected')
|
||||||
|
options.onOpen?.()
|
||||||
|
}
|
||||||
|
|
||||||
|
ws.onmessage = (e) => {
|
||||||
|
try {
|
||||||
|
const data = JSON.parse(e.data)
|
||||||
|
lastMessageAt = Date.now()
|
||||||
|
onMessage(data)
|
||||||
|
} catch (err) {
|
||||||
|
console.warn('[OpsWS] Failed to parse message:', err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ws.onerror = (error) => {
|
||||||
|
console.error('[OpsWS] Connection error:', error)
|
||||||
|
options.onError?.(error)
|
||||||
|
}
|
||||||
|
|
||||||
|
ws.onclose = (event) => {
|
||||||
|
isConnecting = false
|
||||||
|
options.onClose?.(event)
|
||||||
|
clearStaleTimer()
|
||||||
|
ws = null
|
||||||
|
|
||||||
|
// If the server explicitly tells us to stop reconnecting, honor it.
|
||||||
|
if (event && typeof event.code === 'number' && event.code === OPS_WS_CLOSE_CODES.REALTIME_DISABLED) {
|
||||||
|
shouldReconnect = false
|
||||||
|
clearReconnectTimer()
|
||||||
|
setStatus('closed')
|
||||||
|
options.onFatalClose?.(event)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
scheduleReconnect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
window.addEventListener('online', handleOnline)
|
||||||
|
window.addEventListener('offline', handleOffline)
|
||||||
|
connect()
|
||||||
|
|
||||||
|
return () => {
|
||||||
|
shouldReconnect = false
|
||||||
|
window.removeEventListener('online', handleOnline)
|
||||||
|
window.removeEventListener('offline', handleOffline)
|
||||||
|
clearReconnectTimer()
|
||||||
|
clearStaleTimer()
|
||||||
|
if (ws) ws.close()
|
||||||
|
ws = null
|
||||||
|
setStatus('closed')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export type OpsSeverity = string
|
||||||
|
export type OpsPhase = string
|
||||||
|
|
||||||
|
export type AlertSeverity = 'critical' | 'warning' | 'info'
|
||||||
|
export type ThresholdMode = 'count' | 'percentage' | 'both'
|
||||||
|
export type MetricType =
|
||||||
|
| 'success_rate'
|
||||||
|
| 'error_rate'
|
||||||
|
| 'upstream_error_rate'
|
||||||
|
| 'p95_latency_ms'
|
||||||
|
| 'p99_latency_ms'
|
||||||
|
| 'cpu_usage_percent'
|
||||||
|
| 'memory_usage_percent'
|
||||||
|
| 'concurrency_queue_depth'
|
||||||
|
| 'group_available_accounts'
|
||||||
|
| 'group_available_ratio'
|
||||||
|
| 'group_rate_limit_ratio'
|
||||||
|
| 'account_rate_limited_count'
|
||||||
|
| 'account_error_count'
|
||||||
|
| 'account_error_ratio'
|
||||||
|
| 'overload_account_count'
|
||||||
|
export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!='
|
||||||
|
|
||||||
|
export interface AlertRule {
|
||||||
|
id?: number
|
||||||
|
name: string
|
||||||
|
description?: string
|
||||||
|
enabled: boolean
|
||||||
|
metric_type: MetricType
|
||||||
|
operator: Operator
|
||||||
|
threshold: number
|
||||||
|
window_minutes: number
|
||||||
|
sustained_minutes: number
|
||||||
|
severity: OpsSeverity
|
||||||
|
cooldown_minutes: number
|
||||||
|
notify_email: boolean
|
||||||
|
filters?: Record<string, any>
|
||||||
|
created_at?: string
|
||||||
|
updated_at?: string
|
||||||
|
last_triggered_at?: string | null
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface AlertEvent {
|
||||||
|
id: number
|
||||||
|
rule_id: number
|
||||||
|
severity: OpsSeverity | string
|
||||||
|
status: 'firing' | 'resolved' | string
|
||||||
|
title?: string
|
||||||
|
description?: string
|
||||||
|
metric_value?: number
|
||||||
|
threshold_value?: number
|
||||||
|
dimensions?: Record<string, any>
|
||||||
|
fired_at: string
|
||||||
|
resolved_at?: string | null
|
||||||
|
email_sent: boolean
|
||||||
|
created_at: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface EmailNotificationConfig {
|
||||||
|
alert: {
|
||||||
|
enabled: boolean
|
||||||
|
recipients: string[]
|
||||||
|
min_severity: AlertSeverity | ''
|
||||||
|
rate_limit_per_hour: number
|
||||||
|
batching_window_seconds: number
|
||||||
|
include_resolved_alerts: boolean
|
||||||
|
}
|
||||||
|
report: {
|
||||||
|
enabled: boolean
|
||||||
|
recipients: string[]
|
||||||
|
daily_summary_enabled: boolean
|
||||||
|
daily_summary_schedule: string
|
||||||
|
weekly_summary_enabled: boolean
|
||||||
|
weekly_summary_schedule: string
|
||||||
|
error_digest_enabled: boolean
|
||||||
|
error_digest_schedule: string
|
||||||
|
error_digest_min_count: number
|
||||||
|
account_health_enabled: boolean
|
||||||
|
account_health_schedule: string
|
||||||
|
account_health_error_rate_threshold: number
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsDistributedLockSettings {
|
||||||
|
enabled: boolean
|
||||||
|
key: string
|
||||||
|
ttl_seconds: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsAlertRuntimeSettings {
|
||||||
|
evaluation_interval_seconds: number
|
||||||
|
distributed_lock: OpsDistributedLockSettings
|
||||||
|
silencing: {
|
||||||
|
enabled: boolean
|
||||||
|
global_until_rfc3339: string
|
||||||
|
global_reason: string
|
||||||
|
entries?: Array<{
|
||||||
|
rule_id?: number
|
||||||
|
severities?: Array<OpsSeverity | string>
|
||||||
|
until_rfc3339: string
|
||||||
|
reason: string
|
||||||
|
}>
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsAdvancedSettings {
|
||||||
|
data_retention: OpsDataRetentionSettings
|
||||||
|
aggregation: OpsAggregationSettings
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsDataRetentionSettings {
|
||||||
|
cleanup_enabled: boolean
|
||||||
|
cleanup_schedule: string
|
||||||
|
error_log_retention_days: number
|
||||||
|
minute_metrics_retention_days: number
|
||||||
|
hourly_metrics_retention_days: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsAggregationSettings {
|
||||||
|
aggregation_enabled: boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsErrorLog {
|
||||||
|
id: number
|
||||||
|
created_at: string
|
||||||
|
phase: OpsPhase
|
||||||
|
type: string
|
||||||
|
severity: OpsSeverity
|
||||||
|
status_code: number
|
||||||
|
platform: string
|
||||||
|
model: string
|
||||||
|
latency_ms?: number | null
|
||||||
|
client_request_id: string
|
||||||
|
request_id: string
|
||||||
|
message: string
|
||||||
|
|
||||||
|
user_id?: number | null
|
||||||
|
api_key_id?: number | null
|
||||||
|
account_id?: number | null
|
||||||
|
group_id?: number | null
|
||||||
|
|
||||||
|
client_ip?: string | null
|
||||||
|
request_path?: string
|
||||||
|
stream?: boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsErrorDetail extends OpsErrorLog {
|
||||||
|
error_body: string
|
||||||
|
user_agent: string
|
||||||
|
|
||||||
|
// Upstream context (optional; enriched by gateway services)
|
||||||
|
upstream_status_code?: number | null
|
||||||
|
upstream_error_message?: string
|
||||||
|
upstream_error_detail?: string
|
||||||
|
upstream_errors?: string
|
||||||
|
|
||||||
|
auth_latency_ms?: number | null
|
||||||
|
routing_latency_ms?: number | null
|
||||||
|
upstream_latency_ms?: number | null
|
||||||
|
response_latency_ms?: number | null
|
||||||
|
time_to_first_token_ms?: number | null
|
||||||
|
|
||||||
|
request_body: string
|
||||||
|
request_body_truncated: boolean
|
||||||
|
request_body_bytes?: number | null
|
||||||
|
|
||||||
|
is_business_limited: boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
export type OpsErrorLogsResponse = PaginatedResponse<OpsErrorLog>
|
||||||
|
|
||||||
|
export async function getDashboardOverview(
|
||||||
|
params: {
|
||||||
|
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
|
||||||
|
start_time?: string
|
||||||
|
end_time?: string
|
||||||
|
platform?: string
|
||||||
|
group_id?: number | null
|
||||||
|
mode?: OpsQueryMode
|
||||||
|
},
|
||||||
|
options: OpsRequestOptions = {}
|
||||||
|
): Promise<OpsDashboardOverview> {
|
||||||
|
const { data } = await apiClient.get<OpsDashboardOverview>('/admin/ops/dashboard/overview', {
|
||||||
|
params,
|
||||||
|
signal: options.signal
|
||||||
|
})
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getThroughputTrend(
|
||||||
|
params: {
|
||||||
|
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
|
||||||
|
start_time?: string
|
||||||
|
end_time?: string
|
||||||
|
platform?: string
|
||||||
|
group_id?: number | null
|
||||||
|
mode?: OpsQueryMode
|
||||||
|
},
|
||||||
|
options: OpsRequestOptions = {}
|
||||||
|
): Promise<OpsThroughputTrendResponse> {
|
||||||
|
const { data } = await apiClient.get<OpsThroughputTrendResponse>('/admin/ops/dashboard/throughput-trend', {
|
||||||
|
params,
|
||||||
|
signal: options.signal
|
||||||
|
})
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getLatencyHistogram(
|
||||||
|
params: {
|
||||||
|
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
|
||||||
|
start_time?: string
|
||||||
|
end_time?: string
|
||||||
|
platform?: string
|
||||||
|
group_id?: number | null
|
||||||
|
mode?: OpsQueryMode
|
||||||
|
},
|
||||||
|
options: OpsRequestOptions = {}
|
||||||
|
): Promise<OpsLatencyHistogramResponse> {
|
||||||
|
const { data } = await apiClient.get<OpsLatencyHistogramResponse>('/admin/ops/dashboard/latency-histogram', {
|
||||||
|
params,
|
||||||
|
signal: options.signal
|
||||||
|
})
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getErrorTrend(
|
||||||
|
params: {
|
||||||
|
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
|
||||||
|
start_time?: string
|
||||||
|
end_time?: string
|
||||||
|
platform?: string
|
||||||
|
group_id?: number | null
|
||||||
|
mode?: OpsQueryMode
|
||||||
|
},
|
||||||
|
options: OpsRequestOptions = {}
|
||||||
|
): Promise<OpsErrorTrendResponse> {
|
||||||
|
const { data } = await apiClient.get<OpsErrorTrendResponse>('/admin/ops/dashboard/error-trend', {
|
||||||
|
params,
|
||||||
|
signal: options.signal
|
||||||
|
})
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getErrorDistribution(
|
||||||
|
params: {
|
||||||
|
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
|
||||||
|
start_time?: string
|
||||||
|
end_time?: string
|
||||||
|
platform?: string
|
||||||
|
group_id?: number | null
|
||||||
|
mode?: OpsQueryMode
|
||||||
|
},
|
||||||
|
options: OpsRequestOptions = {}
|
||||||
|
): Promise<OpsErrorDistributionResponse> {
|
||||||
|
const { data } = await apiClient.get<OpsErrorDistributionResponse>('/admin/ops/dashboard/error-distribution', {
|
||||||
|
params,
|
||||||
|
signal: options.signal
|
||||||
|
})
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function listErrorLogs(params: {
|
||||||
|
page?: number
|
||||||
|
page_size?: number
|
||||||
|
time_range?: string
|
||||||
|
start_time?: string
|
||||||
|
end_time?: string
|
||||||
|
platform?: string
|
||||||
|
group_id?: number | null
|
||||||
|
account_id?: number | null
|
||||||
|
phase?: string
|
||||||
|
q?: string
|
||||||
|
status_codes?: string
|
||||||
|
}): Promise<OpsErrorLogsResponse> {
|
||||||
|
const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/errors', { params })
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getErrorLogDetail(id: number): Promise<OpsErrorDetail> {
|
||||||
|
const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/errors/${id}`)
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promise<OpsRetryResult> {
|
||||||
|
const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/errors/${id}/retry`, req)
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function listRequestDetails(params: OpsRequestDetailsParams): Promise<OpsRequestDetailsResponse> {
|
||||||
|
const { data } = await apiClient.get<OpsRequestDetailsResponse>('/admin/ops/requests', { params })
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
// Alert rules
|
||||||
|
export async function listAlertRules(): Promise<AlertRule[]> {
|
||||||
|
const { data } = await apiClient.get<AlertRule[]>('/admin/ops/alert-rules')
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function createAlertRule(rule: AlertRule): Promise<AlertRule> {
|
||||||
|
const { data } = await apiClient.post<AlertRule>('/admin/ops/alert-rules', rule)
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function updateAlertRule(id: number, rule: Partial<AlertRule>): Promise<AlertRule> {
|
||||||
|
const { data } = await apiClient.put<AlertRule>(`/admin/ops/alert-rules/${id}`, rule)
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function deleteAlertRule(id: number): Promise<void> {
|
||||||
|
await apiClient.delete(`/admin/ops/alert-rules/${id}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function listAlertEvents(limit = 100): Promise<AlertEvent[]> {
|
||||||
|
const { data } = await apiClient.get<AlertEvent[]>('/admin/ops/alert-events', { params: { limit } })
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
// Email notification config
|
||||||
|
export async function getEmailNotificationConfig(): Promise<EmailNotificationConfig> {
|
||||||
|
const { data } = await apiClient.get<EmailNotificationConfig>('/admin/ops/email-notification/config')
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function updateEmailNotificationConfig(config: EmailNotificationConfig): Promise<EmailNotificationConfig> {
|
||||||
|
const { data } = await apiClient.put<EmailNotificationConfig>('/admin/ops/email-notification/config', config)
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
// Runtime settings (DB-backed)
|
||||||
|
export async function getAlertRuntimeSettings(): Promise<OpsAlertRuntimeSettings> {
|
||||||
|
const { data } = await apiClient.get<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert')
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function updateAlertRuntimeSettings(config: OpsAlertRuntimeSettings): Promise<OpsAlertRuntimeSettings> {
|
||||||
|
const { data } = await apiClient.put<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert', config)
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
// Advanced settings (DB-backed)
|
||||||
|
export async function getAdvancedSettings(): Promise<OpsAdvancedSettings> {
|
||||||
|
const { data } = await apiClient.get<OpsAdvancedSettings>('/admin/ops/advanced-settings')
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function updateAdvancedSettings(config: OpsAdvancedSettings): Promise<OpsAdvancedSettings> {
|
||||||
|
const { data } = await apiClient.put<OpsAdvancedSettings>('/admin/ops/advanced-settings', config)
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export const opsAPI = {
|
||||||
|
getDashboardOverview,
|
||||||
|
getThroughputTrend,
|
||||||
|
getLatencyHistogram,
|
||||||
|
getErrorTrend,
|
||||||
|
getErrorDistribution,
|
||||||
|
getConcurrencyStats,
|
||||||
|
getAccountAvailabilityStats,
|
||||||
|
subscribeQPS,
|
||||||
|
listErrorLogs,
|
||||||
|
getErrorLogDetail,
|
||||||
|
retryErrorRequest,
|
||||||
|
listRequestDetails,
|
||||||
|
listAlertRules,
|
||||||
|
createAlertRule,
|
||||||
|
updateAlertRule,
|
||||||
|
deleteAlertRule,
|
||||||
|
listAlertEvents,
|
||||||
|
getEmailNotificationConfig,
|
||||||
|
updateEmailNotificationConfig,
|
||||||
|
getAlertRuntimeSettings,
|
||||||
|
updateAlertRuntimeSettings,
|
||||||
|
getAdvancedSettings,
|
||||||
|
updateAdvancedSettings
|
||||||
|
}
|
||||||
|
|
||||||
|
export default opsAPI
|
||||||
@@ -35,14 +35,29 @@ export interface SystemSettings {
|
|||||||
turnstile_enabled: boolean
|
turnstile_enabled: boolean
|
||||||
turnstile_site_key: string
|
turnstile_site_key: string
|
||||||
turnstile_secret_key_configured: boolean
|
turnstile_secret_key_configured: boolean
|
||||||
// LinuxDo Connect OAuth 登录(终端用户 SSO)
|
|
||||||
|
// LinuxDo Connect OAuth settings
|
||||||
linuxdo_connect_enabled: boolean
|
linuxdo_connect_enabled: boolean
|
||||||
linuxdo_connect_client_id: string
|
linuxdo_connect_client_id: string
|
||||||
linuxdo_connect_client_secret_configured: boolean
|
linuxdo_connect_client_secret_configured: boolean
|
||||||
linuxdo_connect_redirect_url: string
|
linuxdo_connect_redirect_url: string
|
||||||
|
|
||||||
|
// Model fallback configuration
|
||||||
|
enable_model_fallback: boolean
|
||||||
|
fallback_model_anthropic: string
|
||||||
|
fallback_model_openai: string
|
||||||
|
fallback_model_gemini: string
|
||||||
|
fallback_model_antigravity: string
|
||||||
|
|
||||||
// Identity patch configuration (Claude -> Gemini)
|
// Identity patch configuration (Claude -> Gemini)
|
||||||
enable_identity_patch: boolean
|
enable_identity_patch: boolean
|
||||||
identity_patch_prompt: string
|
identity_patch_prompt: string
|
||||||
|
|
||||||
|
// Ops Monitoring (vNext)
|
||||||
|
ops_monitoring_enabled: boolean
|
||||||
|
ops_realtime_monitoring_enabled: boolean
|
||||||
|
ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string
|
||||||
|
ops_metrics_interval_seconds: number
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface UpdateSettingsRequest {
|
export interface UpdateSettingsRequest {
|
||||||
@@ -71,8 +86,17 @@ export interface UpdateSettingsRequest {
|
|||||||
linuxdo_connect_client_id?: string
|
linuxdo_connect_client_id?: string
|
||||||
linuxdo_connect_client_secret?: string
|
linuxdo_connect_client_secret?: string
|
||||||
linuxdo_connect_redirect_url?: string
|
linuxdo_connect_redirect_url?: string
|
||||||
|
enable_model_fallback?: boolean
|
||||||
|
fallback_model_anthropic?: string
|
||||||
|
fallback_model_openai?: string
|
||||||
|
fallback_model_gemini?: string
|
||||||
|
fallback_model_antigravity?: string
|
||||||
enable_identity_patch?: boolean
|
enable_identity_patch?: boolean
|
||||||
identity_patch_prompt?: string
|
identity_patch_prompt?: string
|
||||||
|
ops_monitoring_enabled?: boolean
|
||||||
|
ops_realtime_monitoring_enabled?: boolean
|
||||||
|
ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string
|
||||||
|
ops_metrics_interval_seconds?: number
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -80,9 +80,45 @@ apiClient.interceptors.response.use(
|
|||||||
return response
|
return response
|
||||||
},
|
},
|
||||||
(error: AxiosError<ApiResponse<unknown>>) => {
|
(error: AxiosError<ApiResponse<unknown>>) => {
|
||||||
|
// Request cancellation: keep the original axios cancellation error so callers can ignore it.
|
||||||
|
// Otherwise we'd misclassify it as a generic "network error".
|
||||||
|
if (error.code === 'ERR_CANCELED' || axios.isCancel(error)) {
|
||||||
|
return Promise.reject(error)
|
||||||
|
}
|
||||||
|
|
||||||
// Handle common errors
|
// Handle common errors
|
||||||
if (error.response) {
|
if (error.response) {
|
||||||
const { status, data } = error.response
|
const { status, data } = error.response
|
||||||
|
const url = String(error.config?.url || '')
|
||||||
|
|
||||||
|
// Validate `data` shape to avoid HTML error pages breaking our error handling.
|
||||||
|
const apiData = (typeof data === 'object' && data !== null ? data : {}) as Record<string, any>
|
||||||
|
|
||||||
|
// Ops monitoring disabled: treat as feature-flagged 404, and proactively redirect away
|
||||||
|
// from ops pages to avoid broken UI states.
|
||||||
|
if (status === 404 && apiData.message === 'Ops monitoring is disabled') {
|
||||||
|
try {
|
||||||
|
localStorage.setItem('ops_monitoring_enabled_cached', 'false')
|
||||||
|
} catch {
|
||||||
|
// ignore localStorage failures
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
window.dispatchEvent(new CustomEvent('ops-monitoring-disabled'))
|
||||||
|
} catch {
|
||||||
|
// ignore event failures
|
||||||
|
}
|
||||||
|
|
||||||
|
if (window.location.pathname.startsWith('/admin/ops')) {
|
||||||
|
window.location.href = '/admin/settings'
|
||||||
|
}
|
||||||
|
|
||||||
|
return Promise.reject({
|
||||||
|
status,
|
||||||
|
code: 'OPS_DISABLED',
|
||||||
|
message: apiData.message || error.message,
|
||||||
|
url
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
// 401: Unauthorized - clear token and redirect to login
|
// 401: Unauthorized - clear token and redirect to login
|
||||||
if (status === 401) {
|
if (status === 401) {
|
||||||
@@ -113,8 +149,8 @@ apiClient.interceptors.response.use(
|
|||||||
// Return structured error
|
// Return structured error
|
||||||
return Promise.reject({
|
return Promise.reject({
|
||||||
status,
|
status,
|
||||||
code: data?.code,
|
code: apiData.code,
|
||||||
message: data?.message || error.message
|
message: apiData.message || apiData.detail || error.message
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
44
frontend/src/components/common/HelpTooltip.vue
Normal file
44
frontend/src/components/common/HelpTooltip.vue
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
<script setup lang="ts">
|
||||||
|
import { ref } from 'vue'
|
||||||
|
|
||||||
|
defineProps<{
|
||||||
|
content?: string
|
||||||
|
}>()
|
||||||
|
|
||||||
|
const show = ref(false)
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<template>
|
||||||
|
<div
|
||||||
|
class="group relative ml-1 inline-flex items-center align-middle"
|
||||||
|
@mouseenter="show = true"
|
||||||
|
@mouseleave="show = false"
|
||||||
|
>
|
||||||
|
<!-- Trigger Icon -->
|
||||||
|
<slot name="trigger">
|
||||||
|
<svg
|
||||||
|
class="h-4 w-4 cursor-help text-gray-400 transition-colors hover:text-primary-600 dark:text-gray-500 dark:hover:text-primary-400"
|
||||||
|
fill="none"
|
||||||
|
viewBox="0 0 24 24"
|
||||||
|
stroke="currentColor"
|
||||||
|
stroke-width="2"
|
||||||
|
>
|
||||||
|
<path
|
||||||
|
stroke-linecap="round"
|
||||||
|
stroke-linejoin="round"
|
||||||
|
d="M13 16h-1v-4h-1m1-4h.01M21 12a9 9 0 11-18 0 9 9 0 0118 0z"
|
||||||
|
/>
|
||||||
|
</svg>
|
||||||
|
</slot>
|
||||||
|
|
||||||
|
<!-- Popover Content -->
|
||||||
|
<div
|
||||||
|
v-show="show"
|
||||||
|
class="absolute bottom-full left-1/2 z-50 mb-2 w-64 -translate-x-1/2 rounded-lg bg-gray-900 p-3 text-xs leading-relaxed text-white shadow-xl ring-1 ring-white/10 opacity-0 transition-opacity duration-200 group-hover:opacity-100 dark:bg-gray-800"
|
||||||
|
>
|
||||||
|
<slot>{{ content }}</slot>
|
||||||
|
<div class="absolute -bottom-1 left-1/2 h-2 w-2 -translate-x-1/2 rotate-45 bg-gray-900 dark:bg-gray-800"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</template>
|
||||||
|
|
||||||
@@ -67,12 +67,13 @@
|
|||||||
:aria-selected="isSelected(option)"
|
:aria-selected="isSelected(option)"
|
||||||
:aria-disabled="isOptionDisabled(option)"
|
:aria-disabled="isOptionDisabled(option)"
|
||||||
@click.stop="!isOptionDisabled(option) && selectOption(option)"
|
@click.stop="!isOptionDisabled(option) && selectOption(option)"
|
||||||
@mouseenter="focusedIndex = index"
|
@mouseenter="handleOptionMouseEnter(option, index)"
|
||||||
:class="[
|
:class="[
|
||||||
'select-option',
|
'select-option',
|
||||||
|
isGroupHeaderOption(option) && 'select-option-group',
|
||||||
isSelected(option) && 'select-option-selected',
|
isSelected(option) && 'select-option-selected',
|
||||||
isOptionDisabled(option) && 'select-option-disabled',
|
isOptionDisabled(option) && !isGroupHeaderOption(option) && 'select-option-disabled',
|
||||||
focusedIndex === index && 'select-option-focused'
|
focusedIndex === index && !isGroupHeaderOption(option) && 'select-option-focused'
|
||||||
]"
|
]"
|
||||||
>
|
>
|
||||||
<slot name="option" :option="option" :selected="isSelected(option)">
|
<slot name="option" :option="option" :selected="isSelected(option)">
|
||||||
@@ -201,6 +202,13 @@ const isOptionDisabled = (option: any): boolean => {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const isGroupHeaderOption = (option: any): boolean => {
|
||||||
|
if (typeof option === 'object' && option !== null) {
|
||||||
|
return option.kind === 'group'
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
const selectedOption = computed(() => {
|
const selectedOption = computed(() => {
|
||||||
return props.options.find((opt) => getOptionValue(opt) === props.modelValue) || null
|
return props.options.find((opt) => getOptionValue(opt) === props.modelValue) || null
|
||||||
})
|
})
|
||||||
@@ -225,6 +233,31 @@ const isSelected = (option: any): boolean => {
|
|||||||
return getOptionValue(option) === props.modelValue
|
return getOptionValue(option) === props.modelValue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const findNextEnabledIndex = (startIndex: number): number => {
|
||||||
|
const opts = filteredOptions.value
|
||||||
|
if (opts.length === 0) return -1
|
||||||
|
for (let offset = 0; offset < opts.length; offset++) {
|
||||||
|
const idx = (startIndex + offset) % opts.length
|
||||||
|
if (!isOptionDisabled(opts[idx])) return idx
|
||||||
|
}
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
|
const findPrevEnabledIndex = (startIndex: number): number => {
|
||||||
|
const opts = filteredOptions.value
|
||||||
|
if (opts.length === 0) return -1
|
||||||
|
for (let offset = 0; offset < opts.length; offset++) {
|
||||||
|
const idx = (startIndex - offset + opts.length) % opts.length
|
||||||
|
if (!isOptionDisabled(opts[idx])) return idx
|
||||||
|
}
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
|
const handleOptionMouseEnter = (option: any, index: number) => {
|
||||||
|
if (isOptionDisabled(option) || isGroupHeaderOption(option)) return
|
||||||
|
focusedIndex.value = index
|
||||||
|
}
|
||||||
|
|
||||||
// Update trigger rect periodically while open to follow scroll/resize
|
// Update trigger rect periodically while open to follow scroll/resize
|
||||||
const updateTriggerRect = () => {
|
const updateTriggerRect = () => {
|
||||||
if (containerRef.value) {
|
if (containerRef.value) {
|
||||||
@@ -259,8 +292,15 @@ watch(isOpen, (open) => {
|
|||||||
if (open) {
|
if (open) {
|
||||||
calculateDropdownPosition()
|
calculateDropdownPosition()
|
||||||
// Reset focused index to current selection or first item
|
// Reset focused index to current selection or first item
|
||||||
const selectedIdx = filteredOptions.value.findIndex(isSelected)
|
if (filteredOptions.value.length === 0) {
|
||||||
focusedIndex.value = selectedIdx >= 0 ? selectedIdx : 0
|
focusedIndex.value = -1
|
||||||
|
} else {
|
||||||
|
const selectedIdx = filteredOptions.value.findIndex(isSelected)
|
||||||
|
const initialIdx = selectedIdx >= 0 ? selectedIdx : 0
|
||||||
|
focusedIndex.value = isOptionDisabled(filteredOptions.value[initialIdx])
|
||||||
|
? findNextEnabledIndex(initialIdx + 1)
|
||||||
|
: initialIdx
|
||||||
|
}
|
||||||
|
|
||||||
if (props.searchable) {
|
if (props.searchable) {
|
||||||
nextTick(() => searchInputRef.value?.focus())
|
nextTick(() => searchInputRef.value?.focus())
|
||||||
@@ -295,13 +335,13 @@ const onDropdownKeyDown = (e: KeyboardEvent) => {
|
|||||||
switch (e.key) {
|
switch (e.key) {
|
||||||
case 'ArrowDown':
|
case 'ArrowDown':
|
||||||
e.preventDefault()
|
e.preventDefault()
|
||||||
focusedIndex.value = (focusedIndex.value + 1) % filteredOptions.value.length
|
focusedIndex.value = findNextEnabledIndex(focusedIndex.value + 1)
|
||||||
scrollToFocused()
|
if (focusedIndex.value >= 0) scrollToFocused()
|
||||||
break
|
break
|
||||||
case 'ArrowUp':
|
case 'ArrowUp':
|
||||||
e.preventDefault()
|
e.preventDefault()
|
||||||
focusedIndex.value = (focusedIndex.value - 1 + filteredOptions.value.length) % filteredOptions.value.length
|
focusedIndex.value = findPrevEnabledIndex(focusedIndex.value - 1)
|
||||||
scrollToFocused()
|
if (focusedIndex.value >= 0) scrollToFocused()
|
||||||
break
|
break
|
||||||
case 'Enter':
|
case 'Enter':
|
||||||
e.preventDefault()
|
e.preventDefault()
|
||||||
@@ -441,6 +481,17 @@ onUnmounted(() => {
|
|||||||
@apply cursor-not-allowed opacity-40;
|
@apply cursor-not-allowed opacity-40;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.select-dropdown-portal .select-option-group {
|
||||||
|
@apply cursor-default select-none;
|
||||||
|
@apply bg-gray-50 dark:bg-dark-900;
|
||||||
|
@apply text-[11px] font-bold uppercase tracking-wider;
|
||||||
|
@apply text-gray-500 dark:text-gray-400;
|
||||||
|
}
|
||||||
|
|
||||||
|
.select-dropdown-portal .select-option-group:hover {
|
||||||
|
@apply bg-gray-50 dark:bg-dark-900;
|
||||||
|
}
|
||||||
|
|
||||||
.select-dropdown-portal .select-option-label {
|
.select-dropdown-portal .select-option-label {
|
||||||
@apply flex-1 min-w-0 truncate text-left;
|
@apply flex-1 min-w-0 truncate text-left;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,8 +28,8 @@
|
|||||||
{{ platformDescription }}
|
{{ platformDescription }}
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<!-- Client Tabs (only for Antigravity platform) -->
|
<!-- Client Tabs -->
|
||||||
<div v-if="platform === 'antigravity'" class="border-b border-gray-200 dark:border-dark-700">
|
<div v-if="clientTabs.length" class="border-b border-gray-200 dark:border-dark-700">
|
||||||
<nav class="-mb-px flex space-x-6" aria-label="Client">
|
<nav class="-mb-px flex space-x-6" aria-label="Client">
|
||||||
<button
|
<button
|
||||||
v-for="tab in clientTabs"
|
v-for="tab in clientTabs"
|
||||||
@@ -51,7 +51,7 @@
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- OS/Shell Tabs -->
|
<!-- OS/Shell Tabs -->
|
||||||
<div class="border-b border-gray-200 dark:border-dark-700">
|
<div v-if="showShellTabs" class="border-b border-gray-200 dark:border-dark-700">
|
||||||
<nav class="-mb-px flex space-x-4" aria-label="Tabs">
|
<nav class="-mb-px flex space-x-4" aria-label="Tabs">
|
||||||
<button
|
<button
|
||||||
v-for="tab in currentTabs"
|
v-for="tab in currentTabs"
|
||||||
@@ -111,7 +111,7 @@
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Usage Note -->
|
<!-- Usage Note -->
|
||||||
<div class="flex items-start gap-3 p-3 rounded-lg bg-blue-50 dark:bg-blue-900/20 border border-blue-100 dark:border-blue-800">
|
<div v-if="showPlatformNote" class="flex items-start gap-3 p-3 rounded-lg bg-blue-50 dark:bg-blue-900/20 border border-blue-100 dark:border-blue-800">
|
||||||
<Icon name="infoCircle" size="md" class="text-blue-500 flex-shrink-0 mt-0.5" />
|
<Icon name="infoCircle" size="md" class="text-blue-500 flex-shrink-0 mt-0.5" />
|
||||||
<p class="text-sm text-blue-700 dark:text-blue-300">
|
<p class="text-sm text-blue-700 dark:text-blue-300">
|
||||||
{{ platformNote }}
|
{{ platformNote }}
|
||||||
@@ -173,17 +173,28 @@ const { copyToClipboard: clipboardCopy } = useClipboard()
|
|||||||
|
|
||||||
const copiedIndex = ref<number | null>(null)
|
const copiedIndex = ref<number | null>(null)
|
||||||
const activeTab = ref<string>('unix')
|
const activeTab = ref<string>('unix')
|
||||||
const activeClientTab = ref<string>('claude') // Level 1 tab for antigravity platform
|
const activeClientTab = ref<string>('claude')
|
||||||
|
|
||||||
// Reset tabs when platform changes
|
// Reset tabs when platform changes
|
||||||
watch(() => props.platform, (newPlatform) => {
|
const defaultClientTab = computed(() => {
|
||||||
activeTab.value = 'unix'
|
switch (props.platform) {
|
||||||
if (newPlatform === 'antigravity') {
|
case 'openai':
|
||||||
activeClientTab.value = 'claude'
|
return 'codex'
|
||||||
|
case 'gemini':
|
||||||
|
return 'gemini'
|
||||||
|
case 'antigravity':
|
||||||
|
return 'claude'
|
||||||
|
default:
|
||||||
|
return 'claude'
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
// Reset shell tab when client changes (for antigravity)
|
watch(() => props.platform, () => {
|
||||||
|
activeTab.value = 'unix'
|
||||||
|
activeClientTab.value = defaultClientTab.value
|
||||||
|
}, { immediate: true })
|
||||||
|
|
||||||
|
// Reset shell tab when client changes
|
||||||
watch(activeClientTab, () => {
|
watch(activeClientTab, () => {
|
||||||
activeTab.value = 'unix'
|
activeTab.value = 'unix'
|
||||||
})
|
})
|
||||||
@@ -251,11 +262,32 @@ const SparkleIcon = {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Client tabs for Antigravity platform (Level 1)
|
const clientTabs = computed((): TabConfig[] => {
|
||||||
const clientTabs = computed((): TabConfig[] => [
|
if (!props.platform) return []
|
||||||
{ id: 'claude', label: t('keys.useKeyModal.antigravity.claudeCode'), icon: TerminalIcon },
|
switch (props.platform) {
|
||||||
{ id: 'gemini', label: t('keys.useKeyModal.antigravity.geminiCli'), icon: SparkleIcon }
|
case 'openai':
|
||||||
])
|
return [
|
||||||
|
{ id: 'codex', label: t('keys.useKeyModal.cliTabs.codexCli'), icon: TerminalIcon },
|
||||||
|
{ id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
|
||||||
|
]
|
||||||
|
case 'gemini':
|
||||||
|
return [
|
||||||
|
{ id: 'gemini', label: t('keys.useKeyModal.cliTabs.geminiCli'), icon: SparkleIcon },
|
||||||
|
{ id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
|
||||||
|
]
|
||||||
|
case 'antigravity':
|
||||||
|
return [
|
||||||
|
{ id: 'claude', label: t('keys.useKeyModal.cliTabs.claudeCode'), icon: TerminalIcon },
|
||||||
|
{ id: 'gemini', label: t('keys.useKeyModal.cliTabs.geminiCli'), icon: SparkleIcon },
|
||||||
|
{ id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
|
||||||
|
]
|
||||||
|
default:
|
||||||
|
return [
|
||||||
|
{ id: 'claude', label: t('keys.useKeyModal.cliTabs.claudeCode'), icon: TerminalIcon },
|
||||||
|
{ id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
// Shell tabs (3 types for environment variable based configs)
|
// Shell tabs (3 types for environment variable based configs)
|
||||||
const shellTabs: TabConfig[] = [
|
const shellTabs: TabConfig[] = [
|
||||||
@@ -270,11 +302,13 @@ const openaiTabs: TabConfig[] = [
|
|||||||
{ id: 'windows', label: 'Windows', icon: WindowsIcon }
|
{ id: 'windows', label: 'Windows', icon: WindowsIcon }
|
||||||
]
|
]
|
||||||
|
|
||||||
|
const showShellTabs = computed(() => activeClientTab.value !== 'opencode')
|
||||||
|
|
||||||
const currentTabs = computed(() => {
|
const currentTabs = computed(() => {
|
||||||
|
if (!showShellTabs.value) return []
|
||||||
if (props.platform === 'openai') {
|
if (props.platform === 'openai') {
|
||||||
return openaiTabs // 2 tabs: unix, windows
|
return openaiTabs
|
||||||
}
|
}
|
||||||
// All other platforms (anthropic, gemini, antigravity) use shell tabs
|
|
||||||
return shellTabs
|
return shellTabs
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -308,6 +342,8 @@ const platformNote = computed(() => {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
const showPlatformNote = computed(() => activeClientTab.value !== 'opencode')
|
||||||
|
|
||||||
const escapeHtml = (value: string) => value
|
const escapeHtml = (value: string) => value
|
||||||
.replace(/&/g, '&')
|
.replace(/&/g, '&')
|
||||||
.replace(/</g, '<')
|
.replace(/</g, '<')
|
||||||
@@ -329,6 +365,35 @@ const comment = (value: string) => wrapToken('text-slate-500', value)
|
|||||||
const currentFiles = computed((): FileConfig[] => {
|
const currentFiles = computed((): FileConfig[] => {
|
||||||
const baseUrl = props.baseUrl || window.location.origin
|
const baseUrl = props.baseUrl || window.location.origin
|
||||||
const apiKey = props.apiKey
|
const apiKey = props.apiKey
|
||||||
|
const baseRoot = baseUrl.replace(/\/v1\/?$/, '').replace(/\/+$/, '')
|
||||||
|
const ensureV1 = (value: string) => {
|
||||||
|
const trimmed = value.replace(/\/+$/, '')
|
||||||
|
return trimmed.endsWith('/v1') ? trimmed : `${trimmed}/v1`
|
||||||
|
}
|
||||||
|
const apiBase = ensureV1(baseRoot)
|
||||||
|
const antigravityBase = ensureV1(`${baseRoot}/antigravity`)
|
||||||
|
const antigravityGeminiBase = (() => {
|
||||||
|
const trimmed = `${baseRoot}/antigravity`.replace(/\/+$/, '')
|
||||||
|
return trimmed.endsWith('/v1beta') ? trimmed : `${trimmed}/v1beta`
|
||||||
|
})()
|
||||||
|
|
||||||
|
if (activeClientTab.value === 'opencode') {
|
||||||
|
switch (props.platform) {
|
||||||
|
case 'anthropic':
|
||||||
|
return [generateOpenCodeConfig('anthropic', apiBase, apiKey)]
|
||||||
|
case 'openai':
|
||||||
|
return [generateOpenCodeConfig('openai', apiBase, apiKey)]
|
||||||
|
case 'gemini':
|
||||||
|
return [generateOpenCodeConfig('gemini', apiBase, apiKey)]
|
||||||
|
case 'antigravity':
|
||||||
|
return [
|
||||||
|
generateOpenCodeConfig('antigravity-claude', antigravityBase, apiKey, 'opencode.json (Claude)'),
|
||||||
|
generateOpenCodeConfig('antigravity-gemini', antigravityGeminiBase, apiKey, 'opencode.json (Gemini)')
|
||||||
|
]
|
||||||
|
default:
|
||||||
|
return [generateOpenCodeConfig('openai', apiBase, apiKey)]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
switch (props.platform) {
|
switch (props.platform) {
|
||||||
case 'openai':
|
case 'openai':
|
||||||
@@ -336,12 +401,11 @@ const currentFiles = computed((): FileConfig[] => {
|
|||||||
case 'gemini':
|
case 'gemini':
|
||||||
return [generateGeminiCliContent(baseUrl, apiKey)]
|
return [generateGeminiCliContent(baseUrl, apiKey)]
|
||||||
case 'antigravity':
|
case 'antigravity':
|
||||||
// Both Claude Code and Gemini CLI need /antigravity suffix for antigravity platform
|
if (activeClientTab.value === 'gemini') {
|
||||||
if (activeClientTab.value === 'claude') {
|
return [generateGeminiCliContent(`${baseUrl}/antigravity`, apiKey)]
|
||||||
return generateAnthropicFiles(`${baseUrl}/antigravity`, apiKey)
|
|
||||||
}
|
}
|
||||||
return [generateGeminiCliContent(`${baseUrl}/antigravity`, apiKey)]
|
return generateAnthropicFiles(`${baseUrl}/antigravity`, apiKey)
|
||||||
default: // anthropic
|
default:
|
||||||
return generateAnthropicFiles(baseUrl, apiKey)
|
return generateAnthropicFiles(baseUrl, apiKey)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@@ -456,6 +520,76 @@ requires_openai_auth = true`
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function generateOpenCodeConfig(platform: string, baseUrl: string, apiKey: string, pathLabel?: string): FileConfig {
|
||||||
|
const provider: Record<string, any> = {
|
||||||
|
[platform]: {
|
||||||
|
options: {
|
||||||
|
baseURL: baseUrl,
|
||||||
|
apiKey,
|
||||||
|
...(platform === 'openai' ? { store: false } : {})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const openaiModels = {
|
||||||
|
'gpt-5.2-codex': {
|
||||||
|
name: 'GPT-5.2 Codex',
|
||||||
|
variants: {
|
||||||
|
low: {},
|
||||||
|
medium: {},
|
||||||
|
high: {},
|
||||||
|
xhigh: {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const geminiModels = {
|
||||||
|
'gemini-3-pro-high': { name: 'Gemini 3 Pro High' },
|
||||||
|
'gemini-3-pro-low': { name: 'Gemini 3 Pro Low' },
|
||||||
|
'gemini-3-pro-preview': { name: 'Gemini 3 Pro Preview' },
|
||||||
|
'gemini-3-pro-image': { name: 'Gemini 3 Pro Image' },
|
||||||
|
'gemini-3-flash': { name: 'Gemini 3 Flash' },
|
||||||
|
'gemini-2.5-flash-thinking': { name: 'Gemini 2.5 Flash Thinking' },
|
||||||
|
'gemini-2.5-flash': { name: 'Gemini 2.5 Flash' },
|
||||||
|
'gemini-2.5-flash-lite': { name: 'Gemini 2.5 Flash Lite' }
|
||||||
|
}
|
||||||
|
const claudeModels = {
|
||||||
|
'claude-opus-4-5-thinking': { name: 'Claude Opus 4.5 Thinking' },
|
||||||
|
'claude-sonnet-4-5-thinking': { name: 'Claude Sonnet 4.5 Thinking' },
|
||||||
|
'claude-sonnet-4-5': { name: 'Claude Sonnet 4.5' }
|
||||||
|
}
|
||||||
|
|
||||||
|
if (platform === 'gemini') {
|
||||||
|
provider[platform].npm = '@ai-sdk/google'
|
||||||
|
provider[platform].models = geminiModels
|
||||||
|
} else if (platform === 'anthropic') {
|
||||||
|
provider[platform].npm = '@ai-sdk/anthropic'
|
||||||
|
} else if (platform === 'antigravity-claude') {
|
||||||
|
provider[platform].npm = '@ai-sdk/anthropic'
|
||||||
|
provider[platform].name = 'Antigravity (Claude)'
|
||||||
|
provider[platform].models = claudeModels
|
||||||
|
} else if (platform === 'antigravity-gemini') {
|
||||||
|
provider[platform].npm = '@ai-sdk/google'
|
||||||
|
provider[platform].name = 'Antigravity (Gemini)'
|
||||||
|
provider[platform].models = geminiModels
|
||||||
|
} else if (platform === 'openai') {
|
||||||
|
provider[platform].models = openaiModels
|
||||||
|
}
|
||||||
|
|
||||||
|
const content = JSON.stringify(
|
||||||
|
{
|
||||||
|
provider,
|
||||||
|
$schema: 'https://opencode.ai/config.json'
|
||||||
|
},
|
||||||
|
null,
|
||||||
|
2
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
path: pathLabel ?? 'opencode.json',
|
||||||
|
content,
|
||||||
|
hint: t('keys.useKeyModal.opencode.hint')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const copyContent = async (content: string, index: number) => {
|
const copyContent = async (content: string, index: number) => {
|
||||||
const success = await clipboardCopy(content, t('keys.copied'))
|
const success = await clipboardCopy(content, t('keys.copied'))
|
||||||
if (success) {
|
if (success) {
|
||||||
|
|||||||
@@ -144,10 +144,10 @@
|
|||||||
</template>
|
</template>
|
||||||
|
|
||||||
<script setup lang="ts">
|
<script setup lang="ts">
|
||||||
import { computed, h, ref } from 'vue'
|
import { computed, h, onMounted, ref, watch } from 'vue'
|
||||||
import { useRoute } from 'vue-router'
|
import { useRoute } from 'vue-router'
|
||||||
import { useI18n } from 'vue-i18n'
|
import { useI18n } from 'vue-i18n'
|
||||||
import { useAppStore, useAuthStore, useOnboardingStore } from '@/stores'
|
import { useAdminSettingsStore, useAppStore, useAuthStore, useOnboardingStore } from '@/stores'
|
||||||
import VersionBadge from '@/components/common/VersionBadge.vue'
|
import VersionBadge from '@/components/common/VersionBadge.vue'
|
||||||
|
|
||||||
const { t } = useI18n()
|
const { t } = useI18n()
|
||||||
@@ -156,6 +156,7 @@ const route = useRoute()
|
|||||||
const appStore = useAppStore()
|
const appStore = useAppStore()
|
||||||
const authStore = useAuthStore()
|
const authStore = useAuthStore()
|
||||||
const onboardingStore = useOnboardingStore()
|
const onboardingStore = useOnboardingStore()
|
||||||
|
const adminSettingsStore = useAdminSettingsStore()
|
||||||
|
|
||||||
const sidebarCollapsed = computed(() => appStore.sidebarCollapsed)
|
const sidebarCollapsed = computed(() => appStore.sidebarCollapsed)
|
||||||
const mobileOpen = computed(() => appStore.mobileOpen)
|
const mobileOpen = computed(() => appStore.mobileOpen)
|
||||||
@@ -442,6 +443,9 @@ const personalNavItems = computed(() => {
|
|||||||
const adminNavItems = computed(() => {
|
const adminNavItems = computed(() => {
|
||||||
const baseItems = [
|
const baseItems = [
|
||||||
{ path: '/admin/dashboard', label: t('nav.dashboard'), icon: DashboardIcon },
|
{ path: '/admin/dashboard', label: t('nav.dashboard'), icon: DashboardIcon },
|
||||||
|
...(adminSettingsStore.opsMonitoringEnabled
|
||||||
|
? [{ path: '/admin/ops', label: t('nav.ops'), icon: ChartIcon }]
|
||||||
|
: []),
|
||||||
{ path: '/admin/users', label: t('nav.users'), icon: UsersIcon, hideInSimpleMode: true },
|
{ path: '/admin/users', label: t('nav.users'), icon: UsersIcon, hideInSimpleMode: true },
|
||||||
{ path: '/admin/groups', label: t('nav.groups'), icon: FolderIcon, hideInSimpleMode: true },
|
{ path: '/admin/groups', label: t('nav.groups'), icon: FolderIcon, hideInSimpleMode: true },
|
||||||
{ path: '/admin/subscriptions', label: t('nav.subscriptions'), icon: CreditCardIcon, hideInSimpleMode: true },
|
{ path: '/admin/subscriptions', label: t('nav.subscriptions'), icon: CreditCardIcon, hideInSimpleMode: true },
|
||||||
@@ -511,6 +515,23 @@ if (
|
|||||||
isDark.value = true
|
isDark.value = true
|
||||||
document.documentElement.classList.add('dark')
|
document.documentElement.classList.add('dark')
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fetch admin settings (for feature-gated nav items like Ops).
|
||||||
|
watch(
|
||||||
|
isAdmin,
|
||||||
|
(v) => {
|
||||||
|
if (v) {
|
||||||
|
adminSettingsStore.fetch()
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ immediate: true }
|
||||||
|
)
|
||||||
|
|
||||||
|
onMounted(() => {
|
||||||
|
if (isAdmin.value) {
|
||||||
|
adminSettingsStore.fetch()
|
||||||
|
}
|
||||||
|
})
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<style scoped>
|
<style scoped>
|
||||||
|
|||||||
@@ -131,6 +131,7 @@ export default {
|
|||||||
noData: 'No data',
|
noData: 'No data',
|
||||||
success: 'Success',
|
success: 'Success',
|
||||||
error: 'Error',
|
error: 'Error',
|
||||||
|
critical: 'Critical',
|
||||||
warning: 'Warning',
|
warning: 'Warning',
|
||||||
info: 'Info',
|
info: 'Info',
|
||||||
active: 'Active',
|
active: 'Active',
|
||||||
@@ -145,9 +146,11 @@ export default {
|
|||||||
copiedToClipboard: 'Copied to clipboard',
|
copiedToClipboard: 'Copied to clipboard',
|
||||||
copyFailed: 'Failed to copy',
|
copyFailed: 'Failed to copy',
|
||||||
contactSupport: 'Contact Support',
|
contactSupport: 'Contact Support',
|
||||||
|
add: 'Add',
|
||||||
|
invalidEmail: 'Please enter a valid email address',
|
||||||
optional: 'optional',
|
optional: 'optional',
|
||||||
selectOption: 'Select an option',
|
selectOption: 'Select an option',
|
||||||
searchPlaceholder: 'Search...',
|
searchPlaceholder: 'Search...',
|
||||||
noOptionsFound: 'No options found',
|
noOptionsFound: 'No options found',
|
||||||
noGroupsAvailable: 'No groups available',
|
noGroupsAvailable: 'No groups available',
|
||||||
unknownError: 'Unknown error occurred',
|
unknownError: 'Unknown error occurred',
|
||||||
@@ -178,6 +181,7 @@ export default {
|
|||||||
accounts: 'Accounts',
|
accounts: 'Accounts',
|
||||||
proxies: 'Proxies',
|
proxies: 'Proxies',
|
||||||
redeemCodes: 'Redeem Codes',
|
redeemCodes: 'Redeem Codes',
|
||||||
|
ops: 'Ops',
|
||||||
promoCodes: 'Promo Codes',
|
promoCodes: 'Promo Codes',
|
||||||
settings: 'Settings',
|
settings: 'Settings',
|
||||||
myAccount: 'My Account',
|
myAccount: 'My Account',
|
||||||
@@ -364,6 +368,12 @@ export default {
|
|||||||
note: 'Make sure the config directory exists. macOS/Linux users can run mkdir -p ~/.codex to create it.',
|
note: 'Make sure the config directory exists. macOS/Linux users can run mkdir -p ~/.codex to create it.',
|
||||||
noteWindows: 'Press Win+R and enter %userprofile%\\.codex to open the config directory. Create it manually if it does not exist.',
|
noteWindows: 'Press Win+R and enter %userprofile%\\.codex to open the config directory. Create it manually if it does not exist.',
|
||||||
},
|
},
|
||||||
|
cliTabs: {
|
||||||
|
claudeCode: 'Claude Code',
|
||||||
|
geminiCli: 'Gemini CLI',
|
||||||
|
codexCli: 'Codex CLI',
|
||||||
|
opencode: 'OpenCode',
|
||||||
|
},
|
||||||
antigravity: {
|
antigravity: {
|
||||||
description: 'Configure API access for Antigravity group. Select the configuration method based on your client.',
|
description: 'Configure API access for Antigravity group. Select the configuration method based on your client.',
|
||||||
claudeCode: 'Claude Code',
|
claudeCode: 'Claude Code',
|
||||||
@@ -376,6 +386,11 @@ export default {
|
|||||||
modelComment: 'If you have Gemini 3 access, you can use: gemini-3-pro-preview',
|
modelComment: 'If you have Gemini 3 access, you can use: gemini-3-pro-preview',
|
||||||
note: 'These environment variables will be active in the current terminal session. For permanent configuration, add them to ~/.bashrc, ~/.zshrc, or the appropriate configuration file.',
|
note: 'These environment variables will be active in the current terminal session. For permanent configuration, add them to ~/.bashrc, ~/.zshrc, or the appropriate configuration file.',
|
||||||
},
|
},
|
||||||
|
opencode: {
|
||||||
|
title: 'OpenCode Example',
|
||||||
|
subtitle: 'opencode.json',
|
||||||
|
hint: 'This is a group configuration example. Adjust model and options as needed.',
|
||||||
|
},
|
||||||
},
|
},
|
||||||
customKeyLabel: 'Custom Key',
|
customKeyLabel: 'Custom Key',
|
||||||
customKeyPlaceholder: 'Enter your custom key (min 16 chars)',
|
customKeyPlaceholder: 'Enter your custom key (min 16 chars)',
|
||||||
@@ -1826,6 +1841,524 @@ export default {
|
|||||||
ipAddress: 'IP'
|
ipAddress: 'IP'
|
||||||
},
|
},
|
||||||
|
|
||||||
|
// Ops Monitoring
|
||||||
|
ops: {
|
||||||
|
title: 'Ops Monitoring',
|
||||||
|
description: 'Operational monitoring and troubleshooting',
|
||||||
|
// Dashboard
|
||||||
|
systemHealth: 'System Health',
|
||||||
|
overview: 'Overview',
|
||||||
|
noSystemMetrics: 'No system metrics collected yet.',
|
||||||
|
collectedAt: 'Collected at:',
|
||||||
|
window: 'window',
|
||||||
|
cpu: 'CPU',
|
||||||
|
memory: 'Memory',
|
||||||
|
db: 'DB',
|
||||||
|
redis: 'Redis',
|
||||||
|
goroutines: 'Goroutines',
|
||||||
|
jobs: 'Jobs',
|
||||||
|
jobsHelp: 'Click “Details” to view job heartbeats and recent errors',
|
||||||
|
active: 'active',
|
||||||
|
idle: 'idle',
|
||||||
|
waiting: 'waiting',
|
||||||
|
conns: 'conns',
|
||||||
|
queue: 'queue',
|
||||||
|
ok: 'ok',
|
||||||
|
lastRun: 'last_run:',
|
||||||
|
lastSuccess: 'last_success:',
|
||||||
|
lastError: 'last_error:',
|
||||||
|
noData: 'No data.',
|
||||||
|
loadingText: 'loading',
|
||||||
|
ready: 'ready',
|
||||||
|
requestsTotal: 'Requests (total)',
|
||||||
|
slaScope: 'SLA scope:',
|
||||||
|
tokens: 'Tokens',
|
||||||
|
tps: 'TPS:',
|
||||||
|
current: 'current',
|
||||||
|
peak: 'peak',
|
||||||
|
average: 'average',
|
||||||
|
totalRequests: 'Total Requests',
|
||||||
|
avgQps: 'Avg QPS',
|
||||||
|
avgTps: 'Avg TPS',
|
||||||
|
avgLatency: 'Avg Latency',
|
||||||
|
avgTtft: 'Avg TTFT',
|
||||||
|
exceptions: 'Exceptions',
|
||||||
|
requestErrors: 'Request Errors',
|
||||||
|
errorCount: 'Error Count',
|
||||||
|
upstreamErrors: 'Upstream Errors',
|
||||||
|
errorCountExcl429529: 'Error Count (excl 429/529)',
|
||||||
|
sla: 'SLA (excl business limits)',
|
||||||
|
businessLimited: 'business_limited:',
|
||||||
|
errors: 'Errors',
|
||||||
|
errorRate: 'error_rate:',
|
||||||
|
upstreamRate: 'upstream_rate:',
|
||||||
|
latencyDuration: 'Latency (duration_ms)',
|
||||||
|
ttftLabel: 'TTFT (first_token_ms)',
|
||||||
|
p50: 'p50:',
|
||||||
|
p90: 'p90:',
|
||||||
|
p95: 'p95:',
|
||||||
|
p99: 'p99:',
|
||||||
|
avg: 'avg:',
|
||||||
|
max: 'max:',
|
||||||
|
qps: 'QPS',
|
||||||
|
requests: 'Requests',
|
||||||
|
upstream: 'Upstream',
|
||||||
|
client: 'Client',
|
||||||
|
system: 'System',
|
||||||
|
other: 'Other',
|
||||||
|
errorsSla: 'Errors (SLA scope)',
|
||||||
|
upstreamExcl429529: 'Upstream (excl 429/529)',
|
||||||
|
failedToLoadData: 'Failed to load ops data.',
|
||||||
|
failedToLoadOverview: 'Failed to load overview',
|
||||||
|
failedToLoadThroughputTrend: 'Failed to load throughput trend',
|
||||||
|
failedToLoadLatencyHistogram: 'Failed to load latency histogram',
|
||||||
|
failedToLoadErrorTrend: 'Failed to load error trend',
|
||||||
|
failedToLoadErrorDistribution: 'Failed to load error distribution',
|
||||||
|
failedToLoadErrorDetail: 'Failed to load error detail',
|
||||||
|
retryFailed: 'Retry failed',
|
||||||
|
tpsK: 'TPS (K)',
|
||||||
|
top: 'Top:',
|
||||||
|
throughputTrend: 'Throughput Trend',
|
||||||
|
latencyHistogram: 'Latency Histogram',
|
||||||
|
errorTrend: 'Error Trend',
|
||||||
|
errorDistribution: 'Error Distribution',
|
||||||
|
// Health Score & Diagnosis
|
||||||
|
health: 'Health',
|
||||||
|
healthCondition: 'Health Condition',
|
||||||
|
healthHelp: 'Overall system health score based on SLA, error rate, and resource usage',
|
||||||
|
healthyStatus: 'Healthy',
|
||||||
|
riskyStatus: 'At Risk',
|
||||||
|
idleStatus: 'Idle',
|
||||||
|
timeRange: {
|
||||||
|
'5m': 'Last 5 minutes',
|
||||||
|
'30m': 'Last 30 minutes',
|
||||||
|
'1h': 'Last 1 hour',
|
||||||
|
'6h': 'Last 6 hours',
|
||||||
|
'24h': 'Last 24 hours'
|
||||||
|
},
|
||||||
|
diagnosis: {
|
||||||
|
title: 'Smart Diagnosis',
|
||||||
|
footer: 'Automated diagnostic suggestions based on current metrics',
|
||||||
|
idle: 'System is currently idle',
|
||||||
|
idleImpact: 'No active traffic',
|
||||||
|
// Resource diagnostics
|
||||||
|
dbDown: 'Database connection failed',
|
||||||
|
dbDownImpact: 'All database operations will fail',
|
||||||
|
dbDownAction: 'Check database service status, network connectivity, and connection configuration',
|
||||||
|
redisDown: 'Redis connection failed',
|
||||||
|
redisDownImpact: 'Cache functionality degraded, performance may decline',
|
||||||
|
redisDownAction: 'Check Redis service status and network connectivity',
|
||||||
|
cpuCritical: 'CPU usage critically high ({usage}%)',
|
||||||
|
cpuCriticalImpact: 'System response slowing, may affect all requests',
|
||||||
|
cpuCriticalAction: 'Check CPU-intensive tasks, consider scaling or code optimization',
|
||||||
|
cpuHigh: 'CPU usage elevated ({usage}%)',
|
||||||
|
cpuHighImpact: 'System load is high, needs attention',
|
||||||
|
cpuHighAction: 'Monitor CPU trends, prepare scaling plan',
|
||||||
|
memoryCritical: 'Memory usage critically high ({usage}%)',
|
||||||
|
memoryCriticalImpact: 'May trigger OOM, system stability threatened',
|
||||||
|
memoryCriticalAction: 'Check for memory leaks, consider increasing memory or optimizing usage',
|
||||||
|
memoryHigh: 'Memory usage elevated ({usage}%)',
|
||||||
|
memoryHighImpact: 'Memory pressure is high, needs attention',
|
||||||
|
memoryHighAction: 'Monitor memory trends, check for memory leaks',
|
||||||
|
// Latency diagnostics
|
||||||
|
latencyCritical: 'Response latency critically high ({latency}ms)',
|
||||||
|
latencyCriticalImpact: 'User experience extremely poor, many requests timing out',
|
||||||
|
latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services',
|
||||||
|
latencyHigh: 'Response latency elevated ({latency}ms)',
|
||||||
|
latencyHighImpact: 'User experience degraded, needs optimization',
|
||||||
|
latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic',
|
||||||
|
ttftHigh: 'Time to first byte elevated ({ttft}ms)',
|
||||||
|
ttftHighImpact: 'User perceived latency increased',
|
||||||
|
ttftHighAction: 'Optimize request processing flow, reduce pre-processing time',
|
||||||
|
// Error rate diagnostics
|
||||||
|
upstreamCritical: 'Upstream error rate critically high ({rate}%)',
|
||||||
|
upstreamCriticalImpact: 'May affect many user requests',
|
||||||
|
upstreamCriticalAction: 'Check upstream service health, enable fallback strategies',
|
||||||
|
upstreamHigh: 'Upstream error rate elevated ({rate}%)',
|
||||||
|
upstreamHighImpact: 'Recommend checking upstream service status',
|
||||||
|
upstreamHighAction: 'Contact upstream service team, prepare fallback plan',
|
||||||
|
errorHigh: 'Error rate too high ({rate}%)',
|
||||||
|
errorHighImpact: 'Many requests failing',
|
||||||
|
errorHighAction: 'Check error logs, identify root cause, urgent fix required',
|
||||||
|
errorElevated: 'Error rate elevated ({rate}%)',
|
||||||
|
errorElevatedImpact: 'Recommend checking error logs',
|
||||||
|
errorElevatedAction: 'Analyze error types and distribution, create fix plan',
|
||||||
|
// SLA diagnostics
|
||||||
|
slaCritical: 'SLA critically below target ({sla}%)',
|
||||||
|
slaCriticalImpact: 'User experience severely degraded',
|
||||||
|
slaCriticalAction: 'Urgently investigate errors and latency, consider rate limiting',
|
||||||
|
slaLow: 'SLA below target ({sla}%)',
|
||||||
|
slaLowImpact: 'Service quality needs attention',
|
||||||
|
slaLowAction: 'Analyze SLA decline causes, optimize system performance',
|
||||||
|
// Health score diagnostics
|
||||||
|
healthCritical: 'Overall health score critically low ({score})',
|
||||||
|
healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation',
|
||||||
|
healthCriticalAction: 'Comprehensive system check, prioritize critical-level issues',
|
||||||
|
healthLow: 'Overall health score low ({score})',
|
||||||
|
healthLowImpact: 'May indicate minor instability; monitor SLA and error rates',
|
||||||
|
healthLowAction: 'Monitor metric trends, prevent issue escalation',
|
||||||
|
healthy: 'All system metrics normal',
|
||||||
|
healthyImpact: 'Service running stable'
|
||||||
|
},
|
||||||
|
// Error Log
|
||||||
|
errorLog: {
|
||||||
|
timeId: 'Time / ID',
|
||||||
|
context: 'Context',
|
||||||
|
status: 'Status',
|
||||||
|
message: 'Message',
|
||||||
|
latency: 'Latency',
|
||||||
|
action: 'Action',
|
||||||
|
noErrors: 'No errors in this window.',
|
||||||
|
grp: 'GRP:',
|
||||||
|
acc: 'ACC:',
|
||||||
|
details: 'Details',
|
||||||
|
phase: 'Phase'
|
||||||
|
},
|
||||||
|
// Error Details Modal
|
||||||
|
errorDetails: {
|
||||||
|
upstreamErrors: 'Upstream Errors',
|
||||||
|
requestErrors: 'Request Errors',
|
||||||
|
total: 'Total:',
|
||||||
|
searchPlaceholder: 'Search request_id / client_request_id / message',
|
||||||
|
accountIdPlaceholder: 'account_id'
|
||||||
|
},
|
||||||
|
// Error Detail Modal
|
||||||
|
errorDetail: {
|
||||||
|
loading: 'Loading…',
|
||||||
|
requestId: 'Request ID',
|
||||||
|
time: 'Time',
|
||||||
|
phase: 'Phase',
|
||||||
|
status: 'Status',
|
||||||
|
message: 'Message',
|
||||||
|
basicInfo: 'Basic Info',
|
||||||
|
platform: 'Platform',
|
||||||
|
model: 'Model',
|
||||||
|
latency: 'Latency',
|
||||||
|
ttft: 'TTFT',
|
||||||
|
businessLimited: 'Business Limited',
|
||||||
|
requestPath: 'Request Path',
|
||||||
|
timings: 'Timings',
|
||||||
|
auth: 'Auth',
|
||||||
|
routing: 'Routing',
|
||||||
|
upstream: 'Upstream',
|
||||||
|
response: 'Response',
|
||||||
|
retry: 'Retry',
|
||||||
|
retryClient: 'Retry (Client)',
|
||||||
|
retryUpstream: 'Retry (Upstream pinned)',
|
||||||
|
pinnedAccountId: 'Pinned account_id',
|
||||||
|
retryNotes: 'Retry Notes',
|
||||||
|
requestBody: 'Request Body',
|
||||||
|
errorBody: 'Error Body',
|
||||||
|
trimmed: 'trimmed',
|
||||||
|
confirmRetry: 'Confirm Retry',
|
||||||
|
retrySuccess: 'Retry succeeded',
|
||||||
|
retryFailed: 'Retry failed',
|
||||||
|
na: 'N/A',
|
||||||
|
retryHint: 'Retry will resend the request with the same parameters',
|
||||||
|
retryClientHint: 'Use client retry (no account pinning)',
|
||||||
|
retryUpstreamHint: 'Use upstream pinned retry (pin to the error account)',
|
||||||
|
pinnedAccountIdHint: '(auto from error log)',
|
||||||
|
retryNote1: 'Retry will use the same request body and parameters',
|
||||||
|
retryNote2: 'If the original request failed due to account issues, pinned retry may still fail',
|
||||||
|
retryNote3: 'Client retry will reselect an account',
|
||||||
|
confirmRetryMessage: 'Confirm retry this request?',
|
||||||
|
confirmRetryHint: 'Will resend with the same request parameters'
|
||||||
|
},
|
||||||
|
requestDetails: {
|
||||||
|
title: 'Request Details',
|
||||||
|
details: 'Details',
|
||||||
|
rangeLabel: 'Window: {range}',
|
||||||
|
rangeMinutes: '{n} minutes',
|
||||||
|
rangeHours: '{n} hours',
|
||||||
|
empty: 'No requests in this window.',
|
||||||
|
emptyHint: 'Try a different time range or remove filters.',
|
||||||
|
failedToLoad: 'Failed to load request details',
|
||||||
|
requestIdCopied: 'Request ID copied',
|
||||||
|
copyFailed: 'Copy failed',
|
||||||
|
copy: 'Copy',
|
||||||
|
viewError: 'View Error',
|
||||||
|
kind: {
|
||||||
|
success: 'SUCCESS',
|
||||||
|
error: 'ERROR'
|
||||||
|
},
|
||||||
|
table: {
|
||||||
|
time: 'Time',
|
||||||
|
kind: 'Kind',
|
||||||
|
platform: 'Platform',
|
||||||
|
model: 'Model',
|
||||||
|
duration: 'Duration',
|
||||||
|
status: 'Status',
|
||||||
|
requestId: 'Request ID',
|
||||||
|
actions: 'Actions'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
alertEvents: {
|
||||||
|
title: 'Alert Events',
|
||||||
|
description: 'Recent alert firing/resolution records (email-only)',
|
||||||
|
loading: 'Loading...',
|
||||||
|
empty: 'No alert events',
|
||||||
|
loadFailed: 'Failed to load alert events',
|
||||||
|
table: {
|
||||||
|
time: 'Time',
|
||||||
|
status: 'Status',
|
||||||
|
severity: 'Severity',
|
||||||
|
title: 'Title',
|
||||||
|
metric: 'Metric / Threshold',
|
||||||
|
email: 'Email Sent'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
alertRules: {
|
||||||
|
title: 'Alert Rules',
|
||||||
|
description: 'Create and manage threshold-based system alerts (email-only)',
|
||||||
|
loading: 'Loading...',
|
||||||
|
empty: 'No alert rules',
|
||||||
|
loadFailed: 'Failed to load alert rules',
|
||||||
|
saveFailed: 'Failed to save alert rule',
|
||||||
|
deleteFailed: 'Failed to delete alert rule',
|
||||||
|
create: 'Create Rule',
|
||||||
|
createTitle: 'Create Alert Rule',
|
||||||
|
editTitle: 'Edit Alert Rule',
|
||||||
|
deleteConfirmTitle: 'Delete this rule?',
|
||||||
|
deleteConfirmMessage: 'This will remove the rule and its related events. Continue?',
|
||||||
|
metricGroups: {
|
||||||
|
system: 'System Metrics',
|
||||||
|
group: 'Group-level Metrics (requires group_id)',
|
||||||
|
account: 'Account-level Metrics'
|
||||||
|
},
|
||||||
|
metrics: {
|
||||||
|
successRate: 'Success Rate (%)',
|
||||||
|
errorRate: 'Error Rate (%)',
|
||||||
|
upstreamErrorRate: 'Upstream Error Rate (%)',
|
||||||
|
p95: 'P95 Latency (ms)',
|
||||||
|
p99: 'P99 Latency (ms)',
|
||||||
|
cpu: 'CPU Usage (%)',
|
||||||
|
memory: 'Memory Usage (%)',
|
||||||
|
queueDepth: 'Concurrency Queue Depth',
|
||||||
|
groupAvailableAccounts: 'Group Available Accounts',
|
||||||
|
groupAvailableRatio: 'Group Available Ratio (%)',
|
||||||
|
groupRateLimitRatio: 'Group Rate Limit Ratio (%)',
|
||||||
|
accountRateLimitedCount: 'Rate-limited Accounts',
|
||||||
|
accountErrorCount: 'Error Accounts (excluding temporarily unschedulable)',
|
||||||
|
accountErrorRatio: 'Error Account Ratio (%)',
|
||||||
|
overloadAccountCount: 'Overloaded Accounts'
|
||||||
|
},
|
||||||
|
metricDescriptions: {
|
||||||
|
successRate: 'Percentage of successful requests in the window (0-100).',
|
||||||
|
errorRate: 'Percentage of failed requests in the window (0-100).',
|
||||||
|
upstreamErrorRate: 'Percentage of upstream failures in the window (0-100).',
|
||||||
|
p95: 'P95 request latency within the window (ms).',
|
||||||
|
p99: 'P99 request latency within the window (ms).',
|
||||||
|
cpu: 'Current instance CPU usage (0-100).',
|
||||||
|
memory: 'Current instance memory usage (0-100).',
|
||||||
|
queueDepth: 'Concurrency queue depth within the window (queued requests).',
|
||||||
|
groupAvailableAccounts: 'Number of available accounts in the selected group (requires group_id).',
|
||||||
|
groupAvailableRatio: 'Available account ratio in the selected group (0-100, requires group_id).',
|
||||||
|
groupRateLimitRatio: 'Rate-limited account ratio in the selected group (0-100, requires group_id).',
|
||||||
|
accountRateLimitedCount: 'Number of rate-limited accounts within the window.',
|
||||||
|
accountErrorCount: 'Number of error accounts within the window (excluding temporarily unschedulable).',
|
||||||
|
accountErrorRatio: 'Error account ratio within the window (0-100).',
|
||||||
|
overloadAccountCount: 'Number of overloaded accounts within the window.'
|
||||||
|
},
|
||||||
|
hints: {
|
||||||
|
recommended: 'Recommended: operator {operator}, threshold {threshold}{unit}',
|
||||||
|
groupRequired: 'This is a group-level metric; selecting a group (group_id) is required.',
|
||||||
|
groupOptional: 'Optional: limit the rule to a specific group via group_id.'
|
||||||
|
},
|
||||||
|
table: {
|
||||||
|
name: 'Name',
|
||||||
|
metric: 'Metric',
|
||||||
|
severity: 'Severity',
|
||||||
|
enabled: 'Enabled',
|
||||||
|
actions: 'Actions'
|
||||||
|
},
|
||||||
|
form: {
|
||||||
|
name: 'Name',
|
||||||
|
description: 'Description',
|
||||||
|
metric: 'Metric',
|
||||||
|
operator: 'Operator',
|
||||||
|
groupId: 'Group (group_id)',
|
||||||
|
groupPlaceholder: 'Select a group',
|
||||||
|
allGroups: 'All groups',
|
||||||
|
threshold: 'Threshold',
|
||||||
|
severity: 'Severity',
|
||||||
|
window: 'Window (minutes)',
|
||||||
|
sustained: 'Sustained (samples)',
|
||||||
|
cooldown: 'Cooldown (minutes)',
|
||||||
|
enabled: 'Enabled',
|
||||||
|
notifyEmail: 'Send email notifications'
|
||||||
|
},
|
||||||
|
validation: {
|
||||||
|
title: 'Please fix the following issues',
|
||||||
|
invalid: 'Invalid rule',
|
||||||
|
nameRequired: 'Name is required',
|
||||||
|
metricRequired: 'Metric is required',
|
||||||
|
groupIdRequired: 'group_id is required for group-level metrics',
|
||||||
|
operatorRequired: 'Operator is required',
|
||||||
|
thresholdRequired: 'Threshold must be a number',
|
||||||
|
windowRange: 'Window must be one of: 1, 5, 60 minutes',
|
||||||
|
sustainedRange: 'Sustained must be between 1 and 1440 samples',
|
||||||
|
cooldownRange: 'Cooldown must be between 0 and 1440 minutes'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
runtime: {
|
||||||
|
title: 'Ops Runtime Settings',
|
||||||
|
description: 'Stored in database; changes take effect without editing config files.',
|
||||||
|
loading: 'Loading...',
|
||||||
|
noData: 'No runtime settings available',
|
||||||
|
loadFailed: 'Failed to load runtime settings',
|
||||||
|
saveSuccess: 'Runtime settings saved',
|
||||||
|
saveFailed: 'Failed to save runtime settings',
|
||||||
|
alertTitle: 'Alert Evaluator',
|
||||||
|
groupAvailabilityTitle: 'Group Availability Monitor',
|
||||||
|
evalIntervalSeconds: 'Evaluation Interval (seconds)',
|
||||||
|
silencing: {
|
||||||
|
title: 'Alert Silencing (Maintenance Mode)',
|
||||||
|
enabled: 'Enable silencing',
|
||||||
|
globalUntil: 'Silence until (RFC3339)',
|
||||||
|
untilPlaceholder: '2026-01-05T00:00:00Z',
|
||||||
|
untilHint: 'Leave empty to only toggle silencing without an expiry (not recommended).',
|
||||||
|
reason: 'Reason',
|
||||||
|
reasonPlaceholder: 'e.g., planned maintenance',
|
||||||
|
entries: {
|
||||||
|
title: 'Advanced: targeted silencing',
|
||||||
|
hint: 'Optional: silence only certain rules or severities. Leave fields empty to match all.',
|
||||||
|
add: 'Add Entry',
|
||||||
|
empty: 'No targeted entries',
|
||||||
|
entryTitle: 'Entry #{n}',
|
||||||
|
ruleId: 'Rule ID (optional)',
|
||||||
|
ruleIdPlaceholder: 'e.g., 1',
|
||||||
|
severities: 'Severities (optional)',
|
||||||
|
severitiesPlaceholder: 'e.g., P0,P1 (empty = all)',
|
||||||
|
until: 'Until (RFC3339)',
|
||||||
|
reason: 'Reason',
|
||||||
|
validation: {
|
||||||
|
untilRequired: 'Entry until time is required',
|
||||||
|
untilFormat: 'Entry until time must be a valid RFC3339 timestamp',
|
||||||
|
ruleIdPositive: 'Entry rule_id must be a positive integer',
|
||||||
|
severitiesFormat: 'Entry severities must be a comma-separated list of P0..P3'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
validation: {
|
||||||
|
timeFormat: 'Silence time must be a valid RFC3339 timestamp'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
lockEnabled: 'Distributed Lock Enabled',
|
||||||
|
lockKey: 'Distributed Lock Key',
|
||||||
|
lockTTLSeconds: 'Distributed Lock TTL (seconds)',
|
||||||
|
showAdvancedDeveloperSettings: 'Show advanced developer settings (Distributed Lock)',
|
||||||
|
advancedSettingsSummary: 'Advanced settings (Distributed Lock)',
|
||||||
|
evalIntervalHint: 'How often the evaluator runs. Keeping the default is recommended.',
|
||||||
|
validation: {
|
||||||
|
title: 'Please fix the following issues',
|
||||||
|
invalid: 'Invalid settings',
|
||||||
|
evalIntervalRange: 'Evaluation interval must be between 1 and 86400 seconds',
|
||||||
|
lockKeyRequired: 'Distributed lock key is required when lock is enabled',
|
||||||
|
lockKeyPrefix: 'Distributed lock key must start with "{prefix}"',
|
||||||
|
lockKeyHint: 'Recommended: start with "{prefix}" to avoid conflicts',
|
||||||
|
lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
email: {
|
||||||
|
title: 'Email Notification',
|
||||||
|
description: 'Configure alert/report email notifications (stored in database).',
|
||||||
|
loading: 'Loading...',
|
||||||
|
noData: 'No email notification config',
|
||||||
|
loadFailed: 'Failed to load email notification config',
|
||||||
|
saveSuccess: 'Email notification config saved',
|
||||||
|
saveFailed: 'Failed to save email notification config',
|
||||||
|
alertTitle: 'Alert Emails',
|
||||||
|
reportTitle: 'Report Emails',
|
||||||
|
recipients: 'Recipients',
|
||||||
|
recipientsHint: 'If empty, the system may fallback to the first admin email.',
|
||||||
|
minSeverity: 'Min Severity',
|
||||||
|
minSeverityAll: 'All severities',
|
||||||
|
rateLimitPerHour: 'Rate limit per hour',
|
||||||
|
batchWindowSeconds: 'Batch window (seconds)',
|
||||||
|
includeResolved: 'Include resolved alerts',
|
||||||
|
dailySummary: 'Daily summary',
|
||||||
|
weeklySummary: 'Weekly summary',
|
||||||
|
errorDigest: 'Error digest',
|
||||||
|
errorDigestMinCount: 'Min errors for digest',
|
||||||
|
accountHealth: 'Account health',
|
||||||
|
accountHealthThreshold: 'Error rate threshold (%)',
|
||||||
|
cronPlaceholder: 'Cron expression',
|
||||||
|
reportHint: 'Schedules use cron syntax; leave empty to use defaults.',
|
||||||
|
validation: {
|
||||||
|
title: 'Please fix the following issues',
|
||||||
|
invalid: 'Invalid email notification config',
|
||||||
|
alertRecipientsRequired: 'Alert emails are enabled but no recipients are configured',
|
||||||
|
reportRecipientsRequired: 'Report emails are enabled but no recipients are configured',
|
||||||
|
invalidRecipients: 'One or more recipient emails are invalid',
|
||||||
|
rateLimitRange: 'Rate limit per hour must be a number ≥ 0',
|
||||||
|
batchWindowRange: 'Batch window must be between 0 and 86400 seconds',
|
||||||
|
cronRequired: 'A cron expression is required when schedule is enabled',
|
||||||
|
cronFormat: 'Cron expression format looks invalid (expected at least 5 parts)',
|
||||||
|
digestMinCountRange: 'Min errors for digest must be a number ≥ 0',
|
||||||
|
accountHealthThresholdRange: 'Account health threshold must be between 0 and 100'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
concurrency: {
|
||||||
|
title: 'Concurrency / Queue',
|
||||||
|
byPlatform: 'By Platform',
|
||||||
|
byGroup: 'By Group',
|
||||||
|
byAccount: 'By Account',
|
||||||
|
totalRows: '{count} rows',
|
||||||
|
disabledHint: 'Realtime monitoring is disabled in settings.',
|
||||||
|
empty: 'No data',
|
||||||
|
queued: 'Queue {count}',
|
||||||
|
rateLimited: 'Rate-limited {count}',
|
||||||
|
errorAccounts: 'Errors {count}',
|
||||||
|
loadFailed: 'Failed to load concurrency data'
|
||||||
|
},
|
||||||
|
realtime: {
|
||||||
|
title: 'Realtime',
|
||||||
|
connected: 'Realtime connected',
|
||||||
|
connecting: 'Realtime connecting',
|
||||||
|
reconnecting: 'Realtime reconnecting',
|
||||||
|
offline: 'Realtime offline',
|
||||||
|
closed: 'Realtime closed',
|
||||||
|
reconnectIn: 'retry in {seconds}s'
|
||||||
|
},
|
||||||
|
queryMode: {
|
||||||
|
auto: 'Auto',
|
||||||
|
raw: 'Raw',
|
||||||
|
preagg: 'Preagg'
|
||||||
|
},
|
||||||
|
accountAvailability: {
|
||||||
|
available: 'Available',
|
||||||
|
unavailable: 'Unavailable',
|
||||||
|
accountError: 'Error'
|
||||||
|
},
|
||||||
|
tooltips: {
|
||||||
|
throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
|
||||||
|
latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
|
||||||
|
errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
|
||||||
|
errorDistribution: 'Error distribution by status code.',
|
||||||
|
goroutines:
|
||||||
|
'Number of Go runtime goroutines (lightweight threads). There is no absolute “safe” number—use your historical baseline. Heuristic: <2k is common; 2k–8k watch; >8k plus rising queue/latency often suggests blocking/leaks.',
|
||||||
|
cpu: 'CPU usage percentage, showing system processor load.',
|
||||||
|
memory: 'Memory usage, including used and total available memory.',
|
||||||
|
db: 'Database connection pool status, including active, idle, and waiting connections.',
|
||||||
|
redis: 'Redis connection pool status, showing active and idle connections.',
|
||||||
|
jobs: 'Background job execution status, including last run time, success time, and error information.',
|
||||||
|
qps: 'Queries Per Second (QPS) and Tokens Per Second (TPS), real-time system throughput.',
|
||||||
|
tokens: 'Total number of tokens processed in the current time window.',
|
||||||
|
sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
|
||||||
|
errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
|
||||||
|
latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
|
||||||
|
ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
|
||||||
|
health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
|
||||||
|
},
|
||||||
|
charts: {
|
||||||
|
emptyRequest: 'No requests in this window.',
|
||||||
|
emptyError: 'No errors in this window.',
|
||||||
|
resetZoom: 'Reset',
|
||||||
|
resetZoomHint: 'Reset zoom (if enabled)',
|
||||||
|
downloadChart: 'Download',
|
||||||
|
downloadChartHint: 'Download chart as image'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
// Settings
|
// Settings
|
||||||
settings: {
|
settings: {
|
||||||
title: 'System Settings',
|
title: 'System Settings',
|
||||||
@@ -1940,6 +2473,22 @@ export default {
|
|||||||
sending: 'Sending...',
|
sending: 'Sending...',
|
||||||
enterRecipientHint: 'Please enter a recipient email address'
|
enterRecipientHint: 'Please enter a recipient email address'
|
||||||
},
|
},
|
||||||
|
opsMonitoring: {
|
||||||
|
title: 'Ops Monitoring',
|
||||||
|
description: 'Enable ops monitoring for troubleshooting and health visibility',
|
||||||
|
disabled: 'Ops monitoring is disabled',
|
||||||
|
enabled: 'Enable Ops Monitoring',
|
||||||
|
enabledHint: 'Enable the ops monitoring module (admin only)',
|
||||||
|
realtimeEnabled: 'Enable Realtime Monitoring',
|
||||||
|
realtimeEnabledHint: 'Enable realtime QPS/metrics push (WebSocket)',
|
||||||
|
queryMode: 'Default Query Mode',
|
||||||
|
queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)',
|
||||||
|
queryModeAuto: 'Auto (recommended)',
|
||||||
|
queryModeRaw: 'Raw (most accurate, slower)',
|
||||||
|
queryModePreagg: 'Preagg (fastest, requires aggregation)',
|
||||||
|
metricsInterval: 'Metrics Collection Interval (seconds)',
|
||||||
|
metricsIntervalHint: 'How often to collect system/request metrics (60-3600 seconds)'
|
||||||
|
},
|
||||||
adminApiKey: {
|
adminApiKey: {
|
||||||
title: 'Admin API Key',
|
title: 'Admin API Key',
|
||||||
description: 'Global API key for external system integration with full admin access',
|
description: 'Global API key for external system integration with full admin access',
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user