From 5baa8b5673e77a5ebb3453094c8b13bef83e71f6 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:53:44 +0800 Subject: [PATCH] =?UTF-8?q?feat(service):=20=E5=AE=9E=E7=8E=B0=E8=BF=90?= =?UTF-8?q?=E7=BB=B4=E7=9B=91=E6=8E=A7=E4=B8=9A=E5=8A=A1=E9=80=BB=E8=BE=91?= =?UTF-8?q?=E5=B1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 ops 主服务(ops_service.go)和端口定义(ops_port.go) - 实现账号可用性检查服务(ops_account_availability.go) - 实现数据聚合服务(ops_aggregation_service.go) - 实现告警评估服务(ops_alert_evaluator_service.go) - 实现告警管理服务(ops_alerts.go) - 实现数据清理服务(ops_cleanup_service.go) - 实现并发控制服务(ops_concurrency.go) - 实现仪表板服务(ops_dashboard.go) - 实现错误处理服务(ops_errors.go) - 实现直方图服务(ops_histograms.go) - 实现指标采集服务(ops_metrics_collector.go) - 实现查询模式服务(ops_query_mode.go) - 实现实时监控服务(ops_realtime.go) - 实现请求详情服务(ops_request_details.go) - 实现重试机制服务(ops_retry.go) - 实现配置管理服务(ops_settings.go) - 实现趋势分析服务(ops_trends.go) - 实现窗口统计服务(ops_window_stats.go) - 添加 ops 相关领域常量 - 注册 service 依赖注入 --- backend/internal/service/domain_constants.go | 22 + .../service/ops_account_availability.go | 157 ++++ .../service/ops_aggregation_service.go | 434 +++++++++ .../service/ops_alert_evaluator_service.go | 839 +++++++++++++++++ backend/internal/service/ops_alerts.go | 162 ++++ .../internal/service/ops_cleanup_service.go | 361 ++++++++ backend/internal/service/ops_concurrency.go | 257 ++++++ backend/internal/service/ops_dashboard.go | 77 ++ backend/internal/service/ops_errors.go | 45 + backend/internal/service/ops_histograms.go | 26 + .../internal/service/ops_metrics_collector.go | 861 ++++++++++++++++++ backend/internal/service/ops_port.go | 226 +++++ backend/internal/service/ops_query_mode.go | 40 + backend/internal/service/ops_realtime.go | 36 + .../internal/service/ops_request_details.go | 152 ++++ backend/internal/service/ops_retry.go | 635 +++++++++++++ backend/internal/service/ops_service.go | 451 +++++++++ backend/internal/service/ops_settings.go | 354 +++++++ backend/internal/service/ops_trends.go | 27 + backend/internal/service/ops_window_stats.go | 24 + backend/internal/service/wire.go | 58 ++ 21 files changed, 5244 insertions(+) create mode 100644 backend/internal/service/ops_account_availability.go create mode 100644 backend/internal/service/ops_aggregation_service.go create mode 100644 backend/internal/service/ops_alert_evaluator_service.go create mode 100644 backend/internal/service/ops_alerts.go create mode 100644 backend/internal/service/ops_cleanup_service.go create mode 100644 backend/internal/service/ops_concurrency.go create mode 100644 backend/internal/service/ops_dashboard.go create mode 100644 backend/internal/service/ops_errors.go create mode 100644 backend/internal/service/ops_histograms.go create mode 100644 backend/internal/service/ops_metrics_collector.go create mode 100644 backend/internal/service/ops_port.go create mode 100644 backend/internal/service/ops_query_mode.go create mode 100644 backend/internal/service/ops_realtime.go create mode 100644 backend/internal/service/ops_request_details.go create mode 100644 backend/internal/service/ops_retry.go create mode 100644 backend/internal/service/ops_service.go create mode 100644 backend/internal/service/ops_settings.go create mode 100644 backend/internal/service/ops_trends.go create mode 100644 backend/internal/service/ops_window_stats.go diff --git a/backend/internal/service/domain_constants.go b/backend/internal/service/domain_constants.go index 9c61ea2e..04f80dbe 100644 --- a/backend/internal/service/domain_constants.go +++ b/backend/internal/service/domain_constants.go @@ -105,6 +105,28 @@ const ( // Request identity patch (Claude -> Gemini systemInstruction injection) SettingKeyEnableIdentityPatch = "enable_identity_patch" SettingKeyIdentityPatchPrompt = "identity_patch_prompt" + + // ========================= + // Ops Monitoring (vNext) + // ========================= + + // SettingKeyOpsMonitoringEnabled is a DB-backed soft switch to enable/disable ops module at runtime. + SettingKeyOpsMonitoringEnabled = "ops_monitoring_enabled" + + // SettingKeyOpsRealtimeMonitoringEnabled controls realtime features (e.g. WS/QPS push). + SettingKeyOpsRealtimeMonitoringEnabled = "ops_realtime_monitoring_enabled" + + // SettingKeyOpsQueryModeDefault controls the default query mode for ops dashboard (auto/raw/preagg). + SettingKeyOpsQueryModeDefault = "ops_query_mode_default" + + // SettingKeyOpsEmailNotificationConfig stores JSON config for ops email notifications. + SettingKeyOpsEmailNotificationConfig = "ops_email_notification_config" + + // SettingKeyOpsAlertRuntimeSettings stores JSON config for ops alert evaluator runtime settings. + SettingKeyOpsAlertRuntimeSettings = "ops_alert_runtime_settings" + + // SettingKeyOpsMetricsIntervalSeconds controls the ops metrics collector interval (>=60). + SettingKeyOpsMetricsIntervalSeconds = "ops_metrics_interval_seconds" ) // AdminAPIKeyPrefix is the prefix for admin API keys (distinct from user "sk-" keys). diff --git a/backend/internal/service/ops_account_availability.go b/backend/internal/service/ops_account_availability.go new file mode 100644 index 00000000..d0cbbe5c --- /dev/null +++ b/backend/internal/service/ops_account_availability.go @@ -0,0 +1,157 @@ +package service + +import ( + "context" + "time" +) + +// GetAccountAvailabilityStats returns current account availability stats. +// +// Query-level filtering is intentionally limited to platform/group to match the dashboard scope. +func (s *OpsService) GetAccountAvailabilityStats(ctx context.Context, platformFilter string, groupIDFilter *int64) ( + map[string]*PlatformAvailability, + map[int64]*GroupAvailability, + map[int64]*AccountAvailability, + *time.Time, + error, +) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, nil, nil, nil, err + } + + accounts, err := s.listAllAccountsForOps(ctx, platformFilter) + if err != nil { + return nil, nil, nil, nil, err + } + + if groupIDFilter != nil && *groupIDFilter > 0 { + filtered := make([]Account, 0, len(accounts)) + for _, acc := range accounts { + for _, grp := range acc.Groups { + if grp != nil && grp.ID == *groupIDFilter { + filtered = append(filtered, acc) + break + } + } + } + accounts = filtered + } + + now := time.Now() + collectedAt := now + + platform := make(map[string]*PlatformAvailability) + group := make(map[int64]*GroupAvailability) + account := make(map[int64]*AccountAvailability) + + for _, acc := range accounts { + if acc.ID <= 0 { + continue + } + + isTempUnsched := false + if acc.TempUnschedulableUntil != nil && now.Before(*acc.TempUnschedulableUntil) { + isTempUnsched = true + } + + isRateLimited := acc.RateLimitResetAt != nil && now.Before(*acc.RateLimitResetAt) + isOverloaded := acc.OverloadUntil != nil && now.Before(*acc.OverloadUntil) + hasError := acc.Status == StatusError + + // Normalize exclusive status flags so the UI doesn't show conflicting badges. + if hasError { + isRateLimited = false + isOverloaded = false + } + + isAvailable := acc.Status == StatusActive && acc.Schedulable && !isRateLimited && !isOverloaded && !isTempUnsched + + if acc.Platform != "" { + if _, ok := platform[acc.Platform]; !ok { + platform[acc.Platform] = &PlatformAvailability{ + Platform: acc.Platform, + } + } + p := platform[acc.Platform] + p.TotalAccounts++ + if isAvailable { + p.AvailableCount++ + } + if isRateLimited { + p.RateLimitCount++ + } + if hasError { + p.ErrorCount++ + } + } + + for _, grp := range acc.Groups { + if grp == nil || grp.ID <= 0 { + continue + } + if _, ok := group[grp.ID]; !ok { + group[grp.ID] = &GroupAvailability{ + GroupID: grp.ID, + GroupName: grp.Name, + Platform: grp.Platform, + } + } + g := group[grp.ID] + g.TotalAccounts++ + if isAvailable { + g.AvailableCount++ + } + if isRateLimited { + g.RateLimitCount++ + } + if hasError { + g.ErrorCount++ + } + } + + displayGroupID := int64(0) + displayGroupName := "" + if len(acc.Groups) > 0 && acc.Groups[0] != nil { + displayGroupID = acc.Groups[0].ID + displayGroupName = acc.Groups[0].Name + } + + item := &AccountAvailability{ + AccountID: acc.ID, + AccountName: acc.Name, + Platform: acc.Platform, + GroupID: displayGroupID, + GroupName: displayGroupName, + Status: acc.Status, + + IsAvailable: isAvailable, + IsRateLimited: isRateLimited, + IsOverloaded: isOverloaded, + HasError: hasError, + + ErrorMessage: acc.ErrorMessage, + } + + if isRateLimited && acc.RateLimitResetAt != nil { + item.RateLimitResetAt = acc.RateLimitResetAt + remainingSec := int64(time.Until(*acc.RateLimitResetAt).Seconds()) + if remainingSec > 0 { + item.RateLimitRemainingSec = &remainingSec + } + } + if isOverloaded && acc.OverloadUntil != nil { + item.OverloadUntil = acc.OverloadUntil + remainingSec := int64(time.Until(*acc.OverloadUntil).Seconds()) + if remainingSec > 0 { + item.OverloadRemainingSec = &remainingSec + } + } + if isTempUnsched && acc.TempUnschedulableUntil != nil { + item.TempUnschedulableUntil = acc.TempUnschedulableUntil + } + + account[acc.ID] = item + } + + return platform, group, account, &collectedAt, nil +} diff --git a/backend/internal/service/ops_aggregation_service.go b/backend/internal/service/ops_aggregation_service.go new file mode 100644 index 00000000..04dbb11b --- /dev/null +++ b/backend/internal/service/ops_aggregation_service.go @@ -0,0 +1,434 @@ +package service + +import ( + "context" + "database/sql" + "errors" + "log" + "strings" + "sync" + "time" + + "github.com/Wei-Shaw/sub2api/internal/config" + "github.com/google/uuid" + "github.com/redis/go-redis/v9" +) + +const ( + opsAggHourlyJobName = "ops_preaggregation_hourly" + opsAggDailyJobName = "ops_preaggregation_daily" + + opsAggHourlyInterval = 10 * time.Minute + opsAggDailyInterval = 1 * time.Hour + + // Keep in sync with ops retention target (vNext default 30d). + opsAggBackfillWindow = 30 * 24 * time.Hour + + // Recompute overlap to absorb late-arriving rows near boundaries. + opsAggHourlyOverlap = 2 * time.Hour + opsAggDailyOverlap = 48 * time.Hour + + opsAggHourlyChunk = 24 * time.Hour + opsAggDailyChunk = 7 * 24 * time.Hour + + // Delay around boundaries (e.g. 10:00..10:05) to avoid aggregating buckets + // that may still receive late inserts. + opsAggSafeDelay = 5 * time.Minute + + opsAggMaxQueryTimeout = 3 * time.Second + opsAggHourlyTimeout = 5 * time.Minute + opsAggDailyTimeout = 2 * time.Minute + + opsAggHourlyLeaderLockKey = "ops:aggregation:hourly:leader" + opsAggDailyLeaderLockKey = "ops:aggregation:daily:leader" + + opsAggHourlyLeaderLockTTL = 15 * time.Minute + opsAggDailyLeaderLockTTL = 10 * time.Minute +) + +// OpsAggregationService periodically backfills ops_metrics_hourly / ops_metrics_daily +// for stable long-window dashboard queries. +// +// It is safe to run in multi-replica deployments when Redis is available (leader lock). +type OpsAggregationService struct { + opsRepo OpsRepository + settingRepo SettingRepository + cfg *config.Config + + db *sql.DB + redisClient *redis.Client + instanceID string + + stopCh chan struct{} + startOnce sync.Once + stopOnce sync.Once + + hourlyMu sync.Mutex + dailyMu sync.Mutex + + skipLogMu sync.Mutex + skipLogAt time.Time +} + +func NewOpsAggregationService( + opsRepo OpsRepository, + settingRepo SettingRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsAggregationService { + return &OpsAggregationService{ + opsRepo: opsRepo, + settingRepo: settingRepo, + cfg: cfg, + db: db, + redisClient: redisClient, + instanceID: uuid.NewString(), + } +} + +func (s *OpsAggregationService) Start() { + if s == nil { + return + } + s.startOnce.Do(func() { + if s.stopCh == nil { + s.stopCh = make(chan struct{}) + } + go s.hourlyLoop() + go s.dailyLoop() + }) +} + +func (s *OpsAggregationService) Stop() { + if s == nil { + return + } + s.stopOnce.Do(func() { + if s.stopCh != nil { + close(s.stopCh) + } + }) +} + +func (s *OpsAggregationService) hourlyLoop() { + // First run immediately. + s.aggregateHourly() + + ticker := time.NewTicker(opsAggHourlyInterval) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + s.aggregateHourly() + case <-s.stopCh: + return + } + } +} + +func (s *OpsAggregationService) dailyLoop() { + // First run immediately. + s.aggregateDaily() + + ticker := time.NewTicker(opsAggDailyInterval) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + s.aggregateDaily() + case <-s.stopCh: + return + } + } +} + +func (s *OpsAggregationService) aggregateHourly() { + if s == nil || s.opsRepo == nil { + return + } + if s.cfg != nil { + if !s.cfg.Ops.Enabled { + return + } + if !s.cfg.Ops.Aggregation.Enabled { + return + } + } + + ctx, cancel := context.WithTimeout(context.Background(), opsAggHourlyTimeout) + defer cancel() + + if !s.isMonitoringEnabled(ctx) { + return + } + + release, ok := s.tryAcquireLeaderLock(ctx, opsAggHourlyLeaderLockKey, opsAggHourlyLeaderLockTTL, "[OpsAggregation][hourly]") + if !ok { + return + } + if release != nil { + defer release() + } + + s.hourlyMu.Lock() + defer s.hourlyMu.Unlock() + + startedAt := time.Now().UTC() + runAt := startedAt + + // Aggregate stable full hours only. + end := utcFloorToHour(time.Now().UTC().Add(-opsAggSafeDelay)) + start := end.Add(-opsAggBackfillWindow) + + // Resume from the latest bucket with overlap. + { + ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout) + latest, ok, err := s.opsRepo.GetLatestHourlyBucketStart(ctxMax) + cancelMax() + if err != nil { + log.Printf("[OpsAggregation][hourly] failed to read latest bucket: %v", err) + } else if ok { + candidate := latest.Add(-opsAggHourlyOverlap) + if candidate.After(start) { + start = candidate + } + } + } + + start = utcFloorToHour(start) + if !start.Before(end) { + return + } + + var aggErr error + for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggHourlyChunk) { + chunkEnd := minTime(cursor.Add(opsAggHourlyChunk), end) + if err := s.opsRepo.UpsertHourlyMetrics(ctx, cursor, chunkEnd); err != nil { + aggErr = err + log.Printf("[OpsAggregation][hourly] upsert failed (%s..%s): %v", cursor.Format(time.RFC3339), chunkEnd.Format(time.RFC3339), err) + break + } + } + + finishedAt := time.Now().UTC() + durationMs := finishedAt.Sub(startedAt).Milliseconds() + dur := durationMs + + if aggErr != nil { + msg := truncateString(aggErr.Error(), 2048) + errAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer hbCancel() + _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAggHourlyJobName, + LastRunAt: &runAt, + LastErrorAt: &errAt, + LastError: &msg, + LastDurationMs: &dur, + }) + return + } + + successAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer hbCancel() + _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAggHourlyJobName, + LastRunAt: &runAt, + LastSuccessAt: &successAt, + LastDurationMs: &dur, + }) +} + +func (s *OpsAggregationService) aggregateDaily() { + if s == nil || s.opsRepo == nil { + return + } + if s.cfg != nil { + if !s.cfg.Ops.Enabled { + return + } + if !s.cfg.Ops.Aggregation.Enabled { + return + } + } + + ctx, cancel := context.WithTimeout(context.Background(), opsAggDailyTimeout) + defer cancel() + + if !s.isMonitoringEnabled(ctx) { + return + } + + release, ok := s.tryAcquireLeaderLock(ctx, opsAggDailyLeaderLockKey, opsAggDailyLeaderLockTTL, "[OpsAggregation][daily]") + if !ok { + return + } + if release != nil { + defer release() + } + + s.dailyMu.Lock() + defer s.dailyMu.Unlock() + + startedAt := time.Now().UTC() + runAt := startedAt + + end := utcFloorToDay(time.Now().UTC()) + start := end.Add(-opsAggBackfillWindow) + + { + ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout) + latest, ok, err := s.opsRepo.GetLatestDailyBucketDate(ctxMax) + cancelMax() + if err != nil { + log.Printf("[OpsAggregation][daily] failed to read latest bucket: %v", err) + } else if ok { + candidate := latest.Add(-opsAggDailyOverlap) + if candidate.After(start) { + start = candidate + } + } + } + + start = utcFloorToDay(start) + if !start.Before(end) { + return + } + + var aggErr error + for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggDailyChunk) { + chunkEnd := minTime(cursor.Add(opsAggDailyChunk), end) + if err := s.opsRepo.UpsertDailyMetrics(ctx, cursor, chunkEnd); err != nil { + aggErr = err + log.Printf("[OpsAggregation][daily] upsert failed (%s..%s): %v", cursor.Format("2006-01-02"), chunkEnd.Format("2006-01-02"), err) + break + } + } + + finishedAt := time.Now().UTC() + durationMs := finishedAt.Sub(startedAt).Milliseconds() + dur := durationMs + + if aggErr != nil { + msg := truncateString(aggErr.Error(), 2048) + errAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer hbCancel() + _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAggDailyJobName, + LastRunAt: &runAt, + LastErrorAt: &errAt, + LastError: &msg, + LastDurationMs: &dur, + }) + return + } + + successAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer hbCancel() + _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAggDailyJobName, + LastRunAt: &runAt, + LastSuccessAt: &successAt, + LastDurationMs: &dur, + }) +} + +func (s *OpsAggregationService) isMonitoringEnabled(ctx context.Context) bool { + if s == nil { + return false + } + if s.cfg != nil && !s.cfg.Ops.Enabled { + return false + } + if s.settingRepo == nil { + return true + } + if ctx == nil { + ctx = context.Background() + } + + value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled) + if err != nil { + if errors.Is(err, ErrSettingNotFound) { + return true + } + return true + } + switch strings.ToLower(strings.TrimSpace(value)) { + case "false", "0", "off", "disabled": + return false + default: + return true + } +} + +var opsAggReleaseScript = redis.NewScript(` +if redis.call("GET", KEYS[1]) == ARGV[1] then + return redis.call("DEL", KEYS[1]) +end +return 0 +`) + +func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) { + if s == nil || s.redisClient == nil { + return nil, true + } + if ctx == nil { + ctx = context.Background() + } + + ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() + if err != nil { + // Fail-open: do not block single-instance deployments. + return nil, true + } + if !ok { + s.maybeLogSkip(logPrefix) + return nil, false + } + + release := func() { + ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result() + } + return release, true +} + +func (s *OpsAggregationService) maybeLogSkip(prefix string) { + s.skipLogMu.Lock() + defer s.skipLogMu.Unlock() + + now := time.Now() + if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < time.Minute { + return + } + s.skipLogAt = now + if prefix == "" { + prefix = "[OpsAggregation]" + } + log.Printf("%s leader lock held by another instance; skipping", prefix) +} + +func utcFloorToHour(t time.Time) time.Time { + return t.UTC().Truncate(time.Hour) +} + +func utcFloorToDay(t time.Time) time.Time { + u := t.UTC() + y, m, d := u.Date() + return time.Date(y, m, d, 0, 0, 0, 0, time.UTC) +} + +func minTime(a, b time.Time) time.Time { + if a.Before(b) { + return a + } + return b +} diff --git a/backend/internal/service/ops_alert_evaluator_service.go b/backend/internal/service/ops_alert_evaluator_service.go new file mode 100644 index 00000000..b970c720 --- /dev/null +++ b/backend/internal/service/ops_alert_evaluator_service.go @@ -0,0 +1,839 @@ +package service + +import ( + "context" + "fmt" + "log" + "math" + "strconv" + "strings" + "sync" + "time" + + "github.com/Wei-Shaw/sub2api/internal/config" + "github.com/google/uuid" + "github.com/redis/go-redis/v9" +) + +const ( + opsAlertEvaluatorJobName = "ops_alert_evaluator" + + opsAlertEvaluatorTimeout = 45 * time.Second + opsAlertEvaluatorLeaderLockKey = "ops:alert:evaluator:leader" + opsAlertEvaluatorLeaderLockTTL = 90 * time.Second + opsAlertEvaluatorSkipLogInterval = 1 * time.Minute +) + +var opsAlertEvaluatorReleaseScript = redis.NewScript(` +if redis.call("GET", KEYS[1]) == ARGV[1] then + return redis.call("DEL", KEYS[1]) +end +return 0 +`) + +type OpsAlertEvaluatorService struct { + opsService *OpsService + opsRepo OpsRepository + emailService *EmailService + + redisClient *redis.Client + cfg *config.Config + instanceID string + + stopCh chan struct{} + startOnce sync.Once + stopOnce sync.Once + wg sync.WaitGroup + + mu sync.Mutex + ruleStates map[int64]*opsAlertRuleState + + emailLimiter *slidingWindowLimiter + + skipLogMu sync.Mutex + skipLogAt time.Time + + warnNoRedisOnce sync.Once +} + +type opsAlertRuleState struct { + LastEvaluatedAt time.Time + ConsecutiveBreaches int +} + +func NewOpsAlertEvaluatorService( + opsService *OpsService, + opsRepo OpsRepository, + emailService *EmailService, + redisClient *redis.Client, + cfg *config.Config, +) *OpsAlertEvaluatorService { + return &OpsAlertEvaluatorService{ + opsService: opsService, + opsRepo: opsRepo, + emailService: emailService, + redisClient: redisClient, + cfg: cfg, + instanceID: uuid.NewString(), + ruleStates: map[int64]*opsAlertRuleState{}, + emailLimiter: newSlidingWindowLimiter(0, time.Hour), + } +} + +func (s *OpsAlertEvaluatorService) Start() { + if s == nil { + return + } + s.startOnce.Do(func() { + if s.stopCh == nil { + s.stopCh = make(chan struct{}) + } + go s.run() + }) +} + +func (s *OpsAlertEvaluatorService) Stop() { + if s == nil { + return + } + s.stopOnce.Do(func() { + if s.stopCh != nil { + close(s.stopCh) + } + }) + s.wg.Wait() +} + +func (s *OpsAlertEvaluatorService) run() { + s.wg.Add(1) + defer s.wg.Done() + + // Start immediately to produce early feedback in ops dashboard. + timer := time.NewTimer(0) + defer timer.Stop() + + for { + select { + case <-timer.C: + interval := s.getInterval() + s.evaluateOnce(interval) + timer.Reset(interval) + case <-s.stopCh: + return + } + } +} + +func (s *OpsAlertEvaluatorService) getInterval() time.Duration { + // Default. + interval := 60 * time.Second + + if s == nil || s.opsService == nil { + return interval + } + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + cfg, err := s.opsService.GetOpsAlertRuntimeSettings(ctx) + if err != nil || cfg == nil { + return interval + } + if cfg.EvaluationIntervalSeconds <= 0 { + return interval + } + if cfg.EvaluationIntervalSeconds < 1 { + return interval + } + if cfg.EvaluationIntervalSeconds > int((24 * time.Hour).Seconds()) { + return interval + } + return time.Duration(cfg.EvaluationIntervalSeconds) * time.Second +} + +func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) { + if s == nil || s.opsRepo == nil { + return + } + if s.cfg != nil && !s.cfg.Ops.Enabled { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), opsAlertEvaluatorTimeout) + defer cancel() + + if s.opsService != nil && !s.opsService.IsMonitoringEnabled(ctx) { + return + } + + runtimeCfg := defaultOpsAlertRuntimeSettings() + if s.opsService != nil { + if loaded, err := s.opsService.GetOpsAlertRuntimeSettings(ctx); err == nil && loaded != nil { + runtimeCfg = loaded + } + } + + release, ok := s.tryAcquireLeaderLock(ctx, runtimeCfg.DistributedLock) + if !ok { + return + } + if release != nil { + defer release() + } + + startedAt := time.Now().UTC() + runAt := startedAt + + rules, err := s.opsRepo.ListAlertRules(ctx) + if err != nil { + s.recordHeartbeatError(runAt, time.Since(startedAt), err) + log.Printf("[OpsAlertEvaluator] list rules failed: %v", err) + return + } + + now := time.Now().UTC() + safeEnd := now.Truncate(time.Minute) + if safeEnd.IsZero() { + safeEnd = now + } + + systemMetrics, _ := s.opsRepo.GetLatestSystemMetrics(ctx, 1) + + // Cleanup stale state for removed rules. + s.pruneRuleStates(rules) + + for _, rule := range rules { + if rule == nil || !rule.Enabled || rule.ID <= 0 { + continue + } + + scopePlatform, scopeGroupID := parseOpsAlertRuleScope(rule.Filters) + + windowMinutes := rule.WindowMinutes + if windowMinutes <= 0 { + windowMinutes = 1 + } + windowStart := safeEnd.Add(-time.Duration(windowMinutes) * time.Minute) + windowEnd := safeEnd + + metricValue, ok := s.computeRuleMetric(ctx, rule, systemMetrics, windowStart, windowEnd, scopePlatform, scopeGroupID) + if !ok { + s.resetRuleState(rule.ID, now) + continue + } + + breachedNow := compareMetric(metricValue, rule.Operator, rule.Threshold) + required := requiredSustainedBreaches(rule.SustainedMinutes, interval) + consecutive := s.updateRuleBreaches(rule.ID, now, interval, breachedNow) + + activeEvent, err := s.opsRepo.GetActiveAlertEvent(ctx, rule.ID) + if err != nil { + log.Printf("[OpsAlertEvaluator] get active event failed (rule=%d): %v", rule.ID, err) + continue + } + + if breachedNow && consecutive >= required { + if activeEvent != nil { + continue + } + + latestEvent, err := s.opsRepo.GetLatestAlertEvent(ctx, rule.ID) + if err != nil { + log.Printf("[OpsAlertEvaluator] get latest event failed (rule=%d): %v", rule.ID, err) + continue + } + if latestEvent != nil && rule.CooldownMinutes > 0 { + cooldown := time.Duration(rule.CooldownMinutes) * time.Minute + if now.Sub(latestEvent.FiredAt) < cooldown { + continue + } + } + + firedEvent := &OpsAlertEvent{ + RuleID: rule.ID, + Severity: strings.TrimSpace(rule.Severity), + Status: OpsAlertStatusFiring, + Title: fmt.Sprintf("%s: %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name)), + Description: buildOpsAlertDescription(rule, metricValue, windowMinutes, scopePlatform, scopeGroupID), + MetricValue: float64Ptr(metricValue), + ThresholdValue: float64Ptr(rule.Threshold), + Dimensions: buildOpsAlertDimensions(scopePlatform, scopeGroupID), + FiredAt: now, + CreatedAt: now, + } + + created, err := s.opsRepo.CreateAlertEvent(ctx, firedEvent) + if err != nil { + log.Printf("[OpsAlertEvaluator] create event failed (rule=%d): %v", rule.ID, err) + continue + } + + if created != nil && created.ID > 0 { + s.maybeSendAlertEmail(ctx, runtimeCfg, rule, created) + } + continue + } + + // Not breached: resolve active event if present. + if activeEvent != nil { + resolvedAt := now + if err := s.opsRepo.UpdateAlertEventStatus(ctx, activeEvent.ID, OpsAlertStatusResolved, &resolvedAt); err != nil { + log.Printf("[OpsAlertEvaluator] resolve event failed (event=%d): %v", activeEvent.ID, err) + } + } + } + + s.recordHeartbeatSuccess(runAt, time.Since(startedAt)) +} + +func (s *OpsAlertEvaluatorService) pruneRuleStates(rules []*OpsAlertRule) { + s.mu.Lock() + defer s.mu.Unlock() + + live := map[int64]struct{}{} + for _, r := range rules { + if r != nil && r.ID > 0 { + live[r.ID] = struct{}{} + } + } + for id := range s.ruleStates { + if _, ok := live[id]; !ok { + delete(s.ruleStates, id) + } + } +} + +func (s *OpsAlertEvaluatorService) resetRuleState(ruleID int64, now time.Time) { + if ruleID <= 0 { + return + } + s.mu.Lock() + defer s.mu.Unlock() + state, ok := s.ruleStates[ruleID] + if !ok { + state = &opsAlertRuleState{} + s.ruleStates[ruleID] = state + } + state.LastEvaluatedAt = now + state.ConsecutiveBreaches = 0 +} + +func (s *OpsAlertEvaluatorService) updateRuleBreaches(ruleID int64, now time.Time, interval time.Duration, breached bool) int { + if ruleID <= 0 { + return 0 + } + s.mu.Lock() + defer s.mu.Unlock() + + state, ok := s.ruleStates[ruleID] + if !ok { + state = &opsAlertRuleState{} + s.ruleStates[ruleID] = state + } + + if !state.LastEvaluatedAt.IsZero() && interval > 0 { + if now.Sub(state.LastEvaluatedAt) > interval*2 { + state.ConsecutiveBreaches = 0 + } + } + + state.LastEvaluatedAt = now + if breached { + state.ConsecutiveBreaches++ + } else { + state.ConsecutiveBreaches = 0 + } + return state.ConsecutiveBreaches +} + +func requiredSustainedBreaches(sustainedMinutes int, interval time.Duration) int { + if sustainedMinutes <= 0 { + return 1 + } + if interval <= 0 { + return sustainedMinutes + } + required := int(math.Ceil(float64(sustainedMinutes*60) / interval.Seconds())) + if required < 1 { + return 1 + } + return required +} + +func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64) { + if filters == nil { + return "", nil + } + if v, ok := filters["platform"]; ok { + if s, ok := v.(string); ok { + platform = strings.TrimSpace(s) + } + } + if v, ok := filters["group_id"]; ok { + switch t := v.(type) { + case float64: + if t > 0 { + id := int64(t) + groupID = &id + } + case int64: + if t > 0 { + id := t + groupID = &id + } + case int: + if t > 0 { + id := int64(t) + groupID = &id + } + case string: + n, err := strconv.ParseInt(strings.TrimSpace(t), 10, 64) + if err == nil && n > 0 { + groupID = &n + } + } + } + return platform, groupID +} + +func (s *OpsAlertEvaluatorService) computeRuleMetric( + ctx context.Context, + rule *OpsAlertRule, + systemMetrics *OpsSystemMetricsSnapshot, + start time.Time, + end time.Time, + platform string, + groupID *int64, +) (float64, bool) { + if rule == nil { + return 0, false + } + switch strings.TrimSpace(rule.MetricType) { + case "cpu_usage_percent": + if systemMetrics != nil && systemMetrics.CPUUsagePercent != nil { + return *systemMetrics.CPUUsagePercent, true + } + return 0, false + case "memory_usage_percent": + if systemMetrics != nil && systemMetrics.MemoryUsagePercent != nil { + return *systemMetrics.MemoryUsagePercent, true + } + return 0, false + case "concurrency_queue_depth": + if systemMetrics != nil && systemMetrics.ConcurrencyQueueDepth != nil { + return float64(*systemMetrics.ConcurrencyQueueDepth), true + } + return 0, false + } + + overview, err := s.opsRepo.GetDashboardOverview(ctx, &OpsDashboardFilter{ + StartTime: start, + EndTime: end, + Platform: platform, + GroupID: groupID, + QueryMode: OpsQueryModeRaw, + }) + if err != nil { + return 0, false + } + if overview == nil { + return 0, false + } + + switch strings.TrimSpace(rule.MetricType) { + case "success_rate": + if overview.RequestCountSLA <= 0 { + return 0, false + } + return overview.SLA * 100, true + case "error_rate": + if overview.RequestCountSLA <= 0 { + return 0, false + } + return overview.ErrorRate * 100, true + case "upstream_error_rate": + if overview.RequestCountSLA <= 0 { + return 0, false + } + return overview.UpstreamErrorRate * 100, true + case "p95_latency_ms": + if overview.Duration.P95 == nil { + return 0, false + } + return float64(*overview.Duration.P95), true + case "p99_latency_ms": + if overview.Duration.P99 == nil { + return 0, false + } + return float64(*overview.Duration.P99), true + default: + return 0, false + } +} + +func compareMetric(value float64, operator string, threshold float64) bool { + switch strings.TrimSpace(operator) { + case ">": + return value > threshold + case ">=": + return value >= threshold + case "<": + return value < threshold + case "<=": + return value <= threshold + case "==": + return value == threshold + case "!=": + return value != threshold + default: + return false + } +} + +func buildOpsAlertDimensions(platform string, groupID *int64) map[string]any { + dims := map[string]any{} + if strings.TrimSpace(platform) != "" { + dims["platform"] = strings.TrimSpace(platform) + } + if groupID != nil && *groupID > 0 { + dims["group_id"] = *groupID + } + if len(dims) == 0 { + return nil + } + return dims +} + +func buildOpsAlertDescription(rule *OpsAlertRule, value float64, windowMinutes int, platform string, groupID *int64) string { + if rule == nil { + return "" + } + scope := "overall" + if strings.TrimSpace(platform) != "" { + scope = fmt.Sprintf("platform=%s", strings.TrimSpace(platform)) + } + if groupID != nil && *groupID > 0 { + scope = fmt.Sprintf("%s group_id=%d", scope, *groupID) + } + if windowMinutes <= 0 { + windowMinutes = 1 + } + return fmt.Sprintf("%s %s %.2f (current %.2f) over last %dm (%s)", + strings.TrimSpace(rule.MetricType), + strings.TrimSpace(rule.Operator), + rule.Threshold, + value, + windowMinutes, + strings.TrimSpace(scope), + ) +} + +func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runtimeCfg *OpsAlertRuntimeSettings, rule *OpsAlertRule, event *OpsAlertEvent) { + if s == nil || s.emailService == nil || s.opsService == nil || event == nil || rule == nil { + return + } + if event.EmailSent { + return + } + if !rule.NotifyEmail { + return + } + + emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx) + if err != nil || emailCfg == nil || !emailCfg.Alert.Enabled { + return + } + + if len(emailCfg.Alert.Recipients) == 0 { + return + } + if !shouldSendOpsAlertEmailByMinSeverity(strings.TrimSpace(emailCfg.Alert.MinSeverity), strings.TrimSpace(rule.Severity)) { + return + } + + if runtimeCfg != nil && runtimeCfg.Silencing.Enabled { + if isOpsAlertSilenced(time.Now().UTC(), rule, event, runtimeCfg.Silencing) { + return + } + } + + // Apply/update rate limiter. + s.emailLimiter.SetLimit(emailCfg.Alert.RateLimitPerHour) + + subject := fmt.Sprintf("[Ops Alert][%s] %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name)) + body := buildOpsAlertEmailBody(rule, event) + + anySent := false + for _, to := range emailCfg.Alert.Recipients { + addr := strings.TrimSpace(to) + if addr == "" { + continue + } + if !s.emailLimiter.Allow(time.Now().UTC()) { + continue + } + if err := s.emailService.SendEmail(ctx, addr, subject, body); err != nil { + // Ignore per-recipient failures; continue best-effort. + continue + } + anySent = true + } + + if anySent { + _ = s.opsRepo.UpdateAlertEventEmailSent(context.Background(), event.ID, true) + } +} + +func buildOpsAlertEmailBody(rule *OpsAlertRule, event *OpsAlertEvent) string { + if rule == nil || event == nil { + return "" + } + metric := strings.TrimSpace(rule.MetricType) + value := "-" + threshold := fmt.Sprintf("%.2f", rule.Threshold) + if event.MetricValue != nil { + value = fmt.Sprintf("%.2f", *event.MetricValue) + } + if event.ThresholdValue != nil { + threshold = fmt.Sprintf("%.2f", *event.ThresholdValue) + } + return fmt.Sprintf(` +

Ops Alert

+

Rule: %s

+

Severity: %s

+

Status: %s

+

Metric: %s %s %s

+

Fired at: %s

+

Description: %s

+`, + htmlEscape(rule.Name), + htmlEscape(rule.Severity), + htmlEscape(event.Status), + htmlEscape(metric), + htmlEscape(rule.Operator), + htmlEscape(fmt.Sprintf("%s (threshold %s)", value, threshold)), + event.FiredAt.Format(time.RFC3339), + htmlEscape(event.Description), + ) +} + +func shouldSendOpsAlertEmailByMinSeverity(minSeverity string, ruleSeverity string) bool { + minSeverity = strings.ToLower(strings.TrimSpace(minSeverity)) + if minSeverity == "" { + return true + } + + eventLevel := opsEmailSeverityForOps(ruleSeverity) + minLevel := strings.ToLower(minSeverity) + + rank := func(level string) int { + switch level { + case "critical": + return 3 + case "warning": + return 2 + case "info": + return 1 + default: + return 0 + } + } + return rank(eventLevel) >= rank(minLevel) +} + +func opsEmailSeverityForOps(severity string) string { + switch strings.ToUpper(strings.TrimSpace(severity)) { + case "P0": + return "critical" + case "P1": + return "warning" + default: + return "info" + } +} + +func isOpsAlertSilenced(now time.Time, rule *OpsAlertRule, event *OpsAlertEvent, silencing OpsAlertSilencingSettings) bool { + if !silencing.Enabled { + return false + } + if now.IsZero() { + now = time.Now().UTC() + } + if strings.TrimSpace(silencing.GlobalUntilRFC3339) != "" { + if t, err := time.Parse(time.RFC3339, strings.TrimSpace(silencing.GlobalUntilRFC3339)); err == nil { + if now.Before(t) { + return true + } + } + } + + for _, entry := range silencing.Entries { + untilRaw := strings.TrimSpace(entry.UntilRFC3339) + if untilRaw == "" { + continue + } + until, err := time.Parse(time.RFC3339, untilRaw) + if err != nil { + continue + } + if now.After(until) { + continue + } + if entry.RuleID != nil && rule != nil && rule.ID > 0 && *entry.RuleID != rule.ID { + continue + } + if len(entry.Severities) > 0 { + match := false + for _, s := range entry.Severities { + if strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(event.Severity)) || strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(rule.Severity)) { + match = true + break + } + } + if !match { + continue + } + } + return true + } + + return false +} + +func (s *OpsAlertEvaluatorService) tryAcquireLeaderLock(ctx context.Context, lock OpsDistributedLockSettings) (func(), bool) { + if !lock.Enabled { + return nil, true + } + if s.redisClient == nil { + s.warnNoRedisOnce.Do(func() { + log.Printf("[OpsAlertEvaluator] redis not configured; running without distributed lock") + }) + return nil, true + } + key := strings.TrimSpace(lock.Key) + if key == "" { + key = opsAlertEvaluatorLeaderLockKey + } + ttl := time.Duration(lock.TTLSeconds) * time.Second + if ttl <= 0 { + ttl = opsAlertEvaluatorLeaderLockTTL + } + + ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() + if err != nil { + // Fail-open for single-node environments, but warn. + s.warnNoRedisOnce.Do(func() { + log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; running without lock: %v", err) + }) + return nil, true + } + if !ok { + s.maybeLogSkip(key) + return nil, false + } + return func() { + _, _ = opsAlertEvaluatorReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result() + }, true +} + +func (s *OpsAlertEvaluatorService) maybeLogSkip(key string) { + s.skipLogMu.Lock() + defer s.skipLogMu.Unlock() + + now := time.Now() + if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < opsAlertEvaluatorSkipLogInterval { + return + } + s.skipLogAt = now + log.Printf("[OpsAlertEvaluator] leader lock held by another instance; skipping (key=%q)", key) +} + +func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) { + if s == nil || s.opsRepo == nil { + return + } + now := time.Now().UTC() + durMs := duration.Milliseconds() + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAlertEvaluatorJobName, + LastRunAt: &runAt, + LastSuccessAt: &now, + LastDurationMs: &durMs, + }) +} + +func (s *OpsAlertEvaluatorService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) { + if s == nil || s.opsRepo == nil || err == nil { + return + } + now := time.Now().UTC() + durMs := duration.Milliseconds() + msg := truncateString(err.Error(), 2048) + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAlertEvaluatorJobName, + LastRunAt: &runAt, + LastErrorAt: &now, + LastError: &msg, + LastDurationMs: &durMs, + }) +} + +func htmlEscape(s string) string { + replacer := strings.NewReplacer( + "&", "&", + "<", "<", + ">", ">", + `"`, """, + "'", "'", + ) + return replacer.Replace(s) +} + +type slidingWindowLimiter struct { + mu sync.Mutex + limit int + window time.Duration + sent []time.Time +} + +func newSlidingWindowLimiter(limit int, window time.Duration) *slidingWindowLimiter { + if window <= 0 { + window = time.Hour + } + return &slidingWindowLimiter{ + limit: limit, + window: window, + sent: []time.Time{}, + } +} + +func (l *slidingWindowLimiter) SetLimit(limit int) { + l.mu.Lock() + defer l.mu.Unlock() + l.limit = limit +} + +func (l *slidingWindowLimiter) Allow(now time.Time) bool { + l.mu.Lock() + defer l.mu.Unlock() + + if l.limit <= 0 { + return true + } + cutoff := now.Add(-l.window) + keep := l.sent[:0] + for _, t := range l.sent { + if t.After(cutoff) { + keep = append(keep, t) + } + } + l.sent = keep + if len(l.sent) >= l.limit { + return false + } + l.sent = append(l.sent, now) + return true +} diff --git a/backend/internal/service/ops_alerts.go b/backend/internal/service/ops_alerts.go new file mode 100644 index 00000000..b6c3d1c3 --- /dev/null +++ b/backend/internal/service/ops_alerts.go @@ -0,0 +1,162 @@ +package service + +import ( + "context" + "database/sql" + "errors" + "strings" + "time" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +func (s *OpsService) ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return []*OpsAlertRule{}, nil + } + return s.opsRepo.ListAlertRules(ctx) +} + +func (s *OpsService) CreateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if rule == nil { + return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule") + } + + created, err := s.opsRepo.CreateAlertRule(ctx, rule) + if err != nil { + return nil, err + } + return created, nil +} + +func (s *OpsService) UpdateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if rule == nil || rule.ID <= 0 { + return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule") + } + + updated, err := s.opsRepo.UpdateAlertRule(ctx, rule) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found") + } + return nil, err + } + return updated, nil +} + +func (s *OpsService) DeleteAlertRule(ctx context.Context, id int64) error { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return err + } + if s.opsRepo == nil { + return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if id <= 0 { + return infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id") + } + if err := s.opsRepo.DeleteAlertRule(ctx, id); err != nil { + if errors.Is(err, sql.ErrNoRows) { + return infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found") + } + return err + } + return nil +} + +func (s *OpsService) ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return []*OpsAlertEvent{}, nil + } + return s.opsRepo.ListAlertEvents(ctx, filter) +} + +func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if ruleID <= 0 { + return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id") + } + return s.opsRepo.GetActiveAlertEvent(ctx, ruleID) +} + +func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if ruleID <= 0 { + return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id") + } + return s.opsRepo.GetLatestAlertEvent(ctx, ruleID) +} + +func (s *OpsService) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if event == nil { + return nil, infraerrors.BadRequest("INVALID_EVENT", "invalid event") + } + + created, err := s.opsRepo.CreateAlertEvent(ctx, event) + if err != nil { + return nil, err + } + return created, nil +} + +func (s *OpsService) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return err + } + if s.opsRepo == nil { + return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if eventID <= 0 { + return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id") + } + if strings.TrimSpace(status) == "" { + return infraerrors.BadRequest("INVALID_STATUS", "invalid status") + } + return s.opsRepo.UpdateAlertEventStatus(ctx, eventID, status, resolvedAt) +} + +func (s *OpsService) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return err + } + if s.opsRepo == nil { + return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if eventID <= 0 { + return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id") + } + return s.opsRepo.UpdateAlertEventEmailSent(ctx, eventID, emailSent) +} diff --git a/backend/internal/service/ops_cleanup_service.go b/backend/internal/service/ops_cleanup_service.go new file mode 100644 index 00000000..ef825c04 --- /dev/null +++ b/backend/internal/service/ops_cleanup_service.go @@ -0,0 +1,361 @@ +package service + +import ( + "context" + "database/sql" + "fmt" + "log" + "strings" + "sync" + "time" + + "github.com/Wei-Shaw/sub2api/internal/config" + "github.com/google/uuid" + "github.com/redis/go-redis/v9" + "github.com/robfig/cron/v3" +) + +const ( + opsCleanupJobName = "ops_cleanup" + + opsCleanupLeaderLockKeyDefault = "ops:cleanup:leader" + opsCleanupLeaderLockTTLDefault = 30 * time.Minute +) + +var opsCleanupCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow) + +var opsCleanupReleaseScript = redis.NewScript(` +if redis.call("GET", KEYS[1]) == ARGV[1] then + return redis.call("DEL", KEYS[1]) +end +return 0 +`) + +// OpsCleanupService periodically deletes old ops data to prevent unbounded DB growth. +// +// - Scheduling: 5-field cron spec (minute hour dom month dow). +// - Multi-instance: best-effort Redis leader lock so only one node runs cleanup. +// - Safety: deletes in batches to avoid long transactions. +type OpsCleanupService struct { + opsRepo OpsRepository + db *sql.DB + redisClient *redis.Client + cfg *config.Config + + instanceID string + + cron *cron.Cron + entryID cron.EntryID + + startOnce sync.Once + stopOnce sync.Once + + warnNoRedisOnce sync.Once +} + +func NewOpsCleanupService( + opsRepo OpsRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsCleanupService { + return &OpsCleanupService{ + opsRepo: opsRepo, + db: db, + redisClient: redisClient, + cfg: cfg, + instanceID: uuid.NewString(), + } +} + +func (s *OpsCleanupService) Start() { + if s == nil { + return + } + if s.cfg != nil && !s.cfg.Ops.Enabled { + return + } + if s.cfg != nil && !s.cfg.Ops.Cleanup.Enabled { + log.Printf("[OpsCleanup] not started (disabled)") + return + } + if s.opsRepo == nil || s.db == nil { + log.Printf("[OpsCleanup] not started (missing deps)") + return + } + + s.startOnce.Do(func() { + schedule := "0 2 * * *" + if s.cfg != nil && strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule) != "" { + schedule = strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule) + } + + loc := time.Local + if s.cfg != nil && strings.TrimSpace(s.cfg.Timezone) != "" { + if parsed, err := time.LoadLocation(strings.TrimSpace(s.cfg.Timezone)); err == nil && parsed != nil { + loc = parsed + } + } + + c := cron.New(cron.WithParser(opsCleanupCronParser), cron.WithLocation(loc)) + id, err := c.AddFunc(schedule, func() { s.runScheduled() }) + if err != nil { + log.Printf("[OpsCleanup] not started (invalid schedule=%q): %v", schedule, err) + return + } + s.cron = c + s.entryID = id + s.cron.Start() + log.Printf("[OpsCleanup] started (schedule=%q tz=%s)", schedule, loc.String()) + }) +} + +func (s *OpsCleanupService) Stop() { + if s == nil { + return + } + s.stopOnce.Do(func() { + if s.cron != nil { + ctx := s.cron.Stop() + select { + case <-ctx.Done(): + case <-time.After(3 * time.Second): + log.Printf("[OpsCleanup] cron stop timed out") + } + } + }) +} + +func (s *OpsCleanupService) runScheduled() { + if s == nil || s.db == nil || s.opsRepo == nil { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute) + defer cancel() + + release, ok := s.tryAcquireLeaderLock(ctx) + if !ok { + return + } + if release != nil { + defer release() + } + + startedAt := time.Now().UTC() + runAt := startedAt + + counts, err := s.runCleanupOnce(ctx) + if err != nil { + s.recordHeartbeatError(runAt, time.Since(startedAt), err) + log.Printf("[OpsCleanup] cleanup failed: %v", err) + return + } + s.recordHeartbeatSuccess(runAt, time.Since(startedAt)) + log.Printf("[OpsCleanup] cleanup complete: %s", counts) +} + +type opsCleanupDeletedCounts struct { + errorLogs int64 + retryAttempts int64 + alertEvents int64 + systemMetrics int64 + hourlyPreagg int64 + dailyPreagg int64 +} + +func (c opsCleanupDeletedCounts) String() string { + return fmt.Sprintf( + "error_logs=%d retry_attempts=%d alert_events=%d system_metrics=%d hourly_preagg=%d daily_preagg=%d", + c.errorLogs, + c.retryAttempts, + c.alertEvents, + c.systemMetrics, + c.hourlyPreagg, + c.dailyPreagg, + ) +} + +func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDeletedCounts, error) { + out := opsCleanupDeletedCounts{} + if s == nil || s.db == nil || s.cfg == nil { + return out, nil + } + + batchSize := 5000 + + now := time.Now().UTC() + + // Error-like tables: error logs / retry attempts / alert events. + if days := s.cfg.Ops.Cleanup.ErrorLogRetentionDays; days > 0 { + cutoff := now.AddDate(0, 0, -days) + n, err := deleteOldRowsByID(ctx, s.db, "ops_error_logs", "created_at", cutoff, batchSize, false) + if err != nil { + return out, err + } + out.errorLogs = n + + n, err = deleteOldRowsByID(ctx, s.db, "ops_retry_attempts", "created_at", cutoff, batchSize, false) + if err != nil { + return out, err + } + out.retryAttempts = n + + n, err = deleteOldRowsByID(ctx, s.db, "ops_alert_events", "created_at", cutoff, batchSize, false) + if err != nil { + return out, err + } + out.alertEvents = n + } + + // Minute-level metrics snapshots. + if days := s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays; days > 0 { + cutoff := now.AddDate(0, 0, -days) + n, err := deleteOldRowsByID(ctx, s.db, "ops_system_metrics", "created_at", cutoff, batchSize, false) + if err != nil { + return out, err + } + out.systemMetrics = n + } + + // Pre-aggregation tables (hourly/daily). + if days := s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays; days > 0 { + cutoff := now.AddDate(0, 0, -days) + n, err := deleteOldRowsByID(ctx, s.db, "ops_metrics_hourly", "bucket_start", cutoff, batchSize, false) + if err != nil { + return out, err + } + out.hourlyPreagg = n + + n, err = deleteOldRowsByID(ctx, s.db, "ops_metrics_daily", "bucket_date", cutoff, batchSize, true) + if err != nil { + return out, err + } + out.dailyPreagg = n + } + + return out, nil +} + +func deleteOldRowsByID( + ctx context.Context, + db *sql.DB, + table string, + timeColumn string, + cutoff time.Time, + batchSize int, + castCutoffToDate bool, +) (int64, error) { + if db == nil { + return 0, nil + } + if batchSize <= 0 { + batchSize = 5000 + } + + where := fmt.Sprintf("%s < $1", timeColumn) + if castCutoffToDate { + where = fmt.Sprintf("%s < $1::date", timeColumn) + } + + q := fmt.Sprintf(` +WITH batch AS ( + SELECT id FROM %s + WHERE %s + ORDER BY id + LIMIT $2 +) +DELETE FROM %s +WHERE id IN (SELECT id FROM batch) +`, table, where, table) + + var total int64 + for { + res, err := db.ExecContext(ctx, q, cutoff, batchSize) + if err != nil { + // If ops tables aren't present yet (partial deployments), treat as no-op. + if strings.Contains(strings.ToLower(err.Error()), "does not exist") && strings.Contains(strings.ToLower(err.Error()), "relation") { + return total, nil + } + return total, err + } + affected, err := res.RowsAffected() + if err != nil { + return total, err + } + total += affected + if affected == 0 { + break + } + } + return total, nil +} + +func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) { + if s == nil { + return nil, false + } + // In simple run mode, assume single instance. + if s.cfg != nil && s.cfg.RunMode == config.RunModeSimple { + return nil, true + } + + if s.redisClient == nil { + s.warnNoRedisOnce.Do(func() { + log.Printf("[OpsCleanup] redis not configured; running without distributed lock") + }) + return nil, true + } + + key := opsCleanupLeaderLockKeyDefault + ttl := opsCleanupLeaderLockTTLDefault + + ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() + if err != nil { + s.warnNoRedisOnce.Do(func() { + log.Printf("[OpsCleanup] leader lock SetNX failed; running without lock: %v", err) + }) + return nil, true + } + if !ok { + return nil, false + } + + return func() { + _, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result() + }, true +} + +func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) { + if s == nil || s.opsRepo == nil { + return + } + now := time.Now().UTC() + durMs := duration.Milliseconds() + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ + JobName: opsCleanupJobName, + LastRunAt: &runAt, + LastSuccessAt: &now, + LastDurationMs: &durMs, + }) +} + +func (s *OpsCleanupService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) { + if s == nil || s.opsRepo == nil || err == nil { + return + } + now := time.Now().UTC() + durMs := duration.Milliseconds() + msg := truncateString(err.Error(), 2048) + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ + JobName: opsCleanupJobName, + LastRunAt: &runAt, + LastErrorAt: &now, + LastError: &msg, + LastDurationMs: &durMs, + }) +} diff --git a/backend/internal/service/ops_concurrency.go b/backend/internal/service/ops_concurrency.go new file mode 100644 index 00000000..c3b7b853 --- /dev/null +++ b/backend/internal/service/ops_concurrency.go @@ -0,0 +1,257 @@ +package service + +import ( + "context" + "log" + "time" + + "github.com/Wei-Shaw/sub2api/internal/pkg/pagination" +) + +const ( + opsAccountsPageSize = 100 + opsConcurrencyBatchChunkSize = 200 +) + +func (s *OpsService) listAllAccountsForOps(ctx context.Context, platformFilter string) ([]Account, error) { + if s == nil || s.accountRepo == nil { + return []Account{}, nil + } + + out := make([]Account, 0, 128) + page := 1 + for { + accounts, pageInfo, err := s.accountRepo.ListWithFilters(ctx, pagination.PaginationParams{ + Page: page, + PageSize: opsAccountsPageSize, + }, platformFilter, "", "", "") + if err != nil { + return nil, err + } + if len(accounts) == 0 { + break + } + + out = append(out, accounts...) + if pageInfo != nil && int64(len(out)) >= pageInfo.Total { + break + } + if len(accounts) < opsAccountsPageSize { + break + } + + page++ + if page > 10_000 { + log.Printf("[Ops] listAllAccountsForOps: aborting after too many pages (platform=%q)", platformFilter) + break + } + } + + return out, nil +} + +func (s *OpsService) getAccountsLoadMapBestEffort(ctx context.Context, accounts []Account) map[int64]*AccountLoadInfo { + if s == nil || s.concurrencyService == nil { + return map[int64]*AccountLoadInfo{} + } + if len(accounts) == 0 { + return map[int64]*AccountLoadInfo{} + } + + // De-duplicate IDs (and keep the max concurrency to avoid under-reporting). + unique := make(map[int64]int, len(accounts)) + for _, acc := range accounts { + if acc.ID <= 0 { + continue + } + if prev, ok := unique[acc.ID]; !ok || acc.Concurrency > prev { + unique[acc.ID] = acc.Concurrency + } + } + + batch := make([]AccountWithConcurrency, 0, len(unique)) + for id, maxConc := range unique { + batch = append(batch, AccountWithConcurrency{ + ID: id, + MaxConcurrency: maxConc, + }) + } + + out := make(map[int64]*AccountLoadInfo, len(batch)) + for i := 0; i < len(batch); i += opsConcurrencyBatchChunkSize { + end := i + opsConcurrencyBatchChunkSize + if end > len(batch) { + end = len(batch) + } + part, err := s.concurrencyService.GetAccountsLoadBatch(ctx, batch[i:end]) + if err != nil { + // Best-effort: return zeros rather than failing the ops UI. + log.Printf("[Ops] GetAccountsLoadBatch failed: %v", err) + continue + } + for k, v := range part { + out[k] = v + } + } + + return out +} + +// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account. +// +// Optional filters: +// - platformFilter: only include accounts in that platform (best-effort reduces DB load) +// - groupIDFilter: only include accounts that belong to that group +func (s *OpsService) GetConcurrencyStats( + ctx context.Context, + platformFilter string, + groupIDFilter *int64, +) (map[string]*PlatformConcurrencyInfo, map[int64]*GroupConcurrencyInfo, map[int64]*AccountConcurrencyInfo, *time.Time, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, nil, nil, nil, err + } + + accounts, err := s.listAllAccountsForOps(ctx, platformFilter) + if err != nil { + return nil, nil, nil, nil, err + } + + collectedAt := time.Now() + loadMap := s.getAccountsLoadMapBestEffort(ctx, accounts) + + platform := make(map[string]*PlatformConcurrencyInfo) + group := make(map[int64]*GroupConcurrencyInfo) + account := make(map[int64]*AccountConcurrencyInfo) + + for _, acc := range accounts { + if acc.ID <= 0 { + continue + } + + var matchedGroup *Group + if groupIDFilter != nil && *groupIDFilter > 0 { + for _, grp := range acc.Groups { + if grp == nil || grp.ID <= 0 { + continue + } + if grp.ID == *groupIDFilter { + matchedGroup = grp + break + } + } + // Group filter provided: skip accounts not in that group. + if matchedGroup == nil { + continue + } + } + + load := loadMap[acc.ID] + currentInUse := int64(0) + waiting := int64(0) + if load != nil { + currentInUse = int64(load.CurrentConcurrency) + waiting = int64(load.WaitingCount) + } + + // Account-level view picks one display group (the first group). + displayGroupID := int64(0) + displayGroupName := "" + if matchedGroup != nil { + displayGroupID = matchedGroup.ID + displayGroupName = matchedGroup.Name + } else if len(acc.Groups) > 0 && acc.Groups[0] != nil { + displayGroupID = acc.Groups[0].ID + displayGroupName = acc.Groups[0].Name + } + + if _, ok := account[acc.ID]; !ok { + info := &AccountConcurrencyInfo{ + AccountID: acc.ID, + AccountName: acc.Name, + Platform: acc.Platform, + GroupID: displayGroupID, + GroupName: displayGroupName, + CurrentInUse: currentInUse, + MaxCapacity: int64(acc.Concurrency), + WaitingInQueue: waiting, + } + if info.MaxCapacity > 0 { + info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100 + } + account[acc.ID] = info + } + + // Platform aggregation. + if acc.Platform != "" { + if _, ok := platform[acc.Platform]; !ok { + platform[acc.Platform] = &PlatformConcurrencyInfo{ + Platform: acc.Platform, + } + } + p := platform[acc.Platform] + p.MaxCapacity += int64(acc.Concurrency) + p.CurrentInUse += currentInUse + p.WaitingInQueue += waiting + } + + // Group aggregation (one account may contribute to multiple groups). + if matchedGroup != nil { + grp := matchedGroup + if _, ok := group[grp.ID]; !ok { + group[grp.ID] = &GroupConcurrencyInfo{ + GroupID: grp.ID, + GroupName: grp.Name, + Platform: grp.Platform, + } + } + g := group[grp.ID] + if g.GroupName == "" && grp.Name != "" { + g.GroupName = grp.Name + } + if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform { + // Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels. + g.Platform = "" + } + g.MaxCapacity += int64(acc.Concurrency) + g.CurrentInUse += currentInUse + g.WaitingInQueue += waiting + } else { + for _, grp := range acc.Groups { + if grp == nil || grp.ID <= 0 { + continue + } + if _, ok := group[grp.ID]; !ok { + group[grp.ID] = &GroupConcurrencyInfo{ + GroupID: grp.ID, + GroupName: grp.Name, + Platform: grp.Platform, + } + } + g := group[grp.ID] + if g.GroupName == "" && grp.Name != "" { + g.GroupName = grp.Name + } + if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform { + // Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels. + g.Platform = "" + } + g.MaxCapacity += int64(acc.Concurrency) + g.CurrentInUse += currentInUse + g.WaitingInQueue += waiting + } + } + } + + for _, info := range platform { + if info.MaxCapacity > 0 { + info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100 + } + } + for _, info := range group { + if info.MaxCapacity > 0 { + info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100 + } + } + + return platform, group, account, &collectedAt, nil +} diff --git a/backend/internal/service/ops_dashboard.go b/backend/internal/service/ops_dashboard.go new file mode 100644 index 00000000..23d6d82f --- /dev/null +++ b/backend/internal/service/ops_dashboard.go @@ -0,0 +1,77 @@ +package service + +import ( + "context" + "database/sql" + "errors" + "log" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + + // Resolve query mode (requested via query param, or DB default). + filter.QueryMode = s.resolveOpsQueryMode(ctx, filter.QueryMode) + + overview, err := s.opsRepo.GetDashboardOverview(ctx, filter) + if err != nil { + if errors.Is(err, ErrOpsPreaggregatedNotPopulated) { + return nil, infraerrors.Conflict("OPS_PREAGG_NOT_READY", "Pre-aggregated ops metrics are not populated yet") + } + return nil, err + } + + // Best-effort system health + jobs; dashboard metrics should still render if these are missing. + if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil { + overview.SystemMetrics = metrics + } else if err != nil && !errors.Is(err, sql.ErrNoRows) { + log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err) + } + + if heartbeats, err := s.opsRepo.ListJobHeartbeats(ctx); err == nil { + overview.JobHeartbeats = heartbeats + } else { + log.Printf("[Ops] ListJobHeartbeats failed: %v", err) + } + + return overview, nil +} + +func (s *OpsService) resolveOpsQueryMode(ctx context.Context, requested OpsQueryMode) OpsQueryMode { + if requested.IsValid() { + // Allow "auto" to be disabled via config until preagg is proven stable in production. + // Forced `preagg` via query param still works. + if requested == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables { + return OpsQueryModeRaw + } + return requested + } + + mode := OpsQueryModeAuto + if s != nil && s.settingRepo != nil { + if raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsQueryModeDefault); err == nil { + mode = ParseOpsQueryMode(raw) + } + } + + if mode == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables { + return OpsQueryModeRaw + } + return mode +} diff --git a/backend/internal/service/ops_errors.go b/backend/internal/service/ops_errors.go new file mode 100644 index 00000000..76b5ce8b --- /dev/null +++ b/backend/internal/service/ops_errors.go @@ -0,0 +1,45 @@ +package service + +import ( + "context" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +func (s *OpsService) GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + return s.opsRepo.GetErrorTrend(ctx, filter, bucketSeconds) +} + +func (s *OpsService) GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + return s.opsRepo.GetErrorDistribution(ctx, filter) +} diff --git a/backend/internal/service/ops_histograms.go b/backend/internal/service/ops_histograms.go new file mode 100644 index 00000000..9f5b514f --- /dev/null +++ b/backend/internal/service/ops_histograms.go @@ -0,0 +1,26 @@ +package service + +import ( + "context" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +func (s *OpsService) GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + return s.opsRepo.GetLatencyHistogram(ctx, filter) +} diff --git a/backend/internal/service/ops_metrics_collector.go b/backend/internal/service/ops_metrics_collector.go new file mode 100644 index 00000000..cd90e1bd --- /dev/null +++ b/backend/internal/service/ops_metrics_collector.go @@ -0,0 +1,861 @@ +package service + +import ( + "context" + "database/sql" + "errors" + "fmt" + "hash/fnv" + "log" + "math" + "os" + "runtime" + "strconv" + "strings" + "sync" + "time" + "unicode/utf8" + + "github.com/Wei-Shaw/sub2api/internal/config" + "github.com/google/uuid" + "github.com/redis/go-redis/v9" + "github.com/shirou/gopsutil/v4/cpu" + "github.com/shirou/gopsutil/v4/mem" +) + +const ( + opsMetricsCollectorJobName = "ops_metrics_collector" + opsMetricsCollectorMinInterval = 60 * time.Second + opsMetricsCollectorMaxInterval = 1 * time.Hour + + opsMetricsCollectorTimeout = 10 * time.Second + + opsMetricsCollectorLeaderLockKey = "ops:metrics:collector:leader" + opsMetricsCollectorLeaderLockTTL = 90 * time.Second + + opsMetricsCollectorHeartbeatTimeout = 2 * time.Second + + bytesPerMB = 1024 * 1024 +) + +var opsMetricsCollectorAdvisoryLockID = hashAdvisoryLockID(opsMetricsCollectorLeaderLockKey) + +type OpsMetricsCollector struct { + opsRepo OpsRepository + settingRepo SettingRepository + cfg *config.Config + + db *sql.DB + redisClient *redis.Client + instanceID string + + lastCgroupCPUUsageNanos uint64 + lastCgroupCPUSampleAt time.Time + + stopCh chan struct{} + startOnce sync.Once + stopOnce sync.Once + + skipLogMu sync.Mutex + skipLogAt time.Time +} + +func NewOpsMetricsCollector( + opsRepo OpsRepository, + settingRepo SettingRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsMetricsCollector { + return &OpsMetricsCollector{ + opsRepo: opsRepo, + settingRepo: settingRepo, + cfg: cfg, + db: db, + redisClient: redisClient, + instanceID: uuid.NewString(), + } +} + +func (c *OpsMetricsCollector) Start() { + if c == nil { + return + } + c.startOnce.Do(func() { + if c.stopCh == nil { + c.stopCh = make(chan struct{}) + } + go c.run() + }) +} + +func (c *OpsMetricsCollector) Stop() { + if c == nil { + return + } + c.stopOnce.Do(func() { + if c.stopCh != nil { + close(c.stopCh) + } + }) +} + +func (c *OpsMetricsCollector) run() { + // First run immediately so the dashboard has data soon after startup. + c.collectOnce() + + for { + interval := c.getInterval() + timer := time.NewTimer(interval) + select { + case <-timer.C: + c.collectOnce() + case <-c.stopCh: + timer.Stop() + return + } + } +} + +func (c *OpsMetricsCollector) getInterval() time.Duration { + interval := opsMetricsCollectorMinInterval + + if c.settingRepo == nil { + return interval + } + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + raw, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMetricsIntervalSeconds) + if err != nil { + return interval + } + raw = strings.TrimSpace(raw) + if raw == "" { + return interval + } + + seconds, err := strconv.Atoi(raw) + if err != nil { + return interval + } + if seconds < int(opsMetricsCollectorMinInterval.Seconds()) { + seconds = int(opsMetricsCollectorMinInterval.Seconds()) + } + if seconds > int(opsMetricsCollectorMaxInterval.Seconds()) { + seconds = int(opsMetricsCollectorMaxInterval.Seconds()) + } + return time.Duration(seconds) * time.Second +} + +func (c *OpsMetricsCollector) collectOnce() { + if c == nil { + return + } + if c.cfg != nil && !c.cfg.Ops.Enabled { + return + } + if c.opsRepo == nil { + return + } + if c.db == nil { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), opsMetricsCollectorTimeout) + defer cancel() + + if !c.isMonitoringEnabled(ctx) { + return + } + + release, ok := c.tryAcquireLeaderLock(ctx) + if !ok { + return + } + if release != nil { + defer release() + } + + startedAt := time.Now().UTC() + err := c.collectAndPersist(ctx) + finishedAt := time.Now().UTC() + + durationMs := finishedAt.Sub(startedAt).Milliseconds() + dur := durationMs + runAt := startedAt + + if err != nil { + msg := truncateString(err.Error(), 2048) + errAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout) + defer hbCancel() + _ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsMetricsCollectorJobName, + LastRunAt: &runAt, + LastErrorAt: &errAt, + LastError: &msg, + LastDurationMs: &dur, + }) + log.Printf("[OpsMetricsCollector] collect failed: %v", err) + return + } + + successAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout) + defer hbCancel() + _ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsMetricsCollectorJobName, + LastRunAt: &runAt, + LastSuccessAt: &successAt, + LastDurationMs: &dur, + }) +} + +func (c *OpsMetricsCollector) isMonitoringEnabled(ctx context.Context) bool { + if c == nil { + return false + } + if c.cfg != nil && !c.cfg.Ops.Enabled { + return false + } + if c.settingRepo == nil { + return true + } + if ctx == nil { + ctx = context.Background() + } + + value, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled) + if err != nil { + if errors.Is(err, ErrSettingNotFound) { + return true + } + // Fail-open: collector should not become a hard dependency. + return true + } + switch strings.ToLower(strings.TrimSpace(value)) { + case "false", "0", "off", "disabled": + return false + default: + return true + } +} + +func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error { + if ctx == nil { + ctx = context.Background() + } + + // Align to stable minute boundaries to avoid partial buckets and to maximize cache hits. + now := time.Now().UTC() + windowEnd := now.Truncate(time.Minute) + windowStart := windowEnd.Add(-1 * time.Minute) + + sys, err := c.collectSystemStats(ctx) + if err != nil { + // Continue; system stats are best-effort. + log.Printf("[OpsMetricsCollector] system stats error: %v", err) + } + + dbOK := c.checkDB(ctx) + redisOK := c.checkRedis(ctx) + active, idle := c.dbPoolStats() + + successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd) + if err != nil { + return fmt.Errorf("query usage counts: %w", err) + } + + duration, ttft, err := c.queryUsageLatency(ctx, windowStart, windowEnd) + if err != nil { + return fmt.Errorf("query usage latency: %w", err) + } + + errorTotal, businessLimited, errorSLA, upstreamExcl, upstream429, upstream529, err := c.queryErrorCounts(ctx, windowStart, windowEnd) + if err != nil { + return fmt.Errorf("query error counts: %w", err) + } + + windowSeconds := windowEnd.Sub(windowStart).Seconds() + if windowSeconds <= 0 { + windowSeconds = 60 + } + requestTotal := successCount + errorTotal + qps := float64(requestTotal) / windowSeconds + tps := float64(tokenConsumed) / windowSeconds + + goroutines := runtime.NumGoroutine() + + input := &OpsInsertSystemMetricsInput{ + CreatedAt: windowEnd, + WindowMinutes: 1, + + SuccessCount: successCount, + ErrorCountTotal: errorTotal, + BusinessLimitedCount: businessLimited, + ErrorCountSLA: errorSLA, + + UpstreamErrorCountExcl429529: upstreamExcl, + Upstream429Count: upstream429, + Upstream529Count: upstream529, + + TokenConsumed: tokenConsumed, + QPS: float64Ptr(roundTo1DP(qps)), + TPS: float64Ptr(roundTo1DP(tps)), + + DurationP50Ms: duration.p50, + DurationP90Ms: duration.p90, + DurationP95Ms: duration.p95, + DurationP99Ms: duration.p99, + DurationAvgMs: duration.avg, + DurationMaxMs: duration.max, + + TTFTP50Ms: ttft.p50, + TTFTP90Ms: ttft.p90, + TTFTP95Ms: ttft.p95, + TTFTP99Ms: ttft.p99, + TTFTAvgMs: ttft.avg, + TTFTMaxMs: ttft.max, + + CPUUsagePercent: sys.cpuUsagePercent, + MemoryUsedMB: sys.memoryUsedMB, + MemoryTotalMB: sys.memoryTotalMB, + MemoryUsagePercent: sys.memoryUsagePercent, + + DBOK: boolPtr(dbOK), + RedisOK: boolPtr(redisOK), + + DBConnActive: intPtr(active), + DBConnIdle: intPtr(idle), + GoroutineCount: intPtr(goroutines), + } + + return c.opsRepo.InsertSystemMetrics(ctx, input) +} + +type opsCollectedPercentiles struct { + p50 *int + p90 *int + p95 *int + p99 *int + avg *float64 + max *int +} + +func (c *OpsMetricsCollector) queryUsageCounts(ctx context.Context, start, end time.Time) (successCount int64, tokenConsumed int64, err error) { + q := ` +SELECT + COALESCE(COUNT(*), 0) AS success_count, + COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed +FROM usage_logs +WHERE created_at >= $1 AND created_at < $2` + + var tokens sql.NullInt64 + if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&successCount, &tokens); err != nil { + return 0, 0, err + } + if tokens.Valid { + tokenConsumed = tokens.Int64 + } + return successCount, tokenConsumed, nil +} + +func (c *OpsMetricsCollector) queryUsageLatency(ctx context.Context, start, end time.Time) (duration opsCollectedPercentiles, ttft opsCollectedPercentiles, err error) { + { + q := ` +SELECT + percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50, + percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90, + percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95, + percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99, + AVG(duration_ms) AS avg_ms, + MAX(duration_ms) AS max_ms +FROM usage_logs +WHERE created_at >= $1 AND created_at < $2 + AND duration_ms IS NOT NULL` + + var p50, p90, p95, p99 sql.NullFloat64 + var avg sql.NullFloat64 + var max sql.NullInt64 + if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil { + return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err + } + duration.p50 = floatToIntPtr(p50) + duration.p90 = floatToIntPtr(p90) + duration.p95 = floatToIntPtr(p95) + duration.p99 = floatToIntPtr(p99) + if avg.Valid { + v := roundTo1DP(avg.Float64) + duration.avg = &v + } + if max.Valid { + v := int(max.Int64) + duration.max = &v + } + } + + { + q := ` +SELECT + percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50, + percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90, + percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95, + percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99, + AVG(first_token_ms) AS avg_ms, + MAX(first_token_ms) AS max_ms +FROM usage_logs +WHERE created_at >= $1 AND created_at < $2 + AND first_token_ms IS NOT NULL` + + var p50, p90, p95, p99 sql.NullFloat64 + var avg sql.NullFloat64 + var max sql.NullInt64 + if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil { + return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err + } + ttft.p50 = floatToIntPtr(p50) + ttft.p90 = floatToIntPtr(p90) + ttft.p95 = floatToIntPtr(p95) + ttft.p99 = floatToIntPtr(p99) + if avg.Valid { + v := roundTo1DP(avg.Float64) + ttft.avg = &v + } + if max.Valid { + v := int(max.Int64) + ttft.max = &v + } + } + + return duration, ttft, nil +} + +func (c *OpsMetricsCollector) queryErrorCounts(ctx context.Context, start, end time.Time) ( + errorTotal int64, + businessLimited int64, + errorSLA int64, + upstreamExcl429529 int64, + upstream429 int64, + upstream529 int64, + err error, +) { + q := ` +SELECT + COALESCE(COUNT(*), 0) AS error_total, + COALESCE(COUNT(*) FILTER (WHERE is_business_limited), 0) AS business_limited, + COALESCE(COUNT(*) FILTER (WHERE NOT is_business_limited), 0) AS error_sla, + COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl, + COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 429), 0) AS upstream_429, + COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 529), 0) AS upstream_529 +FROM ops_error_logs +WHERE created_at >= $1 AND created_at < $2` + + if err := c.db.QueryRowContext(ctx, q, start, end).Scan( + &errorTotal, + &businessLimited, + &errorSLA, + &upstreamExcl429529, + &upstream429, + &upstream529, + ); err != nil { + return 0, 0, 0, 0, 0, 0, err + } + return errorTotal, businessLimited, errorSLA, upstreamExcl429529, upstream429, upstream529, nil +} + +type opsCollectedSystemStats struct { + cpuUsagePercent *float64 + memoryUsedMB *int64 + memoryTotalMB *int64 + memoryUsagePercent *float64 +} + +func (c *OpsMetricsCollector) collectSystemStats(ctx context.Context) (*opsCollectedSystemStats, error) { + out := &opsCollectedSystemStats{} + if ctx == nil { + ctx = context.Background() + } + + sampleAt := time.Now().UTC() + + // Prefer cgroup (container) metrics when available. + if cpuPct := c.tryCgroupCPUPercent(sampleAt); cpuPct != nil { + out.cpuUsagePercent = cpuPct + } + + cgroupUsed, cgroupTotal, cgroupOK := readCgroupMemoryBytes() + if cgroupOK { + usedMB := int64(cgroupUsed / bytesPerMB) + out.memoryUsedMB = &usedMB + if cgroupTotal > 0 { + totalMB := int64(cgroupTotal / bytesPerMB) + out.memoryTotalMB = &totalMB + pct := roundTo1DP(float64(cgroupUsed) / float64(cgroupTotal) * 100) + out.memoryUsagePercent = &pct + } + } + + // Fallback to host metrics if cgroup metrics are unavailable (or incomplete). + if out.cpuUsagePercent == nil { + if cpuPercents, err := cpu.PercentWithContext(ctx, 0, false); err == nil && len(cpuPercents) > 0 { + v := roundTo1DP(cpuPercents[0]) + out.cpuUsagePercent = &v + } + } + + // If total memory isn't available from cgroup (e.g. memory.max = "max"), fill total from host. + if out.memoryUsedMB == nil || out.memoryTotalMB == nil || out.memoryUsagePercent == nil { + if vm, err := mem.VirtualMemoryWithContext(ctx); err == nil && vm != nil { + if out.memoryUsedMB == nil { + usedMB := int64(vm.Used / bytesPerMB) + out.memoryUsedMB = &usedMB + } + if out.memoryTotalMB == nil { + totalMB := int64(vm.Total / bytesPerMB) + out.memoryTotalMB = &totalMB + } + if out.memoryUsagePercent == nil { + if out.memoryUsedMB != nil && out.memoryTotalMB != nil && *out.memoryTotalMB > 0 { + pct := roundTo1DP(float64(*out.memoryUsedMB) / float64(*out.memoryTotalMB) * 100) + out.memoryUsagePercent = &pct + } else { + pct := roundTo1DP(vm.UsedPercent) + out.memoryUsagePercent = &pct + } + } + } + } + + return out, nil +} + +func (c *OpsMetricsCollector) tryCgroupCPUPercent(now time.Time) *float64 { + usageNanos, ok := readCgroupCPUUsageNanos() + if !ok { + return nil + } + + // Initialize baseline sample. + if c.lastCgroupCPUSampleAt.IsZero() { + c.lastCgroupCPUUsageNanos = usageNanos + c.lastCgroupCPUSampleAt = now + return nil + } + + elapsed := now.Sub(c.lastCgroupCPUSampleAt) + if elapsed <= 0 { + c.lastCgroupCPUUsageNanos = usageNanos + c.lastCgroupCPUSampleAt = now + return nil + } + + prev := c.lastCgroupCPUUsageNanos + c.lastCgroupCPUUsageNanos = usageNanos + c.lastCgroupCPUSampleAt = now + + if usageNanos < prev { + // Counter reset (container restarted). + return nil + } + + deltaUsageSec := float64(usageNanos-prev) / 1e9 + elapsedSec := elapsed.Seconds() + if elapsedSec <= 0 { + return nil + } + + cores := readCgroupCPULimitCores() + if cores <= 0 { + // Can't reliably normalize; skip and fall back to gopsutil. + return nil + } + + pct := (deltaUsageSec / (elapsedSec * cores)) * 100 + if pct < 0 { + pct = 0 + } + // Clamp to avoid noise/jitter showing impossible values. + if pct > 100 { + pct = 100 + } + v := roundTo1DP(pct) + return &v +} + +func readCgroupMemoryBytes() (usedBytes uint64, totalBytes uint64, ok bool) { + // cgroup v2 (most common in modern containers) + if used, ok1 := readUintFile("/sys/fs/cgroup/memory.current"); ok1 { + usedBytes = used + rawMax, err := os.ReadFile("/sys/fs/cgroup/memory.max") + if err == nil { + s := strings.TrimSpace(string(rawMax)) + if s != "" && s != "max" { + if v, err := strconv.ParseUint(s, 10, 64); err == nil { + totalBytes = v + } + } + } + return usedBytes, totalBytes, true + } + + // cgroup v1 fallback + if used, ok1 := readUintFile("/sys/fs/cgroup/memory/memory.usage_in_bytes"); ok1 { + usedBytes = used + if limit, ok2 := readUintFile("/sys/fs/cgroup/memory/memory.limit_in_bytes"); ok2 { + // Some environments report a very large number when unlimited. + if limit > 0 && limit < (1<<60) { + totalBytes = limit + } + } + return usedBytes, totalBytes, true + } + + return 0, 0, false +} + +func readCgroupCPUUsageNanos() (usageNanos uint64, ok bool) { + // cgroup v2: cpu.stat has usage_usec + if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.stat"); err == nil { + lines := strings.Split(string(raw), "\n") + for _, line := range lines { + fields := strings.Fields(line) + if len(fields) != 2 { + continue + } + if fields[0] != "usage_usec" { + continue + } + v, err := strconv.ParseUint(fields[1], 10, 64) + if err != nil { + continue + } + return v * 1000, true + } + } + + // cgroup v1: cpuacct.usage is in nanoseconds + if v, ok := readUintFile("/sys/fs/cgroup/cpuacct/cpuacct.usage"); ok { + return v, true + } + + return 0, false +} + +func readCgroupCPULimitCores() float64 { + // cgroup v2: cpu.max => " " or "max " + if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.max"); err == nil { + fields := strings.Fields(string(raw)) + if len(fields) >= 2 && fields[0] != "max" { + quota, err1 := strconv.ParseFloat(fields[0], 64) + period, err2 := strconv.ParseFloat(fields[1], 64) + if err1 == nil && err2 == nil && quota > 0 && period > 0 { + return quota / period + } + } + } + + // cgroup v1: cpu.cfs_quota_us / cpu.cfs_period_us + quota, okQuota := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_quota_us") + period, okPeriod := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_period_us") + if okQuota && okPeriod && quota > 0 && period > 0 { + return float64(quota) / float64(period) + } + + return 0 +} + +func readUintFile(path string) (uint64, bool) { + raw, err := os.ReadFile(path) + if err != nil { + return 0, false + } + s := strings.TrimSpace(string(raw)) + if s == "" { + return 0, false + } + v, err := strconv.ParseUint(s, 10, 64) + if err != nil { + return 0, false + } + return v, true +} + +func readIntFile(path string) (int64, bool) { + raw, err := os.ReadFile(path) + if err != nil { + return 0, false + } + s := strings.TrimSpace(string(raw)) + if s == "" { + return 0, false + } + v, err := strconv.ParseInt(s, 10, 64) + if err != nil { + return 0, false + } + return v, true +} + +func (c *OpsMetricsCollector) checkDB(ctx context.Context) bool { + if c == nil || c.db == nil { + return false + } + if ctx == nil { + ctx = context.Background() + } + var one int + if err := c.db.QueryRowContext(ctx, "SELECT 1").Scan(&one); err != nil { + return false + } + return one == 1 +} + +func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool { + if c == nil || c.redisClient == nil { + return false + } + if ctx == nil { + ctx = context.Background() + } + return c.redisClient.Ping(ctx).Err() == nil +} + +func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) { + if c == nil || c.db == nil { + return 0, 0 + } + stats := c.db.Stats() + return stats.InUse, stats.Idle +} + +var opsMetricsCollectorReleaseScript = redis.NewScript(` +if redis.call("GET", KEYS[1]) == ARGV[1] then + return redis.call("DEL", KEYS[1]) +end +return 0 +`) + +func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), bool) { + if c == nil || c.redisClient == nil { + return nil, true + } + if ctx == nil { + ctx = context.Background() + } + + ok, err := c.redisClient.SetNX(ctx, opsMetricsCollectorLeaderLockKey, c.instanceID, opsMetricsCollectorLeaderLockTTL).Result() + if err != nil { + // Prefer fail-closed to avoid stampeding the database when Redis is flaky. + // Fallback to a DB advisory lock when Redis is present but unavailable. + release, ok := c.tryAcquireDBAdvisoryLock(ctx) + if !ok { + c.maybeLogSkip() + return nil, false + } + return release, true + } + if !ok { + c.maybeLogSkip() + return nil, false + } + + release := func() { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _, _ = opsMetricsCollectorReleaseScript.Run(ctx, c.redisClient, []string{opsMetricsCollectorLeaderLockKey}, c.instanceID).Result() + } + return release, true +} + +func (c *OpsMetricsCollector) tryAcquireDBAdvisoryLock(ctx context.Context) (func(), bool) { + if c == nil || c.db == nil { + return nil, false + } + if ctx == nil { + ctx = context.Background() + } + + conn, err := c.db.Conn(ctx) + if err != nil { + return nil, false + } + + acquired := false + if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", opsMetricsCollectorAdvisoryLockID).Scan(&acquired); err != nil { + _ = conn.Close() + return nil, false + } + if !acquired { + _ = conn.Close() + return nil, false + } + + release := func() { + unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", opsMetricsCollectorAdvisoryLockID) + _ = conn.Close() + } + return release, true +} + +func (c *OpsMetricsCollector) maybeLogSkip() { + c.skipLogMu.Lock() + defer c.skipLogMu.Unlock() + + now := time.Now() + if !c.skipLogAt.IsZero() && now.Sub(c.skipLogAt) < time.Minute { + return + } + c.skipLogAt = now + log.Printf("[OpsMetricsCollector] leader lock held by another instance; skipping") +} + +func floatToIntPtr(v sql.NullFloat64) *int { + if !v.Valid { + return nil + } + n := int(math.Round(v.Float64)) + return &n +} + +func roundTo1DP(v float64) float64 { + return math.Round(v*10) / 10 +} + +func truncateString(s string, max int) string { + if max <= 0 { + return "" + } + if len(s) <= max { + return s + } + cut := s[:max] + for len(cut) > 0 && !utf8.ValidString(cut) { + cut = cut[:len(cut)-1] + } + return cut +} + +func boolPtr(v bool) *bool { + out := v + return &out +} + +func intPtr(v int) *int { + out := v + return &out +} + +func float64Ptr(v float64) *float64 { + out := v + return &out +} + +func hashAdvisoryLockID(s string) int64 { + h := fnv.New64a() + _, _ = h.Write([]byte(s)) + return int64(h.Sum64()) +} diff --git a/backend/internal/service/ops_port.go b/backend/internal/service/ops_port.go new file mode 100644 index 00000000..a3d847e0 --- /dev/null +++ b/backend/internal/service/ops_port.go @@ -0,0 +1,226 @@ +package service + +import ( + "context" + "time" +) + +type OpsRepository interface { + InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error) + ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error) + GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) + ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error) + + InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error) + UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error + GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error) + + // Lightweight window stats (for realtime WS / quick sampling). + GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error) + + GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) + GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) + GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error) + GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error) + GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error) + + InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error + GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error) + + UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error + ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error) + + // Alerts (rules + events) + ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error) + CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error) + UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error) + DeleteAlertRule(ctx context.Context, id int64) error + + ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error) + GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) + GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) + CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error) + UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error + UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error + + // Pre-aggregation (hourly/daily) used for long-window dashboard performance. + UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error + UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error + GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error) + GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error) +} + +type OpsInsertErrorLogInput struct { + RequestID string + ClientRequestID string + + UserID *int64 + APIKeyID *int64 + AccountID *int64 + GroupID *int64 + ClientIP *string + + Platform string + Model string + RequestPath string + Stream bool + UserAgent string + + ErrorPhase string + ErrorType string + Severity string + StatusCode int + IsBusinessLimited bool + + ErrorMessage string + ErrorBody string + + ErrorSource string + ErrorOwner string + + UpstreamStatusCode *int + UpstreamErrorMessage *string + UpstreamErrorDetail *string + + DurationMs *int + TimeToFirstTokenMs *int64 + + RequestBodyJSON *string // sanitized json string (not raw bytes) + RequestBodyTruncated bool + RequestBodyBytes *int + RequestHeadersJSON *string // optional json string + + IsRetryable bool + RetryCount int + + CreatedAt time.Time +} + +type OpsInsertRetryAttemptInput struct { + RequestedByUserID int64 + SourceErrorID int64 + Mode string + PinnedAccountID *int64 + + // running|queued etc. + Status string + StartedAt time.Time +} + +type OpsUpdateRetryAttemptInput struct { + ID int64 + + // succeeded|failed + Status string + FinishedAt time.Time + DurationMs int64 + + // Optional correlation + ResultRequestID *string + ResultErrorID *int64 + + ErrorMessage *string +} + +type OpsInsertSystemMetricsInput struct { + CreatedAt time.Time + WindowMinutes int + + Platform *string + GroupID *int64 + + SuccessCount int64 + ErrorCountTotal int64 + BusinessLimitedCount int64 + ErrorCountSLA int64 + + UpstreamErrorCountExcl429529 int64 + Upstream429Count int64 + Upstream529Count int64 + + TokenConsumed int64 + + QPS *float64 + TPS *float64 + + DurationP50Ms *int + DurationP90Ms *int + DurationP95Ms *int + DurationP99Ms *int + DurationAvgMs *float64 + DurationMaxMs *int + + TTFTP50Ms *int + TTFTP90Ms *int + TTFTP95Ms *int + TTFTP99Ms *int + TTFTAvgMs *float64 + TTFTMaxMs *int + + CPUUsagePercent *float64 + MemoryUsedMB *int64 + MemoryTotalMB *int64 + MemoryUsagePercent *float64 + + DBOK *bool + RedisOK *bool + + DBConnActive *int + DBConnIdle *int + DBConnWaiting *int + + GoroutineCount *int + ConcurrencyQueueDepth *int +} + +type OpsSystemMetricsSnapshot struct { + ID int64 `json:"id"` + CreatedAt time.Time `json:"created_at"` + WindowMinutes int `json:"window_minutes"` + + CPUUsagePercent *float64 `json:"cpu_usage_percent"` + MemoryUsedMB *int64 `json:"memory_used_mb"` + MemoryTotalMB *int64 `json:"memory_total_mb"` + MemoryUsagePercent *float64 `json:"memory_usage_percent"` + + DBOK *bool `json:"db_ok"` + RedisOK *bool `json:"redis_ok"` + + DBConnActive *int `json:"db_conn_active"` + DBConnIdle *int `json:"db_conn_idle"` + DBConnWaiting *int `json:"db_conn_waiting"` + + GoroutineCount *int `json:"goroutine_count"` + ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"` +} + +type OpsUpsertJobHeartbeatInput struct { + JobName string + + LastRunAt *time.Time + LastSuccessAt *time.Time + LastErrorAt *time.Time + LastError *string + LastDurationMs *int64 +} + +type OpsJobHeartbeat struct { + JobName string `json:"job_name"` + + LastRunAt *time.Time `json:"last_run_at"` + LastSuccessAt *time.Time `json:"last_success_at"` + LastErrorAt *time.Time `json:"last_error_at"` + LastError *string `json:"last_error"` + LastDurationMs *int64 `json:"last_duration_ms"` + + UpdatedAt time.Time `json:"updated_at"` +} + +type OpsWindowStats struct { + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + + SuccessCount int64 `json:"success_count"` + ErrorCountTotal int64 `json:"error_count_total"` + TokenConsumed int64 `json:"token_consumed"` +} diff --git a/backend/internal/service/ops_query_mode.go b/backend/internal/service/ops_query_mode.go new file mode 100644 index 00000000..e6fa9c1e --- /dev/null +++ b/backend/internal/service/ops_query_mode.go @@ -0,0 +1,40 @@ +package service + +import ( + "errors" + "strings" +) + +type OpsQueryMode string + +const ( + OpsQueryModeAuto OpsQueryMode = "auto" + OpsQueryModeRaw OpsQueryMode = "raw" + OpsQueryModePreagg OpsQueryMode = "preagg" +) + +// ErrOpsPreaggregatedNotPopulated indicates that raw logs exist for a window, but the +// pre-aggregation tables are not populated yet. This is primarily used to implement +// the forced `preagg` mode UX. +var ErrOpsPreaggregatedNotPopulated = errors.New("ops pre-aggregated tables not populated") + +func ParseOpsQueryMode(raw string) OpsQueryMode { + v := strings.ToLower(strings.TrimSpace(raw)) + switch v { + case string(OpsQueryModeRaw): + return OpsQueryModeRaw + case string(OpsQueryModePreagg): + return OpsQueryModePreagg + default: + return OpsQueryModeAuto + } +} + +func (m OpsQueryMode) IsValid() bool { + switch m { + case OpsQueryModeAuto, OpsQueryModeRaw, OpsQueryModePreagg: + return true + default: + return false + } +} diff --git a/backend/internal/service/ops_realtime.go b/backend/internal/service/ops_realtime.go new file mode 100644 index 00000000..479b9482 --- /dev/null +++ b/backend/internal/service/ops_realtime.go @@ -0,0 +1,36 @@ +package service + +import ( + "context" + "errors" + "strings" +) + +// IsRealtimeMonitoringEnabled returns true when realtime ops features are enabled. +// +// This is a soft switch controlled by the DB setting `ops_realtime_monitoring_enabled`, +// and it is also gated by the hard switch/soft switch of overall ops monitoring. +func (s *OpsService) IsRealtimeMonitoringEnabled(ctx context.Context) bool { + if !s.IsMonitoringEnabled(ctx) { + return false + } + if s.settingRepo == nil { + return true + } + + value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsRealtimeMonitoringEnabled) + if err != nil { + // Default enabled when key is missing; fail-open on transient errors. + if errors.Is(err, ErrSettingNotFound) { + return true + } + return true + } + + switch strings.ToLower(strings.TrimSpace(value)) { + case "false", "0", "off", "disabled": + return false + default: + return true + } +} diff --git a/backend/internal/service/ops_request_details.go b/backend/internal/service/ops_request_details.go new file mode 100644 index 00000000..e33e6f38 --- /dev/null +++ b/backend/internal/service/ops_request_details.go @@ -0,0 +1,152 @@ +package service + +import ( + "context" + "time" +) + +type OpsRequestKind string + +const ( + OpsRequestKindSuccess OpsRequestKind = "success" + OpsRequestKindError OpsRequestKind = "error" +) + +// OpsRequestDetail is a request-level view across success (usage_logs) and error (ops_error_logs). +// It powers "request drilldown" UIs without exposing full request bodies for successful requests. +type OpsRequestDetail struct { + Kind OpsRequestKind `json:"kind"` + CreatedAt time.Time `json:"created_at"` + RequestID string `json:"request_id"` + + Platform string `json:"platform,omitempty"` + Model string `json:"model,omitempty"` + + DurationMs *int `json:"duration_ms,omitempty"` + StatusCode *int `json:"status_code,omitempty"` + + // When Kind == "error", ErrorID links to /admin/ops/errors/:id. + ErrorID *int64 `json:"error_id,omitempty"` + + Phase string `json:"phase,omitempty"` + Severity string `json:"severity,omitempty"` + Message string `json:"message,omitempty"` + + UserID *int64 `json:"user_id,omitempty"` + APIKeyID *int64 `json:"api_key_id,omitempty"` + AccountID *int64 `json:"account_id,omitempty"` + GroupID *int64 `json:"group_id,omitempty"` + + Stream bool `json:"stream"` +} + +type OpsRequestDetailFilter struct { + StartTime *time.Time + EndTime *time.Time + + // kind: success|error|all + Kind string + + Platform string + GroupID *int64 + + UserID *int64 + APIKeyID *int64 + AccountID *int64 + + Model string + RequestID string + Query string + + MinDurationMs *int + MaxDurationMs *int + + // Sort: created_at_desc (default) or duration_desc. + Sort string + + Page int + PageSize int +} + +func (f *OpsRequestDetailFilter) Normalize() (page, pageSize int, startTime, endTime time.Time) { + page = 1 + pageSize = 50 + endTime = time.Now() + startTime = endTime.Add(-1 * time.Hour) + + if f == nil { + return page, pageSize, startTime, endTime + } + + if f.Page > 0 { + page = f.Page + } + if f.PageSize > 0 { + pageSize = f.PageSize + } + if pageSize > 100 { + pageSize = 100 + } + + if f.EndTime != nil { + endTime = *f.EndTime + } + if f.StartTime != nil { + startTime = *f.StartTime + } else if f.EndTime != nil { + startTime = endTime.Add(-1 * time.Hour) + } + + if startTime.After(endTime) { + startTime, endTime = endTime, startTime + } + + return page, pageSize, startTime, endTime +} + +type OpsRequestDetailList struct { + Items []*OpsRequestDetail `json:"items"` + Total int64 `json:"total"` + Page int `json:"page"` + PageSize int `json:"page_size"` +} + +func (s *OpsService) ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) (*OpsRequestDetailList, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return &OpsRequestDetailList{ + Items: []*OpsRequestDetail{}, + Total: 0, + Page: 1, + PageSize: 50, + }, nil + } + + page, pageSize, startTime, endTime := filter.Normalize() + filterCopy := &OpsRequestDetailFilter{} + if filter != nil { + *filterCopy = *filter + } + filterCopy.Page = page + filterCopy.PageSize = pageSize + filterCopy.StartTime = &startTime + filterCopy.EndTime = &endTime + + items, total, err := s.opsRepo.ListRequestDetails(ctx, filterCopy) + if err != nil { + return nil, err + } + if items == nil { + items = []*OpsRequestDetail{} + } + + return &OpsRequestDetailList{ + Items: items, + Total: total, + Page: page, + PageSize: pageSize, + }, nil +} + diff --git a/backend/internal/service/ops_retry.go b/backend/internal/service/ops_retry.go new file mode 100644 index 00000000..3232e708 --- /dev/null +++ b/backend/internal/service/ops_retry.go @@ -0,0 +1,635 @@ +package service + +import ( + "bytes" + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "log" + "net/http" + "strings" + "time" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" + "github.com/gin-gonic/gin" + "github.com/lib/pq" +) + +const ( + OpsRetryModeClient = "client" + OpsRetryModeUpstream = "upstream" +) + +const ( + opsRetryStatusRunning = "running" + opsRetryStatusSucceeded = "succeeded" + opsRetryStatusFailed = "failed" +) + +const ( + opsRetryTimeout = 60 * time.Second + opsRetryCaptureBytesLimit = 64 * 1024 + opsRetryResponsePreviewMax = 8 * 1024 + opsRetryMinIntervalPerError = 10 * time.Second + opsRetryMaxAccountSwitches = 3 +) + +var opsRetryRequestHeaderAllowlist = map[string]bool{ + "anthropic-beta": true, + "anthropic-version": true, +} + +type opsRetryRequestType string + +const ( + opsRetryTypeMessages opsRetryRequestType = "messages" + opsRetryTypeOpenAI opsRetryRequestType = "openai_responses" + opsRetryTypeGeminiV1B opsRetryRequestType = "gemini_v1beta" +) + +type limitedResponseWriter struct { + header http.Header + status int + wroteHeader bool + + limit int + totalWritten int64 + buf bytes.Buffer +} + +func newLimitedResponseWriter(limit int) *limitedResponseWriter { + if limit <= 0 { + limit = 1 + } + return &limitedResponseWriter{ + header: make(http.Header), + status: http.StatusOK, + limit: limit, + } +} + +func (w *limitedResponseWriter) Header() http.Header { + return w.header +} + +func (w *limitedResponseWriter) WriteHeader(statusCode int) { + if w.wroteHeader { + return + } + w.wroteHeader = true + w.status = statusCode +} + +func (w *limitedResponseWriter) Write(p []byte) (int, error) { + if !w.wroteHeader { + w.WriteHeader(http.StatusOK) + } + w.totalWritten += int64(len(p)) + + if w.buf.Len() < w.limit { + remaining := w.limit - w.buf.Len() + if len(p) > remaining { + _, _ = w.buf.Write(p[:remaining]) + } else { + _, _ = w.buf.Write(p) + } + } + + // Pretend we wrote everything to avoid upstream/client code treating it as an error. + return len(p), nil +} + +func (w *limitedResponseWriter) Flush() {} + +func (w *limitedResponseWriter) bodyBytes() []byte { + return w.buf.Bytes() +} + +func (w *limitedResponseWriter) truncated() bool { + return w.totalWritten > int64(w.limit) +} + +func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + + mode = strings.ToLower(strings.TrimSpace(mode)) + switch mode { + case OpsRetryModeClient, OpsRetryModeUpstream: + default: + return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream") + } + + latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID) + if err != nil && !errors.Is(err, sql.ErrNoRows) { + return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err) + } + if latest != nil { + if strings.EqualFold(latest.Status, opsRetryStatusRunning) || strings.EqualFold(latest.Status, "queued") { + return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error") + } + + lastAttemptAt := latest.CreatedAt + if latest.FinishedAt != nil && !latest.FinishedAt.IsZero() { + lastAttemptAt = *latest.FinishedAt + } else if latest.StartedAt != nil && !latest.StartedAt.IsZero() { + lastAttemptAt = *latest.StartedAt + } + + if time.Since(lastAttemptAt) < opsRetryMinIntervalPerError { + return nil, infraerrors.Conflict("OPS_RETRY_TOO_FREQUENT", "Please wait before retrying this error again") + } + } + + errorLog, err := s.GetErrorLogByID(ctx, errorID) + if err != nil { + return nil, err + } + if strings.TrimSpace(errorLog.RequestBody) == "" { + return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry") + } + + var pinned *int64 + if mode == OpsRetryModeUpstream { + if pinnedAccountID != nil && *pinnedAccountID > 0 { + pinned = pinnedAccountID + } else if errorLog.AccountID != nil && *errorLog.AccountID > 0 { + pinned = errorLog.AccountID + } else { + return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry") + } + } + + startedAt := time.Now() + attemptID, err := s.opsRepo.InsertRetryAttempt(ctx, &OpsInsertRetryAttemptInput{ + RequestedByUserID: requestedByUserID, + SourceErrorID: errorID, + Mode: mode, + PinnedAccountID: pinned, + Status: opsRetryStatusRunning, + StartedAt: startedAt, + }) + if err != nil { + var pqErr *pq.Error + if errors.As(err, &pqErr) && string(pqErr.Code) == "23505" { + return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error") + } + return nil, infraerrors.InternalServer("OPS_RETRY_CREATE_ATTEMPT_FAILED", "Failed to create retry attempt").WithCause(err) + } + + result := &OpsRetryResult{ + AttemptID: attemptID, + Mode: mode, + Status: opsRetryStatusFailed, + PinnedAccountID: pinned, + HTTPStatusCode: 0, + UpstreamRequestID: "", + ResponsePreview: "", + ResponseTruncated: false, + ErrorMessage: "", + StartedAt: startedAt, + } + + execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout) + defer cancel() + + execRes := s.executeRetry(execCtx, errorLog, mode, pinned) + + finishedAt := time.Now() + result.FinishedAt = finishedAt + result.DurationMs = finishedAt.Sub(startedAt).Milliseconds() + + if execRes != nil { + result.Status = execRes.status + result.UsedAccountID = execRes.usedAccountID + result.HTTPStatusCode = execRes.httpStatusCode + result.UpstreamRequestID = execRes.upstreamRequestID + result.ResponsePreview = execRes.responsePreview + result.ResponseTruncated = execRes.responseTruncated + result.ErrorMessage = execRes.errorMessage + } + + updateCtx, updateCancel := context.WithTimeout(context.Background(), 3*time.Second) + defer updateCancel() + + var updateErrMsg *string + if strings.TrimSpace(result.ErrorMessage) != "" { + msg := result.ErrorMessage + updateErrMsg = &msg + } + var resultRequestID *string + if strings.TrimSpace(result.UpstreamRequestID) != "" { + v := result.UpstreamRequestID + resultRequestID = &v + } + + finalStatus := result.Status + if strings.TrimSpace(finalStatus) == "" { + finalStatus = opsRetryStatusFailed + } + + if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{ + ID: attemptID, + Status: finalStatus, + FinishedAt: finishedAt, + DurationMs: result.DurationMs, + ResultRequestID: resultRequestID, + ErrorMessage: updateErrMsg, + }); err != nil { + // Best-effort: retry itself already executed; do not fail the API response. + log.Printf("[Ops] UpdateRetryAttempt failed: %v", err) + } + + return result, nil +} + +type opsRetryExecution struct { + status string + + usedAccountID *int64 + httpStatusCode int + upstreamRequestID string + + responsePreview string + responseTruncated bool + + errorMessage string +} + +func (s *OpsService) executeRetry(ctx context.Context, errorLog *OpsErrorLogDetail, mode string, pinnedAccountID *int64) *opsRetryExecution { + if errorLog == nil { + return &opsRetryExecution{ + status: opsRetryStatusFailed, + errorMessage: "missing error log", + } + } + + reqType := detectOpsRetryType(errorLog.RequestPath) + bodyBytes := []byte(errorLog.RequestBody) + + switch reqType { + case opsRetryTypeMessages: + bodyBytes = FilterThinkingBlocksForRetry(bodyBytes) + case opsRetryTypeOpenAI, opsRetryTypeGeminiV1B: + // No-op + } + + switch strings.ToLower(strings.TrimSpace(mode)) { + case OpsRetryModeUpstream: + if pinnedAccountID == nil || *pinnedAccountID <= 0 { + return &opsRetryExecution{ + status: opsRetryStatusFailed, + errorMessage: "pinned_account_id required for upstream retry", + } + } + return s.executePinnedRetry(ctx, reqType, errorLog, bodyBytes, *pinnedAccountID) + case OpsRetryModeClient: + return s.executeClientRetry(ctx, reqType, errorLog, bodyBytes) + default: + return &opsRetryExecution{ + status: opsRetryStatusFailed, + errorMessage: "invalid retry mode", + } + } +} + +func detectOpsRetryType(path string) opsRetryRequestType { + p := strings.ToLower(strings.TrimSpace(path)) + switch { + case strings.Contains(p, "/responses"): + return opsRetryTypeOpenAI + case strings.Contains(p, "/v1beta/"): + return opsRetryTypeGeminiV1B + default: + return opsRetryTypeMessages + } +} + +func (s *OpsService) executePinnedRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, pinnedAccountID int64) *opsRetryExecution { + if s.accountRepo == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account repository not available"} + } + + account, err := s.accountRepo.GetByID(ctx, pinnedAccountID) + if err != nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("account not found: %v", err)} + } + if account == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account not found"} + } + if !account.IsSchedulable() { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account is not schedulable"} + } + if errorLog.GroupID != nil && *errorLog.GroupID > 0 { + if !containsInt64(account.GroupIDs, *errorLog.GroupID) { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "pinned account is not in the same group as the original request"} + } + } + + var release func() + if s.concurrencyService != nil { + acq, err := s.concurrencyService.AcquireAccountSlot(ctx, account.ID, account.Concurrency) + if err != nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("acquire account slot failed: %v", err)} + } + if acq == nil || !acq.Acquired { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account concurrency limit reached"} + } + release = acq.ReleaseFunc + } + if release != nil { + defer release() + } + + usedID := account.ID + exec := s.executeWithAccount(ctx, reqType, errorLog, body, account) + exec.usedAccountID = &usedID + if exec.status == "" { + exec.status = opsRetryStatusFailed + } + return exec +} + +func (s *OpsService) executeClientRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) *opsRetryExecution { + groupID := errorLog.GroupID + if groupID == nil || *groupID <= 0 { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "group_id missing; cannot reselect account"} + } + + model, stream, parsedErr := extractRetryModelAndStream(reqType, errorLog, body) + if parsedErr != nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: parsedErr.Error()} + } + _ = stream + + excluded := make(map[int64]struct{}) + switches := 0 + + for { + if switches >= opsRetryMaxAccountSwitches { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "retry failed after exhausting account failovers"} + } + + selection, selErr := s.selectAccountForRetry(ctx, reqType, groupID, model, excluded) + if selErr != nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: selErr.Error()} + } + if selection == nil || selection.Account == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "no available accounts"} + } + + account := selection.Account + if !selection.Acquired || selection.ReleaseFunc == nil { + excluded[account.ID] = struct{}{} + switches++ + continue + } + + exec := func() *opsRetryExecution { + defer selection.ReleaseFunc() + return s.executeWithAccount(ctx, reqType, errorLog, body, account) + }() + + if exec != nil { + if exec.status == opsRetryStatusSucceeded { + usedID := account.ID + exec.usedAccountID = &usedID + return exec + } + // If the gateway services ask for failover, try another account. + if s.isFailoverError(exec.errorMessage) { + excluded[account.ID] = struct{}{} + switches++ + continue + } + usedID := account.ID + exec.usedAccountID = &usedID + return exec + } + + excluded[account.ID] = struct{}{} + switches++ + } +} + +func (s *OpsService) selectAccountForRetry(ctx context.Context, reqType opsRetryRequestType, groupID *int64, model string, excludedIDs map[int64]struct{}) (*AccountSelectionResult, error) { + switch reqType { + case opsRetryTypeOpenAI: + if s.openAIGatewayService == nil { + return nil, fmt.Errorf("openai gateway service not available") + } + return s.openAIGatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs) + case opsRetryTypeGeminiV1B, opsRetryTypeMessages: + if s.gatewayService == nil { + return nil, fmt.Errorf("gateway service not available") + } + return s.gatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs) + default: + return nil, fmt.Errorf("unsupported retry type: %s", reqType) + } +} + +func extractRetryModelAndStream(reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) (model string, stream bool, err error) { + switch reqType { + case opsRetryTypeMessages: + parsed, parseErr := ParseGatewayRequest(body) + if parseErr != nil { + return "", false, fmt.Errorf("failed to parse messages request body: %w", parseErr) + } + return parsed.Model, parsed.Stream, nil + case opsRetryTypeOpenAI: + var v struct { + Model string `json:"model"` + Stream bool `json:"stream"` + } + if err := json.Unmarshal(body, &v); err != nil { + return "", false, fmt.Errorf("failed to parse openai request body: %w", err) + } + return strings.TrimSpace(v.Model), v.Stream, nil + case opsRetryTypeGeminiV1B: + if strings.TrimSpace(errorLog.Model) == "" { + return "", false, fmt.Errorf("missing model for gemini v1beta retry") + } + return strings.TrimSpace(errorLog.Model), errorLog.Stream, nil + default: + return "", false, fmt.Errorf("unsupported retry type: %s", reqType) + } +} + +func (s *OpsService) executeWithAccount(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, account *Account) *opsRetryExecution { + if account == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "missing account"} + } + + c, w := newOpsRetryContext(ctx, errorLog) + + var err error + switch reqType { + case opsRetryTypeOpenAI: + if s.openAIGatewayService == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "openai gateway service not available"} + } + _, err = s.openAIGatewayService.Forward(ctx, c, account, body) + case opsRetryTypeGeminiV1B: + if s.geminiCompatService == nil || s.antigravityGatewayService == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini services not available"} + } + modelName := strings.TrimSpace(errorLog.Model) + action := "generateContent" + if errorLog.Stream { + action = "streamGenerateContent" + } + if account.Platform == PlatformAntigravity { + _, err = s.antigravityGatewayService.ForwardGemini(ctx, c, account, modelName, action, errorLog.Stream, body) + } else { + _, err = s.geminiCompatService.ForwardNative(ctx, c, account, modelName, action, errorLog.Stream, body) + } + case opsRetryTypeMessages: + switch account.Platform { + case PlatformAntigravity: + if s.antigravityGatewayService == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "antigravity gateway service not available"} + } + _, err = s.antigravityGatewayService.Forward(ctx, c, account, body) + case PlatformGemini: + if s.geminiCompatService == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini gateway service not available"} + } + _, err = s.geminiCompatService.Forward(ctx, c, account, body) + default: + if s.gatewayService == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gateway service not available"} + } + parsedReq, parseErr := ParseGatewayRequest(body) + if parseErr != nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "failed to parse request body"} + } + _, err = s.gatewayService.Forward(ctx, c, account, parsedReq) + } + default: + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "unsupported retry type"} + } + + statusCode := http.StatusOK + if c != nil && c.Writer != nil { + statusCode = c.Writer.Status() + } + + upstreamReqID := extractUpstreamRequestID(c) + preview, truncated := extractResponsePreview(w) + + exec := &opsRetryExecution{ + status: opsRetryStatusFailed, + httpStatusCode: statusCode, + upstreamRequestID: upstreamReqID, + responsePreview: preview, + responseTruncated: truncated, + errorMessage: "", + } + + if err == nil && statusCode < 400 { + exec.status = opsRetryStatusSucceeded + return exec + } + + if err != nil { + exec.errorMessage = err.Error() + } else { + exec.errorMessage = fmt.Sprintf("upstream returned status %d", statusCode) + } + + return exec +} + +func newOpsRetryContext(ctx context.Context, errorLog *OpsErrorLogDetail) (*gin.Context, *limitedResponseWriter) { + w := newLimitedResponseWriter(opsRetryCaptureBytesLimit) + c, _ := gin.CreateTestContext(w) + + path := "/" + if errorLog != nil && strings.TrimSpace(errorLog.RequestPath) != "" { + path = errorLog.RequestPath + } + + req, _ := http.NewRequestWithContext(ctx, http.MethodPost, "http://localhost"+path, bytes.NewReader(nil)) + req.Header.Set("content-type", "application/json") + if errorLog != nil && strings.TrimSpace(errorLog.UserAgent) != "" { + req.Header.Set("user-agent", errorLog.UserAgent) + } + // Restore a minimal, whitelisted subset of request headers to improve retry fidelity + // (e.g. anthropic-beta / anthropic-version). Never replay auth credentials. + if errorLog != nil && strings.TrimSpace(errorLog.RequestHeaders) != "" { + var stored map[string]string + if err := json.Unmarshal([]byte(errorLog.RequestHeaders), &stored); err == nil { + for k, v := range stored { + key := strings.TrimSpace(k) + if key == "" { + continue + } + if !opsRetryRequestHeaderAllowlist[strings.ToLower(key)] { + continue + } + val := strings.TrimSpace(v) + if val == "" { + continue + } + req.Header.Set(key, val) + } + } + } + + c.Request = req + return c, w +} + +func extractUpstreamRequestID(c *gin.Context) string { + if c == nil || c.Writer == nil { + return "" + } + h := c.Writer.Header() + if h == nil { + return "" + } + for _, key := range []string{"x-request-id", "X-Request-Id", "X-Request-ID"} { + if v := strings.TrimSpace(h.Get(key)); v != "" { + return v + } + } + return "" +} + +func extractResponsePreview(w *limitedResponseWriter) (preview string, truncated bool) { + if w == nil { + return "", false + } + b := bytes.TrimSpace(w.bodyBytes()) + if len(b) == 0 { + return "", w.truncated() + } + if len(b) > opsRetryResponsePreviewMax { + return string(b[:opsRetryResponsePreviewMax]), true + } + return string(b), w.truncated() +} + +func containsInt64(items []int64, needle int64) bool { + for _, v := range items { + if v == needle { + return true + } + } + return false +} + +func (s *OpsService) isFailoverError(message string) bool { + msg := strings.ToLower(strings.TrimSpace(message)) + if msg == "" { + return false + } + return strings.Contains(msg, "upstream error:") && strings.Contains(msg, "failover") +} diff --git a/backend/internal/service/ops_service.go b/backend/internal/service/ops_service.go new file mode 100644 index 00000000..169c523a --- /dev/null +++ b/backend/internal/service/ops_service.go @@ -0,0 +1,451 @@ +package service + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "log" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/config" + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +var ErrOpsDisabled = infraerrors.NotFound("OPS_DISABLED", "Ops monitoring is disabled") + +const ( + opsMaxStoredRequestBodyBytes = 10 * 1024 + opsMaxStoredErrorBodyBytes = 20 * 1024 +) + +// OpsService provides ingestion and query APIs for the Ops monitoring module. +type OpsService struct { + opsRepo OpsRepository + settingRepo SettingRepository + cfg *config.Config + + accountRepo AccountRepository + + concurrencyService *ConcurrencyService + gatewayService *GatewayService + openAIGatewayService *OpenAIGatewayService + geminiCompatService *GeminiMessagesCompatService + antigravityGatewayService *AntigravityGatewayService +} + +func NewOpsService( + opsRepo OpsRepository, + settingRepo SettingRepository, + cfg *config.Config, + accountRepo AccountRepository, + concurrencyService *ConcurrencyService, + gatewayService *GatewayService, + openAIGatewayService *OpenAIGatewayService, + geminiCompatService *GeminiMessagesCompatService, + antigravityGatewayService *AntigravityGatewayService, +) *OpsService { + return &OpsService{ + opsRepo: opsRepo, + settingRepo: settingRepo, + cfg: cfg, + + accountRepo: accountRepo, + + concurrencyService: concurrencyService, + gatewayService: gatewayService, + openAIGatewayService: openAIGatewayService, + geminiCompatService: geminiCompatService, + antigravityGatewayService: antigravityGatewayService, + } +} + +func (s *OpsService) RequireMonitoringEnabled(ctx context.Context) error { + if s.IsMonitoringEnabled(ctx) { + return nil + } + return ErrOpsDisabled +} + +func (s *OpsService) IsMonitoringEnabled(ctx context.Context) bool { + // Hard switch: disable ops entirely. + if s.cfg != nil && !s.cfg.Ops.Enabled { + return false + } + if s.settingRepo == nil { + return true + } + value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled) + if err != nil { + // Default enabled when key is missing, and fail-open on transient errors + // (ops should never block gateway traffic). + if errors.Is(err, ErrSettingNotFound) { + return true + } + return true + } + switch strings.ToLower(strings.TrimSpace(value)) { + case "false", "0", "off", "disabled": + return false + default: + return true + } +} + +func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogInput, rawRequestBody []byte) error { + if entry == nil { + return nil + } + if !s.IsMonitoringEnabled(ctx) { + return nil + } + if s.opsRepo == nil { + return nil + } + + // Ensure timestamps are always populated. + if entry.CreatedAt.IsZero() { + entry.CreatedAt = time.Now() + } + + // Ensure required fields exist (DB has NOT NULL constraints). + entry.ErrorPhase = strings.TrimSpace(entry.ErrorPhase) + entry.ErrorType = strings.TrimSpace(entry.ErrorType) + if entry.ErrorPhase == "" { + entry.ErrorPhase = "internal" + } + if entry.ErrorType == "" { + entry.ErrorType = "api_error" + } + + // Sanitize + trim request body (errors only). + if len(rawRequestBody) > 0 { + sanitized, truncated, bytesLen := sanitizeAndTrimRequestBody(rawRequestBody, opsMaxStoredRequestBodyBytes) + if sanitized != "" { + entry.RequestBodyJSON = &sanitized + } + entry.RequestBodyTruncated = truncated + entry.RequestBodyBytes = &bytesLen + } + + // Sanitize + truncate error_body to avoid storing sensitive data. + if strings.TrimSpace(entry.ErrorBody) != "" { + sanitized, _ := sanitizeErrorBodyForStorage(entry.ErrorBody, opsMaxStoredErrorBodyBytes) + entry.ErrorBody = sanitized + } + + if _, err := s.opsRepo.InsertErrorLog(ctx, entry); err != nil { + // Never bubble up to gateway; best-effort logging. + log.Printf("[Ops] RecordError failed: %v", err) + return err + } + return nil +} + +func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil + } + return s.opsRepo.ListErrorLogs(ctx, filter) +} + +func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found") + } + detail, err := s.opsRepo.GetErrorLogByID(ctx, id) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found") + } + return nil, infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err) + } + return detail, nil +} + +func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) { + bytesLen = len(raw) + if len(raw) == 0 { + return "", false, 0 + } + + var decoded any + if err := json.Unmarshal(raw, &decoded); err != nil { + // If it's not valid JSON, don't store (retry would not be reliable anyway). + return "", false, bytesLen + } + + decoded = redactSensitiveJSON(decoded) + + encoded, err := json.Marshal(decoded) + if err != nil { + return "", false, bytesLen + } + if len(encoded) <= maxBytes { + return string(encoded), false, bytesLen + } + + // Trim conversation history to keep the most recent context. + if root, ok := decoded.(map[string]any); ok { + if trimmed, ok := trimConversationArrays(root, maxBytes); ok { + encoded2, err2 := json.Marshal(trimmed) + if err2 == nil && len(encoded2) <= maxBytes { + return string(encoded2), true, bytesLen + } + // Fallthrough: keep shrinking. + decoded = trimmed + } + + essential := shrinkToEssentials(root) + encoded3, err3 := json.Marshal(essential) + if err3 == nil && len(encoded3) <= maxBytes { + return string(encoded3), true, bytesLen + } + } + + // Last resort: store a minimal placeholder (still valid JSON). + placeholder := map[string]any{ + "request_body_truncated": true, + } + if model := extractString(decoded, "model"); model != "" { + placeholder["model"] = model + } + encoded4, err4 := json.Marshal(placeholder) + if err4 != nil { + return "", true, bytesLen + } + return string(encoded4), true, bytesLen +} + +func redactSensitiveJSON(v any) any { + switch t := v.(type) { + case map[string]any: + out := make(map[string]any, len(t)) + for k, vv := range t { + if isSensitiveKey(k) { + out[k] = "[REDACTED]" + continue + } + out[k] = redactSensitiveJSON(vv) + } + return out + case []any: + out := make([]any, 0, len(t)) + for _, vv := range t { + out = append(out, redactSensitiveJSON(vv)) + } + return out + default: + return v + } +} + +func isSensitiveKey(key string) bool { + k := strings.ToLower(strings.TrimSpace(key)) + if k == "" { + return false + } + + // Exact matches (common credential fields). + switch k { + case "authorization", + "proxy-authorization", + "x-api-key", + "api_key", + "apikey", + "access_token", + "refresh_token", + "id_token", + "session_token", + "token", + "password", + "passwd", + "passphrase", + "secret", + "client_secret", + "private_key", + "jwt", + "signature", + "accesskeyid", + "secretaccesskey": + return true + } + + // Suffix matches. + for _, suffix := range []string{ + "_secret", + "_token", + "_id_token", + "_session_token", + "_password", + "_passwd", + "_passphrase", + "_key", + "secret_key", + "private_key", + } { + if strings.HasSuffix(k, suffix) { + return true + } + } + + // Substring matches (conservative, but errs on the side of privacy). + for _, sub := range []string{ + "secret", + "token", + "password", + "passwd", + "passphrase", + "privatekey", + "private_key", + "apikey", + "api_key", + "accesskeyid", + "secretaccesskey", + "bearer", + "cookie", + "credential", + "session", + "jwt", + "signature", + } { + if strings.Contains(k, sub) { + return true + } + } + + return false +} + +func trimConversationArrays(root map[string]any, maxBytes int) (map[string]any, bool) { + // Supported: anthropic/openai: messages; gemini: contents. + if out, ok := trimArrayField(root, "messages", maxBytes); ok { + return out, true + } + if out, ok := trimArrayField(root, "contents", maxBytes); ok { + return out, true + } + return root, false +} + +func trimArrayField(root map[string]any, field string, maxBytes int) (map[string]any, bool) { + raw, ok := root[field] + if !ok { + return nil, false + } + arr, ok := raw.([]any) + if !ok || len(arr) == 0 { + return nil, false + } + + // Keep at least the last message/content. Use binary search so we don't marshal O(n) times. + // We are dropping from the *front* of the array (oldest context first). + lo := 0 + hi := len(arr) - 1 // inclusive; hi ensures at least one item remains + + var best map[string]any + found := false + + for lo <= hi { + mid := (lo + hi) / 2 + candidateArr := arr[mid:] + if len(candidateArr) == 0 { + lo = mid + 1 + continue + } + + next := shallowCopyMap(root) + next[field] = candidateArr + encoded, err := json.Marshal(next) + if err != nil { + // If marshal fails, try dropping more. + lo = mid + 1 + continue + } + + if len(encoded) <= maxBytes { + best = next + found = true + // Try to keep more context by dropping fewer items. + hi = mid - 1 + continue + } + + // Need to drop more. + lo = mid + 1 + } + + if found { + return best, true + } + + // Nothing fit (even with only one element); return the smallest slice and let the + // caller fall back to shrinkToEssentials(). + next := shallowCopyMap(root) + next[field] = arr[len(arr)-1:] + return next, true +} + +func shrinkToEssentials(root map[string]any) map[string]any { + out := make(map[string]any) + for _, key := range []string{"model", "stream", "max_tokens", "temperature", "top_p", "top_k"} { + if v, ok := root[key]; ok { + out[key] = v + } + } + + // Keep only the last element of the conversation array. + if v, ok := root["messages"]; ok { + if arr, ok := v.([]any); ok && len(arr) > 0 { + out["messages"] = []any{arr[len(arr)-1]} + } + } + if v, ok := root["contents"]; ok { + if arr, ok := v.([]any); ok && len(arr) > 0 { + out["contents"] = []any{arr[len(arr)-1]} + } + } + return out +} + +func shallowCopyMap(m map[string]any) map[string]any { + out := make(map[string]any, len(m)) + for k, v := range m { + out[k] = v + } + return out +} + +func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, truncated bool) { + raw = strings.TrimSpace(raw) + if raw == "" { + return "", false + } + + // Prefer JSON-safe sanitization when possible. + if out, trunc, _ := sanitizeAndTrimRequestBody([]byte(raw), maxBytes); out != "" { + return out, trunc + } + + // Non-JSON: best-effort truncate. + if maxBytes > 0 && len(raw) > maxBytes { + return truncateString(raw, maxBytes), true + } + return raw, false +} + +func extractString(v any, key string) string { + root, ok := v.(map[string]any) + if !ok { + return "" + } + s, _ := root[key].(string) + return strings.TrimSpace(s) +} diff --git a/backend/internal/service/ops_settings.go b/backend/internal/service/ops_settings.go new file mode 100644 index 00000000..2f15bc79 --- /dev/null +++ b/backend/internal/service/ops_settings.go @@ -0,0 +1,354 @@ +package service + +import ( + "context" + "encoding/json" + "errors" + "strings" + "time" +) + +const ( + opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader" + opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second +) + +// ========================= +// Email notification config +// ========================= + +func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) { + defaultCfg := defaultOpsEmailNotificationConfig() + if s == nil || s.settingRepo == nil { + return defaultCfg, nil + } + if ctx == nil { + ctx = context.Background() + } + + raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig) + if err != nil { + if errors.Is(err, ErrSettingNotFound) { + // Initialize defaults on first read (best-effort). + if b, mErr := json.Marshal(defaultCfg); mErr == nil { + _ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b)) + } + return defaultCfg, nil + } + return nil, err + } + + cfg := &OpsEmailNotificationConfig{} + if err := json.Unmarshal([]byte(raw), cfg); err != nil { + // Corrupted JSON should not break ops UI; fall back to defaults. + return defaultCfg, nil + } + normalizeOpsEmailNotificationConfig(cfg) + return cfg, nil +} + +func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) { + if s == nil || s.settingRepo == nil { + return nil, errors.New("setting repository not initialized") + } + if ctx == nil { + ctx = context.Background() + } + if req == nil { + return nil, errors.New("invalid request") + } + + cfg, err := s.GetEmailNotificationConfig(ctx) + if err != nil { + return nil, err + } + + if req.Alert != nil { + cfg.Alert.Enabled = req.Alert.Enabled + if req.Alert.Recipients != nil { + cfg.Alert.Recipients = req.Alert.Recipients + } + cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity) + cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour + cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds + cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts + } + + if req.Report != nil { + cfg.Report.Enabled = req.Report.Enabled + if req.Report.Recipients != nil { + cfg.Report.Recipients = req.Report.Recipients + } + cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled + cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule) + cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled + cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule) + cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled + cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule) + cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount + cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled + cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule) + cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold + } + + if err := validateOpsEmailNotificationConfig(cfg); err != nil { + return nil, err + } + + normalizeOpsEmailNotificationConfig(cfg) + raw, err := json.Marshal(cfg) + if err != nil { + return nil, err + } + if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil { + return nil, err + } + return cfg, nil +} + +func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig { + return &OpsEmailNotificationConfig{ + Alert: OpsEmailAlertConfig{ + Enabled: true, + Recipients: []string{}, + MinSeverity: "", + RateLimitPerHour: 0, + BatchingWindowSeconds: 0, + IncludeResolvedAlerts: false, + }, + Report: OpsEmailReportConfig{ + Enabled: false, + Recipients: []string{}, + DailySummaryEnabled: false, + DailySummarySchedule: "0 9 * * *", + WeeklySummaryEnabled: false, + WeeklySummarySchedule: "0 9 * * 1", + ErrorDigestEnabled: false, + ErrorDigestSchedule: "0 9 * * *", + ErrorDigestMinCount: 10, + AccountHealthEnabled: false, + AccountHealthSchedule: "0 9 * * *", + AccountHealthErrorRateThreshold: 10.0, + }, + } +} + +func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) { + if cfg == nil { + return + } + if cfg.Alert.Recipients == nil { + cfg.Alert.Recipients = []string{} + } + if cfg.Report.Recipients == nil { + cfg.Report.Recipients = []string{} + } + + cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity) + cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule) + cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule) + cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule) + cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule) + + // Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings. + if cfg.Report.DailySummarySchedule == "" { + cfg.Report.DailySummarySchedule = "0 9 * * *" + } + if cfg.Report.WeeklySummarySchedule == "" { + cfg.Report.WeeklySummarySchedule = "0 9 * * 1" + } + if cfg.Report.ErrorDigestSchedule == "" { + cfg.Report.ErrorDigestSchedule = "0 9 * * *" + } + if cfg.Report.AccountHealthSchedule == "" { + cfg.Report.AccountHealthSchedule = "0 9 * * *" + } +} + +func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error { + if cfg == nil { + return errors.New("invalid config") + } + + if cfg.Alert.RateLimitPerHour < 0 { + return errors.New("alert.rate_limit_per_hour must be >= 0") + } + if cfg.Alert.BatchingWindowSeconds < 0 { + return errors.New("alert.batching_window_seconds must be >= 0") + } + switch strings.TrimSpace(cfg.Alert.MinSeverity) { + case "", "critical", "warning", "info": + default: + return errors.New("alert.min_severity must be one of: critical, warning, info, or empty") + } + + if cfg.Report.ErrorDigestMinCount < 0 { + return errors.New("report.error_digest_min_count must be >= 0") + } + if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 { + return errors.New("report.account_health_error_rate_threshold must be between 0 and 100") + } + return nil +} + +// ========================= +// Alert runtime settings +// ========================= + +func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings { + return &OpsAlertRuntimeSettings{ + EvaluationIntervalSeconds: 60, + DistributedLock: OpsDistributedLockSettings{ + Enabled: true, + Key: opsAlertEvaluatorLeaderLockKeyDefault, + TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()), + }, + Silencing: OpsAlertSilencingSettings{ + Enabled: false, + GlobalUntilRFC3339: "", + GlobalReason: "", + Entries: []OpsAlertSilenceEntry{}, + }, + } +} + +func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) { + if s == nil { + return + } + s.Key = strings.TrimSpace(s.Key) + if s.Key == "" { + s.Key = defaultKey + } + if s.TTLSeconds <= 0 { + s.TTLSeconds = defaultTTLSeconds + } +} + +func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) { + if s == nil { + return + } + s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339) + s.GlobalReason = strings.TrimSpace(s.GlobalReason) + if s.Entries == nil { + s.Entries = []OpsAlertSilenceEntry{} + } + for i := range s.Entries { + s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339) + s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason) + } +} + +func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error { + if strings.TrimSpace(s.Key) == "" { + return errors.New("distributed_lock.key is required") + } + if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) { + return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400") + } + return nil +} + +func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error { + parse := func(raw string) error { + if strings.TrimSpace(raw) == "" { + return nil + } + if _, err := time.Parse(time.RFC3339, raw); err != nil { + return errors.New("silencing time must be RFC3339") + } + return nil + } + + if err := parse(s.GlobalUntilRFC3339); err != nil { + return err + } + for _, entry := range s.Entries { + if strings.TrimSpace(entry.UntilRFC3339) == "" { + return errors.New("silencing.entries.until_rfc3339 is required") + } + if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil { + return errors.New("silencing.entries.until_rfc3339 must be RFC3339") + } + } + return nil +} + +func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) { + defaultCfg := defaultOpsAlertRuntimeSettings() + if s == nil || s.settingRepo == nil { + return defaultCfg, nil + } + if ctx == nil { + ctx = context.Background() + } + + raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings) + if err != nil { + if errors.Is(err, ErrSettingNotFound) { + if b, mErr := json.Marshal(defaultCfg); mErr == nil { + _ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b)) + } + return defaultCfg, nil + } + return nil, err + } + + cfg := &OpsAlertRuntimeSettings{} + if err := json.Unmarshal([]byte(raw), cfg); err != nil { + return defaultCfg, nil + } + + if cfg.EvaluationIntervalSeconds <= 0 { + cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds + } + normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds) + normalizeOpsAlertSilencingSettings(&cfg.Silencing) + + return cfg, nil +} + +func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) { + if s == nil || s.settingRepo == nil { + return nil, errors.New("setting repository not initialized") + } + if ctx == nil { + ctx = context.Background() + } + if cfg == nil { + return nil, errors.New("invalid config") + } + + if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) { + return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400") + } + if cfg.DistributedLock.Enabled { + if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil { + return nil, err + } + } + if cfg.Silencing.Enabled { + if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil { + return nil, err + } + } + + defaultCfg := defaultOpsAlertRuntimeSettings() + normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds) + normalizeOpsAlertSilencingSettings(&cfg.Silencing) + + raw, err := json.Marshal(cfg) + if err != nil { + return nil, err + } + if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil { + return nil, err + } + + // Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated). + updated := &OpsAlertRuntimeSettings{} + _ = json.Unmarshal(raw, updated) + return updated, nil +} + diff --git a/backend/internal/service/ops_trends.go b/backend/internal/service/ops_trends.go new file mode 100644 index 00000000..9237544c --- /dev/null +++ b/backend/internal/service/ops_trends.go @@ -0,0 +1,27 @@ +package service + +import ( + "context" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +func (s *OpsService) GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + return s.opsRepo.GetThroughputTrend(ctx, filter, bucketSeconds) +} + diff --git a/backend/internal/service/ops_window_stats.go b/backend/internal/service/ops_window_stats.go new file mode 100644 index 00000000..71021d15 --- /dev/null +++ b/backend/internal/service/ops_window_stats.go @@ -0,0 +1,24 @@ +package service + +import ( + "context" + "time" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +// GetWindowStats returns lightweight request/token counts for the provided window. +// It is intended for realtime sampling (e.g. WebSocket QPS push) without computing percentiles/peaks. +func (s *OpsService) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + filter := &OpsDashboardFilter{ + StartTime: startTime, + EndTime: endTime, + } + return s.opsRepo.GetWindowStats(ctx, filter) +} diff --git a/backend/internal/service/wire.go b/backend/internal/service/wire.go index d4b984d6..bf78601f 100644 --- a/backend/internal/service/wire.go +++ b/backend/internal/service/wire.go @@ -1,10 +1,12 @@ package service import ( + "database/sql" "time" "github.com/Wei-Shaw/sub2api/internal/config" "github.com/google/wire" + "github.com/redis/go-redis/v9" ) // BuildInfo contains build information @@ -70,6 +72,57 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi return svc } +// ProvideOpsMetricsCollector creates and starts OpsMetricsCollector. +func ProvideOpsMetricsCollector( + opsRepo OpsRepository, + settingRepo SettingRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsMetricsCollector { + collector := NewOpsMetricsCollector(opsRepo, settingRepo, db, redisClient, cfg) + collector.Start() + return collector +} + +// ProvideOpsAggregationService creates and starts OpsAggregationService (hourly/daily pre-aggregation). +func ProvideOpsAggregationService( + opsRepo OpsRepository, + settingRepo SettingRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsAggregationService { + svc := NewOpsAggregationService(opsRepo, settingRepo, db, redisClient, cfg) + svc.Start() + return svc +} + +// ProvideOpsAlertEvaluatorService creates and starts OpsAlertEvaluatorService. +func ProvideOpsAlertEvaluatorService( + opsService *OpsService, + opsRepo OpsRepository, + emailService *EmailService, + redisClient *redis.Client, + cfg *config.Config, +) *OpsAlertEvaluatorService { + svc := NewOpsAlertEvaluatorService(opsService, opsRepo, emailService, redisClient, cfg) + svc.Start() + return svc +} + +// ProvideOpsCleanupService creates and starts OpsCleanupService (cron scheduled). +func ProvideOpsCleanupService( + opsRepo OpsRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsCleanupService { + svc := NewOpsCleanupService(opsRepo, db, redisClient, cfg) + svc.Start() + return svc +} + // ProviderSet is the Wire provider set for all services var ProviderSet = wire.NewSet( // Core services @@ -101,6 +154,11 @@ var ProviderSet = wire.NewSet( NewAccountUsageService, NewAccountTestService, NewSettingService, + NewOpsService, + ProvideOpsMetricsCollector, + ProvideOpsAggregationService, + ProvideOpsAlertEvaluatorService, + ProvideOpsCleanupService, NewEmailService, ProvideEmailQueueService, NewTurnstileService,