feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务(ops_service.go)和端口定义(ops_port.go)
- 实现账号可用性检查服务(ops_account_availability.go)
- 实现数据聚合服务(ops_aggregation_service.go)
- 实现告警评估服务(ops_alert_evaluator_service.go)
- 实现告警管理服务(ops_alerts.go)
- 实现数据清理服务(ops_cleanup_service.go)
- 实现并发控制服务(ops_concurrency.go)
- 实现仪表板服务(ops_dashboard.go)
- 实现错误处理服务(ops_errors.go)
- 实现直方图服务(ops_histograms.go)
- 实现指标采集服务(ops_metrics_collector.go)
- 实现查询模式服务(ops_query_mode.go)
- 实现实时监控服务(ops_realtime.go)
- 实现请求详情服务(ops_request_details.go)
- 实现重试机制服务(ops_retry.go)
- 实现配置管理服务(ops_settings.go)
- 实现趋势分析服务(ops_trends.go)
- 实现窗口统计服务(ops_window_stats.go)
- 添加 ops 相关领域常量
- 注册 service 依赖注入
This commit is contained in:
IanShaw027
2026-01-09 20:53:44 +08:00
parent bb5303272b
commit 5baa8b5673
21 changed files with 5244 additions and 0 deletions

View File

@@ -105,6 +105,28 @@ const (
// Request identity patch (Claude -> Gemini systemInstruction injection)
SettingKeyEnableIdentityPatch = "enable_identity_patch"
SettingKeyIdentityPatchPrompt = "identity_patch_prompt"
// =========================
// Ops Monitoring (vNext)
// =========================
// SettingKeyOpsMonitoringEnabled is a DB-backed soft switch to enable/disable ops module at runtime.
SettingKeyOpsMonitoringEnabled = "ops_monitoring_enabled"
// SettingKeyOpsRealtimeMonitoringEnabled controls realtime features (e.g. WS/QPS push).
SettingKeyOpsRealtimeMonitoringEnabled = "ops_realtime_monitoring_enabled"
// SettingKeyOpsQueryModeDefault controls the default query mode for ops dashboard (auto/raw/preagg).
SettingKeyOpsQueryModeDefault = "ops_query_mode_default"
// SettingKeyOpsEmailNotificationConfig stores JSON config for ops email notifications.
SettingKeyOpsEmailNotificationConfig = "ops_email_notification_config"
// SettingKeyOpsAlertRuntimeSettings stores JSON config for ops alert evaluator runtime settings.
SettingKeyOpsAlertRuntimeSettings = "ops_alert_runtime_settings"
// SettingKeyOpsMetricsIntervalSeconds controls the ops metrics collector interval (>=60).
SettingKeyOpsMetricsIntervalSeconds = "ops_metrics_interval_seconds"
)
// AdminAPIKeyPrefix is the prefix for admin API keys (distinct from user "sk-" keys).

View File

@@ -0,0 +1,157 @@
package service
import (
"context"
"time"
)
// GetAccountAvailabilityStats returns current account availability stats.
//
// Query-level filtering is intentionally limited to platform/group to match the dashboard scope.
func (s *OpsService) GetAccountAvailabilityStats(ctx context.Context, platformFilter string, groupIDFilter *int64) (
map[string]*PlatformAvailability,
map[int64]*GroupAvailability,
map[int64]*AccountAvailability,
*time.Time,
error,
) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, nil, nil, nil, err
}
accounts, err := s.listAllAccountsForOps(ctx, platformFilter)
if err != nil {
return nil, nil, nil, nil, err
}
if groupIDFilter != nil && *groupIDFilter > 0 {
filtered := make([]Account, 0, len(accounts))
for _, acc := range accounts {
for _, grp := range acc.Groups {
if grp != nil && grp.ID == *groupIDFilter {
filtered = append(filtered, acc)
break
}
}
}
accounts = filtered
}
now := time.Now()
collectedAt := now
platform := make(map[string]*PlatformAvailability)
group := make(map[int64]*GroupAvailability)
account := make(map[int64]*AccountAvailability)
for _, acc := range accounts {
if acc.ID <= 0 {
continue
}
isTempUnsched := false
if acc.TempUnschedulableUntil != nil && now.Before(*acc.TempUnschedulableUntil) {
isTempUnsched = true
}
isRateLimited := acc.RateLimitResetAt != nil && now.Before(*acc.RateLimitResetAt)
isOverloaded := acc.OverloadUntil != nil && now.Before(*acc.OverloadUntil)
hasError := acc.Status == StatusError
// Normalize exclusive status flags so the UI doesn't show conflicting badges.
if hasError {
isRateLimited = false
isOverloaded = false
}
isAvailable := acc.Status == StatusActive && acc.Schedulable && !isRateLimited && !isOverloaded && !isTempUnsched
if acc.Platform != "" {
if _, ok := platform[acc.Platform]; !ok {
platform[acc.Platform] = &PlatformAvailability{
Platform: acc.Platform,
}
}
p := platform[acc.Platform]
p.TotalAccounts++
if isAvailable {
p.AvailableCount++
}
if isRateLimited {
p.RateLimitCount++
}
if hasError {
p.ErrorCount++
}
}
for _, grp := range acc.Groups {
if grp == nil || grp.ID <= 0 {
continue
}
if _, ok := group[grp.ID]; !ok {
group[grp.ID] = &GroupAvailability{
GroupID: grp.ID,
GroupName: grp.Name,
Platform: grp.Platform,
}
}
g := group[grp.ID]
g.TotalAccounts++
if isAvailable {
g.AvailableCount++
}
if isRateLimited {
g.RateLimitCount++
}
if hasError {
g.ErrorCount++
}
}
displayGroupID := int64(0)
displayGroupName := ""
if len(acc.Groups) > 0 && acc.Groups[0] != nil {
displayGroupID = acc.Groups[0].ID
displayGroupName = acc.Groups[0].Name
}
item := &AccountAvailability{
AccountID: acc.ID,
AccountName: acc.Name,
Platform: acc.Platform,
GroupID: displayGroupID,
GroupName: displayGroupName,
Status: acc.Status,
IsAvailable: isAvailable,
IsRateLimited: isRateLimited,
IsOverloaded: isOverloaded,
HasError: hasError,
ErrorMessage: acc.ErrorMessage,
}
if isRateLimited && acc.RateLimitResetAt != nil {
item.RateLimitResetAt = acc.RateLimitResetAt
remainingSec := int64(time.Until(*acc.RateLimitResetAt).Seconds())
if remainingSec > 0 {
item.RateLimitRemainingSec = &remainingSec
}
}
if isOverloaded && acc.OverloadUntil != nil {
item.OverloadUntil = acc.OverloadUntil
remainingSec := int64(time.Until(*acc.OverloadUntil).Seconds())
if remainingSec > 0 {
item.OverloadRemainingSec = &remainingSec
}
}
if isTempUnsched && acc.TempUnschedulableUntil != nil {
item.TempUnschedulableUntil = acc.TempUnschedulableUntil
}
account[acc.ID] = item
}
return platform, group, account, &collectedAt, nil
}

View File

@@ -0,0 +1,434 @@
package service
import (
"context"
"database/sql"
"errors"
"log"
"strings"
"sync"
"time"
"github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/uuid"
"github.com/redis/go-redis/v9"
)
const (
opsAggHourlyJobName = "ops_preaggregation_hourly"
opsAggDailyJobName = "ops_preaggregation_daily"
opsAggHourlyInterval = 10 * time.Minute
opsAggDailyInterval = 1 * time.Hour
// Keep in sync with ops retention target (vNext default 30d).
opsAggBackfillWindow = 30 * 24 * time.Hour
// Recompute overlap to absorb late-arriving rows near boundaries.
opsAggHourlyOverlap = 2 * time.Hour
opsAggDailyOverlap = 48 * time.Hour
opsAggHourlyChunk = 24 * time.Hour
opsAggDailyChunk = 7 * 24 * time.Hour
// Delay around boundaries (e.g. 10:00..10:05) to avoid aggregating buckets
// that may still receive late inserts.
opsAggSafeDelay = 5 * time.Minute
opsAggMaxQueryTimeout = 3 * time.Second
opsAggHourlyTimeout = 5 * time.Minute
opsAggDailyTimeout = 2 * time.Minute
opsAggHourlyLeaderLockKey = "ops:aggregation:hourly:leader"
opsAggDailyLeaderLockKey = "ops:aggregation:daily:leader"
opsAggHourlyLeaderLockTTL = 15 * time.Minute
opsAggDailyLeaderLockTTL = 10 * time.Minute
)
// OpsAggregationService periodically backfills ops_metrics_hourly / ops_metrics_daily
// for stable long-window dashboard queries.
//
// It is safe to run in multi-replica deployments when Redis is available (leader lock).
type OpsAggregationService struct {
opsRepo OpsRepository
settingRepo SettingRepository
cfg *config.Config
db *sql.DB
redisClient *redis.Client
instanceID string
stopCh chan struct{}
startOnce sync.Once
stopOnce sync.Once
hourlyMu sync.Mutex
dailyMu sync.Mutex
skipLogMu sync.Mutex
skipLogAt time.Time
}
func NewOpsAggregationService(
opsRepo OpsRepository,
settingRepo SettingRepository,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsAggregationService {
return &OpsAggregationService{
opsRepo: opsRepo,
settingRepo: settingRepo,
cfg: cfg,
db: db,
redisClient: redisClient,
instanceID: uuid.NewString(),
}
}
func (s *OpsAggregationService) Start() {
if s == nil {
return
}
s.startOnce.Do(func() {
if s.stopCh == nil {
s.stopCh = make(chan struct{})
}
go s.hourlyLoop()
go s.dailyLoop()
})
}
func (s *OpsAggregationService) Stop() {
if s == nil {
return
}
s.stopOnce.Do(func() {
if s.stopCh != nil {
close(s.stopCh)
}
})
}
func (s *OpsAggregationService) hourlyLoop() {
// First run immediately.
s.aggregateHourly()
ticker := time.NewTicker(opsAggHourlyInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
s.aggregateHourly()
case <-s.stopCh:
return
}
}
}
func (s *OpsAggregationService) dailyLoop() {
// First run immediately.
s.aggregateDaily()
ticker := time.NewTicker(opsAggDailyInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
s.aggregateDaily()
case <-s.stopCh:
return
}
}
}
func (s *OpsAggregationService) aggregateHourly() {
if s == nil || s.opsRepo == nil {
return
}
if s.cfg != nil {
if !s.cfg.Ops.Enabled {
return
}
if !s.cfg.Ops.Aggregation.Enabled {
return
}
}
ctx, cancel := context.WithTimeout(context.Background(), opsAggHourlyTimeout)
defer cancel()
if !s.isMonitoringEnabled(ctx) {
return
}
release, ok := s.tryAcquireLeaderLock(ctx, opsAggHourlyLeaderLockKey, opsAggHourlyLeaderLockTTL, "[OpsAggregation][hourly]")
if !ok {
return
}
if release != nil {
defer release()
}
s.hourlyMu.Lock()
defer s.hourlyMu.Unlock()
startedAt := time.Now().UTC()
runAt := startedAt
// Aggregate stable full hours only.
end := utcFloorToHour(time.Now().UTC().Add(-opsAggSafeDelay))
start := end.Add(-opsAggBackfillWindow)
// Resume from the latest bucket with overlap.
{
ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout)
latest, ok, err := s.opsRepo.GetLatestHourlyBucketStart(ctxMax)
cancelMax()
if err != nil {
log.Printf("[OpsAggregation][hourly] failed to read latest bucket: %v", err)
} else if ok {
candidate := latest.Add(-opsAggHourlyOverlap)
if candidate.After(start) {
start = candidate
}
}
}
start = utcFloorToHour(start)
if !start.Before(end) {
return
}
var aggErr error
for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggHourlyChunk) {
chunkEnd := minTime(cursor.Add(opsAggHourlyChunk), end)
if err := s.opsRepo.UpsertHourlyMetrics(ctx, cursor, chunkEnd); err != nil {
aggErr = err
log.Printf("[OpsAggregation][hourly] upsert failed (%s..%s): %v", cursor.Format(time.RFC3339), chunkEnd.Format(time.RFC3339), err)
break
}
}
finishedAt := time.Now().UTC()
durationMs := finishedAt.Sub(startedAt).Milliseconds()
dur := durationMs
if aggErr != nil {
msg := truncateString(aggErr.Error(), 2048)
errAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
defer hbCancel()
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsAggHourlyJobName,
LastRunAt: &runAt,
LastErrorAt: &errAt,
LastError: &msg,
LastDurationMs: &dur,
})
return
}
successAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
defer hbCancel()
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsAggHourlyJobName,
LastRunAt: &runAt,
LastSuccessAt: &successAt,
LastDurationMs: &dur,
})
}
func (s *OpsAggregationService) aggregateDaily() {
if s == nil || s.opsRepo == nil {
return
}
if s.cfg != nil {
if !s.cfg.Ops.Enabled {
return
}
if !s.cfg.Ops.Aggregation.Enabled {
return
}
}
ctx, cancel := context.WithTimeout(context.Background(), opsAggDailyTimeout)
defer cancel()
if !s.isMonitoringEnabled(ctx) {
return
}
release, ok := s.tryAcquireLeaderLock(ctx, opsAggDailyLeaderLockKey, opsAggDailyLeaderLockTTL, "[OpsAggregation][daily]")
if !ok {
return
}
if release != nil {
defer release()
}
s.dailyMu.Lock()
defer s.dailyMu.Unlock()
startedAt := time.Now().UTC()
runAt := startedAt
end := utcFloorToDay(time.Now().UTC())
start := end.Add(-opsAggBackfillWindow)
{
ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout)
latest, ok, err := s.opsRepo.GetLatestDailyBucketDate(ctxMax)
cancelMax()
if err != nil {
log.Printf("[OpsAggregation][daily] failed to read latest bucket: %v", err)
} else if ok {
candidate := latest.Add(-opsAggDailyOverlap)
if candidate.After(start) {
start = candidate
}
}
}
start = utcFloorToDay(start)
if !start.Before(end) {
return
}
var aggErr error
for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggDailyChunk) {
chunkEnd := minTime(cursor.Add(opsAggDailyChunk), end)
if err := s.opsRepo.UpsertDailyMetrics(ctx, cursor, chunkEnd); err != nil {
aggErr = err
log.Printf("[OpsAggregation][daily] upsert failed (%s..%s): %v", cursor.Format("2006-01-02"), chunkEnd.Format("2006-01-02"), err)
break
}
}
finishedAt := time.Now().UTC()
durationMs := finishedAt.Sub(startedAt).Milliseconds()
dur := durationMs
if aggErr != nil {
msg := truncateString(aggErr.Error(), 2048)
errAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
defer hbCancel()
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsAggDailyJobName,
LastRunAt: &runAt,
LastErrorAt: &errAt,
LastError: &msg,
LastDurationMs: &dur,
})
return
}
successAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
defer hbCancel()
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsAggDailyJobName,
LastRunAt: &runAt,
LastSuccessAt: &successAt,
LastDurationMs: &dur,
})
}
func (s *OpsAggregationService) isMonitoringEnabled(ctx context.Context) bool {
if s == nil {
return false
}
if s.cfg != nil && !s.cfg.Ops.Enabled {
return false
}
if s.settingRepo == nil {
return true
}
if ctx == nil {
ctx = context.Background()
}
value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
return true
}
return true
}
switch strings.ToLower(strings.TrimSpace(value)) {
case "false", "0", "off", "disabled":
return false
default:
return true
}
}
var opsAggReleaseScript = redis.NewScript(`
if redis.call("GET", KEYS[1]) == ARGV[1] then
return redis.call("DEL", KEYS[1])
end
return 0
`)
func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) {
if s == nil || s.redisClient == nil {
return nil, true
}
if ctx == nil {
ctx = context.Background()
}
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
if err != nil {
// Fail-open: do not block single-instance deployments.
return nil, true
}
if !ok {
s.maybeLogSkip(logPrefix)
return nil, false
}
release := func() {
ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result()
}
return release, true
}
func (s *OpsAggregationService) maybeLogSkip(prefix string) {
s.skipLogMu.Lock()
defer s.skipLogMu.Unlock()
now := time.Now()
if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < time.Minute {
return
}
s.skipLogAt = now
if prefix == "" {
prefix = "[OpsAggregation]"
}
log.Printf("%s leader lock held by another instance; skipping", prefix)
}
func utcFloorToHour(t time.Time) time.Time {
return t.UTC().Truncate(time.Hour)
}
func utcFloorToDay(t time.Time) time.Time {
u := t.UTC()
y, m, d := u.Date()
return time.Date(y, m, d, 0, 0, 0, 0, time.UTC)
}
func minTime(a, b time.Time) time.Time {
if a.Before(b) {
return a
}
return b
}

View File

@@ -0,0 +1,839 @@
package service
import (
"context"
"fmt"
"log"
"math"
"strconv"
"strings"
"sync"
"time"
"github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/uuid"
"github.com/redis/go-redis/v9"
)
const (
opsAlertEvaluatorJobName = "ops_alert_evaluator"
opsAlertEvaluatorTimeout = 45 * time.Second
opsAlertEvaluatorLeaderLockKey = "ops:alert:evaluator:leader"
opsAlertEvaluatorLeaderLockTTL = 90 * time.Second
opsAlertEvaluatorSkipLogInterval = 1 * time.Minute
)
var opsAlertEvaluatorReleaseScript = redis.NewScript(`
if redis.call("GET", KEYS[1]) == ARGV[1] then
return redis.call("DEL", KEYS[1])
end
return 0
`)
type OpsAlertEvaluatorService struct {
opsService *OpsService
opsRepo OpsRepository
emailService *EmailService
redisClient *redis.Client
cfg *config.Config
instanceID string
stopCh chan struct{}
startOnce sync.Once
stopOnce sync.Once
wg sync.WaitGroup
mu sync.Mutex
ruleStates map[int64]*opsAlertRuleState
emailLimiter *slidingWindowLimiter
skipLogMu sync.Mutex
skipLogAt time.Time
warnNoRedisOnce sync.Once
}
type opsAlertRuleState struct {
LastEvaluatedAt time.Time
ConsecutiveBreaches int
}
func NewOpsAlertEvaluatorService(
opsService *OpsService,
opsRepo OpsRepository,
emailService *EmailService,
redisClient *redis.Client,
cfg *config.Config,
) *OpsAlertEvaluatorService {
return &OpsAlertEvaluatorService{
opsService: opsService,
opsRepo: opsRepo,
emailService: emailService,
redisClient: redisClient,
cfg: cfg,
instanceID: uuid.NewString(),
ruleStates: map[int64]*opsAlertRuleState{},
emailLimiter: newSlidingWindowLimiter(0, time.Hour),
}
}
func (s *OpsAlertEvaluatorService) Start() {
if s == nil {
return
}
s.startOnce.Do(func() {
if s.stopCh == nil {
s.stopCh = make(chan struct{})
}
go s.run()
})
}
func (s *OpsAlertEvaluatorService) Stop() {
if s == nil {
return
}
s.stopOnce.Do(func() {
if s.stopCh != nil {
close(s.stopCh)
}
})
s.wg.Wait()
}
func (s *OpsAlertEvaluatorService) run() {
s.wg.Add(1)
defer s.wg.Done()
// Start immediately to produce early feedback in ops dashboard.
timer := time.NewTimer(0)
defer timer.Stop()
for {
select {
case <-timer.C:
interval := s.getInterval()
s.evaluateOnce(interval)
timer.Reset(interval)
case <-s.stopCh:
return
}
}
}
func (s *OpsAlertEvaluatorService) getInterval() time.Duration {
// Default.
interval := 60 * time.Second
if s == nil || s.opsService == nil {
return interval
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
cfg, err := s.opsService.GetOpsAlertRuntimeSettings(ctx)
if err != nil || cfg == nil {
return interval
}
if cfg.EvaluationIntervalSeconds <= 0 {
return interval
}
if cfg.EvaluationIntervalSeconds < 1 {
return interval
}
if cfg.EvaluationIntervalSeconds > int((24 * time.Hour).Seconds()) {
return interval
}
return time.Duration(cfg.EvaluationIntervalSeconds) * time.Second
}
func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
if s == nil || s.opsRepo == nil {
return
}
if s.cfg != nil && !s.cfg.Ops.Enabled {
return
}
ctx, cancel := context.WithTimeout(context.Background(), opsAlertEvaluatorTimeout)
defer cancel()
if s.opsService != nil && !s.opsService.IsMonitoringEnabled(ctx) {
return
}
runtimeCfg := defaultOpsAlertRuntimeSettings()
if s.opsService != nil {
if loaded, err := s.opsService.GetOpsAlertRuntimeSettings(ctx); err == nil && loaded != nil {
runtimeCfg = loaded
}
}
release, ok := s.tryAcquireLeaderLock(ctx, runtimeCfg.DistributedLock)
if !ok {
return
}
if release != nil {
defer release()
}
startedAt := time.Now().UTC()
runAt := startedAt
rules, err := s.opsRepo.ListAlertRules(ctx)
if err != nil {
s.recordHeartbeatError(runAt, time.Since(startedAt), err)
log.Printf("[OpsAlertEvaluator] list rules failed: %v", err)
return
}
now := time.Now().UTC()
safeEnd := now.Truncate(time.Minute)
if safeEnd.IsZero() {
safeEnd = now
}
systemMetrics, _ := s.opsRepo.GetLatestSystemMetrics(ctx, 1)
// Cleanup stale state for removed rules.
s.pruneRuleStates(rules)
for _, rule := range rules {
if rule == nil || !rule.Enabled || rule.ID <= 0 {
continue
}
scopePlatform, scopeGroupID := parseOpsAlertRuleScope(rule.Filters)
windowMinutes := rule.WindowMinutes
if windowMinutes <= 0 {
windowMinutes = 1
}
windowStart := safeEnd.Add(-time.Duration(windowMinutes) * time.Minute)
windowEnd := safeEnd
metricValue, ok := s.computeRuleMetric(ctx, rule, systemMetrics, windowStart, windowEnd, scopePlatform, scopeGroupID)
if !ok {
s.resetRuleState(rule.ID, now)
continue
}
breachedNow := compareMetric(metricValue, rule.Operator, rule.Threshold)
required := requiredSustainedBreaches(rule.SustainedMinutes, interval)
consecutive := s.updateRuleBreaches(rule.ID, now, interval, breachedNow)
activeEvent, err := s.opsRepo.GetActiveAlertEvent(ctx, rule.ID)
if err != nil {
log.Printf("[OpsAlertEvaluator] get active event failed (rule=%d): %v", rule.ID, err)
continue
}
if breachedNow && consecutive >= required {
if activeEvent != nil {
continue
}
latestEvent, err := s.opsRepo.GetLatestAlertEvent(ctx, rule.ID)
if err != nil {
log.Printf("[OpsAlertEvaluator] get latest event failed (rule=%d): %v", rule.ID, err)
continue
}
if latestEvent != nil && rule.CooldownMinutes > 0 {
cooldown := time.Duration(rule.CooldownMinutes) * time.Minute
if now.Sub(latestEvent.FiredAt) < cooldown {
continue
}
}
firedEvent := &OpsAlertEvent{
RuleID: rule.ID,
Severity: strings.TrimSpace(rule.Severity),
Status: OpsAlertStatusFiring,
Title: fmt.Sprintf("%s: %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name)),
Description: buildOpsAlertDescription(rule, metricValue, windowMinutes, scopePlatform, scopeGroupID),
MetricValue: float64Ptr(metricValue),
ThresholdValue: float64Ptr(rule.Threshold),
Dimensions: buildOpsAlertDimensions(scopePlatform, scopeGroupID),
FiredAt: now,
CreatedAt: now,
}
created, err := s.opsRepo.CreateAlertEvent(ctx, firedEvent)
if err != nil {
log.Printf("[OpsAlertEvaluator] create event failed (rule=%d): %v", rule.ID, err)
continue
}
if created != nil && created.ID > 0 {
s.maybeSendAlertEmail(ctx, runtimeCfg, rule, created)
}
continue
}
// Not breached: resolve active event if present.
if activeEvent != nil {
resolvedAt := now
if err := s.opsRepo.UpdateAlertEventStatus(ctx, activeEvent.ID, OpsAlertStatusResolved, &resolvedAt); err != nil {
log.Printf("[OpsAlertEvaluator] resolve event failed (event=%d): %v", activeEvent.ID, err)
}
}
}
s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
}
func (s *OpsAlertEvaluatorService) pruneRuleStates(rules []*OpsAlertRule) {
s.mu.Lock()
defer s.mu.Unlock()
live := map[int64]struct{}{}
for _, r := range rules {
if r != nil && r.ID > 0 {
live[r.ID] = struct{}{}
}
}
for id := range s.ruleStates {
if _, ok := live[id]; !ok {
delete(s.ruleStates, id)
}
}
}
func (s *OpsAlertEvaluatorService) resetRuleState(ruleID int64, now time.Time) {
if ruleID <= 0 {
return
}
s.mu.Lock()
defer s.mu.Unlock()
state, ok := s.ruleStates[ruleID]
if !ok {
state = &opsAlertRuleState{}
s.ruleStates[ruleID] = state
}
state.LastEvaluatedAt = now
state.ConsecutiveBreaches = 0
}
func (s *OpsAlertEvaluatorService) updateRuleBreaches(ruleID int64, now time.Time, interval time.Duration, breached bool) int {
if ruleID <= 0 {
return 0
}
s.mu.Lock()
defer s.mu.Unlock()
state, ok := s.ruleStates[ruleID]
if !ok {
state = &opsAlertRuleState{}
s.ruleStates[ruleID] = state
}
if !state.LastEvaluatedAt.IsZero() && interval > 0 {
if now.Sub(state.LastEvaluatedAt) > interval*2 {
state.ConsecutiveBreaches = 0
}
}
state.LastEvaluatedAt = now
if breached {
state.ConsecutiveBreaches++
} else {
state.ConsecutiveBreaches = 0
}
return state.ConsecutiveBreaches
}
func requiredSustainedBreaches(sustainedMinutes int, interval time.Duration) int {
if sustainedMinutes <= 0 {
return 1
}
if interval <= 0 {
return sustainedMinutes
}
required := int(math.Ceil(float64(sustainedMinutes*60) / interval.Seconds()))
if required < 1 {
return 1
}
return required
}
func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64) {
if filters == nil {
return "", nil
}
if v, ok := filters["platform"]; ok {
if s, ok := v.(string); ok {
platform = strings.TrimSpace(s)
}
}
if v, ok := filters["group_id"]; ok {
switch t := v.(type) {
case float64:
if t > 0 {
id := int64(t)
groupID = &id
}
case int64:
if t > 0 {
id := t
groupID = &id
}
case int:
if t > 0 {
id := int64(t)
groupID = &id
}
case string:
n, err := strconv.ParseInt(strings.TrimSpace(t), 10, 64)
if err == nil && n > 0 {
groupID = &n
}
}
}
return platform, groupID
}
func (s *OpsAlertEvaluatorService) computeRuleMetric(
ctx context.Context,
rule *OpsAlertRule,
systemMetrics *OpsSystemMetricsSnapshot,
start time.Time,
end time.Time,
platform string,
groupID *int64,
) (float64, bool) {
if rule == nil {
return 0, false
}
switch strings.TrimSpace(rule.MetricType) {
case "cpu_usage_percent":
if systemMetrics != nil && systemMetrics.CPUUsagePercent != nil {
return *systemMetrics.CPUUsagePercent, true
}
return 0, false
case "memory_usage_percent":
if systemMetrics != nil && systemMetrics.MemoryUsagePercent != nil {
return *systemMetrics.MemoryUsagePercent, true
}
return 0, false
case "concurrency_queue_depth":
if systemMetrics != nil && systemMetrics.ConcurrencyQueueDepth != nil {
return float64(*systemMetrics.ConcurrencyQueueDepth), true
}
return 0, false
}
overview, err := s.opsRepo.GetDashboardOverview(ctx, &OpsDashboardFilter{
StartTime: start,
EndTime: end,
Platform: platform,
GroupID: groupID,
QueryMode: OpsQueryModeRaw,
})
if err != nil {
return 0, false
}
if overview == nil {
return 0, false
}
switch strings.TrimSpace(rule.MetricType) {
case "success_rate":
if overview.RequestCountSLA <= 0 {
return 0, false
}
return overview.SLA * 100, true
case "error_rate":
if overview.RequestCountSLA <= 0 {
return 0, false
}
return overview.ErrorRate * 100, true
case "upstream_error_rate":
if overview.RequestCountSLA <= 0 {
return 0, false
}
return overview.UpstreamErrorRate * 100, true
case "p95_latency_ms":
if overview.Duration.P95 == nil {
return 0, false
}
return float64(*overview.Duration.P95), true
case "p99_latency_ms":
if overview.Duration.P99 == nil {
return 0, false
}
return float64(*overview.Duration.P99), true
default:
return 0, false
}
}
func compareMetric(value float64, operator string, threshold float64) bool {
switch strings.TrimSpace(operator) {
case ">":
return value > threshold
case ">=":
return value >= threshold
case "<":
return value < threshold
case "<=":
return value <= threshold
case "==":
return value == threshold
case "!=":
return value != threshold
default:
return false
}
}
func buildOpsAlertDimensions(platform string, groupID *int64) map[string]any {
dims := map[string]any{}
if strings.TrimSpace(platform) != "" {
dims["platform"] = strings.TrimSpace(platform)
}
if groupID != nil && *groupID > 0 {
dims["group_id"] = *groupID
}
if len(dims) == 0 {
return nil
}
return dims
}
func buildOpsAlertDescription(rule *OpsAlertRule, value float64, windowMinutes int, platform string, groupID *int64) string {
if rule == nil {
return ""
}
scope := "overall"
if strings.TrimSpace(platform) != "" {
scope = fmt.Sprintf("platform=%s", strings.TrimSpace(platform))
}
if groupID != nil && *groupID > 0 {
scope = fmt.Sprintf("%s group_id=%d", scope, *groupID)
}
if windowMinutes <= 0 {
windowMinutes = 1
}
return fmt.Sprintf("%s %s %.2f (current %.2f) over last %dm (%s)",
strings.TrimSpace(rule.MetricType),
strings.TrimSpace(rule.Operator),
rule.Threshold,
value,
windowMinutes,
strings.TrimSpace(scope),
)
}
func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runtimeCfg *OpsAlertRuntimeSettings, rule *OpsAlertRule, event *OpsAlertEvent) {
if s == nil || s.emailService == nil || s.opsService == nil || event == nil || rule == nil {
return
}
if event.EmailSent {
return
}
if !rule.NotifyEmail {
return
}
emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx)
if err != nil || emailCfg == nil || !emailCfg.Alert.Enabled {
return
}
if len(emailCfg.Alert.Recipients) == 0 {
return
}
if !shouldSendOpsAlertEmailByMinSeverity(strings.TrimSpace(emailCfg.Alert.MinSeverity), strings.TrimSpace(rule.Severity)) {
return
}
if runtimeCfg != nil && runtimeCfg.Silencing.Enabled {
if isOpsAlertSilenced(time.Now().UTC(), rule, event, runtimeCfg.Silencing) {
return
}
}
// Apply/update rate limiter.
s.emailLimiter.SetLimit(emailCfg.Alert.RateLimitPerHour)
subject := fmt.Sprintf("[Ops Alert][%s] %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name))
body := buildOpsAlertEmailBody(rule, event)
anySent := false
for _, to := range emailCfg.Alert.Recipients {
addr := strings.TrimSpace(to)
if addr == "" {
continue
}
if !s.emailLimiter.Allow(time.Now().UTC()) {
continue
}
if err := s.emailService.SendEmail(ctx, addr, subject, body); err != nil {
// Ignore per-recipient failures; continue best-effort.
continue
}
anySent = true
}
if anySent {
_ = s.opsRepo.UpdateAlertEventEmailSent(context.Background(), event.ID, true)
}
}
func buildOpsAlertEmailBody(rule *OpsAlertRule, event *OpsAlertEvent) string {
if rule == nil || event == nil {
return ""
}
metric := strings.TrimSpace(rule.MetricType)
value := "-"
threshold := fmt.Sprintf("%.2f", rule.Threshold)
if event.MetricValue != nil {
value = fmt.Sprintf("%.2f", *event.MetricValue)
}
if event.ThresholdValue != nil {
threshold = fmt.Sprintf("%.2f", *event.ThresholdValue)
}
return fmt.Sprintf(`
<h2>Ops Alert</h2>
<p><b>Rule</b>: %s</p>
<p><b>Severity</b>: %s</p>
<p><b>Status</b>: %s</p>
<p><b>Metric</b>: %s %s %s</p>
<p><b>Fired at</b>: %s</p>
<p><b>Description</b>: %s</p>
`,
htmlEscape(rule.Name),
htmlEscape(rule.Severity),
htmlEscape(event.Status),
htmlEscape(metric),
htmlEscape(rule.Operator),
htmlEscape(fmt.Sprintf("%s (threshold %s)", value, threshold)),
event.FiredAt.Format(time.RFC3339),
htmlEscape(event.Description),
)
}
func shouldSendOpsAlertEmailByMinSeverity(minSeverity string, ruleSeverity string) bool {
minSeverity = strings.ToLower(strings.TrimSpace(minSeverity))
if minSeverity == "" {
return true
}
eventLevel := opsEmailSeverityForOps(ruleSeverity)
minLevel := strings.ToLower(minSeverity)
rank := func(level string) int {
switch level {
case "critical":
return 3
case "warning":
return 2
case "info":
return 1
default:
return 0
}
}
return rank(eventLevel) >= rank(minLevel)
}
func opsEmailSeverityForOps(severity string) string {
switch strings.ToUpper(strings.TrimSpace(severity)) {
case "P0":
return "critical"
case "P1":
return "warning"
default:
return "info"
}
}
func isOpsAlertSilenced(now time.Time, rule *OpsAlertRule, event *OpsAlertEvent, silencing OpsAlertSilencingSettings) bool {
if !silencing.Enabled {
return false
}
if now.IsZero() {
now = time.Now().UTC()
}
if strings.TrimSpace(silencing.GlobalUntilRFC3339) != "" {
if t, err := time.Parse(time.RFC3339, strings.TrimSpace(silencing.GlobalUntilRFC3339)); err == nil {
if now.Before(t) {
return true
}
}
}
for _, entry := range silencing.Entries {
untilRaw := strings.TrimSpace(entry.UntilRFC3339)
if untilRaw == "" {
continue
}
until, err := time.Parse(time.RFC3339, untilRaw)
if err != nil {
continue
}
if now.After(until) {
continue
}
if entry.RuleID != nil && rule != nil && rule.ID > 0 && *entry.RuleID != rule.ID {
continue
}
if len(entry.Severities) > 0 {
match := false
for _, s := range entry.Severities {
if strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(event.Severity)) || strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(rule.Severity)) {
match = true
break
}
}
if !match {
continue
}
}
return true
}
return false
}
func (s *OpsAlertEvaluatorService) tryAcquireLeaderLock(ctx context.Context, lock OpsDistributedLockSettings) (func(), bool) {
if !lock.Enabled {
return nil, true
}
if s.redisClient == nil {
s.warnNoRedisOnce.Do(func() {
log.Printf("[OpsAlertEvaluator] redis not configured; running without distributed lock")
})
return nil, true
}
key := strings.TrimSpace(lock.Key)
if key == "" {
key = opsAlertEvaluatorLeaderLockKey
}
ttl := time.Duration(lock.TTLSeconds) * time.Second
if ttl <= 0 {
ttl = opsAlertEvaluatorLeaderLockTTL
}
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
if err != nil {
// Fail-open for single-node environments, but warn.
s.warnNoRedisOnce.Do(func() {
log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; running without lock: %v", err)
})
return nil, true
}
if !ok {
s.maybeLogSkip(key)
return nil, false
}
return func() {
_, _ = opsAlertEvaluatorReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
}, true
}
func (s *OpsAlertEvaluatorService) maybeLogSkip(key string) {
s.skipLogMu.Lock()
defer s.skipLogMu.Unlock()
now := time.Now()
if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < opsAlertEvaluatorSkipLogInterval {
return
}
s.skipLogAt = now
log.Printf("[OpsAlertEvaluator] leader lock held by another instance; skipping (key=%q)", key)
}
func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
if s == nil || s.opsRepo == nil {
return
}
now := time.Now().UTC()
durMs := duration.Milliseconds()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
JobName: opsAlertEvaluatorJobName,
LastRunAt: &runAt,
LastSuccessAt: &now,
LastDurationMs: &durMs,
})
}
func (s *OpsAlertEvaluatorService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
if s == nil || s.opsRepo == nil || err == nil {
return
}
now := time.Now().UTC()
durMs := duration.Milliseconds()
msg := truncateString(err.Error(), 2048)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
JobName: opsAlertEvaluatorJobName,
LastRunAt: &runAt,
LastErrorAt: &now,
LastError: &msg,
LastDurationMs: &durMs,
})
}
func htmlEscape(s string) string {
replacer := strings.NewReplacer(
"&", "&amp;",
"<", "&lt;",
">", "&gt;",
`"`, "&quot;",
"'", "&#39;",
)
return replacer.Replace(s)
}
type slidingWindowLimiter struct {
mu sync.Mutex
limit int
window time.Duration
sent []time.Time
}
func newSlidingWindowLimiter(limit int, window time.Duration) *slidingWindowLimiter {
if window <= 0 {
window = time.Hour
}
return &slidingWindowLimiter{
limit: limit,
window: window,
sent: []time.Time{},
}
}
func (l *slidingWindowLimiter) SetLimit(limit int) {
l.mu.Lock()
defer l.mu.Unlock()
l.limit = limit
}
func (l *slidingWindowLimiter) Allow(now time.Time) bool {
l.mu.Lock()
defer l.mu.Unlock()
if l.limit <= 0 {
return true
}
cutoff := now.Add(-l.window)
keep := l.sent[:0]
for _, t := range l.sent {
if t.After(cutoff) {
keep = append(keep, t)
}
}
l.sent = keep
if len(l.sent) >= l.limit {
return false
}
l.sent = append(l.sent, now)
return true
}

View File

@@ -0,0 +1,162 @@
package service
import (
"context"
"database/sql"
"errors"
"strings"
"time"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
func (s *OpsService) ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return []*OpsAlertRule{}, nil
}
return s.opsRepo.ListAlertRules(ctx)
}
func (s *OpsService) CreateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if rule == nil {
return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule")
}
created, err := s.opsRepo.CreateAlertRule(ctx, rule)
if err != nil {
return nil, err
}
return created, nil
}
func (s *OpsService) UpdateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if rule == nil || rule.ID <= 0 {
return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule")
}
updated, err := s.opsRepo.UpdateAlertRule(ctx, rule)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
return nil, infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found")
}
return nil, err
}
return updated, nil
}
func (s *OpsService) DeleteAlertRule(ctx context.Context, id int64) error {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return err
}
if s.opsRepo == nil {
return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if id <= 0 {
return infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
}
if err := s.opsRepo.DeleteAlertRule(ctx, id); err != nil {
if errors.Is(err, sql.ErrNoRows) {
return infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found")
}
return err
}
return nil
}
func (s *OpsService) ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return []*OpsAlertEvent{}, nil
}
return s.opsRepo.ListAlertEvents(ctx, filter)
}
func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if ruleID <= 0 {
return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
}
return s.opsRepo.GetActiveAlertEvent(ctx, ruleID)
}
func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if ruleID <= 0 {
return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
}
return s.opsRepo.GetLatestAlertEvent(ctx, ruleID)
}
func (s *OpsService) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if event == nil {
return nil, infraerrors.BadRequest("INVALID_EVENT", "invalid event")
}
created, err := s.opsRepo.CreateAlertEvent(ctx, event)
if err != nil {
return nil, err
}
return created, nil
}
func (s *OpsService) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return err
}
if s.opsRepo == nil {
return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if eventID <= 0 {
return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
}
if strings.TrimSpace(status) == "" {
return infraerrors.BadRequest("INVALID_STATUS", "invalid status")
}
return s.opsRepo.UpdateAlertEventStatus(ctx, eventID, status, resolvedAt)
}
func (s *OpsService) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return err
}
if s.opsRepo == nil {
return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if eventID <= 0 {
return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
}
return s.opsRepo.UpdateAlertEventEmailSent(ctx, eventID, emailSent)
}

View File

@@ -0,0 +1,361 @@
package service
import (
"context"
"database/sql"
"fmt"
"log"
"strings"
"sync"
"time"
"github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/uuid"
"github.com/redis/go-redis/v9"
"github.com/robfig/cron/v3"
)
const (
opsCleanupJobName = "ops_cleanup"
opsCleanupLeaderLockKeyDefault = "ops:cleanup:leader"
opsCleanupLeaderLockTTLDefault = 30 * time.Minute
)
var opsCleanupCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
var opsCleanupReleaseScript = redis.NewScript(`
if redis.call("GET", KEYS[1]) == ARGV[1] then
return redis.call("DEL", KEYS[1])
end
return 0
`)
// OpsCleanupService periodically deletes old ops data to prevent unbounded DB growth.
//
// - Scheduling: 5-field cron spec (minute hour dom month dow).
// - Multi-instance: best-effort Redis leader lock so only one node runs cleanup.
// - Safety: deletes in batches to avoid long transactions.
type OpsCleanupService struct {
opsRepo OpsRepository
db *sql.DB
redisClient *redis.Client
cfg *config.Config
instanceID string
cron *cron.Cron
entryID cron.EntryID
startOnce sync.Once
stopOnce sync.Once
warnNoRedisOnce sync.Once
}
func NewOpsCleanupService(
opsRepo OpsRepository,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsCleanupService {
return &OpsCleanupService{
opsRepo: opsRepo,
db: db,
redisClient: redisClient,
cfg: cfg,
instanceID: uuid.NewString(),
}
}
func (s *OpsCleanupService) Start() {
if s == nil {
return
}
if s.cfg != nil && !s.cfg.Ops.Enabled {
return
}
if s.cfg != nil && !s.cfg.Ops.Cleanup.Enabled {
log.Printf("[OpsCleanup] not started (disabled)")
return
}
if s.opsRepo == nil || s.db == nil {
log.Printf("[OpsCleanup] not started (missing deps)")
return
}
s.startOnce.Do(func() {
schedule := "0 2 * * *"
if s.cfg != nil && strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule) != "" {
schedule = strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule)
}
loc := time.Local
if s.cfg != nil && strings.TrimSpace(s.cfg.Timezone) != "" {
if parsed, err := time.LoadLocation(strings.TrimSpace(s.cfg.Timezone)); err == nil && parsed != nil {
loc = parsed
}
}
c := cron.New(cron.WithParser(opsCleanupCronParser), cron.WithLocation(loc))
id, err := c.AddFunc(schedule, func() { s.runScheduled() })
if err != nil {
log.Printf("[OpsCleanup] not started (invalid schedule=%q): %v", schedule, err)
return
}
s.cron = c
s.entryID = id
s.cron.Start()
log.Printf("[OpsCleanup] started (schedule=%q tz=%s)", schedule, loc.String())
})
}
func (s *OpsCleanupService) Stop() {
if s == nil {
return
}
s.stopOnce.Do(func() {
if s.cron != nil {
ctx := s.cron.Stop()
select {
case <-ctx.Done():
case <-time.After(3 * time.Second):
log.Printf("[OpsCleanup] cron stop timed out")
}
}
})
}
func (s *OpsCleanupService) runScheduled() {
if s == nil || s.db == nil || s.opsRepo == nil {
return
}
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
defer cancel()
release, ok := s.tryAcquireLeaderLock(ctx)
if !ok {
return
}
if release != nil {
defer release()
}
startedAt := time.Now().UTC()
runAt := startedAt
counts, err := s.runCleanupOnce(ctx)
if err != nil {
s.recordHeartbeatError(runAt, time.Since(startedAt), err)
log.Printf("[OpsCleanup] cleanup failed: %v", err)
return
}
s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
log.Printf("[OpsCleanup] cleanup complete: %s", counts)
}
type opsCleanupDeletedCounts struct {
errorLogs int64
retryAttempts int64
alertEvents int64
systemMetrics int64
hourlyPreagg int64
dailyPreagg int64
}
func (c opsCleanupDeletedCounts) String() string {
return fmt.Sprintf(
"error_logs=%d retry_attempts=%d alert_events=%d system_metrics=%d hourly_preagg=%d daily_preagg=%d",
c.errorLogs,
c.retryAttempts,
c.alertEvents,
c.systemMetrics,
c.hourlyPreagg,
c.dailyPreagg,
)
}
func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDeletedCounts, error) {
out := opsCleanupDeletedCounts{}
if s == nil || s.db == nil || s.cfg == nil {
return out, nil
}
batchSize := 5000
now := time.Now().UTC()
// Error-like tables: error logs / retry attempts / alert events.
if days := s.cfg.Ops.Cleanup.ErrorLogRetentionDays; days > 0 {
cutoff := now.AddDate(0, 0, -days)
n, err := deleteOldRowsByID(ctx, s.db, "ops_error_logs", "created_at", cutoff, batchSize, false)
if err != nil {
return out, err
}
out.errorLogs = n
n, err = deleteOldRowsByID(ctx, s.db, "ops_retry_attempts", "created_at", cutoff, batchSize, false)
if err != nil {
return out, err
}
out.retryAttempts = n
n, err = deleteOldRowsByID(ctx, s.db, "ops_alert_events", "created_at", cutoff, batchSize, false)
if err != nil {
return out, err
}
out.alertEvents = n
}
// Minute-level metrics snapshots.
if days := s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays; days > 0 {
cutoff := now.AddDate(0, 0, -days)
n, err := deleteOldRowsByID(ctx, s.db, "ops_system_metrics", "created_at", cutoff, batchSize, false)
if err != nil {
return out, err
}
out.systemMetrics = n
}
// Pre-aggregation tables (hourly/daily).
if days := s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays; days > 0 {
cutoff := now.AddDate(0, 0, -days)
n, err := deleteOldRowsByID(ctx, s.db, "ops_metrics_hourly", "bucket_start", cutoff, batchSize, false)
if err != nil {
return out, err
}
out.hourlyPreagg = n
n, err = deleteOldRowsByID(ctx, s.db, "ops_metrics_daily", "bucket_date", cutoff, batchSize, true)
if err != nil {
return out, err
}
out.dailyPreagg = n
}
return out, nil
}
func deleteOldRowsByID(
ctx context.Context,
db *sql.DB,
table string,
timeColumn string,
cutoff time.Time,
batchSize int,
castCutoffToDate bool,
) (int64, error) {
if db == nil {
return 0, nil
}
if batchSize <= 0 {
batchSize = 5000
}
where := fmt.Sprintf("%s < $1", timeColumn)
if castCutoffToDate {
where = fmt.Sprintf("%s < $1::date", timeColumn)
}
q := fmt.Sprintf(`
WITH batch AS (
SELECT id FROM %s
WHERE %s
ORDER BY id
LIMIT $2
)
DELETE FROM %s
WHERE id IN (SELECT id FROM batch)
`, table, where, table)
var total int64
for {
res, err := db.ExecContext(ctx, q, cutoff, batchSize)
if err != nil {
// If ops tables aren't present yet (partial deployments), treat as no-op.
if strings.Contains(strings.ToLower(err.Error()), "does not exist") && strings.Contains(strings.ToLower(err.Error()), "relation") {
return total, nil
}
return total, err
}
affected, err := res.RowsAffected()
if err != nil {
return total, err
}
total += affected
if affected == 0 {
break
}
}
return total, nil
}
func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
if s == nil {
return nil, false
}
// In simple run mode, assume single instance.
if s.cfg != nil && s.cfg.RunMode == config.RunModeSimple {
return nil, true
}
if s.redisClient == nil {
s.warnNoRedisOnce.Do(func() {
log.Printf("[OpsCleanup] redis not configured; running without distributed lock")
})
return nil, true
}
key := opsCleanupLeaderLockKeyDefault
ttl := opsCleanupLeaderLockTTLDefault
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
if err != nil {
s.warnNoRedisOnce.Do(func() {
log.Printf("[OpsCleanup] leader lock SetNX failed; running without lock: %v", err)
})
return nil, true
}
if !ok {
return nil, false
}
return func() {
_, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
}, true
}
func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
if s == nil || s.opsRepo == nil {
return
}
now := time.Now().UTC()
durMs := duration.Milliseconds()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
JobName: opsCleanupJobName,
LastRunAt: &runAt,
LastSuccessAt: &now,
LastDurationMs: &durMs,
})
}
func (s *OpsCleanupService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
if s == nil || s.opsRepo == nil || err == nil {
return
}
now := time.Now().UTC()
durMs := duration.Milliseconds()
msg := truncateString(err.Error(), 2048)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
JobName: opsCleanupJobName,
LastRunAt: &runAt,
LastErrorAt: &now,
LastError: &msg,
LastDurationMs: &durMs,
})
}

View File

@@ -0,0 +1,257 @@
package service
import (
"context"
"log"
"time"
"github.com/Wei-Shaw/sub2api/internal/pkg/pagination"
)
const (
opsAccountsPageSize = 100
opsConcurrencyBatchChunkSize = 200
)
func (s *OpsService) listAllAccountsForOps(ctx context.Context, platformFilter string) ([]Account, error) {
if s == nil || s.accountRepo == nil {
return []Account{}, nil
}
out := make([]Account, 0, 128)
page := 1
for {
accounts, pageInfo, err := s.accountRepo.ListWithFilters(ctx, pagination.PaginationParams{
Page: page,
PageSize: opsAccountsPageSize,
}, platformFilter, "", "", "")
if err != nil {
return nil, err
}
if len(accounts) == 0 {
break
}
out = append(out, accounts...)
if pageInfo != nil && int64(len(out)) >= pageInfo.Total {
break
}
if len(accounts) < opsAccountsPageSize {
break
}
page++
if page > 10_000 {
log.Printf("[Ops] listAllAccountsForOps: aborting after too many pages (platform=%q)", platformFilter)
break
}
}
return out, nil
}
func (s *OpsService) getAccountsLoadMapBestEffort(ctx context.Context, accounts []Account) map[int64]*AccountLoadInfo {
if s == nil || s.concurrencyService == nil {
return map[int64]*AccountLoadInfo{}
}
if len(accounts) == 0 {
return map[int64]*AccountLoadInfo{}
}
// De-duplicate IDs (and keep the max concurrency to avoid under-reporting).
unique := make(map[int64]int, len(accounts))
for _, acc := range accounts {
if acc.ID <= 0 {
continue
}
if prev, ok := unique[acc.ID]; !ok || acc.Concurrency > prev {
unique[acc.ID] = acc.Concurrency
}
}
batch := make([]AccountWithConcurrency, 0, len(unique))
for id, maxConc := range unique {
batch = append(batch, AccountWithConcurrency{
ID: id,
MaxConcurrency: maxConc,
})
}
out := make(map[int64]*AccountLoadInfo, len(batch))
for i := 0; i < len(batch); i += opsConcurrencyBatchChunkSize {
end := i + opsConcurrencyBatchChunkSize
if end > len(batch) {
end = len(batch)
}
part, err := s.concurrencyService.GetAccountsLoadBatch(ctx, batch[i:end])
if err != nil {
// Best-effort: return zeros rather than failing the ops UI.
log.Printf("[Ops] GetAccountsLoadBatch failed: %v", err)
continue
}
for k, v := range part {
out[k] = v
}
}
return out
}
// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account.
//
// Optional filters:
// - platformFilter: only include accounts in that platform (best-effort reduces DB load)
// - groupIDFilter: only include accounts that belong to that group
func (s *OpsService) GetConcurrencyStats(
ctx context.Context,
platformFilter string,
groupIDFilter *int64,
) (map[string]*PlatformConcurrencyInfo, map[int64]*GroupConcurrencyInfo, map[int64]*AccountConcurrencyInfo, *time.Time, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, nil, nil, nil, err
}
accounts, err := s.listAllAccountsForOps(ctx, platformFilter)
if err != nil {
return nil, nil, nil, nil, err
}
collectedAt := time.Now()
loadMap := s.getAccountsLoadMapBestEffort(ctx, accounts)
platform := make(map[string]*PlatformConcurrencyInfo)
group := make(map[int64]*GroupConcurrencyInfo)
account := make(map[int64]*AccountConcurrencyInfo)
for _, acc := range accounts {
if acc.ID <= 0 {
continue
}
var matchedGroup *Group
if groupIDFilter != nil && *groupIDFilter > 0 {
for _, grp := range acc.Groups {
if grp == nil || grp.ID <= 0 {
continue
}
if grp.ID == *groupIDFilter {
matchedGroup = grp
break
}
}
// Group filter provided: skip accounts not in that group.
if matchedGroup == nil {
continue
}
}
load := loadMap[acc.ID]
currentInUse := int64(0)
waiting := int64(0)
if load != nil {
currentInUse = int64(load.CurrentConcurrency)
waiting = int64(load.WaitingCount)
}
// Account-level view picks one display group (the first group).
displayGroupID := int64(0)
displayGroupName := ""
if matchedGroup != nil {
displayGroupID = matchedGroup.ID
displayGroupName = matchedGroup.Name
} else if len(acc.Groups) > 0 && acc.Groups[0] != nil {
displayGroupID = acc.Groups[0].ID
displayGroupName = acc.Groups[0].Name
}
if _, ok := account[acc.ID]; !ok {
info := &AccountConcurrencyInfo{
AccountID: acc.ID,
AccountName: acc.Name,
Platform: acc.Platform,
GroupID: displayGroupID,
GroupName: displayGroupName,
CurrentInUse: currentInUse,
MaxCapacity: int64(acc.Concurrency),
WaitingInQueue: waiting,
}
if info.MaxCapacity > 0 {
info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
}
account[acc.ID] = info
}
// Platform aggregation.
if acc.Platform != "" {
if _, ok := platform[acc.Platform]; !ok {
platform[acc.Platform] = &PlatformConcurrencyInfo{
Platform: acc.Platform,
}
}
p := platform[acc.Platform]
p.MaxCapacity += int64(acc.Concurrency)
p.CurrentInUse += currentInUse
p.WaitingInQueue += waiting
}
// Group aggregation (one account may contribute to multiple groups).
if matchedGroup != nil {
grp := matchedGroup
if _, ok := group[grp.ID]; !ok {
group[grp.ID] = &GroupConcurrencyInfo{
GroupID: grp.ID,
GroupName: grp.Name,
Platform: grp.Platform,
}
}
g := group[grp.ID]
if g.GroupName == "" && grp.Name != "" {
g.GroupName = grp.Name
}
if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform {
// Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels.
g.Platform = ""
}
g.MaxCapacity += int64(acc.Concurrency)
g.CurrentInUse += currentInUse
g.WaitingInQueue += waiting
} else {
for _, grp := range acc.Groups {
if grp == nil || grp.ID <= 0 {
continue
}
if _, ok := group[grp.ID]; !ok {
group[grp.ID] = &GroupConcurrencyInfo{
GroupID: grp.ID,
GroupName: grp.Name,
Platform: grp.Platform,
}
}
g := group[grp.ID]
if g.GroupName == "" && grp.Name != "" {
g.GroupName = grp.Name
}
if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform {
// Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels.
g.Platform = ""
}
g.MaxCapacity += int64(acc.Concurrency)
g.CurrentInUse += currentInUse
g.WaitingInQueue += waiting
}
}
}
for _, info := range platform {
if info.MaxCapacity > 0 {
info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
}
}
for _, info := range group {
if info.MaxCapacity > 0 {
info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
}
}
return platform, group, account, &collectedAt, nil
}

View File

@@ -0,0 +1,77 @@
package service
import (
"context"
"database/sql"
"errors"
"log"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if filter == nil {
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
}
if filter.StartTime.After(filter.EndTime) {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
}
// Resolve query mode (requested via query param, or DB default).
filter.QueryMode = s.resolveOpsQueryMode(ctx, filter.QueryMode)
overview, err := s.opsRepo.GetDashboardOverview(ctx, filter)
if err != nil {
if errors.Is(err, ErrOpsPreaggregatedNotPopulated) {
return nil, infraerrors.Conflict("OPS_PREAGG_NOT_READY", "Pre-aggregated ops metrics are not populated yet")
}
return nil, err
}
// Best-effort system health + jobs; dashboard metrics should still render if these are missing.
if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil {
overview.SystemMetrics = metrics
} else if err != nil && !errors.Is(err, sql.ErrNoRows) {
log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err)
}
if heartbeats, err := s.opsRepo.ListJobHeartbeats(ctx); err == nil {
overview.JobHeartbeats = heartbeats
} else {
log.Printf("[Ops] ListJobHeartbeats failed: %v", err)
}
return overview, nil
}
func (s *OpsService) resolveOpsQueryMode(ctx context.Context, requested OpsQueryMode) OpsQueryMode {
if requested.IsValid() {
// Allow "auto" to be disabled via config until preagg is proven stable in production.
// Forced `preagg` via query param still works.
if requested == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
return OpsQueryModeRaw
}
return requested
}
mode := OpsQueryModeAuto
if s != nil && s.settingRepo != nil {
if raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsQueryModeDefault); err == nil {
mode = ParseOpsQueryMode(raw)
}
}
if mode == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
return OpsQueryModeRaw
}
return mode
}

View File

@@ -0,0 +1,45 @@
package service
import (
"context"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
func (s *OpsService) GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if filter == nil {
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
}
if filter.StartTime.After(filter.EndTime) {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
}
return s.opsRepo.GetErrorTrend(ctx, filter, bucketSeconds)
}
func (s *OpsService) GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if filter == nil {
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
}
if filter.StartTime.After(filter.EndTime) {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
}
return s.opsRepo.GetErrorDistribution(ctx, filter)
}

View File

@@ -0,0 +1,26 @@
package service
import (
"context"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
func (s *OpsService) GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if filter == nil {
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
}
if filter.StartTime.After(filter.EndTime) {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
}
return s.opsRepo.GetLatencyHistogram(ctx, filter)
}

View File

@@ -0,0 +1,861 @@
package service
import (
"context"
"database/sql"
"errors"
"fmt"
"hash/fnv"
"log"
"math"
"os"
"runtime"
"strconv"
"strings"
"sync"
"time"
"unicode/utf8"
"github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/uuid"
"github.com/redis/go-redis/v9"
"github.com/shirou/gopsutil/v4/cpu"
"github.com/shirou/gopsutil/v4/mem"
)
const (
opsMetricsCollectorJobName = "ops_metrics_collector"
opsMetricsCollectorMinInterval = 60 * time.Second
opsMetricsCollectorMaxInterval = 1 * time.Hour
opsMetricsCollectorTimeout = 10 * time.Second
opsMetricsCollectorLeaderLockKey = "ops:metrics:collector:leader"
opsMetricsCollectorLeaderLockTTL = 90 * time.Second
opsMetricsCollectorHeartbeatTimeout = 2 * time.Second
bytesPerMB = 1024 * 1024
)
var opsMetricsCollectorAdvisoryLockID = hashAdvisoryLockID(opsMetricsCollectorLeaderLockKey)
type OpsMetricsCollector struct {
opsRepo OpsRepository
settingRepo SettingRepository
cfg *config.Config
db *sql.DB
redisClient *redis.Client
instanceID string
lastCgroupCPUUsageNanos uint64
lastCgroupCPUSampleAt time.Time
stopCh chan struct{}
startOnce sync.Once
stopOnce sync.Once
skipLogMu sync.Mutex
skipLogAt time.Time
}
func NewOpsMetricsCollector(
opsRepo OpsRepository,
settingRepo SettingRepository,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsMetricsCollector {
return &OpsMetricsCollector{
opsRepo: opsRepo,
settingRepo: settingRepo,
cfg: cfg,
db: db,
redisClient: redisClient,
instanceID: uuid.NewString(),
}
}
func (c *OpsMetricsCollector) Start() {
if c == nil {
return
}
c.startOnce.Do(func() {
if c.stopCh == nil {
c.stopCh = make(chan struct{})
}
go c.run()
})
}
func (c *OpsMetricsCollector) Stop() {
if c == nil {
return
}
c.stopOnce.Do(func() {
if c.stopCh != nil {
close(c.stopCh)
}
})
}
func (c *OpsMetricsCollector) run() {
// First run immediately so the dashboard has data soon after startup.
c.collectOnce()
for {
interval := c.getInterval()
timer := time.NewTimer(interval)
select {
case <-timer.C:
c.collectOnce()
case <-c.stopCh:
timer.Stop()
return
}
}
}
func (c *OpsMetricsCollector) getInterval() time.Duration {
interval := opsMetricsCollectorMinInterval
if c.settingRepo == nil {
return interval
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
raw, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMetricsIntervalSeconds)
if err != nil {
return interval
}
raw = strings.TrimSpace(raw)
if raw == "" {
return interval
}
seconds, err := strconv.Atoi(raw)
if err != nil {
return interval
}
if seconds < int(opsMetricsCollectorMinInterval.Seconds()) {
seconds = int(opsMetricsCollectorMinInterval.Seconds())
}
if seconds > int(opsMetricsCollectorMaxInterval.Seconds()) {
seconds = int(opsMetricsCollectorMaxInterval.Seconds())
}
return time.Duration(seconds) * time.Second
}
func (c *OpsMetricsCollector) collectOnce() {
if c == nil {
return
}
if c.cfg != nil && !c.cfg.Ops.Enabled {
return
}
if c.opsRepo == nil {
return
}
if c.db == nil {
return
}
ctx, cancel := context.WithTimeout(context.Background(), opsMetricsCollectorTimeout)
defer cancel()
if !c.isMonitoringEnabled(ctx) {
return
}
release, ok := c.tryAcquireLeaderLock(ctx)
if !ok {
return
}
if release != nil {
defer release()
}
startedAt := time.Now().UTC()
err := c.collectAndPersist(ctx)
finishedAt := time.Now().UTC()
durationMs := finishedAt.Sub(startedAt).Milliseconds()
dur := durationMs
runAt := startedAt
if err != nil {
msg := truncateString(err.Error(), 2048)
errAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
defer hbCancel()
_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsMetricsCollectorJobName,
LastRunAt: &runAt,
LastErrorAt: &errAt,
LastError: &msg,
LastDurationMs: &dur,
})
log.Printf("[OpsMetricsCollector] collect failed: %v", err)
return
}
successAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
defer hbCancel()
_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsMetricsCollectorJobName,
LastRunAt: &runAt,
LastSuccessAt: &successAt,
LastDurationMs: &dur,
})
}
func (c *OpsMetricsCollector) isMonitoringEnabled(ctx context.Context) bool {
if c == nil {
return false
}
if c.cfg != nil && !c.cfg.Ops.Enabled {
return false
}
if c.settingRepo == nil {
return true
}
if ctx == nil {
ctx = context.Background()
}
value, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
return true
}
// Fail-open: collector should not become a hard dependency.
return true
}
switch strings.ToLower(strings.TrimSpace(value)) {
case "false", "0", "off", "disabled":
return false
default:
return true
}
}
func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
if ctx == nil {
ctx = context.Background()
}
// Align to stable minute boundaries to avoid partial buckets and to maximize cache hits.
now := time.Now().UTC()
windowEnd := now.Truncate(time.Minute)
windowStart := windowEnd.Add(-1 * time.Minute)
sys, err := c.collectSystemStats(ctx)
if err != nil {
// Continue; system stats are best-effort.
log.Printf("[OpsMetricsCollector] system stats error: %v", err)
}
dbOK := c.checkDB(ctx)
redisOK := c.checkRedis(ctx)
active, idle := c.dbPoolStats()
successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd)
if err != nil {
return fmt.Errorf("query usage counts: %w", err)
}
duration, ttft, err := c.queryUsageLatency(ctx, windowStart, windowEnd)
if err != nil {
return fmt.Errorf("query usage latency: %w", err)
}
errorTotal, businessLimited, errorSLA, upstreamExcl, upstream429, upstream529, err := c.queryErrorCounts(ctx, windowStart, windowEnd)
if err != nil {
return fmt.Errorf("query error counts: %w", err)
}
windowSeconds := windowEnd.Sub(windowStart).Seconds()
if windowSeconds <= 0 {
windowSeconds = 60
}
requestTotal := successCount + errorTotal
qps := float64(requestTotal) / windowSeconds
tps := float64(tokenConsumed) / windowSeconds
goroutines := runtime.NumGoroutine()
input := &OpsInsertSystemMetricsInput{
CreatedAt: windowEnd,
WindowMinutes: 1,
SuccessCount: successCount,
ErrorCountTotal: errorTotal,
BusinessLimitedCount: businessLimited,
ErrorCountSLA: errorSLA,
UpstreamErrorCountExcl429529: upstreamExcl,
Upstream429Count: upstream429,
Upstream529Count: upstream529,
TokenConsumed: tokenConsumed,
QPS: float64Ptr(roundTo1DP(qps)),
TPS: float64Ptr(roundTo1DP(tps)),
DurationP50Ms: duration.p50,
DurationP90Ms: duration.p90,
DurationP95Ms: duration.p95,
DurationP99Ms: duration.p99,
DurationAvgMs: duration.avg,
DurationMaxMs: duration.max,
TTFTP50Ms: ttft.p50,
TTFTP90Ms: ttft.p90,
TTFTP95Ms: ttft.p95,
TTFTP99Ms: ttft.p99,
TTFTAvgMs: ttft.avg,
TTFTMaxMs: ttft.max,
CPUUsagePercent: sys.cpuUsagePercent,
MemoryUsedMB: sys.memoryUsedMB,
MemoryTotalMB: sys.memoryTotalMB,
MemoryUsagePercent: sys.memoryUsagePercent,
DBOK: boolPtr(dbOK),
RedisOK: boolPtr(redisOK),
DBConnActive: intPtr(active),
DBConnIdle: intPtr(idle),
GoroutineCount: intPtr(goroutines),
}
return c.opsRepo.InsertSystemMetrics(ctx, input)
}
type opsCollectedPercentiles struct {
p50 *int
p90 *int
p95 *int
p99 *int
avg *float64
max *int
}
func (c *OpsMetricsCollector) queryUsageCounts(ctx context.Context, start, end time.Time) (successCount int64, tokenConsumed int64, err error) {
q := `
SELECT
COALESCE(COUNT(*), 0) AS success_count,
COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
FROM usage_logs
WHERE created_at >= $1 AND created_at < $2`
var tokens sql.NullInt64
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&successCount, &tokens); err != nil {
return 0, 0, err
}
if tokens.Valid {
tokenConsumed = tokens.Int64
}
return successCount, tokenConsumed, nil
}
func (c *OpsMetricsCollector) queryUsageLatency(ctx context.Context, start, end time.Time) (duration opsCollectedPercentiles, ttft opsCollectedPercentiles, err error) {
{
q := `
SELECT
percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50,
percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90,
percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95,
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99,
AVG(duration_ms) AS avg_ms,
MAX(duration_ms) AS max_ms
FROM usage_logs
WHERE created_at >= $1 AND created_at < $2
AND duration_ms IS NOT NULL`
var p50, p90, p95, p99 sql.NullFloat64
var avg sql.NullFloat64
var max sql.NullInt64
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
}
duration.p50 = floatToIntPtr(p50)
duration.p90 = floatToIntPtr(p90)
duration.p95 = floatToIntPtr(p95)
duration.p99 = floatToIntPtr(p99)
if avg.Valid {
v := roundTo1DP(avg.Float64)
duration.avg = &v
}
if max.Valid {
v := int(max.Int64)
duration.max = &v
}
}
{
q := `
SELECT
percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50,
percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90,
percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95,
percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99,
AVG(first_token_ms) AS avg_ms,
MAX(first_token_ms) AS max_ms
FROM usage_logs
WHERE created_at >= $1 AND created_at < $2
AND first_token_ms IS NOT NULL`
var p50, p90, p95, p99 sql.NullFloat64
var avg sql.NullFloat64
var max sql.NullInt64
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
}
ttft.p50 = floatToIntPtr(p50)
ttft.p90 = floatToIntPtr(p90)
ttft.p95 = floatToIntPtr(p95)
ttft.p99 = floatToIntPtr(p99)
if avg.Valid {
v := roundTo1DP(avg.Float64)
ttft.avg = &v
}
if max.Valid {
v := int(max.Int64)
ttft.max = &v
}
}
return duration, ttft, nil
}
func (c *OpsMetricsCollector) queryErrorCounts(ctx context.Context, start, end time.Time) (
errorTotal int64,
businessLimited int64,
errorSLA int64,
upstreamExcl429529 int64,
upstream429 int64,
upstream529 int64,
err error,
) {
q := `
SELECT
COALESCE(COUNT(*), 0) AS error_total,
COALESCE(COUNT(*) FILTER (WHERE is_business_limited), 0) AS business_limited,
COALESCE(COUNT(*) FILTER (WHERE NOT is_business_limited), 0) AS error_sla,
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl,
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 429), 0) AS upstream_429,
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 529), 0) AS upstream_529
FROM ops_error_logs
WHERE created_at >= $1 AND created_at < $2`
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(
&errorTotal,
&businessLimited,
&errorSLA,
&upstreamExcl429529,
&upstream429,
&upstream529,
); err != nil {
return 0, 0, 0, 0, 0, 0, err
}
return errorTotal, businessLimited, errorSLA, upstreamExcl429529, upstream429, upstream529, nil
}
type opsCollectedSystemStats struct {
cpuUsagePercent *float64
memoryUsedMB *int64
memoryTotalMB *int64
memoryUsagePercent *float64
}
func (c *OpsMetricsCollector) collectSystemStats(ctx context.Context) (*opsCollectedSystemStats, error) {
out := &opsCollectedSystemStats{}
if ctx == nil {
ctx = context.Background()
}
sampleAt := time.Now().UTC()
// Prefer cgroup (container) metrics when available.
if cpuPct := c.tryCgroupCPUPercent(sampleAt); cpuPct != nil {
out.cpuUsagePercent = cpuPct
}
cgroupUsed, cgroupTotal, cgroupOK := readCgroupMemoryBytes()
if cgroupOK {
usedMB := int64(cgroupUsed / bytesPerMB)
out.memoryUsedMB = &usedMB
if cgroupTotal > 0 {
totalMB := int64(cgroupTotal / bytesPerMB)
out.memoryTotalMB = &totalMB
pct := roundTo1DP(float64(cgroupUsed) / float64(cgroupTotal) * 100)
out.memoryUsagePercent = &pct
}
}
// Fallback to host metrics if cgroup metrics are unavailable (or incomplete).
if out.cpuUsagePercent == nil {
if cpuPercents, err := cpu.PercentWithContext(ctx, 0, false); err == nil && len(cpuPercents) > 0 {
v := roundTo1DP(cpuPercents[0])
out.cpuUsagePercent = &v
}
}
// If total memory isn't available from cgroup (e.g. memory.max = "max"), fill total from host.
if out.memoryUsedMB == nil || out.memoryTotalMB == nil || out.memoryUsagePercent == nil {
if vm, err := mem.VirtualMemoryWithContext(ctx); err == nil && vm != nil {
if out.memoryUsedMB == nil {
usedMB := int64(vm.Used / bytesPerMB)
out.memoryUsedMB = &usedMB
}
if out.memoryTotalMB == nil {
totalMB := int64(vm.Total / bytesPerMB)
out.memoryTotalMB = &totalMB
}
if out.memoryUsagePercent == nil {
if out.memoryUsedMB != nil && out.memoryTotalMB != nil && *out.memoryTotalMB > 0 {
pct := roundTo1DP(float64(*out.memoryUsedMB) / float64(*out.memoryTotalMB) * 100)
out.memoryUsagePercent = &pct
} else {
pct := roundTo1DP(vm.UsedPercent)
out.memoryUsagePercent = &pct
}
}
}
}
return out, nil
}
func (c *OpsMetricsCollector) tryCgroupCPUPercent(now time.Time) *float64 {
usageNanos, ok := readCgroupCPUUsageNanos()
if !ok {
return nil
}
// Initialize baseline sample.
if c.lastCgroupCPUSampleAt.IsZero() {
c.lastCgroupCPUUsageNanos = usageNanos
c.lastCgroupCPUSampleAt = now
return nil
}
elapsed := now.Sub(c.lastCgroupCPUSampleAt)
if elapsed <= 0 {
c.lastCgroupCPUUsageNanos = usageNanos
c.lastCgroupCPUSampleAt = now
return nil
}
prev := c.lastCgroupCPUUsageNanos
c.lastCgroupCPUUsageNanos = usageNanos
c.lastCgroupCPUSampleAt = now
if usageNanos < prev {
// Counter reset (container restarted).
return nil
}
deltaUsageSec := float64(usageNanos-prev) / 1e9
elapsedSec := elapsed.Seconds()
if elapsedSec <= 0 {
return nil
}
cores := readCgroupCPULimitCores()
if cores <= 0 {
// Can't reliably normalize; skip and fall back to gopsutil.
return nil
}
pct := (deltaUsageSec / (elapsedSec * cores)) * 100
if pct < 0 {
pct = 0
}
// Clamp to avoid noise/jitter showing impossible values.
if pct > 100 {
pct = 100
}
v := roundTo1DP(pct)
return &v
}
func readCgroupMemoryBytes() (usedBytes uint64, totalBytes uint64, ok bool) {
// cgroup v2 (most common in modern containers)
if used, ok1 := readUintFile("/sys/fs/cgroup/memory.current"); ok1 {
usedBytes = used
rawMax, err := os.ReadFile("/sys/fs/cgroup/memory.max")
if err == nil {
s := strings.TrimSpace(string(rawMax))
if s != "" && s != "max" {
if v, err := strconv.ParseUint(s, 10, 64); err == nil {
totalBytes = v
}
}
}
return usedBytes, totalBytes, true
}
// cgroup v1 fallback
if used, ok1 := readUintFile("/sys/fs/cgroup/memory/memory.usage_in_bytes"); ok1 {
usedBytes = used
if limit, ok2 := readUintFile("/sys/fs/cgroup/memory/memory.limit_in_bytes"); ok2 {
// Some environments report a very large number when unlimited.
if limit > 0 && limit < (1<<60) {
totalBytes = limit
}
}
return usedBytes, totalBytes, true
}
return 0, 0, false
}
func readCgroupCPUUsageNanos() (usageNanos uint64, ok bool) {
// cgroup v2: cpu.stat has usage_usec
if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.stat"); err == nil {
lines := strings.Split(string(raw), "\n")
for _, line := range lines {
fields := strings.Fields(line)
if len(fields) != 2 {
continue
}
if fields[0] != "usage_usec" {
continue
}
v, err := strconv.ParseUint(fields[1], 10, 64)
if err != nil {
continue
}
return v * 1000, true
}
}
// cgroup v1: cpuacct.usage is in nanoseconds
if v, ok := readUintFile("/sys/fs/cgroup/cpuacct/cpuacct.usage"); ok {
return v, true
}
return 0, false
}
func readCgroupCPULimitCores() float64 {
// cgroup v2: cpu.max => "<quota> <period>" or "max <period>"
if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.max"); err == nil {
fields := strings.Fields(string(raw))
if len(fields) >= 2 && fields[0] != "max" {
quota, err1 := strconv.ParseFloat(fields[0], 64)
period, err2 := strconv.ParseFloat(fields[1], 64)
if err1 == nil && err2 == nil && quota > 0 && period > 0 {
return quota / period
}
}
}
// cgroup v1: cpu.cfs_quota_us / cpu.cfs_period_us
quota, okQuota := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_quota_us")
period, okPeriod := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_period_us")
if okQuota && okPeriod && quota > 0 && period > 0 {
return float64(quota) / float64(period)
}
return 0
}
func readUintFile(path string) (uint64, bool) {
raw, err := os.ReadFile(path)
if err != nil {
return 0, false
}
s := strings.TrimSpace(string(raw))
if s == "" {
return 0, false
}
v, err := strconv.ParseUint(s, 10, 64)
if err != nil {
return 0, false
}
return v, true
}
func readIntFile(path string) (int64, bool) {
raw, err := os.ReadFile(path)
if err != nil {
return 0, false
}
s := strings.TrimSpace(string(raw))
if s == "" {
return 0, false
}
v, err := strconv.ParseInt(s, 10, 64)
if err != nil {
return 0, false
}
return v, true
}
func (c *OpsMetricsCollector) checkDB(ctx context.Context) bool {
if c == nil || c.db == nil {
return false
}
if ctx == nil {
ctx = context.Background()
}
var one int
if err := c.db.QueryRowContext(ctx, "SELECT 1").Scan(&one); err != nil {
return false
}
return one == 1
}
func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool {
if c == nil || c.redisClient == nil {
return false
}
if ctx == nil {
ctx = context.Background()
}
return c.redisClient.Ping(ctx).Err() == nil
}
func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) {
if c == nil || c.db == nil {
return 0, 0
}
stats := c.db.Stats()
return stats.InUse, stats.Idle
}
var opsMetricsCollectorReleaseScript = redis.NewScript(`
if redis.call("GET", KEYS[1]) == ARGV[1] then
return redis.call("DEL", KEYS[1])
end
return 0
`)
func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
if c == nil || c.redisClient == nil {
return nil, true
}
if ctx == nil {
ctx = context.Background()
}
ok, err := c.redisClient.SetNX(ctx, opsMetricsCollectorLeaderLockKey, c.instanceID, opsMetricsCollectorLeaderLockTTL).Result()
if err != nil {
// Prefer fail-closed to avoid stampeding the database when Redis is flaky.
// Fallback to a DB advisory lock when Redis is present but unavailable.
release, ok := c.tryAcquireDBAdvisoryLock(ctx)
if !ok {
c.maybeLogSkip()
return nil, false
}
return release, true
}
if !ok {
c.maybeLogSkip()
return nil, false
}
release := func() {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_, _ = opsMetricsCollectorReleaseScript.Run(ctx, c.redisClient, []string{opsMetricsCollectorLeaderLockKey}, c.instanceID).Result()
}
return release, true
}
func (c *OpsMetricsCollector) tryAcquireDBAdvisoryLock(ctx context.Context) (func(), bool) {
if c == nil || c.db == nil {
return nil, false
}
if ctx == nil {
ctx = context.Background()
}
conn, err := c.db.Conn(ctx)
if err != nil {
return nil, false
}
acquired := false
if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", opsMetricsCollectorAdvisoryLockID).Scan(&acquired); err != nil {
_ = conn.Close()
return nil, false
}
if !acquired {
_ = conn.Close()
return nil, false
}
release := func() {
unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", opsMetricsCollectorAdvisoryLockID)
_ = conn.Close()
}
return release, true
}
func (c *OpsMetricsCollector) maybeLogSkip() {
c.skipLogMu.Lock()
defer c.skipLogMu.Unlock()
now := time.Now()
if !c.skipLogAt.IsZero() && now.Sub(c.skipLogAt) < time.Minute {
return
}
c.skipLogAt = now
log.Printf("[OpsMetricsCollector] leader lock held by another instance; skipping")
}
func floatToIntPtr(v sql.NullFloat64) *int {
if !v.Valid {
return nil
}
n := int(math.Round(v.Float64))
return &n
}
func roundTo1DP(v float64) float64 {
return math.Round(v*10) / 10
}
func truncateString(s string, max int) string {
if max <= 0 {
return ""
}
if len(s) <= max {
return s
}
cut := s[:max]
for len(cut) > 0 && !utf8.ValidString(cut) {
cut = cut[:len(cut)-1]
}
return cut
}
func boolPtr(v bool) *bool {
out := v
return &out
}
func intPtr(v int) *int {
out := v
return &out
}
func float64Ptr(v float64) *float64 {
out := v
return &out
}
func hashAdvisoryLockID(s string) int64 {
h := fnv.New64a()
_, _ = h.Write([]byte(s))
return int64(h.Sum64())
}

View File

@@ -0,0 +1,226 @@
package service
import (
"context"
"time"
)
type OpsRepository interface {
InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error)
ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error)
GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error)
ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error)
InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)
// Lightweight window stats (for realtime WS / quick sampling).
GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error)
GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error)
GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error)
GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error)
GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error)
InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error
GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error)
UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error
ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error)
// Alerts (rules + events)
ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error)
CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
DeleteAlertRule(ctx context.Context, id int64) error
ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error
// Pre-aggregation (hourly/daily) used for long-window dashboard performance.
UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error)
GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error)
}
type OpsInsertErrorLogInput struct {
RequestID string
ClientRequestID string
UserID *int64
APIKeyID *int64
AccountID *int64
GroupID *int64
ClientIP *string
Platform string
Model string
RequestPath string
Stream bool
UserAgent string
ErrorPhase string
ErrorType string
Severity string
StatusCode int
IsBusinessLimited bool
ErrorMessage string
ErrorBody string
ErrorSource string
ErrorOwner string
UpstreamStatusCode *int
UpstreamErrorMessage *string
UpstreamErrorDetail *string
DurationMs *int
TimeToFirstTokenMs *int64
RequestBodyJSON *string // sanitized json string (not raw bytes)
RequestBodyTruncated bool
RequestBodyBytes *int
RequestHeadersJSON *string // optional json string
IsRetryable bool
RetryCount int
CreatedAt time.Time
}
type OpsInsertRetryAttemptInput struct {
RequestedByUserID int64
SourceErrorID int64
Mode string
PinnedAccountID *int64
// running|queued etc.
Status string
StartedAt time.Time
}
type OpsUpdateRetryAttemptInput struct {
ID int64
// succeeded|failed
Status string
FinishedAt time.Time
DurationMs int64
// Optional correlation
ResultRequestID *string
ResultErrorID *int64
ErrorMessage *string
}
type OpsInsertSystemMetricsInput struct {
CreatedAt time.Time
WindowMinutes int
Platform *string
GroupID *int64
SuccessCount int64
ErrorCountTotal int64
BusinessLimitedCount int64
ErrorCountSLA int64
UpstreamErrorCountExcl429529 int64
Upstream429Count int64
Upstream529Count int64
TokenConsumed int64
QPS *float64
TPS *float64
DurationP50Ms *int
DurationP90Ms *int
DurationP95Ms *int
DurationP99Ms *int
DurationAvgMs *float64
DurationMaxMs *int
TTFTP50Ms *int
TTFTP90Ms *int
TTFTP95Ms *int
TTFTP99Ms *int
TTFTAvgMs *float64
TTFTMaxMs *int
CPUUsagePercent *float64
MemoryUsedMB *int64
MemoryTotalMB *int64
MemoryUsagePercent *float64
DBOK *bool
RedisOK *bool
DBConnActive *int
DBConnIdle *int
DBConnWaiting *int
GoroutineCount *int
ConcurrencyQueueDepth *int
}
type OpsSystemMetricsSnapshot struct {
ID int64 `json:"id"`
CreatedAt time.Time `json:"created_at"`
WindowMinutes int `json:"window_minutes"`
CPUUsagePercent *float64 `json:"cpu_usage_percent"`
MemoryUsedMB *int64 `json:"memory_used_mb"`
MemoryTotalMB *int64 `json:"memory_total_mb"`
MemoryUsagePercent *float64 `json:"memory_usage_percent"`
DBOK *bool `json:"db_ok"`
RedisOK *bool `json:"redis_ok"`
DBConnActive *int `json:"db_conn_active"`
DBConnIdle *int `json:"db_conn_idle"`
DBConnWaiting *int `json:"db_conn_waiting"`
GoroutineCount *int `json:"goroutine_count"`
ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"`
}
type OpsUpsertJobHeartbeatInput struct {
JobName string
LastRunAt *time.Time
LastSuccessAt *time.Time
LastErrorAt *time.Time
LastError *string
LastDurationMs *int64
}
type OpsJobHeartbeat struct {
JobName string `json:"job_name"`
LastRunAt *time.Time `json:"last_run_at"`
LastSuccessAt *time.Time `json:"last_success_at"`
LastErrorAt *time.Time `json:"last_error_at"`
LastError *string `json:"last_error"`
LastDurationMs *int64 `json:"last_duration_ms"`
UpdatedAt time.Time `json:"updated_at"`
}
type OpsWindowStats struct {
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
SuccessCount int64 `json:"success_count"`
ErrorCountTotal int64 `json:"error_count_total"`
TokenConsumed int64 `json:"token_consumed"`
}

View File

@@ -0,0 +1,40 @@
package service
import (
"errors"
"strings"
)
type OpsQueryMode string
const (
OpsQueryModeAuto OpsQueryMode = "auto"
OpsQueryModeRaw OpsQueryMode = "raw"
OpsQueryModePreagg OpsQueryMode = "preagg"
)
// ErrOpsPreaggregatedNotPopulated indicates that raw logs exist for a window, but the
// pre-aggregation tables are not populated yet. This is primarily used to implement
// the forced `preagg` mode UX.
var ErrOpsPreaggregatedNotPopulated = errors.New("ops pre-aggregated tables not populated")
func ParseOpsQueryMode(raw string) OpsQueryMode {
v := strings.ToLower(strings.TrimSpace(raw))
switch v {
case string(OpsQueryModeRaw):
return OpsQueryModeRaw
case string(OpsQueryModePreagg):
return OpsQueryModePreagg
default:
return OpsQueryModeAuto
}
}
func (m OpsQueryMode) IsValid() bool {
switch m {
case OpsQueryModeAuto, OpsQueryModeRaw, OpsQueryModePreagg:
return true
default:
return false
}
}

View File

@@ -0,0 +1,36 @@
package service
import (
"context"
"errors"
"strings"
)
// IsRealtimeMonitoringEnabled returns true when realtime ops features are enabled.
//
// This is a soft switch controlled by the DB setting `ops_realtime_monitoring_enabled`,
// and it is also gated by the hard switch/soft switch of overall ops monitoring.
func (s *OpsService) IsRealtimeMonitoringEnabled(ctx context.Context) bool {
if !s.IsMonitoringEnabled(ctx) {
return false
}
if s.settingRepo == nil {
return true
}
value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsRealtimeMonitoringEnabled)
if err != nil {
// Default enabled when key is missing; fail-open on transient errors.
if errors.Is(err, ErrSettingNotFound) {
return true
}
return true
}
switch strings.ToLower(strings.TrimSpace(value)) {
case "false", "0", "off", "disabled":
return false
default:
return true
}
}

View File

@@ -0,0 +1,152 @@
package service
import (
"context"
"time"
)
type OpsRequestKind string
const (
OpsRequestKindSuccess OpsRequestKind = "success"
OpsRequestKindError OpsRequestKind = "error"
)
// OpsRequestDetail is a request-level view across success (usage_logs) and error (ops_error_logs).
// It powers "request drilldown" UIs without exposing full request bodies for successful requests.
type OpsRequestDetail struct {
Kind OpsRequestKind `json:"kind"`
CreatedAt time.Time `json:"created_at"`
RequestID string `json:"request_id"`
Platform string `json:"platform,omitempty"`
Model string `json:"model,omitempty"`
DurationMs *int `json:"duration_ms,omitempty"`
StatusCode *int `json:"status_code,omitempty"`
// When Kind == "error", ErrorID links to /admin/ops/errors/:id.
ErrorID *int64 `json:"error_id,omitempty"`
Phase string `json:"phase,omitempty"`
Severity string `json:"severity,omitempty"`
Message string `json:"message,omitempty"`
UserID *int64 `json:"user_id,omitempty"`
APIKeyID *int64 `json:"api_key_id,omitempty"`
AccountID *int64 `json:"account_id,omitempty"`
GroupID *int64 `json:"group_id,omitempty"`
Stream bool `json:"stream"`
}
type OpsRequestDetailFilter struct {
StartTime *time.Time
EndTime *time.Time
// kind: success|error|all
Kind string
Platform string
GroupID *int64
UserID *int64
APIKeyID *int64
AccountID *int64
Model string
RequestID string
Query string
MinDurationMs *int
MaxDurationMs *int
// Sort: created_at_desc (default) or duration_desc.
Sort string
Page int
PageSize int
}
func (f *OpsRequestDetailFilter) Normalize() (page, pageSize int, startTime, endTime time.Time) {
page = 1
pageSize = 50
endTime = time.Now()
startTime = endTime.Add(-1 * time.Hour)
if f == nil {
return page, pageSize, startTime, endTime
}
if f.Page > 0 {
page = f.Page
}
if f.PageSize > 0 {
pageSize = f.PageSize
}
if pageSize > 100 {
pageSize = 100
}
if f.EndTime != nil {
endTime = *f.EndTime
}
if f.StartTime != nil {
startTime = *f.StartTime
} else if f.EndTime != nil {
startTime = endTime.Add(-1 * time.Hour)
}
if startTime.After(endTime) {
startTime, endTime = endTime, startTime
}
return page, pageSize, startTime, endTime
}
type OpsRequestDetailList struct {
Items []*OpsRequestDetail `json:"items"`
Total int64 `json:"total"`
Page int `json:"page"`
PageSize int `json:"page_size"`
}
func (s *OpsService) ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) (*OpsRequestDetailList, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return &OpsRequestDetailList{
Items: []*OpsRequestDetail{},
Total: 0,
Page: 1,
PageSize: 50,
}, nil
}
page, pageSize, startTime, endTime := filter.Normalize()
filterCopy := &OpsRequestDetailFilter{}
if filter != nil {
*filterCopy = *filter
}
filterCopy.Page = page
filterCopy.PageSize = pageSize
filterCopy.StartTime = &startTime
filterCopy.EndTime = &endTime
items, total, err := s.opsRepo.ListRequestDetails(ctx, filterCopy)
if err != nil {
return nil, err
}
if items == nil {
items = []*OpsRequestDetail{}
}
return &OpsRequestDetailList{
Items: items,
Total: total,
Page: page,
PageSize: pageSize,
}, nil
}

View File

@@ -0,0 +1,635 @@
package service
import (
"bytes"
"context"
"database/sql"
"encoding/json"
"errors"
"fmt"
"log"
"net/http"
"strings"
"time"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
"github.com/gin-gonic/gin"
"github.com/lib/pq"
)
const (
OpsRetryModeClient = "client"
OpsRetryModeUpstream = "upstream"
)
const (
opsRetryStatusRunning = "running"
opsRetryStatusSucceeded = "succeeded"
opsRetryStatusFailed = "failed"
)
const (
opsRetryTimeout = 60 * time.Second
opsRetryCaptureBytesLimit = 64 * 1024
opsRetryResponsePreviewMax = 8 * 1024
opsRetryMinIntervalPerError = 10 * time.Second
opsRetryMaxAccountSwitches = 3
)
var opsRetryRequestHeaderAllowlist = map[string]bool{
"anthropic-beta": true,
"anthropic-version": true,
}
type opsRetryRequestType string
const (
opsRetryTypeMessages opsRetryRequestType = "messages"
opsRetryTypeOpenAI opsRetryRequestType = "openai_responses"
opsRetryTypeGeminiV1B opsRetryRequestType = "gemini_v1beta"
)
type limitedResponseWriter struct {
header http.Header
status int
wroteHeader bool
limit int
totalWritten int64
buf bytes.Buffer
}
func newLimitedResponseWriter(limit int) *limitedResponseWriter {
if limit <= 0 {
limit = 1
}
return &limitedResponseWriter{
header: make(http.Header),
status: http.StatusOK,
limit: limit,
}
}
func (w *limitedResponseWriter) Header() http.Header {
return w.header
}
func (w *limitedResponseWriter) WriteHeader(statusCode int) {
if w.wroteHeader {
return
}
w.wroteHeader = true
w.status = statusCode
}
func (w *limitedResponseWriter) Write(p []byte) (int, error) {
if !w.wroteHeader {
w.WriteHeader(http.StatusOK)
}
w.totalWritten += int64(len(p))
if w.buf.Len() < w.limit {
remaining := w.limit - w.buf.Len()
if len(p) > remaining {
_, _ = w.buf.Write(p[:remaining])
} else {
_, _ = w.buf.Write(p)
}
}
// Pretend we wrote everything to avoid upstream/client code treating it as an error.
return len(p), nil
}
func (w *limitedResponseWriter) Flush() {}
func (w *limitedResponseWriter) bodyBytes() []byte {
return w.buf.Bytes()
}
func (w *limitedResponseWriter) truncated() bool {
return w.totalWritten > int64(w.limit)
}
func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
mode = strings.ToLower(strings.TrimSpace(mode))
switch mode {
case OpsRetryModeClient, OpsRetryModeUpstream:
default:
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
}
latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
if err != nil && !errors.Is(err, sql.ErrNoRows) {
return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
}
if latest != nil {
if strings.EqualFold(latest.Status, opsRetryStatusRunning) || strings.EqualFold(latest.Status, "queued") {
return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
}
lastAttemptAt := latest.CreatedAt
if latest.FinishedAt != nil && !latest.FinishedAt.IsZero() {
lastAttemptAt = *latest.FinishedAt
} else if latest.StartedAt != nil && !latest.StartedAt.IsZero() {
lastAttemptAt = *latest.StartedAt
}
if time.Since(lastAttemptAt) < opsRetryMinIntervalPerError {
return nil, infraerrors.Conflict("OPS_RETRY_TOO_FREQUENT", "Please wait before retrying this error again")
}
}
errorLog, err := s.GetErrorLogByID(ctx, errorID)
if err != nil {
return nil, err
}
if strings.TrimSpace(errorLog.RequestBody) == "" {
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
}
var pinned *int64
if mode == OpsRetryModeUpstream {
if pinnedAccountID != nil && *pinnedAccountID > 0 {
pinned = pinnedAccountID
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
pinned = errorLog.AccountID
} else {
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
}
}
startedAt := time.Now()
attemptID, err := s.opsRepo.InsertRetryAttempt(ctx, &OpsInsertRetryAttemptInput{
RequestedByUserID: requestedByUserID,
SourceErrorID: errorID,
Mode: mode,
PinnedAccountID: pinned,
Status: opsRetryStatusRunning,
StartedAt: startedAt,
})
if err != nil {
var pqErr *pq.Error
if errors.As(err, &pqErr) && string(pqErr.Code) == "23505" {
return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
}
return nil, infraerrors.InternalServer("OPS_RETRY_CREATE_ATTEMPT_FAILED", "Failed to create retry attempt").WithCause(err)
}
result := &OpsRetryResult{
AttemptID: attemptID,
Mode: mode,
Status: opsRetryStatusFailed,
PinnedAccountID: pinned,
HTTPStatusCode: 0,
UpstreamRequestID: "",
ResponsePreview: "",
ResponseTruncated: false,
ErrorMessage: "",
StartedAt: startedAt,
}
execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
defer cancel()
execRes := s.executeRetry(execCtx, errorLog, mode, pinned)
finishedAt := time.Now()
result.FinishedAt = finishedAt
result.DurationMs = finishedAt.Sub(startedAt).Milliseconds()
if execRes != nil {
result.Status = execRes.status
result.UsedAccountID = execRes.usedAccountID
result.HTTPStatusCode = execRes.httpStatusCode
result.UpstreamRequestID = execRes.upstreamRequestID
result.ResponsePreview = execRes.responsePreview
result.ResponseTruncated = execRes.responseTruncated
result.ErrorMessage = execRes.errorMessage
}
updateCtx, updateCancel := context.WithTimeout(context.Background(), 3*time.Second)
defer updateCancel()
var updateErrMsg *string
if strings.TrimSpace(result.ErrorMessage) != "" {
msg := result.ErrorMessage
updateErrMsg = &msg
}
var resultRequestID *string
if strings.TrimSpace(result.UpstreamRequestID) != "" {
v := result.UpstreamRequestID
resultRequestID = &v
}
finalStatus := result.Status
if strings.TrimSpace(finalStatus) == "" {
finalStatus = opsRetryStatusFailed
}
if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{
ID: attemptID,
Status: finalStatus,
FinishedAt: finishedAt,
DurationMs: result.DurationMs,
ResultRequestID: resultRequestID,
ErrorMessage: updateErrMsg,
}); err != nil {
// Best-effort: retry itself already executed; do not fail the API response.
log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
}
return result, nil
}
type opsRetryExecution struct {
status string
usedAccountID *int64
httpStatusCode int
upstreamRequestID string
responsePreview string
responseTruncated bool
errorMessage string
}
func (s *OpsService) executeRetry(ctx context.Context, errorLog *OpsErrorLogDetail, mode string, pinnedAccountID *int64) *opsRetryExecution {
if errorLog == nil {
return &opsRetryExecution{
status: opsRetryStatusFailed,
errorMessage: "missing error log",
}
}
reqType := detectOpsRetryType(errorLog.RequestPath)
bodyBytes := []byte(errorLog.RequestBody)
switch reqType {
case opsRetryTypeMessages:
bodyBytes = FilterThinkingBlocksForRetry(bodyBytes)
case opsRetryTypeOpenAI, opsRetryTypeGeminiV1B:
// No-op
}
switch strings.ToLower(strings.TrimSpace(mode)) {
case OpsRetryModeUpstream:
if pinnedAccountID == nil || *pinnedAccountID <= 0 {
return &opsRetryExecution{
status: opsRetryStatusFailed,
errorMessage: "pinned_account_id required for upstream retry",
}
}
return s.executePinnedRetry(ctx, reqType, errorLog, bodyBytes, *pinnedAccountID)
case OpsRetryModeClient:
return s.executeClientRetry(ctx, reqType, errorLog, bodyBytes)
default:
return &opsRetryExecution{
status: opsRetryStatusFailed,
errorMessage: "invalid retry mode",
}
}
}
func detectOpsRetryType(path string) opsRetryRequestType {
p := strings.ToLower(strings.TrimSpace(path))
switch {
case strings.Contains(p, "/responses"):
return opsRetryTypeOpenAI
case strings.Contains(p, "/v1beta/"):
return opsRetryTypeGeminiV1B
default:
return opsRetryTypeMessages
}
}
func (s *OpsService) executePinnedRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, pinnedAccountID int64) *opsRetryExecution {
if s.accountRepo == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account repository not available"}
}
account, err := s.accountRepo.GetByID(ctx, pinnedAccountID)
if err != nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("account not found: %v", err)}
}
if account == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account not found"}
}
if !account.IsSchedulable() {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account is not schedulable"}
}
if errorLog.GroupID != nil && *errorLog.GroupID > 0 {
if !containsInt64(account.GroupIDs, *errorLog.GroupID) {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "pinned account is not in the same group as the original request"}
}
}
var release func()
if s.concurrencyService != nil {
acq, err := s.concurrencyService.AcquireAccountSlot(ctx, account.ID, account.Concurrency)
if err != nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("acquire account slot failed: %v", err)}
}
if acq == nil || !acq.Acquired {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account concurrency limit reached"}
}
release = acq.ReleaseFunc
}
if release != nil {
defer release()
}
usedID := account.ID
exec := s.executeWithAccount(ctx, reqType, errorLog, body, account)
exec.usedAccountID = &usedID
if exec.status == "" {
exec.status = opsRetryStatusFailed
}
return exec
}
func (s *OpsService) executeClientRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) *opsRetryExecution {
groupID := errorLog.GroupID
if groupID == nil || *groupID <= 0 {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "group_id missing; cannot reselect account"}
}
model, stream, parsedErr := extractRetryModelAndStream(reqType, errorLog, body)
if parsedErr != nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: parsedErr.Error()}
}
_ = stream
excluded := make(map[int64]struct{})
switches := 0
for {
if switches >= opsRetryMaxAccountSwitches {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "retry failed after exhausting account failovers"}
}
selection, selErr := s.selectAccountForRetry(ctx, reqType, groupID, model, excluded)
if selErr != nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: selErr.Error()}
}
if selection == nil || selection.Account == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "no available accounts"}
}
account := selection.Account
if !selection.Acquired || selection.ReleaseFunc == nil {
excluded[account.ID] = struct{}{}
switches++
continue
}
exec := func() *opsRetryExecution {
defer selection.ReleaseFunc()
return s.executeWithAccount(ctx, reqType, errorLog, body, account)
}()
if exec != nil {
if exec.status == opsRetryStatusSucceeded {
usedID := account.ID
exec.usedAccountID = &usedID
return exec
}
// If the gateway services ask for failover, try another account.
if s.isFailoverError(exec.errorMessage) {
excluded[account.ID] = struct{}{}
switches++
continue
}
usedID := account.ID
exec.usedAccountID = &usedID
return exec
}
excluded[account.ID] = struct{}{}
switches++
}
}
func (s *OpsService) selectAccountForRetry(ctx context.Context, reqType opsRetryRequestType, groupID *int64, model string, excludedIDs map[int64]struct{}) (*AccountSelectionResult, error) {
switch reqType {
case opsRetryTypeOpenAI:
if s.openAIGatewayService == nil {
return nil, fmt.Errorf("openai gateway service not available")
}
return s.openAIGatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
case opsRetryTypeGeminiV1B, opsRetryTypeMessages:
if s.gatewayService == nil {
return nil, fmt.Errorf("gateway service not available")
}
return s.gatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
default:
return nil, fmt.Errorf("unsupported retry type: %s", reqType)
}
}
func extractRetryModelAndStream(reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) (model string, stream bool, err error) {
switch reqType {
case opsRetryTypeMessages:
parsed, parseErr := ParseGatewayRequest(body)
if parseErr != nil {
return "", false, fmt.Errorf("failed to parse messages request body: %w", parseErr)
}
return parsed.Model, parsed.Stream, nil
case opsRetryTypeOpenAI:
var v struct {
Model string `json:"model"`
Stream bool `json:"stream"`
}
if err := json.Unmarshal(body, &v); err != nil {
return "", false, fmt.Errorf("failed to parse openai request body: %w", err)
}
return strings.TrimSpace(v.Model), v.Stream, nil
case opsRetryTypeGeminiV1B:
if strings.TrimSpace(errorLog.Model) == "" {
return "", false, fmt.Errorf("missing model for gemini v1beta retry")
}
return strings.TrimSpace(errorLog.Model), errorLog.Stream, nil
default:
return "", false, fmt.Errorf("unsupported retry type: %s", reqType)
}
}
func (s *OpsService) executeWithAccount(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, account *Account) *opsRetryExecution {
if account == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "missing account"}
}
c, w := newOpsRetryContext(ctx, errorLog)
var err error
switch reqType {
case opsRetryTypeOpenAI:
if s.openAIGatewayService == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "openai gateway service not available"}
}
_, err = s.openAIGatewayService.Forward(ctx, c, account, body)
case opsRetryTypeGeminiV1B:
if s.geminiCompatService == nil || s.antigravityGatewayService == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini services not available"}
}
modelName := strings.TrimSpace(errorLog.Model)
action := "generateContent"
if errorLog.Stream {
action = "streamGenerateContent"
}
if account.Platform == PlatformAntigravity {
_, err = s.antigravityGatewayService.ForwardGemini(ctx, c, account, modelName, action, errorLog.Stream, body)
} else {
_, err = s.geminiCompatService.ForwardNative(ctx, c, account, modelName, action, errorLog.Stream, body)
}
case opsRetryTypeMessages:
switch account.Platform {
case PlatformAntigravity:
if s.antigravityGatewayService == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "antigravity gateway service not available"}
}
_, err = s.antigravityGatewayService.Forward(ctx, c, account, body)
case PlatformGemini:
if s.geminiCompatService == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini gateway service not available"}
}
_, err = s.geminiCompatService.Forward(ctx, c, account, body)
default:
if s.gatewayService == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gateway service not available"}
}
parsedReq, parseErr := ParseGatewayRequest(body)
if parseErr != nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "failed to parse request body"}
}
_, err = s.gatewayService.Forward(ctx, c, account, parsedReq)
}
default:
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "unsupported retry type"}
}
statusCode := http.StatusOK
if c != nil && c.Writer != nil {
statusCode = c.Writer.Status()
}
upstreamReqID := extractUpstreamRequestID(c)
preview, truncated := extractResponsePreview(w)
exec := &opsRetryExecution{
status: opsRetryStatusFailed,
httpStatusCode: statusCode,
upstreamRequestID: upstreamReqID,
responsePreview: preview,
responseTruncated: truncated,
errorMessage: "",
}
if err == nil && statusCode < 400 {
exec.status = opsRetryStatusSucceeded
return exec
}
if err != nil {
exec.errorMessage = err.Error()
} else {
exec.errorMessage = fmt.Sprintf("upstream returned status %d", statusCode)
}
return exec
}
func newOpsRetryContext(ctx context.Context, errorLog *OpsErrorLogDetail) (*gin.Context, *limitedResponseWriter) {
w := newLimitedResponseWriter(opsRetryCaptureBytesLimit)
c, _ := gin.CreateTestContext(w)
path := "/"
if errorLog != nil && strings.TrimSpace(errorLog.RequestPath) != "" {
path = errorLog.RequestPath
}
req, _ := http.NewRequestWithContext(ctx, http.MethodPost, "http://localhost"+path, bytes.NewReader(nil))
req.Header.Set("content-type", "application/json")
if errorLog != nil && strings.TrimSpace(errorLog.UserAgent) != "" {
req.Header.Set("user-agent", errorLog.UserAgent)
}
// Restore a minimal, whitelisted subset of request headers to improve retry fidelity
// (e.g. anthropic-beta / anthropic-version). Never replay auth credentials.
if errorLog != nil && strings.TrimSpace(errorLog.RequestHeaders) != "" {
var stored map[string]string
if err := json.Unmarshal([]byte(errorLog.RequestHeaders), &stored); err == nil {
for k, v := range stored {
key := strings.TrimSpace(k)
if key == "" {
continue
}
if !opsRetryRequestHeaderAllowlist[strings.ToLower(key)] {
continue
}
val := strings.TrimSpace(v)
if val == "" {
continue
}
req.Header.Set(key, val)
}
}
}
c.Request = req
return c, w
}
func extractUpstreamRequestID(c *gin.Context) string {
if c == nil || c.Writer == nil {
return ""
}
h := c.Writer.Header()
if h == nil {
return ""
}
for _, key := range []string{"x-request-id", "X-Request-Id", "X-Request-ID"} {
if v := strings.TrimSpace(h.Get(key)); v != "" {
return v
}
}
return ""
}
func extractResponsePreview(w *limitedResponseWriter) (preview string, truncated bool) {
if w == nil {
return "", false
}
b := bytes.TrimSpace(w.bodyBytes())
if len(b) == 0 {
return "", w.truncated()
}
if len(b) > opsRetryResponsePreviewMax {
return string(b[:opsRetryResponsePreviewMax]), true
}
return string(b), w.truncated()
}
func containsInt64(items []int64, needle int64) bool {
for _, v := range items {
if v == needle {
return true
}
}
return false
}
func (s *OpsService) isFailoverError(message string) bool {
msg := strings.ToLower(strings.TrimSpace(message))
if msg == "" {
return false
}
return strings.Contains(msg, "upstream error:") && strings.Contains(msg, "failover")
}

View File

@@ -0,0 +1,451 @@
package service
import (
"context"
"database/sql"
"encoding/json"
"errors"
"log"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/config"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
var ErrOpsDisabled = infraerrors.NotFound("OPS_DISABLED", "Ops monitoring is disabled")
const (
opsMaxStoredRequestBodyBytes = 10 * 1024
opsMaxStoredErrorBodyBytes = 20 * 1024
)
// OpsService provides ingestion and query APIs for the Ops monitoring module.
type OpsService struct {
opsRepo OpsRepository
settingRepo SettingRepository
cfg *config.Config
accountRepo AccountRepository
concurrencyService *ConcurrencyService
gatewayService *GatewayService
openAIGatewayService *OpenAIGatewayService
geminiCompatService *GeminiMessagesCompatService
antigravityGatewayService *AntigravityGatewayService
}
func NewOpsService(
opsRepo OpsRepository,
settingRepo SettingRepository,
cfg *config.Config,
accountRepo AccountRepository,
concurrencyService *ConcurrencyService,
gatewayService *GatewayService,
openAIGatewayService *OpenAIGatewayService,
geminiCompatService *GeminiMessagesCompatService,
antigravityGatewayService *AntigravityGatewayService,
) *OpsService {
return &OpsService{
opsRepo: opsRepo,
settingRepo: settingRepo,
cfg: cfg,
accountRepo: accountRepo,
concurrencyService: concurrencyService,
gatewayService: gatewayService,
openAIGatewayService: openAIGatewayService,
geminiCompatService: geminiCompatService,
antigravityGatewayService: antigravityGatewayService,
}
}
func (s *OpsService) RequireMonitoringEnabled(ctx context.Context) error {
if s.IsMonitoringEnabled(ctx) {
return nil
}
return ErrOpsDisabled
}
func (s *OpsService) IsMonitoringEnabled(ctx context.Context) bool {
// Hard switch: disable ops entirely.
if s.cfg != nil && !s.cfg.Ops.Enabled {
return false
}
if s.settingRepo == nil {
return true
}
value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
if err != nil {
// Default enabled when key is missing, and fail-open on transient errors
// (ops should never block gateway traffic).
if errors.Is(err, ErrSettingNotFound) {
return true
}
return true
}
switch strings.ToLower(strings.TrimSpace(value)) {
case "false", "0", "off", "disabled":
return false
default:
return true
}
}
func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogInput, rawRequestBody []byte) error {
if entry == nil {
return nil
}
if !s.IsMonitoringEnabled(ctx) {
return nil
}
if s.opsRepo == nil {
return nil
}
// Ensure timestamps are always populated.
if entry.CreatedAt.IsZero() {
entry.CreatedAt = time.Now()
}
// Ensure required fields exist (DB has NOT NULL constraints).
entry.ErrorPhase = strings.TrimSpace(entry.ErrorPhase)
entry.ErrorType = strings.TrimSpace(entry.ErrorType)
if entry.ErrorPhase == "" {
entry.ErrorPhase = "internal"
}
if entry.ErrorType == "" {
entry.ErrorType = "api_error"
}
// Sanitize + trim request body (errors only).
if len(rawRequestBody) > 0 {
sanitized, truncated, bytesLen := sanitizeAndTrimRequestBody(rawRequestBody, opsMaxStoredRequestBodyBytes)
if sanitized != "" {
entry.RequestBodyJSON = &sanitized
}
entry.RequestBodyTruncated = truncated
entry.RequestBodyBytes = &bytesLen
}
// Sanitize + truncate error_body to avoid storing sensitive data.
if strings.TrimSpace(entry.ErrorBody) != "" {
sanitized, _ := sanitizeErrorBodyForStorage(entry.ErrorBody, opsMaxStoredErrorBodyBytes)
entry.ErrorBody = sanitized
}
if _, err := s.opsRepo.InsertErrorLog(ctx, entry); err != nil {
// Never bubble up to gateway; best-effort logging.
log.Printf("[Ops] RecordError failed: %v", err)
return err
}
return nil
}
func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil
}
return s.opsRepo.ListErrorLogs(ctx, filter)
}
func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
}
detail, err := s.opsRepo.GetErrorLogByID(ctx, id)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
}
return nil, infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err)
}
return detail, nil
}
func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) {
bytesLen = len(raw)
if len(raw) == 0 {
return "", false, 0
}
var decoded any
if err := json.Unmarshal(raw, &decoded); err != nil {
// If it's not valid JSON, don't store (retry would not be reliable anyway).
return "", false, bytesLen
}
decoded = redactSensitiveJSON(decoded)
encoded, err := json.Marshal(decoded)
if err != nil {
return "", false, bytesLen
}
if len(encoded) <= maxBytes {
return string(encoded), false, bytesLen
}
// Trim conversation history to keep the most recent context.
if root, ok := decoded.(map[string]any); ok {
if trimmed, ok := trimConversationArrays(root, maxBytes); ok {
encoded2, err2 := json.Marshal(trimmed)
if err2 == nil && len(encoded2) <= maxBytes {
return string(encoded2), true, bytesLen
}
// Fallthrough: keep shrinking.
decoded = trimmed
}
essential := shrinkToEssentials(root)
encoded3, err3 := json.Marshal(essential)
if err3 == nil && len(encoded3) <= maxBytes {
return string(encoded3), true, bytesLen
}
}
// Last resort: store a minimal placeholder (still valid JSON).
placeholder := map[string]any{
"request_body_truncated": true,
}
if model := extractString(decoded, "model"); model != "" {
placeholder["model"] = model
}
encoded4, err4 := json.Marshal(placeholder)
if err4 != nil {
return "", true, bytesLen
}
return string(encoded4), true, bytesLen
}
func redactSensitiveJSON(v any) any {
switch t := v.(type) {
case map[string]any:
out := make(map[string]any, len(t))
for k, vv := range t {
if isSensitiveKey(k) {
out[k] = "[REDACTED]"
continue
}
out[k] = redactSensitiveJSON(vv)
}
return out
case []any:
out := make([]any, 0, len(t))
for _, vv := range t {
out = append(out, redactSensitiveJSON(vv))
}
return out
default:
return v
}
}
func isSensitiveKey(key string) bool {
k := strings.ToLower(strings.TrimSpace(key))
if k == "" {
return false
}
// Exact matches (common credential fields).
switch k {
case "authorization",
"proxy-authorization",
"x-api-key",
"api_key",
"apikey",
"access_token",
"refresh_token",
"id_token",
"session_token",
"token",
"password",
"passwd",
"passphrase",
"secret",
"client_secret",
"private_key",
"jwt",
"signature",
"accesskeyid",
"secretaccesskey":
return true
}
// Suffix matches.
for _, suffix := range []string{
"_secret",
"_token",
"_id_token",
"_session_token",
"_password",
"_passwd",
"_passphrase",
"_key",
"secret_key",
"private_key",
} {
if strings.HasSuffix(k, suffix) {
return true
}
}
// Substring matches (conservative, but errs on the side of privacy).
for _, sub := range []string{
"secret",
"token",
"password",
"passwd",
"passphrase",
"privatekey",
"private_key",
"apikey",
"api_key",
"accesskeyid",
"secretaccesskey",
"bearer",
"cookie",
"credential",
"session",
"jwt",
"signature",
} {
if strings.Contains(k, sub) {
return true
}
}
return false
}
func trimConversationArrays(root map[string]any, maxBytes int) (map[string]any, bool) {
// Supported: anthropic/openai: messages; gemini: contents.
if out, ok := trimArrayField(root, "messages", maxBytes); ok {
return out, true
}
if out, ok := trimArrayField(root, "contents", maxBytes); ok {
return out, true
}
return root, false
}
func trimArrayField(root map[string]any, field string, maxBytes int) (map[string]any, bool) {
raw, ok := root[field]
if !ok {
return nil, false
}
arr, ok := raw.([]any)
if !ok || len(arr) == 0 {
return nil, false
}
// Keep at least the last message/content. Use binary search so we don't marshal O(n) times.
// We are dropping from the *front* of the array (oldest context first).
lo := 0
hi := len(arr) - 1 // inclusive; hi ensures at least one item remains
var best map[string]any
found := false
for lo <= hi {
mid := (lo + hi) / 2
candidateArr := arr[mid:]
if len(candidateArr) == 0 {
lo = mid + 1
continue
}
next := shallowCopyMap(root)
next[field] = candidateArr
encoded, err := json.Marshal(next)
if err != nil {
// If marshal fails, try dropping more.
lo = mid + 1
continue
}
if len(encoded) <= maxBytes {
best = next
found = true
// Try to keep more context by dropping fewer items.
hi = mid - 1
continue
}
// Need to drop more.
lo = mid + 1
}
if found {
return best, true
}
// Nothing fit (even with only one element); return the smallest slice and let the
// caller fall back to shrinkToEssentials().
next := shallowCopyMap(root)
next[field] = arr[len(arr)-1:]
return next, true
}
func shrinkToEssentials(root map[string]any) map[string]any {
out := make(map[string]any)
for _, key := range []string{"model", "stream", "max_tokens", "temperature", "top_p", "top_k"} {
if v, ok := root[key]; ok {
out[key] = v
}
}
// Keep only the last element of the conversation array.
if v, ok := root["messages"]; ok {
if arr, ok := v.([]any); ok && len(arr) > 0 {
out["messages"] = []any{arr[len(arr)-1]}
}
}
if v, ok := root["contents"]; ok {
if arr, ok := v.([]any); ok && len(arr) > 0 {
out["contents"] = []any{arr[len(arr)-1]}
}
}
return out
}
func shallowCopyMap(m map[string]any) map[string]any {
out := make(map[string]any, len(m))
for k, v := range m {
out[k] = v
}
return out
}
func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, truncated bool) {
raw = strings.TrimSpace(raw)
if raw == "" {
return "", false
}
// Prefer JSON-safe sanitization when possible.
if out, trunc, _ := sanitizeAndTrimRequestBody([]byte(raw), maxBytes); out != "" {
return out, trunc
}
// Non-JSON: best-effort truncate.
if maxBytes > 0 && len(raw) > maxBytes {
return truncateString(raw, maxBytes), true
}
return raw, false
}
func extractString(v any, key string) string {
root, ok := v.(map[string]any)
if !ok {
return ""
}
s, _ := root[key].(string)
return strings.TrimSpace(s)
}

View File

@@ -0,0 +1,354 @@
package service
import (
"context"
"encoding/json"
"errors"
"strings"
"time"
)
const (
opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader"
opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second
)
// =========================
// Email notification config
// =========================
func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) {
defaultCfg := defaultOpsEmailNotificationConfig()
if s == nil || s.settingRepo == nil {
return defaultCfg, nil
}
if ctx == nil {
ctx = context.Background()
}
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
// Initialize defaults on first read (best-effort).
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
_ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b))
}
return defaultCfg, nil
}
return nil, err
}
cfg := &OpsEmailNotificationConfig{}
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
// Corrupted JSON should not break ops UI; fall back to defaults.
return defaultCfg, nil
}
normalizeOpsEmailNotificationConfig(cfg)
return cfg, nil
}
func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) {
if s == nil || s.settingRepo == nil {
return nil, errors.New("setting repository not initialized")
}
if ctx == nil {
ctx = context.Background()
}
if req == nil {
return nil, errors.New("invalid request")
}
cfg, err := s.GetEmailNotificationConfig(ctx)
if err != nil {
return nil, err
}
if req.Alert != nil {
cfg.Alert.Enabled = req.Alert.Enabled
if req.Alert.Recipients != nil {
cfg.Alert.Recipients = req.Alert.Recipients
}
cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity)
cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour
cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds
cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts
}
if req.Report != nil {
cfg.Report.Enabled = req.Report.Enabled
if req.Report.Recipients != nil {
cfg.Report.Recipients = req.Report.Recipients
}
cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled
cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule)
cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled
cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule)
cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled
cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule)
cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount
cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled
cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule)
cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold
}
if err := validateOpsEmailNotificationConfig(cfg); err != nil {
return nil, err
}
normalizeOpsEmailNotificationConfig(cfg)
raw, err := json.Marshal(cfg)
if err != nil {
return nil, err
}
if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil {
return nil, err
}
return cfg, nil
}
func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig {
return &OpsEmailNotificationConfig{
Alert: OpsEmailAlertConfig{
Enabled: true,
Recipients: []string{},
MinSeverity: "",
RateLimitPerHour: 0,
BatchingWindowSeconds: 0,
IncludeResolvedAlerts: false,
},
Report: OpsEmailReportConfig{
Enabled: false,
Recipients: []string{},
DailySummaryEnabled: false,
DailySummarySchedule: "0 9 * * *",
WeeklySummaryEnabled: false,
WeeklySummarySchedule: "0 9 * * 1",
ErrorDigestEnabled: false,
ErrorDigestSchedule: "0 9 * * *",
ErrorDigestMinCount: 10,
AccountHealthEnabled: false,
AccountHealthSchedule: "0 9 * * *",
AccountHealthErrorRateThreshold: 10.0,
},
}
}
func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) {
if cfg == nil {
return
}
if cfg.Alert.Recipients == nil {
cfg.Alert.Recipients = []string{}
}
if cfg.Report.Recipients == nil {
cfg.Report.Recipients = []string{}
}
cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity)
cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule)
cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule)
cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule)
cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule)
// Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings.
if cfg.Report.DailySummarySchedule == "" {
cfg.Report.DailySummarySchedule = "0 9 * * *"
}
if cfg.Report.WeeklySummarySchedule == "" {
cfg.Report.WeeklySummarySchedule = "0 9 * * 1"
}
if cfg.Report.ErrorDigestSchedule == "" {
cfg.Report.ErrorDigestSchedule = "0 9 * * *"
}
if cfg.Report.AccountHealthSchedule == "" {
cfg.Report.AccountHealthSchedule = "0 9 * * *"
}
}
func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error {
if cfg == nil {
return errors.New("invalid config")
}
if cfg.Alert.RateLimitPerHour < 0 {
return errors.New("alert.rate_limit_per_hour must be >= 0")
}
if cfg.Alert.BatchingWindowSeconds < 0 {
return errors.New("alert.batching_window_seconds must be >= 0")
}
switch strings.TrimSpace(cfg.Alert.MinSeverity) {
case "", "critical", "warning", "info":
default:
return errors.New("alert.min_severity must be one of: critical, warning, info, or empty")
}
if cfg.Report.ErrorDigestMinCount < 0 {
return errors.New("report.error_digest_min_count must be >= 0")
}
if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 {
return errors.New("report.account_health_error_rate_threshold must be between 0 and 100")
}
return nil
}
// =========================
// Alert runtime settings
// =========================
func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings {
return &OpsAlertRuntimeSettings{
EvaluationIntervalSeconds: 60,
DistributedLock: OpsDistributedLockSettings{
Enabled: true,
Key: opsAlertEvaluatorLeaderLockKeyDefault,
TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()),
},
Silencing: OpsAlertSilencingSettings{
Enabled: false,
GlobalUntilRFC3339: "",
GlobalReason: "",
Entries: []OpsAlertSilenceEntry{},
},
}
}
func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) {
if s == nil {
return
}
s.Key = strings.TrimSpace(s.Key)
if s.Key == "" {
s.Key = defaultKey
}
if s.TTLSeconds <= 0 {
s.TTLSeconds = defaultTTLSeconds
}
}
func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) {
if s == nil {
return
}
s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339)
s.GlobalReason = strings.TrimSpace(s.GlobalReason)
if s.Entries == nil {
s.Entries = []OpsAlertSilenceEntry{}
}
for i := range s.Entries {
s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339)
s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason)
}
}
func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error {
if strings.TrimSpace(s.Key) == "" {
return errors.New("distributed_lock.key is required")
}
if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) {
return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400")
}
return nil
}
func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error {
parse := func(raw string) error {
if strings.TrimSpace(raw) == "" {
return nil
}
if _, err := time.Parse(time.RFC3339, raw); err != nil {
return errors.New("silencing time must be RFC3339")
}
return nil
}
if err := parse(s.GlobalUntilRFC3339); err != nil {
return err
}
for _, entry := range s.Entries {
if strings.TrimSpace(entry.UntilRFC3339) == "" {
return errors.New("silencing.entries.until_rfc3339 is required")
}
if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil {
return errors.New("silencing.entries.until_rfc3339 must be RFC3339")
}
}
return nil
}
func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) {
defaultCfg := defaultOpsAlertRuntimeSettings()
if s == nil || s.settingRepo == nil {
return defaultCfg, nil
}
if ctx == nil {
ctx = context.Background()
}
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
_ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b))
}
return defaultCfg, nil
}
return nil, err
}
cfg := &OpsAlertRuntimeSettings{}
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
return defaultCfg, nil
}
if cfg.EvaluationIntervalSeconds <= 0 {
cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds
}
normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
normalizeOpsAlertSilencingSettings(&cfg.Silencing)
return cfg, nil
}
func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) {
if s == nil || s.settingRepo == nil {
return nil, errors.New("setting repository not initialized")
}
if ctx == nil {
ctx = context.Background()
}
if cfg == nil {
return nil, errors.New("invalid config")
}
if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) {
return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400")
}
if cfg.DistributedLock.Enabled {
if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil {
return nil, err
}
}
if cfg.Silencing.Enabled {
if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil {
return nil, err
}
}
defaultCfg := defaultOpsAlertRuntimeSettings()
normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
normalizeOpsAlertSilencingSettings(&cfg.Silencing)
raw, err := json.Marshal(cfg)
if err != nil {
return nil, err
}
if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil {
return nil, err
}
// Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated).
updated := &OpsAlertRuntimeSettings{}
_ = json.Unmarshal(raw, updated)
return updated, nil
}

View File

@@ -0,0 +1,27 @@
package service
import (
"context"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
func (s *OpsService) GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if filter == nil {
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
}
if filter.StartTime.After(filter.EndTime) {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
}
return s.opsRepo.GetThroughputTrend(ctx, filter, bucketSeconds)
}

View File

@@ -0,0 +1,24 @@
package service
import (
"context"
"time"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
// GetWindowStats returns lightweight request/token counts for the provided window.
// It is intended for realtime sampling (e.g. WebSocket QPS push) without computing percentiles/peaks.
func (s *OpsService) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
filter := &OpsDashboardFilter{
StartTime: startTime,
EndTime: endTime,
}
return s.opsRepo.GetWindowStats(ctx, filter)
}

View File

@@ -1,10 +1,12 @@
package service
import (
"database/sql"
"time"
"github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/wire"
"github.com/redis/go-redis/v9"
)
// BuildInfo contains build information
@@ -70,6 +72,57 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi
return svc
}
// ProvideOpsMetricsCollector creates and starts OpsMetricsCollector.
func ProvideOpsMetricsCollector(
opsRepo OpsRepository,
settingRepo SettingRepository,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsMetricsCollector {
collector := NewOpsMetricsCollector(opsRepo, settingRepo, db, redisClient, cfg)
collector.Start()
return collector
}
// ProvideOpsAggregationService creates and starts OpsAggregationService (hourly/daily pre-aggregation).
func ProvideOpsAggregationService(
opsRepo OpsRepository,
settingRepo SettingRepository,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsAggregationService {
svc := NewOpsAggregationService(opsRepo, settingRepo, db, redisClient, cfg)
svc.Start()
return svc
}
// ProvideOpsAlertEvaluatorService creates and starts OpsAlertEvaluatorService.
func ProvideOpsAlertEvaluatorService(
opsService *OpsService,
opsRepo OpsRepository,
emailService *EmailService,
redisClient *redis.Client,
cfg *config.Config,
) *OpsAlertEvaluatorService {
svc := NewOpsAlertEvaluatorService(opsService, opsRepo, emailService, redisClient, cfg)
svc.Start()
return svc
}
// ProvideOpsCleanupService creates and starts OpsCleanupService (cron scheduled).
func ProvideOpsCleanupService(
opsRepo OpsRepository,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsCleanupService {
svc := NewOpsCleanupService(opsRepo, db, redisClient, cfg)
svc.Start()
return svc
}
// ProviderSet is the Wire provider set for all services
var ProviderSet = wire.NewSet(
// Core services
@@ -101,6 +154,11 @@ var ProviderSet = wire.NewSet(
NewAccountUsageService,
NewAccountTestService,
NewSettingService,
NewOpsService,
ProvideOpsMetricsCollector,
ProvideOpsAggregationService,
ProvideOpsAlertEvaluatorService,
ProvideOpsCleanupService,
NewEmailService,
ProvideEmailQueueService,
NewTurnstileService,