feat(service): 实现运维监控业务逻辑层
- 新增 ops 主服务(ops_service.go)和端口定义(ops_port.go) - 实现账号可用性检查服务(ops_account_availability.go) - 实现数据聚合服务(ops_aggregation_service.go) - 实现告警评估服务(ops_alert_evaluator_service.go) - 实现告警管理服务(ops_alerts.go) - 实现数据清理服务(ops_cleanup_service.go) - 实现并发控制服务(ops_concurrency.go) - 实现仪表板服务(ops_dashboard.go) - 实现错误处理服务(ops_errors.go) - 实现直方图服务(ops_histograms.go) - 实现指标采集服务(ops_metrics_collector.go) - 实现查询模式服务(ops_query_mode.go) - 实现实时监控服务(ops_realtime.go) - 实现请求详情服务(ops_request_details.go) - 实现重试机制服务(ops_retry.go) - 实现配置管理服务(ops_settings.go) - 实现趋势分析服务(ops_trends.go) - 实现窗口统计服务(ops_window_stats.go) - 添加 ops 相关领域常量 - 注册 service 依赖注入
This commit is contained in:
@@ -105,6 +105,28 @@ const (
|
||||
// Request identity patch (Claude -> Gemini systemInstruction injection)
|
||||
SettingKeyEnableIdentityPatch = "enable_identity_patch"
|
||||
SettingKeyIdentityPatchPrompt = "identity_patch_prompt"
|
||||
|
||||
// =========================
|
||||
// Ops Monitoring (vNext)
|
||||
// =========================
|
||||
|
||||
// SettingKeyOpsMonitoringEnabled is a DB-backed soft switch to enable/disable ops module at runtime.
|
||||
SettingKeyOpsMonitoringEnabled = "ops_monitoring_enabled"
|
||||
|
||||
// SettingKeyOpsRealtimeMonitoringEnabled controls realtime features (e.g. WS/QPS push).
|
||||
SettingKeyOpsRealtimeMonitoringEnabled = "ops_realtime_monitoring_enabled"
|
||||
|
||||
// SettingKeyOpsQueryModeDefault controls the default query mode for ops dashboard (auto/raw/preagg).
|
||||
SettingKeyOpsQueryModeDefault = "ops_query_mode_default"
|
||||
|
||||
// SettingKeyOpsEmailNotificationConfig stores JSON config for ops email notifications.
|
||||
SettingKeyOpsEmailNotificationConfig = "ops_email_notification_config"
|
||||
|
||||
// SettingKeyOpsAlertRuntimeSettings stores JSON config for ops alert evaluator runtime settings.
|
||||
SettingKeyOpsAlertRuntimeSettings = "ops_alert_runtime_settings"
|
||||
|
||||
// SettingKeyOpsMetricsIntervalSeconds controls the ops metrics collector interval (>=60).
|
||||
SettingKeyOpsMetricsIntervalSeconds = "ops_metrics_interval_seconds"
|
||||
)
|
||||
|
||||
// AdminAPIKeyPrefix is the prefix for admin API keys (distinct from user "sk-" keys).
|
||||
|
||||
157
backend/internal/service/ops_account_availability.go
Normal file
157
backend/internal/service/ops_account_availability.go
Normal file
@@ -0,0 +1,157 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
)
|
||||
|
||||
// GetAccountAvailabilityStats returns current account availability stats.
|
||||
//
|
||||
// Query-level filtering is intentionally limited to platform/group to match the dashboard scope.
|
||||
func (s *OpsService) GetAccountAvailabilityStats(ctx context.Context, platformFilter string, groupIDFilter *int64) (
|
||||
map[string]*PlatformAvailability,
|
||||
map[int64]*GroupAvailability,
|
||||
map[int64]*AccountAvailability,
|
||||
*time.Time,
|
||||
error,
|
||||
) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, nil, nil, nil, err
|
||||
}
|
||||
|
||||
accounts, err := s.listAllAccountsForOps(ctx, platformFilter)
|
||||
if err != nil {
|
||||
return nil, nil, nil, nil, err
|
||||
}
|
||||
|
||||
if groupIDFilter != nil && *groupIDFilter > 0 {
|
||||
filtered := make([]Account, 0, len(accounts))
|
||||
for _, acc := range accounts {
|
||||
for _, grp := range acc.Groups {
|
||||
if grp != nil && grp.ID == *groupIDFilter {
|
||||
filtered = append(filtered, acc)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
accounts = filtered
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
collectedAt := now
|
||||
|
||||
platform := make(map[string]*PlatformAvailability)
|
||||
group := make(map[int64]*GroupAvailability)
|
||||
account := make(map[int64]*AccountAvailability)
|
||||
|
||||
for _, acc := range accounts {
|
||||
if acc.ID <= 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
isTempUnsched := false
|
||||
if acc.TempUnschedulableUntil != nil && now.Before(*acc.TempUnschedulableUntil) {
|
||||
isTempUnsched = true
|
||||
}
|
||||
|
||||
isRateLimited := acc.RateLimitResetAt != nil && now.Before(*acc.RateLimitResetAt)
|
||||
isOverloaded := acc.OverloadUntil != nil && now.Before(*acc.OverloadUntil)
|
||||
hasError := acc.Status == StatusError
|
||||
|
||||
// Normalize exclusive status flags so the UI doesn't show conflicting badges.
|
||||
if hasError {
|
||||
isRateLimited = false
|
||||
isOverloaded = false
|
||||
}
|
||||
|
||||
isAvailable := acc.Status == StatusActive && acc.Schedulable && !isRateLimited && !isOverloaded && !isTempUnsched
|
||||
|
||||
if acc.Platform != "" {
|
||||
if _, ok := platform[acc.Platform]; !ok {
|
||||
platform[acc.Platform] = &PlatformAvailability{
|
||||
Platform: acc.Platform,
|
||||
}
|
||||
}
|
||||
p := platform[acc.Platform]
|
||||
p.TotalAccounts++
|
||||
if isAvailable {
|
||||
p.AvailableCount++
|
||||
}
|
||||
if isRateLimited {
|
||||
p.RateLimitCount++
|
||||
}
|
||||
if hasError {
|
||||
p.ErrorCount++
|
||||
}
|
||||
}
|
||||
|
||||
for _, grp := range acc.Groups {
|
||||
if grp == nil || grp.ID <= 0 {
|
||||
continue
|
||||
}
|
||||
if _, ok := group[grp.ID]; !ok {
|
||||
group[grp.ID] = &GroupAvailability{
|
||||
GroupID: grp.ID,
|
||||
GroupName: grp.Name,
|
||||
Platform: grp.Platform,
|
||||
}
|
||||
}
|
||||
g := group[grp.ID]
|
||||
g.TotalAccounts++
|
||||
if isAvailable {
|
||||
g.AvailableCount++
|
||||
}
|
||||
if isRateLimited {
|
||||
g.RateLimitCount++
|
||||
}
|
||||
if hasError {
|
||||
g.ErrorCount++
|
||||
}
|
||||
}
|
||||
|
||||
displayGroupID := int64(0)
|
||||
displayGroupName := ""
|
||||
if len(acc.Groups) > 0 && acc.Groups[0] != nil {
|
||||
displayGroupID = acc.Groups[0].ID
|
||||
displayGroupName = acc.Groups[0].Name
|
||||
}
|
||||
|
||||
item := &AccountAvailability{
|
||||
AccountID: acc.ID,
|
||||
AccountName: acc.Name,
|
||||
Platform: acc.Platform,
|
||||
GroupID: displayGroupID,
|
||||
GroupName: displayGroupName,
|
||||
Status: acc.Status,
|
||||
|
||||
IsAvailable: isAvailable,
|
||||
IsRateLimited: isRateLimited,
|
||||
IsOverloaded: isOverloaded,
|
||||
HasError: hasError,
|
||||
|
||||
ErrorMessage: acc.ErrorMessage,
|
||||
}
|
||||
|
||||
if isRateLimited && acc.RateLimitResetAt != nil {
|
||||
item.RateLimitResetAt = acc.RateLimitResetAt
|
||||
remainingSec := int64(time.Until(*acc.RateLimitResetAt).Seconds())
|
||||
if remainingSec > 0 {
|
||||
item.RateLimitRemainingSec = &remainingSec
|
||||
}
|
||||
}
|
||||
if isOverloaded && acc.OverloadUntil != nil {
|
||||
item.OverloadUntil = acc.OverloadUntil
|
||||
remainingSec := int64(time.Until(*acc.OverloadUntil).Seconds())
|
||||
if remainingSec > 0 {
|
||||
item.OverloadRemainingSec = &remainingSec
|
||||
}
|
||||
}
|
||||
if isTempUnsched && acc.TempUnschedulableUntil != nil {
|
||||
item.TempUnschedulableUntil = acc.TempUnschedulableUntil
|
||||
}
|
||||
|
||||
account[acc.ID] = item
|
||||
}
|
||||
|
||||
return platform, group, account, &collectedAt, nil
|
||||
}
|
||||
434
backend/internal/service/ops_aggregation_service.go
Normal file
434
backend/internal/service/ops_aggregation_service.go
Normal file
@@ -0,0 +1,434 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"log"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||
"github.com/google/uuid"
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
const (
|
||||
opsAggHourlyJobName = "ops_preaggregation_hourly"
|
||||
opsAggDailyJobName = "ops_preaggregation_daily"
|
||||
|
||||
opsAggHourlyInterval = 10 * time.Minute
|
||||
opsAggDailyInterval = 1 * time.Hour
|
||||
|
||||
// Keep in sync with ops retention target (vNext default 30d).
|
||||
opsAggBackfillWindow = 30 * 24 * time.Hour
|
||||
|
||||
// Recompute overlap to absorb late-arriving rows near boundaries.
|
||||
opsAggHourlyOverlap = 2 * time.Hour
|
||||
opsAggDailyOverlap = 48 * time.Hour
|
||||
|
||||
opsAggHourlyChunk = 24 * time.Hour
|
||||
opsAggDailyChunk = 7 * 24 * time.Hour
|
||||
|
||||
// Delay around boundaries (e.g. 10:00..10:05) to avoid aggregating buckets
|
||||
// that may still receive late inserts.
|
||||
opsAggSafeDelay = 5 * time.Minute
|
||||
|
||||
opsAggMaxQueryTimeout = 3 * time.Second
|
||||
opsAggHourlyTimeout = 5 * time.Minute
|
||||
opsAggDailyTimeout = 2 * time.Minute
|
||||
|
||||
opsAggHourlyLeaderLockKey = "ops:aggregation:hourly:leader"
|
||||
opsAggDailyLeaderLockKey = "ops:aggregation:daily:leader"
|
||||
|
||||
opsAggHourlyLeaderLockTTL = 15 * time.Minute
|
||||
opsAggDailyLeaderLockTTL = 10 * time.Minute
|
||||
)
|
||||
|
||||
// OpsAggregationService periodically backfills ops_metrics_hourly / ops_metrics_daily
|
||||
// for stable long-window dashboard queries.
|
||||
//
|
||||
// It is safe to run in multi-replica deployments when Redis is available (leader lock).
|
||||
type OpsAggregationService struct {
|
||||
opsRepo OpsRepository
|
||||
settingRepo SettingRepository
|
||||
cfg *config.Config
|
||||
|
||||
db *sql.DB
|
||||
redisClient *redis.Client
|
||||
instanceID string
|
||||
|
||||
stopCh chan struct{}
|
||||
startOnce sync.Once
|
||||
stopOnce sync.Once
|
||||
|
||||
hourlyMu sync.Mutex
|
||||
dailyMu sync.Mutex
|
||||
|
||||
skipLogMu sync.Mutex
|
||||
skipLogAt time.Time
|
||||
}
|
||||
|
||||
func NewOpsAggregationService(
|
||||
opsRepo OpsRepository,
|
||||
settingRepo SettingRepository,
|
||||
db *sql.DB,
|
||||
redisClient *redis.Client,
|
||||
cfg *config.Config,
|
||||
) *OpsAggregationService {
|
||||
return &OpsAggregationService{
|
||||
opsRepo: opsRepo,
|
||||
settingRepo: settingRepo,
|
||||
cfg: cfg,
|
||||
db: db,
|
||||
redisClient: redisClient,
|
||||
instanceID: uuid.NewString(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsAggregationService) Start() {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
s.startOnce.Do(func() {
|
||||
if s.stopCh == nil {
|
||||
s.stopCh = make(chan struct{})
|
||||
}
|
||||
go s.hourlyLoop()
|
||||
go s.dailyLoop()
|
||||
})
|
||||
}
|
||||
|
||||
func (s *OpsAggregationService) Stop() {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
s.stopOnce.Do(func() {
|
||||
if s.stopCh != nil {
|
||||
close(s.stopCh)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (s *OpsAggregationService) hourlyLoop() {
|
||||
// First run immediately.
|
||||
s.aggregateHourly()
|
||||
|
||||
ticker := time.NewTicker(opsAggHourlyInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
s.aggregateHourly()
|
||||
case <-s.stopCh:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsAggregationService) dailyLoop() {
|
||||
// First run immediately.
|
||||
s.aggregateDaily()
|
||||
|
||||
ticker := time.NewTicker(opsAggDailyInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
s.aggregateDaily()
|
||||
case <-s.stopCh:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsAggregationService) aggregateHourly() {
|
||||
if s == nil || s.opsRepo == nil {
|
||||
return
|
||||
}
|
||||
if s.cfg != nil {
|
||||
if !s.cfg.Ops.Enabled {
|
||||
return
|
||||
}
|
||||
if !s.cfg.Ops.Aggregation.Enabled {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), opsAggHourlyTimeout)
|
||||
defer cancel()
|
||||
|
||||
if !s.isMonitoringEnabled(ctx) {
|
||||
return
|
||||
}
|
||||
|
||||
release, ok := s.tryAcquireLeaderLock(ctx, opsAggHourlyLeaderLockKey, opsAggHourlyLeaderLockTTL, "[OpsAggregation][hourly]")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if release != nil {
|
||||
defer release()
|
||||
}
|
||||
|
||||
s.hourlyMu.Lock()
|
||||
defer s.hourlyMu.Unlock()
|
||||
|
||||
startedAt := time.Now().UTC()
|
||||
runAt := startedAt
|
||||
|
||||
// Aggregate stable full hours only.
|
||||
end := utcFloorToHour(time.Now().UTC().Add(-opsAggSafeDelay))
|
||||
start := end.Add(-opsAggBackfillWindow)
|
||||
|
||||
// Resume from the latest bucket with overlap.
|
||||
{
|
||||
ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout)
|
||||
latest, ok, err := s.opsRepo.GetLatestHourlyBucketStart(ctxMax)
|
||||
cancelMax()
|
||||
if err != nil {
|
||||
log.Printf("[OpsAggregation][hourly] failed to read latest bucket: %v", err)
|
||||
} else if ok {
|
||||
candidate := latest.Add(-opsAggHourlyOverlap)
|
||||
if candidate.After(start) {
|
||||
start = candidate
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
start = utcFloorToHour(start)
|
||||
if !start.Before(end) {
|
||||
return
|
||||
}
|
||||
|
||||
var aggErr error
|
||||
for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggHourlyChunk) {
|
||||
chunkEnd := minTime(cursor.Add(opsAggHourlyChunk), end)
|
||||
if err := s.opsRepo.UpsertHourlyMetrics(ctx, cursor, chunkEnd); err != nil {
|
||||
aggErr = err
|
||||
log.Printf("[OpsAggregation][hourly] upsert failed (%s..%s): %v", cursor.Format(time.RFC3339), chunkEnd.Format(time.RFC3339), err)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
finishedAt := time.Now().UTC()
|
||||
durationMs := finishedAt.Sub(startedAt).Milliseconds()
|
||||
dur := durationMs
|
||||
|
||||
if aggErr != nil {
|
||||
msg := truncateString(aggErr.Error(), 2048)
|
||||
errAt := finishedAt
|
||||
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer hbCancel()
|
||||
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
|
||||
JobName: opsAggHourlyJobName,
|
||||
LastRunAt: &runAt,
|
||||
LastErrorAt: &errAt,
|
||||
LastError: &msg,
|
||||
LastDurationMs: &dur,
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
successAt := finishedAt
|
||||
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer hbCancel()
|
||||
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
|
||||
JobName: opsAggHourlyJobName,
|
||||
LastRunAt: &runAt,
|
||||
LastSuccessAt: &successAt,
|
||||
LastDurationMs: &dur,
|
||||
})
|
||||
}
|
||||
|
||||
func (s *OpsAggregationService) aggregateDaily() {
|
||||
if s == nil || s.opsRepo == nil {
|
||||
return
|
||||
}
|
||||
if s.cfg != nil {
|
||||
if !s.cfg.Ops.Enabled {
|
||||
return
|
||||
}
|
||||
if !s.cfg.Ops.Aggregation.Enabled {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), opsAggDailyTimeout)
|
||||
defer cancel()
|
||||
|
||||
if !s.isMonitoringEnabled(ctx) {
|
||||
return
|
||||
}
|
||||
|
||||
release, ok := s.tryAcquireLeaderLock(ctx, opsAggDailyLeaderLockKey, opsAggDailyLeaderLockTTL, "[OpsAggregation][daily]")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if release != nil {
|
||||
defer release()
|
||||
}
|
||||
|
||||
s.dailyMu.Lock()
|
||||
defer s.dailyMu.Unlock()
|
||||
|
||||
startedAt := time.Now().UTC()
|
||||
runAt := startedAt
|
||||
|
||||
end := utcFloorToDay(time.Now().UTC())
|
||||
start := end.Add(-opsAggBackfillWindow)
|
||||
|
||||
{
|
||||
ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout)
|
||||
latest, ok, err := s.opsRepo.GetLatestDailyBucketDate(ctxMax)
|
||||
cancelMax()
|
||||
if err != nil {
|
||||
log.Printf("[OpsAggregation][daily] failed to read latest bucket: %v", err)
|
||||
} else if ok {
|
||||
candidate := latest.Add(-opsAggDailyOverlap)
|
||||
if candidate.After(start) {
|
||||
start = candidate
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
start = utcFloorToDay(start)
|
||||
if !start.Before(end) {
|
||||
return
|
||||
}
|
||||
|
||||
var aggErr error
|
||||
for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggDailyChunk) {
|
||||
chunkEnd := minTime(cursor.Add(opsAggDailyChunk), end)
|
||||
if err := s.opsRepo.UpsertDailyMetrics(ctx, cursor, chunkEnd); err != nil {
|
||||
aggErr = err
|
||||
log.Printf("[OpsAggregation][daily] upsert failed (%s..%s): %v", cursor.Format("2006-01-02"), chunkEnd.Format("2006-01-02"), err)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
finishedAt := time.Now().UTC()
|
||||
durationMs := finishedAt.Sub(startedAt).Milliseconds()
|
||||
dur := durationMs
|
||||
|
||||
if aggErr != nil {
|
||||
msg := truncateString(aggErr.Error(), 2048)
|
||||
errAt := finishedAt
|
||||
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer hbCancel()
|
||||
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
|
||||
JobName: opsAggDailyJobName,
|
||||
LastRunAt: &runAt,
|
||||
LastErrorAt: &errAt,
|
||||
LastError: &msg,
|
||||
LastDurationMs: &dur,
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
successAt := finishedAt
|
||||
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer hbCancel()
|
||||
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
|
||||
JobName: opsAggDailyJobName,
|
||||
LastRunAt: &runAt,
|
||||
LastSuccessAt: &successAt,
|
||||
LastDurationMs: &dur,
|
||||
})
|
||||
}
|
||||
|
||||
func (s *OpsAggregationService) isMonitoringEnabled(ctx context.Context) bool {
|
||||
if s == nil {
|
||||
return false
|
||||
}
|
||||
if s.cfg != nil && !s.cfg.Ops.Enabled {
|
||||
return false
|
||||
}
|
||||
if s.settingRepo == nil {
|
||||
return true
|
||||
}
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
|
||||
value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrSettingNotFound) {
|
||||
return true
|
||||
}
|
||||
return true
|
||||
}
|
||||
switch strings.ToLower(strings.TrimSpace(value)) {
|
||||
case "false", "0", "off", "disabled":
|
||||
return false
|
||||
default:
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
var opsAggReleaseScript = redis.NewScript(`
|
||||
if redis.call("GET", KEYS[1]) == ARGV[1] then
|
||||
return redis.call("DEL", KEYS[1])
|
||||
end
|
||||
return 0
|
||||
`)
|
||||
|
||||
func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) {
|
||||
if s == nil || s.redisClient == nil {
|
||||
return nil, true
|
||||
}
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
|
||||
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
|
||||
if err != nil {
|
||||
// Fail-open: do not block single-instance deployments.
|
||||
return nil, true
|
||||
}
|
||||
if !ok {
|
||||
s.maybeLogSkip(logPrefix)
|
||||
return nil, false
|
||||
}
|
||||
|
||||
release := func() {
|
||||
ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
_, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result()
|
||||
}
|
||||
return release, true
|
||||
}
|
||||
|
||||
func (s *OpsAggregationService) maybeLogSkip(prefix string) {
|
||||
s.skipLogMu.Lock()
|
||||
defer s.skipLogMu.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < time.Minute {
|
||||
return
|
||||
}
|
||||
s.skipLogAt = now
|
||||
if prefix == "" {
|
||||
prefix = "[OpsAggregation]"
|
||||
}
|
||||
log.Printf("%s leader lock held by another instance; skipping", prefix)
|
||||
}
|
||||
|
||||
func utcFloorToHour(t time.Time) time.Time {
|
||||
return t.UTC().Truncate(time.Hour)
|
||||
}
|
||||
|
||||
func utcFloorToDay(t time.Time) time.Time {
|
||||
u := t.UTC()
|
||||
y, m, d := u.Date()
|
||||
return time.Date(y, m, d, 0, 0, 0, 0, time.UTC)
|
||||
}
|
||||
|
||||
func minTime(a, b time.Time) time.Time {
|
||||
if a.Before(b) {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
839
backend/internal/service/ops_alert_evaluator_service.go
Normal file
839
backend/internal/service/ops_alert_evaluator_service.go
Normal file
@@ -0,0 +1,839 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"math"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||
"github.com/google/uuid"
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
const (
|
||||
opsAlertEvaluatorJobName = "ops_alert_evaluator"
|
||||
|
||||
opsAlertEvaluatorTimeout = 45 * time.Second
|
||||
opsAlertEvaluatorLeaderLockKey = "ops:alert:evaluator:leader"
|
||||
opsAlertEvaluatorLeaderLockTTL = 90 * time.Second
|
||||
opsAlertEvaluatorSkipLogInterval = 1 * time.Minute
|
||||
)
|
||||
|
||||
var opsAlertEvaluatorReleaseScript = redis.NewScript(`
|
||||
if redis.call("GET", KEYS[1]) == ARGV[1] then
|
||||
return redis.call("DEL", KEYS[1])
|
||||
end
|
||||
return 0
|
||||
`)
|
||||
|
||||
type OpsAlertEvaluatorService struct {
|
||||
opsService *OpsService
|
||||
opsRepo OpsRepository
|
||||
emailService *EmailService
|
||||
|
||||
redisClient *redis.Client
|
||||
cfg *config.Config
|
||||
instanceID string
|
||||
|
||||
stopCh chan struct{}
|
||||
startOnce sync.Once
|
||||
stopOnce sync.Once
|
||||
wg sync.WaitGroup
|
||||
|
||||
mu sync.Mutex
|
||||
ruleStates map[int64]*opsAlertRuleState
|
||||
|
||||
emailLimiter *slidingWindowLimiter
|
||||
|
||||
skipLogMu sync.Mutex
|
||||
skipLogAt time.Time
|
||||
|
||||
warnNoRedisOnce sync.Once
|
||||
}
|
||||
|
||||
type opsAlertRuleState struct {
|
||||
LastEvaluatedAt time.Time
|
||||
ConsecutiveBreaches int
|
||||
}
|
||||
|
||||
func NewOpsAlertEvaluatorService(
|
||||
opsService *OpsService,
|
||||
opsRepo OpsRepository,
|
||||
emailService *EmailService,
|
||||
redisClient *redis.Client,
|
||||
cfg *config.Config,
|
||||
) *OpsAlertEvaluatorService {
|
||||
return &OpsAlertEvaluatorService{
|
||||
opsService: opsService,
|
||||
opsRepo: opsRepo,
|
||||
emailService: emailService,
|
||||
redisClient: redisClient,
|
||||
cfg: cfg,
|
||||
instanceID: uuid.NewString(),
|
||||
ruleStates: map[int64]*opsAlertRuleState{},
|
||||
emailLimiter: newSlidingWindowLimiter(0, time.Hour),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) Start() {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
s.startOnce.Do(func() {
|
||||
if s.stopCh == nil {
|
||||
s.stopCh = make(chan struct{})
|
||||
}
|
||||
go s.run()
|
||||
})
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) Stop() {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
s.stopOnce.Do(func() {
|
||||
if s.stopCh != nil {
|
||||
close(s.stopCh)
|
||||
}
|
||||
})
|
||||
s.wg.Wait()
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) run() {
|
||||
s.wg.Add(1)
|
||||
defer s.wg.Done()
|
||||
|
||||
// Start immediately to produce early feedback in ops dashboard.
|
||||
timer := time.NewTimer(0)
|
||||
defer timer.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-timer.C:
|
||||
interval := s.getInterval()
|
||||
s.evaluateOnce(interval)
|
||||
timer.Reset(interval)
|
||||
case <-s.stopCh:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) getInterval() time.Duration {
|
||||
// Default.
|
||||
interval := 60 * time.Second
|
||||
|
||||
if s == nil || s.opsService == nil {
|
||||
return interval
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
cfg, err := s.opsService.GetOpsAlertRuntimeSettings(ctx)
|
||||
if err != nil || cfg == nil {
|
||||
return interval
|
||||
}
|
||||
if cfg.EvaluationIntervalSeconds <= 0 {
|
||||
return interval
|
||||
}
|
||||
if cfg.EvaluationIntervalSeconds < 1 {
|
||||
return interval
|
||||
}
|
||||
if cfg.EvaluationIntervalSeconds > int((24 * time.Hour).Seconds()) {
|
||||
return interval
|
||||
}
|
||||
return time.Duration(cfg.EvaluationIntervalSeconds) * time.Second
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
|
||||
if s == nil || s.opsRepo == nil {
|
||||
return
|
||||
}
|
||||
if s.cfg != nil && !s.cfg.Ops.Enabled {
|
||||
return
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), opsAlertEvaluatorTimeout)
|
||||
defer cancel()
|
||||
|
||||
if s.opsService != nil && !s.opsService.IsMonitoringEnabled(ctx) {
|
||||
return
|
||||
}
|
||||
|
||||
runtimeCfg := defaultOpsAlertRuntimeSettings()
|
||||
if s.opsService != nil {
|
||||
if loaded, err := s.opsService.GetOpsAlertRuntimeSettings(ctx); err == nil && loaded != nil {
|
||||
runtimeCfg = loaded
|
||||
}
|
||||
}
|
||||
|
||||
release, ok := s.tryAcquireLeaderLock(ctx, runtimeCfg.DistributedLock)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if release != nil {
|
||||
defer release()
|
||||
}
|
||||
|
||||
startedAt := time.Now().UTC()
|
||||
runAt := startedAt
|
||||
|
||||
rules, err := s.opsRepo.ListAlertRules(ctx)
|
||||
if err != nil {
|
||||
s.recordHeartbeatError(runAt, time.Since(startedAt), err)
|
||||
log.Printf("[OpsAlertEvaluator] list rules failed: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
now := time.Now().UTC()
|
||||
safeEnd := now.Truncate(time.Minute)
|
||||
if safeEnd.IsZero() {
|
||||
safeEnd = now
|
||||
}
|
||||
|
||||
systemMetrics, _ := s.opsRepo.GetLatestSystemMetrics(ctx, 1)
|
||||
|
||||
// Cleanup stale state for removed rules.
|
||||
s.pruneRuleStates(rules)
|
||||
|
||||
for _, rule := range rules {
|
||||
if rule == nil || !rule.Enabled || rule.ID <= 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
scopePlatform, scopeGroupID := parseOpsAlertRuleScope(rule.Filters)
|
||||
|
||||
windowMinutes := rule.WindowMinutes
|
||||
if windowMinutes <= 0 {
|
||||
windowMinutes = 1
|
||||
}
|
||||
windowStart := safeEnd.Add(-time.Duration(windowMinutes) * time.Minute)
|
||||
windowEnd := safeEnd
|
||||
|
||||
metricValue, ok := s.computeRuleMetric(ctx, rule, systemMetrics, windowStart, windowEnd, scopePlatform, scopeGroupID)
|
||||
if !ok {
|
||||
s.resetRuleState(rule.ID, now)
|
||||
continue
|
||||
}
|
||||
|
||||
breachedNow := compareMetric(metricValue, rule.Operator, rule.Threshold)
|
||||
required := requiredSustainedBreaches(rule.SustainedMinutes, interval)
|
||||
consecutive := s.updateRuleBreaches(rule.ID, now, interval, breachedNow)
|
||||
|
||||
activeEvent, err := s.opsRepo.GetActiveAlertEvent(ctx, rule.ID)
|
||||
if err != nil {
|
||||
log.Printf("[OpsAlertEvaluator] get active event failed (rule=%d): %v", rule.ID, err)
|
||||
continue
|
||||
}
|
||||
|
||||
if breachedNow && consecutive >= required {
|
||||
if activeEvent != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
latestEvent, err := s.opsRepo.GetLatestAlertEvent(ctx, rule.ID)
|
||||
if err != nil {
|
||||
log.Printf("[OpsAlertEvaluator] get latest event failed (rule=%d): %v", rule.ID, err)
|
||||
continue
|
||||
}
|
||||
if latestEvent != nil && rule.CooldownMinutes > 0 {
|
||||
cooldown := time.Duration(rule.CooldownMinutes) * time.Minute
|
||||
if now.Sub(latestEvent.FiredAt) < cooldown {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
firedEvent := &OpsAlertEvent{
|
||||
RuleID: rule.ID,
|
||||
Severity: strings.TrimSpace(rule.Severity),
|
||||
Status: OpsAlertStatusFiring,
|
||||
Title: fmt.Sprintf("%s: %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name)),
|
||||
Description: buildOpsAlertDescription(rule, metricValue, windowMinutes, scopePlatform, scopeGroupID),
|
||||
MetricValue: float64Ptr(metricValue),
|
||||
ThresholdValue: float64Ptr(rule.Threshold),
|
||||
Dimensions: buildOpsAlertDimensions(scopePlatform, scopeGroupID),
|
||||
FiredAt: now,
|
||||
CreatedAt: now,
|
||||
}
|
||||
|
||||
created, err := s.opsRepo.CreateAlertEvent(ctx, firedEvent)
|
||||
if err != nil {
|
||||
log.Printf("[OpsAlertEvaluator] create event failed (rule=%d): %v", rule.ID, err)
|
||||
continue
|
||||
}
|
||||
|
||||
if created != nil && created.ID > 0 {
|
||||
s.maybeSendAlertEmail(ctx, runtimeCfg, rule, created)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Not breached: resolve active event if present.
|
||||
if activeEvent != nil {
|
||||
resolvedAt := now
|
||||
if err := s.opsRepo.UpdateAlertEventStatus(ctx, activeEvent.ID, OpsAlertStatusResolved, &resolvedAt); err != nil {
|
||||
log.Printf("[OpsAlertEvaluator] resolve event failed (event=%d): %v", activeEvent.ID, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) pruneRuleStates(rules []*OpsAlertRule) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
live := map[int64]struct{}{}
|
||||
for _, r := range rules {
|
||||
if r != nil && r.ID > 0 {
|
||||
live[r.ID] = struct{}{}
|
||||
}
|
||||
}
|
||||
for id := range s.ruleStates {
|
||||
if _, ok := live[id]; !ok {
|
||||
delete(s.ruleStates, id)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) resetRuleState(ruleID int64, now time.Time) {
|
||||
if ruleID <= 0 {
|
||||
return
|
||||
}
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
state, ok := s.ruleStates[ruleID]
|
||||
if !ok {
|
||||
state = &opsAlertRuleState{}
|
||||
s.ruleStates[ruleID] = state
|
||||
}
|
||||
state.LastEvaluatedAt = now
|
||||
state.ConsecutiveBreaches = 0
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) updateRuleBreaches(ruleID int64, now time.Time, interval time.Duration, breached bool) int {
|
||||
if ruleID <= 0 {
|
||||
return 0
|
||||
}
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
state, ok := s.ruleStates[ruleID]
|
||||
if !ok {
|
||||
state = &opsAlertRuleState{}
|
||||
s.ruleStates[ruleID] = state
|
||||
}
|
||||
|
||||
if !state.LastEvaluatedAt.IsZero() && interval > 0 {
|
||||
if now.Sub(state.LastEvaluatedAt) > interval*2 {
|
||||
state.ConsecutiveBreaches = 0
|
||||
}
|
||||
}
|
||||
|
||||
state.LastEvaluatedAt = now
|
||||
if breached {
|
||||
state.ConsecutiveBreaches++
|
||||
} else {
|
||||
state.ConsecutiveBreaches = 0
|
||||
}
|
||||
return state.ConsecutiveBreaches
|
||||
}
|
||||
|
||||
func requiredSustainedBreaches(sustainedMinutes int, interval time.Duration) int {
|
||||
if sustainedMinutes <= 0 {
|
||||
return 1
|
||||
}
|
||||
if interval <= 0 {
|
||||
return sustainedMinutes
|
||||
}
|
||||
required := int(math.Ceil(float64(sustainedMinutes*60) / interval.Seconds()))
|
||||
if required < 1 {
|
||||
return 1
|
||||
}
|
||||
return required
|
||||
}
|
||||
|
||||
func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64) {
|
||||
if filters == nil {
|
||||
return "", nil
|
||||
}
|
||||
if v, ok := filters["platform"]; ok {
|
||||
if s, ok := v.(string); ok {
|
||||
platform = strings.TrimSpace(s)
|
||||
}
|
||||
}
|
||||
if v, ok := filters["group_id"]; ok {
|
||||
switch t := v.(type) {
|
||||
case float64:
|
||||
if t > 0 {
|
||||
id := int64(t)
|
||||
groupID = &id
|
||||
}
|
||||
case int64:
|
||||
if t > 0 {
|
||||
id := t
|
||||
groupID = &id
|
||||
}
|
||||
case int:
|
||||
if t > 0 {
|
||||
id := int64(t)
|
||||
groupID = &id
|
||||
}
|
||||
case string:
|
||||
n, err := strconv.ParseInt(strings.TrimSpace(t), 10, 64)
|
||||
if err == nil && n > 0 {
|
||||
groupID = &n
|
||||
}
|
||||
}
|
||||
}
|
||||
return platform, groupID
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) computeRuleMetric(
|
||||
ctx context.Context,
|
||||
rule *OpsAlertRule,
|
||||
systemMetrics *OpsSystemMetricsSnapshot,
|
||||
start time.Time,
|
||||
end time.Time,
|
||||
platform string,
|
||||
groupID *int64,
|
||||
) (float64, bool) {
|
||||
if rule == nil {
|
||||
return 0, false
|
||||
}
|
||||
switch strings.TrimSpace(rule.MetricType) {
|
||||
case "cpu_usage_percent":
|
||||
if systemMetrics != nil && systemMetrics.CPUUsagePercent != nil {
|
||||
return *systemMetrics.CPUUsagePercent, true
|
||||
}
|
||||
return 0, false
|
||||
case "memory_usage_percent":
|
||||
if systemMetrics != nil && systemMetrics.MemoryUsagePercent != nil {
|
||||
return *systemMetrics.MemoryUsagePercent, true
|
||||
}
|
||||
return 0, false
|
||||
case "concurrency_queue_depth":
|
||||
if systemMetrics != nil && systemMetrics.ConcurrencyQueueDepth != nil {
|
||||
return float64(*systemMetrics.ConcurrencyQueueDepth), true
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
overview, err := s.opsRepo.GetDashboardOverview(ctx, &OpsDashboardFilter{
|
||||
StartTime: start,
|
||||
EndTime: end,
|
||||
Platform: platform,
|
||||
GroupID: groupID,
|
||||
QueryMode: OpsQueryModeRaw,
|
||||
})
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
if overview == nil {
|
||||
return 0, false
|
||||
}
|
||||
|
||||
switch strings.TrimSpace(rule.MetricType) {
|
||||
case "success_rate":
|
||||
if overview.RequestCountSLA <= 0 {
|
||||
return 0, false
|
||||
}
|
||||
return overview.SLA * 100, true
|
||||
case "error_rate":
|
||||
if overview.RequestCountSLA <= 0 {
|
||||
return 0, false
|
||||
}
|
||||
return overview.ErrorRate * 100, true
|
||||
case "upstream_error_rate":
|
||||
if overview.RequestCountSLA <= 0 {
|
||||
return 0, false
|
||||
}
|
||||
return overview.UpstreamErrorRate * 100, true
|
||||
case "p95_latency_ms":
|
||||
if overview.Duration.P95 == nil {
|
||||
return 0, false
|
||||
}
|
||||
return float64(*overview.Duration.P95), true
|
||||
case "p99_latency_ms":
|
||||
if overview.Duration.P99 == nil {
|
||||
return 0, false
|
||||
}
|
||||
return float64(*overview.Duration.P99), true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
func compareMetric(value float64, operator string, threshold float64) bool {
|
||||
switch strings.TrimSpace(operator) {
|
||||
case ">":
|
||||
return value > threshold
|
||||
case ">=":
|
||||
return value >= threshold
|
||||
case "<":
|
||||
return value < threshold
|
||||
case "<=":
|
||||
return value <= threshold
|
||||
case "==":
|
||||
return value == threshold
|
||||
case "!=":
|
||||
return value != threshold
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func buildOpsAlertDimensions(platform string, groupID *int64) map[string]any {
|
||||
dims := map[string]any{}
|
||||
if strings.TrimSpace(platform) != "" {
|
||||
dims["platform"] = strings.TrimSpace(platform)
|
||||
}
|
||||
if groupID != nil && *groupID > 0 {
|
||||
dims["group_id"] = *groupID
|
||||
}
|
||||
if len(dims) == 0 {
|
||||
return nil
|
||||
}
|
||||
return dims
|
||||
}
|
||||
|
||||
func buildOpsAlertDescription(rule *OpsAlertRule, value float64, windowMinutes int, platform string, groupID *int64) string {
|
||||
if rule == nil {
|
||||
return ""
|
||||
}
|
||||
scope := "overall"
|
||||
if strings.TrimSpace(platform) != "" {
|
||||
scope = fmt.Sprintf("platform=%s", strings.TrimSpace(platform))
|
||||
}
|
||||
if groupID != nil && *groupID > 0 {
|
||||
scope = fmt.Sprintf("%s group_id=%d", scope, *groupID)
|
||||
}
|
||||
if windowMinutes <= 0 {
|
||||
windowMinutes = 1
|
||||
}
|
||||
return fmt.Sprintf("%s %s %.2f (current %.2f) over last %dm (%s)",
|
||||
strings.TrimSpace(rule.MetricType),
|
||||
strings.TrimSpace(rule.Operator),
|
||||
rule.Threshold,
|
||||
value,
|
||||
windowMinutes,
|
||||
strings.TrimSpace(scope),
|
||||
)
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runtimeCfg *OpsAlertRuntimeSettings, rule *OpsAlertRule, event *OpsAlertEvent) {
|
||||
if s == nil || s.emailService == nil || s.opsService == nil || event == nil || rule == nil {
|
||||
return
|
||||
}
|
||||
if event.EmailSent {
|
||||
return
|
||||
}
|
||||
if !rule.NotifyEmail {
|
||||
return
|
||||
}
|
||||
|
||||
emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx)
|
||||
if err != nil || emailCfg == nil || !emailCfg.Alert.Enabled {
|
||||
return
|
||||
}
|
||||
|
||||
if len(emailCfg.Alert.Recipients) == 0 {
|
||||
return
|
||||
}
|
||||
if !shouldSendOpsAlertEmailByMinSeverity(strings.TrimSpace(emailCfg.Alert.MinSeverity), strings.TrimSpace(rule.Severity)) {
|
||||
return
|
||||
}
|
||||
|
||||
if runtimeCfg != nil && runtimeCfg.Silencing.Enabled {
|
||||
if isOpsAlertSilenced(time.Now().UTC(), rule, event, runtimeCfg.Silencing) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Apply/update rate limiter.
|
||||
s.emailLimiter.SetLimit(emailCfg.Alert.RateLimitPerHour)
|
||||
|
||||
subject := fmt.Sprintf("[Ops Alert][%s] %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name))
|
||||
body := buildOpsAlertEmailBody(rule, event)
|
||||
|
||||
anySent := false
|
||||
for _, to := range emailCfg.Alert.Recipients {
|
||||
addr := strings.TrimSpace(to)
|
||||
if addr == "" {
|
||||
continue
|
||||
}
|
||||
if !s.emailLimiter.Allow(time.Now().UTC()) {
|
||||
continue
|
||||
}
|
||||
if err := s.emailService.SendEmail(ctx, addr, subject, body); err != nil {
|
||||
// Ignore per-recipient failures; continue best-effort.
|
||||
continue
|
||||
}
|
||||
anySent = true
|
||||
}
|
||||
|
||||
if anySent {
|
||||
_ = s.opsRepo.UpdateAlertEventEmailSent(context.Background(), event.ID, true)
|
||||
}
|
||||
}
|
||||
|
||||
func buildOpsAlertEmailBody(rule *OpsAlertRule, event *OpsAlertEvent) string {
|
||||
if rule == nil || event == nil {
|
||||
return ""
|
||||
}
|
||||
metric := strings.TrimSpace(rule.MetricType)
|
||||
value := "-"
|
||||
threshold := fmt.Sprintf("%.2f", rule.Threshold)
|
||||
if event.MetricValue != nil {
|
||||
value = fmt.Sprintf("%.2f", *event.MetricValue)
|
||||
}
|
||||
if event.ThresholdValue != nil {
|
||||
threshold = fmt.Sprintf("%.2f", *event.ThresholdValue)
|
||||
}
|
||||
return fmt.Sprintf(`
|
||||
<h2>Ops Alert</h2>
|
||||
<p><b>Rule</b>: %s</p>
|
||||
<p><b>Severity</b>: %s</p>
|
||||
<p><b>Status</b>: %s</p>
|
||||
<p><b>Metric</b>: %s %s %s</p>
|
||||
<p><b>Fired at</b>: %s</p>
|
||||
<p><b>Description</b>: %s</p>
|
||||
`,
|
||||
htmlEscape(rule.Name),
|
||||
htmlEscape(rule.Severity),
|
||||
htmlEscape(event.Status),
|
||||
htmlEscape(metric),
|
||||
htmlEscape(rule.Operator),
|
||||
htmlEscape(fmt.Sprintf("%s (threshold %s)", value, threshold)),
|
||||
event.FiredAt.Format(time.RFC3339),
|
||||
htmlEscape(event.Description),
|
||||
)
|
||||
}
|
||||
|
||||
func shouldSendOpsAlertEmailByMinSeverity(minSeverity string, ruleSeverity string) bool {
|
||||
minSeverity = strings.ToLower(strings.TrimSpace(minSeverity))
|
||||
if minSeverity == "" {
|
||||
return true
|
||||
}
|
||||
|
||||
eventLevel := opsEmailSeverityForOps(ruleSeverity)
|
||||
minLevel := strings.ToLower(minSeverity)
|
||||
|
||||
rank := func(level string) int {
|
||||
switch level {
|
||||
case "critical":
|
||||
return 3
|
||||
case "warning":
|
||||
return 2
|
||||
case "info":
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
return rank(eventLevel) >= rank(minLevel)
|
||||
}
|
||||
|
||||
func opsEmailSeverityForOps(severity string) string {
|
||||
switch strings.ToUpper(strings.TrimSpace(severity)) {
|
||||
case "P0":
|
||||
return "critical"
|
||||
case "P1":
|
||||
return "warning"
|
||||
default:
|
||||
return "info"
|
||||
}
|
||||
}
|
||||
|
||||
func isOpsAlertSilenced(now time.Time, rule *OpsAlertRule, event *OpsAlertEvent, silencing OpsAlertSilencingSettings) bool {
|
||||
if !silencing.Enabled {
|
||||
return false
|
||||
}
|
||||
if now.IsZero() {
|
||||
now = time.Now().UTC()
|
||||
}
|
||||
if strings.TrimSpace(silencing.GlobalUntilRFC3339) != "" {
|
||||
if t, err := time.Parse(time.RFC3339, strings.TrimSpace(silencing.GlobalUntilRFC3339)); err == nil {
|
||||
if now.Before(t) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, entry := range silencing.Entries {
|
||||
untilRaw := strings.TrimSpace(entry.UntilRFC3339)
|
||||
if untilRaw == "" {
|
||||
continue
|
||||
}
|
||||
until, err := time.Parse(time.RFC3339, untilRaw)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if now.After(until) {
|
||||
continue
|
||||
}
|
||||
if entry.RuleID != nil && rule != nil && rule.ID > 0 && *entry.RuleID != rule.ID {
|
||||
continue
|
||||
}
|
||||
if len(entry.Severities) > 0 {
|
||||
match := false
|
||||
for _, s := range entry.Severities {
|
||||
if strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(event.Severity)) || strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(rule.Severity)) {
|
||||
match = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !match {
|
||||
continue
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) tryAcquireLeaderLock(ctx context.Context, lock OpsDistributedLockSettings) (func(), bool) {
|
||||
if !lock.Enabled {
|
||||
return nil, true
|
||||
}
|
||||
if s.redisClient == nil {
|
||||
s.warnNoRedisOnce.Do(func() {
|
||||
log.Printf("[OpsAlertEvaluator] redis not configured; running without distributed lock")
|
||||
})
|
||||
return nil, true
|
||||
}
|
||||
key := strings.TrimSpace(lock.Key)
|
||||
if key == "" {
|
||||
key = opsAlertEvaluatorLeaderLockKey
|
||||
}
|
||||
ttl := time.Duration(lock.TTLSeconds) * time.Second
|
||||
if ttl <= 0 {
|
||||
ttl = opsAlertEvaluatorLeaderLockTTL
|
||||
}
|
||||
|
||||
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
|
||||
if err != nil {
|
||||
// Fail-open for single-node environments, but warn.
|
||||
s.warnNoRedisOnce.Do(func() {
|
||||
log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; running without lock: %v", err)
|
||||
})
|
||||
return nil, true
|
||||
}
|
||||
if !ok {
|
||||
s.maybeLogSkip(key)
|
||||
return nil, false
|
||||
}
|
||||
return func() {
|
||||
_, _ = opsAlertEvaluatorReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
|
||||
}, true
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) maybeLogSkip(key string) {
|
||||
s.skipLogMu.Lock()
|
||||
defer s.skipLogMu.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < opsAlertEvaluatorSkipLogInterval {
|
||||
return
|
||||
}
|
||||
s.skipLogAt = now
|
||||
log.Printf("[OpsAlertEvaluator] leader lock held by another instance; skipping (key=%q)", key)
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
|
||||
if s == nil || s.opsRepo == nil {
|
||||
return
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
durMs := duration.Milliseconds()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
|
||||
JobName: opsAlertEvaluatorJobName,
|
||||
LastRunAt: &runAt,
|
||||
LastSuccessAt: &now,
|
||||
LastDurationMs: &durMs,
|
||||
})
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
|
||||
if s == nil || s.opsRepo == nil || err == nil {
|
||||
return
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
durMs := duration.Milliseconds()
|
||||
msg := truncateString(err.Error(), 2048)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
|
||||
JobName: opsAlertEvaluatorJobName,
|
||||
LastRunAt: &runAt,
|
||||
LastErrorAt: &now,
|
||||
LastError: &msg,
|
||||
LastDurationMs: &durMs,
|
||||
})
|
||||
}
|
||||
|
||||
func htmlEscape(s string) string {
|
||||
replacer := strings.NewReplacer(
|
||||
"&", "&",
|
||||
"<", "<",
|
||||
">", ">",
|
||||
`"`, """,
|
||||
"'", "'",
|
||||
)
|
||||
return replacer.Replace(s)
|
||||
}
|
||||
|
||||
type slidingWindowLimiter struct {
|
||||
mu sync.Mutex
|
||||
limit int
|
||||
window time.Duration
|
||||
sent []time.Time
|
||||
}
|
||||
|
||||
func newSlidingWindowLimiter(limit int, window time.Duration) *slidingWindowLimiter {
|
||||
if window <= 0 {
|
||||
window = time.Hour
|
||||
}
|
||||
return &slidingWindowLimiter{
|
||||
limit: limit,
|
||||
window: window,
|
||||
sent: []time.Time{},
|
||||
}
|
||||
}
|
||||
|
||||
func (l *slidingWindowLimiter) SetLimit(limit int) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.limit = limit
|
||||
}
|
||||
|
||||
func (l *slidingWindowLimiter) Allow(now time.Time) bool {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
|
||||
if l.limit <= 0 {
|
||||
return true
|
||||
}
|
||||
cutoff := now.Add(-l.window)
|
||||
keep := l.sent[:0]
|
||||
for _, t := range l.sent {
|
||||
if t.After(cutoff) {
|
||||
keep = append(keep, t)
|
||||
}
|
||||
}
|
||||
l.sent = keep
|
||||
if len(l.sent) >= l.limit {
|
||||
return false
|
||||
}
|
||||
l.sent = append(l.sent, now)
|
||||
return true
|
||||
}
|
||||
162
backend/internal/service/ops_alerts.go
Normal file
162
backend/internal/service/ops_alerts.go
Normal file
@@ -0,0 +1,162 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||
)
|
||||
|
||||
func (s *OpsService) ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return []*OpsAlertRule{}, nil
|
||||
}
|
||||
return s.opsRepo.ListAlertRules(ctx)
|
||||
}
|
||||
|
||||
func (s *OpsService) CreateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if rule == nil {
|
||||
return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule")
|
||||
}
|
||||
|
||||
created, err := s.opsRepo.CreateAlertRule(ctx, rule)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return created, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) UpdateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if rule == nil || rule.ID <= 0 {
|
||||
return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule")
|
||||
}
|
||||
|
||||
updated, err := s.opsRepo.UpdateAlertRule(ctx, rule)
|
||||
if err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found")
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
return updated, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) DeleteAlertRule(ctx context.Context, id int64) error {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if id <= 0 {
|
||||
return infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
|
||||
}
|
||||
if err := s.opsRepo.DeleteAlertRule(ctx, id); err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found")
|
||||
}
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *OpsService) ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return []*OpsAlertEvent{}, nil
|
||||
}
|
||||
return s.opsRepo.ListAlertEvents(ctx, filter)
|
||||
}
|
||||
|
||||
func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if ruleID <= 0 {
|
||||
return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
|
||||
}
|
||||
return s.opsRepo.GetActiveAlertEvent(ctx, ruleID)
|
||||
}
|
||||
|
||||
func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if ruleID <= 0 {
|
||||
return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
|
||||
}
|
||||
return s.opsRepo.GetLatestAlertEvent(ctx, ruleID)
|
||||
}
|
||||
|
||||
func (s *OpsService) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if event == nil {
|
||||
return nil, infraerrors.BadRequest("INVALID_EVENT", "invalid event")
|
||||
}
|
||||
|
||||
created, err := s.opsRepo.CreateAlertEvent(ctx, event)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return created, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if eventID <= 0 {
|
||||
return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
|
||||
}
|
||||
if strings.TrimSpace(status) == "" {
|
||||
return infraerrors.BadRequest("INVALID_STATUS", "invalid status")
|
||||
}
|
||||
return s.opsRepo.UpdateAlertEventStatus(ctx, eventID, status, resolvedAt)
|
||||
}
|
||||
|
||||
func (s *OpsService) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if eventID <= 0 {
|
||||
return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
|
||||
}
|
||||
return s.opsRepo.UpdateAlertEventEmailSent(ctx, eventID, emailSent)
|
||||
}
|
||||
361
backend/internal/service/ops_cleanup_service.go
Normal file
361
backend/internal/service/ops_cleanup_service.go
Normal file
@@ -0,0 +1,361 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||
"github.com/google/uuid"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/robfig/cron/v3"
|
||||
)
|
||||
|
||||
const (
|
||||
opsCleanupJobName = "ops_cleanup"
|
||||
|
||||
opsCleanupLeaderLockKeyDefault = "ops:cleanup:leader"
|
||||
opsCleanupLeaderLockTTLDefault = 30 * time.Minute
|
||||
)
|
||||
|
||||
var opsCleanupCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
|
||||
|
||||
var opsCleanupReleaseScript = redis.NewScript(`
|
||||
if redis.call("GET", KEYS[1]) == ARGV[1] then
|
||||
return redis.call("DEL", KEYS[1])
|
||||
end
|
||||
return 0
|
||||
`)
|
||||
|
||||
// OpsCleanupService periodically deletes old ops data to prevent unbounded DB growth.
|
||||
//
|
||||
// - Scheduling: 5-field cron spec (minute hour dom month dow).
|
||||
// - Multi-instance: best-effort Redis leader lock so only one node runs cleanup.
|
||||
// - Safety: deletes in batches to avoid long transactions.
|
||||
type OpsCleanupService struct {
|
||||
opsRepo OpsRepository
|
||||
db *sql.DB
|
||||
redisClient *redis.Client
|
||||
cfg *config.Config
|
||||
|
||||
instanceID string
|
||||
|
||||
cron *cron.Cron
|
||||
entryID cron.EntryID
|
||||
|
||||
startOnce sync.Once
|
||||
stopOnce sync.Once
|
||||
|
||||
warnNoRedisOnce sync.Once
|
||||
}
|
||||
|
||||
func NewOpsCleanupService(
|
||||
opsRepo OpsRepository,
|
||||
db *sql.DB,
|
||||
redisClient *redis.Client,
|
||||
cfg *config.Config,
|
||||
) *OpsCleanupService {
|
||||
return &OpsCleanupService{
|
||||
opsRepo: opsRepo,
|
||||
db: db,
|
||||
redisClient: redisClient,
|
||||
cfg: cfg,
|
||||
instanceID: uuid.NewString(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsCleanupService) Start() {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
if s.cfg != nil && !s.cfg.Ops.Enabled {
|
||||
return
|
||||
}
|
||||
if s.cfg != nil && !s.cfg.Ops.Cleanup.Enabled {
|
||||
log.Printf("[OpsCleanup] not started (disabled)")
|
||||
return
|
||||
}
|
||||
if s.opsRepo == nil || s.db == nil {
|
||||
log.Printf("[OpsCleanup] not started (missing deps)")
|
||||
return
|
||||
}
|
||||
|
||||
s.startOnce.Do(func() {
|
||||
schedule := "0 2 * * *"
|
||||
if s.cfg != nil && strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule) != "" {
|
||||
schedule = strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule)
|
||||
}
|
||||
|
||||
loc := time.Local
|
||||
if s.cfg != nil && strings.TrimSpace(s.cfg.Timezone) != "" {
|
||||
if parsed, err := time.LoadLocation(strings.TrimSpace(s.cfg.Timezone)); err == nil && parsed != nil {
|
||||
loc = parsed
|
||||
}
|
||||
}
|
||||
|
||||
c := cron.New(cron.WithParser(opsCleanupCronParser), cron.WithLocation(loc))
|
||||
id, err := c.AddFunc(schedule, func() { s.runScheduled() })
|
||||
if err != nil {
|
||||
log.Printf("[OpsCleanup] not started (invalid schedule=%q): %v", schedule, err)
|
||||
return
|
||||
}
|
||||
s.cron = c
|
||||
s.entryID = id
|
||||
s.cron.Start()
|
||||
log.Printf("[OpsCleanup] started (schedule=%q tz=%s)", schedule, loc.String())
|
||||
})
|
||||
}
|
||||
|
||||
func (s *OpsCleanupService) Stop() {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
s.stopOnce.Do(func() {
|
||||
if s.cron != nil {
|
||||
ctx := s.cron.Stop()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case <-time.After(3 * time.Second):
|
||||
log.Printf("[OpsCleanup] cron stop timed out")
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (s *OpsCleanupService) runScheduled() {
|
||||
if s == nil || s.db == nil || s.opsRepo == nil {
|
||||
return
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
release, ok := s.tryAcquireLeaderLock(ctx)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if release != nil {
|
||||
defer release()
|
||||
}
|
||||
|
||||
startedAt := time.Now().UTC()
|
||||
runAt := startedAt
|
||||
|
||||
counts, err := s.runCleanupOnce(ctx)
|
||||
if err != nil {
|
||||
s.recordHeartbeatError(runAt, time.Since(startedAt), err)
|
||||
log.Printf("[OpsCleanup] cleanup failed: %v", err)
|
||||
return
|
||||
}
|
||||
s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
|
||||
log.Printf("[OpsCleanup] cleanup complete: %s", counts)
|
||||
}
|
||||
|
||||
type opsCleanupDeletedCounts struct {
|
||||
errorLogs int64
|
||||
retryAttempts int64
|
||||
alertEvents int64
|
||||
systemMetrics int64
|
||||
hourlyPreagg int64
|
||||
dailyPreagg int64
|
||||
}
|
||||
|
||||
func (c opsCleanupDeletedCounts) String() string {
|
||||
return fmt.Sprintf(
|
||||
"error_logs=%d retry_attempts=%d alert_events=%d system_metrics=%d hourly_preagg=%d daily_preagg=%d",
|
||||
c.errorLogs,
|
||||
c.retryAttempts,
|
||||
c.alertEvents,
|
||||
c.systemMetrics,
|
||||
c.hourlyPreagg,
|
||||
c.dailyPreagg,
|
||||
)
|
||||
}
|
||||
|
||||
func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDeletedCounts, error) {
|
||||
out := opsCleanupDeletedCounts{}
|
||||
if s == nil || s.db == nil || s.cfg == nil {
|
||||
return out, nil
|
||||
}
|
||||
|
||||
batchSize := 5000
|
||||
|
||||
now := time.Now().UTC()
|
||||
|
||||
// Error-like tables: error logs / retry attempts / alert events.
|
||||
if days := s.cfg.Ops.Cleanup.ErrorLogRetentionDays; days > 0 {
|
||||
cutoff := now.AddDate(0, 0, -days)
|
||||
n, err := deleteOldRowsByID(ctx, s.db, "ops_error_logs", "created_at", cutoff, batchSize, false)
|
||||
if err != nil {
|
||||
return out, err
|
||||
}
|
||||
out.errorLogs = n
|
||||
|
||||
n, err = deleteOldRowsByID(ctx, s.db, "ops_retry_attempts", "created_at", cutoff, batchSize, false)
|
||||
if err != nil {
|
||||
return out, err
|
||||
}
|
||||
out.retryAttempts = n
|
||||
|
||||
n, err = deleteOldRowsByID(ctx, s.db, "ops_alert_events", "created_at", cutoff, batchSize, false)
|
||||
if err != nil {
|
||||
return out, err
|
||||
}
|
||||
out.alertEvents = n
|
||||
}
|
||||
|
||||
// Minute-level metrics snapshots.
|
||||
if days := s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays; days > 0 {
|
||||
cutoff := now.AddDate(0, 0, -days)
|
||||
n, err := deleteOldRowsByID(ctx, s.db, "ops_system_metrics", "created_at", cutoff, batchSize, false)
|
||||
if err != nil {
|
||||
return out, err
|
||||
}
|
||||
out.systemMetrics = n
|
||||
}
|
||||
|
||||
// Pre-aggregation tables (hourly/daily).
|
||||
if days := s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays; days > 0 {
|
||||
cutoff := now.AddDate(0, 0, -days)
|
||||
n, err := deleteOldRowsByID(ctx, s.db, "ops_metrics_hourly", "bucket_start", cutoff, batchSize, false)
|
||||
if err != nil {
|
||||
return out, err
|
||||
}
|
||||
out.hourlyPreagg = n
|
||||
|
||||
n, err = deleteOldRowsByID(ctx, s.db, "ops_metrics_daily", "bucket_date", cutoff, batchSize, true)
|
||||
if err != nil {
|
||||
return out, err
|
||||
}
|
||||
out.dailyPreagg = n
|
||||
}
|
||||
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func deleteOldRowsByID(
|
||||
ctx context.Context,
|
||||
db *sql.DB,
|
||||
table string,
|
||||
timeColumn string,
|
||||
cutoff time.Time,
|
||||
batchSize int,
|
||||
castCutoffToDate bool,
|
||||
) (int64, error) {
|
||||
if db == nil {
|
||||
return 0, nil
|
||||
}
|
||||
if batchSize <= 0 {
|
||||
batchSize = 5000
|
||||
}
|
||||
|
||||
where := fmt.Sprintf("%s < $1", timeColumn)
|
||||
if castCutoffToDate {
|
||||
where = fmt.Sprintf("%s < $1::date", timeColumn)
|
||||
}
|
||||
|
||||
q := fmt.Sprintf(`
|
||||
WITH batch AS (
|
||||
SELECT id FROM %s
|
||||
WHERE %s
|
||||
ORDER BY id
|
||||
LIMIT $2
|
||||
)
|
||||
DELETE FROM %s
|
||||
WHERE id IN (SELECT id FROM batch)
|
||||
`, table, where, table)
|
||||
|
||||
var total int64
|
||||
for {
|
||||
res, err := db.ExecContext(ctx, q, cutoff, batchSize)
|
||||
if err != nil {
|
||||
// If ops tables aren't present yet (partial deployments), treat as no-op.
|
||||
if strings.Contains(strings.ToLower(err.Error()), "does not exist") && strings.Contains(strings.ToLower(err.Error()), "relation") {
|
||||
return total, nil
|
||||
}
|
||||
return total, err
|
||||
}
|
||||
affected, err := res.RowsAffected()
|
||||
if err != nil {
|
||||
return total, err
|
||||
}
|
||||
total += affected
|
||||
if affected == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return total, nil
|
||||
}
|
||||
|
||||
func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
|
||||
if s == nil {
|
||||
return nil, false
|
||||
}
|
||||
// In simple run mode, assume single instance.
|
||||
if s.cfg != nil && s.cfg.RunMode == config.RunModeSimple {
|
||||
return nil, true
|
||||
}
|
||||
|
||||
if s.redisClient == nil {
|
||||
s.warnNoRedisOnce.Do(func() {
|
||||
log.Printf("[OpsCleanup] redis not configured; running without distributed lock")
|
||||
})
|
||||
return nil, true
|
||||
}
|
||||
|
||||
key := opsCleanupLeaderLockKeyDefault
|
||||
ttl := opsCleanupLeaderLockTTLDefault
|
||||
|
||||
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
|
||||
if err != nil {
|
||||
s.warnNoRedisOnce.Do(func() {
|
||||
log.Printf("[OpsCleanup] leader lock SetNX failed; running without lock: %v", err)
|
||||
})
|
||||
return nil, true
|
||||
}
|
||||
if !ok {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
return func() {
|
||||
_, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
|
||||
}, true
|
||||
}
|
||||
|
||||
func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
|
||||
if s == nil || s.opsRepo == nil {
|
||||
return
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
durMs := duration.Milliseconds()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
|
||||
JobName: opsCleanupJobName,
|
||||
LastRunAt: &runAt,
|
||||
LastSuccessAt: &now,
|
||||
LastDurationMs: &durMs,
|
||||
})
|
||||
}
|
||||
|
||||
func (s *OpsCleanupService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
|
||||
if s == nil || s.opsRepo == nil || err == nil {
|
||||
return
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
durMs := duration.Milliseconds()
|
||||
msg := truncateString(err.Error(), 2048)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
|
||||
JobName: opsCleanupJobName,
|
||||
LastRunAt: &runAt,
|
||||
LastErrorAt: &now,
|
||||
LastError: &msg,
|
||||
LastDurationMs: &durMs,
|
||||
})
|
||||
}
|
||||
257
backend/internal/service/ops_concurrency.go
Normal file
257
backend/internal/service/ops_concurrency.go
Normal file
@@ -0,0 +1,257 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"github.com/Wei-Shaw/sub2api/internal/pkg/pagination"
|
||||
)
|
||||
|
||||
const (
|
||||
opsAccountsPageSize = 100
|
||||
opsConcurrencyBatchChunkSize = 200
|
||||
)
|
||||
|
||||
func (s *OpsService) listAllAccountsForOps(ctx context.Context, platformFilter string) ([]Account, error) {
|
||||
if s == nil || s.accountRepo == nil {
|
||||
return []Account{}, nil
|
||||
}
|
||||
|
||||
out := make([]Account, 0, 128)
|
||||
page := 1
|
||||
for {
|
||||
accounts, pageInfo, err := s.accountRepo.ListWithFilters(ctx, pagination.PaginationParams{
|
||||
Page: page,
|
||||
PageSize: opsAccountsPageSize,
|
||||
}, platformFilter, "", "", "")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(accounts) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
out = append(out, accounts...)
|
||||
if pageInfo != nil && int64(len(out)) >= pageInfo.Total {
|
||||
break
|
||||
}
|
||||
if len(accounts) < opsAccountsPageSize {
|
||||
break
|
||||
}
|
||||
|
||||
page++
|
||||
if page > 10_000 {
|
||||
log.Printf("[Ops] listAllAccountsForOps: aborting after too many pages (platform=%q)", platformFilter)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) getAccountsLoadMapBestEffort(ctx context.Context, accounts []Account) map[int64]*AccountLoadInfo {
|
||||
if s == nil || s.concurrencyService == nil {
|
||||
return map[int64]*AccountLoadInfo{}
|
||||
}
|
||||
if len(accounts) == 0 {
|
||||
return map[int64]*AccountLoadInfo{}
|
||||
}
|
||||
|
||||
// De-duplicate IDs (and keep the max concurrency to avoid under-reporting).
|
||||
unique := make(map[int64]int, len(accounts))
|
||||
for _, acc := range accounts {
|
||||
if acc.ID <= 0 {
|
||||
continue
|
||||
}
|
||||
if prev, ok := unique[acc.ID]; !ok || acc.Concurrency > prev {
|
||||
unique[acc.ID] = acc.Concurrency
|
||||
}
|
||||
}
|
||||
|
||||
batch := make([]AccountWithConcurrency, 0, len(unique))
|
||||
for id, maxConc := range unique {
|
||||
batch = append(batch, AccountWithConcurrency{
|
||||
ID: id,
|
||||
MaxConcurrency: maxConc,
|
||||
})
|
||||
}
|
||||
|
||||
out := make(map[int64]*AccountLoadInfo, len(batch))
|
||||
for i := 0; i < len(batch); i += opsConcurrencyBatchChunkSize {
|
||||
end := i + opsConcurrencyBatchChunkSize
|
||||
if end > len(batch) {
|
||||
end = len(batch)
|
||||
}
|
||||
part, err := s.concurrencyService.GetAccountsLoadBatch(ctx, batch[i:end])
|
||||
if err != nil {
|
||||
// Best-effort: return zeros rather than failing the ops UI.
|
||||
log.Printf("[Ops] GetAccountsLoadBatch failed: %v", err)
|
||||
continue
|
||||
}
|
||||
for k, v := range part {
|
||||
out[k] = v
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account.
|
||||
//
|
||||
// Optional filters:
|
||||
// - platformFilter: only include accounts in that platform (best-effort reduces DB load)
|
||||
// - groupIDFilter: only include accounts that belong to that group
|
||||
func (s *OpsService) GetConcurrencyStats(
|
||||
ctx context.Context,
|
||||
platformFilter string,
|
||||
groupIDFilter *int64,
|
||||
) (map[string]*PlatformConcurrencyInfo, map[int64]*GroupConcurrencyInfo, map[int64]*AccountConcurrencyInfo, *time.Time, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, nil, nil, nil, err
|
||||
}
|
||||
|
||||
accounts, err := s.listAllAccountsForOps(ctx, platformFilter)
|
||||
if err != nil {
|
||||
return nil, nil, nil, nil, err
|
||||
}
|
||||
|
||||
collectedAt := time.Now()
|
||||
loadMap := s.getAccountsLoadMapBestEffort(ctx, accounts)
|
||||
|
||||
platform := make(map[string]*PlatformConcurrencyInfo)
|
||||
group := make(map[int64]*GroupConcurrencyInfo)
|
||||
account := make(map[int64]*AccountConcurrencyInfo)
|
||||
|
||||
for _, acc := range accounts {
|
||||
if acc.ID <= 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
var matchedGroup *Group
|
||||
if groupIDFilter != nil && *groupIDFilter > 0 {
|
||||
for _, grp := range acc.Groups {
|
||||
if grp == nil || grp.ID <= 0 {
|
||||
continue
|
||||
}
|
||||
if grp.ID == *groupIDFilter {
|
||||
matchedGroup = grp
|
||||
break
|
||||
}
|
||||
}
|
||||
// Group filter provided: skip accounts not in that group.
|
||||
if matchedGroup == nil {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
load := loadMap[acc.ID]
|
||||
currentInUse := int64(0)
|
||||
waiting := int64(0)
|
||||
if load != nil {
|
||||
currentInUse = int64(load.CurrentConcurrency)
|
||||
waiting = int64(load.WaitingCount)
|
||||
}
|
||||
|
||||
// Account-level view picks one display group (the first group).
|
||||
displayGroupID := int64(0)
|
||||
displayGroupName := ""
|
||||
if matchedGroup != nil {
|
||||
displayGroupID = matchedGroup.ID
|
||||
displayGroupName = matchedGroup.Name
|
||||
} else if len(acc.Groups) > 0 && acc.Groups[0] != nil {
|
||||
displayGroupID = acc.Groups[0].ID
|
||||
displayGroupName = acc.Groups[0].Name
|
||||
}
|
||||
|
||||
if _, ok := account[acc.ID]; !ok {
|
||||
info := &AccountConcurrencyInfo{
|
||||
AccountID: acc.ID,
|
||||
AccountName: acc.Name,
|
||||
Platform: acc.Platform,
|
||||
GroupID: displayGroupID,
|
||||
GroupName: displayGroupName,
|
||||
CurrentInUse: currentInUse,
|
||||
MaxCapacity: int64(acc.Concurrency),
|
||||
WaitingInQueue: waiting,
|
||||
}
|
||||
if info.MaxCapacity > 0 {
|
||||
info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
|
||||
}
|
||||
account[acc.ID] = info
|
||||
}
|
||||
|
||||
// Platform aggregation.
|
||||
if acc.Platform != "" {
|
||||
if _, ok := platform[acc.Platform]; !ok {
|
||||
platform[acc.Platform] = &PlatformConcurrencyInfo{
|
||||
Platform: acc.Platform,
|
||||
}
|
||||
}
|
||||
p := platform[acc.Platform]
|
||||
p.MaxCapacity += int64(acc.Concurrency)
|
||||
p.CurrentInUse += currentInUse
|
||||
p.WaitingInQueue += waiting
|
||||
}
|
||||
|
||||
// Group aggregation (one account may contribute to multiple groups).
|
||||
if matchedGroup != nil {
|
||||
grp := matchedGroup
|
||||
if _, ok := group[grp.ID]; !ok {
|
||||
group[grp.ID] = &GroupConcurrencyInfo{
|
||||
GroupID: grp.ID,
|
||||
GroupName: grp.Name,
|
||||
Platform: grp.Platform,
|
||||
}
|
||||
}
|
||||
g := group[grp.ID]
|
||||
if g.GroupName == "" && grp.Name != "" {
|
||||
g.GroupName = grp.Name
|
||||
}
|
||||
if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform {
|
||||
// Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels.
|
||||
g.Platform = ""
|
||||
}
|
||||
g.MaxCapacity += int64(acc.Concurrency)
|
||||
g.CurrentInUse += currentInUse
|
||||
g.WaitingInQueue += waiting
|
||||
} else {
|
||||
for _, grp := range acc.Groups {
|
||||
if grp == nil || grp.ID <= 0 {
|
||||
continue
|
||||
}
|
||||
if _, ok := group[grp.ID]; !ok {
|
||||
group[grp.ID] = &GroupConcurrencyInfo{
|
||||
GroupID: grp.ID,
|
||||
GroupName: grp.Name,
|
||||
Platform: grp.Platform,
|
||||
}
|
||||
}
|
||||
g := group[grp.ID]
|
||||
if g.GroupName == "" && grp.Name != "" {
|
||||
g.GroupName = grp.Name
|
||||
}
|
||||
if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform {
|
||||
// Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels.
|
||||
g.Platform = ""
|
||||
}
|
||||
g.MaxCapacity += int64(acc.Concurrency)
|
||||
g.CurrentInUse += currentInUse
|
||||
g.WaitingInQueue += waiting
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, info := range platform {
|
||||
if info.MaxCapacity > 0 {
|
||||
info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
|
||||
}
|
||||
}
|
||||
for _, info := range group {
|
||||
if info.MaxCapacity > 0 {
|
||||
info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
|
||||
}
|
||||
}
|
||||
|
||||
return platform, group, account, &collectedAt, nil
|
||||
}
|
||||
77
backend/internal/service/ops_dashboard.go
Normal file
77
backend/internal/service/ops_dashboard.go
Normal file
@@ -0,0 +1,77 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"log"
|
||||
|
||||
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||
)
|
||||
|
||||
func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if filter == nil {
|
||||
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
|
||||
}
|
||||
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
|
||||
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
|
||||
}
|
||||
if filter.StartTime.After(filter.EndTime) {
|
||||
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
|
||||
}
|
||||
|
||||
// Resolve query mode (requested via query param, or DB default).
|
||||
filter.QueryMode = s.resolveOpsQueryMode(ctx, filter.QueryMode)
|
||||
|
||||
overview, err := s.opsRepo.GetDashboardOverview(ctx, filter)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrOpsPreaggregatedNotPopulated) {
|
||||
return nil, infraerrors.Conflict("OPS_PREAGG_NOT_READY", "Pre-aggregated ops metrics are not populated yet")
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Best-effort system health + jobs; dashboard metrics should still render if these are missing.
|
||||
if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil {
|
||||
overview.SystemMetrics = metrics
|
||||
} else if err != nil && !errors.Is(err, sql.ErrNoRows) {
|
||||
log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err)
|
||||
}
|
||||
|
||||
if heartbeats, err := s.opsRepo.ListJobHeartbeats(ctx); err == nil {
|
||||
overview.JobHeartbeats = heartbeats
|
||||
} else {
|
||||
log.Printf("[Ops] ListJobHeartbeats failed: %v", err)
|
||||
}
|
||||
|
||||
return overview, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) resolveOpsQueryMode(ctx context.Context, requested OpsQueryMode) OpsQueryMode {
|
||||
if requested.IsValid() {
|
||||
// Allow "auto" to be disabled via config until preagg is proven stable in production.
|
||||
// Forced `preagg` via query param still works.
|
||||
if requested == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
|
||||
return OpsQueryModeRaw
|
||||
}
|
||||
return requested
|
||||
}
|
||||
|
||||
mode := OpsQueryModeAuto
|
||||
if s != nil && s.settingRepo != nil {
|
||||
if raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsQueryModeDefault); err == nil {
|
||||
mode = ParseOpsQueryMode(raw)
|
||||
}
|
||||
}
|
||||
|
||||
if mode == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
|
||||
return OpsQueryModeRaw
|
||||
}
|
||||
return mode
|
||||
}
|
||||
45
backend/internal/service/ops_errors.go
Normal file
45
backend/internal/service/ops_errors.go
Normal file
@@ -0,0 +1,45 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||
)
|
||||
|
||||
func (s *OpsService) GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if filter == nil {
|
||||
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
|
||||
}
|
||||
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
|
||||
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
|
||||
}
|
||||
if filter.StartTime.After(filter.EndTime) {
|
||||
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
|
||||
}
|
||||
return s.opsRepo.GetErrorTrend(ctx, filter, bucketSeconds)
|
||||
}
|
||||
|
||||
func (s *OpsService) GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if filter == nil {
|
||||
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
|
||||
}
|
||||
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
|
||||
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
|
||||
}
|
||||
if filter.StartTime.After(filter.EndTime) {
|
||||
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
|
||||
}
|
||||
return s.opsRepo.GetErrorDistribution(ctx, filter)
|
||||
}
|
||||
26
backend/internal/service/ops_histograms.go
Normal file
26
backend/internal/service/ops_histograms.go
Normal file
@@ -0,0 +1,26 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||
)
|
||||
|
||||
func (s *OpsService) GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if filter == nil {
|
||||
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
|
||||
}
|
||||
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
|
||||
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
|
||||
}
|
||||
if filter.StartTime.After(filter.EndTime) {
|
||||
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
|
||||
}
|
||||
return s.opsRepo.GetLatencyHistogram(ctx, filter)
|
||||
}
|
||||
861
backend/internal/service/ops_metrics_collector.go
Normal file
861
backend/internal/service/ops_metrics_collector.go
Normal file
@@ -0,0 +1,861 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"log"
|
||||
"math"
|
||||
"os"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||
"github.com/google/uuid"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/shirou/gopsutil/v4/cpu"
|
||||
"github.com/shirou/gopsutil/v4/mem"
|
||||
)
|
||||
|
||||
const (
|
||||
opsMetricsCollectorJobName = "ops_metrics_collector"
|
||||
opsMetricsCollectorMinInterval = 60 * time.Second
|
||||
opsMetricsCollectorMaxInterval = 1 * time.Hour
|
||||
|
||||
opsMetricsCollectorTimeout = 10 * time.Second
|
||||
|
||||
opsMetricsCollectorLeaderLockKey = "ops:metrics:collector:leader"
|
||||
opsMetricsCollectorLeaderLockTTL = 90 * time.Second
|
||||
|
||||
opsMetricsCollectorHeartbeatTimeout = 2 * time.Second
|
||||
|
||||
bytesPerMB = 1024 * 1024
|
||||
)
|
||||
|
||||
var opsMetricsCollectorAdvisoryLockID = hashAdvisoryLockID(opsMetricsCollectorLeaderLockKey)
|
||||
|
||||
type OpsMetricsCollector struct {
|
||||
opsRepo OpsRepository
|
||||
settingRepo SettingRepository
|
||||
cfg *config.Config
|
||||
|
||||
db *sql.DB
|
||||
redisClient *redis.Client
|
||||
instanceID string
|
||||
|
||||
lastCgroupCPUUsageNanos uint64
|
||||
lastCgroupCPUSampleAt time.Time
|
||||
|
||||
stopCh chan struct{}
|
||||
startOnce sync.Once
|
||||
stopOnce sync.Once
|
||||
|
||||
skipLogMu sync.Mutex
|
||||
skipLogAt time.Time
|
||||
}
|
||||
|
||||
func NewOpsMetricsCollector(
|
||||
opsRepo OpsRepository,
|
||||
settingRepo SettingRepository,
|
||||
db *sql.DB,
|
||||
redisClient *redis.Client,
|
||||
cfg *config.Config,
|
||||
) *OpsMetricsCollector {
|
||||
return &OpsMetricsCollector{
|
||||
opsRepo: opsRepo,
|
||||
settingRepo: settingRepo,
|
||||
cfg: cfg,
|
||||
db: db,
|
||||
redisClient: redisClient,
|
||||
instanceID: uuid.NewString(),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) Start() {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.startOnce.Do(func() {
|
||||
if c.stopCh == nil {
|
||||
c.stopCh = make(chan struct{})
|
||||
}
|
||||
go c.run()
|
||||
})
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) Stop() {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.stopOnce.Do(func() {
|
||||
if c.stopCh != nil {
|
||||
close(c.stopCh)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) run() {
|
||||
// First run immediately so the dashboard has data soon after startup.
|
||||
c.collectOnce()
|
||||
|
||||
for {
|
||||
interval := c.getInterval()
|
||||
timer := time.NewTimer(interval)
|
||||
select {
|
||||
case <-timer.C:
|
||||
c.collectOnce()
|
||||
case <-c.stopCh:
|
||||
timer.Stop()
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) getInterval() time.Duration {
|
||||
interval := opsMetricsCollectorMinInterval
|
||||
|
||||
if c.settingRepo == nil {
|
||||
return interval
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
raw, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMetricsIntervalSeconds)
|
||||
if err != nil {
|
||||
return interval
|
||||
}
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return interval
|
||||
}
|
||||
|
||||
seconds, err := strconv.Atoi(raw)
|
||||
if err != nil {
|
||||
return interval
|
||||
}
|
||||
if seconds < int(opsMetricsCollectorMinInterval.Seconds()) {
|
||||
seconds = int(opsMetricsCollectorMinInterval.Seconds())
|
||||
}
|
||||
if seconds > int(opsMetricsCollectorMaxInterval.Seconds()) {
|
||||
seconds = int(opsMetricsCollectorMaxInterval.Seconds())
|
||||
}
|
||||
return time.Duration(seconds) * time.Second
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) collectOnce() {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
if c.cfg != nil && !c.cfg.Ops.Enabled {
|
||||
return
|
||||
}
|
||||
if c.opsRepo == nil {
|
||||
return
|
||||
}
|
||||
if c.db == nil {
|
||||
return
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), opsMetricsCollectorTimeout)
|
||||
defer cancel()
|
||||
|
||||
if !c.isMonitoringEnabled(ctx) {
|
||||
return
|
||||
}
|
||||
|
||||
release, ok := c.tryAcquireLeaderLock(ctx)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if release != nil {
|
||||
defer release()
|
||||
}
|
||||
|
||||
startedAt := time.Now().UTC()
|
||||
err := c.collectAndPersist(ctx)
|
||||
finishedAt := time.Now().UTC()
|
||||
|
||||
durationMs := finishedAt.Sub(startedAt).Milliseconds()
|
||||
dur := durationMs
|
||||
runAt := startedAt
|
||||
|
||||
if err != nil {
|
||||
msg := truncateString(err.Error(), 2048)
|
||||
errAt := finishedAt
|
||||
hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
|
||||
defer hbCancel()
|
||||
_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
|
||||
JobName: opsMetricsCollectorJobName,
|
||||
LastRunAt: &runAt,
|
||||
LastErrorAt: &errAt,
|
||||
LastError: &msg,
|
||||
LastDurationMs: &dur,
|
||||
})
|
||||
log.Printf("[OpsMetricsCollector] collect failed: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
successAt := finishedAt
|
||||
hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
|
||||
defer hbCancel()
|
||||
_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
|
||||
JobName: opsMetricsCollectorJobName,
|
||||
LastRunAt: &runAt,
|
||||
LastSuccessAt: &successAt,
|
||||
LastDurationMs: &dur,
|
||||
})
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) isMonitoringEnabled(ctx context.Context) bool {
|
||||
if c == nil {
|
||||
return false
|
||||
}
|
||||
if c.cfg != nil && !c.cfg.Ops.Enabled {
|
||||
return false
|
||||
}
|
||||
if c.settingRepo == nil {
|
||||
return true
|
||||
}
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
|
||||
value, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrSettingNotFound) {
|
||||
return true
|
||||
}
|
||||
// Fail-open: collector should not become a hard dependency.
|
||||
return true
|
||||
}
|
||||
switch strings.ToLower(strings.TrimSpace(value)) {
|
||||
case "false", "0", "off", "disabled":
|
||||
return false
|
||||
default:
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
|
||||
// Align to stable minute boundaries to avoid partial buckets and to maximize cache hits.
|
||||
now := time.Now().UTC()
|
||||
windowEnd := now.Truncate(time.Minute)
|
||||
windowStart := windowEnd.Add(-1 * time.Minute)
|
||||
|
||||
sys, err := c.collectSystemStats(ctx)
|
||||
if err != nil {
|
||||
// Continue; system stats are best-effort.
|
||||
log.Printf("[OpsMetricsCollector] system stats error: %v", err)
|
||||
}
|
||||
|
||||
dbOK := c.checkDB(ctx)
|
||||
redisOK := c.checkRedis(ctx)
|
||||
active, idle := c.dbPoolStats()
|
||||
|
||||
successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd)
|
||||
if err != nil {
|
||||
return fmt.Errorf("query usage counts: %w", err)
|
||||
}
|
||||
|
||||
duration, ttft, err := c.queryUsageLatency(ctx, windowStart, windowEnd)
|
||||
if err != nil {
|
||||
return fmt.Errorf("query usage latency: %w", err)
|
||||
}
|
||||
|
||||
errorTotal, businessLimited, errorSLA, upstreamExcl, upstream429, upstream529, err := c.queryErrorCounts(ctx, windowStart, windowEnd)
|
||||
if err != nil {
|
||||
return fmt.Errorf("query error counts: %w", err)
|
||||
}
|
||||
|
||||
windowSeconds := windowEnd.Sub(windowStart).Seconds()
|
||||
if windowSeconds <= 0 {
|
||||
windowSeconds = 60
|
||||
}
|
||||
requestTotal := successCount + errorTotal
|
||||
qps := float64(requestTotal) / windowSeconds
|
||||
tps := float64(tokenConsumed) / windowSeconds
|
||||
|
||||
goroutines := runtime.NumGoroutine()
|
||||
|
||||
input := &OpsInsertSystemMetricsInput{
|
||||
CreatedAt: windowEnd,
|
||||
WindowMinutes: 1,
|
||||
|
||||
SuccessCount: successCount,
|
||||
ErrorCountTotal: errorTotal,
|
||||
BusinessLimitedCount: businessLimited,
|
||||
ErrorCountSLA: errorSLA,
|
||||
|
||||
UpstreamErrorCountExcl429529: upstreamExcl,
|
||||
Upstream429Count: upstream429,
|
||||
Upstream529Count: upstream529,
|
||||
|
||||
TokenConsumed: tokenConsumed,
|
||||
QPS: float64Ptr(roundTo1DP(qps)),
|
||||
TPS: float64Ptr(roundTo1DP(tps)),
|
||||
|
||||
DurationP50Ms: duration.p50,
|
||||
DurationP90Ms: duration.p90,
|
||||
DurationP95Ms: duration.p95,
|
||||
DurationP99Ms: duration.p99,
|
||||
DurationAvgMs: duration.avg,
|
||||
DurationMaxMs: duration.max,
|
||||
|
||||
TTFTP50Ms: ttft.p50,
|
||||
TTFTP90Ms: ttft.p90,
|
||||
TTFTP95Ms: ttft.p95,
|
||||
TTFTP99Ms: ttft.p99,
|
||||
TTFTAvgMs: ttft.avg,
|
||||
TTFTMaxMs: ttft.max,
|
||||
|
||||
CPUUsagePercent: sys.cpuUsagePercent,
|
||||
MemoryUsedMB: sys.memoryUsedMB,
|
||||
MemoryTotalMB: sys.memoryTotalMB,
|
||||
MemoryUsagePercent: sys.memoryUsagePercent,
|
||||
|
||||
DBOK: boolPtr(dbOK),
|
||||
RedisOK: boolPtr(redisOK),
|
||||
|
||||
DBConnActive: intPtr(active),
|
||||
DBConnIdle: intPtr(idle),
|
||||
GoroutineCount: intPtr(goroutines),
|
||||
}
|
||||
|
||||
return c.opsRepo.InsertSystemMetrics(ctx, input)
|
||||
}
|
||||
|
||||
type opsCollectedPercentiles struct {
|
||||
p50 *int
|
||||
p90 *int
|
||||
p95 *int
|
||||
p99 *int
|
||||
avg *float64
|
||||
max *int
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) queryUsageCounts(ctx context.Context, start, end time.Time) (successCount int64, tokenConsumed int64, err error) {
|
||||
q := `
|
||||
SELECT
|
||||
COALESCE(COUNT(*), 0) AS success_count,
|
||||
COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
|
||||
FROM usage_logs
|
||||
WHERE created_at >= $1 AND created_at < $2`
|
||||
|
||||
var tokens sql.NullInt64
|
||||
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&successCount, &tokens); err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
if tokens.Valid {
|
||||
tokenConsumed = tokens.Int64
|
||||
}
|
||||
return successCount, tokenConsumed, nil
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) queryUsageLatency(ctx context.Context, start, end time.Time) (duration opsCollectedPercentiles, ttft opsCollectedPercentiles, err error) {
|
||||
{
|
||||
q := `
|
||||
SELECT
|
||||
percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50,
|
||||
percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90,
|
||||
percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95,
|
||||
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99,
|
||||
AVG(duration_ms) AS avg_ms,
|
||||
MAX(duration_ms) AS max_ms
|
||||
FROM usage_logs
|
||||
WHERE created_at >= $1 AND created_at < $2
|
||||
AND duration_ms IS NOT NULL`
|
||||
|
||||
var p50, p90, p95, p99 sql.NullFloat64
|
||||
var avg sql.NullFloat64
|
||||
var max sql.NullInt64
|
||||
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
|
||||
return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
|
||||
}
|
||||
duration.p50 = floatToIntPtr(p50)
|
||||
duration.p90 = floatToIntPtr(p90)
|
||||
duration.p95 = floatToIntPtr(p95)
|
||||
duration.p99 = floatToIntPtr(p99)
|
||||
if avg.Valid {
|
||||
v := roundTo1DP(avg.Float64)
|
||||
duration.avg = &v
|
||||
}
|
||||
if max.Valid {
|
||||
v := int(max.Int64)
|
||||
duration.max = &v
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
q := `
|
||||
SELECT
|
||||
percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50,
|
||||
percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90,
|
||||
percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95,
|
||||
percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99,
|
||||
AVG(first_token_ms) AS avg_ms,
|
||||
MAX(first_token_ms) AS max_ms
|
||||
FROM usage_logs
|
||||
WHERE created_at >= $1 AND created_at < $2
|
||||
AND first_token_ms IS NOT NULL`
|
||||
|
||||
var p50, p90, p95, p99 sql.NullFloat64
|
||||
var avg sql.NullFloat64
|
||||
var max sql.NullInt64
|
||||
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
|
||||
return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
|
||||
}
|
||||
ttft.p50 = floatToIntPtr(p50)
|
||||
ttft.p90 = floatToIntPtr(p90)
|
||||
ttft.p95 = floatToIntPtr(p95)
|
||||
ttft.p99 = floatToIntPtr(p99)
|
||||
if avg.Valid {
|
||||
v := roundTo1DP(avg.Float64)
|
||||
ttft.avg = &v
|
||||
}
|
||||
if max.Valid {
|
||||
v := int(max.Int64)
|
||||
ttft.max = &v
|
||||
}
|
||||
}
|
||||
|
||||
return duration, ttft, nil
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) queryErrorCounts(ctx context.Context, start, end time.Time) (
|
||||
errorTotal int64,
|
||||
businessLimited int64,
|
||||
errorSLA int64,
|
||||
upstreamExcl429529 int64,
|
||||
upstream429 int64,
|
||||
upstream529 int64,
|
||||
err error,
|
||||
) {
|
||||
q := `
|
||||
SELECT
|
||||
COALESCE(COUNT(*), 0) AS error_total,
|
||||
COALESCE(COUNT(*) FILTER (WHERE is_business_limited), 0) AS business_limited,
|
||||
COALESCE(COUNT(*) FILTER (WHERE NOT is_business_limited), 0) AS error_sla,
|
||||
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl,
|
||||
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 429), 0) AS upstream_429,
|
||||
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 529), 0) AS upstream_529
|
||||
FROM ops_error_logs
|
||||
WHERE created_at >= $1 AND created_at < $2`
|
||||
|
||||
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(
|
||||
&errorTotal,
|
||||
&businessLimited,
|
||||
&errorSLA,
|
||||
&upstreamExcl429529,
|
||||
&upstream429,
|
||||
&upstream529,
|
||||
); err != nil {
|
||||
return 0, 0, 0, 0, 0, 0, err
|
||||
}
|
||||
return errorTotal, businessLimited, errorSLA, upstreamExcl429529, upstream429, upstream529, nil
|
||||
}
|
||||
|
||||
type opsCollectedSystemStats struct {
|
||||
cpuUsagePercent *float64
|
||||
memoryUsedMB *int64
|
||||
memoryTotalMB *int64
|
||||
memoryUsagePercent *float64
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) collectSystemStats(ctx context.Context) (*opsCollectedSystemStats, error) {
|
||||
out := &opsCollectedSystemStats{}
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
|
||||
sampleAt := time.Now().UTC()
|
||||
|
||||
// Prefer cgroup (container) metrics when available.
|
||||
if cpuPct := c.tryCgroupCPUPercent(sampleAt); cpuPct != nil {
|
||||
out.cpuUsagePercent = cpuPct
|
||||
}
|
||||
|
||||
cgroupUsed, cgroupTotal, cgroupOK := readCgroupMemoryBytes()
|
||||
if cgroupOK {
|
||||
usedMB := int64(cgroupUsed / bytesPerMB)
|
||||
out.memoryUsedMB = &usedMB
|
||||
if cgroupTotal > 0 {
|
||||
totalMB := int64(cgroupTotal / bytesPerMB)
|
||||
out.memoryTotalMB = &totalMB
|
||||
pct := roundTo1DP(float64(cgroupUsed) / float64(cgroupTotal) * 100)
|
||||
out.memoryUsagePercent = &pct
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to host metrics if cgroup metrics are unavailable (or incomplete).
|
||||
if out.cpuUsagePercent == nil {
|
||||
if cpuPercents, err := cpu.PercentWithContext(ctx, 0, false); err == nil && len(cpuPercents) > 0 {
|
||||
v := roundTo1DP(cpuPercents[0])
|
||||
out.cpuUsagePercent = &v
|
||||
}
|
||||
}
|
||||
|
||||
// If total memory isn't available from cgroup (e.g. memory.max = "max"), fill total from host.
|
||||
if out.memoryUsedMB == nil || out.memoryTotalMB == nil || out.memoryUsagePercent == nil {
|
||||
if vm, err := mem.VirtualMemoryWithContext(ctx); err == nil && vm != nil {
|
||||
if out.memoryUsedMB == nil {
|
||||
usedMB := int64(vm.Used / bytesPerMB)
|
||||
out.memoryUsedMB = &usedMB
|
||||
}
|
||||
if out.memoryTotalMB == nil {
|
||||
totalMB := int64(vm.Total / bytesPerMB)
|
||||
out.memoryTotalMB = &totalMB
|
||||
}
|
||||
if out.memoryUsagePercent == nil {
|
||||
if out.memoryUsedMB != nil && out.memoryTotalMB != nil && *out.memoryTotalMB > 0 {
|
||||
pct := roundTo1DP(float64(*out.memoryUsedMB) / float64(*out.memoryTotalMB) * 100)
|
||||
out.memoryUsagePercent = &pct
|
||||
} else {
|
||||
pct := roundTo1DP(vm.UsedPercent)
|
||||
out.memoryUsagePercent = &pct
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) tryCgroupCPUPercent(now time.Time) *float64 {
|
||||
usageNanos, ok := readCgroupCPUUsageNanos()
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Initialize baseline sample.
|
||||
if c.lastCgroupCPUSampleAt.IsZero() {
|
||||
c.lastCgroupCPUUsageNanos = usageNanos
|
||||
c.lastCgroupCPUSampleAt = now
|
||||
return nil
|
||||
}
|
||||
|
||||
elapsed := now.Sub(c.lastCgroupCPUSampleAt)
|
||||
if elapsed <= 0 {
|
||||
c.lastCgroupCPUUsageNanos = usageNanos
|
||||
c.lastCgroupCPUSampleAt = now
|
||||
return nil
|
||||
}
|
||||
|
||||
prev := c.lastCgroupCPUUsageNanos
|
||||
c.lastCgroupCPUUsageNanos = usageNanos
|
||||
c.lastCgroupCPUSampleAt = now
|
||||
|
||||
if usageNanos < prev {
|
||||
// Counter reset (container restarted).
|
||||
return nil
|
||||
}
|
||||
|
||||
deltaUsageSec := float64(usageNanos-prev) / 1e9
|
||||
elapsedSec := elapsed.Seconds()
|
||||
if elapsedSec <= 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
cores := readCgroupCPULimitCores()
|
||||
if cores <= 0 {
|
||||
// Can't reliably normalize; skip and fall back to gopsutil.
|
||||
return nil
|
||||
}
|
||||
|
||||
pct := (deltaUsageSec / (elapsedSec * cores)) * 100
|
||||
if pct < 0 {
|
||||
pct = 0
|
||||
}
|
||||
// Clamp to avoid noise/jitter showing impossible values.
|
||||
if pct > 100 {
|
||||
pct = 100
|
||||
}
|
||||
v := roundTo1DP(pct)
|
||||
return &v
|
||||
}
|
||||
|
||||
func readCgroupMemoryBytes() (usedBytes uint64, totalBytes uint64, ok bool) {
|
||||
// cgroup v2 (most common in modern containers)
|
||||
if used, ok1 := readUintFile("/sys/fs/cgroup/memory.current"); ok1 {
|
||||
usedBytes = used
|
||||
rawMax, err := os.ReadFile("/sys/fs/cgroup/memory.max")
|
||||
if err == nil {
|
||||
s := strings.TrimSpace(string(rawMax))
|
||||
if s != "" && s != "max" {
|
||||
if v, err := strconv.ParseUint(s, 10, 64); err == nil {
|
||||
totalBytes = v
|
||||
}
|
||||
}
|
||||
}
|
||||
return usedBytes, totalBytes, true
|
||||
}
|
||||
|
||||
// cgroup v1 fallback
|
||||
if used, ok1 := readUintFile("/sys/fs/cgroup/memory/memory.usage_in_bytes"); ok1 {
|
||||
usedBytes = used
|
||||
if limit, ok2 := readUintFile("/sys/fs/cgroup/memory/memory.limit_in_bytes"); ok2 {
|
||||
// Some environments report a very large number when unlimited.
|
||||
if limit > 0 && limit < (1<<60) {
|
||||
totalBytes = limit
|
||||
}
|
||||
}
|
||||
return usedBytes, totalBytes, true
|
||||
}
|
||||
|
||||
return 0, 0, false
|
||||
}
|
||||
|
||||
func readCgroupCPUUsageNanos() (usageNanos uint64, ok bool) {
|
||||
// cgroup v2: cpu.stat has usage_usec
|
||||
if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.stat"); err == nil {
|
||||
lines := strings.Split(string(raw), "\n")
|
||||
for _, line := range lines {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) != 2 {
|
||||
continue
|
||||
}
|
||||
if fields[0] != "usage_usec" {
|
||||
continue
|
||||
}
|
||||
v, err := strconv.ParseUint(fields[1], 10, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
return v * 1000, true
|
||||
}
|
||||
}
|
||||
|
||||
// cgroup v1: cpuacct.usage is in nanoseconds
|
||||
if v, ok := readUintFile("/sys/fs/cgroup/cpuacct/cpuacct.usage"); ok {
|
||||
return v, true
|
||||
}
|
||||
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func readCgroupCPULimitCores() float64 {
|
||||
// cgroup v2: cpu.max => "<quota> <period>" or "max <period>"
|
||||
if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.max"); err == nil {
|
||||
fields := strings.Fields(string(raw))
|
||||
if len(fields) >= 2 && fields[0] != "max" {
|
||||
quota, err1 := strconv.ParseFloat(fields[0], 64)
|
||||
period, err2 := strconv.ParseFloat(fields[1], 64)
|
||||
if err1 == nil && err2 == nil && quota > 0 && period > 0 {
|
||||
return quota / period
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// cgroup v1: cpu.cfs_quota_us / cpu.cfs_period_us
|
||||
quota, okQuota := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_quota_us")
|
||||
period, okPeriod := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_period_us")
|
||||
if okQuota && okPeriod && quota > 0 && period > 0 {
|
||||
return float64(quota) / float64(period)
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
func readUintFile(path string) (uint64, bool) {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
s := strings.TrimSpace(string(raw))
|
||||
if s == "" {
|
||||
return 0, false
|
||||
}
|
||||
v, err := strconv.ParseUint(s, 10, 64)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return v, true
|
||||
}
|
||||
|
||||
func readIntFile(path string) (int64, bool) {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
s := strings.TrimSpace(string(raw))
|
||||
if s == "" {
|
||||
return 0, false
|
||||
}
|
||||
v, err := strconv.ParseInt(s, 10, 64)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return v, true
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) checkDB(ctx context.Context) bool {
|
||||
if c == nil || c.db == nil {
|
||||
return false
|
||||
}
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
var one int
|
||||
if err := c.db.QueryRowContext(ctx, "SELECT 1").Scan(&one); err != nil {
|
||||
return false
|
||||
}
|
||||
return one == 1
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool {
|
||||
if c == nil || c.redisClient == nil {
|
||||
return false
|
||||
}
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
return c.redisClient.Ping(ctx).Err() == nil
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) {
|
||||
if c == nil || c.db == nil {
|
||||
return 0, 0
|
||||
}
|
||||
stats := c.db.Stats()
|
||||
return stats.InUse, stats.Idle
|
||||
}
|
||||
|
||||
var opsMetricsCollectorReleaseScript = redis.NewScript(`
|
||||
if redis.call("GET", KEYS[1]) == ARGV[1] then
|
||||
return redis.call("DEL", KEYS[1])
|
||||
end
|
||||
return 0
|
||||
`)
|
||||
|
||||
func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
|
||||
if c == nil || c.redisClient == nil {
|
||||
return nil, true
|
||||
}
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
|
||||
ok, err := c.redisClient.SetNX(ctx, opsMetricsCollectorLeaderLockKey, c.instanceID, opsMetricsCollectorLeaderLockTTL).Result()
|
||||
if err != nil {
|
||||
// Prefer fail-closed to avoid stampeding the database when Redis is flaky.
|
||||
// Fallback to a DB advisory lock when Redis is present but unavailable.
|
||||
release, ok := c.tryAcquireDBAdvisoryLock(ctx)
|
||||
if !ok {
|
||||
c.maybeLogSkip()
|
||||
return nil, false
|
||||
}
|
||||
return release, true
|
||||
}
|
||||
if !ok {
|
||||
c.maybeLogSkip()
|
||||
return nil, false
|
||||
}
|
||||
|
||||
release := func() {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
_, _ = opsMetricsCollectorReleaseScript.Run(ctx, c.redisClient, []string{opsMetricsCollectorLeaderLockKey}, c.instanceID).Result()
|
||||
}
|
||||
return release, true
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) tryAcquireDBAdvisoryLock(ctx context.Context) (func(), bool) {
|
||||
if c == nil || c.db == nil {
|
||||
return nil, false
|
||||
}
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
|
||||
conn, err := c.db.Conn(ctx)
|
||||
if err != nil {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
acquired := false
|
||||
if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", opsMetricsCollectorAdvisoryLockID).Scan(&acquired); err != nil {
|
||||
_ = conn.Close()
|
||||
return nil, false
|
||||
}
|
||||
if !acquired {
|
||||
_ = conn.Close()
|
||||
return nil, false
|
||||
}
|
||||
|
||||
release := func() {
|
||||
unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
_, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", opsMetricsCollectorAdvisoryLockID)
|
||||
_ = conn.Close()
|
||||
}
|
||||
return release, true
|
||||
}
|
||||
|
||||
func (c *OpsMetricsCollector) maybeLogSkip() {
|
||||
c.skipLogMu.Lock()
|
||||
defer c.skipLogMu.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
if !c.skipLogAt.IsZero() && now.Sub(c.skipLogAt) < time.Minute {
|
||||
return
|
||||
}
|
||||
c.skipLogAt = now
|
||||
log.Printf("[OpsMetricsCollector] leader lock held by another instance; skipping")
|
||||
}
|
||||
|
||||
func floatToIntPtr(v sql.NullFloat64) *int {
|
||||
if !v.Valid {
|
||||
return nil
|
||||
}
|
||||
n := int(math.Round(v.Float64))
|
||||
return &n
|
||||
}
|
||||
|
||||
func roundTo1DP(v float64) float64 {
|
||||
return math.Round(v*10) / 10
|
||||
}
|
||||
|
||||
func truncateString(s string, max int) string {
|
||||
if max <= 0 {
|
||||
return ""
|
||||
}
|
||||
if len(s) <= max {
|
||||
return s
|
||||
}
|
||||
cut := s[:max]
|
||||
for len(cut) > 0 && !utf8.ValidString(cut) {
|
||||
cut = cut[:len(cut)-1]
|
||||
}
|
||||
return cut
|
||||
}
|
||||
|
||||
func boolPtr(v bool) *bool {
|
||||
out := v
|
||||
return &out
|
||||
}
|
||||
|
||||
func intPtr(v int) *int {
|
||||
out := v
|
||||
return &out
|
||||
}
|
||||
|
||||
func float64Ptr(v float64) *float64 {
|
||||
out := v
|
||||
return &out
|
||||
}
|
||||
|
||||
func hashAdvisoryLockID(s string) int64 {
|
||||
h := fnv.New64a()
|
||||
_, _ = h.Write([]byte(s))
|
||||
return int64(h.Sum64())
|
||||
}
|
||||
226
backend/internal/service/ops_port.go
Normal file
226
backend/internal/service/ops_port.go
Normal file
@@ -0,0 +1,226 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
)
|
||||
|
||||
type OpsRepository interface {
|
||||
InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error)
|
||||
ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error)
|
||||
GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error)
|
||||
ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error)
|
||||
|
||||
InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
|
||||
UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
|
||||
GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)
|
||||
|
||||
// Lightweight window stats (for realtime WS / quick sampling).
|
||||
GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
|
||||
|
||||
GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error)
|
||||
GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error)
|
||||
GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error)
|
||||
GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error)
|
||||
GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error)
|
||||
|
||||
InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error
|
||||
GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error)
|
||||
|
||||
UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error
|
||||
ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error)
|
||||
|
||||
// Alerts (rules + events)
|
||||
ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error)
|
||||
CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
|
||||
UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
|
||||
DeleteAlertRule(ctx context.Context, id int64) error
|
||||
|
||||
ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
|
||||
GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
|
||||
GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
|
||||
CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
|
||||
UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
|
||||
UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error
|
||||
|
||||
// Pre-aggregation (hourly/daily) used for long-window dashboard performance.
|
||||
UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
|
||||
UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
|
||||
GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error)
|
||||
GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error)
|
||||
}
|
||||
|
||||
type OpsInsertErrorLogInput struct {
|
||||
RequestID string
|
||||
ClientRequestID string
|
||||
|
||||
UserID *int64
|
||||
APIKeyID *int64
|
||||
AccountID *int64
|
||||
GroupID *int64
|
||||
ClientIP *string
|
||||
|
||||
Platform string
|
||||
Model string
|
||||
RequestPath string
|
||||
Stream bool
|
||||
UserAgent string
|
||||
|
||||
ErrorPhase string
|
||||
ErrorType string
|
||||
Severity string
|
||||
StatusCode int
|
||||
IsBusinessLimited bool
|
||||
|
||||
ErrorMessage string
|
||||
ErrorBody string
|
||||
|
||||
ErrorSource string
|
||||
ErrorOwner string
|
||||
|
||||
UpstreamStatusCode *int
|
||||
UpstreamErrorMessage *string
|
||||
UpstreamErrorDetail *string
|
||||
|
||||
DurationMs *int
|
||||
TimeToFirstTokenMs *int64
|
||||
|
||||
RequestBodyJSON *string // sanitized json string (not raw bytes)
|
||||
RequestBodyTruncated bool
|
||||
RequestBodyBytes *int
|
||||
RequestHeadersJSON *string // optional json string
|
||||
|
||||
IsRetryable bool
|
||||
RetryCount int
|
||||
|
||||
CreatedAt time.Time
|
||||
}
|
||||
|
||||
type OpsInsertRetryAttemptInput struct {
|
||||
RequestedByUserID int64
|
||||
SourceErrorID int64
|
||||
Mode string
|
||||
PinnedAccountID *int64
|
||||
|
||||
// running|queued etc.
|
||||
Status string
|
||||
StartedAt time.Time
|
||||
}
|
||||
|
||||
type OpsUpdateRetryAttemptInput struct {
|
||||
ID int64
|
||||
|
||||
// succeeded|failed
|
||||
Status string
|
||||
FinishedAt time.Time
|
||||
DurationMs int64
|
||||
|
||||
// Optional correlation
|
||||
ResultRequestID *string
|
||||
ResultErrorID *int64
|
||||
|
||||
ErrorMessage *string
|
||||
}
|
||||
|
||||
type OpsInsertSystemMetricsInput struct {
|
||||
CreatedAt time.Time
|
||||
WindowMinutes int
|
||||
|
||||
Platform *string
|
||||
GroupID *int64
|
||||
|
||||
SuccessCount int64
|
||||
ErrorCountTotal int64
|
||||
BusinessLimitedCount int64
|
||||
ErrorCountSLA int64
|
||||
|
||||
UpstreamErrorCountExcl429529 int64
|
||||
Upstream429Count int64
|
||||
Upstream529Count int64
|
||||
|
||||
TokenConsumed int64
|
||||
|
||||
QPS *float64
|
||||
TPS *float64
|
||||
|
||||
DurationP50Ms *int
|
||||
DurationP90Ms *int
|
||||
DurationP95Ms *int
|
||||
DurationP99Ms *int
|
||||
DurationAvgMs *float64
|
||||
DurationMaxMs *int
|
||||
|
||||
TTFTP50Ms *int
|
||||
TTFTP90Ms *int
|
||||
TTFTP95Ms *int
|
||||
TTFTP99Ms *int
|
||||
TTFTAvgMs *float64
|
||||
TTFTMaxMs *int
|
||||
|
||||
CPUUsagePercent *float64
|
||||
MemoryUsedMB *int64
|
||||
MemoryTotalMB *int64
|
||||
MemoryUsagePercent *float64
|
||||
|
||||
DBOK *bool
|
||||
RedisOK *bool
|
||||
|
||||
DBConnActive *int
|
||||
DBConnIdle *int
|
||||
DBConnWaiting *int
|
||||
|
||||
GoroutineCount *int
|
||||
ConcurrencyQueueDepth *int
|
||||
}
|
||||
|
||||
type OpsSystemMetricsSnapshot struct {
|
||||
ID int64 `json:"id"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
WindowMinutes int `json:"window_minutes"`
|
||||
|
||||
CPUUsagePercent *float64 `json:"cpu_usage_percent"`
|
||||
MemoryUsedMB *int64 `json:"memory_used_mb"`
|
||||
MemoryTotalMB *int64 `json:"memory_total_mb"`
|
||||
MemoryUsagePercent *float64 `json:"memory_usage_percent"`
|
||||
|
||||
DBOK *bool `json:"db_ok"`
|
||||
RedisOK *bool `json:"redis_ok"`
|
||||
|
||||
DBConnActive *int `json:"db_conn_active"`
|
||||
DBConnIdle *int `json:"db_conn_idle"`
|
||||
DBConnWaiting *int `json:"db_conn_waiting"`
|
||||
|
||||
GoroutineCount *int `json:"goroutine_count"`
|
||||
ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"`
|
||||
}
|
||||
|
||||
type OpsUpsertJobHeartbeatInput struct {
|
||||
JobName string
|
||||
|
||||
LastRunAt *time.Time
|
||||
LastSuccessAt *time.Time
|
||||
LastErrorAt *time.Time
|
||||
LastError *string
|
||||
LastDurationMs *int64
|
||||
}
|
||||
|
||||
type OpsJobHeartbeat struct {
|
||||
JobName string `json:"job_name"`
|
||||
|
||||
LastRunAt *time.Time `json:"last_run_at"`
|
||||
LastSuccessAt *time.Time `json:"last_success_at"`
|
||||
LastErrorAt *time.Time `json:"last_error_at"`
|
||||
LastError *string `json:"last_error"`
|
||||
LastDurationMs *int64 `json:"last_duration_ms"`
|
||||
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type OpsWindowStats struct {
|
||||
StartTime time.Time `json:"start_time"`
|
||||
EndTime time.Time `json:"end_time"`
|
||||
|
||||
SuccessCount int64 `json:"success_count"`
|
||||
ErrorCountTotal int64 `json:"error_count_total"`
|
||||
TokenConsumed int64 `json:"token_consumed"`
|
||||
}
|
||||
40
backend/internal/service/ops_query_mode.go
Normal file
40
backend/internal/service/ops_query_mode.go
Normal file
@@ -0,0 +1,40 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type OpsQueryMode string
|
||||
|
||||
const (
|
||||
OpsQueryModeAuto OpsQueryMode = "auto"
|
||||
OpsQueryModeRaw OpsQueryMode = "raw"
|
||||
OpsQueryModePreagg OpsQueryMode = "preagg"
|
||||
)
|
||||
|
||||
// ErrOpsPreaggregatedNotPopulated indicates that raw logs exist for a window, but the
|
||||
// pre-aggregation tables are not populated yet. This is primarily used to implement
|
||||
// the forced `preagg` mode UX.
|
||||
var ErrOpsPreaggregatedNotPopulated = errors.New("ops pre-aggregated tables not populated")
|
||||
|
||||
func ParseOpsQueryMode(raw string) OpsQueryMode {
|
||||
v := strings.ToLower(strings.TrimSpace(raw))
|
||||
switch v {
|
||||
case string(OpsQueryModeRaw):
|
||||
return OpsQueryModeRaw
|
||||
case string(OpsQueryModePreagg):
|
||||
return OpsQueryModePreagg
|
||||
default:
|
||||
return OpsQueryModeAuto
|
||||
}
|
||||
}
|
||||
|
||||
func (m OpsQueryMode) IsValid() bool {
|
||||
switch m {
|
||||
case OpsQueryModeAuto, OpsQueryModeRaw, OpsQueryModePreagg:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
36
backend/internal/service/ops_realtime.go
Normal file
36
backend/internal/service/ops_realtime.go
Normal file
@@ -0,0 +1,36 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// IsRealtimeMonitoringEnabled returns true when realtime ops features are enabled.
|
||||
//
|
||||
// This is a soft switch controlled by the DB setting `ops_realtime_monitoring_enabled`,
|
||||
// and it is also gated by the hard switch/soft switch of overall ops monitoring.
|
||||
func (s *OpsService) IsRealtimeMonitoringEnabled(ctx context.Context) bool {
|
||||
if !s.IsMonitoringEnabled(ctx) {
|
||||
return false
|
||||
}
|
||||
if s.settingRepo == nil {
|
||||
return true
|
||||
}
|
||||
|
||||
value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsRealtimeMonitoringEnabled)
|
||||
if err != nil {
|
||||
// Default enabled when key is missing; fail-open on transient errors.
|
||||
if errors.Is(err, ErrSettingNotFound) {
|
||||
return true
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
switch strings.ToLower(strings.TrimSpace(value)) {
|
||||
case "false", "0", "off", "disabled":
|
||||
return false
|
||||
default:
|
||||
return true
|
||||
}
|
||||
}
|
||||
152
backend/internal/service/ops_request_details.go
Normal file
152
backend/internal/service/ops_request_details.go
Normal file
@@ -0,0 +1,152 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
)
|
||||
|
||||
type OpsRequestKind string
|
||||
|
||||
const (
|
||||
OpsRequestKindSuccess OpsRequestKind = "success"
|
||||
OpsRequestKindError OpsRequestKind = "error"
|
||||
)
|
||||
|
||||
// OpsRequestDetail is a request-level view across success (usage_logs) and error (ops_error_logs).
|
||||
// It powers "request drilldown" UIs without exposing full request bodies for successful requests.
|
||||
type OpsRequestDetail struct {
|
||||
Kind OpsRequestKind `json:"kind"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
RequestID string `json:"request_id"`
|
||||
|
||||
Platform string `json:"platform,omitempty"`
|
||||
Model string `json:"model,omitempty"`
|
||||
|
||||
DurationMs *int `json:"duration_ms,omitempty"`
|
||||
StatusCode *int `json:"status_code,omitempty"`
|
||||
|
||||
// When Kind == "error", ErrorID links to /admin/ops/errors/:id.
|
||||
ErrorID *int64 `json:"error_id,omitempty"`
|
||||
|
||||
Phase string `json:"phase,omitempty"`
|
||||
Severity string `json:"severity,omitempty"`
|
||||
Message string `json:"message,omitempty"`
|
||||
|
||||
UserID *int64 `json:"user_id,omitempty"`
|
||||
APIKeyID *int64 `json:"api_key_id,omitempty"`
|
||||
AccountID *int64 `json:"account_id,omitempty"`
|
||||
GroupID *int64 `json:"group_id,omitempty"`
|
||||
|
||||
Stream bool `json:"stream"`
|
||||
}
|
||||
|
||||
type OpsRequestDetailFilter struct {
|
||||
StartTime *time.Time
|
||||
EndTime *time.Time
|
||||
|
||||
// kind: success|error|all
|
||||
Kind string
|
||||
|
||||
Platform string
|
||||
GroupID *int64
|
||||
|
||||
UserID *int64
|
||||
APIKeyID *int64
|
||||
AccountID *int64
|
||||
|
||||
Model string
|
||||
RequestID string
|
||||
Query string
|
||||
|
||||
MinDurationMs *int
|
||||
MaxDurationMs *int
|
||||
|
||||
// Sort: created_at_desc (default) or duration_desc.
|
||||
Sort string
|
||||
|
||||
Page int
|
||||
PageSize int
|
||||
}
|
||||
|
||||
func (f *OpsRequestDetailFilter) Normalize() (page, pageSize int, startTime, endTime time.Time) {
|
||||
page = 1
|
||||
pageSize = 50
|
||||
endTime = time.Now()
|
||||
startTime = endTime.Add(-1 * time.Hour)
|
||||
|
||||
if f == nil {
|
||||
return page, pageSize, startTime, endTime
|
||||
}
|
||||
|
||||
if f.Page > 0 {
|
||||
page = f.Page
|
||||
}
|
||||
if f.PageSize > 0 {
|
||||
pageSize = f.PageSize
|
||||
}
|
||||
if pageSize > 100 {
|
||||
pageSize = 100
|
||||
}
|
||||
|
||||
if f.EndTime != nil {
|
||||
endTime = *f.EndTime
|
||||
}
|
||||
if f.StartTime != nil {
|
||||
startTime = *f.StartTime
|
||||
} else if f.EndTime != nil {
|
||||
startTime = endTime.Add(-1 * time.Hour)
|
||||
}
|
||||
|
||||
if startTime.After(endTime) {
|
||||
startTime, endTime = endTime, startTime
|
||||
}
|
||||
|
||||
return page, pageSize, startTime, endTime
|
||||
}
|
||||
|
||||
type OpsRequestDetailList struct {
|
||||
Items []*OpsRequestDetail `json:"items"`
|
||||
Total int64 `json:"total"`
|
||||
Page int `json:"page"`
|
||||
PageSize int `json:"page_size"`
|
||||
}
|
||||
|
||||
func (s *OpsService) ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) (*OpsRequestDetailList, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return &OpsRequestDetailList{
|
||||
Items: []*OpsRequestDetail{},
|
||||
Total: 0,
|
||||
Page: 1,
|
||||
PageSize: 50,
|
||||
}, nil
|
||||
}
|
||||
|
||||
page, pageSize, startTime, endTime := filter.Normalize()
|
||||
filterCopy := &OpsRequestDetailFilter{}
|
||||
if filter != nil {
|
||||
*filterCopy = *filter
|
||||
}
|
||||
filterCopy.Page = page
|
||||
filterCopy.PageSize = pageSize
|
||||
filterCopy.StartTime = &startTime
|
||||
filterCopy.EndTime = &endTime
|
||||
|
||||
items, total, err := s.opsRepo.ListRequestDetails(ctx, filterCopy)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if items == nil {
|
||||
items = []*OpsRequestDetail{}
|
||||
}
|
||||
|
||||
return &OpsRequestDetailList{
|
||||
Items: items,
|
||||
Total: total,
|
||||
Page: page,
|
||||
PageSize: pageSize,
|
||||
}, nil
|
||||
}
|
||||
|
||||
635
backend/internal/service/ops_retry.go
Normal file
635
backend/internal/service/ops_retry.go
Normal file
@@ -0,0 +1,635 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/lib/pq"
|
||||
)
|
||||
|
||||
const (
|
||||
OpsRetryModeClient = "client"
|
||||
OpsRetryModeUpstream = "upstream"
|
||||
)
|
||||
|
||||
const (
|
||||
opsRetryStatusRunning = "running"
|
||||
opsRetryStatusSucceeded = "succeeded"
|
||||
opsRetryStatusFailed = "failed"
|
||||
)
|
||||
|
||||
const (
|
||||
opsRetryTimeout = 60 * time.Second
|
||||
opsRetryCaptureBytesLimit = 64 * 1024
|
||||
opsRetryResponsePreviewMax = 8 * 1024
|
||||
opsRetryMinIntervalPerError = 10 * time.Second
|
||||
opsRetryMaxAccountSwitches = 3
|
||||
)
|
||||
|
||||
var opsRetryRequestHeaderAllowlist = map[string]bool{
|
||||
"anthropic-beta": true,
|
||||
"anthropic-version": true,
|
||||
}
|
||||
|
||||
type opsRetryRequestType string
|
||||
|
||||
const (
|
||||
opsRetryTypeMessages opsRetryRequestType = "messages"
|
||||
opsRetryTypeOpenAI opsRetryRequestType = "openai_responses"
|
||||
opsRetryTypeGeminiV1B opsRetryRequestType = "gemini_v1beta"
|
||||
)
|
||||
|
||||
type limitedResponseWriter struct {
|
||||
header http.Header
|
||||
status int
|
||||
wroteHeader bool
|
||||
|
||||
limit int
|
||||
totalWritten int64
|
||||
buf bytes.Buffer
|
||||
}
|
||||
|
||||
func newLimitedResponseWriter(limit int) *limitedResponseWriter {
|
||||
if limit <= 0 {
|
||||
limit = 1
|
||||
}
|
||||
return &limitedResponseWriter{
|
||||
header: make(http.Header),
|
||||
status: http.StatusOK,
|
||||
limit: limit,
|
||||
}
|
||||
}
|
||||
|
||||
func (w *limitedResponseWriter) Header() http.Header {
|
||||
return w.header
|
||||
}
|
||||
|
||||
func (w *limitedResponseWriter) WriteHeader(statusCode int) {
|
||||
if w.wroteHeader {
|
||||
return
|
||||
}
|
||||
w.wroteHeader = true
|
||||
w.status = statusCode
|
||||
}
|
||||
|
||||
func (w *limitedResponseWriter) Write(p []byte) (int, error) {
|
||||
if !w.wroteHeader {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}
|
||||
w.totalWritten += int64(len(p))
|
||||
|
||||
if w.buf.Len() < w.limit {
|
||||
remaining := w.limit - w.buf.Len()
|
||||
if len(p) > remaining {
|
||||
_, _ = w.buf.Write(p[:remaining])
|
||||
} else {
|
||||
_, _ = w.buf.Write(p)
|
||||
}
|
||||
}
|
||||
|
||||
// Pretend we wrote everything to avoid upstream/client code treating it as an error.
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
func (w *limitedResponseWriter) Flush() {}
|
||||
|
||||
func (w *limitedResponseWriter) bodyBytes() []byte {
|
||||
return w.buf.Bytes()
|
||||
}
|
||||
|
||||
func (w *limitedResponseWriter) truncated() bool {
|
||||
return w.totalWritten > int64(w.limit)
|
||||
}
|
||||
|
||||
func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
|
||||
mode = strings.ToLower(strings.TrimSpace(mode))
|
||||
switch mode {
|
||||
case OpsRetryModeClient, OpsRetryModeUpstream:
|
||||
default:
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
|
||||
}
|
||||
|
||||
latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
|
||||
if err != nil && !errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
|
||||
}
|
||||
if latest != nil {
|
||||
if strings.EqualFold(latest.Status, opsRetryStatusRunning) || strings.EqualFold(latest.Status, "queued") {
|
||||
return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
|
||||
}
|
||||
|
||||
lastAttemptAt := latest.CreatedAt
|
||||
if latest.FinishedAt != nil && !latest.FinishedAt.IsZero() {
|
||||
lastAttemptAt = *latest.FinishedAt
|
||||
} else if latest.StartedAt != nil && !latest.StartedAt.IsZero() {
|
||||
lastAttemptAt = *latest.StartedAt
|
||||
}
|
||||
|
||||
if time.Since(lastAttemptAt) < opsRetryMinIntervalPerError {
|
||||
return nil, infraerrors.Conflict("OPS_RETRY_TOO_FREQUENT", "Please wait before retrying this error again")
|
||||
}
|
||||
}
|
||||
|
||||
errorLog, err := s.GetErrorLogByID(ctx, errorID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if strings.TrimSpace(errorLog.RequestBody) == "" {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
|
||||
}
|
||||
|
||||
var pinned *int64
|
||||
if mode == OpsRetryModeUpstream {
|
||||
if pinnedAccountID != nil && *pinnedAccountID > 0 {
|
||||
pinned = pinnedAccountID
|
||||
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
|
||||
pinned = errorLog.AccountID
|
||||
} else {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
|
||||
}
|
||||
}
|
||||
|
||||
startedAt := time.Now()
|
||||
attemptID, err := s.opsRepo.InsertRetryAttempt(ctx, &OpsInsertRetryAttemptInput{
|
||||
RequestedByUserID: requestedByUserID,
|
||||
SourceErrorID: errorID,
|
||||
Mode: mode,
|
||||
PinnedAccountID: pinned,
|
||||
Status: opsRetryStatusRunning,
|
||||
StartedAt: startedAt,
|
||||
})
|
||||
if err != nil {
|
||||
var pqErr *pq.Error
|
||||
if errors.As(err, &pqErr) && string(pqErr.Code) == "23505" {
|
||||
return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
|
||||
}
|
||||
return nil, infraerrors.InternalServer("OPS_RETRY_CREATE_ATTEMPT_FAILED", "Failed to create retry attempt").WithCause(err)
|
||||
}
|
||||
|
||||
result := &OpsRetryResult{
|
||||
AttemptID: attemptID,
|
||||
Mode: mode,
|
||||
Status: opsRetryStatusFailed,
|
||||
PinnedAccountID: pinned,
|
||||
HTTPStatusCode: 0,
|
||||
UpstreamRequestID: "",
|
||||
ResponsePreview: "",
|
||||
ResponseTruncated: false,
|
||||
ErrorMessage: "",
|
||||
StartedAt: startedAt,
|
||||
}
|
||||
|
||||
execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
|
||||
defer cancel()
|
||||
|
||||
execRes := s.executeRetry(execCtx, errorLog, mode, pinned)
|
||||
|
||||
finishedAt := time.Now()
|
||||
result.FinishedAt = finishedAt
|
||||
result.DurationMs = finishedAt.Sub(startedAt).Milliseconds()
|
||||
|
||||
if execRes != nil {
|
||||
result.Status = execRes.status
|
||||
result.UsedAccountID = execRes.usedAccountID
|
||||
result.HTTPStatusCode = execRes.httpStatusCode
|
||||
result.UpstreamRequestID = execRes.upstreamRequestID
|
||||
result.ResponsePreview = execRes.responsePreview
|
||||
result.ResponseTruncated = execRes.responseTruncated
|
||||
result.ErrorMessage = execRes.errorMessage
|
||||
}
|
||||
|
||||
updateCtx, updateCancel := context.WithTimeout(context.Background(), 3*time.Second)
|
||||
defer updateCancel()
|
||||
|
||||
var updateErrMsg *string
|
||||
if strings.TrimSpace(result.ErrorMessage) != "" {
|
||||
msg := result.ErrorMessage
|
||||
updateErrMsg = &msg
|
||||
}
|
||||
var resultRequestID *string
|
||||
if strings.TrimSpace(result.UpstreamRequestID) != "" {
|
||||
v := result.UpstreamRequestID
|
||||
resultRequestID = &v
|
||||
}
|
||||
|
||||
finalStatus := result.Status
|
||||
if strings.TrimSpace(finalStatus) == "" {
|
||||
finalStatus = opsRetryStatusFailed
|
||||
}
|
||||
|
||||
if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{
|
||||
ID: attemptID,
|
||||
Status: finalStatus,
|
||||
FinishedAt: finishedAt,
|
||||
DurationMs: result.DurationMs,
|
||||
ResultRequestID: resultRequestID,
|
||||
ErrorMessage: updateErrMsg,
|
||||
}); err != nil {
|
||||
// Best-effort: retry itself already executed; do not fail the API response.
|
||||
log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
type opsRetryExecution struct {
|
||||
status string
|
||||
|
||||
usedAccountID *int64
|
||||
httpStatusCode int
|
||||
upstreamRequestID string
|
||||
|
||||
responsePreview string
|
||||
responseTruncated bool
|
||||
|
||||
errorMessage string
|
||||
}
|
||||
|
||||
func (s *OpsService) executeRetry(ctx context.Context, errorLog *OpsErrorLogDetail, mode string, pinnedAccountID *int64) *opsRetryExecution {
|
||||
if errorLog == nil {
|
||||
return &opsRetryExecution{
|
||||
status: opsRetryStatusFailed,
|
||||
errorMessage: "missing error log",
|
||||
}
|
||||
}
|
||||
|
||||
reqType := detectOpsRetryType(errorLog.RequestPath)
|
||||
bodyBytes := []byte(errorLog.RequestBody)
|
||||
|
||||
switch reqType {
|
||||
case opsRetryTypeMessages:
|
||||
bodyBytes = FilterThinkingBlocksForRetry(bodyBytes)
|
||||
case opsRetryTypeOpenAI, opsRetryTypeGeminiV1B:
|
||||
// No-op
|
||||
}
|
||||
|
||||
switch strings.ToLower(strings.TrimSpace(mode)) {
|
||||
case OpsRetryModeUpstream:
|
||||
if pinnedAccountID == nil || *pinnedAccountID <= 0 {
|
||||
return &opsRetryExecution{
|
||||
status: opsRetryStatusFailed,
|
||||
errorMessage: "pinned_account_id required for upstream retry",
|
||||
}
|
||||
}
|
||||
return s.executePinnedRetry(ctx, reqType, errorLog, bodyBytes, *pinnedAccountID)
|
||||
case OpsRetryModeClient:
|
||||
return s.executeClientRetry(ctx, reqType, errorLog, bodyBytes)
|
||||
default:
|
||||
return &opsRetryExecution{
|
||||
status: opsRetryStatusFailed,
|
||||
errorMessage: "invalid retry mode",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func detectOpsRetryType(path string) opsRetryRequestType {
|
||||
p := strings.ToLower(strings.TrimSpace(path))
|
||||
switch {
|
||||
case strings.Contains(p, "/responses"):
|
||||
return opsRetryTypeOpenAI
|
||||
case strings.Contains(p, "/v1beta/"):
|
||||
return opsRetryTypeGeminiV1B
|
||||
default:
|
||||
return opsRetryTypeMessages
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsService) executePinnedRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, pinnedAccountID int64) *opsRetryExecution {
|
||||
if s.accountRepo == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account repository not available"}
|
||||
}
|
||||
|
||||
account, err := s.accountRepo.GetByID(ctx, pinnedAccountID)
|
||||
if err != nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("account not found: %v", err)}
|
||||
}
|
||||
if account == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account not found"}
|
||||
}
|
||||
if !account.IsSchedulable() {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account is not schedulable"}
|
||||
}
|
||||
if errorLog.GroupID != nil && *errorLog.GroupID > 0 {
|
||||
if !containsInt64(account.GroupIDs, *errorLog.GroupID) {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "pinned account is not in the same group as the original request"}
|
||||
}
|
||||
}
|
||||
|
||||
var release func()
|
||||
if s.concurrencyService != nil {
|
||||
acq, err := s.concurrencyService.AcquireAccountSlot(ctx, account.ID, account.Concurrency)
|
||||
if err != nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("acquire account slot failed: %v", err)}
|
||||
}
|
||||
if acq == nil || !acq.Acquired {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account concurrency limit reached"}
|
||||
}
|
||||
release = acq.ReleaseFunc
|
||||
}
|
||||
if release != nil {
|
||||
defer release()
|
||||
}
|
||||
|
||||
usedID := account.ID
|
||||
exec := s.executeWithAccount(ctx, reqType, errorLog, body, account)
|
||||
exec.usedAccountID = &usedID
|
||||
if exec.status == "" {
|
||||
exec.status = opsRetryStatusFailed
|
||||
}
|
||||
return exec
|
||||
}
|
||||
|
||||
func (s *OpsService) executeClientRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) *opsRetryExecution {
|
||||
groupID := errorLog.GroupID
|
||||
if groupID == nil || *groupID <= 0 {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "group_id missing; cannot reselect account"}
|
||||
}
|
||||
|
||||
model, stream, parsedErr := extractRetryModelAndStream(reqType, errorLog, body)
|
||||
if parsedErr != nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: parsedErr.Error()}
|
||||
}
|
||||
_ = stream
|
||||
|
||||
excluded := make(map[int64]struct{})
|
||||
switches := 0
|
||||
|
||||
for {
|
||||
if switches >= opsRetryMaxAccountSwitches {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "retry failed after exhausting account failovers"}
|
||||
}
|
||||
|
||||
selection, selErr := s.selectAccountForRetry(ctx, reqType, groupID, model, excluded)
|
||||
if selErr != nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: selErr.Error()}
|
||||
}
|
||||
if selection == nil || selection.Account == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "no available accounts"}
|
||||
}
|
||||
|
||||
account := selection.Account
|
||||
if !selection.Acquired || selection.ReleaseFunc == nil {
|
||||
excluded[account.ID] = struct{}{}
|
||||
switches++
|
||||
continue
|
||||
}
|
||||
|
||||
exec := func() *opsRetryExecution {
|
||||
defer selection.ReleaseFunc()
|
||||
return s.executeWithAccount(ctx, reqType, errorLog, body, account)
|
||||
}()
|
||||
|
||||
if exec != nil {
|
||||
if exec.status == opsRetryStatusSucceeded {
|
||||
usedID := account.ID
|
||||
exec.usedAccountID = &usedID
|
||||
return exec
|
||||
}
|
||||
// If the gateway services ask for failover, try another account.
|
||||
if s.isFailoverError(exec.errorMessage) {
|
||||
excluded[account.ID] = struct{}{}
|
||||
switches++
|
||||
continue
|
||||
}
|
||||
usedID := account.ID
|
||||
exec.usedAccountID = &usedID
|
||||
return exec
|
||||
}
|
||||
|
||||
excluded[account.ID] = struct{}{}
|
||||
switches++
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsService) selectAccountForRetry(ctx context.Context, reqType opsRetryRequestType, groupID *int64, model string, excludedIDs map[int64]struct{}) (*AccountSelectionResult, error) {
|
||||
switch reqType {
|
||||
case opsRetryTypeOpenAI:
|
||||
if s.openAIGatewayService == nil {
|
||||
return nil, fmt.Errorf("openai gateway service not available")
|
||||
}
|
||||
return s.openAIGatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
|
||||
case opsRetryTypeGeminiV1B, opsRetryTypeMessages:
|
||||
if s.gatewayService == nil {
|
||||
return nil, fmt.Errorf("gateway service not available")
|
||||
}
|
||||
return s.gatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported retry type: %s", reqType)
|
||||
}
|
||||
}
|
||||
|
||||
func extractRetryModelAndStream(reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) (model string, stream bool, err error) {
|
||||
switch reqType {
|
||||
case opsRetryTypeMessages:
|
||||
parsed, parseErr := ParseGatewayRequest(body)
|
||||
if parseErr != nil {
|
||||
return "", false, fmt.Errorf("failed to parse messages request body: %w", parseErr)
|
||||
}
|
||||
return parsed.Model, parsed.Stream, nil
|
||||
case opsRetryTypeOpenAI:
|
||||
var v struct {
|
||||
Model string `json:"model"`
|
||||
Stream bool `json:"stream"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &v); err != nil {
|
||||
return "", false, fmt.Errorf("failed to parse openai request body: %w", err)
|
||||
}
|
||||
return strings.TrimSpace(v.Model), v.Stream, nil
|
||||
case opsRetryTypeGeminiV1B:
|
||||
if strings.TrimSpace(errorLog.Model) == "" {
|
||||
return "", false, fmt.Errorf("missing model for gemini v1beta retry")
|
||||
}
|
||||
return strings.TrimSpace(errorLog.Model), errorLog.Stream, nil
|
||||
default:
|
||||
return "", false, fmt.Errorf("unsupported retry type: %s", reqType)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsService) executeWithAccount(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, account *Account) *opsRetryExecution {
|
||||
if account == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "missing account"}
|
||||
}
|
||||
|
||||
c, w := newOpsRetryContext(ctx, errorLog)
|
||||
|
||||
var err error
|
||||
switch reqType {
|
||||
case opsRetryTypeOpenAI:
|
||||
if s.openAIGatewayService == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "openai gateway service not available"}
|
||||
}
|
||||
_, err = s.openAIGatewayService.Forward(ctx, c, account, body)
|
||||
case opsRetryTypeGeminiV1B:
|
||||
if s.geminiCompatService == nil || s.antigravityGatewayService == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini services not available"}
|
||||
}
|
||||
modelName := strings.TrimSpace(errorLog.Model)
|
||||
action := "generateContent"
|
||||
if errorLog.Stream {
|
||||
action = "streamGenerateContent"
|
||||
}
|
||||
if account.Platform == PlatformAntigravity {
|
||||
_, err = s.antigravityGatewayService.ForwardGemini(ctx, c, account, modelName, action, errorLog.Stream, body)
|
||||
} else {
|
||||
_, err = s.geminiCompatService.ForwardNative(ctx, c, account, modelName, action, errorLog.Stream, body)
|
||||
}
|
||||
case opsRetryTypeMessages:
|
||||
switch account.Platform {
|
||||
case PlatformAntigravity:
|
||||
if s.antigravityGatewayService == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "antigravity gateway service not available"}
|
||||
}
|
||||
_, err = s.antigravityGatewayService.Forward(ctx, c, account, body)
|
||||
case PlatformGemini:
|
||||
if s.geminiCompatService == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini gateway service not available"}
|
||||
}
|
||||
_, err = s.geminiCompatService.Forward(ctx, c, account, body)
|
||||
default:
|
||||
if s.gatewayService == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gateway service not available"}
|
||||
}
|
||||
parsedReq, parseErr := ParseGatewayRequest(body)
|
||||
if parseErr != nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "failed to parse request body"}
|
||||
}
|
||||
_, err = s.gatewayService.Forward(ctx, c, account, parsedReq)
|
||||
}
|
||||
default:
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "unsupported retry type"}
|
||||
}
|
||||
|
||||
statusCode := http.StatusOK
|
||||
if c != nil && c.Writer != nil {
|
||||
statusCode = c.Writer.Status()
|
||||
}
|
||||
|
||||
upstreamReqID := extractUpstreamRequestID(c)
|
||||
preview, truncated := extractResponsePreview(w)
|
||||
|
||||
exec := &opsRetryExecution{
|
||||
status: opsRetryStatusFailed,
|
||||
httpStatusCode: statusCode,
|
||||
upstreamRequestID: upstreamReqID,
|
||||
responsePreview: preview,
|
||||
responseTruncated: truncated,
|
||||
errorMessage: "",
|
||||
}
|
||||
|
||||
if err == nil && statusCode < 400 {
|
||||
exec.status = opsRetryStatusSucceeded
|
||||
return exec
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
exec.errorMessage = err.Error()
|
||||
} else {
|
||||
exec.errorMessage = fmt.Sprintf("upstream returned status %d", statusCode)
|
||||
}
|
||||
|
||||
return exec
|
||||
}
|
||||
|
||||
func newOpsRetryContext(ctx context.Context, errorLog *OpsErrorLogDetail) (*gin.Context, *limitedResponseWriter) {
|
||||
w := newLimitedResponseWriter(opsRetryCaptureBytesLimit)
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
|
||||
path := "/"
|
||||
if errorLog != nil && strings.TrimSpace(errorLog.RequestPath) != "" {
|
||||
path = errorLog.RequestPath
|
||||
}
|
||||
|
||||
req, _ := http.NewRequestWithContext(ctx, http.MethodPost, "http://localhost"+path, bytes.NewReader(nil))
|
||||
req.Header.Set("content-type", "application/json")
|
||||
if errorLog != nil && strings.TrimSpace(errorLog.UserAgent) != "" {
|
||||
req.Header.Set("user-agent", errorLog.UserAgent)
|
||||
}
|
||||
// Restore a minimal, whitelisted subset of request headers to improve retry fidelity
|
||||
// (e.g. anthropic-beta / anthropic-version). Never replay auth credentials.
|
||||
if errorLog != nil && strings.TrimSpace(errorLog.RequestHeaders) != "" {
|
||||
var stored map[string]string
|
||||
if err := json.Unmarshal([]byte(errorLog.RequestHeaders), &stored); err == nil {
|
||||
for k, v := range stored {
|
||||
key := strings.TrimSpace(k)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
if !opsRetryRequestHeaderAllowlist[strings.ToLower(key)] {
|
||||
continue
|
||||
}
|
||||
val := strings.TrimSpace(v)
|
||||
if val == "" {
|
||||
continue
|
||||
}
|
||||
req.Header.Set(key, val)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.Request = req
|
||||
return c, w
|
||||
}
|
||||
|
||||
func extractUpstreamRequestID(c *gin.Context) string {
|
||||
if c == nil || c.Writer == nil {
|
||||
return ""
|
||||
}
|
||||
h := c.Writer.Header()
|
||||
if h == nil {
|
||||
return ""
|
||||
}
|
||||
for _, key := range []string{"x-request-id", "X-Request-Id", "X-Request-ID"} {
|
||||
if v := strings.TrimSpace(h.Get(key)); v != "" {
|
||||
return v
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func extractResponsePreview(w *limitedResponseWriter) (preview string, truncated bool) {
|
||||
if w == nil {
|
||||
return "", false
|
||||
}
|
||||
b := bytes.TrimSpace(w.bodyBytes())
|
||||
if len(b) == 0 {
|
||||
return "", w.truncated()
|
||||
}
|
||||
if len(b) > opsRetryResponsePreviewMax {
|
||||
return string(b[:opsRetryResponsePreviewMax]), true
|
||||
}
|
||||
return string(b), w.truncated()
|
||||
}
|
||||
|
||||
func containsInt64(items []int64, needle int64) bool {
|
||||
for _, v := range items {
|
||||
if v == needle {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (s *OpsService) isFailoverError(message string) bool {
|
||||
msg := strings.ToLower(strings.TrimSpace(message))
|
||||
if msg == "" {
|
||||
return false
|
||||
}
|
||||
return strings.Contains(msg, "upstream error:") && strings.Contains(msg, "failover")
|
||||
}
|
||||
451
backend/internal/service/ops_service.go
Normal file
451
backend/internal/service/ops_service.go
Normal file
@@ -0,0 +1,451 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"log"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||
)
|
||||
|
||||
var ErrOpsDisabled = infraerrors.NotFound("OPS_DISABLED", "Ops monitoring is disabled")
|
||||
|
||||
const (
|
||||
opsMaxStoredRequestBodyBytes = 10 * 1024
|
||||
opsMaxStoredErrorBodyBytes = 20 * 1024
|
||||
)
|
||||
|
||||
// OpsService provides ingestion and query APIs for the Ops monitoring module.
|
||||
type OpsService struct {
|
||||
opsRepo OpsRepository
|
||||
settingRepo SettingRepository
|
||||
cfg *config.Config
|
||||
|
||||
accountRepo AccountRepository
|
||||
|
||||
concurrencyService *ConcurrencyService
|
||||
gatewayService *GatewayService
|
||||
openAIGatewayService *OpenAIGatewayService
|
||||
geminiCompatService *GeminiMessagesCompatService
|
||||
antigravityGatewayService *AntigravityGatewayService
|
||||
}
|
||||
|
||||
func NewOpsService(
|
||||
opsRepo OpsRepository,
|
||||
settingRepo SettingRepository,
|
||||
cfg *config.Config,
|
||||
accountRepo AccountRepository,
|
||||
concurrencyService *ConcurrencyService,
|
||||
gatewayService *GatewayService,
|
||||
openAIGatewayService *OpenAIGatewayService,
|
||||
geminiCompatService *GeminiMessagesCompatService,
|
||||
antigravityGatewayService *AntigravityGatewayService,
|
||||
) *OpsService {
|
||||
return &OpsService{
|
||||
opsRepo: opsRepo,
|
||||
settingRepo: settingRepo,
|
||||
cfg: cfg,
|
||||
|
||||
accountRepo: accountRepo,
|
||||
|
||||
concurrencyService: concurrencyService,
|
||||
gatewayService: gatewayService,
|
||||
openAIGatewayService: openAIGatewayService,
|
||||
geminiCompatService: geminiCompatService,
|
||||
antigravityGatewayService: antigravityGatewayService,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsService) RequireMonitoringEnabled(ctx context.Context) error {
|
||||
if s.IsMonitoringEnabled(ctx) {
|
||||
return nil
|
||||
}
|
||||
return ErrOpsDisabled
|
||||
}
|
||||
|
||||
func (s *OpsService) IsMonitoringEnabled(ctx context.Context) bool {
|
||||
// Hard switch: disable ops entirely.
|
||||
if s.cfg != nil && !s.cfg.Ops.Enabled {
|
||||
return false
|
||||
}
|
||||
if s.settingRepo == nil {
|
||||
return true
|
||||
}
|
||||
value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
|
||||
if err != nil {
|
||||
// Default enabled when key is missing, and fail-open on transient errors
|
||||
// (ops should never block gateway traffic).
|
||||
if errors.Is(err, ErrSettingNotFound) {
|
||||
return true
|
||||
}
|
||||
return true
|
||||
}
|
||||
switch strings.ToLower(strings.TrimSpace(value)) {
|
||||
case "false", "0", "off", "disabled":
|
||||
return false
|
||||
default:
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogInput, rawRequestBody []byte) error {
|
||||
if entry == nil {
|
||||
return nil
|
||||
}
|
||||
if !s.IsMonitoringEnabled(ctx) {
|
||||
return nil
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Ensure timestamps are always populated.
|
||||
if entry.CreatedAt.IsZero() {
|
||||
entry.CreatedAt = time.Now()
|
||||
}
|
||||
|
||||
// Ensure required fields exist (DB has NOT NULL constraints).
|
||||
entry.ErrorPhase = strings.TrimSpace(entry.ErrorPhase)
|
||||
entry.ErrorType = strings.TrimSpace(entry.ErrorType)
|
||||
if entry.ErrorPhase == "" {
|
||||
entry.ErrorPhase = "internal"
|
||||
}
|
||||
if entry.ErrorType == "" {
|
||||
entry.ErrorType = "api_error"
|
||||
}
|
||||
|
||||
// Sanitize + trim request body (errors only).
|
||||
if len(rawRequestBody) > 0 {
|
||||
sanitized, truncated, bytesLen := sanitizeAndTrimRequestBody(rawRequestBody, opsMaxStoredRequestBodyBytes)
|
||||
if sanitized != "" {
|
||||
entry.RequestBodyJSON = &sanitized
|
||||
}
|
||||
entry.RequestBodyTruncated = truncated
|
||||
entry.RequestBodyBytes = &bytesLen
|
||||
}
|
||||
|
||||
// Sanitize + truncate error_body to avoid storing sensitive data.
|
||||
if strings.TrimSpace(entry.ErrorBody) != "" {
|
||||
sanitized, _ := sanitizeErrorBodyForStorage(entry.ErrorBody, opsMaxStoredErrorBodyBytes)
|
||||
entry.ErrorBody = sanitized
|
||||
}
|
||||
|
||||
if _, err := s.opsRepo.InsertErrorLog(ctx, entry); err != nil {
|
||||
// Never bubble up to gateway; best-effort logging.
|
||||
log.Printf("[Ops] RecordError failed: %v", err)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil
|
||||
}
|
||||
return s.opsRepo.ListErrorLogs(ctx, filter)
|
||||
}
|
||||
|
||||
func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
|
||||
}
|
||||
detail, err := s.opsRepo.GetErrorLogByID(ctx, id)
|
||||
if err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
|
||||
}
|
||||
return nil, infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err)
|
||||
}
|
||||
return detail, nil
|
||||
}
|
||||
|
||||
func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) {
|
||||
bytesLen = len(raw)
|
||||
if len(raw) == 0 {
|
||||
return "", false, 0
|
||||
}
|
||||
|
||||
var decoded any
|
||||
if err := json.Unmarshal(raw, &decoded); err != nil {
|
||||
// If it's not valid JSON, don't store (retry would not be reliable anyway).
|
||||
return "", false, bytesLen
|
||||
}
|
||||
|
||||
decoded = redactSensitiveJSON(decoded)
|
||||
|
||||
encoded, err := json.Marshal(decoded)
|
||||
if err != nil {
|
||||
return "", false, bytesLen
|
||||
}
|
||||
if len(encoded) <= maxBytes {
|
||||
return string(encoded), false, bytesLen
|
||||
}
|
||||
|
||||
// Trim conversation history to keep the most recent context.
|
||||
if root, ok := decoded.(map[string]any); ok {
|
||||
if trimmed, ok := trimConversationArrays(root, maxBytes); ok {
|
||||
encoded2, err2 := json.Marshal(trimmed)
|
||||
if err2 == nil && len(encoded2) <= maxBytes {
|
||||
return string(encoded2), true, bytesLen
|
||||
}
|
||||
// Fallthrough: keep shrinking.
|
||||
decoded = trimmed
|
||||
}
|
||||
|
||||
essential := shrinkToEssentials(root)
|
||||
encoded3, err3 := json.Marshal(essential)
|
||||
if err3 == nil && len(encoded3) <= maxBytes {
|
||||
return string(encoded3), true, bytesLen
|
||||
}
|
||||
}
|
||||
|
||||
// Last resort: store a minimal placeholder (still valid JSON).
|
||||
placeholder := map[string]any{
|
||||
"request_body_truncated": true,
|
||||
}
|
||||
if model := extractString(decoded, "model"); model != "" {
|
||||
placeholder["model"] = model
|
||||
}
|
||||
encoded4, err4 := json.Marshal(placeholder)
|
||||
if err4 != nil {
|
||||
return "", true, bytesLen
|
||||
}
|
||||
return string(encoded4), true, bytesLen
|
||||
}
|
||||
|
||||
func redactSensitiveJSON(v any) any {
|
||||
switch t := v.(type) {
|
||||
case map[string]any:
|
||||
out := make(map[string]any, len(t))
|
||||
for k, vv := range t {
|
||||
if isSensitiveKey(k) {
|
||||
out[k] = "[REDACTED]"
|
||||
continue
|
||||
}
|
||||
out[k] = redactSensitiveJSON(vv)
|
||||
}
|
||||
return out
|
||||
case []any:
|
||||
out := make([]any, 0, len(t))
|
||||
for _, vv := range t {
|
||||
out = append(out, redactSensitiveJSON(vv))
|
||||
}
|
||||
return out
|
||||
default:
|
||||
return v
|
||||
}
|
||||
}
|
||||
|
||||
func isSensitiveKey(key string) bool {
|
||||
k := strings.ToLower(strings.TrimSpace(key))
|
||||
if k == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
// Exact matches (common credential fields).
|
||||
switch k {
|
||||
case "authorization",
|
||||
"proxy-authorization",
|
||||
"x-api-key",
|
||||
"api_key",
|
||||
"apikey",
|
||||
"access_token",
|
||||
"refresh_token",
|
||||
"id_token",
|
||||
"session_token",
|
||||
"token",
|
||||
"password",
|
||||
"passwd",
|
||||
"passphrase",
|
||||
"secret",
|
||||
"client_secret",
|
||||
"private_key",
|
||||
"jwt",
|
||||
"signature",
|
||||
"accesskeyid",
|
||||
"secretaccesskey":
|
||||
return true
|
||||
}
|
||||
|
||||
// Suffix matches.
|
||||
for _, suffix := range []string{
|
||||
"_secret",
|
||||
"_token",
|
||||
"_id_token",
|
||||
"_session_token",
|
||||
"_password",
|
||||
"_passwd",
|
||||
"_passphrase",
|
||||
"_key",
|
||||
"secret_key",
|
||||
"private_key",
|
||||
} {
|
||||
if strings.HasSuffix(k, suffix) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// Substring matches (conservative, but errs on the side of privacy).
|
||||
for _, sub := range []string{
|
||||
"secret",
|
||||
"token",
|
||||
"password",
|
||||
"passwd",
|
||||
"passphrase",
|
||||
"privatekey",
|
||||
"private_key",
|
||||
"apikey",
|
||||
"api_key",
|
||||
"accesskeyid",
|
||||
"secretaccesskey",
|
||||
"bearer",
|
||||
"cookie",
|
||||
"credential",
|
||||
"session",
|
||||
"jwt",
|
||||
"signature",
|
||||
} {
|
||||
if strings.Contains(k, sub) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func trimConversationArrays(root map[string]any, maxBytes int) (map[string]any, bool) {
|
||||
// Supported: anthropic/openai: messages; gemini: contents.
|
||||
if out, ok := trimArrayField(root, "messages", maxBytes); ok {
|
||||
return out, true
|
||||
}
|
||||
if out, ok := trimArrayField(root, "contents", maxBytes); ok {
|
||||
return out, true
|
||||
}
|
||||
return root, false
|
||||
}
|
||||
|
||||
func trimArrayField(root map[string]any, field string, maxBytes int) (map[string]any, bool) {
|
||||
raw, ok := root[field]
|
||||
if !ok {
|
||||
return nil, false
|
||||
}
|
||||
arr, ok := raw.([]any)
|
||||
if !ok || len(arr) == 0 {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// Keep at least the last message/content. Use binary search so we don't marshal O(n) times.
|
||||
// We are dropping from the *front* of the array (oldest context first).
|
||||
lo := 0
|
||||
hi := len(arr) - 1 // inclusive; hi ensures at least one item remains
|
||||
|
||||
var best map[string]any
|
||||
found := false
|
||||
|
||||
for lo <= hi {
|
||||
mid := (lo + hi) / 2
|
||||
candidateArr := arr[mid:]
|
||||
if len(candidateArr) == 0 {
|
||||
lo = mid + 1
|
||||
continue
|
||||
}
|
||||
|
||||
next := shallowCopyMap(root)
|
||||
next[field] = candidateArr
|
||||
encoded, err := json.Marshal(next)
|
||||
if err != nil {
|
||||
// If marshal fails, try dropping more.
|
||||
lo = mid + 1
|
||||
continue
|
||||
}
|
||||
|
||||
if len(encoded) <= maxBytes {
|
||||
best = next
|
||||
found = true
|
||||
// Try to keep more context by dropping fewer items.
|
||||
hi = mid - 1
|
||||
continue
|
||||
}
|
||||
|
||||
// Need to drop more.
|
||||
lo = mid + 1
|
||||
}
|
||||
|
||||
if found {
|
||||
return best, true
|
||||
}
|
||||
|
||||
// Nothing fit (even with only one element); return the smallest slice and let the
|
||||
// caller fall back to shrinkToEssentials().
|
||||
next := shallowCopyMap(root)
|
||||
next[field] = arr[len(arr)-1:]
|
||||
return next, true
|
||||
}
|
||||
|
||||
func shrinkToEssentials(root map[string]any) map[string]any {
|
||||
out := make(map[string]any)
|
||||
for _, key := range []string{"model", "stream", "max_tokens", "temperature", "top_p", "top_k"} {
|
||||
if v, ok := root[key]; ok {
|
||||
out[key] = v
|
||||
}
|
||||
}
|
||||
|
||||
// Keep only the last element of the conversation array.
|
||||
if v, ok := root["messages"]; ok {
|
||||
if arr, ok := v.([]any); ok && len(arr) > 0 {
|
||||
out["messages"] = []any{arr[len(arr)-1]}
|
||||
}
|
||||
}
|
||||
if v, ok := root["contents"]; ok {
|
||||
if arr, ok := v.([]any); ok && len(arr) > 0 {
|
||||
out["contents"] = []any{arr[len(arr)-1]}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func shallowCopyMap(m map[string]any) map[string]any {
|
||||
out := make(map[string]any, len(m))
|
||||
for k, v := range m {
|
||||
out[k] = v
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, truncated bool) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return "", false
|
||||
}
|
||||
|
||||
// Prefer JSON-safe sanitization when possible.
|
||||
if out, trunc, _ := sanitizeAndTrimRequestBody([]byte(raw), maxBytes); out != "" {
|
||||
return out, trunc
|
||||
}
|
||||
|
||||
// Non-JSON: best-effort truncate.
|
||||
if maxBytes > 0 && len(raw) > maxBytes {
|
||||
return truncateString(raw, maxBytes), true
|
||||
}
|
||||
return raw, false
|
||||
}
|
||||
|
||||
func extractString(v any, key string) string {
|
||||
root, ok := v.(map[string]any)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
s, _ := root[key].(string)
|
||||
return strings.TrimSpace(s)
|
||||
}
|
||||
354
backend/internal/service/ops_settings.go
Normal file
354
backend/internal/service/ops_settings.go
Normal file
@@ -0,0 +1,354 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader"
|
||||
opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second
|
||||
)
|
||||
|
||||
// =========================
|
||||
// Email notification config
|
||||
// =========================
|
||||
|
||||
func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) {
|
||||
defaultCfg := defaultOpsEmailNotificationConfig()
|
||||
if s == nil || s.settingRepo == nil {
|
||||
return defaultCfg, nil
|
||||
}
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
|
||||
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrSettingNotFound) {
|
||||
// Initialize defaults on first read (best-effort).
|
||||
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
|
||||
_ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b))
|
||||
}
|
||||
return defaultCfg, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
cfg := &OpsEmailNotificationConfig{}
|
||||
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
|
||||
// Corrupted JSON should not break ops UI; fall back to defaults.
|
||||
return defaultCfg, nil
|
||||
}
|
||||
normalizeOpsEmailNotificationConfig(cfg)
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) {
|
||||
if s == nil || s.settingRepo == nil {
|
||||
return nil, errors.New("setting repository not initialized")
|
||||
}
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
if req == nil {
|
||||
return nil, errors.New("invalid request")
|
||||
}
|
||||
|
||||
cfg, err := s.GetEmailNotificationConfig(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if req.Alert != nil {
|
||||
cfg.Alert.Enabled = req.Alert.Enabled
|
||||
if req.Alert.Recipients != nil {
|
||||
cfg.Alert.Recipients = req.Alert.Recipients
|
||||
}
|
||||
cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity)
|
||||
cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour
|
||||
cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds
|
||||
cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts
|
||||
}
|
||||
|
||||
if req.Report != nil {
|
||||
cfg.Report.Enabled = req.Report.Enabled
|
||||
if req.Report.Recipients != nil {
|
||||
cfg.Report.Recipients = req.Report.Recipients
|
||||
}
|
||||
cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled
|
||||
cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule)
|
||||
cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled
|
||||
cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule)
|
||||
cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled
|
||||
cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule)
|
||||
cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount
|
||||
cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled
|
||||
cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule)
|
||||
cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold
|
||||
}
|
||||
|
||||
if err := validateOpsEmailNotificationConfig(cfg); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
normalizeOpsEmailNotificationConfig(cfg)
|
||||
raw, err := json.Marshal(cfg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig {
|
||||
return &OpsEmailNotificationConfig{
|
||||
Alert: OpsEmailAlertConfig{
|
||||
Enabled: true,
|
||||
Recipients: []string{},
|
||||
MinSeverity: "",
|
||||
RateLimitPerHour: 0,
|
||||
BatchingWindowSeconds: 0,
|
||||
IncludeResolvedAlerts: false,
|
||||
},
|
||||
Report: OpsEmailReportConfig{
|
||||
Enabled: false,
|
||||
Recipients: []string{},
|
||||
DailySummaryEnabled: false,
|
||||
DailySummarySchedule: "0 9 * * *",
|
||||
WeeklySummaryEnabled: false,
|
||||
WeeklySummarySchedule: "0 9 * * 1",
|
||||
ErrorDigestEnabled: false,
|
||||
ErrorDigestSchedule: "0 9 * * *",
|
||||
ErrorDigestMinCount: 10,
|
||||
AccountHealthEnabled: false,
|
||||
AccountHealthSchedule: "0 9 * * *",
|
||||
AccountHealthErrorRateThreshold: 10.0,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) {
|
||||
if cfg == nil {
|
||||
return
|
||||
}
|
||||
if cfg.Alert.Recipients == nil {
|
||||
cfg.Alert.Recipients = []string{}
|
||||
}
|
||||
if cfg.Report.Recipients == nil {
|
||||
cfg.Report.Recipients = []string{}
|
||||
}
|
||||
|
||||
cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity)
|
||||
cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule)
|
||||
cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule)
|
||||
cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule)
|
||||
cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule)
|
||||
|
||||
// Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings.
|
||||
if cfg.Report.DailySummarySchedule == "" {
|
||||
cfg.Report.DailySummarySchedule = "0 9 * * *"
|
||||
}
|
||||
if cfg.Report.WeeklySummarySchedule == "" {
|
||||
cfg.Report.WeeklySummarySchedule = "0 9 * * 1"
|
||||
}
|
||||
if cfg.Report.ErrorDigestSchedule == "" {
|
||||
cfg.Report.ErrorDigestSchedule = "0 9 * * *"
|
||||
}
|
||||
if cfg.Report.AccountHealthSchedule == "" {
|
||||
cfg.Report.AccountHealthSchedule = "0 9 * * *"
|
||||
}
|
||||
}
|
||||
|
||||
func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error {
|
||||
if cfg == nil {
|
||||
return errors.New("invalid config")
|
||||
}
|
||||
|
||||
if cfg.Alert.RateLimitPerHour < 0 {
|
||||
return errors.New("alert.rate_limit_per_hour must be >= 0")
|
||||
}
|
||||
if cfg.Alert.BatchingWindowSeconds < 0 {
|
||||
return errors.New("alert.batching_window_seconds must be >= 0")
|
||||
}
|
||||
switch strings.TrimSpace(cfg.Alert.MinSeverity) {
|
||||
case "", "critical", "warning", "info":
|
||||
default:
|
||||
return errors.New("alert.min_severity must be one of: critical, warning, info, or empty")
|
||||
}
|
||||
|
||||
if cfg.Report.ErrorDigestMinCount < 0 {
|
||||
return errors.New("report.error_digest_min_count must be >= 0")
|
||||
}
|
||||
if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 {
|
||||
return errors.New("report.account_health_error_rate_threshold must be between 0 and 100")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Alert runtime settings
|
||||
// =========================
|
||||
|
||||
func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings {
|
||||
return &OpsAlertRuntimeSettings{
|
||||
EvaluationIntervalSeconds: 60,
|
||||
DistributedLock: OpsDistributedLockSettings{
|
||||
Enabled: true,
|
||||
Key: opsAlertEvaluatorLeaderLockKeyDefault,
|
||||
TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()),
|
||||
},
|
||||
Silencing: OpsAlertSilencingSettings{
|
||||
Enabled: false,
|
||||
GlobalUntilRFC3339: "",
|
||||
GlobalReason: "",
|
||||
Entries: []OpsAlertSilenceEntry{},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
s.Key = strings.TrimSpace(s.Key)
|
||||
if s.Key == "" {
|
||||
s.Key = defaultKey
|
||||
}
|
||||
if s.TTLSeconds <= 0 {
|
||||
s.TTLSeconds = defaultTTLSeconds
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339)
|
||||
s.GlobalReason = strings.TrimSpace(s.GlobalReason)
|
||||
if s.Entries == nil {
|
||||
s.Entries = []OpsAlertSilenceEntry{}
|
||||
}
|
||||
for i := range s.Entries {
|
||||
s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339)
|
||||
s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason)
|
||||
}
|
||||
}
|
||||
|
||||
func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error {
|
||||
if strings.TrimSpace(s.Key) == "" {
|
||||
return errors.New("distributed_lock.key is required")
|
||||
}
|
||||
if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) {
|
||||
return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error {
|
||||
parse := func(raw string) error {
|
||||
if strings.TrimSpace(raw) == "" {
|
||||
return nil
|
||||
}
|
||||
if _, err := time.Parse(time.RFC3339, raw); err != nil {
|
||||
return errors.New("silencing time must be RFC3339")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := parse(s.GlobalUntilRFC3339); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, entry := range s.Entries {
|
||||
if strings.TrimSpace(entry.UntilRFC3339) == "" {
|
||||
return errors.New("silencing.entries.until_rfc3339 is required")
|
||||
}
|
||||
if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil {
|
||||
return errors.New("silencing.entries.until_rfc3339 must be RFC3339")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) {
|
||||
defaultCfg := defaultOpsAlertRuntimeSettings()
|
||||
if s == nil || s.settingRepo == nil {
|
||||
return defaultCfg, nil
|
||||
}
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
|
||||
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrSettingNotFound) {
|
||||
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
|
||||
_ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b))
|
||||
}
|
||||
return defaultCfg, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
cfg := &OpsAlertRuntimeSettings{}
|
||||
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
|
||||
return defaultCfg, nil
|
||||
}
|
||||
|
||||
if cfg.EvaluationIntervalSeconds <= 0 {
|
||||
cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds
|
||||
}
|
||||
normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
|
||||
normalizeOpsAlertSilencingSettings(&cfg.Silencing)
|
||||
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) {
|
||||
if s == nil || s.settingRepo == nil {
|
||||
return nil, errors.New("setting repository not initialized")
|
||||
}
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
if cfg == nil {
|
||||
return nil, errors.New("invalid config")
|
||||
}
|
||||
|
||||
if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) {
|
||||
return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400")
|
||||
}
|
||||
if cfg.DistributedLock.Enabled {
|
||||
if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
if cfg.Silencing.Enabled {
|
||||
if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
defaultCfg := defaultOpsAlertRuntimeSettings()
|
||||
normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
|
||||
normalizeOpsAlertSilencingSettings(&cfg.Silencing)
|
||||
|
||||
raw, err := json.Marshal(cfg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated).
|
||||
updated := &OpsAlertRuntimeSettings{}
|
||||
_ = json.Unmarshal(raw, updated)
|
||||
return updated, nil
|
||||
}
|
||||
|
||||
27
backend/internal/service/ops_trends.go
Normal file
27
backend/internal/service/ops_trends.go
Normal file
@@ -0,0 +1,27 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||
)
|
||||
|
||||
func (s *OpsService) GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if filter == nil {
|
||||
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
|
||||
}
|
||||
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
|
||||
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
|
||||
}
|
||||
if filter.StartTime.After(filter.EndTime) {
|
||||
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
|
||||
}
|
||||
return s.opsRepo.GetThroughputTrend(ctx, filter, bucketSeconds)
|
||||
}
|
||||
|
||||
24
backend/internal/service/ops_window_stats.go
Normal file
24
backend/internal/service/ops_window_stats.go
Normal file
@@ -0,0 +1,24 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||
)
|
||||
|
||||
// GetWindowStats returns lightweight request/token counts for the provided window.
|
||||
// It is intended for realtime sampling (e.g. WebSocket QPS push) without computing percentiles/peaks.
|
||||
func (s *OpsService) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
filter := &OpsDashboardFilter{
|
||||
StartTime: startTime,
|
||||
EndTime: endTime,
|
||||
}
|
||||
return s.opsRepo.GetWindowStats(ctx, filter)
|
||||
}
|
||||
@@ -1,10 +1,12 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"time"
|
||||
|
||||
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||
"github.com/google/wire"
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// BuildInfo contains build information
|
||||
@@ -70,6 +72,57 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi
|
||||
return svc
|
||||
}
|
||||
|
||||
// ProvideOpsMetricsCollector creates and starts OpsMetricsCollector.
|
||||
func ProvideOpsMetricsCollector(
|
||||
opsRepo OpsRepository,
|
||||
settingRepo SettingRepository,
|
||||
db *sql.DB,
|
||||
redisClient *redis.Client,
|
||||
cfg *config.Config,
|
||||
) *OpsMetricsCollector {
|
||||
collector := NewOpsMetricsCollector(opsRepo, settingRepo, db, redisClient, cfg)
|
||||
collector.Start()
|
||||
return collector
|
||||
}
|
||||
|
||||
// ProvideOpsAggregationService creates and starts OpsAggregationService (hourly/daily pre-aggregation).
|
||||
func ProvideOpsAggregationService(
|
||||
opsRepo OpsRepository,
|
||||
settingRepo SettingRepository,
|
||||
db *sql.DB,
|
||||
redisClient *redis.Client,
|
||||
cfg *config.Config,
|
||||
) *OpsAggregationService {
|
||||
svc := NewOpsAggregationService(opsRepo, settingRepo, db, redisClient, cfg)
|
||||
svc.Start()
|
||||
return svc
|
||||
}
|
||||
|
||||
// ProvideOpsAlertEvaluatorService creates and starts OpsAlertEvaluatorService.
|
||||
func ProvideOpsAlertEvaluatorService(
|
||||
opsService *OpsService,
|
||||
opsRepo OpsRepository,
|
||||
emailService *EmailService,
|
||||
redisClient *redis.Client,
|
||||
cfg *config.Config,
|
||||
) *OpsAlertEvaluatorService {
|
||||
svc := NewOpsAlertEvaluatorService(opsService, opsRepo, emailService, redisClient, cfg)
|
||||
svc.Start()
|
||||
return svc
|
||||
}
|
||||
|
||||
// ProvideOpsCleanupService creates and starts OpsCleanupService (cron scheduled).
|
||||
func ProvideOpsCleanupService(
|
||||
opsRepo OpsRepository,
|
||||
db *sql.DB,
|
||||
redisClient *redis.Client,
|
||||
cfg *config.Config,
|
||||
) *OpsCleanupService {
|
||||
svc := NewOpsCleanupService(opsRepo, db, redisClient, cfg)
|
||||
svc.Start()
|
||||
return svc
|
||||
}
|
||||
|
||||
// ProviderSet is the Wire provider set for all services
|
||||
var ProviderSet = wire.NewSet(
|
||||
// Core services
|
||||
@@ -101,6 +154,11 @@ var ProviderSet = wire.NewSet(
|
||||
NewAccountUsageService,
|
||||
NewAccountTestService,
|
||||
NewSettingService,
|
||||
NewOpsService,
|
||||
ProvideOpsMetricsCollector,
|
||||
ProvideOpsAggregationService,
|
||||
ProvideOpsAlertEvaluatorService,
|
||||
ProvideOpsCleanupService,
|
||||
NewEmailService,
|
||||
ProvideEmailQueueService,
|
||||
NewTurnstileService,
|
||||
|
||||
Reference in New Issue
Block a user