feat(channel-monitor): aggregate history to daily rollups + soft delete
明细只保留 1 天,超过 1 天聚合到新表 channel_monitor_daily_rollups(按 monitor_id/model/bucket_date 维度),聚合保留 30 天。两张表都用 SoftDeleteMixin 软删除(DELETE 自动改为 UPDATE deleted_at = NOW())。 聚合 + 清理任务由 OpsCleanupService 的 cron 统一调度,与运维监控的清理共享 schedule(默认 0 2 * * *)和 leader lock。ChannelMonitorRunner 的 cleanupLoop 被移除,只保留 dueCheckLoop。 读取路径 ComputeAvailability* 改为 UNION 明细(今天 deleted_at IS NULL)+ 聚合(过去 windowDays 天 deleted_at IS NULL),SUM(ok)/SUM(total) 自然加权 计算可用率,AVG latency 用 SUM(sum_latency_ms)/SUM(count_latency)。 watermark 表 channel_monitor_aggregation_watermark 单行(id=1),记录 last_aggregated_date,重启后从该日期 +1 继续聚合,首次为 nil 则从 today - 30d 开始回填,单次最多 35 天上限避免长事务。 raw SQL 的 ListLatestPerModel / ListLatestForMonitorIDs / ListRecentHistoryForMonitors 都补上 deleted_at IS NULL 过滤(SoftDeleteMixin interceptor 只对 ent query 生效)。 bump version to 0.1.114.28 GroupBadge 在 MonitorKeyPickerDialog 中复用平台主题色 + 倍率/专属倍率 (顺手优化)。
This commit is contained in:
@@ -41,6 +41,20 @@ type ChannelMonitorRepository interface {
|
||||
// ListRecentHistoryForMonitors 批量取多个 monitor 各自主模型(primaryModels[monitorID])最近 perMonitorLimit 条历史。
|
||||
// 返回的 entry 已按 checked_at DESC 排序(最新在前),不含 message 字段。
|
||||
ListRecentHistoryForMonitors(ctx context.Context, ids []int64, primaryModels map[int64]string, perMonitorLimit int) (map[int64][]*ChannelMonitorHistoryEntry, error)
|
||||
|
||||
// ---------- 聚合维护(OpsCleanupService 调用) ----------
|
||||
|
||||
// UpsertDailyRollupsFor 把 targetDate 当天的明细按 (monitor_id, model, bucket_date)
|
||||
// 聚合到 channel_monitor_daily_rollups。targetDate 会被截断到日期;
|
||||
// 用 ON CONFLICT DO UPDATE 实现幂等回填,返回 upsert 影响的行数。
|
||||
UpsertDailyRollupsFor(ctx context.Context, targetDate time.Time) (int64, error)
|
||||
// DeleteRollupsBefore 软删 bucket_date < beforeDate 的聚合行,返回删除行数。
|
||||
DeleteRollupsBefore(ctx context.Context, beforeDate time.Time) (int64, error)
|
||||
// LoadAggregationWatermark 读 watermark(id=1)。
|
||||
// 返回 nil 表示从未聚合过;watermark 表本身预期已存在单行(migration 110 写入)。
|
||||
LoadAggregationWatermark(ctx context.Context) (*time.Time, error)
|
||||
// UpdateAggregationWatermark 写 watermark(UPSERT 到 id=1)。
|
||||
UpdateAggregationWatermark(ctx context.Context, date time.Time) error
|
||||
}
|
||||
|
||||
// ChannelMonitorService 渠道监控管理服务。
|
||||
@@ -300,9 +314,10 @@ func (s *ChannelMonitorService) listDueForCheck(ctx context.Context) ([]*Channel
|
||||
return due, nil
|
||||
}
|
||||
|
||||
// cleanupOldHistory 删除 monitorHistoryRetentionDays 天之前的历史记录。
|
||||
// cleanupOldHistory 删除 monitorHistoryRetentionDays 天之前的明细历史记录。
|
||||
// 由 RunDailyMaintenance 调用;SoftDeleteMixin 自动把 DELETE 改为 UPDATE deleted_at。
|
||||
func (s *ChannelMonitorService) cleanupOldHistory(ctx context.Context) error {
|
||||
before := time.Now().AddDate(0, 0, -monitorHistoryRetentionDays)
|
||||
before := time.Now().UTC().AddDate(0, 0, -monitorHistoryRetentionDays)
|
||||
deleted, err := s.repo.DeleteHistoryBefore(ctx, before)
|
||||
if err != nil {
|
||||
return fmt.Errorf("delete history before %s: %w", before.Format(time.RFC3339), err)
|
||||
@@ -314,6 +329,94 @@ func (s *ChannelMonitorService) cleanupOldHistory(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// RunDailyMaintenance 每日维护任务:聚合昨天之前未聚合的明细,软删过期明细和聚合。
|
||||
// 由 OpsCleanupService 的 cron 调度触发(共享 schedule 和 leader lock)。
|
||||
//
|
||||
// 幂等性:
|
||||
// - watermark 保证已聚合的日期不会重复处理;
|
||||
// - UpsertDailyRollupsFor 内部使用 ON CONFLICT DO UPDATE,同一日重复跑结果一致。
|
||||
//
|
||||
// 每一步失败都只记 slog.Warn,整体函数始终返回 nil 让后续步骤能继续跑
|
||||
// (与 OpsCleanupService.runCleanupOnce 风格一致)。
|
||||
func (s *ChannelMonitorService) RunDailyMaintenance(ctx context.Context) error {
|
||||
now := time.Now().UTC()
|
||||
today := now.Truncate(24 * time.Hour)
|
||||
|
||||
if err := s.runDailyAggregation(ctx, today); err != nil {
|
||||
slog.Warn("channel_monitor: maintenance step failed",
|
||||
"step", "aggregate", "error", err)
|
||||
}
|
||||
if err := s.cleanupOldHistory(ctx); err != nil {
|
||||
slog.Warn("channel_monitor: maintenance step failed",
|
||||
"step", "prune_history", "error", err)
|
||||
}
|
||||
if err := s.cleanupOldRollups(ctx, today); err != nil {
|
||||
slog.Warn("channel_monitor: maintenance step failed",
|
||||
"step", "prune_rollups", "error", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// runDailyAggregation 从 watermark+1 聚合到昨天(UTC)。
|
||||
// 首次跑(watermark nil):从 today-monitorRollupRetentionDays 开始回填。
|
||||
// 每次最多聚合 monitorMaintenanceMaxDaysPerRun 天,避免长事务。
|
||||
func (s *ChannelMonitorService) runDailyAggregation(ctx context.Context, today time.Time) error {
|
||||
watermark, err := s.repo.LoadAggregationWatermark(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("load watermark: %w", err)
|
||||
}
|
||||
|
||||
start := s.resolveAggregationStart(watermark, today)
|
||||
if !start.Before(today) {
|
||||
return nil // 没有需要聚合的日期
|
||||
}
|
||||
|
||||
iterations := 0
|
||||
for d := start; d.Before(today); d = d.Add(24 * time.Hour) {
|
||||
if iterations >= monitorMaintenanceMaxDaysPerRun {
|
||||
slog.Info("channel_monitor: maintenance aggregation capped",
|
||||
"max_days", monitorMaintenanceMaxDaysPerRun,
|
||||
"next_resume", d.Format("2006-01-02"))
|
||||
break
|
||||
}
|
||||
affected, upErr := s.repo.UpsertDailyRollupsFor(ctx, d)
|
||||
if upErr != nil {
|
||||
return fmt.Errorf("upsert rollups for %s: %w", d.Format("2006-01-02"), upErr)
|
||||
}
|
||||
if err := s.repo.UpdateAggregationWatermark(ctx, d); err != nil {
|
||||
return fmt.Errorf("update watermark to %s: %w", d.Format("2006-01-02"), err)
|
||||
}
|
||||
slog.Info("channel_monitor: rollups upserted",
|
||||
"date", d.Format("2006-01-02"), "affected_rows", affected)
|
||||
iterations++
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// resolveAggregationStart 计算本次聚合起点:
|
||||
// - watermark == nil:today - monitorRollupRetentionDays(首次回填最多 30 天)
|
||||
// - watermark != nil:*watermark + 1 day
|
||||
func (s *ChannelMonitorService) resolveAggregationStart(watermark *time.Time, today time.Time) time.Time {
|
||||
if watermark == nil {
|
||||
return today.AddDate(0, 0, -monitorRollupRetentionDays)
|
||||
}
|
||||
return watermark.UTC().Truncate(24 * time.Hour).Add(24 * time.Hour)
|
||||
}
|
||||
|
||||
// cleanupOldRollups 软删 bucket_date < today - monitorRollupRetentionDays 的日聚合行。
|
||||
func (s *ChannelMonitorService) cleanupOldRollups(ctx context.Context, today time.Time) error {
|
||||
cutoff := today.AddDate(0, 0, -monitorRollupRetentionDays)
|
||||
deleted, err := s.repo.DeleteRollupsBefore(ctx, cutoff)
|
||||
if err != nil {
|
||||
return fmt.Errorf("delete rollups before %s: %w", cutoff.Format("2006-01-02"), err)
|
||||
}
|
||||
if deleted > 0 {
|
||||
slog.Info("channel_monitor: rollups cleanup",
|
||||
"deleted_rows", deleted, "before", cutoff.Format("2006-01-02"))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---------- helpers ----------
|
||||
|
||||
// decryptInPlace 把 ChannelMonitor.APIKey 从密文解密为明文。
|
||||
|
||||
Reference in New Issue
Block a user