feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup
Background / 背景 The ops cleanup task currently rejects retention days < 1 in both validate and normalize, so operators who want minimal-history setups (e.g. high churn deployments that prefer near-realtime cleanup) cannot express that intent through the UI. The only options are 1+ days, which keeps at least 24h of history regardless of cron frequency. ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数, 让希望尽量不留历史的运维场景(高吞吐部署 + 想用近实时清理)无法通过 UI 表达。最低只能配 1,等于不管 cron 多频繁,至少都会保留 24 小时的历史。 Purpose / 目的 Let admins set retention days to 0, meaning "every scheduled cleanup run wipes the corresponding table(s) entirely". Combined with a more frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup. 允许管理员把保留天数设为 0,语义为"每次定时清理时把对应表全部清空"。 搭配更频繁的 cron(比如每小时整点)即可获得近似滚动清理的效果。 Changes / 改动内容 Backend - service/ops_settings.go: validate accepts [0, 365]; normalize only refills default 30 when value is < 0 (negative is treated as legacy bad data, 0 is honoured) - service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)` returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and short-circuits to a new `truncateOpsTable` helper that uses `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps the existing batched DELETE path unchanged. Empty tables skip TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely - Extract `isMissingRelationError` helper to dedupe the "table not yet created" tolerance shared by both delete and truncate paths - Add unit tests for `opsCleanupPlan` (three branches) and `isMissingRelationError` 后端 - service/ops_settings.go: validate 接受 [0, 365];normalize 仅在 < 0 时回填默认 30(负数视为脏数据,0 被尊重) - service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回 `(cutoff, truncate, ok)`。days==0 → truncate=true,走新增 `truncateOpsTable`(TRUNCATE TABLE,O(1),无 WAL、无 VACUUM 压力); days>0 仍走原批量 DELETE 路径,行为完全不变。空表跳过 TRUNCATE, 避免无意义的 ACCESS EXCLUSIVE 锁 - 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的 "表不存在"宽容判断 - 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试 Frontend - OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0 - i18n (zh/en): hint mentions "0 = wipe all on every cleanup", validation message updated to 0-365 range 前端 - OpsSettingsDialog.vue: 校验放宽到 [0, 365],input min 改 0 - i18n(zh/en):hint 补"0 = 每次清理时清空所有",错误提示改 0-365 Trade-offs / 取舍 - TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only have the cleanup task as a writer, so the lock is invisible to other workloads - Empty-table guard avoids the lock when there is nothing to clean - Negative values are still treated as legacy bad data and replaced with default 30 to preserve compatibility
This commit is contained in:
@@ -184,6 +184,25 @@ func (c opsCleanupDeletedCounts) String() string {
|
||||
)
|
||||
}
|
||||
|
||||
// opsCleanupPlan 把"保留天数"翻译成具体的清理动作。
|
||||
// - days < 0 → 跳过该项清理(ok=false),保留兼容老数据
|
||||
// - days == 0 → TRUNCATE TABLE(O(1) 全清),truncate=true
|
||||
// - days > 0 → 批量 DELETE 早于 now-N天 的行,cutoff = now - N 天
|
||||
//
|
||||
// 之所以 days==0 走 TRUNCATE 而非"now+24h cutoff + DELETE":
|
||||
// - 速度从 O(N) 降到 O(1),对百万行级表毫秒完成
|
||||
// - 无 WAL 写入、无后续 VACUUM 压力
|
||||
// - 这些 ops 表只有 cleanup 任务自己写,TRUNCATE 的 ACCESS EXCLUSIVE 锁影响可忽略
|
||||
func opsCleanupPlan(now time.Time, days int) (cutoff time.Time, truncate, ok bool) {
|
||||
if days < 0 {
|
||||
return time.Time{}, false, false
|
||||
}
|
||||
if days == 0 {
|
||||
return time.Time{}, true, true
|
||||
}
|
||||
return now.AddDate(0, 0, -days), false, true
|
||||
}
|
||||
|
||||
func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDeletedCounts, error) {
|
||||
out := opsCleanupDeletedCounts{}
|
||||
if s == nil || s.db == nil || s.cfg == nil {
|
||||
@@ -194,34 +213,42 @@ func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDelet
|
||||
|
||||
now := time.Now().UTC()
|
||||
|
||||
// Error-like tables: error logs / retry attempts / alert events.
|
||||
if days := s.cfg.Ops.Cleanup.ErrorLogRetentionDays; days > 0 {
|
||||
cutoff := now.AddDate(0, 0, -days)
|
||||
n, err := deleteOldRowsByID(ctx, s.db, "ops_error_logs", "created_at", cutoff, batchSize, false)
|
||||
// runOne 把"truncate? cutoff? batched delete?"封装到一处,
|
||||
// 让三组清理(错误日志类 / 分钟指标 / 小时+日预聚合)调用方只关心表名和列名。
|
||||
runOne := func(truncate bool, cutoff time.Time, table, timeCol string, castDate bool) (int64, error) {
|
||||
if truncate {
|
||||
return truncateOpsTable(ctx, s.db, table)
|
||||
}
|
||||
return deleteOldRowsByID(ctx, s.db, table, timeCol, cutoff, batchSize, castDate)
|
||||
}
|
||||
|
||||
// Error-like tables: error logs / retry attempts / alert events / system logs / cleanup audits.
|
||||
if cutoff, truncate, ok := opsCleanupPlan(now, s.cfg.Ops.Cleanup.ErrorLogRetentionDays); ok {
|
||||
n, err := runOne(truncate, cutoff, "ops_error_logs", "created_at", false)
|
||||
if err != nil {
|
||||
return out, err
|
||||
}
|
||||
out.errorLogs = n
|
||||
|
||||
n, err = deleteOldRowsByID(ctx, s.db, "ops_retry_attempts", "created_at", cutoff, batchSize, false)
|
||||
n, err = runOne(truncate, cutoff, "ops_retry_attempts", "created_at", false)
|
||||
if err != nil {
|
||||
return out, err
|
||||
}
|
||||
out.retryAttempts = n
|
||||
|
||||
n, err = deleteOldRowsByID(ctx, s.db, "ops_alert_events", "created_at", cutoff, batchSize, false)
|
||||
n, err = runOne(truncate, cutoff, "ops_alert_events", "created_at", false)
|
||||
if err != nil {
|
||||
return out, err
|
||||
}
|
||||
out.alertEvents = n
|
||||
|
||||
n, err = deleteOldRowsByID(ctx, s.db, "ops_system_logs", "created_at", cutoff, batchSize, false)
|
||||
n, err = runOne(truncate, cutoff, "ops_system_logs", "created_at", false)
|
||||
if err != nil {
|
||||
return out, err
|
||||
}
|
||||
out.systemLogs = n
|
||||
|
||||
n, err = deleteOldRowsByID(ctx, s.db, "ops_system_log_cleanup_audits", "created_at", cutoff, batchSize, false)
|
||||
n, err = runOne(truncate, cutoff, "ops_system_log_cleanup_audits", "created_at", false)
|
||||
if err != nil {
|
||||
return out, err
|
||||
}
|
||||
@@ -229,9 +256,8 @@ func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDelet
|
||||
}
|
||||
|
||||
// Minute-level metrics snapshots.
|
||||
if days := s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays; days > 0 {
|
||||
cutoff := now.AddDate(0, 0, -days)
|
||||
n, err := deleteOldRowsByID(ctx, s.db, "ops_system_metrics", "created_at", cutoff, batchSize, false)
|
||||
if cutoff, truncate, ok := opsCleanupPlan(now, s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays); ok {
|
||||
n, err := runOne(truncate, cutoff, "ops_system_metrics", "created_at", false)
|
||||
if err != nil {
|
||||
return out, err
|
||||
}
|
||||
@@ -239,15 +265,14 @@ func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDelet
|
||||
}
|
||||
|
||||
// Pre-aggregation tables (hourly/daily).
|
||||
if days := s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays; days > 0 {
|
||||
cutoff := now.AddDate(0, 0, -days)
|
||||
n, err := deleteOldRowsByID(ctx, s.db, "ops_metrics_hourly", "bucket_start", cutoff, batchSize, false)
|
||||
if cutoff, truncate, ok := opsCleanupPlan(now, s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays); ok {
|
||||
n, err := runOne(truncate, cutoff, "ops_metrics_hourly", "bucket_start", false)
|
||||
if err != nil {
|
||||
return out, err
|
||||
}
|
||||
out.hourlyPreagg = n
|
||||
|
||||
n, err = deleteOldRowsByID(ctx, s.db, "ops_metrics_daily", "bucket_date", cutoff, batchSize, true)
|
||||
n, err = runOne(truncate, cutoff, "ops_metrics_daily", "bucket_date", true)
|
||||
if err != nil {
|
||||
return out, err
|
||||
}
|
||||
@@ -303,7 +328,7 @@ WHERE id IN (SELECT id FROM batch)
|
||||
res, err := db.ExecContext(ctx, q, cutoff, batchSize)
|
||||
if err != nil {
|
||||
// If ops tables aren't present yet (partial deployments), treat as no-op.
|
||||
if strings.Contains(strings.ToLower(err.Error()), "does not exist") && strings.Contains(strings.ToLower(err.Error()), "relation") {
|
||||
if isMissingRelationError(err) {
|
||||
return total, nil
|
||||
}
|
||||
return total, err
|
||||
@@ -320,6 +345,46 @@ WHERE id IN (SELECT id FROM batch)
|
||||
return total, nil
|
||||
}
|
||||
|
||||
// truncateOpsTable 用 TRUNCATE TABLE 清空指定表,先 SELECT COUNT(*) 取得清空前行数用于 heartbeat。
|
||||
//
|
||||
// 与 deleteOldRowsByID 的差异:
|
||||
// - 不可指定 WHERE 条件,仅用于 days==0 的"清空全部"语义
|
||||
// - O(1) 释放表的物理存储页,毫秒级完成,无 WAL 写入、无 VACUUM 压力
|
||||
// - 需要 ACCESS EXCLUSIVE 锁,但 ops 表只有清理任务自己写入,瞬间锁影响可忽略
|
||||
//
|
||||
// 表不存在(部分部署)静默返回 0,与 deleteOldRowsByID 保持一致。
|
||||
func truncateOpsTable(ctx context.Context, db *sql.DB, table string) (int64, error) {
|
||||
if db == nil {
|
||||
return 0, nil
|
||||
}
|
||||
var count int64
|
||||
if err := db.QueryRowContext(ctx, fmt.Sprintf("SELECT COUNT(*) FROM %s", table)).Scan(&count); err != nil {
|
||||
if isMissingRelationError(err) {
|
||||
return 0, nil
|
||||
}
|
||||
return 0, fmt.Errorf("count %s: %w", table, err)
|
||||
}
|
||||
if count == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
if _, err := db.ExecContext(ctx, fmt.Sprintf("TRUNCATE TABLE %s", table)); err != nil {
|
||||
if isMissingRelationError(err) {
|
||||
return 0, nil
|
||||
}
|
||||
return 0, fmt.Errorf("truncate %s: %w", table, err)
|
||||
}
|
||||
return count, nil
|
||||
}
|
||||
|
||||
// isMissingRelationError 判断 PG 报错是否为"表不存在",用于让清理任务在部分部署场景静默跳过。
|
||||
func isMissingRelationError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
s := strings.ToLower(err.Error())
|
||||
return strings.Contains(s, "does not exist") && strings.Contains(s, "relation")
|
||||
}
|
||||
|
||||
func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
|
||||
if s == nil {
|
||||
return nil, false
|
||||
|
||||
64
backend/internal/service/ops_cleanup_service_test.go
Normal file
64
backend/internal/service/ops_cleanup_service_test.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestOpsCleanupPlan(t *testing.T) {
|
||||
now := time.Date(2026, 4, 29, 12, 0, 0, 0, time.UTC)
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
days int
|
||||
wantOK bool
|
||||
wantTruncate bool
|
||||
wantCutoff time.Time
|
||||
}{
|
||||
{name: "negative skips", days: -1, wantOK: false},
|
||||
{name: "zero truncates", days: 0, wantOK: true, wantTruncate: true},
|
||||
{name: "positive yields past cutoff", days: 7, wantOK: true, wantCutoff: now.AddDate(0, 0, -7)},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
cutoff, truncate, ok := opsCleanupPlan(now, tc.days)
|
||||
if ok != tc.wantOK {
|
||||
t.Fatalf("ok = %v, want %v", ok, tc.wantOK)
|
||||
}
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if truncate != tc.wantTruncate {
|
||||
t.Fatalf("truncate = %v, want %v", truncate, tc.wantTruncate)
|
||||
}
|
||||
if !tc.wantTruncate && !cutoff.Equal(tc.wantCutoff) {
|
||||
t.Fatalf("cutoff = %v, want %v", cutoff, tc.wantCutoff)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsMissingRelationError(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
err error
|
||||
want bool
|
||||
}{
|
||||
{name: "nil is not missing", err: nil, want: false},
|
||||
{name: "match relation does not exist", err: fakeErr(`pq: relation "ops_error_logs" does not exist`), want: true},
|
||||
{name: "match case-insensitive", err: fakeErr(`ERROR: Relation "x" Does Not Exist`), want: true},
|
||||
{name: "non-matching error", err: fakeErr("connection refused"), want: false},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if got := isMissingRelationError(tc.err); got != tc.want {
|
||||
t.Fatalf("got %v, want %v", got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
type fakeErr string
|
||||
|
||||
func (e fakeErr) Error() string { return string(e) }
|
||||
@@ -387,13 +387,15 @@ func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) {
|
||||
if cfg.DataRetention.CleanupSchedule == "" {
|
||||
cfg.DataRetention.CleanupSchedule = "0 2 * * *"
|
||||
}
|
||||
if cfg.DataRetention.ErrorLogRetentionDays <= 0 {
|
||||
// 保留天数:0 表示每次定时清理全部(清空所有),> 0 表示按天数保留;
|
||||
// 仅在拿到非法的负数时回填默认值,避免覆盖用户主动设的 0。
|
||||
if cfg.DataRetention.ErrorLogRetentionDays < 0 {
|
||||
cfg.DataRetention.ErrorLogRetentionDays = 30
|
||||
}
|
||||
if cfg.DataRetention.MinuteMetricsRetentionDays <= 0 {
|
||||
if cfg.DataRetention.MinuteMetricsRetentionDays < 0 {
|
||||
cfg.DataRetention.MinuteMetricsRetentionDays = 30
|
||||
}
|
||||
if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 {
|
||||
if cfg.DataRetention.HourlyMetricsRetentionDays < 0 {
|
||||
cfg.DataRetention.HourlyMetricsRetentionDays = 30
|
||||
}
|
||||
// Normalize auto refresh interval (default 30 seconds)
|
||||
@@ -406,14 +408,15 @@ func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
|
||||
if cfg == nil {
|
||||
return errors.New("invalid config")
|
||||
}
|
||||
if cfg.DataRetention.ErrorLogRetentionDays < 1 || cfg.DataRetention.ErrorLogRetentionDays > 365 {
|
||||
return errors.New("error_log_retention_days must be between 1 and 365")
|
||||
// 保留天数:0 表示每次清理全部,1-365 表示按天数保留。
|
||||
if cfg.DataRetention.ErrorLogRetentionDays < 0 || cfg.DataRetention.ErrorLogRetentionDays > 365 {
|
||||
return errors.New("error_log_retention_days must be between 0 and 365")
|
||||
}
|
||||
if cfg.DataRetention.MinuteMetricsRetentionDays < 1 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 {
|
||||
return errors.New("minute_metrics_retention_days must be between 1 and 365")
|
||||
if cfg.DataRetention.MinuteMetricsRetentionDays < 0 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 {
|
||||
return errors.New("minute_metrics_retention_days must be between 0 and 365")
|
||||
}
|
||||
if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
|
||||
return errors.New("hourly_metrics_retention_days must be between 1 and 365")
|
||||
if cfg.DataRetention.HourlyMetricsRetentionDays < 0 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
|
||||
return errors.New("hourly_metrics_retention_days must be between 0 and 365")
|
||||
}
|
||||
if cfg.AutoRefreshIntervalSec < 15 || cfg.AutoRefreshIntervalSec > 300 {
|
||||
return errors.New("auto_refresh_interval_seconds must be between 15 and 300")
|
||||
|
||||
Reference in New Issue
Block a user