feat(monitor): 30-day raw retention + timeline 4-tier style + CC template seed + JSON format button

- History retention 1d → 30d(60s × 30d ≈ 43200 行/model,PG 无压力);
  ComputeAvailability* 不再 UNION rollup 表,直接扫 histories 精度更高。
- Timeline bar 四级高度+颜色双重编码:operational 高+绿 / degraded 中+黄 /
  failed+error 短+红 / 未测试 很短+灰。
- migration 113 seed「Claude Code 伪装」模板(ON CONFLICT DO NOTHING)。
  user_id 用 legacy 格式(user_<64hex>_account_<uuid>_session_<uuid>),
  避免新版 JSON 字符串内嵌 JSON 在编辑器里一长串 \" 难读。
- MonitorAdvancedRequestConfig 加「格式化」按钮 + white-space:pre
  让 body textarea 对长字符串不压扁。
This commit is contained in:
erio
2026-04-21 15:24:48 +08:00
parent 6925ac25c4
commit a7415d4d2e
7 changed files with 102 additions and 64 deletions

View File

@@ -297,41 +297,22 @@ func assignNullInt(dst **int, n sql.NullInt64) {
// "可用" = status IN (operational, degraded)。
//
// 数据来源:明细表只保留 1 天;窗口前其余天数走聚合表。
// - raw = 今天CURRENT_DATE 起)的未软删明细,按 model 累加
// - rollup = [CURRENT_DATE - windowDays, CURRENT_DATE) 区间的聚合行
//
// 总窗口为 "今天 + 过去 windowDays 天",比 windowDays 字面值大 1 天,但因为聚合
// 是按整 UTC 日切的,这是聚合化无法避免的精度损失,且偏宽不偏窄(数据更全)。
// 明细保留 30 天monitorHistoryRetentionDays窗口 <= 30 天时直接扫 histories
// 精度到秒,避免与聚合表 UNION 带来的 UTC 日切精度损失。
func (r *channelMonitorRepository) ComputeAvailability(ctx context.Context, monitorID int64, windowDays int) ([]*service.ChannelMonitorAvailability, error) {
if windowDays <= 0 {
windowDays = 7
}
const q = `
WITH raw AS (
SELECT model,
COUNT(*) AS total_checks,
COUNT(*) FILTER (WHERE status IN ('operational','degraded')) AS ok_count,
COALESCE(SUM(latency_ms) FILTER (WHERE latency_ms IS NOT NULL), 0) AS sum_latency_ms,
COUNT(latency_ms) AS count_latency
FROM channel_monitor_histories
WHERE monitor_id = $1
AND checked_at >= CURRENT_DATE
GROUP BY model
),
rollup AS (
SELECT model, total_checks, ok_count, sum_latency_ms, count_latency
FROM channel_monitor_daily_rollups
WHERE monitor_id = $1
AND bucket_date >= (CURRENT_DATE - $2::int)
AND bucket_date < CURRENT_DATE
)
SELECT model,
SUM(total_checks) AS total,
SUM(ok_count) AS ok,
CASE WHEN SUM(count_latency) > 0
THEN SUM(sum_latency_ms)::float8 / SUM(count_latency)
ELSE NULL END AS avg_latency_ms
FROM (SELECT * FROM raw UNION ALL SELECT * FROM rollup) combined
COUNT(*) AS total,
COUNT(*) FILTER (WHERE status IN ('operational','degraded')) AS ok,
CASE WHEN COUNT(latency_ms) > 0
THEN SUM(latency_ms) FILTER (WHERE latency_ms IS NOT NULL)::float8 / COUNT(latency_ms)
ELSE NULL END AS avg_latency_ms
FROM channel_monitor_histories
WHERE monitor_id = $1
AND checked_at >= NOW() - ($2::int || ' days')::interval
GROUP BY model
`
rows, err := r.db.QueryContext(ctx, q, monitorID, windowDays)
@@ -514,7 +495,7 @@ func clampTimelineLimit(n int) int {
}
// ComputeAvailabilityForMonitors 一次性计算多个监控在某个窗口内的每模型可用率与平均延迟。
// 与单 monitor 版本同构:明细只覆盖今天,更早走聚合表 UNION 合并
// 明细保留 30 天,直接扫 histories窗口 <= 30 天时无需聚合)
func (r *channelMonitorRepository) ComputeAvailabilityForMonitors(ctx context.Context, ids []int64, windowDays int) (map[int64][]*service.ChannelMonitorAvailability, error) {
out := make(map[int64][]*service.ChannelMonitorAvailability, len(ids))
if len(ids) == 0 {
@@ -524,33 +505,16 @@ func (r *channelMonitorRepository) ComputeAvailabilityForMonitors(ctx context.Co
windowDays = 7
}
const q = `
WITH raw AS (
SELECT monitor_id,
model,
COUNT(*) AS total_checks,
COUNT(*) FILTER (WHERE status IN ('operational','degraded')) AS ok_count,
COALESCE(SUM(latency_ms) FILTER (WHERE latency_ms IS NOT NULL), 0) AS sum_latency_ms,
COUNT(latency_ms) AS count_latency
FROM channel_monitor_histories
WHERE monitor_id = ANY($1)
AND checked_at >= CURRENT_DATE
GROUP BY monitor_id, model
),
rollup AS (
SELECT monitor_id, model, total_checks, ok_count, sum_latency_ms, count_latency
FROM channel_monitor_daily_rollups
WHERE monitor_id = ANY($1)
AND bucket_date >= (CURRENT_DATE - $2::int)
AND bucket_date < CURRENT_DATE
)
SELECT monitor_id,
model,
SUM(total_checks) AS total,
SUM(ok_count) AS ok,
CASE WHEN SUM(count_latency) > 0
THEN SUM(sum_latency_ms)::float8 / SUM(count_latency)
ELSE NULL END AS avg_latency_ms
FROM (SELECT * FROM raw UNION ALL SELECT * FROM rollup) combined
COUNT(*) AS total,
COUNT(*) FILTER (WHERE status IN ('operational','degraded')) AS ok,
CASE WHEN COUNT(latency_ms) > 0
THEN SUM(latency_ms) FILTER (WHERE latency_ms IS NOT NULL)::float8 / COUNT(latency_ms)
ELSE NULL END AS avg_latency_ms
FROM channel_monitor_histories
WHERE monitor_id = ANY($1)
AND checked_at >= NOW() - ($2::int || ' days')::interval
GROUP BY monitor_id, model
`
rows, err := r.db.QueryContext(ctx, q, pq.Array(ids), windowDays)