feat(ops): 增强上游错误追踪和新增定时报告服务

- 优化错误日志中间件,即使请求成功也记录上游重试/故障转移事件
- 新增OpsScheduledReportService支持定时报告功能
- 使用Redis分布式锁确保定时任务单实例执行
- 完善依赖注入配置
- 优化前端错误趋势图表展示
This commit is contained in:
IanShaw027
2026-01-11 23:00:31 +08:00
parent 8fffcd8091
commit 73b62bb15c
13 changed files with 1021 additions and 30 deletions

View File

@@ -149,7 +149,7 @@ SELECT
error_phase,
error_type,
severity,
COALESCE(status_code, 0),
COALESCE(upstream_status_code, status_code, 0),
COALESCE(platform, ''),
COALESCE(model, ''),
duration_ms,
@@ -261,7 +261,7 @@ SELECT
error_phase,
error_type,
severity,
COALESCE(status_code, 0),
COALESCE(upstream_status_code, status_code, 0),
COALESCE(platform, ''),
COALESCE(model, ''),
duration_ms,
@@ -605,6 +605,17 @@ func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) {
args := make([]any, 0, 8)
clauses = append(clauses, "1=1")
phaseFilter := ""
if filter != nil {
phaseFilter = strings.TrimSpace(strings.ToLower(filter.Phase))
}
// ops_error_logs primarily stores client-visible error requests (status>=400),
// but we also persist "recovered" upstream errors (status<400) for upstream health visibility.
// By default, keep list endpoints scoped to client errors unless explicitly filtering upstream phase.
if phaseFilter != "upstream" {
clauses = append(clauses, "COALESCE(status_code, 0) >= 400")
}
if filter.StartTime != nil && !filter.StartTime.IsZero() {
args = append(args, filter.StartTime.UTC())
clauses = append(clauses, "created_at >= $"+itoa(len(args)))
@@ -626,13 +637,13 @@ func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) {
args = append(args, *filter.AccountID)
clauses = append(clauses, "account_id = $"+itoa(len(args)))
}
if phase := strings.TrimSpace(filter.Phase); phase != "" {
if phase := phaseFilter; phase != "" {
args = append(args, phase)
clauses = append(clauses, "error_phase = $"+itoa(len(args)))
}
if len(filter.StatusCodes) > 0 {
args = append(args, pq.Array(filter.StatusCodes))
clauses = append(clauses, "status_code = ANY($"+itoa(len(args))+")")
clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) = ANY($"+itoa(len(args))+")")
}
if q := strings.TrimSpace(filter.Query); q != "" {
like := "%" + q + "%"

View File

@@ -815,9 +815,9 @@ func (r *opsRepository) queryErrorCounts(ctx context.Context, filter *service.Op
q := `
SELECT
COALESCE(COUNT(*), 0) AS error_total,
COALESCE(COUNT(*) FILTER (WHERE is_business_limited), 0) AS business_limited,
COALESCE(COUNT(*) FILTER (WHERE NOT is_business_limited), 0) AS error_sla,
COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400), 0) AS error_total,
COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited), 0) AS business_limited,
COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited), 0) AS error_sla,
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl,
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429), 0) AS upstream_429,
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529), 0) AS upstream_529
@@ -870,6 +870,7 @@ error_buckets AS (
SELECT date_trunc('minute', created_at) AS bucket, COUNT(*) AS cnt
FROM ops_error_logs
` + errorWhere + `
AND COALESCE(status_code, 0) >= 400
GROUP BY 1
),
combined AS (

View File

@@ -75,7 +75,8 @@ error_base AS (
group_id AS group_id,
is_business_limited AS is_business_limited,
error_owner AS error_owner,
COALESCE(upstream_status_code, status_code) AS status_code
status_code AS client_status_code,
COALESCE(upstream_status_code, status_code, 0) AS effective_status_code
FROM ops_error_logs
WHERE created_at >= $1 AND created_at < $2
),
@@ -84,12 +85,12 @@ error_agg AS (
bucket_start,
CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform,
CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id,
COUNT(*) AS error_count_total,
COUNT(*) FILTER (WHERE is_business_limited) AS business_limited_count,
COUNT(*) FILTER (WHERE NOT is_business_limited) AS error_count_sla,
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) NOT IN (429, 529)) AS upstream_error_count_excl_429_529,
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 429) AS upstream_429_count,
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 529) AS upstream_529_count
COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400) AS error_count_total,
COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND is_business_limited) AS business_limited_count,
COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND NOT is_business_limited) AS error_count_sla,
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) NOT IN (429, 529)) AS upstream_error_count_excl_429_529,
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 429) AS upstream_429_count,
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 529) AS upstream_529_count
FROM error_base
GROUP BY GROUPING SETS (
(bucket_start),

View File

@@ -131,6 +131,7 @@ WITH combined AS (
LEFT JOIN groups g ON g.id = o.group_id
LEFT JOIN accounts a ON a.id = o.account_id
WHERE o.created_at >= $1 AND o.created_at < $2
AND COALESCE(o.status_code, 0) >= 400
)
`

View File

@@ -53,6 +53,7 @@ error_buckets AS (
COUNT(*) AS error_count
FROM ops_error_logs
` + errorWhere + `
AND COALESCE(status_code, 0) >= 400
GROUP BY 1
),
combined AS (
@@ -168,6 +169,7 @@ error_totals AS (
COUNT(*) AS error_count
FROM ops_error_logs
WHERE created_at >= $1 AND created_at < $2
AND COALESCE(status_code, 0) >= 400
GROUP BY 1
),
combined AS (
@@ -240,6 +242,7 @@ error_totals AS (
WHERE created_at >= $1 AND created_at < $2
AND platform = $3
AND group_id IS NOT NULL
AND COALESCE(status_code, 0) >= 400
GROUP BY 1
),
combined AS (
@@ -413,9 +416,9 @@ func (r *opsRepository) GetErrorTrend(ctx context.Context, filter *service.OpsDa
q := `
SELECT
` + bucketExpr + ` AS bucket,
COUNT(*) AS error_total,
COUNT(*) FILTER (WHERE is_business_limited) AS business_limited,
COUNT(*) FILTER (WHERE NOT is_business_limited) AS error_sla,
COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400) AS error_total,
COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited) AS business_limited,
COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited) AS error_sla,
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)) AS upstream_excl,
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429) AS upstream_429,
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529) AS upstream_529
@@ -524,12 +527,13 @@ func (r *opsRepository) GetErrorDistribution(ctx context.Context, filter *servic
q := `
SELECT
COALESCE(status_code, 0) AS status_code,
COALESCE(upstream_status_code, status_code, 0) AS status_code,
COUNT(*) AS total,
COUNT(*) FILTER (WHERE NOT is_business_limited) AS sla,
COUNT(*) FILTER (WHERE is_business_limited) AS business_limited
FROM ops_error_logs
` + where + `
AND COALESCE(status_code, 0) >= 400
GROUP BY 1
ORDER BY total DESC
LIMIT 20`