feat(ops): 增强上游错误追踪和新增定时报告服务
- 优化错误日志中间件,即使请求成功也记录上游重试/故障转移事件 - 新增OpsScheduledReportService支持定时报告功能 - 使用Redis分布式锁确保定时任务单实例执行 - 完善依赖注入配置 - 优化前端错误趋势图表展示
This commit is contained in:
@@ -149,7 +149,7 @@ SELECT
|
||||
error_phase,
|
||||
error_type,
|
||||
severity,
|
||||
COALESCE(status_code, 0),
|
||||
COALESCE(upstream_status_code, status_code, 0),
|
||||
COALESCE(platform, ''),
|
||||
COALESCE(model, ''),
|
||||
duration_ms,
|
||||
@@ -261,7 +261,7 @@ SELECT
|
||||
error_phase,
|
||||
error_type,
|
||||
severity,
|
||||
COALESCE(status_code, 0),
|
||||
COALESCE(upstream_status_code, status_code, 0),
|
||||
COALESCE(platform, ''),
|
||||
COALESCE(model, ''),
|
||||
duration_ms,
|
||||
@@ -605,6 +605,17 @@ func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) {
|
||||
args := make([]any, 0, 8)
|
||||
clauses = append(clauses, "1=1")
|
||||
|
||||
phaseFilter := ""
|
||||
if filter != nil {
|
||||
phaseFilter = strings.TrimSpace(strings.ToLower(filter.Phase))
|
||||
}
|
||||
// ops_error_logs primarily stores client-visible error requests (status>=400),
|
||||
// but we also persist "recovered" upstream errors (status<400) for upstream health visibility.
|
||||
// By default, keep list endpoints scoped to client errors unless explicitly filtering upstream phase.
|
||||
if phaseFilter != "upstream" {
|
||||
clauses = append(clauses, "COALESCE(status_code, 0) >= 400")
|
||||
}
|
||||
|
||||
if filter.StartTime != nil && !filter.StartTime.IsZero() {
|
||||
args = append(args, filter.StartTime.UTC())
|
||||
clauses = append(clauses, "created_at >= $"+itoa(len(args)))
|
||||
@@ -626,13 +637,13 @@ func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) {
|
||||
args = append(args, *filter.AccountID)
|
||||
clauses = append(clauses, "account_id = $"+itoa(len(args)))
|
||||
}
|
||||
if phase := strings.TrimSpace(filter.Phase); phase != "" {
|
||||
if phase := phaseFilter; phase != "" {
|
||||
args = append(args, phase)
|
||||
clauses = append(clauses, "error_phase = $"+itoa(len(args)))
|
||||
}
|
||||
if len(filter.StatusCodes) > 0 {
|
||||
args = append(args, pq.Array(filter.StatusCodes))
|
||||
clauses = append(clauses, "status_code = ANY($"+itoa(len(args))+")")
|
||||
clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) = ANY($"+itoa(len(args))+")")
|
||||
}
|
||||
if q := strings.TrimSpace(filter.Query); q != "" {
|
||||
like := "%" + q + "%"
|
||||
|
||||
@@ -815,9 +815,9 @@ func (r *opsRepository) queryErrorCounts(ctx context.Context, filter *service.Op
|
||||
|
||||
q := `
|
||||
SELECT
|
||||
COALESCE(COUNT(*), 0) AS error_total,
|
||||
COALESCE(COUNT(*) FILTER (WHERE is_business_limited), 0) AS business_limited,
|
||||
COALESCE(COUNT(*) FILTER (WHERE NOT is_business_limited), 0) AS error_sla,
|
||||
COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400), 0) AS error_total,
|
||||
COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited), 0) AS business_limited,
|
||||
COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited), 0) AS error_sla,
|
||||
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl,
|
||||
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429), 0) AS upstream_429,
|
||||
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529), 0) AS upstream_529
|
||||
@@ -870,6 +870,7 @@ error_buckets AS (
|
||||
SELECT date_trunc('minute', created_at) AS bucket, COUNT(*) AS cnt
|
||||
FROM ops_error_logs
|
||||
` + errorWhere + `
|
||||
AND COALESCE(status_code, 0) >= 400
|
||||
GROUP BY 1
|
||||
),
|
||||
combined AS (
|
||||
|
||||
@@ -75,7 +75,8 @@ error_base AS (
|
||||
group_id AS group_id,
|
||||
is_business_limited AS is_business_limited,
|
||||
error_owner AS error_owner,
|
||||
COALESCE(upstream_status_code, status_code) AS status_code
|
||||
status_code AS client_status_code,
|
||||
COALESCE(upstream_status_code, status_code, 0) AS effective_status_code
|
||||
FROM ops_error_logs
|
||||
WHERE created_at >= $1 AND created_at < $2
|
||||
),
|
||||
@@ -84,12 +85,12 @@ error_agg AS (
|
||||
bucket_start,
|
||||
CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform,
|
||||
CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id,
|
||||
COUNT(*) AS error_count_total,
|
||||
COUNT(*) FILTER (WHERE is_business_limited) AS business_limited_count,
|
||||
COUNT(*) FILTER (WHERE NOT is_business_limited) AS error_count_sla,
|
||||
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) NOT IN (429, 529)) AS upstream_error_count_excl_429_529,
|
||||
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 429) AS upstream_429_count,
|
||||
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 529) AS upstream_529_count
|
||||
COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400) AS error_count_total,
|
||||
COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND is_business_limited) AS business_limited_count,
|
||||
COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND NOT is_business_limited) AS error_count_sla,
|
||||
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) NOT IN (429, 529)) AS upstream_error_count_excl_429_529,
|
||||
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 429) AS upstream_429_count,
|
||||
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 529) AS upstream_529_count
|
||||
FROM error_base
|
||||
GROUP BY GROUPING SETS (
|
||||
(bucket_start),
|
||||
|
||||
@@ -131,6 +131,7 @@ WITH combined AS (
|
||||
LEFT JOIN groups g ON g.id = o.group_id
|
||||
LEFT JOIN accounts a ON a.id = o.account_id
|
||||
WHERE o.created_at >= $1 AND o.created_at < $2
|
||||
AND COALESCE(o.status_code, 0) >= 400
|
||||
)
|
||||
`
|
||||
|
||||
|
||||
@@ -53,6 +53,7 @@ error_buckets AS (
|
||||
COUNT(*) AS error_count
|
||||
FROM ops_error_logs
|
||||
` + errorWhere + `
|
||||
AND COALESCE(status_code, 0) >= 400
|
||||
GROUP BY 1
|
||||
),
|
||||
combined AS (
|
||||
@@ -168,6 +169,7 @@ error_totals AS (
|
||||
COUNT(*) AS error_count
|
||||
FROM ops_error_logs
|
||||
WHERE created_at >= $1 AND created_at < $2
|
||||
AND COALESCE(status_code, 0) >= 400
|
||||
GROUP BY 1
|
||||
),
|
||||
combined AS (
|
||||
@@ -240,6 +242,7 @@ error_totals AS (
|
||||
WHERE created_at >= $1 AND created_at < $2
|
||||
AND platform = $3
|
||||
AND group_id IS NOT NULL
|
||||
AND COALESCE(status_code, 0) >= 400
|
||||
GROUP BY 1
|
||||
),
|
||||
combined AS (
|
||||
@@ -413,9 +416,9 @@ func (r *opsRepository) GetErrorTrend(ctx context.Context, filter *service.OpsDa
|
||||
q := `
|
||||
SELECT
|
||||
` + bucketExpr + ` AS bucket,
|
||||
COUNT(*) AS error_total,
|
||||
COUNT(*) FILTER (WHERE is_business_limited) AS business_limited,
|
||||
COUNT(*) FILTER (WHERE NOT is_business_limited) AS error_sla,
|
||||
COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400) AS error_total,
|
||||
COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited) AS business_limited,
|
||||
COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited) AS error_sla,
|
||||
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)) AS upstream_excl,
|
||||
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429) AS upstream_429,
|
||||
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529) AS upstream_529
|
||||
@@ -524,12 +527,13 @@ func (r *opsRepository) GetErrorDistribution(ctx context.Context, filter *servic
|
||||
|
||||
q := `
|
||||
SELECT
|
||||
COALESCE(status_code, 0) AS status_code,
|
||||
COALESCE(upstream_status_code, status_code, 0) AS status_code,
|
||||
COUNT(*) AS total,
|
||||
COUNT(*) FILTER (WHERE NOT is_business_limited) AS sla,
|
||||
COUNT(*) FILTER (WHERE is_business_limited) AS business_limited
|
||||
FROM ops_error_logs
|
||||
` + where + `
|
||||
AND COALESCE(status_code, 0) >= 400
|
||||
GROUP BY 1
|
||||
ORDER BY total DESC
|
||||
LIMIT 20`
|
||||
|
||||
Reference in New Issue
Block a user