## 问题描述
调度器快照更新存在0.5-1秒的延迟(Outbox轮询间隔),导致在账号被限流或过载后的短时间窗口内,
可能仍会被选中,造成请求失败。
## 根本原因
账号选择逻辑依赖调度器快照(listSchedulableAccounts),但快照更新有延迟:
- Outbox轮询: 每1秒检查一次变更事件
- 全量重建: 每300秒重建一次
- 时间窗口: 账号状态变更后0.5-1秒内,快照可能未更新
## 解决方案
在账号选择循环中添加IsSchedulable()实时检查,作为第二道防线:
1. 第一道防线: 调度器快照过滤(可能有延迟)
2. 第二道防线: IsSchedulable()实时检查(本次修复)
IsSchedulable()会检查:
- RateLimitResetAt: 限流重置时间
- OverloadUntil: 过载持续时间
- TempUnschedulableUntil: 临时不可调度时间
- Status: 账号状态
- Schedulable: 可调度标志
## 修改范围
### OpenAI Gateway Service
- SelectAccountForModelWithExclusions: 添加IsSchedulable()检查
- SelectAccountWithLoadAwareness: 添加IsSchedulable()检查
### Gateway Service (Claude/Gemini/Antigravity)
- 负载感知选择候选账号筛选: 添加IsSchedulable()检查
- selectAccountForModelWithPlatform: 添加IsSchedulable()检查
- selectAccountWithMixedScheduling: 添加IsSchedulable()检查
### 测试用例
- OpenAI: 添加2个测试用例验证限流账号过滤
- Gateway: 添加2个测试用例验证限流和过载账号过滤
### 其他修复
- ops_repo_preagg.go: 修复platform为NULL时的聚合问题
## 测试结果
所有单元测试通过 ✅
364 lines
13 KiB
Go
364 lines
13 KiB
Go
package repository
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"fmt"
|
|
"time"
|
|
)
|
|
|
|
func (r *opsRepository) UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error {
|
|
if r == nil || r.db == nil {
|
|
return fmt.Errorf("nil ops repository")
|
|
}
|
|
if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) {
|
|
return nil
|
|
}
|
|
|
|
start := startTime.UTC()
|
|
end := endTime.UTC()
|
|
|
|
// NOTE:
|
|
// - We aggregate usage_logs + ops_error_logs into ops_metrics_hourly.
|
|
// - We emit three dimension granularities via GROUPING SETS:
|
|
// 1) overall: (bucket_start)
|
|
// 2) platform: (bucket_start, platform)
|
|
// 3) group: (bucket_start, platform, group_id)
|
|
//
|
|
// IMPORTANT: Postgres UNIQUE treats NULLs as distinct, so the table uses a COALESCE-based
|
|
// unique index; our ON CONFLICT target must match that expression set.
|
|
q := `
|
|
WITH usage_base AS (
|
|
SELECT
|
|
date_trunc('hour', ul.created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start,
|
|
g.platform AS platform,
|
|
ul.group_id AS group_id,
|
|
ul.duration_ms AS duration_ms,
|
|
ul.first_token_ms AS first_token_ms,
|
|
(ul.input_tokens + ul.output_tokens + ul.cache_creation_tokens + ul.cache_read_tokens) AS tokens
|
|
FROM usage_logs ul
|
|
JOIN groups g ON g.id = ul.group_id
|
|
WHERE ul.created_at >= $1 AND ul.created_at < $2
|
|
),
|
|
usage_agg AS (
|
|
SELECT
|
|
bucket_start,
|
|
CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform,
|
|
CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id,
|
|
COUNT(*) AS success_count,
|
|
COALESCE(SUM(tokens), 0) AS token_consumed,
|
|
|
|
percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p50_ms,
|
|
percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p90_ms,
|
|
percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p95_ms,
|
|
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p99_ms,
|
|
AVG(duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_avg_ms,
|
|
MAX(duration_ms) AS duration_max_ms,
|
|
|
|
percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p50_ms,
|
|
percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p90_ms,
|
|
percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p95_ms,
|
|
percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p99_ms,
|
|
AVG(first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_avg_ms,
|
|
MAX(first_token_ms) AS ttft_max_ms
|
|
FROM usage_base
|
|
GROUP BY GROUPING SETS (
|
|
(bucket_start),
|
|
(bucket_start, platform),
|
|
(bucket_start, platform, group_id)
|
|
)
|
|
),
|
|
error_base AS (
|
|
SELECT
|
|
date_trunc('hour', created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start,
|
|
-- platform is NULL for some early-phase errors (e.g. before routing); map to a sentinel
|
|
-- value so platform-level GROUPING SETS don't collide with the overall (platform=NULL) row.
|
|
COALESCE(platform, 'unknown') AS platform,
|
|
group_id AS group_id,
|
|
is_business_limited AS is_business_limited,
|
|
error_owner AS error_owner,
|
|
status_code AS client_status_code,
|
|
COALESCE(upstream_status_code, status_code, 0) AS effective_status_code
|
|
FROM ops_error_logs
|
|
-- Exclude count_tokens requests from error metrics as they are informational probes
|
|
WHERE created_at >= $1 AND created_at < $2
|
|
AND is_count_tokens = FALSE
|
|
),
|
|
error_agg AS (
|
|
SELECT
|
|
bucket_start,
|
|
CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform,
|
|
CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id,
|
|
COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400) AS error_count_total,
|
|
COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND is_business_limited) AS business_limited_count,
|
|
COUNT(*) FILTER (WHERE COALESCE(client_status_code, 0) >= 400 AND NOT is_business_limited) AS error_count_sla,
|
|
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) NOT IN (429, 529)) AS upstream_error_count_excl_429_529,
|
|
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 429) AS upstream_429_count,
|
|
COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(effective_status_code, 0) = 529) AS upstream_529_count
|
|
FROM error_base
|
|
GROUP BY GROUPING SETS (
|
|
(bucket_start),
|
|
(bucket_start, platform),
|
|
(bucket_start, platform, group_id)
|
|
)
|
|
HAVING GROUPING(group_id) = 1 OR group_id IS NOT NULL
|
|
),
|
|
combined AS (
|
|
SELECT
|
|
COALESCE(u.bucket_start, e.bucket_start) AS bucket_start,
|
|
COALESCE(u.platform, e.platform) AS platform,
|
|
COALESCE(u.group_id, e.group_id) AS group_id,
|
|
|
|
COALESCE(u.success_count, 0) AS success_count,
|
|
COALESCE(e.error_count_total, 0) AS error_count_total,
|
|
COALESCE(e.business_limited_count, 0) AS business_limited_count,
|
|
COALESCE(e.error_count_sla, 0) AS error_count_sla,
|
|
COALESCE(e.upstream_error_count_excl_429_529, 0) AS upstream_error_count_excl_429_529,
|
|
COALESCE(e.upstream_429_count, 0) AS upstream_429_count,
|
|
COALESCE(e.upstream_529_count, 0) AS upstream_529_count,
|
|
|
|
COALESCE(u.token_consumed, 0) AS token_consumed,
|
|
|
|
u.duration_p50_ms,
|
|
u.duration_p90_ms,
|
|
u.duration_p95_ms,
|
|
u.duration_p99_ms,
|
|
u.duration_avg_ms,
|
|
u.duration_max_ms,
|
|
|
|
u.ttft_p50_ms,
|
|
u.ttft_p90_ms,
|
|
u.ttft_p95_ms,
|
|
u.ttft_p99_ms,
|
|
u.ttft_avg_ms,
|
|
u.ttft_max_ms
|
|
FROM usage_agg u
|
|
FULL OUTER JOIN error_agg e
|
|
ON u.bucket_start = e.bucket_start
|
|
AND COALESCE(u.platform, '') = COALESCE(e.platform, '')
|
|
AND COALESCE(u.group_id, 0) = COALESCE(e.group_id, 0)
|
|
)
|
|
INSERT INTO ops_metrics_hourly (
|
|
bucket_start,
|
|
platform,
|
|
group_id,
|
|
success_count,
|
|
error_count_total,
|
|
business_limited_count,
|
|
error_count_sla,
|
|
upstream_error_count_excl_429_529,
|
|
upstream_429_count,
|
|
upstream_529_count,
|
|
token_consumed,
|
|
duration_p50_ms,
|
|
duration_p90_ms,
|
|
duration_p95_ms,
|
|
duration_p99_ms,
|
|
duration_avg_ms,
|
|
duration_max_ms,
|
|
ttft_p50_ms,
|
|
ttft_p90_ms,
|
|
ttft_p95_ms,
|
|
ttft_p99_ms,
|
|
ttft_avg_ms,
|
|
ttft_max_ms,
|
|
computed_at
|
|
)
|
|
SELECT
|
|
bucket_start,
|
|
NULLIF(platform, '') AS platform,
|
|
group_id,
|
|
success_count,
|
|
error_count_total,
|
|
business_limited_count,
|
|
error_count_sla,
|
|
upstream_error_count_excl_429_529,
|
|
upstream_429_count,
|
|
upstream_529_count,
|
|
token_consumed,
|
|
duration_p50_ms::int,
|
|
duration_p90_ms::int,
|
|
duration_p95_ms::int,
|
|
duration_p99_ms::int,
|
|
duration_avg_ms,
|
|
duration_max_ms::int,
|
|
ttft_p50_ms::int,
|
|
ttft_p90_ms::int,
|
|
ttft_p95_ms::int,
|
|
ttft_p99_ms::int,
|
|
ttft_avg_ms,
|
|
ttft_max_ms::int,
|
|
NOW()
|
|
FROM combined
|
|
WHERE bucket_start IS NOT NULL
|
|
AND (platform IS NULL OR platform <> '')
|
|
ON CONFLICT (bucket_start, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET
|
|
success_count = EXCLUDED.success_count,
|
|
error_count_total = EXCLUDED.error_count_total,
|
|
business_limited_count = EXCLUDED.business_limited_count,
|
|
error_count_sla = EXCLUDED.error_count_sla,
|
|
upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529,
|
|
upstream_429_count = EXCLUDED.upstream_429_count,
|
|
upstream_529_count = EXCLUDED.upstream_529_count,
|
|
token_consumed = EXCLUDED.token_consumed,
|
|
|
|
duration_p50_ms = EXCLUDED.duration_p50_ms,
|
|
duration_p90_ms = EXCLUDED.duration_p90_ms,
|
|
duration_p95_ms = EXCLUDED.duration_p95_ms,
|
|
duration_p99_ms = EXCLUDED.duration_p99_ms,
|
|
duration_avg_ms = EXCLUDED.duration_avg_ms,
|
|
duration_max_ms = EXCLUDED.duration_max_ms,
|
|
|
|
ttft_p50_ms = EXCLUDED.ttft_p50_ms,
|
|
ttft_p90_ms = EXCLUDED.ttft_p90_ms,
|
|
ttft_p95_ms = EXCLUDED.ttft_p95_ms,
|
|
ttft_p99_ms = EXCLUDED.ttft_p99_ms,
|
|
ttft_avg_ms = EXCLUDED.ttft_avg_ms,
|
|
ttft_max_ms = EXCLUDED.ttft_max_ms,
|
|
|
|
computed_at = NOW()
|
|
`
|
|
|
|
_, err := r.db.ExecContext(ctx, q, start, end)
|
|
return err
|
|
}
|
|
|
|
func (r *opsRepository) UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error {
|
|
if r == nil || r.db == nil {
|
|
return fmt.Errorf("nil ops repository")
|
|
}
|
|
if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) {
|
|
return nil
|
|
}
|
|
|
|
start := startTime.UTC()
|
|
end := endTime.UTC()
|
|
|
|
q := `
|
|
INSERT INTO ops_metrics_daily (
|
|
bucket_date,
|
|
platform,
|
|
group_id,
|
|
success_count,
|
|
error_count_total,
|
|
business_limited_count,
|
|
error_count_sla,
|
|
upstream_error_count_excl_429_529,
|
|
upstream_429_count,
|
|
upstream_529_count,
|
|
token_consumed,
|
|
duration_p50_ms,
|
|
duration_p90_ms,
|
|
duration_p95_ms,
|
|
duration_p99_ms,
|
|
duration_avg_ms,
|
|
duration_max_ms,
|
|
ttft_p50_ms,
|
|
ttft_p90_ms,
|
|
ttft_p95_ms,
|
|
ttft_p99_ms,
|
|
ttft_avg_ms,
|
|
ttft_max_ms,
|
|
computed_at
|
|
)
|
|
SELECT
|
|
(bucket_start AT TIME ZONE 'UTC')::date AS bucket_date,
|
|
platform,
|
|
group_id,
|
|
|
|
COALESCE(SUM(success_count), 0) AS success_count,
|
|
COALESCE(SUM(error_count_total), 0) AS error_count_total,
|
|
COALESCE(SUM(business_limited_count), 0) AS business_limited_count,
|
|
COALESCE(SUM(error_count_sla), 0) AS error_count_sla,
|
|
COALESCE(SUM(upstream_error_count_excl_429_529), 0) AS upstream_error_count_excl_429_529,
|
|
COALESCE(SUM(upstream_429_count), 0) AS upstream_429_count,
|
|
COALESCE(SUM(upstream_529_count), 0) AS upstream_529_count,
|
|
COALESCE(SUM(token_consumed), 0) AS token_consumed,
|
|
|
|
-- Approximation: weighted average for p50/p90, max for p95/p99 (conservative tail).
|
|
ROUND(SUM(duration_p50_ms::double precision * success_count) FILTER (WHERE duration_p50_ms IS NOT NULL)
|
|
/ NULLIF(SUM(success_count) FILTER (WHERE duration_p50_ms IS NOT NULL), 0))::int AS duration_p50_ms,
|
|
ROUND(SUM(duration_p90_ms::double precision * success_count) FILTER (WHERE duration_p90_ms IS NOT NULL)
|
|
/ NULLIF(SUM(success_count) FILTER (WHERE duration_p90_ms IS NOT NULL), 0))::int AS duration_p90_ms,
|
|
MAX(duration_p95_ms) AS duration_p95_ms,
|
|
MAX(duration_p99_ms) AS duration_p99_ms,
|
|
SUM(duration_avg_ms * success_count) FILTER (WHERE duration_avg_ms IS NOT NULL)
|
|
/ NULLIF(SUM(success_count) FILTER (WHERE duration_avg_ms IS NOT NULL), 0) AS duration_avg_ms,
|
|
MAX(duration_max_ms) AS duration_max_ms,
|
|
|
|
ROUND(SUM(ttft_p50_ms::double precision * success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL)
|
|
/ NULLIF(SUM(success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL), 0))::int AS ttft_p50_ms,
|
|
ROUND(SUM(ttft_p90_ms::double precision * success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL)
|
|
/ NULLIF(SUM(success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL), 0))::int AS ttft_p90_ms,
|
|
MAX(ttft_p95_ms) AS ttft_p95_ms,
|
|
MAX(ttft_p99_ms) AS ttft_p99_ms,
|
|
SUM(ttft_avg_ms * success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL)
|
|
/ NULLIF(SUM(success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL), 0) AS ttft_avg_ms,
|
|
MAX(ttft_max_ms) AS ttft_max_ms,
|
|
|
|
NOW()
|
|
FROM ops_metrics_hourly
|
|
WHERE bucket_start >= $1 AND bucket_start < $2
|
|
GROUP BY 1, 2, 3
|
|
ON CONFLICT (bucket_date, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET
|
|
success_count = EXCLUDED.success_count,
|
|
error_count_total = EXCLUDED.error_count_total,
|
|
business_limited_count = EXCLUDED.business_limited_count,
|
|
error_count_sla = EXCLUDED.error_count_sla,
|
|
upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529,
|
|
upstream_429_count = EXCLUDED.upstream_429_count,
|
|
upstream_529_count = EXCLUDED.upstream_529_count,
|
|
token_consumed = EXCLUDED.token_consumed,
|
|
|
|
duration_p50_ms = EXCLUDED.duration_p50_ms,
|
|
duration_p90_ms = EXCLUDED.duration_p90_ms,
|
|
duration_p95_ms = EXCLUDED.duration_p95_ms,
|
|
duration_p99_ms = EXCLUDED.duration_p99_ms,
|
|
duration_avg_ms = EXCLUDED.duration_avg_ms,
|
|
duration_max_ms = EXCLUDED.duration_max_ms,
|
|
|
|
ttft_p50_ms = EXCLUDED.ttft_p50_ms,
|
|
ttft_p90_ms = EXCLUDED.ttft_p90_ms,
|
|
ttft_p95_ms = EXCLUDED.ttft_p95_ms,
|
|
ttft_p99_ms = EXCLUDED.ttft_p99_ms,
|
|
ttft_avg_ms = EXCLUDED.ttft_avg_ms,
|
|
ttft_max_ms = EXCLUDED.ttft_max_ms,
|
|
|
|
computed_at = NOW()
|
|
`
|
|
|
|
_, err := r.db.ExecContext(ctx, q, start, end)
|
|
return err
|
|
}
|
|
|
|
func (r *opsRepository) GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error) {
|
|
if r == nil || r.db == nil {
|
|
return time.Time{}, false, fmt.Errorf("nil ops repository")
|
|
}
|
|
|
|
var value sql.NullTime
|
|
if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_start) FROM ops_metrics_hourly`).Scan(&value); err != nil {
|
|
return time.Time{}, false, err
|
|
}
|
|
if !value.Valid {
|
|
return time.Time{}, false, nil
|
|
}
|
|
return value.Time.UTC(), true, nil
|
|
}
|
|
|
|
func (r *opsRepository) GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error) {
|
|
if r == nil || r.db == nil {
|
|
return time.Time{}, false, fmt.Errorf("nil ops repository")
|
|
}
|
|
|
|
var value sql.NullTime
|
|
if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_date) FROM ops_metrics_daily`).Scan(&value); err != nil {
|
|
return time.Time{}, false, err
|
|
}
|
|
if !value.Valid {
|
|
return time.Time{}, false, nil
|
|
}
|
|
t := value.Time.UTC()
|
|
return time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, time.UTC), true, nil
|
|
}
|