fix(网关): 修复账号选择中的调度器快照延迟问题

## 问题描述
调度器快照更新存在0.5-1秒的延迟(Outbox轮询间隔),导致在账号被限流或过载后的短时间窗口内,
可能仍会被选中,造成请求失败。

## 根本原因
账号选择逻辑依赖调度器快照(listSchedulableAccounts),但快照更新有延迟:
- Outbox轮询: 每1秒检查一次变更事件
- 全量重建: 每300秒重建一次
- 时间窗口: 账号状态变更后0.5-1秒内,快照可能未更新

## 解决方案
在账号选择循环中添加IsSchedulable()实时检查,作为第二道防线:
1. 第一道防线: 调度器快照过滤(可能有延迟)
2. 第二道防线: IsSchedulable()实时检查(本次修复)

IsSchedulable()会检查:
- RateLimitResetAt: 限流重置时间
- OverloadUntil: 过载持续时间
- TempUnschedulableUntil: 临时不可调度时间
- Status: 账号状态
- Schedulable: 可调度标志

## 修改范围
### OpenAI Gateway Service
- SelectAccountForModelWithExclusions: 添加IsSchedulable()检查
- SelectAccountWithLoadAwareness: 添加IsSchedulable()检查

### Gateway Service (Claude/Gemini/Antigravity)
- 负载感知选择候选账号筛选: 添加IsSchedulable()检查
- selectAccountForModelWithPlatform: 添加IsSchedulable()检查
- selectAccountWithMixedScheduling: 添加IsSchedulable()检查

### 测试用例
- OpenAI: 添加2个测试用例验证限流账号过滤
- Gateway: 添加2个测试用例验证限流和过载账号过滤

### 其他修复
- ops_repo_preagg.go: 修复platform为NULL时的聚合问题

## 测试结果
所有单元测试通过 
This commit is contained in:
ianshaw
2026-01-13 22:49:26 -08:00
parent 55fced3942
commit 25b00abca1
5 changed files with 220 additions and 1 deletions

View File

@@ -1211,6 +1211,72 @@ func TestGatewayService_SelectAccountWithLoadAwareness(t *testing.T) {
require.Nil(t, result)
require.Contains(t, err.Error(), "no available accounts")
})
t.Run("过滤不可调度账号-限流账号被跳过", func(t *testing.T) {
now := time.Now()
resetAt := now.Add(10 * time.Minute)
repo := &mockAccountRepoForPlatform{
accounts: []Account{
{ID: 1, Platform: PlatformAnthropic, Priority: 1, Status: StatusActive, Schedulable: true, Concurrency: 5, RateLimitResetAt: &resetAt},
{ID: 2, Platform: PlatformAnthropic, Priority: 2, Status: StatusActive, Schedulable: true, Concurrency: 5},
},
accountsByID: map[int64]*Account{},
}
for i := range repo.accounts {
repo.accountsByID[repo.accounts[i].ID] = &repo.accounts[i]
}
cache := &mockGatewayCacheForPlatform{}
cfg := testConfig()
cfg.Gateway.Scheduling.LoadBatchEnabled = false
svc := &GatewayService{
accountRepo: repo,
cache: cache,
cfg: cfg,
concurrencyService: nil,
}
result, err := svc.SelectAccountWithLoadAwareness(ctx, nil, "", "claude-3-5-sonnet-20241022", nil)
require.NoError(t, err)
require.NotNil(t, result)
require.NotNil(t, result.Account)
require.Equal(t, int64(2), result.Account.ID, "应跳过限流账号,选择可用账号")
})
t.Run("过滤不可调度账号-过载账号被跳过", func(t *testing.T) {
now := time.Now()
overloadUntil := now.Add(10 * time.Minute)
repo := &mockAccountRepoForPlatform{
accounts: []Account{
{ID: 1, Platform: PlatformAnthropic, Priority: 1, Status: StatusActive, Schedulable: true, Concurrency: 5, OverloadUntil: &overloadUntil},
{ID: 2, Platform: PlatformAnthropic, Priority: 2, Status: StatusActive, Schedulable: true, Concurrency: 5},
},
accountsByID: map[int64]*Account{},
}
for i := range repo.accounts {
repo.accountsByID[repo.accounts[i].ID] = &repo.accounts[i]
}
cache := &mockGatewayCacheForPlatform{}
cfg := testConfig()
cfg.Gateway.Scheduling.LoadBatchEnabled = false
svc := &GatewayService{
accountRepo: repo,
cache: cache,
cfg: cfg,
concurrencyService: nil,
}
result, err := svc.SelectAccountWithLoadAwareness(ctx, nil, "", "claude-3-5-sonnet-20241022", nil)
require.NoError(t, err)
require.NotNil(t, result)
require.NotNil(t, result.Account)
require.Equal(t, int64(2), result.Account.ID, "应跳过过载账号,选择可用账号")
})
}
func TestGatewayService_GroupResolution_ReusesContextGroup(t *testing.T) {