fix(antigravity): fast-fail on proxy unavailable, temp-unschedule account

## Problem

When a proxy is unreachable, token refresh retries up to 4 times with
30s timeout each, causing requests to hang for ~2 minutes before
failing with a generic 502 error. The failed account is not marked,
so subsequent requests keep hitting it.

## Changes

### Proxy connection fast-fail
- Set TCP dial timeout to 5s and TLS handshake timeout to 5s on
  antigravity client, so proxy connectivity issues fail within 5s
  instead of 30s
- Reduce overall HTTP client timeout from 30s to 10s
- Export `IsConnectionError` for service-layer use
- Detect proxy connection errors in `RefreshToken` and return
  immediately with "proxy unavailable" error (no retries)

### Token refresh temp-unschedulable
- Add 8s context timeout for token refresh on request path
- Mark account as temp-unschedulable for 10min when refresh fails
  (both background `TokenRefreshService` and request-path
  `GetAccessToken`)
- Sync temp-unschedulable state to Redis cache for immediate
  scheduler effect
- Inject `TempUnschedCache` into `AntigravityTokenProvider`

### Account failover
- Return `UpstreamFailoverError` on `GetAccessToken` failure in
  `Forward`/`ForwardGemini` to trigger handler-level account switch
  instead of returning 502 directly

### Proxy probe alignment
- Apply same 5s dial/TLS timeout to shared `httpclient` pool
- Reduce proxy probe timeout from 30s to 10s
This commit is contained in:
erio
2026-03-19 23:48:37 +08:00
parent 0236b97d49
commit 528ff5d28c
10 changed files with 125 additions and 20 deletions

View File

@@ -1359,7 +1359,10 @@ func (s *AntigravityGatewayService) Forward(ctx context.Context, c *gin.Context,
}
accessToken, err := s.tokenProvider.GetAccessToken(ctx, account)
if err != nil {
return nil, s.writeClaudeError(c, http.StatusBadGateway, "authentication_error", "Failed to get upstream access token")
return nil, &UpstreamFailoverError{
StatusCode: http.StatusBadGateway,
ResponseBody: []byte(`{"error":{"type":"authentication_error","message":"Failed to get upstream access token"},"type":"error"}`),
}
}
// 获取 project_id部分账户类型可能没有
@@ -2101,7 +2104,10 @@ func (s *AntigravityGatewayService) ForwardGemini(ctx context.Context, c *gin.Co
}
accessToken, err := s.tokenProvider.GetAccessToken(ctx, account)
if err != nil {
return nil, s.writeGoogleError(c, http.StatusBadGateway, "Failed to get upstream access token")
return nil, &UpstreamFailoverError{
StatusCode: http.StatusBadGateway,
ResponseBody: []byte(`{"error":{"message":"Failed to get upstream access token","status":"UNAVAILABLE"}}`),
}
}
// 获取 project_id部分账户类型可能没有

View File

@@ -192,6 +192,10 @@ func (s *AntigravityOAuthService) RefreshToken(ctx context.Context, refreshToken
if isNonRetryableAntigravityOAuthError(err) {
return nil, err
}
// 代理连接错误TCP 超时、连接拒绝、DNS 失败)不重试,立即返回
if antigravity.IsConnectionError(err) {
return nil, fmt.Errorf("proxy unavailable: %w", err)
}
lastErr = err
}

View File

@@ -14,6 +14,10 @@ const (
antigravityTokenRefreshSkew = 3 * time.Minute
antigravityTokenCacheSkew = 5 * time.Minute
antigravityBackfillCooldown = 5 * time.Minute
// antigravityRequestRefreshTimeout 请求路径上 token 刷新的最大等待时间。
// 超过此时间直接放弃刷新、标记账号临时不可调度并触发 failover
// 让后台 TokenRefreshService 在下个周期继续重试。
antigravityRequestRefreshTimeout = 8 * time.Second
)
// AntigravityTokenCache token cache interface.
@@ -28,6 +32,7 @@ type AntigravityTokenProvider struct {
refreshAPI *OAuthRefreshAPI
executor OAuthRefreshExecutor
refreshPolicy ProviderRefreshPolicy
tempUnschedCache TempUnschedCache // 用于同步更新 Redis 临时不可调度缓存
}
func NewAntigravityTokenProvider(
@@ -54,6 +59,11 @@ func (p *AntigravityTokenProvider) SetRefreshPolicy(policy ProviderRefreshPolicy
p.refreshPolicy = policy
}
// SetTempUnschedCache injects temp unschedulable cache for immediate scheduler sync.
func (p *AntigravityTokenProvider) SetTempUnschedCache(cache TempUnschedCache) {
p.tempUnschedCache = cache
}
// GetAccessToken returns a valid access_token.
func (p *AntigravityTokenProvider) GetAccessToken(ctx context.Context, account *Account) (string, error) {
if account == nil {
@@ -88,8 +98,13 @@ func (p *AntigravityTokenProvider) GetAccessToken(ctx context.Context, account *
expiresAt := account.GetCredentialAsTime("expires_at")
needsRefresh := expiresAt == nil || time.Until(*expiresAt) <= antigravityTokenRefreshSkew
if needsRefresh && p.refreshAPI != nil && p.executor != nil {
result, err := p.refreshAPI.RefreshIfNeeded(ctx, account, p.executor, antigravityTokenRefreshSkew)
// 请求路径使用短超时,避免代理不通时阻塞过久(后台刷新服务会继续重试)
refreshCtx, cancel := context.WithTimeout(ctx, antigravityRequestRefreshTimeout)
defer cancel()
result, err := p.refreshAPI.RefreshIfNeeded(refreshCtx, account, p.executor, antigravityTokenRefreshSkew)
if err != nil {
// 标记账号临时不可调度,避免后续请求继续命中
p.markTempUnschedulable(account, err)
if p.refreshPolicy.OnRefreshError == ProviderRefreshErrorReturn {
return "", err
}
@@ -172,6 +187,45 @@ func (p *AntigravityTokenProvider) shouldAttemptBackfill(accountID int64) bool {
return true
}
// markTempUnschedulable 在请求路径上 token 刷新失败时标记账号临时不可调度。
// 同时写 DB 和 Redis 缓存,确保调度器立即跳过该账号。
// 使用 background context 因为请求 context 可能已超时。
func (p *AntigravityTokenProvider) markTempUnschedulable(account *Account, refreshErr error) {
if p.accountRepo == nil || account == nil {
return
}
now := time.Now()
until := now.Add(tokenRefreshTempUnschedDuration)
reason := "token refresh failed on request path: " + refreshErr.Error()
bgCtx := context.Background()
if err := p.accountRepo.SetTempUnschedulable(bgCtx, account.ID, until, reason); err != nil {
slog.Warn("antigravity_token_provider.set_temp_unschedulable_failed",
"account_id", account.ID,
"error", err,
)
return
}
slog.Warn("antigravity_token_provider.temp_unschedulable_set",
"account_id", account.ID,
"until", until.Format(time.RFC3339),
"reason", reason,
)
// 同步写 Redis 缓存,调度器立即生效
if p.tempUnschedCache != nil {
state := &TempUnschedState{
UntilUnix: until.Unix(),
TriggeredAtUnix: now.Unix(),
ErrorMessage: reason,
}
if err := p.tempUnschedCache.SetTempUnsched(bgCtx, account.ID, state); err != nil {
slog.Warn("antigravity_token_provider.temp_unsched_cache_set_failed",
"account_id", account.ID,
"error", err,
)
}
}
}
func (p *AntigravityTokenProvider) markBackfillAttempted(accountID int64) {
p.backfillCooldown.Store(accountID, time.Now())
}

View File

@@ -12,6 +12,9 @@ import (
"github.com/Wei-Shaw/sub2api/internal/config"
)
// tokenRefreshTempUnschedDuration token 刷新重试耗尽后临时不可调度的持续时间
const tokenRefreshTempUnschedDuration = 10 * time.Minute
// TokenRefreshService OAuth token自动刷新服务
// 定期检查并刷新即将过期的token
type TokenRefreshService struct {
@@ -317,7 +320,7 @@ func (s *TokenRefreshService) refreshWithRetry(ctx context.Context, account *Acc
}
}
// 可重试错误耗尽:仅记录日志,不标记 error可能是临时网络问题下个周期继续重试
// 可重试错误耗尽:临时标记账号不可调度,避免请求路径反复命中已知失败的账号
slog.Warn("token_refresh.retry_exhausted",
"account_id", account.ID,
"platform", account.Platform,
@@ -325,6 +328,21 @@ func (s *TokenRefreshService) refreshWithRetry(ctx context.Context, account *Acc
"error", lastErr,
)
// 设置临时不可调度 10 分钟(不标记 error保持 status=active 让下个刷新周期能继续尝试)
until := time.Now().Add(tokenRefreshTempUnschedDuration)
reason := fmt.Sprintf("token refresh retry exhausted: %v", lastErr)
if setErr := s.accountRepo.SetTempUnschedulable(ctx, account.ID, until, reason); setErr != nil {
slog.Warn("token_refresh.set_temp_unschedulable_failed",
"account_id", account.ID,
"error", setErr,
)
} else {
slog.Info("token_refresh.temp_unschedulable_set",
"account_id", account.ID,
"until", until.Format(time.RFC3339),
)
}
return lastErr
}

View File

@@ -114,11 +114,13 @@ func ProvideAntigravityTokenProvider(
tokenCache GeminiTokenCache,
antigravityOAuthService *AntigravityOAuthService,
refreshAPI *OAuthRefreshAPI,
tempUnschedCache TempUnschedCache,
) *AntigravityTokenProvider {
p := NewAntigravityTokenProvider(accountRepo, tokenCache, antigravityOAuthService)
executor := NewAntigravityTokenRefresher(antigravityOAuthService)
p.SetRefreshAPI(refreshAPI, executor)
p.SetRefreshPolicy(AntigravityProviderRefreshPolicy())
p.SetTempUnschedCache(tempUnschedCache)
return p
}