feat: same-account retry before failover for transient errors

For retryable transient errors (Google 400 "invalid project resource name"
and empty stream responses), retry on the same account up to 2 times
(with 500ms delay) before switching to another account.

- Add RetryableOnSameAccount field to UpstreamFailoverError
- Add same-account retry loop in both Gemini and Claude/OpenAI handler paths
- Move temp-unschedule from service layer to handler layer (only after
  all same-account retries exhausted)
- Reduce temp-unschedule cooldown from 30 minutes to 1 minute
This commit is contained in:
Edric Li
2026-02-10 00:53:54 +08:00
parent 61c73287dc
commit d6c2921f2b
4 changed files with 91 additions and 33 deletions

View File

@@ -908,8 +908,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
Message: upstreamMsg,
Detail: upstreamDetail,
})
tempUnscheduleGoogleConfigError(ctx, s.accountRepo, account.ID, "[Gemini]")
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode, ResponseBody: respBody}
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode, ResponseBody: respBody, RetryableOnSameAccount: true}
}
}
if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {
@@ -1387,8 +1386,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
Message: upstreamMsg,
Detail: upstreamDetail,
})
tempUnscheduleGoogleConfigError(ctx, s.accountRepo, account.ID, "[Gemini]")
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode, ResponseBody: evBody}
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode, ResponseBody: evBody, RetryableOnSameAccount: true}
}
}
if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {