feat: failover and temp-unschedule on Google "Invalid project resource name" 400
Google 后端间歇性返回 400 "Invalid project resource name" 错误, 此前该错误直接透传给客户端且不触发账号切换,导致请求失败。 - 在 Antigravity 和 Gemini 两个平台的所有转发路径中, 精确匹配该错误消息后触发 failover 自动换号重试 - 命中后将账号临时封禁 1 小时,避免反复调度到同一故障账号 - 提取共享函数 isGoogleProjectConfigError / tempUnscheduleGoogleConfigError 消除跨 Service 的代码重复
This commit is contained in:
@@ -1285,6 +1285,28 @@ func (s *AntigravityGatewayService) Forward(ctx context.Context, c *gin.Context,
|
|||||||
|
|
||||||
s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, originalModel, 0, "", isStickySession)
|
s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, originalModel, 0, "", isStickySession)
|
||||||
|
|
||||||
|
// 精确匹配服务端配置类 400 错误,触发 failover + 临时封禁
|
||||||
|
if resp.StatusCode == http.StatusBadRequest {
|
||||||
|
msg := strings.ToLower(strings.TrimSpace(extractAntigravityErrorMessage(respBody)))
|
||||||
|
if isGoogleProjectConfigError(msg) {
|
||||||
|
upstreamMsg := sanitizeUpstreamErrorMessage(strings.TrimSpace(extractAntigravityErrorMessage(respBody)))
|
||||||
|
upstreamDetail := s.getUpstreamErrorDetail(respBody)
|
||||||
|
log.Printf("%s status=400 google_config_error failover=true upstream_message=%q account=%d", prefix, upstreamMsg, account.ID)
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
AccountName: account.Name,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||||
|
Kind: "failover",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
tempUnscheduleGoogleConfigError(ctx, s.accountRepo, account.ID, prefix)
|
||||||
|
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode, ResponseBody: respBody}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if s.shouldFailoverUpstreamError(resp.StatusCode) {
|
if s.shouldFailoverUpstreamError(resp.StatusCode) {
|
||||||
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
|
upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
|
||||||
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
|
||||||
@@ -1825,6 +1847,23 @@ func (s *AntigravityGatewayService) ForwardGemini(ctx context.Context, c *gin.Co
|
|||||||
// Always record upstream context for Ops error logs, even when we will failover.
|
// Always record upstream context for Ops error logs, even when we will failover.
|
||||||
setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
|
setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail)
|
||||||
|
|
||||||
|
// 精确匹配服务端配置类 400 错误,触发 failover + 临时封禁
|
||||||
|
if resp.StatusCode == http.StatusBadRequest && isGoogleProjectConfigError(strings.ToLower(upstreamMsg)) {
|
||||||
|
log.Printf("%s status=400 google_config_error failover=true upstream_message=%q account=%d", prefix, upstreamMsg, account.ID)
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
AccountName: account.Name,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: requestID,
|
||||||
|
Kind: "failover",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
tempUnscheduleGoogleConfigError(ctx, s.accountRepo, account.ID, prefix)
|
||||||
|
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode, ResponseBody: unwrappedForOps}
|
||||||
|
}
|
||||||
|
|
||||||
if s.shouldFailoverUpstreamError(resp.StatusCode) {
|
if s.shouldFailoverUpstreamError(resp.StatusCode) {
|
||||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
Platform: account.Platform,
|
Platform: account.Platform,
|
||||||
@@ -1920,6 +1959,29 @@ func (s *AntigravityGatewayService) shouldFailoverUpstreamError(statusCode int)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isGoogleProjectConfigError 判断(已提取的小写)错误消息是否属于 Google 服务端配置类问题。
|
||||||
|
// 只精确匹配已知的服务端侧错误,避免对客户端请求错误做无意义重试。
|
||||||
|
// 适用于所有走 Google 后端的平台(Antigravity、Gemini)。
|
||||||
|
func isGoogleProjectConfigError(lowerMsg string) bool {
|
||||||
|
// Google 间歇性 Bug:Project ID 有效但被临时识别失败
|
||||||
|
return strings.Contains(lowerMsg, "invalid project resource name")
|
||||||
|
}
|
||||||
|
|
||||||
|
// googleConfigErrorCooldown 服务端配置类 400 错误的临时封禁时长
|
||||||
|
const googleConfigErrorCooldown = 60 * time.Minute
|
||||||
|
|
||||||
|
// tempUnscheduleGoogleConfigError 对服务端配置类 400 错误触发临时封禁,
|
||||||
|
// 避免短时间内反复调度到同一个有问题的账号。
|
||||||
|
func tempUnscheduleGoogleConfigError(ctx context.Context, repo AccountRepository, accountID int64, logPrefix string) {
|
||||||
|
until := time.Now().Add(googleConfigErrorCooldown)
|
||||||
|
reason := "400: invalid project resource name (auto temp-unschedule 1h)"
|
||||||
|
if err := repo.SetTempUnschedulable(ctx, accountID, until, reason); err != nil {
|
||||||
|
log.Printf("%s temp_unschedule_failed account=%d error=%v", logPrefix, accountID, err)
|
||||||
|
} else {
|
||||||
|
log.Printf("%s temp_unscheduled account=%d until=%v reason=%q", logPrefix, accountID, until.Format("15:04:05"), reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// sleepAntigravityBackoffWithContext 带 context 取消检查的退避等待
|
// sleepAntigravityBackoffWithContext 带 context 取消检查的退避等待
|
||||||
// 返回 true 表示正常完成等待,false 表示 context 已取消
|
// 返回 true 表示正常完成等待,false 表示 context 已取消
|
||||||
func sleepAntigravityBackoffWithContext(ctx context.Context, attempt int) bool {
|
func sleepAntigravityBackoffWithContext(ctx context.Context, attempt int) bool {
|
||||||
|
|||||||
@@ -880,6 +880,38 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
|
|||||||
|
|
||||||
// ErrorPolicyNone → 原有逻辑
|
// ErrorPolicyNone → 原有逻辑
|
||||||
s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
|
s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
|
||||||
|
// 精确匹配服务端配置类 400 错误,触发 failover + 临时封禁
|
||||||
|
if resp.StatusCode == http.StatusBadRequest {
|
||||||
|
msg400 := strings.ToLower(strings.TrimSpace(extractUpstreamErrorMessage(respBody)))
|
||||||
|
if isGoogleProjectConfigError(msg400) {
|
||||||
|
upstreamReqID := resp.Header.Get(requestIDHeader)
|
||||||
|
if upstreamReqID == "" {
|
||||||
|
upstreamReqID = resp.Header.Get("x-goog-request-id")
|
||||||
|
}
|
||||||
|
upstreamMsg := sanitizeUpstreamErrorMessage(strings.TrimSpace(extractUpstreamErrorMessage(respBody)))
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(respBody), maxBytes)
|
||||||
|
}
|
||||||
|
log.Printf("[Gemini] status=400 google_config_error failover=true upstream_message=%q account=%d", upstreamMsg, account.ID)
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
AccountName: account.Name,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: upstreamReqID,
|
||||||
|
Kind: "failover",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
tempUnscheduleGoogleConfigError(ctx, s.accountRepo, account.ID, "[Gemini]")
|
||||||
|
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode, ResponseBody: respBody}
|
||||||
|
}
|
||||||
|
}
|
||||||
if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {
|
if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {
|
||||||
upstreamReqID := resp.Header.Get(requestIDHeader)
|
upstreamReqID := resp.Header.Get(requestIDHeader)
|
||||||
if upstreamReqID == "" {
|
if upstreamReqID == "" {
|
||||||
@@ -1330,6 +1362,35 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
|
|||||||
|
|
||||||
// ErrorPolicyNone → 原有逻辑
|
// ErrorPolicyNone → 原有逻辑
|
||||||
s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
|
s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody)
|
||||||
|
// 精确匹配服务端配置类 400 错误,触发 failover + 临时封禁
|
||||||
|
if resp.StatusCode == http.StatusBadRequest {
|
||||||
|
msg400 := strings.ToLower(strings.TrimSpace(extractUpstreamErrorMessage(respBody)))
|
||||||
|
if isGoogleProjectConfigError(msg400) {
|
||||||
|
evBody := unwrapIfNeeded(isOAuth, respBody)
|
||||||
|
upstreamMsg := sanitizeUpstreamErrorMessage(strings.TrimSpace(extractUpstreamErrorMessage(evBody)))
|
||||||
|
upstreamDetail := ""
|
||||||
|
if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody {
|
||||||
|
maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 2048
|
||||||
|
}
|
||||||
|
upstreamDetail = truncateString(string(evBody), maxBytes)
|
||||||
|
}
|
||||||
|
log.Printf("[Gemini] status=400 google_config_error failover=true upstream_message=%q account=%d", upstreamMsg, account.ID)
|
||||||
|
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||||
|
Platform: account.Platform,
|
||||||
|
AccountID: account.ID,
|
||||||
|
AccountName: account.Name,
|
||||||
|
UpstreamStatusCode: resp.StatusCode,
|
||||||
|
UpstreamRequestID: requestID,
|
||||||
|
Kind: "failover",
|
||||||
|
Message: upstreamMsg,
|
||||||
|
Detail: upstreamDetail,
|
||||||
|
})
|
||||||
|
tempUnscheduleGoogleConfigError(ctx, s.accountRepo, account.ID, "[Gemini]")
|
||||||
|
return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode, ResponseBody: evBody}
|
||||||
|
}
|
||||||
|
}
|
||||||
if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {
|
if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) {
|
||||||
evBody := unwrapIfNeeded(isOAuth, respBody)
|
evBody := unwrapIfNeeded(isOAuth, respBody)
|
||||||
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(evBody))
|
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(evBody))
|
||||||
|
|||||||
Reference in New Issue
Block a user