feat(antigravity): progressive penalty for consecutive INTERNAL 500 errors
When an antigravity account returns 500 "Internal error encountered." on all 3 retry attempts, increment a Redis counter and apply escalating penalties: - 1st round: temp unschedulable 10 minutes - 2nd round: temp unschedulable 10 hours - 3rd round: permanently mark as error Counter resets on any successful response (< 400).
This commit is contained in:
@@ -71,6 +71,11 @@ const (
|
||||
|
||||
// MODEL_CAPACITY_EXHAUSTED 全局去重:重试全部失败后的 cooldown 时间
|
||||
antigravityModelCapacityCooldown = 10 * time.Second
|
||||
|
||||
// INTERNAL 500 渐进惩罚:连续多轮全部返回特定 500 错误时的惩罚时长
|
||||
internal500PenaltyTier1Duration = 10 * time.Minute
|
||||
internal500PenaltyTier2Duration = 10 * time.Hour
|
||||
internal500PenaltyTier3Threshold = 3 // 第 3+ 轮:永久禁用
|
||||
)
|
||||
|
||||
// antigravityPassthroughErrorMessages 透传给客户端的错误消息白名单(小写)
|
||||
@@ -614,6 +619,7 @@ func (s *AntigravityGatewayService) antigravityRetryLoop(p antigravityRetryLoopP
|
||||
urlFallbackLoop:
|
||||
for urlIdx, baseURL := range availableURLs {
|
||||
usedBaseURL = baseURL
|
||||
allAttemptsInternal500 := true // 追踪本轮所有 attempt 是否全部命中 INTERNAL 500
|
||||
for attempt := 1; attempt <= antigravityMaxRetries; attempt++ {
|
||||
select {
|
||||
case <-p.ctx.Done():
|
||||
@@ -766,10 +772,27 @@ urlFallbackLoop:
|
||||
logger.LegacyPrintf("service.antigravity_gateway", "%s status=context_canceled_during_backoff", p.prefix)
|
||||
return nil, p.ctx.Err()
|
||||
}
|
||||
// 追踪 INTERNAL 500:非匹配的 attempt 清除标记
|
||||
if !isAntigravityInternalServerError(resp.StatusCode, respBody) {
|
||||
allAttemptsInternal500 = false
|
||||
}
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// INTERNAL 500 渐进惩罚:3 次重试全部命中特定 500 时递增计数器并惩罚
|
||||
if allAttemptsInternal500 &&
|
||||
isAntigravityInternalServerError(resp.StatusCode, respBody) &&
|
||||
s.internal500Cache != nil {
|
||||
count, incrErr := s.internal500Cache.IncrementInternal500Count(p.ctx, p.account.ID)
|
||||
if incrErr != nil {
|
||||
slog.Error("internal500_counter_increment_failed",
|
||||
"prefix", p.prefix, "account_id", p.account.ID, "error", incrErr)
|
||||
} else {
|
||||
s.applyInternal500Penalty(p.ctx, p.prefix, p.account, count)
|
||||
}
|
||||
}
|
||||
|
||||
// 其他 4xx 错误或重试用尽,直接返回
|
||||
resp = &http.Response{
|
||||
StatusCode: resp.StatusCode,
|
||||
@@ -779,7 +802,13 @@ urlFallbackLoop:
|
||||
break urlFallbackLoop
|
||||
}
|
||||
|
||||
// 成功响应(< 400)
|
||||
// 成功响应(< 400):清零 INTERNAL 500 连续失败计数器
|
||||
if s.internal500Cache != nil {
|
||||
if err := s.internal500Cache.ResetInternal500Count(p.ctx, p.account.ID); err != nil {
|
||||
slog.Error("internal500_counter_reset_failed",
|
||||
"prefix", p.prefix, "account_id", p.account.ID, "error", err)
|
||||
}
|
||||
}
|
||||
break urlFallbackLoop
|
||||
}
|
||||
}
|
||||
@@ -801,6 +830,56 @@ func shouldRetryAntigravityError(statusCode int) bool {
|
||||
}
|
||||
}
|
||||
|
||||
// isAntigravityInternalServerError 检测特定的 INTERNAL 500 错误
|
||||
// 必须同时匹配 error.code==500, error.message=="Internal error encountered.", error.status=="INTERNAL"
|
||||
func isAntigravityInternalServerError(statusCode int, body []byte) bool {
|
||||
if statusCode != http.StatusInternalServerError {
|
||||
return false
|
||||
}
|
||||
return gjson.GetBytes(body, "error.code").Int() == 500 &&
|
||||
gjson.GetBytes(body, "error.message").String() == "Internal error encountered." &&
|
||||
gjson.GetBytes(body, "error.status").String() == "INTERNAL"
|
||||
}
|
||||
|
||||
// applyInternal500Penalty 根据连续 INTERNAL 500 轮次数应用渐进惩罚
|
||||
// count=1: temp_unschedulable 10 分钟
|
||||
// count=2: temp_unschedulable 10 小时
|
||||
// count>=3: SetError 永久禁用
|
||||
func (s *AntigravityGatewayService) applyInternal500Penalty(
|
||||
ctx context.Context, prefix string, account *Account, count int64,
|
||||
) {
|
||||
switch {
|
||||
case count >= int64(internal500PenaltyTier3Threshold):
|
||||
reason := fmt.Sprintf("INTERNAL 500 consecutive failures: %d rounds", count)
|
||||
if err := s.accountRepo.SetError(ctx, account.ID, reason); err != nil {
|
||||
slog.Error("internal500_set_error_failed", "account_id", account.ID, "error", err)
|
||||
return
|
||||
}
|
||||
slog.Warn("internal500_account_disabled",
|
||||
"account_id", account.ID, "account_name", account.Name, "consecutive_count", count)
|
||||
case count == 2:
|
||||
until := time.Now().Add(internal500PenaltyTier2Duration)
|
||||
reason := fmt.Sprintf("INTERNAL 500 x%d (temp unsched 10h)", count)
|
||||
if err := s.accountRepo.SetTempUnschedulable(ctx, account.ID, until, reason); err != nil {
|
||||
slog.Error("internal500_temp_unsched_failed", "account_id", account.ID, "error", err)
|
||||
return
|
||||
}
|
||||
slog.Warn("internal500_temp_unschedulable",
|
||||
"account_id", account.ID, "account_name", account.Name,
|
||||
"duration", internal500PenaltyTier2Duration, "consecutive_count", count)
|
||||
case count == 1:
|
||||
until := time.Now().Add(internal500PenaltyTier1Duration)
|
||||
reason := fmt.Sprintf("INTERNAL 500 x%d (temp unsched 10m)", count)
|
||||
if err := s.accountRepo.SetTempUnschedulable(ctx, account.ID, until, reason); err != nil {
|
||||
slog.Error("internal500_temp_unsched_failed", "account_id", account.ID, "error", err)
|
||||
return
|
||||
}
|
||||
slog.Info("internal500_temp_unschedulable",
|
||||
"account_id", account.ID, "account_name", account.Name,
|
||||
"duration", internal500PenaltyTier1Duration, "consecutive_count", count)
|
||||
}
|
||||
}
|
||||
|
||||
// isURLLevelRateLimit 判断是否为 URL 级别的限流(应切换 URL 重试)
|
||||
// "Resource has been exhausted" 是 URL/节点级别限流,切换 URL 可能成功
|
||||
// "exhausted your capacity on this model" 是账户/模型配额限流,切换 URL 无效
|
||||
@@ -862,6 +941,7 @@ type AntigravityGatewayService struct {
|
||||
settingService *SettingService
|
||||
cache GatewayCache // 用于模型级限流时清除粘性会话绑定
|
||||
schedulerSnapshot *SchedulerSnapshotService
|
||||
internal500Cache Internal500CounterCache // INTERNAL 500 渐进惩罚计数器
|
||||
}
|
||||
|
||||
func NewAntigravityGatewayService(
|
||||
@@ -872,6 +952,7 @@ func NewAntigravityGatewayService(
|
||||
rateLimitService *RateLimitService,
|
||||
httpUpstream HTTPUpstream,
|
||||
settingService *SettingService,
|
||||
internal500Cache Internal500CounterCache,
|
||||
) *AntigravityGatewayService {
|
||||
return &AntigravityGatewayService{
|
||||
accountRepo: accountRepo,
|
||||
@@ -881,6 +962,7 @@ func NewAntigravityGatewayService(
|
||||
settingService: settingService,
|
||||
cache: cache,
|
||||
schedulerSnapshot: schedulerSnapshot,
|
||||
internal500Cache: internal500Cache,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
11
backend/internal/service/internal500_counter.go
Normal file
11
backend/internal/service/internal500_counter.go
Normal file
@@ -0,0 +1,11 @@
|
||||
package service
|
||||
|
||||
import "context"
|
||||
|
||||
// Internal500CounterCache 追踪 Antigravity 账号连续 INTERNAL 500 失败轮数
|
||||
type Internal500CounterCache interface {
|
||||
// IncrementInternal500Count 原子递增计数并返回当前值
|
||||
IncrementInternal500Count(ctx context.Context, accountID int64) (int64, error)
|
||||
// ResetInternal500Count 清零计数器(成功响应时调用)
|
||||
ResetInternal500Count(ctx context.Context, accountID int64) error
|
||||
}
|
||||
Reference in New Issue
Block a user