Key changes: - Upgrade model mapping: Opus 4.5 → Opus 4.6-thinking with precise matching - Unified rate limiting: scope-level → model-level with Redis snapshot sync - Load-balanced scheduling by call count with smart retry mechanism - Force cache billing support - Model identity injection in prompts with leak prevention - Thinking mode auto-handling (max_tokens/budget_tokens fix) - Frontend: whitelist mode toggle, model mapping validation, status indicators - Gemini session fallback with Redis Trie O(L) matching - Ops: enhanced concurrency monitoring, account availability, retry logic - Migration scripts: 049-051 for model mapping unification
726 lines
22 KiB
Go
726 lines
22 KiB
Go
package service
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"database/sql"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"log"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
|
|
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
|
"github.com/gin-gonic/gin"
|
|
"github.com/lib/pq"
|
|
)
|
|
|
|
const (
|
|
OpsRetryModeClient = "client"
|
|
OpsRetryModeUpstream = "upstream"
|
|
)
|
|
|
|
const (
|
|
opsRetryStatusRunning = "running"
|
|
opsRetryStatusSucceeded = "succeeded"
|
|
opsRetryStatusFailed = "failed"
|
|
)
|
|
|
|
const (
|
|
opsRetryTimeout = 60 * time.Second
|
|
opsRetryCaptureBytesLimit = 64 * 1024
|
|
opsRetryResponsePreviewMax = 8 * 1024
|
|
opsRetryMinIntervalPerError = 10 * time.Second
|
|
opsRetryMaxAccountSwitches = 3
|
|
)
|
|
|
|
var opsRetryRequestHeaderAllowlist = map[string]bool{
|
|
"anthropic-beta": true,
|
|
"anthropic-version": true,
|
|
}
|
|
|
|
type opsRetryRequestType string
|
|
|
|
const (
|
|
opsRetryTypeMessages opsRetryRequestType = "messages"
|
|
opsRetryTypeOpenAI opsRetryRequestType = "openai_responses"
|
|
opsRetryTypeGeminiV1B opsRetryRequestType = "gemini_v1beta"
|
|
)
|
|
|
|
type limitedResponseWriter struct {
|
|
header http.Header
|
|
wroteHeader bool
|
|
|
|
limit int
|
|
totalWritten int64
|
|
buf bytes.Buffer
|
|
}
|
|
|
|
func newLimitedResponseWriter(limit int) *limitedResponseWriter {
|
|
if limit <= 0 {
|
|
limit = 1
|
|
}
|
|
return &limitedResponseWriter{
|
|
header: make(http.Header),
|
|
limit: limit,
|
|
}
|
|
}
|
|
|
|
func (w *limitedResponseWriter) Header() http.Header {
|
|
return w.header
|
|
}
|
|
|
|
func (w *limitedResponseWriter) WriteHeader(statusCode int) {
|
|
if w.wroteHeader {
|
|
return
|
|
}
|
|
w.wroteHeader = true
|
|
}
|
|
|
|
func (w *limitedResponseWriter) Write(p []byte) (int, error) {
|
|
if !w.wroteHeader {
|
|
w.WriteHeader(http.StatusOK)
|
|
}
|
|
w.totalWritten += int64(len(p))
|
|
|
|
if w.buf.Len() < w.limit {
|
|
remaining := w.limit - w.buf.Len()
|
|
if len(p) > remaining {
|
|
_, _ = w.buf.Write(p[:remaining])
|
|
} else {
|
|
_, _ = w.buf.Write(p)
|
|
}
|
|
}
|
|
|
|
// Pretend we wrote everything to avoid upstream/client code treating it as an error.
|
|
return len(p), nil
|
|
}
|
|
|
|
func (w *limitedResponseWriter) Flush() {}
|
|
|
|
func (w *limitedResponseWriter) bodyBytes() []byte {
|
|
return w.buf.Bytes()
|
|
}
|
|
|
|
func (w *limitedResponseWriter) truncated() bool {
|
|
return w.totalWritten > int64(w.limit)
|
|
}
|
|
|
|
const (
|
|
OpsRetryModeUpstreamEvent = "upstream_event"
|
|
)
|
|
|
|
func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
|
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
if s.opsRepo == nil {
|
|
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
|
}
|
|
|
|
mode = strings.ToLower(strings.TrimSpace(mode))
|
|
switch mode {
|
|
case OpsRetryModeClient, OpsRetryModeUpstream:
|
|
default:
|
|
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
|
|
}
|
|
|
|
errorLog, err := s.GetErrorLogByID(ctx, errorID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if errorLog == nil {
|
|
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
|
|
}
|
|
if strings.TrimSpace(errorLog.RequestBody) == "" {
|
|
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
|
|
}
|
|
|
|
var pinned *int64
|
|
if mode == OpsRetryModeUpstream {
|
|
if pinnedAccountID != nil && *pinnedAccountID > 0 {
|
|
pinned = pinnedAccountID
|
|
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
|
|
pinned = errorLog.AccountID
|
|
} else {
|
|
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
|
|
}
|
|
}
|
|
|
|
return s.retryWithErrorLog(ctx, requestedByUserID, errorID, mode, mode, pinned, errorLog)
|
|
}
|
|
|
|
// RetryUpstreamEvent retries a specific upstream attempt captured inside ops_error_logs.upstream_errors.
|
|
// idx is 0-based. It always pins the original event account_id.
|
|
func (s *OpsService) RetryUpstreamEvent(ctx context.Context, requestedByUserID int64, errorID int64, idx int) (*OpsRetryResult, error) {
|
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
if s.opsRepo == nil {
|
|
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
|
}
|
|
if idx < 0 {
|
|
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_UPSTREAM_IDX", "invalid upstream idx")
|
|
}
|
|
|
|
errorLog, err := s.GetErrorLogByID(ctx, errorID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if errorLog == nil {
|
|
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
|
|
}
|
|
|
|
events, err := ParseOpsUpstreamErrors(errorLog.UpstreamErrors)
|
|
if err != nil {
|
|
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENTS_INVALID", "invalid upstream_errors")
|
|
}
|
|
if idx >= len(events) {
|
|
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_IDX_OOB", "upstream idx out of range")
|
|
}
|
|
ev := events[idx]
|
|
if ev == nil {
|
|
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENT_MISSING", "upstream event missing")
|
|
}
|
|
if ev.AccountID <= 0 {
|
|
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry")
|
|
}
|
|
|
|
upstreamBody := strings.TrimSpace(ev.UpstreamRequestBody)
|
|
if upstreamBody == "" {
|
|
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_NO_REQUEST_BODY", "No upstream request body found to retry")
|
|
}
|
|
|
|
override := *errorLog
|
|
override.RequestBody = upstreamBody
|
|
pinned := ev.AccountID
|
|
|
|
// Persist as upstream_event, execute as upstream pinned retry.
|
|
return s.retryWithErrorLog(ctx, requestedByUserID, errorID, OpsRetryModeUpstreamEvent, OpsRetryModeUpstream, &pinned, &override)
|
|
}
|
|
|
|
func (s *OpsService) retryWithErrorLog(ctx context.Context, requestedByUserID int64, errorID int64, mode string, execMode string, pinnedAccountID *int64, errorLog *OpsErrorLogDetail) (*OpsRetryResult, error) {
|
|
latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
|
|
if err != nil && !errors.Is(err, sql.ErrNoRows) {
|
|
return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
|
|
}
|
|
if latest != nil {
|
|
if strings.EqualFold(latest.Status, opsRetryStatusRunning) || strings.EqualFold(latest.Status, "queued") {
|
|
return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
|
|
}
|
|
|
|
lastAttemptAt := latest.CreatedAt
|
|
if latest.FinishedAt != nil && !latest.FinishedAt.IsZero() {
|
|
lastAttemptAt = *latest.FinishedAt
|
|
} else if latest.StartedAt != nil && !latest.StartedAt.IsZero() {
|
|
lastAttemptAt = *latest.StartedAt
|
|
}
|
|
|
|
if time.Since(lastAttemptAt) < opsRetryMinIntervalPerError {
|
|
return nil, infraerrors.Conflict("OPS_RETRY_TOO_FREQUENT", "Please wait before retrying this error again")
|
|
}
|
|
}
|
|
|
|
if errorLog == nil || strings.TrimSpace(errorLog.RequestBody) == "" {
|
|
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
|
|
}
|
|
|
|
var pinned *int64
|
|
if execMode == OpsRetryModeUpstream {
|
|
if pinnedAccountID != nil && *pinnedAccountID > 0 {
|
|
pinned = pinnedAccountID
|
|
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
|
|
pinned = errorLog.AccountID
|
|
} else {
|
|
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry")
|
|
}
|
|
}
|
|
|
|
startedAt := time.Now()
|
|
attemptID, err := s.opsRepo.InsertRetryAttempt(ctx, &OpsInsertRetryAttemptInput{
|
|
RequestedByUserID: requestedByUserID,
|
|
SourceErrorID: errorID,
|
|
Mode: mode,
|
|
PinnedAccountID: pinned,
|
|
Status: opsRetryStatusRunning,
|
|
StartedAt: startedAt,
|
|
})
|
|
if err != nil {
|
|
var pqErr *pq.Error
|
|
if errors.As(err, &pqErr) && string(pqErr.Code) == "23505" {
|
|
return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
|
|
}
|
|
return nil, infraerrors.InternalServer("OPS_RETRY_CREATE_ATTEMPT_FAILED", "Failed to create retry attempt").WithCause(err)
|
|
}
|
|
|
|
result := &OpsRetryResult{
|
|
AttemptID: attemptID,
|
|
Mode: mode,
|
|
Status: opsRetryStatusFailed,
|
|
PinnedAccountID: pinned,
|
|
HTTPStatusCode: 0,
|
|
UpstreamRequestID: "",
|
|
ResponsePreview: "",
|
|
ResponseTruncated: false,
|
|
ErrorMessage: "",
|
|
StartedAt: startedAt,
|
|
}
|
|
|
|
execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
|
|
defer cancel()
|
|
|
|
execRes := s.executeRetry(execCtx, errorLog, execMode, pinned)
|
|
|
|
finishedAt := time.Now()
|
|
result.FinishedAt = finishedAt
|
|
result.DurationMs = finishedAt.Sub(startedAt).Milliseconds()
|
|
|
|
if execRes != nil {
|
|
result.Status = execRes.status
|
|
result.UsedAccountID = execRes.usedAccountID
|
|
result.HTTPStatusCode = execRes.httpStatusCode
|
|
result.UpstreamRequestID = execRes.upstreamRequestID
|
|
result.ResponsePreview = execRes.responsePreview
|
|
result.ResponseTruncated = execRes.responseTruncated
|
|
result.ErrorMessage = execRes.errorMessage
|
|
}
|
|
|
|
updateCtx, updateCancel := context.WithTimeout(context.Background(), 3*time.Second)
|
|
defer updateCancel()
|
|
|
|
var updateErrMsg *string
|
|
if strings.TrimSpace(result.ErrorMessage) != "" {
|
|
msg := result.ErrorMessage
|
|
updateErrMsg = &msg
|
|
}
|
|
// Keep legacy result_request_id empty; use upstream_request_id instead.
|
|
var resultRequestID *string
|
|
|
|
finalStatus := result.Status
|
|
if strings.TrimSpace(finalStatus) == "" {
|
|
finalStatus = opsRetryStatusFailed
|
|
}
|
|
|
|
success := strings.EqualFold(finalStatus, opsRetryStatusSucceeded)
|
|
httpStatus := result.HTTPStatusCode
|
|
upstreamReqID := result.UpstreamRequestID
|
|
usedAccountID := result.UsedAccountID
|
|
preview := result.ResponsePreview
|
|
truncated := result.ResponseTruncated
|
|
|
|
if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{
|
|
ID: attemptID,
|
|
Status: finalStatus,
|
|
FinishedAt: finishedAt,
|
|
DurationMs: result.DurationMs,
|
|
Success: &success,
|
|
HTTPStatusCode: &httpStatus,
|
|
UpstreamRequestID: &upstreamReqID,
|
|
UsedAccountID: usedAccountID,
|
|
ResponsePreview: &preview,
|
|
ResponseTruncated: &truncated,
|
|
ResultRequestID: resultRequestID,
|
|
ErrorMessage: updateErrMsg,
|
|
}); err != nil {
|
|
log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
|
|
} else if success {
|
|
if err := s.opsRepo.UpdateErrorResolution(updateCtx, errorID, true, &requestedByUserID, &attemptID, &finishedAt); err != nil {
|
|
log.Printf("[Ops] UpdateErrorResolution failed: %v", err)
|
|
}
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
type opsRetryExecution struct {
|
|
status string
|
|
|
|
usedAccountID *int64
|
|
httpStatusCode int
|
|
upstreamRequestID string
|
|
|
|
responsePreview string
|
|
responseTruncated bool
|
|
|
|
errorMessage string
|
|
}
|
|
|
|
func (s *OpsService) executeRetry(ctx context.Context, errorLog *OpsErrorLogDetail, mode string, pinnedAccountID *int64) *opsRetryExecution {
|
|
if errorLog == nil {
|
|
return &opsRetryExecution{
|
|
status: opsRetryStatusFailed,
|
|
errorMessage: "missing error log",
|
|
}
|
|
}
|
|
|
|
reqType := detectOpsRetryType(errorLog.RequestPath)
|
|
bodyBytes := []byte(errorLog.RequestBody)
|
|
|
|
switch reqType {
|
|
case opsRetryTypeMessages:
|
|
bodyBytes = FilterThinkingBlocksForRetry(bodyBytes)
|
|
case opsRetryTypeOpenAI, opsRetryTypeGeminiV1B:
|
|
// No-op
|
|
}
|
|
|
|
switch strings.ToLower(strings.TrimSpace(mode)) {
|
|
case OpsRetryModeUpstream:
|
|
if pinnedAccountID == nil || *pinnedAccountID <= 0 {
|
|
return &opsRetryExecution{
|
|
status: opsRetryStatusFailed,
|
|
errorMessage: "pinned_account_id required for upstream retry",
|
|
}
|
|
}
|
|
return s.executePinnedRetry(ctx, reqType, errorLog, bodyBytes, *pinnedAccountID)
|
|
case OpsRetryModeClient:
|
|
return s.executeClientRetry(ctx, reqType, errorLog, bodyBytes)
|
|
default:
|
|
return &opsRetryExecution{
|
|
status: opsRetryStatusFailed,
|
|
errorMessage: "invalid retry mode",
|
|
}
|
|
}
|
|
}
|
|
|
|
func detectOpsRetryType(path string) opsRetryRequestType {
|
|
p := strings.ToLower(strings.TrimSpace(path))
|
|
switch {
|
|
case strings.Contains(p, "/responses"):
|
|
return opsRetryTypeOpenAI
|
|
case strings.Contains(p, "/v1beta/"):
|
|
return opsRetryTypeGeminiV1B
|
|
default:
|
|
return opsRetryTypeMessages
|
|
}
|
|
}
|
|
|
|
func (s *OpsService) executePinnedRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, pinnedAccountID int64) *opsRetryExecution {
|
|
if s.accountRepo == nil {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account repository not available"}
|
|
}
|
|
|
|
account, err := s.accountRepo.GetByID(ctx, pinnedAccountID)
|
|
if err != nil {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("account not found: %v", err)}
|
|
}
|
|
if account == nil {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account not found"}
|
|
}
|
|
if !account.IsSchedulable() {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account is not schedulable"}
|
|
}
|
|
if errorLog.GroupID != nil && *errorLog.GroupID > 0 {
|
|
if !containsInt64(account.GroupIDs, *errorLog.GroupID) {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "pinned account is not in the same group as the original request"}
|
|
}
|
|
}
|
|
|
|
var release func()
|
|
if s.concurrencyService != nil {
|
|
acq, err := s.concurrencyService.AcquireAccountSlot(ctx, account.ID, account.Concurrency)
|
|
if err != nil {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("acquire account slot failed: %v", err)}
|
|
}
|
|
if acq == nil || !acq.Acquired {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account concurrency limit reached"}
|
|
}
|
|
release = acq.ReleaseFunc
|
|
}
|
|
if release != nil {
|
|
defer release()
|
|
}
|
|
|
|
usedID := account.ID
|
|
exec := s.executeWithAccount(ctx, reqType, errorLog, body, account)
|
|
exec.usedAccountID = &usedID
|
|
if exec.status == "" {
|
|
exec.status = opsRetryStatusFailed
|
|
}
|
|
return exec
|
|
}
|
|
|
|
func (s *OpsService) executeClientRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) *opsRetryExecution {
|
|
groupID := errorLog.GroupID
|
|
if groupID == nil || *groupID <= 0 {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "group_id missing; cannot reselect account"}
|
|
}
|
|
|
|
model, stream, parsedErr := extractRetryModelAndStream(reqType, errorLog, body)
|
|
if parsedErr != nil {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: parsedErr.Error()}
|
|
}
|
|
_ = stream
|
|
|
|
excluded := make(map[int64]struct{})
|
|
switches := 0
|
|
|
|
for {
|
|
if switches >= opsRetryMaxAccountSwitches {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "retry failed after exhausting account failovers"}
|
|
}
|
|
|
|
selection, selErr := s.selectAccountForRetry(ctx, reqType, groupID, model, excluded)
|
|
if selErr != nil {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: selErr.Error()}
|
|
}
|
|
if selection == nil || selection.Account == nil {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "no available accounts"}
|
|
}
|
|
|
|
account := selection.Account
|
|
if !selection.Acquired || selection.ReleaseFunc == nil {
|
|
excluded[account.ID] = struct{}{}
|
|
switches++
|
|
continue
|
|
}
|
|
|
|
attemptCtx := ctx
|
|
if switches > 0 {
|
|
attemptCtx = context.WithValue(attemptCtx, ctxkey.AccountSwitchCount, switches)
|
|
}
|
|
exec := func() *opsRetryExecution {
|
|
defer selection.ReleaseFunc()
|
|
return s.executeWithAccount(attemptCtx, reqType, errorLog, body, account)
|
|
}()
|
|
|
|
if exec != nil {
|
|
if exec.status == opsRetryStatusSucceeded {
|
|
usedID := account.ID
|
|
exec.usedAccountID = &usedID
|
|
return exec
|
|
}
|
|
// If the gateway services ask for failover, try another account.
|
|
if s.isFailoverError(exec.errorMessage) {
|
|
excluded[account.ID] = struct{}{}
|
|
switches++
|
|
continue
|
|
}
|
|
usedID := account.ID
|
|
exec.usedAccountID = &usedID
|
|
return exec
|
|
}
|
|
|
|
excluded[account.ID] = struct{}{}
|
|
switches++
|
|
}
|
|
}
|
|
|
|
func (s *OpsService) selectAccountForRetry(ctx context.Context, reqType opsRetryRequestType, groupID *int64, model string, excludedIDs map[int64]struct{}) (*AccountSelectionResult, error) {
|
|
switch reqType {
|
|
case opsRetryTypeOpenAI:
|
|
if s.openAIGatewayService == nil {
|
|
return nil, fmt.Errorf("openai gateway service not available")
|
|
}
|
|
return s.openAIGatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
|
|
case opsRetryTypeGeminiV1B, opsRetryTypeMessages:
|
|
if s.gatewayService == nil {
|
|
return nil, fmt.Errorf("gateway service not available")
|
|
}
|
|
return s.gatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs, "") // 重试不使用会话限制
|
|
default:
|
|
return nil, fmt.Errorf("unsupported retry type: %s", reqType)
|
|
}
|
|
}
|
|
|
|
func extractRetryModelAndStream(reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) (model string, stream bool, err error) {
|
|
switch reqType {
|
|
case opsRetryTypeMessages:
|
|
parsed, parseErr := ParseGatewayRequest(body)
|
|
if parseErr != nil {
|
|
return "", false, fmt.Errorf("failed to parse messages request body: %w", parseErr)
|
|
}
|
|
return parsed.Model, parsed.Stream, nil
|
|
case opsRetryTypeOpenAI:
|
|
var v struct {
|
|
Model string `json:"model"`
|
|
Stream bool `json:"stream"`
|
|
}
|
|
if err := json.Unmarshal(body, &v); err != nil {
|
|
return "", false, fmt.Errorf("failed to parse openai request body: %w", err)
|
|
}
|
|
return strings.TrimSpace(v.Model), v.Stream, nil
|
|
case opsRetryTypeGeminiV1B:
|
|
if strings.TrimSpace(errorLog.Model) == "" {
|
|
return "", false, fmt.Errorf("missing model for gemini v1beta retry")
|
|
}
|
|
return strings.TrimSpace(errorLog.Model), errorLog.Stream, nil
|
|
default:
|
|
return "", false, fmt.Errorf("unsupported retry type: %s", reqType)
|
|
}
|
|
}
|
|
|
|
func (s *OpsService) executeWithAccount(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, account *Account) *opsRetryExecution {
|
|
if account == nil {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "missing account"}
|
|
}
|
|
|
|
c, w := newOpsRetryContext(ctx, errorLog)
|
|
|
|
var err error
|
|
switch reqType {
|
|
case opsRetryTypeOpenAI:
|
|
if s.openAIGatewayService == nil {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "openai gateway service not available"}
|
|
}
|
|
_, err = s.openAIGatewayService.Forward(ctx, c, account, body)
|
|
case opsRetryTypeGeminiV1B:
|
|
if s.geminiCompatService == nil || s.antigravityGatewayService == nil {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini services not available"}
|
|
}
|
|
modelName := strings.TrimSpace(errorLog.Model)
|
|
action := "generateContent"
|
|
if errorLog.Stream {
|
|
action = "streamGenerateContent"
|
|
}
|
|
if account.Platform == PlatformAntigravity {
|
|
_, err = s.antigravityGatewayService.ForwardGemini(ctx, c, account, modelName, action, errorLog.Stream, body, false)
|
|
} else {
|
|
_, err = s.geminiCompatService.ForwardNative(ctx, c, account, modelName, action, errorLog.Stream, body)
|
|
}
|
|
case opsRetryTypeMessages:
|
|
switch account.Platform {
|
|
case PlatformAntigravity:
|
|
if s.antigravityGatewayService == nil {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "antigravity gateway service not available"}
|
|
}
|
|
_, err = s.antigravityGatewayService.Forward(ctx, c, account, body, false)
|
|
case PlatformGemini:
|
|
if s.geminiCompatService == nil {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini gateway service not available"}
|
|
}
|
|
_, err = s.geminiCompatService.Forward(ctx, c, account, body)
|
|
default:
|
|
if s.gatewayService == nil {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gateway service not available"}
|
|
}
|
|
parsedReq, parseErr := ParseGatewayRequest(body)
|
|
if parseErr != nil {
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "failed to parse request body"}
|
|
}
|
|
_, err = s.gatewayService.Forward(ctx, c, account, parsedReq)
|
|
}
|
|
default:
|
|
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "unsupported retry type"}
|
|
}
|
|
|
|
statusCode := http.StatusOK
|
|
if c != nil && c.Writer != nil {
|
|
statusCode = c.Writer.Status()
|
|
}
|
|
|
|
upstreamReqID := extractUpstreamRequestID(c)
|
|
preview, truncated := extractResponsePreview(w)
|
|
|
|
exec := &opsRetryExecution{
|
|
status: opsRetryStatusFailed,
|
|
httpStatusCode: statusCode,
|
|
upstreamRequestID: upstreamReqID,
|
|
responsePreview: preview,
|
|
responseTruncated: truncated,
|
|
errorMessage: "",
|
|
}
|
|
|
|
if err == nil && statusCode < 400 {
|
|
exec.status = opsRetryStatusSucceeded
|
|
return exec
|
|
}
|
|
|
|
if err != nil {
|
|
exec.errorMessage = err.Error()
|
|
} else {
|
|
exec.errorMessage = fmt.Sprintf("upstream returned status %d", statusCode)
|
|
}
|
|
|
|
return exec
|
|
}
|
|
|
|
func newOpsRetryContext(ctx context.Context, errorLog *OpsErrorLogDetail) (*gin.Context, *limitedResponseWriter) {
|
|
w := newLimitedResponseWriter(opsRetryCaptureBytesLimit)
|
|
c, _ := gin.CreateTestContext(w)
|
|
|
|
path := "/"
|
|
if errorLog != nil && strings.TrimSpace(errorLog.RequestPath) != "" {
|
|
path = errorLog.RequestPath
|
|
}
|
|
|
|
req, _ := http.NewRequestWithContext(ctx, http.MethodPost, "http://localhost"+path, bytes.NewReader(nil))
|
|
req.Header.Set("content-type", "application/json")
|
|
if errorLog != nil && strings.TrimSpace(errorLog.UserAgent) != "" {
|
|
req.Header.Set("user-agent", errorLog.UserAgent)
|
|
}
|
|
// Restore a minimal, whitelisted subset of request headers to improve retry fidelity
|
|
// (e.g. anthropic-beta / anthropic-version). Never replay auth credentials.
|
|
if errorLog != nil && strings.TrimSpace(errorLog.RequestHeaders) != "" {
|
|
var stored map[string]string
|
|
if err := json.Unmarshal([]byte(errorLog.RequestHeaders), &stored); err == nil {
|
|
for k, v := range stored {
|
|
key := strings.TrimSpace(k)
|
|
if key == "" {
|
|
continue
|
|
}
|
|
if !opsRetryRequestHeaderAllowlist[strings.ToLower(key)] {
|
|
continue
|
|
}
|
|
val := strings.TrimSpace(v)
|
|
if val == "" {
|
|
continue
|
|
}
|
|
req.Header.Set(key, val)
|
|
}
|
|
}
|
|
}
|
|
|
|
c.Request = req
|
|
return c, w
|
|
}
|
|
|
|
func extractUpstreamRequestID(c *gin.Context) string {
|
|
if c == nil || c.Writer == nil {
|
|
return ""
|
|
}
|
|
h := c.Writer.Header()
|
|
if h == nil {
|
|
return ""
|
|
}
|
|
for _, key := range []string{"x-request-id", "X-Request-Id", "X-Request-ID"} {
|
|
if v := strings.TrimSpace(h.Get(key)); v != "" {
|
|
return v
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func extractResponsePreview(w *limitedResponseWriter) (preview string, truncated bool) {
|
|
if w == nil {
|
|
return "", false
|
|
}
|
|
b := bytes.TrimSpace(w.bodyBytes())
|
|
if len(b) == 0 {
|
|
return "", w.truncated()
|
|
}
|
|
if len(b) > opsRetryResponsePreviewMax {
|
|
return string(b[:opsRetryResponsePreviewMax]), true
|
|
}
|
|
return string(b), w.truncated()
|
|
}
|
|
|
|
func containsInt64(items []int64, needle int64) bool {
|
|
for _, v := range items {
|
|
if v == needle {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (s *OpsService) isFailoverError(message string) bool {
|
|
msg := strings.ToLower(strings.TrimSpace(message))
|
|
if msg == "" {
|
|
return false
|
|
}
|
|
return strings.Contains(msg, "upstream error:") && strings.Contains(msg, "failover")
|
|
}
|