feat(service): 实现运维监控业务逻辑层
- 新增 ops 主服务(ops_service.go)和端口定义(ops_port.go) - 实现账号可用性检查服务(ops_account_availability.go) - 实现数据聚合服务(ops_aggregation_service.go) - 实现告警评估服务(ops_alert_evaluator_service.go) - 实现告警管理服务(ops_alerts.go) - 实现数据清理服务(ops_cleanup_service.go) - 实现并发控制服务(ops_concurrency.go) - 实现仪表板服务(ops_dashboard.go) - 实现错误处理服务(ops_errors.go) - 实现直方图服务(ops_histograms.go) - 实现指标采集服务(ops_metrics_collector.go) - 实现查询模式服务(ops_query_mode.go) - 实现实时监控服务(ops_realtime.go) - 实现请求详情服务(ops_request_details.go) - 实现重试机制服务(ops_retry.go) - 实现配置管理服务(ops_settings.go) - 实现趋势分析服务(ops_trends.go) - 实现窗口统计服务(ops_window_stats.go) - 添加 ops 相关领域常量 - 注册 service 依赖注入
This commit is contained in:
635
backend/internal/service/ops_retry.go
Normal file
635
backend/internal/service/ops_retry.go
Normal file
@@ -0,0 +1,635 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/lib/pq"
|
||||
)
|
||||
|
||||
const (
|
||||
OpsRetryModeClient = "client"
|
||||
OpsRetryModeUpstream = "upstream"
|
||||
)
|
||||
|
||||
const (
|
||||
opsRetryStatusRunning = "running"
|
||||
opsRetryStatusSucceeded = "succeeded"
|
||||
opsRetryStatusFailed = "failed"
|
||||
)
|
||||
|
||||
const (
|
||||
opsRetryTimeout = 60 * time.Second
|
||||
opsRetryCaptureBytesLimit = 64 * 1024
|
||||
opsRetryResponsePreviewMax = 8 * 1024
|
||||
opsRetryMinIntervalPerError = 10 * time.Second
|
||||
opsRetryMaxAccountSwitches = 3
|
||||
)
|
||||
|
||||
var opsRetryRequestHeaderAllowlist = map[string]bool{
|
||||
"anthropic-beta": true,
|
||||
"anthropic-version": true,
|
||||
}
|
||||
|
||||
type opsRetryRequestType string
|
||||
|
||||
const (
|
||||
opsRetryTypeMessages opsRetryRequestType = "messages"
|
||||
opsRetryTypeOpenAI opsRetryRequestType = "openai_responses"
|
||||
opsRetryTypeGeminiV1B opsRetryRequestType = "gemini_v1beta"
|
||||
)
|
||||
|
||||
type limitedResponseWriter struct {
|
||||
header http.Header
|
||||
status int
|
||||
wroteHeader bool
|
||||
|
||||
limit int
|
||||
totalWritten int64
|
||||
buf bytes.Buffer
|
||||
}
|
||||
|
||||
func newLimitedResponseWriter(limit int) *limitedResponseWriter {
|
||||
if limit <= 0 {
|
||||
limit = 1
|
||||
}
|
||||
return &limitedResponseWriter{
|
||||
header: make(http.Header),
|
||||
status: http.StatusOK,
|
||||
limit: limit,
|
||||
}
|
||||
}
|
||||
|
||||
func (w *limitedResponseWriter) Header() http.Header {
|
||||
return w.header
|
||||
}
|
||||
|
||||
func (w *limitedResponseWriter) WriteHeader(statusCode int) {
|
||||
if w.wroteHeader {
|
||||
return
|
||||
}
|
||||
w.wroteHeader = true
|
||||
w.status = statusCode
|
||||
}
|
||||
|
||||
func (w *limitedResponseWriter) Write(p []byte) (int, error) {
|
||||
if !w.wroteHeader {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}
|
||||
w.totalWritten += int64(len(p))
|
||||
|
||||
if w.buf.Len() < w.limit {
|
||||
remaining := w.limit - w.buf.Len()
|
||||
if len(p) > remaining {
|
||||
_, _ = w.buf.Write(p[:remaining])
|
||||
} else {
|
||||
_, _ = w.buf.Write(p)
|
||||
}
|
||||
}
|
||||
|
||||
// Pretend we wrote everything to avoid upstream/client code treating it as an error.
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
func (w *limitedResponseWriter) Flush() {}
|
||||
|
||||
func (w *limitedResponseWriter) bodyBytes() []byte {
|
||||
return w.buf.Bytes()
|
||||
}
|
||||
|
||||
func (w *limitedResponseWriter) truncated() bool {
|
||||
return w.totalWritten > int64(w.limit)
|
||||
}
|
||||
|
||||
func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
|
||||
mode = strings.ToLower(strings.TrimSpace(mode))
|
||||
switch mode {
|
||||
case OpsRetryModeClient, OpsRetryModeUpstream:
|
||||
default:
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
|
||||
}
|
||||
|
||||
latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
|
||||
if err != nil && !errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
|
||||
}
|
||||
if latest != nil {
|
||||
if strings.EqualFold(latest.Status, opsRetryStatusRunning) || strings.EqualFold(latest.Status, "queued") {
|
||||
return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
|
||||
}
|
||||
|
||||
lastAttemptAt := latest.CreatedAt
|
||||
if latest.FinishedAt != nil && !latest.FinishedAt.IsZero() {
|
||||
lastAttemptAt = *latest.FinishedAt
|
||||
} else if latest.StartedAt != nil && !latest.StartedAt.IsZero() {
|
||||
lastAttemptAt = *latest.StartedAt
|
||||
}
|
||||
|
||||
if time.Since(lastAttemptAt) < opsRetryMinIntervalPerError {
|
||||
return nil, infraerrors.Conflict("OPS_RETRY_TOO_FREQUENT", "Please wait before retrying this error again")
|
||||
}
|
||||
}
|
||||
|
||||
errorLog, err := s.GetErrorLogByID(ctx, errorID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if strings.TrimSpace(errorLog.RequestBody) == "" {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
|
||||
}
|
||||
|
||||
var pinned *int64
|
||||
if mode == OpsRetryModeUpstream {
|
||||
if pinnedAccountID != nil && *pinnedAccountID > 0 {
|
||||
pinned = pinnedAccountID
|
||||
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
|
||||
pinned = errorLog.AccountID
|
||||
} else {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
|
||||
}
|
||||
}
|
||||
|
||||
startedAt := time.Now()
|
||||
attemptID, err := s.opsRepo.InsertRetryAttempt(ctx, &OpsInsertRetryAttemptInput{
|
||||
RequestedByUserID: requestedByUserID,
|
||||
SourceErrorID: errorID,
|
||||
Mode: mode,
|
||||
PinnedAccountID: pinned,
|
||||
Status: opsRetryStatusRunning,
|
||||
StartedAt: startedAt,
|
||||
})
|
||||
if err != nil {
|
||||
var pqErr *pq.Error
|
||||
if errors.As(err, &pqErr) && string(pqErr.Code) == "23505" {
|
||||
return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
|
||||
}
|
||||
return nil, infraerrors.InternalServer("OPS_RETRY_CREATE_ATTEMPT_FAILED", "Failed to create retry attempt").WithCause(err)
|
||||
}
|
||||
|
||||
result := &OpsRetryResult{
|
||||
AttemptID: attemptID,
|
||||
Mode: mode,
|
||||
Status: opsRetryStatusFailed,
|
||||
PinnedAccountID: pinned,
|
||||
HTTPStatusCode: 0,
|
||||
UpstreamRequestID: "",
|
||||
ResponsePreview: "",
|
||||
ResponseTruncated: false,
|
||||
ErrorMessage: "",
|
||||
StartedAt: startedAt,
|
||||
}
|
||||
|
||||
execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
|
||||
defer cancel()
|
||||
|
||||
execRes := s.executeRetry(execCtx, errorLog, mode, pinned)
|
||||
|
||||
finishedAt := time.Now()
|
||||
result.FinishedAt = finishedAt
|
||||
result.DurationMs = finishedAt.Sub(startedAt).Milliseconds()
|
||||
|
||||
if execRes != nil {
|
||||
result.Status = execRes.status
|
||||
result.UsedAccountID = execRes.usedAccountID
|
||||
result.HTTPStatusCode = execRes.httpStatusCode
|
||||
result.UpstreamRequestID = execRes.upstreamRequestID
|
||||
result.ResponsePreview = execRes.responsePreview
|
||||
result.ResponseTruncated = execRes.responseTruncated
|
||||
result.ErrorMessage = execRes.errorMessage
|
||||
}
|
||||
|
||||
updateCtx, updateCancel := context.WithTimeout(context.Background(), 3*time.Second)
|
||||
defer updateCancel()
|
||||
|
||||
var updateErrMsg *string
|
||||
if strings.TrimSpace(result.ErrorMessage) != "" {
|
||||
msg := result.ErrorMessage
|
||||
updateErrMsg = &msg
|
||||
}
|
||||
var resultRequestID *string
|
||||
if strings.TrimSpace(result.UpstreamRequestID) != "" {
|
||||
v := result.UpstreamRequestID
|
||||
resultRequestID = &v
|
||||
}
|
||||
|
||||
finalStatus := result.Status
|
||||
if strings.TrimSpace(finalStatus) == "" {
|
||||
finalStatus = opsRetryStatusFailed
|
||||
}
|
||||
|
||||
if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{
|
||||
ID: attemptID,
|
||||
Status: finalStatus,
|
||||
FinishedAt: finishedAt,
|
||||
DurationMs: result.DurationMs,
|
||||
ResultRequestID: resultRequestID,
|
||||
ErrorMessage: updateErrMsg,
|
||||
}); err != nil {
|
||||
// Best-effort: retry itself already executed; do not fail the API response.
|
||||
log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
type opsRetryExecution struct {
|
||||
status string
|
||||
|
||||
usedAccountID *int64
|
||||
httpStatusCode int
|
||||
upstreamRequestID string
|
||||
|
||||
responsePreview string
|
||||
responseTruncated bool
|
||||
|
||||
errorMessage string
|
||||
}
|
||||
|
||||
func (s *OpsService) executeRetry(ctx context.Context, errorLog *OpsErrorLogDetail, mode string, pinnedAccountID *int64) *opsRetryExecution {
|
||||
if errorLog == nil {
|
||||
return &opsRetryExecution{
|
||||
status: opsRetryStatusFailed,
|
||||
errorMessage: "missing error log",
|
||||
}
|
||||
}
|
||||
|
||||
reqType := detectOpsRetryType(errorLog.RequestPath)
|
||||
bodyBytes := []byte(errorLog.RequestBody)
|
||||
|
||||
switch reqType {
|
||||
case opsRetryTypeMessages:
|
||||
bodyBytes = FilterThinkingBlocksForRetry(bodyBytes)
|
||||
case opsRetryTypeOpenAI, opsRetryTypeGeminiV1B:
|
||||
// No-op
|
||||
}
|
||||
|
||||
switch strings.ToLower(strings.TrimSpace(mode)) {
|
||||
case OpsRetryModeUpstream:
|
||||
if pinnedAccountID == nil || *pinnedAccountID <= 0 {
|
||||
return &opsRetryExecution{
|
||||
status: opsRetryStatusFailed,
|
||||
errorMessage: "pinned_account_id required for upstream retry",
|
||||
}
|
||||
}
|
||||
return s.executePinnedRetry(ctx, reqType, errorLog, bodyBytes, *pinnedAccountID)
|
||||
case OpsRetryModeClient:
|
||||
return s.executeClientRetry(ctx, reqType, errorLog, bodyBytes)
|
||||
default:
|
||||
return &opsRetryExecution{
|
||||
status: opsRetryStatusFailed,
|
||||
errorMessage: "invalid retry mode",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func detectOpsRetryType(path string) opsRetryRequestType {
|
||||
p := strings.ToLower(strings.TrimSpace(path))
|
||||
switch {
|
||||
case strings.Contains(p, "/responses"):
|
||||
return opsRetryTypeOpenAI
|
||||
case strings.Contains(p, "/v1beta/"):
|
||||
return opsRetryTypeGeminiV1B
|
||||
default:
|
||||
return opsRetryTypeMessages
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsService) executePinnedRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, pinnedAccountID int64) *opsRetryExecution {
|
||||
if s.accountRepo == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account repository not available"}
|
||||
}
|
||||
|
||||
account, err := s.accountRepo.GetByID(ctx, pinnedAccountID)
|
||||
if err != nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("account not found: %v", err)}
|
||||
}
|
||||
if account == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account not found"}
|
||||
}
|
||||
if !account.IsSchedulable() {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account is not schedulable"}
|
||||
}
|
||||
if errorLog.GroupID != nil && *errorLog.GroupID > 0 {
|
||||
if !containsInt64(account.GroupIDs, *errorLog.GroupID) {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "pinned account is not in the same group as the original request"}
|
||||
}
|
||||
}
|
||||
|
||||
var release func()
|
||||
if s.concurrencyService != nil {
|
||||
acq, err := s.concurrencyService.AcquireAccountSlot(ctx, account.ID, account.Concurrency)
|
||||
if err != nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("acquire account slot failed: %v", err)}
|
||||
}
|
||||
if acq == nil || !acq.Acquired {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account concurrency limit reached"}
|
||||
}
|
||||
release = acq.ReleaseFunc
|
||||
}
|
||||
if release != nil {
|
||||
defer release()
|
||||
}
|
||||
|
||||
usedID := account.ID
|
||||
exec := s.executeWithAccount(ctx, reqType, errorLog, body, account)
|
||||
exec.usedAccountID = &usedID
|
||||
if exec.status == "" {
|
||||
exec.status = opsRetryStatusFailed
|
||||
}
|
||||
return exec
|
||||
}
|
||||
|
||||
func (s *OpsService) executeClientRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) *opsRetryExecution {
|
||||
groupID := errorLog.GroupID
|
||||
if groupID == nil || *groupID <= 0 {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "group_id missing; cannot reselect account"}
|
||||
}
|
||||
|
||||
model, stream, parsedErr := extractRetryModelAndStream(reqType, errorLog, body)
|
||||
if parsedErr != nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: parsedErr.Error()}
|
||||
}
|
||||
_ = stream
|
||||
|
||||
excluded := make(map[int64]struct{})
|
||||
switches := 0
|
||||
|
||||
for {
|
||||
if switches >= opsRetryMaxAccountSwitches {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "retry failed after exhausting account failovers"}
|
||||
}
|
||||
|
||||
selection, selErr := s.selectAccountForRetry(ctx, reqType, groupID, model, excluded)
|
||||
if selErr != nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: selErr.Error()}
|
||||
}
|
||||
if selection == nil || selection.Account == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "no available accounts"}
|
||||
}
|
||||
|
||||
account := selection.Account
|
||||
if !selection.Acquired || selection.ReleaseFunc == nil {
|
||||
excluded[account.ID] = struct{}{}
|
||||
switches++
|
||||
continue
|
||||
}
|
||||
|
||||
exec := func() *opsRetryExecution {
|
||||
defer selection.ReleaseFunc()
|
||||
return s.executeWithAccount(ctx, reqType, errorLog, body, account)
|
||||
}()
|
||||
|
||||
if exec != nil {
|
||||
if exec.status == opsRetryStatusSucceeded {
|
||||
usedID := account.ID
|
||||
exec.usedAccountID = &usedID
|
||||
return exec
|
||||
}
|
||||
// If the gateway services ask for failover, try another account.
|
||||
if s.isFailoverError(exec.errorMessage) {
|
||||
excluded[account.ID] = struct{}{}
|
||||
switches++
|
||||
continue
|
||||
}
|
||||
usedID := account.ID
|
||||
exec.usedAccountID = &usedID
|
||||
return exec
|
||||
}
|
||||
|
||||
excluded[account.ID] = struct{}{}
|
||||
switches++
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsService) selectAccountForRetry(ctx context.Context, reqType opsRetryRequestType, groupID *int64, model string, excludedIDs map[int64]struct{}) (*AccountSelectionResult, error) {
|
||||
switch reqType {
|
||||
case opsRetryTypeOpenAI:
|
||||
if s.openAIGatewayService == nil {
|
||||
return nil, fmt.Errorf("openai gateway service not available")
|
||||
}
|
||||
return s.openAIGatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
|
||||
case opsRetryTypeGeminiV1B, opsRetryTypeMessages:
|
||||
if s.gatewayService == nil {
|
||||
return nil, fmt.Errorf("gateway service not available")
|
||||
}
|
||||
return s.gatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported retry type: %s", reqType)
|
||||
}
|
||||
}
|
||||
|
||||
func extractRetryModelAndStream(reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) (model string, stream bool, err error) {
|
||||
switch reqType {
|
||||
case opsRetryTypeMessages:
|
||||
parsed, parseErr := ParseGatewayRequest(body)
|
||||
if parseErr != nil {
|
||||
return "", false, fmt.Errorf("failed to parse messages request body: %w", parseErr)
|
||||
}
|
||||
return parsed.Model, parsed.Stream, nil
|
||||
case opsRetryTypeOpenAI:
|
||||
var v struct {
|
||||
Model string `json:"model"`
|
||||
Stream bool `json:"stream"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &v); err != nil {
|
||||
return "", false, fmt.Errorf("failed to parse openai request body: %w", err)
|
||||
}
|
||||
return strings.TrimSpace(v.Model), v.Stream, nil
|
||||
case opsRetryTypeGeminiV1B:
|
||||
if strings.TrimSpace(errorLog.Model) == "" {
|
||||
return "", false, fmt.Errorf("missing model for gemini v1beta retry")
|
||||
}
|
||||
return strings.TrimSpace(errorLog.Model), errorLog.Stream, nil
|
||||
default:
|
||||
return "", false, fmt.Errorf("unsupported retry type: %s", reqType)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsService) executeWithAccount(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, account *Account) *opsRetryExecution {
|
||||
if account == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "missing account"}
|
||||
}
|
||||
|
||||
c, w := newOpsRetryContext(ctx, errorLog)
|
||||
|
||||
var err error
|
||||
switch reqType {
|
||||
case opsRetryTypeOpenAI:
|
||||
if s.openAIGatewayService == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "openai gateway service not available"}
|
||||
}
|
||||
_, err = s.openAIGatewayService.Forward(ctx, c, account, body)
|
||||
case opsRetryTypeGeminiV1B:
|
||||
if s.geminiCompatService == nil || s.antigravityGatewayService == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini services not available"}
|
||||
}
|
||||
modelName := strings.TrimSpace(errorLog.Model)
|
||||
action := "generateContent"
|
||||
if errorLog.Stream {
|
||||
action = "streamGenerateContent"
|
||||
}
|
||||
if account.Platform == PlatformAntigravity {
|
||||
_, err = s.antigravityGatewayService.ForwardGemini(ctx, c, account, modelName, action, errorLog.Stream, body)
|
||||
} else {
|
||||
_, err = s.geminiCompatService.ForwardNative(ctx, c, account, modelName, action, errorLog.Stream, body)
|
||||
}
|
||||
case opsRetryTypeMessages:
|
||||
switch account.Platform {
|
||||
case PlatformAntigravity:
|
||||
if s.antigravityGatewayService == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "antigravity gateway service not available"}
|
||||
}
|
||||
_, err = s.antigravityGatewayService.Forward(ctx, c, account, body)
|
||||
case PlatformGemini:
|
||||
if s.geminiCompatService == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini gateway service not available"}
|
||||
}
|
||||
_, err = s.geminiCompatService.Forward(ctx, c, account, body)
|
||||
default:
|
||||
if s.gatewayService == nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gateway service not available"}
|
||||
}
|
||||
parsedReq, parseErr := ParseGatewayRequest(body)
|
||||
if parseErr != nil {
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "failed to parse request body"}
|
||||
}
|
||||
_, err = s.gatewayService.Forward(ctx, c, account, parsedReq)
|
||||
}
|
||||
default:
|
||||
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "unsupported retry type"}
|
||||
}
|
||||
|
||||
statusCode := http.StatusOK
|
||||
if c != nil && c.Writer != nil {
|
||||
statusCode = c.Writer.Status()
|
||||
}
|
||||
|
||||
upstreamReqID := extractUpstreamRequestID(c)
|
||||
preview, truncated := extractResponsePreview(w)
|
||||
|
||||
exec := &opsRetryExecution{
|
||||
status: opsRetryStatusFailed,
|
||||
httpStatusCode: statusCode,
|
||||
upstreamRequestID: upstreamReqID,
|
||||
responsePreview: preview,
|
||||
responseTruncated: truncated,
|
||||
errorMessage: "",
|
||||
}
|
||||
|
||||
if err == nil && statusCode < 400 {
|
||||
exec.status = opsRetryStatusSucceeded
|
||||
return exec
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
exec.errorMessage = err.Error()
|
||||
} else {
|
||||
exec.errorMessage = fmt.Sprintf("upstream returned status %d", statusCode)
|
||||
}
|
||||
|
||||
return exec
|
||||
}
|
||||
|
||||
func newOpsRetryContext(ctx context.Context, errorLog *OpsErrorLogDetail) (*gin.Context, *limitedResponseWriter) {
|
||||
w := newLimitedResponseWriter(opsRetryCaptureBytesLimit)
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
|
||||
path := "/"
|
||||
if errorLog != nil && strings.TrimSpace(errorLog.RequestPath) != "" {
|
||||
path = errorLog.RequestPath
|
||||
}
|
||||
|
||||
req, _ := http.NewRequestWithContext(ctx, http.MethodPost, "http://localhost"+path, bytes.NewReader(nil))
|
||||
req.Header.Set("content-type", "application/json")
|
||||
if errorLog != nil && strings.TrimSpace(errorLog.UserAgent) != "" {
|
||||
req.Header.Set("user-agent", errorLog.UserAgent)
|
||||
}
|
||||
// Restore a minimal, whitelisted subset of request headers to improve retry fidelity
|
||||
// (e.g. anthropic-beta / anthropic-version). Never replay auth credentials.
|
||||
if errorLog != nil && strings.TrimSpace(errorLog.RequestHeaders) != "" {
|
||||
var stored map[string]string
|
||||
if err := json.Unmarshal([]byte(errorLog.RequestHeaders), &stored); err == nil {
|
||||
for k, v := range stored {
|
||||
key := strings.TrimSpace(k)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
if !opsRetryRequestHeaderAllowlist[strings.ToLower(key)] {
|
||||
continue
|
||||
}
|
||||
val := strings.TrimSpace(v)
|
||||
if val == "" {
|
||||
continue
|
||||
}
|
||||
req.Header.Set(key, val)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.Request = req
|
||||
return c, w
|
||||
}
|
||||
|
||||
func extractUpstreamRequestID(c *gin.Context) string {
|
||||
if c == nil || c.Writer == nil {
|
||||
return ""
|
||||
}
|
||||
h := c.Writer.Header()
|
||||
if h == nil {
|
||||
return ""
|
||||
}
|
||||
for _, key := range []string{"x-request-id", "X-Request-Id", "X-Request-ID"} {
|
||||
if v := strings.TrimSpace(h.Get(key)); v != "" {
|
||||
return v
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func extractResponsePreview(w *limitedResponseWriter) (preview string, truncated bool) {
|
||||
if w == nil {
|
||||
return "", false
|
||||
}
|
||||
b := bytes.TrimSpace(w.bodyBytes())
|
||||
if len(b) == 0 {
|
||||
return "", w.truncated()
|
||||
}
|
||||
if len(b) > opsRetryResponsePreviewMax {
|
||||
return string(b[:opsRetryResponsePreviewMax]), true
|
||||
}
|
||||
return string(b), w.truncated()
|
||||
}
|
||||
|
||||
func containsInt64(items []int64, needle int64) bool {
|
||||
for _, v := range items {
|
||||
if v == needle {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (s *OpsService) isFailoverError(message string) bool {
|
||||
msg := strings.ToLower(strings.TrimSpace(message))
|
||||
if msg == "" {
|
||||
return false
|
||||
}
|
||||
return strings.Contains(msg, "upstream error:") && strings.Contains(msg, "failover")
|
||||
}
|
||||
Reference in New Issue
Block a user