diff --git a/backend/internal/service/ops_models.go b/backend/internal/service/ops_models.go index 98be759f..c48c9b56 100644 --- a/backend/internal/service/ops_models.go +++ b/backend/internal/service/ops_models.go @@ -25,12 +25,12 @@ type OpsErrorLog struct { IsRetryable bool `json:"is_retryable"` RetryCount int `json:"retry_count"` - Resolved bool `json:"resolved"` - ResolvedAt *time.Time `json:"resolved_at"` - ResolvedByUserID *int64 `json:"resolved_by_user_id"` + Resolved bool `json:"resolved"` + ResolvedAt *time.Time `json:"resolved_at"` + ResolvedByUserID *int64 `json:"resolved_by_user_id"` ResolvedByUserName string `json:"resolved_by_user_name"` - ResolvedRetryID *int64 `json:"resolved_retry_id"` - ResolvedStatusRaw string `json:"-"` + ResolvedRetryID *int64 `json:"resolved_retry_id"` + ResolvedStatusRaw string `json:"-"` ClientRequestID string `json:"client_request_id"` RequestID string `json:"request_id"` @@ -93,6 +93,12 @@ type OpsErrorLogFilter struct { Resolved *bool Query string + // View controls error categorization for list endpoints. + // - errors: show actionable errors (exclude business-limited / 429 / 529) + // - excluded: only show excluded errors + // - all: show everything + View string + Page int PageSize int } diff --git a/backend/internal/service/ops_retry.go b/backend/internal/service/ops_retry.go index f52e2b77..25c10af6 100644 --- a/backend/internal/service/ops_retry.go +++ b/backend/internal/service/ops_retry.go @@ -108,6 +108,10 @@ func (w *limitedResponseWriter) truncated() bool { return w.totalWritten > int64(w.limit) } +const ( + OpsRetryModeUpstreamEvent = "upstream_event" +) + func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) { if err := s.RequireMonitoringEnabled(ctx); err != nil { return nil, err @@ -123,6 +127,81 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream") } + errorLog, err := s.GetErrorLogByID(ctx, errorID) + if err != nil { + return nil, err + } + if errorLog == nil { + return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found") + } + if strings.TrimSpace(errorLog.RequestBody) == "" { + return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry") + } + + var pinned *int64 + if mode == OpsRetryModeUpstream { + if pinnedAccountID != nil && *pinnedAccountID > 0 { + pinned = pinnedAccountID + } else if errorLog.AccountID != nil && *errorLog.AccountID > 0 { + pinned = errorLog.AccountID + } else { + return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry") + } + } + + return s.retryWithErrorLog(ctx, requestedByUserID, errorID, mode, mode, pinned, errorLog) +} + +// RetryUpstreamEvent retries a specific upstream attempt captured inside ops_error_logs.upstream_errors. +// idx is 0-based. It always pins the original event account_id. +func (s *OpsService) RetryUpstreamEvent(ctx context.Context, requestedByUserID int64, errorID int64, idx int) (*OpsRetryResult, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if idx < 0 { + return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_UPSTREAM_IDX", "invalid upstream idx") + } + + errorLog, err := s.GetErrorLogByID(ctx, errorID) + if err != nil { + return nil, err + } + if errorLog == nil { + return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found") + } + + events, err := ParseOpsUpstreamErrors(errorLog.UpstreamErrors) + if err != nil { + return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENTS_INVALID", "invalid upstream_errors") + } + if idx >= len(events) { + return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_IDX_OOB", "upstream idx out of range") + } + ev := events[idx] + if ev == nil { + return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENT_MISSING", "upstream event missing") + } + if ev.AccountID <= 0 { + return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry") + } + + upstreamBody := strings.TrimSpace(ev.UpstreamRequestBody) + if upstreamBody == "" { + return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_NO_REQUEST_BODY", "No upstream request body found to retry") + } + + override := *errorLog + override.RequestBody = upstreamBody + pinned := ev.AccountID + + // Persist as upstream_event, execute as upstream pinned retry. + return s.retryWithErrorLog(ctx, requestedByUserID, errorID, OpsRetryModeUpstreamEvent, OpsRetryModeUpstream, &pinned, &override) +} + +func (s *OpsService) retryWithErrorLog(ctx context.Context, requestedByUserID int64, errorID int64, mode string, execMode string, pinnedAccountID *int64, errorLog *OpsErrorLogDetail) (*OpsRetryResult, error) { latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID) if err != nil && !errors.Is(err, sql.ErrNoRows) { return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err) @@ -144,22 +223,18 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er } } - errorLog, err := s.GetErrorLogByID(ctx, errorID) - if err != nil { - return nil, err - } - if strings.TrimSpace(errorLog.RequestBody) == "" { + if errorLog == nil || strings.TrimSpace(errorLog.RequestBody) == "" { return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry") } var pinned *int64 - if mode == OpsRetryModeUpstream { + if execMode == OpsRetryModeUpstream { if pinnedAccountID != nil && *pinnedAccountID > 0 { pinned = pinnedAccountID } else if errorLog.AccountID != nil && *errorLog.AccountID > 0 { pinned = errorLog.AccountID } else { - return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry") + return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry") } } @@ -196,7 +271,7 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout) defer cancel() - execRes := s.executeRetry(execCtx, errorLog, mode, pinned) + execRes := s.executeRetry(execCtx, errorLog, execMode, pinned) finishedAt := time.Now() result.FinishedAt = finishedAt @@ -249,14 +324,10 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er ResultRequestID: resultRequestID, ErrorMessage: updateErrMsg, }); err != nil { - // Best-effort: retry itself already executed; do not fail the API response. log.Printf("[Ops] UpdateRetryAttempt failed: %v", err) - } else { - // Auto-resolve the source error when a manual retry succeeds. - if success { - if err := s.opsRepo.UpdateErrorResolution(updateCtx, errorID, true, &requestedByUserID, &attemptID, &finishedAt); err != nil { - log.Printf("[Ops] UpdateErrorResolution failed: %v", err) - } + } else if success { + if err := s.opsRepo.UpdateErrorResolution(updateCtx, errorID, true, &requestedByUserID, &attemptID, &finishedAt); err != nil { + log.Printf("[Ops] UpdateErrorResolution failed: %v", err) } } diff --git a/backend/internal/service/ops_service.go b/backend/internal/service/ops_service.go index f1240aaf..d606ba09 100644 --- a/backend/internal/service/ops_service.go +++ b/backend/internal/service/ops_service.go @@ -208,6 +208,25 @@ func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogIn out.Detail = "" } + out.UpstreamRequestBody = strings.TrimSpace(out.UpstreamRequestBody) + if out.UpstreamRequestBody != "" { + // Reuse the same sanitization/trimming strategy as request body storage. + // Keep it small so it is safe to persist in ops_error_logs JSON. + sanitized, truncated, _ := sanitizeAndTrimRequestBody([]byte(out.UpstreamRequestBody), 10*1024) + if sanitized != "" { + out.UpstreamRequestBody = sanitized + if truncated { + out.Kind = strings.TrimSpace(out.Kind) + if out.Kind == "" { + out.Kind = "upstream" + } + out.Kind = out.Kind + ":request_body_truncated" + } + } else { + out.UpstreamRequestBody = "" + } + } + // Drop fully-empty events (can happen if only status code was known). if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" { continue diff --git a/backend/internal/service/ops_upstream_context.go b/backend/internal/service/ops_upstream_context.go index 20c0ea11..20ca6469 100644 --- a/backend/internal/service/ops_upstream_context.go +++ b/backend/internal/service/ops_upstream_context.go @@ -15,6 +15,11 @@ const ( OpsUpstreamErrorMessageKey = "ops_upstream_error_message" OpsUpstreamErrorDetailKey = "ops_upstream_error_detail" OpsUpstreamErrorsKey = "ops_upstream_errors" + + // Best-effort capture of the current upstream request body so ops can + // retry the specific upstream attempt (not just the client request). + // This value is sanitized+trimmed before being persisted. + OpsUpstreamRequestBodyKey = "ops_upstream_request_body" ) func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) { @@ -46,6 +51,10 @@ type OpsUpstreamErrorEvent struct { UpstreamStatusCode int `json:"upstream_status_code,omitempty"` UpstreamRequestID string `json:"upstream_request_id,omitempty"` + // Best-effort upstream request capture (sanitized+trimmed). + // Required for retrying a specific upstream attempt. + UpstreamRequestBody string `json:"upstream_request_body,omitempty"` + // Kind: http_error | request_error | retry_exhausted | failover Kind string `json:"kind,omitempty"` @@ -62,6 +71,7 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) { } ev.Platform = strings.TrimSpace(ev.Platform) ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID) + ev.UpstreamRequestBody = strings.TrimSpace(ev.UpstreamRequestBody) ev.Kind = strings.TrimSpace(ev.Kind) ev.Message = strings.TrimSpace(ev.Message) ev.Detail = strings.TrimSpace(ev.Detail) @@ -69,6 +79,16 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) { ev.Message = sanitizeUpstreamErrorMessage(ev.Message) } + // If the caller didn't explicitly pass upstream request body but the gateway + // stored it on the context, attach it so ops can retry this specific attempt. + if ev.UpstreamRequestBody == "" { + if v, ok := c.Get(OpsUpstreamRequestBodyKey); ok { + if s, ok := v.(string); ok { + ev.UpstreamRequestBody = strings.TrimSpace(s) + } + } + } + var existing []*OpsUpstreamErrorEvent if v, ok := c.Get(OpsUpstreamErrorsKey); ok { if arr, ok := v.([]*OpsUpstreamErrorEvent); ok { @@ -93,3 +113,15 @@ func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string { s := string(raw) return &s } + +func ParseOpsUpstreamErrors(raw string) ([]*OpsUpstreamErrorEvent, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return []*OpsUpstreamErrorEvent{}, nil + } + var out []*OpsUpstreamErrorEvent + if err := json.Unmarshal([]byte(raw), &out); err != nil { + return nil, err + } + return out, nil +}