feat(ops): 增强ops核心服务功能和重试机制
This commit is contained in:
@@ -25,12 +25,12 @@ type OpsErrorLog struct {
|
|||||||
IsRetryable bool `json:"is_retryable"`
|
IsRetryable bool `json:"is_retryable"`
|
||||||
RetryCount int `json:"retry_count"`
|
RetryCount int `json:"retry_count"`
|
||||||
|
|
||||||
Resolved bool `json:"resolved"`
|
Resolved bool `json:"resolved"`
|
||||||
ResolvedAt *time.Time `json:"resolved_at"`
|
ResolvedAt *time.Time `json:"resolved_at"`
|
||||||
ResolvedByUserID *int64 `json:"resolved_by_user_id"`
|
ResolvedByUserID *int64 `json:"resolved_by_user_id"`
|
||||||
ResolvedByUserName string `json:"resolved_by_user_name"`
|
ResolvedByUserName string `json:"resolved_by_user_name"`
|
||||||
ResolvedRetryID *int64 `json:"resolved_retry_id"`
|
ResolvedRetryID *int64 `json:"resolved_retry_id"`
|
||||||
ResolvedStatusRaw string `json:"-"`
|
ResolvedStatusRaw string `json:"-"`
|
||||||
|
|
||||||
ClientRequestID string `json:"client_request_id"`
|
ClientRequestID string `json:"client_request_id"`
|
||||||
RequestID string `json:"request_id"`
|
RequestID string `json:"request_id"`
|
||||||
@@ -93,6 +93,12 @@ type OpsErrorLogFilter struct {
|
|||||||
Resolved *bool
|
Resolved *bool
|
||||||
Query string
|
Query string
|
||||||
|
|
||||||
|
// View controls error categorization for list endpoints.
|
||||||
|
// - errors: show actionable errors (exclude business-limited / 429 / 529)
|
||||||
|
// - excluded: only show excluded errors
|
||||||
|
// - all: show everything
|
||||||
|
View string
|
||||||
|
|
||||||
Page int
|
Page int
|
||||||
PageSize int
|
PageSize int
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -108,6 +108,10 @@ func (w *limitedResponseWriter) truncated() bool {
|
|||||||
return w.totalWritten > int64(w.limit)
|
return w.totalWritten > int64(w.limit)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
OpsRetryModeUpstreamEvent = "upstream_event"
|
||||||
|
)
|
||||||
|
|
||||||
func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
|
func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
|
||||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -123,6 +127,81 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
|
|||||||
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
|
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
errorLog, err := s.GetErrorLogByID(ctx, errorID)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if errorLog == nil {
|
||||||
|
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(errorLog.RequestBody) == "" {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
|
||||||
|
}
|
||||||
|
|
||||||
|
var pinned *int64
|
||||||
|
if mode == OpsRetryModeUpstream {
|
||||||
|
if pinnedAccountID != nil && *pinnedAccountID > 0 {
|
||||||
|
pinned = pinnedAccountID
|
||||||
|
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
|
||||||
|
pinned = errorLog.AccountID
|
||||||
|
} else {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return s.retryWithErrorLog(ctx, requestedByUserID, errorID, mode, mode, pinned, errorLog)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RetryUpstreamEvent retries a specific upstream attempt captured inside ops_error_logs.upstream_errors.
|
||||||
|
// idx is 0-based. It always pins the original event account_id.
|
||||||
|
func (s *OpsService) RetryUpstreamEvent(ctx context.Context, requestedByUserID int64, errorID int64, idx int) (*OpsRetryResult, error) {
|
||||||
|
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if s.opsRepo == nil {
|
||||||
|
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||||
|
}
|
||||||
|
if idx < 0 {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_UPSTREAM_IDX", "invalid upstream idx")
|
||||||
|
}
|
||||||
|
|
||||||
|
errorLog, err := s.GetErrorLogByID(ctx, errorID)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if errorLog == nil {
|
||||||
|
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
|
||||||
|
}
|
||||||
|
|
||||||
|
events, err := ParseOpsUpstreamErrors(errorLog.UpstreamErrors)
|
||||||
|
if err != nil {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENTS_INVALID", "invalid upstream_errors")
|
||||||
|
}
|
||||||
|
if idx >= len(events) {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_IDX_OOB", "upstream idx out of range")
|
||||||
|
}
|
||||||
|
ev := events[idx]
|
||||||
|
if ev == nil {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENT_MISSING", "upstream event missing")
|
||||||
|
}
|
||||||
|
if ev.AccountID <= 0 {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry")
|
||||||
|
}
|
||||||
|
|
||||||
|
upstreamBody := strings.TrimSpace(ev.UpstreamRequestBody)
|
||||||
|
if upstreamBody == "" {
|
||||||
|
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_NO_REQUEST_BODY", "No upstream request body found to retry")
|
||||||
|
}
|
||||||
|
|
||||||
|
override := *errorLog
|
||||||
|
override.RequestBody = upstreamBody
|
||||||
|
pinned := ev.AccountID
|
||||||
|
|
||||||
|
// Persist as upstream_event, execute as upstream pinned retry.
|
||||||
|
return s.retryWithErrorLog(ctx, requestedByUserID, errorID, OpsRetryModeUpstreamEvent, OpsRetryModeUpstream, &pinned, &override)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) retryWithErrorLog(ctx context.Context, requestedByUserID int64, errorID int64, mode string, execMode string, pinnedAccountID *int64, errorLog *OpsErrorLogDetail) (*OpsRetryResult, error) {
|
||||||
latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
|
latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
|
||||||
if err != nil && !errors.Is(err, sql.ErrNoRows) {
|
if err != nil && !errors.Is(err, sql.ErrNoRows) {
|
||||||
return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
|
return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
|
||||||
@@ -144,22 +223,18 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
errorLog, err := s.GetErrorLogByID(ctx, errorID)
|
if errorLog == nil || strings.TrimSpace(errorLog.RequestBody) == "" {
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(errorLog.RequestBody) == "" {
|
|
||||||
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
|
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
|
||||||
}
|
}
|
||||||
|
|
||||||
var pinned *int64
|
var pinned *int64
|
||||||
if mode == OpsRetryModeUpstream {
|
if execMode == OpsRetryModeUpstream {
|
||||||
if pinnedAccountID != nil && *pinnedAccountID > 0 {
|
if pinnedAccountID != nil && *pinnedAccountID > 0 {
|
||||||
pinned = pinnedAccountID
|
pinned = pinnedAccountID
|
||||||
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
|
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
|
||||||
pinned = errorLog.AccountID
|
pinned = errorLog.AccountID
|
||||||
} else {
|
} else {
|
||||||
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
|
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -196,7 +271,7 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
|
|||||||
execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
|
execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
execRes := s.executeRetry(execCtx, errorLog, mode, pinned)
|
execRes := s.executeRetry(execCtx, errorLog, execMode, pinned)
|
||||||
|
|
||||||
finishedAt := time.Now()
|
finishedAt := time.Now()
|
||||||
result.FinishedAt = finishedAt
|
result.FinishedAt = finishedAt
|
||||||
@@ -249,14 +324,10 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
|
|||||||
ResultRequestID: resultRequestID,
|
ResultRequestID: resultRequestID,
|
||||||
ErrorMessage: updateErrMsg,
|
ErrorMessage: updateErrMsg,
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
// Best-effort: retry itself already executed; do not fail the API response.
|
|
||||||
log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
|
log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
|
||||||
} else {
|
} else if success {
|
||||||
// Auto-resolve the source error when a manual retry succeeds.
|
if err := s.opsRepo.UpdateErrorResolution(updateCtx, errorID, true, &requestedByUserID, &attemptID, &finishedAt); err != nil {
|
||||||
if success {
|
log.Printf("[Ops] UpdateErrorResolution failed: %v", err)
|
||||||
if err := s.opsRepo.UpdateErrorResolution(updateCtx, errorID, true, &requestedByUserID, &attemptID, &finishedAt); err != nil {
|
|
||||||
log.Printf("[Ops] UpdateErrorResolution failed: %v", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -208,6 +208,25 @@ func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogIn
|
|||||||
out.Detail = ""
|
out.Detail = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
out.UpstreamRequestBody = strings.TrimSpace(out.UpstreamRequestBody)
|
||||||
|
if out.UpstreamRequestBody != "" {
|
||||||
|
// Reuse the same sanitization/trimming strategy as request body storage.
|
||||||
|
// Keep it small so it is safe to persist in ops_error_logs JSON.
|
||||||
|
sanitized, truncated, _ := sanitizeAndTrimRequestBody([]byte(out.UpstreamRequestBody), 10*1024)
|
||||||
|
if sanitized != "" {
|
||||||
|
out.UpstreamRequestBody = sanitized
|
||||||
|
if truncated {
|
||||||
|
out.Kind = strings.TrimSpace(out.Kind)
|
||||||
|
if out.Kind == "" {
|
||||||
|
out.Kind = "upstream"
|
||||||
|
}
|
||||||
|
out.Kind = out.Kind + ":request_body_truncated"
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
out.UpstreamRequestBody = ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Drop fully-empty events (can happen if only status code was known).
|
// Drop fully-empty events (can happen if only status code was known).
|
||||||
if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" {
|
if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" {
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -15,6 +15,11 @@ const (
|
|||||||
OpsUpstreamErrorMessageKey = "ops_upstream_error_message"
|
OpsUpstreamErrorMessageKey = "ops_upstream_error_message"
|
||||||
OpsUpstreamErrorDetailKey = "ops_upstream_error_detail"
|
OpsUpstreamErrorDetailKey = "ops_upstream_error_detail"
|
||||||
OpsUpstreamErrorsKey = "ops_upstream_errors"
|
OpsUpstreamErrorsKey = "ops_upstream_errors"
|
||||||
|
|
||||||
|
// Best-effort capture of the current upstream request body so ops can
|
||||||
|
// retry the specific upstream attempt (not just the client request).
|
||||||
|
// This value is sanitized+trimmed before being persisted.
|
||||||
|
OpsUpstreamRequestBodyKey = "ops_upstream_request_body"
|
||||||
)
|
)
|
||||||
|
|
||||||
func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) {
|
func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) {
|
||||||
@@ -46,6 +51,10 @@ type OpsUpstreamErrorEvent struct {
|
|||||||
UpstreamStatusCode int `json:"upstream_status_code,omitempty"`
|
UpstreamStatusCode int `json:"upstream_status_code,omitempty"`
|
||||||
UpstreamRequestID string `json:"upstream_request_id,omitempty"`
|
UpstreamRequestID string `json:"upstream_request_id,omitempty"`
|
||||||
|
|
||||||
|
// Best-effort upstream request capture (sanitized+trimmed).
|
||||||
|
// Required for retrying a specific upstream attempt.
|
||||||
|
UpstreamRequestBody string `json:"upstream_request_body,omitempty"`
|
||||||
|
|
||||||
// Kind: http_error | request_error | retry_exhausted | failover
|
// Kind: http_error | request_error | retry_exhausted | failover
|
||||||
Kind string `json:"kind,omitempty"`
|
Kind string `json:"kind,omitempty"`
|
||||||
|
|
||||||
@@ -62,6 +71,7 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
|
|||||||
}
|
}
|
||||||
ev.Platform = strings.TrimSpace(ev.Platform)
|
ev.Platform = strings.TrimSpace(ev.Platform)
|
||||||
ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID)
|
ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID)
|
||||||
|
ev.UpstreamRequestBody = strings.TrimSpace(ev.UpstreamRequestBody)
|
||||||
ev.Kind = strings.TrimSpace(ev.Kind)
|
ev.Kind = strings.TrimSpace(ev.Kind)
|
||||||
ev.Message = strings.TrimSpace(ev.Message)
|
ev.Message = strings.TrimSpace(ev.Message)
|
||||||
ev.Detail = strings.TrimSpace(ev.Detail)
|
ev.Detail = strings.TrimSpace(ev.Detail)
|
||||||
@@ -69,6 +79,16 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
|
|||||||
ev.Message = sanitizeUpstreamErrorMessage(ev.Message)
|
ev.Message = sanitizeUpstreamErrorMessage(ev.Message)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the caller didn't explicitly pass upstream request body but the gateway
|
||||||
|
// stored it on the context, attach it so ops can retry this specific attempt.
|
||||||
|
if ev.UpstreamRequestBody == "" {
|
||||||
|
if v, ok := c.Get(OpsUpstreamRequestBodyKey); ok {
|
||||||
|
if s, ok := v.(string); ok {
|
||||||
|
ev.UpstreamRequestBody = strings.TrimSpace(s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
var existing []*OpsUpstreamErrorEvent
|
var existing []*OpsUpstreamErrorEvent
|
||||||
if v, ok := c.Get(OpsUpstreamErrorsKey); ok {
|
if v, ok := c.Get(OpsUpstreamErrorsKey); ok {
|
||||||
if arr, ok := v.([]*OpsUpstreamErrorEvent); ok {
|
if arr, ok := v.([]*OpsUpstreamErrorEvent); ok {
|
||||||
@@ -93,3 +113,15 @@ func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string {
|
|||||||
s := string(raw)
|
s := string(raw)
|
||||||
return &s
|
return &s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ParseOpsUpstreamErrors(raw string) ([]*OpsUpstreamErrorEvent, error) {
|
||||||
|
raw = strings.TrimSpace(raw)
|
||||||
|
if raw == "" {
|
||||||
|
return []*OpsUpstreamErrorEvent{}, nil
|
||||||
|
}
|
||||||
|
var out []*OpsUpstreamErrorEvent
|
||||||
|
if err := json.Unmarshal([]byte(raw), &out); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user