feat(ops): 增强上游错误追踪和新增定时报告服务
- 优化错误日志中间件,即使请求成功也记录上游重试/故障转移事件 - 新增OpsScheduledReportService支持定时报告功能 - 使用Redis分布式锁确保定时任务单实例执行 - 完善依赖注入配置 - 优化前端错误趋势图表展示
This commit is contained in:
@@ -309,10 +309,6 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
|
||||
c.Writer = w
|
||||
c.Next()
|
||||
|
||||
status := c.Writer.Status()
|
||||
if status < 400 {
|
||||
return
|
||||
}
|
||||
if ops == nil {
|
||||
return
|
||||
}
|
||||
@@ -320,6 +316,229 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
|
||||
return
|
||||
}
|
||||
|
||||
status := c.Writer.Status()
|
||||
if status < 400 {
|
||||
// Even when the client request succeeds, we still want to persist upstream error attempts
|
||||
// (retries/failover) so ops can observe upstream instability that gets "covered" by retries.
|
||||
var events []*service.OpsUpstreamErrorEvent
|
||||
if v, ok := c.Get(service.OpsUpstreamErrorsKey); ok {
|
||||
if arr, ok := v.([]*service.OpsUpstreamErrorEvent); ok && len(arr) > 0 {
|
||||
events = arr
|
||||
}
|
||||
}
|
||||
// Also accept single upstream fields set by gateway services (rare for successful requests).
|
||||
hasUpstreamContext := len(events) > 0
|
||||
if !hasUpstreamContext {
|
||||
if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
|
||||
switch t := v.(type) {
|
||||
case int:
|
||||
hasUpstreamContext = t > 0
|
||||
case int64:
|
||||
hasUpstreamContext = t > 0
|
||||
}
|
||||
}
|
||||
}
|
||||
if !hasUpstreamContext {
|
||||
if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
|
||||
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
|
||||
hasUpstreamContext = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if !hasUpstreamContext {
|
||||
if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
|
||||
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
|
||||
hasUpstreamContext = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if !hasUpstreamContext {
|
||||
return
|
||||
}
|
||||
|
||||
apiKey, _ := middleware2.GetAPIKeyFromContext(c)
|
||||
clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string)
|
||||
|
||||
model, _ := c.Get(opsModelKey)
|
||||
streamV, _ := c.Get(opsStreamKey)
|
||||
accountIDV, _ := c.Get(opsAccountIDKey)
|
||||
|
||||
var modelName string
|
||||
if s, ok := model.(string); ok {
|
||||
modelName = s
|
||||
}
|
||||
stream := false
|
||||
if b, ok := streamV.(bool); ok {
|
||||
stream = b
|
||||
}
|
||||
|
||||
// Prefer showing the account that experienced the upstream error (if we have events),
|
||||
// otherwise fall back to the final selected account (best-effort).
|
||||
var accountID *int64
|
||||
if len(events) > 0 {
|
||||
if last := events[len(events)-1]; last != nil && last.AccountID > 0 {
|
||||
v := last.AccountID
|
||||
accountID = &v
|
||||
}
|
||||
}
|
||||
if accountID == nil {
|
||||
if v, ok := accountIDV.(int64); ok && v > 0 {
|
||||
accountID = &v
|
||||
}
|
||||
}
|
||||
|
||||
fallbackPlatform := guessPlatformFromPath(c.Request.URL.Path)
|
||||
platform := resolveOpsPlatform(apiKey, fallbackPlatform)
|
||||
|
||||
requestID := c.Writer.Header().Get("X-Request-Id")
|
||||
if requestID == "" {
|
||||
requestID = c.Writer.Header().Get("x-request-id")
|
||||
}
|
||||
|
||||
// Best-effort backfill single upstream fields from the last event (if present).
|
||||
var upstreamStatusCode *int
|
||||
var upstreamErrorMessage *string
|
||||
var upstreamErrorDetail *string
|
||||
if len(events) > 0 {
|
||||
last := events[len(events)-1]
|
||||
if last != nil {
|
||||
if last.UpstreamStatusCode > 0 {
|
||||
code := last.UpstreamStatusCode
|
||||
upstreamStatusCode = &code
|
||||
}
|
||||
if msg := strings.TrimSpace(last.Message); msg != "" {
|
||||
upstreamErrorMessage = &msg
|
||||
}
|
||||
if detail := strings.TrimSpace(last.Detail); detail != "" {
|
||||
upstreamErrorDetail = &detail
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if upstreamStatusCode == nil {
|
||||
if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
|
||||
switch t := v.(type) {
|
||||
case int:
|
||||
if t > 0 {
|
||||
code := t
|
||||
upstreamStatusCode = &code
|
||||
}
|
||||
case int64:
|
||||
if t > 0 {
|
||||
code := int(t)
|
||||
upstreamStatusCode = &code
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if upstreamErrorMessage == nil {
|
||||
if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
|
||||
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
|
||||
msg := strings.TrimSpace(s)
|
||||
upstreamErrorMessage = &msg
|
||||
}
|
||||
}
|
||||
}
|
||||
if upstreamErrorDetail == nil {
|
||||
if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
|
||||
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
|
||||
detail := strings.TrimSpace(s)
|
||||
upstreamErrorDetail = &detail
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we still have nothing meaningful, skip.
|
||||
if upstreamStatusCode == nil && upstreamErrorMessage == nil && upstreamErrorDetail == nil && len(events) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
effectiveUpstreamStatus := 0
|
||||
if upstreamStatusCode != nil {
|
||||
effectiveUpstreamStatus = *upstreamStatusCode
|
||||
}
|
||||
|
||||
recoveredMsg := "Recovered upstream error"
|
||||
if effectiveUpstreamStatus > 0 {
|
||||
recoveredMsg += " " + strconvItoa(effectiveUpstreamStatus)
|
||||
}
|
||||
if upstreamErrorMessage != nil && strings.TrimSpace(*upstreamErrorMessage) != "" {
|
||||
recoveredMsg += ": " + strings.TrimSpace(*upstreamErrorMessage)
|
||||
}
|
||||
recoveredMsg = truncateString(recoveredMsg, 2048)
|
||||
|
||||
entry := &service.OpsInsertErrorLogInput{
|
||||
RequestID: requestID,
|
||||
ClientRequestID: clientRequestID,
|
||||
|
||||
AccountID: accountID,
|
||||
Platform: platform,
|
||||
Model: modelName,
|
||||
RequestPath: func() string {
|
||||
if c.Request != nil && c.Request.URL != nil {
|
||||
return c.Request.URL.Path
|
||||
}
|
||||
return ""
|
||||
}(),
|
||||
Stream: stream,
|
||||
UserAgent: c.GetHeader("User-Agent"),
|
||||
|
||||
ErrorPhase: "upstream",
|
||||
ErrorType: "upstream_error",
|
||||
// Severity/retryability should reflect the upstream failure, not the final client status (200).
|
||||
Severity: classifyOpsSeverity("upstream_error", effectiveUpstreamStatus),
|
||||
StatusCode: status,
|
||||
IsBusinessLimited: false,
|
||||
|
||||
ErrorMessage: recoveredMsg,
|
||||
ErrorBody: "",
|
||||
|
||||
ErrorSource: "upstream_http",
|
||||
ErrorOwner: "provider",
|
||||
|
||||
UpstreamStatusCode: upstreamStatusCode,
|
||||
UpstreamErrorMessage: upstreamErrorMessage,
|
||||
UpstreamErrorDetail: upstreamErrorDetail,
|
||||
UpstreamErrors: events,
|
||||
|
||||
IsRetryable: classifyOpsIsRetryable("upstream_error", effectiveUpstreamStatus),
|
||||
RetryCount: 0,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
|
||||
if apiKey != nil {
|
||||
entry.APIKeyID = &apiKey.ID
|
||||
if apiKey.User != nil {
|
||||
entry.UserID = &apiKey.User.ID
|
||||
}
|
||||
if apiKey.GroupID != nil {
|
||||
entry.GroupID = apiKey.GroupID
|
||||
}
|
||||
// Prefer group platform if present (more stable than inferring from path).
|
||||
if apiKey.Group != nil && apiKey.Group.Platform != "" {
|
||||
entry.Platform = apiKey.Group.Platform
|
||||
}
|
||||
}
|
||||
|
||||
var clientIP string
|
||||
if ip := strings.TrimSpace(c.ClientIP()); ip != "" {
|
||||
clientIP = ip
|
||||
entry.ClientIP = &clientIP
|
||||
}
|
||||
|
||||
var requestBody []byte
|
||||
if v, ok := c.Get(opsRequestBodyKey); ok {
|
||||
if b, ok := v.([]byte); ok && len(b) > 0 {
|
||||
requestBody = b
|
||||
}
|
||||
}
|
||||
// Store request headers/body only when an upstream error occurred to keep overhead minimal.
|
||||
entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c)
|
||||
|
||||
enqueueOpsErrorLog(ops, entry, requestBody)
|
||||
return
|
||||
}
|
||||
|
||||
body := w.buf.Bytes()
|
||||
parsed := parseOpsErrorResponse(body)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user