feat(ops): 增强上游错误追踪和新增定时报告服务

- 优化错误日志中间件,即使请求成功也记录上游重试/故障转移事件
- 新增OpsScheduledReportService支持定时报告功能
- 使用Redis分布式锁确保定时任务单实例执行
- 完善依赖注入配置
- 优化前端错误趋势图表展示
This commit is contained in:
IanShaw027
2026-01-11 23:00:31 +08:00
parent 8fffcd8091
commit 73b62bb15c
13 changed files with 1021 additions and 30 deletions

View File

@@ -309,10 +309,6 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
c.Writer = w
c.Next()
status := c.Writer.Status()
if status < 400 {
return
}
if ops == nil {
return
}
@@ -320,6 +316,229 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
return
}
status := c.Writer.Status()
if status < 400 {
// Even when the client request succeeds, we still want to persist upstream error attempts
// (retries/failover) so ops can observe upstream instability that gets "covered" by retries.
var events []*service.OpsUpstreamErrorEvent
if v, ok := c.Get(service.OpsUpstreamErrorsKey); ok {
if arr, ok := v.([]*service.OpsUpstreamErrorEvent); ok && len(arr) > 0 {
events = arr
}
}
// Also accept single upstream fields set by gateway services (rare for successful requests).
hasUpstreamContext := len(events) > 0
if !hasUpstreamContext {
if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
switch t := v.(type) {
case int:
hasUpstreamContext = t > 0
case int64:
hasUpstreamContext = t > 0
}
}
}
if !hasUpstreamContext {
if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
hasUpstreamContext = true
}
}
}
if !hasUpstreamContext {
if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
hasUpstreamContext = true
}
}
}
if !hasUpstreamContext {
return
}
apiKey, _ := middleware2.GetAPIKeyFromContext(c)
clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string)
model, _ := c.Get(opsModelKey)
streamV, _ := c.Get(opsStreamKey)
accountIDV, _ := c.Get(opsAccountIDKey)
var modelName string
if s, ok := model.(string); ok {
modelName = s
}
stream := false
if b, ok := streamV.(bool); ok {
stream = b
}
// Prefer showing the account that experienced the upstream error (if we have events),
// otherwise fall back to the final selected account (best-effort).
var accountID *int64
if len(events) > 0 {
if last := events[len(events)-1]; last != nil && last.AccountID > 0 {
v := last.AccountID
accountID = &v
}
}
if accountID == nil {
if v, ok := accountIDV.(int64); ok && v > 0 {
accountID = &v
}
}
fallbackPlatform := guessPlatformFromPath(c.Request.URL.Path)
platform := resolveOpsPlatform(apiKey, fallbackPlatform)
requestID := c.Writer.Header().Get("X-Request-Id")
if requestID == "" {
requestID = c.Writer.Header().Get("x-request-id")
}
// Best-effort backfill single upstream fields from the last event (if present).
var upstreamStatusCode *int
var upstreamErrorMessage *string
var upstreamErrorDetail *string
if len(events) > 0 {
last := events[len(events)-1]
if last != nil {
if last.UpstreamStatusCode > 0 {
code := last.UpstreamStatusCode
upstreamStatusCode = &code
}
if msg := strings.TrimSpace(last.Message); msg != "" {
upstreamErrorMessage = &msg
}
if detail := strings.TrimSpace(last.Detail); detail != "" {
upstreamErrorDetail = &detail
}
}
}
if upstreamStatusCode == nil {
if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
switch t := v.(type) {
case int:
if t > 0 {
code := t
upstreamStatusCode = &code
}
case int64:
if t > 0 {
code := int(t)
upstreamStatusCode = &code
}
}
}
}
if upstreamErrorMessage == nil {
if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok {
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
msg := strings.TrimSpace(s)
upstreamErrorMessage = &msg
}
}
}
if upstreamErrorDetail == nil {
if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok {
if s, ok := v.(string); ok && strings.TrimSpace(s) != "" {
detail := strings.TrimSpace(s)
upstreamErrorDetail = &detail
}
}
}
// If we still have nothing meaningful, skip.
if upstreamStatusCode == nil && upstreamErrorMessage == nil && upstreamErrorDetail == nil && len(events) == 0 {
return
}
effectiveUpstreamStatus := 0
if upstreamStatusCode != nil {
effectiveUpstreamStatus = *upstreamStatusCode
}
recoveredMsg := "Recovered upstream error"
if effectiveUpstreamStatus > 0 {
recoveredMsg += " " + strconvItoa(effectiveUpstreamStatus)
}
if upstreamErrorMessage != nil && strings.TrimSpace(*upstreamErrorMessage) != "" {
recoveredMsg += ": " + strings.TrimSpace(*upstreamErrorMessage)
}
recoveredMsg = truncateString(recoveredMsg, 2048)
entry := &service.OpsInsertErrorLogInput{
RequestID: requestID,
ClientRequestID: clientRequestID,
AccountID: accountID,
Platform: platform,
Model: modelName,
RequestPath: func() string {
if c.Request != nil && c.Request.URL != nil {
return c.Request.URL.Path
}
return ""
}(),
Stream: stream,
UserAgent: c.GetHeader("User-Agent"),
ErrorPhase: "upstream",
ErrorType: "upstream_error",
// Severity/retryability should reflect the upstream failure, not the final client status (200).
Severity: classifyOpsSeverity("upstream_error", effectiveUpstreamStatus),
StatusCode: status,
IsBusinessLimited: false,
ErrorMessage: recoveredMsg,
ErrorBody: "",
ErrorSource: "upstream_http",
ErrorOwner: "provider",
UpstreamStatusCode: upstreamStatusCode,
UpstreamErrorMessage: upstreamErrorMessage,
UpstreamErrorDetail: upstreamErrorDetail,
UpstreamErrors: events,
IsRetryable: classifyOpsIsRetryable("upstream_error", effectiveUpstreamStatus),
RetryCount: 0,
CreatedAt: time.Now(),
}
if apiKey != nil {
entry.APIKeyID = &apiKey.ID
if apiKey.User != nil {
entry.UserID = &apiKey.User.ID
}
if apiKey.GroupID != nil {
entry.GroupID = apiKey.GroupID
}
// Prefer group platform if present (more stable than inferring from path).
if apiKey.Group != nil && apiKey.Group.Platform != "" {
entry.Platform = apiKey.Group.Platform
}
}
var clientIP string
if ip := strings.TrimSpace(c.ClientIP()); ip != "" {
clientIP = ip
entry.ClientIP = &clientIP
}
var requestBody []byte
if v, ok := c.Get(opsRequestBodyKey); ok {
if b, ok := v.([]byte); ok && len(b) > 0 {
requestBody = b
}
}
// Store request headers/body only when an upstream error occurred to keep overhead minimal.
entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c)
enqueueOpsErrorLog(ops, entry, requestBody)
return
}
body := w.buf.Bytes()
parsed := parseOpsErrorResponse(body)