From 659df6e220a5499959bbd6980de702534266094c Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Wed, 14 Jan 2026 09:03:35 +0800 Subject: [PATCH] =?UTF-8?q?feat(handler):=20=E6=96=B0=E5=A2=9Eops=E7=AE=A1?= =?UTF-8?q?=E7=90=86=E6=8E=A5=E5=8F=A3=E5=92=8C=E8=B7=AF=E7=94=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加告警静默管理接口 - 扩展错误日志查询和操作接口 - 新增重试和解决状态相关端点 - 完善错误日志记录功能 --- .../handler/admin/ops_alerts_handler.go | 168 +++++++++++++++++- backend/internal/handler/admin/ops_handler.go | 105 +++++++++++ backend/internal/handler/ops_error_logger.go | 32 ++-- backend/internal/server/routes/admin.go | 5 + 4 files changed, 296 insertions(+), 14 deletions(-) diff --git a/backend/internal/handler/admin/ops_alerts_handler.go b/backend/internal/handler/admin/ops_alerts_handler.go index 1e33ddd5..e7ad693b 100644 --- a/backend/internal/handler/admin/ops_alerts_handler.go +++ b/backend/internal/handler/admin/ops_alerts_handler.go @@ -7,8 +7,10 @@ import ( "net/http" "strconv" "strings" + "time" "github.com/Wei-Shaw/sub2api/internal/pkg/response" + "github.com/Wei-Shaw/sub2api/internal/server/middleware" "github.com/Wei-Shaw/sub2api/internal/service" "github.com/gin-gonic/gin" "github.com/gin-gonic/gin/binding" @@ -372,8 +374,135 @@ func (h *OpsHandler) DeleteAlertRule(c *gin.Context) { response.Success(c, gin.H{"deleted": true}) } +// GetAlertEvent returns a single ops alert event. +// GET /api/v1/admin/ops/alert-events/:id +func (h *OpsHandler) GetAlertEvent(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + id, err := strconv.ParseInt(c.Param("id"), 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid event ID") + return + } + + ev, err := h.opsService.GetAlertEventByID(c.Request.Context(), id) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, ev) +} + +// UpdateAlertEventStatus updates an ops alert event status. +// PUT /api/v1/admin/ops/alert-events/:id/status +func (h *OpsHandler) UpdateAlertEventStatus(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + id, err := strconv.ParseInt(c.Param("id"), 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid event ID") + return + } + + var payload struct { + Status string `json:"status"` + } + if err := c.ShouldBindJSON(&payload); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + payload.Status = strings.TrimSpace(payload.Status) + if payload.Status == "" { + response.BadRequest(c, "Invalid status") + return + } + if payload.Status != service.OpsAlertStatusResolved && payload.Status != service.OpsAlertStatusManualResolved { + response.BadRequest(c, "Invalid status") + return + } + + var resolvedAt *time.Time + if payload.Status == service.OpsAlertStatusResolved || payload.Status == service.OpsAlertStatusManualResolved { + now := time.Now().UTC() + resolvedAt = &now + } + if err := h.opsService.UpdateAlertEventStatus(c.Request.Context(), id, payload.Status, resolvedAt); err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, gin.H{"updated": true}) +} + // ListAlertEvents lists recent ops alert events. // GET /api/v1/admin/ops/alert-events +// CreateAlertSilence creates a scoped silence for ops alerts. +// POST /api/v1/admin/ops/alert-silences +func (h *OpsHandler) CreateAlertSilence(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + var payload struct { + RuleID int64 `json:"rule_id"` + Platform string `json:"platform"` + GroupID *int64 `json:"group_id"` + Region *string `json:"region"` + Until string `json:"until"` + Reason string `json:"reason"` + } + if err := c.ShouldBindJSON(&payload); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + until, err := time.Parse(time.RFC3339, strings.TrimSpace(payload.Until)) + if err != nil { + response.BadRequest(c, "Invalid until") + return + } + + createdBy := (*int64)(nil) + if subject, ok := middleware.GetAuthSubjectFromContext(c); ok { + uid := subject.UserID + createdBy = &uid + } + + silence := &service.OpsAlertSilence{ + RuleID: payload.RuleID, + Platform: strings.TrimSpace(payload.Platform), + GroupID: payload.GroupID, + Region: payload.Region, + Until: until, + Reason: strings.TrimSpace(payload.Reason), + CreatedBy: createdBy, + } + + created, err := h.opsService.CreateAlertSilence(c.Request.Context(), silence) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, created) +} + func (h *OpsHandler) ListAlertEvents(c *gin.Context) { if h.opsService == nil { response.Error(c, http.StatusServiceUnavailable, "Ops service not available") @@ -384,7 +513,7 @@ func (h *OpsHandler) ListAlertEvents(c *gin.Context) { return } - limit := 100 + limit := 20 if raw := strings.TrimSpace(c.Query("limit")); raw != "" { n, err := strconv.Atoi(raw) if err != nil || n <= 0 { @@ -400,6 +529,43 @@ func (h *OpsHandler) ListAlertEvents(c *gin.Context) { Severity: strings.TrimSpace(c.Query("severity")), } + if v := strings.TrimSpace(c.Query("email_sent")); v != "" { + vv := strings.ToLower(v) + switch vv { + case "true", "1": + b := true + filter.EmailSent = &b + case "false", "0": + b := false + filter.EmailSent = &b + default: + response.BadRequest(c, "Invalid email_sent") + return + } + } + + // Cursor pagination + if rawTS := strings.TrimSpace(c.Query("before_fired_at")); rawTS != "" { + ts, err := time.Parse(time.RFC3339Nano, rawTS) + if err != nil { + if t2, err2 := time.Parse(time.RFC3339, rawTS); err2 == nil { + ts = t2 + } else { + response.BadRequest(c, "Invalid before_fired_at") + return + } + } + filter.BeforeFiredAt = &ts + } + if rawID := strings.TrimSpace(c.Query("before_id")); rawID != "" { + id, err := strconv.ParseInt(rawID, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid before_id") + return + } + filter.BeforeID = &id + } + // Optional global filter support (platform/group/time range). if platform := strings.TrimSpace(c.Query("platform")); platform != "" { filter.Platform = platform diff --git a/backend/internal/handler/admin/ops_handler.go b/backend/internal/handler/admin/ops_handler.go index bff7426a..ec7a8b75 100644 --- a/backend/internal/handler/admin/ops_handler.go +++ b/backend/internal/handler/admin/ops_handler.go @@ -80,6 +80,25 @@ func (h *OpsHandler) GetErrorLogs(c *gin.Context) { if phase := strings.TrimSpace(c.Query("phase")); phase != "" { filter.Phase = phase } + if owner := strings.TrimSpace(c.Query("error_owner")); owner != "" { + filter.Owner = owner + } + if source := strings.TrimSpace(c.Query("error_source")); source != "" { + filter.Source = source + } + if v := strings.TrimSpace(c.Query("resolved")); v != "" { + switch strings.ToLower(v) { + case "1", "true", "yes": + b := true + filter.Resolved = &b + case "0", "false", "no": + b := false + filter.Resolved = &b + default: + response.BadRequest(c, "Invalid resolved") + return + } + } if q := strings.TrimSpace(c.Query("q")); q != "" { filter.Query = q } @@ -242,6 +261,11 @@ func (h *OpsHandler) ListRequestDetails(c *gin.Context) { type opsRetryRequest struct { Mode string `json:"mode"` PinnedAccountID *int64 `json:"pinned_account_id"` + Force bool `json:"force"` +} + +type opsResolveRequest struct { + Resolved bool `json:"resolved"` } // RetryErrorRequest retries a failed request using stored request_body. @@ -278,6 +302,8 @@ func (h *OpsHandler) RetryErrorRequest(c *gin.Context) { req.Mode = service.OpsRetryModeClient } + // Force flag is currently a UI-level acknowledgement. Server may still enforce safety constraints. + _ = req.Force result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, req.Mode, req.PinnedAccountID) if err != nil { response.ErrorFrom(c, err) @@ -287,6 +313,81 @@ func (h *OpsHandler) RetryErrorRequest(c *gin.Context) { response.Success(c, result) } +// ListRetryAttempts lists retry attempts for an error log. +// GET /api/v1/admin/ops/errors/:id/retries +func (h *OpsHandler) ListRetryAttempts(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + limit := 50 + if v := strings.TrimSpace(c.Query("limit")); v != "" { + n, err := strconv.Atoi(v) + if err != nil || n <= 0 { + response.BadRequest(c, "Invalid limit") + return + } + limit = n + } + + items, err := h.opsService.ListRetryAttemptsByErrorID(c.Request.Context(), id, limit) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, items) +} + +// UpdateErrorResolution allows manual resolve/unresolve. +// PUT /api/v1/admin/ops/errors/:id/resolve +func (h *OpsHandler) UpdateErrorResolution(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + subject, ok := middleware.GetAuthSubjectFromContext(c) + if !ok || subject.UserID <= 0 { + response.Error(c, http.StatusUnauthorized, "Unauthorized") + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + var req opsResolveRequest + if err := c.ShouldBindJSON(&req); err != nil { + response.BadRequest(c, "Invalid request: "+err.Error()) + return + } + uid := subject.UserID + if err := h.opsService.UpdateErrorResolution(c.Request.Context(), id, req.Resolved, &uid, nil); err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, gin.H{"ok": true}) +} + func parseOpsTimeRange(c *gin.Context, defaultRange string) (time.Time, time.Time, error) { startStr := strings.TrimSpace(c.Query("start_time")) endStr := strings.TrimSpace(c.Query("end_time")) @@ -358,6 +459,10 @@ func parseOpsDuration(v string) (time.Duration, bool) { return 6 * time.Hour, true case "24h": return 24 * time.Hour, true + case "7d": + return 7 * 24 * time.Hour, true + case "30d": + return 30 * 24 * time.Hour, true default: return 0, false } diff --git a/backend/internal/handler/ops_error_logger.go b/backend/internal/handler/ops_error_logger.go index 13bd9d94..f101bf92 100644 --- a/backend/internal/handler/ops_error_logger.go +++ b/backend/internal/handler/ops_error_logger.go @@ -832,28 +832,30 @@ func normalizeOpsErrorType(errType string, code string) string { func classifyOpsPhase(errType, message, code string) string { msg := strings.ToLower(message) + // Standardized phases: request|auth|routing|upstream|network|internal + // Map billing/concurrency/response => request; scheduling => routing. switch strings.TrimSpace(code) { case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID": - return "billing" + return "request" } switch errType { case "authentication_error": return "auth" case "billing_error", "subscription_error": - return "billing" + return "request" case "rate_limit_error": if strings.Contains(msg, "concurrency") || strings.Contains(msg, "pending") || strings.Contains(msg, "queue") { - return "concurrency" + return "request" } return "upstream" case "invalid_request_error": - return "response" + return "request" case "upstream_error", "overloaded_error": return "upstream" case "api_error": if strings.Contains(msg, "no available accounts") { - return "scheduling" + return "routing" } return "internal" default: @@ -914,34 +916,38 @@ func classifyOpsIsBusinessLimited(errType, phase, code string, status int, messa } func classifyOpsErrorOwner(phase string, message string) string { + // Standardized owners: client|provider|platform switch phase { case "upstream", "network": return "provider" - case "billing", "concurrency", "auth", "response": + case "request", "auth": return "client" + case "routing", "internal": + return "platform" default: if strings.Contains(strings.ToLower(message), "upstream") { return "provider" } - return "sub2api" + return "platform" } } func classifyOpsErrorSource(phase string, message string) string { + // Standardized sources: client_request|upstream_http|gateway switch phase { case "upstream": return "upstream_http" case "network": - return "upstream_network" - case "billing": - return "billing" - case "concurrency": - return "concurrency" + return "gateway" + case "request", "auth": + return "client_request" + case "routing", "internal": + return "gateway" default: if strings.Contains(strings.ToLower(message), "upstream") { return "upstream_http" } - return "internal" + return "gateway" } } diff --git a/backend/internal/server/routes/admin.go b/backend/internal/server/routes/admin.go index 9bb019bb..adae7cdd 100644 --- a/backend/internal/server/routes/admin.go +++ b/backend/internal/server/routes/admin.go @@ -81,6 +81,9 @@ func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) { ops.PUT("/alert-rules/:id", h.Admin.Ops.UpdateAlertRule) ops.DELETE("/alert-rules/:id", h.Admin.Ops.DeleteAlertRule) ops.GET("/alert-events", h.Admin.Ops.ListAlertEvents) + ops.GET("/alert-events/:id", h.Admin.Ops.GetAlertEvent) + ops.PUT("/alert-events/:id/status", h.Admin.Ops.UpdateAlertEventStatus) + ops.POST("/alert-silences", h.Admin.Ops.CreateAlertSilence) // Email notification config (DB-backed) ops.GET("/email-notification/config", h.Admin.Ops.GetEmailNotificationConfig) @@ -113,7 +116,9 @@ func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) { // Error logs (MVP-1) ops.GET("/errors", h.Admin.Ops.GetErrorLogs) ops.GET("/errors/:id", h.Admin.Ops.GetErrorLogByID) + ops.GET("/errors/:id/retries", h.Admin.Ops.ListRetryAttempts) ops.POST("/errors/:id/retry", h.Admin.Ops.RetryErrorRequest) + ops.PUT("/errors/:id/resolve", h.Admin.Ops.UpdateErrorResolution) // Request drilldown (success + error) ops.GET("/requests", h.Admin.Ops.ListRequestDetails)