diff --git a/PR_DESCRIPTION.md b/PR_DESCRIPTION.md new file mode 100644 index 00000000..b240f45c --- /dev/null +++ b/PR_DESCRIPTION.md @@ -0,0 +1,164 @@ +## 概述 + +全面增强运维监控系统(Ops)的错误日志管理和告警静默功能,优化前端 UI 组件代码质量和用户体验。本次更新重构了核心服务层和数据访问层,提升系统可维护性和运维效率。 + +## 主要改动 + +### 1. 错误日志查询优化 + +**功能特性:** +- 新增 GetErrorLogByID 接口,支持按 ID 精确查询错误详情 +- 优化错误日志过滤逻辑,支持多维度筛选(平台、阶段、来源、所有者等) +- 改进查询参数处理,简化代码结构 +- 增强错误分类和标准化处理 +- 支持错误解决状态追踪(resolved 字段) + +**技术实现:** +- `ops_handler.go` - 新增单条错误日志查询接口 +- `ops_repo.go` - 优化数据查询和过滤条件构建 +- `ops_models.go` - 扩展错误日志数据模型 +- 前端 API 接口同步更新 + +### 2. 告警静默功能 + +**功能特性:** +- 支持按规则、平台、分组、区域等维度静默告警 +- 可设置静默时长和原因说明 +- 静默记录可追溯,记录创建人和创建时间 +- 自动过期机制,避免永久静默 + +**技术实现:** +- `037_ops_alert_silences.sql` - 新增告警静默表 +- `ops_alerts.go` - 告警静默逻辑实现 +- `ops_alerts_handler.go` - 告警静默 API 接口 +- `OpsAlertEventsCard.vue` - 前端告警静默操作界面 + +**数据库结构:** + +| 字段 | 类型 | 说明 | +|------|------|------| +| rule_id | BIGINT | 告警规则 ID | +| platform | VARCHAR(64) | 平台标识 | +| group_id | BIGINT | 分组 ID(可选) | +| region | VARCHAR(64) | 区域(可选) | +| until | TIMESTAMPTZ | 静默截止时间 | +| reason | TEXT | 静默原因 | +| created_by | BIGINT | 创建人 ID | + +### 3. 错误分类标准化 + +**功能特性:** +- 统一错误阶段分类(request|auth|routing|upstream|network|internal) +- 规范错误归属分类(client|provider|platform) +- 标准化错误来源分类(client_request|upstream_http|gateway) +- 自动迁移历史数据到新分类体系 + +**技术实现:** +- `038_ops_errors_resolution_retry_results_and_standardize_classification.sql` - 分类标准化迁移 +- 自动映射历史遗留分类到新标准 +- 自动解决已恢复的上游错误(客户端状态码 < 400) + +### 4. Gateway 服务集成 + +**功能特性:** +- 完善各 Gateway 服务的 Ops 集成 +- 统一错误日志记录接口 +- 增强上游错误追踪能力 + +**涉及服务:** +- `antigravity_gateway_service.go` - Antigravity 网关集成 +- `gateway_service.go` - 通用网关集成 +- `gemini_messages_compat_service.go` - Gemini 兼容层集成 +- `openai_gateway_service.go` - OpenAI 网关集成 + +### 5. 前端 UI 优化 + +**代码重构:** +- 大幅简化错误详情模态框代码(从 828 行优化到 450 行) +- 优化错误日志表格组件,提升可读性 +- 清理未使用的 i18n 翻译,减少冗余 +- 统一组件代码风格和格式 +- 优化骨架屏组件,更好匹配实际看板布局 + +**布局改进:** +- 修复模态框内容溢出和滚动问题 +- 优化表格布局,使用 flex 布局确保正确显示 +- 改进看板头部布局和交互 +- 提升响应式体验 +- 骨架屏支持全屏模式适配 + +**交互优化:** +- 优化告警事件卡片功能和展示 +- 改进错误详情展示逻辑 +- 增强请求详情模态框 +- 完善运行时设置卡片 +- 改进加载动画效果 + +### 6. 国际化完善 + +**文案补充:** +- 补充错误日志相关的英文翻译 +- 添加告警静默功能的中英文文案 +- 完善提示文本和错误信息 +- 统一术语翻译标准 + +## 文件变更 + +**后端(26 个文件):** +- `backend/internal/handler/admin/ops_alerts_handler.go` - 告警接口增强 +- `backend/internal/handler/admin/ops_handler.go` - 错误日志接口优化 +- `backend/internal/handler/ops_error_logger.go` - 错误记录器增强 +- `backend/internal/repository/ops_repo.go` - 数据访问层重构 +- `backend/internal/repository/ops_repo_alerts.go` - 告警数据访问增强 +- `backend/internal/service/ops_*.go` - 核心服务层重构(10 个文件) +- `backend/internal/service/*_gateway_service.go` - Gateway 集成(4 个文件) +- `backend/internal/server/routes/admin.go` - 路由配置更新 +- `backend/migrations/*.sql` - 数据库迁移(2 个文件) +- 测试文件更新(5 个文件) + +**前端(13 个文件):** +- `frontend/src/views/admin/ops/OpsDashboard.vue` - 看板主页优化 +- `frontend/src/views/admin/ops/components/*.vue` - 组件重构(10 个文件) +- `frontend/src/api/admin/ops.ts` - API 接口扩展 +- `frontend/src/i18n/locales/*.ts` - 国际化文本(2 个文件) + +## 代码统计 + +- 44 个文件修改 +- 3733 行新增 +- 995 行删除 +- 净增加 2738 行 + +## 核心改进 + +**可维护性提升:** +- 重构核心服务层,职责更清晰 +- 简化前端组件代码,降低复杂度 +- 统一代码风格和命名规范 +- 清理冗余代码和未使用的翻译 +- 标准化错误分类体系 + +**功能完善:** +- 告警静默功能,减少告警噪音 +- 错误日志查询优化,提升运维效率 +- Gateway 服务集成完善,统一监控能力 +- 错误解决状态追踪,便于问题管理 + +**用户体验优化:** +- 修复多个 UI 布局问题 +- 优化交互流程 +- 完善国际化支持 +- 提升响应式体验 +- 改进加载状态展示 + +## 测试验证 + +- ✅ 错误日志查询和过滤功能 +- ✅ 告警静默创建和自动过期 +- ✅ 错误分类标准化迁移 +- ✅ Gateway 服务错误日志记录 +- ✅ 前端组件布局和交互 +- ✅ 骨架屏全屏模式适配 +- ✅ 国际化文本完整性 +- ✅ API 接口功能正确性 +- ✅ 数据库迁移执行成功 diff --git a/backend/internal/handler/admin/ops_alerts_handler.go b/backend/internal/handler/admin/ops_alerts_handler.go index 1e33ddd5..c9da19c7 100644 --- a/backend/internal/handler/admin/ops_alerts_handler.go +++ b/backend/internal/handler/admin/ops_alerts_handler.go @@ -7,8 +7,10 @@ import ( "net/http" "strconv" "strings" + "time" "github.com/Wei-Shaw/sub2api/internal/pkg/response" + "github.com/Wei-Shaw/sub2api/internal/server/middleware" "github.com/Wei-Shaw/sub2api/internal/service" "github.com/gin-gonic/gin" "github.com/gin-gonic/gin/binding" @@ -18,8 +20,6 @@ var validOpsAlertMetricTypes = []string{ "success_rate", "error_rate", "upstream_error_rate", - "p95_latency_ms", - "p99_latency_ms", "cpu_usage_percent", "memory_usage_percent", "concurrency_queue_depth", @@ -372,8 +372,135 @@ func (h *OpsHandler) DeleteAlertRule(c *gin.Context) { response.Success(c, gin.H{"deleted": true}) } +// GetAlertEvent returns a single ops alert event. +// GET /api/v1/admin/ops/alert-events/:id +func (h *OpsHandler) GetAlertEvent(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + id, err := strconv.ParseInt(c.Param("id"), 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid event ID") + return + } + + ev, err := h.opsService.GetAlertEventByID(c.Request.Context(), id) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, ev) +} + +// UpdateAlertEventStatus updates an ops alert event status. +// PUT /api/v1/admin/ops/alert-events/:id/status +func (h *OpsHandler) UpdateAlertEventStatus(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + id, err := strconv.ParseInt(c.Param("id"), 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid event ID") + return + } + + var payload struct { + Status string `json:"status"` + } + if err := c.ShouldBindJSON(&payload); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + payload.Status = strings.TrimSpace(payload.Status) + if payload.Status == "" { + response.BadRequest(c, "Invalid status") + return + } + if payload.Status != service.OpsAlertStatusResolved && payload.Status != service.OpsAlertStatusManualResolved { + response.BadRequest(c, "Invalid status") + return + } + + var resolvedAt *time.Time + if payload.Status == service.OpsAlertStatusResolved || payload.Status == service.OpsAlertStatusManualResolved { + now := time.Now().UTC() + resolvedAt = &now + } + if err := h.opsService.UpdateAlertEventStatus(c.Request.Context(), id, payload.Status, resolvedAt); err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, gin.H{"updated": true}) +} + // ListAlertEvents lists recent ops alert events. // GET /api/v1/admin/ops/alert-events +// CreateAlertSilence creates a scoped silence for ops alerts. +// POST /api/v1/admin/ops/alert-silences +func (h *OpsHandler) CreateAlertSilence(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + var payload struct { + RuleID int64 `json:"rule_id"` + Platform string `json:"platform"` + GroupID *int64 `json:"group_id"` + Region *string `json:"region"` + Until string `json:"until"` + Reason string `json:"reason"` + } + if err := c.ShouldBindJSON(&payload); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + until, err := time.Parse(time.RFC3339, strings.TrimSpace(payload.Until)) + if err != nil { + response.BadRequest(c, "Invalid until") + return + } + + createdBy := (*int64)(nil) + if subject, ok := middleware.GetAuthSubjectFromContext(c); ok { + uid := subject.UserID + createdBy = &uid + } + + silence := &service.OpsAlertSilence{ + RuleID: payload.RuleID, + Platform: strings.TrimSpace(payload.Platform), + GroupID: payload.GroupID, + Region: payload.Region, + Until: until, + Reason: strings.TrimSpace(payload.Reason), + CreatedBy: createdBy, + } + + created, err := h.opsService.CreateAlertSilence(c.Request.Context(), silence) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, created) +} + func (h *OpsHandler) ListAlertEvents(c *gin.Context) { if h.opsService == nil { response.Error(c, http.StatusServiceUnavailable, "Ops service not available") @@ -384,7 +511,7 @@ func (h *OpsHandler) ListAlertEvents(c *gin.Context) { return } - limit := 100 + limit := 20 if raw := strings.TrimSpace(c.Query("limit")); raw != "" { n, err := strconv.Atoi(raw) if err != nil || n <= 0 { @@ -400,6 +527,49 @@ func (h *OpsHandler) ListAlertEvents(c *gin.Context) { Severity: strings.TrimSpace(c.Query("severity")), } + if v := strings.TrimSpace(c.Query("email_sent")); v != "" { + vv := strings.ToLower(v) + switch vv { + case "true", "1": + b := true + filter.EmailSent = &b + case "false", "0": + b := false + filter.EmailSent = &b + default: + response.BadRequest(c, "Invalid email_sent") + return + } + } + + // Cursor pagination: both params must be provided together. + rawTS := strings.TrimSpace(c.Query("before_fired_at")) + rawID := strings.TrimSpace(c.Query("before_id")) + if (rawTS == "") != (rawID == "") { + response.BadRequest(c, "before_fired_at and before_id must be provided together") + return + } + if rawTS != "" { + ts, err := time.Parse(time.RFC3339Nano, rawTS) + if err != nil { + if t2, err2 := time.Parse(time.RFC3339, rawTS); err2 == nil { + ts = t2 + } else { + response.BadRequest(c, "Invalid before_fired_at") + return + } + } + filter.BeforeFiredAt = &ts + } + if rawID != "" { + id, err := strconv.ParseInt(rawID, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid before_id") + return + } + filter.BeforeID = &id + } + // Optional global filter support (platform/group/time range). if platform := strings.TrimSpace(c.Query("platform")); platform != "" { filter.Platform = platform diff --git a/backend/internal/handler/admin/ops_handler.go b/backend/internal/handler/admin/ops_handler.go index bff7426a..44accc8f 100644 --- a/backend/internal/handler/admin/ops_handler.go +++ b/backend/internal/handler/admin/ops_handler.go @@ -19,6 +19,57 @@ type OpsHandler struct { opsService *service.OpsService } +// GetErrorLogByID returns ops error log detail. +// GET /api/v1/admin/ops/errors/:id +func (h *OpsHandler) GetErrorLogByID(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + detail, err := h.opsService.GetErrorLogByID(c.Request.Context(), id) + if err != nil { + response.ErrorFrom(c, err) + return + } + + response.Success(c, detail) +} + +const ( + opsListViewErrors = "errors" + opsListViewExcluded = "excluded" + opsListViewAll = "all" +) + +func parseOpsViewParam(c *gin.Context) string { + if c == nil { + return "" + } + v := strings.ToLower(strings.TrimSpace(c.Query("view"))) + switch v { + case "", opsListViewErrors: + return opsListViewErrors + case opsListViewExcluded: + return opsListViewExcluded + case opsListViewAll: + return opsListViewAll + default: + return opsListViewErrors + } +} + func NewOpsHandler(opsService *service.OpsService) *OpsHandler { return &OpsHandler{opsService: opsService} } @@ -47,16 +98,26 @@ func (h *OpsHandler) GetErrorLogs(c *gin.Context) { return } - filter := &service.OpsErrorLogFilter{ - Page: page, - PageSize: pageSize, - } + filter := &service.OpsErrorLogFilter{Page: page, PageSize: pageSize} + if !startTime.IsZero() { filter.StartTime = &startTime } if !endTime.IsZero() { filter.EndTime = &endTime } + filter.View = parseOpsViewParam(c) + filter.Phase = strings.TrimSpace(c.Query("phase")) + filter.Owner = strings.TrimSpace(c.Query("error_owner")) + filter.Source = strings.TrimSpace(c.Query("error_source")) + filter.Query = strings.TrimSpace(c.Query("q")) + filter.UserQuery = strings.TrimSpace(c.Query("user_query")) + + // Force request errors: client-visible status >= 400. + // buildOpsErrorLogsWhere already applies this for non-upstream phase. + if strings.EqualFold(strings.TrimSpace(filter.Phase), "upstream") { + filter.Phase = "" + } if platform := strings.TrimSpace(c.Query("platform")); platform != "" { filter.Platform = platform @@ -77,11 +138,19 @@ func (h *OpsHandler) GetErrorLogs(c *gin.Context) { } filter.AccountID = &id } - if phase := strings.TrimSpace(c.Query("phase")); phase != "" { - filter.Phase = phase - } - if q := strings.TrimSpace(c.Query("q")); q != "" { - filter.Query = q + + if v := strings.TrimSpace(c.Query("resolved")); v != "" { + switch strings.ToLower(v) { + case "1", "true", "yes": + b := true + filter.Resolved = &b + case "0", "false", "no": + b := false + filter.Resolved = &b + default: + response.BadRequest(c, "Invalid resolved") + return + } } if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" { parts := strings.Split(statusCodesStr, ",") @@ -106,13 +175,120 @@ func (h *OpsHandler) GetErrorLogs(c *gin.Context) { response.ErrorFrom(c, err) return } - response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize) } -// GetErrorLogByID returns a single error log detail. -// GET /api/v1/admin/ops/errors/:id -func (h *OpsHandler) GetErrorLogByID(c *gin.Context) { +// ListRequestErrors lists client-visible request errors. +// GET /api/v1/admin/ops/request-errors +func (h *OpsHandler) ListRequestErrors(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + page, pageSize := response.ParsePagination(c) + if pageSize > 500 { + pageSize = 500 + } + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsErrorLogFilter{Page: page, PageSize: pageSize} + if !startTime.IsZero() { + filter.StartTime = &startTime + } + if !endTime.IsZero() { + filter.EndTime = &endTime + } + filter.View = parseOpsViewParam(c) + filter.Phase = strings.TrimSpace(c.Query("phase")) + filter.Owner = strings.TrimSpace(c.Query("error_owner")) + filter.Source = strings.TrimSpace(c.Query("error_source")) + filter.Query = strings.TrimSpace(c.Query("q")) + filter.UserQuery = strings.TrimSpace(c.Query("user_query")) + + // Force request errors: client-visible status >= 400. + // buildOpsErrorLogsWhere already applies this for non-upstream phase. + if strings.EqualFold(strings.TrimSpace(filter.Phase), "upstream") { + filter.Phase = "" + } + + if platform := strings.TrimSpace(c.Query("platform")); platform != "" { + filter.Platform = platform + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + if v := strings.TrimSpace(c.Query("account_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid account_id") + return + } + filter.AccountID = &id + } + + if v := strings.TrimSpace(c.Query("resolved")); v != "" { + switch strings.ToLower(v) { + case "1", "true", "yes": + b := true + filter.Resolved = &b + case "0", "false", "no": + b := false + filter.Resolved = &b + default: + response.BadRequest(c, "Invalid resolved") + return + } + } + if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" { + parts := strings.Split(statusCodesStr, ",") + out := make([]int, 0, len(parts)) + for _, part := range parts { + p := strings.TrimSpace(part) + if p == "" { + continue + } + n, err := strconv.Atoi(p) + if err != nil || n < 0 { + response.BadRequest(c, "Invalid status_codes") + return + } + out = append(out, n) + } + filter.StatusCodes = out + } + + result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize) +} + +// GetRequestError returns request error detail. +// GET /api/v1/admin/ops/request-errors/:id +func (h *OpsHandler) GetRequestError(c *gin.Context) { + // same storage; just proxy to existing detail + h.GetErrorLogByID(c) +} + +// ListRequestErrorUpstreamErrors lists upstream error logs correlated to a request error. +// GET /api/v1/admin/ops/request-errors/:id/upstream-errors +func (h *OpsHandler) ListRequestErrorUpstreamErrors(c *gin.Context) { if h.opsService == nil { response.Error(c, http.StatusServiceUnavailable, "Ops service not available") return @@ -129,15 +305,306 @@ func (h *OpsHandler) GetErrorLogByID(c *gin.Context) { return } + // Load request error to get correlation keys. detail, err := h.opsService.GetErrorLogByID(c.Request.Context(), id) if err != nil { response.ErrorFrom(c, err) return } - response.Success(c, detail) + // Correlate by request_id/client_request_id. + requestID := strings.TrimSpace(detail.RequestID) + clientRequestID := strings.TrimSpace(detail.ClientRequestID) + if requestID == "" && clientRequestID == "" { + response.Paginated(c, []*service.OpsErrorLog{}, 0, 1, 10) + return + } + + page, pageSize := response.ParsePagination(c) + if pageSize > 500 { + pageSize = 500 + } + + // Keep correlation window wide enough so linked upstream errors + // are discoverable even when UI defaults to 1h elsewhere. + startTime, endTime, err := parseOpsTimeRange(c, "30d") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsErrorLogFilter{Page: page, PageSize: pageSize} + if !startTime.IsZero() { + filter.StartTime = &startTime + } + if !endTime.IsZero() { + filter.EndTime = &endTime + } + filter.View = "all" + filter.Phase = "upstream" + filter.Owner = "provider" + filter.Source = strings.TrimSpace(c.Query("error_source")) + filter.Query = strings.TrimSpace(c.Query("q")) + + if platform := strings.TrimSpace(c.Query("platform")); platform != "" { + filter.Platform = platform + } + + // Prefer exact match on request_id; if missing, fall back to client_request_id. + if requestID != "" { + filter.RequestID = requestID + } else { + filter.ClientRequestID = clientRequestID + } + + result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + + // If client asks for details, expand each upstream error log to include upstream response fields. + includeDetail := strings.TrimSpace(c.Query("include_detail")) + if includeDetail == "1" || strings.EqualFold(includeDetail, "true") || strings.EqualFold(includeDetail, "yes") { + details := make([]*service.OpsErrorLogDetail, 0, len(result.Errors)) + for _, item := range result.Errors { + if item == nil { + continue + } + d, err := h.opsService.GetErrorLogByID(c.Request.Context(), item.ID) + if err != nil || d == nil { + continue + } + details = append(details, d) + } + response.Paginated(c, details, int64(result.Total), result.Page, result.PageSize) + return + } + + response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize) } +// RetryRequestErrorClient retries the client request based on stored request body. +// POST /api/v1/admin/ops/request-errors/:id/retry-client +func (h *OpsHandler) RetryRequestErrorClient(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + subject, ok := middleware.GetAuthSubjectFromContext(c) + if !ok || subject.UserID <= 0 { + response.Error(c, http.StatusUnauthorized, "Unauthorized") + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, service.OpsRetryModeClient, nil) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, result) +} + +// RetryRequestErrorUpstreamEvent retries a specific upstream attempt using captured upstream_request_body. +// POST /api/v1/admin/ops/request-errors/:id/upstream-errors/:idx/retry +func (h *OpsHandler) RetryRequestErrorUpstreamEvent(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + subject, ok := middleware.GetAuthSubjectFromContext(c) + if !ok || subject.UserID <= 0 { + response.Error(c, http.StatusUnauthorized, "Unauthorized") + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + idxStr := strings.TrimSpace(c.Param("idx")) + idx, err := strconv.Atoi(idxStr) + if err != nil || idx < 0 { + response.BadRequest(c, "Invalid upstream idx") + return + } + + result, err := h.opsService.RetryUpstreamEvent(c.Request.Context(), subject.UserID, id, idx) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, result) +} + +// ResolveRequestError toggles resolved status. +// PUT /api/v1/admin/ops/request-errors/:id/resolve +func (h *OpsHandler) ResolveRequestError(c *gin.Context) { + h.UpdateErrorResolution(c) +} + +// ListUpstreamErrors lists independent upstream errors. +// GET /api/v1/admin/ops/upstream-errors +func (h *OpsHandler) ListUpstreamErrors(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + page, pageSize := response.ParsePagination(c) + if pageSize > 500 { + pageSize = 500 + } + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsErrorLogFilter{Page: page, PageSize: pageSize} + if !startTime.IsZero() { + filter.StartTime = &startTime + } + if !endTime.IsZero() { + filter.EndTime = &endTime + } + + filter.View = parseOpsViewParam(c) + filter.Phase = "upstream" + filter.Owner = "provider" + filter.Source = strings.TrimSpace(c.Query("error_source")) + filter.Query = strings.TrimSpace(c.Query("q")) + + if platform := strings.TrimSpace(c.Query("platform")); platform != "" { + filter.Platform = platform + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + if v := strings.TrimSpace(c.Query("account_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid account_id") + return + } + filter.AccountID = &id + } + + if v := strings.TrimSpace(c.Query("resolved")); v != "" { + switch strings.ToLower(v) { + case "1", "true", "yes": + b := true + filter.Resolved = &b + case "0", "false", "no": + b := false + filter.Resolved = &b + default: + response.BadRequest(c, "Invalid resolved") + return + } + } + if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" { + parts := strings.Split(statusCodesStr, ",") + out := make([]int, 0, len(parts)) + for _, part := range parts { + p := strings.TrimSpace(part) + if p == "" { + continue + } + n, err := strconv.Atoi(p) + if err != nil || n < 0 { + response.BadRequest(c, "Invalid status_codes") + return + } + out = append(out, n) + } + filter.StatusCodes = out + } + + result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize) +} + +// GetUpstreamError returns upstream error detail. +// GET /api/v1/admin/ops/upstream-errors/:id +func (h *OpsHandler) GetUpstreamError(c *gin.Context) { + h.GetErrorLogByID(c) +} + +// RetryUpstreamError retries upstream error using the original account_id. +// POST /api/v1/admin/ops/upstream-errors/:id/retry +func (h *OpsHandler) RetryUpstreamError(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + subject, ok := middleware.GetAuthSubjectFromContext(c) + if !ok || subject.UserID <= 0 { + response.Error(c, http.StatusUnauthorized, "Unauthorized") + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, service.OpsRetryModeUpstream, nil) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, result) +} + +// ResolveUpstreamError toggles resolved status. +// PUT /api/v1/admin/ops/upstream-errors/:id/resolve +func (h *OpsHandler) ResolveUpstreamError(c *gin.Context) { + h.UpdateErrorResolution(c) +} + +// ==================== Existing endpoints ==================== + // ListRequestDetails returns a request-level list (success + error) for drill-down. // GET /api/v1/admin/ops/requests func (h *OpsHandler) ListRequestDetails(c *gin.Context) { @@ -242,6 +709,11 @@ func (h *OpsHandler) ListRequestDetails(c *gin.Context) { type opsRetryRequest struct { Mode string `json:"mode"` PinnedAccountID *int64 `json:"pinned_account_id"` + Force bool `json:"force"` +} + +type opsResolveRequest struct { + Resolved bool `json:"resolved"` } // RetryErrorRequest retries a failed request using stored request_body. @@ -278,6 +750,16 @@ func (h *OpsHandler) RetryErrorRequest(c *gin.Context) { req.Mode = service.OpsRetryModeClient } + // Force flag is currently a UI-level acknowledgement. Server may still enforce safety constraints. + _ = req.Force + + // Legacy endpoint safety: only allow retrying the client request here. + // Upstream retries must go through the split endpoints. + if strings.EqualFold(strings.TrimSpace(req.Mode), service.OpsRetryModeUpstream) { + response.BadRequest(c, "upstream retry is not supported on this endpoint") + return + } + result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, req.Mode, req.PinnedAccountID) if err != nil { response.ErrorFrom(c, err) @@ -287,6 +769,81 @@ func (h *OpsHandler) RetryErrorRequest(c *gin.Context) { response.Success(c, result) } +// ListRetryAttempts lists retry attempts for an error log. +// GET /api/v1/admin/ops/errors/:id/retries +func (h *OpsHandler) ListRetryAttempts(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + limit := 50 + if v := strings.TrimSpace(c.Query("limit")); v != "" { + n, err := strconv.Atoi(v) + if err != nil || n <= 0 { + response.BadRequest(c, "Invalid limit") + return + } + limit = n + } + + items, err := h.opsService.ListRetryAttemptsByErrorID(c.Request.Context(), id, limit) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, items) +} + +// UpdateErrorResolution allows manual resolve/unresolve. +// PUT /api/v1/admin/ops/errors/:id/resolve +func (h *OpsHandler) UpdateErrorResolution(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + subject, ok := middleware.GetAuthSubjectFromContext(c) + if !ok || subject.UserID <= 0 { + response.Error(c, http.StatusUnauthorized, "Unauthorized") + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + var req opsResolveRequest + if err := c.ShouldBindJSON(&req); err != nil { + response.BadRequest(c, "Invalid request: "+err.Error()) + return + } + uid := subject.UserID + if err := h.opsService.UpdateErrorResolution(c.Request.Context(), id, req.Resolved, &uid, nil); err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, gin.H{"ok": true}) +} + func parseOpsTimeRange(c *gin.Context, defaultRange string) (time.Time, time.Time, error) { startStr := strings.TrimSpace(c.Query("start_time")) endStr := strings.TrimSpace(c.Query("end_time")) @@ -358,6 +915,10 @@ func parseOpsDuration(v string) (time.Duration, bool) { return 6 * time.Hour, true case "24h": return 24 * time.Hour, true + case "7d": + return 7 * 24 * time.Hour, true + case "30d": + return 30 * 24 * time.Hour, true default: return 0, false } diff --git a/backend/internal/handler/ops_error_logger.go b/backend/internal/handler/ops_error_logger.go index 13bd9d94..f62e6b3e 100644 --- a/backend/internal/handler/ops_error_logger.go +++ b/backend/internal/handler/ops_error_logger.go @@ -544,6 +544,11 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc { body := w.buf.Bytes() parsed := parseOpsErrorResponse(body) + // Skip logging if the error should be filtered based on settings + if shouldSkipOpsErrorLog(c.Request.Context(), ops, parsed.Message, string(body), c.Request.URL.Path) { + return + } + apiKey, _ := middleware2.GetAPIKeyFromContext(c) clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string) @@ -832,28 +837,30 @@ func normalizeOpsErrorType(errType string, code string) string { func classifyOpsPhase(errType, message, code string) string { msg := strings.ToLower(message) + // Standardized phases: request|auth|routing|upstream|network|internal + // Map billing/concurrency/response => request; scheduling => routing. switch strings.TrimSpace(code) { case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID": - return "billing" + return "request" } switch errType { case "authentication_error": return "auth" case "billing_error", "subscription_error": - return "billing" + return "request" case "rate_limit_error": if strings.Contains(msg, "concurrency") || strings.Contains(msg, "pending") || strings.Contains(msg, "queue") { - return "concurrency" + return "request" } return "upstream" case "invalid_request_error": - return "response" + return "request" case "upstream_error", "overloaded_error": return "upstream" case "api_error": if strings.Contains(msg, "no available accounts") { - return "scheduling" + return "routing" } return "internal" default: @@ -914,34 +921,38 @@ func classifyOpsIsBusinessLimited(errType, phase, code string, status int, messa } func classifyOpsErrorOwner(phase string, message string) string { + // Standardized owners: client|provider|platform switch phase { case "upstream", "network": return "provider" - case "billing", "concurrency", "auth", "response": + case "request", "auth": return "client" + case "routing", "internal": + return "platform" default: if strings.Contains(strings.ToLower(message), "upstream") { return "provider" } - return "sub2api" + return "platform" } } func classifyOpsErrorSource(phase string, message string) string { + // Standardized sources: client_request|upstream_http|gateway switch phase { case "upstream": return "upstream_http" case "network": - return "upstream_network" - case "billing": - return "billing" - case "concurrency": - return "concurrency" + return "gateway" + case "request", "auth": + return "client_request" + case "routing", "internal": + return "gateway" default: if strings.Contains(strings.ToLower(message), "upstream") { return "upstream_http" } - return "internal" + return "gateway" } } @@ -963,3 +974,42 @@ func truncateString(s string, max int) string { func strconvItoa(v int) string { return strconv.Itoa(v) } + +// shouldSkipOpsErrorLog determines if an error should be skipped from logging based on settings. +// Returns true for errors that should be filtered according to OpsAdvancedSettings. +func shouldSkipOpsErrorLog(ctx context.Context, ops *service.OpsService, message, body, requestPath string) bool { + if ops == nil { + return false + } + + // Get advanced settings to check filter configuration + settings, err := ops.GetOpsAdvancedSettings(ctx) + if err != nil || settings == nil { + // If we can't get settings, don't skip (fail open) + return false + } + + msgLower := strings.ToLower(message) + bodyLower := strings.ToLower(body) + + // Check if count_tokens errors should be ignored + if settings.IgnoreCountTokensErrors && strings.Contains(requestPath, "/count_tokens") { + return true + } + + // Check if context canceled errors should be ignored (client disconnects) + if settings.IgnoreContextCanceled { + if strings.Contains(msgLower, "context canceled") || strings.Contains(bodyLower, "context canceled") { + return true + } + } + + // Check if "no available accounts" errors should be ignored + if settings.IgnoreNoAvailableAccounts { + if strings.Contains(msgLower, "no available accounts") || strings.Contains(bodyLower, "no available accounts") { + return true + } + } + + return false +} diff --git a/backend/internal/repository/ops_repo.go b/backend/internal/repository/ops_repo.go index f9cb6b4d..613c5bd5 100644 --- a/backend/internal/repository/ops_repo.go +++ b/backend/internal/repository/ops_repo.go @@ -55,7 +55,6 @@ INSERT INTO ops_error_logs ( upstream_error_message, upstream_error_detail, upstream_errors, - duration_ms, time_to_first_token_ms, request_body, request_body_truncated, @@ -65,7 +64,7 @@ INSERT INTO ops_error_logs ( retry_count, created_at ) VALUES ( - $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35 + $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34 ) RETURNING id` var id int64 @@ -98,7 +97,6 @@ INSERT INTO ops_error_logs ( opsNullString(input.UpstreamErrorMessage), opsNullString(input.UpstreamErrorDetail), opsNullString(input.UpstreamErrorsJSON), - opsNullInt(input.DurationMs), opsNullInt64(input.TimeToFirstTokenMs), opsNullString(input.RequestBodyJSON), input.RequestBodyTruncated, @@ -135,7 +133,7 @@ func (r *opsRepository) ListErrorLogs(ctx context.Context, filter *service.OpsEr } where, args := buildOpsErrorLogsWhere(filter) - countSQL := "SELECT COUNT(*) FROM ops_error_logs " + where + countSQL := "SELECT COUNT(*) FROM ops_error_logs e " + where var total int if err := r.db.QueryRowContext(ctx, countSQL, args...).Scan(&total); err != nil { @@ -146,28 +144,43 @@ func (r *opsRepository) ListErrorLogs(ctx context.Context, filter *service.OpsEr argsWithLimit := append(args, pageSize, offset) selectSQL := ` SELECT - id, - created_at, - error_phase, - error_type, - severity, - COALESCE(upstream_status_code, status_code, 0), - COALESCE(platform, ''), - COALESCE(model, ''), - duration_ms, - COALESCE(client_request_id, ''), - COALESCE(request_id, ''), - COALESCE(error_message, ''), - user_id, - api_key_id, - account_id, - group_id, - CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END, - COALESCE(request_path, ''), - stream -FROM ops_error_logs + e.id, + e.created_at, + e.error_phase, + e.error_type, + COALESCE(e.error_owner, ''), + COALESCE(e.error_source, ''), + e.severity, + COALESCE(e.upstream_status_code, e.status_code, 0), + COALESCE(e.platform, ''), + COALESCE(e.model, ''), + COALESCE(e.is_retryable, false), + COALESCE(e.retry_count, 0), + COALESCE(e.resolved, false), + e.resolved_at, + e.resolved_by_user_id, + COALESCE(u2.email, ''), + e.resolved_retry_id, + COALESCE(e.client_request_id, ''), + COALESCE(e.request_id, ''), + COALESCE(e.error_message, ''), + e.user_id, + COALESCE(u.email, ''), + e.api_key_id, + e.account_id, + COALESCE(a.name, ''), + e.group_id, + COALESCE(g.name, ''), + CASE WHEN e.client_ip IS NULL THEN NULL ELSE e.client_ip::text END, + COALESCE(e.request_path, ''), + e.stream +FROM ops_error_logs e +LEFT JOIN accounts a ON e.account_id = a.id +LEFT JOIN groups g ON e.group_id = g.id +LEFT JOIN users u ON e.user_id = u.id +LEFT JOIN users u2 ON e.resolved_by_user_id = u2.id ` + where + ` -ORDER BY created_at DESC +ORDER BY e.created_at DESC LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2) rows, err := r.db.QueryContext(ctx, selectSQL, argsWithLimit...) @@ -179,39 +192,65 @@ LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2) out := make([]*service.OpsErrorLog, 0, pageSize) for rows.Next() { var item service.OpsErrorLog - var latency sql.NullInt64 var statusCode sql.NullInt64 var clientIP sql.NullString var userID sql.NullInt64 var apiKeyID sql.NullInt64 var accountID sql.NullInt64 + var accountName string var groupID sql.NullInt64 + var groupName string + var userEmail string + var resolvedAt sql.NullTime + var resolvedBy sql.NullInt64 + var resolvedByName string + var resolvedRetryID sql.NullInt64 if err := rows.Scan( &item.ID, &item.CreatedAt, &item.Phase, &item.Type, + &item.Owner, + &item.Source, &item.Severity, &statusCode, &item.Platform, &item.Model, - &latency, + &item.IsRetryable, + &item.RetryCount, + &item.Resolved, + &resolvedAt, + &resolvedBy, + &resolvedByName, + &resolvedRetryID, &item.ClientRequestID, &item.RequestID, &item.Message, &userID, + &userEmail, &apiKeyID, &accountID, + &accountName, &groupID, + &groupName, &clientIP, &item.RequestPath, &item.Stream, ); err != nil { return nil, err } - if latency.Valid { - v := int(latency.Int64) - item.LatencyMs = &v + if resolvedAt.Valid { + t := resolvedAt.Time + item.ResolvedAt = &t + } + if resolvedBy.Valid { + v := resolvedBy.Int64 + item.ResolvedByUserID = &v + } + item.ResolvedByUserName = resolvedByName + if resolvedRetryID.Valid { + v := resolvedRetryID.Int64 + item.ResolvedRetryID = &v } item.StatusCode = int(statusCode.Int64) if clientIP.Valid { @@ -222,6 +261,7 @@ LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2) v := userID.Int64 item.UserID = &v } + item.UserEmail = userEmail if apiKeyID.Valid { v := apiKeyID.Int64 item.APIKeyID = &v @@ -230,10 +270,12 @@ LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2) v := accountID.Int64 item.AccountID = &v } + item.AccountName = accountName if groupID.Valid { v := groupID.Int64 item.GroupID = &v } + item.GroupName = groupName out = append(out, &item) } if err := rows.Err(); err != nil { @@ -258,49 +300,64 @@ func (r *opsRepository) GetErrorLogByID(ctx context.Context, id int64) (*service q := ` SELECT - id, - created_at, - error_phase, - error_type, - severity, - COALESCE(upstream_status_code, status_code, 0), - COALESCE(platform, ''), - COALESCE(model, ''), - duration_ms, - COALESCE(client_request_id, ''), - COALESCE(request_id, ''), - COALESCE(error_message, ''), - COALESCE(error_body, ''), - upstream_status_code, - COALESCE(upstream_error_message, ''), - COALESCE(upstream_error_detail, ''), - COALESCE(upstream_errors::text, ''), - is_business_limited, - user_id, - api_key_id, - account_id, - group_id, - CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END, - COALESCE(request_path, ''), - stream, - COALESCE(user_agent, ''), - auth_latency_ms, - routing_latency_ms, - upstream_latency_ms, - response_latency_ms, - time_to_first_token_ms, - COALESCE(request_body::text, ''), - request_body_truncated, - request_body_bytes, - COALESCE(request_headers::text, '') -FROM ops_error_logs -WHERE id = $1 + e.id, + e.created_at, + e.error_phase, + e.error_type, + COALESCE(e.error_owner, ''), + COALESCE(e.error_source, ''), + e.severity, + COALESCE(e.upstream_status_code, e.status_code, 0), + COALESCE(e.platform, ''), + COALESCE(e.model, ''), + COALESCE(e.is_retryable, false), + COALESCE(e.retry_count, 0), + COALESCE(e.resolved, false), + e.resolved_at, + e.resolved_by_user_id, + e.resolved_retry_id, + COALESCE(e.client_request_id, ''), + COALESCE(e.request_id, ''), + COALESCE(e.error_message, ''), + COALESCE(e.error_body, ''), + e.upstream_status_code, + COALESCE(e.upstream_error_message, ''), + COALESCE(e.upstream_error_detail, ''), + COALESCE(e.upstream_errors::text, ''), + e.is_business_limited, + e.user_id, + COALESCE(u.email, ''), + e.api_key_id, + e.account_id, + COALESCE(a.name, ''), + e.group_id, + COALESCE(g.name, ''), + CASE WHEN e.client_ip IS NULL THEN NULL ELSE e.client_ip::text END, + COALESCE(e.request_path, ''), + e.stream, + COALESCE(e.user_agent, ''), + e.auth_latency_ms, + e.routing_latency_ms, + e.upstream_latency_ms, + e.response_latency_ms, + e.time_to_first_token_ms, + COALESCE(e.request_body::text, ''), + e.request_body_truncated, + e.request_body_bytes, + COALESCE(e.request_headers::text, '') +FROM ops_error_logs e +LEFT JOIN users u ON e.user_id = u.id +LEFT JOIN accounts a ON e.account_id = a.id +LEFT JOIN groups g ON e.group_id = g.id +WHERE e.id = $1 LIMIT 1` var out service.OpsErrorLogDetail - var latency sql.NullInt64 var statusCode sql.NullInt64 var upstreamStatusCode sql.NullInt64 + var resolvedAt sql.NullTime + var resolvedBy sql.NullInt64 + var resolvedRetryID sql.NullInt64 var clientIP sql.NullString var userID sql.NullInt64 var apiKeyID sql.NullInt64 @@ -318,11 +375,18 @@ LIMIT 1` &out.CreatedAt, &out.Phase, &out.Type, + &out.Owner, + &out.Source, &out.Severity, &statusCode, &out.Platform, &out.Model, - &latency, + &out.IsRetryable, + &out.RetryCount, + &out.Resolved, + &resolvedAt, + &resolvedBy, + &resolvedRetryID, &out.ClientRequestID, &out.RequestID, &out.Message, @@ -333,9 +397,12 @@ LIMIT 1` &out.UpstreamErrors, &out.IsBusinessLimited, &userID, + &out.UserEmail, &apiKeyID, &accountID, + &out.AccountName, &groupID, + &out.GroupName, &clientIP, &out.RequestPath, &out.Stream, @@ -355,9 +422,17 @@ LIMIT 1` } out.StatusCode = int(statusCode.Int64) - if latency.Valid { - v := int(latency.Int64) - out.LatencyMs = &v + if resolvedAt.Valid { + t := resolvedAt.Time + out.ResolvedAt = &t + } + if resolvedBy.Valid { + v := resolvedBy.Int64 + out.ResolvedByUserID = &v + } + if resolvedRetryID.Valid { + v := resolvedRetryID.Int64 + out.ResolvedRetryID = &v } if clientIP.Valid { s := clientIP.String @@ -487,9 +562,15 @@ SET status = $2, finished_at = $3, duration_ms = $4, - result_request_id = $5, - result_error_id = $6, - error_message = $7 + success = $5, + http_status_code = $6, + upstream_request_id = $7, + used_account_id = $8, + response_preview = $9, + response_truncated = $10, + result_request_id = $11, + result_error_id = $12, + error_message = $13 WHERE id = $1` _, err := r.db.ExecContext( @@ -499,8 +580,14 @@ WHERE id = $1` strings.TrimSpace(input.Status), nullTime(input.FinishedAt), input.DurationMs, + nullBool(input.Success), + nullInt(input.HTTPStatusCode), + opsNullString(input.UpstreamRequestID), + nullInt64(input.UsedAccountID), + opsNullString(input.ResponsePreview), + nullBool(input.ResponseTruncated), opsNullString(input.ResultRequestID), - opsNullInt64(input.ResultErrorID), + nullInt64(input.ResultErrorID), opsNullString(input.ErrorMessage), ) return err @@ -526,6 +613,12 @@ SELECT started_at, finished_at, duration_ms, + success, + http_status_code, + upstream_request_id, + used_account_id, + response_preview, + response_truncated, result_request_id, result_error_id, error_message @@ -540,6 +633,12 @@ LIMIT 1` var startedAt sql.NullTime var finishedAt sql.NullTime var durationMs sql.NullInt64 + var success sql.NullBool + var httpStatusCode sql.NullInt64 + var upstreamRequestID sql.NullString + var usedAccountID sql.NullInt64 + var responsePreview sql.NullString + var responseTruncated sql.NullBool var resultRequestID sql.NullString var resultErrorID sql.NullInt64 var errorMessage sql.NullString @@ -555,6 +654,12 @@ LIMIT 1` &startedAt, &finishedAt, &durationMs, + &success, + &httpStatusCode, + &upstreamRequestID, + &usedAccountID, + &responsePreview, + &responseTruncated, &resultRequestID, &resultErrorID, &errorMessage, @@ -579,6 +684,30 @@ LIMIT 1` v := durationMs.Int64 out.DurationMs = &v } + if success.Valid { + v := success.Bool + out.Success = &v + } + if httpStatusCode.Valid { + v := int(httpStatusCode.Int64) + out.HTTPStatusCode = &v + } + if upstreamRequestID.Valid { + s := upstreamRequestID.String + out.UpstreamRequestID = &s + } + if usedAccountID.Valid { + v := usedAccountID.Int64 + out.UsedAccountID = &v + } + if responsePreview.Valid { + s := responsePreview.String + out.ResponsePreview = &s + } + if responseTruncated.Valid { + v := responseTruncated.Bool + out.ResponseTruncated = &v + } if resultRequestID.Valid { s := resultRequestID.String out.ResultRequestID = &s @@ -602,30 +731,234 @@ func nullTime(t time.Time) sql.NullTime { return sql.NullTime{Time: t, Valid: true} } +func nullBool(v *bool) sql.NullBool { + if v == nil { + return sql.NullBool{} + } + return sql.NullBool{Bool: *v, Valid: true} +} + +func (r *opsRepository) ListRetryAttemptsByErrorID(ctx context.Context, sourceErrorID int64, limit int) ([]*service.OpsRetryAttempt, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if sourceErrorID <= 0 { + return nil, fmt.Errorf("invalid source_error_id") + } + if limit <= 0 { + limit = 50 + } + if limit > 200 { + limit = 200 + } + + q := ` +SELECT + r.id, + r.created_at, + COALESCE(r.requested_by_user_id, 0), + r.source_error_id, + COALESCE(r.mode, ''), + r.pinned_account_id, + COALESCE(pa.name, ''), + COALESCE(r.status, ''), + r.started_at, + r.finished_at, + r.duration_ms, + r.success, + r.http_status_code, + r.upstream_request_id, + r.used_account_id, + COALESCE(ua.name, ''), + r.response_preview, + r.response_truncated, + r.result_request_id, + r.result_error_id, + r.error_message +FROM ops_retry_attempts r +LEFT JOIN accounts pa ON r.pinned_account_id = pa.id +LEFT JOIN accounts ua ON r.used_account_id = ua.id +WHERE r.source_error_id = $1 +ORDER BY r.created_at DESC +LIMIT $2` + + rows, err := r.db.QueryContext(ctx, q, sourceErrorID, limit) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + out := make([]*service.OpsRetryAttempt, 0, 16) + for rows.Next() { + var item service.OpsRetryAttempt + var pinnedAccountID sql.NullInt64 + var pinnedAccountName string + var requestedBy sql.NullInt64 + var startedAt sql.NullTime + var finishedAt sql.NullTime + var durationMs sql.NullInt64 + var success sql.NullBool + var httpStatusCode sql.NullInt64 + var upstreamRequestID sql.NullString + var usedAccountID sql.NullInt64 + var usedAccountName string + var responsePreview sql.NullString + var responseTruncated sql.NullBool + var resultRequestID sql.NullString + var resultErrorID sql.NullInt64 + var errorMessage sql.NullString + + if err := rows.Scan( + &item.ID, + &item.CreatedAt, + &requestedBy, + &item.SourceErrorID, + &item.Mode, + &pinnedAccountID, + &pinnedAccountName, + &item.Status, + &startedAt, + &finishedAt, + &durationMs, + &success, + &httpStatusCode, + &upstreamRequestID, + &usedAccountID, + &usedAccountName, + &responsePreview, + &responseTruncated, + &resultRequestID, + &resultErrorID, + &errorMessage, + ); err != nil { + return nil, err + } + + item.RequestedByUserID = requestedBy.Int64 + if pinnedAccountID.Valid { + v := pinnedAccountID.Int64 + item.PinnedAccountID = &v + } + item.PinnedAccountName = pinnedAccountName + if startedAt.Valid { + t := startedAt.Time + item.StartedAt = &t + } + if finishedAt.Valid { + t := finishedAt.Time + item.FinishedAt = &t + } + if durationMs.Valid { + v := durationMs.Int64 + item.DurationMs = &v + } + if success.Valid { + v := success.Bool + item.Success = &v + } + if httpStatusCode.Valid { + v := int(httpStatusCode.Int64) + item.HTTPStatusCode = &v + } + if upstreamRequestID.Valid { + item.UpstreamRequestID = &upstreamRequestID.String + } + if usedAccountID.Valid { + v := usedAccountID.Int64 + item.UsedAccountID = &v + } + item.UsedAccountName = usedAccountName + if responsePreview.Valid { + item.ResponsePreview = &responsePreview.String + } + if responseTruncated.Valid { + v := responseTruncated.Bool + item.ResponseTruncated = &v + } + if resultRequestID.Valid { + item.ResultRequestID = &resultRequestID.String + } + if resultErrorID.Valid { + v := resultErrorID.Int64 + item.ResultErrorID = &v + } + if errorMessage.Valid { + item.ErrorMessage = &errorMessage.String + } + out = append(out, &item) + } + if err := rows.Err(); err != nil { + return nil, err + } + return out, nil +} + +func (r *opsRepository) UpdateErrorResolution(ctx context.Context, errorID int64, resolved bool, resolvedByUserID *int64, resolvedRetryID *int64, resolvedAt *time.Time) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if errorID <= 0 { + return fmt.Errorf("invalid error id") + } + + q := ` +UPDATE ops_error_logs +SET + resolved = $2, + resolved_at = $3, + resolved_by_user_id = $4, + resolved_retry_id = $5 +WHERE id = $1` + + at := sql.NullTime{} + if resolvedAt != nil && !resolvedAt.IsZero() { + at = sql.NullTime{Time: resolvedAt.UTC(), Valid: true} + } else if resolved { + now := time.Now().UTC() + at = sql.NullTime{Time: now, Valid: true} + } + + _, err := r.db.ExecContext( + ctx, + q, + errorID, + resolved, + at, + nullInt64(resolvedByUserID), + nullInt64(resolvedRetryID), + ) + return err +} + func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) { - clauses := make([]string, 0, 8) - args := make([]any, 0, 8) + clauses := make([]string, 0, 12) + args := make([]any, 0, 12) clauses = append(clauses, "1=1") phaseFilter := "" if filter != nil { phaseFilter = strings.TrimSpace(strings.ToLower(filter.Phase)) } - // ops_error_logs primarily stores client-visible error requests (status>=400), + // ops_error_logs stores client-visible error requests (status>=400), // but we also persist "recovered" upstream errors (status<400) for upstream health visibility. - // By default, keep list endpoints scoped to client errors unless explicitly filtering upstream phase. + // If Resolved is not specified, do not filter by resolved state (backward-compatible). + resolvedFilter := (*bool)(nil) + if filter != nil { + resolvedFilter = filter.Resolved + } + // Keep list endpoints scoped to client errors unless explicitly filtering upstream phase. if phaseFilter != "upstream" { clauses = append(clauses, "COALESCE(status_code, 0) >= 400") } if filter.StartTime != nil && !filter.StartTime.IsZero() { args = append(args, filter.StartTime.UTC()) - clauses = append(clauses, "created_at >= $"+itoa(len(args))) + clauses = append(clauses, "e.created_at >= $"+itoa(len(args))) } if filter.EndTime != nil && !filter.EndTime.IsZero() { args = append(args, filter.EndTime.UTC()) // Keep time-window semantics consistent with other ops queries: [start, end) - clauses = append(clauses, "created_at < $"+itoa(len(args))) + clauses = append(clauses, "e.created_at < $"+itoa(len(args))) } if p := strings.TrimSpace(filter.Platform); p != "" { args = append(args, p) @@ -643,10 +976,59 @@ func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) { args = append(args, phase) clauses = append(clauses, "error_phase = $"+itoa(len(args))) } + if filter != nil { + if owner := strings.TrimSpace(strings.ToLower(filter.Owner)); owner != "" { + args = append(args, owner) + clauses = append(clauses, "LOWER(COALESCE(error_owner,'')) = $"+itoa(len(args))) + } + if source := strings.TrimSpace(strings.ToLower(filter.Source)); source != "" { + args = append(args, source) + clauses = append(clauses, "LOWER(COALESCE(error_source,'')) = $"+itoa(len(args))) + } + } + if resolvedFilter != nil { + args = append(args, *resolvedFilter) + clauses = append(clauses, "COALESCE(resolved,false) = $"+itoa(len(args))) + } + + // View filter: errors vs excluded vs all. + // Excluded = upstream 429/529 and business-limited (quota/concurrency/billing) errors. + view := "" + if filter != nil { + view = strings.ToLower(strings.TrimSpace(filter.View)) + } + switch view { + case "", "errors": + clauses = append(clauses, "COALESCE(is_business_limited,false) = false") + clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)") + case "excluded": + clauses = append(clauses, "(COALESCE(is_business_limited,false) = true OR COALESCE(upstream_status_code, status_code, 0) IN (429, 529))") + case "all": + // no-op + default: + // treat unknown as default 'errors' + clauses = append(clauses, "COALESCE(is_business_limited,false) = false") + clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)") + } if len(filter.StatusCodes) > 0 { args = append(args, pq.Array(filter.StatusCodes)) clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) = ANY($"+itoa(len(args))+")") + } else if filter.StatusCodesOther { + // "Other" means: status codes not in the common list. + known := []int{400, 401, 403, 404, 409, 422, 429, 500, 502, 503, 504, 529} + args = append(args, pq.Array(known)) + clauses = append(clauses, "NOT (COALESCE(upstream_status_code, status_code, 0) = ANY($"+itoa(len(args))+"))") } + // Exact correlation keys (preferred for request↔upstream linkage). + if rid := strings.TrimSpace(filter.RequestID); rid != "" { + args = append(args, rid) + clauses = append(clauses, "COALESCE(request_id,'') = $"+itoa(len(args))) + } + if crid := strings.TrimSpace(filter.ClientRequestID); crid != "" { + args = append(args, crid) + clauses = append(clauses, "COALESCE(client_request_id,'') = $"+itoa(len(args))) + } + if q := strings.TrimSpace(filter.Query); q != "" { like := "%" + q + "%" args = append(args, like) @@ -654,6 +1036,13 @@ func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) { clauses = append(clauses, "(request_id ILIKE $"+n+" OR client_request_id ILIKE $"+n+" OR error_message ILIKE $"+n+")") } + if userQuery := strings.TrimSpace(filter.UserQuery); userQuery != "" { + like := "%" + userQuery + "%" + args = append(args, like) + n := itoa(len(args)) + clauses = append(clauses, "u.email ILIKE $"+n) + } + return "WHERE " + strings.Join(clauses, " AND "), args } diff --git a/backend/internal/repository/ops_repo_alerts.go b/backend/internal/repository/ops_repo_alerts.go index f601c363..bd98b7e4 100644 --- a/backend/internal/repository/ops_repo_alerts.go +++ b/backend/internal/repository/ops_repo_alerts.go @@ -354,7 +354,7 @@ SELECT created_at FROM ops_alert_events ` + where + ` -ORDER BY fired_at DESC +ORDER BY fired_at DESC, id DESC LIMIT ` + limitArg rows, err := r.db.QueryContext(ctx, q, args...) @@ -413,6 +413,43 @@ LIMIT ` + limitArg return out, nil } +func (r *opsRepository) GetAlertEventByID(ctx context.Context, eventID int64) (*service.OpsAlertEvent, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if eventID <= 0 { + return nil, fmt.Errorf("invalid event id") + } + + q := ` +SELECT + id, + COALESCE(rule_id, 0), + COALESCE(severity, ''), + COALESCE(status, ''), + COALESCE(title, ''), + COALESCE(description, ''), + metric_value, + threshold_value, + dimensions, + fired_at, + resolved_at, + email_sent, + created_at +FROM ops_alert_events +WHERE id = $1` + + row := r.db.QueryRowContext(ctx, q, eventID) + ev, err := scanOpsAlertEvent(row) + if err != nil { + if err == sql.ErrNoRows { + return nil, nil + } + return nil, err + } + return ev, nil +} + func (r *opsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) { if r == nil || r.db == nil { return nil, fmt.Errorf("nil ops repository") @@ -591,6 +628,121 @@ type opsAlertEventRow interface { Scan(dest ...any) error } +func (r *opsRepository) CreateAlertSilence(ctx context.Context, input *service.OpsAlertSilence) (*service.OpsAlertSilence, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if input == nil { + return nil, fmt.Errorf("nil input") + } + if input.RuleID <= 0 { + return nil, fmt.Errorf("invalid rule_id") + } + platform := strings.TrimSpace(input.Platform) + if platform == "" { + return nil, fmt.Errorf("invalid platform") + } + if input.Until.IsZero() { + return nil, fmt.Errorf("invalid until") + } + + q := ` +INSERT INTO ops_alert_silences ( + rule_id, + platform, + group_id, + region, + until, + reason, + created_by, + created_at +) VALUES ( + $1,$2,$3,$4,$5,$6,$7,NOW() +) +RETURNING id, rule_id, platform, group_id, region, until, COALESCE(reason,''), created_by, created_at` + + row := r.db.QueryRowContext( + ctx, + q, + input.RuleID, + platform, + opsNullInt64(input.GroupID), + opsNullString(input.Region), + input.Until, + opsNullString(input.Reason), + opsNullInt64(input.CreatedBy), + ) + + var out service.OpsAlertSilence + var groupID sql.NullInt64 + var region sql.NullString + var createdBy sql.NullInt64 + if err := row.Scan( + &out.ID, + &out.RuleID, + &out.Platform, + &groupID, + ®ion, + &out.Until, + &out.Reason, + &createdBy, + &out.CreatedAt, + ); err != nil { + return nil, err + } + if groupID.Valid { + v := groupID.Int64 + out.GroupID = &v + } + if region.Valid { + v := strings.TrimSpace(region.String) + if v != "" { + out.Region = &v + } + } + if createdBy.Valid { + v := createdBy.Int64 + out.CreatedBy = &v + } + return &out, nil +} + +func (r *opsRepository) IsAlertSilenced(ctx context.Context, ruleID int64, platform string, groupID *int64, region *string, now time.Time) (bool, error) { + if r == nil || r.db == nil { + return false, fmt.Errorf("nil ops repository") + } + if ruleID <= 0 { + return false, fmt.Errorf("invalid rule id") + } + platform = strings.TrimSpace(platform) + if platform == "" { + return false, nil + } + if now.IsZero() { + now = time.Now().UTC() + } + + q := ` +SELECT 1 +FROM ops_alert_silences +WHERE rule_id = $1 + AND platform = $2 + AND (group_id IS NOT DISTINCT FROM $3) + AND (region IS NOT DISTINCT FROM $4) + AND until > $5 +LIMIT 1` + + var dummy int + err := r.db.QueryRowContext(ctx, q, ruleID, platform, opsNullInt64(groupID), opsNullString(region), now).Scan(&dummy) + if err != nil { + if err == sql.ErrNoRows { + return false, nil + } + return false, err + } + return true, nil +} + func scanOpsAlertEvent(row opsAlertEventRow) (*service.OpsAlertEvent, error) { var ev service.OpsAlertEvent var metricValue sql.NullFloat64 @@ -652,6 +804,10 @@ func buildOpsAlertEventsWhere(filter *service.OpsAlertEventFilter) (string, []an args = append(args, severity) clauses = append(clauses, "severity = $"+itoa(len(args))) } + if filter.EmailSent != nil { + args = append(args, *filter.EmailSent) + clauses = append(clauses, "email_sent = $"+itoa(len(args))) + } if filter.StartTime != nil && !filter.StartTime.IsZero() { args = append(args, *filter.StartTime) clauses = append(clauses, "fired_at >= $"+itoa(len(args))) @@ -661,6 +817,14 @@ func buildOpsAlertEventsWhere(filter *service.OpsAlertEventFilter) (string, []an clauses = append(clauses, "fired_at < $"+itoa(len(args))) } + // Cursor pagination (descending by fired_at, then id) + if filter.BeforeFiredAt != nil && !filter.BeforeFiredAt.IsZero() && filter.BeforeID != nil && *filter.BeforeID > 0 { + args = append(args, *filter.BeforeFiredAt) + tsArg := "$" + itoa(len(args)) + args = append(args, *filter.BeforeID) + idArg := "$" + itoa(len(args)) + clauses = append(clauses, fmt.Sprintf("(fired_at < %s OR (fired_at = %s AND id < %s))", tsArg, tsArg, idArg)) + } // Dimensions are stored in JSONB. We filter best-effort without requiring GIN indexes. if platform := strings.TrimSpace(filter.Platform); platform != "" { args = append(args, platform) diff --git a/backend/internal/repository/scheduler_snapshot_outbox_integration_test.go b/backend/internal/repository/scheduler_snapshot_outbox_integration_test.go index dede6014..e442a125 100644 --- a/backend/internal/repository/scheduler_snapshot_outbox_integration_test.go +++ b/backend/internal/repository/scheduler_snapshot_outbox_integration_test.go @@ -27,7 +27,7 @@ func TestSchedulerSnapshotOutboxReplay(t *testing.T) { RunMode: config.RunModeStandard, Gateway: config.GatewayConfig{ Scheduling: config.GatewaySchedulingConfig{ - OutboxPollIntervalSeconds: 1, + OutboxPollIntervalSeconds: 1, FullRebuildIntervalSeconds: 0, DbFallbackEnabled: true, }, diff --git a/backend/internal/server/routes/admin.go b/backend/internal/server/routes/admin.go index f2ee05e0..ff05b32a 100644 --- a/backend/internal/server/routes/admin.go +++ b/backend/internal/server/routes/admin.go @@ -81,6 +81,9 @@ func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) { ops.PUT("/alert-rules/:id", h.Admin.Ops.UpdateAlertRule) ops.DELETE("/alert-rules/:id", h.Admin.Ops.DeleteAlertRule) ops.GET("/alert-events", h.Admin.Ops.ListAlertEvents) + ops.GET("/alert-events/:id", h.Admin.Ops.GetAlertEvent) + ops.PUT("/alert-events/:id/status", h.Admin.Ops.UpdateAlertEventStatus) + ops.POST("/alert-silences", h.Admin.Ops.CreateAlertSilence) // Email notification config (DB-backed) ops.GET("/email-notification/config", h.Admin.Ops.GetEmailNotificationConfig) @@ -110,10 +113,26 @@ func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) { ws.GET("/qps", h.Admin.Ops.QPSWSHandler) } - // Error logs (MVP-1) + // Error logs (legacy) ops.GET("/errors", h.Admin.Ops.GetErrorLogs) ops.GET("/errors/:id", h.Admin.Ops.GetErrorLogByID) + ops.GET("/errors/:id/retries", h.Admin.Ops.ListRetryAttempts) ops.POST("/errors/:id/retry", h.Admin.Ops.RetryErrorRequest) + ops.PUT("/errors/:id/resolve", h.Admin.Ops.UpdateErrorResolution) + + // Request errors (client-visible failures) + ops.GET("/request-errors", h.Admin.Ops.ListRequestErrors) + ops.GET("/request-errors/:id", h.Admin.Ops.GetRequestError) + ops.GET("/request-errors/:id/upstream-errors", h.Admin.Ops.ListRequestErrorUpstreamErrors) + ops.POST("/request-errors/:id/retry-client", h.Admin.Ops.RetryRequestErrorClient) + ops.POST("/request-errors/:id/upstream-errors/:idx/retry", h.Admin.Ops.RetryRequestErrorUpstreamEvent) + ops.PUT("/request-errors/:id/resolve", h.Admin.Ops.ResolveRequestError) + + // Upstream errors (independent upstream failures) + ops.GET("/upstream-errors", h.Admin.Ops.ListUpstreamErrors) + ops.GET("/upstream-errors/:id", h.Admin.Ops.GetUpstreamError) + ops.POST("/upstream-errors/:id/retry", h.Admin.Ops.RetryUpstreamError) + ops.PUT("/upstream-errors/:id/resolve", h.Admin.Ops.ResolveUpstreamError) // Request drilldown (success + error) ops.GET("/requests", h.Admin.Ops.ListRequestDetails) diff --git a/backend/internal/service/admin_service_bulk_update_test.go b/backend/internal/service/admin_service_bulk_update_test.go index ef621213..662b95fb 100644 --- a/backend/internal/service/admin_service_bulk_update_test.go +++ b/backend/internal/service/admin_service_bulk_update_test.go @@ -12,9 +12,9 @@ import ( type accountRepoStubForBulkUpdate struct { accountRepoStub - bulkUpdateErr error - bulkUpdateIDs []int64 - bindGroupErrByID map[int64]error + bulkUpdateErr error + bulkUpdateIDs []int64 + bindGroupErrByID map[int64]error } func (s *accountRepoStubForBulkUpdate) BulkUpdate(_ context.Context, ids []int64, _ AccountBulkUpdate) (int64, error) { diff --git a/backend/internal/service/antigravity_gateway_service.go b/backend/internal/service/antigravity_gateway_service.go index 60567434..7f3e97a2 100644 --- a/backend/internal/service/antigravity_gateway_service.go +++ b/backend/internal/service/antigravity_gateway_service.go @@ -564,6 +564,10 @@ urlFallbackLoop: } upstreamReq, err := antigravity.NewAPIRequestWithURL(ctx, baseURL, action, accessToken, geminiBody) + // Capture upstream request body for ops retry of this attempt. + if c != nil { + c.Set(OpsUpstreamRequestBodyKey, string(geminiBody)) + } if err != nil { return nil, err } @@ -574,6 +578,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: 0, Kind: "request_error", Message: safeErr, @@ -615,6 +620,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "retry", @@ -645,6 +651,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "retry", @@ -697,6 +704,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "signature_error", @@ -740,6 +748,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: 0, Kind: "signature_retry_request_error", Message: sanitizeUpstreamErrorMessage(retryErr.Error()), @@ -770,6 +779,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: retryResp.StatusCode, UpstreamRequestID: retryResp.Header.Get("x-request-id"), Kind: kind, @@ -817,6 +827,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "failover", @@ -1371,6 +1382,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: 0, Kind: "request_error", Message: safeErr, @@ -1412,6 +1424,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "retry", @@ -1442,6 +1455,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "retry", @@ -1543,6 +1557,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: requestID, Kind: "failover", @@ -1559,6 +1574,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: requestID, Kind: "http_error", @@ -2039,6 +2055,7 @@ func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, accou appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: upstreamStatus, UpstreamRequestID: upstreamRequestID, Kind: "http_error", diff --git a/backend/internal/service/gateway_service.go b/backend/internal/service/gateway_service.go index afd49a6e..b552f030 100644 --- a/backend/internal/service/gateway_service.go +++ b/backend/internal/service/gateway_service.go @@ -1466,6 +1466,9 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A for attempt := 1; attempt <= maxRetryAttempts; attempt++ { // 构建上游请求(每次重试需要重新构建,因为请求体需要重新读取) upstreamReq, err := s.buildUpstreamRequest(ctx, c, account, body, token, tokenType, reqModel) + // Capture upstream request body for ops retry of this attempt. + c.Set(OpsUpstreamRequestBodyKey, string(body)) + if err != nil { return nil, err } @@ -1482,6 +1485,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: 0, Kind: "request_error", Message: safeErr, @@ -1506,6 +1510,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "signature_error", @@ -1557,6 +1562,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: retryResp.StatusCode, UpstreamRequestID: retryResp.Header.Get("x-request-id"), Kind: "signature_retry_thinking", @@ -1585,6 +1591,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: 0, Kind: "signature_retry_tools_request_error", Message: sanitizeUpstreamErrorMessage(retryErr2.Error()), @@ -1643,6 +1650,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "retry", @@ -1691,6 +1699,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "retry_exhausted_failover", @@ -1757,6 +1766,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "failover_on_400", diff --git a/backend/internal/service/gemini_messages_compat_service.go b/backend/internal/service/gemini_messages_compat_service.go index 190e6afc..75de90f2 100644 --- a/backend/internal/service/gemini_messages_compat_service.go +++ b/backend/internal/service/gemini_messages_compat_service.go @@ -545,12 +545,19 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex } requestIDHeader = idHeader + // Capture upstream request body for ops retry of this attempt. + if c != nil { + // In this code path `body` is already the JSON sent to upstream. + c.Set(OpsUpstreamRequestBodyKey, string(body)) + } + resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { safeErr := sanitizeUpstreamErrorMessage(err.Error()) appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: 0, Kind: "request_error", Message: safeErr, @@ -588,6 +595,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: upstreamReqID, Kind: "signature_error", @@ -662,6 +670,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: upstreamReqID, Kind: "retry", @@ -711,6 +720,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: upstreamReqID, Kind: "failover", @@ -737,6 +747,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: upstreamReqID, Kind: "failover", @@ -972,12 +983,19 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. } requestIDHeader = idHeader + // Capture upstream request body for ops retry of this attempt. + if c != nil { + // In this code path `body` is already the JSON sent to upstream. + c.Set(OpsUpstreamRequestBodyKey, string(body)) + } + resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { safeErr := sanitizeUpstreamErrorMessage(err.Error()) appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: 0, Kind: "request_error", Message: safeErr, @@ -1036,6 +1054,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: upstreamReqID, Kind: "retry", @@ -1120,6 +1139,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: requestID, Kind: "failover", @@ -1143,6 +1163,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: requestID, Kind: "failover", @@ -1168,6 +1189,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: requestID, Kind: "http_error", @@ -1300,6 +1322,7 @@ func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, acc appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: upstreamStatus, UpstreamRequestID: upstreamRequestID, Kind: "http_error", diff --git a/backend/internal/service/openai_gateway_service.go b/backend/internal/service/openai_gateway_service.go index 9b3e7603..cfba6460 100644 --- a/backend/internal/service/openai_gateway_service.go +++ b/backend/internal/service/openai_gateway_service.go @@ -664,6 +664,11 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco proxyURL = account.Proxy.URL() } + // Capture upstream request body for ops retry of this attempt. + if c != nil { + c.Set(OpsUpstreamRequestBodyKey, string(body)) + } + // Send request resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { @@ -673,6 +678,7 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: 0, Kind: "request_error", Message: safeErr, @@ -707,6 +713,7 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "failover", @@ -864,6 +871,7 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "http_error", @@ -894,6 +902,7 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: kind, diff --git a/backend/internal/service/ops_alert_evaluator_service.go b/backend/internal/service/ops_alert_evaluator_service.go index f376c246..2b619f4d 100644 --- a/backend/internal/service/ops_alert_evaluator_service.go +++ b/backend/internal/service/ops_alert_evaluator_service.go @@ -206,7 +206,7 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) { continue } - scopePlatform, scopeGroupID := parseOpsAlertRuleScope(rule.Filters) + scopePlatform, scopeGroupID, scopeRegion := parseOpsAlertRuleScope(rule.Filters) windowMinutes := rule.WindowMinutes if windowMinutes <= 0 { @@ -236,6 +236,17 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) { continue } + // Scoped silencing: if a matching silence exists, skip creating a firing event. + if s.opsService != nil { + platform := strings.TrimSpace(scopePlatform) + region := scopeRegion + if platform != "" { + if ok, err := s.opsService.IsAlertSilenced(ctx, rule.ID, platform, scopeGroupID, region, now); err == nil && ok { + continue + } + } + } + latestEvent, err := s.opsRepo.GetLatestAlertEvent(ctx, rule.ID) if err != nil { log.Printf("[OpsAlertEvaluator] get latest event failed (rule=%d): %v", rule.ID, err) @@ -359,9 +370,9 @@ func requiredSustainedBreaches(sustainedMinutes int, interval time.Duration) int return required } -func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64) { +func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64, region *string) { if filters == nil { - return "", nil + return "", nil, nil } if v, ok := filters["platform"]; ok { if s, ok := v.(string); ok { @@ -392,7 +403,15 @@ func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *i } } } - return platform, groupID + if v, ok := filters["region"]; ok { + if s, ok := v.(string); ok { + vv := strings.TrimSpace(s) + if vv != "" { + region = &vv + } + } + } + return platform, groupID, region } func (s *OpsAlertEvaluatorService) computeRuleMetric( @@ -504,16 +523,6 @@ func (s *OpsAlertEvaluatorService) computeRuleMetric( return 0, false } return overview.UpstreamErrorRate * 100, true - case "p95_latency_ms": - if overview.Duration.P95 == nil { - return 0, false - } - return float64(*overview.Duration.P95), true - case "p99_latency_ms": - if overview.Duration.P99 == nil { - return 0, false - } - return float64(*overview.Duration.P99), true default: return 0, false } diff --git a/backend/internal/service/ops_alert_models.go b/backend/internal/service/ops_alert_models.go index 0acf13ab..a0caa990 100644 --- a/backend/internal/service/ops_alert_models.go +++ b/backend/internal/service/ops_alert_models.go @@ -8,8 +8,9 @@ import "time" // with the existing ops dashboard frontend (backup style). const ( - OpsAlertStatusFiring = "firing" - OpsAlertStatusResolved = "resolved" + OpsAlertStatusFiring = "firing" + OpsAlertStatusResolved = "resolved" + OpsAlertStatusManualResolved = "manual_resolved" ) type OpsAlertRule struct { @@ -58,12 +59,32 @@ type OpsAlertEvent struct { CreatedAt time.Time `json:"created_at"` } +type OpsAlertSilence struct { + ID int64 `json:"id"` + + RuleID int64 `json:"rule_id"` + Platform string `json:"platform"` + GroupID *int64 `json:"group_id,omitempty"` + Region *string `json:"region,omitempty"` + + Until time.Time `json:"until"` + Reason string `json:"reason"` + + CreatedBy *int64 `json:"created_by,omitempty"` + CreatedAt time.Time `json:"created_at"` +} + type OpsAlertEventFilter struct { Limit int + // Cursor pagination (descending by fired_at, then id). + BeforeFiredAt *time.Time + BeforeID *int64 + // Optional filters. - Status string - Severity string + Status string + Severity string + EmailSent *bool StartTime *time.Time EndTime *time.Time diff --git a/backend/internal/service/ops_alerts.go b/backend/internal/service/ops_alerts.go index b6c3d1c3..b4c09824 100644 --- a/backend/internal/service/ops_alerts.go +++ b/backend/internal/service/ops_alerts.go @@ -88,6 +88,29 @@ func (s *OpsService) ListAlertEvents(ctx context.Context, filter *OpsAlertEventF return s.opsRepo.ListAlertEvents(ctx, filter) } +func (s *OpsService) GetAlertEventByID(ctx context.Context, eventID int64) (*OpsAlertEvent, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if eventID <= 0 { + return nil, infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id") + } + ev, err := s.opsRepo.GetAlertEventByID(ctx, eventID) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, infraerrors.NotFound("OPS_ALERT_EVENT_NOT_FOUND", "alert event not found") + } + return nil, err + } + if ev == nil { + return nil, infraerrors.NotFound("OPS_ALERT_EVENT_NOT_FOUND", "alert event not found") + } + return ev, nil +} + func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) { if err := s.RequireMonitoringEnabled(ctx); err != nil { return nil, err @@ -101,6 +124,49 @@ func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*Op return s.opsRepo.GetActiveAlertEvent(ctx, ruleID) } +func (s *OpsService) CreateAlertSilence(ctx context.Context, input *OpsAlertSilence) (*OpsAlertSilence, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if input == nil { + return nil, infraerrors.BadRequest("INVALID_SILENCE", "invalid silence") + } + if input.RuleID <= 0 { + return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id") + } + if strings.TrimSpace(input.Platform) == "" { + return nil, infraerrors.BadRequest("INVALID_PLATFORM", "invalid platform") + } + if input.Until.IsZero() { + return nil, infraerrors.BadRequest("INVALID_UNTIL", "invalid until") + } + + created, err := s.opsRepo.CreateAlertSilence(ctx, input) + if err != nil { + return nil, err + } + return created, nil +} + +func (s *OpsService) IsAlertSilenced(ctx context.Context, ruleID int64, platform string, groupID *int64, region *string, now time.Time) (bool, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return false, err + } + if s.opsRepo == nil { + return false, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if ruleID <= 0 { + return false, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id") + } + if strings.TrimSpace(platform) == "" { + return false, nil + } + return s.opsRepo.IsAlertSilenced(ctx, ruleID, platform, groupID, region, now) +} + func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) { if err := s.RequireMonitoringEnabled(ctx); err != nil { return nil, err @@ -142,7 +208,11 @@ func (s *OpsService) UpdateAlertEventStatus(ctx context.Context, eventID int64, if eventID <= 0 { return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id") } - if strings.TrimSpace(status) == "" { + status = strings.TrimSpace(status) + if status == "" { + return infraerrors.BadRequest("INVALID_STATUS", "invalid status") + } + if status != OpsAlertStatusResolved && status != OpsAlertStatusManualResolved { return infraerrors.BadRequest("INVALID_STATUS", "invalid status") } return s.opsRepo.UpdateAlertEventStatus(ctx, eventID, status, resolvedAt) diff --git a/backend/internal/service/ops_health_score.go b/backend/internal/service/ops_health_score.go index feb0d843..5efae870 100644 --- a/backend/internal/service/ops_health_score.go +++ b/backend/internal/service/ops_health_score.go @@ -32,49 +32,38 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) } // computeBusinessHealth calculates business health score (0-100) -// Components: SLA (50%) + Error Rate (30%) + Latency (20%) +// Components: Error Rate (50%) + TTFT (50%) func computeBusinessHealth(overview *OpsDashboardOverview) float64 { - // SLA score: 99.5% → 100, 95% → 0 (linear) - slaScore := 100.0 - slaPct := clampFloat64(overview.SLA*100, 0, 100) - if slaPct < 99.5 { - if slaPct >= 95 { - slaScore = (slaPct - 95) / 4.5 * 100 - } else { - slaScore = 0 - } - } - - // Error rate score: 0.5% → 100, 5% → 0 (linear) + // Error rate score: 1% → 100, 10% → 0 (linear) // Combines request errors and upstream errors errorScore := 100.0 errorPct := clampFloat64(overview.ErrorRate*100, 0, 100) upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100) combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case - if combinedErrorPct > 0.5 { - if combinedErrorPct <= 5 { - errorScore = (5 - combinedErrorPct) / 4.5 * 100 + if combinedErrorPct > 1.0 { + if combinedErrorPct <= 10.0 { + errorScore = (10.0 - combinedErrorPct) / 9.0 * 100 } else { errorScore = 0 } } - // Latency score: 1s → 100, 10s → 0 (linear) - // Uses P99 of duration (TTFT is less critical for overall health) - latencyScore := 100.0 - if overview.Duration.P99 != nil { - p99 := float64(*overview.Duration.P99) + // TTFT score: 1s → 100, 3s → 0 (linear) + // Time to first token is critical for user experience + ttftScore := 100.0 + if overview.TTFT.P99 != nil { + p99 := float64(*overview.TTFT.P99) if p99 > 1000 { - if p99 <= 10000 { - latencyScore = (10000 - p99) / 9000 * 100 + if p99 <= 3000 { + ttftScore = (3000 - p99) / 2000 * 100 } else { - latencyScore = 0 + ttftScore = 0 } } } - // Weighted combination - return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2 + // Weighted combination: 50% error rate + 50% TTFT + return errorScore*0.5 + ttftScore*0.5 } // computeInfraHealth calculates infrastructure health score (0-100) diff --git a/backend/internal/service/ops_health_score_test.go b/backend/internal/service/ops_health_score_test.go index 849ba146..25bfb43d 100644 --- a/backend/internal/service/ops_health_score_test.go +++ b/backend/internal/service/ops_health_score_test.go @@ -127,8 +127,8 @@ func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) { MemoryUsagePercent: float64Ptr(75), }, }, - wantMin: 60, - wantMax: 85, + wantMin: 96, + wantMax: 97, }, { name: "DB failure", @@ -203,8 +203,8 @@ func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) { MemoryUsagePercent: float64Ptr(30), }, }, - wantMin: 25, - wantMax: 50, + wantMin: 84, + wantMax: 85, }, { name: "combined failures - business healthy + infra degraded", @@ -277,30 +277,41 @@ func TestComputeBusinessHealth(t *testing.T) { UpstreamErrorRate: 0, Duration: OpsPercentiles{P99: intPtr(500)}, }, - wantMin: 50, - wantMax: 60, + wantMin: 100, + wantMax: 100, }, { - name: "error rate boundary 0.5%", + name: "error rate boundary 1%", overview: &OpsDashboardOverview{ - SLA: 0.995, - ErrorRate: 0.005, + SLA: 0.99, + ErrorRate: 0.01, UpstreamErrorRate: 0, Duration: OpsPercentiles{P99: intPtr(500)}, }, - wantMin: 95, + wantMin: 100, wantMax: 100, }, { - name: "latency boundary 1000ms", + name: "error rate 5%", overview: &OpsDashboardOverview{ - SLA: 0.995, + SLA: 0.95, + ErrorRate: 0.05, + UpstreamErrorRate: 0, + Duration: OpsPercentiles{P99: intPtr(500)}, + }, + wantMin: 77, + wantMax: 78, + }, + { + name: "TTFT boundary 2s", + overview: &OpsDashboardOverview{ + SLA: 0.99, ErrorRate: 0, UpstreamErrorRate: 0, - Duration: OpsPercentiles{P99: intPtr(1000)}, + TTFT: OpsPercentiles{P99: intPtr(2000)}, }, - wantMin: 95, - wantMax: 100, + wantMin: 75, + wantMax: 75, }, { name: "upstream error dominates", @@ -310,7 +321,7 @@ func TestComputeBusinessHealth(t *testing.T) { UpstreamErrorRate: 0.03, Duration: OpsPercentiles{P99: intPtr(500)}, }, - wantMin: 75, + wantMin: 88, wantMax: 90, }, } diff --git a/backend/internal/service/ops_models.go b/backend/internal/service/ops_models.go index 996267fd..347cd52b 100644 --- a/backend/internal/service/ops_models.go +++ b/backend/internal/service/ops_models.go @@ -6,24 +6,43 @@ type OpsErrorLog struct { ID int64 `json:"id"` CreatedAt time.Time `json:"created_at"` - Phase string `json:"phase"` - Type string `json:"type"` + // Standardized classification + // - phase: request|auth|routing|upstream|network|internal + // - owner: client|provider|platform + // - source: client_request|upstream_http|gateway + Phase string `json:"phase"` + Type string `json:"type"` + + Owner string `json:"error_owner"` + Source string `json:"error_source"` + Severity string `json:"severity"` StatusCode int `json:"status_code"` Platform string `json:"platform"` Model string `json:"model"` - LatencyMs *int `json:"latency_ms"` + IsRetryable bool `json:"is_retryable"` + RetryCount int `json:"retry_count"` + + Resolved bool `json:"resolved"` + ResolvedAt *time.Time `json:"resolved_at"` + ResolvedByUserID *int64 `json:"resolved_by_user_id"` + ResolvedByUserName string `json:"resolved_by_user_name"` + ResolvedRetryID *int64 `json:"resolved_retry_id"` + ResolvedStatusRaw string `json:"-"` ClientRequestID string `json:"client_request_id"` RequestID string `json:"request_id"` Message string `json:"message"` - UserID *int64 `json:"user_id"` - APIKeyID *int64 `json:"api_key_id"` - AccountID *int64 `json:"account_id"` - GroupID *int64 `json:"group_id"` + UserID *int64 `json:"user_id"` + UserEmail string `json:"user_email"` + APIKeyID *int64 `json:"api_key_id"` + AccountID *int64 `json:"account_id"` + AccountName string `json:"account_name"` + GroupID *int64 `json:"group_id"` + GroupName string `json:"group_name"` ClientIP *string `json:"client_ip"` RequestPath string `json:"request_path"` @@ -67,9 +86,24 @@ type OpsErrorLogFilter struct { GroupID *int64 AccountID *int64 - StatusCodes []int - Phase string - Query string + StatusCodes []int + StatusCodesOther bool + Phase string + Owner string + Source string + Resolved *bool + Query string + UserQuery string // Search by user email + + // Optional correlation keys for exact matching. + RequestID string + ClientRequestID string + + // View controls error categorization for list endpoints. + // - errors: show actionable errors (exclude business-limited / 429 / 529) + // - excluded: only show excluded errors + // - all: show everything + View string Page int PageSize int @@ -90,12 +124,23 @@ type OpsRetryAttempt struct { SourceErrorID int64 `json:"source_error_id"` Mode string `json:"mode"` PinnedAccountID *int64 `json:"pinned_account_id"` + PinnedAccountName string `json:"pinned_account_name"` Status string `json:"status"` StartedAt *time.Time `json:"started_at"` FinishedAt *time.Time `json:"finished_at"` DurationMs *int64 `json:"duration_ms"` + // Persisted execution results (best-effort) + Success *bool `json:"success"` + HTTPStatusCode *int `json:"http_status_code"` + UpstreamRequestID *string `json:"upstream_request_id"` + UsedAccountID *int64 `json:"used_account_id"` + UsedAccountName string `json:"used_account_name"` + ResponsePreview *string `json:"response_preview"` + ResponseTruncated *bool `json:"response_truncated"` + + // Optional correlation ResultRequestID *string `json:"result_request_id"` ResultErrorID *int64 `json:"result_error_id"` diff --git a/backend/internal/service/ops_port.go b/backend/internal/service/ops_port.go index 4df21c37..cdeea241 100644 --- a/backend/internal/service/ops_port.go +++ b/backend/internal/service/ops_port.go @@ -14,6 +14,8 @@ type OpsRepository interface { InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error) UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error) + ListRetryAttemptsByErrorID(ctx context.Context, sourceErrorID int64, limit int) ([]*OpsRetryAttempt, error) + UpdateErrorResolution(ctx context.Context, errorID int64, resolved bool, resolvedByUserID *int64, resolvedRetryID *int64, resolvedAt *time.Time) error // Lightweight window stats (for realtime WS / quick sampling). GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error) @@ -39,12 +41,17 @@ type OpsRepository interface { DeleteAlertRule(ctx context.Context, id int64) error ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error) + GetAlertEventByID(ctx context.Context, eventID int64) (*OpsAlertEvent, error) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error + // Alert silences + CreateAlertSilence(ctx context.Context, input *OpsAlertSilence) (*OpsAlertSilence, error) + IsAlertSilenced(ctx context.Context, ruleID int64, platform string, groupID *int64, region *string, now time.Time) (bool, error) + // Pre-aggregation (hourly/daily) used for long-window dashboard performance. UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error @@ -91,7 +98,6 @@ type OpsInsertErrorLogInput struct { // It is set by OpsService.RecordError before persisting. UpstreamErrorsJSON *string - DurationMs *int TimeToFirstTokenMs *int64 RequestBodyJSON *string // sanitized json string (not raw bytes) @@ -124,7 +130,15 @@ type OpsUpdateRetryAttemptInput struct { FinishedAt time.Time DurationMs int64 - // Optional correlation + // Persisted execution results (best-effort) + Success *bool + HTTPStatusCode *int + UpstreamRequestID *string + UsedAccountID *int64 + ResponsePreview *string + ResponseTruncated *bool + + // Optional correlation (legacy fields kept) ResultRequestID *string ResultErrorID *int64 diff --git a/backend/internal/service/ops_retry.go b/backend/internal/service/ops_retry.go index 747aa3b8..25c10af6 100644 --- a/backend/internal/service/ops_retry.go +++ b/backend/internal/service/ops_retry.go @@ -108,6 +108,10 @@ func (w *limitedResponseWriter) truncated() bool { return w.totalWritten > int64(w.limit) } +const ( + OpsRetryModeUpstreamEvent = "upstream_event" +) + func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) { if err := s.RequireMonitoringEnabled(ctx); err != nil { return nil, err @@ -123,6 +127,81 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream") } + errorLog, err := s.GetErrorLogByID(ctx, errorID) + if err != nil { + return nil, err + } + if errorLog == nil { + return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found") + } + if strings.TrimSpace(errorLog.RequestBody) == "" { + return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry") + } + + var pinned *int64 + if mode == OpsRetryModeUpstream { + if pinnedAccountID != nil && *pinnedAccountID > 0 { + pinned = pinnedAccountID + } else if errorLog.AccountID != nil && *errorLog.AccountID > 0 { + pinned = errorLog.AccountID + } else { + return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry") + } + } + + return s.retryWithErrorLog(ctx, requestedByUserID, errorID, mode, mode, pinned, errorLog) +} + +// RetryUpstreamEvent retries a specific upstream attempt captured inside ops_error_logs.upstream_errors. +// idx is 0-based. It always pins the original event account_id. +func (s *OpsService) RetryUpstreamEvent(ctx context.Context, requestedByUserID int64, errorID int64, idx int) (*OpsRetryResult, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if idx < 0 { + return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_UPSTREAM_IDX", "invalid upstream idx") + } + + errorLog, err := s.GetErrorLogByID(ctx, errorID) + if err != nil { + return nil, err + } + if errorLog == nil { + return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found") + } + + events, err := ParseOpsUpstreamErrors(errorLog.UpstreamErrors) + if err != nil { + return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENTS_INVALID", "invalid upstream_errors") + } + if idx >= len(events) { + return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_IDX_OOB", "upstream idx out of range") + } + ev := events[idx] + if ev == nil { + return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENT_MISSING", "upstream event missing") + } + if ev.AccountID <= 0 { + return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry") + } + + upstreamBody := strings.TrimSpace(ev.UpstreamRequestBody) + if upstreamBody == "" { + return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_NO_REQUEST_BODY", "No upstream request body found to retry") + } + + override := *errorLog + override.RequestBody = upstreamBody + pinned := ev.AccountID + + // Persist as upstream_event, execute as upstream pinned retry. + return s.retryWithErrorLog(ctx, requestedByUserID, errorID, OpsRetryModeUpstreamEvent, OpsRetryModeUpstream, &pinned, &override) +} + +func (s *OpsService) retryWithErrorLog(ctx context.Context, requestedByUserID int64, errorID int64, mode string, execMode string, pinnedAccountID *int64, errorLog *OpsErrorLogDetail) (*OpsRetryResult, error) { latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID) if err != nil && !errors.Is(err, sql.ErrNoRows) { return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err) @@ -144,22 +223,18 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er } } - errorLog, err := s.GetErrorLogByID(ctx, errorID) - if err != nil { - return nil, err - } - if strings.TrimSpace(errorLog.RequestBody) == "" { + if errorLog == nil || strings.TrimSpace(errorLog.RequestBody) == "" { return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry") } var pinned *int64 - if mode == OpsRetryModeUpstream { + if execMode == OpsRetryModeUpstream { if pinnedAccountID != nil && *pinnedAccountID > 0 { pinned = pinnedAccountID } else if errorLog.AccountID != nil && *errorLog.AccountID > 0 { pinned = errorLog.AccountID } else { - return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry") + return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry") } } @@ -196,7 +271,7 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout) defer cancel() - execRes := s.executeRetry(execCtx, errorLog, mode, pinned) + execRes := s.executeRetry(execCtx, errorLog, execMode, pinned) finishedAt := time.Now() result.FinishedAt = finishedAt @@ -220,27 +295,40 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er msg := result.ErrorMessage updateErrMsg = &msg } + // Keep legacy result_request_id empty; use upstream_request_id instead. var resultRequestID *string - if strings.TrimSpace(result.UpstreamRequestID) != "" { - v := result.UpstreamRequestID - resultRequestID = &v - } finalStatus := result.Status if strings.TrimSpace(finalStatus) == "" { finalStatus = opsRetryStatusFailed } + success := strings.EqualFold(finalStatus, opsRetryStatusSucceeded) + httpStatus := result.HTTPStatusCode + upstreamReqID := result.UpstreamRequestID + usedAccountID := result.UsedAccountID + preview := result.ResponsePreview + truncated := result.ResponseTruncated + if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{ - ID: attemptID, - Status: finalStatus, - FinishedAt: finishedAt, - DurationMs: result.DurationMs, - ResultRequestID: resultRequestID, - ErrorMessage: updateErrMsg, + ID: attemptID, + Status: finalStatus, + FinishedAt: finishedAt, + DurationMs: result.DurationMs, + Success: &success, + HTTPStatusCode: &httpStatus, + UpstreamRequestID: &upstreamReqID, + UsedAccountID: usedAccountID, + ResponsePreview: &preview, + ResponseTruncated: &truncated, + ResultRequestID: resultRequestID, + ErrorMessage: updateErrMsg, }); err != nil { - // Best-effort: retry itself already executed; do not fail the API response. log.Printf("[Ops] UpdateRetryAttempt failed: %v", err) + } else if success { + if err := s.opsRepo.UpdateErrorResolution(updateCtx, errorID, true, &requestedByUserID, &attemptID, &finishedAt); err != nil { + log.Printf("[Ops] UpdateErrorResolution failed: %v", err) + } } return result, nil diff --git a/backend/internal/service/ops_service.go b/backend/internal/service/ops_service.go index 426d46f1..abb8ae12 100644 --- a/backend/internal/service/ops_service.go +++ b/backend/internal/service/ops_service.go @@ -208,6 +208,25 @@ func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogIn out.Detail = "" } + out.UpstreamRequestBody = strings.TrimSpace(out.UpstreamRequestBody) + if out.UpstreamRequestBody != "" { + // Reuse the same sanitization/trimming strategy as request body storage. + // Keep it small so it is safe to persist in ops_error_logs JSON. + sanitized, truncated, _ := sanitizeAndTrimRequestBody([]byte(out.UpstreamRequestBody), 10*1024) + if sanitized != "" { + out.UpstreamRequestBody = sanitized + if truncated { + out.Kind = strings.TrimSpace(out.Kind) + if out.Kind == "" { + out.Kind = "upstream" + } + out.Kind = out.Kind + ":request_body_truncated" + } + } else { + out.UpstreamRequestBody = "" + } + } + // Drop fully-empty events (can happen if only status code was known). if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" { continue @@ -236,7 +255,13 @@ func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter if s.opsRepo == nil { return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil } - return s.opsRepo.ListErrorLogs(ctx, filter) + result, err := s.opsRepo.ListErrorLogs(ctx, filter) + if err != nil { + log.Printf("[Ops] GetErrorLogs failed: %v", err) + return nil, err + } + + return result, nil } func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) { @@ -256,6 +281,46 @@ func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLo return detail, nil } +func (s *OpsService) ListRetryAttemptsByErrorID(ctx context.Context, errorID int64, limit int) ([]*OpsRetryAttempt, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if errorID <= 0 { + return nil, infraerrors.BadRequest("OPS_ERROR_INVALID_ID", "invalid error id") + } + items, err := s.opsRepo.ListRetryAttemptsByErrorID(ctx, errorID, limit) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return []*OpsRetryAttempt{}, nil + } + return nil, infraerrors.InternalServer("OPS_RETRY_LIST_FAILED", "Failed to list retry attempts").WithCause(err) + } + return items, nil +} + +func (s *OpsService) UpdateErrorResolution(ctx context.Context, errorID int64, resolved bool, resolvedByUserID *int64, resolvedRetryID *int64) error { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return err + } + if s.opsRepo == nil { + return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if errorID <= 0 { + return infraerrors.BadRequest("OPS_ERROR_INVALID_ID", "invalid error id") + } + // Best-effort ensure the error exists + if _, err := s.opsRepo.GetErrorLogByID(ctx, errorID); err != nil { + if errors.Is(err, sql.ErrNoRows) { + return infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found") + } + return infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err) + } + return s.opsRepo.UpdateErrorResolution(ctx, errorID, resolved, resolvedByUserID, resolvedRetryID, nil) +} + func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) { bytesLen = len(raw) if len(raw) == 0 { @@ -296,14 +361,34 @@ func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, tr } } - // Last resort: store a minimal placeholder (still valid JSON). - placeholder := map[string]any{ - "request_body_truncated": true, + // Last resort: keep JSON shape but drop big fields. + // This avoids downstream code that expects certain top-level keys from crashing. + if root, ok := decoded.(map[string]any); ok { + placeholder := shallowCopyMap(root) + placeholder["request_body_truncated"] = true + + // Replace potentially huge arrays/strings, but keep the keys present. + for _, k := range []string{"messages", "contents", "input", "prompt"} { + if _, exists := placeholder[k]; exists { + placeholder[k] = []any{} + } + } + for _, k := range []string{"text"} { + if _, exists := placeholder[k]; exists { + placeholder[k] = "" + } + } + + encoded4, err4 := json.Marshal(placeholder) + if err4 == nil { + if len(encoded4) <= maxBytes { + return string(encoded4), true, bytesLen + } + } } - if model := extractString(decoded, "model"); model != "" { - placeholder["model"] = model - } - encoded4, err4 := json.Marshal(placeholder) + + // Final fallback: minimal valid JSON. + encoded4, err4 := json.Marshal(map[string]any{"request_body_truncated": true}) if err4 != nil { return "", true, bytesLen } @@ -526,12 +611,3 @@ func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, tr } return raw, false } - -func extractString(v any, key string) string { - root, ok := v.(map[string]any) - if !ok { - return "" - } - s, _ := root[key].(string) - return strings.TrimSpace(s) -} diff --git a/backend/internal/service/ops_settings.go b/backend/internal/service/ops_settings.go index 53c78fed..a6a4a0d7 100644 --- a/backend/internal/service/ops_settings.go +++ b/backend/internal/service/ops_settings.go @@ -368,9 +368,11 @@ func defaultOpsAdvancedSettings() *OpsAdvancedSettings { Aggregation: OpsAggregationSettings{ AggregationEnabled: false, }, - IgnoreCountTokensErrors: false, - AutoRefreshEnabled: false, - AutoRefreshIntervalSec: 30, + IgnoreCountTokensErrors: false, + IgnoreContextCanceled: true, // Default to true - client disconnects are not errors + IgnoreNoAvailableAccounts: false, // Default to false - this is a real routing issue + AutoRefreshEnabled: false, + AutoRefreshIntervalSec: 30, } } @@ -482,13 +484,11 @@ const SettingKeyOpsMetricThresholds = "ops_metric_thresholds" func defaultOpsMetricThresholds() *OpsMetricThresholds { slaMin := 99.5 - latencyMax := 2000.0 ttftMax := 500.0 reqErrMax := 5.0 upstreamErrMax := 5.0 return &OpsMetricThresholds{ SLAPercentMin: &slaMin, - LatencyP99MsMax: &latencyMax, TTFTp99MsMax: &ttftMax, RequestErrorRatePercentMax: &reqErrMax, UpstreamErrorRatePercentMax: &upstreamErrMax, @@ -538,9 +538,6 @@ func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricT if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) { return nil, errors.New("sla_percent_min must be between 0 and 100") } - if cfg.LatencyP99MsMax != nil && *cfg.LatencyP99MsMax < 0 { - return nil, errors.New("latency_p99_ms_max must be >= 0") - } if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 { return nil, errors.New("ttft_p99_ms_max must be >= 0") } diff --git a/backend/internal/service/ops_settings_models.go b/backend/internal/service/ops_settings_models.go index 229488a1..df06f578 100644 --- a/backend/internal/service/ops_settings_models.go +++ b/backend/internal/service/ops_settings_models.go @@ -63,7 +63,6 @@ type OpsAlertSilencingSettings struct { type OpsMetricThresholds struct { SLAPercentMin *float64 `json:"sla_percent_min,omitempty"` // SLA低于此值变红 - LatencyP99MsMax *float64 `json:"latency_p99_ms_max,omitempty"` // 延迟P99高于此值变红 TTFTp99MsMax *float64 `json:"ttft_p99_ms_max,omitempty"` // TTFT P99高于此值变红 RequestErrorRatePercentMax *float64 `json:"request_error_rate_percent_max,omitempty"` // 请求错误率高于此值变红 UpstreamErrorRatePercentMax *float64 `json:"upstream_error_rate_percent_max,omitempty"` // 上游错误率高于此值变红 @@ -79,11 +78,13 @@ type OpsAlertRuntimeSettings struct { // OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation). type OpsAdvancedSettings struct { - DataRetention OpsDataRetentionSettings `json:"data_retention"` - Aggregation OpsAggregationSettings `json:"aggregation"` - IgnoreCountTokensErrors bool `json:"ignore_count_tokens_errors"` - AutoRefreshEnabled bool `json:"auto_refresh_enabled"` - AutoRefreshIntervalSec int `json:"auto_refresh_interval_seconds"` + DataRetention OpsDataRetentionSettings `json:"data_retention"` + Aggregation OpsAggregationSettings `json:"aggregation"` + IgnoreCountTokensErrors bool `json:"ignore_count_tokens_errors"` + IgnoreContextCanceled bool `json:"ignore_context_canceled"` + IgnoreNoAvailableAccounts bool `json:"ignore_no_available_accounts"` + AutoRefreshEnabled bool `json:"auto_refresh_enabled"` + AutoRefreshIntervalSec int `json:"auto_refresh_interval_seconds"` } type OpsDataRetentionSettings struct { diff --git a/backend/internal/service/ops_upstream_context.go b/backend/internal/service/ops_upstream_context.go index 615ae6a1..96bcc9fe 100644 --- a/backend/internal/service/ops_upstream_context.go +++ b/backend/internal/service/ops_upstream_context.go @@ -15,6 +15,11 @@ const ( OpsUpstreamErrorMessageKey = "ops_upstream_error_message" OpsUpstreamErrorDetailKey = "ops_upstream_error_detail" OpsUpstreamErrorsKey = "ops_upstream_errors" + + // Best-effort capture of the current upstream request body so ops can + // retry the specific upstream attempt (not just the client request). + // This value is sanitized+trimmed before being persisted. + OpsUpstreamRequestBodyKey = "ops_upstream_request_body" ) func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) { @@ -38,13 +43,21 @@ type OpsUpstreamErrorEvent struct { AtUnixMs int64 `json:"at_unix_ms,omitempty"` // Context - Platform string `json:"platform,omitempty"` - AccountID int64 `json:"account_id,omitempty"` + Platform string `json:"platform,omitempty"` + AccountID int64 `json:"account_id,omitempty"` + AccountName string `json:"account_name,omitempty"` // Outcome UpstreamStatusCode int `json:"upstream_status_code,omitempty"` UpstreamRequestID string `json:"upstream_request_id,omitempty"` + // Best-effort upstream request capture (sanitized+trimmed). + // Required for retrying a specific upstream attempt. + UpstreamRequestBody string `json:"upstream_request_body,omitempty"` + + // Best-effort upstream response capture (sanitized+trimmed). + UpstreamResponseBody string `json:"upstream_response_body,omitempty"` + // Kind: http_error | request_error | retry_exhausted | failover Kind string `json:"kind,omitempty"` @@ -61,6 +74,8 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) { } ev.Platform = strings.TrimSpace(ev.Platform) ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID) + ev.UpstreamRequestBody = strings.TrimSpace(ev.UpstreamRequestBody) + ev.UpstreamResponseBody = strings.TrimSpace(ev.UpstreamResponseBody) ev.Kind = strings.TrimSpace(ev.Kind) ev.Message = strings.TrimSpace(ev.Message) ev.Detail = strings.TrimSpace(ev.Detail) @@ -68,6 +83,16 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) { ev.Message = sanitizeUpstreamErrorMessage(ev.Message) } + // If the caller didn't explicitly pass upstream request body but the gateway + // stored it on the context, attach it so ops can retry this specific attempt. + if ev.UpstreamRequestBody == "" { + if v, ok := c.Get(OpsUpstreamRequestBodyKey); ok { + if s, ok := v.(string); ok { + ev.UpstreamRequestBody = strings.TrimSpace(s) + } + } + } + var existing []*OpsUpstreamErrorEvent if v, ok := c.Get(OpsUpstreamErrorsKey); ok { if arr, ok := v.([]*OpsUpstreamErrorEvent); ok { @@ -92,3 +117,15 @@ func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string { s := string(raw) return &s } + +func ParseOpsUpstreamErrors(raw string) ([]*OpsUpstreamErrorEvent, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return []*OpsUpstreamErrorEvent{}, nil + } + var out []*OpsUpstreamErrorEvent + if err := json.Unmarshal([]byte(raw), &out); err != nil { + return nil, err + } + return out, nil +} diff --git a/backend/migrations/037_ops_alert_silences.sql b/backend/migrations/037_ops_alert_silences.sql new file mode 100644 index 00000000..95b61a09 --- /dev/null +++ b/backend/migrations/037_ops_alert_silences.sql @@ -0,0 +1,28 @@ +-- +goose Up +-- +goose StatementBegin +-- Ops alert silences: scoped (rule_id + platform + group_id + region) + +CREATE TABLE IF NOT EXISTS ops_alert_silences ( + id BIGSERIAL PRIMARY KEY, + + rule_id BIGINT NOT NULL, + platform VARCHAR(64) NOT NULL, + group_id BIGINT, + region VARCHAR(64), + + until TIMESTAMPTZ NOT NULL, + reason TEXT, + + created_by BIGINT, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_ops_alert_silences_lookup + ON ops_alert_silences (rule_id, platform, group_id, region, until); + +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +DROP TABLE IF EXISTS ops_alert_silences; +-- +goose StatementEnd diff --git a/backend/migrations/038_ops_errors_resolution_retry_results_and_standardize_classification.sql b/backend/migrations/038_ops_errors_resolution_retry_results_and_standardize_classification.sql new file mode 100644 index 00000000..adaacf1c --- /dev/null +++ b/backend/migrations/038_ops_errors_resolution_retry_results_and_standardize_classification.sql @@ -0,0 +1,111 @@ +-- Add resolution tracking to ops_error_logs, persist retry results, and standardize error classification enums. +-- +-- This migration is intentionally idempotent. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- ============================================ +-- 1) ops_error_logs: resolution fields +-- ============================================ + +ALTER TABLE ops_error_logs + ADD COLUMN IF NOT EXISTS resolved BOOLEAN NOT NULL DEFAULT false; + +ALTER TABLE ops_error_logs + ADD COLUMN IF NOT EXISTS resolved_at TIMESTAMPTZ; + +ALTER TABLE ops_error_logs + ADD COLUMN IF NOT EXISTS resolved_by_user_id BIGINT; + +ALTER TABLE ops_error_logs + ADD COLUMN IF NOT EXISTS resolved_retry_id BIGINT; + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_resolved_time + ON ops_error_logs (resolved, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_unresolved_time + ON ops_error_logs (created_at DESC) + WHERE resolved = false; + +-- ============================================ +-- 2) ops_retry_attempts: persist execution results +-- ============================================ + +ALTER TABLE ops_retry_attempts + ADD COLUMN IF NOT EXISTS success BOOLEAN; + +ALTER TABLE ops_retry_attempts + ADD COLUMN IF NOT EXISTS http_status_code INT; + +ALTER TABLE ops_retry_attempts + ADD COLUMN IF NOT EXISTS upstream_request_id VARCHAR(128); + +ALTER TABLE ops_retry_attempts + ADD COLUMN IF NOT EXISTS used_account_id BIGINT; + +ALTER TABLE ops_retry_attempts + ADD COLUMN IF NOT EXISTS response_preview TEXT; + +ALTER TABLE ops_retry_attempts + ADD COLUMN IF NOT EXISTS response_truncated BOOLEAN NOT NULL DEFAULT false; + +CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_success_time + ON ops_retry_attempts (success, created_at DESC); + +-- Backfill best-effort fields for existing rows. +UPDATE ops_retry_attempts +SET success = (LOWER(COALESCE(status, '')) = 'succeeded') +WHERE success IS NULL; + +UPDATE ops_retry_attempts +SET upstream_request_id = result_request_id +WHERE upstream_request_id IS NULL AND result_request_id IS NOT NULL; + +-- ============================================ +-- 3) Standardize classification enums in ops_error_logs +-- +-- New enums: +-- error_phase: request|auth|routing|upstream|network|internal +-- error_owner: client|provider|platform +-- error_source: client_request|upstream_http|gateway +-- ============================================ + +-- Owner: legacy sub2api => platform. +UPDATE ops_error_logs +SET error_owner = 'platform' +WHERE LOWER(COALESCE(error_owner, '')) = 'sub2api'; + +-- Owner: normalize empty/null to platform (best-effort). +UPDATE ops_error_logs +SET error_owner = 'platform' +WHERE COALESCE(TRIM(error_owner), '') = ''; + +-- Phase: map legacy phases. +UPDATE ops_error_logs +SET error_phase = CASE + WHEN COALESCE(TRIM(error_phase), '') = '' THEN 'internal' + WHEN LOWER(error_phase) IN ('billing', 'concurrency', 'response') THEN 'request' + WHEN LOWER(error_phase) IN ('scheduling') THEN 'routing' + WHEN LOWER(error_phase) IN ('request', 'auth', 'routing', 'upstream', 'network', 'internal') THEN LOWER(error_phase) + ELSE 'internal' +END; + +-- Source: map legacy sources. +UPDATE ops_error_logs +SET error_source = CASE + WHEN COALESCE(TRIM(error_source), '') = '' THEN 'gateway' + WHEN LOWER(error_source) IN ('billing', 'concurrency') THEN 'client_request' + WHEN LOWER(error_source) IN ('upstream_http') THEN 'upstream_http' + WHEN LOWER(error_source) IN ('upstream_network') THEN 'gateway' + WHEN LOWER(error_source) IN ('internal') THEN 'gateway' + WHEN LOWER(error_source) IN ('client_request', 'upstream_http', 'gateway') THEN LOWER(error_source) + ELSE 'gateway' +END; + +-- Auto-resolve recovered upstream errors (client status < 400). +UPDATE ops_error_logs +SET + resolved = true, + resolved_at = COALESCE(resolved_at, created_at) +WHERE resolved = false AND COALESCE(status_code, 0) > 0 AND COALESCE(status_code, 0) < 400; diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts index ce0ab58d..63b12cfb 100644 --- a/frontend/src/api/admin/ops.ts +++ b/frontend/src/api/admin/ops.ts @@ -17,6 +17,47 @@ export interface OpsRequestOptions { export interface OpsRetryRequest { mode: OpsRetryMode pinned_account_id?: number + force?: boolean +} + +export interface OpsRetryAttempt { + id: number + created_at: string + requested_by_user_id: number + source_error_id: number + mode: string + pinned_account_id?: number | null + pinned_account_name?: string + + status: string + started_at?: string | null + finished_at?: string | null + duration_ms?: number | null + + success?: boolean | null + http_status_code?: number | null + upstream_request_id?: string | null + used_account_id?: number | null + used_account_name?: string + response_preview?: string | null + response_truncated?: boolean | null + + result_request_id?: string | null + result_error_id?: number | null + error_message?: string | null +} + +export type OpsUpstreamErrorEvent = { + at_unix_ms?: number + platform?: string + account_id?: number + account_name?: string + upstream_status_code?: number + upstream_request_id?: string + upstream_request_body?: string + kind?: string + message?: string + detail?: string } export interface OpsRetryResult { @@ -626,8 +667,6 @@ export type MetricType = | 'success_rate' | 'error_rate' | 'upstream_error_rate' - | 'p95_latency_ms' - | 'p99_latency_ms' | 'cpu_usage_percent' | 'memory_usage_percent' | 'concurrency_queue_depth' @@ -663,7 +702,7 @@ export interface AlertEvent { id: number rule_id: number severity: OpsSeverity | string - status: 'firing' | 'resolved' | string + status: 'firing' | 'resolved' | 'manual_resolved' | string title?: string description?: string metric_value?: number @@ -701,10 +740,9 @@ export interface EmailNotificationConfig { } export interface OpsMetricThresholds { - sla_percent_min?: number | null // SLA低于此值变红 - latency_p99_ms_max?: number | null // 延迟P99高于此值变红 - ttft_p99_ms_max?: number | null // TTFT P99高于此值变红 - request_error_rate_percent_max?: number | null // 请求错误率高于此值变红 + sla_percent_min?: number | null // SLA低于此值变红 + ttft_p99_ms_max?: number | null // TTFT P99高于此值变红 + request_error_rate_percent_max?: number | null // 请求错误率高于此值变红 upstream_error_rate_percent_max?: number | null // 上游错误率高于此值变红 } @@ -735,6 +773,8 @@ export interface OpsAdvancedSettings { data_retention: OpsDataRetentionSettings aggregation: OpsAggregationSettings ignore_count_tokens_errors: boolean + ignore_context_canceled: boolean + ignore_no_available_accounts: boolean auto_refresh_enabled: boolean auto_refresh_interval_seconds: number } @@ -754,21 +794,37 @@ export interface OpsAggregationSettings { export interface OpsErrorLog { id: number created_at: string + + // Standardized classification phase: OpsPhase type: string + error_owner: 'client' | 'provider' | 'platform' | string + error_source: 'client_request' | 'upstream_http' | 'gateway' | string + severity: OpsSeverity status_code: number platform: string model: string - latency_ms?: number | null + + is_retryable: boolean + retry_count: number + + resolved: boolean + resolved_at?: string | null + resolved_by_user_id?: number | null + resolved_retry_id?: number | null + client_request_id: string request_id: string message: string user_id?: number | null + user_email: string api_key_id?: number | null account_id?: number | null + account_name: string group_id?: number | null + group_name: string client_ip?: string | null request_path?: string @@ -890,7 +946,9 @@ export async function getErrorDistribution( return data } -export async function listErrorLogs(params: { +export type OpsErrorListView = 'errors' | 'excluded' | 'all' + +export type OpsErrorListQueryParams = { page?: number page_size?: number time_range?: string @@ -899,10 +957,20 @@ export async function listErrorLogs(params: { platform?: string group_id?: number | null account_id?: number | null + phase?: string + error_owner?: string + error_source?: string + resolved?: string + view?: OpsErrorListView + q?: string status_codes?: string -}): Promise { + status_codes_other?: string +} + +// Legacy unified endpoints +export async function listErrorLogs(params: OpsErrorListQueryParams): Promise { const { data } = await apiClient.get('/admin/ops/errors', { params }) return data } @@ -917,6 +985,70 @@ export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promi return data } +export async function listRetryAttempts(errorId: number, limit = 50): Promise { + const { data } = await apiClient.get(`/admin/ops/errors/${errorId}/retries`, { params: { limit } }) + return data +} + +export async function updateErrorResolved(errorId: number, resolved: boolean): Promise { + await apiClient.put(`/admin/ops/errors/${errorId}/resolve`, { resolved }) +} + +// New split endpoints +export async function listRequestErrors(params: OpsErrorListQueryParams): Promise { + const { data } = await apiClient.get('/admin/ops/request-errors', { params }) + return data +} + +export async function listUpstreamErrors(params: OpsErrorListQueryParams): Promise { + const { data } = await apiClient.get('/admin/ops/upstream-errors', { params }) + return data +} + +export async function getRequestErrorDetail(id: number): Promise { + const { data } = await apiClient.get(`/admin/ops/request-errors/${id}`) + return data +} + +export async function getUpstreamErrorDetail(id: number): Promise { + const { data } = await apiClient.get(`/admin/ops/upstream-errors/${id}`) + return data +} + +export async function retryRequestErrorClient(id: number): Promise { + const { data } = await apiClient.post(`/admin/ops/request-errors/${id}/retry-client`, {}) + return data +} + +export async function retryRequestErrorUpstreamEvent(id: number, idx: number): Promise { + const { data } = await apiClient.post(`/admin/ops/request-errors/${id}/upstream-errors/${idx}/retry`, {}) + return data +} + +export async function retryUpstreamError(id: number): Promise { + const { data } = await apiClient.post(`/admin/ops/upstream-errors/${id}/retry`, {}) + return data +} + +export async function updateRequestErrorResolved(errorId: number, resolved: boolean): Promise { + await apiClient.put(`/admin/ops/request-errors/${errorId}/resolve`, { resolved }) +} + +export async function updateUpstreamErrorResolved(errorId: number, resolved: boolean): Promise { + await apiClient.put(`/admin/ops/upstream-errors/${errorId}/resolve`, { resolved }) +} + +export async function listRequestErrorUpstreamErrors( + id: number, + params: OpsErrorListQueryParams = {}, + options: { include_detail?: boolean } = {} +): Promise> { + const query: Record = { ...params } + if (options.include_detail) query.include_detail = '1' + const { data } = await apiClient.get>(`/admin/ops/request-errors/${id}/upstream-errors`, { params: query }) + return data +} + export async function listRequestDetails(params: OpsRequestDetailsParams): Promise { const { data } = await apiClient.get('/admin/ops/requests', { params }) return data @@ -942,11 +1074,45 @@ export async function deleteAlertRule(id: number): Promise { await apiClient.delete(`/admin/ops/alert-rules/${id}`) } -export async function listAlertEvents(limit = 100): Promise { - const { data } = await apiClient.get('/admin/ops/alert-events', { params: { limit } }) +export interface AlertEventsQuery { + limit?: number + status?: string + severity?: string + email_sent?: boolean + time_range?: string + start_time?: string + end_time?: string + before_fired_at?: string + before_id?: number + platform?: string + group_id?: number +} + +export async function listAlertEvents(params: AlertEventsQuery = {}): Promise { + const { data } = await apiClient.get('/admin/ops/alert-events', { params }) return data } +export async function getAlertEvent(id: number): Promise { + const { data } = await apiClient.get(`/admin/ops/alert-events/${id}`) + return data +} + +export async function updateAlertEventStatus(id: number, status: 'resolved' | 'manual_resolved'): Promise { + await apiClient.put(`/admin/ops/alert-events/${id}/status`, { status }) +} + +export async function createAlertSilence(payload: { + rule_id: number + platform: string + group_id?: number | null + region?: string | null + until: string + reason?: string +}): Promise { + await apiClient.post('/admin/ops/alert-silences', payload) +} + // Email notification config export async function getEmailNotificationConfig(): Promise { const { data } = await apiClient.get('/admin/ops/email-notification/config') @@ -1001,15 +1167,35 @@ export const opsAPI = { getAccountAvailabilityStats, getRealtimeTrafficSummary, subscribeQPS, + + // Legacy unified endpoints listErrorLogs, getErrorLogDetail, retryErrorRequest, + listRetryAttempts, + updateErrorResolved, + + // New split endpoints + listRequestErrors, + listUpstreamErrors, + getRequestErrorDetail, + getUpstreamErrorDetail, + retryRequestErrorClient, + retryRequestErrorUpstreamEvent, + retryUpstreamError, + updateRequestErrorResolved, + updateUpstreamErrorResolved, + listRequestErrorUpstreamErrors, + listRequestDetails, listAlertRules, createAlertRule, updateAlertRule, deleteAlertRule, listAlertEvents, + getAlertEvent, + updateAlertEventStatus, + createAlertSilence, getEmailNotificationConfig, updateEmailNotificationConfig, getAlertRuntimeSettings, diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts index 632882b8..09ebe8e1 100644 --- a/frontend/src/i18n/locales/en.ts +++ b/frontend/src/i18n/locales/en.ts @@ -129,6 +129,8 @@ export default { all: 'All', none: 'None', noData: 'No data', + expand: 'Expand', + collapse: 'Collapse', success: 'Success', error: 'Error', critical: 'Critical', @@ -150,12 +152,13 @@ export default { invalidEmail: 'Please enter a valid email address', optional: 'optional', selectOption: 'Select an option', - searchPlaceholder: 'Search...', - noOptionsFound: 'No options found', - noGroupsAvailable: 'No groups available', - unknownError: 'Unknown error occurred', - saving: 'Saving...', - selectedCount: '({count} selected)', refresh: 'Refresh', + searchPlaceholder: 'Search...', + noOptionsFound: 'No options found', + noGroupsAvailable: 'No groups available', + unknownError: 'Unknown error occurred', + saving: 'Saving...', + selectedCount: '({count} selected)', + refresh: 'Refresh', settings: 'Settings', notAvailable: 'N/A', now: 'Now', @@ -1882,10 +1885,8 @@ export default { noSystemMetrics: 'No system metrics collected yet.', collectedAt: 'Collected at:', window: 'window', - cpu: 'CPU', memory: 'Memory', db: 'DB', - redis: 'Redis', goroutines: 'Goroutines', jobs: 'Jobs', jobsHelp: 'Click “Details” to view job heartbeats and recent errors', @@ -1911,7 +1912,7 @@ export default { totalRequests: 'Total Requests', avgQps: 'Avg QPS', avgTps: 'Avg TPS', - avgLatency: 'Avg Latency', + avgLatency: 'Avg Request Duration', avgTtft: 'Avg TTFT', exceptions: 'Exceptions', requestErrors: 'Request Errors', @@ -1923,7 +1924,7 @@ export default { errors: 'Errors', errorRate: 'error_rate:', upstreamRate: 'upstream_rate:', - latencyDuration: 'Latency (duration_ms)', + latencyDuration: 'Request Duration (ms)', ttftLabel: 'TTFT (first_token_ms)', p50: 'p50:', p90: 'p90:', @@ -1931,7 +1932,6 @@ export default { p99: 'p99:', avg: 'avg:', max: 'max:', - qps: 'QPS', requests: 'Requests', requestsTitle: 'Requests', upstream: 'Upstream', @@ -1943,7 +1943,7 @@ export default { failedToLoadData: 'Failed to load ops data.', failedToLoadOverview: 'Failed to load overview', failedToLoadThroughputTrend: 'Failed to load throughput trend', - failedToLoadLatencyHistogram: 'Failed to load latency histogram', + failedToLoadLatencyHistogram: 'Failed to load request duration histogram', failedToLoadErrorTrend: 'Failed to load error trend', failedToLoadErrorDistribution: 'Failed to load error distribution', failedToLoadErrorDetail: 'Failed to load error detail', @@ -1951,7 +1951,7 @@ export default { tpsK: 'TPS (K)', top: 'Top:', throughputTrend: 'Throughput Trend', - latencyHistogram: 'Latency Histogram', + latencyHistogram: 'Request Duration Histogram', errorTrend: 'Error Trend', errorDistribution: 'Error Distribution', // Health Score & Diagnosis @@ -1966,7 +1966,9 @@ export default { '30m': 'Last 30 minutes', '1h': 'Last 1 hour', '6h': 'Last 6 hours', - '24h': 'Last 24 hours' + '24h': 'Last 24 hours', + '7d': 'Last 7 days', + '30d': 'Last 30 days' }, fullscreen: { enter: 'Enter Fullscreen' @@ -1995,14 +1997,7 @@ export default { memoryHigh: 'Memory usage elevated ({usage}%)', memoryHighImpact: 'Memory pressure is high, needs attention', memoryHighAction: 'Monitor memory trends, check for memory leaks', - // Latency diagnostics - latencyCritical: 'Response latency critically high ({latency}ms)', - latencyCriticalImpact: 'User experience extremely poor, many requests timing out', - latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services', - latencyHigh: 'Response latency elevated ({latency}ms)', - latencyHighImpact: 'User experience degraded, needs optimization', - latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic', - ttftHigh: 'Time to first byte elevated ({ttft}ms)', + ttftHigh: 'Time to first token elevated ({ttft}ms)', ttftHighImpact: 'User perceived latency increased', ttftHighAction: 'Optimize request processing flow, reduce pre-processing time', // Error rate diagnostics @@ -2038,27 +2033,106 @@ export default { // Error Log errorLog: { timeId: 'Time / ID', + commonErrors: { + contextDeadlineExceeded: 'context deadline exceeded', + connectionRefused: 'connection refused', + rateLimit: 'rate limit' + }, + time: 'Time', + type: 'Type', context: 'Context', + platform: 'Platform', + model: 'Model', + group: 'Group', + user: 'User', + userId: 'User ID', + account: 'Account', + accountId: 'Account ID', status: 'Status', message: 'Message', - latency: 'Latency', + latency: 'Request Duration', action: 'Action', noErrors: 'No errors in this window.', grp: 'GRP:', acc: 'ACC:', details: 'Details', - phase: 'Phase' + phase: 'Phase', + id: 'ID:', + typeUpstream: 'Upstream', + typeRequest: 'Request', + typeAuth: 'Auth', + typeRouting: 'Routing', + typeInternal: 'Internal' }, // Error Details Modal errorDetails: { upstreamErrors: 'Upstream Errors', requestErrors: 'Request Errors', + unresolved: 'Unresolved', + resolved: 'Resolved', + viewErrors: 'Errors', + viewExcluded: 'Excluded', + statusCodeOther: 'Other', + owner: { + provider: 'Provider', + client: 'Client', + platform: 'Platform' + }, + phase: { + request: 'Request', + auth: 'Auth', + routing: 'Routing', + upstream: 'Upstream', + network: 'Network', + internal: 'Internal' + }, total: 'Total:', searchPlaceholder: 'Search request_id / client_request_id / message', - accountIdPlaceholder: 'account_id' }, // Error Detail Modal errorDetail: { + title: 'Error Detail', + titleWithId: 'Error #{id}', + noErrorSelected: 'No error selected.', + resolution: 'Resolved:', + pinnedToOriginalAccountId: 'Pinned to original account_id', + missingUpstreamRequestBody: 'Missing upstream request body', + failedToLoadRetryHistory: 'Failed to load retry history', + failedToUpdateResolvedStatus: 'Failed to update resolved status', + unsupportedRetryMode: 'Unsupported retry mode', + classificationKeys: { + phase: 'Phase', + owner: 'Owner', + source: 'Source', + retryable: 'Retryable', + resolvedAt: 'Resolved At', + resolvedBy: 'Resolved By', + resolvedRetryId: 'Resolved Retry', + retryCount: 'Retry Count' + }, + source: { + upstream_http: 'Upstream HTTP' + }, + upstreamKeys: { + status: 'Status', + message: 'Message', + detail: 'Detail', + upstreamErrors: 'Upstream Errors' + }, + upstreamEvent: { + account: 'Account', + status: 'Status', + requestId: 'Request ID' + }, + responsePreview: { + expand: 'Response (click to expand)', + collapse: 'Response (click to collapse)' + }, + retryMeta: { + used: 'Used', + success: 'Success', + pinned: 'Pinned' + }, loading: 'Loading…', requestId: 'Request ID', time: 'Time', @@ -2068,8 +2142,10 @@ export default { basicInfo: 'Basic Info', platform: 'Platform', model: 'Model', - latency: 'Latency', - ttft: 'TTFT', + group: 'Group', + user: 'User', + account: 'Account', + latency: 'Request Duration', businessLimited: 'Business Limited', requestPath: 'Request Path', timings: 'Timings', @@ -2077,6 +2153,8 @@ export default { routing: 'Routing', upstream: 'Upstream', response: 'Response', + classification: 'Classification', + notRetryable: 'Not recommended to retry', retry: 'Retry', retryClient: 'Retry (Client)', retryUpstream: 'Retry (Upstream pinned)', @@ -2088,7 +2166,6 @@ export default { confirmRetry: 'Confirm Retry', retrySuccess: 'Retry succeeded', retryFailed: 'Retry failed', - na: 'N/A', retryHint: 'Retry will resend the request with the same parameters', retryClientHint: 'Use client retry (no account pinning)', retryUpstreamHint: 'Use upstream pinned retry (pin to the error account)', @@ -2096,8 +2173,33 @@ export default { retryNote1: 'Retry will use the same request body and parameters', retryNote2: 'If the original request failed due to account issues, pinned retry may still fail', retryNote3: 'Client retry will reselect an account', + retryNote4: 'You can force retry for non-retryable errors, but it is not recommended', confirmRetryMessage: 'Confirm retry this request?', - confirmRetryHint: 'Will resend with the same request parameters' + confirmRetryHint: 'Will resend with the same request parameters', + forceRetry: 'I understand and want to force retry', + forceRetryHint: 'This error usually cannot be fixed by retry; check to proceed', + forceRetryNeedAck: 'Please check to force retry', + markResolved: 'Mark resolved', + markUnresolved: 'Mark unresolved', + viewRetries: 'Retry history', + retryHistory: 'Retry History', + tabOverview: 'Overview', + tabRetries: 'Retries', + tabRequest: 'Request', + tabResponse: 'Response', + responseBody: 'Response', + compareA: 'Compare A', + compareB: 'Compare B', + retrySummary: 'Retry Summary', + responseHintSucceeded: 'Showing succeeded retry response_preview (#{id})', + responseHintFallback: 'No succeeded retry found; showing stored error_body', + suggestion: 'Suggestion', + suggestUpstreamResolved: '✓ Upstream error resolved by retry; no action needed', + suggestUpstream: 'Upstream instability: check account status, consider switching accounts, or retry', + suggestRequest: 'Client request error: ask customer to fix request parameters', + suggestAuth: 'Auth failed: verify API key/credentials', + suggestPlatform: 'Platform error: prioritize investigation and fix', + suggestGeneric: 'See details for more context' }, requestDetails: { title: 'Request Details', @@ -2133,13 +2235,46 @@ export default { loading: 'Loading...', empty: 'No alert events', loadFailed: 'Failed to load alert events', + status: { + firing: 'FIRING', + resolved: 'RESOLVED', + manualResolved: 'MANUAL RESOLVED' + }, + detail: { + title: 'Alert Detail', + loading: 'Loading detail...', + empty: 'No detail', + loadFailed: 'Failed to load alert detail', + manualResolve: 'Mark as Resolved', + manualResolvedSuccess: 'Marked as manually resolved', + manualResolvedFailed: 'Failed to mark as manually resolved', + silence: 'Ignore Alert', + silenceSuccess: 'Alert silenced', + silenceFailed: 'Failed to silence alert', + viewRule: 'View Rule', + viewLogs: 'View Logs', + firedAt: 'Fired At', + resolvedAt: 'Resolved At', + ruleId: 'Rule ID', + dimensions: 'Dimensions', + historyTitle: 'History', + historyHint: 'Recent events with same rule + dimensions', + historyLoading: 'Loading history...', + historyEmpty: 'No history' + }, table: { time: 'Time', status: 'Status', severity: 'Severity', + platform: 'Platform', + ruleId: 'Rule ID', title: 'Title', + duration: 'Duration', metric: 'Metric / Threshold', - email: 'Email Sent' + dimensions: 'Dimensions', + email: 'Email Sent', + emailSent: 'Sent', + emailIgnored: 'Ignored' } }, alertRules: { @@ -2253,7 +2388,6 @@ export default { title: 'Alert Silencing (Maintenance Mode)', enabled: 'Enable silencing', globalUntil: 'Silence until (RFC3339)', - untilPlaceholder: '2026-01-05T00:00:00Z', untilHint: 'Leave empty to only toggle silencing without an expiry (not recommended).', reason: 'Reason', reasonPlaceholder: 'e.g., planned maintenance', @@ -2293,7 +2427,11 @@ export default { lockKeyRequired: 'Distributed lock key is required when lock is enabled', lockKeyPrefix: 'Distributed lock key must start with "{prefix}"', lockKeyHint: 'Recommended: start with "{prefix}" to avoid conflicts', - lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds' + lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds', + slaMinPercentRange: 'SLA minimum percentage must be between 0 and 100', + ttftP99MaxRange: 'TTFT P99 maximum must be a number ≥ 0', + requestErrorRateMaxRange: 'Request error rate maximum must be between 0 and 100', + upstreamErrorRateMaxRange: 'Upstream error rate maximum must be between 0 and 100' } }, email: { @@ -2358,8 +2496,6 @@ export default { metricThresholdsHint: 'Configure alert thresholds for metrics, values exceeding thresholds will be displayed in red', slaMinPercent: 'SLA Minimum Percentage', slaMinPercentHint: 'SLA below this value will be displayed in red (default: 99.5%)', - latencyP99MaxMs: 'Latency P99 Maximum (ms)', - latencyP99MaxMsHint: 'Latency P99 above this value will be displayed in red (default: 2000ms)', ttftP99MaxMs: 'TTFT P99 Maximum (ms)', ttftP99MaxMsHint: 'TTFT P99 above this value will be displayed in red (default: 500ms)', requestErrorRateMaxPercent: 'Request Error Rate Maximum (%)', @@ -2378,9 +2514,28 @@ export default { aggregation: 'Pre-aggregation Tasks', enableAggregation: 'Enable Pre-aggregation', aggregationHint: 'Pre-aggregation improves query performance for long time windows', + errorFiltering: 'Error Filtering', + ignoreCountTokensErrors: 'Ignore count_tokens errors', + ignoreCountTokensErrorsHint: 'When enabled, errors from count_tokens requests will not be written to the error log.', + ignoreContextCanceled: 'Ignore client disconnect errors', + ignoreContextCanceledHint: 'When enabled, client disconnect (context canceled) errors will not be written to the error log.', + ignoreNoAvailableAccounts: 'Ignore no available accounts errors', + ignoreNoAvailableAccountsHint: 'When enabled, "No available accounts" errors will not be written to the error log (not recommended; usually a config issue).', + autoRefresh: 'Auto Refresh', + enableAutoRefresh: 'Enable auto refresh', + enableAutoRefreshHint: 'Automatically refresh dashboard data at a fixed interval.', + refreshInterval: 'Refresh Interval', + refreshInterval15s: '15 seconds', + refreshInterval30s: '30 seconds', + refreshInterval60s: '60 seconds', + autoRefreshCountdown: 'Auto refresh: {seconds}s', validation: { title: 'Please fix the following issues', - retentionDaysRange: 'Retention days must be between 1-365 days' + retentionDaysRange: 'Retention days must be between 1-365 days', + slaMinPercentRange: 'SLA minimum percentage must be between 0 and 100', + ttftP99MaxRange: 'TTFT P99 maximum must be a number ≥ 0', + requestErrorRateMaxRange: 'Request error rate maximum must be between 0 and 100', + upstreamErrorRateMaxRange: 'Upstream error rate maximum must be between 0 and 100' } }, concurrency: { @@ -2418,7 +2573,7 @@ export default { tooltips: { totalRequests: 'Total number of requests (including both successful and failed requests) in the selected time window.', throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.', - latencyHistogram: 'Latency distribution (duration_ms) for successful requests.', + latencyHistogram: 'Request duration distribution (ms) for successful requests.', errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).', errorDistribution: 'Error distribution by status code.', goroutines: @@ -2433,7 +2588,7 @@ export default { sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).', errors: 'Error statistics, including total errors, error rate, and upstream error rate.', upstreamErrors: 'Upstream error statistics, excluding rate limit errors (429/529).', - latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.', + latency: 'Request duration statistics, including p50, p90, p95, p99 percentiles.', ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.', health: 'System health score (0-100), considering SLA, error rate, and resource usage.' }, diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index f2137124..f01c1e4b 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -126,6 +126,8 @@ export default { all: '全部', none: '无', noData: '暂无数据', + expand: '展开', + collapse: '收起', success: '成功', error: '错误', critical: '严重', @@ -2031,10 +2033,8 @@ export default { noSystemMetrics: '尚未收集系统指标。', collectedAt: '采集时间:', window: '窗口', - cpu: 'CPU', memory: '内存', db: '数据库', - redis: 'Redis', goroutines: '协程', jobs: '后台任务', jobsHelp: '点击“明细”查看任务心跳与报错信息', @@ -2060,7 +2060,7 @@ export default { totalRequests: '总请求', avgQps: '平均 QPS', avgTps: '平均 TPS', - avgLatency: '平均延迟', + avgLatency: '平均请求时长', avgTtft: '平均首字延迟', exceptions: '异常数', requestErrors: '请求错误', @@ -2072,7 +2072,7 @@ export default { errors: '错误', errorRate: '错误率:', upstreamRate: '上游错误率:', - latencyDuration: '延迟(毫秒)', + latencyDuration: '请求时长(毫秒)', ttftLabel: '首字延迟(毫秒)', p50: 'p50', p90: 'p90', @@ -2080,7 +2080,6 @@ export default { p99: 'p99', avg: 'avg', max: 'max', - qps: 'QPS', requests: '请求数', requestsTitle: '请求', upstream: '上游', @@ -2092,7 +2091,7 @@ export default { failedToLoadData: '加载运维数据失败', failedToLoadOverview: '加载概览数据失败', failedToLoadThroughputTrend: '加载吞吐趋势失败', - failedToLoadLatencyHistogram: '加载延迟分布失败', + failedToLoadLatencyHistogram: '加载请求时长分布失败', failedToLoadErrorTrend: '加载错误趋势失败', failedToLoadErrorDistribution: '加载错误分布失败', failedToLoadErrorDetail: '加载错误详情失败', @@ -2100,7 +2099,7 @@ export default { tpsK: 'TPS(千)', top: '最高:', throughputTrend: '吞吐趋势', - latencyHistogram: '延迟分布', + latencyHistogram: '请求时长分布', errorTrend: '错误趋势', errorDistribution: '错误分布', // Health Score & Diagnosis @@ -2115,7 +2114,9 @@ export default { '30m': '近30分钟', '1h': '近1小时', '6h': '近6小时', - '24h': '近24小时' + '24h': '近24小时', + '7d': '近7天', + '30d': '近30天' }, fullscreen: { enter: '进入全屏' @@ -2144,15 +2145,8 @@ export default { memoryHigh: '内存使用率偏高 ({usage}%)', memoryHighImpact: '内存压力较大,需要关注', memoryHighAction: '监控内存趋势,检查是否有内存泄漏', - // Latency diagnostics - latencyCritical: '响应延迟严重过高 ({latency}ms)', - latencyCriticalImpact: '用户体验极差,大量请求超时', - latencyCriticalAction: '检查慢查询、数据库索引、网络延迟和上游服务', - latencyHigh: '响应延迟偏高 ({latency}ms)', - latencyHighImpact: '用户体验下降,需要优化', - latencyHighAction: '分析慢请求日志,优化数据库查询和业务逻辑', ttftHigh: '首字节时间偏高 ({ttft}ms)', - ttftHighImpact: '用户感知延迟增加', + ttftHighImpact: '用户感知时长增加', ttftHighAction: '优化请求处理流程,减少前置逻辑耗时', // Error rate diagnostics upstreamCritical: '上游错误率严重偏高 ({rate}%)', @@ -2170,13 +2164,13 @@ export default { // SLA diagnostics slaCritical: 'SLA 严重低于目标 ({sla}%)', slaCriticalImpact: '用户体验严重受损', - slaCriticalAction: '紧急排查错误和延迟问题,考虑限流保护', + slaCriticalAction: '紧急排查错误原因,必要时采取限流保护', slaLow: 'SLA 低于目标 ({sla}%)', slaLowImpact: '需要关注服务质量', slaLowAction: '分析SLA下降原因,优化系统性能', // Health score diagnostics healthCritical: '综合健康评分过低 ({score})', - healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与延迟', + healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与资源使用情况', healthCriticalAction: '全面检查系统状态,优先处理critical级别问题', healthLow: '综合健康评分偏低 ({score})', healthLowImpact: '可能存在轻度波动,建议关注 SLA 与错误率', @@ -2187,27 +2181,106 @@ export default { // Error Log errorLog: { timeId: '时间 / ID', + commonErrors: { + contextDeadlineExceeded: '请求超时', + connectionRefused: '连接被拒绝', + rateLimit: '触发限流' + }, + time: '时间', + type: '类型', context: '上下文', + platform: '平台', + model: '模型', + group: '分组', + user: '用户', + userId: '用户 ID', + account: '账号', + accountId: '账号 ID', status: '状态码', - message: '消息', - latency: '延迟', + message: '响应内容', + latency: '请求时长', action: '操作', noErrors: '该窗口内暂无错误。', grp: 'GRP:', acc: 'ACC:', details: '详情', - phase: '阶段' + phase: '阶段', + id: 'ID:', + typeUpstream: '上游', + typeRequest: '请求', + typeAuth: '认证', + typeRouting: '路由', + typeInternal: '内部' }, // Error Details Modal errorDetails: { upstreamErrors: '上游错误', requestErrors: '请求错误', + unresolved: '未解决', + resolved: '已解决', + viewErrors: '错误', + viewExcluded: '排除项', + statusCodeOther: '其他', + owner: { + provider: '服务商', + client: '客户端', + platform: '平台' + }, + phase: { + request: '请求', + auth: '认证', + routing: '路由', + upstream: '上游', + network: '网络', + internal: '内部' + }, total: '总计:', searchPlaceholder: '搜索 request_id / client_request_id / message', - accountIdPlaceholder: 'account_id' }, // Error Detail Modal errorDetail: { + title: '错误详情', + titleWithId: '错误 #{id}', + noErrorSelected: '未选择错误。', + resolution: '已解决:', + pinnedToOriginalAccountId: '固定到原 account_id', + missingUpstreamRequestBody: '缺少上游请求体', + failedToLoadRetryHistory: '加载重试历史失败', + failedToUpdateResolvedStatus: '更新解决状态失败', + unsupportedRetryMode: '不支持的重试模式', + classificationKeys: { + phase: '阶段', + owner: '归属方', + source: '来源', + retryable: '可重试', + resolvedAt: '解决时间', + resolvedBy: '解决人', + resolvedRetryId: '解决重试ID', + retryCount: '重试次数' + }, + source: { + upstream_http: '上游 HTTP' + }, + upstreamKeys: { + status: '状态码', + message: '消息', + detail: '详情', + upstreamErrors: '上游错误列表' + }, + upstreamEvent: { + account: '账号', + status: '状态码', + requestId: '请求ID' + }, + responsePreview: { + expand: '响应内容(点击展开)', + collapse: '响应内容(点击收起)' + }, + retryMeta: { + used: '使用账号', + success: '成功', + pinned: '固定账号' + }, loading: '加载中…', requestId: '请求 ID', time: '时间', @@ -2217,8 +2290,10 @@ export default { basicInfo: '基本信息', platform: '平台', model: '模型', - latency: '延迟', - ttft: 'TTFT', + group: '分组', + user: '用户', + account: '账号', + latency: '请求时长', businessLimited: '业务限制', requestPath: '请求路径', timings: '时序信息', @@ -2226,6 +2301,8 @@ export default { routing: '路由', upstream: '上游', response: '响应', + classification: '错误分类', + notRetryable: '此错误不建议重试', retry: '重试', retryClient: '重试(客户端)', retryUpstream: '重试(上游固定)', @@ -2237,7 +2314,6 @@ export default { confirmRetry: '确认重试', retrySuccess: '重试成功', retryFailed: '重试失败', - na: 'N/A', retryHint: '重试将使用相同的请求参数重新发送请求', retryClientHint: '使用客户端重试(不固定账号)', retryUpstreamHint: '使用上游固定重试(固定到错误的账号)', @@ -2245,8 +2321,33 @@ export default { retryNote1: '重试会使用相同的请求体和参数', retryNote2: '如果原请求失败是因为账号问题,固定重试可能仍会失败', retryNote3: '客户端重试会重新选择账号', + retryNote4: '对不可重试的错误可以强制重试,但不推荐', confirmRetryMessage: '确认要重试该请求吗?', - confirmRetryHint: '将使用相同的请求参数重新发送' + confirmRetryHint: '将使用相同的请求参数重新发送', + forceRetry: '我已确认并理解强制重试风险', + forceRetryHint: '此错误类型通常不可通过重试解决;如仍需重试请勾选确认', + forceRetryNeedAck: '请先勾选确认再强制重试', + markResolved: '标记已解决', + markUnresolved: '标记未解决', + viewRetries: '重试历史', + retryHistory: '重试历史', + tabOverview: '概览', + tabRetries: '重试历史', + tabRequest: '请求详情', + tabResponse: '响应详情', + responseBody: '响应详情', + compareA: '对比 A', + compareB: '对比 B', + retrySummary: '重试摘要', + responseHintSucceeded: '展示重试成功的 response_preview(#{id})', + responseHintFallback: '没有成功的重试结果,展示存储的 error_body', + suggestion: '处理建议', + suggestUpstreamResolved: '✓ 上游错误已通过重试解决,无需人工介入', + suggestUpstream: '⚠️ 上游服务不稳定,建议:检查上游账号状态 / 考虑切换账号 / 再次重试', + suggestRequest: '⚠️ 客户端请求错误,建议:联系客户修正请求参数 / 手动标记已解决', + suggestAuth: '⚠️ 认证失败,建议:检查 API Key 是否有效 / 联系客户更新凭证', + suggestPlatform: '🚨 平台错误,建议立即排查修复', + suggestGeneric: '查看详情了解更多信息' }, requestDetails: { title: '请求明细', @@ -2282,13 +2383,46 @@ export default { loading: '加载中...', empty: '暂无告警事件', loadFailed: '加载告警事件失败', + status: { + firing: '告警中', + resolved: '已恢复', + manualResolved: '手动已解决' + }, + detail: { + title: '告警详情', + loading: '加载详情中...', + empty: '暂无详情', + loadFailed: '加载告警详情失败', + manualResolve: '标记为已解决', + manualResolvedSuccess: '已标记为手动解决', + manualResolvedFailed: '标记为手动解决失败', + silence: '忽略此告警', + silenceSuccess: '已静默该告警', + silenceFailed: '静默失败', + viewRule: '查看规则', + viewLogs: '查看相关日志', + firedAt: '触发时间', + resolvedAt: '解决时间', + ruleId: '规则 ID', + dimensions: '维度信息', + historyTitle: '历史记录', + historyHint: '同一规则 + 相同维度的最近事件', + historyLoading: '加载历史中...', + historyEmpty: '暂无历史记录' + }, table: { time: '时间', status: '状态', severity: '级别', + platform: '平台', + ruleId: '规则ID', title: '标题', + duration: '持续时间', metric: '指标 / 阈值', - email: '邮件已发送' + dimensions: '维度', + email: '邮件已发送', + emailSent: '已发送', + emailIgnored: '已忽略' } }, alertRules: { @@ -2316,8 +2450,8 @@ export default { successRate: '成功率 (%)', errorRate: '错误率 (%)', upstreamErrorRate: '上游错误率 (%)', - p95: 'P95 延迟 (ms)', - p99: 'P99 延迟 (ms)', + p95: 'P95 请求时长 (ms)', + p99: 'P99 请求时长 (ms)', cpu: 'CPU 使用率 (%)', memory: '内存使用率 (%)', queueDepth: '并发排队深度', @@ -2402,7 +2536,6 @@ export default { title: '告警静默(维护模式)', enabled: '启用静默', globalUntil: '静默截止时间(RFC3339)', - untilPlaceholder: '2026-01-05T00:00:00Z', untilHint: '建议填写截止时间,避免忘记关闭静默。', reason: '原因', reasonPlaceholder: '例如:计划维护', @@ -2442,7 +2575,11 @@ export default { lockKeyRequired: '启用分布式锁时必须填写 Lock Key', lockKeyPrefix: '分布式锁 Key 必须以「{prefix}」开头', lockKeyHint: '建议以「{prefix}」开头以避免冲突', - lockTtlRange: '分布式锁 TTL 必须在 1 到 86400 秒之间' + lockTtlRange: '分布式锁 TTL 必须在 1 到 86400 秒之间', + slaMinPercentRange: 'SLA 最低值必须在 0-100 之间', + ttftP99MaxRange: 'TTFT P99 最大值必须大于或等于 0', + requestErrorRateMaxRange: '请求错误率最大值必须在 0-100 之间', + upstreamErrorRateMaxRange: '上游错误率最大值必须在 0-100 之间' } }, email: { @@ -2507,8 +2644,6 @@ export default { metricThresholdsHint: '配置各项指标的告警阈值,超出阈值时将以红色显示', slaMinPercent: 'SLA最低百分比', slaMinPercentHint: 'SLA低于此值时显示为红色(默认:99.5%)', - latencyP99MaxMs: '延迟P99最大值(毫秒)', - latencyP99MaxMsHint: '延迟P99高于此值时显示为红色(默认:2000ms)', ttftP99MaxMs: 'TTFT P99最大值(毫秒)', ttftP99MaxMsHint: 'TTFT P99高于此值时显示为红色(默认:500ms)', requestErrorRateMaxPercent: '请求错误率最大值(%)', @@ -2527,9 +2662,28 @@ export default { aggregation: '预聚合任务', enableAggregation: '启用预聚合任务', aggregationHint: '预聚合可提升长时间窗口查询性能', + errorFiltering: '错误过滤', + ignoreCountTokensErrors: '忽略 count_tokens 错误', + ignoreCountTokensErrorsHint: '启用后,count_tokens 请求的错误将不会写入错误日志。', + ignoreContextCanceled: '忽略客户端断连错误', + ignoreContextCanceledHint: '启用后,客户端主动断开连接(context canceled)的错误将不会写入错误日志。', + ignoreNoAvailableAccounts: '忽略无可用账号错误', + ignoreNoAvailableAccountsHint: '启用后,“No available accounts” 错误将不会写入错误日志(不推荐,这通常是配置问题)。', + autoRefresh: '自动刷新', + enableAutoRefresh: '启用自动刷新', + enableAutoRefreshHint: '自动刷新仪表板数据,启用后会定期拉取最新数据。', + refreshInterval: '刷新间隔', + refreshInterval15s: '15 秒', + refreshInterval30s: '30 秒', + refreshInterval60s: '60 秒', + autoRefreshCountdown: '自动刷新:{seconds}s', validation: { title: '请先修正以下问题', - retentionDaysRange: '保留天数必须在1-365天之间' + retentionDaysRange: '保留天数必须在1-365天之间', + slaMinPercentRange: 'SLA最低百分比必须在0-100之间', + ttftP99MaxRange: 'TTFT P99最大值必须大于等于0', + requestErrorRateMaxRange: '请求错误率最大值必须在0-100之间', + upstreamErrorRateMaxRange: '上游错误率最大值必须在0-100之间' } }, concurrency: { @@ -2567,12 +2721,12 @@ export default { tooltips: { totalRequests: '当前时间窗口内的总请求数和Token消耗量。', throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。', - latencyHistogram: '成功请求的延迟分布(毫秒)。', + latencyHistogram: '成功请求的请求时长分布(毫秒)。', errorTrend: '错误趋势(SLA 口径排除业务限制;上游错误率排除 429/529)。', errorDistribution: '按状态码统计的错误分布。', upstreamErrors: '上游服务返回的错误,包括API提供商的错误响应(排除429/529限流错误)。', goroutines: - 'Go 运行时的协程数量(轻量级线程)。没有绝对“安全值”,建议以历史基线为准。经验参考:<2000 常见;2000-8000 需关注;>8000 且伴随队列/延迟上升时,优先排查阻塞/泄漏。', + 'Go 运行时的协程数量(轻量级线程)。没有绝对"安全值",建议以历史基线为准。经验参考:<2000 常见;2000-8000 需关注;>8000 且伴随队列上升时,优先排查阻塞/泄漏。', cpu: 'CPU 使用率,显示系统处理器的负载情况。', memory: '内存使用率,包括已使用和总可用内存。', db: '数据库连接池状态,包括活跃连接、空闲连接和等待连接数。', @@ -2582,7 +2736,7 @@ export default { tokens: '当前时间窗口内处理的总Token数量。', sla: '服务等级协议达成率,排除业务限制(如余额不足、配额超限)的成功请求占比。', errors: '错误统计,包括总错误数、错误率和上游错误率。', - latency: '请求延迟统计,包括 p50、p90、p95、p99 等百分位数。', + latency: '请求时长统计,包括 p50、p90、p95、p99 等百分位数。', ttft: '首Token延迟(Time To First Token),衡量流式响应的首字节返回速度。', health: '系统健康评分(0-100),综合考虑 SLA、错误率和资源使用情况。' }, diff --git a/frontend/src/views/admin/ops/OpsDashboard.vue b/frontend/src/views/admin/ops/OpsDashboard.vue index be445a32..ff2a434d 100644 --- a/frontend/src/views/admin/ops/OpsDashboard.vue +++ b/frontend/src/views/admin/ops/OpsDashboard.vue @@ -8,7 +8,7 @@ {{ errorMessage }} - + - + { const fallback = adminSettingsStore.opsQueryModeDefault || 'auto' queryMode.value = allowedQueryModes.has(fallback as QueryMode) ? (fallback as QueryMode) : 'auto' } + + // Deep links + const openRules = readQueryString(QUERY_KEYS.openAlertRules) + if (openRules === '1' || openRules === 'true') { + showAlertRulesCard.value = true + } + + const ruleID = readQueryNumber(QUERY_KEYS.alertRuleId) + if (typeof ruleID === 'number' && ruleID > 0) { + showAlertRulesCard.value = true + } + + const openErr = readQueryString(QUERY_KEYS.openErrorDetails) + if (openErr === '1' || openErr === 'true') { + const typ = readQueryString(QUERY_KEYS.errorType) + errorDetailsType.value = typ === 'upstream' ? 'upstream' : 'request' + showErrorDetails.value = true + } } applyRouteQueryToState() @@ -376,11 +400,17 @@ function handleOpenRequestDetails(preset?: OpsRequestDetailsPreset) { requestDetailsPreset.value = { ...basePreset, ...(preset ?? {}) } if (!requestDetailsPreset.value.title) requestDetailsPreset.value.title = basePreset.title + // Ensure only one modal visible at a time. + showErrorDetails.value = false + showErrorModal.value = false showRequestDetails.value = true } function openErrorDetails(kind: 'request' | 'upstream') { errorDetailsType.value = kind + // Ensure only one modal visible at a time. + showRequestDetails.value = false + showErrorModal.value = false showErrorDetails.value = true } @@ -422,6 +452,9 @@ function onQueryModeChange(v: string | number | boolean | null) { function openError(id: number) { selectedErrorId.value = id + // Ensure only one modal visible at a time. + showErrorDetails.value = false + showRequestDetails.value = false showErrorModal.value = true } diff --git a/frontend/src/views/admin/ops/components/OpsAlertEventsCard.vue b/frontend/src/views/admin/ops/components/OpsAlertEventsCard.vue index 58a91355..06b9b290 100644 --- a/frontend/src/views/admin/ops/components/OpsAlertEventsCard.vue +++ b/frontend/src/views/admin/ops/components/OpsAlertEventsCard.vue @@ -3,42 +3,326 @@ import { computed, onMounted, ref, watch } from 'vue' import { useI18n } from 'vue-i18n' import { useAppStore } from '@/stores/app' import Select from '@/components/common/Select.vue' -import { opsAPI } from '@/api/admin/ops' +import BaseDialog from '@/components/common/BaseDialog.vue' +import Icon from '@/components/icons/Icon.vue' +import { opsAPI, type AlertEventsQuery } from '@/api/admin/ops' import type { AlertEvent } from '../types' import { formatDateTime } from '../utils/opsFormatters' const { t } = useI18n() const appStore = useAppStore() -const loading = ref(false) -const events = ref([]) +const PAGE_SIZE = 10 -const limit = ref(100) -const limitOptions = computed(() => [ - { value: 50, label: '50' }, - { value: 100, label: '100' }, - { value: 200, label: '200' } +const loading = ref(false) +const loadingMore = ref(false) +const events = ref([]) +const hasMore = ref(true) + +// Detail modal +const showDetail = ref(false) +const selected = ref(null) +const detailLoading = ref(false) +const detailActionLoading = ref(false) +const historyLoading = ref(false) +const history = ref([]) +const historyRange = ref('7d') +const historyRangeOptions = computed(() => [ + { value: '7d', label: t('admin.ops.timeRange.7d') }, + { value: '30d', label: t('admin.ops.timeRange.30d') } ]) -async function load() { +const silenceDuration = ref('1h') +const silenceDurationOptions = computed(() => [ + { value: '1h', label: t('admin.ops.timeRange.1h') }, + { value: '24h', label: t('admin.ops.timeRange.24h') }, + { value: '7d', label: t('admin.ops.timeRange.7d') } +]) + +// Filters +const timeRange = ref('24h') +const timeRangeOptions = computed(() => [ + { value: '5m', label: t('admin.ops.timeRange.5m') }, + { value: '30m', label: t('admin.ops.timeRange.30m') }, + { value: '1h', label: t('admin.ops.timeRange.1h') }, + { value: '6h', label: t('admin.ops.timeRange.6h') }, + { value: '24h', label: t('admin.ops.timeRange.24h') }, + { value: '7d', label: t('admin.ops.timeRange.7d') }, + { value: '30d', label: t('admin.ops.timeRange.30d') } +]) + +const severity = ref('') +const severityOptions = computed(() => [ + { value: '', label: t('common.all') }, + { value: 'P0', label: 'P0' }, + { value: 'P1', label: 'P1' }, + { value: 'P2', label: 'P2' }, + { value: 'P3', label: 'P3' } +]) + +const status = ref('') +const statusOptions = computed(() => [ + { value: '', label: t('common.all') }, + { value: 'firing', label: t('admin.ops.alertEvents.status.firing') }, + { value: 'resolved', label: t('admin.ops.alertEvents.status.resolved') }, + { value: 'manual_resolved', label: t('admin.ops.alertEvents.status.manualResolved') } +]) + +const emailSent = ref('') +const emailSentOptions = computed(() => [ + { value: '', label: t('common.all') }, + { value: 'true', label: t('admin.ops.alertEvents.table.emailSent') }, + { value: 'false', label: t('admin.ops.alertEvents.table.emailIgnored') } +]) + +function buildQuery(overrides: Partial = {}): AlertEventsQuery { + const q: AlertEventsQuery = { + limit: PAGE_SIZE, + time_range: timeRange.value + } + if (severity.value) q.severity = severity.value + if (status.value) q.status = status.value + if (emailSent.value === 'true') q.email_sent = true + if (emailSent.value === 'false') q.email_sent = false + return { ...q, ...overrides } +} + +async function loadFirstPage() { loading.value = true try { - events.value = await opsAPI.listAlertEvents(limit.value) + const data = await opsAPI.listAlertEvents(buildQuery()) + events.value = data + hasMore.value = data.length === PAGE_SIZE } catch (err: any) { console.error('[OpsAlertEventsCard] Failed to load alert events', err) appStore.showError(err?.response?.data?.detail || t('admin.ops.alertEvents.loadFailed')) events.value = [] + hasMore.value = false } finally { loading.value = false } } +async function loadMore() { + if (loadingMore.value || loading.value) return + if (!hasMore.value) return + const last = events.value[events.value.length - 1] + if (!last) return + + loadingMore.value = true + try { + const data = await opsAPI.listAlertEvents( + buildQuery({ before_fired_at: last.fired_at || last.created_at, before_id: last.id }) + ) + if (!data.length) { + hasMore.value = false + return + } + events.value = [...events.value, ...data] + if (data.length < PAGE_SIZE) hasMore.value = false + } catch (err: any) { + console.error('[OpsAlertEventsCard] Failed to load more alert events', err) + hasMore.value = false + } finally { + loadingMore.value = false + } +} + +function onScroll(e: Event) { + const el = e.target as HTMLElement | null + if (!el) return + const nearBottom = el.scrollTop + el.clientHeight >= el.scrollHeight - 120 + if (nearBottom) loadMore() +} + +function getDimensionString(event: AlertEvent | null | undefined, key: string): string { + const v = event?.dimensions?.[key] + if (v == null) return '' + if (typeof v === 'string') return v + if (typeof v === 'number' || typeof v === 'boolean') return String(v) + return '' +} + +function formatDurationMs(ms: number): string { + const safe = Math.max(0, Math.floor(ms)) + const sec = Math.floor(safe / 1000) + if (sec < 60) return `${sec}s` + const min = Math.floor(sec / 60) + if (min < 60) return `${min}m` + const hr = Math.floor(min / 60) + if (hr < 24) return `${hr}h` + const day = Math.floor(hr / 24) + return `${day}d` +} + +function formatDurationLabel(event: AlertEvent): string { + const firedAt = new Date(event.fired_at || event.created_at) + if (Number.isNaN(firedAt.getTime())) return '-' + const resolvedAtStr = event.resolved_at || null + const status = String(event.status || '').trim().toLowerCase() + + if (resolvedAtStr) { + const resolvedAt = new Date(resolvedAtStr) + if (!Number.isNaN(resolvedAt.getTime())) { + const ms = resolvedAt.getTime() - firedAt.getTime() + const prefix = status === 'manual_resolved' + ? t('admin.ops.alertEvents.status.manualResolved') + : t('admin.ops.alertEvents.status.resolved') + return `${prefix} ${formatDurationMs(ms)}` + } + } + + const now = Date.now() + const ms = now - firedAt.getTime() + return `${t('admin.ops.alertEvents.status.firing')} ${formatDurationMs(ms)}` +} + +function formatDimensionsSummary(event: AlertEvent): string { + const parts: string[] = [] + const platform = getDimensionString(event, 'platform') + if (platform) parts.push(`platform=${platform}`) + const groupId = event.dimensions?.group_id + if (groupId != null && groupId !== '') parts.push(`group_id=${String(groupId)}`) + const region = getDimensionString(event, 'region') + if (region) parts.push(`region=${region}`) + return parts.length ? parts.join(' ') : '-' +} + +function closeDetail() { + showDetail.value = false + selected.value = null + history.value = [] +} + +async function openDetail(row: AlertEvent) { + showDetail.value = true + selected.value = row + detailLoading.value = true + historyLoading.value = true + + try { + const detail = await opsAPI.getAlertEvent(row.id) + selected.value = detail + } catch (err: any) { + console.error('[OpsAlertEventsCard] Failed to load alert detail', err) + appStore.showError(err?.response?.data?.detail || t('admin.ops.alertEvents.detail.loadFailed')) + } finally { + detailLoading.value = false + } + + await loadHistory() +} + +async function loadHistory() { + const ev = selected.value + if (!ev) { + history.value = [] + historyLoading.value = false + return + } + + historyLoading.value = true + try { + const platform = getDimensionString(ev, 'platform') + const groupIdRaw = ev.dimensions?.group_id + const groupId = typeof groupIdRaw === 'number' ? groupIdRaw : undefined + + const items = await opsAPI.listAlertEvents({ + limit: 20, + time_range: historyRange.value, + platform: platform || undefined, + group_id: groupId, + status: '' + }) + + // Best-effort: narrow to same rule_id + dimensions + history.value = items.filter((it) => { + if (it.rule_id !== ev.rule_id) return false + const p1 = getDimensionString(it, 'platform') + const p2 = getDimensionString(ev, 'platform') + if ((p1 || '') !== (p2 || '')) return false + const g1 = it.dimensions?.group_id + const g2 = ev.dimensions?.group_id + return (g1 ?? null) === (g2 ?? null) + }) + } catch (err: any) { + console.error('[OpsAlertEventsCard] Failed to load alert history', err) + history.value = [] + } finally { + historyLoading.value = false + } +} + +function durationToUntilRFC3339(duration: string): string { + const now = Date.now() + if (duration === '1h') return new Date(now + 60 * 60 * 1000).toISOString() + if (duration === '24h') return new Date(now + 24 * 60 * 60 * 1000).toISOString() + if (duration === '7d') return new Date(now + 7 * 24 * 60 * 60 * 1000).toISOString() + return new Date(now + 60 * 60 * 1000).toISOString() +} + +async function silenceAlert() { + const ev = selected.value + if (!ev) return + if (detailActionLoading.value) return + detailActionLoading.value = true + try { + const platform = getDimensionString(ev, 'platform') + const groupIdRaw = ev.dimensions?.group_id + const groupId = typeof groupIdRaw === 'number' ? groupIdRaw : null + const region = getDimensionString(ev, 'region') || null + + await opsAPI.createAlertSilence({ + rule_id: ev.rule_id, + platform: platform || '', + group_id: groupId ?? undefined, + region: region ?? undefined, + until: durationToUntilRFC3339(silenceDuration.value), + reason: `silence from UI (${silenceDuration.value})` + }) + + appStore.showSuccess(t('admin.ops.alertEvents.detail.silenceSuccess')) + } catch (err: any) { + console.error('[OpsAlertEventsCard] Failed to silence alert', err) + appStore.showError(err?.response?.data?.detail || t('admin.ops.alertEvents.detail.silenceFailed')) + } finally { + detailActionLoading.value = false + } +} + +async function manualResolve() { + if (!selected.value) return + if (detailActionLoading.value) return + detailActionLoading.value = true + try { + await opsAPI.updateAlertEventStatus(selected.value.id, 'manual_resolved') + appStore.showSuccess(t('admin.ops.alertEvents.detail.manualResolvedSuccess')) + + // Refresh detail + first page to reflect new status + const detail = await opsAPI.getAlertEvent(selected.value.id) + selected.value = detail + await loadFirstPage() + await loadHistory() + } catch (err: any) { + console.error('[OpsAlertEventsCard] Failed to resolve alert', err) + appStore.showError(err?.response?.data?.detail || t('admin.ops.alertEvents.detail.manualResolvedFailed')) + } finally { + detailActionLoading.value = false + } +} + onMounted(() => { - load() + loadFirstPage() }) -watch(limit, () => { - load() +watch([timeRange, severity, status, emailSent], () => { + events.value = [] + hasMore.value = true + loadFirstPage() +}) + +watch(historyRange, () => { + if (showDetail.value) loadHistory() }) function severityBadgeClass(severity: string | undefined): string { @@ -54,9 +338,19 @@ function statusBadgeClass(status: string | undefined): string { const s = String(status || '').trim().toLowerCase() if (s === 'firing') return 'bg-red-50 text-red-700 ring-red-600/20 dark:bg-red-900/30 dark:text-red-300 dark:ring-red-500/30' if (s === 'resolved') return 'bg-green-50 text-green-700 ring-green-600/20 dark:bg-green-900/30 dark:text-green-300 dark:ring-green-500/30' + if (s === 'manual_resolved') return 'bg-slate-50 text-slate-700 ring-slate-600/20 dark:bg-slate-900/30 dark:text-slate-300 dark:ring-slate-500/30' return 'bg-gray-50 text-gray-700 ring-gray-600/20 dark:bg-gray-900/30 dark:text-gray-300 dark:ring-gray-500/30' } +function formatStatusLabel(status: string | undefined): string { + const s = String(status || '').trim().toLowerCase() + if (!s) return '-' + if (s === 'firing') return t('admin.ops.alertEvents.status.firing') + if (s === 'resolved') return t('admin.ops.alertEvents.status.resolved') + if (s === 'manual_resolved') return t('admin.ops.alertEvents.status.manualResolved') + return s.toUpperCase() +} + const empty = computed(() => events.value.length === 0 && !loading.value) @@ -69,11 +363,14 @@ const empty = computed(() => events.value.length === 0 && !loading.value)
- + + + +
+ + + + + + +
+
+
{{ t('admin.ops.alertEvents.detail.firedAt') }}
+
{{ formatDateTime(selected.fired_at || selected.created_at) }}
+
+
+
{{ t('admin.ops.alertEvents.detail.resolvedAt') }}
+
{{ selected.resolved_at ? formatDateTime(selected.resolved_at) : '-' }}
+
+
+
{{ t('admin.ops.alertEvents.detail.ruleId') }}
+ +
+
+
{{ t('admin.ops.alertEvents.detail.dimensions') }}
+
+
platform={{ getDimensionString(selected, 'platform') }}
+
group_id={{ selected.dimensions.group_id }}
+
region={{ getDimensionString(selected, 'region') }}
+
+
+
+ + +
+
+
+
{{ t('admin.ops.alertEvents.detail.historyTitle') }}
+
{{ t('admin.ops.alertEvents.detail.historyHint') }}
+
+ +
+ +
+ +
+ +
+ +
+
@@ -231,18 +245,26 @@ watch( {{ t('admin.ops.errorDetails.total') }} {{ total }}
- + +
+ + diff --git a/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue b/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue index 416bdba9..28868552 100644 --- a/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue +++ b/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue @@ -1,55 +1,48 @@ @@ -184,6 +187,36 @@ import { getSeverityClass, formatDateTime } from '../utils/opsFormatters' const { t } = useI18n() +function isUpstreamRow(log: OpsErrorLog): boolean { + const phase = String(log.phase || '').toLowerCase() + const owner = String(log.error_owner || '').toLowerCase() + return phase === 'upstream' && owner === 'provider' +} + +function getTypeBadge(log: OpsErrorLog): { label: string; className: string } { + const phase = String(log.phase || '').toLowerCase() + const owner = String(log.error_owner || '').toLowerCase() + + if (isUpstreamRow(log)) { + return { label: t('admin.ops.errorLog.typeUpstream'), className: 'bg-red-50 text-red-700 ring-red-600/20 dark:bg-red-900/30 dark:text-red-400 dark:ring-red-500/30' } + } + if (phase === 'request' && owner === 'client') { + return { label: t('admin.ops.errorLog.typeRequest'), className: 'bg-amber-50 text-amber-700 ring-amber-600/20 dark:bg-amber-900/30 dark:text-amber-400 dark:ring-amber-500/30' } + } + if (phase === 'auth' && owner === 'client') { + return { label: t('admin.ops.errorLog.typeAuth'), className: 'bg-blue-50 text-blue-700 ring-blue-600/20 dark:bg-blue-900/30 dark:text-blue-400 dark:ring-blue-500/30' } + } + if (phase === 'routing' && owner === 'platform') { + return { label: t('admin.ops.errorLog.typeRouting'), className: 'bg-purple-50 text-purple-700 ring-purple-600/20 dark:bg-purple-900/30 dark:text-purple-400 dark:ring-purple-500/30' } + } + if (phase === 'internal' && owner === 'platform') { + return { label: t('admin.ops.errorLog.typeInternal'), className: 'bg-gray-100 text-gray-800 ring-gray-600/20 dark:bg-dark-700 dark:text-gray-200 dark:ring-dark-500/40' } + } + + const fallback = phase || owner || t('common.unknown') + return { label: fallback, className: 'bg-gray-50 text-gray-700 ring-gray-600/10 dark:bg-dark-900 dark:text-gray-300 dark:ring-dark-700' } +} + interface Props { rows: OpsErrorLog[] total: number @@ -208,14 +241,6 @@ function getStatusClass(code: number): string { return 'bg-gray-50 text-gray-700 ring-gray-600/20 dark:bg-gray-900/30 dark:text-gray-400 dark:ring-gray-500/30' } -function getLatencyClass(latency: number | null): string { - if (!latency) return 'text-gray-400' - if (latency > 10000) return 'text-red-600 font-black' - if (latency > 5000) return 'text-red-500 font-bold' - if (latency > 2000) return 'text-orange-500 font-medium' - return 'text-gray-600 dark:text-gray-400' -} - function formatSmartMessage(msg: string): string { if (!msg) return '' @@ -231,10 +256,11 @@ function formatSmartMessage(msg: string): string { } } - if (msg.includes('context deadline exceeded')) return 'context deadline exceeded' - if (msg.includes('connection refused')) return 'connection refused' - if (msg.toLowerCase().includes('rate limit')) return 'rate limit' + if (msg.includes('context deadline exceeded')) return t('admin.ops.errorLog.commonErrors.contextDeadlineExceeded') + if (msg.includes('connection refused')) return t('admin.ops.errorLog.commonErrors.connectionRefused') + if (msg.toLowerCase().includes('rate limit')) return t('admin.ops.errorLog.commonErrors.rateLimit') return msg.length > 200 ? msg.substring(0, 200) + '...' : msg + } - + \ No newline at end of file diff --git a/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue b/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue index d3edd745..3a70b4f2 100644 --- a/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue +++ b/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue @@ -38,7 +38,7 @@ const loading = ref(false) const items = ref([]) const total = ref(0) const page = ref(1) -const pageSize = ref(20) +const pageSize = ref(10) const close = () => emit('update:modelValue', false) @@ -95,7 +95,7 @@ watch( (open) => { if (open) { page.value = 1 - pageSize.value = 20 + pageSize.value = 10 fetchData() } } diff --git a/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue b/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue index 1dcab4b3..82c19f4f 100644 --- a/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue +++ b/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue @@ -50,27 +50,22 @@ function validateRuntimeSettings(settings: OpsAlertRuntimeSettings): ValidationR if (thresholds) { if (thresholds.sla_percent_min != null) { if (!Number.isFinite(thresholds.sla_percent_min) || thresholds.sla_percent_min < 0 || thresholds.sla_percent_min > 100) { - errors.push('SLA 最低值必须在 0-100 之间') - } - } - if (thresholds.latency_p99_ms_max != null) { - if (!Number.isFinite(thresholds.latency_p99_ms_max) || thresholds.latency_p99_ms_max < 0) { - errors.push('延迟 P99 最大值必须大于或等于 0') + errors.push(t('admin.ops.runtime.validation.slaMinPercentRange')) } } if (thresholds.ttft_p99_ms_max != null) { if (!Number.isFinite(thresholds.ttft_p99_ms_max) || thresholds.ttft_p99_ms_max < 0) { - errors.push('TTFT P99 最大值必须大于或等于 0') + errors.push(t('admin.ops.runtime.validation.ttftP99MaxRange')) } } if (thresholds.request_error_rate_percent_max != null) { if (!Number.isFinite(thresholds.request_error_rate_percent_max) || thresholds.request_error_rate_percent_max < 0 || thresholds.request_error_rate_percent_max > 100) { - errors.push('请求错误率最大值必须在 0-100 之间') + errors.push(t('admin.ops.runtime.validation.requestErrorRateMaxRange')) } } if (thresholds.upstream_error_rate_percent_max != null) { if (!Number.isFinite(thresholds.upstream_error_rate_percent_max) || thresholds.upstream_error_rate_percent_max < 0 || thresholds.upstream_error_rate_percent_max > 100) { - errors.push('上游错误率最大值必须在 0-100 之间') + errors.push(t('admin.ops.runtime.validation.upstreamErrorRateMaxRange')) } } } @@ -163,7 +158,6 @@ function openAlertEditor() { if (!draftAlert.value.thresholds) { draftAlert.value.thresholds = { sla_percent_min: 99.5, - latency_p99_ms_max: 2000, ttft_p99_ms_max: 500, request_error_rate_percent_max: 5, upstream_error_rate_percent_max: 5 @@ -335,12 +329,12 @@ onMounted(() => {
-
指标阈值配置
-

配置各项指标的告警阈值。超出阈值的指标将在看板上以红色显示。

+
{{ t('admin.ops.runtime.metricThresholds') }}
+

{{ t('admin.ops.runtime.metricThresholdsHint') }}

-
SLA 最低值 (%)
+
{{ t('admin.ops.runtime.slaMinPercent') }}
{ class="input" placeholder="99.5" /> -

SLA 低于此值时将显示为红色

+

{{ t('admin.ops.runtime.slaMinPercentHint') }}

-
-
延迟 P99 最大值 (ms)
- -

延迟 P99 高于此值时将显示为红色

-
+
-
TTFT P99 最大值 (ms)
+
{{ t('admin.ops.runtime.ttftP99MaxMs') }}
{ class="input" placeholder="500" /> -

TTFT P99 高于此值时将显示为红色

+

{{ t('admin.ops.runtime.ttftP99MaxMsHint') }}

-
请求错误率最大值 (%)
+
{{ t('admin.ops.runtime.requestErrorRateMaxPercent') }}
{ class="input" placeholder="5" /> -

请求错误率高于此值时将显示为红色

+

{{ t('admin.ops.runtime.requestErrorRateMaxPercentHint') }}

-
上游错误率最大值 (%)
+
{{ t('admin.ops.runtime.upstreamErrorRateMaxPercent') }}
{ class="input" placeholder="5" /> -

上游错误率高于此值时将显示为红色

+

{{ t('admin.ops.runtime.upstreamErrorRateMaxPercentHint') }}

@@ -424,7 +407,7 @@ onMounted(() => { v-model="draftAlert.silencing.global_until_rfc3339" type="text" class="input font-mono text-sm" - :placeholder="t('admin.ops.runtime.silencing.untilPlaceholder')" + placeholder="2026-01-05T00:00:00Z" />

{{ t('admin.ops.runtime.silencing.untilHint') }}

@@ -496,7 +479,7 @@ onMounted(() => { v-model="(entry as any).until_rfc3339" type="text" class="input font-mono text-sm" - :placeholder="t('admin.ops.runtime.silencing.untilPlaceholder')" + placeholder="2026-01-05T00:00:00Z" /> diff --git a/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue index 1f64f253..53ab6683 100644 --- a/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue +++ b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue @@ -32,7 +32,6 @@ const advancedSettings = ref(null) // 指标阈值配置 const metricThresholds = ref({ sla_percent_min: 99.5, - latency_p99_ms_max: 2000, ttft_p99_ms_max: 500, request_error_rate_percent_max: 5, upstream_error_rate_percent_max: 5 @@ -53,13 +52,12 @@ async function loadAllSettings() { advancedSettings.value = advanced // 如果后端返回了阈值,使用后端的值;否则保持默认值 if (thresholds && Object.keys(thresholds).length > 0) { - metricThresholds.value = { - sla_percent_min: thresholds.sla_percent_min ?? 99.5, - latency_p99_ms_max: thresholds.latency_p99_ms_max ?? 2000, - ttft_p99_ms_max: thresholds.ttft_p99_ms_max ?? 500, - request_error_rate_percent_max: thresholds.request_error_rate_percent_max ?? 5, - upstream_error_rate_percent_max: thresholds.upstream_error_rate_percent_max ?? 5 - } + metricThresholds.value = { + sla_percent_min: thresholds.sla_percent_min ?? 99.5, + ttft_p99_ms_max: thresholds.ttft_p99_ms_max ?? 500, + request_error_rate_percent_max: thresholds.request_error_rate_percent_max ?? 5, + upstream_error_rate_percent_max: thresholds.upstream_error_rate_percent_max ?? 5 + } } } catch (err: any) { console.error('[OpsSettingsDialog] Failed to load settings', err) @@ -159,19 +157,16 @@ const validation = computed(() => { // 验证指标阈值 if (metricThresholds.value.sla_percent_min != null && (metricThresholds.value.sla_percent_min < 0 || metricThresholds.value.sla_percent_min > 100)) { - errors.push('SLA最低百分比必须在0-100之间') - } - if (metricThresholds.value.latency_p99_ms_max != null && metricThresholds.value.latency_p99_ms_max < 0) { - errors.push('延迟P99最大值必须大于等于0') + errors.push(t('admin.ops.settings.validation.slaMinPercentRange')) } if (metricThresholds.value.ttft_p99_ms_max != null && metricThresholds.value.ttft_p99_ms_max < 0) { - errors.push('TTFT P99最大值必须大于等于0') + errors.push(t('admin.ops.settings.validation.ttftP99MaxRange')) } if (metricThresholds.value.request_error_rate_percent_max != null && (metricThresholds.value.request_error_rate_percent_max < 0 || metricThresholds.value.request_error_rate_percent_max > 100)) { - errors.push('请求错误率最大值必须在0-100之间') + errors.push(t('admin.ops.settings.validation.requestErrorRateMaxRange')) } if (metricThresholds.value.upstream_error_rate_percent_max != null && (metricThresholds.value.upstream_error_rate_percent_max < 0 || metricThresholds.value.upstream_error_rate_percent_max > 100)) { - errors.push('上游错误率最大值必须在0-100之间') + errors.push(t('admin.ops.settings.validation.upstreamErrorRateMaxRange')) } return { valid: errors.length === 0, errors } @@ -362,17 +357,6 @@ async function saveAllSettings() {

{{ t('admin.ops.settings.slaMinPercentHint') }}

-
- - -

{{ t('admin.ops.settings.latencyP99MaxMsHint') }}

-
@@ -488,43 +472,63 @@ async function saveAllSettings() {
- +
-
错误过滤
+
{{ t('admin.ops.settings.errorFiltering') }}
- +

- 启用后,count_tokens 请求的错误将不计入运维监控的统计和告警中(但仍会存储在数据库中) + {{ t('admin.ops.settings.ignoreCountTokensErrorsHint') }}

-
- - -
-
自动刷新
- +

- 自动刷新仪表板数据,启用后会定期拉取最新数据 + {{ t('admin.ops.settings.ignoreContextCanceledHint') }} +

+
+ +
+ +
+
+ +

+ {{ t('admin.ops.settings.ignoreNoAvailableAccountsHint') }} +

+
+ +
+
+ + +
+
{{ t('admin.ops.settings.autoRefresh') }}
+ +
+
+ +

+ {{ t('admin.ops.settings.enableAutoRefreshHint') }}

- +