Merge pull request #285 from IanShaw027/fix/ops-bug
feat(ops): 增强错误日志管理、告警静默和前端 UI 优化
This commit is contained in:
@@ -7,8 +7,10 @@ import (
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
|
||||
"github.com/Wei-Shaw/sub2api/internal/server/middleware"
|
||||
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/gin-gonic/gin/binding"
|
||||
@@ -18,8 +20,6 @@ var validOpsAlertMetricTypes = []string{
|
||||
"success_rate",
|
||||
"error_rate",
|
||||
"upstream_error_rate",
|
||||
"p95_latency_ms",
|
||||
"p99_latency_ms",
|
||||
"cpu_usage_percent",
|
||||
"memory_usage_percent",
|
||||
"concurrency_queue_depth",
|
||||
@@ -372,8 +372,135 @@ func (h *OpsHandler) DeleteAlertRule(c *gin.Context) {
|
||||
response.Success(c, gin.H{"deleted": true})
|
||||
}
|
||||
|
||||
// GetAlertEvent returns a single ops alert event.
|
||||
// GET /api/v1/admin/ops/alert-events/:id
|
||||
func (h *OpsHandler) GetAlertEvent(c *gin.Context) {
|
||||
if h.opsService == nil {
|
||||
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||
return
|
||||
}
|
||||
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
id, err := strconv.ParseInt(c.Param("id"), 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
response.BadRequest(c, "Invalid event ID")
|
||||
return
|
||||
}
|
||||
|
||||
ev, err := h.opsService.GetAlertEventByID(c.Request.Context(), id)
|
||||
if err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
response.Success(c, ev)
|
||||
}
|
||||
|
||||
// UpdateAlertEventStatus updates an ops alert event status.
|
||||
// PUT /api/v1/admin/ops/alert-events/:id/status
|
||||
func (h *OpsHandler) UpdateAlertEventStatus(c *gin.Context) {
|
||||
if h.opsService == nil {
|
||||
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||
return
|
||||
}
|
||||
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
id, err := strconv.ParseInt(c.Param("id"), 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
response.BadRequest(c, "Invalid event ID")
|
||||
return
|
||||
}
|
||||
|
||||
var payload struct {
|
||||
Status string `json:"status"`
|
||||
}
|
||||
if err := c.ShouldBindJSON(&payload); err != nil {
|
||||
response.BadRequest(c, "Invalid request body")
|
||||
return
|
||||
}
|
||||
payload.Status = strings.TrimSpace(payload.Status)
|
||||
if payload.Status == "" {
|
||||
response.BadRequest(c, "Invalid status")
|
||||
return
|
||||
}
|
||||
if payload.Status != service.OpsAlertStatusResolved && payload.Status != service.OpsAlertStatusManualResolved {
|
||||
response.BadRequest(c, "Invalid status")
|
||||
return
|
||||
}
|
||||
|
||||
var resolvedAt *time.Time
|
||||
if payload.Status == service.OpsAlertStatusResolved || payload.Status == service.OpsAlertStatusManualResolved {
|
||||
now := time.Now().UTC()
|
||||
resolvedAt = &now
|
||||
}
|
||||
if err := h.opsService.UpdateAlertEventStatus(c.Request.Context(), id, payload.Status, resolvedAt); err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
response.Success(c, gin.H{"updated": true})
|
||||
}
|
||||
|
||||
// ListAlertEvents lists recent ops alert events.
|
||||
// GET /api/v1/admin/ops/alert-events
|
||||
// CreateAlertSilence creates a scoped silence for ops alerts.
|
||||
// POST /api/v1/admin/ops/alert-silences
|
||||
func (h *OpsHandler) CreateAlertSilence(c *gin.Context) {
|
||||
if h.opsService == nil {
|
||||
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||
return
|
||||
}
|
||||
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
var payload struct {
|
||||
RuleID int64 `json:"rule_id"`
|
||||
Platform string `json:"platform"`
|
||||
GroupID *int64 `json:"group_id"`
|
||||
Region *string `json:"region"`
|
||||
Until string `json:"until"`
|
||||
Reason string `json:"reason"`
|
||||
}
|
||||
if err := c.ShouldBindJSON(&payload); err != nil {
|
||||
response.BadRequest(c, "Invalid request body")
|
||||
return
|
||||
}
|
||||
until, err := time.Parse(time.RFC3339, strings.TrimSpace(payload.Until))
|
||||
if err != nil {
|
||||
response.BadRequest(c, "Invalid until")
|
||||
return
|
||||
}
|
||||
|
||||
createdBy := (*int64)(nil)
|
||||
if subject, ok := middleware.GetAuthSubjectFromContext(c); ok {
|
||||
uid := subject.UserID
|
||||
createdBy = &uid
|
||||
}
|
||||
|
||||
silence := &service.OpsAlertSilence{
|
||||
RuleID: payload.RuleID,
|
||||
Platform: strings.TrimSpace(payload.Platform),
|
||||
GroupID: payload.GroupID,
|
||||
Region: payload.Region,
|
||||
Until: until,
|
||||
Reason: strings.TrimSpace(payload.Reason),
|
||||
CreatedBy: createdBy,
|
||||
}
|
||||
|
||||
created, err := h.opsService.CreateAlertSilence(c.Request.Context(), silence)
|
||||
if err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
response.Success(c, created)
|
||||
}
|
||||
|
||||
func (h *OpsHandler) ListAlertEvents(c *gin.Context) {
|
||||
if h.opsService == nil {
|
||||
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||
@@ -384,7 +511,7 @@ func (h *OpsHandler) ListAlertEvents(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
limit := 100
|
||||
limit := 20
|
||||
if raw := strings.TrimSpace(c.Query("limit")); raw != "" {
|
||||
n, err := strconv.Atoi(raw)
|
||||
if err != nil || n <= 0 {
|
||||
@@ -400,6 +527,49 @@ func (h *OpsHandler) ListAlertEvents(c *gin.Context) {
|
||||
Severity: strings.TrimSpace(c.Query("severity")),
|
||||
}
|
||||
|
||||
if v := strings.TrimSpace(c.Query("email_sent")); v != "" {
|
||||
vv := strings.ToLower(v)
|
||||
switch vv {
|
||||
case "true", "1":
|
||||
b := true
|
||||
filter.EmailSent = &b
|
||||
case "false", "0":
|
||||
b := false
|
||||
filter.EmailSent = &b
|
||||
default:
|
||||
response.BadRequest(c, "Invalid email_sent")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Cursor pagination: both params must be provided together.
|
||||
rawTS := strings.TrimSpace(c.Query("before_fired_at"))
|
||||
rawID := strings.TrimSpace(c.Query("before_id"))
|
||||
if (rawTS == "") != (rawID == "") {
|
||||
response.BadRequest(c, "before_fired_at and before_id must be provided together")
|
||||
return
|
||||
}
|
||||
if rawTS != "" {
|
||||
ts, err := time.Parse(time.RFC3339Nano, rawTS)
|
||||
if err != nil {
|
||||
if t2, err2 := time.Parse(time.RFC3339, rawTS); err2 == nil {
|
||||
ts = t2
|
||||
} else {
|
||||
response.BadRequest(c, "Invalid before_fired_at")
|
||||
return
|
||||
}
|
||||
}
|
||||
filter.BeforeFiredAt = &ts
|
||||
}
|
||||
if rawID != "" {
|
||||
id, err := strconv.ParseInt(rawID, 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
response.BadRequest(c, "Invalid before_id")
|
||||
return
|
||||
}
|
||||
filter.BeforeID = &id
|
||||
}
|
||||
|
||||
// Optional global filter support (platform/group/time range).
|
||||
if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
|
||||
filter.Platform = platform
|
||||
|
||||
@@ -19,6 +19,57 @@ type OpsHandler struct {
|
||||
opsService *service.OpsService
|
||||
}
|
||||
|
||||
// GetErrorLogByID returns ops error log detail.
|
||||
// GET /api/v1/admin/ops/errors/:id
|
||||
func (h *OpsHandler) GetErrorLogByID(c *gin.Context) {
|
||||
if h.opsService == nil {
|
||||
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||
return
|
||||
}
|
||||
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
idStr := strings.TrimSpace(c.Param("id"))
|
||||
id, err := strconv.ParseInt(idStr, 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
response.BadRequest(c, "Invalid error id")
|
||||
return
|
||||
}
|
||||
|
||||
detail, err := h.opsService.GetErrorLogByID(c.Request.Context(), id)
|
||||
if err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
response.Success(c, detail)
|
||||
}
|
||||
|
||||
const (
|
||||
opsListViewErrors = "errors"
|
||||
opsListViewExcluded = "excluded"
|
||||
opsListViewAll = "all"
|
||||
)
|
||||
|
||||
func parseOpsViewParam(c *gin.Context) string {
|
||||
if c == nil {
|
||||
return ""
|
||||
}
|
||||
v := strings.ToLower(strings.TrimSpace(c.Query("view")))
|
||||
switch v {
|
||||
case "", opsListViewErrors:
|
||||
return opsListViewErrors
|
||||
case opsListViewExcluded:
|
||||
return opsListViewExcluded
|
||||
case opsListViewAll:
|
||||
return opsListViewAll
|
||||
default:
|
||||
return opsListViewErrors
|
||||
}
|
||||
}
|
||||
|
||||
func NewOpsHandler(opsService *service.OpsService) *OpsHandler {
|
||||
return &OpsHandler{opsService: opsService}
|
||||
}
|
||||
@@ -47,16 +98,26 @@ func (h *OpsHandler) GetErrorLogs(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
filter := &service.OpsErrorLogFilter{
|
||||
Page: page,
|
||||
PageSize: pageSize,
|
||||
}
|
||||
filter := &service.OpsErrorLogFilter{Page: page, PageSize: pageSize}
|
||||
|
||||
if !startTime.IsZero() {
|
||||
filter.StartTime = &startTime
|
||||
}
|
||||
if !endTime.IsZero() {
|
||||
filter.EndTime = &endTime
|
||||
}
|
||||
filter.View = parseOpsViewParam(c)
|
||||
filter.Phase = strings.TrimSpace(c.Query("phase"))
|
||||
filter.Owner = strings.TrimSpace(c.Query("error_owner"))
|
||||
filter.Source = strings.TrimSpace(c.Query("error_source"))
|
||||
filter.Query = strings.TrimSpace(c.Query("q"))
|
||||
filter.UserQuery = strings.TrimSpace(c.Query("user_query"))
|
||||
|
||||
// Force request errors: client-visible status >= 400.
|
||||
// buildOpsErrorLogsWhere already applies this for non-upstream phase.
|
||||
if strings.EqualFold(strings.TrimSpace(filter.Phase), "upstream") {
|
||||
filter.Phase = ""
|
||||
}
|
||||
|
||||
if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
|
||||
filter.Platform = platform
|
||||
@@ -77,11 +138,19 @@ func (h *OpsHandler) GetErrorLogs(c *gin.Context) {
|
||||
}
|
||||
filter.AccountID = &id
|
||||
}
|
||||
if phase := strings.TrimSpace(c.Query("phase")); phase != "" {
|
||||
filter.Phase = phase
|
||||
}
|
||||
if q := strings.TrimSpace(c.Query("q")); q != "" {
|
||||
filter.Query = q
|
||||
|
||||
if v := strings.TrimSpace(c.Query("resolved")); v != "" {
|
||||
switch strings.ToLower(v) {
|
||||
case "1", "true", "yes":
|
||||
b := true
|
||||
filter.Resolved = &b
|
||||
case "0", "false", "no":
|
||||
b := false
|
||||
filter.Resolved = &b
|
||||
default:
|
||||
response.BadRequest(c, "Invalid resolved")
|
||||
return
|
||||
}
|
||||
}
|
||||
if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" {
|
||||
parts := strings.Split(statusCodesStr, ",")
|
||||
@@ -106,13 +175,120 @@ func (h *OpsHandler) GetErrorLogs(c *gin.Context) {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize)
|
||||
}
|
||||
|
||||
// GetErrorLogByID returns a single error log detail.
|
||||
// GET /api/v1/admin/ops/errors/:id
|
||||
func (h *OpsHandler) GetErrorLogByID(c *gin.Context) {
|
||||
// ListRequestErrors lists client-visible request errors.
|
||||
// GET /api/v1/admin/ops/request-errors
|
||||
func (h *OpsHandler) ListRequestErrors(c *gin.Context) {
|
||||
if h.opsService == nil {
|
||||
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||
return
|
||||
}
|
||||
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
page, pageSize := response.ParsePagination(c)
|
||||
if pageSize > 500 {
|
||||
pageSize = 500
|
||||
}
|
||||
startTime, endTime, err := parseOpsTimeRange(c, "1h")
|
||||
if err != nil {
|
||||
response.BadRequest(c, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
filter := &service.OpsErrorLogFilter{Page: page, PageSize: pageSize}
|
||||
if !startTime.IsZero() {
|
||||
filter.StartTime = &startTime
|
||||
}
|
||||
if !endTime.IsZero() {
|
||||
filter.EndTime = &endTime
|
||||
}
|
||||
filter.View = parseOpsViewParam(c)
|
||||
filter.Phase = strings.TrimSpace(c.Query("phase"))
|
||||
filter.Owner = strings.TrimSpace(c.Query("error_owner"))
|
||||
filter.Source = strings.TrimSpace(c.Query("error_source"))
|
||||
filter.Query = strings.TrimSpace(c.Query("q"))
|
||||
filter.UserQuery = strings.TrimSpace(c.Query("user_query"))
|
||||
|
||||
// Force request errors: client-visible status >= 400.
|
||||
// buildOpsErrorLogsWhere already applies this for non-upstream phase.
|
||||
if strings.EqualFold(strings.TrimSpace(filter.Phase), "upstream") {
|
||||
filter.Phase = ""
|
||||
}
|
||||
|
||||
if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
|
||||
filter.Platform = platform
|
||||
}
|
||||
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
|
||||
id, err := strconv.ParseInt(v, 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
response.BadRequest(c, "Invalid group_id")
|
||||
return
|
||||
}
|
||||
filter.GroupID = &id
|
||||
}
|
||||
if v := strings.TrimSpace(c.Query("account_id")); v != "" {
|
||||
id, err := strconv.ParseInt(v, 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
response.BadRequest(c, "Invalid account_id")
|
||||
return
|
||||
}
|
||||
filter.AccountID = &id
|
||||
}
|
||||
|
||||
if v := strings.TrimSpace(c.Query("resolved")); v != "" {
|
||||
switch strings.ToLower(v) {
|
||||
case "1", "true", "yes":
|
||||
b := true
|
||||
filter.Resolved = &b
|
||||
case "0", "false", "no":
|
||||
b := false
|
||||
filter.Resolved = &b
|
||||
default:
|
||||
response.BadRequest(c, "Invalid resolved")
|
||||
return
|
||||
}
|
||||
}
|
||||
if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" {
|
||||
parts := strings.Split(statusCodesStr, ",")
|
||||
out := make([]int, 0, len(parts))
|
||||
for _, part := range parts {
|
||||
p := strings.TrimSpace(part)
|
||||
if p == "" {
|
||||
continue
|
||||
}
|
||||
n, err := strconv.Atoi(p)
|
||||
if err != nil || n < 0 {
|
||||
response.BadRequest(c, "Invalid status_codes")
|
||||
return
|
||||
}
|
||||
out = append(out, n)
|
||||
}
|
||||
filter.StatusCodes = out
|
||||
}
|
||||
|
||||
result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter)
|
||||
if err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize)
|
||||
}
|
||||
|
||||
// GetRequestError returns request error detail.
|
||||
// GET /api/v1/admin/ops/request-errors/:id
|
||||
func (h *OpsHandler) GetRequestError(c *gin.Context) {
|
||||
// same storage; just proxy to existing detail
|
||||
h.GetErrorLogByID(c)
|
||||
}
|
||||
|
||||
// ListRequestErrorUpstreamErrors lists upstream error logs correlated to a request error.
|
||||
// GET /api/v1/admin/ops/request-errors/:id/upstream-errors
|
||||
func (h *OpsHandler) ListRequestErrorUpstreamErrors(c *gin.Context) {
|
||||
if h.opsService == nil {
|
||||
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||
return
|
||||
@@ -129,15 +305,306 @@ func (h *OpsHandler) GetErrorLogByID(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// Load request error to get correlation keys.
|
||||
detail, err := h.opsService.GetErrorLogByID(c.Request.Context(), id)
|
||||
if err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
response.Success(c, detail)
|
||||
// Correlate by request_id/client_request_id.
|
||||
requestID := strings.TrimSpace(detail.RequestID)
|
||||
clientRequestID := strings.TrimSpace(detail.ClientRequestID)
|
||||
if requestID == "" && clientRequestID == "" {
|
||||
response.Paginated(c, []*service.OpsErrorLog{}, 0, 1, 10)
|
||||
return
|
||||
}
|
||||
|
||||
page, pageSize := response.ParsePagination(c)
|
||||
if pageSize > 500 {
|
||||
pageSize = 500
|
||||
}
|
||||
|
||||
// Keep correlation window wide enough so linked upstream errors
|
||||
// are discoverable even when UI defaults to 1h elsewhere.
|
||||
startTime, endTime, err := parseOpsTimeRange(c, "30d")
|
||||
if err != nil {
|
||||
response.BadRequest(c, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
filter := &service.OpsErrorLogFilter{Page: page, PageSize: pageSize}
|
||||
if !startTime.IsZero() {
|
||||
filter.StartTime = &startTime
|
||||
}
|
||||
if !endTime.IsZero() {
|
||||
filter.EndTime = &endTime
|
||||
}
|
||||
filter.View = "all"
|
||||
filter.Phase = "upstream"
|
||||
filter.Owner = "provider"
|
||||
filter.Source = strings.TrimSpace(c.Query("error_source"))
|
||||
filter.Query = strings.TrimSpace(c.Query("q"))
|
||||
|
||||
if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
|
||||
filter.Platform = platform
|
||||
}
|
||||
|
||||
// Prefer exact match on request_id; if missing, fall back to client_request_id.
|
||||
if requestID != "" {
|
||||
filter.RequestID = requestID
|
||||
} else {
|
||||
filter.ClientRequestID = clientRequestID
|
||||
}
|
||||
|
||||
result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter)
|
||||
if err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
// If client asks for details, expand each upstream error log to include upstream response fields.
|
||||
includeDetail := strings.TrimSpace(c.Query("include_detail"))
|
||||
if includeDetail == "1" || strings.EqualFold(includeDetail, "true") || strings.EqualFold(includeDetail, "yes") {
|
||||
details := make([]*service.OpsErrorLogDetail, 0, len(result.Errors))
|
||||
for _, item := range result.Errors {
|
||||
if item == nil {
|
||||
continue
|
||||
}
|
||||
d, err := h.opsService.GetErrorLogByID(c.Request.Context(), item.ID)
|
||||
if err != nil || d == nil {
|
||||
continue
|
||||
}
|
||||
details = append(details, d)
|
||||
}
|
||||
response.Paginated(c, details, int64(result.Total), result.Page, result.PageSize)
|
||||
return
|
||||
}
|
||||
|
||||
response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize)
|
||||
}
|
||||
|
||||
// RetryRequestErrorClient retries the client request based on stored request body.
|
||||
// POST /api/v1/admin/ops/request-errors/:id/retry-client
|
||||
func (h *OpsHandler) RetryRequestErrorClient(c *gin.Context) {
|
||||
if h.opsService == nil {
|
||||
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||
return
|
||||
}
|
||||
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
subject, ok := middleware.GetAuthSubjectFromContext(c)
|
||||
if !ok || subject.UserID <= 0 {
|
||||
response.Error(c, http.StatusUnauthorized, "Unauthorized")
|
||||
return
|
||||
}
|
||||
|
||||
idStr := strings.TrimSpace(c.Param("id"))
|
||||
id, err := strconv.ParseInt(idStr, 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
response.BadRequest(c, "Invalid error id")
|
||||
return
|
||||
}
|
||||
|
||||
result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, service.OpsRetryModeClient, nil)
|
||||
if err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
response.Success(c, result)
|
||||
}
|
||||
|
||||
// RetryRequestErrorUpstreamEvent retries a specific upstream attempt using captured upstream_request_body.
|
||||
// POST /api/v1/admin/ops/request-errors/:id/upstream-errors/:idx/retry
|
||||
func (h *OpsHandler) RetryRequestErrorUpstreamEvent(c *gin.Context) {
|
||||
if h.opsService == nil {
|
||||
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||
return
|
||||
}
|
||||
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
subject, ok := middleware.GetAuthSubjectFromContext(c)
|
||||
if !ok || subject.UserID <= 0 {
|
||||
response.Error(c, http.StatusUnauthorized, "Unauthorized")
|
||||
return
|
||||
}
|
||||
|
||||
idStr := strings.TrimSpace(c.Param("id"))
|
||||
id, err := strconv.ParseInt(idStr, 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
response.BadRequest(c, "Invalid error id")
|
||||
return
|
||||
}
|
||||
|
||||
idxStr := strings.TrimSpace(c.Param("idx"))
|
||||
idx, err := strconv.Atoi(idxStr)
|
||||
if err != nil || idx < 0 {
|
||||
response.BadRequest(c, "Invalid upstream idx")
|
||||
return
|
||||
}
|
||||
|
||||
result, err := h.opsService.RetryUpstreamEvent(c.Request.Context(), subject.UserID, id, idx)
|
||||
if err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
response.Success(c, result)
|
||||
}
|
||||
|
||||
// ResolveRequestError toggles resolved status.
|
||||
// PUT /api/v1/admin/ops/request-errors/:id/resolve
|
||||
func (h *OpsHandler) ResolveRequestError(c *gin.Context) {
|
||||
h.UpdateErrorResolution(c)
|
||||
}
|
||||
|
||||
// ListUpstreamErrors lists independent upstream errors.
|
||||
// GET /api/v1/admin/ops/upstream-errors
|
||||
func (h *OpsHandler) ListUpstreamErrors(c *gin.Context) {
|
||||
if h.opsService == nil {
|
||||
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||
return
|
||||
}
|
||||
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
page, pageSize := response.ParsePagination(c)
|
||||
if pageSize > 500 {
|
||||
pageSize = 500
|
||||
}
|
||||
startTime, endTime, err := parseOpsTimeRange(c, "1h")
|
||||
if err != nil {
|
||||
response.BadRequest(c, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
filter := &service.OpsErrorLogFilter{Page: page, PageSize: pageSize}
|
||||
if !startTime.IsZero() {
|
||||
filter.StartTime = &startTime
|
||||
}
|
||||
if !endTime.IsZero() {
|
||||
filter.EndTime = &endTime
|
||||
}
|
||||
|
||||
filter.View = parseOpsViewParam(c)
|
||||
filter.Phase = "upstream"
|
||||
filter.Owner = "provider"
|
||||
filter.Source = strings.TrimSpace(c.Query("error_source"))
|
||||
filter.Query = strings.TrimSpace(c.Query("q"))
|
||||
|
||||
if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
|
||||
filter.Platform = platform
|
||||
}
|
||||
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
|
||||
id, err := strconv.ParseInt(v, 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
response.BadRequest(c, "Invalid group_id")
|
||||
return
|
||||
}
|
||||
filter.GroupID = &id
|
||||
}
|
||||
if v := strings.TrimSpace(c.Query("account_id")); v != "" {
|
||||
id, err := strconv.ParseInt(v, 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
response.BadRequest(c, "Invalid account_id")
|
||||
return
|
||||
}
|
||||
filter.AccountID = &id
|
||||
}
|
||||
|
||||
if v := strings.TrimSpace(c.Query("resolved")); v != "" {
|
||||
switch strings.ToLower(v) {
|
||||
case "1", "true", "yes":
|
||||
b := true
|
||||
filter.Resolved = &b
|
||||
case "0", "false", "no":
|
||||
b := false
|
||||
filter.Resolved = &b
|
||||
default:
|
||||
response.BadRequest(c, "Invalid resolved")
|
||||
return
|
||||
}
|
||||
}
|
||||
if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" {
|
||||
parts := strings.Split(statusCodesStr, ",")
|
||||
out := make([]int, 0, len(parts))
|
||||
for _, part := range parts {
|
||||
p := strings.TrimSpace(part)
|
||||
if p == "" {
|
||||
continue
|
||||
}
|
||||
n, err := strconv.Atoi(p)
|
||||
if err != nil || n < 0 {
|
||||
response.BadRequest(c, "Invalid status_codes")
|
||||
return
|
||||
}
|
||||
out = append(out, n)
|
||||
}
|
||||
filter.StatusCodes = out
|
||||
}
|
||||
|
||||
result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter)
|
||||
if err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize)
|
||||
}
|
||||
|
||||
// GetUpstreamError returns upstream error detail.
|
||||
// GET /api/v1/admin/ops/upstream-errors/:id
|
||||
func (h *OpsHandler) GetUpstreamError(c *gin.Context) {
|
||||
h.GetErrorLogByID(c)
|
||||
}
|
||||
|
||||
// RetryUpstreamError retries upstream error using the original account_id.
|
||||
// POST /api/v1/admin/ops/upstream-errors/:id/retry
|
||||
func (h *OpsHandler) RetryUpstreamError(c *gin.Context) {
|
||||
if h.opsService == nil {
|
||||
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||
return
|
||||
}
|
||||
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
subject, ok := middleware.GetAuthSubjectFromContext(c)
|
||||
if !ok || subject.UserID <= 0 {
|
||||
response.Error(c, http.StatusUnauthorized, "Unauthorized")
|
||||
return
|
||||
}
|
||||
|
||||
idStr := strings.TrimSpace(c.Param("id"))
|
||||
id, err := strconv.ParseInt(idStr, 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
response.BadRequest(c, "Invalid error id")
|
||||
return
|
||||
}
|
||||
|
||||
result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, service.OpsRetryModeUpstream, nil)
|
||||
if err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
response.Success(c, result)
|
||||
}
|
||||
|
||||
// ResolveUpstreamError toggles resolved status.
|
||||
// PUT /api/v1/admin/ops/upstream-errors/:id/resolve
|
||||
func (h *OpsHandler) ResolveUpstreamError(c *gin.Context) {
|
||||
h.UpdateErrorResolution(c)
|
||||
}
|
||||
|
||||
// ==================== Existing endpoints ====================
|
||||
|
||||
// ListRequestDetails returns a request-level list (success + error) for drill-down.
|
||||
// GET /api/v1/admin/ops/requests
|
||||
func (h *OpsHandler) ListRequestDetails(c *gin.Context) {
|
||||
@@ -242,6 +709,11 @@ func (h *OpsHandler) ListRequestDetails(c *gin.Context) {
|
||||
type opsRetryRequest struct {
|
||||
Mode string `json:"mode"`
|
||||
PinnedAccountID *int64 `json:"pinned_account_id"`
|
||||
Force bool `json:"force"`
|
||||
}
|
||||
|
||||
type opsResolveRequest struct {
|
||||
Resolved bool `json:"resolved"`
|
||||
}
|
||||
|
||||
// RetryErrorRequest retries a failed request using stored request_body.
|
||||
@@ -278,6 +750,16 @@ func (h *OpsHandler) RetryErrorRequest(c *gin.Context) {
|
||||
req.Mode = service.OpsRetryModeClient
|
||||
}
|
||||
|
||||
// Force flag is currently a UI-level acknowledgement. Server may still enforce safety constraints.
|
||||
_ = req.Force
|
||||
|
||||
// Legacy endpoint safety: only allow retrying the client request here.
|
||||
// Upstream retries must go through the split endpoints.
|
||||
if strings.EqualFold(strings.TrimSpace(req.Mode), service.OpsRetryModeUpstream) {
|
||||
response.BadRequest(c, "upstream retry is not supported on this endpoint")
|
||||
return
|
||||
}
|
||||
|
||||
result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, req.Mode, req.PinnedAccountID)
|
||||
if err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
@@ -287,6 +769,81 @@ func (h *OpsHandler) RetryErrorRequest(c *gin.Context) {
|
||||
response.Success(c, result)
|
||||
}
|
||||
|
||||
// ListRetryAttempts lists retry attempts for an error log.
|
||||
// GET /api/v1/admin/ops/errors/:id/retries
|
||||
func (h *OpsHandler) ListRetryAttempts(c *gin.Context) {
|
||||
if h.opsService == nil {
|
||||
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||
return
|
||||
}
|
||||
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
idStr := strings.TrimSpace(c.Param("id"))
|
||||
id, err := strconv.ParseInt(idStr, 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
response.BadRequest(c, "Invalid error id")
|
||||
return
|
||||
}
|
||||
|
||||
limit := 50
|
||||
if v := strings.TrimSpace(c.Query("limit")); v != "" {
|
||||
n, err := strconv.Atoi(v)
|
||||
if err != nil || n <= 0 {
|
||||
response.BadRequest(c, "Invalid limit")
|
||||
return
|
||||
}
|
||||
limit = n
|
||||
}
|
||||
|
||||
items, err := h.opsService.ListRetryAttemptsByErrorID(c.Request.Context(), id, limit)
|
||||
if err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
response.Success(c, items)
|
||||
}
|
||||
|
||||
// UpdateErrorResolution allows manual resolve/unresolve.
|
||||
// PUT /api/v1/admin/ops/errors/:id/resolve
|
||||
func (h *OpsHandler) UpdateErrorResolution(c *gin.Context) {
|
||||
if h.opsService == nil {
|
||||
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||
return
|
||||
}
|
||||
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
subject, ok := middleware.GetAuthSubjectFromContext(c)
|
||||
if !ok || subject.UserID <= 0 {
|
||||
response.Error(c, http.StatusUnauthorized, "Unauthorized")
|
||||
return
|
||||
}
|
||||
|
||||
idStr := strings.TrimSpace(c.Param("id"))
|
||||
id, err := strconv.ParseInt(idStr, 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
response.BadRequest(c, "Invalid error id")
|
||||
return
|
||||
}
|
||||
|
||||
var req opsResolveRequest
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
response.BadRequest(c, "Invalid request: "+err.Error())
|
||||
return
|
||||
}
|
||||
uid := subject.UserID
|
||||
if err := h.opsService.UpdateErrorResolution(c.Request.Context(), id, req.Resolved, &uid, nil); err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
response.Success(c, gin.H{"ok": true})
|
||||
}
|
||||
|
||||
func parseOpsTimeRange(c *gin.Context, defaultRange string) (time.Time, time.Time, error) {
|
||||
startStr := strings.TrimSpace(c.Query("start_time"))
|
||||
endStr := strings.TrimSpace(c.Query("end_time"))
|
||||
@@ -358,6 +915,10 @@ func parseOpsDuration(v string) (time.Duration, bool) {
|
||||
return 6 * time.Hour, true
|
||||
case "24h":
|
||||
return 24 * time.Hour, true
|
||||
case "7d":
|
||||
return 7 * 24 * time.Hour, true
|
||||
case "30d":
|
||||
return 30 * 24 * time.Hour, true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
|
||||
@@ -544,6 +544,11 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
|
||||
body := w.buf.Bytes()
|
||||
parsed := parseOpsErrorResponse(body)
|
||||
|
||||
// Skip logging if the error should be filtered based on settings
|
||||
if shouldSkipOpsErrorLog(c.Request.Context(), ops, parsed.Message, string(body), c.Request.URL.Path) {
|
||||
return
|
||||
}
|
||||
|
||||
apiKey, _ := middleware2.GetAPIKeyFromContext(c)
|
||||
|
||||
clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string)
|
||||
@@ -832,28 +837,30 @@ func normalizeOpsErrorType(errType string, code string) string {
|
||||
|
||||
func classifyOpsPhase(errType, message, code string) string {
|
||||
msg := strings.ToLower(message)
|
||||
// Standardized phases: request|auth|routing|upstream|network|internal
|
||||
// Map billing/concurrency/response => request; scheduling => routing.
|
||||
switch strings.TrimSpace(code) {
|
||||
case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
|
||||
return "billing"
|
||||
return "request"
|
||||
}
|
||||
|
||||
switch errType {
|
||||
case "authentication_error":
|
||||
return "auth"
|
||||
case "billing_error", "subscription_error":
|
||||
return "billing"
|
||||
return "request"
|
||||
case "rate_limit_error":
|
||||
if strings.Contains(msg, "concurrency") || strings.Contains(msg, "pending") || strings.Contains(msg, "queue") {
|
||||
return "concurrency"
|
||||
return "request"
|
||||
}
|
||||
return "upstream"
|
||||
case "invalid_request_error":
|
||||
return "response"
|
||||
return "request"
|
||||
case "upstream_error", "overloaded_error":
|
||||
return "upstream"
|
||||
case "api_error":
|
||||
if strings.Contains(msg, "no available accounts") {
|
||||
return "scheduling"
|
||||
return "routing"
|
||||
}
|
||||
return "internal"
|
||||
default:
|
||||
@@ -914,34 +921,38 @@ func classifyOpsIsBusinessLimited(errType, phase, code string, status int, messa
|
||||
}
|
||||
|
||||
func classifyOpsErrorOwner(phase string, message string) string {
|
||||
// Standardized owners: client|provider|platform
|
||||
switch phase {
|
||||
case "upstream", "network":
|
||||
return "provider"
|
||||
case "billing", "concurrency", "auth", "response":
|
||||
case "request", "auth":
|
||||
return "client"
|
||||
case "routing", "internal":
|
||||
return "platform"
|
||||
default:
|
||||
if strings.Contains(strings.ToLower(message), "upstream") {
|
||||
return "provider"
|
||||
}
|
||||
return "sub2api"
|
||||
return "platform"
|
||||
}
|
||||
}
|
||||
|
||||
func classifyOpsErrorSource(phase string, message string) string {
|
||||
// Standardized sources: client_request|upstream_http|gateway
|
||||
switch phase {
|
||||
case "upstream":
|
||||
return "upstream_http"
|
||||
case "network":
|
||||
return "upstream_network"
|
||||
case "billing":
|
||||
return "billing"
|
||||
case "concurrency":
|
||||
return "concurrency"
|
||||
return "gateway"
|
||||
case "request", "auth":
|
||||
return "client_request"
|
||||
case "routing", "internal":
|
||||
return "gateway"
|
||||
default:
|
||||
if strings.Contains(strings.ToLower(message), "upstream") {
|
||||
return "upstream_http"
|
||||
}
|
||||
return "internal"
|
||||
return "gateway"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -963,3 +974,42 @@ func truncateString(s string, max int) string {
|
||||
func strconvItoa(v int) string {
|
||||
return strconv.Itoa(v)
|
||||
}
|
||||
|
||||
// shouldSkipOpsErrorLog determines if an error should be skipped from logging based on settings.
|
||||
// Returns true for errors that should be filtered according to OpsAdvancedSettings.
|
||||
func shouldSkipOpsErrorLog(ctx context.Context, ops *service.OpsService, message, body, requestPath string) bool {
|
||||
if ops == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
// Get advanced settings to check filter configuration
|
||||
settings, err := ops.GetOpsAdvancedSettings(ctx)
|
||||
if err != nil || settings == nil {
|
||||
// If we can't get settings, don't skip (fail open)
|
||||
return false
|
||||
}
|
||||
|
||||
msgLower := strings.ToLower(message)
|
||||
bodyLower := strings.ToLower(body)
|
||||
|
||||
// Check if count_tokens errors should be ignored
|
||||
if settings.IgnoreCountTokensErrors && strings.Contains(requestPath, "/count_tokens") {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check if context canceled errors should be ignored (client disconnects)
|
||||
if settings.IgnoreContextCanceled {
|
||||
if strings.Contains(msgLower, "context canceled") || strings.Contains(bodyLower, "context canceled") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// Check if "no available accounts" errors should be ignored
|
||||
if settings.IgnoreNoAvailableAccounts {
|
||||
if strings.Contains(msgLower, "no available accounts") || strings.Contains(bodyLower, "no available accounts") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -55,7 +55,6 @@ INSERT INTO ops_error_logs (
|
||||
upstream_error_message,
|
||||
upstream_error_detail,
|
||||
upstream_errors,
|
||||
duration_ms,
|
||||
time_to_first_token_ms,
|
||||
request_body,
|
||||
request_body_truncated,
|
||||
@@ -65,7 +64,7 @@ INSERT INTO ops_error_logs (
|
||||
retry_count,
|
||||
created_at
|
||||
) VALUES (
|
||||
$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35
|
||||
$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34
|
||||
) RETURNING id`
|
||||
|
||||
var id int64
|
||||
@@ -98,7 +97,6 @@ INSERT INTO ops_error_logs (
|
||||
opsNullString(input.UpstreamErrorMessage),
|
||||
opsNullString(input.UpstreamErrorDetail),
|
||||
opsNullString(input.UpstreamErrorsJSON),
|
||||
opsNullInt(input.DurationMs),
|
||||
opsNullInt64(input.TimeToFirstTokenMs),
|
||||
opsNullString(input.RequestBodyJSON),
|
||||
input.RequestBodyTruncated,
|
||||
@@ -135,7 +133,7 @@ func (r *opsRepository) ListErrorLogs(ctx context.Context, filter *service.OpsEr
|
||||
}
|
||||
|
||||
where, args := buildOpsErrorLogsWhere(filter)
|
||||
countSQL := "SELECT COUNT(*) FROM ops_error_logs " + where
|
||||
countSQL := "SELECT COUNT(*) FROM ops_error_logs e " + where
|
||||
|
||||
var total int
|
||||
if err := r.db.QueryRowContext(ctx, countSQL, args...).Scan(&total); err != nil {
|
||||
@@ -146,28 +144,43 @@ func (r *opsRepository) ListErrorLogs(ctx context.Context, filter *service.OpsEr
|
||||
argsWithLimit := append(args, pageSize, offset)
|
||||
selectSQL := `
|
||||
SELECT
|
||||
id,
|
||||
created_at,
|
||||
error_phase,
|
||||
error_type,
|
||||
severity,
|
||||
COALESCE(upstream_status_code, status_code, 0),
|
||||
COALESCE(platform, ''),
|
||||
COALESCE(model, ''),
|
||||
duration_ms,
|
||||
COALESCE(client_request_id, ''),
|
||||
COALESCE(request_id, ''),
|
||||
COALESCE(error_message, ''),
|
||||
user_id,
|
||||
api_key_id,
|
||||
account_id,
|
||||
group_id,
|
||||
CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END,
|
||||
COALESCE(request_path, ''),
|
||||
stream
|
||||
FROM ops_error_logs
|
||||
e.id,
|
||||
e.created_at,
|
||||
e.error_phase,
|
||||
e.error_type,
|
||||
COALESCE(e.error_owner, ''),
|
||||
COALESCE(e.error_source, ''),
|
||||
e.severity,
|
||||
COALESCE(e.upstream_status_code, e.status_code, 0),
|
||||
COALESCE(e.platform, ''),
|
||||
COALESCE(e.model, ''),
|
||||
COALESCE(e.is_retryable, false),
|
||||
COALESCE(e.retry_count, 0),
|
||||
COALESCE(e.resolved, false),
|
||||
e.resolved_at,
|
||||
e.resolved_by_user_id,
|
||||
COALESCE(u2.email, ''),
|
||||
e.resolved_retry_id,
|
||||
COALESCE(e.client_request_id, ''),
|
||||
COALESCE(e.request_id, ''),
|
||||
COALESCE(e.error_message, ''),
|
||||
e.user_id,
|
||||
COALESCE(u.email, ''),
|
||||
e.api_key_id,
|
||||
e.account_id,
|
||||
COALESCE(a.name, ''),
|
||||
e.group_id,
|
||||
COALESCE(g.name, ''),
|
||||
CASE WHEN e.client_ip IS NULL THEN NULL ELSE e.client_ip::text END,
|
||||
COALESCE(e.request_path, ''),
|
||||
e.stream
|
||||
FROM ops_error_logs e
|
||||
LEFT JOIN accounts a ON e.account_id = a.id
|
||||
LEFT JOIN groups g ON e.group_id = g.id
|
||||
LEFT JOIN users u ON e.user_id = u.id
|
||||
LEFT JOIN users u2 ON e.resolved_by_user_id = u2.id
|
||||
` + where + `
|
||||
ORDER BY created_at DESC
|
||||
ORDER BY e.created_at DESC
|
||||
LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2)
|
||||
|
||||
rows, err := r.db.QueryContext(ctx, selectSQL, argsWithLimit...)
|
||||
@@ -179,39 +192,65 @@ LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2)
|
||||
out := make([]*service.OpsErrorLog, 0, pageSize)
|
||||
for rows.Next() {
|
||||
var item service.OpsErrorLog
|
||||
var latency sql.NullInt64
|
||||
var statusCode sql.NullInt64
|
||||
var clientIP sql.NullString
|
||||
var userID sql.NullInt64
|
||||
var apiKeyID sql.NullInt64
|
||||
var accountID sql.NullInt64
|
||||
var accountName string
|
||||
var groupID sql.NullInt64
|
||||
var groupName string
|
||||
var userEmail string
|
||||
var resolvedAt sql.NullTime
|
||||
var resolvedBy sql.NullInt64
|
||||
var resolvedByName string
|
||||
var resolvedRetryID sql.NullInt64
|
||||
if err := rows.Scan(
|
||||
&item.ID,
|
||||
&item.CreatedAt,
|
||||
&item.Phase,
|
||||
&item.Type,
|
||||
&item.Owner,
|
||||
&item.Source,
|
||||
&item.Severity,
|
||||
&statusCode,
|
||||
&item.Platform,
|
||||
&item.Model,
|
||||
&latency,
|
||||
&item.IsRetryable,
|
||||
&item.RetryCount,
|
||||
&item.Resolved,
|
||||
&resolvedAt,
|
||||
&resolvedBy,
|
||||
&resolvedByName,
|
||||
&resolvedRetryID,
|
||||
&item.ClientRequestID,
|
||||
&item.RequestID,
|
||||
&item.Message,
|
||||
&userID,
|
||||
&userEmail,
|
||||
&apiKeyID,
|
||||
&accountID,
|
||||
&accountName,
|
||||
&groupID,
|
||||
&groupName,
|
||||
&clientIP,
|
||||
&item.RequestPath,
|
||||
&item.Stream,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if latency.Valid {
|
||||
v := int(latency.Int64)
|
||||
item.LatencyMs = &v
|
||||
if resolvedAt.Valid {
|
||||
t := resolvedAt.Time
|
||||
item.ResolvedAt = &t
|
||||
}
|
||||
if resolvedBy.Valid {
|
||||
v := resolvedBy.Int64
|
||||
item.ResolvedByUserID = &v
|
||||
}
|
||||
item.ResolvedByUserName = resolvedByName
|
||||
if resolvedRetryID.Valid {
|
||||
v := resolvedRetryID.Int64
|
||||
item.ResolvedRetryID = &v
|
||||
}
|
||||
item.StatusCode = int(statusCode.Int64)
|
||||
if clientIP.Valid {
|
||||
@@ -222,6 +261,7 @@ LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2)
|
||||
v := userID.Int64
|
||||
item.UserID = &v
|
||||
}
|
||||
item.UserEmail = userEmail
|
||||
if apiKeyID.Valid {
|
||||
v := apiKeyID.Int64
|
||||
item.APIKeyID = &v
|
||||
@@ -230,10 +270,12 @@ LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2)
|
||||
v := accountID.Int64
|
||||
item.AccountID = &v
|
||||
}
|
||||
item.AccountName = accountName
|
||||
if groupID.Valid {
|
||||
v := groupID.Int64
|
||||
item.GroupID = &v
|
||||
}
|
||||
item.GroupName = groupName
|
||||
out = append(out, &item)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
@@ -258,49 +300,64 @@ func (r *opsRepository) GetErrorLogByID(ctx context.Context, id int64) (*service
|
||||
|
||||
q := `
|
||||
SELECT
|
||||
id,
|
||||
created_at,
|
||||
error_phase,
|
||||
error_type,
|
||||
severity,
|
||||
COALESCE(upstream_status_code, status_code, 0),
|
||||
COALESCE(platform, ''),
|
||||
COALESCE(model, ''),
|
||||
duration_ms,
|
||||
COALESCE(client_request_id, ''),
|
||||
COALESCE(request_id, ''),
|
||||
COALESCE(error_message, ''),
|
||||
COALESCE(error_body, ''),
|
||||
upstream_status_code,
|
||||
COALESCE(upstream_error_message, ''),
|
||||
COALESCE(upstream_error_detail, ''),
|
||||
COALESCE(upstream_errors::text, ''),
|
||||
is_business_limited,
|
||||
user_id,
|
||||
api_key_id,
|
||||
account_id,
|
||||
group_id,
|
||||
CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END,
|
||||
COALESCE(request_path, ''),
|
||||
stream,
|
||||
COALESCE(user_agent, ''),
|
||||
auth_latency_ms,
|
||||
routing_latency_ms,
|
||||
upstream_latency_ms,
|
||||
response_latency_ms,
|
||||
time_to_first_token_ms,
|
||||
COALESCE(request_body::text, ''),
|
||||
request_body_truncated,
|
||||
request_body_bytes,
|
||||
COALESCE(request_headers::text, '')
|
||||
FROM ops_error_logs
|
||||
WHERE id = $1
|
||||
e.id,
|
||||
e.created_at,
|
||||
e.error_phase,
|
||||
e.error_type,
|
||||
COALESCE(e.error_owner, ''),
|
||||
COALESCE(e.error_source, ''),
|
||||
e.severity,
|
||||
COALESCE(e.upstream_status_code, e.status_code, 0),
|
||||
COALESCE(e.platform, ''),
|
||||
COALESCE(e.model, ''),
|
||||
COALESCE(e.is_retryable, false),
|
||||
COALESCE(e.retry_count, 0),
|
||||
COALESCE(e.resolved, false),
|
||||
e.resolved_at,
|
||||
e.resolved_by_user_id,
|
||||
e.resolved_retry_id,
|
||||
COALESCE(e.client_request_id, ''),
|
||||
COALESCE(e.request_id, ''),
|
||||
COALESCE(e.error_message, ''),
|
||||
COALESCE(e.error_body, ''),
|
||||
e.upstream_status_code,
|
||||
COALESCE(e.upstream_error_message, ''),
|
||||
COALESCE(e.upstream_error_detail, ''),
|
||||
COALESCE(e.upstream_errors::text, ''),
|
||||
e.is_business_limited,
|
||||
e.user_id,
|
||||
COALESCE(u.email, ''),
|
||||
e.api_key_id,
|
||||
e.account_id,
|
||||
COALESCE(a.name, ''),
|
||||
e.group_id,
|
||||
COALESCE(g.name, ''),
|
||||
CASE WHEN e.client_ip IS NULL THEN NULL ELSE e.client_ip::text END,
|
||||
COALESCE(e.request_path, ''),
|
||||
e.stream,
|
||||
COALESCE(e.user_agent, ''),
|
||||
e.auth_latency_ms,
|
||||
e.routing_latency_ms,
|
||||
e.upstream_latency_ms,
|
||||
e.response_latency_ms,
|
||||
e.time_to_first_token_ms,
|
||||
COALESCE(e.request_body::text, ''),
|
||||
e.request_body_truncated,
|
||||
e.request_body_bytes,
|
||||
COALESCE(e.request_headers::text, '')
|
||||
FROM ops_error_logs e
|
||||
LEFT JOIN users u ON e.user_id = u.id
|
||||
LEFT JOIN accounts a ON e.account_id = a.id
|
||||
LEFT JOIN groups g ON e.group_id = g.id
|
||||
WHERE e.id = $1
|
||||
LIMIT 1`
|
||||
|
||||
var out service.OpsErrorLogDetail
|
||||
var latency sql.NullInt64
|
||||
var statusCode sql.NullInt64
|
||||
var upstreamStatusCode sql.NullInt64
|
||||
var resolvedAt sql.NullTime
|
||||
var resolvedBy sql.NullInt64
|
||||
var resolvedRetryID sql.NullInt64
|
||||
var clientIP sql.NullString
|
||||
var userID sql.NullInt64
|
||||
var apiKeyID sql.NullInt64
|
||||
@@ -318,11 +375,18 @@ LIMIT 1`
|
||||
&out.CreatedAt,
|
||||
&out.Phase,
|
||||
&out.Type,
|
||||
&out.Owner,
|
||||
&out.Source,
|
||||
&out.Severity,
|
||||
&statusCode,
|
||||
&out.Platform,
|
||||
&out.Model,
|
||||
&latency,
|
||||
&out.IsRetryable,
|
||||
&out.RetryCount,
|
||||
&out.Resolved,
|
||||
&resolvedAt,
|
||||
&resolvedBy,
|
||||
&resolvedRetryID,
|
||||
&out.ClientRequestID,
|
||||
&out.RequestID,
|
||||
&out.Message,
|
||||
@@ -333,9 +397,12 @@ LIMIT 1`
|
||||
&out.UpstreamErrors,
|
||||
&out.IsBusinessLimited,
|
||||
&userID,
|
||||
&out.UserEmail,
|
||||
&apiKeyID,
|
||||
&accountID,
|
||||
&out.AccountName,
|
||||
&groupID,
|
||||
&out.GroupName,
|
||||
&clientIP,
|
||||
&out.RequestPath,
|
||||
&out.Stream,
|
||||
@@ -355,9 +422,17 @@ LIMIT 1`
|
||||
}
|
||||
|
||||
out.StatusCode = int(statusCode.Int64)
|
||||
if latency.Valid {
|
||||
v := int(latency.Int64)
|
||||
out.LatencyMs = &v
|
||||
if resolvedAt.Valid {
|
||||
t := resolvedAt.Time
|
||||
out.ResolvedAt = &t
|
||||
}
|
||||
if resolvedBy.Valid {
|
||||
v := resolvedBy.Int64
|
||||
out.ResolvedByUserID = &v
|
||||
}
|
||||
if resolvedRetryID.Valid {
|
||||
v := resolvedRetryID.Int64
|
||||
out.ResolvedRetryID = &v
|
||||
}
|
||||
if clientIP.Valid {
|
||||
s := clientIP.String
|
||||
@@ -487,9 +562,15 @@ SET
|
||||
status = $2,
|
||||
finished_at = $3,
|
||||
duration_ms = $4,
|
||||
result_request_id = $5,
|
||||
result_error_id = $6,
|
||||
error_message = $7
|
||||
success = $5,
|
||||
http_status_code = $6,
|
||||
upstream_request_id = $7,
|
||||
used_account_id = $8,
|
||||
response_preview = $9,
|
||||
response_truncated = $10,
|
||||
result_request_id = $11,
|
||||
result_error_id = $12,
|
||||
error_message = $13
|
||||
WHERE id = $1`
|
||||
|
||||
_, err := r.db.ExecContext(
|
||||
@@ -499,8 +580,14 @@ WHERE id = $1`
|
||||
strings.TrimSpace(input.Status),
|
||||
nullTime(input.FinishedAt),
|
||||
input.DurationMs,
|
||||
nullBool(input.Success),
|
||||
nullInt(input.HTTPStatusCode),
|
||||
opsNullString(input.UpstreamRequestID),
|
||||
nullInt64(input.UsedAccountID),
|
||||
opsNullString(input.ResponsePreview),
|
||||
nullBool(input.ResponseTruncated),
|
||||
opsNullString(input.ResultRequestID),
|
||||
opsNullInt64(input.ResultErrorID),
|
||||
nullInt64(input.ResultErrorID),
|
||||
opsNullString(input.ErrorMessage),
|
||||
)
|
||||
return err
|
||||
@@ -526,6 +613,12 @@ SELECT
|
||||
started_at,
|
||||
finished_at,
|
||||
duration_ms,
|
||||
success,
|
||||
http_status_code,
|
||||
upstream_request_id,
|
||||
used_account_id,
|
||||
response_preview,
|
||||
response_truncated,
|
||||
result_request_id,
|
||||
result_error_id,
|
||||
error_message
|
||||
@@ -540,6 +633,12 @@ LIMIT 1`
|
||||
var startedAt sql.NullTime
|
||||
var finishedAt sql.NullTime
|
||||
var durationMs sql.NullInt64
|
||||
var success sql.NullBool
|
||||
var httpStatusCode sql.NullInt64
|
||||
var upstreamRequestID sql.NullString
|
||||
var usedAccountID sql.NullInt64
|
||||
var responsePreview sql.NullString
|
||||
var responseTruncated sql.NullBool
|
||||
var resultRequestID sql.NullString
|
||||
var resultErrorID sql.NullInt64
|
||||
var errorMessage sql.NullString
|
||||
@@ -555,6 +654,12 @@ LIMIT 1`
|
||||
&startedAt,
|
||||
&finishedAt,
|
||||
&durationMs,
|
||||
&success,
|
||||
&httpStatusCode,
|
||||
&upstreamRequestID,
|
||||
&usedAccountID,
|
||||
&responsePreview,
|
||||
&responseTruncated,
|
||||
&resultRequestID,
|
||||
&resultErrorID,
|
||||
&errorMessage,
|
||||
@@ -579,6 +684,30 @@ LIMIT 1`
|
||||
v := durationMs.Int64
|
||||
out.DurationMs = &v
|
||||
}
|
||||
if success.Valid {
|
||||
v := success.Bool
|
||||
out.Success = &v
|
||||
}
|
||||
if httpStatusCode.Valid {
|
||||
v := int(httpStatusCode.Int64)
|
||||
out.HTTPStatusCode = &v
|
||||
}
|
||||
if upstreamRequestID.Valid {
|
||||
s := upstreamRequestID.String
|
||||
out.UpstreamRequestID = &s
|
||||
}
|
||||
if usedAccountID.Valid {
|
||||
v := usedAccountID.Int64
|
||||
out.UsedAccountID = &v
|
||||
}
|
||||
if responsePreview.Valid {
|
||||
s := responsePreview.String
|
||||
out.ResponsePreview = &s
|
||||
}
|
||||
if responseTruncated.Valid {
|
||||
v := responseTruncated.Bool
|
||||
out.ResponseTruncated = &v
|
||||
}
|
||||
if resultRequestID.Valid {
|
||||
s := resultRequestID.String
|
||||
out.ResultRequestID = &s
|
||||
@@ -602,30 +731,234 @@ func nullTime(t time.Time) sql.NullTime {
|
||||
return sql.NullTime{Time: t, Valid: true}
|
||||
}
|
||||
|
||||
func nullBool(v *bool) sql.NullBool {
|
||||
if v == nil {
|
||||
return sql.NullBool{}
|
||||
}
|
||||
return sql.NullBool{Bool: *v, Valid: true}
|
||||
}
|
||||
|
||||
func (r *opsRepository) ListRetryAttemptsByErrorID(ctx context.Context, sourceErrorID int64, limit int) ([]*service.OpsRetryAttempt, error) {
|
||||
if r == nil || r.db == nil {
|
||||
return nil, fmt.Errorf("nil ops repository")
|
||||
}
|
||||
if sourceErrorID <= 0 {
|
||||
return nil, fmt.Errorf("invalid source_error_id")
|
||||
}
|
||||
if limit <= 0 {
|
||||
limit = 50
|
||||
}
|
||||
if limit > 200 {
|
||||
limit = 200
|
||||
}
|
||||
|
||||
q := `
|
||||
SELECT
|
||||
r.id,
|
||||
r.created_at,
|
||||
COALESCE(r.requested_by_user_id, 0),
|
||||
r.source_error_id,
|
||||
COALESCE(r.mode, ''),
|
||||
r.pinned_account_id,
|
||||
COALESCE(pa.name, ''),
|
||||
COALESCE(r.status, ''),
|
||||
r.started_at,
|
||||
r.finished_at,
|
||||
r.duration_ms,
|
||||
r.success,
|
||||
r.http_status_code,
|
||||
r.upstream_request_id,
|
||||
r.used_account_id,
|
||||
COALESCE(ua.name, ''),
|
||||
r.response_preview,
|
||||
r.response_truncated,
|
||||
r.result_request_id,
|
||||
r.result_error_id,
|
||||
r.error_message
|
||||
FROM ops_retry_attempts r
|
||||
LEFT JOIN accounts pa ON r.pinned_account_id = pa.id
|
||||
LEFT JOIN accounts ua ON r.used_account_id = ua.id
|
||||
WHERE r.source_error_id = $1
|
||||
ORDER BY r.created_at DESC
|
||||
LIMIT $2`
|
||||
|
||||
rows, err := r.db.QueryContext(ctx, q, sourceErrorID, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer func() { _ = rows.Close() }()
|
||||
|
||||
out := make([]*service.OpsRetryAttempt, 0, 16)
|
||||
for rows.Next() {
|
||||
var item service.OpsRetryAttempt
|
||||
var pinnedAccountID sql.NullInt64
|
||||
var pinnedAccountName string
|
||||
var requestedBy sql.NullInt64
|
||||
var startedAt sql.NullTime
|
||||
var finishedAt sql.NullTime
|
||||
var durationMs sql.NullInt64
|
||||
var success sql.NullBool
|
||||
var httpStatusCode sql.NullInt64
|
||||
var upstreamRequestID sql.NullString
|
||||
var usedAccountID sql.NullInt64
|
||||
var usedAccountName string
|
||||
var responsePreview sql.NullString
|
||||
var responseTruncated sql.NullBool
|
||||
var resultRequestID sql.NullString
|
||||
var resultErrorID sql.NullInt64
|
||||
var errorMessage sql.NullString
|
||||
|
||||
if err := rows.Scan(
|
||||
&item.ID,
|
||||
&item.CreatedAt,
|
||||
&requestedBy,
|
||||
&item.SourceErrorID,
|
||||
&item.Mode,
|
||||
&pinnedAccountID,
|
||||
&pinnedAccountName,
|
||||
&item.Status,
|
||||
&startedAt,
|
||||
&finishedAt,
|
||||
&durationMs,
|
||||
&success,
|
||||
&httpStatusCode,
|
||||
&upstreamRequestID,
|
||||
&usedAccountID,
|
||||
&usedAccountName,
|
||||
&responsePreview,
|
||||
&responseTruncated,
|
||||
&resultRequestID,
|
||||
&resultErrorID,
|
||||
&errorMessage,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
item.RequestedByUserID = requestedBy.Int64
|
||||
if pinnedAccountID.Valid {
|
||||
v := pinnedAccountID.Int64
|
||||
item.PinnedAccountID = &v
|
||||
}
|
||||
item.PinnedAccountName = pinnedAccountName
|
||||
if startedAt.Valid {
|
||||
t := startedAt.Time
|
||||
item.StartedAt = &t
|
||||
}
|
||||
if finishedAt.Valid {
|
||||
t := finishedAt.Time
|
||||
item.FinishedAt = &t
|
||||
}
|
||||
if durationMs.Valid {
|
||||
v := durationMs.Int64
|
||||
item.DurationMs = &v
|
||||
}
|
||||
if success.Valid {
|
||||
v := success.Bool
|
||||
item.Success = &v
|
||||
}
|
||||
if httpStatusCode.Valid {
|
||||
v := int(httpStatusCode.Int64)
|
||||
item.HTTPStatusCode = &v
|
||||
}
|
||||
if upstreamRequestID.Valid {
|
||||
item.UpstreamRequestID = &upstreamRequestID.String
|
||||
}
|
||||
if usedAccountID.Valid {
|
||||
v := usedAccountID.Int64
|
||||
item.UsedAccountID = &v
|
||||
}
|
||||
item.UsedAccountName = usedAccountName
|
||||
if responsePreview.Valid {
|
||||
item.ResponsePreview = &responsePreview.String
|
||||
}
|
||||
if responseTruncated.Valid {
|
||||
v := responseTruncated.Bool
|
||||
item.ResponseTruncated = &v
|
||||
}
|
||||
if resultRequestID.Valid {
|
||||
item.ResultRequestID = &resultRequestID.String
|
||||
}
|
||||
if resultErrorID.Valid {
|
||||
v := resultErrorID.Int64
|
||||
item.ResultErrorID = &v
|
||||
}
|
||||
if errorMessage.Valid {
|
||||
item.ErrorMessage = &errorMessage.String
|
||||
}
|
||||
out = append(out, &item)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (r *opsRepository) UpdateErrorResolution(ctx context.Context, errorID int64, resolved bool, resolvedByUserID *int64, resolvedRetryID *int64, resolvedAt *time.Time) error {
|
||||
if r == nil || r.db == nil {
|
||||
return fmt.Errorf("nil ops repository")
|
||||
}
|
||||
if errorID <= 0 {
|
||||
return fmt.Errorf("invalid error id")
|
||||
}
|
||||
|
||||
q := `
|
||||
UPDATE ops_error_logs
|
||||
SET
|
||||
resolved = $2,
|
||||
resolved_at = $3,
|
||||
resolved_by_user_id = $4,
|
||||
resolved_retry_id = $5
|
||||
WHERE id = $1`
|
||||
|
||||
at := sql.NullTime{}
|
||||
if resolvedAt != nil && !resolvedAt.IsZero() {
|
||||
at = sql.NullTime{Time: resolvedAt.UTC(), Valid: true}
|
||||
} else if resolved {
|
||||
now := time.Now().UTC()
|
||||
at = sql.NullTime{Time: now, Valid: true}
|
||||
}
|
||||
|
||||
_, err := r.db.ExecContext(
|
||||
ctx,
|
||||
q,
|
||||
errorID,
|
||||
resolved,
|
||||
at,
|
||||
nullInt64(resolvedByUserID),
|
||||
nullInt64(resolvedRetryID),
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) {
|
||||
clauses := make([]string, 0, 8)
|
||||
args := make([]any, 0, 8)
|
||||
clauses := make([]string, 0, 12)
|
||||
args := make([]any, 0, 12)
|
||||
clauses = append(clauses, "1=1")
|
||||
|
||||
phaseFilter := ""
|
||||
if filter != nil {
|
||||
phaseFilter = strings.TrimSpace(strings.ToLower(filter.Phase))
|
||||
}
|
||||
// ops_error_logs primarily stores client-visible error requests (status>=400),
|
||||
// ops_error_logs stores client-visible error requests (status>=400),
|
||||
// but we also persist "recovered" upstream errors (status<400) for upstream health visibility.
|
||||
// By default, keep list endpoints scoped to client errors unless explicitly filtering upstream phase.
|
||||
// If Resolved is not specified, do not filter by resolved state (backward-compatible).
|
||||
resolvedFilter := (*bool)(nil)
|
||||
if filter != nil {
|
||||
resolvedFilter = filter.Resolved
|
||||
}
|
||||
// Keep list endpoints scoped to client errors unless explicitly filtering upstream phase.
|
||||
if phaseFilter != "upstream" {
|
||||
clauses = append(clauses, "COALESCE(status_code, 0) >= 400")
|
||||
}
|
||||
|
||||
if filter.StartTime != nil && !filter.StartTime.IsZero() {
|
||||
args = append(args, filter.StartTime.UTC())
|
||||
clauses = append(clauses, "created_at >= $"+itoa(len(args)))
|
||||
clauses = append(clauses, "e.created_at >= $"+itoa(len(args)))
|
||||
}
|
||||
if filter.EndTime != nil && !filter.EndTime.IsZero() {
|
||||
args = append(args, filter.EndTime.UTC())
|
||||
// Keep time-window semantics consistent with other ops queries: [start, end)
|
||||
clauses = append(clauses, "created_at < $"+itoa(len(args)))
|
||||
clauses = append(clauses, "e.created_at < $"+itoa(len(args)))
|
||||
}
|
||||
if p := strings.TrimSpace(filter.Platform); p != "" {
|
||||
args = append(args, p)
|
||||
@@ -643,10 +976,59 @@ func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) {
|
||||
args = append(args, phase)
|
||||
clauses = append(clauses, "error_phase = $"+itoa(len(args)))
|
||||
}
|
||||
if filter != nil {
|
||||
if owner := strings.TrimSpace(strings.ToLower(filter.Owner)); owner != "" {
|
||||
args = append(args, owner)
|
||||
clauses = append(clauses, "LOWER(COALESCE(error_owner,'')) = $"+itoa(len(args)))
|
||||
}
|
||||
if source := strings.TrimSpace(strings.ToLower(filter.Source)); source != "" {
|
||||
args = append(args, source)
|
||||
clauses = append(clauses, "LOWER(COALESCE(error_source,'')) = $"+itoa(len(args)))
|
||||
}
|
||||
}
|
||||
if resolvedFilter != nil {
|
||||
args = append(args, *resolvedFilter)
|
||||
clauses = append(clauses, "COALESCE(resolved,false) = $"+itoa(len(args)))
|
||||
}
|
||||
|
||||
// View filter: errors vs excluded vs all.
|
||||
// Excluded = upstream 429/529 and business-limited (quota/concurrency/billing) errors.
|
||||
view := ""
|
||||
if filter != nil {
|
||||
view = strings.ToLower(strings.TrimSpace(filter.View))
|
||||
}
|
||||
switch view {
|
||||
case "", "errors":
|
||||
clauses = append(clauses, "COALESCE(is_business_limited,false) = false")
|
||||
clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)")
|
||||
case "excluded":
|
||||
clauses = append(clauses, "(COALESCE(is_business_limited,false) = true OR COALESCE(upstream_status_code, status_code, 0) IN (429, 529))")
|
||||
case "all":
|
||||
// no-op
|
||||
default:
|
||||
// treat unknown as default 'errors'
|
||||
clauses = append(clauses, "COALESCE(is_business_limited,false) = false")
|
||||
clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)")
|
||||
}
|
||||
if len(filter.StatusCodes) > 0 {
|
||||
args = append(args, pq.Array(filter.StatusCodes))
|
||||
clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) = ANY($"+itoa(len(args))+")")
|
||||
} else if filter.StatusCodesOther {
|
||||
// "Other" means: status codes not in the common list.
|
||||
known := []int{400, 401, 403, 404, 409, 422, 429, 500, 502, 503, 504, 529}
|
||||
args = append(args, pq.Array(known))
|
||||
clauses = append(clauses, "NOT (COALESCE(upstream_status_code, status_code, 0) = ANY($"+itoa(len(args))+"))")
|
||||
}
|
||||
// Exact correlation keys (preferred for request↔upstream linkage).
|
||||
if rid := strings.TrimSpace(filter.RequestID); rid != "" {
|
||||
args = append(args, rid)
|
||||
clauses = append(clauses, "COALESCE(request_id,'') = $"+itoa(len(args)))
|
||||
}
|
||||
if crid := strings.TrimSpace(filter.ClientRequestID); crid != "" {
|
||||
args = append(args, crid)
|
||||
clauses = append(clauses, "COALESCE(client_request_id,'') = $"+itoa(len(args)))
|
||||
}
|
||||
|
||||
if q := strings.TrimSpace(filter.Query); q != "" {
|
||||
like := "%" + q + "%"
|
||||
args = append(args, like)
|
||||
@@ -654,6 +1036,13 @@ func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) {
|
||||
clauses = append(clauses, "(request_id ILIKE $"+n+" OR client_request_id ILIKE $"+n+" OR error_message ILIKE $"+n+")")
|
||||
}
|
||||
|
||||
if userQuery := strings.TrimSpace(filter.UserQuery); userQuery != "" {
|
||||
like := "%" + userQuery + "%"
|
||||
args = append(args, like)
|
||||
n := itoa(len(args))
|
||||
clauses = append(clauses, "u.email ILIKE $"+n)
|
||||
}
|
||||
|
||||
return "WHERE " + strings.Join(clauses, " AND "), args
|
||||
}
|
||||
|
||||
|
||||
@@ -354,7 +354,7 @@ SELECT
|
||||
created_at
|
||||
FROM ops_alert_events
|
||||
` + where + `
|
||||
ORDER BY fired_at DESC
|
||||
ORDER BY fired_at DESC, id DESC
|
||||
LIMIT ` + limitArg
|
||||
|
||||
rows, err := r.db.QueryContext(ctx, q, args...)
|
||||
@@ -413,6 +413,43 @@ LIMIT ` + limitArg
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (r *opsRepository) GetAlertEventByID(ctx context.Context, eventID int64) (*service.OpsAlertEvent, error) {
|
||||
if r == nil || r.db == nil {
|
||||
return nil, fmt.Errorf("nil ops repository")
|
||||
}
|
||||
if eventID <= 0 {
|
||||
return nil, fmt.Errorf("invalid event id")
|
||||
}
|
||||
|
||||
q := `
|
||||
SELECT
|
||||
id,
|
||||
COALESCE(rule_id, 0),
|
||||
COALESCE(severity, ''),
|
||||
COALESCE(status, ''),
|
||||
COALESCE(title, ''),
|
||||
COALESCE(description, ''),
|
||||
metric_value,
|
||||
threshold_value,
|
||||
dimensions,
|
||||
fired_at,
|
||||
resolved_at,
|
||||
email_sent,
|
||||
created_at
|
||||
FROM ops_alert_events
|
||||
WHERE id = $1`
|
||||
|
||||
row := r.db.QueryRowContext(ctx, q, eventID)
|
||||
ev, err := scanOpsAlertEvent(row)
|
||||
if err != nil {
|
||||
if err == sql.ErrNoRows {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
return ev, nil
|
||||
}
|
||||
|
||||
func (r *opsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) {
|
||||
if r == nil || r.db == nil {
|
||||
return nil, fmt.Errorf("nil ops repository")
|
||||
@@ -591,6 +628,121 @@ type opsAlertEventRow interface {
|
||||
Scan(dest ...any) error
|
||||
}
|
||||
|
||||
func (r *opsRepository) CreateAlertSilence(ctx context.Context, input *service.OpsAlertSilence) (*service.OpsAlertSilence, error) {
|
||||
if r == nil || r.db == nil {
|
||||
return nil, fmt.Errorf("nil ops repository")
|
||||
}
|
||||
if input == nil {
|
||||
return nil, fmt.Errorf("nil input")
|
||||
}
|
||||
if input.RuleID <= 0 {
|
||||
return nil, fmt.Errorf("invalid rule_id")
|
||||
}
|
||||
platform := strings.TrimSpace(input.Platform)
|
||||
if platform == "" {
|
||||
return nil, fmt.Errorf("invalid platform")
|
||||
}
|
||||
if input.Until.IsZero() {
|
||||
return nil, fmt.Errorf("invalid until")
|
||||
}
|
||||
|
||||
q := `
|
||||
INSERT INTO ops_alert_silences (
|
||||
rule_id,
|
||||
platform,
|
||||
group_id,
|
||||
region,
|
||||
until,
|
||||
reason,
|
||||
created_by,
|
||||
created_at
|
||||
) VALUES (
|
||||
$1,$2,$3,$4,$5,$6,$7,NOW()
|
||||
)
|
||||
RETURNING id, rule_id, platform, group_id, region, until, COALESCE(reason,''), created_by, created_at`
|
||||
|
||||
row := r.db.QueryRowContext(
|
||||
ctx,
|
||||
q,
|
||||
input.RuleID,
|
||||
platform,
|
||||
opsNullInt64(input.GroupID),
|
||||
opsNullString(input.Region),
|
||||
input.Until,
|
||||
opsNullString(input.Reason),
|
||||
opsNullInt64(input.CreatedBy),
|
||||
)
|
||||
|
||||
var out service.OpsAlertSilence
|
||||
var groupID sql.NullInt64
|
||||
var region sql.NullString
|
||||
var createdBy sql.NullInt64
|
||||
if err := row.Scan(
|
||||
&out.ID,
|
||||
&out.RuleID,
|
||||
&out.Platform,
|
||||
&groupID,
|
||||
®ion,
|
||||
&out.Until,
|
||||
&out.Reason,
|
||||
&createdBy,
|
||||
&out.CreatedAt,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if groupID.Valid {
|
||||
v := groupID.Int64
|
||||
out.GroupID = &v
|
||||
}
|
||||
if region.Valid {
|
||||
v := strings.TrimSpace(region.String)
|
||||
if v != "" {
|
||||
out.Region = &v
|
||||
}
|
||||
}
|
||||
if createdBy.Valid {
|
||||
v := createdBy.Int64
|
||||
out.CreatedBy = &v
|
||||
}
|
||||
return &out, nil
|
||||
}
|
||||
|
||||
func (r *opsRepository) IsAlertSilenced(ctx context.Context, ruleID int64, platform string, groupID *int64, region *string, now time.Time) (bool, error) {
|
||||
if r == nil || r.db == nil {
|
||||
return false, fmt.Errorf("nil ops repository")
|
||||
}
|
||||
if ruleID <= 0 {
|
||||
return false, fmt.Errorf("invalid rule id")
|
||||
}
|
||||
platform = strings.TrimSpace(platform)
|
||||
if platform == "" {
|
||||
return false, nil
|
||||
}
|
||||
if now.IsZero() {
|
||||
now = time.Now().UTC()
|
||||
}
|
||||
|
||||
q := `
|
||||
SELECT 1
|
||||
FROM ops_alert_silences
|
||||
WHERE rule_id = $1
|
||||
AND platform = $2
|
||||
AND (group_id IS NOT DISTINCT FROM $3)
|
||||
AND (region IS NOT DISTINCT FROM $4)
|
||||
AND until > $5
|
||||
LIMIT 1`
|
||||
|
||||
var dummy int
|
||||
err := r.db.QueryRowContext(ctx, q, ruleID, platform, opsNullInt64(groupID), opsNullString(region), now).Scan(&dummy)
|
||||
if err != nil {
|
||||
if err == sql.ErrNoRows {
|
||||
return false, nil
|
||||
}
|
||||
return false, err
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func scanOpsAlertEvent(row opsAlertEventRow) (*service.OpsAlertEvent, error) {
|
||||
var ev service.OpsAlertEvent
|
||||
var metricValue sql.NullFloat64
|
||||
@@ -652,6 +804,10 @@ func buildOpsAlertEventsWhere(filter *service.OpsAlertEventFilter) (string, []an
|
||||
args = append(args, severity)
|
||||
clauses = append(clauses, "severity = $"+itoa(len(args)))
|
||||
}
|
||||
if filter.EmailSent != nil {
|
||||
args = append(args, *filter.EmailSent)
|
||||
clauses = append(clauses, "email_sent = $"+itoa(len(args)))
|
||||
}
|
||||
if filter.StartTime != nil && !filter.StartTime.IsZero() {
|
||||
args = append(args, *filter.StartTime)
|
||||
clauses = append(clauses, "fired_at >= $"+itoa(len(args)))
|
||||
@@ -661,6 +817,14 @@ func buildOpsAlertEventsWhere(filter *service.OpsAlertEventFilter) (string, []an
|
||||
clauses = append(clauses, "fired_at < $"+itoa(len(args)))
|
||||
}
|
||||
|
||||
// Cursor pagination (descending by fired_at, then id)
|
||||
if filter.BeforeFiredAt != nil && !filter.BeforeFiredAt.IsZero() && filter.BeforeID != nil && *filter.BeforeID > 0 {
|
||||
args = append(args, *filter.BeforeFiredAt)
|
||||
tsArg := "$" + itoa(len(args))
|
||||
args = append(args, *filter.BeforeID)
|
||||
idArg := "$" + itoa(len(args))
|
||||
clauses = append(clauses, fmt.Sprintf("(fired_at < %s OR (fired_at = %s AND id < %s))", tsArg, tsArg, idArg))
|
||||
}
|
||||
// Dimensions are stored in JSONB. We filter best-effort without requiring GIN indexes.
|
||||
if platform := strings.TrimSpace(filter.Platform); platform != "" {
|
||||
args = append(args, platform)
|
||||
|
||||
@@ -27,7 +27,7 @@ func TestSchedulerSnapshotOutboxReplay(t *testing.T) {
|
||||
RunMode: config.RunModeStandard,
|
||||
Gateway: config.GatewayConfig{
|
||||
Scheduling: config.GatewaySchedulingConfig{
|
||||
OutboxPollIntervalSeconds: 1,
|
||||
OutboxPollIntervalSeconds: 1,
|
||||
FullRebuildIntervalSeconds: 0,
|
||||
DbFallbackEnabled: true,
|
||||
},
|
||||
|
||||
@@ -81,6 +81,9 @@ func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) {
|
||||
ops.PUT("/alert-rules/:id", h.Admin.Ops.UpdateAlertRule)
|
||||
ops.DELETE("/alert-rules/:id", h.Admin.Ops.DeleteAlertRule)
|
||||
ops.GET("/alert-events", h.Admin.Ops.ListAlertEvents)
|
||||
ops.GET("/alert-events/:id", h.Admin.Ops.GetAlertEvent)
|
||||
ops.PUT("/alert-events/:id/status", h.Admin.Ops.UpdateAlertEventStatus)
|
||||
ops.POST("/alert-silences", h.Admin.Ops.CreateAlertSilence)
|
||||
|
||||
// Email notification config (DB-backed)
|
||||
ops.GET("/email-notification/config", h.Admin.Ops.GetEmailNotificationConfig)
|
||||
@@ -110,10 +113,26 @@ func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) {
|
||||
ws.GET("/qps", h.Admin.Ops.QPSWSHandler)
|
||||
}
|
||||
|
||||
// Error logs (MVP-1)
|
||||
// Error logs (legacy)
|
||||
ops.GET("/errors", h.Admin.Ops.GetErrorLogs)
|
||||
ops.GET("/errors/:id", h.Admin.Ops.GetErrorLogByID)
|
||||
ops.GET("/errors/:id/retries", h.Admin.Ops.ListRetryAttempts)
|
||||
ops.POST("/errors/:id/retry", h.Admin.Ops.RetryErrorRequest)
|
||||
ops.PUT("/errors/:id/resolve", h.Admin.Ops.UpdateErrorResolution)
|
||||
|
||||
// Request errors (client-visible failures)
|
||||
ops.GET("/request-errors", h.Admin.Ops.ListRequestErrors)
|
||||
ops.GET("/request-errors/:id", h.Admin.Ops.GetRequestError)
|
||||
ops.GET("/request-errors/:id/upstream-errors", h.Admin.Ops.ListRequestErrorUpstreamErrors)
|
||||
ops.POST("/request-errors/:id/retry-client", h.Admin.Ops.RetryRequestErrorClient)
|
||||
ops.POST("/request-errors/:id/upstream-errors/:idx/retry", h.Admin.Ops.RetryRequestErrorUpstreamEvent)
|
||||
ops.PUT("/request-errors/:id/resolve", h.Admin.Ops.ResolveRequestError)
|
||||
|
||||
// Upstream errors (independent upstream failures)
|
||||
ops.GET("/upstream-errors", h.Admin.Ops.ListUpstreamErrors)
|
||||
ops.GET("/upstream-errors/:id", h.Admin.Ops.GetUpstreamError)
|
||||
ops.POST("/upstream-errors/:id/retry", h.Admin.Ops.RetryUpstreamError)
|
||||
ops.PUT("/upstream-errors/:id/resolve", h.Admin.Ops.ResolveUpstreamError)
|
||||
|
||||
// Request drilldown (success + error)
|
||||
ops.GET("/requests", h.Admin.Ops.ListRequestDetails)
|
||||
|
||||
@@ -12,9 +12,9 @@ import (
|
||||
|
||||
type accountRepoStubForBulkUpdate struct {
|
||||
accountRepoStub
|
||||
bulkUpdateErr error
|
||||
bulkUpdateIDs []int64
|
||||
bindGroupErrByID map[int64]error
|
||||
bulkUpdateErr error
|
||||
bulkUpdateIDs []int64
|
||||
bindGroupErrByID map[int64]error
|
||||
}
|
||||
|
||||
func (s *accountRepoStubForBulkUpdate) BulkUpdate(_ context.Context, ids []int64, _ AccountBulkUpdate) (int64, error) {
|
||||
|
||||
@@ -564,6 +564,10 @@ urlFallbackLoop:
|
||||
}
|
||||
|
||||
upstreamReq, err := antigravity.NewAPIRequestWithURL(ctx, baseURL, action, accessToken, geminiBody)
|
||||
// Capture upstream request body for ops retry of this attempt.
|
||||
if c != nil {
|
||||
c.Set(OpsUpstreamRequestBodyKey, string(geminiBody))
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -574,6 +578,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "request_error",
|
||||
Message: safeErr,
|
||||
@@ -615,6 +620,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "retry",
|
||||
@@ -645,6 +651,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "retry",
|
||||
@@ -697,6 +704,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "signature_error",
|
||||
@@ -740,6 +748,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "signature_retry_request_error",
|
||||
Message: sanitizeUpstreamErrorMessage(retryErr.Error()),
|
||||
@@ -770,6 +779,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: retryResp.StatusCode,
|
||||
UpstreamRequestID: retryResp.Header.Get("x-request-id"),
|
||||
Kind: kind,
|
||||
@@ -817,6 +827,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "failover",
|
||||
@@ -1371,6 +1382,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "request_error",
|
||||
Message: safeErr,
|
||||
@@ -1412,6 +1424,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "retry",
|
||||
@@ -1442,6 +1455,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "retry",
|
||||
@@ -1543,6 +1557,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: requestID,
|
||||
Kind: "failover",
|
||||
@@ -1559,6 +1574,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: requestID,
|
||||
Kind: "http_error",
|
||||
@@ -2039,6 +2055,7 @@ func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, accou
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: upstreamStatus,
|
||||
UpstreamRequestID: upstreamRequestID,
|
||||
Kind: "http_error",
|
||||
|
||||
@@ -1466,6 +1466,9 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
for attempt := 1; attempt <= maxRetryAttempts; attempt++ {
|
||||
// 构建上游请求(每次重试需要重新构建,因为请求体需要重新读取)
|
||||
upstreamReq, err := s.buildUpstreamRequest(ctx, c, account, body, token, tokenType, reqModel)
|
||||
// Capture upstream request body for ops retry of this attempt.
|
||||
c.Set(OpsUpstreamRequestBodyKey, string(body))
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -1482,6 +1485,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "request_error",
|
||||
Message: safeErr,
|
||||
@@ -1506,6 +1510,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "signature_error",
|
||||
@@ -1557,6 +1562,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: retryResp.StatusCode,
|
||||
UpstreamRequestID: retryResp.Header.Get("x-request-id"),
|
||||
Kind: "signature_retry_thinking",
|
||||
@@ -1585,6 +1591,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "signature_retry_tools_request_error",
|
||||
Message: sanitizeUpstreamErrorMessage(retryErr2.Error()),
|
||||
@@ -1643,6 +1650,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "retry",
|
||||
@@ -1691,6 +1699,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "retry_exhausted_failover",
|
||||
@@ -1757,6 +1766,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "failover_on_400",
|
||||
|
||||
@@ -545,12 +545,19 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
|
||||
}
|
||||
requestIDHeader = idHeader
|
||||
|
||||
// Capture upstream request body for ops retry of this attempt.
|
||||
if c != nil {
|
||||
// In this code path `body` is already the JSON sent to upstream.
|
||||
c.Set(OpsUpstreamRequestBodyKey, string(body))
|
||||
}
|
||||
|
||||
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
||||
if err != nil {
|
||||
safeErr := sanitizeUpstreamErrorMessage(err.Error())
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "request_error",
|
||||
Message: safeErr,
|
||||
@@ -588,6 +595,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: upstreamReqID,
|
||||
Kind: "signature_error",
|
||||
@@ -662,6 +670,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: upstreamReqID,
|
||||
Kind: "retry",
|
||||
@@ -711,6 +720,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: upstreamReqID,
|
||||
Kind: "failover",
|
||||
@@ -737,6 +747,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: upstreamReqID,
|
||||
Kind: "failover",
|
||||
@@ -972,12 +983,19 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
|
||||
}
|
||||
requestIDHeader = idHeader
|
||||
|
||||
// Capture upstream request body for ops retry of this attempt.
|
||||
if c != nil {
|
||||
// In this code path `body` is already the JSON sent to upstream.
|
||||
c.Set(OpsUpstreamRequestBodyKey, string(body))
|
||||
}
|
||||
|
||||
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
||||
if err != nil {
|
||||
safeErr := sanitizeUpstreamErrorMessage(err.Error())
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "request_error",
|
||||
Message: safeErr,
|
||||
@@ -1036,6 +1054,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: upstreamReqID,
|
||||
Kind: "retry",
|
||||
@@ -1120,6 +1139,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: requestID,
|
||||
Kind: "failover",
|
||||
@@ -1143,6 +1163,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: requestID,
|
||||
Kind: "failover",
|
||||
@@ -1168,6 +1189,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: requestID,
|
||||
Kind: "http_error",
|
||||
@@ -1300,6 +1322,7 @@ func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, acc
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: upstreamStatus,
|
||||
UpstreamRequestID: upstreamRequestID,
|
||||
Kind: "http_error",
|
||||
|
||||
@@ -664,6 +664,11 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco
|
||||
proxyURL = account.Proxy.URL()
|
||||
}
|
||||
|
||||
// Capture upstream request body for ops retry of this attempt.
|
||||
if c != nil {
|
||||
c.Set(OpsUpstreamRequestBodyKey, string(body))
|
||||
}
|
||||
|
||||
// Send request
|
||||
resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
||||
if err != nil {
|
||||
@@ -673,6 +678,7 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "request_error",
|
||||
Message: safeErr,
|
||||
@@ -707,6 +713,7 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "failover",
|
||||
@@ -864,6 +871,7 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "http_error",
|
||||
@@ -894,6 +902,7 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: kind,
|
||||
|
||||
@@ -206,7 +206,7 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
|
||||
continue
|
||||
}
|
||||
|
||||
scopePlatform, scopeGroupID := parseOpsAlertRuleScope(rule.Filters)
|
||||
scopePlatform, scopeGroupID, scopeRegion := parseOpsAlertRuleScope(rule.Filters)
|
||||
|
||||
windowMinutes := rule.WindowMinutes
|
||||
if windowMinutes <= 0 {
|
||||
@@ -236,6 +236,17 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Scoped silencing: if a matching silence exists, skip creating a firing event.
|
||||
if s.opsService != nil {
|
||||
platform := strings.TrimSpace(scopePlatform)
|
||||
region := scopeRegion
|
||||
if platform != "" {
|
||||
if ok, err := s.opsService.IsAlertSilenced(ctx, rule.ID, platform, scopeGroupID, region, now); err == nil && ok {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
latestEvent, err := s.opsRepo.GetLatestAlertEvent(ctx, rule.ID)
|
||||
if err != nil {
|
||||
log.Printf("[OpsAlertEvaluator] get latest event failed (rule=%d): %v", rule.ID, err)
|
||||
@@ -359,9 +370,9 @@ func requiredSustainedBreaches(sustainedMinutes int, interval time.Duration) int
|
||||
return required
|
||||
}
|
||||
|
||||
func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64) {
|
||||
func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64, region *string) {
|
||||
if filters == nil {
|
||||
return "", nil
|
||||
return "", nil, nil
|
||||
}
|
||||
if v, ok := filters["platform"]; ok {
|
||||
if s, ok := v.(string); ok {
|
||||
@@ -392,7 +403,15 @@ func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *i
|
||||
}
|
||||
}
|
||||
}
|
||||
return platform, groupID
|
||||
if v, ok := filters["region"]; ok {
|
||||
if s, ok := v.(string); ok {
|
||||
vv := strings.TrimSpace(s)
|
||||
if vv != "" {
|
||||
region = &vv
|
||||
}
|
||||
}
|
||||
}
|
||||
return platform, groupID, region
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) computeRuleMetric(
|
||||
@@ -504,16 +523,6 @@ func (s *OpsAlertEvaluatorService) computeRuleMetric(
|
||||
return 0, false
|
||||
}
|
||||
return overview.UpstreamErrorRate * 100, true
|
||||
case "p95_latency_ms":
|
||||
if overview.Duration.P95 == nil {
|
||||
return 0, false
|
||||
}
|
||||
return float64(*overview.Duration.P95), true
|
||||
case "p99_latency_ms":
|
||||
if overview.Duration.P99 == nil {
|
||||
return 0, false
|
||||
}
|
||||
return float64(*overview.Duration.P99), true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
|
||||
@@ -8,8 +8,9 @@ import "time"
|
||||
// with the existing ops dashboard frontend (backup style).
|
||||
|
||||
const (
|
||||
OpsAlertStatusFiring = "firing"
|
||||
OpsAlertStatusResolved = "resolved"
|
||||
OpsAlertStatusFiring = "firing"
|
||||
OpsAlertStatusResolved = "resolved"
|
||||
OpsAlertStatusManualResolved = "manual_resolved"
|
||||
)
|
||||
|
||||
type OpsAlertRule struct {
|
||||
@@ -58,12 +59,32 @@ type OpsAlertEvent struct {
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
}
|
||||
|
||||
type OpsAlertSilence struct {
|
||||
ID int64 `json:"id"`
|
||||
|
||||
RuleID int64 `json:"rule_id"`
|
||||
Platform string `json:"platform"`
|
||||
GroupID *int64 `json:"group_id,omitempty"`
|
||||
Region *string `json:"region,omitempty"`
|
||||
|
||||
Until time.Time `json:"until"`
|
||||
Reason string `json:"reason"`
|
||||
|
||||
CreatedBy *int64 `json:"created_by,omitempty"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
}
|
||||
|
||||
type OpsAlertEventFilter struct {
|
||||
Limit int
|
||||
|
||||
// Cursor pagination (descending by fired_at, then id).
|
||||
BeforeFiredAt *time.Time
|
||||
BeforeID *int64
|
||||
|
||||
// Optional filters.
|
||||
Status string
|
||||
Severity string
|
||||
Status string
|
||||
Severity string
|
||||
EmailSent *bool
|
||||
|
||||
StartTime *time.Time
|
||||
EndTime *time.Time
|
||||
|
||||
@@ -88,6 +88,29 @@ func (s *OpsService) ListAlertEvents(ctx context.Context, filter *OpsAlertEventF
|
||||
return s.opsRepo.ListAlertEvents(ctx, filter)
|
||||
}
|
||||
|
||||
func (s *OpsService) GetAlertEventByID(ctx context.Context, eventID int64) (*OpsAlertEvent, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if eventID <= 0 {
|
||||
return nil, infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
|
||||
}
|
||||
ev, err := s.opsRepo.GetAlertEventByID(ctx, eventID)
|
||||
if err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, infraerrors.NotFound("OPS_ALERT_EVENT_NOT_FOUND", "alert event not found")
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
if ev == nil {
|
||||
return nil, infraerrors.NotFound("OPS_ALERT_EVENT_NOT_FOUND", "alert event not found")
|
||||
}
|
||||
return ev, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
@@ -101,6 +124,49 @@ func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*Op
|
||||
return s.opsRepo.GetActiveAlertEvent(ctx, ruleID)
|
||||
}
|
||||
|
||||
func (s *OpsService) CreateAlertSilence(ctx context.Context, input *OpsAlertSilence) (*OpsAlertSilence, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if input == nil {
|
||||
return nil, infraerrors.BadRequest("INVALID_SILENCE", "invalid silence")
|
||||
}
|
||||
if input.RuleID <= 0 {
|
||||
return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
|
||||
}
|
||||
if strings.TrimSpace(input.Platform) == "" {
|
||||
return nil, infraerrors.BadRequest("INVALID_PLATFORM", "invalid platform")
|
||||
}
|
||||
if input.Until.IsZero() {
|
||||
return nil, infraerrors.BadRequest("INVALID_UNTIL", "invalid until")
|
||||
}
|
||||
|
||||
created, err := s.opsRepo.CreateAlertSilence(ctx, input)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return created, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) IsAlertSilenced(ctx context.Context, ruleID int64, platform string, groupID *int64, region *string, now time.Time) (bool, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return false, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return false, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if ruleID <= 0 {
|
||||
return false, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
|
||||
}
|
||||
if strings.TrimSpace(platform) == "" {
|
||||
return false, nil
|
||||
}
|
||||
return s.opsRepo.IsAlertSilenced(ctx, ruleID, platform, groupID, region, now)
|
||||
}
|
||||
|
||||
func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
@@ -142,7 +208,11 @@ func (s *OpsService) UpdateAlertEventStatus(ctx context.Context, eventID int64,
|
||||
if eventID <= 0 {
|
||||
return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
|
||||
}
|
||||
if strings.TrimSpace(status) == "" {
|
||||
status = strings.TrimSpace(status)
|
||||
if status == "" {
|
||||
return infraerrors.BadRequest("INVALID_STATUS", "invalid status")
|
||||
}
|
||||
if status != OpsAlertStatusResolved && status != OpsAlertStatusManualResolved {
|
||||
return infraerrors.BadRequest("INVALID_STATUS", "invalid status")
|
||||
}
|
||||
return s.opsRepo.UpdateAlertEventStatus(ctx, eventID, status, resolvedAt)
|
||||
|
||||
@@ -32,49 +32,38 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview)
|
||||
}
|
||||
|
||||
// computeBusinessHealth calculates business health score (0-100)
|
||||
// Components: SLA (50%) + Error Rate (30%) + Latency (20%)
|
||||
// Components: Error Rate (50%) + TTFT (50%)
|
||||
func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
|
||||
// SLA score: 99.5% → 100, 95% → 0 (linear)
|
||||
slaScore := 100.0
|
||||
slaPct := clampFloat64(overview.SLA*100, 0, 100)
|
||||
if slaPct < 99.5 {
|
||||
if slaPct >= 95 {
|
||||
slaScore = (slaPct - 95) / 4.5 * 100
|
||||
} else {
|
||||
slaScore = 0
|
||||
}
|
||||
}
|
||||
|
||||
// Error rate score: 0.5% → 100, 5% → 0 (linear)
|
||||
// Error rate score: 1% → 100, 10% → 0 (linear)
|
||||
// Combines request errors and upstream errors
|
||||
errorScore := 100.0
|
||||
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
|
||||
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
|
||||
combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
|
||||
if combinedErrorPct > 0.5 {
|
||||
if combinedErrorPct <= 5 {
|
||||
errorScore = (5 - combinedErrorPct) / 4.5 * 100
|
||||
if combinedErrorPct > 1.0 {
|
||||
if combinedErrorPct <= 10.0 {
|
||||
errorScore = (10.0 - combinedErrorPct) / 9.0 * 100
|
||||
} else {
|
||||
errorScore = 0
|
||||
}
|
||||
}
|
||||
|
||||
// Latency score: 1s → 100, 10s → 0 (linear)
|
||||
// Uses P99 of duration (TTFT is less critical for overall health)
|
||||
latencyScore := 100.0
|
||||
if overview.Duration.P99 != nil {
|
||||
p99 := float64(*overview.Duration.P99)
|
||||
// TTFT score: 1s → 100, 3s → 0 (linear)
|
||||
// Time to first token is critical for user experience
|
||||
ttftScore := 100.0
|
||||
if overview.TTFT.P99 != nil {
|
||||
p99 := float64(*overview.TTFT.P99)
|
||||
if p99 > 1000 {
|
||||
if p99 <= 10000 {
|
||||
latencyScore = (10000 - p99) / 9000 * 100
|
||||
if p99 <= 3000 {
|
||||
ttftScore = (3000 - p99) / 2000 * 100
|
||||
} else {
|
||||
latencyScore = 0
|
||||
ttftScore = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Weighted combination
|
||||
return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2
|
||||
// Weighted combination: 50% error rate + 50% TTFT
|
||||
return errorScore*0.5 + ttftScore*0.5
|
||||
}
|
||||
|
||||
// computeInfraHealth calculates infrastructure health score (0-100)
|
||||
|
||||
@@ -127,8 +127,8 @@ func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
|
||||
MemoryUsagePercent: float64Ptr(75),
|
||||
},
|
||||
},
|
||||
wantMin: 60,
|
||||
wantMax: 85,
|
||||
wantMin: 96,
|
||||
wantMax: 97,
|
||||
},
|
||||
{
|
||||
name: "DB failure",
|
||||
@@ -203,8 +203,8 @@ func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
|
||||
MemoryUsagePercent: float64Ptr(30),
|
||||
},
|
||||
},
|
||||
wantMin: 25,
|
||||
wantMax: 50,
|
||||
wantMin: 84,
|
||||
wantMax: 85,
|
||||
},
|
||||
{
|
||||
name: "combined failures - business healthy + infra degraded",
|
||||
@@ -277,30 +277,41 @@ func TestComputeBusinessHealth(t *testing.T) {
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
},
|
||||
wantMin: 50,
|
||||
wantMax: 60,
|
||||
wantMin: 100,
|
||||
wantMax: 100,
|
||||
},
|
||||
{
|
||||
name: "error rate boundary 0.5%",
|
||||
name: "error rate boundary 1%",
|
||||
overview: &OpsDashboardOverview{
|
||||
SLA: 0.995,
|
||||
ErrorRate: 0.005,
|
||||
SLA: 0.99,
|
||||
ErrorRate: 0.01,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
},
|
||||
wantMin: 95,
|
||||
wantMin: 100,
|
||||
wantMax: 100,
|
||||
},
|
||||
{
|
||||
name: "latency boundary 1000ms",
|
||||
name: "error rate 5%",
|
||||
overview: &OpsDashboardOverview{
|
||||
SLA: 0.995,
|
||||
SLA: 0.95,
|
||||
ErrorRate: 0.05,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
},
|
||||
wantMin: 77,
|
||||
wantMax: 78,
|
||||
},
|
||||
{
|
||||
name: "TTFT boundary 2s",
|
||||
overview: &OpsDashboardOverview{
|
||||
SLA: 0.99,
|
||||
ErrorRate: 0,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(1000)},
|
||||
TTFT: OpsPercentiles{P99: intPtr(2000)},
|
||||
},
|
||||
wantMin: 95,
|
||||
wantMax: 100,
|
||||
wantMin: 75,
|
||||
wantMax: 75,
|
||||
},
|
||||
{
|
||||
name: "upstream error dominates",
|
||||
@@ -310,7 +321,7 @@ func TestComputeBusinessHealth(t *testing.T) {
|
||||
UpstreamErrorRate: 0.03,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
},
|
||||
wantMin: 75,
|
||||
wantMin: 88,
|
||||
wantMax: 90,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -6,24 +6,43 @@ type OpsErrorLog struct {
|
||||
ID int64 `json:"id"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
|
||||
Phase string `json:"phase"`
|
||||
Type string `json:"type"`
|
||||
// Standardized classification
|
||||
// - phase: request|auth|routing|upstream|network|internal
|
||||
// - owner: client|provider|platform
|
||||
// - source: client_request|upstream_http|gateway
|
||||
Phase string `json:"phase"`
|
||||
Type string `json:"type"`
|
||||
|
||||
Owner string `json:"error_owner"`
|
||||
Source string `json:"error_source"`
|
||||
|
||||
Severity string `json:"severity"`
|
||||
|
||||
StatusCode int `json:"status_code"`
|
||||
Platform string `json:"platform"`
|
||||
Model string `json:"model"`
|
||||
|
||||
LatencyMs *int `json:"latency_ms"`
|
||||
IsRetryable bool `json:"is_retryable"`
|
||||
RetryCount int `json:"retry_count"`
|
||||
|
||||
Resolved bool `json:"resolved"`
|
||||
ResolvedAt *time.Time `json:"resolved_at"`
|
||||
ResolvedByUserID *int64 `json:"resolved_by_user_id"`
|
||||
ResolvedByUserName string `json:"resolved_by_user_name"`
|
||||
ResolvedRetryID *int64 `json:"resolved_retry_id"`
|
||||
ResolvedStatusRaw string `json:"-"`
|
||||
|
||||
ClientRequestID string `json:"client_request_id"`
|
||||
RequestID string `json:"request_id"`
|
||||
Message string `json:"message"`
|
||||
|
||||
UserID *int64 `json:"user_id"`
|
||||
APIKeyID *int64 `json:"api_key_id"`
|
||||
AccountID *int64 `json:"account_id"`
|
||||
GroupID *int64 `json:"group_id"`
|
||||
UserID *int64 `json:"user_id"`
|
||||
UserEmail string `json:"user_email"`
|
||||
APIKeyID *int64 `json:"api_key_id"`
|
||||
AccountID *int64 `json:"account_id"`
|
||||
AccountName string `json:"account_name"`
|
||||
GroupID *int64 `json:"group_id"`
|
||||
GroupName string `json:"group_name"`
|
||||
|
||||
ClientIP *string `json:"client_ip"`
|
||||
RequestPath string `json:"request_path"`
|
||||
@@ -67,9 +86,24 @@ type OpsErrorLogFilter struct {
|
||||
GroupID *int64
|
||||
AccountID *int64
|
||||
|
||||
StatusCodes []int
|
||||
Phase string
|
||||
Query string
|
||||
StatusCodes []int
|
||||
StatusCodesOther bool
|
||||
Phase string
|
||||
Owner string
|
||||
Source string
|
||||
Resolved *bool
|
||||
Query string
|
||||
UserQuery string // Search by user email
|
||||
|
||||
// Optional correlation keys for exact matching.
|
||||
RequestID string
|
||||
ClientRequestID string
|
||||
|
||||
// View controls error categorization for list endpoints.
|
||||
// - errors: show actionable errors (exclude business-limited / 429 / 529)
|
||||
// - excluded: only show excluded errors
|
||||
// - all: show everything
|
||||
View string
|
||||
|
||||
Page int
|
||||
PageSize int
|
||||
@@ -90,12 +124,23 @@ type OpsRetryAttempt struct {
|
||||
SourceErrorID int64 `json:"source_error_id"`
|
||||
Mode string `json:"mode"`
|
||||
PinnedAccountID *int64 `json:"pinned_account_id"`
|
||||
PinnedAccountName string `json:"pinned_account_name"`
|
||||
|
||||
Status string `json:"status"`
|
||||
StartedAt *time.Time `json:"started_at"`
|
||||
FinishedAt *time.Time `json:"finished_at"`
|
||||
DurationMs *int64 `json:"duration_ms"`
|
||||
|
||||
// Persisted execution results (best-effort)
|
||||
Success *bool `json:"success"`
|
||||
HTTPStatusCode *int `json:"http_status_code"`
|
||||
UpstreamRequestID *string `json:"upstream_request_id"`
|
||||
UsedAccountID *int64 `json:"used_account_id"`
|
||||
UsedAccountName string `json:"used_account_name"`
|
||||
ResponsePreview *string `json:"response_preview"`
|
||||
ResponseTruncated *bool `json:"response_truncated"`
|
||||
|
||||
// Optional correlation
|
||||
ResultRequestID *string `json:"result_request_id"`
|
||||
ResultErrorID *int64 `json:"result_error_id"`
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ type OpsRepository interface {
|
||||
InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
|
||||
UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
|
||||
GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)
|
||||
ListRetryAttemptsByErrorID(ctx context.Context, sourceErrorID int64, limit int) ([]*OpsRetryAttempt, error)
|
||||
UpdateErrorResolution(ctx context.Context, errorID int64, resolved bool, resolvedByUserID *int64, resolvedRetryID *int64, resolvedAt *time.Time) error
|
||||
|
||||
// Lightweight window stats (for realtime WS / quick sampling).
|
||||
GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
|
||||
@@ -39,12 +41,17 @@ type OpsRepository interface {
|
||||
DeleteAlertRule(ctx context.Context, id int64) error
|
||||
|
||||
ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
|
||||
GetAlertEventByID(ctx context.Context, eventID int64) (*OpsAlertEvent, error)
|
||||
GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
|
||||
GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
|
||||
CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
|
||||
UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
|
||||
UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error
|
||||
|
||||
// Alert silences
|
||||
CreateAlertSilence(ctx context.Context, input *OpsAlertSilence) (*OpsAlertSilence, error)
|
||||
IsAlertSilenced(ctx context.Context, ruleID int64, platform string, groupID *int64, region *string, now time.Time) (bool, error)
|
||||
|
||||
// Pre-aggregation (hourly/daily) used for long-window dashboard performance.
|
||||
UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
|
||||
UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
|
||||
@@ -91,7 +98,6 @@ type OpsInsertErrorLogInput struct {
|
||||
// It is set by OpsService.RecordError before persisting.
|
||||
UpstreamErrorsJSON *string
|
||||
|
||||
DurationMs *int
|
||||
TimeToFirstTokenMs *int64
|
||||
|
||||
RequestBodyJSON *string // sanitized json string (not raw bytes)
|
||||
@@ -124,7 +130,15 @@ type OpsUpdateRetryAttemptInput struct {
|
||||
FinishedAt time.Time
|
||||
DurationMs int64
|
||||
|
||||
// Optional correlation
|
||||
// Persisted execution results (best-effort)
|
||||
Success *bool
|
||||
HTTPStatusCode *int
|
||||
UpstreamRequestID *string
|
||||
UsedAccountID *int64
|
||||
ResponsePreview *string
|
||||
ResponseTruncated *bool
|
||||
|
||||
// Optional correlation (legacy fields kept)
|
||||
ResultRequestID *string
|
||||
ResultErrorID *int64
|
||||
|
||||
|
||||
@@ -108,6 +108,10 @@ func (w *limitedResponseWriter) truncated() bool {
|
||||
return w.totalWritten > int64(w.limit)
|
||||
}
|
||||
|
||||
const (
|
||||
OpsRetryModeUpstreamEvent = "upstream_event"
|
||||
)
|
||||
|
||||
func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
@@ -123,6 +127,81 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
|
||||
}
|
||||
|
||||
errorLog, err := s.GetErrorLogByID(ctx, errorID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if errorLog == nil {
|
||||
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
|
||||
}
|
||||
if strings.TrimSpace(errorLog.RequestBody) == "" {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
|
||||
}
|
||||
|
||||
var pinned *int64
|
||||
if mode == OpsRetryModeUpstream {
|
||||
if pinnedAccountID != nil && *pinnedAccountID > 0 {
|
||||
pinned = pinnedAccountID
|
||||
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
|
||||
pinned = errorLog.AccountID
|
||||
} else {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
|
||||
}
|
||||
}
|
||||
|
||||
return s.retryWithErrorLog(ctx, requestedByUserID, errorID, mode, mode, pinned, errorLog)
|
||||
}
|
||||
|
||||
// RetryUpstreamEvent retries a specific upstream attempt captured inside ops_error_logs.upstream_errors.
|
||||
// idx is 0-based. It always pins the original event account_id.
|
||||
func (s *OpsService) RetryUpstreamEvent(ctx context.Context, requestedByUserID int64, errorID int64, idx int) (*OpsRetryResult, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if idx < 0 {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_UPSTREAM_IDX", "invalid upstream idx")
|
||||
}
|
||||
|
||||
errorLog, err := s.GetErrorLogByID(ctx, errorID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if errorLog == nil {
|
||||
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
|
||||
}
|
||||
|
||||
events, err := ParseOpsUpstreamErrors(errorLog.UpstreamErrors)
|
||||
if err != nil {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENTS_INVALID", "invalid upstream_errors")
|
||||
}
|
||||
if idx >= len(events) {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_IDX_OOB", "upstream idx out of range")
|
||||
}
|
||||
ev := events[idx]
|
||||
if ev == nil {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENT_MISSING", "upstream event missing")
|
||||
}
|
||||
if ev.AccountID <= 0 {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry")
|
||||
}
|
||||
|
||||
upstreamBody := strings.TrimSpace(ev.UpstreamRequestBody)
|
||||
if upstreamBody == "" {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_NO_REQUEST_BODY", "No upstream request body found to retry")
|
||||
}
|
||||
|
||||
override := *errorLog
|
||||
override.RequestBody = upstreamBody
|
||||
pinned := ev.AccountID
|
||||
|
||||
// Persist as upstream_event, execute as upstream pinned retry.
|
||||
return s.retryWithErrorLog(ctx, requestedByUserID, errorID, OpsRetryModeUpstreamEvent, OpsRetryModeUpstream, &pinned, &override)
|
||||
}
|
||||
|
||||
func (s *OpsService) retryWithErrorLog(ctx context.Context, requestedByUserID int64, errorID int64, mode string, execMode string, pinnedAccountID *int64, errorLog *OpsErrorLogDetail) (*OpsRetryResult, error) {
|
||||
latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
|
||||
if err != nil && !errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
|
||||
@@ -144,22 +223,18 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
|
||||
}
|
||||
}
|
||||
|
||||
errorLog, err := s.GetErrorLogByID(ctx, errorID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if strings.TrimSpace(errorLog.RequestBody) == "" {
|
||||
if errorLog == nil || strings.TrimSpace(errorLog.RequestBody) == "" {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
|
||||
}
|
||||
|
||||
var pinned *int64
|
||||
if mode == OpsRetryModeUpstream {
|
||||
if execMode == OpsRetryModeUpstream {
|
||||
if pinnedAccountID != nil && *pinnedAccountID > 0 {
|
||||
pinned = pinnedAccountID
|
||||
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
|
||||
pinned = errorLog.AccountID
|
||||
} else {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -196,7 +271,7 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
|
||||
execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
|
||||
defer cancel()
|
||||
|
||||
execRes := s.executeRetry(execCtx, errorLog, mode, pinned)
|
||||
execRes := s.executeRetry(execCtx, errorLog, execMode, pinned)
|
||||
|
||||
finishedAt := time.Now()
|
||||
result.FinishedAt = finishedAt
|
||||
@@ -220,27 +295,40 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
|
||||
msg := result.ErrorMessage
|
||||
updateErrMsg = &msg
|
||||
}
|
||||
// Keep legacy result_request_id empty; use upstream_request_id instead.
|
||||
var resultRequestID *string
|
||||
if strings.TrimSpace(result.UpstreamRequestID) != "" {
|
||||
v := result.UpstreamRequestID
|
||||
resultRequestID = &v
|
||||
}
|
||||
|
||||
finalStatus := result.Status
|
||||
if strings.TrimSpace(finalStatus) == "" {
|
||||
finalStatus = opsRetryStatusFailed
|
||||
}
|
||||
|
||||
success := strings.EqualFold(finalStatus, opsRetryStatusSucceeded)
|
||||
httpStatus := result.HTTPStatusCode
|
||||
upstreamReqID := result.UpstreamRequestID
|
||||
usedAccountID := result.UsedAccountID
|
||||
preview := result.ResponsePreview
|
||||
truncated := result.ResponseTruncated
|
||||
|
||||
if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{
|
||||
ID: attemptID,
|
||||
Status: finalStatus,
|
||||
FinishedAt: finishedAt,
|
||||
DurationMs: result.DurationMs,
|
||||
ResultRequestID: resultRequestID,
|
||||
ErrorMessage: updateErrMsg,
|
||||
ID: attemptID,
|
||||
Status: finalStatus,
|
||||
FinishedAt: finishedAt,
|
||||
DurationMs: result.DurationMs,
|
||||
Success: &success,
|
||||
HTTPStatusCode: &httpStatus,
|
||||
UpstreamRequestID: &upstreamReqID,
|
||||
UsedAccountID: usedAccountID,
|
||||
ResponsePreview: &preview,
|
||||
ResponseTruncated: &truncated,
|
||||
ResultRequestID: resultRequestID,
|
||||
ErrorMessage: updateErrMsg,
|
||||
}); err != nil {
|
||||
// Best-effort: retry itself already executed; do not fail the API response.
|
||||
log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
|
||||
} else if success {
|
||||
if err := s.opsRepo.UpdateErrorResolution(updateCtx, errorID, true, &requestedByUserID, &attemptID, &finishedAt); err != nil {
|
||||
log.Printf("[Ops] UpdateErrorResolution failed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
|
||||
@@ -208,6 +208,25 @@ func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogIn
|
||||
out.Detail = ""
|
||||
}
|
||||
|
||||
out.UpstreamRequestBody = strings.TrimSpace(out.UpstreamRequestBody)
|
||||
if out.UpstreamRequestBody != "" {
|
||||
// Reuse the same sanitization/trimming strategy as request body storage.
|
||||
// Keep it small so it is safe to persist in ops_error_logs JSON.
|
||||
sanitized, truncated, _ := sanitizeAndTrimRequestBody([]byte(out.UpstreamRequestBody), 10*1024)
|
||||
if sanitized != "" {
|
||||
out.UpstreamRequestBody = sanitized
|
||||
if truncated {
|
||||
out.Kind = strings.TrimSpace(out.Kind)
|
||||
if out.Kind == "" {
|
||||
out.Kind = "upstream"
|
||||
}
|
||||
out.Kind = out.Kind + ":request_body_truncated"
|
||||
}
|
||||
} else {
|
||||
out.UpstreamRequestBody = ""
|
||||
}
|
||||
}
|
||||
|
||||
// Drop fully-empty events (can happen if only status code was known).
|
||||
if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" {
|
||||
continue
|
||||
@@ -236,7 +255,13 @@ func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter
|
||||
if s.opsRepo == nil {
|
||||
return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil
|
||||
}
|
||||
return s.opsRepo.ListErrorLogs(ctx, filter)
|
||||
result, err := s.opsRepo.ListErrorLogs(ctx, filter)
|
||||
if err != nil {
|
||||
log.Printf("[Ops] GetErrorLogs failed: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) {
|
||||
@@ -256,6 +281,46 @@ func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLo
|
||||
return detail, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) ListRetryAttemptsByErrorID(ctx context.Context, errorID int64, limit int) ([]*OpsRetryAttempt, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if errorID <= 0 {
|
||||
return nil, infraerrors.BadRequest("OPS_ERROR_INVALID_ID", "invalid error id")
|
||||
}
|
||||
items, err := s.opsRepo.ListRetryAttemptsByErrorID(ctx, errorID, limit)
|
||||
if err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return []*OpsRetryAttempt{}, nil
|
||||
}
|
||||
return nil, infraerrors.InternalServer("OPS_RETRY_LIST_FAILED", "Failed to list retry attempts").WithCause(err)
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) UpdateErrorResolution(ctx context.Context, errorID int64, resolved bool, resolvedByUserID *int64, resolvedRetryID *int64) error {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if errorID <= 0 {
|
||||
return infraerrors.BadRequest("OPS_ERROR_INVALID_ID", "invalid error id")
|
||||
}
|
||||
// Best-effort ensure the error exists
|
||||
if _, err := s.opsRepo.GetErrorLogByID(ctx, errorID); err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
|
||||
}
|
||||
return infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err)
|
||||
}
|
||||
return s.opsRepo.UpdateErrorResolution(ctx, errorID, resolved, resolvedByUserID, resolvedRetryID, nil)
|
||||
}
|
||||
|
||||
func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) {
|
||||
bytesLen = len(raw)
|
||||
if len(raw) == 0 {
|
||||
@@ -296,14 +361,34 @@ func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, tr
|
||||
}
|
||||
}
|
||||
|
||||
// Last resort: store a minimal placeholder (still valid JSON).
|
||||
placeholder := map[string]any{
|
||||
"request_body_truncated": true,
|
||||
// Last resort: keep JSON shape but drop big fields.
|
||||
// This avoids downstream code that expects certain top-level keys from crashing.
|
||||
if root, ok := decoded.(map[string]any); ok {
|
||||
placeholder := shallowCopyMap(root)
|
||||
placeholder["request_body_truncated"] = true
|
||||
|
||||
// Replace potentially huge arrays/strings, but keep the keys present.
|
||||
for _, k := range []string{"messages", "contents", "input", "prompt"} {
|
||||
if _, exists := placeholder[k]; exists {
|
||||
placeholder[k] = []any{}
|
||||
}
|
||||
}
|
||||
for _, k := range []string{"text"} {
|
||||
if _, exists := placeholder[k]; exists {
|
||||
placeholder[k] = ""
|
||||
}
|
||||
}
|
||||
|
||||
encoded4, err4 := json.Marshal(placeholder)
|
||||
if err4 == nil {
|
||||
if len(encoded4) <= maxBytes {
|
||||
return string(encoded4), true, bytesLen
|
||||
}
|
||||
}
|
||||
}
|
||||
if model := extractString(decoded, "model"); model != "" {
|
||||
placeholder["model"] = model
|
||||
}
|
||||
encoded4, err4 := json.Marshal(placeholder)
|
||||
|
||||
// Final fallback: minimal valid JSON.
|
||||
encoded4, err4 := json.Marshal(map[string]any{"request_body_truncated": true})
|
||||
if err4 != nil {
|
||||
return "", true, bytesLen
|
||||
}
|
||||
@@ -526,12 +611,3 @@ func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, tr
|
||||
}
|
||||
return raw, false
|
||||
}
|
||||
|
||||
func extractString(v any, key string) string {
|
||||
root, ok := v.(map[string]any)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
s, _ := root[key].(string)
|
||||
return strings.TrimSpace(s)
|
||||
}
|
||||
|
||||
@@ -368,9 +368,11 @@ func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
|
||||
Aggregation: OpsAggregationSettings{
|
||||
AggregationEnabled: false,
|
||||
},
|
||||
IgnoreCountTokensErrors: false,
|
||||
AutoRefreshEnabled: false,
|
||||
AutoRefreshIntervalSec: 30,
|
||||
IgnoreCountTokensErrors: false,
|
||||
IgnoreContextCanceled: true, // Default to true - client disconnects are not errors
|
||||
IgnoreNoAvailableAccounts: false, // Default to false - this is a real routing issue
|
||||
AutoRefreshEnabled: false,
|
||||
AutoRefreshIntervalSec: 30,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -482,13 +484,11 @@ const SettingKeyOpsMetricThresholds = "ops_metric_thresholds"
|
||||
|
||||
func defaultOpsMetricThresholds() *OpsMetricThresholds {
|
||||
slaMin := 99.5
|
||||
latencyMax := 2000.0
|
||||
ttftMax := 500.0
|
||||
reqErrMax := 5.0
|
||||
upstreamErrMax := 5.0
|
||||
return &OpsMetricThresholds{
|
||||
SLAPercentMin: &slaMin,
|
||||
LatencyP99MsMax: &latencyMax,
|
||||
TTFTp99MsMax: &ttftMax,
|
||||
RequestErrorRatePercentMax: &reqErrMax,
|
||||
UpstreamErrorRatePercentMax: &upstreamErrMax,
|
||||
@@ -538,9 +538,6 @@ func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricT
|
||||
if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) {
|
||||
return nil, errors.New("sla_percent_min must be between 0 and 100")
|
||||
}
|
||||
if cfg.LatencyP99MsMax != nil && *cfg.LatencyP99MsMax < 0 {
|
||||
return nil, errors.New("latency_p99_ms_max must be >= 0")
|
||||
}
|
||||
if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 {
|
||||
return nil, errors.New("ttft_p99_ms_max must be >= 0")
|
||||
}
|
||||
|
||||
@@ -63,7 +63,6 @@ type OpsAlertSilencingSettings struct {
|
||||
|
||||
type OpsMetricThresholds struct {
|
||||
SLAPercentMin *float64 `json:"sla_percent_min,omitempty"` // SLA低于此值变红
|
||||
LatencyP99MsMax *float64 `json:"latency_p99_ms_max,omitempty"` // 延迟P99高于此值变红
|
||||
TTFTp99MsMax *float64 `json:"ttft_p99_ms_max,omitempty"` // TTFT P99高于此值变红
|
||||
RequestErrorRatePercentMax *float64 `json:"request_error_rate_percent_max,omitempty"` // 请求错误率高于此值变红
|
||||
UpstreamErrorRatePercentMax *float64 `json:"upstream_error_rate_percent_max,omitempty"` // 上游错误率高于此值变红
|
||||
@@ -79,11 +78,13 @@ type OpsAlertRuntimeSettings struct {
|
||||
|
||||
// OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation).
|
||||
type OpsAdvancedSettings struct {
|
||||
DataRetention OpsDataRetentionSettings `json:"data_retention"`
|
||||
Aggregation OpsAggregationSettings `json:"aggregation"`
|
||||
IgnoreCountTokensErrors bool `json:"ignore_count_tokens_errors"`
|
||||
AutoRefreshEnabled bool `json:"auto_refresh_enabled"`
|
||||
AutoRefreshIntervalSec int `json:"auto_refresh_interval_seconds"`
|
||||
DataRetention OpsDataRetentionSettings `json:"data_retention"`
|
||||
Aggregation OpsAggregationSettings `json:"aggregation"`
|
||||
IgnoreCountTokensErrors bool `json:"ignore_count_tokens_errors"`
|
||||
IgnoreContextCanceled bool `json:"ignore_context_canceled"`
|
||||
IgnoreNoAvailableAccounts bool `json:"ignore_no_available_accounts"`
|
||||
AutoRefreshEnabled bool `json:"auto_refresh_enabled"`
|
||||
AutoRefreshIntervalSec int `json:"auto_refresh_interval_seconds"`
|
||||
}
|
||||
|
||||
type OpsDataRetentionSettings struct {
|
||||
|
||||
@@ -15,6 +15,11 @@ const (
|
||||
OpsUpstreamErrorMessageKey = "ops_upstream_error_message"
|
||||
OpsUpstreamErrorDetailKey = "ops_upstream_error_detail"
|
||||
OpsUpstreamErrorsKey = "ops_upstream_errors"
|
||||
|
||||
// Best-effort capture of the current upstream request body so ops can
|
||||
// retry the specific upstream attempt (not just the client request).
|
||||
// This value is sanitized+trimmed before being persisted.
|
||||
OpsUpstreamRequestBodyKey = "ops_upstream_request_body"
|
||||
)
|
||||
|
||||
func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) {
|
||||
@@ -38,13 +43,21 @@ type OpsUpstreamErrorEvent struct {
|
||||
AtUnixMs int64 `json:"at_unix_ms,omitempty"`
|
||||
|
||||
// Context
|
||||
Platform string `json:"platform,omitempty"`
|
||||
AccountID int64 `json:"account_id,omitempty"`
|
||||
Platform string `json:"platform,omitempty"`
|
||||
AccountID int64 `json:"account_id,omitempty"`
|
||||
AccountName string `json:"account_name,omitempty"`
|
||||
|
||||
// Outcome
|
||||
UpstreamStatusCode int `json:"upstream_status_code,omitempty"`
|
||||
UpstreamRequestID string `json:"upstream_request_id,omitempty"`
|
||||
|
||||
// Best-effort upstream request capture (sanitized+trimmed).
|
||||
// Required for retrying a specific upstream attempt.
|
||||
UpstreamRequestBody string `json:"upstream_request_body,omitempty"`
|
||||
|
||||
// Best-effort upstream response capture (sanitized+trimmed).
|
||||
UpstreamResponseBody string `json:"upstream_response_body,omitempty"`
|
||||
|
||||
// Kind: http_error | request_error | retry_exhausted | failover
|
||||
Kind string `json:"kind,omitempty"`
|
||||
|
||||
@@ -61,6 +74,8 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
|
||||
}
|
||||
ev.Platform = strings.TrimSpace(ev.Platform)
|
||||
ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID)
|
||||
ev.UpstreamRequestBody = strings.TrimSpace(ev.UpstreamRequestBody)
|
||||
ev.UpstreamResponseBody = strings.TrimSpace(ev.UpstreamResponseBody)
|
||||
ev.Kind = strings.TrimSpace(ev.Kind)
|
||||
ev.Message = strings.TrimSpace(ev.Message)
|
||||
ev.Detail = strings.TrimSpace(ev.Detail)
|
||||
@@ -68,6 +83,16 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
|
||||
ev.Message = sanitizeUpstreamErrorMessage(ev.Message)
|
||||
}
|
||||
|
||||
// If the caller didn't explicitly pass upstream request body but the gateway
|
||||
// stored it on the context, attach it so ops can retry this specific attempt.
|
||||
if ev.UpstreamRequestBody == "" {
|
||||
if v, ok := c.Get(OpsUpstreamRequestBodyKey); ok {
|
||||
if s, ok := v.(string); ok {
|
||||
ev.UpstreamRequestBody = strings.TrimSpace(s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var existing []*OpsUpstreamErrorEvent
|
||||
if v, ok := c.Get(OpsUpstreamErrorsKey); ok {
|
||||
if arr, ok := v.([]*OpsUpstreamErrorEvent); ok {
|
||||
@@ -92,3 +117,15 @@ func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string {
|
||||
s := string(raw)
|
||||
return &s
|
||||
}
|
||||
|
||||
func ParseOpsUpstreamErrors(raw string) ([]*OpsUpstreamErrorEvent, error) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return []*OpsUpstreamErrorEvent{}, nil
|
||||
}
|
||||
var out []*OpsUpstreamErrorEvent
|
||||
if err := json.Unmarshal([]byte(raw), &out); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
28
backend/migrations/037_ops_alert_silences.sql
Normal file
28
backend/migrations/037_ops_alert_silences.sql
Normal file
@@ -0,0 +1,28 @@
|
||||
-- +goose Up
|
||||
-- +goose StatementBegin
|
||||
-- Ops alert silences: scoped (rule_id + platform + group_id + region)
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ops_alert_silences (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
|
||||
rule_id BIGINT NOT NULL,
|
||||
platform VARCHAR(64) NOT NULL,
|
||||
group_id BIGINT,
|
||||
region VARCHAR(64),
|
||||
|
||||
until TIMESTAMPTZ NOT NULL,
|
||||
reason TEXT,
|
||||
|
||||
created_by BIGINT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ops_alert_silences_lookup
|
||||
ON ops_alert_silences (rule_id, platform, group_id, region, until);
|
||||
|
||||
-- +goose StatementEnd
|
||||
|
||||
-- +goose Down
|
||||
-- +goose StatementBegin
|
||||
DROP TABLE IF EXISTS ops_alert_silences;
|
||||
-- +goose StatementEnd
|
||||
@@ -0,0 +1,111 @@
|
||||
-- Add resolution tracking to ops_error_logs, persist retry results, and standardize error classification enums.
|
||||
--
|
||||
-- This migration is intentionally idempotent.
|
||||
|
||||
SET LOCAL lock_timeout = '5s';
|
||||
SET LOCAL statement_timeout = '10min';
|
||||
|
||||
-- ============================================
|
||||
-- 1) ops_error_logs: resolution fields
|
||||
-- ============================================
|
||||
|
||||
ALTER TABLE ops_error_logs
|
||||
ADD COLUMN IF NOT EXISTS resolved BOOLEAN NOT NULL DEFAULT false;
|
||||
|
||||
ALTER TABLE ops_error_logs
|
||||
ADD COLUMN IF NOT EXISTS resolved_at TIMESTAMPTZ;
|
||||
|
||||
ALTER TABLE ops_error_logs
|
||||
ADD COLUMN IF NOT EXISTS resolved_by_user_id BIGINT;
|
||||
|
||||
ALTER TABLE ops_error_logs
|
||||
ADD COLUMN IF NOT EXISTS resolved_retry_id BIGINT;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_resolved_time
|
||||
ON ops_error_logs (resolved, created_at DESC);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_unresolved_time
|
||||
ON ops_error_logs (created_at DESC)
|
||||
WHERE resolved = false;
|
||||
|
||||
-- ============================================
|
||||
-- 2) ops_retry_attempts: persist execution results
|
||||
-- ============================================
|
||||
|
||||
ALTER TABLE ops_retry_attempts
|
||||
ADD COLUMN IF NOT EXISTS success BOOLEAN;
|
||||
|
||||
ALTER TABLE ops_retry_attempts
|
||||
ADD COLUMN IF NOT EXISTS http_status_code INT;
|
||||
|
||||
ALTER TABLE ops_retry_attempts
|
||||
ADD COLUMN IF NOT EXISTS upstream_request_id VARCHAR(128);
|
||||
|
||||
ALTER TABLE ops_retry_attempts
|
||||
ADD COLUMN IF NOT EXISTS used_account_id BIGINT;
|
||||
|
||||
ALTER TABLE ops_retry_attempts
|
||||
ADD COLUMN IF NOT EXISTS response_preview TEXT;
|
||||
|
||||
ALTER TABLE ops_retry_attempts
|
||||
ADD COLUMN IF NOT EXISTS response_truncated BOOLEAN NOT NULL DEFAULT false;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_success_time
|
||||
ON ops_retry_attempts (success, created_at DESC);
|
||||
|
||||
-- Backfill best-effort fields for existing rows.
|
||||
UPDATE ops_retry_attempts
|
||||
SET success = (LOWER(COALESCE(status, '')) = 'succeeded')
|
||||
WHERE success IS NULL;
|
||||
|
||||
UPDATE ops_retry_attempts
|
||||
SET upstream_request_id = result_request_id
|
||||
WHERE upstream_request_id IS NULL AND result_request_id IS NOT NULL;
|
||||
|
||||
-- ============================================
|
||||
-- 3) Standardize classification enums in ops_error_logs
|
||||
--
|
||||
-- New enums:
|
||||
-- error_phase: request|auth|routing|upstream|network|internal
|
||||
-- error_owner: client|provider|platform
|
||||
-- error_source: client_request|upstream_http|gateway
|
||||
-- ============================================
|
||||
|
||||
-- Owner: legacy sub2api => platform.
|
||||
UPDATE ops_error_logs
|
||||
SET error_owner = 'platform'
|
||||
WHERE LOWER(COALESCE(error_owner, '')) = 'sub2api';
|
||||
|
||||
-- Owner: normalize empty/null to platform (best-effort).
|
||||
UPDATE ops_error_logs
|
||||
SET error_owner = 'platform'
|
||||
WHERE COALESCE(TRIM(error_owner), '') = '';
|
||||
|
||||
-- Phase: map legacy phases.
|
||||
UPDATE ops_error_logs
|
||||
SET error_phase = CASE
|
||||
WHEN COALESCE(TRIM(error_phase), '') = '' THEN 'internal'
|
||||
WHEN LOWER(error_phase) IN ('billing', 'concurrency', 'response') THEN 'request'
|
||||
WHEN LOWER(error_phase) IN ('scheduling') THEN 'routing'
|
||||
WHEN LOWER(error_phase) IN ('request', 'auth', 'routing', 'upstream', 'network', 'internal') THEN LOWER(error_phase)
|
||||
ELSE 'internal'
|
||||
END;
|
||||
|
||||
-- Source: map legacy sources.
|
||||
UPDATE ops_error_logs
|
||||
SET error_source = CASE
|
||||
WHEN COALESCE(TRIM(error_source), '') = '' THEN 'gateway'
|
||||
WHEN LOWER(error_source) IN ('billing', 'concurrency') THEN 'client_request'
|
||||
WHEN LOWER(error_source) IN ('upstream_http') THEN 'upstream_http'
|
||||
WHEN LOWER(error_source) IN ('upstream_network') THEN 'gateway'
|
||||
WHEN LOWER(error_source) IN ('internal') THEN 'gateway'
|
||||
WHEN LOWER(error_source) IN ('client_request', 'upstream_http', 'gateway') THEN LOWER(error_source)
|
||||
ELSE 'gateway'
|
||||
END;
|
||||
|
||||
-- Auto-resolve recovered upstream errors (client status < 400).
|
||||
UPDATE ops_error_logs
|
||||
SET
|
||||
resolved = true,
|
||||
resolved_at = COALESCE(resolved_at, created_at)
|
||||
WHERE resolved = false AND COALESCE(status_code, 0) > 0 AND COALESCE(status_code, 0) < 400;
|
||||
Reference in New Issue
Block a user