From 340dc9cadbdffcb1df5649d986b6fad847705813 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Wed, 14 Jan 2026 09:02:45 +0800 Subject: [PATCH 01/86] =?UTF-8?q?feat(db):=20=E6=B7=BB=E5=8A=A0ops?= =?UTF-8?q?=E5=91=8A=E8=AD=A6=E9=9D=99=E9=BB=98=E5=92=8C=E9=94=99=E8=AF=AF?= =?UTF-8?q?=E5=88=86=E7=B1=BB=E4=BC=98=E5=8C=96=E8=BF=81=E7=A7=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加ops告警静默功能的数据库结构 - 优化错误分类和重试结果字段标准化 --- backend/migrations/037_ops_alert_silences.sql | 28 +++++ ...results_and_standardize_classification.sql | 111 ++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 backend/migrations/037_ops_alert_silences.sql create mode 100644 backend/migrations/038_ops_errors_resolution_retry_results_and_standardize_classification.sql diff --git a/backend/migrations/037_ops_alert_silences.sql b/backend/migrations/037_ops_alert_silences.sql new file mode 100644 index 00000000..95b61a09 --- /dev/null +++ b/backend/migrations/037_ops_alert_silences.sql @@ -0,0 +1,28 @@ +-- +goose Up +-- +goose StatementBegin +-- Ops alert silences: scoped (rule_id + platform + group_id + region) + +CREATE TABLE IF NOT EXISTS ops_alert_silences ( + id BIGSERIAL PRIMARY KEY, + + rule_id BIGINT NOT NULL, + platform VARCHAR(64) NOT NULL, + group_id BIGINT, + region VARCHAR(64), + + until TIMESTAMPTZ NOT NULL, + reason TEXT, + + created_by BIGINT, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_ops_alert_silences_lookup + ON ops_alert_silences (rule_id, platform, group_id, region, until); + +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +DROP TABLE IF EXISTS ops_alert_silences; +-- +goose StatementEnd diff --git a/backend/migrations/038_ops_errors_resolution_retry_results_and_standardize_classification.sql b/backend/migrations/038_ops_errors_resolution_retry_results_and_standardize_classification.sql new file mode 100644 index 00000000..adaacf1c --- /dev/null +++ b/backend/migrations/038_ops_errors_resolution_retry_results_and_standardize_classification.sql @@ -0,0 +1,111 @@ +-- Add resolution tracking to ops_error_logs, persist retry results, and standardize error classification enums. +-- +-- This migration is intentionally idempotent. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- ============================================ +-- 1) ops_error_logs: resolution fields +-- ============================================ + +ALTER TABLE ops_error_logs + ADD COLUMN IF NOT EXISTS resolved BOOLEAN NOT NULL DEFAULT false; + +ALTER TABLE ops_error_logs + ADD COLUMN IF NOT EXISTS resolved_at TIMESTAMPTZ; + +ALTER TABLE ops_error_logs + ADD COLUMN IF NOT EXISTS resolved_by_user_id BIGINT; + +ALTER TABLE ops_error_logs + ADD COLUMN IF NOT EXISTS resolved_retry_id BIGINT; + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_resolved_time + ON ops_error_logs (resolved, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_unresolved_time + ON ops_error_logs (created_at DESC) + WHERE resolved = false; + +-- ============================================ +-- 2) ops_retry_attempts: persist execution results +-- ============================================ + +ALTER TABLE ops_retry_attempts + ADD COLUMN IF NOT EXISTS success BOOLEAN; + +ALTER TABLE ops_retry_attempts + ADD COLUMN IF NOT EXISTS http_status_code INT; + +ALTER TABLE ops_retry_attempts + ADD COLUMN IF NOT EXISTS upstream_request_id VARCHAR(128); + +ALTER TABLE ops_retry_attempts + ADD COLUMN IF NOT EXISTS used_account_id BIGINT; + +ALTER TABLE ops_retry_attempts + ADD COLUMN IF NOT EXISTS response_preview TEXT; + +ALTER TABLE ops_retry_attempts + ADD COLUMN IF NOT EXISTS response_truncated BOOLEAN NOT NULL DEFAULT false; + +CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_success_time + ON ops_retry_attempts (success, created_at DESC); + +-- Backfill best-effort fields for existing rows. +UPDATE ops_retry_attempts +SET success = (LOWER(COALESCE(status, '')) = 'succeeded') +WHERE success IS NULL; + +UPDATE ops_retry_attempts +SET upstream_request_id = result_request_id +WHERE upstream_request_id IS NULL AND result_request_id IS NOT NULL; + +-- ============================================ +-- 3) Standardize classification enums in ops_error_logs +-- +-- New enums: +-- error_phase: request|auth|routing|upstream|network|internal +-- error_owner: client|provider|platform +-- error_source: client_request|upstream_http|gateway +-- ============================================ + +-- Owner: legacy sub2api => platform. +UPDATE ops_error_logs +SET error_owner = 'platform' +WHERE LOWER(COALESCE(error_owner, '')) = 'sub2api'; + +-- Owner: normalize empty/null to platform (best-effort). +UPDATE ops_error_logs +SET error_owner = 'platform' +WHERE COALESCE(TRIM(error_owner), '') = ''; + +-- Phase: map legacy phases. +UPDATE ops_error_logs +SET error_phase = CASE + WHEN COALESCE(TRIM(error_phase), '') = '' THEN 'internal' + WHEN LOWER(error_phase) IN ('billing', 'concurrency', 'response') THEN 'request' + WHEN LOWER(error_phase) IN ('scheduling') THEN 'routing' + WHEN LOWER(error_phase) IN ('request', 'auth', 'routing', 'upstream', 'network', 'internal') THEN LOWER(error_phase) + ELSE 'internal' +END; + +-- Source: map legacy sources. +UPDATE ops_error_logs +SET error_source = CASE + WHEN COALESCE(TRIM(error_source), '') = '' THEN 'gateway' + WHEN LOWER(error_source) IN ('billing', 'concurrency') THEN 'client_request' + WHEN LOWER(error_source) IN ('upstream_http') THEN 'upstream_http' + WHEN LOWER(error_source) IN ('upstream_network') THEN 'gateway' + WHEN LOWER(error_source) IN ('internal') THEN 'gateway' + WHEN LOWER(error_source) IN ('client_request', 'upstream_http', 'gateway') THEN LOWER(error_source) + ELSE 'gateway' +END; + +-- Auto-resolve recovered upstream errors (client status < 400). +UPDATE ops_error_logs +SET + resolved = true, + resolved_at = COALESCE(resolved_at, created_at) +WHERE resolved = false AND COALESCE(status_code, 0) > 0 AND COALESCE(status_code, 0) < 400; From 16ddc6a83b38b1ee373ebc82545012862875ce06 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Wed, 14 Jan 2026 09:03:01 +0800 Subject: [PATCH 02/86] =?UTF-8?q?feat(repository):=20=E6=89=A9=E5=B1=95ops?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E8=AE=BF=E9=97=AE=E5=B1=82=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增告警静默相关数据库操作 - 增强错误日志查询和统计功能 - 优化重试结果和解决状态的存储 --- backend/internal/repository/ops_repo.go | 344 +++++++++++++++++- .../internal/repository/ops_repo_alerts.go | 166 ++++++++- 2 files changed, 501 insertions(+), 9 deletions(-) diff --git a/backend/internal/repository/ops_repo.go b/backend/internal/repository/ops_repo.go index f9cb6b4d..d9d71867 100644 --- a/backend/internal/repository/ops_repo.go +++ b/backend/internal/repository/ops_repo.go @@ -134,6 +134,7 @@ func (r *opsRepository) ListErrorLogs(ctx context.Context, filter *service.OpsEr pageSize = 500 } + // buildOpsErrorLogsWhere may mutate filter (default resolved filter). where, args := buildOpsErrorLogsWhere(filter) countSQL := "SELECT COUNT(*) FROM ops_error_logs " + where @@ -150,11 +151,19 @@ SELECT created_at, error_phase, error_type, + COALESCE(error_owner, ''), + COALESCE(error_source, ''), severity, COALESCE(upstream_status_code, status_code, 0), COALESCE(platform, ''), COALESCE(model, ''), duration_ms, + COALESCE(is_retryable, false), + COALESCE(retry_count, 0), + COALESCE(resolved, false), + resolved_at, + resolved_by_user_id, + resolved_retry_id, COALESCE(client_request_id, ''), COALESCE(request_id, ''), COALESCE(error_message, ''), @@ -186,16 +195,27 @@ LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2) var apiKeyID sql.NullInt64 var accountID sql.NullInt64 var groupID sql.NullInt64 + var resolvedAt sql.NullTime + var resolvedBy sql.NullInt64 + var resolvedRetryID sql.NullInt64 if err := rows.Scan( &item.ID, &item.CreatedAt, &item.Phase, &item.Type, + &item.Owner, + &item.Source, &item.Severity, &statusCode, &item.Platform, &item.Model, &latency, + &item.IsRetryable, + &item.RetryCount, + &item.Resolved, + &resolvedAt, + &resolvedBy, + &resolvedRetryID, &item.ClientRequestID, &item.RequestID, &item.Message, @@ -209,6 +229,18 @@ LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2) ); err != nil { return nil, err } + if resolvedAt.Valid { + t := resolvedAt.Time + item.ResolvedAt = &t + } + if resolvedBy.Valid { + v := resolvedBy.Int64 + item.ResolvedByUserID = &v + } + if resolvedRetryID.Valid { + v := resolvedRetryID.Int64 + item.ResolvedRetryID = &v + } if latency.Valid { v := int(latency.Int64) item.LatencyMs = &v @@ -262,11 +294,19 @@ SELECT created_at, error_phase, error_type, + COALESCE(error_owner, ''), + COALESCE(error_source, ''), severity, COALESCE(upstream_status_code, status_code, 0), COALESCE(platform, ''), COALESCE(model, ''), duration_ms, + COALESCE(is_retryable, false), + COALESCE(retry_count, 0), + COALESCE(resolved, false), + resolved_at, + resolved_by_user_id, + resolved_retry_id, COALESCE(client_request_id, ''), COALESCE(request_id, ''), COALESCE(error_message, ''), @@ -301,6 +341,9 @@ LIMIT 1` var latency sql.NullInt64 var statusCode sql.NullInt64 var upstreamStatusCode sql.NullInt64 + var resolvedAt sql.NullTime + var resolvedBy sql.NullInt64 + var resolvedRetryID sql.NullInt64 var clientIP sql.NullString var userID sql.NullInt64 var apiKeyID sql.NullInt64 @@ -318,11 +361,19 @@ LIMIT 1` &out.CreatedAt, &out.Phase, &out.Type, + &out.Owner, + &out.Source, &out.Severity, &statusCode, &out.Platform, &out.Model, &latency, + &out.IsRetryable, + &out.RetryCount, + &out.Resolved, + &resolvedAt, + &resolvedBy, + &resolvedRetryID, &out.ClientRequestID, &out.RequestID, &out.Message, @@ -359,6 +410,18 @@ LIMIT 1` v := int(latency.Int64) out.LatencyMs = &v } + if resolvedAt.Valid { + t := resolvedAt.Time + out.ResolvedAt = &t + } + if resolvedBy.Valid { + v := resolvedBy.Int64 + out.ResolvedByUserID = &v + } + if resolvedRetryID.Valid { + v := resolvedRetryID.Int64 + out.ResolvedRetryID = &v + } if clientIP.Valid { s := clientIP.String out.ClientIP = &s @@ -487,9 +550,15 @@ SET status = $2, finished_at = $3, duration_ms = $4, - result_request_id = $5, - result_error_id = $6, - error_message = $7 + success = $5, + http_status_code = $6, + upstream_request_id = $7, + used_account_id = $8, + response_preview = $9, + response_truncated = $10, + result_request_id = $11, + result_error_id = $12, + error_message = $13 WHERE id = $1` _, err := r.db.ExecContext( @@ -499,8 +568,14 @@ WHERE id = $1` strings.TrimSpace(input.Status), nullTime(input.FinishedAt), input.DurationMs, + nullBool(input.Success), + nullInt(input.HTTPStatusCode), + opsNullString(input.UpstreamRequestID), + nullInt64(input.UsedAccountID), + opsNullString(input.ResponsePreview), + nullBool(input.ResponseTruncated), opsNullString(input.ResultRequestID), - opsNullInt64(input.ResultErrorID), + nullInt64(input.ResultErrorID), opsNullString(input.ErrorMessage), ) return err @@ -526,6 +601,12 @@ SELECT started_at, finished_at, duration_ms, + success, + http_status_code, + upstream_request_id, + used_account_id, + response_preview, + response_truncated, result_request_id, result_error_id, error_message @@ -540,6 +621,12 @@ LIMIT 1` var startedAt sql.NullTime var finishedAt sql.NullTime var durationMs sql.NullInt64 + var success sql.NullBool + var httpStatusCode sql.NullInt64 + var upstreamRequestID sql.NullString + var usedAccountID sql.NullInt64 + var responsePreview sql.NullString + var responseTruncated sql.NullBool var resultRequestID sql.NullString var resultErrorID sql.NullInt64 var errorMessage sql.NullString @@ -555,6 +642,12 @@ LIMIT 1` &startedAt, &finishedAt, &durationMs, + &success, + &httpStatusCode, + &upstreamRequestID, + &usedAccountID, + &responsePreview, + &responseTruncated, &resultRequestID, &resultErrorID, &errorMessage, @@ -579,6 +672,30 @@ LIMIT 1` v := durationMs.Int64 out.DurationMs = &v } + if success.Valid { + v := success.Bool + out.Success = &v + } + if httpStatusCode.Valid { + v := int(httpStatusCode.Int64) + out.HTTPStatusCode = &v + } + if upstreamRequestID.Valid { + s := upstreamRequestID.String + out.UpstreamRequestID = &s + } + if usedAccountID.Valid { + v := usedAccountID.Int64 + out.UsedAccountID = &v + } + if responsePreview.Valid { + s := responsePreview.String + out.ResponsePreview = &s + } + if responseTruncated.Valid { + v := responseTruncated.Bool + out.ResponseTruncated = &v + } if resultRequestID.Valid { s := resultRequestID.String out.ResultRequestID = &s @@ -602,18 +719,217 @@ func nullTime(t time.Time) sql.NullTime { return sql.NullTime{Time: t, Valid: true} } +func nullBool(v *bool) sql.NullBool { + if v == nil { + return sql.NullBool{} + } + return sql.NullBool{Bool: *v, Valid: true} +} + +func (r *opsRepository) ListRetryAttemptsByErrorID(ctx context.Context, sourceErrorID int64, limit int) ([]*service.OpsRetryAttempt, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if sourceErrorID <= 0 { + return nil, fmt.Errorf("invalid source_error_id") + } + if limit <= 0 { + limit = 50 + } + if limit > 200 { + limit = 200 + } + + q := ` +SELECT + id, + created_at, + COALESCE(requested_by_user_id, 0), + source_error_id, + COALESCE(mode, ''), + pinned_account_id, + COALESCE(status, ''), + started_at, + finished_at, + duration_ms, + success, + http_status_code, + upstream_request_id, + used_account_id, + response_preview, + response_truncated, + result_request_id, + result_error_id, + error_message +FROM ops_retry_attempts +WHERE source_error_id = $1 +ORDER BY created_at DESC +LIMIT $2` + + rows, err := r.db.QueryContext(ctx, q, sourceErrorID, limit) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + out := make([]*service.OpsRetryAttempt, 0, 16) + for rows.Next() { + var item service.OpsRetryAttempt + var pinnedAccountID sql.NullInt64 + var requestedBy sql.NullInt64 + var startedAt sql.NullTime + var finishedAt sql.NullTime + var durationMs sql.NullInt64 + var success sql.NullBool + var httpStatusCode sql.NullInt64 + var upstreamRequestID sql.NullString + var usedAccountID sql.NullInt64 + var responsePreview sql.NullString + var responseTruncated sql.NullBool + var resultRequestID sql.NullString + var resultErrorID sql.NullInt64 + var errorMessage sql.NullString + + if err := rows.Scan( + &item.ID, + &item.CreatedAt, + &requestedBy, + &item.SourceErrorID, + &item.Mode, + &pinnedAccountID, + &item.Status, + &startedAt, + &finishedAt, + &durationMs, + &success, + &httpStatusCode, + &upstreamRequestID, + &usedAccountID, + &responsePreview, + &responseTruncated, + &resultRequestID, + &resultErrorID, + &errorMessage, + ); err != nil { + return nil, err + } + + item.RequestedByUserID = requestedBy.Int64 + if pinnedAccountID.Valid { + v := pinnedAccountID.Int64 + item.PinnedAccountID = &v + } + if startedAt.Valid { + t := startedAt.Time + item.StartedAt = &t + } + if finishedAt.Valid { + t := finishedAt.Time + item.FinishedAt = &t + } + if durationMs.Valid { + v := durationMs.Int64 + item.DurationMs = &v + } + if success.Valid { + v := success.Bool + item.Success = &v + } + if httpStatusCode.Valid { + v := int(httpStatusCode.Int64) + item.HTTPStatusCode = &v + } + if upstreamRequestID.Valid { + s := upstreamRequestID.String + item.UpstreamRequestID = &s + } + if usedAccountID.Valid { + v := usedAccountID.Int64 + item.UsedAccountID = &v + } + if responsePreview.Valid { + s := responsePreview.String + item.ResponsePreview = &s + } + if responseTruncated.Valid { + v := responseTruncated.Bool + item.ResponseTruncated = &v + } + if resultRequestID.Valid { + s := resultRequestID.String + item.ResultRequestID = &s + } + if resultErrorID.Valid { + v := resultErrorID.Int64 + item.ResultErrorID = &v + } + if errorMessage.Valid { + s := errorMessage.String + item.ErrorMessage = &s + } + + out = append(out, &item) + } + if err := rows.Err(); err != nil { + return nil, err + } + return out, nil +} + +func (r *opsRepository) UpdateErrorResolution(ctx context.Context, errorID int64, resolved bool, resolvedByUserID *int64, resolvedRetryID *int64, resolvedAt *time.Time) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if errorID <= 0 { + return fmt.Errorf("invalid error id") + } + + q := ` +UPDATE ops_error_logs +SET + resolved = $2, + resolved_at = $3, + resolved_by_user_id = $4, + resolved_retry_id = $5 +WHERE id = $1` + + at := sql.NullTime{} + if resolvedAt != nil && !resolvedAt.IsZero() { + at = sql.NullTime{Time: resolvedAt.UTC(), Valid: true} + } else if resolved { + now := time.Now().UTC() + at = sql.NullTime{Time: now, Valid: true} + } + + _, err := r.db.ExecContext( + ctx, + q, + errorID, + resolved, + at, + nullInt64(resolvedByUserID), + nullInt64(resolvedRetryID), + ) + return err +} + func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) { - clauses := make([]string, 0, 8) - args := make([]any, 0, 8) + clauses := make([]string, 0, 12) + args := make([]any, 0, 12) clauses = append(clauses, "1=1") phaseFilter := "" if filter != nil { phaseFilter = strings.TrimSpace(strings.ToLower(filter.Phase)) } - // ops_error_logs primarily stores client-visible error requests (status>=400), + // ops_error_logs stores client-visible error requests (status>=400), // but we also persist "recovered" upstream errors (status<400) for upstream health visibility. - // By default, keep list endpoints scoped to client errors unless explicitly filtering upstream phase. + // By default, keep list endpoints scoped to unresolved records if the caller didn't specify. + if filter != nil && filter.Resolved == nil { + f := false + filter.Resolved = &f + } + // Keep list endpoints scoped to client errors unless explicitly filtering upstream phase. if phaseFilter != "upstream" { clauses = append(clauses, "COALESCE(status_code, 0) >= 400") } @@ -643,6 +959,18 @@ func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) { args = append(args, phase) clauses = append(clauses, "error_phase = $"+itoa(len(args))) } + if owner := strings.TrimSpace(strings.ToLower(filter.Owner)); owner != "" { + args = append(args, owner) + clauses = append(clauses, "LOWER(COALESCE(error_owner,'')) = $"+itoa(len(args))) + } + if source := strings.TrimSpace(strings.ToLower(filter.Source)); source != "" { + args = append(args, source) + clauses = append(clauses, "LOWER(COALESCE(error_source,'')) = $"+itoa(len(args))) + } + if filter.Resolved != nil { + args = append(args, *filter.Resolved) + clauses = append(clauses, "COALESCE(resolved,false) = $"+itoa(len(args))) + } if len(filter.StatusCodes) > 0 { args = append(args, pq.Array(filter.StatusCodes)) clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) = ANY($"+itoa(len(args))+")") diff --git a/backend/internal/repository/ops_repo_alerts.go b/backend/internal/repository/ops_repo_alerts.go index f601c363..bd98b7e4 100644 --- a/backend/internal/repository/ops_repo_alerts.go +++ b/backend/internal/repository/ops_repo_alerts.go @@ -354,7 +354,7 @@ SELECT created_at FROM ops_alert_events ` + where + ` -ORDER BY fired_at DESC +ORDER BY fired_at DESC, id DESC LIMIT ` + limitArg rows, err := r.db.QueryContext(ctx, q, args...) @@ -413,6 +413,43 @@ LIMIT ` + limitArg return out, nil } +func (r *opsRepository) GetAlertEventByID(ctx context.Context, eventID int64) (*service.OpsAlertEvent, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if eventID <= 0 { + return nil, fmt.Errorf("invalid event id") + } + + q := ` +SELECT + id, + COALESCE(rule_id, 0), + COALESCE(severity, ''), + COALESCE(status, ''), + COALESCE(title, ''), + COALESCE(description, ''), + metric_value, + threshold_value, + dimensions, + fired_at, + resolved_at, + email_sent, + created_at +FROM ops_alert_events +WHERE id = $1` + + row := r.db.QueryRowContext(ctx, q, eventID) + ev, err := scanOpsAlertEvent(row) + if err != nil { + if err == sql.ErrNoRows { + return nil, nil + } + return nil, err + } + return ev, nil +} + func (r *opsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) { if r == nil || r.db == nil { return nil, fmt.Errorf("nil ops repository") @@ -591,6 +628,121 @@ type opsAlertEventRow interface { Scan(dest ...any) error } +func (r *opsRepository) CreateAlertSilence(ctx context.Context, input *service.OpsAlertSilence) (*service.OpsAlertSilence, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if input == nil { + return nil, fmt.Errorf("nil input") + } + if input.RuleID <= 0 { + return nil, fmt.Errorf("invalid rule_id") + } + platform := strings.TrimSpace(input.Platform) + if platform == "" { + return nil, fmt.Errorf("invalid platform") + } + if input.Until.IsZero() { + return nil, fmt.Errorf("invalid until") + } + + q := ` +INSERT INTO ops_alert_silences ( + rule_id, + platform, + group_id, + region, + until, + reason, + created_by, + created_at +) VALUES ( + $1,$2,$3,$4,$5,$6,$7,NOW() +) +RETURNING id, rule_id, platform, group_id, region, until, COALESCE(reason,''), created_by, created_at` + + row := r.db.QueryRowContext( + ctx, + q, + input.RuleID, + platform, + opsNullInt64(input.GroupID), + opsNullString(input.Region), + input.Until, + opsNullString(input.Reason), + opsNullInt64(input.CreatedBy), + ) + + var out service.OpsAlertSilence + var groupID sql.NullInt64 + var region sql.NullString + var createdBy sql.NullInt64 + if err := row.Scan( + &out.ID, + &out.RuleID, + &out.Platform, + &groupID, + ®ion, + &out.Until, + &out.Reason, + &createdBy, + &out.CreatedAt, + ); err != nil { + return nil, err + } + if groupID.Valid { + v := groupID.Int64 + out.GroupID = &v + } + if region.Valid { + v := strings.TrimSpace(region.String) + if v != "" { + out.Region = &v + } + } + if createdBy.Valid { + v := createdBy.Int64 + out.CreatedBy = &v + } + return &out, nil +} + +func (r *opsRepository) IsAlertSilenced(ctx context.Context, ruleID int64, platform string, groupID *int64, region *string, now time.Time) (bool, error) { + if r == nil || r.db == nil { + return false, fmt.Errorf("nil ops repository") + } + if ruleID <= 0 { + return false, fmt.Errorf("invalid rule id") + } + platform = strings.TrimSpace(platform) + if platform == "" { + return false, nil + } + if now.IsZero() { + now = time.Now().UTC() + } + + q := ` +SELECT 1 +FROM ops_alert_silences +WHERE rule_id = $1 + AND platform = $2 + AND (group_id IS NOT DISTINCT FROM $3) + AND (region IS NOT DISTINCT FROM $4) + AND until > $5 +LIMIT 1` + + var dummy int + err := r.db.QueryRowContext(ctx, q, ruleID, platform, opsNullInt64(groupID), opsNullString(region), now).Scan(&dummy) + if err != nil { + if err == sql.ErrNoRows { + return false, nil + } + return false, err + } + return true, nil +} + func scanOpsAlertEvent(row opsAlertEventRow) (*service.OpsAlertEvent, error) { var ev service.OpsAlertEvent var metricValue sql.NullFloat64 @@ -652,6 +804,10 @@ func buildOpsAlertEventsWhere(filter *service.OpsAlertEventFilter) (string, []an args = append(args, severity) clauses = append(clauses, "severity = $"+itoa(len(args))) } + if filter.EmailSent != nil { + args = append(args, *filter.EmailSent) + clauses = append(clauses, "email_sent = $"+itoa(len(args))) + } if filter.StartTime != nil && !filter.StartTime.IsZero() { args = append(args, *filter.StartTime) clauses = append(clauses, "fired_at >= $"+itoa(len(args))) @@ -661,6 +817,14 @@ func buildOpsAlertEventsWhere(filter *service.OpsAlertEventFilter) (string, []an clauses = append(clauses, "fired_at < $"+itoa(len(args))) } + // Cursor pagination (descending by fired_at, then id) + if filter.BeforeFiredAt != nil && !filter.BeforeFiredAt.IsZero() && filter.BeforeID != nil && *filter.BeforeID > 0 { + args = append(args, *filter.BeforeFiredAt) + tsArg := "$" + itoa(len(args)) + args = append(args, *filter.BeforeID) + idArg := "$" + itoa(len(args)) + clauses = append(clauses, fmt.Sprintf("(fired_at < %s OR (fired_at = %s AND id < %s))", tsArg, tsArg, idArg)) + } // Dimensions are stored in JSONB. We filter best-effort without requiring GIN indexes. if platform := strings.TrimSpace(filter.Platform); platform != "" { args = append(args, platform) From d60176801635a188ebcba04701d40dd5085a5b90 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Wed, 14 Jan 2026 09:03:16 +0800 Subject: [PATCH 03/86] =?UTF-8?q?feat(service):=20=E5=A2=9E=E5=BC=BAops?= =?UTF-8?q?=E4=B8=9A=E5=8A=A1=E9=80=BB=E8=BE=91=E5=92=8C=E5=91=8A=E8=AD=A6?= =?UTF-8?q?=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 实现告警静默功能的业务逻辑 - 优化错误分类和重试机制 - 扩展告警评估和通知功能 - 完善错误解决和重试结果处理 --- .../service/ops_alert_evaluator_service.go | 11 +++ backend/internal/service/ops_alert_models.go | 29 +++++++- backend/internal/service/ops_alerts.go | 66 +++++++++++++++++ backend/internal/service/ops_models.go | 33 ++++++++- backend/internal/service/ops_port.go | 17 ++++- backend/internal/service/ops_retry.go | 32 ++++++-- backend/internal/service/ops_service.go | 74 +++++++++++++++++-- 7 files changed, 242 insertions(+), 20 deletions(-) diff --git a/backend/internal/service/ops_alert_evaluator_service.go b/backend/internal/service/ops_alert_evaluator_service.go index f376c246..3efa11d2 100644 --- a/backend/internal/service/ops_alert_evaluator_service.go +++ b/backend/internal/service/ops_alert_evaluator_service.go @@ -236,6 +236,17 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) { continue } + // Scoped silencing: if a matching silence exists, skip creating a firing event. + if s.opsService != nil { + platform := strings.TrimSpace(scopePlatform) + region := (*string)(nil) + if platform != "" { + if ok, err := s.opsService.IsAlertSilenced(ctx, rule.ID, platform, scopeGroupID, region, now); err == nil && ok { + continue + } + } + } + latestEvent, err := s.opsRepo.GetLatestAlertEvent(ctx, rule.ID) if err != nil { log.Printf("[OpsAlertEvaluator] get latest event failed (rule=%d): %v", rule.ID, err) diff --git a/backend/internal/service/ops_alert_models.go b/backend/internal/service/ops_alert_models.go index 0acf13ab..a0caa990 100644 --- a/backend/internal/service/ops_alert_models.go +++ b/backend/internal/service/ops_alert_models.go @@ -8,8 +8,9 @@ import "time" // with the existing ops dashboard frontend (backup style). const ( - OpsAlertStatusFiring = "firing" - OpsAlertStatusResolved = "resolved" + OpsAlertStatusFiring = "firing" + OpsAlertStatusResolved = "resolved" + OpsAlertStatusManualResolved = "manual_resolved" ) type OpsAlertRule struct { @@ -58,12 +59,32 @@ type OpsAlertEvent struct { CreatedAt time.Time `json:"created_at"` } +type OpsAlertSilence struct { + ID int64 `json:"id"` + + RuleID int64 `json:"rule_id"` + Platform string `json:"platform"` + GroupID *int64 `json:"group_id,omitempty"` + Region *string `json:"region,omitempty"` + + Until time.Time `json:"until"` + Reason string `json:"reason"` + + CreatedBy *int64 `json:"created_by,omitempty"` + CreatedAt time.Time `json:"created_at"` +} + type OpsAlertEventFilter struct { Limit int + // Cursor pagination (descending by fired_at, then id). + BeforeFiredAt *time.Time + BeforeID *int64 + // Optional filters. - Status string - Severity string + Status string + Severity string + EmailSent *bool StartTime *time.Time EndTime *time.Time diff --git a/backend/internal/service/ops_alerts.go b/backend/internal/service/ops_alerts.go index b6c3d1c3..c2bb4e7b 100644 --- a/backend/internal/service/ops_alerts.go +++ b/backend/internal/service/ops_alerts.go @@ -88,6 +88,29 @@ func (s *OpsService) ListAlertEvents(ctx context.Context, filter *OpsAlertEventF return s.opsRepo.ListAlertEvents(ctx, filter) } +func (s *OpsService) GetAlertEventByID(ctx context.Context, eventID int64) (*OpsAlertEvent, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if eventID <= 0 { + return nil, infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id") + } + ev, err := s.opsRepo.GetAlertEventByID(ctx, eventID) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, infraerrors.NotFound("OPS_ALERT_EVENT_NOT_FOUND", "alert event not found") + } + return nil, err + } + if ev == nil { + return nil, infraerrors.NotFound("OPS_ALERT_EVENT_NOT_FOUND", "alert event not found") + } + return ev, nil +} + func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) { if err := s.RequireMonitoringEnabled(ctx); err != nil { return nil, err @@ -101,6 +124,49 @@ func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*Op return s.opsRepo.GetActiveAlertEvent(ctx, ruleID) } +func (s *OpsService) CreateAlertSilence(ctx context.Context, input *OpsAlertSilence) (*OpsAlertSilence, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if input == nil { + return nil, infraerrors.BadRequest("INVALID_SILENCE", "invalid silence") + } + if input.RuleID <= 0 { + return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id") + } + if strings.TrimSpace(input.Platform) == "" { + return nil, infraerrors.BadRequest("INVALID_PLATFORM", "invalid platform") + } + if input.Until.IsZero() { + return nil, infraerrors.BadRequest("INVALID_UNTIL", "invalid until") + } + + created, err := s.opsRepo.CreateAlertSilence(ctx, input) + if err != nil { + return nil, err + } + return created, nil +} + +func (s *OpsService) IsAlertSilenced(ctx context.Context, ruleID int64, platform string, groupID *int64, region *string, now time.Time) (bool, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return false, err + } + if s.opsRepo == nil { + return false, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if ruleID <= 0 { + return false, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id") + } + if strings.TrimSpace(platform) == "" { + return false, nil + } + return s.opsRepo.IsAlertSilenced(ctx, ruleID, platform, groupID, region, now) +} + func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) { if err := s.RequireMonitoringEnabled(ctx); err != nil { return nil, err diff --git a/backend/internal/service/ops_models.go b/backend/internal/service/ops_models.go index 996267fd..78f7cdd0 100644 --- a/backend/internal/service/ops_models.go +++ b/backend/internal/service/ops_models.go @@ -6,8 +6,16 @@ type OpsErrorLog struct { ID int64 `json:"id"` CreatedAt time.Time `json:"created_at"` - Phase string `json:"phase"` - Type string `json:"type"` + // Standardized classification + // - phase: request|auth|routing|upstream|network|internal + // - owner: client|provider|platform + // - source: client_request|upstream_http|gateway + Phase string `json:"phase"` + Type string `json:"type"` + + Owner string `json:"error_owner"` + Source string `json:"error_source"` + Severity string `json:"severity"` StatusCode int `json:"status_code"` @@ -16,6 +24,15 @@ type OpsErrorLog struct { LatencyMs *int `json:"latency_ms"` + IsRetryable bool `json:"is_retryable"` + RetryCount int `json:"retry_count"` + + Resolved bool `json:"resolved"` + ResolvedAt *time.Time `json:"resolved_at"` + ResolvedByUserID *int64 `json:"resolved_by_user_id"` + ResolvedRetryID *int64 `json:"resolved_retry_id"` + ResolvedStatusRaw string `json:"-"` + ClientRequestID string `json:"client_request_id"` RequestID string `json:"request_id"` Message string `json:"message"` @@ -69,6 +86,9 @@ type OpsErrorLogFilter struct { StatusCodes []int Phase string + Owner string + Source string + Resolved *bool Query string Page int @@ -96,6 +116,15 @@ type OpsRetryAttempt struct { FinishedAt *time.Time `json:"finished_at"` DurationMs *int64 `json:"duration_ms"` + // Persisted execution results (best-effort) + Success *bool `json:"success"` + HTTPStatusCode *int `json:"http_status_code"` + UpstreamRequestID *string `json:"upstream_request_id"` + UsedAccountID *int64 `json:"used_account_id"` + ResponsePreview *string `json:"response_preview"` + ResponseTruncated *bool `json:"response_truncated"` + + // Optional correlation ResultRequestID *string `json:"result_request_id"` ResultErrorID *int64 `json:"result_error_id"` diff --git a/backend/internal/service/ops_port.go b/backend/internal/service/ops_port.go index 4df21c37..37a8107c 100644 --- a/backend/internal/service/ops_port.go +++ b/backend/internal/service/ops_port.go @@ -14,6 +14,8 @@ type OpsRepository interface { InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error) UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error) + ListRetryAttemptsByErrorID(ctx context.Context, sourceErrorID int64, limit int) ([]*OpsRetryAttempt, error) + UpdateErrorResolution(ctx context.Context, errorID int64, resolved bool, resolvedByUserID *int64, resolvedRetryID *int64, resolvedAt *time.Time) error // Lightweight window stats (for realtime WS / quick sampling). GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error) @@ -39,12 +41,17 @@ type OpsRepository interface { DeleteAlertRule(ctx context.Context, id int64) error ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error) + GetAlertEventByID(ctx context.Context, eventID int64) (*OpsAlertEvent, error) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error + // Alert silences + CreateAlertSilence(ctx context.Context, input *OpsAlertSilence) (*OpsAlertSilence, error) + IsAlertSilenced(ctx context.Context, ruleID int64, platform string, groupID *int64, region *string, now time.Time) (bool, error) + // Pre-aggregation (hourly/daily) used for long-window dashboard performance. UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error @@ -124,7 +131,15 @@ type OpsUpdateRetryAttemptInput struct { FinishedAt time.Time DurationMs int64 - // Optional correlation + // Persisted execution results (best-effort) + Success *bool + HTTPStatusCode *int + UpstreamRequestID *string + UsedAccountID *int64 + ResponsePreview *string + ResponseTruncated *bool + + // Optional correlation (legacy fields kept) ResultRequestID *string ResultErrorID *int64 diff --git a/backend/internal/service/ops_retry.go b/backend/internal/service/ops_retry.go index 747aa3b8..2cbb8ced 100644 --- a/backend/internal/service/ops_retry.go +++ b/backend/internal/service/ops_retry.go @@ -231,16 +231,36 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er finalStatus = opsRetryStatusFailed } + success := strings.EqualFold(finalStatus, opsRetryStatusSucceeded) + httpStatus := result.HTTPStatusCode + upstreamReqID := result.UpstreamRequestID + usedAccountID := result.UsedAccountID + preview := result.ResponsePreview + truncated := result.ResponseTruncated + if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{ - ID: attemptID, - Status: finalStatus, - FinishedAt: finishedAt, - DurationMs: result.DurationMs, - ResultRequestID: resultRequestID, - ErrorMessage: updateErrMsg, + ID: attemptID, + Status: finalStatus, + FinishedAt: finishedAt, + DurationMs: result.DurationMs, + Success: &success, + HTTPStatusCode: &httpStatus, + UpstreamRequestID: &upstreamReqID, + UsedAccountID: usedAccountID, + ResponsePreview: &preview, + ResponseTruncated: &truncated, + ResultRequestID: resultRequestID, + ErrorMessage: updateErrMsg, }); err != nil { // Best-effort: retry itself already executed; do not fail the API response. log.Printf("[Ops] UpdateRetryAttempt failed: %v", err) + } else { + // Auto-resolve the source error when a manual retry succeeds. + if success { + if err := s.opsRepo.UpdateErrorResolution(updateCtx, errorID, true, &requestedByUserID, &attemptID, &finishedAt); err != nil { + log.Printf("[Ops] UpdateErrorResolution failed: %v", err) + } + } } return result, nil diff --git a/backend/internal/service/ops_service.go b/backend/internal/service/ops_service.go index 426d46f1..d9984659 100644 --- a/backend/internal/service/ops_service.go +++ b/backend/internal/service/ops_service.go @@ -256,6 +256,46 @@ func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLo return detail, nil } +func (s *OpsService) ListRetryAttemptsByErrorID(ctx context.Context, errorID int64, limit int) ([]*OpsRetryAttempt, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found") + } + if errorID <= 0 { + return nil, infraerrors.BadRequest("OPS_ERROR_INVALID_ID", "invalid error id") + } + items, err := s.opsRepo.ListRetryAttemptsByErrorID(ctx, errorID, limit) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return []*OpsRetryAttempt{}, nil + } + return nil, infraerrors.InternalServer("OPS_RETRY_LIST_FAILED", "Failed to list retry attempts").WithCause(err) + } + return items, nil +} + +func (s *OpsService) UpdateErrorResolution(ctx context.Context, errorID int64, resolved bool, resolvedByUserID *int64, resolvedRetryID *int64) error { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return err + } + if s.opsRepo == nil { + return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if errorID <= 0 { + return infraerrors.BadRequest("OPS_ERROR_INVALID_ID", "invalid error id") + } + // Best-effort ensure the error exists + if _, err := s.opsRepo.GetErrorLogByID(ctx, errorID); err != nil { + if errors.Is(err, sql.ErrNoRows) { + return infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found") + } + return infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err) + } + return s.opsRepo.UpdateErrorResolution(ctx, errorID, resolved, resolvedByUserID, resolvedRetryID, nil) +} + func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) { bytesLen = len(raw) if len(raw) == 0 { @@ -296,14 +336,34 @@ func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, tr } } - // Last resort: store a minimal placeholder (still valid JSON). - placeholder := map[string]any{ - "request_body_truncated": true, + // Last resort: keep JSON shape but drop big fields. + // This avoids downstream code that expects certain top-level keys from crashing. + if root, ok := decoded.(map[string]any); ok { + placeholder := shallowCopyMap(root) + placeholder["request_body_truncated"] = true + + // Replace potentially huge arrays/strings, but keep the keys present. + for _, k := range []string{"messages", "contents", "input", "prompt"} { + if _, exists := placeholder[k]; exists { + placeholder[k] = []any{} + } + } + for _, k := range []string{"text"} { + if _, exists := placeholder[k]; exists { + placeholder[k] = "" + } + } + + encoded4, err4 := json.Marshal(placeholder) + if err4 == nil { + if len(encoded4) <= maxBytes { + return string(encoded4), true, bytesLen + } + } } - if model := extractString(decoded, "model"); model != "" { - placeholder["model"] = model - } - encoded4, err4 := json.Marshal(placeholder) + + // Final fallback: minimal valid JSON. + encoded4, err4 := json.Marshal(map[string]any{"request_body_truncated": true}) if err4 != nil { return "", true, bytesLen } From 659df6e220a5499959bbd6980de702534266094c Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Wed, 14 Jan 2026 09:03:35 +0800 Subject: [PATCH 04/86] =?UTF-8?q?feat(handler):=20=E6=96=B0=E5=A2=9Eops?= =?UTF-8?q?=E7=AE=A1=E7=90=86=E6=8E=A5=E5=8F=A3=E5=92=8C=E8=B7=AF=E7=94=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加告警静默管理接口 - 扩展错误日志查询和操作接口 - 新增重试和解决状态相关端点 - 完善错误日志记录功能 --- .../handler/admin/ops_alerts_handler.go | 168 +++++++++++++++++- backend/internal/handler/admin/ops_handler.go | 105 +++++++++++ backend/internal/handler/ops_error_logger.go | 32 ++-- backend/internal/server/routes/admin.go | 5 + 4 files changed, 296 insertions(+), 14 deletions(-) diff --git a/backend/internal/handler/admin/ops_alerts_handler.go b/backend/internal/handler/admin/ops_alerts_handler.go index 1e33ddd5..e7ad693b 100644 --- a/backend/internal/handler/admin/ops_alerts_handler.go +++ b/backend/internal/handler/admin/ops_alerts_handler.go @@ -7,8 +7,10 @@ import ( "net/http" "strconv" "strings" + "time" "github.com/Wei-Shaw/sub2api/internal/pkg/response" + "github.com/Wei-Shaw/sub2api/internal/server/middleware" "github.com/Wei-Shaw/sub2api/internal/service" "github.com/gin-gonic/gin" "github.com/gin-gonic/gin/binding" @@ -372,8 +374,135 @@ func (h *OpsHandler) DeleteAlertRule(c *gin.Context) { response.Success(c, gin.H{"deleted": true}) } +// GetAlertEvent returns a single ops alert event. +// GET /api/v1/admin/ops/alert-events/:id +func (h *OpsHandler) GetAlertEvent(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + id, err := strconv.ParseInt(c.Param("id"), 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid event ID") + return + } + + ev, err := h.opsService.GetAlertEventByID(c.Request.Context(), id) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, ev) +} + +// UpdateAlertEventStatus updates an ops alert event status. +// PUT /api/v1/admin/ops/alert-events/:id/status +func (h *OpsHandler) UpdateAlertEventStatus(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + id, err := strconv.ParseInt(c.Param("id"), 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid event ID") + return + } + + var payload struct { + Status string `json:"status"` + } + if err := c.ShouldBindJSON(&payload); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + payload.Status = strings.TrimSpace(payload.Status) + if payload.Status == "" { + response.BadRequest(c, "Invalid status") + return + } + if payload.Status != service.OpsAlertStatusResolved && payload.Status != service.OpsAlertStatusManualResolved { + response.BadRequest(c, "Invalid status") + return + } + + var resolvedAt *time.Time + if payload.Status == service.OpsAlertStatusResolved || payload.Status == service.OpsAlertStatusManualResolved { + now := time.Now().UTC() + resolvedAt = &now + } + if err := h.opsService.UpdateAlertEventStatus(c.Request.Context(), id, payload.Status, resolvedAt); err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, gin.H{"updated": true}) +} + // ListAlertEvents lists recent ops alert events. // GET /api/v1/admin/ops/alert-events +// CreateAlertSilence creates a scoped silence for ops alerts. +// POST /api/v1/admin/ops/alert-silences +func (h *OpsHandler) CreateAlertSilence(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + var payload struct { + RuleID int64 `json:"rule_id"` + Platform string `json:"platform"` + GroupID *int64 `json:"group_id"` + Region *string `json:"region"` + Until string `json:"until"` + Reason string `json:"reason"` + } + if err := c.ShouldBindJSON(&payload); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + until, err := time.Parse(time.RFC3339, strings.TrimSpace(payload.Until)) + if err != nil { + response.BadRequest(c, "Invalid until") + return + } + + createdBy := (*int64)(nil) + if subject, ok := middleware.GetAuthSubjectFromContext(c); ok { + uid := subject.UserID + createdBy = &uid + } + + silence := &service.OpsAlertSilence{ + RuleID: payload.RuleID, + Platform: strings.TrimSpace(payload.Platform), + GroupID: payload.GroupID, + Region: payload.Region, + Until: until, + Reason: strings.TrimSpace(payload.Reason), + CreatedBy: createdBy, + } + + created, err := h.opsService.CreateAlertSilence(c.Request.Context(), silence) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, created) +} + func (h *OpsHandler) ListAlertEvents(c *gin.Context) { if h.opsService == nil { response.Error(c, http.StatusServiceUnavailable, "Ops service not available") @@ -384,7 +513,7 @@ func (h *OpsHandler) ListAlertEvents(c *gin.Context) { return } - limit := 100 + limit := 20 if raw := strings.TrimSpace(c.Query("limit")); raw != "" { n, err := strconv.Atoi(raw) if err != nil || n <= 0 { @@ -400,6 +529,43 @@ func (h *OpsHandler) ListAlertEvents(c *gin.Context) { Severity: strings.TrimSpace(c.Query("severity")), } + if v := strings.TrimSpace(c.Query("email_sent")); v != "" { + vv := strings.ToLower(v) + switch vv { + case "true", "1": + b := true + filter.EmailSent = &b + case "false", "0": + b := false + filter.EmailSent = &b + default: + response.BadRequest(c, "Invalid email_sent") + return + } + } + + // Cursor pagination + if rawTS := strings.TrimSpace(c.Query("before_fired_at")); rawTS != "" { + ts, err := time.Parse(time.RFC3339Nano, rawTS) + if err != nil { + if t2, err2 := time.Parse(time.RFC3339, rawTS); err2 == nil { + ts = t2 + } else { + response.BadRequest(c, "Invalid before_fired_at") + return + } + } + filter.BeforeFiredAt = &ts + } + if rawID := strings.TrimSpace(c.Query("before_id")); rawID != "" { + id, err := strconv.ParseInt(rawID, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid before_id") + return + } + filter.BeforeID = &id + } + // Optional global filter support (platform/group/time range). if platform := strings.TrimSpace(c.Query("platform")); platform != "" { filter.Platform = platform diff --git a/backend/internal/handler/admin/ops_handler.go b/backend/internal/handler/admin/ops_handler.go index bff7426a..ec7a8b75 100644 --- a/backend/internal/handler/admin/ops_handler.go +++ b/backend/internal/handler/admin/ops_handler.go @@ -80,6 +80,25 @@ func (h *OpsHandler) GetErrorLogs(c *gin.Context) { if phase := strings.TrimSpace(c.Query("phase")); phase != "" { filter.Phase = phase } + if owner := strings.TrimSpace(c.Query("error_owner")); owner != "" { + filter.Owner = owner + } + if source := strings.TrimSpace(c.Query("error_source")); source != "" { + filter.Source = source + } + if v := strings.TrimSpace(c.Query("resolved")); v != "" { + switch strings.ToLower(v) { + case "1", "true", "yes": + b := true + filter.Resolved = &b + case "0", "false", "no": + b := false + filter.Resolved = &b + default: + response.BadRequest(c, "Invalid resolved") + return + } + } if q := strings.TrimSpace(c.Query("q")); q != "" { filter.Query = q } @@ -242,6 +261,11 @@ func (h *OpsHandler) ListRequestDetails(c *gin.Context) { type opsRetryRequest struct { Mode string `json:"mode"` PinnedAccountID *int64 `json:"pinned_account_id"` + Force bool `json:"force"` +} + +type opsResolveRequest struct { + Resolved bool `json:"resolved"` } // RetryErrorRequest retries a failed request using stored request_body. @@ -278,6 +302,8 @@ func (h *OpsHandler) RetryErrorRequest(c *gin.Context) { req.Mode = service.OpsRetryModeClient } + // Force flag is currently a UI-level acknowledgement. Server may still enforce safety constraints. + _ = req.Force result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, req.Mode, req.PinnedAccountID) if err != nil { response.ErrorFrom(c, err) @@ -287,6 +313,81 @@ func (h *OpsHandler) RetryErrorRequest(c *gin.Context) { response.Success(c, result) } +// ListRetryAttempts lists retry attempts for an error log. +// GET /api/v1/admin/ops/errors/:id/retries +func (h *OpsHandler) ListRetryAttempts(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + limit := 50 + if v := strings.TrimSpace(c.Query("limit")); v != "" { + n, err := strconv.Atoi(v) + if err != nil || n <= 0 { + response.BadRequest(c, "Invalid limit") + return + } + limit = n + } + + items, err := h.opsService.ListRetryAttemptsByErrorID(c.Request.Context(), id, limit) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, items) +} + +// UpdateErrorResolution allows manual resolve/unresolve. +// PUT /api/v1/admin/ops/errors/:id/resolve +func (h *OpsHandler) UpdateErrorResolution(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + subject, ok := middleware.GetAuthSubjectFromContext(c) + if !ok || subject.UserID <= 0 { + response.Error(c, http.StatusUnauthorized, "Unauthorized") + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + var req opsResolveRequest + if err := c.ShouldBindJSON(&req); err != nil { + response.BadRequest(c, "Invalid request: "+err.Error()) + return + } + uid := subject.UserID + if err := h.opsService.UpdateErrorResolution(c.Request.Context(), id, req.Resolved, &uid, nil); err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, gin.H{"ok": true}) +} + func parseOpsTimeRange(c *gin.Context, defaultRange string) (time.Time, time.Time, error) { startStr := strings.TrimSpace(c.Query("start_time")) endStr := strings.TrimSpace(c.Query("end_time")) @@ -358,6 +459,10 @@ func parseOpsDuration(v string) (time.Duration, bool) { return 6 * time.Hour, true case "24h": return 24 * time.Hour, true + case "7d": + return 7 * 24 * time.Hour, true + case "30d": + return 30 * 24 * time.Hour, true default: return 0, false } diff --git a/backend/internal/handler/ops_error_logger.go b/backend/internal/handler/ops_error_logger.go index 13bd9d94..f101bf92 100644 --- a/backend/internal/handler/ops_error_logger.go +++ b/backend/internal/handler/ops_error_logger.go @@ -832,28 +832,30 @@ func normalizeOpsErrorType(errType string, code string) string { func classifyOpsPhase(errType, message, code string) string { msg := strings.ToLower(message) + // Standardized phases: request|auth|routing|upstream|network|internal + // Map billing/concurrency/response => request; scheduling => routing. switch strings.TrimSpace(code) { case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID": - return "billing" + return "request" } switch errType { case "authentication_error": return "auth" case "billing_error", "subscription_error": - return "billing" + return "request" case "rate_limit_error": if strings.Contains(msg, "concurrency") || strings.Contains(msg, "pending") || strings.Contains(msg, "queue") { - return "concurrency" + return "request" } return "upstream" case "invalid_request_error": - return "response" + return "request" case "upstream_error", "overloaded_error": return "upstream" case "api_error": if strings.Contains(msg, "no available accounts") { - return "scheduling" + return "routing" } return "internal" default: @@ -914,34 +916,38 @@ func classifyOpsIsBusinessLimited(errType, phase, code string, status int, messa } func classifyOpsErrorOwner(phase string, message string) string { + // Standardized owners: client|provider|platform switch phase { case "upstream", "network": return "provider" - case "billing", "concurrency", "auth", "response": + case "request", "auth": return "client" + case "routing", "internal": + return "platform" default: if strings.Contains(strings.ToLower(message), "upstream") { return "provider" } - return "sub2api" + return "platform" } } func classifyOpsErrorSource(phase string, message string) string { + // Standardized sources: client_request|upstream_http|gateway switch phase { case "upstream": return "upstream_http" case "network": - return "upstream_network" - case "billing": - return "billing" - case "concurrency": - return "concurrency" + return "gateway" + case "request", "auth": + return "client_request" + case "routing", "internal": + return "gateway" default: if strings.Contains(strings.ToLower(message), "upstream") { return "upstream_http" } - return "internal" + return "gateway" } } diff --git a/backend/internal/server/routes/admin.go b/backend/internal/server/routes/admin.go index 9bb019bb..adae7cdd 100644 --- a/backend/internal/server/routes/admin.go +++ b/backend/internal/server/routes/admin.go @@ -81,6 +81,9 @@ func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) { ops.PUT("/alert-rules/:id", h.Admin.Ops.UpdateAlertRule) ops.DELETE("/alert-rules/:id", h.Admin.Ops.DeleteAlertRule) ops.GET("/alert-events", h.Admin.Ops.ListAlertEvents) + ops.GET("/alert-events/:id", h.Admin.Ops.GetAlertEvent) + ops.PUT("/alert-events/:id/status", h.Admin.Ops.UpdateAlertEventStatus) + ops.POST("/alert-silences", h.Admin.Ops.CreateAlertSilence) // Email notification config (DB-backed) ops.GET("/email-notification/config", h.Admin.Ops.GetEmailNotificationConfig) @@ -113,7 +116,9 @@ func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) { // Error logs (MVP-1) ops.GET("/errors", h.Admin.Ops.GetErrorLogs) ops.GET("/errors/:id", h.Admin.Ops.GetErrorLogByID) + ops.GET("/errors/:id/retries", h.Admin.Ops.ListRetryAttempts) ops.POST("/errors/:id/retry", h.Admin.Ops.RetryErrorRequest) + ops.PUT("/errors/:id/resolve", h.Admin.Ops.UpdateErrorResolution) // Request drilldown (success + error) ops.GET("/requests", h.Admin.Ops.ListRequestDetails) From b8da5d45cefb67ec9103507161407f752ec24199 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Wed, 14 Jan 2026 09:03:45 +0800 Subject: [PATCH 05/86] =?UTF-8?q?feat(api):=20=E6=89=A9=E5=B1=95=E5=89=8D?= =?UTF-8?q?=E7=AB=AFops=20API=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增告警静默相关API调用 - 增强错误日志查询和过滤接口 - 添加重试和解决状态管理接口 --- frontend/src/api/admin/ops.ts | 108 +++++++++++++++++++++++++++++++--- 1 file changed, 101 insertions(+), 7 deletions(-) diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts index ce0ab58d..24ac7ad3 100644 --- a/frontend/src/api/admin/ops.ts +++ b/frontend/src/api/admin/ops.ts @@ -17,6 +17,33 @@ export interface OpsRequestOptions { export interface OpsRetryRequest { mode: OpsRetryMode pinned_account_id?: number + force?: boolean +} + +export interface OpsRetryAttempt { + id: number + created_at: string + requested_by_user_id: number + source_error_id: number + mode: OpsRetryMode | string + pinned_account_id?: number | null + + status: string + started_at?: string | null + finished_at?: string | null + duration_ms?: number | null + + success?: boolean | null + http_status_code?: number | null + upstream_request_id?: string | null + used_account_id?: number | null + response_preview?: string | null + response_truncated?: boolean | null + + result_request_id?: string | null + result_error_id?: number | null + + error_message?: string | null } export interface OpsRetryResult { @@ -663,7 +690,7 @@ export interface AlertEvent { id: number rule_id: number severity: OpsSeverity | string - status: 'firing' | 'resolved' | string + status: 'firing' | 'resolved' | 'manual_resolved' | string title?: string description?: string metric_value?: number @@ -701,10 +728,10 @@ export interface EmailNotificationConfig { } export interface OpsMetricThresholds { - sla_percent_min?: number | null // SLA低于此值变红 - latency_p99_ms_max?: number | null // 延迟P99高于此值变红 - ttft_p99_ms_max?: number | null // TTFT P99高于此值变红 - request_error_rate_percent_max?: number | null // 请求错误率高于此值变红 + sla_percent_min?: number | null // SLA低于此值变红 + latency_p99_ms_max?: number | null // 延迟 P99 高于此值变红 + ttft_p99_ms_max?: number | null // TTFT P99高于此值变红 + request_error_rate_percent_max?: number | null // 请求错误率高于此值变红 upstream_error_rate_percent_max?: number | null // 上游错误率高于此值变红 } @@ -754,13 +781,27 @@ export interface OpsAggregationSettings { export interface OpsErrorLog { id: number created_at: string + + // Standardized classification phase: OpsPhase type: string + error_owner: 'client' | 'provider' | 'platform' | string + error_source: 'client_request' | 'upstream_http' | 'gateway' | string + severity: OpsSeverity status_code: number platform: string model: string latency_ms?: number | null + + is_retryable: boolean + retry_count: number + + resolved: boolean + resolved_at?: string | null + resolved_by_user_id?: number | null + resolved_retry_id?: number | null + client_request_id: string request_id: string message: string @@ -899,7 +940,12 @@ export async function listErrorLogs(params: { platform?: string group_id?: number | null account_id?: number | null + phase?: string + error_owner?: string + error_source?: string + resolved?: string + q?: string status_codes?: string }): Promise { @@ -917,6 +963,15 @@ export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promi return data } +export async function listRetryAttempts(errorId: number, limit = 50): Promise { + const { data } = await apiClient.get(`/admin/ops/errors/${errorId}/retries`, { params: { limit } }) + return data +} + +export async function updateErrorResolved(errorId: number, resolved: boolean): Promise { + await apiClient.put(`/admin/ops/errors/${errorId}/resolve`, { resolved }) +} + export async function listRequestDetails(params: OpsRequestDetailsParams): Promise { const { data } = await apiClient.get('/admin/ops/requests', { params }) return data @@ -942,11 +997,45 @@ export async function deleteAlertRule(id: number): Promise { await apiClient.delete(`/admin/ops/alert-rules/${id}`) } -export async function listAlertEvents(limit = 100): Promise { - const { data } = await apiClient.get('/admin/ops/alert-events', { params: { limit } }) +export interface AlertEventsQuery { + limit?: number + status?: string + severity?: string + email_sent?: boolean + time_range?: string + start_time?: string + end_time?: string + before_fired_at?: string + before_id?: number + platform?: string + group_id?: number +} + +export async function listAlertEvents(params: AlertEventsQuery = {}): Promise { + const { data } = await apiClient.get('/admin/ops/alert-events', { params }) return data } +export async function getAlertEvent(id: number): Promise { + const { data } = await apiClient.get(`/admin/ops/alert-events/${id}`) + return data +} + +export async function updateAlertEventStatus(id: number, status: 'resolved' | 'manual_resolved'): Promise { + await apiClient.put(`/admin/ops/alert-events/${id}/status`, { status }) +} + +export async function createAlertSilence(payload: { + rule_id: number + platform: string + group_id?: number | null + region?: string | null + until: string + reason?: string +}): Promise { + await apiClient.post('/admin/ops/alert-silences', payload) +} + // Email notification config export async function getEmailNotificationConfig(): Promise { const { data } = await apiClient.get('/admin/ops/email-notification/config') @@ -1004,12 +1093,17 @@ export const opsAPI = { listErrorLogs, getErrorLogDetail, retryErrorRequest, + listRetryAttempts, + updateErrorResolved, listRequestDetails, listAlertRules, createAlertRule, updateAlertRule, deleteAlertRule, listAlertEvents, + getAlertEvent, + updateAlertEventStatus, + createAlertSilence, getEmailNotificationConfig, updateEmailNotificationConfig, getAlertRuntimeSettings, From f38a3e758506b98e3d09742e04bee3edf3fc6c5f Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Wed, 14 Jan 2026 09:03:59 +0800 Subject: [PATCH 06/86] =?UTF-8?q?feat(ui):=20=E4=BC=98=E5=8C=96ops?= =?UTF-8?q?=E7=9B=91=E6=8E=A7=E9=9D=A2=E6=9D=BF=E5=92=8C=E7=BB=84=E4=BB=B6?= =?UTF-8?q?=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 增强告警事件卡片的交互和静默功能 - 完善错误详情弹窗的展示和操作 - 优化错误日志表格的筛选和排序 - 新增重试和解决状态的UI支持 --- frontend/src/views/admin/ops/OpsDashboard.vue | 26 +- .../ops/components/OpsAlertEventsCard.vue | 555 ++++++++++++++++-- .../ops/components/OpsErrorDetailModal.vue | 405 +++++++++++-- .../ops/components/OpsErrorDetailsModal.vue | 50 +- .../admin/ops/components/OpsErrorLogTable.vue | 63 +- 5 files changed, 1013 insertions(+), 86 deletions(-) diff --git a/frontend/src/views/admin/ops/OpsDashboard.vue b/frontend/src/views/admin/ops/OpsDashboard.vue index be445a32..d059059d 100644 --- a/frontend/src/views/admin/ops/OpsDashboard.vue +++ b/frontend/src/views/admin/ops/OpsDashboard.vue @@ -169,7 +169,13 @@ const QUERY_KEYS = { platform: 'platform', groupId: 'group_id', queryMode: 'mode', - fullscreen: 'fullscreen' + fullscreen: 'fullscreen', + + // Deep links + openErrorDetails: 'open_error_details', + errorType: 'error_type', + alertRuleId: 'alert_rule_id', + openAlertRules: 'open_alert_rules' } as const const isApplyingRouteQuery = ref(false) @@ -249,6 +255,24 @@ const applyRouteQueryToState = () => { const fallback = adminSettingsStore.opsQueryModeDefault || 'auto' queryMode.value = allowedQueryModes.has(fallback as QueryMode) ? (fallback as QueryMode) : 'auto' } + + // Deep links + const openRules = readQueryString(QUERY_KEYS.openAlertRules) + if (openRules === '1' || openRules === 'true') { + showAlertRulesCard.value = true + } + + const ruleID = readQueryNumber(QUERY_KEYS.alertRuleId) + if (typeof ruleID === 'number' && ruleID > 0) { + showAlertRulesCard.value = true + } + + const openErr = readQueryString(QUERY_KEYS.openErrorDetails) + if (openErr === '1' || openErr === 'true') { + const typ = readQueryString(QUERY_KEYS.errorType) + errorDetailsType.value = typ === 'upstream' ? 'upstream' : 'request' + showErrorDetails.value = true + } } applyRouteQueryToState() diff --git a/frontend/src/views/admin/ops/components/OpsAlertEventsCard.vue b/frontend/src/views/admin/ops/components/OpsAlertEventsCard.vue index 58a91355..ef6e8f80 100644 --- a/frontend/src/views/admin/ops/components/OpsAlertEventsCard.vue +++ b/frontend/src/views/admin/ops/components/OpsAlertEventsCard.vue @@ -3,42 +3,326 @@ import { computed, onMounted, ref, watch } from 'vue' import { useI18n } from 'vue-i18n' import { useAppStore } from '@/stores/app' import Select from '@/components/common/Select.vue' -import { opsAPI } from '@/api/admin/ops' +import BaseDialog from '@/components/common/BaseDialog.vue' +import Icon from '@/components/icons/Icon.vue' +import { opsAPI, type AlertEventsQuery } from '@/api/admin/ops' import type { AlertEvent } from '../types' import { formatDateTime } from '../utils/opsFormatters' const { t } = useI18n() const appStore = useAppStore() -const loading = ref(false) -const events = ref([]) +const PAGE_SIZE = 20 -const limit = ref(100) -const limitOptions = computed(() => [ - { value: 50, label: '50' }, - { value: 100, label: '100' }, - { value: 200, label: '200' } +const loading = ref(false) +const loadingMore = ref(false) +const events = ref([]) +const hasMore = ref(true) + +// Detail modal +const showDetail = ref(false) +const selected = ref(null) +const detailLoading = ref(false) +const detailActionLoading = ref(false) +const historyLoading = ref(false) +const history = ref([]) +const historyRange = ref('7d') +const historyRangeOptions = computed(() => [ + { value: '7d', label: t('admin.ops.timeRange.7d') }, + { value: '30d', label: t('admin.ops.timeRange.30d') } ]) -async function load() { +const silenceDuration = ref('1h') +const silenceDurationOptions = computed(() => [ + { value: '1h', label: t('admin.ops.timeRange.1h') }, + { value: '24h', label: t('admin.ops.timeRange.24h') }, + { value: '7d', label: t('admin.ops.timeRange.7d') } +]) + +// Filters +const timeRange = ref('24h') +const timeRangeOptions = computed(() => [ + { value: '5m', label: t('admin.ops.timeRange.5m') }, + { value: '30m', label: t('admin.ops.timeRange.30m') }, + { value: '1h', label: t('admin.ops.timeRange.1h') }, + { value: '6h', label: t('admin.ops.timeRange.6h') }, + { value: '24h', label: t('admin.ops.timeRange.24h') }, + { value: '7d', label: t('admin.ops.timeRange.7d') }, + { value: '30d', label: t('admin.ops.timeRange.30d') } +]) + +const severity = ref('') +const severityOptions = computed(() => [ + { value: '', label: t('common.all') }, + { value: 'P0', label: 'P0' }, + { value: 'P1', label: 'P1' }, + { value: 'P2', label: 'P2' }, + { value: 'P3', label: 'P3' } +]) + +const status = ref('') +const statusOptions = computed(() => [ + { value: '', label: t('common.all') }, + { value: 'firing', label: t('admin.ops.alertEvents.status.firing') }, + { value: 'resolved', label: t('admin.ops.alertEvents.status.resolved') }, + { value: 'manual_resolved', label: t('admin.ops.alertEvents.status.manualResolved') } +]) + +const emailSent = ref('') +const emailSentOptions = computed(() => [ + { value: '', label: t('common.all') }, + { value: 'true', label: t('admin.ops.alertEvents.table.emailSent') }, + { value: 'false', label: t('admin.ops.alertEvents.table.emailIgnored') } +]) + +function buildQuery(overrides: Partial = {}): AlertEventsQuery { + const q: AlertEventsQuery = { + limit: PAGE_SIZE, + time_range: timeRange.value + } + if (severity.value) q.severity = severity.value + if (status.value) q.status = status.value + if (emailSent.value === 'true') q.email_sent = true + if (emailSent.value === 'false') q.email_sent = false + return { ...q, ...overrides } +} + +async function loadFirstPage() { loading.value = true try { - events.value = await opsAPI.listAlertEvents(limit.value) + const data = await opsAPI.listAlertEvents(buildQuery()) + events.value = data + hasMore.value = data.length === PAGE_SIZE } catch (err: any) { console.error('[OpsAlertEventsCard] Failed to load alert events', err) appStore.showError(err?.response?.data?.detail || t('admin.ops.alertEvents.loadFailed')) events.value = [] + hasMore.value = false } finally { loading.value = false } } +async function loadMore() { + if (loadingMore.value || loading.value) return + if (!hasMore.value) return + const last = events.value[events.value.length - 1] + if (!last) return + + loadingMore.value = true + try { + const data = await opsAPI.listAlertEvents( + buildQuery({ before_fired_at: last.fired_at || last.created_at, before_id: last.id }) + ) + if (!data.length) { + hasMore.value = false + return + } + events.value = [...events.value, ...data] + if (data.length < PAGE_SIZE) hasMore.value = false + } catch (err: any) { + console.error('[OpsAlertEventsCard] Failed to load more alert events', err) + hasMore.value = false + } finally { + loadingMore.value = false + } +} + +function onScroll(e: Event) { + const el = e.target as HTMLElement | null + if (!el) return + const nearBottom = el.scrollTop + el.clientHeight >= el.scrollHeight - 120 + if (nearBottom) loadMore() +} + +function getDimensionString(event: AlertEvent | null | undefined, key: string): string { + const v = event?.dimensions?.[key] + if (v == null) return '' + if (typeof v === 'string') return v + if (typeof v === 'number' || typeof v === 'boolean') return String(v) + return '' +} + +function formatDurationMs(ms: number): string { + const safe = Math.max(0, Math.floor(ms)) + const sec = Math.floor(safe / 1000) + if (sec < 60) return `${sec}s` + const min = Math.floor(sec / 60) + if (min < 60) return `${min}m` + const hr = Math.floor(min / 60) + if (hr < 24) return `${hr}h` + const day = Math.floor(hr / 24) + return `${day}d` +} + +function formatDurationLabel(event: AlertEvent): string { + const firedAt = new Date(event.fired_at || event.created_at) + if (Number.isNaN(firedAt.getTime())) return '-' + const resolvedAtStr = event.resolved_at || null + const status = String(event.status || '').trim().toLowerCase() + + if (resolvedAtStr) { + const resolvedAt = new Date(resolvedAtStr) + if (!Number.isNaN(resolvedAt.getTime())) { + const ms = resolvedAt.getTime() - firedAt.getTime() + const prefix = status === 'manual_resolved' + ? t('admin.ops.alertEvents.status.manualResolved') + : t('admin.ops.alertEvents.status.resolved') + return `${prefix} ${formatDurationMs(ms)}` + } + } + + const now = Date.now() + const ms = now - firedAt.getTime() + return `${t('admin.ops.alertEvents.status.firing')} ${formatDurationMs(ms)}` +} + +function formatDimensionsSummary(event: AlertEvent): string { + const parts: string[] = [] + const platform = getDimensionString(event, 'platform') + if (platform) parts.push(`platform=${platform}`) + const groupId = event.dimensions?.group_id + if (groupId != null && groupId !== '') parts.push(`group_id=${String(groupId)}`) + const region = getDimensionString(event, 'region') + if (region) parts.push(`region=${region}`) + return parts.length ? parts.join(' ') : '-' +} + +function closeDetail() { + showDetail.value = false + selected.value = null + history.value = [] +} + +async function openDetail(row: AlertEvent) { + showDetail.value = true + selected.value = row + detailLoading.value = true + historyLoading.value = true + + try { + const detail = await opsAPI.getAlertEvent(row.id) + selected.value = detail + } catch (err: any) { + console.error('[OpsAlertEventsCard] Failed to load alert detail', err) + appStore.showError(err?.response?.data?.detail || t('admin.ops.alertEvents.detail.loadFailed')) + } finally { + detailLoading.value = false + } + + await loadHistory() +} + +async function loadHistory() { + const ev = selected.value + if (!ev) { + history.value = [] + historyLoading.value = false + return + } + + historyLoading.value = true + try { + const platform = getDimensionString(ev, 'platform') + const groupIdRaw = ev.dimensions?.group_id + const groupId = typeof groupIdRaw === 'number' ? groupIdRaw : undefined + + const items = await opsAPI.listAlertEvents({ + limit: 20, + time_range: historyRange.value, + platform: platform || undefined, + group_id: groupId, + status: '' + }) + + // Best-effort: narrow to same rule_id + dimensions + history.value = items.filter((it) => { + if (it.rule_id !== ev.rule_id) return false + const p1 = getDimensionString(it, 'platform') + const p2 = getDimensionString(ev, 'platform') + if ((p1 || '') !== (p2 || '')) return false + const g1 = it.dimensions?.group_id + const g2 = ev.dimensions?.group_id + return (g1 ?? null) === (g2 ?? null) + }) + } catch (err: any) { + console.error('[OpsAlertEventsCard] Failed to load alert history', err) + history.value = [] + } finally { + historyLoading.value = false + } +} + +function durationToUntilRFC3339(duration: string): string { + const now = Date.now() + if (duration === '1h') return new Date(now + 60 * 60 * 1000).toISOString() + if (duration === '24h') return new Date(now + 24 * 60 * 60 * 1000).toISOString() + if (duration === '7d') return new Date(now + 7 * 24 * 60 * 60 * 1000).toISOString() + return new Date(now + 60 * 60 * 1000).toISOString() +} + +async function silenceAlert() { + const ev = selected.value + if (!ev) return + if (detailActionLoading.value) return + detailActionLoading.value = true + try { + const platform = getDimensionString(ev, 'platform') + const groupIdRaw = ev.dimensions?.group_id + const groupId = typeof groupIdRaw === 'number' ? groupIdRaw : null + const region = getDimensionString(ev, 'region') || null + + await opsAPI.createAlertSilence({ + rule_id: ev.rule_id, + platform: platform || '', + group_id: groupId ?? undefined, + region: region ?? undefined, + until: durationToUntilRFC3339(silenceDuration.value), + reason: `silence from UI (${silenceDuration.value})` + }) + + appStore.showSuccess(t('admin.ops.alertEvents.detail.silenceSuccess')) + } catch (err: any) { + console.error('[OpsAlertEventsCard] Failed to silence alert', err) + appStore.showError(err?.response?.data?.detail || t('admin.ops.alertEvents.detail.silenceFailed')) + } finally { + detailActionLoading.value = false + } +} + +async function manualResolve() { + if (!selected.value) return + if (detailActionLoading.value) return + detailActionLoading.value = true + try { + await opsAPI.updateAlertEventStatus(selected.value.id, 'manual_resolved') + appStore.showSuccess(t('admin.ops.alertEvents.detail.manualResolvedSuccess')) + + // Refresh detail + first page to reflect new status + const detail = await opsAPI.getAlertEvent(selected.value.id) + selected.value = detail + await loadFirstPage() + await loadHistory() + } catch (err: any) { + console.error('[OpsAlertEventsCard] Failed to resolve alert', err) + appStore.showError(err?.response?.data?.detail || t('admin.ops.alertEvents.detail.manualResolvedFailed')) + } finally { + detailActionLoading.value = false + } +} + onMounted(() => { - load() + loadFirstPage() }) -watch(limit, () => { - load() +watch([timeRange, severity, status, emailSent], () => { + events.value = [] + hasMore.value = true + loadFirstPage() +}) + +watch(historyRange, () => { + if (showDetail.value) loadHistory() }) function severityBadgeClass(severity: string | undefined): string { @@ -54,9 +338,19 @@ function statusBadgeClass(status: string | undefined): string { const s = String(status || '').trim().toLowerCase() if (s === 'firing') return 'bg-red-50 text-red-700 ring-red-600/20 dark:bg-red-900/30 dark:text-red-300 dark:ring-red-500/30' if (s === 'resolved') return 'bg-green-50 text-green-700 ring-green-600/20 dark:bg-green-900/30 dark:text-green-300 dark:ring-green-500/30' + if (s === 'manual_resolved') return 'bg-slate-50 text-slate-700 ring-slate-600/20 dark:bg-slate-900/30 dark:text-slate-300 dark:ring-slate-500/30' return 'bg-gray-50 text-gray-700 ring-gray-600/20 dark:bg-gray-900/30 dark:text-gray-300 dark:ring-gray-500/30' } +function formatStatusLabel(status: string | undefined): string { + const s = String(status || '').trim().toLowerCase() + if (!s) return '-' + if (s === 'firing') return t('admin.ops.alertEvents.status.firing') + if (s === 'resolved') return t('admin.ops.alertEvents.status.resolved') + if (s === 'manual_resolved') return t('admin.ops.alertEvents.status.manualResolved') + return s.toUpperCase() +} + const empty = computed(() => events.value.length === 0 && !loading.value) @@ -69,11 +363,14 @@ const empty = computed(() => events.value.length === 0 && !loading.value)
- + + + +
+ + + + + + +
+
+
{{ t('admin.ops.alertEvents.detail.firedAt') }}
+
{{ formatDateTime(selected.fired_at || selected.created_at) }}
+
+
+
{{ t('admin.ops.alertEvents.detail.resolvedAt') }}
+
{{ selected.resolved_at ? formatDateTime(selected.resolved_at) : '-' }}
+
+
+
{{ t('admin.ops.alertEvents.detail.ruleId') }}
+ +
+
+
{{ t('admin.ops.alertEvents.detail.dimensions') }}
+
+
platform={{ getDimensionString(selected, 'platform') }}
+
group_id={{ selected.dimensions.group_id }}
+
region={{ getDimensionString(selected, 'region') }}
+
+
+
+ + +
+
+
+
{{ t('admin.ops.alertEvents.detail.historyTitle') }}
+
{{ t('admin.ops.alertEvents.detail.historyHint') }}
+
+ + + + +
+
+
{{ t('admin.ops.errorDetail.compareB') || 'Compare B' }}
+ +
+
+ +
+
+
{{ selectedA ? `#${selectedA.id} · ${selectedA.mode} · ${selectedA.status}` : '—' }}
+
http: {{ selectedA?.http_status_code ?? '—' }} · used: {{ selectedA?.used_account_id ?? '—' }}
+
{{ selectedA?.response_preview || '' }}
+
{{ selectedA.error_message }}
+
+
+
{{ selectedB ? `#${selectedB.id} · ${selectedB.mode} · ${selectedB.status}` : '—' }}
+
http: {{ selectedB?.http_status_code ?? '—' }} · used: {{ selectedB?.used_account_id ?? '—' }}
+
{{ selectedB?.response_preview || '' }}
+
{{ selectedB.error_message }}
+
+
+ +
+
+
+
#{{ a.id }} · {{ a.mode }} · {{ a.status }}
+
{{ a.created_at }}
+
+
+
success: {{ a.success ?? '—' }}
+
http: {{ a.http_status_code ?? '—' }}
+
pinned: {{ a.pinned_account_id ?? '—' }}
+
used: {{ a.used_account_id ?? '—' }}
+
+
{{ a.response_preview }}
+
{{ a.error_message }}
+
+
+ + + + +
+
+

{{ t('admin.ops.errorDetail.requestBody') }}

+
{{ prettyJSON(detail.request_body) }}
+
+
+ +
+
+

{{ t('admin.ops.errorDetail.responseBody') || 'Response' }}

+
+ {{ responseTabHint }} +
+
{{ prettyJSON(responseTabBody) }}
+
+
+ + + + +
+
+ +
+
+ + \ No newline at end of file diff --git a/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue b/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue index 1d9859d4..0abe183a 100644 --- a/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue +++ b/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue @@ -33,14 +33,6 @@ const statusCode = ref(null) const phase = ref('') const errorOwner = ref('') const resolvedStatus = ref('unresolved') -const accountIdInput = ref('') - -const accountId = computed(() => { - const raw = String(accountIdInput.value || '').trim() - if (!raw) return null - const n = Number.parseInt(raw, 10) - return Number.isFinite(n) && n > 0 ? n : null -}) const modalTitle = computed(() => { return props.errorType === 'upstream' ? t('admin.ops.errorDetails.upstreamErrors') : t('admin.ops.errorDetails.requestErrors') @@ -105,7 +97,6 @@ async function fetchErrorLogs() { if (q.value.trim()) params.q = q.value.trim() if (typeof statusCode.value === 'number') params.status_codes = String(statusCode.value) - if (typeof accountId.value === 'number') params.account_id = accountId.value const phaseVal = String(phase.value || '').trim() if (phaseVal) params.phase = phaseVal @@ -136,7 +127,6 @@ function resetFilters() { phase.value = props.errorType === 'upstream' ? 'upstream' : '' errorOwner.value = '' resolvedStatus.value = 'unresolved' - accountIdInput.value = '' page.value = 1 fetchErrorLogs() } @@ -189,15 +179,6 @@ watch( fetchErrorLogs() } ) - -watch( - () => accountId.value, - () => { - if (!props.show) return - page.value = 1 - fetchErrorLogs() - } -) + + diff --git a/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue b/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue index 6ef455e9..3e7424df 100644 --- a/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue +++ b/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue @@ -1,61 +1,48 @@ @@ -212,39 +173,32 @@ import Pagination from '@/components/common/Pagination.vue' import type { OpsErrorLog } from '@/api/admin/ops' import { getSeverityClass, formatDateTime } from '../utils/opsFormatters' +const { t } = useI18n() + function getTypeBadge(log: OpsErrorLog): { label: string; className: string } { const phase = String(log.phase || '').toLowerCase() - const owner = String((log as any).error_owner || '').toLowerCase() + const owner = String(log.error_owner || '').toLowerCase() - // Mapping aligned with the design: - // - upstream/provider => 🔴 上游 - // - request/client => 🟡 请求 - // - auth/client => 🔵 认证 - // - routing/platform => 🟣 路由 - // - internal/platform => ⚫ 内部 if (phase === 'upstream' && owner === 'provider') { - return { label: '🔴 上游', className: 'bg-red-50 text-red-700 ring-red-600/20 dark:bg-red-900/30 dark:text-red-400 dark:ring-red-500/30' } + return { label: t('admin.ops.errorLog.typeUpstream'), className: 'bg-red-50 text-red-700 ring-red-600/20 dark:bg-red-900/30 dark:text-red-400 dark:ring-red-500/30' } } if (phase === 'request' && owner === 'client') { - return { label: '🟡 请求', className: 'bg-amber-50 text-amber-700 ring-amber-600/20 dark:bg-amber-900/30 dark:text-amber-400 dark:ring-amber-500/30' } + return { label: t('admin.ops.errorLog.typeRequest'), className: 'bg-amber-50 text-amber-700 ring-amber-600/20 dark:bg-amber-900/30 dark:text-amber-400 dark:ring-amber-500/30' } } if (phase === 'auth' && owner === 'client') { - return { label: '🔵 认证', className: 'bg-blue-50 text-blue-700 ring-blue-600/20 dark:bg-blue-900/30 dark:text-blue-400 dark:ring-blue-500/30' } + return { label: t('admin.ops.errorLog.typeAuth'), className: 'bg-blue-50 text-blue-700 ring-blue-600/20 dark:bg-blue-900/30 dark:text-blue-400 dark:ring-blue-500/30' } } if (phase === 'routing' && owner === 'platform') { - return { label: '🟣 路由', className: 'bg-purple-50 text-purple-700 ring-purple-600/20 dark:bg-purple-900/30 dark:text-purple-400 dark:ring-purple-500/30' } + return { label: t('admin.ops.errorLog.typeRouting'), className: 'bg-purple-50 text-purple-700 ring-purple-600/20 dark:bg-purple-900/30 dark:text-purple-400 dark:ring-purple-500/30' } } if (phase === 'internal' && owner === 'platform') { - return { label: '⚫ 内部', className: 'bg-gray-100 text-gray-800 ring-gray-600/20 dark:bg-dark-700 dark:text-gray-200 dark:ring-dark-500/40' } + return { label: t('admin.ops.errorLog.typeInternal'), className: 'bg-gray-100 text-gray-800 ring-gray-600/20 dark:bg-dark-700 dark:text-gray-200 dark:ring-dark-500/40' } } - // Fallback: show phase/owner for unknown combos. const fallback = phase || owner || 'unknown' return { label: fallback, className: 'bg-gray-50 text-gray-700 ring-gray-600/10 dark:bg-dark-900 dark:text-gray-300 dark:ring-dark-700' } } -const { t } = useI18n() - interface Props { rows: OpsErrorLog[] total: number @@ -269,14 +223,6 @@ function getStatusClass(code: number): string { return 'bg-gray-50 text-gray-700 ring-gray-600/20 dark:bg-gray-900/30 dark:text-gray-400 dark:ring-gray-500/30' } -function getLatencyClass(latency: number | null): string { - if (!latency) return 'text-gray-400' - if (latency > 10000) return 'text-red-600 font-black' - if (latency > 5000) return 'text-red-500 font-bold' - if (latency > 2000) return 'text-orange-500 font-medium' - return 'text-gray-600 dark:text-gray-400' -} - function formatSmartMessage(msg: string): string { if (!msg) return '' @@ -298,4 +244,4 @@ function formatSmartMessage(msg: string): string { return msg.length > 200 ? msg.substring(0, 200) + '...' : msg } - + \ No newline at end of file diff --git a/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue index c8291313..4d737c1b 100644 --- a/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue +++ b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue @@ -480,11 +480,31 @@ async function saveAllSettings() {

- 启用后,count_tokens 请求的错误将不计入运维监控的统计和告警中(但仍会存储在数据库中) + 启用后,count_tokens 请求的错误将不会写入错误日志

+ +
+
+ +

+ 启用后,客户端主动断开连接(context canceled)的错误将不会写入错误日志 +

+
+ +
+ +
+
+ +

+ 启用后,"No available accounts" 错误将不会写入错误日志(不推荐,这通常是配置问题) +

+
+ +
From 7c4309ea240056b69bbcf33aec0583d185abe3bc Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Wed, 14 Jan 2026 14:29:01 +0800 Subject: [PATCH 15/86] =?UTF-8?q?feat(ops):=20=E6=B7=BB=E5=8A=A0ops=20hand?= =?UTF-8?q?ler=E5=92=8C=E8=B7=AF=E7=94=B1=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/internal/handler/admin/ops_handler.go | 354 ++++++++++++++++++ backend/internal/server/routes/admin.go | 15 +- 2 files changed, 368 insertions(+), 1 deletion(-) diff --git a/backend/internal/handler/admin/ops_handler.go b/backend/internal/handler/admin/ops_handler.go index ec7a8b75..630d5665 100644 --- a/backend/internal/handler/admin/ops_handler.go +++ b/backend/internal/handler/admin/ops_handler.go @@ -19,6 +19,29 @@ type OpsHandler struct { opsService *service.OpsService } +const ( + opsListViewErrors = "errors" + opsListViewExcluded = "excluded" + opsListViewAll = "all" +) + +func parseOpsViewParam(c *gin.Context) string { + if c == nil { + return "" + } + v := strings.ToLower(strings.TrimSpace(c.Query("view"))) + switch v { + case "", opsListViewErrors: + return opsListViewErrors + case opsListViewExcluded: + return opsListViewExcluded + case opsListViewAll: + return opsListViewAll + default: + return opsListViewErrors + } +} + func NewOpsHandler(opsService *service.OpsService) *OpsHandler { return &OpsHandler{opsService: opsService} } @@ -86,6 +109,7 @@ func (h *OpsHandler) GetErrorLogs(c *gin.Context) { if source := strings.TrimSpace(c.Query("error_source")); source != "" { filter.Source = source } + filter.View = parseOpsViewParam(c) if v := strings.TrimSpace(c.Query("resolved")); v != "" { switch strings.ToLower(v) { case "1", "true", "yes": @@ -157,6 +181,336 @@ func (h *OpsHandler) GetErrorLogByID(c *gin.Context) { response.Success(c, detail) } +// ==================== New split endpoints ==================== + +// ListRequestErrors lists client-visible request errors. +// GET /api/v1/admin/ops/request-errors +func (h *OpsHandler) ListRequestErrors(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + page, pageSize := response.ParsePagination(c) + if pageSize > 500 { + pageSize = 500 + } + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsErrorLogFilter{Page: page, PageSize: pageSize} + if !startTime.IsZero() { + filter.StartTime = &startTime + } + if !endTime.IsZero() { + filter.EndTime = &endTime + } + filter.View = parseOpsViewParam(c) + filter.Phase = strings.TrimSpace(c.Query("phase")) + filter.Owner = strings.TrimSpace(c.Query("error_owner")) + filter.Source = strings.TrimSpace(c.Query("error_source")) + filter.Query = strings.TrimSpace(c.Query("q")) + + // Force request errors: client-visible status >= 400. + // buildOpsErrorLogsWhere already applies this for non-upstream phase. + if strings.EqualFold(strings.TrimSpace(filter.Phase), "upstream") { + filter.Phase = "" + } + + if platform := strings.TrimSpace(c.Query("platform")); platform != "" { + filter.Platform = platform + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + if v := strings.TrimSpace(c.Query("account_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid account_id") + return + } + filter.AccountID = &id + } + + if v := strings.TrimSpace(c.Query("resolved")); v != "" { + switch strings.ToLower(v) { + case "1", "true", "yes": + b := true + filter.Resolved = &b + case "0", "false", "no": + b := false + filter.Resolved = &b + default: + response.BadRequest(c, "Invalid resolved") + return + } + } + if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" { + parts := strings.Split(statusCodesStr, ",") + out := make([]int, 0, len(parts)) + for _, part := range parts { + p := strings.TrimSpace(part) + if p == "" { + continue + } + n, err := strconv.Atoi(p) + if err != nil || n < 0 { + response.BadRequest(c, "Invalid status_codes") + return + } + out = append(out, n) + } + filter.StatusCodes = out + } + + result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize) +} + +// GetRequestError returns request error detail. +// GET /api/v1/admin/ops/request-errors/:id +func (h *OpsHandler) GetRequestError(c *gin.Context) { + // same storage; just proxy to existing detail + h.GetErrorLogByID(c) +} + +// RetryRequestErrorClient retries the client request based on stored request body. +// POST /api/v1/admin/ops/request-errors/:id/retry-client +func (h *OpsHandler) RetryRequestErrorClient(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + subject, ok := middleware.GetAuthSubjectFromContext(c) + if !ok || subject.UserID <= 0 { + response.Error(c, http.StatusUnauthorized, "Unauthorized") + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, service.OpsRetryModeClient, nil) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, result) +} + +// RetryRequestErrorUpstreamEvent retries a specific upstream attempt using captured upstream_request_body. +// POST /api/v1/admin/ops/request-errors/:id/upstream-errors/:idx/retry +func (h *OpsHandler) RetryRequestErrorUpstreamEvent(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + subject, ok := middleware.GetAuthSubjectFromContext(c) + if !ok || subject.UserID <= 0 { + response.Error(c, http.StatusUnauthorized, "Unauthorized") + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + idxStr := strings.TrimSpace(c.Param("idx")) + idx, err := strconv.Atoi(idxStr) + if err != nil || idx < 0 { + response.BadRequest(c, "Invalid upstream idx") + return + } + + result, err := h.opsService.RetryUpstreamEvent(c.Request.Context(), subject.UserID, id, idx) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, result) +} + +// ResolveRequestError toggles resolved status. +// PUT /api/v1/admin/ops/request-errors/:id/resolve +func (h *OpsHandler) ResolveRequestError(c *gin.Context) { + h.UpdateErrorResolution(c) +} + +// ListUpstreamErrors lists independent upstream errors. +// GET /api/v1/admin/ops/upstream-errors +func (h *OpsHandler) ListUpstreamErrors(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + page, pageSize := response.ParsePagination(c) + if pageSize > 500 { + pageSize = 500 + } + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsErrorLogFilter{Page: page, PageSize: pageSize} + if !startTime.IsZero() { + filter.StartTime = &startTime + } + if !endTime.IsZero() { + filter.EndTime = &endTime + } + + filter.View = parseOpsViewParam(c) + filter.Phase = "upstream" + filter.Owner = "provider" + filter.Source = strings.TrimSpace(c.Query("error_source")) + filter.Query = strings.TrimSpace(c.Query("q")) + + if platform := strings.TrimSpace(c.Query("platform")); platform != "" { + filter.Platform = platform + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + if v := strings.TrimSpace(c.Query("account_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid account_id") + return + } + filter.AccountID = &id + } + + if v := strings.TrimSpace(c.Query("resolved")); v != "" { + switch strings.ToLower(v) { + case "1", "true", "yes": + b := true + filter.Resolved = &b + case "0", "false", "no": + b := false + filter.Resolved = &b + default: + response.BadRequest(c, "Invalid resolved") + return + } + } + if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" { + parts := strings.Split(statusCodesStr, ",") + out := make([]int, 0, len(parts)) + for _, part := range parts { + p := strings.TrimSpace(part) + if p == "" { + continue + } + n, err := strconv.Atoi(p) + if err != nil || n < 0 { + response.BadRequest(c, "Invalid status_codes") + return + } + out = append(out, n) + } + filter.StatusCodes = out + } + + result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize) +} + +// GetUpstreamError returns upstream error detail. +// GET /api/v1/admin/ops/upstream-errors/:id +func (h *OpsHandler) GetUpstreamError(c *gin.Context) { + h.GetErrorLogByID(c) +} + +// RetryUpstreamError retries upstream error using the original account_id. +// POST /api/v1/admin/ops/upstream-errors/:id/retry +func (h *OpsHandler) RetryUpstreamError(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + subject, ok := middleware.GetAuthSubjectFromContext(c) + if !ok || subject.UserID <= 0 { + response.Error(c, http.StatusUnauthorized, "Unauthorized") + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, service.OpsRetryModeUpstream, nil) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, result) +} + +// ResolveUpstreamError toggles resolved status. +// PUT /api/v1/admin/ops/upstream-errors/:id/resolve +func (h *OpsHandler) ResolveUpstreamError(c *gin.Context) { + h.UpdateErrorResolution(c) +} + +// ==================== Existing endpoints ==================== + // ListRequestDetails returns a request-level list (success + error) for drill-down. // GET /api/v1/admin/ops/requests func (h *OpsHandler) ListRequestDetails(c *gin.Context) { diff --git a/backend/internal/server/routes/admin.go b/backend/internal/server/routes/admin.go index adae7cdd..53702766 100644 --- a/backend/internal/server/routes/admin.go +++ b/backend/internal/server/routes/admin.go @@ -113,13 +113,26 @@ func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) { ws.GET("/qps", h.Admin.Ops.QPSWSHandler) } - // Error logs (MVP-1) + // Error logs (legacy) ops.GET("/errors", h.Admin.Ops.GetErrorLogs) ops.GET("/errors/:id", h.Admin.Ops.GetErrorLogByID) ops.GET("/errors/:id/retries", h.Admin.Ops.ListRetryAttempts) ops.POST("/errors/:id/retry", h.Admin.Ops.RetryErrorRequest) ops.PUT("/errors/:id/resolve", h.Admin.Ops.UpdateErrorResolution) + // Request errors (client-visible failures) + ops.GET("/request-errors", h.Admin.Ops.ListRequestErrors) + ops.GET("/request-errors/:id", h.Admin.Ops.GetRequestError) + ops.POST("/request-errors/:id/retry-client", h.Admin.Ops.RetryRequestErrorClient) + ops.POST("/request-errors/:id/upstream-errors/:idx/retry", h.Admin.Ops.RetryRequestErrorUpstreamEvent) + ops.PUT("/request-errors/:id/resolve", h.Admin.Ops.ResolveRequestError) + + // Upstream errors (independent upstream failures) + ops.GET("/upstream-errors", h.Admin.Ops.ListUpstreamErrors) + ops.GET("/upstream-errors/:id", h.Admin.Ops.GetUpstreamError) + ops.POST("/upstream-errors/:id/retry", h.Admin.Ops.RetryUpstreamError) + ops.PUT("/upstream-errors/:id/resolve", h.Admin.Ops.ResolveUpstreamError) + // Request drilldown (success + error) ops.GET("/requests", h.Admin.Ops.ListRequestDetails) From 7f317b9093363cd1130c3b7af2569223f0bb7cce Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Wed, 14 Jan 2026 14:29:19 +0800 Subject: [PATCH 16/86] =?UTF-8?q?feat(ops):=20=E5=A2=9E=E5=BC=BAops?= =?UTF-8?q?=E6=A0=B8=E5=BF=83=E6=9C=8D=E5=8A=A1=E5=8A=9F=E8=83=BD=E5=92=8C?= =?UTF-8?q?=E9=87=8D=E8=AF=95=E6=9C=BA=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/internal/service/ops_models.go | 16 ++- backend/internal/service/ops_retry.go | 101 +++++++++++++++--- backend/internal/service/ops_service.go | 19 ++++ .../internal/service/ops_upstream_context.go | 32 ++++++ 4 files changed, 148 insertions(+), 20 deletions(-) diff --git a/backend/internal/service/ops_models.go b/backend/internal/service/ops_models.go index 98be759f..c48c9b56 100644 --- a/backend/internal/service/ops_models.go +++ b/backend/internal/service/ops_models.go @@ -25,12 +25,12 @@ type OpsErrorLog struct { IsRetryable bool `json:"is_retryable"` RetryCount int `json:"retry_count"` - Resolved bool `json:"resolved"` - ResolvedAt *time.Time `json:"resolved_at"` - ResolvedByUserID *int64 `json:"resolved_by_user_id"` + Resolved bool `json:"resolved"` + ResolvedAt *time.Time `json:"resolved_at"` + ResolvedByUserID *int64 `json:"resolved_by_user_id"` ResolvedByUserName string `json:"resolved_by_user_name"` - ResolvedRetryID *int64 `json:"resolved_retry_id"` - ResolvedStatusRaw string `json:"-"` + ResolvedRetryID *int64 `json:"resolved_retry_id"` + ResolvedStatusRaw string `json:"-"` ClientRequestID string `json:"client_request_id"` RequestID string `json:"request_id"` @@ -93,6 +93,12 @@ type OpsErrorLogFilter struct { Resolved *bool Query string + // View controls error categorization for list endpoints. + // - errors: show actionable errors (exclude business-limited / 429 / 529) + // - excluded: only show excluded errors + // - all: show everything + View string + Page int PageSize int } diff --git a/backend/internal/service/ops_retry.go b/backend/internal/service/ops_retry.go index f52e2b77..25c10af6 100644 --- a/backend/internal/service/ops_retry.go +++ b/backend/internal/service/ops_retry.go @@ -108,6 +108,10 @@ func (w *limitedResponseWriter) truncated() bool { return w.totalWritten > int64(w.limit) } +const ( + OpsRetryModeUpstreamEvent = "upstream_event" +) + func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) { if err := s.RequireMonitoringEnabled(ctx); err != nil { return nil, err @@ -123,6 +127,81 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream") } + errorLog, err := s.GetErrorLogByID(ctx, errorID) + if err != nil { + return nil, err + } + if errorLog == nil { + return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found") + } + if strings.TrimSpace(errorLog.RequestBody) == "" { + return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry") + } + + var pinned *int64 + if mode == OpsRetryModeUpstream { + if pinnedAccountID != nil && *pinnedAccountID > 0 { + pinned = pinnedAccountID + } else if errorLog.AccountID != nil && *errorLog.AccountID > 0 { + pinned = errorLog.AccountID + } else { + return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry") + } + } + + return s.retryWithErrorLog(ctx, requestedByUserID, errorID, mode, mode, pinned, errorLog) +} + +// RetryUpstreamEvent retries a specific upstream attempt captured inside ops_error_logs.upstream_errors. +// idx is 0-based. It always pins the original event account_id. +func (s *OpsService) RetryUpstreamEvent(ctx context.Context, requestedByUserID int64, errorID int64, idx int) (*OpsRetryResult, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if idx < 0 { + return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_UPSTREAM_IDX", "invalid upstream idx") + } + + errorLog, err := s.GetErrorLogByID(ctx, errorID) + if err != nil { + return nil, err + } + if errorLog == nil { + return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found") + } + + events, err := ParseOpsUpstreamErrors(errorLog.UpstreamErrors) + if err != nil { + return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENTS_INVALID", "invalid upstream_errors") + } + if idx >= len(events) { + return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_IDX_OOB", "upstream idx out of range") + } + ev := events[idx] + if ev == nil { + return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENT_MISSING", "upstream event missing") + } + if ev.AccountID <= 0 { + return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry") + } + + upstreamBody := strings.TrimSpace(ev.UpstreamRequestBody) + if upstreamBody == "" { + return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_NO_REQUEST_BODY", "No upstream request body found to retry") + } + + override := *errorLog + override.RequestBody = upstreamBody + pinned := ev.AccountID + + // Persist as upstream_event, execute as upstream pinned retry. + return s.retryWithErrorLog(ctx, requestedByUserID, errorID, OpsRetryModeUpstreamEvent, OpsRetryModeUpstream, &pinned, &override) +} + +func (s *OpsService) retryWithErrorLog(ctx context.Context, requestedByUserID int64, errorID int64, mode string, execMode string, pinnedAccountID *int64, errorLog *OpsErrorLogDetail) (*OpsRetryResult, error) { latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID) if err != nil && !errors.Is(err, sql.ErrNoRows) { return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err) @@ -144,22 +223,18 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er } } - errorLog, err := s.GetErrorLogByID(ctx, errorID) - if err != nil { - return nil, err - } - if strings.TrimSpace(errorLog.RequestBody) == "" { + if errorLog == nil || strings.TrimSpace(errorLog.RequestBody) == "" { return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry") } var pinned *int64 - if mode == OpsRetryModeUpstream { + if execMode == OpsRetryModeUpstream { if pinnedAccountID != nil && *pinnedAccountID > 0 { pinned = pinnedAccountID } else if errorLog.AccountID != nil && *errorLog.AccountID > 0 { pinned = errorLog.AccountID } else { - return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry") + return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry") } } @@ -196,7 +271,7 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout) defer cancel() - execRes := s.executeRetry(execCtx, errorLog, mode, pinned) + execRes := s.executeRetry(execCtx, errorLog, execMode, pinned) finishedAt := time.Now() result.FinishedAt = finishedAt @@ -249,14 +324,10 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er ResultRequestID: resultRequestID, ErrorMessage: updateErrMsg, }); err != nil { - // Best-effort: retry itself already executed; do not fail the API response. log.Printf("[Ops] UpdateRetryAttempt failed: %v", err) - } else { - // Auto-resolve the source error when a manual retry succeeds. - if success { - if err := s.opsRepo.UpdateErrorResolution(updateCtx, errorID, true, &requestedByUserID, &attemptID, &finishedAt); err != nil { - log.Printf("[Ops] UpdateErrorResolution failed: %v", err) - } + } else if success { + if err := s.opsRepo.UpdateErrorResolution(updateCtx, errorID, true, &requestedByUserID, &attemptID, &finishedAt); err != nil { + log.Printf("[Ops] UpdateErrorResolution failed: %v", err) } } diff --git a/backend/internal/service/ops_service.go b/backend/internal/service/ops_service.go index f1240aaf..d606ba09 100644 --- a/backend/internal/service/ops_service.go +++ b/backend/internal/service/ops_service.go @@ -208,6 +208,25 @@ func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogIn out.Detail = "" } + out.UpstreamRequestBody = strings.TrimSpace(out.UpstreamRequestBody) + if out.UpstreamRequestBody != "" { + // Reuse the same sanitization/trimming strategy as request body storage. + // Keep it small so it is safe to persist in ops_error_logs JSON. + sanitized, truncated, _ := sanitizeAndTrimRequestBody([]byte(out.UpstreamRequestBody), 10*1024) + if sanitized != "" { + out.UpstreamRequestBody = sanitized + if truncated { + out.Kind = strings.TrimSpace(out.Kind) + if out.Kind == "" { + out.Kind = "upstream" + } + out.Kind = out.Kind + ":request_body_truncated" + } + } else { + out.UpstreamRequestBody = "" + } + } + // Drop fully-empty events (can happen if only status code was known). if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" { continue diff --git a/backend/internal/service/ops_upstream_context.go b/backend/internal/service/ops_upstream_context.go index 20c0ea11..20ca6469 100644 --- a/backend/internal/service/ops_upstream_context.go +++ b/backend/internal/service/ops_upstream_context.go @@ -15,6 +15,11 @@ const ( OpsUpstreamErrorMessageKey = "ops_upstream_error_message" OpsUpstreamErrorDetailKey = "ops_upstream_error_detail" OpsUpstreamErrorsKey = "ops_upstream_errors" + + // Best-effort capture of the current upstream request body so ops can + // retry the specific upstream attempt (not just the client request). + // This value is sanitized+trimmed before being persisted. + OpsUpstreamRequestBodyKey = "ops_upstream_request_body" ) func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) { @@ -46,6 +51,10 @@ type OpsUpstreamErrorEvent struct { UpstreamStatusCode int `json:"upstream_status_code,omitempty"` UpstreamRequestID string `json:"upstream_request_id,omitempty"` + // Best-effort upstream request capture (sanitized+trimmed). + // Required for retrying a specific upstream attempt. + UpstreamRequestBody string `json:"upstream_request_body,omitempty"` + // Kind: http_error | request_error | retry_exhausted | failover Kind string `json:"kind,omitempty"` @@ -62,6 +71,7 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) { } ev.Platform = strings.TrimSpace(ev.Platform) ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID) + ev.UpstreamRequestBody = strings.TrimSpace(ev.UpstreamRequestBody) ev.Kind = strings.TrimSpace(ev.Kind) ev.Message = strings.TrimSpace(ev.Message) ev.Detail = strings.TrimSpace(ev.Detail) @@ -69,6 +79,16 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) { ev.Message = sanitizeUpstreamErrorMessage(ev.Message) } + // If the caller didn't explicitly pass upstream request body but the gateway + // stored it on the context, attach it so ops can retry this specific attempt. + if ev.UpstreamRequestBody == "" { + if v, ok := c.Get(OpsUpstreamRequestBodyKey); ok { + if s, ok := v.(string); ok { + ev.UpstreamRequestBody = strings.TrimSpace(s) + } + } + } + var existing []*OpsUpstreamErrorEvent if v, ok := c.Get(OpsUpstreamErrorsKey); ok { if arr, ok := v.([]*OpsUpstreamErrorEvent); ok { @@ -93,3 +113,15 @@ func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string { s := string(raw) return &s } + +func ParseOpsUpstreamErrors(raw string) ([]*OpsUpstreamErrorEvent, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return []*OpsUpstreamErrorEvent{}, nil + } + var out []*OpsUpstreamErrorEvent + if err := json.Unmarshal([]byte(raw), &out); err != nil { + return nil, err + } + return out, nil +} From 7158b38897a90b856db9468c883fc3df25b9900e Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Wed, 14 Jan 2026 14:29:39 +0800 Subject: [PATCH 17/86] =?UTF-8?q?refactor(ops):=20=E4=BC=98=E5=8C=96ops=20?= =?UTF-8?q?repository=E6=95=B0=E6=8D=AE=E8=AE=BF=E9=97=AE=E5=B1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/internal/repository/ops_repo.go | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/backend/internal/repository/ops_repo.go b/backend/internal/repository/ops_repo.go index 9d290f08..c9cca1d5 100644 --- a/backend/internal/repository/ops_repo.go +++ b/backend/internal/repository/ops_repo.go @@ -984,6 +984,26 @@ func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) { args = append(args, *resolvedFilter) clauses = append(clauses, "COALESCE(resolved,false) = $"+itoa(len(args))) } + + // View filter: errors vs excluded vs all. + // Excluded = upstream 429/529 and business-limited (quota/concurrency/billing) errors. + view := "" + if filter != nil { + view = strings.ToLower(strings.TrimSpace(filter.View)) + } + switch view { + case "", "errors": + clauses = append(clauses, "COALESCE(is_business_limited,false) = false") + clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)") + case "excluded": + clauses = append(clauses, "(COALESCE(is_business_limited,false) = true OR COALESCE(upstream_status_code, status_code, 0) IN (429, 529))") + case "all": + // no-op + default: + // treat unknown as default 'errors' + clauses = append(clauses, "COALESCE(is_business_limited,false) = false") + clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)") + } if len(filter.StatusCodes) > 0 { args = append(args, pq.Array(filter.StatusCodes)) clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) = ANY($"+itoa(len(args))+")") From 63711067e6bd3a089455532d13a9719d4753c334 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Wed, 14 Jan 2026 14:30:00 +0800 Subject: [PATCH 18/86] =?UTF-8?q?refactor(ops):=20=E5=AE=8C=E5=96=84gatewa?= =?UTF-8?q?y=E6=9C=8D=E5=8A=A1ops=E9=9B=86=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../service/antigravity_gateway_service.go | 28 +++++++++------- backend/internal/service/gateway_service.go | 13 +++++--- .../service/gemini_messages_compat_service.go | 32 +++++++++++++------ .../service/openai_gateway_service.go | 11 +++++-- 4 files changed, 54 insertions(+), 30 deletions(-) diff --git a/backend/internal/service/antigravity_gateway_service.go b/backend/internal/service/antigravity_gateway_service.go index c49f6f3f..7f3e97a2 100644 --- a/backend/internal/service/antigravity_gateway_service.go +++ b/backend/internal/service/antigravity_gateway_service.go @@ -564,6 +564,10 @@ urlFallbackLoop: } upstreamReq, err := antigravity.NewAPIRequestWithURL(ctx, baseURL, action, accessToken, geminiBody) + // Capture upstream request body for ops retry of this attempt. + if c != nil { + c.Set(OpsUpstreamRequestBodyKey, string(geminiBody)) + } if err != nil { return nil, err } @@ -574,7 +578,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: 0, Kind: "request_error", Message: safeErr, @@ -616,7 +620,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "retry", @@ -647,7 +651,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "retry", @@ -700,7 +704,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "signature_error", @@ -744,7 +748,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: 0, Kind: "signature_retry_request_error", Message: sanitizeUpstreamErrorMessage(retryErr.Error()), @@ -775,7 +779,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: retryResp.StatusCode, UpstreamRequestID: retryResp.Header.Get("x-request-id"), Kind: kind, @@ -823,7 +827,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "failover", @@ -1378,7 +1382,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: 0, Kind: "request_error", Message: safeErr, @@ -1420,7 +1424,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "retry", @@ -1451,7 +1455,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "retry", @@ -1553,7 +1557,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: requestID, Kind: "failover", @@ -1570,7 +1574,7 @@ urlFallbackLoop: appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: requestID, Kind: "http_error", diff --git a/backend/internal/service/gateway_service.go b/backend/internal/service/gateway_service.go index 0361405c..121b59e8 100644 --- a/backend/internal/service/gateway_service.go +++ b/backend/internal/service/gateway_service.go @@ -1450,6 +1450,9 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A for attempt := 1; attempt <= maxRetryAttempts; attempt++ { // 构建上游请求(每次重试需要重新构建,因为请求体需要重新读取) upstreamReq, err := s.buildUpstreamRequest(ctx, c, account, body, token, tokenType, reqModel) + // Capture upstream request body for ops retry of this attempt. + c.Set(OpsUpstreamRequestBodyKey, string(body)) + if err != nil { return nil, err } @@ -1491,7 +1494,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "signature_error", @@ -1543,7 +1546,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: retryResp.StatusCode, UpstreamRequestID: retryResp.Header.Get("x-request-id"), Kind: "signature_retry_thinking", @@ -1572,7 +1575,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: 0, Kind: "signature_retry_tools_request_error", Message: sanitizeUpstreamErrorMessage(retryErr2.Error()), @@ -1631,7 +1634,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "retry", @@ -1747,7 +1750,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "failover_on_400", diff --git a/backend/internal/service/gemini_messages_compat_service.go b/backend/internal/service/gemini_messages_compat_service.go index 12068357..75de90f2 100644 --- a/backend/internal/service/gemini_messages_compat_service.go +++ b/backend/internal/service/gemini_messages_compat_service.go @@ -545,13 +545,19 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex } requestIDHeader = idHeader + // Capture upstream request body for ops retry of this attempt. + if c != nil { + // In this code path `body` is already the JSON sent to upstream. + c.Set(OpsUpstreamRequestBodyKey, string(body)) + } + resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { safeErr := sanitizeUpstreamErrorMessage(err.Error()) appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: 0, Kind: "request_error", Message: safeErr, @@ -589,7 +595,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: upstreamReqID, Kind: "signature_error", @@ -664,7 +670,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: upstreamReqID, Kind: "retry", @@ -714,7 +720,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: upstreamReqID, Kind: "failover", @@ -741,7 +747,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: upstreamReqID, Kind: "failover", @@ -977,13 +983,19 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. } requestIDHeader = idHeader + // Capture upstream request body for ops retry of this attempt. + if c != nil { + // In this code path `body` is already the JSON sent to upstream. + c.Set(OpsUpstreamRequestBodyKey, string(body)) + } + resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { safeErr := sanitizeUpstreamErrorMessage(err.Error()) appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: 0, Kind: "request_error", Message: safeErr, @@ -1042,7 +1054,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: upstreamReqID, Kind: "retry", @@ -1127,7 +1139,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: requestID, Kind: "failover", @@ -1151,7 +1163,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: requestID, Kind: "failover", @@ -1177,7 +1189,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: requestID, Kind: "http_error", diff --git a/backend/internal/service/openai_gateway_service.go b/backend/internal/service/openai_gateway_service.go index 9fe1df0f..e7f3f9d1 100644 --- a/backend/internal/service/openai_gateway_service.go +++ b/backend/internal/service/openai_gateway_service.go @@ -631,6 +631,11 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco proxyURL = account.Proxy.URL() } + // Capture upstream request body for ops retry of this attempt. + if c != nil { + c.Set(OpsUpstreamRequestBodyKey, string(body)) + } + // Send request resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { @@ -640,7 +645,7 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: 0, Kind: "request_error", Message: safeErr, @@ -675,7 +680,7 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "failover", @@ -836,7 +841,7 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ Platform: account.Platform, AccountID: account.ID, - AccountName: account.Name, + AccountName: account.Name, UpstreamStatusCode: resp.StatusCode, UpstreamRequestID: resp.Header.Get("x-request-id"), Kind: "http_error", From 918a2538513287e1f33208823e4f2c21844468e6 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Wed, 14 Jan 2026 14:30:18 +0800 Subject: [PATCH 19/86] =?UTF-8?q?feat(frontend):=20=E5=AE=8C=E5=96=84ops?= =?UTF-8?q?=E7=9B=91=E6=8E=A7=E9=9D=A2=E6=9D=BF=E5=92=8C=E7=BB=84=E4=BB=B6?= =?UTF-8?q?=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frontend/src/api/admin/ops.ts | 69 +++++++++++++- frontend/src/views/admin/ops/OpsDashboard.vue | 2 +- .../ops/components/OpsErrorDetailModal.vue | 92 +++++++++++++------ .../ops/components/OpsErrorDetailsModal.vue | 47 +++++++--- 4 files changed, 164 insertions(+), 46 deletions(-) diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts index 5b6d202c..4ec560a4 100644 --- a/frontend/src/api/admin/ops.ts +++ b/frontend/src/api/admin/ops.ts @@ -54,6 +54,7 @@ export type OpsUpstreamErrorEvent = { account_name?: string upstream_status_code?: number upstream_request_id?: string + upstream_request_body?: string kind?: string message?: string detail?: string @@ -944,7 +945,9 @@ export async function getErrorDistribution( return data } -export async function listErrorLogs(params: { +export type OpsErrorListView = 'errors' | 'excluded' | 'all' + +export type OpsErrorListQueryParams = { page?: number page_size?: number time_range?: string @@ -958,10 +961,14 @@ export async function listErrorLogs(params: { error_owner?: string error_source?: string resolved?: string + view?: OpsErrorListView q?: string status_codes?: string -}): Promise { +} + +// Legacy unified endpoints +export async function listErrorLogs(params: OpsErrorListQueryParams): Promise { const { data } = await apiClient.get('/admin/ops/errors', { params }) return data } @@ -985,6 +992,50 @@ export async function updateErrorResolved(errorId: number, resolved: boolean): P await apiClient.put(`/admin/ops/errors/${errorId}/resolve`, { resolved }) } +// New split endpoints +export async function listRequestErrors(params: OpsErrorListQueryParams): Promise { + const { data } = await apiClient.get('/admin/ops/request-errors', { params }) + return data +} + +export async function listUpstreamErrors(params: OpsErrorListQueryParams): Promise { + const { data } = await apiClient.get('/admin/ops/upstream-errors', { params }) + return data +} + +export async function getRequestErrorDetail(id: number): Promise { + const { data } = await apiClient.get(`/admin/ops/request-errors/${id}`) + return data +} + +export async function getUpstreamErrorDetail(id: number): Promise { + const { data } = await apiClient.get(`/admin/ops/upstream-errors/${id}`) + return data +} + +export async function retryRequestErrorClient(id: number): Promise { + const { data } = await apiClient.post(`/admin/ops/request-errors/${id}/retry-client`, {}) + return data +} + +export async function retryRequestErrorUpstreamEvent(id: number, idx: number): Promise { + const { data } = await apiClient.post(`/admin/ops/request-errors/${id}/upstream-errors/${idx}/retry`, {}) + return data +} + +export async function retryUpstreamError(id: number): Promise { + const { data } = await apiClient.post(`/admin/ops/upstream-errors/${id}/retry`, {}) + return data +} + +export async function updateRequestErrorResolved(errorId: number, resolved: boolean): Promise { + await apiClient.put(`/admin/ops/request-errors/${errorId}/resolve`, { resolved }) +} + +export async function updateUpstreamErrorResolved(errorId: number, resolved: boolean): Promise { + await apiClient.put(`/admin/ops/upstream-errors/${errorId}/resolve`, { resolved }) +} + export async function listRequestDetails(params: OpsRequestDetailsParams): Promise { const { data } = await apiClient.get('/admin/ops/requests', { params }) return data @@ -1103,11 +1154,25 @@ export const opsAPI = { getAccountAvailabilityStats, getRealtimeTrafficSummary, subscribeQPS, + + // Legacy unified endpoints listErrorLogs, getErrorLogDetail, retryErrorRequest, listRetryAttempts, updateErrorResolved, + + // New split endpoints + listRequestErrors, + listUpstreamErrors, + getRequestErrorDetail, + getUpstreamErrorDetail, + retryRequestErrorClient, + retryRequestErrorUpstreamEvent, + retryUpstreamError, + updateRequestErrorResolved, + updateUpstreamErrorResolved, + listRequestDetails, listAlertRules, createAlertRule, diff --git a/frontend/src/views/admin/ops/OpsDashboard.vue b/frontend/src/views/admin/ops/OpsDashboard.vue index d059059d..f6f18f3d 100644 --- a/frontend/src/views/admin/ops/OpsDashboard.vue +++ b/frontend/src/views/admin/ops/OpsDashboard.vue @@ -94,7 +94,7 @@ @openErrorDetail="openError" /> - + @@ -263,12 +263,12 @@ -
+
- +
- {{ t('admin.ops.errorDetail.retryNote2') }} + pinned to original account_id
@@ -327,8 +327,20 @@
#{{ idx + 1 }} {{ ev.kind }}
-
- {{ ev.at_unix_ms ? formatDateTime(new Date(ev.at_unix_ms)) : '' }} +
+ +
+ {{ ev.at_unix_ms ? formatDateTime(new Date(ev.at_unix_ms)) : '' }} +
@@ -526,13 +538,14 @@ import { useI18n } from 'vue-i18n' import BaseDialog from '@/components/common/BaseDialog.vue' import ConfirmDialog from '@/components/common/ConfirmDialog.vue' import { useAppStore } from '@/stores' -import { opsAPI, type OpsErrorDetail, type OpsRetryMode, type OpsRetryAttempt } from '@/api/admin/ops' +import { opsAPI, type OpsErrorDetail, type OpsRetryAttempt } from '@/api/admin/ops' import { formatDateTime } from '@/utils/format' import { getSeverityClass } from '../utils/opsFormatters' interface Props { show: boolean errorId: number | null + errorType?: 'request' | 'upstream' } interface Emits { @@ -552,7 +565,7 @@ const activeTab = ref<'overview' | 'retries' | 'request' | 'response'>('overview const retrying = ref(false) const showRetryConfirm = ref(false) -const pendingRetryMode = ref('client') +const pendingRetryMode = ref<'client' | 'upstream' | 'upstream_event'>('client') const forceRetryAck = ref(false) const retryHistory = ref([]) @@ -563,12 +576,6 @@ const compareA = ref(null) const compareB = ref(null) const pinnedAccountIdInput = ref('') -const pinnedAccountId = computed(() => { - const raw = String(pinnedAccountIdInput.value || '').trim() - if (!raw) return null - const n = Number.parseInt(raw, 10) - return Number.isFinite(n) && n > 0 ? n : null -}) const title = computed(() => { if (!props.errorId) return 'Error Detail' @@ -584,6 +591,7 @@ type UpstreamErrorEvent = { account_name?: string upstream_status_code?: number upstream_request_id?: string + upstream_request_body?: string kind?: string message?: string detail?: string @@ -641,15 +649,12 @@ const handlingSuggestion = computed(() => { async function fetchDetail(id: number) { loading.value = true try { - const d = await opsAPI.getErrorLogDetail(id) + const kind = props.errorType || (detail.value?.phase === 'upstream' ? 'upstream' : 'request') + const d = kind === 'upstream' ? await opsAPI.getUpstreamErrorDetail(id) : await opsAPI.getRequestErrorDetail(id) detail.value = d - // Default pinned account from error log if present. - if (d.account_id && d.account_id > 0) { - pinnedAccountIdInput.value = String(d.account_id) - } else { - pinnedAccountIdInput.value = '' - } + // Keep showing original account_id (read-only hint for upstream retries). + pinnedAccountIdInput.value = d.account_id && d.account_id > 0 ? String(d.account_id) : '' } catch (err: any) { detail.value = null appStore.showError(err?.message || t('admin.ops.failedToLoadErrorDetail')) @@ -679,7 +684,7 @@ watch( { immediate: true } ) -function openRetryConfirm(mode: OpsRetryMode) { +function openRetryConfirm(mode: 'client' | 'upstream' | 'upstream_event') { pendingRetryMode.value = mode // Force-ack required only when backend says not retryable. forceRetryAck.value = false @@ -733,7 +738,12 @@ const responseTabHint = computed(() => { async function markResolved(resolved: boolean) { if (!props.errorId) return try { - await opsAPI.updateErrorResolved(props.errorId, resolved) + const kind = props.errorType || (detail.value?.phase === 'upstream' ? 'upstream' : 'request') + if (kind === 'upstream') { + await opsAPI.updateUpstreamErrorResolved(props.errorId, resolved) + } else { + await opsAPI.updateRequestErrorResolved(props.errorId, resolved) + } await fetchDetail(props.errorId) appStore.showSuccess(resolved ? (t('admin.ops.errorDetails.resolved') || 'Resolved') : (t('admin.ops.errorDetails.unresolved') || 'Unresolved')) } catch (err: any) { @@ -779,12 +789,20 @@ async function runConfirmedRetry() { retrying.value = true try { - const req = - mode === 'upstream' - ? { mode, pinned_account_id: pinnedAccountId.value ?? undefined, force: !retryable ? true : undefined } - : { mode, force: !retryable ? true : undefined } + const kind = props.errorType || (detail.value?.phase === 'upstream' ? 'upstream' : 'request') + + let res + if (kind === 'upstream') { + // Upstream error retries always pin the original account_id. + res = await opsAPI.retryUpstreamError(props.errorId) + } else { + if (mode === 'client') { + res = await opsAPI.retryRequestErrorClient(props.errorId) + } else { + throw new Error('Unsupported retry mode') + } + } - const res = await opsAPI.retryErrorRequest(props.errorId, req) const summary = res.status === 'succeeded' ? t('admin.ops.errorDetail.retrySuccess') : t('admin.ops.errorDetail.retryFailed') appStore.showSuccess(summary) @@ -798,6 +816,22 @@ async function runConfirmedRetry() { } } +async function retryUpstreamEvent(idx: number) { + if (!props.errorId) return + try { + retrying.value = true + const res = await opsAPI.retryRequestErrorUpstreamEvent(props.errorId, idx) + const summary = res.status === 'succeeded' ? t('admin.ops.errorDetail.retrySuccess') : t('admin.ops.errorDetail.retryFailed') + appStore.showSuccess(summary) + await fetchDetail(props.errorId) + await loadRetryHistory() + } catch (err: any) { + appStore.showError(err?.message || t('admin.ops.retryFailed')) + } finally { + retrying.value = false + } +} + function cancelRetry() { showRetryConfirm.value = false } diff --git a/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue b/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue index 0abe183a..4ff2ec0f 100644 --- a/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue +++ b/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue @@ -32,7 +32,9 @@ const q = ref('') const statusCode = ref(null) const phase = ref('') const errorOwner = ref('') -const resolvedStatus = ref('unresolved') + const resolvedStatus = ref('unresolved') + const viewMode = ref<'errors' | 'excluded' | 'all'>('errors') + const modalTitle = computed(() => { return props.errorType === 'upstream' ? t('admin.ops.errorDetails.upstreamErrors') : t('admin.ops.errorDetails.requestErrors') @@ -63,6 +65,14 @@ const resolvedSelectOptions = computed(() => { ] }) +const viewModeSelectOptions = computed(() => { + return [ + { value: 'errors', label: t('admin.ops.errorDetails.viewErrors') || 'errors' }, + { value: 'excluded', label: t('admin.ops.errorDetails.viewExcluded') || 'excluded' }, + { value: 'all', label: t('common.all') } + ] +}) + const phaseSelectOptions = computed(() => { const options = [ { value: '', label: t('common.all') }, @@ -88,7 +98,8 @@ async function fetchErrorLogs() { const params: Record = { page: page.value, page_size: pageSize.value, - time_range: props.timeRange + time_range: props.timeRange, + view: viewMode.value } const platform = String(props.platform || '').trim() @@ -109,7 +120,9 @@ async function fetchErrorLogs() { else if (resolvedVal === 'unresolved') params.resolved = 'false' // 'all' -> omit - const res = await opsAPI.listErrorLogs(params) + const res = props.errorType === 'upstream' + ? await opsAPI.listUpstreamErrors(params) + : await opsAPI.listRequestErrors(params) rows.value = res.items || [] total.value = res.total || 0 } catch (err) { @@ -121,15 +134,17 @@ async function fetchErrorLogs() { } } -function resetFilters() { - q.value = '' - statusCode.value = null - phase.value = props.errorType === 'upstream' ? 'upstream' : '' - errorOwner.value = '' - resolvedStatus.value = 'unresolved' - page.value = 1 - fetchErrorLogs() -} + function resetFilters() { + q.value = '' + statusCode.value = null + phase.value = props.errorType === 'upstream' ? 'upstream' : '' + errorOwner.value = '' + resolvedStatus.value = 'unresolved' + viewMode.value = 'errors' + page.value = 1 + fetchErrorLogs() + } + watch( () => props.show, @@ -172,7 +187,7 @@ watch( ) watch( - () => [statusCode.value, phase.value, errorOwner.value, resolvedStatus.value] as const, + () => [statusCode.value, phase.value, errorOwner.value, resolvedStatus.value, viewMode.value] as const, () => { if (!props.show) return page.value = 1 @@ -186,7 +201,7 @@ watch(
-
+
@@ -224,6 +239,10 @@ watch( +
+
@@ -121,12 +122,15 @@

${{ formatCost(stats.summary.avg_daily_cost) }}

-

+

{{ t('admin.accounts.stats.basedOnActualDays', { days: stats.summary.actual_days_used }) }} + + ({{ t('usage.userBilled') }}: ${{ formatCost(stats.summary.avg_daily_user_cost) }}) +

@@ -189,13 +193,17 @@
- {{ - t('admin.accounts.stats.cost') - }} + {{ t('usage.accountBilled') }} ${{ formatCost(stats.summary.today?.cost || 0) }}
+
+ {{ t('usage.userBilled') }} + ${{ formatCost(stats.summary.today?.user_cost || 0) }} +
{{ t('admin.accounts.stats.requests') @@ -240,13 +248,17 @@ }}
- {{ - t('admin.accounts.stats.cost') - }} + {{ t('usage.accountBilled') }} ${{ formatCost(stats.summary.highest_cost_day?.cost || 0) }}
+
+ {{ t('usage.userBilled') }} + ${{ formatCost(stats.summary.highest_cost_day?.user_cost || 0) }} +
{{ t('admin.accounts.stats.requests') @@ -291,13 +303,17 @@ }}
- {{ - t('admin.accounts.stats.cost') - }} + {{ t('usage.accountBilled') }} ${{ formatCost(stats.summary.highest_request_day?.cost || 0) }}
+
+ {{ t('usage.userBilled') }} + ${{ formatCost(stats.summary.highest_request_day?.user_cost || 0) }} +
@@ -397,13 +413,17 @@ }}
- {{ - t('admin.accounts.stats.todayCost') - }} + {{ t('usage.accountBilled') }} ${{ formatCost(stats.summary.today?.cost || 0) }}
+
+ {{ t('usage.userBilled') }} + ${{ formatCost(stats.summary.today?.user_cost || 0) }} +
@@ -517,14 +537,24 @@ const trendChartData = computed(() => { labels: stats.value.history.map((h) => h.label), datasets: [ { - label: t('admin.accounts.stats.cost') + ' (USD)', - data: stats.value.history.map((h) => h.cost), + label: t('usage.accountBilled') + ' (USD)', + data: stats.value.history.map((h) => h.actual_cost), borderColor: '#3b82f6', backgroundColor: 'rgba(59, 130, 246, 0.1)', fill: true, tension: 0.3, yAxisID: 'y' }, + { + label: t('usage.userBilled') + ' (USD)', + data: stats.value.history.map((h) => h.user_cost), + borderColor: '#10b981', + backgroundColor: 'rgba(16, 185, 129, 0.08)', + fill: false, + tension: 0.3, + borderDash: [5, 5], + yAxisID: 'y' + }, { label: t('admin.accounts.stats.requests'), data: stats.value.history.map((h) => h.requests), @@ -602,7 +632,7 @@ const lineChartOptions = computed(() => ({ }, title: { display: true, - text: t('admin.accounts.stats.cost') + ' (USD)', + text: t('usage.accountBilled') + ' (USD)', color: '#3b82f6', font: { size: 11 diff --git a/frontend/src/components/account/AccountTodayStatsCell.vue b/frontend/src/components/account/AccountTodayStatsCell.vue index b8bbc618..a920f314 100644 --- a/frontend/src/components/account/AccountTodayStatsCell.vue +++ b/frontend/src/components/account/AccountTodayStatsCell.vue @@ -32,15 +32,20 @@ formatTokens(stats.tokens) }}
- +
- {{ t('admin.accounts.stats.cost') }}: + {{ t('usage.accountBilled') }}: {{ formatCurrency(stats.cost) }}
+ +
+ {{ t('usage.userBilled') }}: + {{ + formatCurrency(stats.user_cost) + }} +
diff --git a/frontend/src/components/account/BulkEditAccountModal.vue b/frontend/src/components/account/BulkEditAccountModal.vue index 9ccf6130..fb776e96 100644 --- a/frontend/src/components/account/BulkEditAccountModal.vue +++ b/frontend/src/components/account/BulkEditAccountModal.vue @@ -459,7 +459,7 @@ -
+
+
+
+ + +
+ +

{{ t('admin.accounts.billingRateMultiplierHint') }}

+
@@ -655,6 +685,7 @@ const enableInterceptWarmup = ref(false) const enableProxy = ref(false) const enableConcurrency = ref(false) const enablePriority = ref(false) +const enableRateMultiplier = ref(false) const enableStatus = ref(false) const enableGroups = ref(false) @@ -670,6 +701,7 @@ const interceptWarmupRequests = ref(false) const proxyId = ref(null) const concurrency = ref(1) const priority = ref(1) +const rateMultiplier = ref(1) const status = ref<'active' | 'inactive'>('active') const groupIds = ref([]) @@ -863,6 +895,10 @@ const buildUpdatePayload = (): Record | null => { updates.priority = priority.value } + if (enableRateMultiplier.value) { + updates.rate_multiplier = rateMultiplier.value + } + if (enableStatus.value) { updates.status = status.value } @@ -923,6 +959,7 @@ const handleSubmit = async () => { enableProxy.value || enableConcurrency.value || enablePriority.value || + enableRateMultiplier.value || enableStatus.value || enableGroups.value @@ -977,6 +1014,7 @@ watch( enableProxy.value = false enableConcurrency.value = false enablePriority.value = false + enableRateMultiplier.value = false enableStatus.value = false enableGroups.value = false @@ -991,6 +1029,7 @@ watch( proxyId.value = null concurrency.value = 1 priority.value = 1 + rateMultiplier.value = 1 status.value = 'active' groupIds.value = [] } diff --git a/frontend/src/components/account/CreateAccountModal.vue b/frontend/src/components/account/CreateAccountModal.vue index a56a987f..c81de00e 100644 --- a/frontend/src/components/account/CreateAccountModal.vue +++ b/frontend/src/components/account/CreateAccountModal.vue @@ -1196,7 +1196,7 @@
-
+
@@ -1212,6 +1212,11 @@ />

{{ t('admin.accounts.priorityHint') }}

+
+ + +

{{ t('admin.accounts.billingRateMultiplierHint') }}

+
@@ -1832,6 +1837,7 @@ const form = reactive({ proxy_id: null as number | null, concurrency: 10, priority: 1, + rate_multiplier: 1, group_ids: [] as number[], expires_at: null as number | null }) @@ -2119,6 +2125,7 @@ const resetForm = () => { form.proxy_id = null form.concurrency = 10 form.priority = 1 + form.rate_multiplier = 1 form.group_ids = [] form.expires_at = null accountCategory.value = 'oauth-based' @@ -2272,6 +2279,7 @@ const createAccountAndFinish = async ( proxy_id: form.proxy_id, concurrency: form.concurrency, priority: form.priority, + rate_multiplier: form.rate_multiplier, group_ids: form.group_ids, expires_at: form.expires_at, auto_pause_on_expired: autoPauseOnExpired.value @@ -2490,6 +2498,7 @@ const handleCookieAuth = async (sessionKey: string) => { proxy_id: form.proxy_id, concurrency: form.concurrency, priority: form.priority, + rate_multiplier: form.rate_multiplier, group_ids: form.group_ids, expires_at: form.expires_at, auto_pause_on_expired: autoPauseOnExpired.value diff --git a/frontend/src/components/account/EditAccountModal.vue b/frontend/src/components/account/EditAccountModal.vue index 7cb740bd..00cd9b24 100644 --- a/frontend/src/components/account/EditAccountModal.vue +++ b/frontend/src/components/account/EditAccountModal.vue @@ -549,7 +549,7 @@
-
+
@@ -564,6 +564,11 @@ data-tour="account-form-priority" />
+
+ + +

{{ t('admin.accounts.billingRateMultiplierHint') }}

+
@@ -807,6 +812,7 @@ const form = reactive({ proxy_id: null as number | null, concurrency: 1, priority: 1, + rate_multiplier: 1, status: 'active' as 'active' | 'inactive', group_ids: [] as number[], expires_at: null as number | null @@ -834,6 +840,7 @@ watch( form.proxy_id = newAccount.proxy_id form.concurrency = newAccount.concurrency form.priority = newAccount.priority + form.rate_multiplier = newAccount.rate_multiplier ?? 1 form.status = newAccount.status as 'active' | 'inactive' form.group_ids = newAccount.group_ids || [] form.expires_at = newAccount.expires_at ?? null diff --git a/frontend/src/components/account/UsageProgressBar.vue b/frontend/src/components/account/UsageProgressBar.vue index 1b3561ef..93844295 100644 --- a/frontend/src/components/account/UsageProgressBar.vue +++ b/frontend/src/components/account/UsageProgressBar.vue @@ -15,7 +15,13 @@ {{ formatTokens }} - ${{ formatCost }} + A ${{ formatAccountCost }} + + U ${{ formatUserCost }} +
@@ -149,8 +155,13 @@ const formatTokens = computed(() => { return t.toString() }) -const formatCost = computed(() => { +const formatAccountCost = computed(() => { if (!props.windowStats) return '0.00' return props.windowStats.cost.toFixed(2) }) + +const formatUserCost = computed(() => { + if (!props.windowStats || props.windowStats.user_cost == null) return '0.00' + return props.windowStats.user_cost.toFixed(2) +}) diff --git a/frontend/src/components/admin/account/AccountStatsModal.vue b/frontend/src/components/admin/account/AccountStatsModal.vue index 138f5811..72a71d36 100644 --- a/frontend/src/components/admin/account/AccountStatsModal.vue +++ b/frontend/src/components/admin/account/AccountStatsModal.vue @@ -61,11 +61,12 @@

{{ t('admin.accounts.stats.accumulatedCost') }} - ({{ t('admin.accounts.stats.standardCost') }}: ${{ + + ({{ t('usage.userBilled') }}: ${{ formatCost(stats.summary.total_user_cost) }} · + {{ t('admin.accounts.stats.standardCost') }}: ${{ formatCost(stats.summary.total_standard_cost) - }}) + }}) +

@@ -108,12 +109,15 @@

${{ formatCost(stats.summary.avg_daily_cost) }}

-

+

{{ t('admin.accounts.stats.basedOnActualDays', { days: stats.summary.actual_days_used }) }} + + ({{ t('usage.userBilled') }}: ${{ formatCost(stats.summary.avg_daily_user_cost) }}) +

@@ -164,13 +168,17 @@
- {{ - t('admin.accounts.stats.cost') - }} + {{ t('usage.accountBilled') }} ${{ formatCost(stats.summary.today?.cost || 0) }}
+
+ {{ t('usage.userBilled') }} + ${{ formatCost(stats.summary.today?.user_cost || 0) }} +
{{ t('admin.accounts.stats.requests') @@ -210,13 +218,17 @@ }}
- {{ - t('admin.accounts.stats.cost') - }} + {{ t('usage.accountBilled') }} ${{ formatCost(stats.summary.highest_cost_day?.cost || 0) }}
+
+ {{ t('usage.userBilled') }} + ${{ formatCost(stats.summary.highest_cost_day?.user_cost || 0) }} +
{{ t('admin.accounts.stats.requests') @@ -260,13 +272,17 @@ }}
- {{ - t('admin.accounts.stats.cost') - }} + {{ t('usage.accountBilled') }} ${{ formatCost(stats.summary.highest_request_day?.cost || 0) }}
+
+ {{ t('usage.userBilled') }} + ${{ formatCost(stats.summary.highest_request_day?.user_cost || 0) }} +
@@ -485,14 +501,24 @@ const trendChartData = computed(() => { labels: stats.value.history.map((h) => h.label), datasets: [ { - label: t('admin.accounts.stats.cost') + ' (USD)', - data: stats.value.history.map((h) => h.cost), + label: t('usage.accountBilled') + ' (USD)', + data: stats.value.history.map((h) => h.actual_cost), borderColor: '#3b82f6', backgroundColor: 'rgba(59, 130, 246, 0.1)', fill: true, tension: 0.3, yAxisID: 'y' }, + { + label: t('usage.userBilled') + ' (USD)', + data: stats.value.history.map((h) => h.user_cost), + borderColor: '#10b981', + backgroundColor: 'rgba(16, 185, 129, 0.08)', + fill: false, + tension: 0.3, + borderDash: [5, 5], + yAxisID: 'y' + }, { label: t('admin.accounts.stats.requests'), data: stats.value.history.map((h) => h.requests), @@ -570,7 +596,7 @@ const lineChartOptions = computed(() => ({ }, title: { display: true, - text: t('admin.accounts.stats.cost') + ' (USD)', + text: t('usage.accountBilled') + ' (USD)', color: '#3b82f6', font: { size: 11 diff --git a/frontend/src/components/admin/usage/UsageStatsCards.vue b/frontend/src/components/admin/usage/UsageStatsCards.vue index 16ce6619..cd962a09 100644 --- a/frontend/src/components/admin/usage/UsageStatsCards.vue +++ b/frontend/src/components/admin/usage/UsageStatsCards.vue @@ -27,9 +27,18 @@

{{ t('usage.totalCost') }}

-

${{ (stats?.total_actual_cost || 0).toFixed(4) }}

-

- {{ t('usage.standardCost') }}: ${{ (stats?.total_cost || 0).toFixed(4) }} +

+ ${{ ((stats?.total_account_cost ?? stats?.total_actual_cost) || 0).toFixed(4) }} +

+

+ {{ t('usage.userBilled') }}: + ${{ (stats?.total_actual_cost || 0).toFixed(4) }} + · {{ t('usage.standardCost') }}: + ${{ (stats?.total_cost || 0).toFixed(4) }} +

+

+ {{ t('usage.standardCost') }}: + ${{ (stats?.total_cost || 0).toFixed(4) }}

diff --git a/frontend/src/components/admin/usage/UsageTable.vue b/frontend/src/components/admin/usage/UsageTable.vue index a66e4b7b..d2260c59 100644 --- a/frontend/src/components/admin/usage/UsageTable.vue +++ b/frontend/src/components/admin/usage/UsageTable.vue @@ -81,18 +81,23 @@ @@ -202,14 +207,24 @@ {{ t('usage.rate') }} {{ (tooltipData?.rate_multiplier || 1).toFixed(2) }}x +
+ {{ t('usage.accountMultiplier') }} + {{ (tooltipData?.account_rate_multiplier ?? 1).toFixed(2) }}x +
{{ t('usage.original') }} ${{ tooltipData?.total_cost?.toFixed(6) || '0.000000' }}
-
- {{ t('usage.billed') }} +
+ {{ t('usage.userBilled') }} ${{ tooltipData?.actual_cost?.toFixed(6) || '0.000000' }}
+
+ {{ t('usage.accountBilled') }} + + ${{ (((tooltipData?.total_cost || 0) * (tooltipData?.account_rate_multiplier ?? 1)) || 0).toFixed(6) }} + +
diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts index bd17a7f1..bb2c05bc 100644 --- a/frontend/src/i18n/locales/en.ts +++ b/frontend/src/i18n/locales/en.ts @@ -429,6 +429,9 @@ export default { totalCost: 'Total Cost', standardCost: 'Standard', actualCost: 'Actual', + userBilled: 'User billed', + accountBilled: 'Account billed', + accountMultiplier: 'Account rate', avgDuration: 'Avg Duration', inSelectedRange: 'in selected range', perRequest: 'per request', @@ -1058,6 +1061,7 @@ export default { concurrencyStatus: 'Concurrency', notes: 'Notes', priority: 'Priority', + billingRateMultiplier: 'Billing Rate', weight: 'Weight', status: 'Status', schedulable: 'Schedulable', @@ -1225,6 +1229,8 @@ export default { concurrency: 'Concurrency', priority: 'Priority', priorityHint: 'Lower value accounts are used first', + billingRateMultiplier: 'Billing Rate Multiplier', + billingRateMultiplierHint: '>=0, 0 means free. Affects account billing only', expiresAt: 'Expires At', expiresAtHint: 'Leave empty for no expiration', higherPriorityFirst: 'Lower value means higher priority', diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index 9724a55c..dcb0a812 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -426,6 +426,9 @@ export default { totalCost: '总消费', standardCost: '标准', actualCost: '实际', + userBilled: '用户扣费', + accountBilled: '账号计费', + accountMultiplier: '账号倍率', avgDuration: '平均耗时', inSelectedRange: '所选范围内', perRequest: '每次请求', @@ -1108,6 +1111,7 @@ export default { concurrencyStatus: '并发', notes: '备注', priority: '优先级', + billingRateMultiplier: '账号倍率', weight: '权重', status: '状态', schedulable: '调度', @@ -1359,6 +1363,8 @@ export default { concurrency: '并发数', priority: '优先级', priorityHint: '优先级越小的账号优先使用', + billingRateMultiplier: '账号计费倍率', + billingRateMultiplierHint: '>=0,0 表示该账号计费为 0;仅影响账号计费口径', expiresAt: '过期时间', expiresAtHint: '留空表示不过期', higherPriorityFirst: '数值越小优先级越高', diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts index 5eb74596..66f71b7e 100644 --- a/frontend/src/types/index.ts +++ b/frontend/src/types/index.ts @@ -428,6 +428,7 @@ export interface Account { concurrency: number current_concurrency?: number // Real-time concurrency count from Redis priority: number + rate_multiplier?: number // Account billing multiplier (>=0, 0 means free) status: 'active' | 'inactive' | 'error' error_message: string | null last_used_at: string | null @@ -457,7 +458,9 @@ export interface Account { export interface WindowStats { requests: number tokens: number - cost: number + cost: number // Account cost (account multiplier) + standard_cost?: number + user_cost?: number } export interface UsageProgress { @@ -522,6 +525,7 @@ export interface CreateAccountRequest { proxy_id?: number | null concurrency?: number priority?: number + rate_multiplier?: number // Account billing multiplier (>=0, 0 means free) group_ids?: number[] expires_at?: number | null auto_pause_on_expired?: boolean @@ -537,6 +541,7 @@ export interface UpdateAccountRequest { proxy_id?: number | null concurrency?: number priority?: number + rate_multiplier?: number // Account billing multiplier (>=0, 0 means free) schedulable?: boolean status?: 'active' | 'inactive' group_ids?: number[] @@ -593,6 +598,7 @@ export interface UsageLog { total_cost: number actual_cost: number rate_multiplier: number + account_rate_multiplier?: number | null stream: boolean duration_ms: number @@ -852,23 +858,27 @@ export interface AccountUsageHistory { requests: number tokens: number cost: number - actual_cost: number + actual_cost: number // Account cost (account multiplier) + user_cost: number // User/API key billed cost (group multiplier) } export interface AccountUsageSummary { days: number actual_days_used: number - total_cost: number + total_cost: number // Account cost (account multiplier) + total_user_cost: number total_standard_cost: number total_requests: number total_tokens: number - avg_daily_cost: number + avg_daily_cost: number // Account cost + avg_daily_user_cost: number avg_daily_requests: number avg_daily_tokens: number avg_duration_ms: number today: { date: string cost: number + user_cost: number requests: number tokens: number } | null @@ -876,6 +886,7 @@ export interface AccountUsageSummary { date: string label: string cost: number + user_cost: number requests: number } | null highest_request_day: { @@ -883,6 +894,7 @@ export interface AccountUsageSummary { label: string requests: number cost: number + user_cost: number } | null } diff --git a/frontend/src/views/admin/AccountsView.vue b/frontend/src/views/admin/AccountsView.vue index 8a5268ca..ce9e9125 100644 --- a/frontend/src/views/admin/AccountsView.vue +++ b/frontend/src/views/admin/AccountsView.vue @@ -61,6 +61,11 @@ + @@ -190,10 +195,11 @@ const cols = computed(() => { if (!authStore.isSimpleMode) { c.push({ key: 'groups', label: t('admin.accounts.columns.groups'), sortable: false }) } - c.push( - { key: 'usage', label: t('admin.accounts.columns.usageWindows'), sortable: false }, - { key: 'priority', label: t('admin.accounts.columns.priority'), sortable: true }, - { key: 'last_used_at', label: t('admin.accounts.columns.lastUsed'), sortable: true }, + c.push( + { key: 'usage', label: t('admin.accounts.columns.usageWindows'), sortable: false }, + { key: 'priority', label: t('admin.accounts.columns.priority'), sortable: true }, + { key: 'rate_multiplier', label: t('admin.accounts.columns.billingRateMultiplier'), sortable: true }, + { key: 'last_used_at', label: t('admin.accounts.columns.lastUsed'), sortable: true }, { key: 'expires_at', label: t('admin.accounts.columns.expiresAt'), sortable: true }, { key: 'notes', label: t('admin.accounts.columns.notes'), sortable: false }, { key: 'actions', label: t('admin.accounts.columns.actions'), sortable: false } diff --git a/frontend/src/views/admin/UsageView.vue b/frontend/src/views/admin/UsageView.vue index fbde13fd..749e9dbd 100644 --- a/frontend/src/views/admin/UsageView.vue +++ b/frontend/src/views/admin/UsageView.vue @@ -94,7 +94,7 @@ const exportToExcel = async () => { t('admin.usage.cacheReadTokens'), t('admin.usage.cacheCreationTokens'), t('admin.usage.inputCost'), t('admin.usage.outputCost'), t('admin.usage.cacheReadCost'), t('admin.usage.cacheCreationCost'), - t('usage.rate'), t('usage.original'), t('usage.billed'), + t('usage.rate'), t('usage.accountMultiplier'), t('usage.original'), t('usage.userBilled'), t('usage.accountBilled'), t('usage.firstToken'), t('usage.duration'), t('admin.usage.requestId'), t('usage.userAgent'), t('admin.usage.ipAddress') ] @@ -115,8 +115,10 @@ const exportToExcel = async () => { log.cache_read_cost?.toFixed(6) || '0.000000', log.cache_creation_cost?.toFixed(6) || '0.000000', log.rate_multiplier?.toFixed(2) || '1.00', + (log.account_rate_multiplier ?? 1).toFixed(2), log.total_cost?.toFixed(6) || '0.000000', log.actual_cost?.toFixed(6) || '0.000000', + (log.total_cost * (log.account_rate_multiplier ?? 1)).toFixed(6), log.first_token_ms ?? '', log.duration_ms, log.request_id || '', From 55e469c7fe35ee4fcfab714e132d17bd1f8d16b6 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Wed, 14 Jan 2026 16:26:33 +0800 Subject: [PATCH 23/86] =?UTF-8?q?fix(ops):=20=E4=BC=98=E5=8C=96=E9=94=99?= =?UTF-8?q?=E8=AF=AF=E6=97=A5=E5=BF=97=E8=BF=87=E6=BB=A4=E5=92=8C=E6=9F=A5?= =?UTF-8?q?=E8=AF=A2=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 后端改动: - 添加 resolved 参数默认值处理(向后兼容,默认显示未解决错误) - 新增 status_codes_other 查询参数支持 - 移除 service 层的高级设置过滤逻辑,简化错误日志查询流程 前端改动: - 完善错误日志相关组件的国际化支持 - 优化 Ops 监控面板和设置对话框的用户体验 --- backend/internal/handler/admin/ops_handler.go | 17 +++ backend/internal/repository/ops_repo.go | 12 +- backend/internal/service/ops_models.go | 13 +- backend/internal/service/ops_service.go | 55 -------- frontend/src/api/admin/ops.ts | 1 + frontend/src/i18n/locales/en.ts | 87 ++++++++++++- frontend/src/i18n/locales/zh.ts | 87 ++++++++++++- .../ops/components/OpsDashboardHeader.vue | 64 ++++----- .../ops/components/OpsErrorDetailModal.vue | 122 +++++++++--------- .../ops/components/OpsErrorDetailsModal.vue | 26 ++-- .../admin/ops/components/OpsErrorLogTable.vue | 14 +- .../ops/components/OpsRuntimeSettingsCard.vue | 28 ++-- .../ops/components/OpsSettingsDialog.vue | 40 +++--- 13 files changed, 349 insertions(+), 217 deletions(-) diff --git a/backend/internal/handler/admin/ops_handler.go b/backend/internal/handler/admin/ops_handler.go index 9349838a..c76b6a60 100644 --- a/backend/internal/handler/admin/ops_handler.go +++ b/backend/internal/handler/admin/ops_handler.go @@ -110,6 +110,12 @@ func (h *OpsHandler) GetErrorLogs(c *gin.Context) { filter.Source = source } filter.View = parseOpsViewParam(c) + + // Legacy endpoint default: unresolved only (backward-compatible). + { + b := false + filter.Resolved = &b + } if v := strings.TrimSpace(c.Query("resolved")); v != "" { switch strings.ToLower(v) { case "1", "true", "yes": @@ -143,6 +149,17 @@ func (h *OpsHandler) GetErrorLogs(c *gin.Context) { } filter.StatusCodes = out } + if v := strings.TrimSpace(c.Query("status_codes_other")); v != "" { + switch strings.ToLower(v) { + case "1", "true", "yes": + filter.StatusCodesOther = true + case "0", "false", "no": + filter.StatusCodesOther = false + default: + response.BadRequest(c, "Invalid status_codes_other") + return + } + } result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter) if err != nil { diff --git a/backend/internal/repository/ops_repo.go b/backend/internal/repository/ops_repo.go index c9cca1d5..0535547d 100644 --- a/backend/internal/repository/ops_repo.go +++ b/backend/internal/repository/ops_repo.go @@ -132,7 +132,6 @@ func (r *opsRepository) ListErrorLogs(ctx context.Context, filter *service.OpsEr pageSize = 500 } - // buildOpsErrorLogsWhere may mutate filter (default resolved filter). where, args := buildOpsErrorLogsWhere(filter) countSQL := "SELECT COUNT(*) FROM ops_error_logs e " + where @@ -933,15 +932,11 @@ func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) { } // ops_error_logs stores client-visible error requests (status>=400), // but we also persist "recovered" upstream errors (status<400) for upstream health visibility. - // By default, keep list endpoints scoped to unresolved records if the caller didn't specify. + // If Resolved is not specified, do not filter by resolved state (backward-compatible). resolvedFilter := (*bool)(nil) if filter != nil { resolvedFilter = filter.Resolved } - if resolvedFilter == nil { - f := false - resolvedFilter = &f - } // Keep list endpoints scoped to client errors unless explicitly filtering upstream phase. if phaseFilter != "upstream" { clauses = append(clauses, "COALESCE(status_code, 0) >= 400") @@ -1007,6 +1002,11 @@ func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) { if len(filter.StatusCodes) > 0 { args = append(args, pq.Array(filter.StatusCodes)) clauses = append(clauses, "COALESCE(upstream_status_code, status_code, 0) = ANY($"+itoa(len(args))+")") + } else if filter.StatusCodesOther { + // "Other" means: status codes not in the common list. + known := []int{400, 401, 403, 404, 409, 422, 429, 500, 502, 503, 504, 529} + args = append(args, pq.Array(known)) + clauses = append(clauses, "NOT (COALESCE(upstream_status_code, status_code, 0) = ANY($"+itoa(len(args))+"))") } if q := strings.TrimSpace(filter.Query); q != "" { like := "%" + q + "%" diff --git a/backend/internal/service/ops_models.go b/backend/internal/service/ops_models.go index c48c9b56..ebdf148f 100644 --- a/backend/internal/service/ops_models.go +++ b/backend/internal/service/ops_models.go @@ -86,12 +86,13 @@ type OpsErrorLogFilter struct { GroupID *int64 AccountID *int64 - StatusCodes []int - Phase string - Owner string - Source string - Resolved *bool - Query string + StatusCodes []int + StatusCodesOther bool + Phase string + Owner string + Source string + Resolved *bool + Query string // View controls error categorization for list endpoints. // - errors: show actionable errors (exclude business-limited / 429 / 529) diff --git a/backend/internal/service/ops_service.go b/backend/internal/service/ops_service.go index d606ba09..915be5df 100644 --- a/backend/internal/service/ops_service.go +++ b/backend/internal/service/ops_service.go @@ -261,64 +261,9 @@ func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter return nil, err } - // Apply error filtering based on settings (for historical data) - result = s.filterErrorLogsBySettings(ctx, result) return result, nil } -// filterErrorLogsBySettings filters error logs based on advanced settings. -// This ensures that historical errors are also filtered when viewing the dashboard. -func (s *OpsService) filterErrorLogsBySettings(ctx context.Context, result *OpsErrorLogList) *OpsErrorLogList { - if result == nil || len(result.Errors) == 0 { - return result - } - - settings, err := s.GetOpsAdvancedSettings(ctx) - if err != nil || settings == nil { - // If we can't get settings, return unfiltered (fail open) - return result - } - - filtered := make([]*OpsErrorLog, 0, len(result.Errors)) - for _, errLog := range result.Errors { - if shouldFilterErrorLog(settings, errLog) { - continue // Skip this error - } - filtered = append(filtered, errLog) - } - - // Update total count to reflect filtered results - result.Errors = filtered - result.Total = len(filtered) - return result -} - -// shouldFilterErrorLog determines if an error log should be filtered based on settings. -func shouldFilterErrorLog(settings *OpsAdvancedSettings, errLog *OpsErrorLog) bool { - if settings == nil || errLog == nil { - return false - } - - msgLower := strings.ToLower(errLog.Message) - - // Check if count_tokens errors should be ignored - if settings.IgnoreCountTokensErrors && strings.Contains(errLog.RequestPath, "/count_tokens") { - return true - } - - // Check if context canceled errors should be ignored - if settings.IgnoreContextCanceled && strings.Contains(msgLower, "context canceled") { - return true - } - - // Check if "no available accounts" errors should be ignored - if settings.IgnoreNoAvailableAccounts && strings.Contains(msgLower, "no available accounts") { - return true - } - - return false -} - func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) { if err := s.RequireMonitoringEnabled(ctx); err != nil { return nil, err diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts index 4ec560a4..0ac54db6 100644 --- a/frontend/src/api/admin/ops.ts +++ b/frontend/src/api/admin/ops.ts @@ -965,6 +965,7 @@ export type OpsErrorListQueryParams = { q?: string status_codes?: string + status_codes_other?: string } // Legacy unified endpoints diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts index 993a18c2..936d6bfa 100644 --- a/frontend/src/i18n/locales/en.ts +++ b/frontend/src/i18n/locales/en.ts @@ -2009,6 +2009,11 @@ export default { // Error Log errorLog: { timeId: 'Time / ID', + commonErrors: { + contextDeadlineExceeded: 'context deadline exceeded', + connectionRefused: 'connection refused', + rateLimit: 'rate limit' + }, time: 'Time', type: 'Type', context: 'Context', @@ -2038,12 +2043,64 @@ export default { requestErrors: 'Request Errors', unresolved: 'Unresolved', resolved: 'Resolved', + viewErrors: 'Errors', + viewExcluded: 'Excluded', + statusCodeOther: 'Other', + owner: { + provider: 'Provider', + client: 'Client', + platform: 'Platform' + }, + phase: { + request: 'Request', + auth: 'Auth', + routing: 'Routing', + upstream: 'Upstream', + network: 'Network', + internal: 'Internal' + }, total: 'Total:', searchPlaceholder: 'Search request_id / client_request_id / message', accountIdPlaceholder: 'account_id' }, // Error Detail Modal errorDetail: { + title: 'Error Detail', + titleWithId: 'Error #{id}', + noErrorSelected: 'No error selected.', + resolution: 'Resolved:', + pinnedToOriginalAccountId: 'Pinned to original account_id', + missingUpstreamRequestBody: 'Missing upstream request body', + failedToLoadRetryHistory: 'Failed to load retry history', + failedToUpdateResolvedStatus: 'Failed to update resolved status', + unsupportedRetryMode: 'Unsupported retry mode', + classificationKeys: { + phase: 'Phase', + owner: 'Owner', + source: 'Source', + retryable: 'Retryable', + resolvedAt: 'Resolved At', + resolvedBy: 'Resolved By', + resolvedRetryId: 'Resolved Retry', + retryCount: 'Retry Count' + }, + upstreamKeys: { + status: 'Status', + message: 'Message', + detail: 'Detail', + upstreamErrors: 'Upstream Errors' + }, + upstreamEvent: { + account: 'Account', + status: 'Status', + requestId: 'Request ID' + }, + retryMeta: { + http: 'HTTP', + used: 'Used', + success: 'Success', + pinned: 'Pinned' + }, loading: 'Loading…', requestId: 'Request ID', time: 'Time', @@ -2053,6 +2110,8 @@ export default { basicInfo: 'Basic Info', platform: 'Platform', model: 'Model', + group: 'Group', + account: 'Account', latency: 'Request Duration', ttft: 'TTFT', businessLimited: 'Business Limited', @@ -2083,6 +2142,7 @@ export default { retryNote1: 'Retry will use the same request body and parameters', retryNote2: 'If the original request failed due to account issues, pinned retry may still fail', retryNote3: 'Client retry will reselect an account', + retryNote4: 'You can force retry for non-retryable errors, but it is not recommended', confirmRetryMessage: 'Confirm retry this request?', confirmRetryHint: 'Will resend with the same request parameters', forceRetry: 'I understand and want to force retry', @@ -2337,7 +2397,11 @@ export default { lockKeyRequired: 'Distributed lock key is required when lock is enabled', lockKeyPrefix: 'Distributed lock key must start with "{prefix}"', lockKeyHint: 'Recommended: start with "{prefix}" to avoid conflicts', - lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds' + lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds', + slaMinPercentRange: 'SLA minimum percentage must be between 0 and 100', + ttftP99MaxRange: 'TTFT P99 maximum must be a number ≥ 0', + requestErrorRateMaxRange: 'Request error rate maximum must be between 0 and 100', + upstreamErrorRateMaxRange: 'Upstream error rate maximum must be between 0 and 100' } }, email: { @@ -2420,9 +2484,28 @@ export default { aggregation: 'Pre-aggregation Tasks', enableAggregation: 'Enable Pre-aggregation', aggregationHint: 'Pre-aggregation improves query performance for long time windows', + errorFiltering: 'Error Filtering', + ignoreCountTokensErrors: 'Ignore count_tokens errors', + ignoreCountTokensErrorsHint: 'When enabled, errors from count_tokens requests will not be written to the error log.', + ignoreContextCanceled: 'Ignore client disconnect errors', + ignoreContextCanceledHint: 'When enabled, client disconnect (context canceled) errors will not be written to the error log.', + ignoreNoAvailableAccounts: 'Ignore no available accounts errors', + ignoreNoAvailableAccountsHint: 'When enabled, "No available accounts" errors will not be written to the error log (not recommended; usually a config issue).', + autoRefresh: 'Auto Refresh', + enableAutoRefresh: 'Enable auto refresh', + enableAutoRefreshHint: 'Automatically refresh dashboard data at a fixed interval.', + refreshInterval: 'Refresh Interval', + refreshInterval15s: '15 seconds', + refreshInterval30s: '30 seconds', + refreshInterval60s: '60 seconds', + autoRefreshCountdown: 'Auto refresh: {seconds}s', validation: { title: 'Please fix the following issues', - retentionDaysRange: 'Retention days must be between 1-365 days' + retentionDaysRange: 'Retention days must be between 1-365 days', + slaMinPercentRange: 'SLA minimum percentage must be between 0 and 100', + ttftP99MaxRange: 'TTFT P99 maximum must be a number ≥ 0', + requestErrorRateMaxRange: 'Request error rate maximum must be between 0 and 100', + upstreamErrorRateMaxRange: 'Upstream error rate maximum must be between 0 and 100' } }, concurrency: { diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index fcc84b19..85270e02 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -2153,6 +2153,11 @@ export default { // Error Log errorLog: { timeId: '时间 / ID', + commonErrors: { + contextDeadlineExceeded: '请求超时', + connectionRefused: '连接被拒绝', + rateLimit: '触发限流' + }, time: '时间', type: '类型', context: '上下文', @@ -2182,12 +2187,64 @@ export default { requestErrors: '请求错误', unresolved: '未解决', resolved: '已解决', + viewErrors: '错误', + viewExcluded: '排除项', + statusCodeOther: '其他', + owner: { + provider: '服务商', + client: '客户端', + platform: '平台' + }, + phase: { + request: '请求', + auth: '认证', + routing: '路由', + upstream: '上游', + network: '网络', + internal: '内部' + }, total: '总计:', searchPlaceholder: '搜索 request_id / client_request_id / message', accountIdPlaceholder: 'account_id' }, // Error Detail Modal errorDetail: { + title: '错误详情', + titleWithId: '错误 #{id}', + noErrorSelected: '未选择错误。', + resolution: '已解决:', + pinnedToOriginalAccountId: '固定到原 account_id', + missingUpstreamRequestBody: '缺少上游请求体', + failedToLoadRetryHistory: '加载重试历史失败', + failedToUpdateResolvedStatus: '更新解决状态失败', + unsupportedRetryMode: '不支持的重试模式', + classificationKeys: { + phase: '阶段', + owner: '归属方', + source: '来源', + retryable: '可重试', + resolvedAt: '解决时间', + resolvedBy: '解决人', + resolvedRetryId: '解决重试ID', + retryCount: '重试次数' + }, + upstreamKeys: { + status: '状态码', + message: '消息', + detail: '详情', + upstreamErrors: '上游错误列表' + }, + upstreamEvent: { + account: '账号', + status: '状态码', + requestId: '请求ID' + }, + retryMeta: { + http: 'HTTP', + used: '使用账号', + success: '成功', + pinned: '固定账号' + }, loading: '加载中…', requestId: '请求 ID', time: '时间', @@ -2197,6 +2254,8 @@ export default { basicInfo: '基本信息', platform: '平台', model: '模型', + group: '分组', + account: '账号', latency: '请求时长', ttft: 'TTFT', businessLimited: '业务限制', @@ -2227,6 +2286,7 @@ export default { retryNote1: '重试会使用相同的请求体和参数', retryNote2: '如果原请求失败是因为账号问题,固定重试可能仍会失败', retryNote3: '客户端重试会重新选择账号', + retryNote4: '对不可重试的错误可以强制重试,但不推荐', confirmRetryMessage: '确认要重试该请求吗?', confirmRetryHint: '将使用相同的请求参数重新发送', forceRetry: '我已确认并理解强制重试风险', @@ -2481,7 +2541,11 @@ export default { lockKeyRequired: '启用分布式锁时必须填写 Lock Key', lockKeyPrefix: '分布式锁 Key 必须以「{prefix}」开头', lockKeyHint: '建议以「{prefix}」开头以避免冲突', - lockTtlRange: '分布式锁 TTL 必须在 1 到 86400 秒之间' + lockTtlRange: '分布式锁 TTL 必须在 1 到 86400 秒之间', + slaMinPercentRange: 'SLA 最低值必须在 0-100 之间', + ttftP99MaxRange: 'TTFT P99 最大值必须大于或等于 0', + requestErrorRateMaxRange: '请求错误率最大值必须在 0-100 之间', + upstreamErrorRateMaxRange: '上游错误率最大值必须在 0-100 之间' } }, email: { @@ -2564,9 +2628,28 @@ export default { aggregation: '预聚合任务', enableAggregation: '启用预聚合任务', aggregationHint: '预聚合可提升长时间窗口查询性能', + errorFiltering: '错误过滤', + ignoreCountTokensErrors: '忽略 count_tokens 错误', + ignoreCountTokensErrorsHint: '启用后,count_tokens 请求的错误将不会写入错误日志。', + ignoreContextCanceled: '忽略客户端断连错误', + ignoreContextCanceledHint: '启用后,客户端主动断开连接(context canceled)的错误将不会写入错误日志。', + ignoreNoAvailableAccounts: '忽略无可用账号错误', + ignoreNoAvailableAccountsHint: '启用后,“No available accounts” 错误将不会写入错误日志(不推荐,这通常是配置问题)。', + autoRefresh: '自动刷新', + enableAutoRefresh: '启用自动刷新', + enableAutoRefreshHint: '自动刷新仪表板数据,启用后会定期拉取最新数据。', + refreshInterval: '刷新间隔', + refreshInterval15s: '15 秒', + refreshInterval30s: '30 秒', + refreshInterval60s: '60 秒', + autoRefreshCountdown: '自动刷新:{seconds}s', validation: { title: '请先修正以下问题', - retentionDaysRange: '保留天数必须在1-365天之间' + retentionDaysRange: '保留天数必须在1-365天之间', + slaMinPercentRange: 'SLA最低百分比必须在0-100之间', + ttftP99MaxRange: 'TTFT P99最大值必须大于等于0', + requestErrorRateMaxRange: '请求错误率最大值必须在0-100之间', + upstreamErrorRateMaxRange: '上游错误率最大值必须在0-100之间' } }, concurrency: { diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue index f92c6c50..c50524ac 100644 --- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue +++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue @@ -826,7 +826,7 @@ function handleToolbarRefresh() { - 自动刷新: {{ props.autoRefreshCountdown }}s + {{ t('admin.ops.settings.autoRefreshCountdown', { seconds: props.autoRefreshCountdown }) }} @@ -1084,11 +1084,11 @@ function handleToolbarRefresh() {
{{ displayRealTimeQps.toFixed(1) }} - QPS + {{ t('admin.ops.qps') }}
{{ displayRealTimeTps.toFixed(1) }} - TPS + {{ t('admin.ops.tps') }}
@@ -1101,11 +1101,11 @@ function handleToolbarRefresh() {
{{ realtimeQpsPeakLabel }} - QPS + {{ t('admin.ops.qps') }}
{{ realtimeTpsPeakLabel }} - TPS + {{ t('admin.ops.tps') }}
@@ -1116,11 +1116,11 @@ function handleToolbarRefresh() {
{{ realtimeQpsAvgLabel }} - QPS + {{ t('admin.ops.qps') }}
{{ realtimeTpsAvgLabel }} - TPS + {{ t('admin.ops.tps') }}
@@ -1195,7 +1195,7 @@ function handleToolbarRefresh() {
- SLA + {{ t('admin.ops.sla') }}
@@ -1242,33 +1242,33 @@ function handleToolbarRefresh() {
{{ durationP99Ms ?? '-' }}
- ms (P99) + {{ t('admin.ops.msP99') }}
- P95: + {{ t('admin.ops.p95') }} {{ durationP95Ms ?? '-' }} - ms + {{ t('admin.ops.ms') }}
- P90: + {{ t('admin.ops.p90') }} {{ durationP90Ms ?? '-' }} - ms + {{ t('admin.ops.ms') }}
- P50: + {{ t('admin.ops.p50') }} {{ durationP50Ms ?? '-' }} - ms + {{ t('admin.ops.ms') }}
Avg: {{ durationAvgMs ?? '-' }} - ms + {{ t('admin.ops.ms') }}
Max: {{ durationMaxMs ?? '-' }} - ms + {{ t('admin.ops.ms') }}
@@ -1277,14 +1277,14 @@ function handleToolbarRefresh() {
- TTFT + {{ t('admin.ops.ttft') }}
@@ -1293,33 +1293,33 @@ function handleToolbarRefresh() {
{{ ttftP99Ms ?? '-' }}
- ms (P99) + {{ t('admin.ops.msP99') }}
- P95: + {{ t('admin.ops.p95') }} {{ ttftP95Ms ?? '-' }} - ms + {{ t('admin.ops.ms') }}
- P90: + {{ t('admin.ops.p90') }} {{ ttftP90Ms ?? '-' }} - ms + {{ t('admin.ops.ms') }}
- P50: + {{ t('admin.ops.p50') }} {{ ttftP50Ms ?? '-' }} - ms + {{ t('admin.ops.ms') }}
Avg: {{ ttftAvgMs ?? '-' }} - ms + {{ t('admin.ops.ms') }}
Max: {{ ttftMaxMs ?? '-' }} - ms + {{ t('admin.ops.ms') }}
@@ -1384,7 +1384,7 @@ function handleToolbarRefresh() {
-
CPU
+
{{ t('admin.ops.cpu') }}
@@ -1398,7 +1398,7 @@ function handleToolbarRefresh() {
-
MEM
+
{{ t('admin.ops.mem') }}
@@ -1416,7 +1416,7 @@ function handleToolbarRefresh() {
-
DB
+
{{ t('admin.ops.db') }}
@@ -1433,7 +1433,7 @@ function handleToolbarRefresh() {
-
Redis
+
{{ t('admin.ops.redis') }}
diff --git a/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue index db9cb80c..88af52e5 100644 --- a/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue +++ b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue @@ -15,9 +15,9 @@
- Resolved: + {{ t('admin.ops.errorDetail.resolution') }} - {{ (detail as any).resolved ? 'true' : 'false' }} + {{ (detail as any).resolved ? t('admin.ops.errorDetails.resolved') : t('admin.ops.errorDetails.unresolved') }}
@@ -28,7 +28,7 @@ :disabled="loading" @click="markResolved(true)" > - {{ t('admin.ops.errorDetail.markResolved') || 'Mark resolved' }} + {{ t('admin.ops.errorDetail.markResolved') }}
- - - - + + + +
@@ -102,7 +102,7 @@
-

{{ t('admin.ops.errorDetail.suggestion') || 'Suggestion' }}

+

{{ t('admin.ops.errorDetail.suggestion') }}

{{ handlingSuggestion }}
@@ -110,41 +110,41 @@
-

{{ t('admin.ops.errorDetail.classification') || 'Classification' }}

+

{{ t('admin.ops.errorDetail.classification') }}

-
phase
+
{{ t('admin.ops.errorDetail.classificationKeys.phase') }}
{{ detail.phase || '—' }}
-
owner
+
{{ t('admin.ops.errorDetail.classificationKeys.owner') }}
{{ (detail as any).error_owner || '—' }}
-
source
+
{{ t('admin.ops.errorDetail.classificationKeys.source') }}
{{ (detail as any).error_source || '—' }}
-
retryable
-
{{ (detail as any).is_retryable ? '✓' : '✗' }}
+
{{ t('admin.ops.errorDetail.classificationKeys.retryable') }}
+
{{ (detail as any).is_retryable ? t('common.yes') : t('common.no') }}
-
resolved_at
+
{{ t('admin.ops.errorDetail.classificationKeys.resolvedAt') }}
{{ (detail as any).resolved_at || '—' }}
-
resolved_by
+
{{ t('admin.ops.errorDetail.classificationKeys.resolvedBy') }}
{{ (detail as any).resolved_by_user_id ?? '—' }}
-
resolved_retry_id
+
{{ t('admin.ops.errorDetail.classificationKeys.resolvedRetryId') }}
{{ (detail as any).resolved_retry_id ?? '—' }}
-
retry_count
+
{{ t('admin.ops.errorDetail.classificationKeys.retryCount') }}
{{ (detail as any).retry_count ?? '—' }}
@@ -165,7 +165,7 @@
{{ t('admin.ops.errorDetail.group') }}
- + {{ detail.group_name || detail.group_id }} @@ -174,7 +174,7 @@
{{ t('admin.ops.errorDetail.account') }}
- + {{ detail.account_name || detail.account_id }} @@ -257,7 +257,7 @@
@@ -268,7 +268,7 @@
- pinned to original account_id + {{ t('admin.ops.errorDetail.pinnedToOriginalAccountId') }}
@@ -294,13 +294,13 @@
-
status
+
{{ t('admin.ops.errorDetail.upstreamKeys.status') }}
{{ detail.upstream_status_code != null ? detail.upstream_status_code : '—' }}
-
message
+
{{ t('admin.ops.errorDetail.upstreamKeys.message') }}
{{ detail.upstream_error_message || '—' }}
@@ -308,14 +308,14 @@
-
detail
+
{{ t('admin.ops.errorDetail.upstreamKeys.detail') }}
{{ prettyJSON(detail.upstream_error_detail) }}
-
upstream_errors
+
{{ t('admin.ops.errorDetail.upstreamKeys.upstreamErrors') }}
{{ t('admin.ops.errorDetail.retryUpstream') }} #{{ idx + 1 }} @@ -346,15 +346,15 @@
- account: - + {{ t('admin.ops.errorDetail.upstreamEvent.account') }}: + {{ ev.account_name || ev.account_id }}
-
status: {{ ev.upstream_status_code ?? '—' }}
+
{{ t('admin.ops.errorDetail.upstreamEvent.status') }}: {{ ev.upstream_status_code ?? '—' }}
- request_id: {{ ev.upstream_request_id || '—' }} + {{ t('admin.ops.errorDetail.upstreamEvent.requestId') }}: {{ ev.upstream_request_id || '—' }}
@@ -403,7 +403,7 @@
-
{{ t('admin.ops.errorDetail.retryHistory') || 'Retry History' }}
+
{{ t('admin.ops.errorDetail.retryHistory') }}
@@ -415,14 +415,14 @@
-
{{ t('admin.ops.errorDetail.compareA') || 'Compare A' }}
+
{{ t('admin.ops.errorDetail.compareA') }}
-
{{ t('admin.ops.errorDetail.compareB') || 'Compare B' }}
+
{{ t('admin.ops.errorDetail.compareB') }}
- {{ t('admin.ops.errorDetail.forceRetry') || 'I understand and want to force retry' }} + {{ t('admin.ops.errorDetail.forceRetry') }}
@@ -578,11 +578,11 @@ const compareB = ref(null) const pinnedAccountIdInput = ref('') const title = computed(() => { - if (!props.errorId) return 'Error Detail' - return `Error #${props.errorId}` + if (!props.errorId) return t('admin.ops.errorDetail.title') + return t('admin.ops.errorDetail.titleWithId', { id: String(props.errorId) }) }) -const emptyText = computed(() => 'No error selected.') +const emptyText = computed(() => t('admin.ops.errorDetail.noErrorSelected')) type UpstreamErrorEvent = { at_unix_ms?: number @@ -630,20 +630,20 @@ const handlingSuggestion = computed(() => { if (owner === 'provider' && phase === 'upstream') { if (retryHistory.value.some((r) => r.success === true) && d.resolved) { - return t('admin.ops.errorDetail.suggestUpstreamResolved') || '✓ Upstream error resolved by retry; no action needed.' + return t('admin.ops.errorDetail.suggestUpstreamResolved') } - return t('admin.ops.errorDetail.suggestUpstream') || 'Upstream instability: consider checking upstream account status, switching accounts, or retrying.' + return t('admin.ops.errorDetail.suggestUpstream') } if (owner === 'client' && phase === 'request') { - return t('admin.ops.errorDetail.suggestRequest') || 'Client request validation error: contact customer to fix request parameters.' + return t('admin.ops.errorDetail.suggestRequest') } if (owner === 'client' && phase === 'auth') { - return t('admin.ops.errorDetail.suggestAuth') || 'Auth failed: verify API key/credentials.' + return t('admin.ops.errorDetail.suggestAuth') } if (owner === 'platform') { - return t('admin.ops.errorDetail.suggestPlatform') || 'Platform error: prioritize investigation and fix.' + return t('admin.ops.errorDetail.suggestPlatform') } - return t('admin.ops.errorDetail.suggestGeneric') || 'See details for more context.' + return t('admin.ops.errorDetail.suggestGeneric') }) async function fetchDetail(id: number) { @@ -709,7 +709,7 @@ async function loadRetryHistory() { retryHistory.value = [] compareA.value = null compareB.value = null - appStore.showError(err?.message || 'Failed to load retry history') + appStore.showError(err?.message || t('admin.ops.errorDetail.failedToLoadRetryHistory')) } finally { retryHistoryLoading.value = false } @@ -732,7 +732,7 @@ const responseTabHint = computed(() => { if (succeeded?.response_preview) { return t('admin.ops.errorDetail.responseHintSucceeded', { id: String(succeeded.id) }) || `Showing succeeded retry response_preview (#${succeeded.id})` } - return t('admin.ops.errorDetail.responseHintFallback') || 'No succeeded retry found; showing stored error_body' + return t('admin.ops.errorDetail.responseHintFallback') }) async function markResolved(resolved: boolean) { @@ -745,9 +745,9 @@ async function markResolved(resolved: boolean) { await opsAPI.updateRequestErrorResolved(props.errorId, resolved) } await fetchDetail(props.errorId) - appStore.showSuccess(resolved ? (t('admin.ops.errorDetails.resolved') || 'Resolved') : (t('admin.ops.errorDetails.unresolved') || 'Unresolved')) + appStore.showSuccess(resolved ? t('admin.ops.errorDetails.resolved') : t('admin.ops.errorDetails.unresolved')) } catch (err: any) { - appStore.showError(err?.message || 'Failed to update resolved status') + appStore.showError(err?.message || t('admin.ops.errorDetail.failedToUpdateResolvedStatus')) } } @@ -755,7 +755,7 @@ const retryConfirmMessage = computed(() => { const mode = pendingRetryMode.value const retryable = !!(detail.value as any)?.is_retryable if (!retryable) { - return t('admin.ops.errorDetail.forceRetryHint') || 'This error is not recommended to retry. Check the box to force retry.' + return t('admin.ops.errorDetail.forceRetryHint') } if (mode === 'upstream') { return t('admin.ops.errorDetail.confirmRetryMessage') @@ -781,7 +781,7 @@ async function runConfirmedRetry() { const mode = pendingRetryMode.value const retryable = !!(detail.value as any)?.is_retryable if (!retryable && !forceRetryAck.value) { - appStore.showError(t('admin.ops.errorDetail.forceRetryNeedAck') || 'Please confirm you want to force retry') + appStore.showError(t('admin.ops.errorDetail.forceRetryNeedAck')) return } @@ -799,7 +799,7 @@ async function runConfirmedRetry() { if (mode === 'client') { res = await opsAPI.retryRequestErrorClient(props.errorId) } else { - throw new Error('Unsupported retry mode') + throw new Error(t('admin.ops.errorDetail.unsupportedRetryMode')) } } diff --git a/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue b/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue index 4ff2ec0f..8c6c116b 100644 --- a/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue +++ b/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue @@ -29,7 +29,7 @@ const page = ref(1) const pageSize = ref(20) const q = ref('') -const statusCode = ref(null) +const statusCode = ref(null) const phase = ref('') const errorOwner = ref('') const resolvedStatus = ref('unresolved') @@ -44,16 +44,17 @@ const statusCodeSelectOptions = computed(() => { const codes = [400, 401, 403, 404, 409, 422, 429, 500, 502, 503, 504, 529] return [ { value: null, label: t('common.all') }, - ...codes.map((c) => ({ value: c, label: String(c) })) + ...codes.map((c) => ({ value: c, label: String(c) })), + { value: 'other', label: t('admin.ops.errorDetails.statusCodeOther') || 'Other' } ] }) const ownerSelectOptions = computed(() => { return [ { value: '', label: t('common.all') }, - { value: 'provider', label: 'provider' }, - { value: 'client', label: 'client' }, - { value: 'platform', label: 'platform' } + { value: 'provider', label: t('admin.ops.errorDetails.owner.provider') || 'provider' }, + { value: 'client', label: t('admin.ops.errorDetails.owner.client') || 'client' }, + { value: 'platform', label: t('admin.ops.errorDetails.owner.platform') || 'platform' } ] }) @@ -76,12 +77,12 @@ const viewModeSelectOptions = computed(() => { const phaseSelectOptions = computed(() => { const options = [ { value: '', label: t('common.all') }, - { value: 'request', label: 'request' }, - { value: 'auth', label: 'auth' }, - { value: 'routing', label: 'routing' }, - { value: 'upstream', label: 'upstream' }, - { value: 'network', label: 'network' }, - { value: 'internal', label: 'internal' } + { value: 'request', label: t('admin.ops.errorDetails.phase.request') || 'request' }, + { value: 'auth', label: t('admin.ops.errorDetails.phase.auth') || 'auth' }, + { value: 'routing', label: t('admin.ops.errorDetails.phase.routing') || 'routing' }, + { value: 'upstream', label: t('admin.ops.errorDetails.phase.upstream') || 'upstream' }, + { value: 'network', label: t('admin.ops.errorDetails.phase.network') || 'network' }, + { value: 'internal', label: t('admin.ops.errorDetails.phase.internal') || 'internal' } ] return options }) @@ -107,7 +108,8 @@ async function fetchErrorLogs() { if (typeof props.groupId === 'number' && props.groupId > 0) params.group_id = props.groupId if (q.value.trim()) params.q = q.value.trim() - if (typeof statusCode.value === 'number') params.status_codes = String(statusCode.value) + if (statusCode.value === 'other') params.status_codes_other = '1' + else if (typeof statusCode.value === 'number') params.status_codes = String(statusCode.value) const phaseVal = String(phase.value || '').trim() if (phaseVal) params.phase = phaseVal diff --git a/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue b/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue index 3e7424df..76922524 100644 --- a/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue +++ b/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue @@ -93,7 +93,7 @@ - + {{ log.group_name || '-' }} @@ -103,7 +103,7 @@ - + {{ log.account_name || '-' }} @@ -195,8 +195,8 @@ function getTypeBadge(log: OpsErrorLog): { label: string; className: string } { return { label: t('admin.ops.errorLog.typeInternal'), className: 'bg-gray-100 text-gray-800 ring-gray-600/20 dark:bg-dark-700 dark:text-gray-200 dark:ring-dark-500/40' } } - const fallback = phase || owner || 'unknown' - return { label: fallback, className: 'bg-gray-50 text-gray-700 ring-gray-600/10 dark:bg-dark-900 dark:text-gray-300 dark:ring-dark-700' } + const fallback = phase || owner || t('common.unknown') + return { label: fallback, className: 'bg-gray-50 text-gray-700 ring-gray-600/10 dark:bg-dark-900 dark:text-gray-300 dark:ring-dark-700' } } interface Props { @@ -238,9 +238,9 @@ function formatSmartMessage(msg: string): string { } } - if (msg.includes('context deadline exceeded')) return 'context deadline exceeded' - if (msg.includes('connection refused')) return 'connection refused' - if (msg.toLowerCase().includes('rate limit')) return 'rate limit' + if (msg.includes('context deadline exceeded')) return t('admin.ops.errorLog.commonErrors.contextDeadlineExceeded') + if (msg.includes('connection refused')) return t('admin.ops.errorLog.commonErrors.connectionRefused') + if (msg.toLowerCase().includes('rate limit')) return t('admin.ops.errorLog.commonErrors.rateLimit') return msg.length > 200 ? msg.substring(0, 200) + '...' : msg } diff --git a/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue b/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue index d64ae390..d9bcbd51 100644 --- a/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue +++ b/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue @@ -50,22 +50,22 @@ function validateRuntimeSettings(settings: OpsAlertRuntimeSettings): ValidationR if (thresholds) { if (thresholds.sla_percent_min != null) { if (!Number.isFinite(thresholds.sla_percent_min) || thresholds.sla_percent_min < 0 || thresholds.sla_percent_min > 100) { - errors.push('SLA 最低值必须在 0-100 之间') + errors.push(t('admin.ops.runtime.validation.slaMinPercentRange')) } } if (thresholds.ttft_p99_ms_max != null) { if (!Number.isFinite(thresholds.ttft_p99_ms_max) || thresholds.ttft_p99_ms_max < 0) { - errors.push('TTFT P99 最大值必须大于或等于 0') + errors.push(t('admin.ops.runtime.validation.ttftP99MaxRange')) } } if (thresholds.request_error_rate_percent_max != null) { if (!Number.isFinite(thresholds.request_error_rate_percent_max) || thresholds.request_error_rate_percent_max < 0 || thresholds.request_error_rate_percent_max > 100) { - errors.push('请求错误率最大值必须在 0-100 之间') + errors.push(t('admin.ops.runtime.validation.requestErrorRateMaxRange')) } } if (thresholds.upstream_error_rate_percent_max != null) { if (!Number.isFinite(thresholds.upstream_error_rate_percent_max) || thresholds.upstream_error_rate_percent_max < 0 || thresholds.upstream_error_rate_percent_max > 100) { - errors.push('上游错误率最大值必须在 0-100 之间') + errors.push(t('admin.ops.runtime.validation.upstreamErrorRateMaxRange')) } } } @@ -329,12 +329,12 @@ onMounted(() => {
-
指标阈值配置
-

配置各项指标的告警阈值。超出阈值的指标将在看板上以红色显示。

+
{{ t('admin.ops.runtime.metricThresholds') }}
+

{{ t('admin.ops.runtime.metricThresholdsHint') }}

-
SLA 最低值 (%)
+
{{ t('admin.ops.runtime.slaMinPercent') }}
{ class="input" placeholder="99.5" /> -

SLA 低于此值时将显示为红色

+

{{ t('admin.ops.runtime.slaMinPercentHint') }}

-
TTFT P99 最大值 (ms)
+
{{ t('admin.ops.runtime.ttftP99MaxMs') }}
{ class="input" placeholder="500" /> -

TTFT P99 高于此值时将显示为红色

+

{{ t('admin.ops.runtime.ttftP99MaxMsHint') }}

-
请求错误率最大值 (%)
+
{{ t('admin.ops.runtime.requestErrorRateMaxPercent') }}
{ class="input" placeholder="5" /> -

请求错误率高于此值时将显示为红色

+

{{ t('admin.ops.runtime.requestErrorRateMaxPercentHint') }}

-
上游错误率最大值 (%)
+
{{ t('admin.ops.runtime.upstreamErrorRateMaxPercent') }}
{ class="input" placeholder="5" /> -

上游错误率高于此值时将显示为红色

+

{{ t('admin.ops.runtime.upstreamErrorRateMaxPercentHint') }}

diff --git a/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue index 4d737c1b..53ab6683 100644 --- a/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue +++ b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue @@ -157,16 +157,16 @@ const validation = computed(() => { // 验证指标阈值 if (metricThresholds.value.sla_percent_min != null && (metricThresholds.value.sla_percent_min < 0 || metricThresholds.value.sla_percent_min > 100)) { - errors.push('SLA最低百分比必须在0-100之间') + errors.push(t('admin.ops.settings.validation.slaMinPercentRange')) } if (metricThresholds.value.ttft_p99_ms_max != null && metricThresholds.value.ttft_p99_ms_max < 0) { - errors.push('TTFT P99最大值必须大于等于0') + errors.push(t('admin.ops.settings.validation.ttftP99MaxRange')) } if (metricThresholds.value.request_error_rate_percent_max != null && (metricThresholds.value.request_error_rate_percent_max < 0 || metricThresholds.value.request_error_rate_percent_max > 100)) { - errors.push('请求错误率最大值必须在0-100之间') + errors.push(t('admin.ops.settings.validation.requestErrorRateMaxRange')) } if (metricThresholds.value.upstream_error_rate_percent_max != null && (metricThresholds.value.upstream_error_rate_percent_max < 0 || metricThresholds.value.upstream_error_rate_percent_max > 100)) { - errors.push('上游错误率最大值必须在0-100之间') + errors.push(t('admin.ops.settings.validation.upstreamErrorRateMaxRange')) } return { valid: errors.length === 0, errors } @@ -472,15 +472,15 @@ async function saveAllSettings() {
- +
-
错误过滤
+
{{ t('admin.ops.settings.errorFiltering') }}
- +

- 启用后,count_tokens 请求的错误将不会写入错误日志 + {{ t('admin.ops.settings.ignoreCountTokensErrorsHint') }}

@@ -488,9 +488,9 @@ async function saveAllSettings() {
- +

- 启用后,客户端主动断开连接(context canceled)的错误将不会写入错误日志 + {{ t('admin.ops.settings.ignoreContextCanceledHint') }}

@@ -498,37 +498,37 @@ async function saveAllSettings() {
- +

- 启用后,"No available accounts" 错误将不会写入错误日志(不推荐,这通常是配置问题) + {{ t('admin.ops.settings.ignoreNoAvailableAccountsHint') }}

- +
-
自动刷新
+
{{ t('admin.ops.settings.autoRefresh') }}
- +

- 自动刷新仪表板数据,启用后会定期拉取最新数据 + {{ t('admin.ops.settings.enableAutoRefreshHint') }}

- + +
+ +
+ +
+ +
+ +
+
@@ -231,18 +245,26 @@ watch( {{ t('admin.ops.errorDetails.total') }} {{ total }}
- + +
+ + diff --git a/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue b/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue index 416bdba9..28868552 100644 --- a/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue +++ b/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue @@ -1,55 +1,48 @@ @@ -184,6 +187,36 @@ import { getSeverityClass, formatDateTime } from '../utils/opsFormatters' const { t } = useI18n() +function isUpstreamRow(log: OpsErrorLog): boolean { + const phase = String(log.phase || '').toLowerCase() + const owner = String(log.error_owner || '').toLowerCase() + return phase === 'upstream' && owner === 'provider' +} + +function getTypeBadge(log: OpsErrorLog): { label: string; className: string } { + const phase = String(log.phase || '').toLowerCase() + const owner = String(log.error_owner || '').toLowerCase() + + if (isUpstreamRow(log)) { + return { label: t('admin.ops.errorLog.typeUpstream'), className: 'bg-red-50 text-red-700 ring-red-600/20 dark:bg-red-900/30 dark:text-red-400 dark:ring-red-500/30' } + } + if (phase === 'request' && owner === 'client') { + return { label: t('admin.ops.errorLog.typeRequest'), className: 'bg-amber-50 text-amber-700 ring-amber-600/20 dark:bg-amber-900/30 dark:text-amber-400 dark:ring-amber-500/30' } + } + if (phase === 'auth' && owner === 'client') { + return { label: t('admin.ops.errorLog.typeAuth'), className: 'bg-blue-50 text-blue-700 ring-blue-600/20 dark:bg-blue-900/30 dark:text-blue-400 dark:ring-blue-500/30' } + } + if (phase === 'routing' && owner === 'platform') { + return { label: t('admin.ops.errorLog.typeRouting'), className: 'bg-purple-50 text-purple-700 ring-purple-600/20 dark:bg-purple-900/30 dark:text-purple-400 dark:ring-purple-500/30' } + } + if (phase === 'internal' && owner === 'platform') { + return { label: t('admin.ops.errorLog.typeInternal'), className: 'bg-gray-100 text-gray-800 ring-gray-600/20 dark:bg-dark-700 dark:text-gray-200 dark:ring-dark-500/40' } + } + + const fallback = phase || owner || t('common.unknown') + return { label: fallback, className: 'bg-gray-50 text-gray-700 ring-gray-600/10 dark:bg-dark-900 dark:text-gray-300 dark:ring-dark-700' } +} + interface Props { rows: OpsErrorLog[] total: number @@ -208,14 +241,6 @@ function getStatusClass(code: number): string { return 'bg-gray-50 text-gray-700 ring-gray-600/20 dark:bg-gray-900/30 dark:text-gray-400 dark:ring-gray-500/30' } -function getLatencyClass(latency: number | null): string { - if (!latency) return 'text-gray-400' - if (latency > 10000) return 'text-red-600 font-black' - if (latency > 5000) return 'text-red-500 font-bold' - if (latency > 2000) return 'text-orange-500 font-medium' - return 'text-gray-600 dark:text-gray-400' -} - function formatSmartMessage(msg: string): string { if (!msg) return '' @@ -231,10 +256,11 @@ function formatSmartMessage(msg: string): string { } } - if (msg.includes('context deadline exceeded')) return 'context deadline exceeded' - if (msg.includes('connection refused')) return 'connection refused' - if (msg.toLowerCase().includes('rate limit')) return 'rate limit' + if (msg.includes('context deadline exceeded')) return t('admin.ops.errorLog.commonErrors.contextDeadlineExceeded') + if (msg.includes('connection refused')) return t('admin.ops.errorLog.commonErrors.connectionRefused') + if (msg.toLowerCase().includes('rate limit')) return t('admin.ops.errorLog.commonErrors.rateLimit') return msg.length > 200 ? msg.substring(0, 200) + '...' : msg + } - + \ No newline at end of file diff --git a/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue b/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue index d3edd745..3a70b4f2 100644 --- a/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue +++ b/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue @@ -38,7 +38,7 @@ const loading = ref(false) const items = ref([]) const total = ref(0) const page = ref(1) -const pageSize = ref(20) +const pageSize = ref(10) const close = () => emit('update:modelValue', false) @@ -95,7 +95,7 @@ watch( (open) => { if (open) { page.value = 1 - pageSize.value = 20 + pageSize.value = 10 fetchData() } } diff --git a/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue b/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue index 1dcab4b3..82c19f4f 100644 --- a/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue +++ b/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue @@ -50,27 +50,22 @@ function validateRuntimeSettings(settings: OpsAlertRuntimeSettings): ValidationR if (thresholds) { if (thresholds.sla_percent_min != null) { if (!Number.isFinite(thresholds.sla_percent_min) || thresholds.sla_percent_min < 0 || thresholds.sla_percent_min > 100) { - errors.push('SLA 最低值必须在 0-100 之间') - } - } - if (thresholds.latency_p99_ms_max != null) { - if (!Number.isFinite(thresholds.latency_p99_ms_max) || thresholds.latency_p99_ms_max < 0) { - errors.push('延迟 P99 最大值必须大于或等于 0') + errors.push(t('admin.ops.runtime.validation.slaMinPercentRange')) } } if (thresholds.ttft_p99_ms_max != null) { if (!Number.isFinite(thresholds.ttft_p99_ms_max) || thresholds.ttft_p99_ms_max < 0) { - errors.push('TTFT P99 最大值必须大于或等于 0') + errors.push(t('admin.ops.runtime.validation.ttftP99MaxRange')) } } if (thresholds.request_error_rate_percent_max != null) { if (!Number.isFinite(thresholds.request_error_rate_percent_max) || thresholds.request_error_rate_percent_max < 0 || thresholds.request_error_rate_percent_max > 100) { - errors.push('请求错误率最大值必须在 0-100 之间') + errors.push(t('admin.ops.runtime.validation.requestErrorRateMaxRange')) } } if (thresholds.upstream_error_rate_percent_max != null) { if (!Number.isFinite(thresholds.upstream_error_rate_percent_max) || thresholds.upstream_error_rate_percent_max < 0 || thresholds.upstream_error_rate_percent_max > 100) { - errors.push('上游错误率最大值必须在 0-100 之间') + errors.push(t('admin.ops.runtime.validation.upstreamErrorRateMaxRange')) } } } @@ -163,7 +158,6 @@ function openAlertEditor() { if (!draftAlert.value.thresholds) { draftAlert.value.thresholds = { sla_percent_min: 99.5, - latency_p99_ms_max: 2000, ttft_p99_ms_max: 500, request_error_rate_percent_max: 5, upstream_error_rate_percent_max: 5 @@ -335,12 +329,12 @@ onMounted(() => {
-
指标阈值配置
-

配置各项指标的告警阈值。超出阈值的指标将在看板上以红色显示。

+
{{ t('admin.ops.runtime.metricThresholds') }}
+

{{ t('admin.ops.runtime.metricThresholdsHint') }}

-
SLA 最低值 (%)
+
{{ t('admin.ops.runtime.slaMinPercent') }}
{ class="input" placeholder="99.5" /> -

SLA 低于此值时将显示为红色

+

{{ t('admin.ops.runtime.slaMinPercentHint') }}

-
-
延迟 P99 最大值 (ms)
- -

延迟 P99 高于此值时将显示为红色

-
+
-
TTFT P99 最大值 (ms)
+
{{ t('admin.ops.runtime.ttftP99MaxMs') }}
{ class="input" placeholder="500" /> -

TTFT P99 高于此值时将显示为红色

+

{{ t('admin.ops.runtime.ttftP99MaxMsHint') }}

-
请求错误率最大值 (%)
+
{{ t('admin.ops.runtime.requestErrorRateMaxPercent') }}
{ class="input" placeholder="5" /> -

请求错误率高于此值时将显示为红色

+

{{ t('admin.ops.runtime.requestErrorRateMaxPercentHint') }}

-
上游错误率最大值 (%)
+
{{ t('admin.ops.runtime.upstreamErrorRateMaxPercent') }}
{ class="input" placeholder="5" /> -

上游错误率高于此值时将显示为红色

+

{{ t('admin.ops.runtime.upstreamErrorRateMaxPercentHint') }}

@@ -424,7 +407,7 @@ onMounted(() => { v-model="draftAlert.silencing.global_until_rfc3339" type="text" class="input font-mono text-sm" - :placeholder="t('admin.ops.runtime.silencing.untilPlaceholder')" + placeholder="2026-01-05T00:00:00Z" />

{{ t('admin.ops.runtime.silencing.untilHint') }}

@@ -496,7 +479,7 @@ onMounted(() => { v-model="(entry as any).until_rfc3339" type="text" class="input font-mono text-sm" - :placeholder="t('admin.ops.runtime.silencing.untilPlaceholder')" + placeholder="2026-01-05T00:00:00Z" />
diff --git a/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue index 1f64f253..53ab6683 100644 --- a/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue +++ b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue @@ -32,7 +32,6 @@ const advancedSettings = ref(null) // 指标阈值配置 const metricThresholds = ref({ sla_percent_min: 99.5, - latency_p99_ms_max: 2000, ttft_p99_ms_max: 500, request_error_rate_percent_max: 5, upstream_error_rate_percent_max: 5 @@ -53,13 +52,12 @@ async function loadAllSettings() { advancedSettings.value = advanced // 如果后端返回了阈值,使用后端的值;否则保持默认值 if (thresholds && Object.keys(thresholds).length > 0) { - metricThresholds.value = { - sla_percent_min: thresholds.sla_percent_min ?? 99.5, - latency_p99_ms_max: thresholds.latency_p99_ms_max ?? 2000, - ttft_p99_ms_max: thresholds.ttft_p99_ms_max ?? 500, - request_error_rate_percent_max: thresholds.request_error_rate_percent_max ?? 5, - upstream_error_rate_percent_max: thresholds.upstream_error_rate_percent_max ?? 5 - } + metricThresholds.value = { + sla_percent_min: thresholds.sla_percent_min ?? 99.5, + ttft_p99_ms_max: thresholds.ttft_p99_ms_max ?? 500, + request_error_rate_percent_max: thresholds.request_error_rate_percent_max ?? 5, + upstream_error_rate_percent_max: thresholds.upstream_error_rate_percent_max ?? 5 + } } } catch (err: any) { console.error('[OpsSettingsDialog] Failed to load settings', err) @@ -159,19 +157,16 @@ const validation = computed(() => { // 验证指标阈值 if (metricThresholds.value.sla_percent_min != null && (metricThresholds.value.sla_percent_min < 0 || metricThresholds.value.sla_percent_min > 100)) { - errors.push('SLA最低百分比必须在0-100之间') - } - if (metricThresholds.value.latency_p99_ms_max != null && metricThresholds.value.latency_p99_ms_max < 0) { - errors.push('延迟P99最大值必须大于等于0') + errors.push(t('admin.ops.settings.validation.slaMinPercentRange')) } if (metricThresholds.value.ttft_p99_ms_max != null && metricThresholds.value.ttft_p99_ms_max < 0) { - errors.push('TTFT P99最大值必须大于等于0') + errors.push(t('admin.ops.settings.validation.ttftP99MaxRange')) } if (metricThresholds.value.request_error_rate_percent_max != null && (metricThresholds.value.request_error_rate_percent_max < 0 || metricThresholds.value.request_error_rate_percent_max > 100)) { - errors.push('请求错误率最大值必须在0-100之间') + errors.push(t('admin.ops.settings.validation.requestErrorRateMaxRange')) } if (metricThresholds.value.upstream_error_rate_percent_max != null && (metricThresholds.value.upstream_error_rate_percent_max < 0 || metricThresholds.value.upstream_error_rate_percent_max > 100)) { - errors.push('上游错误率最大值必须在0-100之间') + errors.push(t('admin.ops.settings.validation.upstreamErrorRateMaxRange')) } return { valid: errors.length === 0, errors } @@ -362,17 +357,6 @@ async function saveAllSettings() {

{{ t('admin.ops.settings.slaMinPercentHint') }}

-
- - -

{{ t('admin.ops.settings.latencyP99MaxMsHint') }}

-
@@ -488,43 +472,63 @@ async function saveAllSettings() {
- +
-
错误过滤
+
{{ t('admin.ops.settings.errorFiltering') }}
- +

- 启用后,count_tokens 请求的错误将不计入运维监控的统计和告警中(但仍会存储在数据库中) + {{ t('admin.ops.settings.ignoreCountTokensErrorsHint') }}

-
- - -
-
自动刷新
- +

- 自动刷新仪表板数据,启用后会定期拉取最新数据 + {{ t('admin.ops.settings.ignoreContextCanceledHint') }} +

+
+ +
+ +
+
+ +

+ {{ t('admin.ops.settings.ignoreNoAvailableAccountsHint') }} +

+
+ +
+
+ + +
+
{{ t('admin.ops.settings.autoRefresh') }}
+ +
+
+ +

+ {{ t('admin.ops.settings.enableAutoRefreshHint') }}

- +