diff --git a/backend/internal/repository/ops_repo_metrics.go b/backend/internal/repository/ops_repo_metrics.go index 713e0eb9..f1e57c38 100644 --- a/backend/internal/repository/ops_repo_metrics.go +++ b/backend/internal/repository/ops_repo_metrics.go @@ -43,6 +43,7 @@ INSERT INTO ops_system_metrics ( upstream_529_count, token_consumed, + account_switch_count, qps, tps, @@ -81,14 +82,14 @@ INSERT INTO ops_system_metrics ( $1,$2,$3,$4, $5,$6,$7,$8, $9,$10,$11, - $12,$13,$14, - $15,$16,$17,$18,$19,$20, - $21,$22,$23,$24,$25,$26, - $27,$28,$29,$30, - $31,$32, - $33,$34, - $35,$36,$37, - $38,$39 + $12,$13,$14,$15, + $16,$17,$18,$19,$20,$21, + $22,$23,$24,$25,$26,$27, + $28,$29,$30,$31, + $32,$33, + $34,$35, + $36,$37,$38, + $39,$40 )` _, err := r.db.ExecContext( @@ -109,6 +110,7 @@ INSERT INTO ops_system_metrics ( input.Upstream529Count, input.TokenConsumed, + input.AccountSwitchCount, opsNullFloat64(input.QPS), opsNullFloat64(input.TPS), @@ -177,7 +179,8 @@ SELECT db_conn_waiting, goroutine_count, - concurrency_queue_depth + concurrency_queue_depth, + account_switch_count FROM ops_system_metrics WHERE window_minutes = $1 AND platform IS NULL @@ -199,6 +202,7 @@ LIMIT 1` var dbWaiting sql.NullInt64 var goroutines sql.NullInt64 var queueDepth sql.NullInt64 + var accountSwitchCount sql.NullInt64 if err := r.db.QueryRowContext(ctx, q, windowMinutes).Scan( &out.ID, @@ -217,6 +221,7 @@ LIMIT 1` &dbWaiting, &goroutines, &queueDepth, + &accountSwitchCount, ); err != nil { return nil, err } @@ -273,6 +278,10 @@ LIMIT 1` v := int(queueDepth.Int64) out.ConcurrencyQueueDepth = &v } + if accountSwitchCount.Valid { + v := accountSwitchCount.Int64 + out.AccountSwitchCount = &v + } return &out, nil } diff --git a/backend/internal/repository/ops_repo_trends.go b/backend/internal/repository/ops_repo_trends.go index 022d1187..3be490dd 100644 --- a/backend/internal/repository/ops_repo_trends.go +++ b/backend/internal/repository/ops_repo_trends.go @@ -56,18 +56,44 @@ error_buckets AS ( AND COALESCE(status_code, 0) >= 400 GROUP BY 1 ), +switch_buckets AS ( + SELECT ` + errorBucketExpr + ` AS bucket, + COALESCE(SUM(CASE + WHEN ev->>'kind' IN ('failover', 'retry_exhausted_failover', 'failover_on_400') THEN 1 + ELSE 0 + END), 0) AS switch_count + FROM ops_error_logs + CROSS JOIN LATERAL jsonb_array_elements( + COALESCE(NULLIF(upstream_errors, 'null'::jsonb), '[]'::jsonb) + ) AS ev + ` + errorWhere + ` + AND upstream_errors IS NOT NULL + GROUP BY 1 +), combined AS ( - SELECT COALESCE(u.bucket, e.bucket) AS bucket, - COALESCE(u.success_count, 0) AS success_count, - COALESCE(e.error_count, 0) AS error_count, - COALESCE(u.token_consumed, 0) AS token_consumed - FROM usage_buckets u - FULL OUTER JOIN error_buckets e ON u.bucket = e.bucket + SELECT + bucket, + SUM(success_count) AS success_count, + SUM(error_count) AS error_count, + SUM(token_consumed) AS token_consumed, + SUM(switch_count) AS switch_count + FROM ( + SELECT bucket, success_count, 0 AS error_count, token_consumed, 0 AS switch_count + FROM usage_buckets + UNION ALL + SELECT bucket, 0, error_count, 0, 0 + FROM error_buckets + UNION ALL + SELECT bucket, 0, 0, 0, switch_count + FROM switch_buckets + ) t + GROUP BY bucket ) SELECT bucket, (success_count + error_count) AS request_count, - token_consumed + token_consumed, + switch_count FROM combined ORDER BY bucket ASC` @@ -84,13 +110,18 @@ ORDER BY bucket ASC` var bucket time.Time var requests int64 var tokens sql.NullInt64 - if err := rows.Scan(&bucket, &requests, &tokens); err != nil { + var switches sql.NullInt64 + if err := rows.Scan(&bucket, &requests, &tokens, &switches); err != nil { return nil, err } tokenConsumed := int64(0) if tokens.Valid { tokenConsumed = tokens.Int64 } + switchCount := int64(0) + if switches.Valid { + switchCount = switches.Int64 + } denom := float64(bucketSeconds) if denom <= 0 { @@ -103,6 +134,7 @@ ORDER BY bucket ASC` BucketStart: bucket.UTC(), RequestCount: requests, TokenConsumed: tokenConsumed, + SwitchCount: switchCount, QPS: qps, TPS: tps, }) @@ -385,6 +417,7 @@ func fillOpsThroughputBuckets(start, end time.Time, bucketSeconds int, points [] BucketStart: cursor, RequestCount: 0, TokenConsumed: 0, + SwitchCount: 0, QPS: 0, TPS: 0, }) diff --git a/backend/internal/service/ops_metrics_collector.go b/backend/internal/service/ops_metrics_collector.go index edf32cf2..73ad1fb0 100644 --- a/backend/internal/service/ops_metrics_collector.go +++ b/backend/internal/service/ops_metrics_collector.go @@ -285,6 +285,11 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error { return fmt.Errorf("query error counts: %w", err) } + accountSwitchCount, err := c.queryAccountSwitchCount(ctx, windowStart, windowEnd) + if err != nil { + return fmt.Errorf("query account switch counts: %w", err) + } + windowSeconds := windowEnd.Sub(windowStart).Seconds() if windowSeconds <= 0 { windowSeconds = 60 @@ -310,6 +315,7 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error { Upstream529Count: upstream529, TokenConsumed: tokenConsumed, + AccountSwitchCount: accountSwitchCount, QPS: float64Ptr(roundTo1DP(qps)), TPS: float64Ptr(roundTo1DP(tps)), @@ -551,6 +557,27 @@ WHERE created_at >= $1 AND created_at < $2` return errorTotal, businessLimited, errorSLA, upstreamExcl429529, upstream429, upstream529, nil } +func (c *OpsMetricsCollector) queryAccountSwitchCount(ctx context.Context, start, end time.Time) (int64, error) { + q := ` +SELECT + COALESCE(SUM(CASE + WHEN ev->>'kind' IN ('failover', 'retry_exhausted_failover', 'failover_on_400') THEN 1 + ELSE 0 + END), 0) AS switch_count +FROM ops_error_logs o +CROSS JOIN LATERAL jsonb_array_elements( + COALESCE(NULLIF(o.upstream_errors, 'null'::jsonb), '[]'::jsonb) +) AS ev +WHERE o.created_at >= $1 AND o.created_at < $2 + AND o.is_count_tokens = FALSE` + + var count int64 + if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&count); err != nil { + return 0, err + } + return count, nil +} + type opsCollectedSystemStats struct { cpuUsagePercent *float64 memoryUsedMB *int64 diff --git a/backend/internal/service/ops_port.go b/backend/internal/service/ops_port.go index 515b47bb..1de9c8e9 100644 --- a/backend/internal/service/ops_port.go +++ b/backend/internal/service/ops_port.go @@ -162,6 +162,7 @@ type OpsInsertSystemMetricsInput struct { Upstream529Count int64 TokenConsumed int64 + AccountSwitchCount int64 QPS *float64 TPS *float64 @@ -225,6 +226,7 @@ type OpsSystemMetricsSnapshot struct { GoroutineCount *int `json:"goroutine_count"` ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"` + AccountSwitchCount *int64 `json:"account_switch_count"` } type OpsUpsertJobHeartbeatInput struct { diff --git a/backend/internal/service/ops_trend_models.go b/backend/internal/service/ops_trend_models.go index f6d07c14..97bbfebe 100644 --- a/backend/internal/service/ops_trend_models.go +++ b/backend/internal/service/ops_trend_models.go @@ -6,6 +6,7 @@ type OpsThroughputTrendPoint struct { BucketStart time.Time `json:"bucket_start"` RequestCount int64 `json:"request_count"` TokenConsumed int64 `json:"token_consumed"` + SwitchCount int64 `json:"switch_count"` QPS float64 `json:"qps"` TPS float64 `json:"tps"` } diff --git a/backend/migrations/042_add_ops_system_metrics_switch_count.sql b/backend/migrations/042_add_ops_system_metrics_switch_count.sql new file mode 100644 index 00000000..6d9f48e5 --- /dev/null +++ b/backend/migrations/042_add_ops_system_metrics_switch_count.sql @@ -0,0 +1,3 @@ +-- ops_system_metrics 增加账号切换次数统计(按分钟窗口) +ALTER TABLE ops_system_metrics + ADD COLUMN IF NOT EXISTS account_switch_count BIGINT NOT NULL DEFAULT 0; diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts index 6e048436..4214450f 100644 --- a/frontend/src/api/admin/ops.ts +++ b/frontend/src/api/admin/ops.ts @@ -136,6 +136,7 @@ export interface OpsThroughputTrendPoint { bucket_start: string request_count: number token_consumed: number + switch_count?: number qps: number tps: number } @@ -284,6 +285,7 @@ export interface OpsSystemMetricsSnapshot { goroutine_count?: number | null concurrency_queue_depth?: number | null + account_switch_count?: number | null } export interface OpsJobHeartbeat { diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts index 18e7d7d3..abbd4ff6 100644 --- a/frontend/src/i18n/locales/en.ts +++ b/frontend/src/i18n/locales/en.ts @@ -1955,6 +1955,7 @@ export default { waiting: 'waiting', conns: 'conns', queue: 'queue', + accountSwitches: 'Account switches', ok: 'ok', lastRun: 'last_run:', lastSuccess: 'last_success:', @@ -2003,6 +2004,7 @@ export default { failedToLoadData: 'Failed to load ops data.', failedToLoadOverview: 'Failed to load overview', failedToLoadThroughputTrend: 'Failed to load throughput trend', + failedToLoadSwitchTrend: 'Failed to load avg account switches trend', failedToLoadLatencyHistogram: 'Failed to load request duration histogram', failedToLoadErrorTrend: 'Failed to load error trend', failedToLoadErrorDistribution: 'Failed to load error distribution', @@ -2011,9 +2013,11 @@ export default { tpsK: 'TPS (K)', top: 'Top:', throughputTrend: 'Throughput Trend', + switchRateTrend: 'Avg Account Switches', latencyHistogram: 'Request Duration Histogram', errorTrend: 'Error Trend', errorDistribution: 'Error Distribution', + switchRate: 'Avg switches', // Health Score & Diagnosis health: 'Health', healthCondition: 'Health Condition', @@ -2633,6 +2637,7 @@ export default { tooltips: { totalRequests: 'Total number of requests (including both successful and failed requests) in the selected time window.', throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.', + switchRateTrend: 'Trend of account switches / total requests over the last 5 hours (avg switches).', latencyHistogram: 'Request duration distribution (ms) for successful requests.', errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).', errorDistribution: 'Error distribution by status code.', diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index cb3a4c4c..1b398e7a 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -2103,6 +2103,7 @@ export default { waiting: '等待', conns: '连接', queue: '队列', + accountSwitches: '账号切换', ok: '正常', lastRun: '最近运行', lastSuccess: '最近成功', @@ -2152,6 +2153,7 @@ export default { failedToLoadData: '加载运维数据失败', failedToLoadOverview: '加载概览数据失败', failedToLoadThroughputTrend: '加载吞吐趋势失败', + failedToLoadSwitchTrend: '加载平均账号切换趋势失败', failedToLoadLatencyHistogram: '加载请求时长分布失败', failedToLoadErrorTrend: '加载错误趋势失败', failedToLoadErrorDistribution: '加载错误分布失败', @@ -2160,9 +2162,11 @@ export default { tpsK: 'TPS(千)', top: '最高:', throughputTrend: '吞吐趋势', + switchRateTrend: '平均账号切换趋势', latencyHistogram: '请求时长分布', errorTrend: '错误趋势', errorDistribution: '错误分布', + switchRate: '平均账号切换', // Health Score & Diagnosis health: '健康', healthCondition: '健康状况', @@ -2787,6 +2791,7 @@ export default { tooltips: { totalRequests: '当前时间窗口内的总请求数和Token消耗量。', throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。', + switchRateTrend: '近5小时内账号切换次数 / 请求总数的趋势(平均切换次数)。', latencyHistogram: '成功请求的请求时长分布(毫秒)。', errorTrend: '错误趋势(SLA 口径排除业务限制;上游错误率排除 429/529)。', errorDistribution: '按状态码统计的错误分布。', diff --git a/frontend/src/views/admin/ops/OpsDashboard.vue b/frontend/src/views/admin/ops/OpsDashboard.vue index 72cb2607..927fee94 100644 --- a/frontend/src/views/admin/ops/OpsDashboard.vue +++ b/frontend/src/views/admin/ops/OpsDashboard.vue @@ -40,10 +40,18 @@ /> -
+
+
+ +
(null) const queryMode = ref('auto') const customStartTime = ref(null) const customEndTime = ref(null) +const switchTrendWindowHours = 5 +const switchTrendTimeRange = `${switchTrendWindowHours}h` +const switchTrendWindowMs = switchTrendWindowHours * 60 * 60 * 1000 const QUERY_KEYS = { timeRange: 'tr', @@ -322,6 +334,9 @@ const metricThresholds = ref(null) const throughputTrend = ref(null) const loadingTrend = ref(false) +const switchTrend = ref(null) +const loadingSwitchTrend = ref(false) + const latencyHistogram = ref(null) const loadingLatency = ref(false) @@ -491,6 +506,19 @@ function buildApiParams() { return params } +function buildSwitchTrendParams() { + const params: any = { + platform: platform.value || undefined, + group_id: groupId.value ?? undefined, + mode: queryMode.value + } + const endTime = new Date() + const startTime = new Date(endTime.getTime() - switchTrendWindowMs) + params.start_time = startTime.toISOString() + params.end_time = endTime.toISOString() + return params +} + async function refreshOverviewWithCancel(fetchSeq: number, signal: AbortSignal) { if (!opsEnabled.value) return try { @@ -504,6 +532,24 @@ async function refreshOverviewWithCancel(fetchSeq: number, signal: AbortSignal) } } +async function refreshSwitchTrendWithCancel(fetchSeq: number, signal: AbortSignal) { + if (!opsEnabled.value) return + loadingSwitchTrend.value = true + try { + const data = await opsAPI.getThroughputTrend(buildSwitchTrendParams(), { signal }) + if (fetchSeq !== dashboardFetchSeq) return + switchTrend.value = data + } catch (err: any) { + if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return + switchTrend.value = null + appStore.showError(err?.message || t('admin.ops.failedToLoadSwitchTrend')) + } finally { + if (fetchSeq === dashboardFetchSeq) { + loadingSwitchTrend.value = false + } + } +} + async function refreshThroughputTrendWithCancel(fetchSeq: number, signal: AbortSignal) { if (!opsEnabled.value) return loadingTrend.value = true @@ -600,6 +646,7 @@ async function fetchData() { await Promise.all([ refreshOverviewWithCancel(fetchSeq, dashboardFetchController.signal), refreshThroughputTrendWithCancel(fetchSeq, dashboardFetchController.signal), + refreshSwitchTrendWithCancel(fetchSeq, dashboardFetchController.signal), refreshLatencyHistogramWithCancel(fetchSeq, dashboardFetchController.signal), refreshErrorTrendWithCancel(fetchSeq, dashboardFetchController.signal), refreshErrorDistributionWithCancel(fetchSeq, dashboardFetchController.signal) diff --git a/frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue b/frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue index cffdd8a1..6df1e888 100644 --- a/frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue +++ b/frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue @@ -50,7 +50,11 @@ const props = withDefaults(defineProps(), {
-
+
+
+
+
+
@@ -96,4 +100,3 @@ const props = withDefaults(defineProps(), {
- diff --git a/frontend/src/views/admin/ops/components/OpsSwitchRateTrendChart.vue b/frontend/src/views/admin/ops/components/OpsSwitchRateTrendChart.vue new file mode 100644 index 00000000..391ab8e1 --- /dev/null +++ b/frontend/src/views/admin/ops/components/OpsSwitchRateTrendChart.vue @@ -0,0 +1,150 @@ + + +