feat(ops): add account switch metrics and trend

This commit is contained in:
song
2026-01-23 19:39:48 +08:00
parent 3002c7a17f
commit 316f2fee21
12 changed files with 307 additions and 20 deletions

View File

@@ -43,6 +43,7 @@ INSERT INTO ops_system_metrics (
upstream_529_count,
token_consumed,
account_switch_count,
qps,
tps,
@@ -81,14 +82,14 @@ INSERT INTO ops_system_metrics (
$1,$2,$3,$4,
$5,$6,$7,$8,
$9,$10,$11,
$12,$13,$14,
$15,$16,$17,$18,$19,$20,
$21,$22,$23,$24,$25,$26,
$27,$28,$29,$30,
$31,$32,
$33,$34,
$35,$36,$37,
$38,$39
$12,$13,$14,$15,
$16,$17,$18,$19,$20,$21,
$22,$23,$24,$25,$26,$27,
$28,$29,$30,$31,
$32,$33,
$34,$35,
$36,$37,$38,
$39,$40
)`
_, err := r.db.ExecContext(
@@ -109,6 +110,7 @@ INSERT INTO ops_system_metrics (
input.Upstream529Count,
input.TokenConsumed,
input.AccountSwitchCount,
opsNullFloat64(input.QPS),
opsNullFloat64(input.TPS),
@@ -177,7 +179,8 @@ SELECT
db_conn_waiting,
goroutine_count,
concurrency_queue_depth
concurrency_queue_depth,
account_switch_count
FROM ops_system_metrics
WHERE window_minutes = $1
AND platform IS NULL
@@ -199,6 +202,7 @@ LIMIT 1`
var dbWaiting sql.NullInt64
var goroutines sql.NullInt64
var queueDepth sql.NullInt64
var accountSwitchCount sql.NullInt64
if err := r.db.QueryRowContext(ctx, q, windowMinutes).Scan(
&out.ID,
@@ -217,6 +221,7 @@ LIMIT 1`
&dbWaiting,
&goroutines,
&queueDepth,
&accountSwitchCount,
); err != nil {
return nil, err
}
@@ -273,6 +278,10 @@ LIMIT 1`
v := int(queueDepth.Int64)
out.ConcurrencyQueueDepth = &v
}
if accountSwitchCount.Valid {
v := accountSwitchCount.Int64
out.AccountSwitchCount = &v
}
return &out, nil
}

View File

@@ -56,18 +56,44 @@ error_buckets AS (
AND COALESCE(status_code, 0) >= 400
GROUP BY 1
),
switch_buckets AS (
SELECT ` + errorBucketExpr + ` AS bucket,
COALESCE(SUM(CASE
WHEN ev->>'kind' IN ('failover', 'retry_exhausted_failover', 'failover_on_400') THEN 1
ELSE 0
END), 0) AS switch_count
FROM ops_error_logs
CROSS JOIN LATERAL jsonb_array_elements(
COALESCE(NULLIF(upstream_errors, 'null'::jsonb), '[]'::jsonb)
) AS ev
` + errorWhere + `
AND upstream_errors IS NOT NULL
GROUP BY 1
),
combined AS (
SELECT COALESCE(u.bucket, e.bucket) AS bucket,
COALESCE(u.success_count, 0) AS success_count,
COALESCE(e.error_count, 0) AS error_count,
COALESCE(u.token_consumed, 0) AS token_consumed
FROM usage_buckets u
FULL OUTER JOIN error_buckets e ON u.bucket = e.bucket
SELECT
bucket,
SUM(success_count) AS success_count,
SUM(error_count) AS error_count,
SUM(token_consumed) AS token_consumed,
SUM(switch_count) AS switch_count
FROM (
SELECT bucket, success_count, 0 AS error_count, token_consumed, 0 AS switch_count
FROM usage_buckets
UNION ALL
SELECT bucket, 0, error_count, 0, 0
FROM error_buckets
UNION ALL
SELECT bucket, 0, 0, 0, switch_count
FROM switch_buckets
) t
GROUP BY bucket
)
SELECT
bucket,
(success_count + error_count) AS request_count,
token_consumed
token_consumed,
switch_count
FROM combined
ORDER BY bucket ASC`
@@ -84,13 +110,18 @@ ORDER BY bucket ASC`
var bucket time.Time
var requests int64
var tokens sql.NullInt64
if err := rows.Scan(&bucket, &requests, &tokens); err != nil {
var switches sql.NullInt64
if err := rows.Scan(&bucket, &requests, &tokens, &switches); err != nil {
return nil, err
}
tokenConsumed := int64(0)
if tokens.Valid {
tokenConsumed = tokens.Int64
}
switchCount := int64(0)
if switches.Valid {
switchCount = switches.Int64
}
denom := float64(bucketSeconds)
if denom <= 0 {
@@ -103,6 +134,7 @@ ORDER BY bucket ASC`
BucketStart: bucket.UTC(),
RequestCount: requests,
TokenConsumed: tokenConsumed,
SwitchCount: switchCount,
QPS: qps,
TPS: tps,
})
@@ -385,6 +417,7 @@ func fillOpsThroughputBuckets(start, end time.Time, bucketSeconds int, points []
BucketStart: cursor,
RequestCount: 0,
TokenConsumed: 0,
SwitchCount: 0,
QPS: 0,
TPS: 0,
})

View File

@@ -285,6 +285,11 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
return fmt.Errorf("query error counts: %w", err)
}
accountSwitchCount, err := c.queryAccountSwitchCount(ctx, windowStart, windowEnd)
if err != nil {
return fmt.Errorf("query account switch counts: %w", err)
}
windowSeconds := windowEnd.Sub(windowStart).Seconds()
if windowSeconds <= 0 {
windowSeconds = 60
@@ -310,6 +315,7 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
Upstream529Count: upstream529,
TokenConsumed: tokenConsumed,
AccountSwitchCount: accountSwitchCount,
QPS: float64Ptr(roundTo1DP(qps)),
TPS: float64Ptr(roundTo1DP(tps)),
@@ -551,6 +557,27 @@ WHERE created_at >= $1 AND created_at < $2`
return errorTotal, businessLimited, errorSLA, upstreamExcl429529, upstream429, upstream529, nil
}
func (c *OpsMetricsCollector) queryAccountSwitchCount(ctx context.Context, start, end time.Time) (int64, error) {
q := `
SELECT
COALESCE(SUM(CASE
WHEN ev->>'kind' IN ('failover', 'retry_exhausted_failover', 'failover_on_400') THEN 1
ELSE 0
END), 0) AS switch_count
FROM ops_error_logs o
CROSS JOIN LATERAL jsonb_array_elements(
COALESCE(NULLIF(o.upstream_errors, 'null'::jsonb), '[]'::jsonb)
) AS ev
WHERE o.created_at >= $1 AND o.created_at < $2
AND o.is_count_tokens = FALSE`
var count int64
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&count); err != nil {
return 0, err
}
return count, nil
}
type opsCollectedSystemStats struct {
cpuUsagePercent *float64
memoryUsedMB *int64

View File

@@ -162,6 +162,7 @@ type OpsInsertSystemMetricsInput struct {
Upstream529Count int64
TokenConsumed int64
AccountSwitchCount int64
QPS *float64
TPS *float64
@@ -225,6 +226,7 @@ type OpsSystemMetricsSnapshot struct {
GoroutineCount *int `json:"goroutine_count"`
ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"`
AccountSwitchCount *int64 `json:"account_switch_count"`
}
type OpsUpsertJobHeartbeatInput struct {

View File

@@ -6,6 +6,7 @@ type OpsThroughputTrendPoint struct {
BucketStart time.Time `json:"bucket_start"`
RequestCount int64 `json:"request_count"`
TokenConsumed int64 `json:"token_consumed"`
SwitchCount int64 `json:"switch_count"`
QPS float64 `json:"qps"`
TPS float64 `json:"tps"`
}

View File

@@ -0,0 +1,3 @@
-- ops_system_metrics 增加账号切换次数统计(按分钟窗口)
ALTER TABLE ops_system_metrics
ADD COLUMN IF NOT EXISTS account_switch_count BIGINT NOT NULL DEFAULT 0;

View File

@@ -136,6 +136,7 @@ export interface OpsThroughputTrendPoint {
bucket_start: string
request_count: number
token_consumed: number
switch_count?: number
qps: number
tps: number
}
@@ -284,6 +285,7 @@ export interface OpsSystemMetricsSnapshot {
goroutine_count?: number | null
concurrency_queue_depth?: number | null
account_switch_count?: number | null
}
export interface OpsJobHeartbeat {

View File

@@ -1955,6 +1955,7 @@ export default {
waiting: 'waiting',
conns: 'conns',
queue: 'queue',
accountSwitches: 'Account switches',
ok: 'ok',
lastRun: 'last_run:',
lastSuccess: 'last_success:',
@@ -2003,6 +2004,7 @@ export default {
failedToLoadData: 'Failed to load ops data.',
failedToLoadOverview: 'Failed to load overview',
failedToLoadThroughputTrend: 'Failed to load throughput trend',
failedToLoadSwitchTrend: 'Failed to load avg account switches trend',
failedToLoadLatencyHistogram: 'Failed to load request duration histogram',
failedToLoadErrorTrend: 'Failed to load error trend',
failedToLoadErrorDistribution: 'Failed to load error distribution',
@@ -2011,9 +2013,11 @@ export default {
tpsK: 'TPS (K)',
top: 'Top:',
throughputTrend: 'Throughput Trend',
switchRateTrend: 'Avg Account Switches',
latencyHistogram: 'Request Duration Histogram',
errorTrend: 'Error Trend',
errorDistribution: 'Error Distribution',
switchRate: 'Avg switches',
// Health Score & Diagnosis
health: 'Health',
healthCondition: 'Health Condition',
@@ -2633,6 +2637,7 @@ export default {
tooltips: {
totalRequests: 'Total number of requests (including both successful and failed requests) in the selected time window.',
throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
switchRateTrend: 'Trend of account switches / total requests over the last 5 hours (avg switches).',
latencyHistogram: 'Request duration distribution (ms) for successful requests.',
errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
errorDistribution: 'Error distribution by status code.',

View File

@@ -2103,6 +2103,7 @@ export default {
waiting: '等待',
conns: '连接',
queue: '队列',
accountSwitches: '账号切换',
ok: '正常',
lastRun: '最近运行',
lastSuccess: '最近成功',
@@ -2152,6 +2153,7 @@ export default {
failedToLoadData: '加载运维数据失败',
failedToLoadOverview: '加载概览数据失败',
failedToLoadThroughputTrend: '加载吞吐趋势失败',
failedToLoadSwitchTrend: '加载平均账号切换趋势失败',
failedToLoadLatencyHistogram: '加载请求时长分布失败',
failedToLoadErrorTrend: '加载错误趋势失败',
failedToLoadErrorDistribution: '加载错误分布失败',
@@ -2160,9 +2162,11 @@ export default {
tpsK: 'TPS',
top: '最高:',
throughputTrend: '吞吐趋势',
switchRateTrend: '平均账号切换趋势',
latencyHistogram: '请求时长分布',
errorTrend: '错误趋势',
errorDistribution: '错误分布',
switchRate: '平均账号切换',
// Health Score & Diagnosis
health: '健康',
healthCondition: '健康状况',
@@ -2787,6 +2791,7 @@ export default {
tooltips: {
totalRequests: '当前时间窗口内的总请求数和Token消耗量。',
throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。',
switchRateTrend: '近5小时内账号切换次数 / 请求总数的趋势(平均切换次数)。',
latencyHistogram: '成功请求的请求时长分布(毫秒)。',
errorTrend: '错误趋势SLA 口径排除业务限制;上游错误率排除 429/529。',
errorDistribution: '按状态码统计的错误分布。',

View File

@@ -40,10 +40,18 @@
/>
<!-- Row: Concurrency + Throughput -->
<div v-if="opsEnabled && !(loading && !hasLoadedOnce)" class="grid grid-cols-1 gap-6 lg:grid-cols-3">
<div v-if="opsEnabled && !(loading && !hasLoadedOnce)" class="grid grid-cols-1 gap-6 lg:grid-cols-4">
<div class="lg:col-span-1 min-h-[360px]">
<OpsConcurrencyCard :platform-filter="platform" :group-id-filter="groupId" :refresh-token="dashboardRefreshToken" />
</div>
<div class="lg:col-span-1 min-h-[360px]">
<OpsSwitchRateTrendChart
:points="switchTrend?.points ?? []"
:loading="loadingSwitchTrend"
:time-range="switchTrendTimeRange"
:fullscreen="isFullscreen"
/>
</div>
<div class="lg:col-span-2 min-h-[360px]">
<OpsThroughputTrendChart
:points="throughputTrend?.points ?? []"
@@ -138,6 +146,7 @@ import OpsErrorDetailsModal from './components/OpsErrorDetailsModal.vue'
import OpsErrorTrendChart from './components/OpsErrorTrendChart.vue'
import OpsLatencyChart from './components/OpsLatencyChart.vue'
import OpsThroughputTrendChart from './components/OpsThroughputTrendChart.vue'
import OpsSwitchRateTrendChart from './components/OpsSwitchRateTrendChart.vue'
import OpsAlertEventsCard from './components/OpsAlertEventsCard.vue'
import OpsRequestDetailsModal, { type OpsRequestDetailsPreset } from './components/OpsRequestDetailsModal.vue'
import OpsSettingsDialog from './components/OpsSettingsDialog.vue'
@@ -168,6 +177,9 @@ const groupId = ref<number | null>(null)
const queryMode = ref<QueryMode>('auto')
const customStartTime = ref<string | null>(null)
const customEndTime = ref<string | null>(null)
const switchTrendWindowHours = 5
const switchTrendTimeRange = `${switchTrendWindowHours}h`
const switchTrendWindowMs = switchTrendWindowHours * 60 * 60 * 1000
const QUERY_KEYS = {
timeRange: 'tr',
@@ -322,6 +334,9 @@ const metricThresholds = ref<OpsMetricThresholds | null>(null)
const throughputTrend = ref<OpsThroughputTrendResponse | null>(null)
const loadingTrend = ref(false)
const switchTrend = ref<OpsThroughputTrendResponse | null>(null)
const loadingSwitchTrend = ref(false)
const latencyHistogram = ref<OpsLatencyHistogramResponse | null>(null)
const loadingLatency = ref(false)
@@ -491,6 +506,19 @@ function buildApiParams() {
return params
}
function buildSwitchTrendParams() {
const params: any = {
platform: platform.value || undefined,
group_id: groupId.value ?? undefined,
mode: queryMode.value
}
const endTime = new Date()
const startTime = new Date(endTime.getTime() - switchTrendWindowMs)
params.start_time = startTime.toISOString()
params.end_time = endTime.toISOString()
return params
}
async function refreshOverviewWithCancel(fetchSeq: number, signal: AbortSignal) {
if (!opsEnabled.value) return
try {
@@ -504,6 +532,24 @@ async function refreshOverviewWithCancel(fetchSeq: number, signal: AbortSignal)
}
}
async function refreshSwitchTrendWithCancel(fetchSeq: number, signal: AbortSignal) {
if (!opsEnabled.value) return
loadingSwitchTrend.value = true
try {
const data = await opsAPI.getThroughputTrend(buildSwitchTrendParams(), { signal })
if (fetchSeq !== dashboardFetchSeq) return
switchTrend.value = data
} catch (err: any) {
if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
switchTrend.value = null
appStore.showError(err?.message || t('admin.ops.failedToLoadSwitchTrend'))
} finally {
if (fetchSeq === dashboardFetchSeq) {
loadingSwitchTrend.value = false
}
}
}
async function refreshThroughputTrendWithCancel(fetchSeq: number, signal: AbortSignal) {
if (!opsEnabled.value) return
loadingTrend.value = true
@@ -600,6 +646,7 @@ async function fetchData() {
await Promise.all([
refreshOverviewWithCancel(fetchSeq, dashboardFetchController.signal),
refreshThroughputTrendWithCancel(fetchSeq, dashboardFetchController.signal),
refreshSwitchTrendWithCancel(fetchSeq, dashboardFetchController.signal),
refreshLatencyHistogramWithCancel(fetchSeq, dashboardFetchController.signal),
refreshErrorTrendWithCancel(fetchSeq, dashboardFetchController.signal),
refreshErrorDistributionWithCancel(fetchSeq, dashboardFetchController.signal)

View File

@@ -50,7 +50,11 @@ const props = withDefaults(defineProps<Props>(), {
</div>
<!-- Row: Concurrency + Throughput (matches OpsDashboard.vue) -->
<div class="grid grid-cols-1 gap-6 lg:grid-cols-3">
<div class="grid grid-cols-1 gap-6 lg:grid-cols-4">
<div :class="['min-h-[360px] rounded-3xl bg-white shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700 lg:col-span-1', props.fullscreen ? 'p-8' : 'p-6']">
<div class="h-4 w-44 animate-pulse rounded bg-gray-200 dark:bg-dark-700"></div>
<div class="mt-6 h-72 animate-pulse rounded-2xl bg-gray-100 dark:bg-dark-700/70"></div>
</div>
<div :class="['min-h-[360px] rounded-3xl bg-white shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700 lg:col-span-1', props.fullscreen ? 'p-8' : 'p-6']">
<div class="h-4 w-44 animate-pulse rounded bg-gray-200 dark:bg-dark-700"></div>
<div class="mt-6 h-72 animate-pulse rounded-2xl bg-gray-100 dark:bg-dark-700/70"></div>
@@ -96,4 +100,3 @@ const props = withDefaults(defineProps<Props>(), {
</div>
</div>
</template>

View File

@@ -0,0 +1,150 @@
<script setup lang="ts">
import { computed } from 'vue'
import { useI18n } from 'vue-i18n'
import {
Chart as ChartJS,
CategoryScale,
Filler,
Legend,
LineElement,
LinearScale,
PointElement,
Title,
Tooltip
} from 'chart.js'
import { Line } from 'vue-chartjs'
import type { OpsThroughputTrendPoint } from '@/api/admin/ops'
import type { ChartState } from '../types'
import { formatHistoryLabel, sumNumbers } from '../utils/opsFormatters'
import HelpTooltip from '@/components/common/HelpTooltip.vue'
import EmptyState from '@/components/common/EmptyState.vue'
ChartJS.register(Title, Tooltip, Legend, LineElement, LinearScale, PointElement, CategoryScale, Filler)
interface Props {
points: OpsThroughputTrendPoint[]
loading: boolean
timeRange: string
fullscreen?: boolean
}
const props = defineProps<Props>()
const { t } = useI18n()
const isDarkMode = computed(() => document.documentElement.classList.contains('dark'))
const colors = computed(() => ({
teal: '#14b8a6',
tealAlpha: '#14b8a620',
grid: isDarkMode.value ? '#374151' : '#f3f4f6',
text: isDarkMode.value ? '#9ca3af' : '#6b7280'
}))
const totalRequests = computed(() => sumNumbers(props.points.map((p) => p.request_count)))
const chartData = computed(() => {
if (!props.points.length || totalRequests.value <= 0) return null
return {
labels: props.points.map((p) => formatHistoryLabel(p.bucket_start, props.timeRange)),
datasets: [
{
label: t('admin.ops.switchRate'),
data: props.points.map((p) => {
const requests = p.request_count ?? 0
const switches = p.switch_count ?? 0
if (requests <= 0) return 0
return switches / requests
}),
borderColor: colors.value.teal,
backgroundColor: colors.value.tealAlpha,
fill: true,
tension: 0.35,
pointRadius: 0,
pointHitRadius: 10
}
]
}
})
const state = computed<ChartState>(() => {
if (chartData.value) return 'ready'
if (props.loading) return 'loading'
return 'empty'
})
const options = computed(() => {
const c = colors.value
return {
responsive: true,
maintainAspectRatio: false,
interaction: { intersect: false, mode: 'index' as const },
plugins: {
legend: {
position: 'top' as const,
align: 'end' as const,
labels: { color: c.text, usePointStyle: true, boxWidth: 6, font: { size: 10 } }
},
tooltip: {
backgroundColor: isDarkMode.value ? '#1f2937' : '#ffffff',
titleColor: isDarkMode.value ? '#f3f4f6' : '#111827',
bodyColor: isDarkMode.value ? '#d1d5db' : '#4b5563',
borderColor: c.grid,
borderWidth: 1,
padding: 10,
displayColors: true,
callbacks: {
label: (context: any) => {
const value = typeof context?.parsed?.y === 'number' ? context.parsed.y : 0
return `${t('admin.ops.switchRate')}: ${value.toFixed(3)}`
}
}
}
},
scales: {
x: {
type: 'category' as const,
grid: { display: false },
ticks: {
color: c.text,
font: { size: 10 },
maxTicksLimit: 8,
autoSkip: true,
autoSkipPadding: 10
}
},
y: {
type: 'linear' as const,
display: true,
position: 'left' as const,
grid: { color: c.grid, borderDash: [4, 4] },
ticks: {
color: c.text,
font: { size: 10 },
callback: (value: any) => Number(value).toFixed(3)
}
}
}
}
})
</script>
<template>
<div class="flex h-full flex-col rounded-3xl bg-white p-6 shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700">
<div class="mb-4 flex shrink-0 items-center justify-between">
<h3 class="flex items-center gap-2 text-sm font-bold text-gray-900 dark:text-white">
<svg class="h-4 w-4 text-teal-500" fill="none" viewBox="0 0 24 24" stroke="currentColor">
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M7 7h10M7 12h6m-6 5h3" />
</svg>
{{ t('admin.ops.switchRateTrend') }}
<HelpTooltip v-if="!props.fullscreen" :content="t('admin.ops.tooltips.switchRateTrend')" />
</h3>
</div>
<div class="min-h-0 flex-1">
<Line v-if="state === 'ready' && chartData" :data="chartData" :options="options" />
<div v-else class="flex h-full items-center justify-center">
<div v-if="state === 'loading'" class="animate-pulse text-sm text-gray-400">{{ t('common.loading') }}</div>
<EmptyState v-else :title="t('common.noData')" :description="t('admin.ops.charts.emptyRequest')" />
</div>
</div>
</div>
</template>