feat(运维监控): 重构仪表板布局和增强数据展示

主要改动:
- 重构仪表板为左右布局(5:7比例)
- 左侧:健康评分 + 实时信息(当前/峰值/平均 QPS/TPS)
- 右侧:6个卡片展示详细指标(3列x2行)
  - 总请求:请求数、Token数、平均QPS/TPS、平均延迟/TTFT
  - SLA:百分比、异常数、进度条
  - 延迟:P99/P95/P90/P50/Avg/Max(带颜色编码)
  - TTFT:P99/P95/P90/P50/Avg/Max(带颜色编码)
  - 请求错误:错误率、错误数、业务限制数
  - 上游错误:错误率、错误数(排除429/529)、429/529数
- 添加延迟/TTFT颜色编码(<500ms绿色,<1s黄色,<2s橙色,≥2s红色)
- 添加实时窗口选择器(1min/5min/30min/1h)
- 优化时间段选择器标签("近5分钟"等)
- 完善中英文i18n翻译
- 数据库:添加Redis连接池字段(redis_conn_total, redis_conn_idle)
This commit is contained in:
IanShaw027
2026-01-10 02:17:38 +08:00
parent 585257d340
commit c48dc097ff
5 changed files with 1104 additions and 111 deletions

View File

@@ -705,3 +705,13 @@ INSERT INTO ops_alert_rules (
'当错误率超过 20% 且持续 1 分钟时触发告警(服务严重异常)',
true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- Ops Monitoring vNext: add Redis pool stats fields to system metrics snapshots.
-- This migration is intentionally idempotent.
ALTER TABLE ops_system_metrics
ADD COLUMN IF NOT EXISTS redis_conn_total INT,
ADD COLUMN IF NOT EXISTS redis_conn_idle INT;
COMMENT ON COLUMN ops_system_metrics.redis_conn_total IS 'Redis pool total connections (go-redis PoolStats.TotalConns).';
COMMENT ON COLUMN ops_system_metrics.redis_conn_idle IS 'Redis pool idle connections (go-redis PoolStats.IdleConns).';

View File

@@ -230,6 +230,10 @@ export interface OpsSystemMetricsSnapshot {
db_ok?: boolean | null
redis_ok?: boolean | null
// Config-derived limits (best-effort) for rendering "current vs max".
db_max_open_conns?: number | null
redis_pool_size?: number | null
redis_conn_total?: number | null
redis_conn_idle?: number | null

View File

@@ -1737,6 +1737,8 @@ export default {
active: 'active',
idle: 'idle',
waiting: 'waiting',
conns: 'conns',
queue: 'queue',
ok: 'ok',
lastRun: 'last_run:',
lastSuccess: 'last_success:',
@@ -1750,6 +1752,17 @@ export default {
tps: 'TPS:',
current: 'current',
peak: 'peak',
average: 'average',
totalRequests: 'Total Requests',
avgQps: 'Avg QPS',
avgTps: 'Avg TPS',
avgLatency: 'Avg Latency',
avgTtft: 'Avg TTFT',
exceptions: 'Exceptions',
requestErrors: 'Request Errors',
errorCount: 'Error Count',
upstreamErrors: 'Upstream Errors',
errorCountExcl429529: 'Error Count (excl 429/529)',
sla: 'SLA (excl business limits)',
businessLimited: 'business_limited:',
errors: 'Errors',
@@ -1792,6 +1805,42 @@ export default {
healthyStatus: 'Healthy',
riskyStatus: 'At Risk',
idleStatus: 'Idle',
realtime: {
title: 'Realtime',
connected: 'Connected',
connecting: 'Connecting',
reconnecting: 'Reconnecting',
offline: 'Offline',
closed: 'Closed',
reconnectIn: 'Reconnect in {seconds}s'
},
tooltips: {
qps: 'Queries per second - real-time request rate',
sla: 'Service Level Agreement - percentage of requests within acceptable latency',
latency: 'Request duration from start to finish',
ttft: 'Time to First Token - latency until first response token',
errors: 'Request errors within SLA scope',
upstreamErrors: 'Errors from upstream services (excluding rate limits)',
totalRequests: 'Total requests and tokens consumed in this time window',
cpu: 'CPU usage percentage',
memory: 'Memory usage percentage',
db: 'Database connection pool status',
redis: 'Redis connection pool status',
goroutines: 'Go routine count (concurrent tasks)',
jobs: 'Background job health status'
},
timeRange: {
'5m': 'Last 5 minutes',
'30m': 'Last 30 minutes',
'1h': 'Last 1 hour',
'6h': 'Last 6 hours',
'24h': 'Last 24 hours'
},
queryMode: {
auto: 'Auto',
raw: 'Raw Query',
preagg: 'Pre-aggregated'
},
diagnosis: {
title: 'Smart Diagnosis',
footer: 'Automated diagnostic suggestions based on current metrics',

View File

@@ -1882,6 +1882,8 @@ export default {
active: '活跃',
idle: '空闲',
waiting: '等待',
conns: '连接',
queue: '队列',
ok: '正常',
lastRun: '最近运行',
lastSuccess: '最近成功',
@@ -1895,6 +1897,17 @@ export default {
tps: 'TPS',
current: '当前',
peak: '峰值',
average: '平均',
totalRequests: '总请求',
avgQps: '平均 QPS',
avgTps: '平均 TPS',
avgLatency: '平均延迟',
avgTtft: '平均首字延迟',
exceptions: '异常数',
requestErrors: '请求错误',
errorCount: '错误数',
upstreamErrors: '上游错误',
errorCountExcl429529: '错误数排除429/529',
sla: 'SLA排除业务限制',
businessLimited: '业务限制:',
errors: '错误',
@@ -1937,6 +1950,42 @@ export default {
healthyStatus: '健康',
riskyStatus: '风险',
idleStatus: '待机',
realtime: {
title: '实时信息',
connected: '已连接',
connecting: '连接中',
reconnecting: '重连中',
offline: '离线',
closed: '已关闭',
reconnectIn: '{seconds}秒后重连'
},
tooltips: {
qps: '每秒查询数 - 实时请求速率',
sla: '服务等级协议 - 可接受延迟范围内的请求百分比',
latency: '从开始到结束的请求持续时间',
ttft: '首字延迟 - 直到第一个响应令牌的延迟',
errors: 'SLA 范围内的请求错误',
upstreamErrors: '上游服务错误(不包括速率限制)',
totalRequests: '此时间窗口内的总请求数和消耗的令牌数',
cpu: 'CPU 使用率',
memory: '内存使用率',
db: '数据库连接池状态',
redis: 'Redis 连接池状态',
goroutines: 'Go 协程数(并发任务)',
jobs: '后台任务健康状态'
},
timeRange: {
'5m': '近5分钟',
'30m': '近30分钟',
'1h': '近1小时',
'6h': '近6小时',
'24h': '近24小时'
},
queryMode: {
auto: '自动',
raw: '原始查询',
preagg: '预聚合'
},
diagnosis: {
title: '智能诊断',
footer: '基于当前指标的自动诊断建议',

File diff suppressed because it is too large Load Diff