feat(运维监控): 重构仪表板布局和增强数据展示

主要改动： - 重构仪表板为左右布局（5:7比例） - 左侧：健康评分 + 实时信息（当前/峰值/平均 QPS/TPS） - 右侧：6个卡片展示详细指标（3列x2行） - 总请求：请求数、Token数、平均QPS/TPS、平均延迟/TTFT - SLA：百分比、异常数、进度条 - 延迟：P99/P95/P90/P50/Avg/Max（带颜色编码） - TTFT：P99/P95/P90/P50/Avg/Max（带颜色编码） - 请求错误：错误率、错误数、业务限制数 - 上游错误：错误率、错误数（排除429/529）、429/529数 - 添加延迟/TTFT颜色编码（<500ms绿色，<1s黄色，<2s橙色，≥2s红色） - 添加实时窗口选择器（1min/5min/30min/1h） - 优化时间段选择器标签（"近5分钟"等） - 完善中英文i18n翻译 - 数据库：添加Redis连接池字段（redis_conn_total, redis_conn_idle）
2026-01-10 02:17:38 +08:00
parent 585257d340
commit c48dc097ff
5 changed files with 1104 additions and 111 deletions
--- a/backend/migrations/030_ops_monitoring_vnext.sql
+++ b/backend/migrations/030_ops_monitoring_vnext.sql
@@ -705,3 +705,13 @@ INSERT INTO ops_alert_rules (
    '当错误率超过 20% 且持续 1 分钟时触发告警（服务严重异常）',
    true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
+
+-- Ops Monitoring vNext: add Redis pool stats fields to system metrics snapshots.
+-- This migration is intentionally idempotent.
+
+ALTER TABLE ops_system_metrics
+  ADD COLUMN IF NOT EXISTS redis_conn_total INT,
+  ADD COLUMN IF NOT EXISTS redis_conn_idle INT;
+
+COMMENT ON COLUMN ops_system_metrics.redis_conn_total IS 'Redis pool total connections (go-redis PoolStats.TotalConns).';
+COMMENT ON COLUMN ops_system_metrics.redis_conn_idle IS 'Redis pool idle connections (go-redis PoolStats.IdleConns).';
--- a/frontend/src/api/admin/ops.ts
+++ b/frontend/src/api/admin/ops.ts
@@ -230,6 +230,10 @@ export interface OpsSystemMetricsSnapshot {
  db_ok?: boolean | null
  redis_ok?: boolean | null

+  // Config-derived limits (best-effort) for rendering "current vs max".
+  db_max_open_conns?: number | null
+  redis_pool_size?: number | null
+
  redis_conn_total?: number | null
  redis_conn_idle?: number | null

--- a/frontend/src/i18n/locales/en.ts
+++ b/frontend/src/i18n/locales/en.ts
@@ -1737,6 +1737,8 @@ export default {
      active: 'active',
      idle: 'idle',
      waiting: 'waiting',
+      conns: 'conns',
+      queue: 'queue',
      ok: 'ok',
      lastRun: 'last_run:',
      lastSuccess: 'last_success:',
@@ -1750,6 +1752,17 @@ export default {
      tps: 'TPS:',
      current: 'current',
      peak: 'peak',
+      average: 'average',
+      totalRequests: 'Total Requests',
+      avgQps: 'Avg QPS',
+      avgTps: 'Avg TPS',
+      avgLatency: 'Avg Latency',
+      avgTtft: 'Avg TTFT',
+      exceptions: 'Exceptions',
+      requestErrors: 'Request Errors',
+      errorCount: 'Error Count',
+      upstreamErrors: 'Upstream Errors',
+      errorCountExcl429529: 'Error Count (excl 429/529)',
      sla: 'SLA (excl business limits)',
      businessLimited: 'business_limited:',
      errors: 'Errors',
@@ -1792,6 +1805,42 @@ export default {
      healthyStatus: 'Healthy',
      riskyStatus: 'At Risk',
      idleStatus: 'Idle',
+      realtime: {
+        title: 'Realtime',
+        connected: 'Connected',
+        connecting: 'Connecting',
+        reconnecting: 'Reconnecting',
+        offline: 'Offline',
+        closed: 'Closed',
+        reconnectIn: 'Reconnect in {seconds}s'
+      },
+      tooltips: {
+        qps: 'Queries per second - real-time request rate',
+        sla: 'Service Level Agreement - percentage of requests within acceptable latency',
+        latency: 'Request duration from start to finish',
+        ttft: 'Time to First Token - latency until first response token',
+        errors: 'Request errors within SLA scope',
+        upstreamErrors: 'Errors from upstream services (excluding rate limits)',
+        totalRequests: 'Total requests and tokens consumed in this time window',
+        cpu: 'CPU usage percentage',
+        memory: 'Memory usage percentage',
+        db: 'Database connection pool status',
+        redis: 'Redis connection pool status',
+        goroutines: 'Go routine count (concurrent tasks)',
+        jobs: 'Background job health status'
+      },
+      timeRange: {
+        '5m': 'Last 5 minutes',
+        '30m': 'Last 30 minutes',
+        '1h': 'Last 1 hour',
+        '6h': 'Last 6 hours',
+        '24h': 'Last 24 hours'
+      },
+      queryMode: {
+        auto: 'Auto',
+        raw: 'Raw Query',
+        preagg: 'Pre-aggregated'
+      },
      diagnosis: {
        title: 'Smart Diagnosis',
        footer: 'Automated diagnostic suggestions based on current metrics',
--- a/frontend/src/i18n/locales/zh.ts
+++ b/frontend/src/i18n/locales/zh.ts
@@ -1882,6 +1882,8 @@ export default {
      active: '活跃',
      idle: '空闲',
      waiting: '等待',
+      conns: '连接',
+      queue: '队列',
      ok: '正常',
      lastRun: '最近运行',
      lastSuccess: '最近成功',
@@ -1895,6 +1897,17 @@ export default {
      tps: 'TPS',
      current: '当前',
      peak: '峰值',
+      average: '平均',
+      totalRequests: '总请求',
+      avgQps: '平均 QPS',
+      avgTps: '平均 TPS',
+      avgLatency: '平均延迟',
+      avgTtft: '平均首字延迟',
+      exceptions: '异常数',
+      requestErrors: '请求错误',
+      errorCount: '错误数',
+      upstreamErrors: '上游错误',
+      errorCountExcl429529: '错误数（排除429/529）',
      sla: 'SLA（排除业务限制）',
      businessLimited: '业务限制：',
      errors: '错误',
@@ -1937,6 +1950,42 @@ export default {
      healthyStatus: '健康',
      riskyStatus: '风险',
      idleStatus: '待机',
+      realtime: {
+        title: '实时信息',
+        connected: '已连接',
+        connecting: '连接中',
+        reconnecting: '重连中',
+        offline: '离线',
+        closed: '已关闭',
+        reconnectIn: '{seconds}秒后重连'
+      },
+      tooltips: {
+        qps: '每秒查询数 - 实时请求速率',
+        sla: '服务等级协议 - 可接受延迟范围内的请求百分比',
+        latency: '从开始到结束的请求持续时间',
+        ttft: '首字延迟 - 直到第一个响应令牌的延迟',
+        errors: 'SLA 范围内的请求错误',
+        upstreamErrors: '上游服务错误（不包括速率限制）',
+        totalRequests: '此时间窗口内的总请求数和消耗的令牌数',
+        cpu: 'CPU 使用率',
+        memory: '内存使用率',
+        db: '数据库连接池状态',
+        redis: 'Redis 连接池状态',
+        goroutines: 'Go 协程数（并发任务）',
+        jobs: '后台任务健康状态'
+      },
+      timeRange: {
+        '5m': '近5分钟',
+        '30m': '近30分钟',
+        '1h': '近1小时',
+        '6h': '近6小时',
+        '24h': '近24小时'
+      },
+      queryMode: {
+        auto: '自动',
+        raw: '原始查询',
+        preagg: '预聚合'
+      },
      diagnosis: {
        title: '智能诊断',
        footer: '基于当前指标的自动诊断建议',
--- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
+++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue