feat(运维监控): 重构仪表板布局和增强数据展示
主要改动: - 重构仪表板为左右布局(5:7比例) - 左侧:健康评分 + 实时信息(当前/峰值/平均 QPS/TPS) - 右侧:6个卡片展示详细指标(3列x2行) - 总请求:请求数、Token数、平均QPS/TPS、平均延迟/TTFT - SLA:百分比、异常数、进度条 - 延迟:P99/P95/P90/P50/Avg/Max(带颜色编码) - TTFT:P99/P95/P90/P50/Avg/Max(带颜色编码) - 请求错误:错误率、错误数、业务限制数 - 上游错误:错误率、错误数(排除429/529)、429/529数 - 添加延迟/TTFT颜色编码(<500ms绿色,<1s黄色,<2s橙色,≥2s红色) - 添加实时窗口选择器(1min/5min/30min/1h) - 优化时间段选择器标签("近5分钟"等) - 完善中英文i18n翻译 - 数据库:添加Redis连接池字段(redis_conn_total, redis_conn_idle)
This commit is contained in:
@@ -705,3 +705,13 @@ INSERT INTO ops_alert_rules (
|
||||
'当错误率超过 20% 且持续 1 分钟时触发告警(服务严重异常)',
|
||||
true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
|
||||
) ON CONFLICT (name) DO NOTHING;
|
||||
|
||||
-- Ops Monitoring vNext: add Redis pool stats fields to system metrics snapshots.
|
||||
-- This migration is intentionally idempotent.
|
||||
|
||||
ALTER TABLE ops_system_metrics
|
||||
ADD COLUMN IF NOT EXISTS redis_conn_total INT,
|
||||
ADD COLUMN IF NOT EXISTS redis_conn_idle INT;
|
||||
|
||||
COMMENT ON COLUMN ops_system_metrics.redis_conn_total IS 'Redis pool total connections (go-redis PoolStats.TotalConns).';
|
||||
COMMENT ON COLUMN ops_system_metrics.redis_conn_idle IS 'Redis pool idle connections (go-redis PoolStats.IdleConns).';
|
||||
|
||||
@@ -230,6 +230,10 @@ export interface OpsSystemMetricsSnapshot {
|
||||
db_ok?: boolean | null
|
||||
redis_ok?: boolean | null
|
||||
|
||||
// Config-derived limits (best-effort) for rendering "current vs max".
|
||||
db_max_open_conns?: number | null
|
||||
redis_pool_size?: number | null
|
||||
|
||||
redis_conn_total?: number | null
|
||||
redis_conn_idle?: number | null
|
||||
|
||||
|
||||
@@ -1737,6 +1737,8 @@ export default {
|
||||
active: 'active',
|
||||
idle: 'idle',
|
||||
waiting: 'waiting',
|
||||
conns: 'conns',
|
||||
queue: 'queue',
|
||||
ok: 'ok',
|
||||
lastRun: 'last_run:',
|
||||
lastSuccess: 'last_success:',
|
||||
@@ -1750,6 +1752,17 @@ export default {
|
||||
tps: 'TPS:',
|
||||
current: 'current',
|
||||
peak: 'peak',
|
||||
average: 'average',
|
||||
totalRequests: 'Total Requests',
|
||||
avgQps: 'Avg QPS',
|
||||
avgTps: 'Avg TPS',
|
||||
avgLatency: 'Avg Latency',
|
||||
avgTtft: 'Avg TTFT',
|
||||
exceptions: 'Exceptions',
|
||||
requestErrors: 'Request Errors',
|
||||
errorCount: 'Error Count',
|
||||
upstreamErrors: 'Upstream Errors',
|
||||
errorCountExcl429529: 'Error Count (excl 429/529)',
|
||||
sla: 'SLA (excl business limits)',
|
||||
businessLimited: 'business_limited:',
|
||||
errors: 'Errors',
|
||||
@@ -1792,6 +1805,42 @@ export default {
|
||||
healthyStatus: 'Healthy',
|
||||
riskyStatus: 'At Risk',
|
||||
idleStatus: 'Idle',
|
||||
realtime: {
|
||||
title: 'Realtime',
|
||||
connected: 'Connected',
|
||||
connecting: 'Connecting',
|
||||
reconnecting: 'Reconnecting',
|
||||
offline: 'Offline',
|
||||
closed: 'Closed',
|
||||
reconnectIn: 'Reconnect in {seconds}s'
|
||||
},
|
||||
tooltips: {
|
||||
qps: 'Queries per second - real-time request rate',
|
||||
sla: 'Service Level Agreement - percentage of requests within acceptable latency',
|
||||
latency: 'Request duration from start to finish',
|
||||
ttft: 'Time to First Token - latency until first response token',
|
||||
errors: 'Request errors within SLA scope',
|
||||
upstreamErrors: 'Errors from upstream services (excluding rate limits)',
|
||||
totalRequests: 'Total requests and tokens consumed in this time window',
|
||||
cpu: 'CPU usage percentage',
|
||||
memory: 'Memory usage percentage',
|
||||
db: 'Database connection pool status',
|
||||
redis: 'Redis connection pool status',
|
||||
goroutines: 'Go routine count (concurrent tasks)',
|
||||
jobs: 'Background job health status'
|
||||
},
|
||||
timeRange: {
|
||||
'5m': 'Last 5 minutes',
|
||||
'30m': 'Last 30 minutes',
|
||||
'1h': 'Last 1 hour',
|
||||
'6h': 'Last 6 hours',
|
||||
'24h': 'Last 24 hours'
|
||||
},
|
||||
queryMode: {
|
||||
auto: 'Auto',
|
||||
raw: 'Raw Query',
|
||||
preagg: 'Pre-aggregated'
|
||||
},
|
||||
diagnosis: {
|
||||
title: 'Smart Diagnosis',
|
||||
footer: 'Automated diagnostic suggestions based on current metrics',
|
||||
|
||||
@@ -1882,6 +1882,8 @@ export default {
|
||||
active: '活跃',
|
||||
idle: '空闲',
|
||||
waiting: '等待',
|
||||
conns: '连接',
|
||||
queue: '队列',
|
||||
ok: '正常',
|
||||
lastRun: '最近运行',
|
||||
lastSuccess: '最近成功',
|
||||
@@ -1895,6 +1897,17 @@ export default {
|
||||
tps: 'TPS',
|
||||
current: '当前',
|
||||
peak: '峰值',
|
||||
average: '平均',
|
||||
totalRequests: '总请求',
|
||||
avgQps: '平均 QPS',
|
||||
avgTps: '平均 TPS',
|
||||
avgLatency: '平均延迟',
|
||||
avgTtft: '平均首字延迟',
|
||||
exceptions: '异常数',
|
||||
requestErrors: '请求错误',
|
||||
errorCount: '错误数',
|
||||
upstreamErrors: '上游错误',
|
||||
errorCountExcl429529: '错误数(排除429/529)',
|
||||
sla: 'SLA(排除业务限制)',
|
||||
businessLimited: '业务限制:',
|
||||
errors: '错误',
|
||||
@@ -1937,6 +1950,42 @@ export default {
|
||||
healthyStatus: '健康',
|
||||
riskyStatus: '风险',
|
||||
idleStatus: '待机',
|
||||
realtime: {
|
||||
title: '实时信息',
|
||||
connected: '已连接',
|
||||
connecting: '连接中',
|
||||
reconnecting: '重连中',
|
||||
offline: '离线',
|
||||
closed: '已关闭',
|
||||
reconnectIn: '{seconds}秒后重连'
|
||||
},
|
||||
tooltips: {
|
||||
qps: '每秒查询数 - 实时请求速率',
|
||||
sla: '服务等级协议 - 可接受延迟范围内的请求百分比',
|
||||
latency: '从开始到结束的请求持续时间',
|
||||
ttft: '首字延迟 - 直到第一个响应令牌的延迟',
|
||||
errors: 'SLA 范围内的请求错误',
|
||||
upstreamErrors: '上游服务错误(不包括速率限制)',
|
||||
totalRequests: '此时间窗口内的总请求数和消耗的令牌数',
|
||||
cpu: 'CPU 使用率',
|
||||
memory: '内存使用率',
|
||||
db: '数据库连接池状态',
|
||||
redis: 'Redis 连接池状态',
|
||||
goroutines: 'Go 协程数(并发任务)',
|
||||
jobs: '后台任务健康状态'
|
||||
},
|
||||
timeRange: {
|
||||
'5m': '近5分钟',
|
||||
'30m': '近30分钟',
|
||||
'1h': '近1小时',
|
||||
'6h': '近6小时',
|
||||
'24h': '近24小时'
|
||||
},
|
||||
queryMode: {
|
||||
auto: '自动',
|
||||
raw: '原始查询',
|
||||
preagg: '预聚合'
|
||||
},
|
||||
diagnosis: {
|
||||
title: '智能诊断',
|
||||
footer: '基于当前指标的自动诊断建议',
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user