feat(运维监控): 重构仪表板布局和增强数据展示
主要改动: - 重构仪表板为左右布局(5:7比例) - 左侧:健康评分 + 实时信息(当前/峰值/平均 QPS/TPS) - 右侧:6个卡片展示详细指标(3列x2行) - 总请求:请求数、Token数、平均QPS/TPS、平均延迟/TTFT - SLA:百分比、异常数、进度条 - 延迟:P99/P95/P90/P50/Avg/Max(带颜色编码) - TTFT:P99/P95/P90/P50/Avg/Max(带颜色编码) - 请求错误:错误率、错误数、业务限制数 - 上游错误:错误率、错误数(排除429/529)、429/529数 - 添加延迟/TTFT颜色编码(<500ms绿色,<1s黄色,<2s橙色,≥2s红色) - 添加实时窗口选择器(1min/5min/30min/1h) - 优化时间段选择器标签("近5分钟"等) - 完善中英文i18n翻译 - 数据库:添加Redis连接池字段(redis_conn_total, redis_conn_idle)
This commit is contained in:
@@ -705,3 +705,13 @@ INSERT INTO ops_alert_rules (
|
|||||||
'当错误率超过 20% 且持续 1 分钟时触发告警(服务严重异常)',
|
'当错误率超过 20% 且持续 1 分钟时触发告警(服务严重异常)',
|
||||||
true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
|
true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
|
||||||
) ON CONFLICT (name) DO NOTHING;
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- Ops Monitoring vNext: add Redis pool stats fields to system metrics snapshots.
|
||||||
|
-- This migration is intentionally idempotent.
|
||||||
|
|
||||||
|
ALTER TABLE ops_system_metrics
|
||||||
|
ADD COLUMN IF NOT EXISTS redis_conn_total INT,
|
||||||
|
ADD COLUMN IF NOT EXISTS redis_conn_idle INT;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN ops_system_metrics.redis_conn_total IS 'Redis pool total connections (go-redis PoolStats.TotalConns).';
|
||||||
|
COMMENT ON COLUMN ops_system_metrics.redis_conn_idle IS 'Redis pool idle connections (go-redis PoolStats.IdleConns).';
|
||||||
|
|||||||
@@ -230,6 +230,10 @@ export interface OpsSystemMetricsSnapshot {
|
|||||||
db_ok?: boolean | null
|
db_ok?: boolean | null
|
||||||
redis_ok?: boolean | null
|
redis_ok?: boolean | null
|
||||||
|
|
||||||
|
// Config-derived limits (best-effort) for rendering "current vs max".
|
||||||
|
db_max_open_conns?: number | null
|
||||||
|
redis_pool_size?: number | null
|
||||||
|
|
||||||
redis_conn_total?: number | null
|
redis_conn_total?: number | null
|
||||||
redis_conn_idle?: number | null
|
redis_conn_idle?: number | null
|
||||||
|
|
||||||
|
|||||||
@@ -1737,6 +1737,8 @@ export default {
|
|||||||
active: 'active',
|
active: 'active',
|
||||||
idle: 'idle',
|
idle: 'idle',
|
||||||
waiting: 'waiting',
|
waiting: 'waiting',
|
||||||
|
conns: 'conns',
|
||||||
|
queue: 'queue',
|
||||||
ok: 'ok',
|
ok: 'ok',
|
||||||
lastRun: 'last_run:',
|
lastRun: 'last_run:',
|
||||||
lastSuccess: 'last_success:',
|
lastSuccess: 'last_success:',
|
||||||
@@ -1750,6 +1752,17 @@ export default {
|
|||||||
tps: 'TPS:',
|
tps: 'TPS:',
|
||||||
current: 'current',
|
current: 'current',
|
||||||
peak: 'peak',
|
peak: 'peak',
|
||||||
|
average: 'average',
|
||||||
|
totalRequests: 'Total Requests',
|
||||||
|
avgQps: 'Avg QPS',
|
||||||
|
avgTps: 'Avg TPS',
|
||||||
|
avgLatency: 'Avg Latency',
|
||||||
|
avgTtft: 'Avg TTFT',
|
||||||
|
exceptions: 'Exceptions',
|
||||||
|
requestErrors: 'Request Errors',
|
||||||
|
errorCount: 'Error Count',
|
||||||
|
upstreamErrors: 'Upstream Errors',
|
||||||
|
errorCountExcl429529: 'Error Count (excl 429/529)',
|
||||||
sla: 'SLA (excl business limits)',
|
sla: 'SLA (excl business limits)',
|
||||||
businessLimited: 'business_limited:',
|
businessLimited: 'business_limited:',
|
||||||
errors: 'Errors',
|
errors: 'Errors',
|
||||||
@@ -1792,6 +1805,42 @@ export default {
|
|||||||
healthyStatus: 'Healthy',
|
healthyStatus: 'Healthy',
|
||||||
riskyStatus: 'At Risk',
|
riskyStatus: 'At Risk',
|
||||||
idleStatus: 'Idle',
|
idleStatus: 'Idle',
|
||||||
|
realtime: {
|
||||||
|
title: 'Realtime',
|
||||||
|
connected: 'Connected',
|
||||||
|
connecting: 'Connecting',
|
||||||
|
reconnecting: 'Reconnecting',
|
||||||
|
offline: 'Offline',
|
||||||
|
closed: 'Closed',
|
||||||
|
reconnectIn: 'Reconnect in {seconds}s'
|
||||||
|
},
|
||||||
|
tooltips: {
|
||||||
|
qps: 'Queries per second - real-time request rate',
|
||||||
|
sla: 'Service Level Agreement - percentage of requests within acceptable latency',
|
||||||
|
latency: 'Request duration from start to finish',
|
||||||
|
ttft: 'Time to First Token - latency until first response token',
|
||||||
|
errors: 'Request errors within SLA scope',
|
||||||
|
upstreamErrors: 'Errors from upstream services (excluding rate limits)',
|
||||||
|
totalRequests: 'Total requests and tokens consumed in this time window',
|
||||||
|
cpu: 'CPU usage percentage',
|
||||||
|
memory: 'Memory usage percentage',
|
||||||
|
db: 'Database connection pool status',
|
||||||
|
redis: 'Redis connection pool status',
|
||||||
|
goroutines: 'Go routine count (concurrent tasks)',
|
||||||
|
jobs: 'Background job health status'
|
||||||
|
},
|
||||||
|
timeRange: {
|
||||||
|
'5m': 'Last 5 minutes',
|
||||||
|
'30m': 'Last 30 minutes',
|
||||||
|
'1h': 'Last 1 hour',
|
||||||
|
'6h': 'Last 6 hours',
|
||||||
|
'24h': 'Last 24 hours'
|
||||||
|
},
|
||||||
|
queryMode: {
|
||||||
|
auto: 'Auto',
|
||||||
|
raw: 'Raw Query',
|
||||||
|
preagg: 'Pre-aggregated'
|
||||||
|
},
|
||||||
diagnosis: {
|
diagnosis: {
|
||||||
title: 'Smart Diagnosis',
|
title: 'Smart Diagnosis',
|
||||||
footer: 'Automated diagnostic suggestions based on current metrics',
|
footer: 'Automated diagnostic suggestions based on current metrics',
|
||||||
|
|||||||
@@ -1882,6 +1882,8 @@ export default {
|
|||||||
active: '活跃',
|
active: '活跃',
|
||||||
idle: '空闲',
|
idle: '空闲',
|
||||||
waiting: '等待',
|
waiting: '等待',
|
||||||
|
conns: '连接',
|
||||||
|
queue: '队列',
|
||||||
ok: '正常',
|
ok: '正常',
|
||||||
lastRun: '最近运行',
|
lastRun: '最近运行',
|
||||||
lastSuccess: '最近成功',
|
lastSuccess: '最近成功',
|
||||||
@@ -1895,6 +1897,17 @@ export default {
|
|||||||
tps: 'TPS',
|
tps: 'TPS',
|
||||||
current: '当前',
|
current: '当前',
|
||||||
peak: '峰值',
|
peak: '峰值',
|
||||||
|
average: '平均',
|
||||||
|
totalRequests: '总请求',
|
||||||
|
avgQps: '平均 QPS',
|
||||||
|
avgTps: '平均 TPS',
|
||||||
|
avgLatency: '平均延迟',
|
||||||
|
avgTtft: '平均首字延迟',
|
||||||
|
exceptions: '异常数',
|
||||||
|
requestErrors: '请求错误',
|
||||||
|
errorCount: '错误数',
|
||||||
|
upstreamErrors: '上游错误',
|
||||||
|
errorCountExcl429529: '错误数(排除429/529)',
|
||||||
sla: 'SLA(排除业务限制)',
|
sla: 'SLA(排除业务限制)',
|
||||||
businessLimited: '业务限制:',
|
businessLimited: '业务限制:',
|
||||||
errors: '错误',
|
errors: '错误',
|
||||||
@@ -1937,6 +1950,42 @@ export default {
|
|||||||
healthyStatus: '健康',
|
healthyStatus: '健康',
|
||||||
riskyStatus: '风险',
|
riskyStatus: '风险',
|
||||||
idleStatus: '待机',
|
idleStatus: '待机',
|
||||||
|
realtime: {
|
||||||
|
title: '实时信息',
|
||||||
|
connected: '已连接',
|
||||||
|
connecting: '连接中',
|
||||||
|
reconnecting: '重连中',
|
||||||
|
offline: '离线',
|
||||||
|
closed: '已关闭',
|
||||||
|
reconnectIn: '{seconds}秒后重连'
|
||||||
|
},
|
||||||
|
tooltips: {
|
||||||
|
qps: '每秒查询数 - 实时请求速率',
|
||||||
|
sla: '服务等级协议 - 可接受延迟范围内的请求百分比',
|
||||||
|
latency: '从开始到结束的请求持续时间',
|
||||||
|
ttft: '首字延迟 - 直到第一个响应令牌的延迟',
|
||||||
|
errors: 'SLA 范围内的请求错误',
|
||||||
|
upstreamErrors: '上游服务错误(不包括速率限制)',
|
||||||
|
totalRequests: '此时间窗口内的总请求数和消耗的令牌数',
|
||||||
|
cpu: 'CPU 使用率',
|
||||||
|
memory: '内存使用率',
|
||||||
|
db: '数据库连接池状态',
|
||||||
|
redis: 'Redis 连接池状态',
|
||||||
|
goroutines: 'Go 协程数(并发任务)',
|
||||||
|
jobs: '后台任务健康状态'
|
||||||
|
},
|
||||||
|
timeRange: {
|
||||||
|
'5m': '近5分钟',
|
||||||
|
'30m': '近30分钟',
|
||||||
|
'1h': '近1小时',
|
||||||
|
'6h': '近6小时',
|
||||||
|
'24h': '近24小时'
|
||||||
|
},
|
||||||
|
queryMode: {
|
||||||
|
auto: '自动',
|
||||||
|
raw: '原始查询',
|
||||||
|
preagg: '预聚合'
|
||||||
|
},
|
||||||
diagnosis: {
|
diagnosis: {
|
||||||
title: '智能诊断',
|
title: '智能诊断',
|
||||||
footer: '基于当前指标的自动诊断建议',
|
footer: '基于当前指标的自动诊断建议',
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user