refactor(ops): 移除duration相关告警指标,简化监控配置

主要改动:
- 移除 p95_latency_ms 和 p99_latency_ms 告警指标类型
- 移除配置中的 latency_p99_ms_max 阈值设置
- 简化健康分数计算(移除latency权重,重新归一化SLA和错误率)
- 移除duration相关的诊断规则和阈值检查
- 统一术语:延迟 → 请求时长
- 保留duration数据展示,但不再用于告警判断
- 聚焦TTFT作为主要的响应速度告警指标

影响范围:
- Backend: handler, service, models, tests
- Frontend: API types, i18n, components
This commit is contained in:
IanShaw027
2026-01-14 10:52:56 +08:00
parent 33f58d583d
commit 182683814b
14 changed files with 92 additions and 227 deletions

View File

@@ -1887,7 +1887,7 @@ export default {
totalRequests: 'Total Requests',
avgQps: 'Avg QPS',
avgTps: 'Avg TPS',
avgLatency: 'Avg Latency',
avgLatency: 'Avg Request Duration',
avgTtft: 'Avg TTFT',
exceptions: 'Exceptions',
requestErrors: 'Request Errors',
@@ -1899,7 +1899,7 @@ export default {
errors: 'Errors',
errorRate: 'error_rate:',
upstreamRate: 'upstream_rate:',
latencyDuration: 'Latency (duration_ms)',
latencyDuration: 'Request Duration (ms)',
ttftLabel: 'TTFT (first_token_ms)',
p50: 'p50:',
p90: 'p90:',
@@ -1919,7 +1919,7 @@ export default {
failedToLoadData: 'Failed to load ops data.',
failedToLoadOverview: 'Failed to load overview',
failedToLoadThroughputTrend: 'Failed to load throughput trend',
failedToLoadLatencyHistogram: 'Failed to load latency histogram',
failedToLoadLatencyHistogram: 'Failed to load request duration histogram',
failedToLoadErrorTrend: 'Failed to load error trend',
failedToLoadErrorDistribution: 'Failed to load error distribution',
failedToLoadErrorDetail: 'Failed to load error detail',
@@ -1927,7 +1927,7 @@ export default {
tpsK: 'TPS (K)',
top: 'Top:',
throughputTrend: 'Throughput Trend',
latencyHistogram: 'Latency Histogram',
latencyHistogram: 'Request Duration Histogram',
errorTrend: 'Error Trend',
errorDistribution: 'Error Distribution',
// Health Score & Diagnosis
@@ -1973,14 +1973,7 @@ export default {
memoryHigh: 'Memory usage elevated ({usage}%)',
memoryHighImpact: 'Memory pressure is high, needs attention',
memoryHighAction: 'Monitor memory trends, check for memory leaks',
// Latency diagnostics
latencyCritical: 'Response latency critically high ({latency}ms)',
latencyCriticalImpact: 'User experience extremely poor, many requests timing out',
latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services',
latencyHigh: 'Response latency elevated ({latency}ms)',
latencyHighImpact: 'User experience degraded, needs optimization',
latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic',
ttftHigh: 'Time to first byte elevated ({ttft}ms)',
ttftHigh: 'Time to first token elevated ({ttft}ms)',
ttftHighImpact: 'User perceived latency increased',
ttftHighAction: 'Optimize request processing flow, reduce pre-processing time',
// Error rate diagnostics
@@ -2020,7 +2013,7 @@ export default {
context: 'Context',
status: 'Status',
message: 'Message',
latency: 'Latency',
latency: 'Request Duration',
action: 'Action',
noErrors: 'No errors in this window.',
grp: 'GRP:',
@@ -2049,7 +2042,7 @@ export default {
basicInfo: 'Basic Info',
platform: 'Platform',
model: 'Model',
latency: 'Latency',
latency: 'Request Duration',
ttft: 'TTFT',
businessLimited: 'Business Limited',
requestPath: 'Request Path',
@@ -2398,8 +2391,6 @@ export default {
metricThresholdsHint: 'Configure alert thresholds for metrics, values exceeding thresholds will be displayed in red',
slaMinPercent: 'SLA Minimum Percentage',
slaMinPercentHint: 'SLA below this value will be displayed in red (default: 99.5%)',
latencyP99MaxMs: 'Latency P99 Maximum (ms)',
latencyP99MaxMsHint: 'Latency P99 above this value will be displayed in red (default: 2000ms)',
ttftP99MaxMs: 'TTFT P99 Maximum (ms)',
ttftP99MaxMsHint: 'TTFT P99 above this value will be displayed in red (default: 500ms)',
requestErrorRateMaxPercent: 'Request Error Rate Maximum (%)',
@@ -2458,7 +2449,7 @@ export default {
tooltips: {
totalRequests: 'Total number of requests (including both successful and failed requests) in the selected time window.',
throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
latencyHistogram: 'Request duration distribution (ms) for successful requests.',
errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
errorDistribution: 'Error distribution by status code.',
goroutines:
@@ -2473,7 +2464,7 @@ export default {
sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
upstreamErrors: 'Upstream error statistics, excluding rate limit errors (429/529).',
latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
latency: 'Request duration statistics, including p50, p90, p95, p99 percentiles.',
ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
},

View File

@@ -2031,7 +2031,7 @@ export default {
totalRequests: '总请求',
avgQps: '平均 QPS',
avgTps: '平均 TPS',
avgLatency: '平均延迟',
avgLatency: '平均请求时长',
avgTtft: '平均首字延迟',
exceptions: '异常数',
requestErrors: '请求错误',
@@ -2043,7 +2043,7 @@ export default {
errors: '错误',
errorRate: '错误率:',
upstreamRate: '上游错误率:',
latencyDuration: '延迟(毫秒)',
latencyDuration: '请求时长(毫秒)',
ttftLabel: '首字延迟(毫秒)',
p50: 'p50',
p90: 'p90',
@@ -2063,7 +2063,7 @@ export default {
failedToLoadData: '加载运维数据失败',
failedToLoadOverview: '加载概览数据失败',
failedToLoadThroughputTrend: '加载吞吐趋势失败',
failedToLoadLatencyHistogram: '加载延迟分布失败',
failedToLoadLatencyHistogram: '加载请求时长分布失败',
failedToLoadErrorTrend: '加载错误趋势失败',
failedToLoadErrorDistribution: '加载错误分布失败',
failedToLoadErrorDetail: '加载错误详情失败',
@@ -2071,7 +2071,7 @@ export default {
tpsK: 'TPS',
top: '最高:',
throughputTrend: '吞吐趋势',
latencyHistogram: '延迟分布',
latencyHistogram: '请求时长分布',
errorTrend: '错误趋势',
errorDistribution: '错误分布',
// Health Score & Diagnosis
@@ -2117,15 +2117,8 @@ export default {
memoryHigh: '内存使用率偏高 ({usage}%)',
memoryHighImpact: '内存压力较大,需要关注',
memoryHighAction: '监控内存趋势,检查是否有内存泄漏',
// Latency diagnostics
latencyCritical: '响应延迟严重过高 ({latency}ms)',
latencyCriticalImpact: '用户体验极差,大量请求超时',
latencyCriticalAction: '检查慢查询、数据库索引、网络延迟和上游服务',
latencyHigh: '响应延迟偏高 ({latency}ms)',
latencyHighImpact: '用户体验下降,需要优化',
latencyHighAction: '分析慢请求日志,优化数据库查询和业务逻辑',
ttftHigh: '首字节时间偏高 ({ttft}ms)',
ttftHighImpact: '用户感知延迟增加',
ttftHighImpact: '用户感知时长增加',
ttftHighAction: '优化请求处理流程,减少前置逻辑耗时',
// Error rate diagnostics
upstreamCritical: '上游错误率严重偏高 ({rate}%)',
@@ -2143,13 +2136,13 @@ export default {
// SLA diagnostics
slaCritical: 'SLA 严重低于目标 ({sla}%)',
slaCriticalImpact: '用户体验严重受损',
slaCriticalAction: '紧急排查错误和延迟问题,考虑限流保护',
slaCriticalAction: '紧急排查错误原因,必要时采取限流保护',
slaLow: 'SLA 低于目标 ({sla}%)',
slaLowImpact: '需要关注服务质量',
slaLowAction: '分析SLA下降原因优化系统性能',
// Health score diagnostics
healthCritical: '综合健康评分过低 ({score})',
healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与延迟',
healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与资源使用情况',
healthCriticalAction: '全面检查系统状态优先处理critical级别问题',
healthLow: '综合健康评分偏低 ({score})',
healthLowImpact: '可能存在轻度波动,建议关注 SLA 与错误率',
@@ -2164,7 +2157,7 @@ export default {
context: '上下文',
status: '状态码',
message: '消息',
latency: '延迟',
latency: '请求时长',
action: '操作',
noErrors: '该窗口内暂无错误。',
grp: 'GRP',
@@ -2193,7 +2186,7 @@ export default {
basicInfo: '基本信息',
platform: '平台',
model: '模型',
latency: '延迟',
latency: '请求时长',
ttft: 'TTFT',
businessLimited: '业务限制',
requestPath: '请求路径',
@@ -2351,8 +2344,8 @@ export default {
successRate: '成功率 (%)',
errorRate: '错误率 (%)',
upstreamErrorRate: '上游错误率 (%)',
p95: 'P95 延迟 (ms)',
p99: 'P99 延迟 (ms)',
p95: 'P95 请求时长 (ms)',
p99: 'P99 请求时长 (ms)',
cpu: 'CPU 使用率 (%)',
memory: '内存使用率 (%)',
queueDepth: '并发排队深度',
@@ -2542,8 +2535,6 @@ export default {
metricThresholdsHint: '配置各项指标的告警阈值,超出阈值时将以红色显示',
slaMinPercent: 'SLA最低百分比',
slaMinPercentHint: 'SLA低于此值时显示为红色默认99.5%',
latencyP99MaxMs: '延迟P99最大值毫秒',
latencyP99MaxMsHint: '延迟P99高于此值时显示为红色默认2000ms',
ttftP99MaxMs: 'TTFT P99最大值毫秒',
ttftP99MaxMsHint: 'TTFT P99高于此值时显示为红色默认500ms',
requestErrorRateMaxPercent: '请求错误率最大值(%',
@@ -2602,12 +2593,12 @@ export default {
tooltips: {
totalRequests: '当前时间窗口内的总请求数和Token消耗量。',
throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。',
latencyHistogram: '成功请求的延迟分布(毫秒)。',
latencyHistogram: '成功请求的请求时长分布(毫秒)。',
errorTrend: '错误趋势SLA 口径排除业务限制;上游错误率排除 429/529。',
errorDistribution: '按状态码统计的错误分布。',
upstreamErrors: '上游服务返回的错误包括API提供商的错误响应排除429/529限流错误。',
goroutines:
'Go 运行时的协程数量(轻量级线程)。没有绝对安全值,建议以历史基线为准。经验参考:<2000 常见2000-8000 需关注;>8000 且伴随队列/延迟上升时,优先排查阻塞/泄漏。',
'Go 运行时的协程数量(轻量级线程)。没有绝对"安全值",建议以历史基线为准。经验参考:<2000 常见2000-8000 需关注;>8000 且伴随队列上升时,优先排查阻塞/泄漏。',
cpu: 'CPU 使用率,显示系统处理器的负载情况。',
memory: '内存使用率,包括已使用和总可用内存。',
db: '数据库连接池状态,包括活跃连接、空闲连接和等待连接数。',
@@ -2617,7 +2608,7 @@ export default {
tokens: '当前时间窗口内处理的总Token数量。',
sla: '服务等级协议达成率,排除业务限制(如余额不足、配额超限)的成功请求占比。',
errors: '错误统计,包括总错误数、错误率和上游错误率。',
latency: '请求延迟统计,包括 p50、p90、p95、p99 等百分位数。',
latency: '请求时长统计,包括 p50、p90、p95、p99 等百分位数。',
ttft: '首Token延迟Time To First Token衡量流式响应的首字节返回速度。',
health: '系统健康评分0-100综合考虑 SLA、错误率和资源使用情况。'
},