refactor(ops): 移除duration相关告警指标,简化监控配置

主要改动:
- 移除 p95_latency_ms 和 p99_latency_ms 告警指标类型
- 移除配置中的 latency_p99_ms_max 阈值设置
- 简化健康分数计算(移除latency权重,重新归一化SLA和错误率)
- 移除duration相关的诊断规则和阈值检查
- 统一术语:延迟 → 请求时长
- 保留duration数据展示,但不再用于告警判断
- 聚焦TTFT作为主要的响应速度告警指标

影响范围:
- Backend: handler, service, models, tests
- Frontend: API types, i18n, components
This commit is contained in:
IanShaw027
2026-01-14 10:52:56 +08:00
parent 33f58d583d
commit 182683814b
14 changed files with 92 additions and 227 deletions

View File

@@ -1887,7 +1887,7 @@ export default {
totalRequests: 'Total Requests',
avgQps: 'Avg QPS',
avgTps: 'Avg TPS',
avgLatency: 'Avg Latency',
avgLatency: 'Avg Request Duration',
avgTtft: 'Avg TTFT',
exceptions: 'Exceptions',
requestErrors: 'Request Errors',
@@ -1899,7 +1899,7 @@ export default {
errors: 'Errors',
errorRate: 'error_rate:',
upstreamRate: 'upstream_rate:',
latencyDuration: 'Latency (duration_ms)',
latencyDuration: 'Request Duration (ms)',
ttftLabel: 'TTFT (first_token_ms)',
p50: 'p50:',
p90: 'p90:',
@@ -1919,7 +1919,7 @@ export default {
failedToLoadData: 'Failed to load ops data.',
failedToLoadOverview: 'Failed to load overview',
failedToLoadThroughputTrend: 'Failed to load throughput trend',
failedToLoadLatencyHistogram: 'Failed to load latency histogram',
failedToLoadLatencyHistogram: 'Failed to load request duration histogram',
failedToLoadErrorTrend: 'Failed to load error trend',
failedToLoadErrorDistribution: 'Failed to load error distribution',
failedToLoadErrorDetail: 'Failed to load error detail',
@@ -1927,7 +1927,7 @@ export default {
tpsK: 'TPS (K)',
top: 'Top:',
throughputTrend: 'Throughput Trend',
latencyHistogram: 'Latency Histogram',
latencyHistogram: 'Request Duration Histogram',
errorTrend: 'Error Trend',
errorDistribution: 'Error Distribution',
// Health Score & Diagnosis
@@ -1973,14 +1973,7 @@ export default {
memoryHigh: 'Memory usage elevated ({usage}%)',
memoryHighImpact: 'Memory pressure is high, needs attention',
memoryHighAction: 'Monitor memory trends, check for memory leaks',
// Latency diagnostics
latencyCritical: 'Response latency critically high ({latency}ms)',
latencyCriticalImpact: 'User experience extremely poor, many requests timing out',
latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services',
latencyHigh: 'Response latency elevated ({latency}ms)',
latencyHighImpact: 'User experience degraded, needs optimization',
latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic',
ttftHigh: 'Time to first byte elevated ({ttft}ms)',
ttftHigh: 'Time to first token elevated ({ttft}ms)',
ttftHighImpact: 'User perceived latency increased',
ttftHighAction: 'Optimize request processing flow, reduce pre-processing time',
// Error rate diagnostics
@@ -2020,7 +2013,7 @@ export default {
context: 'Context',
status: 'Status',
message: 'Message',
latency: 'Latency',
latency: 'Request Duration',
action: 'Action',
noErrors: 'No errors in this window.',
grp: 'GRP:',
@@ -2049,7 +2042,7 @@ export default {
basicInfo: 'Basic Info',
platform: 'Platform',
model: 'Model',
latency: 'Latency',
latency: 'Request Duration',
ttft: 'TTFT',
businessLimited: 'Business Limited',
requestPath: 'Request Path',
@@ -2398,8 +2391,6 @@ export default {
metricThresholdsHint: 'Configure alert thresholds for metrics, values exceeding thresholds will be displayed in red',
slaMinPercent: 'SLA Minimum Percentage',
slaMinPercentHint: 'SLA below this value will be displayed in red (default: 99.5%)',
latencyP99MaxMs: 'Latency P99 Maximum (ms)',
latencyP99MaxMsHint: 'Latency P99 above this value will be displayed in red (default: 2000ms)',
ttftP99MaxMs: 'TTFT P99 Maximum (ms)',
ttftP99MaxMsHint: 'TTFT P99 above this value will be displayed in red (default: 500ms)',
requestErrorRateMaxPercent: 'Request Error Rate Maximum (%)',
@@ -2458,7 +2449,7 @@ export default {
tooltips: {
totalRequests: 'Total number of requests (including both successful and failed requests) in the selected time window.',
throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
latencyHistogram: 'Request duration distribution (ms) for successful requests.',
errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
errorDistribution: 'Error distribution by status code.',
goroutines:
@@ -2473,7 +2464,7 @@ export default {
sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
upstreamErrors: 'Upstream error statistics, excluding rate limit errors (429/529).',
latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
latency: 'Request duration statistics, including p50, p90, p95, p99 percentiles.',
ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
},