refactor(ops): 移除duration相关告警指标,简化监控配置

主要改动:
- 移除 p95_latency_ms 和 p99_latency_ms 告警指标类型
- 移除配置中的 latency_p99_ms_max 阈值设置
- 简化健康分数计算(移除latency权重,重新归一化SLA和错误率)
- 移除duration相关的诊断规则和阈值检查
- 统一术语:延迟 → 请求时长
- 保留duration数据展示,但不再用于告警判断
- 聚焦TTFT作为主要的响应速度告警指标

影响范围:
- Backend: handler, service, models, tests
- Frontend: API types, i18n, components
This commit is contained in:
IanShaw027
2026-01-14 10:52:56 +08:00
parent 33f58d583d
commit 182683814b
14 changed files with 92 additions and 227 deletions

View File

@@ -482,13 +482,11 @@ const SettingKeyOpsMetricThresholds = "ops_metric_thresholds"
func defaultOpsMetricThresholds() *OpsMetricThresholds {
slaMin := 99.5
latencyMax := 2000.0
ttftMax := 500.0
reqErrMax := 5.0
upstreamErrMax := 5.0
return &OpsMetricThresholds{
SLAPercentMin: &slaMin,
LatencyP99MsMax: &latencyMax,
TTFTp99MsMax: &ttftMax,
RequestErrorRatePercentMax: &reqErrMax,
UpstreamErrorRatePercentMax: &upstreamErrMax,
@@ -538,9 +536,6 @@ func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricT
if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) {
return nil, errors.New("sla_percent_min must be between 0 and 100")
}
if cfg.LatencyP99MsMax != nil && *cfg.LatencyP99MsMax < 0 {
return nil, errors.New("latency_p99_ms_max must be >= 0")
}
if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 {
return nil, errors.New("ttft_p99_ms_max must be >= 0")
}