refactor(ops): 移除duration相关告警指标，简化监控配置

主要改动： - 移除 p95_latency_ms 和 p99_latency_ms 告警指标类型 - 移除配置中的 latency_p99_ms_max 阈值设置 - 简化健康分数计算（移除latency权重，重新归一化SLA和错误率） - 移除duration相关的诊断规则和阈值检查 - 统一术语：延迟 → 请求时长 - 保留duration数据展示，但不再用于告警判断 - 聚焦TTFT作为主要的响应速度告警指标影响范围： - Backend: handler, service, models, tests - Frontend: API types, i18n, components
2026-01-14 10:52:56 +08:00
parent 33f58d583d
commit 182683814b
14 changed files with 92 additions and 227 deletions
--- a/backend/internal/handler/admin/ops_alerts_handler.go
+++ b/backend/internal/handler/admin/ops_alerts_handler.go
@@ -20,8 +20,6 @@ var validOpsAlertMetricTypes = []string{
 	"success_rate",
 	"error_rate",
 	"upstream_error_rate",
-	"p95_latency_ms",
-	"p99_latency_ms",
 	"cpu_usage_percent",
 	"memory_usage_percent",
 	"concurrency_queue_depth",
--- a/backend/internal/service/ops_alert_evaluator_service.go
+++ b/backend/internal/service/ops_alert_evaluator_service.go
@@ -523,16 +523,6 @@ func (s *OpsAlertEvaluatorService) computeRuleMetric(
 			return 0, false
 		}
 		return overview.UpstreamErrorRate * 100, true
-	case "p95_latency_ms":
-		if overview.Duration.P95 == nil {
-			return 0, false
-		}
-		return float64(*overview.Duration.P95), true
-	case "p99_latency_ms":
-		if overview.Duration.P99 == nil {
-			return 0, false
-		}
-		return float64(*overview.Duration.P99), true
 	default:
 		return 0, false
 	}
--- a/backend/internal/service/ops_health_score.go
+++ b/backend/internal/service/ops_health_score.go
@@ -32,7 +32,7 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview)
 }

 // computeBusinessHealth calculates business health score (0-100)
-// Components: SLA (50%) + Error Rate (30%) + Latency (20%)
+// Components: SLA (50%) + Error Rate (30%)
 func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
 	// SLA score: 99.5% → 100, 95% → 0 (linear)
 	slaScore := 100.0
@@ -59,22 +59,9 @@ func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
 		}
 	}

-	// Latency score: 1s → 100, 10s → 0 (linear)
-	// Uses P99 of duration (TTFT is less critical for overall health)
-	latencyScore := 100.0
-	if overview.Duration.P99 != nil {
-		p99 := float64(*overview.Duration.P99)
-		if p99 > 1000 {
-			if p99 <= 10000 {
-				latencyScore = (10000 - p99) / 9000 * 100
-			} else {
-				latencyScore = 0
-			}
-		}
-	}
-
-	// Weighted combination
-	return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2
+	// Weighted combination (renormalized after removing duration)
+	const weightSum = 0.8
+	return (slaScore*0.5 + errorScore*0.3) / weightSum
 }

 // computeInfraHealth calculates infrastructure health score (0-100)
--- a/backend/internal/service/ops_health_score_test.go
+++ b/backend/internal/service/ops_health_score_test.go
@@ -291,17 +291,6 @@ func TestComputeBusinessHealth(t *testing.T) {
 			wantMin: 95,
 			wantMax: 100,
 		},
-		{
-			name: "latency boundary 1000ms",
-			overview: &OpsDashboardOverview{
-				SLA:               0.995,
-				ErrorRate:         0,
-				UpstreamErrorRate: 0,
-				Duration:          OpsPercentiles{P99: intPtr(1000)},
-			},
-			wantMin: 95,
-			wantMax: 100,
-		},
 		{
 			name: "upstream error dominates",
 			overview: &OpsDashboardOverview{
--- a/backend/internal/service/ops_settings.go
+++ b/backend/internal/service/ops_settings.go
@@ -482,13 +482,11 @@ const SettingKeyOpsMetricThresholds = "ops_metric_thresholds"

 func defaultOpsMetricThresholds() *OpsMetricThresholds {
 	slaMin := 99.5
-	latencyMax := 2000.0
 	ttftMax := 500.0
 	reqErrMax := 5.0
 	upstreamErrMax := 5.0
 	return &OpsMetricThresholds{
 		SLAPercentMin:               &slaMin,
-		LatencyP99MsMax:             &latencyMax,
 		TTFTp99MsMax:                &ttftMax,
 		RequestErrorRatePercentMax:  &reqErrMax,
 		UpstreamErrorRatePercentMax: &upstreamErrMax,
@@ -538,9 +536,6 @@ func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricT
 	if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) {
 		return nil, errors.New("sla_percent_min must be between 0 and 100")
 	}
-	if cfg.LatencyP99MsMax != nil && *cfg.LatencyP99MsMax < 0 {
-		return nil, errors.New("latency_p99_ms_max must be >= 0")
-	}
 	if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 {
 		return nil, errors.New("ttft_p99_ms_max must be >= 0")
 	}
--- a/backend/internal/service/ops_settings_models.go
+++ b/backend/internal/service/ops_settings_models.go
@@ -63,7 +63,6 @@ type OpsAlertSilencingSettings struct {

 type OpsMetricThresholds struct {
 	SLAPercentMin               *float64 `json:"sla_percent_min,omitempty"`                 // SLA低于此值变红
-	LatencyP99MsMax             *float64 `json:"latency_p99_ms_max,omitempty"`              // 延迟P99高于此值变红
 	TTFTp99MsMax                *float64 `json:"ttft_p99_ms_max,omitempty"`                 // TTFT P99高于此值变红
 	RequestErrorRatePercentMax  *float64 `json:"request_error_rate_percent_max,omitempty"`  // 请求错误率高于此值变红
 	UpstreamErrorRatePercentMax *float64 `json:"upstream_error_rate_percent_max,omitempty"` // 上游错误率高于此值变红