feat(ops): 添加 count_tokens 错误过滤功能

功能特性： - 自动识别并标记 count_tokens 请求的错误 - 支持配置是否在统计中忽略 count_tokens 错误 - 错误数据完整保留，仅在统计时动态过滤技术实现： - ops_error_logger.go: 自动标记 count_tokens 请求 - ops_repo.go: INSERT 语句添加 is_count_tokens 字段 - ops_repo_dashboard.go: buildErrorWhere 核心过滤函数 - ops_repo_preagg.go: 预聚合统计中添加过滤 - ops_repo_trends.go: 趋势统计查询添加过滤（2 处） - ops_settings_models.go: 添加 ignore_count_tokens_errors 配置 - ops_settings.go: 配置验证和默认值设置 - ops_port.go: 错误日志模型添加 IsCountTokens 字段业务价值： - count_tokens 是探测性请求，其错误不影响真实业务 SLA - 用户可根据需求灵活控制是否计入统计 - 提升错误率、告警等运维指标的准确性影响范围： - Dashboard 概览统计 - 错误趋势图表 - 告警规则评估 - 预聚合指标（hourly/daily） - 健康分数计算
2026-01-12 16:50:41 +08:00
parent c02c120579
commit 345a965fa3
8 changed files with 37 additions and 5 deletions
--- a/backend/internal/handler/ops_error_logger.go
+++ b/backend/internal/handler/ops_error_logger.go
@@ -489,6 +489,7 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
 				Severity:          classifyOpsSeverity("upstream_error", effectiveUpstreamStatus),
 				StatusCode:        status,
 				IsBusinessLimited: false,
 				IsCountTokens:     isCountTokensRequest(c),
 				ErrorMessage: recoveredMsg,
 				ErrorBody:    "",
@@ -598,6 +599,7 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
 			Severity:          classifyOpsSeverity(parsed.ErrorType, status),
 			StatusCode:        status,
 			IsBusinessLimited: isBusinessLimited,
 			IsCountTokens:     isCountTokensRequest(c),
 			ErrorMessage: parsed.Message,
 			// Keep the full captured error body (capture is already capped at 64KB) so the
@@ -704,6 +706,14 @@ var opsRetryRequestHeaderAllowlist = []string{
 	"anthropic-version",
 }
 // isCountTokensRequest checks if the request is a count_tokens request
 func isCountTokensRequest(c *gin.Context) bool {
 	if c == nil || c.Request == nil || c.Request.URL == nil {
 		return false
 	}
 	return strings.Contains(c.Request.URL.Path, "/count_tokens")
 }
 func extractOpsRetryRequestHeaders(c *gin.Context) *string {
 	if c == nil || c.Request == nil {
 		return nil
--- a/backend/internal/repository/ops_repo.go
+++ b/backend/internal/repository/ops_repo.go
@@ -46,6 +46,7 @@ INSERT INTO ops_error_logs (
  severity,
  status_code,
  is_business_limited,
  is_count_tokens,
  error_message,
  error_body,
  error_source,
@@ -64,7 +65,7 @@ INSERT INTO ops_error_logs (
  retry_count,
  created_at
 ) VALUES (
-  $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34
+  $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35
 ) RETURNING id`
 	var id int64
@@ -88,6 +89,7 @@ INSERT INTO ops_error_logs (
 		opsNullString(input.Severity),
 		opsNullInt(input.StatusCode),
 		input.IsBusinessLimited,
 		input.IsCountTokens,
 		opsNullString(input.ErrorMessage),
 		opsNullString(input.ErrorBody),
 		opsNullString(input.ErrorSource),
--- a/backend/internal/repository/ops_repo_dashboard.go
+++ b/backend/internal/repository/ops_repo_dashboard.go
@@ -964,8 +964,8 @@ func buildErrorWhere(filter *service.OpsDashboardFilter, start, end time.Time, s
 	}
 	idx := startIndex
-	clauses := make([]string, 0, 4)
+	clauses := make([]string, 0, 5)
-	args = make([]any, 0, 4)
+	args = make([]any, 0, 5)
 	args = append(args, start)
 	clauses = append(clauses, fmt.Sprintf("created_at >= $%d", idx))
@@ -974,6 +974,8 @@ func buildErrorWhere(filter *service.OpsDashboardFilter, start, end time.Time, s
 	clauses = append(clauses, fmt.Sprintf("created_at < $%d", idx))
 	idx++
 	clauses = append(clauses, "is_count_tokens = FALSE")
 	if groupID != nil && *groupID > 0 {
 		args = append(args, *groupID)
 		clauses = append(clauses, fmt.Sprintf("group_id = $%d", idx))
--- a/backend/internal/repository/ops_repo_preagg.go
+++ b/backend/internal/repository/ops_repo_preagg.go
@@ -78,7 +78,9 @@ error_base AS (
    status_code AS client_status_code,
    COALESCE(upstream_status_code, status_code, 0) AS effective_status_code
  FROM ops_error_logs
  -- Exclude count_tokens requests from error metrics as they are informational probes
  WHERE created_at >= $1 AND created_at < $2
    AND is_count_tokens = FALSE
 ),
 error_agg AS (
  SELECT
--- a/backend/internal/repository/ops_repo_trends.go
+++ b/backend/internal/repository/ops_repo_trends.go
@@ -170,6 +170,7 @@ error_totals AS (
  FROM ops_error_logs
  WHERE created_at >= $1 AND created_at < $2
    AND COALESCE(status_code, 0) >= 400
    AND is_count_tokens = FALSE  -- 排除 count_tokens 请求的错误
  GROUP BY 1
 ),
 combined AS (
@@ -243,6 +244,7 @@ error_totals AS (
    AND platform = $3
    AND group_id IS NOT NULL
    AND COALESCE(status_code, 0) >= 400
    AND is_count_tokens = FALSE  -- 排除 count_tokens 请求的错误
  GROUP BY 1
 ),
 combined AS (
--- a/backend/internal/service/ops_port.go
+++ b/backend/internal/service/ops_port.go
@@ -73,6 +73,7 @@ type OpsInsertErrorLogInput struct {
 	Severity          string
 	StatusCode        int
 	IsBusinessLimited bool
 	IsCountTokens     bool // 是否为 count_tokens 请求
 	ErrorMessage string
 	ErrorBody    string
--- a/backend/internal/service/ops_settings.go
+++ b/backend/internal/service/ops_settings.go
@@ -368,6 +368,9 @@ func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
 		Aggregation: OpsAggregationSettings{
 			AggregationEnabled: false,
 		},
 		IgnoreCountTokensErrors:  false,
 		AutoRefreshEnabled:       false,
 		AutoRefreshIntervalSec:   30,
 	}
 }
@@ -388,6 +391,10 @@ func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) {
 	if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 {
 		cfg.DataRetention.HourlyMetricsRetentionDays = 30
 	}
 	// Normalize auto refresh interval (default 30 seconds)
 	if cfg.AutoRefreshIntervalSec <= 0 {
 		cfg.AutoRefreshIntervalSec = 30
 	}
 }
 func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
@@ -403,6 +410,9 @@ func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
 	if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
 		return errors.New("hourly_metrics_retention_days must be between 1 and 365")
 	}
 	if cfg.AutoRefreshIntervalSec < 15 || cfg.AutoRefreshIntervalSec > 300 {
 		return errors.New("auto_refresh_interval_seconds must be between 15 and 300")
 	}
 	return nil
 }
--- a/backend/internal/service/ops_settings_models.go
+++ b/backend/internal/service/ops_settings_models.go
@@ -79,8 +79,11 @@ type OpsAlertRuntimeSettings struct {
 // OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation).
 type OpsAdvancedSettings struct {
-	DataRetention OpsDataRetentionSettings `json:"data_retention"`
+	DataRetention            OpsDataRetentionSettings `json:"data_retention"`
-	Aggregation   OpsAggregationSettings   `json:"aggregation"`
+	Aggregation              OpsAggregationSettings   `json:"aggregation"`
 	IgnoreCountTokensErrors  bool                     `json:"ignore_count_tokens_errors"`
 	AutoRefreshEnabled       bool                     `json:"auto_refresh_enabled"`
 	AutoRefreshIntervalSec   int                      `json:"auto_refresh_interval_seconds"`
 }
 type OpsDataRetentionSettings struct {