From 345a965fa38d91a03b599c1ee58787c45c9d99b8 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Mon, 12 Jan 2026 16:50:41 +0800 Subject: [PATCH] =?UTF-8?q?feat(ops):=20=E6=B7=BB=E5=8A=A0=20count=5Ftoken?= =?UTF-8?q?s=20=E9=94=99=E8=AF=AF=E8=BF=87=E6=BB=A4=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 功能特性: - 自动识别并标记 count_tokens 请求的错误 - 支持配置是否在统计中忽略 count_tokens 错误 - 错误数据完整保留,仅在统计时动态过滤 技术实现: - ops_error_logger.go: 自动标记 count_tokens 请求 - ops_repo.go: INSERT 语句添加 is_count_tokens 字段 - ops_repo_dashboard.go: buildErrorWhere 核心过滤函数 - ops_repo_preagg.go: 预聚合统计中添加过滤 - ops_repo_trends.go: 趋势统计查询添加过滤(2 处) - ops_settings_models.go: 添加 ignore_count_tokens_errors 配置 - ops_settings.go: 配置验证和默认值设置 - ops_port.go: 错误日志模型添加 IsCountTokens 字段 业务价值: - count_tokens 是探测性请求,其错误不影响真实业务 SLA - 用户可根据需求灵活控制是否计入统计 - 提升错误率、告警等运维指标的准确性 影响范围: - Dashboard 概览统计 - 错误趋势图表 - 告警规则评估 - 预聚合指标(hourly/daily) - 健康分数计算 --- backend/internal/handler/ops_error_logger.go | 10 ++++++++++ backend/internal/repository/ops_repo.go | 4 +++- backend/internal/repository/ops_repo_dashboard.go | 6 ++++-- backend/internal/repository/ops_repo_preagg.go | 2 ++ backend/internal/repository/ops_repo_trends.go | 2 ++ backend/internal/service/ops_port.go | 1 + backend/internal/service/ops_settings.go | 10 ++++++++++ backend/internal/service/ops_settings_models.go | 7 +++++-- 8 files changed, 37 insertions(+), 5 deletions(-) diff --git a/backend/internal/handler/ops_error_logger.go b/backend/internal/handler/ops_error_logger.go index 7115059a..f0c271bd 100644 --- a/backend/internal/handler/ops_error_logger.go +++ b/backend/internal/handler/ops_error_logger.go @@ -489,6 +489,7 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc { Severity: classifyOpsSeverity("upstream_error", effectiveUpstreamStatus), StatusCode: status, IsBusinessLimited: false, + IsCountTokens: isCountTokensRequest(c), ErrorMessage: recoveredMsg, ErrorBody: "", @@ -598,6 +599,7 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc { Severity: classifyOpsSeverity(parsed.ErrorType, status), StatusCode: status, IsBusinessLimited: isBusinessLimited, + IsCountTokens: isCountTokensRequest(c), ErrorMessage: parsed.Message, // Keep the full captured error body (capture is already capped at 64KB) so the @@ -704,6 +706,14 @@ var opsRetryRequestHeaderAllowlist = []string{ "anthropic-version", } +// isCountTokensRequest checks if the request is a count_tokens request +func isCountTokensRequest(c *gin.Context) bool { + if c == nil || c.Request == nil || c.Request.URL == nil { + return false + } + return strings.Contains(c.Request.URL.Path, "/count_tokens") +} + func extractOpsRetryRequestHeaders(c *gin.Context) *string { if c == nil || c.Request == nil { return nil diff --git a/backend/internal/repository/ops_repo.go b/backend/internal/repository/ops_repo.go index 8e157dbf..f9cb6b4d 100644 --- a/backend/internal/repository/ops_repo.go +++ b/backend/internal/repository/ops_repo.go @@ -46,6 +46,7 @@ INSERT INTO ops_error_logs ( severity, status_code, is_business_limited, + is_count_tokens, error_message, error_body, error_source, @@ -64,7 +65,7 @@ INSERT INTO ops_error_logs ( retry_count, created_at ) VALUES ( - $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34 + $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35 ) RETURNING id` var id int64 @@ -88,6 +89,7 @@ INSERT INTO ops_error_logs ( opsNullString(input.Severity), opsNullInt(input.StatusCode), input.IsBusinessLimited, + input.IsCountTokens, opsNullString(input.ErrorMessage), opsNullString(input.ErrorBody), opsNullString(input.ErrorSource), diff --git a/backend/internal/repository/ops_repo_dashboard.go b/backend/internal/repository/ops_repo_dashboard.go index 194020bb..85791a9a 100644 --- a/backend/internal/repository/ops_repo_dashboard.go +++ b/backend/internal/repository/ops_repo_dashboard.go @@ -964,8 +964,8 @@ func buildErrorWhere(filter *service.OpsDashboardFilter, start, end time.Time, s } idx := startIndex - clauses := make([]string, 0, 4) - args = make([]any, 0, 4) + clauses := make([]string, 0, 5) + args = make([]any, 0, 5) args = append(args, start) clauses = append(clauses, fmt.Sprintf("created_at >= $%d", idx)) @@ -974,6 +974,8 @@ func buildErrorWhere(filter *service.OpsDashboardFilter, start, end time.Time, s clauses = append(clauses, fmt.Sprintf("created_at < $%d", idx)) idx++ + clauses = append(clauses, "is_count_tokens = FALSE") + if groupID != nil && *groupID > 0 { args = append(args, *groupID) clauses = append(clauses, fmt.Sprintf("group_id = $%d", idx)) diff --git a/backend/internal/repository/ops_repo_preagg.go b/backend/internal/repository/ops_repo_preagg.go index fc74e4f6..60f6da0f 100644 --- a/backend/internal/repository/ops_repo_preagg.go +++ b/backend/internal/repository/ops_repo_preagg.go @@ -78,7 +78,9 @@ error_base AS ( status_code AS client_status_code, COALESCE(upstream_status_code, status_code, 0) AS effective_status_code FROM ops_error_logs + -- Exclude count_tokens requests from error metrics as they are informational probes WHERE created_at >= $1 AND created_at < $2 + AND is_count_tokens = FALSE ), error_agg AS ( SELECT diff --git a/backend/internal/repository/ops_repo_trends.go b/backend/internal/repository/ops_repo_trends.go index e4ac96d3..022d1187 100644 --- a/backend/internal/repository/ops_repo_trends.go +++ b/backend/internal/repository/ops_repo_trends.go @@ -170,6 +170,7 @@ error_totals AS ( FROM ops_error_logs WHERE created_at >= $1 AND created_at < $2 AND COALESCE(status_code, 0) >= 400 + AND is_count_tokens = FALSE -- 排除 count_tokens 请求的错误 GROUP BY 1 ), combined AS ( @@ -243,6 +244,7 @@ error_totals AS ( AND platform = $3 AND group_id IS NOT NULL AND COALESCE(status_code, 0) >= 400 + AND is_count_tokens = FALSE -- 排除 count_tokens 请求的错误 GROUP BY 1 ), combined AS ( diff --git a/backend/internal/service/ops_port.go b/backend/internal/service/ops_port.go index 4549214d..4df21c37 100644 --- a/backend/internal/service/ops_port.go +++ b/backend/internal/service/ops_port.go @@ -73,6 +73,7 @@ type OpsInsertErrorLogInput struct { Severity string StatusCode int IsBusinessLimited bool + IsCountTokens bool // 是否为 count_tokens 请求 ErrorMessage string ErrorBody string diff --git a/backend/internal/service/ops_settings.go b/backend/internal/service/ops_settings.go index bb8052bb..d69e4b87 100644 --- a/backend/internal/service/ops_settings.go +++ b/backend/internal/service/ops_settings.go @@ -368,6 +368,9 @@ func defaultOpsAdvancedSettings() *OpsAdvancedSettings { Aggregation: OpsAggregationSettings{ AggregationEnabled: false, }, + IgnoreCountTokensErrors: false, + AutoRefreshEnabled: false, + AutoRefreshIntervalSec: 30, } } @@ -388,6 +391,10 @@ func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) { if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 { cfg.DataRetention.HourlyMetricsRetentionDays = 30 } + // Normalize auto refresh interval (default 30 seconds) + if cfg.AutoRefreshIntervalSec <= 0 { + cfg.AutoRefreshIntervalSec = 30 + } } func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error { @@ -403,6 +410,9 @@ func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error { if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 { return errors.New("hourly_metrics_retention_days must be between 1 and 365") } + if cfg.AutoRefreshIntervalSec < 15 || cfg.AutoRefreshIntervalSec > 300 { + return errors.New("auto_refresh_interval_seconds must be between 15 and 300") + } return nil } diff --git a/backend/internal/service/ops_settings_models.go b/backend/internal/service/ops_settings_models.go index 0de28358..1d9ef445 100644 --- a/backend/internal/service/ops_settings_models.go +++ b/backend/internal/service/ops_settings_models.go @@ -79,8 +79,11 @@ type OpsAlertRuntimeSettings struct { // OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation). type OpsAdvancedSettings struct { - DataRetention OpsDataRetentionSettings `json:"data_retention"` - Aggregation OpsAggregationSettings `json:"aggregation"` + DataRetention OpsDataRetentionSettings `json:"data_retention"` + Aggregation OpsAggregationSettings `json:"aggregation"` + IgnoreCountTokensErrors bool `json:"ignore_count_tokens_errors"` + AutoRefreshEnabled bool `json:"auto_refresh_enabled"` + AutoRefreshIntervalSec int `json:"auto_refresh_interval_seconds"` } type OpsDataRetentionSettings struct {