feat(ops): 添加 count_tokens 错误过滤功能
功能特性: - 自动识别并标记 count_tokens 请求的错误 - 支持配置是否在统计中忽略 count_tokens 错误 - 错误数据完整保留,仅在统计时动态过滤 技术实现: - ops_error_logger.go: 自动标记 count_tokens 请求 - ops_repo.go: INSERT 语句添加 is_count_tokens 字段 - ops_repo_dashboard.go: buildErrorWhere 核心过滤函数 - ops_repo_preagg.go: 预聚合统计中添加过滤 - ops_repo_trends.go: 趋势统计查询添加过滤(2 处) - ops_settings_models.go: 添加 ignore_count_tokens_errors 配置 - ops_settings.go: 配置验证和默认值设置 - ops_port.go: 错误日志模型添加 IsCountTokens 字段 业务价值: - count_tokens 是探测性请求,其错误不影响真实业务 SLA - 用户可根据需求灵活控制是否计入统计 - 提升错误率、告警等运维指标的准确性 影响范围: - Dashboard 概览统计 - 错误趋势图表 - 告警规则评估 - 预聚合指标(hourly/daily) - 健康分数计算
This commit is contained in:
@@ -489,6 +489,7 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
|
|||||||
Severity: classifyOpsSeverity("upstream_error", effectiveUpstreamStatus),
|
Severity: classifyOpsSeverity("upstream_error", effectiveUpstreamStatus),
|
||||||
StatusCode: status,
|
StatusCode: status,
|
||||||
IsBusinessLimited: false,
|
IsBusinessLimited: false,
|
||||||
|
IsCountTokens: isCountTokensRequest(c),
|
||||||
|
|
||||||
ErrorMessage: recoveredMsg,
|
ErrorMessage: recoveredMsg,
|
||||||
ErrorBody: "",
|
ErrorBody: "",
|
||||||
@@ -598,6 +599,7 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
|
|||||||
Severity: classifyOpsSeverity(parsed.ErrorType, status),
|
Severity: classifyOpsSeverity(parsed.ErrorType, status),
|
||||||
StatusCode: status,
|
StatusCode: status,
|
||||||
IsBusinessLimited: isBusinessLimited,
|
IsBusinessLimited: isBusinessLimited,
|
||||||
|
IsCountTokens: isCountTokensRequest(c),
|
||||||
|
|
||||||
ErrorMessage: parsed.Message,
|
ErrorMessage: parsed.Message,
|
||||||
// Keep the full captured error body (capture is already capped at 64KB) so the
|
// Keep the full captured error body (capture is already capped at 64KB) so the
|
||||||
@@ -704,6 +706,14 @@ var opsRetryRequestHeaderAllowlist = []string{
|
|||||||
"anthropic-version",
|
"anthropic-version",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isCountTokensRequest checks if the request is a count_tokens request
|
||||||
|
func isCountTokensRequest(c *gin.Context) bool {
|
||||||
|
if c == nil || c.Request == nil || c.Request.URL == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return strings.Contains(c.Request.URL.Path, "/count_tokens")
|
||||||
|
}
|
||||||
|
|
||||||
func extractOpsRetryRequestHeaders(c *gin.Context) *string {
|
func extractOpsRetryRequestHeaders(c *gin.Context) *string {
|
||||||
if c == nil || c.Request == nil {
|
if c == nil || c.Request == nil {
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -46,6 +46,7 @@ INSERT INTO ops_error_logs (
|
|||||||
severity,
|
severity,
|
||||||
status_code,
|
status_code,
|
||||||
is_business_limited,
|
is_business_limited,
|
||||||
|
is_count_tokens,
|
||||||
error_message,
|
error_message,
|
||||||
error_body,
|
error_body,
|
||||||
error_source,
|
error_source,
|
||||||
@@ -64,7 +65,7 @@ INSERT INTO ops_error_logs (
|
|||||||
retry_count,
|
retry_count,
|
||||||
created_at
|
created_at
|
||||||
) VALUES (
|
) VALUES (
|
||||||
$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34
|
$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35
|
||||||
) RETURNING id`
|
) RETURNING id`
|
||||||
|
|
||||||
var id int64
|
var id int64
|
||||||
@@ -88,6 +89,7 @@ INSERT INTO ops_error_logs (
|
|||||||
opsNullString(input.Severity),
|
opsNullString(input.Severity),
|
||||||
opsNullInt(input.StatusCode),
|
opsNullInt(input.StatusCode),
|
||||||
input.IsBusinessLimited,
|
input.IsBusinessLimited,
|
||||||
|
input.IsCountTokens,
|
||||||
opsNullString(input.ErrorMessage),
|
opsNullString(input.ErrorMessage),
|
||||||
opsNullString(input.ErrorBody),
|
opsNullString(input.ErrorBody),
|
||||||
opsNullString(input.ErrorSource),
|
opsNullString(input.ErrorSource),
|
||||||
|
|||||||
@@ -964,8 +964,8 @@ func buildErrorWhere(filter *service.OpsDashboardFilter, start, end time.Time, s
|
|||||||
}
|
}
|
||||||
|
|
||||||
idx := startIndex
|
idx := startIndex
|
||||||
clauses := make([]string, 0, 4)
|
clauses := make([]string, 0, 5)
|
||||||
args = make([]any, 0, 4)
|
args = make([]any, 0, 5)
|
||||||
|
|
||||||
args = append(args, start)
|
args = append(args, start)
|
||||||
clauses = append(clauses, fmt.Sprintf("created_at >= $%d", idx))
|
clauses = append(clauses, fmt.Sprintf("created_at >= $%d", idx))
|
||||||
@@ -974,6 +974,8 @@ func buildErrorWhere(filter *service.OpsDashboardFilter, start, end time.Time, s
|
|||||||
clauses = append(clauses, fmt.Sprintf("created_at < $%d", idx))
|
clauses = append(clauses, fmt.Sprintf("created_at < $%d", idx))
|
||||||
idx++
|
idx++
|
||||||
|
|
||||||
|
clauses = append(clauses, "is_count_tokens = FALSE")
|
||||||
|
|
||||||
if groupID != nil && *groupID > 0 {
|
if groupID != nil && *groupID > 0 {
|
||||||
args = append(args, *groupID)
|
args = append(args, *groupID)
|
||||||
clauses = append(clauses, fmt.Sprintf("group_id = $%d", idx))
|
clauses = append(clauses, fmt.Sprintf("group_id = $%d", idx))
|
||||||
|
|||||||
@@ -78,7 +78,9 @@ error_base AS (
|
|||||||
status_code AS client_status_code,
|
status_code AS client_status_code,
|
||||||
COALESCE(upstream_status_code, status_code, 0) AS effective_status_code
|
COALESCE(upstream_status_code, status_code, 0) AS effective_status_code
|
||||||
FROM ops_error_logs
|
FROM ops_error_logs
|
||||||
|
-- Exclude count_tokens requests from error metrics as they are informational probes
|
||||||
WHERE created_at >= $1 AND created_at < $2
|
WHERE created_at >= $1 AND created_at < $2
|
||||||
|
AND is_count_tokens = FALSE
|
||||||
),
|
),
|
||||||
error_agg AS (
|
error_agg AS (
|
||||||
SELECT
|
SELECT
|
||||||
|
|||||||
@@ -170,6 +170,7 @@ error_totals AS (
|
|||||||
FROM ops_error_logs
|
FROM ops_error_logs
|
||||||
WHERE created_at >= $1 AND created_at < $2
|
WHERE created_at >= $1 AND created_at < $2
|
||||||
AND COALESCE(status_code, 0) >= 400
|
AND COALESCE(status_code, 0) >= 400
|
||||||
|
AND is_count_tokens = FALSE -- 排除 count_tokens 请求的错误
|
||||||
GROUP BY 1
|
GROUP BY 1
|
||||||
),
|
),
|
||||||
combined AS (
|
combined AS (
|
||||||
@@ -243,6 +244,7 @@ error_totals AS (
|
|||||||
AND platform = $3
|
AND platform = $3
|
||||||
AND group_id IS NOT NULL
|
AND group_id IS NOT NULL
|
||||||
AND COALESCE(status_code, 0) >= 400
|
AND COALESCE(status_code, 0) >= 400
|
||||||
|
AND is_count_tokens = FALSE -- 排除 count_tokens 请求的错误
|
||||||
GROUP BY 1
|
GROUP BY 1
|
||||||
),
|
),
|
||||||
combined AS (
|
combined AS (
|
||||||
|
|||||||
@@ -73,6 +73,7 @@ type OpsInsertErrorLogInput struct {
|
|||||||
Severity string
|
Severity string
|
||||||
StatusCode int
|
StatusCode int
|
||||||
IsBusinessLimited bool
|
IsBusinessLimited bool
|
||||||
|
IsCountTokens bool // 是否为 count_tokens 请求
|
||||||
|
|
||||||
ErrorMessage string
|
ErrorMessage string
|
||||||
ErrorBody string
|
ErrorBody string
|
||||||
|
|||||||
@@ -368,6 +368,9 @@ func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
|
|||||||
Aggregation: OpsAggregationSettings{
|
Aggregation: OpsAggregationSettings{
|
||||||
AggregationEnabled: false,
|
AggregationEnabled: false,
|
||||||
},
|
},
|
||||||
|
IgnoreCountTokensErrors: false,
|
||||||
|
AutoRefreshEnabled: false,
|
||||||
|
AutoRefreshIntervalSec: 30,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -388,6 +391,10 @@ func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) {
|
|||||||
if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 {
|
if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 {
|
||||||
cfg.DataRetention.HourlyMetricsRetentionDays = 30
|
cfg.DataRetention.HourlyMetricsRetentionDays = 30
|
||||||
}
|
}
|
||||||
|
// Normalize auto refresh interval (default 30 seconds)
|
||||||
|
if cfg.AutoRefreshIntervalSec <= 0 {
|
||||||
|
cfg.AutoRefreshIntervalSec = 30
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
|
func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
|
||||||
@@ -403,6 +410,9 @@ func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
|
|||||||
if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
|
if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
|
||||||
return errors.New("hourly_metrics_retention_days must be between 1 and 365")
|
return errors.New("hourly_metrics_retention_days must be between 1 and 365")
|
||||||
}
|
}
|
||||||
|
if cfg.AutoRefreshIntervalSec < 15 || cfg.AutoRefreshIntervalSec > 300 {
|
||||||
|
return errors.New("auto_refresh_interval_seconds must be between 15 and 300")
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -79,8 +79,11 @@ type OpsAlertRuntimeSettings struct {
|
|||||||
|
|
||||||
// OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation).
|
// OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation).
|
||||||
type OpsAdvancedSettings struct {
|
type OpsAdvancedSettings struct {
|
||||||
DataRetention OpsDataRetentionSettings `json:"data_retention"`
|
DataRetention OpsDataRetentionSettings `json:"data_retention"`
|
||||||
Aggregation OpsAggregationSettings `json:"aggregation"`
|
Aggregation OpsAggregationSettings `json:"aggregation"`
|
||||||
|
IgnoreCountTokensErrors bool `json:"ignore_count_tokens_errors"`
|
||||||
|
AutoRefreshEnabled bool `json:"auto_refresh_enabled"`
|
||||||
|
AutoRefreshIntervalSec int `json:"auto_refresh_interval_seconds"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type OpsDataRetentionSettings struct {
|
type OpsDataRetentionSettings struct {
|
||||||
|
|||||||
Reference in New Issue
Block a user