Files
sub2api/backend/internal/service/ops_port.go
IanShaw027 345a965fa3 feat(ops): 添加 count_tokens 错误过滤功能
功能特性:
- 自动识别并标记 count_tokens 请求的错误
- 支持配置是否在统计中忽略 count_tokens 错误
- 错误数据完整保留,仅在统计时动态过滤

技术实现:
- ops_error_logger.go: 自动标记 count_tokens 请求
- ops_repo.go: INSERT 语句添加 is_count_tokens 字段
- ops_repo_dashboard.go: buildErrorWhere 核心过滤函数
- ops_repo_preagg.go: 预聚合统计中添加过滤
- ops_repo_trends.go: 趋势统计查询添加过滤(2 处)
- ops_settings_models.go: 添加 ignore_count_tokens_errors 配置
- ops_settings.go: 配置验证和默认值设置
- ops_port.go: 错误日志模型添加 IsCountTokens 字段

业务价值:
- count_tokens 是探测性请求,其错误不影响真实业务 SLA
- 用户可根据需求灵活控制是否计入统计
- 提升错误率、告警等运维指标的准确性

影响范围:
- Dashboard 概览统计
- 错误趋势图表
- 告警规则评估
- 预聚合指标(hourly/daily)
- 健康分数计算
2026-01-12 17:06:12 +08:00

246 lines
7.5 KiB
Go

package service
import (
"context"
"time"
)
type OpsRepository interface {
InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error)
ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error)
GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error)
ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error)
InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)
// Lightweight window stats (for realtime WS / quick sampling).
GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
// Lightweight realtime traffic summary (for the Ops dashboard header card).
GetRealtimeTrafficSummary(ctx context.Context, filter *OpsDashboardFilter) (*OpsRealtimeTrafficSummary, error)
GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error)
GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error)
GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error)
GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error)
GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error)
InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error
GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error)
UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error
ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error)
// Alerts (rules + events)
ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error)
CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
DeleteAlertRule(ctx context.Context, id int64) error
ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error
// Pre-aggregation (hourly/daily) used for long-window dashboard performance.
UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error)
GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error)
}
type OpsInsertErrorLogInput struct {
RequestID string
ClientRequestID string
UserID *int64
APIKeyID *int64
AccountID *int64
GroupID *int64
ClientIP *string
Platform string
Model string
RequestPath string
Stream bool
UserAgent string
ErrorPhase string
ErrorType string
Severity string
StatusCode int
IsBusinessLimited bool
IsCountTokens bool // 是否为 count_tokens 请求
ErrorMessage string
ErrorBody string
ErrorSource string
ErrorOwner string
UpstreamStatusCode *int
UpstreamErrorMessage *string
UpstreamErrorDetail *string
// UpstreamErrors captures all upstream error attempts observed during handling this request.
// It is populated during request processing (gin context) and sanitized+serialized by OpsService.
UpstreamErrors []*OpsUpstreamErrorEvent
// UpstreamErrorsJSON is the sanitized JSON string stored into ops_error_logs.upstream_errors.
// It is set by OpsService.RecordError before persisting.
UpstreamErrorsJSON *string
DurationMs *int
TimeToFirstTokenMs *int64
RequestBodyJSON *string // sanitized json string (not raw bytes)
RequestBodyTruncated bool
RequestBodyBytes *int
RequestHeadersJSON *string // optional json string
IsRetryable bool
RetryCount int
CreatedAt time.Time
}
type OpsInsertRetryAttemptInput struct {
RequestedByUserID int64
SourceErrorID int64
Mode string
PinnedAccountID *int64
// running|queued etc.
Status string
StartedAt time.Time
}
type OpsUpdateRetryAttemptInput struct {
ID int64
// succeeded|failed
Status string
FinishedAt time.Time
DurationMs int64
// Optional correlation
ResultRequestID *string
ResultErrorID *int64
ErrorMessage *string
}
type OpsInsertSystemMetricsInput struct {
CreatedAt time.Time
WindowMinutes int
Platform *string
GroupID *int64
SuccessCount int64
ErrorCountTotal int64
BusinessLimitedCount int64
ErrorCountSLA int64
UpstreamErrorCountExcl429529 int64
Upstream429Count int64
Upstream529Count int64
TokenConsumed int64
QPS *float64
TPS *float64
DurationP50Ms *int
DurationP90Ms *int
DurationP95Ms *int
DurationP99Ms *int
DurationAvgMs *float64
DurationMaxMs *int
TTFTP50Ms *int
TTFTP90Ms *int
TTFTP95Ms *int
TTFTP99Ms *int
TTFTAvgMs *float64
TTFTMaxMs *int
CPUUsagePercent *float64
MemoryUsedMB *int64
MemoryTotalMB *int64
MemoryUsagePercent *float64
DBOK *bool
RedisOK *bool
RedisConnTotal *int
RedisConnIdle *int
DBConnActive *int
DBConnIdle *int
DBConnWaiting *int
GoroutineCount *int
ConcurrencyQueueDepth *int
}
type OpsSystemMetricsSnapshot struct {
ID int64 `json:"id"`
CreatedAt time.Time `json:"created_at"`
WindowMinutes int `json:"window_minutes"`
CPUUsagePercent *float64 `json:"cpu_usage_percent"`
MemoryUsedMB *int64 `json:"memory_used_mb"`
MemoryTotalMB *int64 `json:"memory_total_mb"`
MemoryUsagePercent *float64 `json:"memory_usage_percent"`
DBOK *bool `json:"db_ok"`
RedisOK *bool `json:"redis_ok"`
// Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max".
DBMaxOpenConns *int `json:"db_max_open_conns"`
RedisPoolSize *int `json:"redis_pool_size"`
RedisConnTotal *int `json:"redis_conn_total"`
RedisConnIdle *int `json:"redis_conn_idle"`
DBConnActive *int `json:"db_conn_active"`
DBConnIdle *int `json:"db_conn_idle"`
DBConnWaiting *int `json:"db_conn_waiting"`
GoroutineCount *int `json:"goroutine_count"`
ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"`
}
type OpsUpsertJobHeartbeatInput struct {
JobName string
LastRunAt *time.Time
LastSuccessAt *time.Time
LastErrorAt *time.Time
LastError *string
LastDurationMs *int64
}
type OpsJobHeartbeat struct {
JobName string `json:"job_name"`
LastRunAt *time.Time `json:"last_run_at"`
LastSuccessAt *time.Time `json:"last_success_at"`
LastErrorAt *time.Time `json:"last_error_at"`
LastError *string `json:"last_error"`
LastDurationMs *int64 `json:"last_duration_ms"`
UpdatedAt time.Time `json:"updated_at"`
}
type OpsWindowStats struct {
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
SuccessCount int64 `json:"success_count"`
ErrorCountTotal int64 `json:"error_count_total"`
TokenConsumed int64 `json:"token_consumed"`
}