Files
yinghuoapi/backend/internal/service/ops_port.go
IanShaw027 d601768016 feat(service): 增强ops业务逻辑和告警功能
- 实现告警静默功能的业务逻辑
- 优化错误分类和重试机制
- 扩展告警评估和通知功能
- 完善错误解决和重试结果处理
2026-01-14 09:03:16 +08:00

261 lines
8.3 KiB
Go

package service
import (
"context"
"time"
)
type OpsRepository interface {
InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error)
ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error)
GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error)
ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error)
InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)
ListRetryAttemptsByErrorID(ctx context.Context, sourceErrorID int64, limit int) ([]*OpsRetryAttempt, error)
UpdateErrorResolution(ctx context.Context, errorID int64, resolved bool, resolvedByUserID *int64, resolvedRetryID *int64, resolvedAt *time.Time) error
// Lightweight window stats (for realtime WS / quick sampling).
GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
// Lightweight realtime traffic summary (for the Ops dashboard header card).
GetRealtimeTrafficSummary(ctx context.Context, filter *OpsDashboardFilter) (*OpsRealtimeTrafficSummary, error)
GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error)
GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error)
GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error)
GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error)
GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error)
InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error
GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error)
UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error
ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error)
// Alerts (rules + events)
ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error)
CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
DeleteAlertRule(ctx context.Context, id int64) error
ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
GetAlertEventByID(ctx context.Context, eventID int64) (*OpsAlertEvent, error)
GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error
// Alert silences
CreateAlertSilence(ctx context.Context, input *OpsAlertSilence) (*OpsAlertSilence, error)
IsAlertSilenced(ctx context.Context, ruleID int64, platform string, groupID *int64, region *string, now time.Time) (bool, error)
// Pre-aggregation (hourly/daily) used for long-window dashboard performance.
UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error)
GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error)
}
type OpsInsertErrorLogInput struct {
RequestID string
ClientRequestID string
UserID *int64
APIKeyID *int64
AccountID *int64
GroupID *int64
ClientIP *string
Platform string
Model string
RequestPath string
Stream bool
UserAgent string
ErrorPhase string
ErrorType string
Severity string
StatusCode int
IsBusinessLimited bool
IsCountTokens bool // 是否为 count_tokens 请求
ErrorMessage string
ErrorBody string
ErrorSource string
ErrorOwner string
UpstreamStatusCode *int
UpstreamErrorMessage *string
UpstreamErrorDetail *string
// UpstreamErrors captures all upstream error attempts observed during handling this request.
// It is populated during request processing (gin context) and sanitized+serialized by OpsService.
UpstreamErrors []*OpsUpstreamErrorEvent
// UpstreamErrorsJSON is the sanitized JSON string stored into ops_error_logs.upstream_errors.
// It is set by OpsService.RecordError before persisting.
UpstreamErrorsJSON *string
DurationMs *int
TimeToFirstTokenMs *int64
RequestBodyJSON *string // sanitized json string (not raw bytes)
RequestBodyTruncated bool
RequestBodyBytes *int
RequestHeadersJSON *string // optional json string
IsRetryable bool
RetryCount int
CreatedAt time.Time
}
type OpsInsertRetryAttemptInput struct {
RequestedByUserID int64
SourceErrorID int64
Mode string
PinnedAccountID *int64
// running|queued etc.
Status string
StartedAt time.Time
}
type OpsUpdateRetryAttemptInput struct {
ID int64
// succeeded|failed
Status string
FinishedAt time.Time
DurationMs int64
// Persisted execution results (best-effort)
Success *bool
HTTPStatusCode *int
UpstreamRequestID *string
UsedAccountID *int64
ResponsePreview *string
ResponseTruncated *bool
// Optional correlation (legacy fields kept)
ResultRequestID *string
ResultErrorID *int64
ErrorMessage *string
}
type OpsInsertSystemMetricsInput struct {
CreatedAt time.Time
WindowMinutes int
Platform *string
GroupID *int64
SuccessCount int64
ErrorCountTotal int64
BusinessLimitedCount int64
ErrorCountSLA int64
UpstreamErrorCountExcl429529 int64
Upstream429Count int64
Upstream529Count int64
TokenConsumed int64
QPS *float64
TPS *float64
DurationP50Ms *int
DurationP90Ms *int
DurationP95Ms *int
DurationP99Ms *int
DurationAvgMs *float64
DurationMaxMs *int
TTFTP50Ms *int
TTFTP90Ms *int
TTFTP95Ms *int
TTFTP99Ms *int
TTFTAvgMs *float64
TTFTMaxMs *int
CPUUsagePercent *float64
MemoryUsedMB *int64
MemoryTotalMB *int64
MemoryUsagePercent *float64
DBOK *bool
RedisOK *bool
RedisConnTotal *int
RedisConnIdle *int
DBConnActive *int
DBConnIdle *int
DBConnWaiting *int
GoroutineCount *int
ConcurrencyQueueDepth *int
}
type OpsSystemMetricsSnapshot struct {
ID int64 `json:"id"`
CreatedAt time.Time `json:"created_at"`
WindowMinutes int `json:"window_minutes"`
CPUUsagePercent *float64 `json:"cpu_usage_percent"`
MemoryUsedMB *int64 `json:"memory_used_mb"`
MemoryTotalMB *int64 `json:"memory_total_mb"`
MemoryUsagePercent *float64 `json:"memory_usage_percent"`
DBOK *bool `json:"db_ok"`
RedisOK *bool `json:"redis_ok"`
// Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max".
DBMaxOpenConns *int `json:"db_max_open_conns"`
RedisPoolSize *int `json:"redis_pool_size"`
RedisConnTotal *int `json:"redis_conn_total"`
RedisConnIdle *int `json:"redis_conn_idle"`
DBConnActive *int `json:"db_conn_active"`
DBConnIdle *int `json:"db_conn_idle"`
DBConnWaiting *int `json:"db_conn_waiting"`
GoroutineCount *int `json:"goroutine_count"`
ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"`
}
type OpsUpsertJobHeartbeatInput struct {
JobName string
LastRunAt *time.Time
LastSuccessAt *time.Time
LastErrorAt *time.Time
LastError *string
LastDurationMs *int64
}
type OpsJobHeartbeat struct {
JobName string `json:"job_name"`
LastRunAt *time.Time `json:"last_run_at"`
LastSuccessAt *time.Time `json:"last_success_at"`
LastErrorAt *time.Time `json:"last_error_at"`
LastError *string `json:"last_error"`
LastDurationMs *int64 `json:"last_duration_ms"`
UpdatedAt time.Time `json:"updated_at"`
}
type OpsWindowStats struct {
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
SuccessCount int64 `json:"success_count"`
ErrorCountTotal int64 `json:"error_count_total"`
TokenConsumed int64 `json:"token_consumed"`
}