sub2api/backend/internal/service/ops_port.go

package service

import (
	"context"
	"time"
)

type OpsRepository interface {
	InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error)
	ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error)
	GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error)
	ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error)

	InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
	UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
	GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)

	// Lightweight window stats (for realtime WS / quick sampling).
	GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
	// Lightweight realtime traffic summary (for the Ops dashboard header card).
	GetRealtimeTrafficSummary(ctx context.Context, filter *OpsDashboardFilter) (*OpsRealtimeTrafficSummary, error)

	GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error)
	GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error)
	GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error)
	GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error)
	GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error)

	InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error
	GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error)

	UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error
	ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error)

	// Alerts (rules + events)
	ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error)
	CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
	UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
	DeleteAlertRule(ctx context.Context, id int64) error

	ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
	GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
	GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
	CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
	UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
	UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error

	// Pre-aggregation (hourly/daily) used for long-window dashboard performance.
	UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
	UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
	GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error)
	GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error)
}

type OpsInsertErrorLogInput struct {
	RequestID       string
	ClientRequestID string

	UserID    *int64
	APIKeyID  *int64
	AccountID *int64
	GroupID   *int64
	ClientIP  *string

	Platform    string
	Model       string
	RequestPath string
	Stream      bool
	UserAgent   string

	ErrorPhase        string
	ErrorType         string
	Severity          string
	StatusCode        int
	IsBusinessLimited bool
	IsCountTokens     bool // 是否为 count_tokens 请求

	ErrorMessage string
	ErrorBody    string

	ErrorSource string
	ErrorOwner  string

	UpstreamStatusCode   *int
	UpstreamErrorMessage *string
	UpstreamErrorDetail  *string
	// UpstreamErrors captures all upstream error attempts observed during handling this request.
	// It is populated during request processing (gin context) and sanitized+serialized by OpsService.
	UpstreamErrors []*OpsUpstreamErrorEvent
	// UpstreamErrorsJSON is the sanitized JSON string stored into ops_error_logs.upstream_errors.
	// It is set by OpsService.RecordError before persisting.
	UpstreamErrorsJSON *string

	DurationMs         *int
	TimeToFirstTokenMs *int64

	RequestBodyJSON      *string // sanitized json string (not raw bytes)
	RequestBodyTruncated bool
	RequestBodyBytes     *int
	RequestHeadersJSON   *string // optional json string

	IsRetryable bool
	RetryCount  int

	CreatedAt time.Time
}

type OpsInsertRetryAttemptInput struct {
	RequestedByUserID int64
	SourceErrorID     int64
	Mode              string
	PinnedAccountID   *int64

	// running|queued etc.
	Status    string
	StartedAt time.Time
}

type OpsUpdateRetryAttemptInput struct {
	ID int64

	// succeeded|failed
	Status     string
	FinishedAt time.Time
	DurationMs int64

	// Optional correlation
	ResultRequestID *string
	ResultErrorID   *int64

	ErrorMessage *string
}

type OpsInsertSystemMetricsInput struct {
	CreatedAt     time.Time
	WindowMinutes int

	Platform *string
	GroupID  *int64

	SuccessCount         int64
	ErrorCountTotal      int64
	BusinessLimitedCount int64
	ErrorCountSLA        int64

	UpstreamErrorCountExcl429529 int64
	Upstream429Count             int64
	Upstream529Count             int64

	TokenConsumed int64

	QPS *float64
	TPS *float64

	DurationP50Ms *int
	DurationP90Ms *int
	DurationP95Ms *int
	DurationP99Ms *int
	DurationAvgMs *float64
	DurationMaxMs *int

	TTFTP50Ms *int
	TTFTP90Ms *int
	TTFTP95Ms *int
	TTFTP99Ms *int
	TTFTAvgMs *float64
	TTFTMaxMs *int

	CPUUsagePercent    *float64
	MemoryUsedMB       *int64
	MemoryTotalMB      *int64
	MemoryUsagePercent *float64

	DBOK    *bool
	RedisOK *bool

	RedisConnTotal *int
	RedisConnIdle  *int

	DBConnActive  *int
	DBConnIdle    *int
	DBConnWaiting *int

	GoroutineCount        *int
	ConcurrencyQueueDepth *int
}

type OpsSystemMetricsSnapshot struct {
	ID            int64     `json:"id"`
	CreatedAt     time.Time `json:"created_at"`
	WindowMinutes int       `json:"window_minutes"`

	CPUUsagePercent    *float64 `json:"cpu_usage_percent"`
	MemoryUsedMB       *int64   `json:"memory_used_mb"`
	MemoryTotalMB      *int64   `json:"memory_total_mb"`
	MemoryUsagePercent *float64 `json:"memory_usage_percent"`

	DBOK    *bool `json:"db_ok"`
	RedisOK *bool `json:"redis_ok"`

	// Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max".
	DBMaxOpenConns *int `json:"db_max_open_conns"`
	RedisPoolSize  *int `json:"redis_pool_size"`

	RedisConnTotal *int `json:"redis_conn_total"`
	RedisConnIdle  *int `json:"redis_conn_idle"`

	DBConnActive  *int `json:"db_conn_active"`
	DBConnIdle    *int `json:"db_conn_idle"`
	DBConnWaiting *int `json:"db_conn_waiting"`

	GoroutineCount        *int `json:"goroutine_count"`
	ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"`
}

type OpsUpsertJobHeartbeatInput struct {
	JobName string

	LastRunAt      *time.Time
	LastSuccessAt  *time.Time
	LastErrorAt    *time.Time
	LastError      *string
	LastDurationMs *int64
}

type OpsJobHeartbeat struct {
	JobName string `json:"job_name"`

	LastRunAt      *time.Time `json:"last_run_at"`
	LastSuccessAt  *time.Time `json:"last_success_at"`
	LastErrorAt    *time.Time `json:"last_error_at"`
	LastError      *string    `json:"last_error"`
	LastDurationMs *int64     `json:"last_duration_ms"`

	UpdatedAt time.Time `json:"updated_at"`
}

type OpsWindowStats struct {
	StartTime time.Time `json:"start_time"`
	EndTime   time.Time `json:"end_time"`

	SuccessCount    int64 `json:"success_count"`
	ErrorCountTotal int64 `json:"error_count_total"`
	TokenConsumed   int64 `json:"token_consumed"`
}