245 lines
7.5 KiB
Go
245 lines
7.5 KiB
Go
package service
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
)
|
|
|
|
type OpsRepository interface {
|
|
InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error)
|
|
ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error)
|
|
GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error)
|
|
ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error)
|
|
|
|
InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
|
|
UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
|
|
GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)
|
|
|
|
// Lightweight window stats (for realtime WS / quick sampling).
|
|
GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
|
|
// Lightweight realtime traffic summary (for the Ops dashboard header card).
|
|
GetRealtimeTrafficSummary(ctx context.Context, filter *OpsDashboardFilter) (*OpsRealtimeTrafficSummary, error)
|
|
|
|
GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error)
|
|
GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error)
|
|
GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error)
|
|
GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error)
|
|
GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error)
|
|
|
|
InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error
|
|
GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error)
|
|
|
|
UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error
|
|
ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error)
|
|
|
|
// Alerts (rules + events)
|
|
ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error)
|
|
CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
|
|
UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
|
|
DeleteAlertRule(ctx context.Context, id int64) error
|
|
|
|
ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
|
|
GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
|
|
GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
|
|
CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
|
|
UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
|
|
UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error
|
|
|
|
// Pre-aggregation (hourly/daily) used for long-window dashboard performance.
|
|
UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
|
|
UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
|
|
GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error)
|
|
GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error)
|
|
}
|
|
|
|
type OpsInsertErrorLogInput struct {
|
|
RequestID string
|
|
ClientRequestID string
|
|
|
|
UserID *int64
|
|
APIKeyID *int64
|
|
AccountID *int64
|
|
GroupID *int64
|
|
ClientIP *string
|
|
|
|
Platform string
|
|
Model string
|
|
RequestPath string
|
|
Stream bool
|
|
UserAgent string
|
|
|
|
ErrorPhase string
|
|
ErrorType string
|
|
Severity string
|
|
StatusCode int
|
|
IsBusinessLimited bool
|
|
|
|
ErrorMessage string
|
|
ErrorBody string
|
|
|
|
ErrorSource string
|
|
ErrorOwner string
|
|
|
|
UpstreamStatusCode *int
|
|
UpstreamErrorMessage *string
|
|
UpstreamErrorDetail *string
|
|
// UpstreamErrors captures all upstream error attempts observed during handling this request.
|
|
// It is populated during request processing (gin context) and sanitized+serialized by OpsService.
|
|
UpstreamErrors []*OpsUpstreamErrorEvent
|
|
// UpstreamErrorsJSON is the sanitized JSON string stored into ops_error_logs.upstream_errors.
|
|
// It is set by OpsService.RecordError before persisting.
|
|
UpstreamErrorsJSON *string
|
|
|
|
DurationMs *int
|
|
TimeToFirstTokenMs *int64
|
|
|
|
RequestBodyJSON *string // sanitized json string (not raw bytes)
|
|
RequestBodyTruncated bool
|
|
RequestBodyBytes *int
|
|
RequestHeadersJSON *string // optional json string
|
|
|
|
IsRetryable bool
|
|
RetryCount int
|
|
|
|
CreatedAt time.Time
|
|
}
|
|
|
|
type OpsInsertRetryAttemptInput struct {
|
|
RequestedByUserID int64
|
|
SourceErrorID int64
|
|
Mode string
|
|
PinnedAccountID *int64
|
|
|
|
// running|queued etc.
|
|
Status string
|
|
StartedAt time.Time
|
|
}
|
|
|
|
type OpsUpdateRetryAttemptInput struct {
|
|
ID int64
|
|
|
|
// succeeded|failed
|
|
Status string
|
|
FinishedAt time.Time
|
|
DurationMs int64
|
|
|
|
// Optional correlation
|
|
ResultRequestID *string
|
|
ResultErrorID *int64
|
|
|
|
ErrorMessage *string
|
|
}
|
|
|
|
type OpsInsertSystemMetricsInput struct {
|
|
CreatedAt time.Time
|
|
WindowMinutes int
|
|
|
|
Platform *string
|
|
GroupID *int64
|
|
|
|
SuccessCount int64
|
|
ErrorCountTotal int64
|
|
BusinessLimitedCount int64
|
|
ErrorCountSLA int64
|
|
|
|
UpstreamErrorCountExcl429529 int64
|
|
Upstream429Count int64
|
|
Upstream529Count int64
|
|
|
|
TokenConsumed int64
|
|
|
|
QPS *float64
|
|
TPS *float64
|
|
|
|
DurationP50Ms *int
|
|
DurationP90Ms *int
|
|
DurationP95Ms *int
|
|
DurationP99Ms *int
|
|
DurationAvgMs *float64
|
|
DurationMaxMs *int
|
|
|
|
TTFTP50Ms *int
|
|
TTFTP90Ms *int
|
|
TTFTP95Ms *int
|
|
TTFTP99Ms *int
|
|
TTFTAvgMs *float64
|
|
TTFTMaxMs *int
|
|
|
|
CPUUsagePercent *float64
|
|
MemoryUsedMB *int64
|
|
MemoryTotalMB *int64
|
|
MemoryUsagePercent *float64
|
|
|
|
DBOK *bool
|
|
RedisOK *bool
|
|
|
|
RedisConnTotal *int
|
|
RedisConnIdle *int
|
|
|
|
DBConnActive *int
|
|
DBConnIdle *int
|
|
DBConnWaiting *int
|
|
|
|
GoroutineCount *int
|
|
ConcurrencyQueueDepth *int
|
|
}
|
|
|
|
type OpsSystemMetricsSnapshot struct {
|
|
ID int64 `json:"id"`
|
|
CreatedAt time.Time `json:"created_at"`
|
|
WindowMinutes int `json:"window_minutes"`
|
|
|
|
CPUUsagePercent *float64 `json:"cpu_usage_percent"`
|
|
MemoryUsedMB *int64 `json:"memory_used_mb"`
|
|
MemoryTotalMB *int64 `json:"memory_total_mb"`
|
|
MemoryUsagePercent *float64 `json:"memory_usage_percent"`
|
|
|
|
DBOK *bool `json:"db_ok"`
|
|
RedisOK *bool `json:"redis_ok"`
|
|
|
|
// Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max".
|
|
DBMaxOpenConns *int `json:"db_max_open_conns"`
|
|
RedisPoolSize *int `json:"redis_pool_size"`
|
|
|
|
RedisConnTotal *int `json:"redis_conn_total"`
|
|
RedisConnIdle *int `json:"redis_conn_idle"`
|
|
|
|
DBConnActive *int `json:"db_conn_active"`
|
|
DBConnIdle *int `json:"db_conn_idle"`
|
|
DBConnWaiting *int `json:"db_conn_waiting"`
|
|
|
|
GoroutineCount *int `json:"goroutine_count"`
|
|
ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"`
|
|
}
|
|
|
|
type OpsUpsertJobHeartbeatInput struct {
|
|
JobName string
|
|
|
|
LastRunAt *time.Time
|
|
LastSuccessAt *time.Time
|
|
LastErrorAt *time.Time
|
|
LastError *string
|
|
LastDurationMs *int64
|
|
}
|
|
|
|
type OpsJobHeartbeat struct {
|
|
JobName string `json:"job_name"`
|
|
|
|
LastRunAt *time.Time `json:"last_run_at"`
|
|
LastSuccessAt *time.Time `json:"last_success_at"`
|
|
LastErrorAt *time.Time `json:"last_error_at"`
|
|
LastError *string `json:"last_error"`
|
|
LastDurationMs *int64 `json:"last_duration_ms"`
|
|
|
|
UpdatedAt time.Time `json:"updated_at"`
|
|
}
|
|
|
|
type OpsWindowStats struct {
|
|
StartTime time.Time `json:"start_time"`
|
|
EndTime time.Time `json:"end_time"`
|
|
|
|
SuccessCount int64 `json:"success_count"`
|
|
ErrorCountTotal int64 `json:"error_count_total"`
|
|
TokenConsumed int64 `json:"token_consumed"`
|
|
}
|