feat(service): 实现运维监控业务逻辑层
- 新增 ops 主服务(ops_service.go)和端口定义(ops_port.go) - 实现账号可用性检查服务(ops_account_availability.go) - 实现数据聚合服务(ops_aggregation_service.go) - 实现告警评估服务(ops_alert_evaluator_service.go) - 实现告警管理服务(ops_alerts.go) - 实现数据清理服务(ops_cleanup_service.go) - 实现并发控制服务(ops_concurrency.go) - 实现仪表板服务(ops_dashboard.go) - 实现错误处理服务(ops_errors.go) - 实现直方图服务(ops_histograms.go) - 实现指标采集服务(ops_metrics_collector.go) - 实现查询模式服务(ops_query_mode.go) - 实现实时监控服务(ops_realtime.go) - 实现请求详情服务(ops_request_details.go) - 实现重试机制服务(ops_retry.go) - 实现配置管理服务(ops_settings.go) - 实现趋势分析服务(ops_trends.go) - 实现窗口统计服务(ops_window_stats.go) - 添加 ops 相关领域常量 - 注册 service 依赖注入
This commit is contained in:
226
backend/internal/service/ops_port.go
Normal file
226
backend/internal/service/ops_port.go
Normal file
@@ -0,0 +1,226 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
)
|
||||
|
||||
type OpsRepository interface {
|
||||
InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error)
|
||||
ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error)
|
||||
GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error)
|
||||
ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error)
|
||||
|
||||
InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
|
||||
UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
|
||||
GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)
|
||||
|
||||
// Lightweight window stats (for realtime WS / quick sampling).
|
||||
GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
|
||||
|
||||
GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error)
|
||||
GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error)
|
||||
GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error)
|
||||
GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error)
|
||||
GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error)
|
||||
|
||||
InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error
|
||||
GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error)
|
||||
|
||||
UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error
|
||||
ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error)
|
||||
|
||||
// Alerts (rules + events)
|
||||
ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error)
|
||||
CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
|
||||
UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
|
||||
DeleteAlertRule(ctx context.Context, id int64) error
|
||||
|
||||
ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
|
||||
GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
|
||||
GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
|
||||
CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
|
||||
UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
|
||||
UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error
|
||||
|
||||
// Pre-aggregation (hourly/daily) used for long-window dashboard performance.
|
||||
UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
|
||||
UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
|
||||
GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error)
|
||||
GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error)
|
||||
}
|
||||
|
||||
type OpsInsertErrorLogInput struct {
|
||||
RequestID string
|
||||
ClientRequestID string
|
||||
|
||||
UserID *int64
|
||||
APIKeyID *int64
|
||||
AccountID *int64
|
||||
GroupID *int64
|
||||
ClientIP *string
|
||||
|
||||
Platform string
|
||||
Model string
|
||||
RequestPath string
|
||||
Stream bool
|
||||
UserAgent string
|
||||
|
||||
ErrorPhase string
|
||||
ErrorType string
|
||||
Severity string
|
||||
StatusCode int
|
||||
IsBusinessLimited bool
|
||||
|
||||
ErrorMessage string
|
||||
ErrorBody string
|
||||
|
||||
ErrorSource string
|
||||
ErrorOwner string
|
||||
|
||||
UpstreamStatusCode *int
|
||||
UpstreamErrorMessage *string
|
||||
UpstreamErrorDetail *string
|
||||
|
||||
DurationMs *int
|
||||
TimeToFirstTokenMs *int64
|
||||
|
||||
RequestBodyJSON *string // sanitized json string (not raw bytes)
|
||||
RequestBodyTruncated bool
|
||||
RequestBodyBytes *int
|
||||
RequestHeadersJSON *string // optional json string
|
||||
|
||||
IsRetryable bool
|
||||
RetryCount int
|
||||
|
||||
CreatedAt time.Time
|
||||
}
|
||||
|
||||
type OpsInsertRetryAttemptInput struct {
|
||||
RequestedByUserID int64
|
||||
SourceErrorID int64
|
||||
Mode string
|
||||
PinnedAccountID *int64
|
||||
|
||||
// running|queued etc.
|
||||
Status string
|
||||
StartedAt time.Time
|
||||
}
|
||||
|
||||
type OpsUpdateRetryAttemptInput struct {
|
||||
ID int64
|
||||
|
||||
// succeeded|failed
|
||||
Status string
|
||||
FinishedAt time.Time
|
||||
DurationMs int64
|
||||
|
||||
// Optional correlation
|
||||
ResultRequestID *string
|
||||
ResultErrorID *int64
|
||||
|
||||
ErrorMessage *string
|
||||
}
|
||||
|
||||
type OpsInsertSystemMetricsInput struct {
|
||||
CreatedAt time.Time
|
||||
WindowMinutes int
|
||||
|
||||
Platform *string
|
||||
GroupID *int64
|
||||
|
||||
SuccessCount int64
|
||||
ErrorCountTotal int64
|
||||
BusinessLimitedCount int64
|
||||
ErrorCountSLA int64
|
||||
|
||||
UpstreamErrorCountExcl429529 int64
|
||||
Upstream429Count int64
|
||||
Upstream529Count int64
|
||||
|
||||
TokenConsumed int64
|
||||
|
||||
QPS *float64
|
||||
TPS *float64
|
||||
|
||||
DurationP50Ms *int
|
||||
DurationP90Ms *int
|
||||
DurationP95Ms *int
|
||||
DurationP99Ms *int
|
||||
DurationAvgMs *float64
|
||||
DurationMaxMs *int
|
||||
|
||||
TTFTP50Ms *int
|
||||
TTFTP90Ms *int
|
||||
TTFTP95Ms *int
|
||||
TTFTP99Ms *int
|
||||
TTFTAvgMs *float64
|
||||
TTFTMaxMs *int
|
||||
|
||||
CPUUsagePercent *float64
|
||||
MemoryUsedMB *int64
|
||||
MemoryTotalMB *int64
|
||||
MemoryUsagePercent *float64
|
||||
|
||||
DBOK *bool
|
||||
RedisOK *bool
|
||||
|
||||
DBConnActive *int
|
||||
DBConnIdle *int
|
||||
DBConnWaiting *int
|
||||
|
||||
GoroutineCount *int
|
||||
ConcurrencyQueueDepth *int
|
||||
}
|
||||
|
||||
type OpsSystemMetricsSnapshot struct {
|
||||
ID int64 `json:"id"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
WindowMinutes int `json:"window_minutes"`
|
||||
|
||||
CPUUsagePercent *float64 `json:"cpu_usage_percent"`
|
||||
MemoryUsedMB *int64 `json:"memory_used_mb"`
|
||||
MemoryTotalMB *int64 `json:"memory_total_mb"`
|
||||
MemoryUsagePercent *float64 `json:"memory_usage_percent"`
|
||||
|
||||
DBOK *bool `json:"db_ok"`
|
||||
RedisOK *bool `json:"redis_ok"`
|
||||
|
||||
DBConnActive *int `json:"db_conn_active"`
|
||||
DBConnIdle *int `json:"db_conn_idle"`
|
||||
DBConnWaiting *int `json:"db_conn_waiting"`
|
||||
|
||||
GoroutineCount *int `json:"goroutine_count"`
|
||||
ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"`
|
||||
}
|
||||
|
||||
type OpsUpsertJobHeartbeatInput struct {
|
||||
JobName string
|
||||
|
||||
LastRunAt *time.Time
|
||||
LastSuccessAt *time.Time
|
||||
LastErrorAt *time.Time
|
||||
LastError *string
|
||||
LastDurationMs *int64
|
||||
}
|
||||
|
||||
type OpsJobHeartbeat struct {
|
||||
JobName string `json:"job_name"`
|
||||
|
||||
LastRunAt *time.Time `json:"last_run_at"`
|
||||
LastSuccessAt *time.Time `json:"last_success_at"`
|
||||
LastErrorAt *time.Time `json:"last_error_at"`
|
||||
LastError *string `json:"last_error"`
|
||||
LastDurationMs *int64 `json:"last_duration_ms"`
|
||||
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type OpsWindowStats struct {
|
||||
StartTime time.Time `json:"start_time"`
|
||||
EndTime time.Time `json:"end_time"`
|
||||
|
||||
SuccessCount int64 `json:"success_count"`
|
||||
ErrorCountTotal int64 `json:"error_count_total"`
|
||||
TokenConsumed int64 `json:"token_consumed"`
|
||||
}
|
||||
Reference in New Issue
Block a user