feat(数据库): 添加运维监控数据模型和数据库迁移脚本

- 新增 ops 监控数据库迁移脚本（表结构定义） - 定义核心数据模型（ops_models.go） - 定义告警相关模型（ops_alert_models.go） - 定义仪表板数据模型（ops_dashboard_models.go） - 定义实时监控数据模型（ops_realtime_models.go） - 定义配置相关模型（ops_settings_models.go） - 定义趋势分析数据模型（ops_trend_models.go）
2026-01-09 20:52:17 +08:00
parent 4b9e47cec9
commit d55866d375
7 changed files with 1199 additions and 0 deletions
--- a/backend/internal/service/ops_alert_models.go
+++ b/backend/internal/service/ops_alert_models.go
@@ -0,0 +1,75 @@
+package service
+
+import "time"
+
+// Ops alert rule/event models.
+//
+// NOTE: These are admin-facing DTOs and intentionally keep JSON naming aligned
+// with the existing ops dashboard frontend (backup style).
+
+const (
+	OpsAlertStatusFiring   = "firing"
+	OpsAlertStatusResolved = "resolved"
+)
+
+type OpsAlertRule struct {
+	ID          int64  `json:"id"`
+	Name        string `json:"name"`
+	Description string `json:"description"`
+
+	Enabled  bool   `json:"enabled"`
+	Severity string `json:"severity"`
+
+	MetricType string  `json:"metric_type"`
+	Operator   string  `json:"operator"`
+	Threshold  float64 `json:"threshold"`
+
+	WindowMinutes    int `json:"window_minutes"`
+	SustainedMinutes int `json:"sustained_minutes"`
+	CooldownMinutes  int `json:"cooldown_minutes"`
+
+	NotifyEmail bool `json:"notify_email"`
+
+	Filters map[string]any `json:"filters,omitempty"`
+
+	LastTriggeredAt *time.Time `json:"last_triggered_at,omitempty"`
+	CreatedAt       time.Time  `json:"created_at"`
+	UpdatedAt       time.Time  `json:"updated_at"`
+}
+
+type OpsAlertEvent struct {
+	ID       int64  `json:"id"`
+	RuleID   int64  `json:"rule_id"`
+	Severity string `json:"severity"`
+	Status   string `json:"status"`
+
+	Title       string `json:"title"`
+	Description string `json:"description"`
+
+	MetricValue    *float64 `json:"metric_value,omitempty"`
+	ThresholdValue *float64 `json:"threshold_value,omitempty"`
+
+	Dimensions map[string]any `json:"dimensions,omitempty"`
+
+	FiredAt    time.Time  `json:"fired_at"`
+	ResolvedAt *time.Time `json:"resolved_at,omitempty"`
+
+	EmailSent bool      `json:"email_sent"`
+	CreatedAt time.Time `json:"created_at"`
+}
+
+type OpsAlertEventFilter struct {
+	Limit int
+
+	// Optional filters.
+	Status   string
+	Severity string
+
+	StartTime *time.Time
+	EndTime   *time.Time
+
+	// Dimensions filters (best-effort).
+	Platform string
+	GroupID  *int64
+}
+
--- a/backend/internal/service/ops_dashboard_models.go
+++ b/backend/internal/service/ops_dashboard_models.go
@@ -0,0 +1,83 @@
+package service
+
+import "time"
+
+type OpsDashboardFilter struct {
+	StartTime time.Time
+	EndTime   time.Time
+
+	Platform string
+	GroupID  *int64
+
+	// QueryMode controls whether dashboard queries should use raw logs or pre-aggregated tables.
+	// Expected values: auto/raw/preagg (see OpsQueryMode).
+	QueryMode OpsQueryMode
+}
+
+type OpsRateSummary struct {
+	Current float64 `json:"current"`
+	Peak    float64 `json:"peak"`
+	Avg     float64 `json:"avg"`
+}
+
+type OpsPercentiles struct {
+	P50 *int `json:"p50_ms"`
+	P90 *int `json:"p90_ms"`
+	P95 *int `json:"p95_ms"`
+	P99 *int `json:"p99_ms"`
+	Avg *int `json:"avg_ms"`
+	Max *int `json:"max_ms"`
+}
+
+type OpsDashboardOverview struct {
+	StartTime time.Time `json:"start_time"`
+	EndTime   time.Time `json:"end_time"`
+	Platform  string    `json:"platform"`
+	GroupID   *int64    `json:"group_id"`
+
+	// Latest system-level snapshot (window=1m, global).
+	SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"`
+
+	// Background jobs health (heartbeats).
+	JobHeartbeats []*OpsJobHeartbeat `json:"job_heartbeats"`
+
+	SuccessCount         int64 `json:"success_count"`
+	ErrorCountTotal      int64 `json:"error_count_total"`
+	BusinessLimitedCount int64 `json:"business_limited_count"`
+
+	ErrorCountSLA     int64 `json:"error_count_sla"`
+	RequestCountTotal int64 `json:"request_count_total"`
+	RequestCountSLA   int64 `json:"request_count_sla"`
+
+	TokenConsumed int64 `json:"token_consumed"`
+
+	SLA                          float64 `json:"sla"`
+	ErrorRate                    float64 `json:"error_rate"`
+	UpstreamErrorRate            float64 `json:"upstream_error_rate"`
+	UpstreamErrorCountExcl429529 int64   `json:"upstream_error_count_excl_429_529"`
+	Upstream429Count             int64   `json:"upstream_429_count"`
+	Upstream529Count             int64   `json:"upstream_529_count"`
+
+	QPS OpsRateSummary `json:"qps"`
+	TPS OpsRateSummary `json:"tps"`
+
+	Duration OpsPercentiles `json:"duration"`
+	TTFT     OpsPercentiles `json:"ttft"`
+}
+
+type OpsLatencyHistogramBucket struct {
+	Range string `json:"range"`
+	Count int64  `json:"count"`
+}
+
+// OpsLatencyHistogramResponse is a coarse latency distribution histogram (success requests only).
+// It is used by the Ops dashboard to quickly identify tail latency regressions.
+type OpsLatencyHistogramResponse struct {
+	StartTime time.Time `json:"start_time"`
+	EndTime   time.Time `json:"end_time"`
+	Platform  string    `json:"platform"`
+	GroupID   *int64    `json:"group_id"`
+
+	TotalRequests int64                        `json:"total_requests"`
+	Buckets       []*OpsLatencyHistogramBucket `json:"buckets"`
+}
--- a/backend/internal/service/ops_models.go
+++ b/backend/internal/service/ops_models.go
@@ -0,0 +1,118 @@
+package service
+
+import "time"
+
+type OpsErrorLog struct {
+	ID        int64     `json:"id"`
+	CreatedAt time.Time `json:"created_at"`
+
+	Phase    string `json:"phase"`
+	Type     string `json:"type"`
+	Severity string `json:"severity"`
+
+	StatusCode int    `json:"status_code"`
+	Platform   string `json:"platform"`
+	Model      string `json:"model"`
+
+	LatencyMs *int `json:"latency_ms"`
+
+	ClientRequestID string `json:"client_request_id"`
+	RequestID       string `json:"request_id"`
+	Message         string `json:"message"`
+
+	UserID    *int64 `json:"user_id"`
+	APIKeyID  *int64 `json:"api_key_id"`
+	AccountID *int64 `json:"account_id"`
+	GroupID   *int64 `json:"group_id"`
+
+	ClientIP    *string `json:"client_ip"`
+	RequestPath string  `json:"request_path"`
+	Stream      bool    `json:"stream"`
+}
+
+type OpsErrorLogDetail struct {
+	OpsErrorLog
+
+	ErrorBody string `json:"error_body"`
+	UserAgent string `json:"user_agent"`
+
+	// Timings (optional)
+	AuthLatencyMs       *int64 `json:"auth_latency_ms"`
+	RoutingLatencyMs    *int64 `json:"routing_latency_ms"`
+	UpstreamLatencyMs   *int64 `json:"upstream_latency_ms"`
+	ResponseLatencyMs   *int64 `json:"response_latency_ms"`
+	TimeToFirstTokenMs  *int64 `json:"time_to_first_token_ms"`
+
+	// Retry context
+	RequestBody          string `json:"request_body"`
+	RequestBodyTruncated bool   `json:"request_body_truncated"`
+	RequestBodyBytes     *int   `json:"request_body_bytes"`
+	RequestHeaders       string `json:"request_headers,omitempty"`
+
+	// vNext metric semantics
+	IsBusinessLimited bool `json:"is_business_limited"`
+}
+
+type OpsErrorLogFilter struct {
+	StartTime *time.Time
+	EndTime   *time.Time
+
+	Platform string
+	GroupID  *int64
+	AccountID *int64
+
+	StatusCodes []int
+	Phase       string
+	Query       string
+
+	Page     int
+	PageSize int
+}
+
+type OpsErrorLogList struct {
+	Errors   []*OpsErrorLog `json:"errors"`
+	Total    int           `json:"total"`
+	Page     int           `json:"page"`
+	PageSize int           `json:"page_size"`
+}
+
+type OpsRetryAttempt struct {
+	ID        int64     `json:"id"`
+	CreatedAt time.Time `json:"created_at"`
+
+	RequestedByUserID int64  `json:"requested_by_user_id"`
+	SourceErrorID     int64  `json:"source_error_id"`
+	Mode              string `json:"mode"`
+	PinnedAccountID   *int64 `json:"pinned_account_id"`
+
+	Status     string     `json:"status"`
+	StartedAt  *time.Time `json:"started_at"`
+	FinishedAt *time.Time `json:"finished_at"`
+	DurationMs *int64     `json:"duration_ms"`
+
+	ResultRequestID *string `json:"result_request_id"`
+	ResultErrorID   *int64  `json:"result_error_id"`
+
+	ErrorMessage *string `json:"error_message"`
+}
+
+type OpsRetryResult struct {
+	AttemptID int64 `json:"attempt_id"`
+	Mode      string `json:"mode"`
+	Status    string `json:"status"`
+
+	PinnedAccountID *int64 `json:"pinned_account_id"`
+	UsedAccountID   *int64 `json:"used_account_id"`
+
+	HTTPStatusCode int    `json:"http_status_code"`
+	UpstreamRequestID string `json:"upstream_request_id"`
+
+	ResponsePreview     string `json:"response_preview"`
+	ResponseTruncated   bool   `json:"response_truncated"`
+
+	ErrorMessage string `json:"error_message"`
+
+	StartedAt  time.Time `json:"started_at"`
+	FinishedAt time.Time `json:"finished_at"`
+	DurationMs int64     `json:"duration_ms"`
+}
--- a/backend/internal/service/ops_realtime_models.go
+++ b/backend/internal/service/ops_realtime_models.go
@@ -0,0 +1,81 @@
+package service
+
+import "time"
+
+// PlatformConcurrencyInfo aggregates concurrency usage by platform.
+type PlatformConcurrencyInfo struct {
+	Platform       string  `json:"platform"`
+	CurrentInUse   int64   `json:"current_in_use"`
+	MaxCapacity    int64   `json:"max_capacity"`
+	LoadPercentage float64 `json:"load_percentage"`
+	WaitingInQueue int64   `json:"waiting_in_queue"`
+}
+
+// GroupConcurrencyInfo aggregates concurrency usage by group.
+//
+// Note: one account can belong to multiple groups; group totals are therefore not additive across groups.
+type GroupConcurrencyInfo struct {
+	GroupID        int64   `json:"group_id"`
+	GroupName      string  `json:"group_name"`
+	Platform       string  `json:"platform"`
+	CurrentInUse   int64   `json:"current_in_use"`
+	MaxCapacity    int64   `json:"max_capacity"`
+	LoadPercentage float64 `json:"load_percentage"`
+	WaitingInQueue int64   `json:"waiting_in_queue"`
+}
+
+// AccountConcurrencyInfo represents real-time concurrency usage for a single account.
+type AccountConcurrencyInfo struct {
+	AccountID      int64   `json:"account_id"`
+	AccountName    string  `json:"account_name"`
+	Platform       string  `json:"platform"`
+	GroupID        int64   `json:"group_id"`
+	GroupName      string  `json:"group_name"`
+	CurrentInUse   int64   `json:"current_in_use"`
+	MaxCapacity    int64   `json:"max_capacity"`
+	LoadPercentage float64 `json:"load_percentage"`
+	WaitingInQueue int64   `json:"waiting_in_queue"`
+}
+
+// PlatformAvailability aggregates account availability by platform.
+type PlatformAvailability struct {
+	Platform       string `json:"platform"`
+	TotalAccounts  int64  `json:"total_accounts"`
+	AvailableCount int64  `json:"available_count"`
+	RateLimitCount int64  `json:"rate_limit_count"`
+	ErrorCount     int64  `json:"error_count"`
+}
+
+// GroupAvailability aggregates account availability by group.
+type GroupAvailability struct {
+	GroupID        int64  `json:"group_id"`
+	GroupName      string `json:"group_name"`
+	Platform       string `json:"platform"`
+	TotalAccounts  int64  `json:"total_accounts"`
+	AvailableCount int64  `json:"available_count"`
+	RateLimitCount int64  `json:"rate_limit_count"`
+	ErrorCount     int64  `json:"error_count"`
+}
+
+// AccountAvailability represents current availability for a single account.
+type AccountAvailability struct {
+	AccountID   int64  `json:"account_id"`
+	AccountName string `json:"account_name"`
+	Platform    string `json:"platform"`
+	GroupID     int64  `json:"group_id"`
+	GroupName   string `json:"group_name"`
+
+	Status string `json:"status"`
+
+	IsAvailable   bool `json:"is_available"`
+	IsRateLimited bool `json:"is_rate_limited"`
+	IsOverloaded  bool `json:"is_overloaded"`
+	HasError      bool `json:"has_error"`
+
+	RateLimitResetAt       *time.Time `json:"rate_limit_reset_at"`
+	RateLimitRemainingSec  *int64     `json:"rate_limit_remaining_sec"`
+	OverloadUntil          *time.Time `json:"overload_until"`
+	OverloadRemainingSec   *int64     `json:"overload_remaining_sec"`
+	ErrorMessage           string     `json:"error_message"`
+	TempUnschedulableUntil *time.Time `json:"temp_unschedulable_until,omitempty"`
+}
--- a/backend/internal/service/ops_settings_models.go
+++ b/backend/internal/service/ops_settings_models.go
@@ -0,0 +1,70 @@
+package service
+
+// Ops settings models stored in DB `settings` table (JSON blobs).
+
+type OpsEmailNotificationConfig struct {
+	Alert  OpsEmailAlertConfig  `json:"alert"`
+	Report OpsEmailReportConfig `json:"report"`
+}
+
+type OpsEmailAlertConfig struct {
+	Enabled               bool     `json:"enabled"`
+	Recipients            []string `json:"recipients"`
+	MinSeverity           string   `json:"min_severity"`
+	RateLimitPerHour      int      `json:"rate_limit_per_hour"`
+	BatchingWindowSeconds int      `json:"batching_window_seconds"`
+	IncludeResolvedAlerts bool     `json:"include_resolved_alerts"`
+}
+
+type OpsEmailReportConfig struct {
+	Enabled                         bool     `json:"enabled"`
+	Recipients                      []string `json:"recipients"`
+	DailySummaryEnabled             bool     `json:"daily_summary_enabled"`
+	DailySummarySchedule            string   `json:"daily_summary_schedule"`
+	WeeklySummaryEnabled            bool     `json:"weekly_summary_enabled"`
+	WeeklySummarySchedule           string   `json:"weekly_summary_schedule"`
+	ErrorDigestEnabled              bool     `json:"error_digest_enabled"`
+	ErrorDigestSchedule             string   `json:"error_digest_schedule"`
+	ErrorDigestMinCount             int      `json:"error_digest_min_count"`
+	AccountHealthEnabled            bool     `json:"account_health_enabled"`
+	AccountHealthSchedule           string   `json:"account_health_schedule"`
+	AccountHealthErrorRateThreshold float64  `json:"account_health_error_rate_threshold"`
+}
+
+// OpsEmailNotificationConfigUpdateRequest allows partial updates, while the
+// frontend can still send the full config shape.
+type OpsEmailNotificationConfigUpdateRequest struct {
+	Alert  *OpsEmailAlertConfig  `json:"alert"`
+	Report *OpsEmailReportConfig `json:"report"`
+}
+
+type OpsDistributedLockSettings struct {
+	Enabled    bool   `json:"enabled"`
+	Key        string `json:"key"`
+	TTLSeconds int    `json:"ttl_seconds"`
+}
+
+type OpsAlertSilenceEntry struct {
+	RuleID     *int64   `json:"rule_id,omitempty"`
+	Severities []string `json:"severities,omitempty"`
+
+	UntilRFC3339 string `json:"until_rfc3339"`
+	Reason       string `json:"reason"`
+}
+
+type OpsAlertSilencingSettings struct {
+	Enabled bool `json:"enabled"`
+
+	GlobalUntilRFC3339 string `json:"global_until_rfc3339"`
+	GlobalReason       string `json:"global_reason"`
+
+	Entries []OpsAlertSilenceEntry `json:"entries,omitempty"`
+}
+
+type OpsAlertRuntimeSettings struct {
+	EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"`
+
+	DistributedLock OpsDistributedLockSettings `json:"distributed_lock"`
+	Silencing       OpsAlertSilencingSettings  `json:"silencing"`
+}
+
--- a/backend/internal/service/ops_trend_models.go
+++ b/backend/internal/service/ops_trend_models.go
@@ -0,0 +1,65 @@
+package service
+
+import "time"
+
+type OpsThroughputTrendPoint struct {
+	BucketStart   time.Time `json:"bucket_start"`
+	RequestCount  int64     `json:"request_count"`
+	TokenConsumed int64     `json:"token_consumed"`
+	QPS           float64   `json:"qps"`
+	TPS           float64   `json:"tps"`
+}
+
+type OpsThroughputPlatformBreakdownItem struct {
+	Platform      string `json:"platform"`
+	RequestCount  int64  `json:"request_count"`
+	TokenConsumed int64  `json:"token_consumed"`
+}
+
+type OpsThroughputGroupBreakdownItem struct {
+	GroupID       int64  `json:"group_id"`
+	GroupName     string `json:"group_name"`
+	RequestCount  int64  `json:"request_count"`
+	TokenConsumed int64  `json:"token_consumed"`
+}
+
+type OpsThroughputTrendResponse struct {
+	Bucket string `json:"bucket"`
+
+	Points []*OpsThroughputTrendPoint `json:"points"`
+
+	// Optional drilldown helpers:
+	// - When no platform/group is selected: returns totals by platform.
+	// - When platform is selected but group is not: returns top groups in that platform.
+	ByPlatform []*OpsThroughputPlatformBreakdownItem `json:"by_platform,omitempty"`
+	TopGroups  []*OpsThroughputGroupBreakdownItem    `json:"top_groups,omitempty"`
+}
+
+type OpsErrorTrendPoint struct {
+	BucketStart time.Time `json:"bucket_start"`
+
+	ErrorCountTotal      int64 `json:"error_count_total"`
+	BusinessLimitedCount int64 `json:"business_limited_count"`
+	ErrorCountSLA        int64 `json:"error_count_sla"`
+
+	UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
+	Upstream429Count             int64 `json:"upstream_429_count"`
+	Upstream529Count             int64 `json:"upstream_529_count"`
+}
+
+type OpsErrorTrendResponse struct {
+	Bucket string                `json:"bucket"`
+	Points []*OpsErrorTrendPoint `json:"points"`
+}
+
+type OpsErrorDistributionItem struct {
+	StatusCode      int   `json:"status_code"`
+	Total           int64 `json:"total"`
+	SLA             int64 `json:"sla"`
+	BusinessLimited int64 `json:"business_limited"`
+}
+
+type OpsErrorDistributionResponse struct {
+	Total int64                       `json:"total"`
+	Items []*OpsErrorDistributionItem `json:"items"`
+}