feat(数据库): 添加运维监控数据模型和数据库迁移脚本

- 新增 ops 监控数据库迁移脚本（表结构定义） - 定义核心数据模型（ops_models.go） - 定义告警相关模型（ops_alert_models.go） - 定义仪表板数据模型（ops_dashboard_models.go） - 定义实时监控数据模型（ops_realtime_models.go） - 定义配置相关模型（ops_settings_models.go） - 定义趋势分析数据模型（ops_trend_models.go）
2026-01-09 20:52:17 +08:00
parent 4b9e47cec9
commit d55866d375
7 changed files with 1199 additions and 0 deletions
--- a/backend/internal/service/ops_alert_models.go
+++ b/backend/internal/service/ops_alert_models.go
@@ -0,0 +1,75 @@
 package service
 import "time"
 // Ops alert rule/event models.
 //
 // NOTE: These are admin-facing DTOs and intentionally keep JSON naming aligned
 // with the existing ops dashboard frontend (backup style).
 const (
 	OpsAlertStatusFiring   = "firing"
 	OpsAlertStatusResolved = "resolved"
 )
 type OpsAlertRule struct {
 	ID          int64  `json:"id"`
 	Name        string `json:"name"`
 	Description string `json:"description"`
 	Enabled  bool   `json:"enabled"`
 	Severity string `json:"severity"`
 	MetricType string  `json:"metric_type"`
 	Operator   string  `json:"operator"`
 	Threshold  float64 `json:"threshold"`
 	WindowMinutes    int `json:"window_minutes"`
 	SustainedMinutes int `json:"sustained_minutes"`
 	CooldownMinutes  int `json:"cooldown_minutes"`
 	NotifyEmail bool `json:"notify_email"`
 	Filters map[string]any `json:"filters,omitempty"`
 	LastTriggeredAt *time.Time `json:"last_triggered_at,omitempty"`
 	CreatedAt       time.Time  `json:"created_at"`
 	UpdatedAt       time.Time  `json:"updated_at"`
 }
 type OpsAlertEvent struct {
 	ID       int64  `json:"id"`
 	RuleID   int64  `json:"rule_id"`
 	Severity string `json:"severity"`
 	Status   string `json:"status"`
 	Title       string `json:"title"`
 	Description string `json:"description"`
 	MetricValue    *float64 `json:"metric_value,omitempty"`
 	ThresholdValue *float64 `json:"threshold_value,omitempty"`
 	Dimensions map[string]any `json:"dimensions,omitempty"`
 	FiredAt    time.Time  `json:"fired_at"`
 	ResolvedAt *time.Time `json:"resolved_at,omitempty"`
 	EmailSent bool      `json:"email_sent"`
 	CreatedAt time.Time `json:"created_at"`
 }
 type OpsAlertEventFilter struct {
 	Limit int
 	// Optional filters.
 	Status   string
 	Severity string
 	StartTime *time.Time
 	EndTime   *time.Time
 	// Dimensions filters (best-effort).
 	Platform string
 	GroupID  *int64
 }
--- a/backend/internal/service/ops_dashboard_models.go
+++ b/backend/internal/service/ops_dashboard_models.go
@@ -0,0 +1,83 @@
 package service
 import "time"
 type OpsDashboardFilter struct {
 	StartTime time.Time
 	EndTime   time.Time
 	Platform string
 	GroupID  *int64
 	// QueryMode controls whether dashboard queries should use raw logs or pre-aggregated tables.
 	// Expected values: auto/raw/preagg (see OpsQueryMode).
 	QueryMode OpsQueryMode
 }
 type OpsRateSummary struct {
 	Current float64 `json:"current"`
 	Peak    float64 `json:"peak"`
 	Avg     float64 `json:"avg"`
 }
 type OpsPercentiles struct {
 	P50 *int `json:"p50_ms"`
 	P90 *int `json:"p90_ms"`
 	P95 *int `json:"p95_ms"`
 	P99 *int `json:"p99_ms"`
 	Avg *int `json:"avg_ms"`
 	Max *int `json:"max_ms"`
 }
 type OpsDashboardOverview struct {
 	StartTime time.Time `json:"start_time"`
 	EndTime   time.Time `json:"end_time"`
 	Platform  string    `json:"platform"`
 	GroupID   *int64    `json:"group_id"`
 	// Latest system-level snapshot (window=1m, global).
 	SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"`
 	// Background jobs health (heartbeats).
 	JobHeartbeats []*OpsJobHeartbeat `json:"job_heartbeats"`
 	SuccessCount         int64 `json:"success_count"`
 	ErrorCountTotal      int64 `json:"error_count_total"`
 	BusinessLimitedCount int64 `json:"business_limited_count"`
 	ErrorCountSLA     int64 `json:"error_count_sla"`
 	RequestCountTotal int64 `json:"request_count_total"`
 	RequestCountSLA   int64 `json:"request_count_sla"`
 	TokenConsumed int64 `json:"token_consumed"`
 	SLA                          float64 `json:"sla"`
 	ErrorRate                    float64 `json:"error_rate"`
 	UpstreamErrorRate            float64 `json:"upstream_error_rate"`
 	UpstreamErrorCountExcl429529 int64   `json:"upstream_error_count_excl_429_529"`
 	Upstream429Count             int64   `json:"upstream_429_count"`
 	Upstream529Count             int64   `json:"upstream_529_count"`
 	QPS OpsRateSummary `json:"qps"`
 	TPS OpsRateSummary `json:"tps"`
 	Duration OpsPercentiles `json:"duration"`
 	TTFT     OpsPercentiles `json:"ttft"`
 }
 type OpsLatencyHistogramBucket struct {
 	Range string `json:"range"`
 	Count int64  `json:"count"`
 }
 // OpsLatencyHistogramResponse is a coarse latency distribution histogram (success requests only).
 // It is used by the Ops dashboard to quickly identify tail latency regressions.
 type OpsLatencyHistogramResponse struct {
 	StartTime time.Time `json:"start_time"`
 	EndTime   time.Time `json:"end_time"`
 	Platform  string    `json:"platform"`
 	GroupID   *int64    `json:"group_id"`
 	TotalRequests int64                        `json:"total_requests"`
 	Buckets       []*OpsLatencyHistogramBucket `json:"buckets"`
 }
--- a/backend/internal/service/ops_models.go
+++ b/backend/internal/service/ops_models.go
@@ -0,0 +1,118 @@
 package service
 import "time"
 type OpsErrorLog struct {
 	ID        int64     `json:"id"`
 	CreatedAt time.Time `json:"created_at"`
 	Phase    string `json:"phase"`
 	Type     string `json:"type"`
 	Severity string `json:"severity"`
 	StatusCode int    `json:"status_code"`
 	Platform   string `json:"platform"`
 	Model      string `json:"model"`
 	LatencyMs *int `json:"latency_ms"`
 	ClientRequestID string `json:"client_request_id"`
 	RequestID       string `json:"request_id"`
 	Message         string `json:"message"`
 	UserID    *int64 `json:"user_id"`
 	APIKeyID  *int64 `json:"api_key_id"`
 	AccountID *int64 `json:"account_id"`
 	GroupID   *int64 `json:"group_id"`
 	ClientIP    *string `json:"client_ip"`
 	RequestPath string  `json:"request_path"`
 	Stream      bool    `json:"stream"`
 }
 type OpsErrorLogDetail struct {
 	OpsErrorLog
 	ErrorBody string `json:"error_body"`
 	UserAgent string `json:"user_agent"`
 	// Timings (optional)
 	AuthLatencyMs       *int64 `json:"auth_latency_ms"`
 	RoutingLatencyMs    *int64 `json:"routing_latency_ms"`
 	UpstreamLatencyMs   *int64 `json:"upstream_latency_ms"`
 	ResponseLatencyMs   *int64 `json:"response_latency_ms"`
 	TimeToFirstTokenMs  *int64 `json:"time_to_first_token_ms"`
 	// Retry context
 	RequestBody          string `json:"request_body"`
 	RequestBodyTruncated bool   `json:"request_body_truncated"`
 	RequestBodyBytes     *int   `json:"request_body_bytes"`
 	RequestHeaders       string `json:"request_headers,omitempty"`
 	// vNext metric semantics
 	IsBusinessLimited bool `json:"is_business_limited"`
 }
 type OpsErrorLogFilter struct {
 	StartTime *time.Time
 	EndTime   *time.Time
 	Platform string
 	GroupID  *int64
 	AccountID *int64
 	StatusCodes []int
 	Phase       string
 	Query       string
 	Page     int
 	PageSize int
 }
 type OpsErrorLogList struct {
 	Errors   []*OpsErrorLog `json:"errors"`
 	Total    int           `json:"total"`
 	Page     int           `json:"page"`
 	PageSize int           `json:"page_size"`
 }
 type OpsRetryAttempt struct {
 	ID        int64     `json:"id"`
 	CreatedAt time.Time `json:"created_at"`
 	RequestedByUserID int64  `json:"requested_by_user_id"`
 	SourceErrorID     int64  `json:"source_error_id"`
 	Mode              string `json:"mode"`
 	PinnedAccountID   *int64 `json:"pinned_account_id"`
 	Status     string     `json:"status"`
 	StartedAt  *time.Time `json:"started_at"`
 	FinishedAt *time.Time `json:"finished_at"`
 	DurationMs *int64     `json:"duration_ms"`
 	ResultRequestID *string `json:"result_request_id"`
 	ResultErrorID   *int64  `json:"result_error_id"`
 	ErrorMessage *string `json:"error_message"`
 }
 type OpsRetryResult struct {
 	AttemptID int64 `json:"attempt_id"`
 	Mode      string `json:"mode"`
 	Status    string `json:"status"`
 	PinnedAccountID *int64 `json:"pinned_account_id"`
 	UsedAccountID   *int64 `json:"used_account_id"`
 	HTTPStatusCode int    `json:"http_status_code"`
 	UpstreamRequestID string `json:"upstream_request_id"`
 	ResponsePreview     string `json:"response_preview"`
 	ResponseTruncated   bool   `json:"response_truncated"`
 	ErrorMessage string `json:"error_message"`
 	StartedAt  time.Time `json:"started_at"`
 	FinishedAt time.Time `json:"finished_at"`
 	DurationMs int64     `json:"duration_ms"`
 }
--- a/backend/internal/service/ops_realtime_models.go
+++ b/backend/internal/service/ops_realtime_models.go
@@ -0,0 +1,81 @@
 package service
 import "time"
 // PlatformConcurrencyInfo aggregates concurrency usage by platform.
 type PlatformConcurrencyInfo struct {
 	Platform       string  `json:"platform"`
 	CurrentInUse   int64   `json:"current_in_use"`
 	MaxCapacity    int64   `json:"max_capacity"`
 	LoadPercentage float64 `json:"load_percentage"`
 	WaitingInQueue int64   `json:"waiting_in_queue"`
 }
 // GroupConcurrencyInfo aggregates concurrency usage by group.
 //
 // Note: one account can belong to multiple groups; group totals are therefore not additive across groups.
 type GroupConcurrencyInfo struct {
 	GroupID        int64   `json:"group_id"`
 	GroupName      string  `json:"group_name"`
 	Platform       string  `json:"platform"`
 	CurrentInUse   int64   `json:"current_in_use"`
 	MaxCapacity    int64   `json:"max_capacity"`
 	LoadPercentage float64 `json:"load_percentage"`
 	WaitingInQueue int64   `json:"waiting_in_queue"`
 }
 // AccountConcurrencyInfo represents real-time concurrency usage for a single account.
 type AccountConcurrencyInfo struct {
 	AccountID      int64   `json:"account_id"`
 	AccountName    string  `json:"account_name"`
 	Platform       string  `json:"platform"`
 	GroupID        int64   `json:"group_id"`
 	GroupName      string  `json:"group_name"`
 	CurrentInUse   int64   `json:"current_in_use"`
 	MaxCapacity    int64   `json:"max_capacity"`
 	LoadPercentage float64 `json:"load_percentage"`
 	WaitingInQueue int64   `json:"waiting_in_queue"`
 }
 // PlatformAvailability aggregates account availability by platform.
 type PlatformAvailability struct {
 	Platform       string `json:"platform"`
 	TotalAccounts  int64  `json:"total_accounts"`
 	AvailableCount int64  `json:"available_count"`
 	RateLimitCount int64  `json:"rate_limit_count"`
 	ErrorCount     int64  `json:"error_count"`
 }
 // GroupAvailability aggregates account availability by group.
 type GroupAvailability struct {
 	GroupID        int64  `json:"group_id"`
 	GroupName      string `json:"group_name"`
 	Platform       string `json:"platform"`
 	TotalAccounts  int64  `json:"total_accounts"`
 	AvailableCount int64  `json:"available_count"`
 	RateLimitCount int64  `json:"rate_limit_count"`
 	ErrorCount     int64  `json:"error_count"`
 }
 // AccountAvailability represents current availability for a single account.
 type AccountAvailability struct {
 	AccountID   int64  `json:"account_id"`
 	AccountName string `json:"account_name"`
 	Platform    string `json:"platform"`
 	GroupID     int64  `json:"group_id"`
 	GroupName   string `json:"group_name"`
 	Status string `json:"status"`
 	IsAvailable   bool `json:"is_available"`
 	IsRateLimited bool `json:"is_rate_limited"`
 	IsOverloaded  bool `json:"is_overloaded"`
 	HasError      bool `json:"has_error"`
 	RateLimitResetAt       *time.Time `json:"rate_limit_reset_at"`
 	RateLimitRemainingSec  *int64     `json:"rate_limit_remaining_sec"`
 	OverloadUntil          *time.Time `json:"overload_until"`
 	OverloadRemainingSec   *int64     `json:"overload_remaining_sec"`
 	ErrorMessage           string     `json:"error_message"`
 	TempUnschedulableUntil *time.Time `json:"temp_unschedulable_until,omitempty"`
 }
--- a/backend/internal/service/ops_settings_models.go
+++ b/backend/internal/service/ops_settings_models.go
@@ -0,0 +1,70 @@
 package service
 // Ops settings models stored in DB `settings` table (JSON blobs).
 type OpsEmailNotificationConfig struct {
 	Alert  OpsEmailAlertConfig  `json:"alert"`
 	Report OpsEmailReportConfig `json:"report"`
 }
 type OpsEmailAlertConfig struct {
 	Enabled               bool     `json:"enabled"`
 	Recipients            []string `json:"recipients"`
 	MinSeverity           string   `json:"min_severity"`
 	RateLimitPerHour      int      `json:"rate_limit_per_hour"`
 	BatchingWindowSeconds int      `json:"batching_window_seconds"`
 	IncludeResolvedAlerts bool     `json:"include_resolved_alerts"`
 }
 type OpsEmailReportConfig struct {
 	Enabled                         bool     `json:"enabled"`
 	Recipients                      []string `json:"recipients"`
 	DailySummaryEnabled             bool     `json:"daily_summary_enabled"`
 	DailySummarySchedule            string   `json:"daily_summary_schedule"`
 	WeeklySummaryEnabled            bool     `json:"weekly_summary_enabled"`
 	WeeklySummarySchedule           string   `json:"weekly_summary_schedule"`
 	ErrorDigestEnabled              bool     `json:"error_digest_enabled"`
 	ErrorDigestSchedule             string   `json:"error_digest_schedule"`
 	ErrorDigestMinCount             int      `json:"error_digest_min_count"`
 	AccountHealthEnabled            bool     `json:"account_health_enabled"`
 	AccountHealthSchedule           string   `json:"account_health_schedule"`
 	AccountHealthErrorRateThreshold float64  `json:"account_health_error_rate_threshold"`
 }
 // OpsEmailNotificationConfigUpdateRequest allows partial updates, while the
 // frontend can still send the full config shape.
 type OpsEmailNotificationConfigUpdateRequest struct {
 	Alert  *OpsEmailAlertConfig  `json:"alert"`
 	Report *OpsEmailReportConfig `json:"report"`
 }
 type OpsDistributedLockSettings struct {
 	Enabled    bool   `json:"enabled"`
 	Key        string `json:"key"`
 	TTLSeconds int    `json:"ttl_seconds"`
 }
 type OpsAlertSilenceEntry struct {
 	RuleID     *int64   `json:"rule_id,omitempty"`
 	Severities []string `json:"severities,omitempty"`
 	UntilRFC3339 string `json:"until_rfc3339"`
 	Reason       string `json:"reason"`
 }
 type OpsAlertSilencingSettings struct {
 	Enabled bool `json:"enabled"`
 	GlobalUntilRFC3339 string `json:"global_until_rfc3339"`
 	GlobalReason       string `json:"global_reason"`
 	Entries []OpsAlertSilenceEntry `json:"entries,omitempty"`
 }
 type OpsAlertRuntimeSettings struct {
 	EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"`
 	DistributedLock OpsDistributedLockSettings `json:"distributed_lock"`
 	Silencing       OpsAlertSilencingSettings  `json:"silencing"`
 }
--- a/backend/internal/service/ops_trend_models.go
+++ b/backend/internal/service/ops_trend_models.go
@@ -0,0 +1,65 @@
 package service
 import "time"
 type OpsThroughputTrendPoint struct {
 	BucketStart   time.Time `json:"bucket_start"`
 	RequestCount  int64     `json:"request_count"`
 	TokenConsumed int64     `json:"token_consumed"`
 	QPS           float64   `json:"qps"`
 	TPS           float64   `json:"tps"`
 }
 type OpsThroughputPlatformBreakdownItem struct {
 	Platform      string `json:"platform"`
 	RequestCount  int64  `json:"request_count"`
 	TokenConsumed int64  `json:"token_consumed"`
 }
 type OpsThroughputGroupBreakdownItem struct {
 	GroupID       int64  `json:"group_id"`
 	GroupName     string `json:"group_name"`
 	RequestCount  int64  `json:"request_count"`
 	TokenConsumed int64  `json:"token_consumed"`
 }
 type OpsThroughputTrendResponse struct {
 	Bucket string `json:"bucket"`
 	Points []*OpsThroughputTrendPoint `json:"points"`
 	// Optional drilldown helpers:
 	// - When no platform/group is selected: returns totals by platform.
 	// - When platform is selected but group is not: returns top groups in that platform.
 	ByPlatform []*OpsThroughputPlatformBreakdownItem `json:"by_platform,omitempty"`
 	TopGroups  []*OpsThroughputGroupBreakdownItem    `json:"top_groups,omitempty"`
 }
 type OpsErrorTrendPoint struct {
 	BucketStart time.Time `json:"bucket_start"`
 	ErrorCountTotal      int64 `json:"error_count_total"`
 	BusinessLimitedCount int64 `json:"business_limited_count"`
 	ErrorCountSLA        int64 `json:"error_count_sla"`
 	UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
 	Upstream429Count             int64 `json:"upstream_429_count"`
 	Upstream529Count             int64 `json:"upstream_529_count"`
 }
 type OpsErrorTrendResponse struct {
 	Bucket string                `json:"bucket"`
 	Points []*OpsErrorTrendPoint `json:"points"`
 }
 type OpsErrorDistributionItem struct {
 	StatusCode      int   `json:"status_code"`
 	Total           int64 `json:"total"`
 	SLA             int64 `json:"sla"`
 	BusinessLimited int64 `json:"business_limited"`
 }
 type OpsErrorDistributionResponse struct {
 	Total int64                       `json:"total"`
 	Items []*OpsErrorDistributionItem `json:"items"`
 }
--- a/backend/migrations/030_ops_monitoring_vnext.sql
+++ b/backend/migrations/030_ops_monitoring_vnext.sql
@@ -0,0 +1,707 @@
 -- Ops Monitoring (vNext): squashed migration (030)
 --
 -- This repository originally planned Ops vNext as migrations 030-036:
 --   030 drop legacy ops tables
 --   031 core schema
 --   032 pre-aggregation tables
 --   033 indexes + optional extensions
 --   034 add avg/max to preagg
 --   035 add notify_email to alert rules
 --   036 seed default alert rules
 --
 -- Since these migrations have NOT been applied to any environment yet, we squash them
 -- into a single 030 migration for easier review and a cleaner migration history.
 --
 -- Notes:
 -- - This is intentionally destructive for ops_* data (error logs / metrics / alerts).
 -- - It is idempotent (DROP/CREATE/ALTER IF EXISTS/IF NOT EXISTS), but will wipe ops_* data if re-run.
 -- =====================================================================
 -- 030_ops_drop_legacy_ops_tables.sql
 -- =====================================================================
 SET LOCAL lock_timeout = '5s';
 SET LOCAL statement_timeout = '10min';
 -- Legacy pre-aggregation tables (from 026 and/or previous branches)
 DROP TABLE IF EXISTS ops_metrics_daily CASCADE;
 DROP TABLE IF EXISTS ops_metrics_hourly CASCADE;
 -- Core ops tables that may exist in some deployments / branches
 DROP TABLE IF EXISTS ops_system_metrics CASCADE;
 DROP TABLE IF EXISTS ops_error_logs CASCADE;
 DROP TABLE IF EXISTS ops_alert_events CASCADE;
 DROP TABLE IF EXISTS ops_alert_rules CASCADE;
 DROP TABLE IF EXISTS ops_job_heartbeats CASCADE;
 DROP TABLE IF EXISTS ops_retry_attempts CASCADE;
 -- Optional legacy tables (best-effort cleanup)
 DROP TABLE IF EXISTS ops_scheduled_reports CASCADE;
 DROP TABLE IF EXISTS ops_group_availability_configs CASCADE;
 DROP TABLE IF EXISTS ops_group_availability_events CASCADE;
 -- Optional legacy views/indexes
 DROP VIEW IF EXISTS ops_latest_metrics CASCADE;
 -- =====================================================================
 -- 031_ops_core_schema.sql
 -- =====================================================================
 -- Ops Monitoring (vNext): core schema (errors / retries / metrics / jobs / alerts)
 --
 -- Design goals:
 -- - Support global filtering (time/platform/group) across all ops modules.
 -- - Persist enough context for two retry modes (client retry / pinned upstream retry).
 -- - Make ops background jobs observable via job heartbeats.
 -- - Keep schema stable and indexes targeted (high-write tables).
 --
 -- Notes:
 -- - This migration is idempotent.
 -- - ops_* tables intentionally avoid strict foreign keys to reduce write amplification/locks.
 SET LOCAL lock_timeout = '5s';
 SET LOCAL statement_timeout = '10min';
 -- ============================================
 -- 1) ops_error_logs: error log details (high-write)
 -- ============================================
 CREATE TABLE IF NOT EXISTS ops_error_logs (
    id BIGSERIAL PRIMARY KEY,
    -- Correlation / identities
    request_id VARCHAR(64),
    client_request_id VARCHAR(64),
    user_id BIGINT,
    api_key_id BIGINT,
    account_id BIGINT,
    group_id BIGINT,
    client_ip inet,
    -- Dimensions for global filtering
    platform VARCHAR(32),
    -- Request metadata
    model VARCHAR(100),
    request_path VARCHAR(256),
    stream BOOLEAN NOT NULL DEFAULT false,
    user_agent TEXT,
    -- Core error classification
    error_phase VARCHAR(32) NOT NULL,
    error_type VARCHAR(64) NOT NULL,
    severity VARCHAR(8) NOT NULL DEFAULT 'P2',
    status_code INT,
    -- vNext metric semantics
    is_business_limited BOOLEAN NOT NULL DEFAULT false,
    -- Error details (sanitized/truncated at ingest time)
    error_message TEXT,
    error_body TEXT,
    -- Provider/upstream details (optional; useful for trends & account health)
    error_source VARCHAR(64),
    error_owner VARCHAR(32),
    account_status VARCHAR(50),
    upstream_status_code INT,
    upstream_error_message TEXT,
    upstream_error_detail TEXT,
    provider_error_code VARCHAR(64),
    provider_error_type VARCHAR(64),
    network_error_type VARCHAR(50),
    retry_after_seconds INT,
    -- Timings (ms) - optional
    duration_ms INT,
    time_to_first_token_ms BIGINT,
    auth_latency_ms BIGINT,
    routing_latency_ms BIGINT,
    upstream_latency_ms BIGINT,
    response_latency_ms BIGINT,
    -- Retry context (only stored for error requests)
    request_body JSONB,
    request_headers JSONB,
    request_body_truncated BOOLEAN NOT NULL DEFAULT false,
    request_body_bytes INT,
    -- Retryability flags (best-effort classification)
    is_retryable BOOLEAN NOT NULL DEFAULT false,
    retry_count INT NOT NULL DEFAULT 0,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );
 COMMENT ON TABLE ops_error_logs IS 'Ops error logs (vNext). Stores sanitized error details and request_body for retries (errors only).';
 -- ============================================
 -- 2) ops_retry_attempts: audit log for retries
 -- ============================================
 CREATE TABLE IF NOT EXISTS ops_retry_attempts (
    id BIGSERIAL PRIMARY KEY,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    requested_by_user_id BIGINT,
    source_error_id BIGINT,
    -- client|upstream
    mode VARCHAR(16) NOT NULL,
    pinned_account_id BIGINT,
    -- queued|running|succeeded|failed
    status VARCHAR(16) NOT NULL DEFAULT 'queued',
    started_at TIMESTAMPTZ,
    finished_at TIMESTAMPTZ,
    duration_ms BIGINT,
    -- Optional result correlation
    result_request_id VARCHAR(64),
    result_error_id BIGINT,
    result_usage_request_id VARCHAR(64),
    error_message TEXT
 );
 COMMENT ON TABLE ops_retry_attempts IS 'Audit table for ops retries (client retry / pinned upstream retry).';
 -- ============================================
 -- 3) ops_system_metrics: system + request window snapshots
 -- ============================================
 CREATE TABLE IF NOT EXISTS ops_system_metrics (
    id BIGSERIAL PRIMARY KEY,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    window_minutes INT NOT NULL DEFAULT 1,
    -- Optional dimensions (only if collector chooses to write per-dimension snapshots)
    platform VARCHAR(32),
    group_id BIGINT,
    -- Core counts
    success_count BIGINT NOT NULL DEFAULT 0,
    error_count_total BIGINT NOT NULL DEFAULT 0,
    business_limited_count BIGINT NOT NULL DEFAULT 0,
    error_count_sla BIGINT NOT NULL DEFAULT 0,
    upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
    upstream_429_count BIGINT NOT NULL DEFAULT 0,
    upstream_529_count BIGINT NOT NULL DEFAULT 0,
    token_consumed BIGINT NOT NULL DEFAULT 0,
    -- Rates
    qps DOUBLE PRECISION,
    tps DOUBLE PRECISION,
    -- Duration percentiles (ms) - success requests
    duration_p50_ms INT,
    duration_p90_ms INT,
    duration_p95_ms INT,
    duration_p99_ms INT,
    duration_avg_ms DOUBLE PRECISION,
    duration_max_ms INT,
    -- TTFT percentiles (ms) - success requests (streaming)
    ttft_p50_ms INT,
    ttft_p90_ms INT,
    ttft_p95_ms INT,
    ttft_p99_ms INT,
    ttft_avg_ms DOUBLE PRECISION,
    ttft_max_ms INT,
    -- System resources
    cpu_usage_percent DOUBLE PRECISION,
    memory_used_mb BIGINT,
    memory_total_mb BIGINT,
    memory_usage_percent DOUBLE PRECISION,
    -- Dependency health (best-effort)
    db_ok BOOLEAN,
    redis_ok BOOLEAN,
    -- DB pool & runtime
    db_conn_active INT,
    db_conn_idle INT,
    db_conn_waiting INT,
    goroutine_count INT,
    -- Queue / concurrency
    concurrency_queue_depth INT
 );
 COMMENT ON TABLE ops_system_metrics IS 'Ops system/request metrics snapshots (vNext). Used for dashboard overview and realtime rates.';
 -- ============================================
 -- 4) ops_job_heartbeats: background jobs health
 -- ============================================
 CREATE TABLE IF NOT EXISTS ops_job_heartbeats (
    job_name VARCHAR(64) PRIMARY KEY,
    last_run_at TIMESTAMPTZ,
    last_success_at TIMESTAMPTZ,
    last_error_at TIMESTAMPTZ,
    last_error TEXT,
    last_duration_ms BIGINT,
    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );
 COMMENT ON TABLE ops_job_heartbeats IS 'Ops background jobs heartbeats (vNext).';
 -- ============================================
 -- 5) ops_alert_rules / ops_alert_events
 -- ============================================
 CREATE TABLE IF NOT EXISTS ops_alert_rules (
    id BIGSERIAL PRIMARY KEY,
    name VARCHAR(128) NOT NULL,
    description TEXT,
    enabled BOOLEAN NOT NULL DEFAULT true,
    severity VARCHAR(16) NOT NULL DEFAULT 'warning',
    -- Metric definition
    -- Metric definition
    metric_type VARCHAR(64) NOT NULL,
    operator VARCHAR(8) NOT NULL,
    threshold DOUBLE PRECISION NOT NULL,
    window_minutes INT NOT NULL DEFAULT 5,
    sustained_minutes INT NOT NULL DEFAULT 5,
    cooldown_minutes INT NOT NULL DEFAULT 10,
    -- Optional scoping: platform/group filters etc.
    filters JSONB,
    last_triggered_at TIMESTAMPTZ,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );
 CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_alert_rules_name_unique
    ON ops_alert_rules (name);
 CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled
    ON ops_alert_rules (enabled);
 CREATE TABLE IF NOT EXISTS ops_alert_events (
    id BIGSERIAL PRIMARY KEY,
    rule_id BIGINT,
    severity VARCHAR(16) NOT NULL,
    status VARCHAR(16) NOT NULL DEFAULT 'firing',
    title VARCHAR(200),
    description TEXT,
    metric_value DOUBLE PRECISION,
    threshold_value DOUBLE PRECISION,
    dimensions JSONB,
    fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    resolved_at TIMESTAMPTZ,
    email_sent BOOLEAN NOT NULL DEFAULT false,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );
 CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status
    ON ops_alert_events (rule_id, status);
 CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at
    ON ops_alert_events (fired_at DESC);
 -- =====================================================================
 -- 032_ops_preaggregation_tables.sql
 -- =====================================================================
 -- Ops Monitoring (vNext): pre-aggregation tables
 --
 -- Purpose:
 -- - Provide stable query performance for 1–24h windows (and beyond), avoiding expensive
 --   percentile_cont scans on raw logs for every dashboard refresh.
 -- - Support global filter dimensions: overall / platform / group.
 --
 -- Design note:
 -- - We keep a single table with nullable platform/group_id, and enforce uniqueness via a
 --   COALESCE-based unique index (because UNIQUE with NULLs allows duplicates in Postgres).
 SET LOCAL lock_timeout = '5s';
 SET LOCAL statement_timeout = '10min';
 -- ============================================
 -- 1) ops_metrics_hourly
 -- ============================================
 CREATE TABLE IF NOT EXISTS ops_metrics_hourly (
    id BIGSERIAL PRIMARY KEY,
    bucket_start TIMESTAMPTZ NOT NULL,
    platform VARCHAR(32),
    group_id BIGINT,
    success_count BIGINT NOT NULL DEFAULT 0,
    error_count_total BIGINT NOT NULL DEFAULT 0,
    business_limited_count BIGINT NOT NULL DEFAULT 0,
    error_count_sla BIGINT NOT NULL DEFAULT 0,
    upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
    upstream_429_count BIGINT NOT NULL DEFAULT 0,
    upstream_529_count BIGINT NOT NULL DEFAULT 0,
    token_consumed BIGINT NOT NULL DEFAULT 0,
    -- Duration percentiles (ms)
    duration_p50_ms INT,
    duration_p90_ms INT,
    duration_p95_ms INT,
    duration_p99_ms INT,
    -- TTFT percentiles (ms)
    ttft_p50_ms INT,
    ttft_p90_ms INT,
    ttft_p95_ms INT,
    ttft_p99_ms INT,
    computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );
 -- Uniqueness across three “dimension modes” (overall / platform / group).
 -- Postgres UNIQUE treats NULLs as distinct, so we enforce uniqueness via COALESCE.
 CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_hourly_unique_dim
    ON ops_metrics_hourly (
        bucket_start,
        COALESCE(platform, ''),
        COALESCE(group_id, 0)
    );
 CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_bucket
    ON ops_metrics_hourly (bucket_start DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_platform_bucket
    ON ops_metrics_hourly (platform, bucket_start DESC)
    WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
 CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_group_bucket
    ON ops_metrics_hourly (group_id, bucket_start DESC)
    WHERE group_id IS NOT NULL AND group_id <> 0;
 COMMENT ON TABLE ops_metrics_hourly IS 'vNext hourly pre-aggregated ops metrics (overall/platform/group).';
 -- ============================================
 -- 2) ops_metrics_daily (optional; for longer windows)
 -- ============================================
 CREATE TABLE IF NOT EXISTS ops_metrics_daily (
    id BIGSERIAL PRIMARY KEY,
    bucket_date DATE NOT NULL,
    platform VARCHAR(32),
    group_id BIGINT,
    success_count BIGINT NOT NULL DEFAULT 0,
    error_count_total BIGINT NOT NULL DEFAULT 0,
    business_limited_count BIGINT NOT NULL DEFAULT 0,
    error_count_sla BIGINT NOT NULL DEFAULT 0,
    upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
    upstream_429_count BIGINT NOT NULL DEFAULT 0,
    upstream_529_count BIGINT NOT NULL DEFAULT 0,
    token_consumed BIGINT NOT NULL DEFAULT 0,
    duration_p50_ms INT,
    duration_p90_ms INT,
    duration_p95_ms INT,
    duration_p99_ms INT,
    ttft_p50_ms INT,
    ttft_p90_ms INT,
    ttft_p95_ms INT,
    ttft_p99_ms INT,
    computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );
 CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_daily_unique_dim
    ON ops_metrics_daily (
        bucket_date,
        COALESCE(platform, ''),
        COALESCE(group_id, 0)
    );
 CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_bucket
    ON ops_metrics_daily (bucket_date DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_platform_bucket
    ON ops_metrics_daily (platform, bucket_date DESC)
    WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
 CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_group_bucket
    ON ops_metrics_daily (group_id, bucket_date DESC)
    WHERE group_id IS NOT NULL AND group_id <> 0;
 COMMENT ON TABLE ops_metrics_daily IS 'vNext daily pre-aggregated ops metrics (overall/platform/group).';
 -- =====================================================================
 -- 033_ops_indexes_and_extensions.sql
 -- =====================================================================
 -- Ops Monitoring (vNext): indexes and optional extensions
 --
 -- This migration intentionally keeps "optional" objects (like pg_trgm) best-effort,
 -- so environments without extension privileges won't fail the whole migration chain.
 SET LOCAL lock_timeout = '5s';
 SET LOCAL statement_timeout = '10min';
 -- ============================================
 -- 1) Core btree indexes (always safe)
 -- ============================================
 -- ops_error_logs
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at
    ON ops_error_logs (created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform_time
    ON ops_error_logs (platform, created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_group_time
    ON ops_error_logs (group_id, created_at DESC)
    WHERE group_id IS NOT NULL;
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_account_time
    ON ops_error_logs (account_id, created_at DESC)
    WHERE account_id IS NOT NULL;
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_status_time
    ON ops_error_logs (status_code, created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_time
    ON ops_error_logs (error_phase, created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_type_time
    ON ops_error_logs (error_type, created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id
    ON ops_error_logs (request_id);
 CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id
    ON ops_error_logs (client_request_id);
 -- ops_system_metrics
 CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at
    ON ops_system_metrics (created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time
    ON ops_system_metrics (window_minutes, created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_platform_time
    ON ops_system_metrics (platform, created_at DESC)
    WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
 CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_group_time
    ON ops_system_metrics (group_id, created_at DESC)
    WHERE group_id IS NOT NULL;
 -- ops_retry_attempts
 CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_created_at
    ON ops_retry_attempts (created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_source_error
    ON ops_retry_attempts (source_error_id, created_at DESC)
    WHERE source_error_id IS NOT NULL;
 -- Prevent concurrent retries for the same ops_error_logs row (race-free, multi-instance safe).
 CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_retry_attempts_unique_active
    ON ops_retry_attempts (source_error_id)
    WHERE source_error_id IS NOT NULL AND status IN ('queued', 'running');
 -- ============================================
 -- 2) Optional: pg_trgm + trigram indexes for fuzzy search
 -- ============================================
 DO $$
 BEGIN
  BEGIN
    CREATE EXTENSION IF NOT EXISTS pg_trgm;
  EXCEPTION WHEN OTHERS THEN
    -- Missing privileges or extension package should not block migrations.
    RAISE NOTICE 'pg_trgm extension not created: %', SQLERRM;
  END;
  IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm') THEN
    -- request_id / client_request_id fuzzy search
    EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id_trgm
             ON ops_error_logs USING gin (request_id gin_trgm_ops)';
    EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id_trgm
             ON ops_error_logs USING gin (client_request_id gin_trgm_ops)';
    -- error_message fuzzy search
    EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_error_message_trgm
             ON ops_error_logs USING gin (error_message gin_trgm_ops)';
  END IF;
 END $$;
 -- =====================================================================
 -- 034_ops_preaggregation_add_avg_max.sql
 -- =====================================================================
 -- Ops Monitoring (vNext): extend pre-aggregation tables with avg/max latency fields
 --
 -- Why:
 -- - The dashboard overview returns avg/max for duration/TTFT.
 -- - Hourly/daily pre-aggregation tables originally stored only p50/p90/p95/p99, which makes
 --   it impossible to answer avg/max in preagg mode without falling back to raw scans.
 --
 -- This migration is idempotent and safe to run multiple times.
 --
 -- NOTE: We keep the existing p50/p90/p95/p99 columns as-is; these are still used for
 --       approximate long-window summaries.
 SET LOCAL lock_timeout = '5s';
 SET LOCAL statement_timeout = '10min';
 -- Hourly table
 ALTER TABLE ops_metrics_hourly
    ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
    ADD COLUMN IF NOT EXISTS duration_max_ms INT,
    ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
    ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
 -- Daily table
 ALTER TABLE ops_metrics_daily
    ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
    ADD COLUMN IF NOT EXISTS duration_max_ms INT,
    ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
    ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
 -- =====================================================================
 -- 035_ops_alert_rules_notify_email.sql
 -- =====================================================================
 -- Ops Monitoring (vNext): alert rule notify settings
 --
 -- Adds notify_email flag to ops_alert_rules to keep UI parity with the backup Ops dashboard.
 -- Migration is idempotent.
 SET LOCAL lock_timeout = '5s';
 SET LOCAL statement_timeout = '10min';
 ALTER TABLE ops_alert_rules
    ADD COLUMN IF NOT EXISTS notify_email BOOLEAN NOT NULL DEFAULT true;
 -- =====================================================================
 -- 036_ops_seed_default_alert_rules.sql
 -- =====================================================================
 -- Ops Monitoring (vNext): seed default alert rules (idempotent)
 --
 -- Goal:
 -- - Provide "out of the box" alert rules so the Ops dashboard can immediately show alert events.
 -- - Keep inserts idempotent via ON CONFLICT (name) DO NOTHING.
 --
 -- Notes:
 -- - Thresholds are intentionally conservative defaults and should be tuned per deployment.
 -- - Metric semantics follow vNext:
 --   - success_rate / error_rate are based on SLA-scope counts (exclude is_business_limited).
 --   - upstream_error_rate excludes 429/529.
 SET LOCAL lock_timeout = '5s';
 SET LOCAL statement_timeout = '10min';
 -- 1) High error rate (P1)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    '错误率过高',
    '当错误率超过 5% 且持续 5 分钟时触发告警',
    true, 'error_rate', '>', 5.0, 5, 5, 'P1', true, 20, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
 -- 2) Low success rate (P0)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    '成功率过低',
    '当成功率低于 95% 且持续 5 分钟时触发告警（服务可用性下降）',
    true, 'success_rate', '<', 95.0, 5, 5, 'P0', true, 15, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
 -- 3) P99 latency too high (P2)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    'P99延迟过高',
    '当 P99 延迟超过 3000ms 且持续 10 分钟时触发告警',
    true, 'p99_latency_ms', '>', 3000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
 -- 4) P95 latency too high (P2)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    'P95延迟过高',
    '当 P95 延迟超过 2000ms 且持续 10 分钟时触发告警',
    true, 'p95_latency_ms', '>', 2000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
 -- 5) CPU usage too high (P2)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    'CPU使用率过高',
    '当 CPU 使用率超过 85% 且持续 10 分钟时触发告警',
    true, 'cpu_usage_percent', '>', 85.0, 5, 10, 'P2', true, 30, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
 -- 6) Memory usage too high (P1)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    '内存使用率过高',
    '当内存使用率超过 90% 且持续 10 分钟时触发告警（可能导致 OOM）',
    true, 'memory_usage_percent', '>', 90.0, 5, 10, 'P1', true, 20, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
 -- 7) Concurrency queue buildup (P1)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    '并发队列积压',
    '当并发队列深度超过 100 且持续 5 分钟时触发告警（系统处理能力不足）',
    true, 'concurrency_queue_depth', '>', 100.0, 5, 5, 'P1', true, 20, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;
 -- 8) Extremely high error rate (P0)
 INSERT INTO ops_alert_rules (
    name, description, enabled, metric_type, operator, threshold,
    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
    created_at, updated_at
 ) VALUES (
    '错误率极高',
    '当错误率超过 20% 且持续 1 分钟时触发告警（服务严重异常）',
    true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
 ) ON CONFLICT (name) DO NOTHING;