From d55866d3755fb6c4b109e225fd52d62414a4c0e5 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:52:17 +0800 Subject: [PATCH] =?UTF-8?q?feat(=E6=95=B0=E6=8D=AE=E5=BA=93):=20=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=92=8C=E6=95=B0=E6=8D=AE=E5=BA=93=E8=BF=81?= =?UTF-8?q?=E7=A7=BB=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 ops 监控数据库迁移脚本(表结构定义) - 定义核心数据模型(ops_models.go) - 定义告警相关模型(ops_alert_models.go) - 定义仪表板数据模型(ops_dashboard_models.go) - 定义实时监控数据模型(ops_realtime_models.go) - 定义配置相关模型(ops_settings_models.go) - 定义趋势分析数据模型(ops_trend_models.go) --- backend/internal/service/ops_alert_models.go | 75 ++ .../internal/service/ops_dashboard_models.go | 83 ++ backend/internal/service/ops_models.go | 118 +++ .../internal/service/ops_realtime_models.go | 81 ++ .../internal/service/ops_settings_models.go | 70 ++ backend/internal/service/ops_trend_models.go | 65 ++ .../migrations/030_ops_monitoring_vnext.sql | 707 ++++++++++++++++++ 7 files changed, 1199 insertions(+) create mode 100644 backend/internal/service/ops_alert_models.go create mode 100644 backend/internal/service/ops_dashboard_models.go create mode 100644 backend/internal/service/ops_models.go create mode 100644 backend/internal/service/ops_realtime_models.go create mode 100644 backend/internal/service/ops_settings_models.go create mode 100644 backend/internal/service/ops_trend_models.go create mode 100644 backend/migrations/030_ops_monitoring_vnext.sql diff --git a/backend/internal/service/ops_alert_models.go b/backend/internal/service/ops_alert_models.go new file mode 100644 index 00000000..783a3d1e --- /dev/null +++ b/backend/internal/service/ops_alert_models.go @@ -0,0 +1,75 @@ +package service + +import "time" + +// Ops alert rule/event models. +// +// NOTE: These are admin-facing DTOs and intentionally keep JSON naming aligned +// with the existing ops dashboard frontend (backup style). + +const ( + OpsAlertStatusFiring = "firing" + OpsAlertStatusResolved = "resolved" +) + +type OpsAlertRule struct { + ID int64 `json:"id"` + Name string `json:"name"` + Description string `json:"description"` + + Enabled bool `json:"enabled"` + Severity string `json:"severity"` + + MetricType string `json:"metric_type"` + Operator string `json:"operator"` + Threshold float64 `json:"threshold"` + + WindowMinutes int `json:"window_minutes"` + SustainedMinutes int `json:"sustained_minutes"` + CooldownMinutes int `json:"cooldown_minutes"` + + NotifyEmail bool `json:"notify_email"` + + Filters map[string]any `json:"filters,omitempty"` + + LastTriggeredAt *time.Time `json:"last_triggered_at,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type OpsAlertEvent struct { + ID int64 `json:"id"` + RuleID int64 `json:"rule_id"` + Severity string `json:"severity"` + Status string `json:"status"` + + Title string `json:"title"` + Description string `json:"description"` + + MetricValue *float64 `json:"metric_value,omitempty"` + ThresholdValue *float64 `json:"threshold_value,omitempty"` + + Dimensions map[string]any `json:"dimensions,omitempty"` + + FiredAt time.Time `json:"fired_at"` + ResolvedAt *time.Time `json:"resolved_at,omitempty"` + + EmailSent bool `json:"email_sent"` + CreatedAt time.Time `json:"created_at"` +} + +type OpsAlertEventFilter struct { + Limit int + + // Optional filters. + Status string + Severity string + + StartTime *time.Time + EndTime *time.Time + + // Dimensions filters (best-effort). + Platform string + GroupID *int64 +} + diff --git a/backend/internal/service/ops_dashboard_models.go b/backend/internal/service/ops_dashboard_models.go new file mode 100644 index 00000000..51a0b1fb --- /dev/null +++ b/backend/internal/service/ops_dashboard_models.go @@ -0,0 +1,83 @@ +package service + +import "time" + +type OpsDashboardFilter struct { + StartTime time.Time + EndTime time.Time + + Platform string + GroupID *int64 + + // QueryMode controls whether dashboard queries should use raw logs or pre-aggregated tables. + // Expected values: auto/raw/preagg (see OpsQueryMode). + QueryMode OpsQueryMode +} + +type OpsRateSummary struct { + Current float64 `json:"current"` + Peak float64 `json:"peak"` + Avg float64 `json:"avg"` +} + +type OpsPercentiles struct { + P50 *int `json:"p50_ms"` + P90 *int `json:"p90_ms"` + P95 *int `json:"p95_ms"` + P99 *int `json:"p99_ms"` + Avg *int `json:"avg_ms"` + Max *int `json:"max_ms"` +} + +type OpsDashboardOverview struct { + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + Platform string `json:"platform"` + GroupID *int64 `json:"group_id"` + + // Latest system-level snapshot (window=1m, global). + SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"` + + // Background jobs health (heartbeats). + JobHeartbeats []*OpsJobHeartbeat `json:"job_heartbeats"` + + SuccessCount int64 `json:"success_count"` + ErrorCountTotal int64 `json:"error_count_total"` + BusinessLimitedCount int64 `json:"business_limited_count"` + + ErrorCountSLA int64 `json:"error_count_sla"` + RequestCountTotal int64 `json:"request_count_total"` + RequestCountSLA int64 `json:"request_count_sla"` + + TokenConsumed int64 `json:"token_consumed"` + + SLA float64 `json:"sla"` + ErrorRate float64 `json:"error_rate"` + UpstreamErrorRate float64 `json:"upstream_error_rate"` + UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"` + Upstream429Count int64 `json:"upstream_429_count"` + Upstream529Count int64 `json:"upstream_529_count"` + + QPS OpsRateSummary `json:"qps"` + TPS OpsRateSummary `json:"tps"` + + Duration OpsPercentiles `json:"duration"` + TTFT OpsPercentiles `json:"ttft"` +} + +type OpsLatencyHistogramBucket struct { + Range string `json:"range"` + Count int64 `json:"count"` +} + +// OpsLatencyHistogramResponse is a coarse latency distribution histogram (success requests only). +// It is used by the Ops dashboard to quickly identify tail latency regressions. +type OpsLatencyHistogramResponse struct { + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + Platform string `json:"platform"` + GroupID *int64 `json:"group_id"` + + TotalRequests int64 `json:"total_requests"` + Buckets []*OpsLatencyHistogramBucket `json:"buckets"` +} diff --git a/backend/internal/service/ops_models.go b/backend/internal/service/ops_models.go new file mode 100644 index 00000000..90b2dc47 --- /dev/null +++ b/backend/internal/service/ops_models.go @@ -0,0 +1,118 @@ +package service + +import "time" + +type OpsErrorLog struct { + ID int64 `json:"id"` + CreatedAt time.Time `json:"created_at"` + + Phase string `json:"phase"` + Type string `json:"type"` + Severity string `json:"severity"` + + StatusCode int `json:"status_code"` + Platform string `json:"platform"` + Model string `json:"model"` + + LatencyMs *int `json:"latency_ms"` + + ClientRequestID string `json:"client_request_id"` + RequestID string `json:"request_id"` + Message string `json:"message"` + + UserID *int64 `json:"user_id"` + APIKeyID *int64 `json:"api_key_id"` + AccountID *int64 `json:"account_id"` + GroupID *int64 `json:"group_id"` + + ClientIP *string `json:"client_ip"` + RequestPath string `json:"request_path"` + Stream bool `json:"stream"` +} + +type OpsErrorLogDetail struct { + OpsErrorLog + + ErrorBody string `json:"error_body"` + UserAgent string `json:"user_agent"` + + // Timings (optional) + AuthLatencyMs *int64 `json:"auth_latency_ms"` + RoutingLatencyMs *int64 `json:"routing_latency_ms"` + UpstreamLatencyMs *int64 `json:"upstream_latency_ms"` + ResponseLatencyMs *int64 `json:"response_latency_ms"` + TimeToFirstTokenMs *int64 `json:"time_to_first_token_ms"` + + // Retry context + RequestBody string `json:"request_body"` + RequestBodyTruncated bool `json:"request_body_truncated"` + RequestBodyBytes *int `json:"request_body_bytes"` + RequestHeaders string `json:"request_headers,omitempty"` + + // vNext metric semantics + IsBusinessLimited bool `json:"is_business_limited"` +} + +type OpsErrorLogFilter struct { + StartTime *time.Time + EndTime *time.Time + + Platform string + GroupID *int64 + AccountID *int64 + + StatusCodes []int + Phase string + Query string + + Page int + PageSize int +} + +type OpsErrorLogList struct { + Errors []*OpsErrorLog `json:"errors"` + Total int `json:"total"` + Page int `json:"page"` + PageSize int `json:"page_size"` +} + +type OpsRetryAttempt struct { + ID int64 `json:"id"` + CreatedAt time.Time `json:"created_at"` + + RequestedByUserID int64 `json:"requested_by_user_id"` + SourceErrorID int64 `json:"source_error_id"` + Mode string `json:"mode"` + PinnedAccountID *int64 `json:"pinned_account_id"` + + Status string `json:"status"` + StartedAt *time.Time `json:"started_at"` + FinishedAt *time.Time `json:"finished_at"` + DurationMs *int64 `json:"duration_ms"` + + ResultRequestID *string `json:"result_request_id"` + ResultErrorID *int64 `json:"result_error_id"` + + ErrorMessage *string `json:"error_message"` +} + +type OpsRetryResult struct { + AttemptID int64 `json:"attempt_id"` + Mode string `json:"mode"` + Status string `json:"status"` + + PinnedAccountID *int64 `json:"pinned_account_id"` + UsedAccountID *int64 `json:"used_account_id"` + + HTTPStatusCode int `json:"http_status_code"` + UpstreamRequestID string `json:"upstream_request_id"` + + ResponsePreview string `json:"response_preview"` + ResponseTruncated bool `json:"response_truncated"` + + ErrorMessage string `json:"error_message"` + + StartedAt time.Time `json:"started_at"` + FinishedAt time.Time `json:"finished_at"` + DurationMs int64 `json:"duration_ms"` +} diff --git a/backend/internal/service/ops_realtime_models.go b/backend/internal/service/ops_realtime_models.go new file mode 100644 index 00000000..f7514a24 --- /dev/null +++ b/backend/internal/service/ops_realtime_models.go @@ -0,0 +1,81 @@ +package service + +import "time" + +// PlatformConcurrencyInfo aggregates concurrency usage by platform. +type PlatformConcurrencyInfo struct { + Platform string `json:"platform"` + CurrentInUse int64 `json:"current_in_use"` + MaxCapacity int64 `json:"max_capacity"` + LoadPercentage float64 `json:"load_percentage"` + WaitingInQueue int64 `json:"waiting_in_queue"` +} + +// GroupConcurrencyInfo aggregates concurrency usage by group. +// +// Note: one account can belong to multiple groups; group totals are therefore not additive across groups. +type GroupConcurrencyInfo struct { + GroupID int64 `json:"group_id"` + GroupName string `json:"group_name"` + Platform string `json:"platform"` + CurrentInUse int64 `json:"current_in_use"` + MaxCapacity int64 `json:"max_capacity"` + LoadPercentage float64 `json:"load_percentage"` + WaitingInQueue int64 `json:"waiting_in_queue"` +} + +// AccountConcurrencyInfo represents real-time concurrency usage for a single account. +type AccountConcurrencyInfo struct { + AccountID int64 `json:"account_id"` + AccountName string `json:"account_name"` + Platform string `json:"platform"` + GroupID int64 `json:"group_id"` + GroupName string `json:"group_name"` + CurrentInUse int64 `json:"current_in_use"` + MaxCapacity int64 `json:"max_capacity"` + LoadPercentage float64 `json:"load_percentage"` + WaitingInQueue int64 `json:"waiting_in_queue"` +} + +// PlatformAvailability aggregates account availability by platform. +type PlatformAvailability struct { + Platform string `json:"platform"` + TotalAccounts int64 `json:"total_accounts"` + AvailableCount int64 `json:"available_count"` + RateLimitCount int64 `json:"rate_limit_count"` + ErrorCount int64 `json:"error_count"` +} + +// GroupAvailability aggregates account availability by group. +type GroupAvailability struct { + GroupID int64 `json:"group_id"` + GroupName string `json:"group_name"` + Platform string `json:"platform"` + TotalAccounts int64 `json:"total_accounts"` + AvailableCount int64 `json:"available_count"` + RateLimitCount int64 `json:"rate_limit_count"` + ErrorCount int64 `json:"error_count"` +} + +// AccountAvailability represents current availability for a single account. +type AccountAvailability struct { + AccountID int64 `json:"account_id"` + AccountName string `json:"account_name"` + Platform string `json:"platform"` + GroupID int64 `json:"group_id"` + GroupName string `json:"group_name"` + + Status string `json:"status"` + + IsAvailable bool `json:"is_available"` + IsRateLimited bool `json:"is_rate_limited"` + IsOverloaded bool `json:"is_overloaded"` + HasError bool `json:"has_error"` + + RateLimitResetAt *time.Time `json:"rate_limit_reset_at"` + RateLimitRemainingSec *int64 `json:"rate_limit_remaining_sec"` + OverloadUntil *time.Time `json:"overload_until"` + OverloadRemainingSec *int64 `json:"overload_remaining_sec"` + ErrorMessage string `json:"error_message"` + TempUnschedulableUntil *time.Time `json:"temp_unschedulable_until,omitempty"` +} diff --git a/backend/internal/service/ops_settings_models.go b/backend/internal/service/ops_settings_models.go new file mode 100644 index 00000000..78399c49 --- /dev/null +++ b/backend/internal/service/ops_settings_models.go @@ -0,0 +1,70 @@ +package service + +// Ops settings models stored in DB `settings` table (JSON blobs). + +type OpsEmailNotificationConfig struct { + Alert OpsEmailAlertConfig `json:"alert"` + Report OpsEmailReportConfig `json:"report"` +} + +type OpsEmailAlertConfig struct { + Enabled bool `json:"enabled"` + Recipients []string `json:"recipients"` + MinSeverity string `json:"min_severity"` + RateLimitPerHour int `json:"rate_limit_per_hour"` + BatchingWindowSeconds int `json:"batching_window_seconds"` + IncludeResolvedAlerts bool `json:"include_resolved_alerts"` +} + +type OpsEmailReportConfig struct { + Enabled bool `json:"enabled"` + Recipients []string `json:"recipients"` + DailySummaryEnabled bool `json:"daily_summary_enabled"` + DailySummarySchedule string `json:"daily_summary_schedule"` + WeeklySummaryEnabled bool `json:"weekly_summary_enabled"` + WeeklySummarySchedule string `json:"weekly_summary_schedule"` + ErrorDigestEnabled bool `json:"error_digest_enabled"` + ErrorDigestSchedule string `json:"error_digest_schedule"` + ErrorDigestMinCount int `json:"error_digest_min_count"` + AccountHealthEnabled bool `json:"account_health_enabled"` + AccountHealthSchedule string `json:"account_health_schedule"` + AccountHealthErrorRateThreshold float64 `json:"account_health_error_rate_threshold"` +} + +// OpsEmailNotificationConfigUpdateRequest allows partial updates, while the +// frontend can still send the full config shape. +type OpsEmailNotificationConfigUpdateRequest struct { + Alert *OpsEmailAlertConfig `json:"alert"` + Report *OpsEmailReportConfig `json:"report"` +} + +type OpsDistributedLockSettings struct { + Enabled bool `json:"enabled"` + Key string `json:"key"` + TTLSeconds int `json:"ttl_seconds"` +} + +type OpsAlertSilenceEntry struct { + RuleID *int64 `json:"rule_id,omitempty"` + Severities []string `json:"severities,omitempty"` + + UntilRFC3339 string `json:"until_rfc3339"` + Reason string `json:"reason"` +} + +type OpsAlertSilencingSettings struct { + Enabled bool `json:"enabled"` + + GlobalUntilRFC3339 string `json:"global_until_rfc3339"` + GlobalReason string `json:"global_reason"` + + Entries []OpsAlertSilenceEntry `json:"entries,omitempty"` +} + +type OpsAlertRuntimeSettings struct { + EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"` + + DistributedLock OpsDistributedLockSettings `json:"distributed_lock"` + Silencing OpsAlertSilencingSettings `json:"silencing"` +} + diff --git a/backend/internal/service/ops_trend_models.go b/backend/internal/service/ops_trend_models.go new file mode 100644 index 00000000..f6d07c14 --- /dev/null +++ b/backend/internal/service/ops_trend_models.go @@ -0,0 +1,65 @@ +package service + +import "time" + +type OpsThroughputTrendPoint struct { + BucketStart time.Time `json:"bucket_start"` + RequestCount int64 `json:"request_count"` + TokenConsumed int64 `json:"token_consumed"` + QPS float64 `json:"qps"` + TPS float64 `json:"tps"` +} + +type OpsThroughputPlatformBreakdownItem struct { + Platform string `json:"platform"` + RequestCount int64 `json:"request_count"` + TokenConsumed int64 `json:"token_consumed"` +} + +type OpsThroughputGroupBreakdownItem struct { + GroupID int64 `json:"group_id"` + GroupName string `json:"group_name"` + RequestCount int64 `json:"request_count"` + TokenConsumed int64 `json:"token_consumed"` +} + +type OpsThroughputTrendResponse struct { + Bucket string `json:"bucket"` + + Points []*OpsThroughputTrendPoint `json:"points"` + + // Optional drilldown helpers: + // - When no platform/group is selected: returns totals by platform. + // - When platform is selected but group is not: returns top groups in that platform. + ByPlatform []*OpsThroughputPlatformBreakdownItem `json:"by_platform,omitempty"` + TopGroups []*OpsThroughputGroupBreakdownItem `json:"top_groups,omitempty"` +} + +type OpsErrorTrendPoint struct { + BucketStart time.Time `json:"bucket_start"` + + ErrorCountTotal int64 `json:"error_count_total"` + BusinessLimitedCount int64 `json:"business_limited_count"` + ErrorCountSLA int64 `json:"error_count_sla"` + + UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"` + Upstream429Count int64 `json:"upstream_429_count"` + Upstream529Count int64 `json:"upstream_529_count"` +} + +type OpsErrorTrendResponse struct { + Bucket string `json:"bucket"` + Points []*OpsErrorTrendPoint `json:"points"` +} + +type OpsErrorDistributionItem struct { + StatusCode int `json:"status_code"` + Total int64 `json:"total"` + SLA int64 `json:"sla"` + BusinessLimited int64 `json:"business_limited"` +} + +type OpsErrorDistributionResponse struct { + Total int64 `json:"total"` + Items []*OpsErrorDistributionItem `json:"items"` +} diff --git a/backend/migrations/030_ops_monitoring_vnext.sql b/backend/migrations/030_ops_monitoring_vnext.sql new file mode 100644 index 00000000..39b19e5d --- /dev/null +++ b/backend/migrations/030_ops_monitoring_vnext.sql @@ -0,0 +1,707 @@ +-- Ops Monitoring (vNext): squashed migration (030) +-- +-- This repository originally planned Ops vNext as migrations 030-036: +-- 030 drop legacy ops tables +-- 031 core schema +-- 032 pre-aggregation tables +-- 033 indexes + optional extensions +-- 034 add avg/max to preagg +-- 035 add notify_email to alert rules +-- 036 seed default alert rules +-- +-- Since these migrations have NOT been applied to any environment yet, we squash them +-- into a single 030 migration for easier review and a cleaner migration history. +-- +-- Notes: +-- - This is intentionally destructive for ops_* data (error logs / metrics / alerts). +-- - It is idempotent (DROP/CREATE/ALTER IF EXISTS/IF NOT EXISTS), but will wipe ops_* data if re-run. + +-- ===================================================================== +-- 030_ops_drop_legacy_ops_tables.sql +-- ===================================================================== + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- Legacy pre-aggregation tables (from 026 and/or previous branches) +DROP TABLE IF EXISTS ops_metrics_daily CASCADE; +DROP TABLE IF EXISTS ops_metrics_hourly CASCADE; + +-- Core ops tables that may exist in some deployments / branches +DROP TABLE IF EXISTS ops_system_metrics CASCADE; +DROP TABLE IF EXISTS ops_error_logs CASCADE; +DROP TABLE IF EXISTS ops_alert_events CASCADE; +DROP TABLE IF EXISTS ops_alert_rules CASCADE; +DROP TABLE IF EXISTS ops_job_heartbeats CASCADE; +DROP TABLE IF EXISTS ops_retry_attempts CASCADE; + +-- Optional legacy tables (best-effort cleanup) +DROP TABLE IF EXISTS ops_scheduled_reports CASCADE; +DROP TABLE IF EXISTS ops_group_availability_configs CASCADE; +DROP TABLE IF EXISTS ops_group_availability_events CASCADE; + +-- Optional legacy views/indexes +DROP VIEW IF EXISTS ops_latest_metrics CASCADE; + +-- ===================================================================== +-- 031_ops_core_schema.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): core schema (errors / retries / metrics / jobs / alerts) +-- +-- Design goals: +-- - Support global filtering (time/platform/group) across all ops modules. +-- - Persist enough context for two retry modes (client retry / pinned upstream retry). +-- - Make ops background jobs observable via job heartbeats. +-- - Keep schema stable and indexes targeted (high-write tables). +-- +-- Notes: +-- - This migration is idempotent. +-- - ops_* tables intentionally avoid strict foreign keys to reduce write amplification/locks. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- ============================================ +-- 1) ops_error_logs: error log details (high-write) +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_error_logs ( + id BIGSERIAL PRIMARY KEY, + + -- Correlation / identities + request_id VARCHAR(64), + client_request_id VARCHAR(64), + user_id BIGINT, + api_key_id BIGINT, + account_id BIGINT, + group_id BIGINT, + client_ip inet, + + -- Dimensions for global filtering + platform VARCHAR(32), + + -- Request metadata + model VARCHAR(100), + request_path VARCHAR(256), + stream BOOLEAN NOT NULL DEFAULT false, + user_agent TEXT, + + -- Core error classification + error_phase VARCHAR(32) NOT NULL, + error_type VARCHAR(64) NOT NULL, + severity VARCHAR(8) NOT NULL DEFAULT 'P2', + status_code INT, + + -- vNext metric semantics + is_business_limited BOOLEAN NOT NULL DEFAULT false, + + -- Error details (sanitized/truncated at ingest time) + error_message TEXT, + error_body TEXT, + + -- Provider/upstream details (optional; useful for trends & account health) + error_source VARCHAR(64), + error_owner VARCHAR(32), + account_status VARCHAR(50), + upstream_status_code INT, + upstream_error_message TEXT, + upstream_error_detail TEXT, + provider_error_code VARCHAR(64), + provider_error_type VARCHAR(64), + network_error_type VARCHAR(50), + retry_after_seconds INT, + + -- Timings (ms) - optional + duration_ms INT, + time_to_first_token_ms BIGINT, + auth_latency_ms BIGINT, + routing_latency_ms BIGINT, + upstream_latency_ms BIGINT, + response_latency_ms BIGINT, + + -- Retry context (only stored for error requests) + request_body JSONB, + request_headers JSONB, + request_body_truncated BOOLEAN NOT NULL DEFAULT false, + request_body_bytes INT, + + -- Retryability flags (best-effort classification) + is_retryable BOOLEAN NOT NULL DEFAULT false, + retry_count INT NOT NULL DEFAULT 0, + + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +COMMENT ON TABLE ops_error_logs IS 'Ops error logs (vNext). Stores sanitized error details and request_body for retries (errors only).'; + +-- ============================================ +-- 2) ops_retry_attempts: audit log for retries +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_retry_attempts ( + id BIGSERIAL PRIMARY KEY, + + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + + requested_by_user_id BIGINT, + source_error_id BIGINT, + + -- client|upstream + mode VARCHAR(16) NOT NULL, + pinned_account_id BIGINT, + + -- queued|running|succeeded|failed + status VARCHAR(16) NOT NULL DEFAULT 'queued', + started_at TIMESTAMPTZ, + finished_at TIMESTAMPTZ, + duration_ms BIGINT, + + -- Optional result correlation + result_request_id VARCHAR(64), + result_error_id BIGINT, + result_usage_request_id VARCHAR(64), + + error_message TEXT +); + +COMMENT ON TABLE ops_retry_attempts IS 'Audit table for ops retries (client retry / pinned upstream retry).'; + +-- ============================================ +-- 3) ops_system_metrics: system + request window snapshots +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_system_metrics ( + id BIGSERIAL PRIMARY KEY, + + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + window_minutes INT NOT NULL DEFAULT 1, + + -- Optional dimensions (only if collector chooses to write per-dimension snapshots) + platform VARCHAR(32), + group_id BIGINT, + + -- Core counts + success_count BIGINT NOT NULL DEFAULT 0, + error_count_total BIGINT NOT NULL DEFAULT 0, + business_limited_count BIGINT NOT NULL DEFAULT 0, + error_count_sla BIGINT NOT NULL DEFAULT 0, + + upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0, + upstream_429_count BIGINT NOT NULL DEFAULT 0, + upstream_529_count BIGINT NOT NULL DEFAULT 0, + + token_consumed BIGINT NOT NULL DEFAULT 0, + + -- Rates + qps DOUBLE PRECISION, + tps DOUBLE PRECISION, + + -- Duration percentiles (ms) - success requests + duration_p50_ms INT, + duration_p90_ms INT, + duration_p95_ms INT, + duration_p99_ms INT, + duration_avg_ms DOUBLE PRECISION, + duration_max_ms INT, + + -- TTFT percentiles (ms) - success requests (streaming) + ttft_p50_ms INT, + ttft_p90_ms INT, + ttft_p95_ms INT, + ttft_p99_ms INT, + ttft_avg_ms DOUBLE PRECISION, + ttft_max_ms INT, + + -- System resources + cpu_usage_percent DOUBLE PRECISION, + memory_used_mb BIGINT, + memory_total_mb BIGINT, + memory_usage_percent DOUBLE PRECISION, + + -- Dependency health (best-effort) + db_ok BOOLEAN, + redis_ok BOOLEAN, + + -- DB pool & runtime + db_conn_active INT, + db_conn_idle INT, + db_conn_waiting INT, + goroutine_count INT, + + -- Queue / concurrency + concurrency_queue_depth INT +); + +COMMENT ON TABLE ops_system_metrics IS 'Ops system/request metrics snapshots (vNext). Used for dashboard overview and realtime rates.'; + +-- ============================================ +-- 4) ops_job_heartbeats: background jobs health +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_job_heartbeats ( + job_name VARCHAR(64) PRIMARY KEY, + + last_run_at TIMESTAMPTZ, + last_success_at TIMESTAMPTZ, + last_error_at TIMESTAMPTZ, + last_error TEXT, + last_duration_ms BIGINT, + + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +COMMENT ON TABLE ops_job_heartbeats IS 'Ops background jobs heartbeats (vNext).'; + +-- ============================================ +-- 5) ops_alert_rules / ops_alert_events +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_alert_rules ( + id BIGSERIAL PRIMARY KEY, + + name VARCHAR(128) NOT NULL, + description TEXT, + enabled BOOLEAN NOT NULL DEFAULT true, + + severity VARCHAR(16) NOT NULL DEFAULT 'warning', + + -- Metric definition + -- Metric definition + metric_type VARCHAR(64) NOT NULL, + operator VARCHAR(8) NOT NULL, + threshold DOUBLE PRECISION NOT NULL, + + window_minutes INT NOT NULL DEFAULT 5, + sustained_minutes INT NOT NULL DEFAULT 5, + cooldown_minutes INT NOT NULL DEFAULT 10, + + -- Optional scoping: platform/group filters etc. + filters JSONB, + + last_triggered_at TIMESTAMPTZ, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_alert_rules_name_unique + ON ops_alert_rules (name); + +CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled + ON ops_alert_rules (enabled); + +CREATE TABLE IF NOT EXISTS ops_alert_events ( + id BIGSERIAL PRIMARY KEY, + + rule_id BIGINT, + severity VARCHAR(16) NOT NULL, + status VARCHAR(16) NOT NULL DEFAULT 'firing', + + title VARCHAR(200), + description TEXT, + + metric_value DOUBLE PRECISION, + threshold_value DOUBLE PRECISION, + dimensions JSONB, + + fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + resolved_at TIMESTAMPTZ, + + email_sent BOOLEAN NOT NULL DEFAULT false, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status + ON ops_alert_events (rule_id, status); + +CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at + ON ops_alert_events (fired_at DESC); + +-- ===================================================================== +-- 032_ops_preaggregation_tables.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): pre-aggregation tables +-- +-- Purpose: +-- - Provide stable query performance for 1–24h windows (and beyond), avoiding expensive +-- percentile_cont scans on raw logs for every dashboard refresh. +-- - Support global filter dimensions: overall / platform / group. +-- +-- Design note: +-- - We keep a single table with nullable platform/group_id, and enforce uniqueness via a +-- COALESCE-based unique index (because UNIQUE with NULLs allows duplicates in Postgres). + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- ============================================ +-- 1) ops_metrics_hourly +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_metrics_hourly ( + id BIGSERIAL PRIMARY KEY, + + bucket_start TIMESTAMPTZ NOT NULL, + platform VARCHAR(32), + group_id BIGINT, + + success_count BIGINT NOT NULL DEFAULT 0, + error_count_total BIGINT NOT NULL DEFAULT 0, + business_limited_count BIGINT NOT NULL DEFAULT 0, + error_count_sla BIGINT NOT NULL DEFAULT 0, + + upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0, + upstream_429_count BIGINT NOT NULL DEFAULT 0, + upstream_529_count BIGINT NOT NULL DEFAULT 0, + + token_consumed BIGINT NOT NULL DEFAULT 0, + + -- Duration percentiles (ms) + duration_p50_ms INT, + duration_p90_ms INT, + duration_p95_ms INT, + duration_p99_ms INT, + + -- TTFT percentiles (ms) + ttft_p50_ms INT, + ttft_p90_ms INT, + ttft_p95_ms INT, + ttft_p99_ms INT, + + computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +-- Uniqueness across three “dimension modes” (overall / platform / group). +-- Postgres UNIQUE treats NULLs as distinct, so we enforce uniqueness via COALESCE. +CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_hourly_unique_dim + ON ops_metrics_hourly ( + bucket_start, + COALESCE(platform, ''), + COALESCE(group_id, 0) + ); + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_bucket + ON ops_metrics_hourly (bucket_start DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_platform_bucket + ON ops_metrics_hourly (platform, bucket_start DESC) + WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL; + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_group_bucket + ON ops_metrics_hourly (group_id, bucket_start DESC) + WHERE group_id IS NOT NULL AND group_id <> 0; + +COMMENT ON TABLE ops_metrics_hourly IS 'vNext hourly pre-aggregated ops metrics (overall/platform/group).'; + +-- ============================================ +-- 2) ops_metrics_daily (optional; for longer windows) +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_metrics_daily ( + id BIGSERIAL PRIMARY KEY, + + bucket_date DATE NOT NULL, + platform VARCHAR(32), + group_id BIGINT, + + success_count BIGINT NOT NULL DEFAULT 0, + error_count_total BIGINT NOT NULL DEFAULT 0, + business_limited_count BIGINT NOT NULL DEFAULT 0, + error_count_sla BIGINT NOT NULL DEFAULT 0, + + upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0, + upstream_429_count BIGINT NOT NULL DEFAULT 0, + upstream_529_count BIGINT NOT NULL DEFAULT 0, + + token_consumed BIGINT NOT NULL DEFAULT 0, + + duration_p50_ms INT, + duration_p90_ms INT, + duration_p95_ms INT, + duration_p99_ms INT, + + ttft_p50_ms INT, + ttft_p90_ms INT, + ttft_p95_ms INT, + ttft_p99_ms INT, + + computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_daily_unique_dim + ON ops_metrics_daily ( + bucket_date, + COALESCE(platform, ''), + COALESCE(group_id, 0) + ); + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_bucket + ON ops_metrics_daily (bucket_date DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_platform_bucket + ON ops_metrics_daily (platform, bucket_date DESC) + WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL; + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_group_bucket + ON ops_metrics_daily (group_id, bucket_date DESC) + WHERE group_id IS NOT NULL AND group_id <> 0; + +COMMENT ON TABLE ops_metrics_daily IS 'vNext daily pre-aggregated ops metrics (overall/platform/group).'; + +-- ===================================================================== +-- 033_ops_indexes_and_extensions.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): indexes and optional extensions +-- +-- This migration intentionally keeps "optional" objects (like pg_trgm) best-effort, +-- so environments without extension privileges won't fail the whole migration chain. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- ============================================ +-- 1) Core btree indexes (always safe) +-- ============================================ + +-- ops_error_logs +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at + ON ops_error_logs (created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform_time + ON ops_error_logs (platform, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_group_time + ON ops_error_logs (group_id, created_at DESC) + WHERE group_id IS NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_account_time + ON ops_error_logs (account_id, created_at DESC) + WHERE account_id IS NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_status_time + ON ops_error_logs (status_code, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_time + ON ops_error_logs (error_phase, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_type_time + ON ops_error_logs (error_type, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id + ON ops_error_logs (request_id); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id + ON ops_error_logs (client_request_id); + +-- ops_system_metrics +CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at + ON ops_system_metrics (created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time + ON ops_system_metrics (window_minutes, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_platform_time + ON ops_system_metrics (platform, created_at DESC) + WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL; + +CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_group_time + ON ops_system_metrics (group_id, created_at DESC) + WHERE group_id IS NOT NULL; + +-- ops_retry_attempts +CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_created_at + ON ops_retry_attempts (created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_source_error + ON ops_retry_attempts (source_error_id, created_at DESC) + WHERE source_error_id IS NOT NULL; + +-- Prevent concurrent retries for the same ops_error_logs row (race-free, multi-instance safe). +CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_retry_attempts_unique_active + ON ops_retry_attempts (source_error_id) + WHERE source_error_id IS NOT NULL AND status IN ('queued', 'running'); + +-- ============================================ +-- 2) Optional: pg_trgm + trigram indexes for fuzzy search +-- ============================================ + +DO $$ +BEGIN + BEGIN + CREATE EXTENSION IF NOT EXISTS pg_trgm; + EXCEPTION WHEN OTHERS THEN + -- Missing privileges or extension package should not block migrations. + RAISE NOTICE 'pg_trgm extension not created: %', SQLERRM; + END; + + IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm') THEN + -- request_id / client_request_id fuzzy search + EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id_trgm + ON ops_error_logs USING gin (request_id gin_trgm_ops)'; + EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id_trgm + ON ops_error_logs USING gin (client_request_id gin_trgm_ops)'; + + -- error_message fuzzy search + EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_error_message_trgm + ON ops_error_logs USING gin (error_message gin_trgm_ops)'; + END IF; +END $$; + +-- ===================================================================== +-- 034_ops_preaggregation_add_avg_max.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): extend pre-aggregation tables with avg/max latency fields +-- +-- Why: +-- - The dashboard overview returns avg/max for duration/TTFT. +-- - Hourly/daily pre-aggregation tables originally stored only p50/p90/p95/p99, which makes +-- it impossible to answer avg/max in preagg mode without falling back to raw scans. +-- +-- This migration is idempotent and safe to run multiple times. +-- +-- NOTE: We keep the existing p50/p90/p95/p99 columns as-is; these are still used for +-- approximate long-window summaries. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- Hourly table +ALTER TABLE ops_metrics_hourly + ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION, + ADD COLUMN IF NOT EXISTS duration_max_ms INT, + ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION, + ADD COLUMN IF NOT EXISTS ttft_max_ms INT; + +-- Daily table +ALTER TABLE ops_metrics_daily + ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION, + ADD COLUMN IF NOT EXISTS duration_max_ms INT, + ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION, + ADD COLUMN IF NOT EXISTS ttft_max_ms INT; + +-- ===================================================================== +-- 035_ops_alert_rules_notify_email.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): alert rule notify settings +-- +-- Adds notify_email flag to ops_alert_rules to keep UI parity with the backup Ops dashboard. +-- Migration is idempotent. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +ALTER TABLE ops_alert_rules + ADD COLUMN IF NOT EXISTS notify_email BOOLEAN NOT NULL DEFAULT true; + +-- ===================================================================== +-- 036_ops_seed_default_alert_rules.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): seed default alert rules (idempotent) +-- +-- Goal: +-- - Provide "out of the box" alert rules so the Ops dashboard can immediately show alert events. +-- - Keep inserts idempotent via ON CONFLICT (name) DO NOTHING. +-- +-- Notes: +-- - Thresholds are intentionally conservative defaults and should be tuned per deployment. +-- - Metric semantics follow vNext: +-- - success_rate / error_rate are based on SLA-scope counts (exclude is_business_limited). +-- - upstream_error_rate excludes 429/529. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- 1) High error rate (P1) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + '错误率过高', + '当错误率超过 5% 且持续 5 分钟时触发告警', + true, 'error_rate', '>', 5.0, 5, 5, 'P1', true, 20, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 2) Low success rate (P0) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + '成功率过低', + '当成功率低于 95% 且持续 5 分钟时触发告警(服务可用性下降)', + true, 'success_rate', '<', 95.0, 5, 5, 'P0', true, 15, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 3) P99 latency too high (P2) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + 'P99延迟过高', + '当 P99 延迟超过 3000ms 且持续 10 分钟时触发告警', + true, 'p99_latency_ms', '>', 3000.0, 5, 10, 'P2', true, 30, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 4) P95 latency too high (P2) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + 'P95延迟过高', + '当 P95 延迟超过 2000ms 且持续 10 分钟时触发告警', + true, 'p95_latency_ms', '>', 2000.0, 5, 10, 'P2', true, 30, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 5) CPU usage too high (P2) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + 'CPU使用率过高', + '当 CPU 使用率超过 85% 且持续 10 分钟时触发告警', + true, 'cpu_usage_percent', '>', 85.0, 5, 10, 'P2', true, 30, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 6) Memory usage too high (P1) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + '内存使用率过高', + '当内存使用率超过 90% 且持续 10 分钟时触发告警(可能导致 OOM)', + true, 'memory_usage_percent', '>', 90.0, 5, 10, 'P1', true, 20, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 7) Concurrency queue buildup (P1) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + '并发队列积压', + '当并发队列深度超过 100 且持续 5 分钟时触发告警(系统处理能力不足)', + true, 'concurrency_queue_depth', '>', 100.0, 5, 5, 'P1', true, 20, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 8) Extremely high error rate (P0) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + '错误率极高', + '当错误率超过 20% 且持续 1 分钟时触发告警(服务严重异常)', + true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING;