feat(数据库): 添加运维监控数据模型和数据库迁移脚本
- 新增 ops 监控数据库迁移脚本(表结构定义) - 定义核心数据模型(ops_models.go) - 定义告警相关模型(ops_alert_models.go) - 定义仪表板数据模型(ops_dashboard_models.go) - 定义实时监控数据模型(ops_realtime_models.go) - 定义配置相关模型(ops_settings_models.go) - 定义趋势分析数据模型(ops_trend_models.go)
This commit is contained in:
75
backend/internal/service/ops_alert_models.go
Normal file
75
backend/internal/service/ops_alert_models.go
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// Ops alert rule/event models.
|
||||||
|
//
|
||||||
|
// NOTE: These are admin-facing DTOs and intentionally keep JSON naming aligned
|
||||||
|
// with the existing ops dashboard frontend (backup style).
|
||||||
|
|
||||||
|
const (
|
||||||
|
OpsAlertStatusFiring = "firing"
|
||||||
|
OpsAlertStatusResolved = "resolved"
|
||||||
|
)
|
||||||
|
|
||||||
|
type OpsAlertRule struct {
|
||||||
|
ID int64 `json:"id"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Description string `json:"description"`
|
||||||
|
|
||||||
|
Enabled bool `json:"enabled"`
|
||||||
|
Severity string `json:"severity"`
|
||||||
|
|
||||||
|
MetricType string `json:"metric_type"`
|
||||||
|
Operator string `json:"operator"`
|
||||||
|
Threshold float64 `json:"threshold"`
|
||||||
|
|
||||||
|
WindowMinutes int `json:"window_minutes"`
|
||||||
|
SustainedMinutes int `json:"sustained_minutes"`
|
||||||
|
CooldownMinutes int `json:"cooldown_minutes"`
|
||||||
|
|
||||||
|
NotifyEmail bool `json:"notify_email"`
|
||||||
|
|
||||||
|
Filters map[string]any `json:"filters,omitempty"`
|
||||||
|
|
||||||
|
LastTriggeredAt *time.Time `json:"last_triggered_at,omitempty"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsAlertEvent struct {
|
||||||
|
ID int64 `json:"id"`
|
||||||
|
RuleID int64 `json:"rule_id"`
|
||||||
|
Severity string `json:"severity"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
|
||||||
|
Title string `json:"title"`
|
||||||
|
Description string `json:"description"`
|
||||||
|
|
||||||
|
MetricValue *float64 `json:"metric_value,omitempty"`
|
||||||
|
ThresholdValue *float64 `json:"threshold_value,omitempty"`
|
||||||
|
|
||||||
|
Dimensions map[string]any `json:"dimensions,omitempty"`
|
||||||
|
|
||||||
|
FiredAt time.Time `json:"fired_at"`
|
||||||
|
ResolvedAt *time.Time `json:"resolved_at,omitempty"`
|
||||||
|
|
||||||
|
EmailSent bool `json:"email_sent"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsAlertEventFilter struct {
|
||||||
|
Limit int
|
||||||
|
|
||||||
|
// Optional filters.
|
||||||
|
Status string
|
||||||
|
Severity string
|
||||||
|
|
||||||
|
StartTime *time.Time
|
||||||
|
EndTime *time.Time
|
||||||
|
|
||||||
|
// Dimensions filters (best-effort).
|
||||||
|
Platform string
|
||||||
|
GroupID *int64
|
||||||
|
}
|
||||||
|
|
||||||
83
backend/internal/service/ops_dashboard_models.go
Normal file
83
backend/internal/service/ops_dashboard_models.go
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
type OpsDashboardFilter struct {
|
||||||
|
StartTime time.Time
|
||||||
|
EndTime time.Time
|
||||||
|
|
||||||
|
Platform string
|
||||||
|
GroupID *int64
|
||||||
|
|
||||||
|
// QueryMode controls whether dashboard queries should use raw logs or pre-aggregated tables.
|
||||||
|
// Expected values: auto/raw/preagg (see OpsQueryMode).
|
||||||
|
QueryMode OpsQueryMode
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsRateSummary struct {
|
||||||
|
Current float64 `json:"current"`
|
||||||
|
Peak float64 `json:"peak"`
|
||||||
|
Avg float64 `json:"avg"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsPercentiles struct {
|
||||||
|
P50 *int `json:"p50_ms"`
|
||||||
|
P90 *int `json:"p90_ms"`
|
||||||
|
P95 *int `json:"p95_ms"`
|
||||||
|
P99 *int `json:"p99_ms"`
|
||||||
|
Avg *int `json:"avg_ms"`
|
||||||
|
Max *int `json:"max_ms"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsDashboardOverview struct {
|
||||||
|
StartTime time.Time `json:"start_time"`
|
||||||
|
EndTime time.Time `json:"end_time"`
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
GroupID *int64 `json:"group_id"`
|
||||||
|
|
||||||
|
// Latest system-level snapshot (window=1m, global).
|
||||||
|
SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"`
|
||||||
|
|
||||||
|
// Background jobs health (heartbeats).
|
||||||
|
JobHeartbeats []*OpsJobHeartbeat `json:"job_heartbeats"`
|
||||||
|
|
||||||
|
SuccessCount int64 `json:"success_count"`
|
||||||
|
ErrorCountTotal int64 `json:"error_count_total"`
|
||||||
|
BusinessLimitedCount int64 `json:"business_limited_count"`
|
||||||
|
|
||||||
|
ErrorCountSLA int64 `json:"error_count_sla"`
|
||||||
|
RequestCountTotal int64 `json:"request_count_total"`
|
||||||
|
RequestCountSLA int64 `json:"request_count_sla"`
|
||||||
|
|
||||||
|
TokenConsumed int64 `json:"token_consumed"`
|
||||||
|
|
||||||
|
SLA float64 `json:"sla"`
|
||||||
|
ErrorRate float64 `json:"error_rate"`
|
||||||
|
UpstreamErrorRate float64 `json:"upstream_error_rate"`
|
||||||
|
UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
|
||||||
|
Upstream429Count int64 `json:"upstream_429_count"`
|
||||||
|
Upstream529Count int64 `json:"upstream_529_count"`
|
||||||
|
|
||||||
|
QPS OpsRateSummary `json:"qps"`
|
||||||
|
TPS OpsRateSummary `json:"tps"`
|
||||||
|
|
||||||
|
Duration OpsPercentiles `json:"duration"`
|
||||||
|
TTFT OpsPercentiles `json:"ttft"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsLatencyHistogramBucket struct {
|
||||||
|
Range string `json:"range"`
|
||||||
|
Count int64 `json:"count"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpsLatencyHistogramResponse is a coarse latency distribution histogram (success requests only).
|
||||||
|
// It is used by the Ops dashboard to quickly identify tail latency regressions.
|
||||||
|
type OpsLatencyHistogramResponse struct {
|
||||||
|
StartTime time.Time `json:"start_time"`
|
||||||
|
EndTime time.Time `json:"end_time"`
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
GroupID *int64 `json:"group_id"`
|
||||||
|
|
||||||
|
TotalRequests int64 `json:"total_requests"`
|
||||||
|
Buckets []*OpsLatencyHistogramBucket `json:"buckets"`
|
||||||
|
}
|
||||||
118
backend/internal/service/ops_models.go
Normal file
118
backend/internal/service/ops_models.go
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
type OpsErrorLog struct {
|
||||||
|
ID int64 `json:"id"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
|
||||||
|
Phase string `json:"phase"`
|
||||||
|
Type string `json:"type"`
|
||||||
|
Severity string `json:"severity"`
|
||||||
|
|
||||||
|
StatusCode int `json:"status_code"`
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
Model string `json:"model"`
|
||||||
|
|
||||||
|
LatencyMs *int `json:"latency_ms"`
|
||||||
|
|
||||||
|
ClientRequestID string `json:"client_request_id"`
|
||||||
|
RequestID string `json:"request_id"`
|
||||||
|
Message string `json:"message"`
|
||||||
|
|
||||||
|
UserID *int64 `json:"user_id"`
|
||||||
|
APIKeyID *int64 `json:"api_key_id"`
|
||||||
|
AccountID *int64 `json:"account_id"`
|
||||||
|
GroupID *int64 `json:"group_id"`
|
||||||
|
|
||||||
|
ClientIP *string `json:"client_ip"`
|
||||||
|
RequestPath string `json:"request_path"`
|
||||||
|
Stream bool `json:"stream"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsErrorLogDetail struct {
|
||||||
|
OpsErrorLog
|
||||||
|
|
||||||
|
ErrorBody string `json:"error_body"`
|
||||||
|
UserAgent string `json:"user_agent"`
|
||||||
|
|
||||||
|
// Timings (optional)
|
||||||
|
AuthLatencyMs *int64 `json:"auth_latency_ms"`
|
||||||
|
RoutingLatencyMs *int64 `json:"routing_latency_ms"`
|
||||||
|
UpstreamLatencyMs *int64 `json:"upstream_latency_ms"`
|
||||||
|
ResponseLatencyMs *int64 `json:"response_latency_ms"`
|
||||||
|
TimeToFirstTokenMs *int64 `json:"time_to_first_token_ms"`
|
||||||
|
|
||||||
|
// Retry context
|
||||||
|
RequestBody string `json:"request_body"`
|
||||||
|
RequestBodyTruncated bool `json:"request_body_truncated"`
|
||||||
|
RequestBodyBytes *int `json:"request_body_bytes"`
|
||||||
|
RequestHeaders string `json:"request_headers,omitempty"`
|
||||||
|
|
||||||
|
// vNext metric semantics
|
||||||
|
IsBusinessLimited bool `json:"is_business_limited"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsErrorLogFilter struct {
|
||||||
|
StartTime *time.Time
|
||||||
|
EndTime *time.Time
|
||||||
|
|
||||||
|
Platform string
|
||||||
|
GroupID *int64
|
||||||
|
AccountID *int64
|
||||||
|
|
||||||
|
StatusCodes []int
|
||||||
|
Phase string
|
||||||
|
Query string
|
||||||
|
|
||||||
|
Page int
|
||||||
|
PageSize int
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsErrorLogList struct {
|
||||||
|
Errors []*OpsErrorLog `json:"errors"`
|
||||||
|
Total int `json:"total"`
|
||||||
|
Page int `json:"page"`
|
||||||
|
PageSize int `json:"page_size"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsRetryAttempt struct {
|
||||||
|
ID int64 `json:"id"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
|
||||||
|
RequestedByUserID int64 `json:"requested_by_user_id"`
|
||||||
|
SourceErrorID int64 `json:"source_error_id"`
|
||||||
|
Mode string `json:"mode"`
|
||||||
|
PinnedAccountID *int64 `json:"pinned_account_id"`
|
||||||
|
|
||||||
|
Status string `json:"status"`
|
||||||
|
StartedAt *time.Time `json:"started_at"`
|
||||||
|
FinishedAt *time.Time `json:"finished_at"`
|
||||||
|
DurationMs *int64 `json:"duration_ms"`
|
||||||
|
|
||||||
|
ResultRequestID *string `json:"result_request_id"`
|
||||||
|
ResultErrorID *int64 `json:"result_error_id"`
|
||||||
|
|
||||||
|
ErrorMessage *string `json:"error_message"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsRetryResult struct {
|
||||||
|
AttemptID int64 `json:"attempt_id"`
|
||||||
|
Mode string `json:"mode"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
|
||||||
|
PinnedAccountID *int64 `json:"pinned_account_id"`
|
||||||
|
UsedAccountID *int64 `json:"used_account_id"`
|
||||||
|
|
||||||
|
HTTPStatusCode int `json:"http_status_code"`
|
||||||
|
UpstreamRequestID string `json:"upstream_request_id"`
|
||||||
|
|
||||||
|
ResponsePreview string `json:"response_preview"`
|
||||||
|
ResponseTruncated bool `json:"response_truncated"`
|
||||||
|
|
||||||
|
ErrorMessage string `json:"error_message"`
|
||||||
|
|
||||||
|
StartedAt time.Time `json:"started_at"`
|
||||||
|
FinishedAt time.Time `json:"finished_at"`
|
||||||
|
DurationMs int64 `json:"duration_ms"`
|
||||||
|
}
|
||||||
81
backend/internal/service/ops_realtime_models.go
Normal file
81
backend/internal/service/ops_realtime_models.go
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// PlatformConcurrencyInfo aggregates concurrency usage by platform.
|
||||||
|
type PlatformConcurrencyInfo struct {
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
CurrentInUse int64 `json:"current_in_use"`
|
||||||
|
MaxCapacity int64 `json:"max_capacity"`
|
||||||
|
LoadPercentage float64 `json:"load_percentage"`
|
||||||
|
WaitingInQueue int64 `json:"waiting_in_queue"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// GroupConcurrencyInfo aggregates concurrency usage by group.
|
||||||
|
//
|
||||||
|
// Note: one account can belong to multiple groups; group totals are therefore not additive across groups.
|
||||||
|
type GroupConcurrencyInfo struct {
|
||||||
|
GroupID int64 `json:"group_id"`
|
||||||
|
GroupName string `json:"group_name"`
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
CurrentInUse int64 `json:"current_in_use"`
|
||||||
|
MaxCapacity int64 `json:"max_capacity"`
|
||||||
|
LoadPercentage float64 `json:"load_percentage"`
|
||||||
|
WaitingInQueue int64 `json:"waiting_in_queue"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// AccountConcurrencyInfo represents real-time concurrency usage for a single account.
|
||||||
|
type AccountConcurrencyInfo struct {
|
||||||
|
AccountID int64 `json:"account_id"`
|
||||||
|
AccountName string `json:"account_name"`
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
GroupID int64 `json:"group_id"`
|
||||||
|
GroupName string `json:"group_name"`
|
||||||
|
CurrentInUse int64 `json:"current_in_use"`
|
||||||
|
MaxCapacity int64 `json:"max_capacity"`
|
||||||
|
LoadPercentage float64 `json:"load_percentage"`
|
||||||
|
WaitingInQueue int64 `json:"waiting_in_queue"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// PlatformAvailability aggregates account availability by platform.
|
||||||
|
type PlatformAvailability struct {
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
TotalAccounts int64 `json:"total_accounts"`
|
||||||
|
AvailableCount int64 `json:"available_count"`
|
||||||
|
RateLimitCount int64 `json:"rate_limit_count"`
|
||||||
|
ErrorCount int64 `json:"error_count"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// GroupAvailability aggregates account availability by group.
|
||||||
|
type GroupAvailability struct {
|
||||||
|
GroupID int64 `json:"group_id"`
|
||||||
|
GroupName string `json:"group_name"`
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
TotalAccounts int64 `json:"total_accounts"`
|
||||||
|
AvailableCount int64 `json:"available_count"`
|
||||||
|
RateLimitCount int64 `json:"rate_limit_count"`
|
||||||
|
ErrorCount int64 `json:"error_count"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// AccountAvailability represents current availability for a single account.
|
||||||
|
type AccountAvailability struct {
|
||||||
|
AccountID int64 `json:"account_id"`
|
||||||
|
AccountName string `json:"account_name"`
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
GroupID int64 `json:"group_id"`
|
||||||
|
GroupName string `json:"group_name"`
|
||||||
|
|
||||||
|
Status string `json:"status"`
|
||||||
|
|
||||||
|
IsAvailable bool `json:"is_available"`
|
||||||
|
IsRateLimited bool `json:"is_rate_limited"`
|
||||||
|
IsOverloaded bool `json:"is_overloaded"`
|
||||||
|
HasError bool `json:"has_error"`
|
||||||
|
|
||||||
|
RateLimitResetAt *time.Time `json:"rate_limit_reset_at"`
|
||||||
|
RateLimitRemainingSec *int64 `json:"rate_limit_remaining_sec"`
|
||||||
|
OverloadUntil *time.Time `json:"overload_until"`
|
||||||
|
OverloadRemainingSec *int64 `json:"overload_remaining_sec"`
|
||||||
|
ErrorMessage string `json:"error_message"`
|
||||||
|
TempUnschedulableUntil *time.Time `json:"temp_unschedulable_until,omitempty"`
|
||||||
|
}
|
||||||
70
backend/internal/service/ops_settings_models.go
Normal file
70
backend/internal/service/ops_settings_models.go
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
// Ops settings models stored in DB `settings` table (JSON blobs).
|
||||||
|
|
||||||
|
type OpsEmailNotificationConfig struct {
|
||||||
|
Alert OpsEmailAlertConfig `json:"alert"`
|
||||||
|
Report OpsEmailReportConfig `json:"report"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsEmailAlertConfig struct {
|
||||||
|
Enabled bool `json:"enabled"`
|
||||||
|
Recipients []string `json:"recipients"`
|
||||||
|
MinSeverity string `json:"min_severity"`
|
||||||
|
RateLimitPerHour int `json:"rate_limit_per_hour"`
|
||||||
|
BatchingWindowSeconds int `json:"batching_window_seconds"`
|
||||||
|
IncludeResolvedAlerts bool `json:"include_resolved_alerts"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsEmailReportConfig struct {
|
||||||
|
Enabled bool `json:"enabled"`
|
||||||
|
Recipients []string `json:"recipients"`
|
||||||
|
DailySummaryEnabled bool `json:"daily_summary_enabled"`
|
||||||
|
DailySummarySchedule string `json:"daily_summary_schedule"`
|
||||||
|
WeeklySummaryEnabled bool `json:"weekly_summary_enabled"`
|
||||||
|
WeeklySummarySchedule string `json:"weekly_summary_schedule"`
|
||||||
|
ErrorDigestEnabled bool `json:"error_digest_enabled"`
|
||||||
|
ErrorDigestSchedule string `json:"error_digest_schedule"`
|
||||||
|
ErrorDigestMinCount int `json:"error_digest_min_count"`
|
||||||
|
AccountHealthEnabled bool `json:"account_health_enabled"`
|
||||||
|
AccountHealthSchedule string `json:"account_health_schedule"`
|
||||||
|
AccountHealthErrorRateThreshold float64 `json:"account_health_error_rate_threshold"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpsEmailNotificationConfigUpdateRequest allows partial updates, while the
|
||||||
|
// frontend can still send the full config shape.
|
||||||
|
type OpsEmailNotificationConfigUpdateRequest struct {
|
||||||
|
Alert *OpsEmailAlertConfig `json:"alert"`
|
||||||
|
Report *OpsEmailReportConfig `json:"report"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsDistributedLockSettings struct {
|
||||||
|
Enabled bool `json:"enabled"`
|
||||||
|
Key string `json:"key"`
|
||||||
|
TTLSeconds int `json:"ttl_seconds"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsAlertSilenceEntry struct {
|
||||||
|
RuleID *int64 `json:"rule_id,omitempty"`
|
||||||
|
Severities []string `json:"severities,omitempty"`
|
||||||
|
|
||||||
|
UntilRFC3339 string `json:"until_rfc3339"`
|
||||||
|
Reason string `json:"reason"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsAlertSilencingSettings struct {
|
||||||
|
Enabled bool `json:"enabled"`
|
||||||
|
|
||||||
|
GlobalUntilRFC3339 string `json:"global_until_rfc3339"`
|
||||||
|
GlobalReason string `json:"global_reason"`
|
||||||
|
|
||||||
|
Entries []OpsAlertSilenceEntry `json:"entries,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsAlertRuntimeSettings struct {
|
||||||
|
EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"`
|
||||||
|
|
||||||
|
DistributedLock OpsDistributedLockSettings `json:"distributed_lock"`
|
||||||
|
Silencing OpsAlertSilencingSettings `json:"silencing"`
|
||||||
|
}
|
||||||
|
|
||||||
65
backend/internal/service/ops_trend_models.go
Normal file
65
backend/internal/service/ops_trend_models.go
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
type OpsThroughputTrendPoint struct {
|
||||||
|
BucketStart time.Time `json:"bucket_start"`
|
||||||
|
RequestCount int64 `json:"request_count"`
|
||||||
|
TokenConsumed int64 `json:"token_consumed"`
|
||||||
|
QPS float64 `json:"qps"`
|
||||||
|
TPS float64 `json:"tps"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsThroughputPlatformBreakdownItem struct {
|
||||||
|
Platform string `json:"platform"`
|
||||||
|
RequestCount int64 `json:"request_count"`
|
||||||
|
TokenConsumed int64 `json:"token_consumed"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsThroughputGroupBreakdownItem struct {
|
||||||
|
GroupID int64 `json:"group_id"`
|
||||||
|
GroupName string `json:"group_name"`
|
||||||
|
RequestCount int64 `json:"request_count"`
|
||||||
|
TokenConsumed int64 `json:"token_consumed"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsThroughputTrendResponse struct {
|
||||||
|
Bucket string `json:"bucket"`
|
||||||
|
|
||||||
|
Points []*OpsThroughputTrendPoint `json:"points"`
|
||||||
|
|
||||||
|
// Optional drilldown helpers:
|
||||||
|
// - When no platform/group is selected: returns totals by platform.
|
||||||
|
// - When platform is selected but group is not: returns top groups in that platform.
|
||||||
|
ByPlatform []*OpsThroughputPlatformBreakdownItem `json:"by_platform,omitempty"`
|
||||||
|
TopGroups []*OpsThroughputGroupBreakdownItem `json:"top_groups,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsErrorTrendPoint struct {
|
||||||
|
BucketStart time.Time `json:"bucket_start"`
|
||||||
|
|
||||||
|
ErrorCountTotal int64 `json:"error_count_total"`
|
||||||
|
BusinessLimitedCount int64 `json:"business_limited_count"`
|
||||||
|
ErrorCountSLA int64 `json:"error_count_sla"`
|
||||||
|
|
||||||
|
UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
|
||||||
|
Upstream429Count int64 `json:"upstream_429_count"`
|
||||||
|
Upstream529Count int64 `json:"upstream_529_count"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsErrorTrendResponse struct {
|
||||||
|
Bucket string `json:"bucket"`
|
||||||
|
Points []*OpsErrorTrendPoint `json:"points"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsErrorDistributionItem struct {
|
||||||
|
StatusCode int `json:"status_code"`
|
||||||
|
Total int64 `json:"total"`
|
||||||
|
SLA int64 `json:"sla"`
|
||||||
|
BusinessLimited int64 `json:"business_limited"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type OpsErrorDistributionResponse struct {
|
||||||
|
Total int64 `json:"total"`
|
||||||
|
Items []*OpsErrorDistributionItem `json:"items"`
|
||||||
|
}
|
||||||
707
backend/migrations/030_ops_monitoring_vnext.sql
Normal file
707
backend/migrations/030_ops_monitoring_vnext.sql
Normal file
@@ -0,0 +1,707 @@
|
|||||||
|
-- Ops Monitoring (vNext): squashed migration (030)
|
||||||
|
--
|
||||||
|
-- This repository originally planned Ops vNext as migrations 030-036:
|
||||||
|
-- 030 drop legacy ops tables
|
||||||
|
-- 031 core schema
|
||||||
|
-- 032 pre-aggregation tables
|
||||||
|
-- 033 indexes + optional extensions
|
||||||
|
-- 034 add avg/max to preagg
|
||||||
|
-- 035 add notify_email to alert rules
|
||||||
|
-- 036 seed default alert rules
|
||||||
|
--
|
||||||
|
-- Since these migrations have NOT been applied to any environment yet, we squash them
|
||||||
|
-- into a single 030 migration for easier review and a cleaner migration history.
|
||||||
|
--
|
||||||
|
-- Notes:
|
||||||
|
-- - This is intentionally destructive for ops_* data (error logs / metrics / alerts).
|
||||||
|
-- - It is idempotent (DROP/CREATE/ALTER IF EXISTS/IF NOT EXISTS), but will wipe ops_* data if re-run.
|
||||||
|
|
||||||
|
-- =====================================================================
|
||||||
|
-- 030_ops_drop_legacy_ops_tables.sql
|
||||||
|
-- =====================================================================
|
||||||
|
|
||||||
|
SET LOCAL lock_timeout = '5s';
|
||||||
|
SET LOCAL statement_timeout = '10min';
|
||||||
|
|
||||||
|
-- Legacy pre-aggregation tables (from 026 and/or previous branches)
|
||||||
|
DROP TABLE IF EXISTS ops_metrics_daily CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_metrics_hourly CASCADE;
|
||||||
|
|
||||||
|
-- Core ops tables that may exist in some deployments / branches
|
||||||
|
DROP TABLE IF EXISTS ops_system_metrics CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_error_logs CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_alert_events CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_alert_rules CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_job_heartbeats CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_retry_attempts CASCADE;
|
||||||
|
|
||||||
|
-- Optional legacy tables (best-effort cleanup)
|
||||||
|
DROP TABLE IF EXISTS ops_scheduled_reports CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_group_availability_configs CASCADE;
|
||||||
|
DROP TABLE IF EXISTS ops_group_availability_events CASCADE;
|
||||||
|
|
||||||
|
-- Optional legacy views/indexes
|
||||||
|
DROP VIEW IF EXISTS ops_latest_metrics CASCADE;
|
||||||
|
|
||||||
|
-- =====================================================================
|
||||||
|
-- 031_ops_core_schema.sql
|
||||||
|
-- =====================================================================
|
||||||
|
|
||||||
|
-- Ops Monitoring (vNext): core schema (errors / retries / metrics / jobs / alerts)
|
||||||
|
--
|
||||||
|
-- Design goals:
|
||||||
|
-- - Support global filtering (time/platform/group) across all ops modules.
|
||||||
|
-- - Persist enough context for two retry modes (client retry / pinned upstream retry).
|
||||||
|
-- - Make ops background jobs observable via job heartbeats.
|
||||||
|
-- - Keep schema stable and indexes targeted (high-write tables).
|
||||||
|
--
|
||||||
|
-- Notes:
|
||||||
|
-- - This migration is idempotent.
|
||||||
|
-- - ops_* tables intentionally avoid strict foreign keys to reduce write amplification/locks.
|
||||||
|
|
||||||
|
SET LOCAL lock_timeout = '5s';
|
||||||
|
SET LOCAL statement_timeout = '10min';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 1) ops_error_logs: error log details (high-write)
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_error_logs (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
-- Correlation / identities
|
||||||
|
request_id VARCHAR(64),
|
||||||
|
client_request_id VARCHAR(64),
|
||||||
|
user_id BIGINT,
|
||||||
|
api_key_id BIGINT,
|
||||||
|
account_id BIGINT,
|
||||||
|
group_id BIGINT,
|
||||||
|
client_ip inet,
|
||||||
|
|
||||||
|
-- Dimensions for global filtering
|
||||||
|
platform VARCHAR(32),
|
||||||
|
|
||||||
|
-- Request metadata
|
||||||
|
model VARCHAR(100),
|
||||||
|
request_path VARCHAR(256),
|
||||||
|
stream BOOLEAN NOT NULL DEFAULT false,
|
||||||
|
user_agent TEXT,
|
||||||
|
|
||||||
|
-- Core error classification
|
||||||
|
error_phase VARCHAR(32) NOT NULL,
|
||||||
|
error_type VARCHAR(64) NOT NULL,
|
||||||
|
severity VARCHAR(8) NOT NULL DEFAULT 'P2',
|
||||||
|
status_code INT,
|
||||||
|
|
||||||
|
-- vNext metric semantics
|
||||||
|
is_business_limited BOOLEAN NOT NULL DEFAULT false,
|
||||||
|
|
||||||
|
-- Error details (sanitized/truncated at ingest time)
|
||||||
|
error_message TEXT,
|
||||||
|
error_body TEXT,
|
||||||
|
|
||||||
|
-- Provider/upstream details (optional; useful for trends & account health)
|
||||||
|
error_source VARCHAR(64),
|
||||||
|
error_owner VARCHAR(32),
|
||||||
|
account_status VARCHAR(50),
|
||||||
|
upstream_status_code INT,
|
||||||
|
upstream_error_message TEXT,
|
||||||
|
upstream_error_detail TEXT,
|
||||||
|
provider_error_code VARCHAR(64),
|
||||||
|
provider_error_type VARCHAR(64),
|
||||||
|
network_error_type VARCHAR(50),
|
||||||
|
retry_after_seconds INT,
|
||||||
|
|
||||||
|
-- Timings (ms) - optional
|
||||||
|
duration_ms INT,
|
||||||
|
time_to_first_token_ms BIGINT,
|
||||||
|
auth_latency_ms BIGINT,
|
||||||
|
routing_latency_ms BIGINT,
|
||||||
|
upstream_latency_ms BIGINT,
|
||||||
|
response_latency_ms BIGINT,
|
||||||
|
|
||||||
|
-- Retry context (only stored for error requests)
|
||||||
|
request_body JSONB,
|
||||||
|
request_headers JSONB,
|
||||||
|
request_body_truncated BOOLEAN NOT NULL DEFAULT false,
|
||||||
|
request_body_bytes INT,
|
||||||
|
|
||||||
|
-- Retryability flags (best-effort classification)
|
||||||
|
is_retryable BOOLEAN NOT NULL DEFAULT false,
|
||||||
|
retry_count INT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
COMMENT ON TABLE ops_error_logs IS 'Ops error logs (vNext). Stores sanitized error details and request_body for retries (errors only).';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 2) ops_retry_attempts: audit log for retries
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_retry_attempts (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
|
||||||
|
requested_by_user_id BIGINT,
|
||||||
|
source_error_id BIGINT,
|
||||||
|
|
||||||
|
-- client|upstream
|
||||||
|
mode VARCHAR(16) NOT NULL,
|
||||||
|
pinned_account_id BIGINT,
|
||||||
|
|
||||||
|
-- queued|running|succeeded|failed
|
||||||
|
status VARCHAR(16) NOT NULL DEFAULT 'queued',
|
||||||
|
started_at TIMESTAMPTZ,
|
||||||
|
finished_at TIMESTAMPTZ,
|
||||||
|
duration_ms BIGINT,
|
||||||
|
|
||||||
|
-- Optional result correlation
|
||||||
|
result_request_id VARCHAR(64),
|
||||||
|
result_error_id BIGINT,
|
||||||
|
result_usage_request_id VARCHAR(64),
|
||||||
|
|
||||||
|
error_message TEXT
|
||||||
|
);
|
||||||
|
|
||||||
|
COMMENT ON TABLE ops_retry_attempts IS 'Audit table for ops retries (client retry / pinned upstream retry).';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 3) ops_system_metrics: system + request window snapshots
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_system_metrics (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
window_minutes INT NOT NULL DEFAULT 1,
|
||||||
|
|
||||||
|
-- Optional dimensions (only if collector chooses to write per-dimension snapshots)
|
||||||
|
platform VARCHAR(32),
|
||||||
|
group_id BIGINT,
|
||||||
|
|
||||||
|
-- Core counts
|
||||||
|
success_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
error_count_total BIGINT NOT NULL DEFAULT 0,
|
||||||
|
business_limited_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
error_count_sla BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
|
||||||
|
upstream_429_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
upstream_529_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
token_consumed BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
-- Rates
|
||||||
|
qps DOUBLE PRECISION,
|
||||||
|
tps DOUBLE PRECISION,
|
||||||
|
|
||||||
|
-- Duration percentiles (ms) - success requests
|
||||||
|
duration_p50_ms INT,
|
||||||
|
duration_p90_ms INT,
|
||||||
|
duration_p95_ms INT,
|
||||||
|
duration_p99_ms INT,
|
||||||
|
duration_avg_ms DOUBLE PRECISION,
|
||||||
|
duration_max_ms INT,
|
||||||
|
|
||||||
|
-- TTFT percentiles (ms) - success requests (streaming)
|
||||||
|
ttft_p50_ms INT,
|
||||||
|
ttft_p90_ms INT,
|
||||||
|
ttft_p95_ms INT,
|
||||||
|
ttft_p99_ms INT,
|
||||||
|
ttft_avg_ms DOUBLE PRECISION,
|
||||||
|
ttft_max_ms INT,
|
||||||
|
|
||||||
|
-- System resources
|
||||||
|
cpu_usage_percent DOUBLE PRECISION,
|
||||||
|
memory_used_mb BIGINT,
|
||||||
|
memory_total_mb BIGINT,
|
||||||
|
memory_usage_percent DOUBLE PRECISION,
|
||||||
|
|
||||||
|
-- Dependency health (best-effort)
|
||||||
|
db_ok BOOLEAN,
|
||||||
|
redis_ok BOOLEAN,
|
||||||
|
|
||||||
|
-- DB pool & runtime
|
||||||
|
db_conn_active INT,
|
||||||
|
db_conn_idle INT,
|
||||||
|
db_conn_waiting INT,
|
||||||
|
goroutine_count INT,
|
||||||
|
|
||||||
|
-- Queue / concurrency
|
||||||
|
concurrency_queue_depth INT
|
||||||
|
);
|
||||||
|
|
||||||
|
COMMENT ON TABLE ops_system_metrics IS 'Ops system/request metrics snapshots (vNext). Used for dashboard overview and realtime rates.';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 4) ops_job_heartbeats: background jobs health
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_job_heartbeats (
|
||||||
|
job_name VARCHAR(64) PRIMARY KEY,
|
||||||
|
|
||||||
|
last_run_at TIMESTAMPTZ,
|
||||||
|
last_success_at TIMESTAMPTZ,
|
||||||
|
last_error_at TIMESTAMPTZ,
|
||||||
|
last_error TEXT,
|
||||||
|
last_duration_ms BIGINT,
|
||||||
|
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
COMMENT ON TABLE ops_job_heartbeats IS 'Ops background jobs heartbeats (vNext).';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 5) ops_alert_rules / ops_alert_events
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_alert_rules (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
name VARCHAR(128) NOT NULL,
|
||||||
|
description TEXT,
|
||||||
|
enabled BOOLEAN NOT NULL DEFAULT true,
|
||||||
|
|
||||||
|
severity VARCHAR(16) NOT NULL DEFAULT 'warning',
|
||||||
|
|
||||||
|
-- Metric definition
|
||||||
|
-- Metric definition
|
||||||
|
metric_type VARCHAR(64) NOT NULL,
|
||||||
|
operator VARCHAR(8) NOT NULL,
|
||||||
|
threshold DOUBLE PRECISION NOT NULL,
|
||||||
|
|
||||||
|
window_minutes INT NOT NULL DEFAULT 5,
|
||||||
|
sustained_minutes INT NOT NULL DEFAULT 5,
|
||||||
|
cooldown_minutes INT NOT NULL DEFAULT 10,
|
||||||
|
|
||||||
|
-- Optional scoping: platform/group filters etc.
|
||||||
|
filters JSONB,
|
||||||
|
|
||||||
|
last_triggered_at TIMESTAMPTZ,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_alert_rules_name_unique
|
||||||
|
ON ops_alert_rules (name);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled
|
||||||
|
ON ops_alert_rules (enabled);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_alert_events (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
rule_id BIGINT,
|
||||||
|
severity VARCHAR(16) NOT NULL,
|
||||||
|
status VARCHAR(16) NOT NULL DEFAULT 'firing',
|
||||||
|
|
||||||
|
title VARCHAR(200),
|
||||||
|
description TEXT,
|
||||||
|
|
||||||
|
metric_value DOUBLE PRECISION,
|
||||||
|
threshold_value DOUBLE PRECISION,
|
||||||
|
dimensions JSONB,
|
||||||
|
|
||||||
|
fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
resolved_at TIMESTAMPTZ,
|
||||||
|
|
||||||
|
email_sent BOOLEAN NOT NULL DEFAULT false,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status
|
||||||
|
ON ops_alert_events (rule_id, status);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at
|
||||||
|
ON ops_alert_events (fired_at DESC);
|
||||||
|
|
||||||
|
-- =====================================================================
|
||||||
|
-- 032_ops_preaggregation_tables.sql
|
||||||
|
-- =====================================================================
|
||||||
|
|
||||||
|
-- Ops Monitoring (vNext): pre-aggregation tables
|
||||||
|
--
|
||||||
|
-- Purpose:
|
||||||
|
-- - Provide stable query performance for 1–24h windows (and beyond), avoiding expensive
|
||||||
|
-- percentile_cont scans on raw logs for every dashboard refresh.
|
||||||
|
-- - Support global filter dimensions: overall / platform / group.
|
||||||
|
--
|
||||||
|
-- Design note:
|
||||||
|
-- - We keep a single table with nullable platform/group_id, and enforce uniqueness via a
|
||||||
|
-- COALESCE-based unique index (because UNIQUE with NULLs allows duplicates in Postgres).
|
||||||
|
|
||||||
|
SET LOCAL lock_timeout = '5s';
|
||||||
|
SET LOCAL statement_timeout = '10min';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 1) ops_metrics_hourly
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_metrics_hourly (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
bucket_start TIMESTAMPTZ NOT NULL,
|
||||||
|
platform VARCHAR(32),
|
||||||
|
group_id BIGINT,
|
||||||
|
|
||||||
|
success_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
error_count_total BIGINT NOT NULL DEFAULT 0,
|
||||||
|
business_limited_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
error_count_sla BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
|
||||||
|
upstream_429_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
upstream_529_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
token_consumed BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
-- Duration percentiles (ms)
|
||||||
|
duration_p50_ms INT,
|
||||||
|
duration_p90_ms INT,
|
||||||
|
duration_p95_ms INT,
|
||||||
|
duration_p99_ms INT,
|
||||||
|
|
||||||
|
-- TTFT percentiles (ms)
|
||||||
|
ttft_p50_ms INT,
|
||||||
|
ttft_p90_ms INT,
|
||||||
|
ttft_p95_ms INT,
|
||||||
|
ttft_p99_ms INT,
|
||||||
|
|
||||||
|
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Uniqueness across three “dimension modes” (overall / platform / group).
|
||||||
|
-- Postgres UNIQUE treats NULLs as distinct, so we enforce uniqueness via COALESCE.
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_hourly_unique_dim
|
||||||
|
ON ops_metrics_hourly (
|
||||||
|
bucket_start,
|
||||||
|
COALESCE(platform, ''),
|
||||||
|
COALESCE(group_id, 0)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_bucket
|
||||||
|
ON ops_metrics_hourly (bucket_start DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_platform_bucket
|
||||||
|
ON ops_metrics_hourly (platform, bucket_start DESC)
|
||||||
|
WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_group_bucket
|
||||||
|
ON ops_metrics_hourly (group_id, bucket_start DESC)
|
||||||
|
WHERE group_id IS NOT NULL AND group_id <> 0;
|
||||||
|
|
||||||
|
COMMENT ON TABLE ops_metrics_hourly IS 'vNext hourly pre-aggregated ops metrics (overall/platform/group).';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 2) ops_metrics_daily (optional; for longer windows)
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ops_metrics_daily (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
bucket_date DATE NOT NULL,
|
||||||
|
platform VARCHAR(32),
|
||||||
|
group_id BIGINT,
|
||||||
|
|
||||||
|
success_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
error_count_total BIGINT NOT NULL DEFAULT 0,
|
||||||
|
business_limited_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
error_count_sla BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
|
||||||
|
upstream_429_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
upstream_529_count BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
token_consumed BIGINT NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
duration_p50_ms INT,
|
||||||
|
duration_p90_ms INT,
|
||||||
|
duration_p95_ms INT,
|
||||||
|
duration_p99_ms INT,
|
||||||
|
|
||||||
|
ttft_p50_ms INT,
|
||||||
|
ttft_p90_ms INT,
|
||||||
|
ttft_p95_ms INT,
|
||||||
|
ttft_p99_ms INT,
|
||||||
|
|
||||||
|
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_daily_unique_dim
|
||||||
|
ON ops_metrics_daily (
|
||||||
|
bucket_date,
|
||||||
|
COALESCE(platform, ''),
|
||||||
|
COALESCE(group_id, 0)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_bucket
|
||||||
|
ON ops_metrics_daily (bucket_date DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_platform_bucket
|
||||||
|
ON ops_metrics_daily (platform, bucket_date DESC)
|
||||||
|
WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_group_bucket
|
||||||
|
ON ops_metrics_daily (group_id, bucket_date DESC)
|
||||||
|
WHERE group_id IS NOT NULL AND group_id <> 0;
|
||||||
|
|
||||||
|
COMMENT ON TABLE ops_metrics_daily IS 'vNext daily pre-aggregated ops metrics (overall/platform/group).';
|
||||||
|
|
||||||
|
-- =====================================================================
|
||||||
|
-- 033_ops_indexes_and_extensions.sql
|
||||||
|
-- =====================================================================
|
||||||
|
|
||||||
|
-- Ops Monitoring (vNext): indexes and optional extensions
|
||||||
|
--
|
||||||
|
-- This migration intentionally keeps "optional" objects (like pg_trgm) best-effort,
|
||||||
|
-- so environments without extension privileges won't fail the whole migration chain.
|
||||||
|
|
||||||
|
SET LOCAL lock_timeout = '5s';
|
||||||
|
SET LOCAL statement_timeout = '10min';
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 1) Core btree indexes (always safe)
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
-- ops_error_logs
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at
|
||||||
|
ON ops_error_logs (created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform_time
|
||||||
|
ON ops_error_logs (platform, created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_group_time
|
||||||
|
ON ops_error_logs (group_id, created_at DESC)
|
||||||
|
WHERE group_id IS NOT NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_account_time
|
||||||
|
ON ops_error_logs (account_id, created_at DESC)
|
||||||
|
WHERE account_id IS NOT NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_status_time
|
||||||
|
ON ops_error_logs (status_code, created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_time
|
||||||
|
ON ops_error_logs (error_phase, created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_type_time
|
||||||
|
ON ops_error_logs (error_type, created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id
|
||||||
|
ON ops_error_logs (request_id);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id
|
||||||
|
ON ops_error_logs (client_request_id);
|
||||||
|
|
||||||
|
-- ops_system_metrics
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at
|
||||||
|
ON ops_system_metrics (created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time
|
||||||
|
ON ops_system_metrics (window_minutes, created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_platform_time
|
||||||
|
ON ops_system_metrics (platform, created_at DESC)
|
||||||
|
WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_group_time
|
||||||
|
ON ops_system_metrics (group_id, created_at DESC)
|
||||||
|
WHERE group_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- ops_retry_attempts
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_created_at
|
||||||
|
ON ops_retry_attempts (created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_source_error
|
||||||
|
ON ops_retry_attempts (source_error_id, created_at DESC)
|
||||||
|
WHERE source_error_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Prevent concurrent retries for the same ops_error_logs row (race-free, multi-instance safe).
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_retry_attempts_unique_active
|
||||||
|
ON ops_retry_attempts (source_error_id)
|
||||||
|
WHERE source_error_id IS NOT NULL AND status IN ('queued', 'running');
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 2) Optional: pg_trgm + trigram indexes for fuzzy search
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
BEGIN
|
||||||
|
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||||
|
EXCEPTION WHEN OTHERS THEN
|
||||||
|
-- Missing privileges or extension package should not block migrations.
|
||||||
|
RAISE NOTICE 'pg_trgm extension not created: %', SQLERRM;
|
||||||
|
END;
|
||||||
|
|
||||||
|
IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm') THEN
|
||||||
|
-- request_id / client_request_id fuzzy search
|
||||||
|
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id_trgm
|
||||||
|
ON ops_error_logs USING gin (request_id gin_trgm_ops)';
|
||||||
|
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id_trgm
|
||||||
|
ON ops_error_logs USING gin (client_request_id gin_trgm_ops)';
|
||||||
|
|
||||||
|
-- error_message fuzzy search
|
||||||
|
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_error_message_trgm
|
||||||
|
ON ops_error_logs USING gin (error_message gin_trgm_ops)';
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- =====================================================================
|
||||||
|
-- 034_ops_preaggregation_add_avg_max.sql
|
||||||
|
-- =====================================================================
|
||||||
|
|
||||||
|
-- Ops Monitoring (vNext): extend pre-aggregation tables with avg/max latency fields
|
||||||
|
--
|
||||||
|
-- Why:
|
||||||
|
-- - The dashboard overview returns avg/max for duration/TTFT.
|
||||||
|
-- - Hourly/daily pre-aggregation tables originally stored only p50/p90/p95/p99, which makes
|
||||||
|
-- it impossible to answer avg/max in preagg mode without falling back to raw scans.
|
||||||
|
--
|
||||||
|
-- This migration is idempotent and safe to run multiple times.
|
||||||
|
--
|
||||||
|
-- NOTE: We keep the existing p50/p90/p95/p99 columns as-is; these are still used for
|
||||||
|
-- approximate long-window summaries.
|
||||||
|
|
||||||
|
SET LOCAL lock_timeout = '5s';
|
||||||
|
SET LOCAL statement_timeout = '10min';
|
||||||
|
|
||||||
|
-- Hourly table
|
||||||
|
ALTER TABLE ops_metrics_hourly
|
||||||
|
ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
|
||||||
|
ADD COLUMN IF NOT EXISTS duration_max_ms INT,
|
||||||
|
ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
|
||||||
|
ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
|
||||||
|
|
||||||
|
-- Daily table
|
||||||
|
ALTER TABLE ops_metrics_daily
|
||||||
|
ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
|
||||||
|
ADD COLUMN IF NOT EXISTS duration_max_ms INT,
|
||||||
|
ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
|
||||||
|
ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
|
||||||
|
|
||||||
|
-- =====================================================================
|
||||||
|
-- 035_ops_alert_rules_notify_email.sql
|
||||||
|
-- =====================================================================
|
||||||
|
|
||||||
|
-- Ops Monitoring (vNext): alert rule notify settings
|
||||||
|
--
|
||||||
|
-- Adds notify_email flag to ops_alert_rules to keep UI parity with the backup Ops dashboard.
|
||||||
|
-- Migration is idempotent.
|
||||||
|
|
||||||
|
SET LOCAL lock_timeout = '5s';
|
||||||
|
SET LOCAL statement_timeout = '10min';
|
||||||
|
|
||||||
|
ALTER TABLE ops_alert_rules
|
||||||
|
ADD COLUMN IF NOT EXISTS notify_email BOOLEAN NOT NULL DEFAULT true;
|
||||||
|
|
||||||
|
-- =====================================================================
|
||||||
|
-- 036_ops_seed_default_alert_rules.sql
|
||||||
|
-- =====================================================================
|
||||||
|
|
||||||
|
-- Ops Monitoring (vNext): seed default alert rules (idempotent)
|
||||||
|
--
|
||||||
|
-- Goal:
|
||||||
|
-- - Provide "out of the box" alert rules so the Ops dashboard can immediately show alert events.
|
||||||
|
-- - Keep inserts idempotent via ON CONFLICT (name) DO NOTHING.
|
||||||
|
--
|
||||||
|
-- Notes:
|
||||||
|
-- - Thresholds are intentionally conservative defaults and should be tuned per deployment.
|
||||||
|
-- - Metric semantics follow vNext:
|
||||||
|
-- - success_rate / error_rate are based on SLA-scope counts (exclude is_business_limited).
|
||||||
|
-- - upstream_error_rate excludes 429/529.
|
||||||
|
|
||||||
|
SET LOCAL lock_timeout = '5s';
|
||||||
|
SET LOCAL statement_timeout = '10min';
|
||||||
|
|
||||||
|
-- 1) High error rate (P1)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'错误率过高',
|
||||||
|
'当错误率超过 5% 且持续 5 分钟时触发告警',
|
||||||
|
true, 'error_rate', '>', 5.0, 5, 5, 'P1', true, 20, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- 2) Low success rate (P0)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'成功率过低',
|
||||||
|
'当成功率低于 95% 且持续 5 分钟时触发告警(服务可用性下降)',
|
||||||
|
true, 'success_rate', '<', 95.0, 5, 5, 'P0', true, 15, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- 3) P99 latency too high (P2)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'P99延迟过高',
|
||||||
|
'当 P99 延迟超过 3000ms 且持续 10 分钟时触发告警',
|
||||||
|
true, 'p99_latency_ms', '>', 3000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- 4) P95 latency too high (P2)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'P95延迟过高',
|
||||||
|
'当 P95 延迟超过 2000ms 且持续 10 分钟时触发告警',
|
||||||
|
true, 'p95_latency_ms', '>', 2000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- 5) CPU usage too high (P2)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'CPU使用率过高',
|
||||||
|
'当 CPU 使用率超过 85% 且持续 10 分钟时触发告警',
|
||||||
|
true, 'cpu_usage_percent', '>', 85.0, 5, 10, 'P2', true, 30, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- 6) Memory usage too high (P1)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'内存使用率过高',
|
||||||
|
'当内存使用率超过 90% 且持续 10 分钟时触发告警(可能导致 OOM)',
|
||||||
|
true, 'memory_usage_percent', '>', 90.0, 5, 10, 'P1', true, 20, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- 7) Concurrency queue buildup (P1)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'并发队列积压',
|
||||||
|
'当并发队列深度超过 100 且持续 5 分钟时触发告警(系统处理能力不足)',
|
||||||
|
true, 'concurrency_queue_depth', '>', 100.0, 5, 5, 'P1', true, 20, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- 8) Extremely high error rate (P0)
|
||||||
|
INSERT INTO ops_alert_rules (
|
||||||
|
name, description, enabled, metric_type, operator, threshold,
|
||||||
|
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
|
||||||
|
created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
'错误率极高',
|
||||||
|
'当错误率超过 20% 且持续 1 分钟时触发告警(服务严重异常)',
|
||||||
|
true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
|
||||||
|
) ON CONFLICT (name) DO NOTHING;
|
||||||
Reference in New Issue
Block a user