feat(数据库): 添加运维监控数据模型和数据库迁移脚本
- 新增 ops 监控数据库迁移脚本(表结构定义) - 定义核心数据模型(ops_models.go) - 定义告警相关模型(ops_alert_models.go) - 定义仪表板数据模型(ops_dashboard_models.go) - 定义实时监控数据模型(ops_realtime_models.go) - 定义配置相关模型(ops_settings_models.go) - 定义趋势分析数据模型(ops_trend_models.go)
This commit is contained in:
75
backend/internal/service/ops_alert_models.go
Normal file
75
backend/internal/service/ops_alert_models.go
Normal file
@@ -0,0 +1,75 @@
|
||||
package service
|
||||
|
||||
import "time"
|
||||
|
||||
// Ops alert rule/event models.
|
||||
//
|
||||
// NOTE: These are admin-facing DTOs and intentionally keep JSON naming aligned
|
||||
// with the existing ops dashboard frontend (backup style).
|
||||
|
||||
const (
|
||||
OpsAlertStatusFiring = "firing"
|
||||
OpsAlertStatusResolved = "resolved"
|
||||
)
|
||||
|
||||
type OpsAlertRule struct {
|
||||
ID int64 `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Description string `json:"description"`
|
||||
|
||||
Enabled bool `json:"enabled"`
|
||||
Severity string `json:"severity"`
|
||||
|
||||
MetricType string `json:"metric_type"`
|
||||
Operator string `json:"operator"`
|
||||
Threshold float64 `json:"threshold"`
|
||||
|
||||
WindowMinutes int `json:"window_minutes"`
|
||||
SustainedMinutes int `json:"sustained_minutes"`
|
||||
CooldownMinutes int `json:"cooldown_minutes"`
|
||||
|
||||
NotifyEmail bool `json:"notify_email"`
|
||||
|
||||
Filters map[string]any `json:"filters,omitempty"`
|
||||
|
||||
LastTriggeredAt *time.Time `json:"last_triggered_at,omitempty"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type OpsAlertEvent struct {
|
||||
ID int64 `json:"id"`
|
||||
RuleID int64 `json:"rule_id"`
|
||||
Severity string `json:"severity"`
|
||||
Status string `json:"status"`
|
||||
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
|
||||
MetricValue *float64 `json:"metric_value,omitempty"`
|
||||
ThresholdValue *float64 `json:"threshold_value,omitempty"`
|
||||
|
||||
Dimensions map[string]any `json:"dimensions,omitempty"`
|
||||
|
||||
FiredAt time.Time `json:"fired_at"`
|
||||
ResolvedAt *time.Time `json:"resolved_at,omitempty"`
|
||||
|
||||
EmailSent bool `json:"email_sent"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
}
|
||||
|
||||
type OpsAlertEventFilter struct {
|
||||
Limit int
|
||||
|
||||
// Optional filters.
|
||||
Status string
|
||||
Severity string
|
||||
|
||||
StartTime *time.Time
|
||||
EndTime *time.Time
|
||||
|
||||
// Dimensions filters (best-effort).
|
||||
Platform string
|
||||
GroupID *int64
|
||||
}
|
||||
|
||||
83
backend/internal/service/ops_dashboard_models.go
Normal file
83
backend/internal/service/ops_dashboard_models.go
Normal file
@@ -0,0 +1,83 @@
|
||||
package service
|
||||
|
||||
import "time"
|
||||
|
||||
type OpsDashboardFilter struct {
|
||||
StartTime time.Time
|
||||
EndTime time.Time
|
||||
|
||||
Platform string
|
||||
GroupID *int64
|
||||
|
||||
// QueryMode controls whether dashboard queries should use raw logs or pre-aggregated tables.
|
||||
// Expected values: auto/raw/preagg (see OpsQueryMode).
|
||||
QueryMode OpsQueryMode
|
||||
}
|
||||
|
||||
type OpsRateSummary struct {
|
||||
Current float64 `json:"current"`
|
||||
Peak float64 `json:"peak"`
|
||||
Avg float64 `json:"avg"`
|
||||
}
|
||||
|
||||
type OpsPercentiles struct {
|
||||
P50 *int `json:"p50_ms"`
|
||||
P90 *int `json:"p90_ms"`
|
||||
P95 *int `json:"p95_ms"`
|
||||
P99 *int `json:"p99_ms"`
|
||||
Avg *int `json:"avg_ms"`
|
||||
Max *int `json:"max_ms"`
|
||||
}
|
||||
|
||||
type OpsDashboardOverview struct {
|
||||
StartTime time.Time `json:"start_time"`
|
||||
EndTime time.Time `json:"end_time"`
|
||||
Platform string `json:"platform"`
|
||||
GroupID *int64 `json:"group_id"`
|
||||
|
||||
// Latest system-level snapshot (window=1m, global).
|
||||
SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"`
|
||||
|
||||
// Background jobs health (heartbeats).
|
||||
JobHeartbeats []*OpsJobHeartbeat `json:"job_heartbeats"`
|
||||
|
||||
SuccessCount int64 `json:"success_count"`
|
||||
ErrorCountTotal int64 `json:"error_count_total"`
|
||||
BusinessLimitedCount int64 `json:"business_limited_count"`
|
||||
|
||||
ErrorCountSLA int64 `json:"error_count_sla"`
|
||||
RequestCountTotal int64 `json:"request_count_total"`
|
||||
RequestCountSLA int64 `json:"request_count_sla"`
|
||||
|
||||
TokenConsumed int64 `json:"token_consumed"`
|
||||
|
||||
SLA float64 `json:"sla"`
|
||||
ErrorRate float64 `json:"error_rate"`
|
||||
UpstreamErrorRate float64 `json:"upstream_error_rate"`
|
||||
UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
|
||||
Upstream429Count int64 `json:"upstream_429_count"`
|
||||
Upstream529Count int64 `json:"upstream_529_count"`
|
||||
|
||||
QPS OpsRateSummary `json:"qps"`
|
||||
TPS OpsRateSummary `json:"tps"`
|
||||
|
||||
Duration OpsPercentiles `json:"duration"`
|
||||
TTFT OpsPercentiles `json:"ttft"`
|
||||
}
|
||||
|
||||
type OpsLatencyHistogramBucket struct {
|
||||
Range string `json:"range"`
|
||||
Count int64 `json:"count"`
|
||||
}
|
||||
|
||||
// OpsLatencyHistogramResponse is a coarse latency distribution histogram (success requests only).
|
||||
// It is used by the Ops dashboard to quickly identify tail latency regressions.
|
||||
type OpsLatencyHistogramResponse struct {
|
||||
StartTime time.Time `json:"start_time"`
|
||||
EndTime time.Time `json:"end_time"`
|
||||
Platform string `json:"platform"`
|
||||
GroupID *int64 `json:"group_id"`
|
||||
|
||||
TotalRequests int64 `json:"total_requests"`
|
||||
Buckets []*OpsLatencyHistogramBucket `json:"buckets"`
|
||||
}
|
||||
118
backend/internal/service/ops_models.go
Normal file
118
backend/internal/service/ops_models.go
Normal file
@@ -0,0 +1,118 @@
|
||||
package service
|
||||
|
||||
import "time"
|
||||
|
||||
type OpsErrorLog struct {
|
||||
ID int64 `json:"id"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
|
||||
Phase string `json:"phase"`
|
||||
Type string `json:"type"`
|
||||
Severity string `json:"severity"`
|
||||
|
||||
StatusCode int `json:"status_code"`
|
||||
Platform string `json:"platform"`
|
||||
Model string `json:"model"`
|
||||
|
||||
LatencyMs *int `json:"latency_ms"`
|
||||
|
||||
ClientRequestID string `json:"client_request_id"`
|
||||
RequestID string `json:"request_id"`
|
||||
Message string `json:"message"`
|
||||
|
||||
UserID *int64 `json:"user_id"`
|
||||
APIKeyID *int64 `json:"api_key_id"`
|
||||
AccountID *int64 `json:"account_id"`
|
||||
GroupID *int64 `json:"group_id"`
|
||||
|
||||
ClientIP *string `json:"client_ip"`
|
||||
RequestPath string `json:"request_path"`
|
||||
Stream bool `json:"stream"`
|
||||
}
|
||||
|
||||
type OpsErrorLogDetail struct {
|
||||
OpsErrorLog
|
||||
|
||||
ErrorBody string `json:"error_body"`
|
||||
UserAgent string `json:"user_agent"`
|
||||
|
||||
// Timings (optional)
|
||||
AuthLatencyMs *int64 `json:"auth_latency_ms"`
|
||||
RoutingLatencyMs *int64 `json:"routing_latency_ms"`
|
||||
UpstreamLatencyMs *int64 `json:"upstream_latency_ms"`
|
||||
ResponseLatencyMs *int64 `json:"response_latency_ms"`
|
||||
TimeToFirstTokenMs *int64 `json:"time_to_first_token_ms"`
|
||||
|
||||
// Retry context
|
||||
RequestBody string `json:"request_body"`
|
||||
RequestBodyTruncated bool `json:"request_body_truncated"`
|
||||
RequestBodyBytes *int `json:"request_body_bytes"`
|
||||
RequestHeaders string `json:"request_headers,omitempty"`
|
||||
|
||||
// vNext metric semantics
|
||||
IsBusinessLimited bool `json:"is_business_limited"`
|
||||
}
|
||||
|
||||
type OpsErrorLogFilter struct {
|
||||
StartTime *time.Time
|
||||
EndTime *time.Time
|
||||
|
||||
Platform string
|
||||
GroupID *int64
|
||||
AccountID *int64
|
||||
|
||||
StatusCodes []int
|
||||
Phase string
|
||||
Query string
|
||||
|
||||
Page int
|
||||
PageSize int
|
||||
}
|
||||
|
||||
type OpsErrorLogList struct {
|
||||
Errors []*OpsErrorLog `json:"errors"`
|
||||
Total int `json:"total"`
|
||||
Page int `json:"page"`
|
||||
PageSize int `json:"page_size"`
|
||||
}
|
||||
|
||||
type OpsRetryAttempt struct {
|
||||
ID int64 `json:"id"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
|
||||
RequestedByUserID int64 `json:"requested_by_user_id"`
|
||||
SourceErrorID int64 `json:"source_error_id"`
|
||||
Mode string `json:"mode"`
|
||||
PinnedAccountID *int64 `json:"pinned_account_id"`
|
||||
|
||||
Status string `json:"status"`
|
||||
StartedAt *time.Time `json:"started_at"`
|
||||
FinishedAt *time.Time `json:"finished_at"`
|
||||
DurationMs *int64 `json:"duration_ms"`
|
||||
|
||||
ResultRequestID *string `json:"result_request_id"`
|
||||
ResultErrorID *int64 `json:"result_error_id"`
|
||||
|
||||
ErrorMessage *string `json:"error_message"`
|
||||
}
|
||||
|
||||
type OpsRetryResult struct {
|
||||
AttemptID int64 `json:"attempt_id"`
|
||||
Mode string `json:"mode"`
|
||||
Status string `json:"status"`
|
||||
|
||||
PinnedAccountID *int64 `json:"pinned_account_id"`
|
||||
UsedAccountID *int64 `json:"used_account_id"`
|
||||
|
||||
HTTPStatusCode int `json:"http_status_code"`
|
||||
UpstreamRequestID string `json:"upstream_request_id"`
|
||||
|
||||
ResponsePreview string `json:"response_preview"`
|
||||
ResponseTruncated bool `json:"response_truncated"`
|
||||
|
||||
ErrorMessage string `json:"error_message"`
|
||||
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
FinishedAt time.Time `json:"finished_at"`
|
||||
DurationMs int64 `json:"duration_ms"`
|
||||
}
|
||||
81
backend/internal/service/ops_realtime_models.go
Normal file
81
backend/internal/service/ops_realtime_models.go
Normal file
@@ -0,0 +1,81 @@
|
||||
package service
|
||||
|
||||
import "time"
|
||||
|
||||
// PlatformConcurrencyInfo aggregates concurrency usage by platform.
|
||||
type PlatformConcurrencyInfo struct {
|
||||
Platform string `json:"platform"`
|
||||
CurrentInUse int64 `json:"current_in_use"`
|
||||
MaxCapacity int64 `json:"max_capacity"`
|
||||
LoadPercentage float64 `json:"load_percentage"`
|
||||
WaitingInQueue int64 `json:"waiting_in_queue"`
|
||||
}
|
||||
|
||||
// GroupConcurrencyInfo aggregates concurrency usage by group.
|
||||
//
|
||||
// Note: one account can belong to multiple groups; group totals are therefore not additive across groups.
|
||||
type GroupConcurrencyInfo struct {
|
||||
GroupID int64 `json:"group_id"`
|
||||
GroupName string `json:"group_name"`
|
||||
Platform string `json:"platform"`
|
||||
CurrentInUse int64 `json:"current_in_use"`
|
||||
MaxCapacity int64 `json:"max_capacity"`
|
||||
LoadPercentage float64 `json:"load_percentage"`
|
||||
WaitingInQueue int64 `json:"waiting_in_queue"`
|
||||
}
|
||||
|
||||
// AccountConcurrencyInfo represents real-time concurrency usage for a single account.
|
||||
type AccountConcurrencyInfo struct {
|
||||
AccountID int64 `json:"account_id"`
|
||||
AccountName string `json:"account_name"`
|
||||
Platform string `json:"platform"`
|
||||
GroupID int64 `json:"group_id"`
|
||||
GroupName string `json:"group_name"`
|
||||
CurrentInUse int64 `json:"current_in_use"`
|
||||
MaxCapacity int64 `json:"max_capacity"`
|
||||
LoadPercentage float64 `json:"load_percentage"`
|
||||
WaitingInQueue int64 `json:"waiting_in_queue"`
|
||||
}
|
||||
|
||||
// PlatformAvailability aggregates account availability by platform.
|
||||
type PlatformAvailability struct {
|
||||
Platform string `json:"platform"`
|
||||
TotalAccounts int64 `json:"total_accounts"`
|
||||
AvailableCount int64 `json:"available_count"`
|
||||
RateLimitCount int64 `json:"rate_limit_count"`
|
||||
ErrorCount int64 `json:"error_count"`
|
||||
}
|
||||
|
||||
// GroupAvailability aggregates account availability by group.
|
||||
type GroupAvailability struct {
|
||||
GroupID int64 `json:"group_id"`
|
||||
GroupName string `json:"group_name"`
|
||||
Platform string `json:"platform"`
|
||||
TotalAccounts int64 `json:"total_accounts"`
|
||||
AvailableCount int64 `json:"available_count"`
|
||||
RateLimitCount int64 `json:"rate_limit_count"`
|
||||
ErrorCount int64 `json:"error_count"`
|
||||
}
|
||||
|
||||
// AccountAvailability represents current availability for a single account.
|
||||
type AccountAvailability struct {
|
||||
AccountID int64 `json:"account_id"`
|
||||
AccountName string `json:"account_name"`
|
||||
Platform string `json:"platform"`
|
||||
GroupID int64 `json:"group_id"`
|
||||
GroupName string `json:"group_name"`
|
||||
|
||||
Status string `json:"status"`
|
||||
|
||||
IsAvailable bool `json:"is_available"`
|
||||
IsRateLimited bool `json:"is_rate_limited"`
|
||||
IsOverloaded bool `json:"is_overloaded"`
|
||||
HasError bool `json:"has_error"`
|
||||
|
||||
RateLimitResetAt *time.Time `json:"rate_limit_reset_at"`
|
||||
RateLimitRemainingSec *int64 `json:"rate_limit_remaining_sec"`
|
||||
OverloadUntil *time.Time `json:"overload_until"`
|
||||
OverloadRemainingSec *int64 `json:"overload_remaining_sec"`
|
||||
ErrorMessage string `json:"error_message"`
|
||||
TempUnschedulableUntil *time.Time `json:"temp_unschedulable_until,omitempty"`
|
||||
}
|
||||
70
backend/internal/service/ops_settings_models.go
Normal file
70
backend/internal/service/ops_settings_models.go
Normal file
@@ -0,0 +1,70 @@
|
||||
package service
|
||||
|
||||
// Ops settings models stored in DB `settings` table (JSON blobs).
|
||||
|
||||
type OpsEmailNotificationConfig struct {
|
||||
Alert OpsEmailAlertConfig `json:"alert"`
|
||||
Report OpsEmailReportConfig `json:"report"`
|
||||
}
|
||||
|
||||
type OpsEmailAlertConfig struct {
|
||||
Enabled bool `json:"enabled"`
|
||||
Recipients []string `json:"recipients"`
|
||||
MinSeverity string `json:"min_severity"`
|
||||
RateLimitPerHour int `json:"rate_limit_per_hour"`
|
||||
BatchingWindowSeconds int `json:"batching_window_seconds"`
|
||||
IncludeResolvedAlerts bool `json:"include_resolved_alerts"`
|
||||
}
|
||||
|
||||
type OpsEmailReportConfig struct {
|
||||
Enabled bool `json:"enabled"`
|
||||
Recipients []string `json:"recipients"`
|
||||
DailySummaryEnabled bool `json:"daily_summary_enabled"`
|
||||
DailySummarySchedule string `json:"daily_summary_schedule"`
|
||||
WeeklySummaryEnabled bool `json:"weekly_summary_enabled"`
|
||||
WeeklySummarySchedule string `json:"weekly_summary_schedule"`
|
||||
ErrorDigestEnabled bool `json:"error_digest_enabled"`
|
||||
ErrorDigestSchedule string `json:"error_digest_schedule"`
|
||||
ErrorDigestMinCount int `json:"error_digest_min_count"`
|
||||
AccountHealthEnabled bool `json:"account_health_enabled"`
|
||||
AccountHealthSchedule string `json:"account_health_schedule"`
|
||||
AccountHealthErrorRateThreshold float64 `json:"account_health_error_rate_threshold"`
|
||||
}
|
||||
|
||||
// OpsEmailNotificationConfigUpdateRequest allows partial updates, while the
|
||||
// frontend can still send the full config shape.
|
||||
type OpsEmailNotificationConfigUpdateRequest struct {
|
||||
Alert *OpsEmailAlertConfig `json:"alert"`
|
||||
Report *OpsEmailReportConfig `json:"report"`
|
||||
}
|
||||
|
||||
type OpsDistributedLockSettings struct {
|
||||
Enabled bool `json:"enabled"`
|
||||
Key string `json:"key"`
|
||||
TTLSeconds int `json:"ttl_seconds"`
|
||||
}
|
||||
|
||||
type OpsAlertSilenceEntry struct {
|
||||
RuleID *int64 `json:"rule_id,omitempty"`
|
||||
Severities []string `json:"severities,omitempty"`
|
||||
|
||||
UntilRFC3339 string `json:"until_rfc3339"`
|
||||
Reason string `json:"reason"`
|
||||
}
|
||||
|
||||
type OpsAlertSilencingSettings struct {
|
||||
Enabled bool `json:"enabled"`
|
||||
|
||||
GlobalUntilRFC3339 string `json:"global_until_rfc3339"`
|
||||
GlobalReason string `json:"global_reason"`
|
||||
|
||||
Entries []OpsAlertSilenceEntry `json:"entries,omitempty"`
|
||||
}
|
||||
|
||||
type OpsAlertRuntimeSettings struct {
|
||||
EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"`
|
||||
|
||||
DistributedLock OpsDistributedLockSettings `json:"distributed_lock"`
|
||||
Silencing OpsAlertSilencingSettings `json:"silencing"`
|
||||
}
|
||||
|
||||
65
backend/internal/service/ops_trend_models.go
Normal file
65
backend/internal/service/ops_trend_models.go
Normal file
@@ -0,0 +1,65 @@
|
||||
package service
|
||||
|
||||
import "time"
|
||||
|
||||
type OpsThroughputTrendPoint struct {
|
||||
BucketStart time.Time `json:"bucket_start"`
|
||||
RequestCount int64 `json:"request_count"`
|
||||
TokenConsumed int64 `json:"token_consumed"`
|
||||
QPS float64 `json:"qps"`
|
||||
TPS float64 `json:"tps"`
|
||||
}
|
||||
|
||||
type OpsThroughputPlatformBreakdownItem struct {
|
||||
Platform string `json:"platform"`
|
||||
RequestCount int64 `json:"request_count"`
|
||||
TokenConsumed int64 `json:"token_consumed"`
|
||||
}
|
||||
|
||||
type OpsThroughputGroupBreakdownItem struct {
|
||||
GroupID int64 `json:"group_id"`
|
||||
GroupName string `json:"group_name"`
|
||||
RequestCount int64 `json:"request_count"`
|
||||
TokenConsumed int64 `json:"token_consumed"`
|
||||
}
|
||||
|
||||
type OpsThroughputTrendResponse struct {
|
||||
Bucket string `json:"bucket"`
|
||||
|
||||
Points []*OpsThroughputTrendPoint `json:"points"`
|
||||
|
||||
// Optional drilldown helpers:
|
||||
// - When no platform/group is selected: returns totals by platform.
|
||||
// - When platform is selected but group is not: returns top groups in that platform.
|
||||
ByPlatform []*OpsThroughputPlatformBreakdownItem `json:"by_platform,omitempty"`
|
||||
TopGroups []*OpsThroughputGroupBreakdownItem `json:"top_groups,omitempty"`
|
||||
}
|
||||
|
||||
type OpsErrorTrendPoint struct {
|
||||
BucketStart time.Time `json:"bucket_start"`
|
||||
|
||||
ErrorCountTotal int64 `json:"error_count_total"`
|
||||
BusinessLimitedCount int64 `json:"business_limited_count"`
|
||||
ErrorCountSLA int64 `json:"error_count_sla"`
|
||||
|
||||
UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
|
||||
Upstream429Count int64 `json:"upstream_429_count"`
|
||||
Upstream529Count int64 `json:"upstream_529_count"`
|
||||
}
|
||||
|
||||
type OpsErrorTrendResponse struct {
|
||||
Bucket string `json:"bucket"`
|
||||
Points []*OpsErrorTrendPoint `json:"points"`
|
||||
}
|
||||
|
||||
type OpsErrorDistributionItem struct {
|
||||
StatusCode int `json:"status_code"`
|
||||
Total int64 `json:"total"`
|
||||
SLA int64 `json:"sla"`
|
||||
BusinessLimited int64 `json:"business_limited"`
|
||||
}
|
||||
|
||||
type OpsErrorDistributionResponse struct {
|
||||
Total int64 `json:"total"`
|
||||
Items []*OpsErrorDistributionItem `json:"items"`
|
||||
}
|
||||
Reference in New Issue
Block a user