From 4b9e47cec915f4ca1709e206a31dd43937d2a4af Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:51:41 +0800 Subject: [PATCH 01/53] =?UTF-8?q?feat(=E5=9F=BA=E7=A1=80=E8=AE=BE=E6=96=BD?= =?UTF-8?q?):=20=E6=B7=BB=E5=8A=A0=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7?= =?UTF-8?q?=E5=8A=9F=E8=83=BD=E7=9A=84=E5=9F=BA=E7=A1=80=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E5=92=8C=E4=BE=9D=E8=B5=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 更新 .gitignore 排除临时文件 - 添加 ops 监控相关配置项到 config.yaml - 更新 Go 依赖包(go.mod/go.sum) - 扩展 config.go 支持 ops 监控配置 - 新增上下文键定义(ClientRequestID) --- .gitignore | 2 - backend/go.mod | 4 +- backend/go.sum | 4 ++ backend/internal/config/config.go | 71 +++++++++++++++++++++++++++ backend/internal/pkg/ctxkey/ctxkey.go | 6 +++ config.yaml | 35 +++++++++++++ 6 files changed, 119 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 93ae19f3..ec218bfa 100644 --- a/.gitignore +++ b/.gitignore @@ -123,6 +123,4 @@ backend/cmd/server/server deploy/docker-compose.override.yml .gocache/ vite.config.js -!docs/ docs/* -!docs/dependency-security.md diff --git a/backend/go.mod b/backend/go.mod index 9ac48305..97f599f8 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -8,9 +8,11 @@ require ( github.com/golang-jwt/jwt/v5 v5.2.2 github.com/google/uuid v1.6.0 github.com/google/wire v0.7.0 + github.com/gorilla/websocket v1.5.3 github.com/imroc/req/v3 v3.57.0 github.com/lib/pq v1.10.9 github.com/redis/go-redis/v9 v9.17.2 + github.com/shirou/gopsutil/v4 v4.25.6 github.com/spf13/viper v1.18.2 github.com/stretchr/testify v1.11.1 github.com/testcontainers/testcontainers-go/modules/postgres v0.40.0 @@ -104,9 +106,9 @@ require ( github.com/quic-go/quic-go v0.57.1 // indirect github.com/refraction-networking/utls v1.8.1 // indirect github.com/rivo/uniseg v0.2.0 // indirect + github.com/robfig/cron/v3 v3.0.1 // indirect github.com/sagikazarmark/locafero v0.4.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect - github.com/shirou/gopsutil/v4 v4.25.6 // indirect github.com/sirupsen/logrus v1.9.3 // indirect github.com/sourcegraph/conc v0.3.0 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect diff --git a/backend/go.sum b/backend/go.sum index 38e2b53e..0adfa4de 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -113,6 +113,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/wire v0.7.0 h1:JxUKI6+CVBgCO2WToKy/nQk0sS+amI9z9EjVmdaocj4= github.com/google/wire v0.7.0/go.mod h1:n6YbUQD9cPKTnHXEBN2DXlOp/mVADhVErcMFb0v3J18= +github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= +github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= @@ -220,6 +222,8 @@ github.com/refraction-networking/utls v1.8.1 h1:yNY1kapmQU8JeM1sSw2H2asfTIwWxIkr github.com/refraction-networking/utls v1.8.1/go.mod h1:jkSOEkLqn+S/jtpEHPOsVv/4V4EVnelwbMQl4vCWXAM= github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= +github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index e49c188b..6e66b22c 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -42,6 +42,7 @@ type Config struct { Turnstile TurnstileConfig `mapstructure:"turnstile"` Database DatabaseConfig `mapstructure:"database"` Redis RedisConfig `mapstructure:"redis"` + Ops OpsConfig `mapstructure:"ops"` JWT JWTConfig `mapstructure:"jwt"` Default DefaultConfig `mapstructure:"default"` RateLimit RateLimitConfig `mapstructure:"rate_limit"` @@ -304,6 +305,47 @@ func (r *RedisConfig) Address() string { return fmt.Sprintf("%s:%d", r.Host, r.Port) } +type OpsConfig struct { + // Enabled controls whether ops features should run. + // + // NOTE: vNext still has a DB-backed feature flag (ops_monitoring_enabled) for runtime on/off. + // This config flag is the "hard switch" for deployments that want to disable ops completely. + Enabled bool `mapstructure:"enabled"` + + // UsePreaggregatedTables prefers ops_metrics_hourly/daily for long-window dashboard queries. + UsePreaggregatedTables bool `mapstructure:"use_preaggregated_tables"` + + // Cleanup controls periodic deletion of old ops data to prevent unbounded growth. + Cleanup OpsCleanupConfig `mapstructure:"cleanup"` + + // MetricsCollectorCache controls Redis caching for expensive per-window collector queries. + MetricsCollectorCache OpsMetricsCollectorCacheConfig `mapstructure:"metrics_collector_cache"` + + // Pre-aggregation configuration. + Aggregation OpsAggregationConfig `mapstructure:"aggregation"` +} + +type OpsCleanupConfig struct { + Enabled bool `mapstructure:"enabled"` + Schedule string `mapstructure:"schedule"` + + // Retention days (0 disables that cleanup target). + // + // vNext requirement: default 30 days across ops datasets. + ErrorLogRetentionDays int `mapstructure:"error_log_retention_days"` + MinuteMetricsRetentionDays int `mapstructure:"minute_metrics_retention_days"` + HourlyMetricsRetentionDays int `mapstructure:"hourly_metrics_retention_days"` +} + +type OpsAggregationConfig struct { + Enabled bool `mapstructure:"enabled"` +} + +type OpsMetricsCollectorCacheConfig struct { + Enabled bool `mapstructure:"enabled"` + TTL time.Duration `mapstructure:"ttl"` +} + type JWTConfig struct { Secret string `mapstructure:"secret"` ExpireHour int `mapstructure:"expire_hour"` @@ -489,6 +531,20 @@ func setDefaults() { viper.SetDefault("redis.pool_size", 128) viper.SetDefault("redis.min_idle_conns", 10) + // Ops (vNext) + viper.SetDefault("ops.enabled", true) + viper.SetDefault("ops.use_preaggregated_tables", false) + viper.SetDefault("ops.cleanup.enabled", true) + viper.SetDefault("ops.cleanup.schedule", "0 2 * * *") + // Retention days: vNext defaults to 30 days across ops datasets. + viper.SetDefault("ops.cleanup.error_log_retention_days", 30) + viper.SetDefault("ops.cleanup.minute_metrics_retention_days", 30) + viper.SetDefault("ops.cleanup.hourly_metrics_retention_days", 30) + viper.SetDefault("ops.aggregation.enabled", true) + viper.SetDefault("ops.metrics_collector_cache.enabled", true) + // TTL should be slightly larger than collection interval (1m) to maximize cross-replica cache hits. + viper.SetDefault("ops.metrics_collector_cache.ttl", 65*time.Second) + // JWT viper.SetDefault("jwt.secret", "") viper.SetDefault("jwt.expire_hour", 24) @@ -687,6 +743,21 @@ func (c *Config) Validate() error { if c.Gateway.Scheduling.SlotCleanupInterval < 0 { return fmt.Errorf("gateway.scheduling.slot_cleanup_interval must be non-negative") } + if c.Ops.MetricsCollectorCache.TTL < 0 { + return fmt.Errorf("ops.metrics_collector_cache.ttl must be non-negative") + } + if c.Ops.Cleanup.ErrorLogRetentionDays < 0 { + return fmt.Errorf("ops.cleanup.error_log_retention_days must be non-negative") + } + if c.Ops.Cleanup.MinuteMetricsRetentionDays < 0 { + return fmt.Errorf("ops.cleanup.minute_metrics_retention_days must be non-negative") + } + if c.Ops.Cleanup.HourlyMetricsRetentionDays < 0 { + return fmt.Errorf("ops.cleanup.hourly_metrics_retention_days must be non-negative") + } + if c.Ops.Cleanup.Enabled && strings.TrimSpace(c.Ops.Cleanup.Schedule) == "" { + return fmt.Errorf("ops.cleanup.schedule is required when ops.cleanup.enabled=true") + } if c.Concurrency.PingInterval < 5 || c.Concurrency.PingInterval > 30 { return fmt.Errorf("concurrency.ping_interval must be between 5-30 seconds") } diff --git a/backend/internal/pkg/ctxkey/ctxkey.go b/backend/internal/pkg/ctxkey/ctxkey.go index 8920ea69..61d98cc2 100644 --- a/backend/internal/pkg/ctxkey/ctxkey.go +++ b/backend/internal/pkg/ctxkey/ctxkey.go @@ -7,4 +7,10 @@ type Key string const ( // ForcePlatform 强制平台(用于 /antigravity 路由),由 middleware.ForcePlatform 设置 ForcePlatform Key = "ctx_force_platform" + + // ClientRequestID 客户端请求的唯一标识,用于追踪请求全生命周期(用于 Ops 监控与排障)。 + ClientRequestID Key = "ctx_client_request_id" + + // RetryCount 表示当前请求在网关层的重试次数(用于 Ops 记录与排障)。 + RetryCount Key = "ctx_retry_count" ) diff --git a/config.yaml b/config.yaml index f43c9c19..0ce796e7 100644 --- a/config.yaml +++ b/config.yaml @@ -221,6 +221,41 @@ redis: # 数据库编号(0-15) db: 0 +# ============================================================================= +# Ops Monitoring (Optional) +# 运维监控 (可选) +# ============================================================================= +ops: + # Hard switch: disable all ops background jobs and APIs when false + # 硬开关:为 false 时禁用所有 Ops 后台任务与接口 + enabled: true + + # Prefer pre-aggregated tables (ops_metrics_hourly/ops_metrics_daily) for long-window dashboard queries. + # 优先使用预聚合表(用于长时间窗口查询性能) + use_preaggregated_tables: false + + # Data cleanup configuration + # 数据清理配置(vNext 默认统一保留 30 天) + cleanup: + enabled: true + # Cron expression (minute hour dom month dow), e.g. "0 2 * * *" = daily at 2 AM + # Cron 表达式(分 时 日 月 周),例如 "0 2 * * *" = 每天凌晨 2 点 + schedule: "0 2 * * *" + error_log_retention_days: 30 + minute_metrics_retention_days: 30 + hourly_metrics_retention_days: 30 + + # Pre-aggregation configuration + # 预聚合任务配置 + aggregation: + enabled: true + + # OpsMetricsCollector Redis cache (reduces duplicate expensive window aggregation in multi-replica deployments) + # 指标采集 Redis 缓存(多副本部署时减少重复计算) + metrics_collector_cache: + enabled: true + ttl: 65s + # ============================================================================= # JWT Configuration # JWT 配置 From d55866d3755fb6c4b109e225fd52d62414a4c0e5 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:52:17 +0800 Subject: [PATCH 02/53] =?UTF-8?q?feat(=E6=95=B0=E6=8D=AE=E5=BA=93):=20?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E6=A8=A1=E5=9E=8B=E5=92=8C=E6=95=B0=E6=8D=AE=E5=BA=93?= =?UTF-8?q?=E8=BF=81=E7=A7=BB=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 ops 监控数据库迁移脚本(表结构定义) - 定义核心数据模型(ops_models.go) - 定义告警相关模型(ops_alert_models.go) - 定义仪表板数据模型(ops_dashboard_models.go) - 定义实时监控数据模型(ops_realtime_models.go) - 定义配置相关模型(ops_settings_models.go) - 定义趋势分析数据模型(ops_trend_models.go) --- backend/internal/service/ops_alert_models.go | 75 ++ .../internal/service/ops_dashboard_models.go | 83 ++ backend/internal/service/ops_models.go | 118 +++ .../internal/service/ops_realtime_models.go | 81 ++ .../internal/service/ops_settings_models.go | 70 ++ backend/internal/service/ops_trend_models.go | 65 ++ .../migrations/030_ops_monitoring_vnext.sql | 707 ++++++++++++++++++ 7 files changed, 1199 insertions(+) create mode 100644 backend/internal/service/ops_alert_models.go create mode 100644 backend/internal/service/ops_dashboard_models.go create mode 100644 backend/internal/service/ops_models.go create mode 100644 backend/internal/service/ops_realtime_models.go create mode 100644 backend/internal/service/ops_settings_models.go create mode 100644 backend/internal/service/ops_trend_models.go create mode 100644 backend/migrations/030_ops_monitoring_vnext.sql diff --git a/backend/internal/service/ops_alert_models.go b/backend/internal/service/ops_alert_models.go new file mode 100644 index 00000000..783a3d1e --- /dev/null +++ b/backend/internal/service/ops_alert_models.go @@ -0,0 +1,75 @@ +package service + +import "time" + +// Ops alert rule/event models. +// +// NOTE: These are admin-facing DTOs and intentionally keep JSON naming aligned +// with the existing ops dashboard frontend (backup style). + +const ( + OpsAlertStatusFiring = "firing" + OpsAlertStatusResolved = "resolved" +) + +type OpsAlertRule struct { + ID int64 `json:"id"` + Name string `json:"name"` + Description string `json:"description"` + + Enabled bool `json:"enabled"` + Severity string `json:"severity"` + + MetricType string `json:"metric_type"` + Operator string `json:"operator"` + Threshold float64 `json:"threshold"` + + WindowMinutes int `json:"window_minutes"` + SustainedMinutes int `json:"sustained_minutes"` + CooldownMinutes int `json:"cooldown_minutes"` + + NotifyEmail bool `json:"notify_email"` + + Filters map[string]any `json:"filters,omitempty"` + + LastTriggeredAt *time.Time `json:"last_triggered_at,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type OpsAlertEvent struct { + ID int64 `json:"id"` + RuleID int64 `json:"rule_id"` + Severity string `json:"severity"` + Status string `json:"status"` + + Title string `json:"title"` + Description string `json:"description"` + + MetricValue *float64 `json:"metric_value,omitempty"` + ThresholdValue *float64 `json:"threshold_value,omitempty"` + + Dimensions map[string]any `json:"dimensions,omitempty"` + + FiredAt time.Time `json:"fired_at"` + ResolvedAt *time.Time `json:"resolved_at,omitempty"` + + EmailSent bool `json:"email_sent"` + CreatedAt time.Time `json:"created_at"` +} + +type OpsAlertEventFilter struct { + Limit int + + // Optional filters. + Status string + Severity string + + StartTime *time.Time + EndTime *time.Time + + // Dimensions filters (best-effort). + Platform string + GroupID *int64 +} + diff --git a/backend/internal/service/ops_dashboard_models.go b/backend/internal/service/ops_dashboard_models.go new file mode 100644 index 00000000..51a0b1fb --- /dev/null +++ b/backend/internal/service/ops_dashboard_models.go @@ -0,0 +1,83 @@ +package service + +import "time" + +type OpsDashboardFilter struct { + StartTime time.Time + EndTime time.Time + + Platform string + GroupID *int64 + + // QueryMode controls whether dashboard queries should use raw logs or pre-aggregated tables. + // Expected values: auto/raw/preagg (see OpsQueryMode). + QueryMode OpsQueryMode +} + +type OpsRateSummary struct { + Current float64 `json:"current"` + Peak float64 `json:"peak"` + Avg float64 `json:"avg"` +} + +type OpsPercentiles struct { + P50 *int `json:"p50_ms"` + P90 *int `json:"p90_ms"` + P95 *int `json:"p95_ms"` + P99 *int `json:"p99_ms"` + Avg *int `json:"avg_ms"` + Max *int `json:"max_ms"` +} + +type OpsDashboardOverview struct { + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + Platform string `json:"platform"` + GroupID *int64 `json:"group_id"` + + // Latest system-level snapshot (window=1m, global). + SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"` + + // Background jobs health (heartbeats). + JobHeartbeats []*OpsJobHeartbeat `json:"job_heartbeats"` + + SuccessCount int64 `json:"success_count"` + ErrorCountTotal int64 `json:"error_count_total"` + BusinessLimitedCount int64 `json:"business_limited_count"` + + ErrorCountSLA int64 `json:"error_count_sla"` + RequestCountTotal int64 `json:"request_count_total"` + RequestCountSLA int64 `json:"request_count_sla"` + + TokenConsumed int64 `json:"token_consumed"` + + SLA float64 `json:"sla"` + ErrorRate float64 `json:"error_rate"` + UpstreamErrorRate float64 `json:"upstream_error_rate"` + UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"` + Upstream429Count int64 `json:"upstream_429_count"` + Upstream529Count int64 `json:"upstream_529_count"` + + QPS OpsRateSummary `json:"qps"` + TPS OpsRateSummary `json:"tps"` + + Duration OpsPercentiles `json:"duration"` + TTFT OpsPercentiles `json:"ttft"` +} + +type OpsLatencyHistogramBucket struct { + Range string `json:"range"` + Count int64 `json:"count"` +} + +// OpsLatencyHistogramResponse is a coarse latency distribution histogram (success requests only). +// It is used by the Ops dashboard to quickly identify tail latency regressions. +type OpsLatencyHistogramResponse struct { + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + Platform string `json:"platform"` + GroupID *int64 `json:"group_id"` + + TotalRequests int64 `json:"total_requests"` + Buckets []*OpsLatencyHistogramBucket `json:"buckets"` +} diff --git a/backend/internal/service/ops_models.go b/backend/internal/service/ops_models.go new file mode 100644 index 00000000..90b2dc47 --- /dev/null +++ b/backend/internal/service/ops_models.go @@ -0,0 +1,118 @@ +package service + +import "time" + +type OpsErrorLog struct { + ID int64 `json:"id"` + CreatedAt time.Time `json:"created_at"` + + Phase string `json:"phase"` + Type string `json:"type"` + Severity string `json:"severity"` + + StatusCode int `json:"status_code"` + Platform string `json:"platform"` + Model string `json:"model"` + + LatencyMs *int `json:"latency_ms"` + + ClientRequestID string `json:"client_request_id"` + RequestID string `json:"request_id"` + Message string `json:"message"` + + UserID *int64 `json:"user_id"` + APIKeyID *int64 `json:"api_key_id"` + AccountID *int64 `json:"account_id"` + GroupID *int64 `json:"group_id"` + + ClientIP *string `json:"client_ip"` + RequestPath string `json:"request_path"` + Stream bool `json:"stream"` +} + +type OpsErrorLogDetail struct { + OpsErrorLog + + ErrorBody string `json:"error_body"` + UserAgent string `json:"user_agent"` + + // Timings (optional) + AuthLatencyMs *int64 `json:"auth_latency_ms"` + RoutingLatencyMs *int64 `json:"routing_latency_ms"` + UpstreamLatencyMs *int64 `json:"upstream_latency_ms"` + ResponseLatencyMs *int64 `json:"response_latency_ms"` + TimeToFirstTokenMs *int64 `json:"time_to_first_token_ms"` + + // Retry context + RequestBody string `json:"request_body"` + RequestBodyTruncated bool `json:"request_body_truncated"` + RequestBodyBytes *int `json:"request_body_bytes"` + RequestHeaders string `json:"request_headers,omitempty"` + + // vNext metric semantics + IsBusinessLimited bool `json:"is_business_limited"` +} + +type OpsErrorLogFilter struct { + StartTime *time.Time + EndTime *time.Time + + Platform string + GroupID *int64 + AccountID *int64 + + StatusCodes []int + Phase string + Query string + + Page int + PageSize int +} + +type OpsErrorLogList struct { + Errors []*OpsErrorLog `json:"errors"` + Total int `json:"total"` + Page int `json:"page"` + PageSize int `json:"page_size"` +} + +type OpsRetryAttempt struct { + ID int64 `json:"id"` + CreatedAt time.Time `json:"created_at"` + + RequestedByUserID int64 `json:"requested_by_user_id"` + SourceErrorID int64 `json:"source_error_id"` + Mode string `json:"mode"` + PinnedAccountID *int64 `json:"pinned_account_id"` + + Status string `json:"status"` + StartedAt *time.Time `json:"started_at"` + FinishedAt *time.Time `json:"finished_at"` + DurationMs *int64 `json:"duration_ms"` + + ResultRequestID *string `json:"result_request_id"` + ResultErrorID *int64 `json:"result_error_id"` + + ErrorMessage *string `json:"error_message"` +} + +type OpsRetryResult struct { + AttemptID int64 `json:"attempt_id"` + Mode string `json:"mode"` + Status string `json:"status"` + + PinnedAccountID *int64 `json:"pinned_account_id"` + UsedAccountID *int64 `json:"used_account_id"` + + HTTPStatusCode int `json:"http_status_code"` + UpstreamRequestID string `json:"upstream_request_id"` + + ResponsePreview string `json:"response_preview"` + ResponseTruncated bool `json:"response_truncated"` + + ErrorMessage string `json:"error_message"` + + StartedAt time.Time `json:"started_at"` + FinishedAt time.Time `json:"finished_at"` + DurationMs int64 `json:"duration_ms"` +} diff --git a/backend/internal/service/ops_realtime_models.go b/backend/internal/service/ops_realtime_models.go new file mode 100644 index 00000000..f7514a24 --- /dev/null +++ b/backend/internal/service/ops_realtime_models.go @@ -0,0 +1,81 @@ +package service + +import "time" + +// PlatformConcurrencyInfo aggregates concurrency usage by platform. +type PlatformConcurrencyInfo struct { + Platform string `json:"platform"` + CurrentInUse int64 `json:"current_in_use"` + MaxCapacity int64 `json:"max_capacity"` + LoadPercentage float64 `json:"load_percentage"` + WaitingInQueue int64 `json:"waiting_in_queue"` +} + +// GroupConcurrencyInfo aggregates concurrency usage by group. +// +// Note: one account can belong to multiple groups; group totals are therefore not additive across groups. +type GroupConcurrencyInfo struct { + GroupID int64 `json:"group_id"` + GroupName string `json:"group_name"` + Platform string `json:"platform"` + CurrentInUse int64 `json:"current_in_use"` + MaxCapacity int64 `json:"max_capacity"` + LoadPercentage float64 `json:"load_percentage"` + WaitingInQueue int64 `json:"waiting_in_queue"` +} + +// AccountConcurrencyInfo represents real-time concurrency usage for a single account. +type AccountConcurrencyInfo struct { + AccountID int64 `json:"account_id"` + AccountName string `json:"account_name"` + Platform string `json:"platform"` + GroupID int64 `json:"group_id"` + GroupName string `json:"group_name"` + CurrentInUse int64 `json:"current_in_use"` + MaxCapacity int64 `json:"max_capacity"` + LoadPercentage float64 `json:"load_percentage"` + WaitingInQueue int64 `json:"waiting_in_queue"` +} + +// PlatformAvailability aggregates account availability by platform. +type PlatformAvailability struct { + Platform string `json:"platform"` + TotalAccounts int64 `json:"total_accounts"` + AvailableCount int64 `json:"available_count"` + RateLimitCount int64 `json:"rate_limit_count"` + ErrorCount int64 `json:"error_count"` +} + +// GroupAvailability aggregates account availability by group. +type GroupAvailability struct { + GroupID int64 `json:"group_id"` + GroupName string `json:"group_name"` + Platform string `json:"platform"` + TotalAccounts int64 `json:"total_accounts"` + AvailableCount int64 `json:"available_count"` + RateLimitCount int64 `json:"rate_limit_count"` + ErrorCount int64 `json:"error_count"` +} + +// AccountAvailability represents current availability for a single account. +type AccountAvailability struct { + AccountID int64 `json:"account_id"` + AccountName string `json:"account_name"` + Platform string `json:"platform"` + GroupID int64 `json:"group_id"` + GroupName string `json:"group_name"` + + Status string `json:"status"` + + IsAvailable bool `json:"is_available"` + IsRateLimited bool `json:"is_rate_limited"` + IsOverloaded bool `json:"is_overloaded"` + HasError bool `json:"has_error"` + + RateLimitResetAt *time.Time `json:"rate_limit_reset_at"` + RateLimitRemainingSec *int64 `json:"rate_limit_remaining_sec"` + OverloadUntil *time.Time `json:"overload_until"` + OverloadRemainingSec *int64 `json:"overload_remaining_sec"` + ErrorMessage string `json:"error_message"` + TempUnschedulableUntil *time.Time `json:"temp_unschedulable_until,omitempty"` +} diff --git a/backend/internal/service/ops_settings_models.go b/backend/internal/service/ops_settings_models.go new file mode 100644 index 00000000..78399c49 --- /dev/null +++ b/backend/internal/service/ops_settings_models.go @@ -0,0 +1,70 @@ +package service + +// Ops settings models stored in DB `settings` table (JSON blobs). + +type OpsEmailNotificationConfig struct { + Alert OpsEmailAlertConfig `json:"alert"` + Report OpsEmailReportConfig `json:"report"` +} + +type OpsEmailAlertConfig struct { + Enabled bool `json:"enabled"` + Recipients []string `json:"recipients"` + MinSeverity string `json:"min_severity"` + RateLimitPerHour int `json:"rate_limit_per_hour"` + BatchingWindowSeconds int `json:"batching_window_seconds"` + IncludeResolvedAlerts bool `json:"include_resolved_alerts"` +} + +type OpsEmailReportConfig struct { + Enabled bool `json:"enabled"` + Recipients []string `json:"recipients"` + DailySummaryEnabled bool `json:"daily_summary_enabled"` + DailySummarySchedule string `json:"daily_summary_schedule"` + WeeklySummaryEnabled bool `json:"weekly_summary_enabled"` + WeeklySummarySchedule string `json:"weekly_summary_schedule"` + ErrorDigestEnabled bool `json:"error_digest_enabled"` + ErrorDigestSchedule string `json:"error_digest_schedule"` + ErrorDigestMinCount int `json:"error_digest_min_count"` + AccountHealthEnabled bool `json:"account_health_enabled"` + AccountHealthSchedule string `json:"account_health_schedule"` + AccountHealthErrorRateThreshold float64 `json:"account_health_error_rate_threshold"` +} + +// OpsEmailNotificationConfigUpdateRequest allows partial updates, while the +// frontend can still send the full config shape. +type OpsEmailNotificationConfigUpdateRequest struct { + Alert *OpsEmailAlertConfig `json:"alert"` + Report *OpsEmailReportConfig `json:"report"` +} + +type OpsDistributedLockSettings struct { + Enabled bool `json:"enabled"` + Key string `json:"key"` + TTLSeconds int `json:"ttl_seconds"` +} + +type OpsAlertSilenceEntry struct { + RuleID *int64 `json:"rule_id,omitempty"` + Severities []string `json:"severities,omitempty"` + + UntilRFC3339 string `json:"until_rfc3339"` + Reason string `json:"reason"` +} + +type OpsAlertSilencingSettings struct { + Enabled bool `json:"enabled"` + + GlobalUntilRFC3339 string `json:"global_until_rfc3339"` + GlobalReason string `json:"global_reason"` + + Entries []OpsAlertSilenceEntry `json:"entries,omitempty"` +} + +type OpsAlertRuntimeSettings struct { + EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"` + + DistributedLock OpsDistributedLockSettings `json:"distributed_lock"` + Silencing OpsAlertSilencingSettings `json:"silencing"` +} + diff --git a/backend/internal/service/ops_trend_models.go b/backend/internal/service/ops_trend_models.go new file mode 100644 index 00000000..f6d07c14 --- /dev/null +++ b/backend/internal/service/ops_trend_models.go @@ -0,0 +1,65 @@ +package service + +import "time" + +type OpsThroughputTrendPoint struct { + BucketStart time.Time `json:"bucket_start"` + RequestCount int64 `json:"request_count"` + TokenConsumed int64 `json:"token_consumed"` + QPS float64 `json:"qps"` + TPS float64 `json:"tps"` +} + +type OpsThroughputPlatformBreakdownItem struct { + Platform string `json:"platform"` + RequestCount int64 `json:"request_count"` + TokenConsumed int64 `json:"token_consumed"` +} + +type OpsThroughputGroupBreakdownItem struct { + GroupID int64 `json:"group_id"` + GroupName string `json:"group_name"` + RequestCount int64 `json:"request_count"` + TokenConsumed int64 `json:"token_consumed"` +} + +type OpsThroughputTrendResponse struct { + Bucket string `json:"bucket"` + + Points []*OpsThroughputTrendPoint `json:"points"` + + // Optional drilldown helpers: + // - When no platform/group is selected: returns totals by platform. + // - When platform is selected but group is not: returns top groups in that platform. + ByPlatform []*OpsThroughputPlatformBreakdownItem `json:"by_platform,omitempty"` + TopGroups []*OpsThroughputGroupBreakdownItem `json:"top_groups,omitempty"` +} + +type OpsErrorTrendPoint struct { + BucketStart time.Time `json:"bucket_start"` + + ErrorCountTotal int64 `json:"error_count_total"` + BusinessLimitedCount int64 `json:"business_limited_count"` + ErrorCountSLA int64 `json:"error_count_sla"` + + UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"` + Upstream429Count int64 `json:"upstream_429_count"` + Upstream529Count int64 `json:"upstream_529_count"` +} + +type OpsErrorTrendResponse struct { + Bucket string `json:"bucket"` + Points []*OpsErrorTrendPoint `json:"points"` +} + +type OpsErrorDistributionItem struct { + StatusCode int `json:"status_code"` + Total int64 `json:"total"` + SLA int64 `json:"sla"` + BusinessLimited int64 `json:"business_limited"` +} + +type OpsErrorDistributionResponse struct { + Total int64 `json:"total"` + Items []*OpsErrorDistributionItem `json:"items"` +} diff --git a/backend/migrations/030_ops_monitoring_vnext.sql b/backend/migrations/030_ops_monitoring_vnext.sql new file mode 100644 index 00000000..39b19e5d --- /dev/null +++ b/backend/migrations/030_ops_monitoring_vnext.sql @@ -0,0 +1,707 @@ +-- Ops Monitoring (vNext): squashed migration (030) +-- +-- This repository originally planned Ops vNext as migrations 030-036: +-- 030 drop legacy ops tables +-- 031 core schema +-- 032 pre-aggregation tables +-- 033 indexes + optional extensions +-- 034 add avg/max to preagg +-- 035 add notify_email to alert rules +-- 036 seed default alert rules +-- +-- Since these migrations have NOT been applied to any environment yet, we squash them +-- into a single 030 migration for easier review and a cleaner migration history. +-- +-- Notes: +-- - This is intentionally destructive for ops_* data (error logs / metrics / alerts). +-- - It is idempotent (DROP/CREATE/ALTER IF EXISTS/IF NOT EXISTS), but will wipe ops_* data if re-run. + +-- ===================================================================== +-- 030_ops_drop_legacy_ops_tables.sql +-- ===================================================================== + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- Legacy pre-aggregation tables (from 026 and/or previous branches) +DROP TABLE IF EXISTS ops_metrics_daily CASCADE; +DROP TABLE IF EXISTS ops_metrics_hourly CASCADE; + +-- Core ops tables that may exist in some deployments / branches +DROP TABLE IF EXISTS ops_system_metrics CASCADE; +DROP TABLE IF EXISTS ops_error_logs CASCADE; +DROP TABLE IF EXISTS ops_alert_events CASCADE; +DROP TABLE IF EXISTS ops_alert_rules CASCADE; +DROP TABLE IF EXISTS ops_job_heartbeats CASCADE; +DROP TABLE IF EXISTS ops_retry_attempts CASCADE; + +-- Optional legacy tables (best-effort cleanup) +DROP TABLE IF EXISTS ops_scheduled_reports CASCADE; +DROP TABLE IF EXISTS ops_group_availability_configs CASCADE; +DROP TABLE IF EXISTS ops_group_availability_events CASCADE; + +-- Optional legacy views/indexes +DROP VIEW IF EXISTS ops_latest_metrics CASCADE; + +-- ===================================================================== +-- 031_ops_core_schema.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): core schema (errors / retries / metrics / jobs / alerts) +-- +-- Design goals: +-- - Support global filtering (time/platform/group) across all ops modules. +-- - Persist enough context for two retry modes (client retry / pinned upstream retry). +-- - Make ops background jobs observable via job heartbeats. +-- - Keep schema stable and indexes targeted (high-write tables). +-- +-- Notes: +-- - This migration is idempotent. +-- - ops_* tables intentionally avoid strict foreign keys to reduce write amplification/locks. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- ============================================ +-- 1) ops_error_logs: error log details (high-write) +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_error_logs ( + id BIGSERIAL PRIMARY KEY, + + -- Correlation / identities + request_id VARCHAR(64), + client_request_id VARCHAR(64), + user_id BIGINT, + api_key_id BIGINT, + account_id BIGINT, + group_id BIGINT, + client_ip inet, + + -- Dimensions for global filtering + platform VARCHAR(32), + + -- Request metadata + model VARCHAR(100), + request_path VARCHAR(256), + stream BOOLEAN NOT NULL DEFAULT false, + user_agent TEXT, + + -- Core error classification + error_phase VARCHAR(32) NOT NULL, + error_type VARCHAR(64) NOT NULL, + severity VARCHAR(8) NOT NULL DEFAULT 'P2', + status_code INT, + + -- vNext metric semantics + is_business_limited BOOLEAN NOT NULL DEFAULT false, + + -- Error details (sanitized/truncated at ingest time) + error_message TEXT, + error_body TEXT, + + -- Provider/upstream details (optional; useful for trends & account health) + error_source VARCHAR(64), + error_owner VARCHAR(32), + account_status VARCHAR(50), + upstream_status_code INT, + upstream_error_message TEXT, + upstream_error_detail TEXT, + provider_error_code VARCHAR(64), + provider_error_type VARCHAR(64), + network_error_type VARCHAR(50), + retry_after_seconds INT, + + -- Timings (ms) - optional + duration_ms INT, + time_to_first_token_ms BIGINT, + auth_latency_ms BIGINT, + routing_latency_ms BIGINT, + upstream_latency_ms BIGINT, + response_latency_ms BIGINT, + + -- Retry context (only stored for error requests) + request_body JSONB, + request_headers JSONB, + request_body_truncated BOOLEAN NOT NULL DEFAULT false, + request_body_bytes INT, + + -- Retryability flags (best-effort classification) + is_retryable BOOLEAN NOT NULL DEFAULT false, + retry_count INT NOT NULL DEFAULT 0, + + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +COMMENT ON TABLE ops_error_logs IS 'Ops error logs (vNext). Stores sanitized error details and request_body for retries (errors only).'; + +-- ============================================ +-- 2) ops_retry_attempts: audit log for retries +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_retry_attempts ( + id BIGSERIAL PRIMARY KEY, + + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + + requested_by_user_id BIGINT, + source_error_id BIGINT, + + -- client|upstream + mode VARCHAR(16) NOT NULL, + pinned_account_id BIGINT, + + -- queued|running|succeeded|failed + status VARCHAR(16) NOT NULL DEFAULT 'queued', + started_at TIMESTAMPTZ, + finished_at TIMESTAMPTZ, + duration_ms BIGINT, + + -- Optional result correlation + result_request_id VARCHAR(64), + result_error_id BIGINT, + result_usage_request_id VARCHAR(64), + + error_message TEXT +); + +COMMENT ON TABLE ops_retry_attempts IS 'Audit table for ops retries (client retry / pinned upstream retry).'; + +-- ============================================ +-- 3) ops_system_metrics: system + request window snapshots +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_system_metrics ( + id BIGSERIAL PRIMARY KEY, + + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + window_minutes INT NOT NULL DEFAULT 1, + + -- Optional dimensions (only if collector chooses to write per-dimension snapshots) + platform VARCHAR(32), + group_id BIGINT, + + -- Core counts + success_count BIGINT NOT NULL DEFAULT 0, + error_count_total BIGINT NOT NULL DEFAULT 0, + business_limited_count BIGINT NOT NULL DEFAULT 0, + error_count_sla BIGINT NOT NULL DEFAULT 0, + + upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0, + upstream_429_count BIGINT NOT NULL DEFAULT 0, + upstream_529_count BIGINT NOT NULL DEFAULT 0, + + token_consumed BIGINT NOT NULL DEFAULT 0, + + -- Rates + qps DOUBLE PRECISION, + tps DOUBLE PRECISION, + + -- Duration percentiles (ms) - success requests + duration_p50_ms INT, + duration_p90_ms INT, + duration_p95_ms INT, + duration_p99_ms INT, + duration_avg_ms DOUBLE PRECISION, + duration_max_ms INT, + + -- TTFT percentiles (ms) - success requests (streaming) + ttft_p50_ms INT, + ttft_p90_ms INT, + ttft_p95_ms INT, + ttft_p99_ms INT, + ttft_avg_ms DOUBLE PRECISION, + ttft_max_ms INT, + + -- System resources + cpu_usage_percent DOUBLE PRECISION, + memory_used_mb BIGINT, + memory_total_mb BIGINT, + memory_usage_percent DOUBLE PRECISION, + + -- Dependency health (best-effort) + db_ok BOOLEAN, + redis_ok BOOLEAN, + + -- DB pool & runtime + db_conn_active INT, + db_conn_idle INT, + db_conn_waiting INT, + goroutine_count INT, + + -- Queue / concurrency + concurrency_queue_depth INT +); + +COMMENT ON TABLE ops_system_metrics IS 'Ops system/request metrics snapshots (vNext). Used for dashboard overview and realtime rates.'; + +-- ============================================ +-- 4) ops_job_heartbeats: background jobs health +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_job_heartbeats ( + job_name VARCHAR(64) PRIMARY KEY, + + last_run_at TIMESTAMPTZ, + last_success_at TIMESTAMPTZ, + last_error_at TIMESTAMPTZ, + last_error TEXT, + last_duration_ms BIGINT, + + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +COMMENT ON TABLE ops_job_heartbeats IS 'Ops background jobs heartbeats (vNext).'; + +-- ============================================ +-- 5) ops_alert_rules / ops_alert_events +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_alert_rules ( + id BIGSERIAL PRIMARY KEY, + + name VARCHAR(128) NOT NULL, + description TEXT, + enabled BOOLEAN NOT NULL DEFAULT true, + + severity VARCHAR(16) NOT NULL DEFAULT 'warning', + + -- Metric definition + -- Metric definition + metric_type VARCHAR(64) NOT NULL, + operator VARCHAR(8) NOT NULL, + threshold DOUBLE PRECISION NOT NULL, + + window_minutes INT NOT NULL DEFAULT 5, + sustained_minutes INT NOT NULL DEFAULT 5, + cooldown_minutes INT NOT NULL DEFAULT 10, + + -- Optional scoping: platform/group filters etc. + filters JSONB, + + last_triggered_at TIMESTAMPTZ, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_alert_rules_name_unique + ON ops_alert_rules (name); + +CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled + ON ops_alert_rules (enabled); + +CREATE TABLE IF NOT EXISTS ops_alert_events ( + id BIGSERIAL PRIMARY KEY, + + rule_id BIGINT, + severity VARCHAR(16) NOT NULL, + status VARCHAR(16) NOT NULL DEFAULT 'firing', + + title VARCHAR(200), + description TEXT, + + metric_value DOUBLE PRECISION, + threshold_value DOUBLE PRECISION, + dimensions JSONB, + + fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + resolved_at TIMESTAMPTZ, + + email_sent BOOLEAN NOT NULL DEFAULT false, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status + ON ops_alert_events (rule_id, status); + +CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at + ON ops_alert_events (fired_at DESC); + +-- ===================================================================== +-- 032_ops_preaggregation_tables.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): pre-aggregation tables +-- +-- Purpose: +-- - Provide stable query performance for 1–24h windows (and beyond), avoiding expensive +-- percentile_cont scans on raw logs for every dashboard refresh. +-- - Support global filter dimensions: overall / platform / group. +-- +-- Design note: +-- - We keep a single table with nullable platform/group_id, and enforce uniqueness via a +-- COALESCE-based unique index (because UNIQUE with NULLs allows duplicates in Postgres). + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- ============================================ +-- 1) ops_metrics_hourly +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_metrics_hourly ( + id BIGSERIAL PRIMARY KEY, + + bucket_start TIMESTAMPTZ NOT NULL, + platform VARCHAR(32), + group_id BIGINT, + + success_count BIGINT NOT NULL DEFAULT 0, + error_count_total BIGINT NOT NULL DEFAULT 0, + business_limited_count BIGINT NOT NULL DEFAULT 0, + error_count_sla BIGINT NOT NULL DEFAULT 0, + + upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0, + upstream_429_count BIGINT NOT NULL DEFAULT 0, + upstream_529_count BIGINT NOT NULL DEFAULT 0, + + token_consumed BIGINT NOT NULL DEFAULT 0, + + -- Duration percentiles (ms) + duration_p50_ms INT, + duration_p90_ms INT, + duration_p95_ms INT, + duration_p99_ms INT, + + -- TTFT percentiles (ms) + ttft_p50_ms INT, + ttft_p90_ms INT, + ttft_p95_ms INT, + ttft_p99_ms INT, + + computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +-- Uniqueness across three “dimension modes” (overall / platform / group). +-- Postgres UNIQUE treats NULLs as distinct, so we enforce uniqueness via COALESCE. +CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_hourly_unique_dim + ON ops_metrics_hourly ( + bucket_start, + COALESCE(platform, ''), + COALESCE(group_id, 0) + ); + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_bucket + ON ops_metrics_hourly (bucket_start DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_platform_bucket + ON ops_metrics_hourly (platform, bucket_start DESC) + WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL; + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_group_bucket + ON ops_metrics_hourly (group_id, bucket_start DESC) + WHERE group_id IS NOT NULL AND group_id <> 0; + +COMMENT ON TABLE ops_metrics_hourly IS 'vNext hourly pre-aggregated ops metrics (overall/platform/group).'; + +-- ============================================ +-- 2) ops_metrics_daily (optional; for longer windows) +-- ============================================ + +CREATE TABLE IF NOT EXISTS ops_metrics_daily ( + id BIGSERIAL PRIMARY KEY, + + bucket_date DATE NOT NULL, + platform VARCHAR(32), + group_id BIGINT, + + success_count BIGINT NOT NULL DEFAULT 0, + error_count_total BIGINT NOT NULL DEFAULT 0, + business_limited_count BIGINT NOT NULL DEFAULT 0, + error_count_sla BIGINT NOT NULL DEFAULT 0, + + upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0, + upstream_429_count BIGINT NOT NULL DEFAULT 0, + upstream_529_count BIGINT NOT NULL DEFAULT 0, + + token_consumed BIGINT NOT NULL DEFAULT 0, + + duration_p50_ms INT, + duration_p90_ms INT, + duration_p95_ms INT, + duration_p99_ms INT, + + ttft_p50_ms INT, + ttft_p90_ms INT, + ttft_p95_ms INT, + ttft_p99_ms INT, + + computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_daily_unique_dim + ON ops_metrics_daily ( + bucket_date, + COALESCE(platform, ''), + COALESCE(group_id, 0) + ); + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_bucket + ON ops_metrics_daily (bucket_date DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_platform_bucket + ON ops_metrics_daily (platform, bucket_date DESC) + WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL; + +CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_group_bucket + ON ops_metrics_daily (group_id, bucket_date DESC) + WHERE group_id IS NOT NULL AND group_id <> 0; + +COMMENT ON TABLE ops_metrics_daily IS 'vNext daily pre-aggregated ops metrics (overall/platform/group).'; + +-- ===================================================================== +-- 033_ops_indexes_and_extensions.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): indexes and optional extensions +-- +-- This migration intentionally keeps "optional" objects (like pg_trgm) best-effort, +-- so environments without extension privileges won't fail the whole migration chain. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- ============================================ +-- 1) Core btree indexes (always safe) +-- ============================================ + +-- ops_error_logs +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at + ON ops_error_logs (created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform_time + ON ops_error_logs (platform, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_group_time + ON ops_error_logs (group_id, created_at DESC) + WHERE group_id IS NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_account_time + ON ops_error_logs (account_id, created_at DESC) + WHERE account_id IS NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_status_time + ON ops_error_logs (status_code, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_time + ON ops_error_logs (error_phase, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_type_time + ON ops_error_logs (error_type, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id + ON ops_error_logs (request_id); + +CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id + ON ops_error_logs (client_request_id); + +-- ops_system_metrics +CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at + ON ops_system_metrics (created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time + ON ops_system_metrics (window_minutes, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_platform_time + ON ops_system_metrics (platform, created_at DESC) + WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL; + +CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_group_time + ON ops_system_metrics (group_id, created_at DESC) + WHERE group_id IS NOT NULL; + +-- ops_retry_attempts +CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_created_at + ON ops_retry_attempts (created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_source_error + ON ops_retry_attempts (source_error_id, created_at DESC) + WHERE source_error_id IS NOT NULL; + +-- Prevent concurrent retries for the same ops_error_logs row (race-free, multi-instance safe). +CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_retry_attempts_unique_active + ON ops_retry_attempts (source_error_id) + WHERE source_error_id IS NOT NULL AND status IN ('queued', 'running'); + +-- ============================================ +-- 2) Optional: pg_trgm + trigram indexes for fuzzy search +-- ============================================ + +DO $$ +BEGIN + BEGIN + CREATE EXTENSION IF NOT EXISTS pg_trgm; + EXCEPTION WHEN OTHERS THEN + -- Missing privileges or extension package should not block migrations. + RAISE NOTICE 'pg_trgm extension not created: %', SQLERRM; + END; + + IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm') THEN + -- request_id / client_request_id fuzzy search + EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id_trgm + ON ops_error_logs USING gin (request_id gin_trgm_ops)'; + EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id_trgm + ON ops_error_logs USING gin (client_request_id gin_trgm_ops)'; + + -- error_message fuzzy search + EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_error_message_trgm + ON ops_error_logs USING gin (error_message gin_trgm_ops)'; + END IF; +END $$; + +-- ===================================================================== +-- 034_ops_preaggregation_add_avg_max.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): extend pre-aggregation tables with avg/max latency fields +-- +-- Why: +-- - The dashboard overview returns avg/max for duration/TTFT. +-- - Hourly/daily pre-aggregation tables originally stored only p50/p90/p95/p99, which makes +-- it impossible to answer avg/max in preagg mode without falling back to raw scans. +-- +-- This migration is idempotent and safe to run multiple times. +-- +-- NOTE: We keep the existing p50/p90/p95/p99 columns as-is; these are still used for +-- approximate long-window summaries. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- Hourly table +ALTER TABLE ops_metrics_hourly + ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION, + ADD COLUMN IF NOT EXISTS duration_max_ms INT, + ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION, + ADD COLUMN IF NOT EXISTS ttft_max_ms INT; + +-- Daily table +ALTER TABLE ops_metrics_daily + ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION, + ADD COLUMN IF NOT EXISTS duration_max_ms INT, + ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION, + ADD COLUMN IF NOT EXISTS ttft_max_ms INT; + +-- ===================================================================== +-- 035_ops_alert_rules_notify_email.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): alert rule notify settings +-- +-- Adds notify_email flag to ops_alert_rules to keep UI parity with the backup Ops dashboard. +-- Migration is idempotent. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +ALTER TABLE ops_alert_rules + ADD COLUMN IF NOT EXISTS notify_email BOOLEAN NOT NULL DEFAULT true; + +-- ===================================================================== +-- 036_ops_seed_default_alert_rules.sql +-- ===================================================================== + +-- Ops Monitoring (vNext): seed default alert rules (idempotent) +-- +-- Goal: +-- - Provide "out of the box" alert rules so the Ops dashboard can immediately show alert events. +-- - Keep inserts idempotent via ON CONFLICT (name) DO NOTHING. +-- +-- Notes: +-- - Thresholds are intentionally conservative defaults and should be tuned per deployment. +-- - Metric semantics follow vNext: +-- - success_rate / error_rate are based on SLA-scope counts (exclude is_business_limited). +-- - upstream_error_rate excludes 429/529. + +SET LOCAL lock_timeout = '5s'; +SET LOCAL statement_timeout = '10min'; + +-- 1) High error rate (P1) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + '错误率过高', + '当错误率超过 5% 且持续 5 分钟时触发告警', + true, 'error_rate', '>', 5.0, 5, 5, 'P1', true, 20, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 2) Low success rate (P0) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + '成功率过低', + '当成功率低于 95% 且持续 5 分钟时触发告警(服务可用性下降)', + true, 'success_rate', '<', 95.0, 5, 5, 'P0', true, 15, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 3) P99 latency too high (P2) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + 'P99延迟过高', + '当 P99 延迟超过 3000ms 且持续 10 分钟时触发告警', + true, 'p99_latency_ms', '>', 3000.0, 5, 10, 'P2', true, 30, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 4) P95 latency too high (P2) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + 'P95延迟过高', + '当 P95 延迟超过 2000ms 且持续 10 分钟时触发告警', + true, 'p95_latency_ms', '>', 2000.0, 5, 10, 'P2', true, 30, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 5) CPU usage too high (P2) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + 'CPU使用率过高', + '当 CPU 使用率超过 85% 且持续 10 分钟时触发告警', + true, 'cpu_usage_percent', '>', 85.0, 5, 10, 'P2', true, 30, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 6) Memory usage too high (P1) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + '内存使用率过高', + '当内存使用率超过 90% 且持续 10 分钟时触发告警(可能导致 OOM)', + true, 'memory_usage_percent', '>', 90.0, 5, 10, 'P1', true, 20, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 7) Concurrency queue buildup (P1) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + '并发队列积压', + '当并发队列深度超过 100 且持续 5 分钟时触发告警(系统处理能力不足)', + true, 'concurrency_queue_depth', '>', 100.0, 5, 5, 'P1', true, 20, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; + +-- 8) Extremely high error rate (P0) +INSERT INTO ops_alert_rules ( + name, description, enabled, metric_type, operator, threshold, + window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes, + created_at, updated_at +) VALUES ( + '错误率极高', + '当错误率超过 20% 且持续 1 分钟时触发告警(服务严重异常)', + true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW() +) ON CONFLICT (name) DO NOTHING; From bb5303272b801611361fd317e6a28e2a052b4b9c Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:52:57 +0800 Subject: [PATCH 03/53] =?UTF-8?q?feat(repository):=20=E5=AE=9E=E7=8E=B0?= =?UTF-8?q?=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E6=95=B0=E6=8D=AE=E8=AE=BF?= =?UTF-8?q?=E9=97=AE=E5=B1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 ops 主仓库(ops_repo.go) - 实现告警数据访问(ops_repo_alerts.go) - 实现仪表板数据访问(ops_repo_dashboard.go) - 实现直方图数据访问(ops_repo_histograms.go) - 实现延迟直方图桶逻辑(ops_repo_latency_histogram_buckets.go) - 新增延迟直方图桶测试(ops_repo_latency_histogram_buckets_test.go) - 实现指标数据访问(ops_repo_metrics.go) - 实现预聚合数据访问(ops_repo_preagg.go) - 实现请求详情数据访问(ops_repo_request_details.go) - 实现趋势数据访问(ops_repo_trends.go) - 实现窗口统计数据访问(ops_repo_window_stats.go) - 更新并发缓存支持 ops 场景 - 注册 repository 依赖注入 --- .../internal/repository/concurrency_cache.go | 16 +- backend/internal/repository/ops_repo.go | 676 +++++++++++ .../internal/repository/ops_repo_alerts.go | 689 +++++++++++ .../internal/repository/ops_repo_dashboard.go | 1012 +++++++++++++++++ .../repository/ops_repo_histograms.go | 79 ++ .../ops_repo_latency_histogram_buckets.go | 64 ++ ...ops_repo_latency_histogram_buckets_test.go | 14 + .../internal/repository/ops_repo_metrics.go | 401 +++++++ .../internal/repository/ops_repo_preagg.go | 359 ++++++ .../repository/ops_repo_request_details.go | 285 +++++ .../internal/repository/ops_repo_trends.go | 567 +++++++++ .../repository/ops_repo_window_stats.go | 50 + backend/internal/repository/wire.go | 1 + 13 files changed, 4203 insertions(+), 10 deletions(-) create mode 100644 backend/internal/repository/ops_repo.go create mode 100644 backend/internal/repository/ops_repo_alerts.go create mode 100644 backend/internal/repository/ops_repo_dashboard.go create mode 100644 backend/internal/repository/ops_repo_histograms.go create mode 100644 backend/internal/repository/ops_repo_latency_histogram_buckets.go create mode 100644 backend/internal/repository/ops_repo_latency_histogram_buckets_test.go create mode 100644 backend/internal/repository/ops_repo_metrics.go create mode 100644 backend/internal/repository/ops_repo_preagg.go create mode 100644 backend/internal/repository/ops_repo_request_details.go create mode 100644 backend/internal/repository/ops_repo_trends.go create mode 100644 backend/internal/repository/ops_repo_window_stats.go diff --git a/backend/internal/repository/concurrency_cache.go b/backend/internal/repository/concurrency_cache.go index 0831f5eb..b34961e1 100644 --- a/backend/internal/repository/concurrency_cache.go +++ b/backend/internal/repository/concurrency_cache.go @@ -93,7 +93,7 @@ var ( return redis.call('ZCARD', key) `) - // incrementWaitScript - only sets TTL on first creation to avoid refreshing + // incrementWaitScript - refreshes TTL on each increment to keep queue depth accurate // KEYS[1] = wait queue key // ARGV[1] = maxWait // ARGV[2] = TTL in seconds @@ -111,15 +111,13 @@ var ( local newVal = redis.call('INCR', KEYS[1]) - -- Only set TTL on first creation to avoid refreshing zombie data - if newVal == 1 then - redis.call('EXPIRE', KEYS[1], ARGV[2]) - end + -- Refresh TTL so long-running traffic doesn't expire active queue counters. + redis.call('EXPIRE', KEYS[1], ARGV[2]) return 1 `) - // incrementAccountWaitScript - account-level wait queue count + // incrementAccountWaitScript - account-level wait queue count (refresh TTL on each increment) incrementAccountWaitScript = redis.NewScript(` local current = redis.call('GET', KEYS[1]) if current == false then @@ -134,10 +132,8 @@ var ( local newVal = redis.call('INCR', KEYS[1]) - -- Only set TTL on first creation to avoid refreshing zombie data - if newVal == 1 then - redis.call('EXPIRE', KEYS[1], ARGV[2]) - end + -- Refresh TTL so long-running traffic doesn't expire active queue counters. + redis.call('EXPIRE', KEYS[1], ARGV[2]) return 1 `) diff --git a/backend/internal/repository/ops_repo.go b/backend/internal/repository/ops_repo.go new file mode 100644 index 00000000..b27a9ea0 --- /dev/null +++ b/backend/internal/repository/ops_repo.go @@ -0,0 +1,676 @@ +package repository + +import ( + "context" + "database/sql" + "fmt" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/lib/pq" +) + +type opsRepository struct { + db *sql.DB +} + +func NewOpsRepository(db *sql.DB) service.OpsRepository { + return &opsRepository{db: db} +} + +func (r *opsRepository) InsertErrorLog(ctx context.Context, input *service.OpsInsertErrorLogInput) (int64, error) { + if r == nil || r.db == nil { + return 0, fmt.Errorf("nil ops repository") + } + if input == nil { + return 0, fmt.Errorf("nil input") + } + + q := ` +INSERT INTO ops_error_logs ( + request_id, + client_request_id, + user_id, + api_key_id, + account_id, + group_id, + client_ip, + platform, + model, + request_path, + stream, + user_agent, + error_phase, + error_type, + severity, + status_code, + is_business_limited, + error_message, + error_body, + error_source, + error_owner, + upstream_status_code, + upstream_error_message, + upstream_error_detail, + duration_ms, + time_to_first_token_ms, + request_body, + request_body_truncated, + request_body_bytes, + request_headers, + is_retryable, + retry_count, + created_at +) VALUES ( + $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33 +) RETURNING id` + + var id int64 + err := r.db.QueryRowContext( + ctx, + q, + opsNullString(input.RequestID), + opsNullString(input.ClientRequestID), + opsNullInt64(input.UserID), + opsNullInt64(input.APIKeyID), + opsNullInt64(input.AccountID), + opsNullInt64(input.GroupID), + opsNullString(input.ClientIP), + opsNullString(input.Platform), + opsNullString(input.Model), + opsNullString(input.RequestPath), + input.Stream, + opsNullString(input.UserAgent), + input.ErrorPhase, + input.ErrorType, + opsNullString(input.Severity), + opsNullInt(input.StatusCode), + input.IsBusinessLimited, + opsNullString(input.ErrorMessage), + opsNullString(input.ErrorBody), + opsNullString(input.ErrorSource), + opsNullString(input.ErrorOwner), + opsNullInt(input.UpstreamStatusCode), + opsNullString(input.UpstreamErrorMessage), + opsNullString(input.UpstreamErrorDetail), + opsNullInt(input.DurationMs), + opsNullInt64(input.TimeToFirstTokenMs), + opsNullString(input.RequestBodyJSON), + input.RequestBodyTruncated, + opsNullInt(input.RequestBodyBytes), + opsNullString(input.RequestHeadersJSON), + input.IsRetryable, + input.RetryCount, + input.CreatedAt, + ).Scan(&id) + if err != nil { + return 0, err + } + return id, nil +} + +func (r *opsRepository) ListErrorLogs(ctx context.Context, filter *service.OpsErrorLogFilter) (*service.OpsErrorLogList, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + filter = &service.OpsErrorLogFilter{} + } + + page := filter.Page + if page <= 0 { + page = 1 + } + pageSize := filter.PageSize + if pageSize <= 0 { + pageSize = 20 + } + if pageSize > 500 { + pageSize = 500 + } + + where, args := buildOpsErrorLogsWhere(filter) + countSQL := "SELECT COUNT(*) FROM ops_error_logs " + where + + var total int + if err := r.db.QueryRowContext(ctx, countSQL, args...).Scan(&total); err != nil { + return nil, err + } + + offset := (page - 1) * pageSize + argsWithLimit := append(args, pageSize, offset) + selectSQL := ` +SELECT + id, + created_at, + error_phase, + error_type, + severity, + COALESCE(status_code, 0), + COALESCE(platform, ''), + COALESCE(model, ''), + duration_ms, + COALESCE(client_request_id, ''), + COALESCE(request_id, ''), + COALESCE(error_message, ''), + user_id, + api_key_id, + account_id, + group_id, + CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END, + COALESCE(request_path, ''), + stream +FROM ops_error_logs +` + where + ` +ORDER BY created_at DESC +LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2) + + rows, err := r.db.QueryContext(ctx, selectSQL, argsWithLimit...) + if err != nil { + return nil, err + } + defer rows.Close() + + out := make([]*service.OpsErrorLog, 0, pageSize) + for rows.Next() { + var item service.OpsErrorLog + var latency sql.NullInt64 + var statusCode sql.NullInt64 + var clientIP sql.NullString + var userID sql.NullInt64 + var apiKeyID sql.NullInt64 + var accountID sql.NullInt64 + var groupID sql.NullInt64 + if err := rows.Scan( + &item.ID, + &item.CreatedAt, + &item.Phase, + &item.Type, + &item.Severity, + &statusCode, + &item.Platform, + &item.Model, + &latency, + &item.ClientRequestID, + &item.RequestID, + &item.Message, + &userID, + &apiKeyID, + &accountID, + &groupID, + &clientIP, + &item.RequestPath, + &item.Stream, + ); err != nil { + return nil, err + } + if latency.Valid { + v := int(latency.Int64) + item.LatencyMs = &v + } + item.StatusCode = int(statusCode.Int64) + if clientIP.Valid { + s := clientIP.String + item.ClientIP = &s + } + if userID.Valid { + v := userID.Int64 + item.UserID = &v + } + if apiKeyID.Valid { + v := apiKeyID.Int64 + item.APIKeyID = &v + } + if accountID.Valid { + v := accountID.Int64 + item.AccountID = &v + } + if groupID.Valid { + v := groupID.Int64 + item.GroupID = &v + } + out = append(out, &item) + } + if err := rows.Err(); err != nil { + return nil, err + } + + return &service.OpsErrorLogList{ + Errors: out, + Total: total, + Page: page, + PageSize: pageSize, + }, nil +} + +func (r *opsRepository) GetErrorLogByID(ctx context.Context, id int64) (*service.OpsErrorLogDetail, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if id <= 0 { + return nil, fmt.Errorf("invalid id") + } + + q := ` +SELECT + id, + created_at, + error_phase, + error_type, + severity, + COALESCE(status_code, 0), + COALESCE(platform, ''), + COALESCE(model, ''), + duration_ms, + COALESCE(client_request_id, ''), + COALESCE(request_id, ''), + COALESCE(error_message, ''), + COALESCE(error_body, ''), + is_business_limited, + user_id, + api_key_id, + account_id, + group_id, + CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END, + COALESCE(request_path, ''), + stream, + COALESCE(user_agent, ''), + auth_latency_ms, + routing_latency_ms, + upstream_latency_ms, + response_latency_ms, + time_to_first_token_ms, + COALESCE(request_body::text, ''), + request_body_truncated, + request_body_bytes, + COALESCE(request_headers::text, '') +FROM ops_error_logs +WHERE id = $1 +LIMIT 1` + + var out service.OpsErrorLogDetail + var latency sql.NullInt64 + var statusCode sql.NullInt64 + var clientIP sql.NullString + var userID sql.NullInt64 + var apiKeyID sql.NullInt64 + var accountID sql.NullInt64 + var groupID sql.NullInt64 + var authLatency sql.NullInt64 + var routingLatency sql.NullInt64 + var upstreamLatency sql.NullInt64 + var responseLatency sql.NullInt64 + var ttft sql.NullInt64 + var requestBodyBytes sql.NullInt64 + + err := r.db.QueryRowContext(ctx, q, id).Scan( + &out.ID, + &out.CreatedAt, + &out.Phase, + &out.Type, + &out.Severity, + &statusCode, + &out.Platform, + &out.Model, + &latency, + &out.ClientRequestID, + &out.RequestID, + &out.Message, + &out.ErrorBody, + &out.IsBusinessLimited, + &userID, + &apiKeyID, + &accountID, + &groupID, + &clientIP, + &out.RequestPath, + &out.Stream, + &out.UserAgent, + &authLatency, + &routingLatency, + &upstreamLatency, + &responseLatency, + &ttft, + &out.RequestBody, + &out.RequestBodyTruncated, + &requestBodyBytes, + &out.RequestHeaders, + ) + if err != nil { + return nil, err + } + + out.StatusCode = int(statusCode.Int64) + if latency.Valid { + v := int(latency.Int64) + out.LatencyMs = &v + } + if clientIP.Valid { + s := clientIP.String + out.ClientIP = &s + } + if userID.Valid { + v := userID.Int64 + out.UserID = &v + } + if apiKeyID.Valid { + v := apiKeyID.Int64 + out.APIKeyID = &v + } + if accountID.Valid { + v := accountID.Int64 + out.AccountID = &v + } + if groupID.Valid { + v := groupID.Int64 + out.GroupID = &v + } + if authLatency.Valid { + v := authLatency.Int64 + out.AuthLatencyMs = &v + } + if routingLatency.Valid { + v := routingLatency.Int64 + out.RoutingLatencyMs = &v + } + if upstreamLatency.Valid { + v := upstreamLatency.Int64 + out.UpstreamLatencyMs = &v + } + if responseLatency.Valid { + v := responseLatency.Int64 + out.ResponseLatencyMs = &v + } + if ttft.Valid { + v := ttft.Int64 + out.TimeToFirstTokenMs = &v + } + if requestBodyBytes.Valid { + v := int(requestBodyBytes.Int64) + out.RequestBodyBytes = &v + } + + // Normalize request_body to empty string when stored as JSON null. + out.RequestBody = strings.TrimSpace(out.RequestBody) + if out.RequestBody == "null" { + out.RequestBody = "" + } + // Normalize request_headers to empty string when stored as JSON null. + out.RequestHeaders = strings.TrimSpace(out.RequestHeaders) + if out.RequestHeaders == "null" { + out.RequestHeaders = "" + } + + return &out, nil +} + +func (r *opsRepository) InsertRetryAttempt(ctx context.Context, input *service.OpsInsertRetryAttemptInput) (int64, error) { + if r == nil || r.db == nil { + return 0, fmt.Errorf("nil ops repository") + } + if input == nil { + return 0, fmt.Errorf("nil input") + } + if input.SourceErrorID <= 0 { + return 0, fmt.Errorf("invalid source_error_id") + } + if strings.TrimSpace(input.Mode) == "" { + return 0, fmt.Errorf("invalid mode") + } + + q := ` +INSERT INTO ops_retry_attempts ( + requested_by_user_id, + source_error_id, + mode, + pinned_account_id, + status, + started_at +) VALUES ( + $1,$2,$3,$4,$5,$6 +) RETURNING id` + + var id int64 + err := r.db.QueryRowContext( + ctx, + q, + opsNullInt64(&input.RequestedByUserID), + input.SourceErrorID, + strings.TrimSpace(input.Mode), + opsNullInt64(input.PinnedAccountID), + strings.TrimSpace(input.Status), + input.StartedAt, + ).Scan(&id) + if err != nil { + return 0, err + } + return id, nil +} + +func (r *opsRepository) UpdateRetryAttempt(ctx context.Context, input *service.OpsUpdateRetryAttemptInput) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if input == nil { + return fmt.Errorf("nil input") + } + if input.ID <= 0 { + return fmt.Errorf("invalid id") + } + + q := ` +UPDATE ops_retry_attempts +SET + status = $2, + finished_at = $3, + duration_ms = $4, + result_request_id = $5, + result_error_id = $6, + error_message = $7 +WHERE id = $1` + + _, err := r.db.ExecContext( + ctx, + q, + input.ID, + strings.TrimSpace(input.Status), + nullTime(input.FinishedAt), + input.DurationMs, + opsNullString(input.ResultRequestID), + opsNullInt64(input.ResultErrorID), + opsNullString(input.ErrorMessage), + ) + return err +} + +func (r *opsRepository) GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*service.OpsRetryAttempt, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if sourceErrorID <= 0 { + return nil, fmt.Errorf("invalid source_error_id") + } + + q := ` +SELECT + id, + created_at, + COALESCE(requested_by_user_id, 0), + source_error_id, + COALESCE(mode, ''), + pinned_account_id, + COALESCE(status, ''), + started_at, + finished_at, + duration_ms, + result_request_id, + result_error_id, + error_message +FROM ops_retry_attempts +WHERE source_error_id = $1 +ORDER BY created_at DESC +LIMIT 1` + + var out service.OpsRetryAttempt + var pinnedAccountID sql.NullInt64 + var requestedBy sql.NullInt64 + var startedAt sql.NullTime + var finishedAt sql.NullTime + var durationMs sql.NullInt64 + var resultRequestID sql.NullString + var resultErrorID sql.NullInt64 + var errorMessage sql.NullString + + err := r.db.QueryRowContext(ctx, q, sourceErrorID).Scan( + &out.ID, + &out.CreatedAt, + &requestedBy, + &out.SourceErrorID, + &out.Mode, + &pinnedAccountID, + &out.Status, + &startedAt, + &finishedAt, + &durationMs, + &resultRequestID, + &resultErrorID, + &errorMessage, + ) + if err != nil { + return nil, err + } + out.RequestedByUserID = requestedBy.Int64 + if pinnedAccountID.Valid { + v := pinnedAccountID.Int64 + out.PinnedAccountID = &v + } + if startedAt.Valid { + t := startedAt.Time + out.StartedAt = &t + } + if finishedAt.Valid { + t := finishedAt.Time + out.FinishedAt = &t + } + if durationMs.Valid { + v := durationMs.Int64 + out.DurationMs = &v + } + if resultRequestID.Valid { + s := resultRequestID.String + out.ResultRequestID = &s + } + if resultErrorID.Valid { + v := resultErrorID.Int64 + out.ResultErrorID = &v + } + if errorMessage.Valid { + s := errorMessage.String + out.ErrorMessage = &s + } + + return &out, nil +} + +func nullTime(t time.Time) sql.NullTime { + if t.IsZero() { + return sql.NullTime{} + } + return sql.NullTime{Time: t, Valid: true} +} + +func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) { + clauses := make([]string, 0, 8) + args := make([]any, 0, 8) + clauses = append(clauses, "1=1") + + if filter.StartTime != nil && !filter.StartTime.IsZero() { + args = append(args, filter.StartTime.UTC()) + clauses = append(clauses, "created_at >= $"+itoa(len(args))) + } + if filter.EndTime != nil && !filter.EndTime.IsZero() { + args = append(args, filter.EndTime.UTC()) + // Keep time-window semantics consistent with other ops queries: [start, end) + clauses = append(clauses, "created_at < $"+itoa(len(args))) + } + if p := strings.TrimSpace(filter.Platform); p != "" { + args = append(args, p) + clauses = append(clauses, "platform = $"+itoa(len(args))) + } + if filter.GroupID != nil && *filter.GroupID > 0 { + args = append(args, *filter.GroupID) + clauses = append(clauses, "group_id = $"+itoa(len(args))) + } + if filter.AccountID != nil && *filter.AccountID > 0 { + args = append(args, *filter.AccountID) + clauses = append(clauses, "account_id = $"+itoa(len(args))) + } + if phase := strings.TrimSpace(filter.Phase); phase != "" { + args = append(args, phase) + clauses = append(clauses, "error_phase = $"+itoa(len(args))) + } + if len(filter.StatusCodes) > 0 { + args = append(args, pq.Array(filter.StatusCodes)) + clauses = append(clauses, "status_code = ANY($"+itoa(len(args))+")") + } + if q := strings.TrimSpace(filter.Query); q != "" { + like := "%" + q + "%" + args = append(args, like) + n := itoa(len(args)) + clauses = append(clauses, "(request_id ILIKE $"+n+" OR client_request_id ILIKE $"+n+" OR error_message ILIKE $"+n+")") + } + + return "WHERE " + strings.Join(clauses, " AND "), args +} + +// Helpers for nullable args +func opsNullString(v any) any { + switch s := v.(type) { + case nil: + return sql.NullString{} + case *string: + if s == nil || strings.TrimSpace(*s) == "" { + return sql.NullString{} + } + return sql.NullString{String: strings.TrimSpace(*s), Valid: true} + case string: + if strings.TrimSpace(s) == "" { + return sql.NullString{} + } + return sql.NullString{String: strings.TrimSpace(s), Valid: true} + default: + return sql.NullString{} + } +} + +func opsNullInt64(v *int64) any { + if v == nil || *v == 0 { + return sql.NullInt64{} + } + return sql.NullInt64{Int64: *v, Valid: true} +} + +func opsNullInt(v any) any { + switch n := v.(type) { + case nil: + return sql.NullInt64{} + case *int: + if n == nil || *n == 0 { + return sql.NullInt64{} + } + return sql.NullInt64{Int64: int64(*n), Valid: true} + case *int64: + if n == nil || *n == 0 { + return sql.NullInt64{} + } + return sql.NullInt64{Int64: *n, Valid: true} + case int: + if n == 0 { + return sql.NullInt64{} + } + return sql.NullInt64{Int64: int64(n), Valid: true} + default: + return sql.NullInt64{} + } +} diff --git a/backend/internal/repository/ops_repo_alerts.go b/backend/internal/repository/ops_repo_alerts.go new file mode 100644 index 00000000..ce99e6f7 --- /dev/null +++ b/backend/internal/repository/ops_repo_alerts.go @@ -0,0 +1,689 @@ +package repository + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" +) + +func (r *opsRepository) ListAlertRules(ctx context.Context) ([]*service.OpsAlertRule, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + + q := ` +SELECT + id, + name, + COALESCE(description, ''), + enabled, + COALESCE(severity, ''), + metric_type, + operator, + threshold, + window_minutes, + sustained_minutes, + cooldown_minutes, + COALESCE(notify_email, true), + filters, + last_triggered_at, + created_at, + updated_at +FROM ops_alert_rules +ORDER BY id DESC` + + rows, err := r.db.QueryContext(ctx, q) + if err != nil { + return nil, err + } + defer rows.Close() + + out := []*service.OpsAlertRule{} + for rows.Next() { + var rule service.OpsAlertRule + var filtersRaw []byte + var lastTriggeredAt sql.NullTime + if err := rows.Scan( + &rule.ID, + &rule.Name, + &rule.Description, + &rule.Enabled, + &rule.Severity, + &rule.MetricType, + &rule.Operator, + &rule.Threshold, + &rule.WindowMinutes, + &rule.SustainedMinutes, + &rule.CooldownMinutes, + &rule.NotifyEmail, + &filtersRaw, + &lastTriggeredAt, + &rule.CreatedAt, + &rule.UpdatedAt, + ); err != nil { + return nil, err + } + if lastTriggeredAt.Valid { + v := lastTriggeredAt.Time + rule.LastTriggeredAt = &v + } + if len(filtersRaw) > 0 && string(filtersRaw) != "null" { + var decoded map[string]any + if err := json.Unmarshal(filtersRaw, &decoded); err == nil { + rule.Filters = decoded + } + } + out = append(out, &rule) + } + if err := rows.Err(); err != nil { + return nil, err + } + return out, nil +} + +func (r *opsRepository) CreateAlertRule(ctx context.Context, input *service.OpsAlertRule) (*service.OpsAlertRule, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if input == nil { + return nil, fmt.Errorf("nil input") + } + + filtersArg, err := opsNullJSONMap(input.Filters) + if err != nil { + return nil, err + } + + q := ` +INSERT INTO ops_alert_rules ( + name, + description, + enabled, + severity, + metric_type, + operator, + threshold, + window_minutes, + sustained_minutes, + cooldown_minutes, + notify_email, + filters, + created_at, + updated_at +) VALUES ( + $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,NOW(),NOW() +) +RETURNING + id, + name, + COALESCE(description, ''), + enabled, + COALESCE(severity, ''), + metric_type, + operator, + threshold, + window_minutes, + sustained_minutes, + cooldown_minutes, + COALESCE(notify_email, true), + filters, + last_triggered_at, + created_at, + updated_at` + + var out service.OpsAlertRule + var filtersRaw []byte + var lastTriggeredAt sql.NullTime + + if err := r.db.QueryRowContext( + ctx, + q, + strings.TrimSpace(input.Name), + strings.TrimSpace(input.Description), + input.Enabled, + strings.TrimSpace(input.Severity), + strings.TrimSpace(input.MetricType), + strings.TrimSpace(input.Operator), + input.Threshold, + input.WindowMinutes, + input.SustainedMinutes, + input.CooldownMinutes, + input.NotifyEmail, + filtersArg, + ).Scan( + &out.ID, + &out.Name, + &out.Description, + &out.Enabled, + &out.Severity, + &out.MetricType, + &out.Operator, + &out.Threshold, + &out.WindowMinutes, + &out.SustainedMinutes, + &out.CooldownMinutes, + &out.NotifyEmail, + &filtersRaw, + &lastTriggeredAt, + &out.CreatedAt, + &out.UpdatedAt, + ); err != nil { + return nil, err + } + if lastTriggeredAt.Valid { + v := lastTriggeredAt.Time + out.LastTriggeredAt = &v + } + if len(filtersRaw) > 0 && string(filtersRaw) != "null" { + var decoded map[string]any + if err := json.Unmarshal(filtersRaw, &decoded); err == nil { + out.Filters = decoded + } + } + + return &out, nil +} + +func (r *opsRepository) UpdateAlertRule(ctx context.Context, input *service.OpsAlertRule) (*service.OpsAlertRule, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if input == nil { + return nil, fmt.Errorf("nil input") + } + if input.ID <= 0 { + return nil, fmt.Errorf("invalid id") + } + + filtersArg, err := opsNullJSONMap(input.Filters) + if err != nil { + return nil, err + } + + q := ` +UPDATE ops_alert_rules +SET + name = $2, + description = $3, + enabled = $4, + severity = $5, + metric_type = $6, + operator = $7, + threshold = $8, + window_minutes = $9, + sustained_minutes = $10, + cooldown_minutes = $11, + notify_email = $12, + filters = $13, + updated_at = NOW() +WHERE id = $1 +RETURNING + id, + name, + COALESCE(description, ''), + enabled, + COALESCE(severity, ''), + metric_type, + operator, + threshold, + window_minutes, + sustained_minutes, + cooldown_minutes, + COALESCE(notify_email, true), + filters, + last_triggered_at, + created_at, + updated_at` + + var out service.OpsAlertRule + var filtersRaw []byte + var lastTriggeredAt sql.NullTime + + if err := r.db.QueryRowContext( + ctx, + q, + input.ID, + strings.TrimSpace(input.Name), + strings.TrimSpace(input.Description), + input.Enabled, + strings.TrimSpace(input.Severity), + strings.TrimSpace(input.MetricType), + strings.TrimSpace(input.Operator), + input.Threshold, + input.WindowMinutes, + input.SustainedMinutes, + input.CooldownMinutes, + input.NotifyEmail, + filtersArg, + ).Scan( + &out.ID, + &out.Name, + &out.Description, + &out.Enabled, + &out.Severity, + &out.MetricType, + &out.Operator, + &out.Threshold, + &out.WindowMinutes, + &out.SustainedMinutes, + &out.CooldownMinutes, + &out.NotifyEmail, + &filtersRaw, + &lastTriggeredAt, + &out.CreatedAt, + &out.UpdatedAt, + ); err != nil { + return nil, err + } + + if lastTriggeredAt.Valid { + v := lastTriggeredAt.Time + out.LastTriggeredAt = &v + } + if len(filtersRaw) > 0 && string(filtersRaw) != "null" { + var decoded map[string]any + if err := json.Unmarshal(filtersRaw, &decoded); err == nil { + out.Filters = decoded + } + } + + return &out, nil +} + +func (r *opsRepository) DeleteAlertRule(ctx context.Context, id int64) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if id <= 0 { + return fmt.Errorf("invalid id") + } + + res, err := r.db.ExecContext(ctx, "DELETE FROM ops_alert_rules WHERE id = $1", id) + if err != nil { + return err + } + affected, err := res.RowsAffected() + if err != nil { + return err + } + if affected == 0 { + return sql.ErrNoRows + } + return nil +} + +func (r *opsRepository) ListAlertEvents(ctx context.Context, filter *service.OpsAlertEventFilter) ([]*service.OpsAlertEvent, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + filter = &service.OpsAlertEventFilter{} + } + + limit := filter.Limit + if limit <= 0 { + limit = 100 + } + if limit > 500 { + limit = 500 + } + + where, args := buildOpsAlertEventsWhere(filter) + args = append(args, limit) + limitArg := "$" + itoa(len(args)) + + q := ` +SELECT + id, + COALESCE(rule_id, 0), + COALESCE(severity, ''), + COALESCE(status, ''), + COALESCE(title, ''), + COALESCE(description, ''), + metric_value, + threshold_value, + dimensions, + fired_at, + resolved_at, + email_sent, + created_at +FROM ops_alert_events +` + where + ` +ORDER BY fired_at DESC +LIMIT ` + limitArg + + rows, err := r.db.QueryContext(ctx, q, args...) + if err != nil { + return nil, err + } + defer rows.Close() + + out := []*service.OpsAlertEvent{} + for rows.Next() { + var ev service.OpsAlertEvent + var metricValue sql.NullFloat64 + var thresholdValue sql.NullFloat64 + var dimensionsRaw []byte + var resolvedAt sql.NullTime + if err := rows.Scan( + &ev.ID, + &ev.RuleID, + &ev.Severity, + &ev.Status, + &ev.Title, + &ev.Description, + &metricValue, + &thresholdValue, + &dimensionsRaw, + &ev.FiredAt, + &resolvedAt, + &ev.EmailSent, + &ev.CreatedAt, + ); err != nil { + return nil, err + } + if metricValue.Valid { + v := metricValue.Float64 + ev.MetricValue = &v + } + if thresholdValue.Valid { + v := thresholdValue.Float64 + ev.ThresholdValue = &v + } + if resolvedAt.Valid { + v := resolvedAt.Time + ev.ResolvedAt = &v + } + if len(dimensionsRaw) > 0 && string(dimensionsRaw) != "null" { + var decoded map[string]any + if err := json.Unmarshal(dimensionsRaw, &decoded); err == nil { + ev.Dimensions = decoded + } + } + out = append(out, &ev) + } + if err := rows.Err(); err != nil { + return nil, err + } + return out, nil +} + +func (r *opsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if ruleID <= 0 { + return nil, fmt.Errorf("invalid rule id") + } + + q := ` +SELECT + id, + COALESCE(rule_id, 0), + COALESCE(severity, ''), + COALESCE(status, ''), + COALESCE(title, ''), + COALESCE(description, ''), + metric_value, + threshold_value, + dimensions, + fired_at, + resolved_at, + email_sent, + created_at +FROM ops_alert_events +WHERE rule_id = $1 AND status = $2 +ORDER BY fired_at DESC +LIMIT 1` + + row := r.db.QueryRowContext(ctx, q, ruleID, service.OpsAlertStatusFiring) + ev, err := scanOpsAlertEvent(row) + if err != nil { + if err == sql.ErrNoRows { + return nil, nil + } + return nil, err + } + return ev, nil +} + +func (r *opsRepository) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if ruleID <= 0 { + return nil, fmt.Errorf("invalid rule id") + } + + q := ` +SELECT + id, + COALESCE(rule_id, 0), + COALESCE(severity, ''), + COALESCE(status, ''), + COALESCE(title, ''), + COALESCE(description, ''), + metric_value, + threshold_value, + dimensions, + fired_at, + resolved_at, + email_sent, + created_at +FROM ops_alert_events +WHERE rule_id = $1 +ORDER BY fired_at DESC +LIMIT 1` + + row := r.db.QueryRowContext(ctx, q, ruleID) + ev, err := scanOpsAlertEvent(row) + if err != nil { + if err == sql.ErrNoRows { + return nil, nil + } + return nil, err + } + return ev, nil +} + +func (r *opsRepository) CreateAlertEvent(ctx context.Context, event *service.OpsAlertEvent) (*service.OpsAlertEvent, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if event == nil { + return nil, fmt.Errorf("nil event") + } + + dimensionsArg, err := opsNullJSONMap(event.Dimensions) + if err != nil { + return nil, err + } + + q := ` +INSERT INTO ops_alert_events ( + rule_id, + severity, + status, + title, + description, + metric_value, + threshold_value, + dimensions, + fired_at, + resolved_at, + email_sent, + created_at +) VALUES ( + $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,NOW() +) +RETURNING + id, + COALESCE(rule_id, 0), + COALESCE(severity, ''), + COALESCE(status, ''), + COALESCE(title, ''), + COALESCE(description, ''), + metric_value, + threshold_value, + dimensions, + fired_at, + resolved_at, + email_sent, + created_at` + + row := r.db.QueryRowContext( + ctx, + q, + opsNullInt64(&event.RuleID), + opsNullString(event.Severity), + opsNullString(event.Status), + opsNullString(event.Title), + opsNullString(event.Description), + opsNullFloat64(event.MetricValue), + opsNullFloat64(event.ThresholdValue), + dimensionsArg, + event.FiredAt, + opsNullTime(event.ResolvedAt), + event.EmailSent, + ) + return scanOpsAlertEvent(row) +} + +func (r *opsRepository) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if eventID <= 0 { + return fmt.Errorf("invalid event id") + } + if strings.TrimSpace(status) == "" { + return fmt.Errorf("invalid status") + } + + q := ` +UPDATE ops_alert_events +SET status = $2, + resolved_at = $3 +WHERE id = $1` + + _, err := r.db.ExecContext(ctx, q, eventID, strings.TrimSpace(status), opsNullTime(resolvedAt)) + return err +} + +func (r *opsRepository) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if eventID <= 0 { + return fmt.Errorf("invalid event id") + } + + _, err := r.db.ExecContext(ctx, "UPDATE ops_alert_events SET email_sent = $2 WHERE id = $1", eventID, emailSent) + return err +} + +type opsAlertEventRow interface { + Scan(dest ...any) error +} + +func scanOpsAlertEvent(row opsAlertEventRow) (*service.OpsAlertEvent, error) { + var ev service.OpsAlertEvent + var metricValue sql.NullFloat64 + var thresholdValue sql.NullFloat64 + var dimensionsRaw []byte + var resolvedAt sql.NullTime + + if err := row.Scan( + &ev.ID, + &ev.RuleID, + &ev.Severity, + &ev.Status, + &ev.Title, + &ev.Description, + &metricValue, + &thresholdValue, + &dimensionsRaw, + &ev.FiredAt, + &resolvedAt, + &ev.EmailSent, + &ev.CreatedAt, + ); err != nil { + return nil, err + } + if metricValue.Valid { + v := metricValue.Float64 + ev.MetricValue = &v + } + if thresholdValue.Valid { + v := thresholdValue.Float64 + ev.ThresholdValue = &v + } + if resolvedAt.Valid { + v := resolvedAt.Time + ev.ResolvedAt = &v + } + if len(dimensionsRaw) > 0 && string(dimensionsRaw) != "null" { + var decoded map[string]any + if err := json.Unmarshal(dimensionsRaw, &decoded); err == nil { + ev.Dimensions = decoded + } + } + return &ev, nil +} + +func buildOpsAlertEventsWhere(filter *service.OpsAlertEventFilter) (string, []any) { + clauses := []string{"1=1"} + args := []any{} + + if filter == nil { + return "WHERE " + strings.Join(clauses, " AND "), args + } + + if status := strings.TrimSpace(filter.Status); status != "" { + args = append(args, status) + clauses = append(clauses, "status = $"+itoa(len(args))) + } + if severity := strings.TrimSpace(filter.Severity); severity != "" { + args = append(args, severity) + clauses = append(clauses, "severity = $"+itoa(len(args))) + } + if filter.StartTime != nil && !filter.StartTime.IsZero() { + args = append(args, *filter.StartTime) + clauses = append(clauses, "fired_at >= $"+itoa(len(args))) + } + if filter.EndTime != nil && !filter.EndTime.IsZero() { + args = append(args, *filter.EndTime) + clauses = append(clauses, "fired_at < $"+itoa(len(args))) + } + + // Dimensions are stored in JSONB. We filter best-effort without requiring GIN indexes. + if platform := strings.TrimSpace(filter.Platform); platform != "" { + args = append(args, platform) + clauses = append(clauses, "(dimensions->>'platform') = $"+itoa(len(args))) + } + if filter.GroupID != nil && *filter.GroupID > 0 { + args = append(args, fmt.Sprintf("%d", *filter.GroupID)) + clauses = append(clauses, "(dimensions->>'group_id') = $"+itoa(len(args))) + } + + return "WHERE " + strings.Join(clauses, " AND "), args +} + +func opsNullJSONMap(v map[string]any) (any, error) { + if v == nil { + return sql.NullString{}, nil + } + b, err := json.Marshal(v) + if err != nil { + return nil, err + } + if len(b) == 0 { + return sql.NullString{}, nil + } + return sql.NullString{String: string(b), Valid: true}, nil +} diff --git a/backend/internal/repository/ops_repo_dashboard.go b/backend/internal/repository/ops_repo_dashboard.go new file mode 100644 index 00000000..d96efd48 --- /dev/null +++ b/backend/internal/repository/ops_repo_dashboard.go @@ -0,0 +1,1012 @@ +package repository + +import ( + "context" + "database/sql" + "errors" + "fmt" + "math" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" +) + +func (r *opsRepository) GetDashboardOverview(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsDashboardOverview, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + return nil, fmt.Errorf("nil filter") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, fmt.Errorf("start_time/end_time required") + } + + mode := filter.QueryMode + if !mode.IsValid() { + mode = service.OpsQueryModeRaw + } + + switch mode { + case service.OpsQueryModePreagg: + return r.getDashboardOverviewPreaggregated(ctx, filter) + case service.OpsQueryModeAuto: + out, err := r.getDashboardOverviewPreaggregated(ctx, filter) + if err != nil && errors.Is(err, service.ErrOpsPreaggregatedNotPopulated) { + return r.getDashboardOverviewRaw(ctx, filter) + } + return out, err + default: + return r.getDashboardOverviewRaw(ctx, filter) + } +} + +func (r *opsRepository) getDashboardOverviewRaw(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsDashboardOverview, error) { + start := filter.StartTime.UTC() + end := filter.EndTime.UTC() + + successCount, tokenConsumed, err := r.queryUsageCounts(ctx, filter, start, end) + if err != nil { + return nil, err + } + + duration, ttft, err := r.queryUsageLatency(ctx, filter, start, end) + if err != nil { + return nil, err + } + + errorTotal, businessLimited, errorCountSLA, upstreamExcl, upstream429, upstream529, err := r.queryErrorCounts(ctx, filter, start, end) + if err != nil { + return nil, err + } + + windowSeconds := end.Sub(start).Seconds() + if windowSeconds <= 0 { + windowSeconds = 1 + } + + requestCountTotal := successCount + errorTotal + requestCountSLA := successCount + errorCountSLA + + sla := safeDivideFloat64(float64(successCount), float64(requestCountSLA)) + errorRate := safeDivideFloat64(float64(errorCountSLA), float64(requestCountSLA)) + upstreamErrorRate := safeDivideFloat64(float64(upstreamExcl), float64(requestCountSLA)) + + qpsCurrent, tpsCurrent, err := r.queryCurrentRates(ctx, filter, end) + if err != nil { + return nil, err + } + + qpsPeak, err := r.queryPeakQPS(ctx, filter, start, end) + if err != nil { + return nil, err + } + tpsPeak, err := r.queryPeakTPS(ctx, filter, start, end) + if err != nil { + return nil, err + } + + qpsAvg := roundTo1DP(float64(requestCountTotal) / windowSeconds) + tpsAvg := roundTo1DP(float64(tokenConsumed) / windowSeconds) + + return &service.OpsDashboardOverview{ + StartTime: start, + EndTime: end, + Platform: strings.TrimSpace(filter.Platform), + GroupID: filter.GroupID, + + SuccessCount: successCount, + ErrorCountTotal: errorTotal, + BusinessLimitedCount: businessLimited, + ErrorCountSLA: errorCountSLA, + RequestCountTotal: requestCountTotal, + RequestCountSLA: requestCountSLA, + TokenConsumed: tokenConsumed, + + SLA: roundTo4DP(sla), + ErrorRate: roundTo4DP(errorRate), + UpstreamErrorRate: roundTo4DP(upstreamErrorRate), + UpstreamErrorCountExcl429529: upstreamExcl, + Upstream429Count: upstream429, + Upstream529Count: upstream529, + + QPS: service.OpsRateSummary{ + Current: qpsCurrent, + Peak: qpsPeak, + Avg: qpsAvg, + }, + TPS: service.OpsRateSummary{ + Current: tpsCurrent, + Peak: tpsPeak, + Avg: tpsAvg, + }, + + Duration: duration, + TTFT: ttft, + }, nil +} + +type opsDashboardPartial struct { + successCount int64 + errorCountTotal int64 + businessLimitedCount int64 + errorCountSLA int64 + + upstreamErrorCountExcl429529 int64 + upstream429Count int64 + upstream529Count int64 + + tokenConsumed int64 + + duration service.OpsPercentiles + ttft service.OpsPercentiles +} + +func (r *opsRepository) getDashboardOverviewPreaggregated(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsDashboardOverview, error) { + if filter == nil { + return nil, fmt.Errorf("nil filter") + } + + start := filter.StartTime.UTC() + end := filter.EndTime.UTC() + + // Stable full-hour range covered by pre-aggregation. + aggSafeEnd := preaggSafeEnd(end) + aggFullStart := utcCeilToHour(start) + aggFullEnd := utcFloorToHour(aggSafeEnd) + + // If there are no stable full-hour buckets, use raw directly (short windows). + if !aggFullStart.Before(aggFullEnd) { + return r.getDashboardOverviewRaw(ctx, filter) + } + + // 1) Pre-aggregated stable segment. + preaggRows, err := r.listHourlyMetricsRows(ctx, filter, aggFullStart, aggFullEnd) + if err != nil { + return nil, err + } + if len(preaggRows) == 0 { + // Distinguish "no data" vs "preagg not populated yet". + if exists, err := r.rawOpsDataExists(ctx, filter, aggFullStart, aggFullEnd); err == nil && exists { + return nil, service.ErrOpsPreaggregatedNotPopulated + } + } + preagg := aggregateHourlyRows(preaggRows) + + // 2) Raw head/tail fragments (at most ~1 hour each). + head := opsDashboardPartial{} + tail := opsDashboardPartial{} + + if start.Before(aggFullStart) { + part, err := r.queryRawPartial(ctx, filter, start, minTime(end, aggFullStart)) + if err != nil { + return nil, err + } + head = *part + } + if aggFullEnd.Before(end) { + part, err := r.queryRawPartial(ctx, filter, maxTime(start, aggFullEnd), end) + if err != nil { + return nil, err + } + tail = *part + } + + // Merge counts. + successCount := preagg.successCount + head.successCount + tail.successCount + errorTotal := preagg.errorCountTotal + head.errorCountTotal + tail.errorCountTotal + businessLimited := preagg.businessLimitedCount + head.businessLimitedCount + tail.businessLimitedCount + errorCountSLA := preagg.errorCountSLA + head.errorCountSLA + tail.errorCountSLA + + upstreamExcl := preagg.upstreamErrorCountExcl429529 + head.upstreamErrorCountExcl429529 + tail.upstreamErrorCountExcl429529 + upstream429 := preagg.upstream429Count + head.upstream429Count + tail.upstream429Count + upstream529 := preagg.upstream529Count + head.upstream529Count + tail.upstream529Count + + tokenConsumed := preagg.tokenConsumed + head.tokenConsumed + tail.tokenConsumed + + // Approximate percentiles across segments: + // - p50/p90/avg: weighted average by success_count + // - p95/p99/max: max (conservative tail) + duration := combineApproxPercentiles([]opsPercentileSegment{ + {weight: preagg.successCount, p: preagg.duration}, + {weight: head.successCount, p: head.duration}, + {weight: tail.successCount, p: tail.duration}, + }) + ttft := combineApproxPercentiles([]opsPercentileSegment{ + {weight: preagg.successCount, p: preagg.ttft}, + {weight: head.successCount, p: head.ttft}, + {weight: tail.successCount, p: tail.ttft}, + }) + + windowSeconds := end.Sub(start).Seconds() + if windowSeconds <= 0 { + windowSeconds = 1 + } + + requestCountTotal := successCount + errorTotal + requestCountSLA := successCount + errorCountSLA + + sla := safeDivideFloat64(float64(successCount), float64(requestCountSLA)) + errorRate := safeDivideFloat64(float64(errorCountSLA), float64(requestCountSLA)) + upstreamErrorRate := safeDivideFloat64(float64(upstreamExcl), float64(requestCountSLA)) + + // Keep "current" rates as raw, to preserve realtime semantics. + qpsCurrent, tpsCurrent, err := r.queryCurrentRates(ctx, filter, end) + if err != nil { + return nil, err + } + + // NOTE: peak still uses raw logs (minute granularity). This is typically cheaper than percentile_cont + // and keeps semantics consistent across modes. + qpsPeak, err := r.queryPeakQPS(ctx, filter, start, end) + if err != nil { + return nil, err + } + tpsPeak, err := r.queryPeakTPS(ctx, filter, start, end) + if err != nil { + return nil, err + } + + qpsAvg := roundTo1DP(float64(requestCountTotal) / windowSeconds) + tpsAvg := roundTo1DP(float64(tokenConsumed) / windowSeconds) + + return &service.OpsDashboardOverview{ + StartTime: start, + EndTime: end, + Platform: strings.TrimSpace(filter.Platform), + GroupID: filter.GroupID, + + SuccessCount: successCount, + ErrorCountTotal: errorTotal, + BusinessLimitedCount: businessLimited, + ErrorCountSLA: errorCountSLA, + RequestCountTotal: requestCountTotal, + RequestCountSLA: requestCountSLA, + TokenConsumed: tokenConsumed, + + SLA: roundTo4DP(sla), + ErrorRate: roundTo4DP(errorRate), + UpstreamErrorRate: roundTo4DP(upstreamErrorRate), + UpstreamErrorCountExcl429529: upstreamExcl, + Upstream429Count: upstream429, + Upstream529Count: upstream529, + + QPS: service.OpsRateSummary{ + Current: qpsCurrent, + Peak: qpsPeak, + Avg: qpsAvg, + }, + TPS: service.OpsRateSummary{ + Current: tpsCurrent, + Peak: tpsPeak, + Avg: tpsAvg, + }, + + Duration: duration, + TTFT: ttft, + }, nil +} + +type opsHourlyMetricsRow struct { + bucketStart time.Time + + successCount int64 + errorCountTotal int64 + businessLimitedCount int64 + errorCountSLA int64 + + upstreamErrorCountExcl429529 int64 + upstream429Count int64 + upstream529Count int64 + + tokenConsumed int64 + + durationP50 sql.NullInt64 + durationP90 sql.NullInt64 + durationP95 sql.NullInt64 + durationP99 sql.NullInt64 + durationAvg sql.NullFloat64 + durationMax sql.NullInt64 + + ttftP50 sql.NullInt64 + ttftP90 sql.NullInt64 + ttftP95 sql.NullInt64 + ttftP99 sql.NullInt64 + ttftAvg sql.NullFloat64 + ttftMax sql.NullInt64 +} + +func (r *opsRepository) listHourlyMetricsRows(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) ([]opsHourlyMetricsRow, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if start.IsZero() || end.IsZero() || !start.Before(end) { + return []opsHourlyMetricsRow{}, nil + } + + where := "bucket_start >= $1 AND bucket_start < $2" + args := []any{start.UTC(), end.UTC()} + idx := 3 + + platform := "" + groupID := (*int64)(nil) + if filter != nil { + platform = strings.TrimSpace(strings.ToLower(filter.Platform)) + groupID = filter.GroupID + } + + switch { + case groupID != nil && *groupID > 0: + where += fmt.Sprintf(" AND group_id = $%d", idx) + args = append(args, *groupID) + idx++ + if platform != "" { + where += fmt.Sprintf(" AND platform = $%d", idx) + args = append(args, platform) + idx++ + } + case platform != "": + where += fmt.Sprintf(" AND platform = $%d AND group_id IS NULL", idx) + args = append(args, platform) + idx++ + default: + where += " AND platform IS NULL AND group_id IS NULL" + } + + q := ` +SELECT + bucket_start, + success_count, + error_count_total, + business_limited_count, + error_count_sla, + upstream_error_count_excl_429_529, + upstream_429_count, + upstream_529_count, + token_consumed, + duration_p50_ms, + duration_p90_ms, + duration_p95_ms, + duration_p99_ms, + duration_avg_ms, + duration_max_ms, + ttft_p50_ms, + ttft_p90_ms, + ttft_p95_ms, + ttft_p99_ms, + ttft_avg_ms, + ttft_max_ms +FROM ops_metrics_hourly +WHERE ` + where + ` +ORDER BY bucket_start ASC` + + rows, err := r.db.QueryContext(ctx, q, args...) + if err != nil { + return nil, err + } + defer rows.Close() + + out := make([]opsHourlyMetricsRow, 0, 64) + for rows.Next() { + var row opsHourlyMetricsRow + if err := rows.Scan( + &row.bucketStart, + &row.successCount, + &row.errorCountTotal, + &row.businessLimitedCount, + &row.errorCountSLA, + &row.upstreamErrorCountExcl429529, + &row.upstream429Count, + &row.upstream529Count, + &row.tokenConsumed, + &row.durationP50, + &row.durationP90, + &row.durationP95, + &row.durationP99, + &row.durationAvg, + &row.durationMax, + &row.ttftP50, + &row.ttftP90, + &row.ttftP95, + &row.ttftP99, + &row.ttftAvg, + &row.ttftMax, + ); err != nil { + return nil, err + } + out = append(out, row) + } + if err := rows.Err(); err != nil { + return nil, err + } + return out, nil +} + +func aggregateHourlyRows(rows []opsHourlyMetricsRow) opsDashboardPartial { + out := opsDashboardPartial{} + if len(rows) == 0 { + return out + } + + var ( + p50Sum float64 + p50W int64 + p90Sum float64 + p90W int64 + avgSum float64 + avgW int64 + ) + var ( + ttftP50Sum float64 + ttftP50W int64 + ttftP90Sum float64 + ttftP90W int64 + ttftAvgSum float64 + ttftAvgW int64 + ) + + var ( + p95Max *int + p99Max *int + maxMax *int + + ttftP95Max *int + ttftP99Max *int + ttftMaxMax *int + ) + + for _, row := range rows { + out.successCount += row.successCount + out.errorCountTotal += row.errorCountTotal + out.businessLimitedCount += row.businessLimitedCount + out.errorCountSLA += row.errorCountSLA + + out.upstreamErrorCountExcl429529 += row.upstreamErrorCountExcl429529 + out.upstream429Count += row.upstream429Count + out.upstream529Count += row.upstream529Count + + out.tokenConsumed += row.tokenConsumed + + if row.successCount > 0 { + if row.durationP50.Valid { + p50Sum += float64(row.durationP50.Int64) * float64(row.successCount) + p50W += row.successCount + } + if row.durationP90.Valid { + p90Sum += float64(row.durationP90.Int64) * float64(row.successCount) + p90W += row.successCount + } + if row.durationAvg.Valid { + avgSum += row.durationAvg.Float64 * float64(row.successCount) + avgW += row.successCount + } + if row.ttftP50.Valid { + ttftP50Sum += float64(row.ttftP50.Int64) * float64(row.successCount) + ttftP50W += row.successCount + } + if row.ttftP90.Valid { + ttftP90Sum += float64(row.ttftP90.Int64) * float64(row.successCount) + ttftP90W += row.successCount + } + if row.ttftAvg.Valid { + ttftAvgSum += row.ttftAvg.Float64 * float64(row.successCount) + ttftAvgW += row.successCount + } + } + + if row.durationP95.Valid { + v := int(row.durationP95.Int64) + if p95Max == nil || v > *p95Max { + p95Max = &v + } + } + if row.durationP99.Valid { + v := int(row.durationP99.Int64) + if p99Max == nil || v > *p99Max { + p99Max = &v + } + } + if row.durationMax.Valid { + v := int(row.durationMax.Int64) + if maxMax == nil || v > *maxMax { + maxMax = &v + } + } + + if row.ttftP95.Valid { + v := int(row.ttftP95.Int64) + if ttftP95Max == nil || v > *ttftP95Max { + ttftP95Max = &v + } + } + if row.ttftP99.Valid { + v := int(row.ttftP99.Int64) + if ttftP99Max == nil || v > *ttftP99Max { + ttftP99Max = &v + } + } + if row.ttftMax.Valid { + v := int(row.ttftMax.Int64) + if ttftMaxMax == nil || v > *ttftMaxMax { + ttftMaxMax = &v + } + } + } + + // duration + if p50W > 0 { + v := int(math.Round(p50Sum / float64(p50W))) + out.duration.P50 = &v + } + if p90W > 0 { + v := int(math.Round(p90Sum / float64(p90W))) + out.duration.P90 = &v + } + out.duration.P95 = p95Max + out.duration.P99 = p99Max + if avgW > 0 { + v := int(math.Round(avgSum / float64(avgW))) + out.duration.Avg = &v + } + out.duration.Max = maxMax + + // ttft + if ttftP50W > 0 { + v := int(math.Round(ttftP50Sum / float64(ttftP50W))) + out.ttft.P50 = &v + } + if ttftP90W > 0 { + v := int(math.Round(ttftP90Sum / float64(ttftP90W))) + out.ttft.P90 = &v + } + out.ttft.P95 = ttftP95Max + out.ttft.P99 = ttftP99Max + if ttftAvgW > 0 { + v := int(math.Round(ttftAvgSum / float64(ttftAvgW))) + out.ttft.Avg = &v + } + out.ttft.Max = ttftMaxMax + + return out +} + +func (r *opsRepository) queryRawPartial(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (*opsDashboardPartial, error) { + successCount, tokenConsumed, err := r.queryUsageCounts(ctx, filter, start, end) + if err != nil { + return nil, err + } + + duration, ttft, err := r.queryUsageLatency(ctx, filter, start, end) + if err != nil { + return nil, err + } + + errorTotal, businessLimited, errorCountSLA, upstreamExcl, upstream429, upstream529, err := r.queryErrorCounts(ctx, filter, start, end) + if err != nil { + return nil, err + } + + return &opsDashboardPartial{ + successCount: successCount, + errorCountTotal: errorTotal, + businessLimitedCount: businessLimited, + errorCountSLA: errorCountSLA, + upstreamErrorCountExcl429529: upstreamExcl, + upstream429Count: upstream429, + upstream529Count: upstream529, + tokenConsumed: tokenConsumed, + duration: duration, + ttft: ttft, + }, nil +} + +func (r *opsRepository) rawOpsDataExists(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (bool, error) { + { + join, where, args, _ := buildUsageWhere(filter, start, end, 1) + q := `SELECT EXISTS(SELECT 1 FROM usage_logs ul ` + join + ` ` + where + ` LIMIT 1)` + var exists bool + if err := r.db.QueryRowContext(ctx, q, args...).Scan(&exists); err != nil { + return false, err + } + if exists { + return true, nil + } + } + + { + where, args, _ := buildErrorWhere(filter, start, end, 1) + q := `SELECT EXISTS(SELECT 1 FROM ops_error_logs ` + where + ` LIMIT 1)` + var exists bool + if err := r.db.QueryRowContext(ctx, q, args...).Scan(&exists); err != nil { + return false, err + } + return exists, nil + } +} + +type opsPercentileSegment struct { + weight int64 + p service.OpsPercentiles +} + +func combineApproxPercentiles(segments []opsPercentileSegment) service.OpsPercentiles { + weightedInt := func(get func(service.OpsPercentiles) *int) *int { + var sum float64 + var w int64 + for _, seg := range segments { + if seg.weight <= 0 { + continue + } + v := get(seg.p) + if v == nil { + continue + } + sum += float64(*v) * float64(seg.weight) + w += seg.weight + } + if w <= 0 { + return nil + } + out := int(math.Round(sum / float64(w))) + return &out + } + + maxInt := func(get func(service.OpsPercentiles) *int) *int { + var max *int + for _, seg := range segments { + v := get(seg.p) + if v == nil { + continue + } + if max == nil || *v > *max { + c := *v + max = &c + } + } + return max + } + + return service.OpsPercentiles{ + P50: weightedInt(func(p service.OpsPercentiles) *int { return p.P50 }), + P90: weightedInt(func(p service.OpsPercentiles) *int { return p.P90 }), + P95: maxInt(func(p service.OpsPercentiles) *int { return p.P95 }), + P99: maxInt(func(p service.OpsPercentiles) *int { return p.P99 }), + Avg: weightedInt(func(p service.OpsPercentiles) *int { return p.Avg }), + Max: maxInt(func(p service.OpsPercentiles) *int { return p.Max }), + } +} + +func preaggSafeEnd(endTime time.Time) time.Time { + now := time.Now().UTC() + cutoff := now.Add(-5 * time.Minute) + if endTime.After(cutoff) { + return cutoff + } + return endTime +} + +func utcCeilToHour(t time.Time) time.Time { + u := t.UTC() + f := u.Truncate(time.Hour) + if f.Equal(u) { + return f + } + return f.Add(time.Hour) +} + +func utcFloorToHour(t time.Time) time.Time { + return t.UTC().Truncate(time.Hour) +} + +func minTime(a, b time.Time) time.Time { + if a.Before(b) { + return a + } + return b +} + +func maxTime(a, b time.Time) time.Time { + if a.After(b) { + return a + } + return b +} + +func (r *opsRepository) queryUsageCounts(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (successCount int64, tokenConsumed int64, err error) { + join, where, args, _ := buildUsageWhere(filter, start, end, 1) + + q := ` +SELECT + COALESCE(COUNT(*), 0) AS success_count, + COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed +FROM usage_logs ul +` + join + ` +` + where + + var tokens sql.NullInt64 + if err := r.db.QueryRowContext(ctx, q, args...).Scan(&successCount, &tokens); err != nil { + return 0, 0, err + } + if tokens.Valid { + tokenConsumed = tokens.Int64 + } + return successCount, tokenConsumed, nil +} + +func (r *opsRepository) queryUsageLatency(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (duration service.OpsPercentiles, ttft service.OpsPercentiles, err error) { + { + join, where, args, _ := buildUsageWhere(filter, start, end, 1) + q := ` +SELECT + percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50, + percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90, + percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95, + percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99, + AVG(duration_ms) AS avg_ms, + MAX(duration_ms) AS max_ms +FROM usage_logs ul +` + join + ` +` + where + ` +AND duration_ms IS NOT NULL` + + var p50, p90, p95, p99 sql.NullFloat64 + var avg sql.NullFloat64 + var max sql.NullInt64 + if err := r.db.QueryRowContext(ctx, q, args...).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil { + return service.OpsPercentiles{}, service.OpsPercentiles{}, err + } + duration.P50 = floatToIntPtr(p50) + duration.P90 = floatToIntPtr(p90) + duration.P95 = floatToIntPtr(p95) + duration.P99 = floatToIntPtr(p99) + duration.Avg = floatToIntPtr(avg) + if max.Valid { + v := int(max.Int64) + duration.Max = &v + } + } + + { + join, where, args, _ := buildUsageWhere(filter, start, end, 1) + q := ` +SELECT + percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50, + percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90, + percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95, + percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99, + AVG(first_token_ms) AS avg_ms, + MAX(first_token_ms) AS max_ms +FROM usage_logs ul +` + join + ` +` + where + ` +AND first_token_ms IS NOT NULL` + + var p50, p90, p95, p99 sql.NullFloat64 + var avg sql.NullFloat64 + var max sql.NullInt64 + if err := r.db.QueryRowContext(ctx, q, args...).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil { + return service.OpsPercentiles{}, service.OpsPercentiles{}, err + } + ttft.P50 = floatToIntPtr(p50) + ttft.P90 = floatToIntPtr(p90) + ttft.P95 = floatToIntPtr(p95) + ttft.P99 = floatToIntPtr(p99) + ttft.Avg = floatToIntPtr(avg) + if max.Valid { + v := int(max.Int64) + ttft.Max = &v + } + } + + return duration, ttft, nil +} + +func (r *opsRepository) queryErrorCounts(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) ( + errorTotal int64, + businessLimited int64, + errorCountSLA int64, + upstreamExcl429529 int64, + upstream429 int64, + upstream529 int64, + err error, +) { + where, args, _ := buildErrorWhere(filter, start, end, 1) + + q := ` +SELECT + COALESCE(COUNT(*), 0) AS error_total, + COALESCE(COUNT(*) FILTER (WHERE is_business_limited), 0) AS business_limited, + COALESCE(COUNT(*) FILTER (WHERE NOT is_business_limited), 0) AS error_sla, + COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl, + COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 429), 0) AS upstream_429, + COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 529), 0) AS upstream_529 +FROM ops_error_logs +` + where + + if err := r.db.QueryRowContext(ctx, q, args...).Scan( + &errorTotal, + &businessLimited, + &errorCountSLA, + &upstreamExcl429529, + &upstream429, + &upstream529, + ); err != nil { + return 0, 0, 0, 0, 0, 0, err + } + return errorTotal, businessLimited, errorCountSLA, upstreamExcl429529, upstream429, upstream529, nil +} + +func (r *opsRepository) queryCurrentRates(ctx context.Context, filter *service.OpsDashboardFilter, end time.Time) (qpsCurrent float64, tpsCurrent float64, err error) { + windowStart := end.Add(-1 * time.Minute) + + successCount1m, token1m, err := r.queryUsageCounts(ctx, filter, windowStart, end) + if err != nil { + return 0, 0, err + } + errorCount1m, _, _, _, _, _, err := r.queryErrorCounts(ctx, filter, windowStart, end) + if err != nil { + return 0, 0, err + } + + qpsCurrent = roundTo1DP(float64(successCount1m+errorCount1m) / 60.0) + tpsCurrent = roundTo1DP(float64(token1m) / 60.0) + return qpsCurrent, tpsCurrent, nil +} + +func (r *opsRepository) queryPeakQPS(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (float64, error) { + usageJoin, usageWhere, usageArgs, next := buildUsageWhere(filter, start, end, 1) + errorWhere, errorArgs, _ := buildErrorWhere(filter, start, end, next) + + q := ` +WITH usage_buckets AS ( + SELECT date_trunc('minute', ul.created_at) AS bucket, COUNT(*) AS cnt + FROM usage_logs ul + ` + usageJoin + ` + ` + usageWhere + ` + GROUP BY 1 +), +error_buckets AS ( + SELECT date_trunc('minute', created_at) AS bucket, COUNT(*) AS cnt + FROM ops_error_logs + ` + errorWhere + ` + GROUP BY 1 +), +combined AS ( + SELECT COALESCE(u.bucket, e.bucket) AS bucket, + COALESCE(u.cnt, 0) + COALESCE(e.cnt, 0) AS total + FROM usage_buckets u + FULL OUTER JOIN error_buckets e ON u.bucket = e.bucket +) +SELECT COALESCE(MAX(total), 0) FROM combined` + + args := append(usageArgs, errorArgs...) + + var maxPerMinute sql.NullInt64 + if err := r.db.QueryRowContext(ctx, q, args...).Scan(&maxPerMinute); err != nil { + return 0, err + } + if !maxPerMinute.Valid || maxPerMinute.Int64 <= 0 { + return 0, nil + } + return roundTo1DP(float64(maxPerMinute.Int64) / 60.0), nil +} + +func (r *opsRepository) queryPeakTPS(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (float64, error) { + join, where, args, _ := buildUsageWhere(filter, start, end, 1) + + q := ` +SELECT COALESCE(MAX(tokens_per_min), 0) +FROM ( + SELECT + date_trunc('minute', ul.created_at) AS bucket, + COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS tokens_per_min + FROM usage_logs ul + ` + join + ` + ` + where + ` + GROUP BY 1 +) t` + + var maxPerMinute sql.NullInt64 + if err := r.db.QueryRowContext(ctx, q, args...).Scan(&maxPerMinute); err != nil { + return 0, err + } + if !maxPerMinute.Valid || maxPerMinute.Int64 <= 0 { + return 0, nil + } + return roundTo1DP(float64(maxPerMinute.Int64) / 60.0), nil +} + +func buildUsageWhere(filter *service.OpsDashboardFilter, start, end time.Time, startIndex int) (join string, where string, args []any, nextIndex int) { + platform := "" + groupID := (*int64)(nil) + if filter != nil { + platform = strings.TrimSpace(strings.ToLower(filter.Platform)) + groupID = filter.GroupID + } + + idx := startIndex + clauses := make([]string, 0, 4) + args = make([]any, 0, 4) + + args = append(args, start) + clauses = append(clauses, fmt.Sprintf("ul.created_at >= $%d", idx)) + idx++ + args = append(args, end) + clauses = append(clauses, fmt.Sprintf("ul.created_at < $%d", idx)) + idx++ + + if groupID != nil && *groupID > 0 { + args = append(args, *groupID) + clauses = append(clauses, fmt.Sprintf("ul.group_id = $%d", idx)) + idx++ + } + if platform != "" { + // Prefer group.platform when available; fall back to account.platform so we don't + // drop rows where group_id is NULL. + join = "LEFT JOIN groups g ON g.id = ul.group_id LEFT JOIN accounts a ON a.id = ul.account_id" + args = append(args, platform) + clauses = append(clauses, fmt.Sprintf("COALESCE(NULLIF(g.platform,''), a.platform) = $%d", idx)) + idx++ + } + + where = "WHERE " + strings.Join(clauses, " AND ") + return join, where, args, idx +} + +func buildErrorWhere(filter *service.OpsDashboardFilter, start, end time.Time, startIndex int) (where string, args []any, nextIndex int) { + platform := "" + groupID := (*int64)(nil) + if filter != nil { + platform = strings.TrimSpace(strings.ToLower(filter.Platform)) + groupID = filter.GroupID + } + + idx := startIndex + clauses := make([]string, 0, 4) + args = make([]any, 0, 4) + + args = append(args, start) + clauses = append(clauses, fmt.Sprintf("created_at >= $%d", idx)) + idx++ + args = append(args, end) + clauses = append(clauses, fmt.Sprintf("created_at < $%d", idx)) + idx++ + + if groupID != nil && *groupID > 0 { + args = append(args, *groupID) + clauses = append(clauses, fmt.Sprintf("group_id = $%d", idx)) + idx++ + } + if platform != "" { + args = append(args, platform) + clauses = append(clauses, fmt.Sprintf("platform = $%d", idx)) + idx++ + } + + where = "WHERE " + strings.Join(clauses, " AND ") + return where, args, idx +} + +func floatToIntPtr(v sql.NullFloat64) *int { + if !v.Valid { + return nil + } + n := int(math.Round(v.Float64)) + return &n +} + +func safeDivideFloat64(numerator float64, denominator float64) float64 { + if denominator == 0 { + return 0 + } + return numerator / denominator +} + +func roundTo1DP(v float64) float64 { + return math.Round(v*10) / 10 +} + +func roundTo4DP(v float64) float64 { + return math.Round(v*10000) / 10000 +} diff --git a/backend/internal/repository/ops_repo_histograms.go b/backend/internal/repository/ops_repo_histograms.go new file mode 100644 index 00000000..143c7e83 --- /dev/null +++ b/backend/internal/repository/ops_repo_histograms.go @@ -0,0 +1,79 @@ +package repository + +import ( + "context" + "fmt" + "strings" + + "github.com/Wei-Shaw/sub2api/internal/service" +) + +func (r *opsRepository) GetLatencyHistogram(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsLatencyHistogramResponse, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + return nil, fmt.Errorf("nil filter") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, fmt.Errorf("start_time/end_time required") + } + + start := filter.StartTime.UTC() + end := filter.EndTime.UTC() + + join, where, args, _ := buildUsageWhere(filter, start, end, 1) + rangeExpr := latencyHistogramRangeCaseExpr("ul.duration_ms") + orderExpr := latencyHistogramRangeOrderCaseExpr("ul.duration_ms") + + q := ` +SELECT + ` + rangeExpr + ` AS range, + COALESCE(COUNT(*), 0) AS count, + ` + orderExpr + ` AS ord +FROM usage_logs ul +` + join + ` +` + where + ` +AND ul.duration_ms IS NOT NULL +GROUP BY 1, 3 +ORDER BY 3 ASC` + + rows, err := r.db.QueryContext(ctx, q, args...) + if err != nil { + return nil, err + } + defer rows.Close() + + counts := make(map[string]int64, len(latencyHistogramOrderedRanges)) + var total int64 + for rows.Next() { + var label string + var count int64 + var _ord int + if err := rows.Scan(&label, &count, &_ord); err != nil { + return nil, err + } + counts[label] = count + total += count + } + if err := rows.Err(); err != nil { + return nil, err + } + + buckets := make([]*service.OpsLatencyHistogramBucket, 0, len(latencyHistogramOrderedRanges)) + for _, label := range latencyHistogramOrderedRanges { + buckets = append(buckets, &service.OpsLatencyHistogramBucket{ + Range: label, + Count: counts[label], + }) + } + + return &service.OpsLatencyHistogramResponse{ + StartTime: start, + EndTime: end, + Platform: strings.TrimSpace(filter.Platform), + GroupID: filter.GroupID, + TotalRequests: total, + Buckets: buckets, + }, nil +} diff --git a/backend/internal/repository/ops_repo_latency_histogram_buckets.go b/backend/internal/repository/ops_repo_latency_histogram_buckets.go new file mode 100644 index 00000000..fc085fc6 --- /dev/null +++ b/backend/internal/repository/ops_repo_latency_histogram_buckets.go @@ -0,0 +1,64 @@ +package repository + +import ( + "fmt" + "strings" +) + +type latencyHistogramBucket struct { + upperMs int + label string +} + +var latencyHistogramBuckets = []latencyHistogramBucket{ + {upperMs: 100, label: "0-100ms"}, + {upperMs: 200, label: "100-200ms"}, + {upperMs: 500, label: "200-500ms"}, + {upperMs: 1000, label: "500-1000ms"}, + {upperMs: 2000, label: "1000-2000ms"}, + {upperMs: 0, label: "2000ms+"}, // default bucket +} + +var latencyHistogramOrderedRanges = func() []string { + out := make([]string, 0, len(latencyHistogramBuckets)) + for _, b := range latencyHistogramBuckets { + out = append(out, b.label) + } + return out +}() + +func latencyHistogramRangeCaseExpr(column string) string { + var sb strings.Builder + sb.WriteString("CASE\n") + + for _, b := range latencyHistogramBuckets { + if b.upperMs <= 0 { + continue + } + sb.WriteString(fmt.Sprintf("\tWHEN %s < %d THEN '%s'\n", column, b.upperMs, b.label)) + } + + // Default bucket. + last := latencyHistogramBuckets[len(latencyHistogramBuckets)-1] + sb.WriteString(fmt.Sprintf("\tELSE '%s'\n", last.label)) + sb.WriteString("END") + return sb.String() +} + +func latencyHistogramRangeOrderCaseExpr(column string) string { + var sb strings.Builder + sb.WriteString("CASE\n") + + order := 1 + for _, b := range latencyHistogramBuckets { + if b.upperMs <= 0 { + continue + } + sb.WriteString(fmt.Sprintf("\tWHEN %s < %d THEN %d\n", column, b.upperMs, order)) + order++ + } + + sb.WriteString(fmt.Sprintf("\tELSE %d\n", order)) + sb.WriteString("END") + return sb.String() +} diff --git a/backend/internal/repository/ops_repo_latency_histogram_buckets_test.go b/backend/internal/repository/ops_repo_latency_histogram_buckets_test.go new file mode 100644 index 00000000..dc79f6cc --- /dev/null +++ b/backend/internal/repository/ops_repo_latency_histogram_buckets_test.go @@ -0,0 +1,14 @@ +package repository + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestLatencyHistogramBuckets_AreConsistent(t *testing.T) { + require.Equal(t, len(latencyHistogramBuckets), len(latencyHistogramOrderedRanges)) + for i, b := range latencyHistogramBuckets { + require.Equal(t, b.label, latencyHistogramOrderedRanges[i]) + } +} diff --git a/backend/internal/repository/ops_repo_metrics.go b/backend/internal/repository/ops_repo_metrics.go new file mode 100644 index 00000000..96bad88a --- /dev/null +++ b/backend/internal/repository/ops_repo_metrics.go @@ -0,0 +1,401 @@ +package repository + +import ( + "context" + "database/sql" + "fmt" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" +) + +func (r *opsRepository) InsertSystemMetrics(ctx context.Context, input *service.OpsInsertSystemMetricsInput) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if input == nil { + return fmt.Errorf("nil input") + } + + window := input.WindowMinutes + if window <= 0 { + window = 1 + } + createdAt := input.CreatedAt + if createdAt.IsZero() { + createdAt = time.Now().UTC() + } + + q := ` +INSERT INTO ops_system_metrics ( + created_at, + window_minutes, + platform, + group_id, + + success_count, + error_count_total, + business_limited_count, + error_count_sla, + + upstream_error_count_excl_429_529, + upstream_429_count, + upstream_529_count, + + token_consumed, + qps, + tps, + + duration_p50_ms, + duration_p90_ms, + duration_p95_ms, + duration_p99_ms, + duration_avg_ms, + duration_max_ms, + + ttft_p50_ms, + ttft_p90_ms, + ttft_p95_ms, + ttft_p99_ms, + ttft_avg_ms, + ttft_max_ms, + + cpu_usage_percent, + memory_used_mb, + memory_total_mb, + memory_usage_percent, + + db_ok, + redis_ok, + + db_conn_active, + db_conn_idle, + db_conn_waiting, + + goroutine_count, + concurrency_queue_depth +) VALUES ( + $1,$2,$3,$4, + $5,$6,$7,$8, + $9,$10,$11, + $12,$13,$14, + $15,$16,$17,$18,$19,$20, + $21,$22,$23,$24,$25,$26, + $27,$28,$29,$30, + $31,$32, + $33,$34,$35, + $36,$37 +)` + + _, err := r.db.ExecContext( + ctx, + q, + createdAt, + window, + opsNullString(input.Platform), + opsNullInt64(input.GroupID), + + input.SuccessCount, + input.ErrorCountTotal, + input.BusinessLimitedCount, + input.ErrorCountSLA, + + input.UpstreamErrorCountExcl429529, + input.Upstream429Count, + input.Upstream529Count, + + input.TokenConsumed, + opsNullFloat64(input.QPS), + opsNullFloat64(input.TPS), + + opsNullInt(input.DurationP50Ms), + opsNullInt(input.DurationP90Ms), + opsNullInt(input.DurationP95Ms), + opsNullInt(input.DurationP99Ms), + opsNullFloat64(input.DurationAvgMs), + opsNullInt(input.DurationMaxMs), + + opsNullInt(input.TTFTP50Ms), + opsNullInt(input.TTFTP90Ms), + opsNullInt(input.TTFTP95Ms), + opsNullInt(input.TTFTP99Ms), + opsNullFloat64(input.TTFTAvgMs), + opsNullInt(input.TTFTMaxMs), + + opsNullFloat64(input.CPUUsagePercent), + opsNullInt(input.MemoryUsedMB), + opsNullInt(input.MemoryTotalMB), + opsNullFloat64(input.MemoryUsagePercent), + + opsNullBool(input.DBOK), + opsNullBool(input.RedisOK), + + opsNullInt(input.DBConnActive), + opsNullInt(input.DBConnIdle), + opsNullInt(input.DBConnWaiting), + + opsNullInt(input.GoroutineCount), + opsNullInt(input.ConcurrencyQueueDepth), + ) + return err +} + +func (r *opsRepository) GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*service.OpsSystemMetricsSnapshot, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if windowMinutes <= 0 { + windowMinutes = 1 + } + + q := ` +SELECT + id, + created_at, + window_minutes, + + cpu_usage_percent, + memory_used_mb, + memory_total_mb, + memory_usage_percent, + + db_ok, + redis_ok, + + db_conn_active, + db_conn_idle, + db_conn_waiting, + + goroutine_count, + concurrency_queue_depth +FROM ops_system_metrics +WHERE window_minutes = $1 + AND platform IS NULL + AND group_id IS NULL +ORDER BY created_at DESC +LIMIT 1` + + var out service.OpsSystemMetricsSnapshot + var cpu sql.NullFloat64 + var memUsed sql.NullInt64 + var memTotal sql.NullInt64 + var memPct sql.NullFloat64 + var dbOK sql.NullBool + var redisOK sql.NullBool + var dbActive sql.NullInt64 + var dbIdle sql.NullInt64 + var dbWaiting sql.NullInt64 + var goroutines sql.NullInt64 + var queueDepth sql.NullInt64 + + if err := r.db.QueryRowContext(ctx, q, windowMinutes).Scan( + &out.ID, + &out.CreatedAt, + &out.WindowMinutes, + &cpu, + &memUsed, + &memTotal, + &memPct, + &dbOK, + &redisOK, + &dbActive, + &dbIdle, + &dbWaiting, + &goroutines, + &queueDepth, + ); err != nil { + return nil, err + } + + if cpu.Valid { + v := cpu.Float64 + out.CPUUsagePercent = &v + } + if memUsed.Valid { + v := memUsed.Int64 + out.MemoryUsedMB = &v + } + if memTotal.Valid { + v := memTotal.Int64 + out.MemoryTotalMB = &v + } + if memPct.Valid { + v := memPct.Float64 + out.MemoryUsagePercent = &v + } + if dbOK.Valid { + v := dbOK.Bool + out.DBOK = &v + } + if redisOK.Valid { + v := redisOK.Bool + out.RedisOK = &v + } + if dbActive.Valid { + v := int(dbActive.Int64) + out.DBConnActive = &v + } + if dbIdle.Valid { + v := int(dbIdle.Int64) + out.DBConnIdle = &v + } + if dbWaiting.Valid { + v := int(dbWaiting.Int64) + out.DBConnWaiting = &v + } + if goroutines.Valid { + v := int(goroutines.Int64) + out.GoroutineCount = &v + } + if queueDepth.Valid { + v := int(queueDepth.Int64) + out.ConcurrencyQueueDepth = &v + } + + return &out, nil +} + +func (r *opsRepository) UpsertJobHeartbeat(ctx context.Context, input *service.OpsUpsertJobHeartbeatInput) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if input == nil { + return fmt.Errorf("nil input") + } + if input.JobName == "" { + return fmt.Errorf("job_name required") + } + + q := ` +INSERT INTO ops_job_heartbeats ( + job_name, + last_run_at, + last_success_at, + last_error_at, + last_error, + last_duration_ms, + updated_at +) VALUES ( + $1,$2,$3,$4,$5,$6,NOW() +) +ON CONFLICT (job_name) DO UPDATE SET + last_run_at = COALESCE(EXCLUDED.last_run_at, ops_job_heartbeats.last_run_at), + last_success_at = COALESCE(EXCLUDED.last_success_at, ops_job_heartbeats.last_success_at), + last_error_at = CASE + WHEN EXCLUDED.last_success_at IS NOT NULL THEN NULL + ELSE COALESCE(EXCLUDED.last_error_at, ops_job_heartbeats.last_error_at) + END, + last_error = CASE + WHEN EXCLUDED.last_success_at IS NOT NULL THEN NULL + ELSE COALESCE(EXCLUDED.last_error, ops_job_heartbeats.last_error) + END, + last_duration_ms = COALESCE(EXCLUDED.last_duration_ms, ops_job_heartbeats.last_duration_ms), + updated_at = NOW()` + + _, err := r.db.ExecContext( + ctx, + q, + input.JobName, + opsNullTime(input.LastRunAt), + opsNullTime(input.LastSuccessAt), + opsNullTime(input.LastErrorAt), + opsNullString(input.LastError), + opsNullInt(input.LastDurationMs), + ) + return err +} + +func (r *opsRepository) ListJobHeartbeats(ctx context.Context) ([]*service.OpsJobHeartbeat, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + + q := ` +SELECT + job_name, + last_run_at, + last_success_at, + last_error_at, + last_error, + last_duration_ms, + updated_at +FROM ops_job_heartbeats +ORDER BY job_name ASC` + + rows, err := r.db.QueryContext(ctx, q) + if err != nil { + return nil, err + } + defer rows.Close() + + out := make([]*service.OpsJobHeartbeat, 0, 8) + for rows.Next() { + var item service.OpsJobHeartbeat + var lastRun sql.NullTime + var lastSuccess sql.NullTime + var lastErrorAt sql.NullTime + var lastError sql.NullString + var lastDuration sql.NullInt64 + + if err := rows.Scan( + &item.JobName, + &lastRun, + &lastSuccess, + &lastErrorAt, + &lastError, + &lastDuration, + &item.UpdatedAt, + ); err != nil { + return nil, err + } + + if lastRun.Valid { + v := lastRun.Time + item.LastRunAt = &v + } + if lastSuccess.Valid { + v := lastSuccess.Time + item.LastSuccessAt = &v + } + if lastErrorAt.Valid { + v := lastErrorAt.Time + item.LastErrorAt = &v + } + if lastError.Valid { + v := lastError.String + item.LastError = &v + } + if lastDuration.Valid { + v := lastDuration.Int64 + item.LastDurationMs = &v + } + + out = append(out, &item) + } + if err := rows.Err(); err != nil { + return nil, err + } + return out, nil +} + +func opsNullBool(v *bool) any { + if v == nil { + return sql.NullBool{} + } + return sql.NullBool{Bool: *v, Valid: true} +} + +func opsNullFloat64(v *float64) any { + if v == nil { + return sql.NullFloat64{} + } + return sql.NullFloat64{Float64: *v, Valid: true} +} + +func opsNullTime(v *time.Time) any { + if v == nil || v.IsZero() { + return sql.NullTime{} + } + return sql.NullTime{Time: *v, Valid: true} +} + diff --git a/backend/internal/repository/ops_repo_preagg.go b/backend/internal/repository/ops_repo_preagg.go new file mode 100644 index 00000000..6a8b9184 --- /dev/null +++ b/backend/internal/repository/ops_repo_preagg.go @@ -0,0 +1,359 @@ +package repository + +import ( + "context" + "database/sql" + "fmt" + "time" +) + +func (r *opsRepository) UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) { + return nil + } + + start := startTime.UTC() + end := endTime.UTC() + + // NOTE: + // - We aggregate usage_logs + ops_error_logs into ops_metrics_hourly. + // - We emit three dimension granularities via GROUPING SETS: + // 1) overall: (bucket_start) + // 2) platform: (bucket_start, platform) + // 3) group: (bucket_start, platform, group_id) + // + // IMPORTANT: Postgres UNIQUE treats NULLs as distinct, so the table uses a COALESCE-based + // unique index; our ON CONFLICT target must match that expression set. + q := ` +WITH usage_base AS ( + SELECT + date_trunc('hour', ul.created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start, + g.platform AS platform, + ul.group_id AS group_id, + ul.duration_ms AS duration_ms, + ul.first_token_ms AS first_token_ms, + (ul.input_tokens + ul.output_tokens + ul.cache_creation_tokens + ul.cache_read_tokens) AS tokens + FROM usage_logs ul + JOIN groups g ON g.id = ul.group_id + WHERE ul.created_at >= $1 AND ul.created_at < $2 +), +usage_agg AS ( + SELECT + bucket_start, + CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform, + CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id, + COUNT(*) AS success_count, + COALESCE(SUM(tokens), 0) AS token_consumed, + + percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p50_ms, + percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p90_ms, + percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p95_ms, + percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p99_ms, + AVG(duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_avg_ms, + MAX(duration_ms) AS duration_max_ms, + + percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p50_ms, + percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p90_ms, + percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p95_ms, + percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p99_ms, + AVG(first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_avg_ms, + MAX(first_token_ms) AS ttft_max_ms + FROM usage_base + GROUP BY GROUPING SETS ( + (bucket_start), + (bucket_start, platform), + (bucket_start, platform, group_id) + ) +), +error_base AS ( + SELECT + date_trunc('hour', created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start, + platform AS platform, + group_id AS group_id, + is_business_limited AS is_business_limited, + error_owner AS error_owner, + status_code AS status_code + FROM ops_error_logs + WHERE created_at >= $1 AND created_at < $2 +), +error_agg AS ( + SELECT + bucket_start, + CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform, + CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id, + COUNT(*) AS error_count_total, + COUNT(*) FILTER (WHERE is_business_limited) AS business_limited_count, + COUNT(*) FILTER (WHERE NOT is_business_limited) AS error_count_sla, + COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) NOT IN (429, 529)) AS upstream_error_count_excl_429_529, + COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 429) AS upstream_429_count, + COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 529) AS upstream_529_count + FROM error_base + GROUP BY GROUPING SETS ( + (bucket_start), + (bucket_start, platform), + (bucket_start, platform, group_id) + ) + HAVING GROUPING(group_id) = 1 OR group_id IS NOT NULL +), +combined AS ( + SELECT + COALESCE(u.bucket_start, e.bucket_start) AS bucket_start, + COALESCE(u.platform, e.platform) AS platform, + COALESCE(u.group_id, e.group_id) AS group_id, + + COALESCE(u.success_count, 0) AS success_count, + COALESCE(e.error_count_total, 0) AS error_count_total, + COALESCE(e.business_limited_count, 0) AS business_limited_count, + COALESCE(e.error_count_sla, 0) AS error_count_sla, + COALESCE(e.upstream_error_count_excl_429_529, 0) AS upstream_error_count_excl_429_529, + COALESCE(e.upstream_429_count, 0) AS upstream_429_count, + COALESCE(e.upstream_529_count, 0) AS upstream_529_count, + + COALESCE(u.token_consumed, 0) AS token_consumed, + + u.duration_p50_ms, + u.duration_p90_ms, + u.duration_p95_ms, + u.duration_p99_ms, + u.duration_avg_ms, + u.duration_max_ms, + + u.ttft_p50_ms, + u.ttft_p90_ms, + u.ttft_p95_ms, + u.ttft_p99_ms, + u.ttft_avg_ms, + u.ttft_max_ms + FROM usage_agg u + FULL OUTER JOIN error_agg e + ON u.bucket_start = e.bucket_start + AND COALESCE(u.platform, '') = COALESCE(e.platform, '') + AND COALESCE(u.group_id, 0) = COALESCE(e.group_id, 0) +) +INSERT INTO ops_metrics_hourly ( + bucket_start, + platform, + group_id, + success_count, + error_count_total, + business_limited_count, + error_count_sla, + upstream_error_count_excl_429_529, + upstream_429_count, + upstream_529_count, + token_consumed, + duration_p50_ms, + duration_p90_ms, + duration_p95_ms, + duration_p99_ms, + duration_avg_ms, + duration_max_ms, + ttft_p50_ms, + ttft_p90_ms, + ttft_p95_ms, + ttft_p99_ms, + ttft_avg_ms, + ttft_max_ms, + computed_at +) +SELECT + bucket_start, + NULLIF(platform, '') AS platform, + group_id, + success_count, + error_count_total, + business_limited_count, + error_count_sla, + upstream_error_count_excl_429_529, + upstream_429_count, + upstream_529_count, + token_consumed, + duration_p50_ms::int, + duration_p90_ms::int, + duration_p95_ms::int, + duration_p99_ms::int, + duration_avg_ms, + duration_max_ms::int, + ttft_p50_ms::int, + ttft_p90_ms::int, + ttft_p95_ms::int, + ttft_p99_ms::int, + ttft_avg_ms, + ttft_max_ms::int, + NOW() +FROM combined +WHERE bucket_start IS NOT NULL + AND (platform IS NULL OR platform <> '') +ON CONFLICT (bucket_start, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET + success_count = EXCLUDED.success_count, + error_count_total = EXCLUDED.error_count_total, + business_limited_count = EXCLUDED.business_limited_count, + error_count_sla = EXCLUDED.error_count_sla, + upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529, + upstream_429_count = EXCLUDED.upstream_429_count, + upstream_529_count = EXCLUDED.upstream_529_count, + token_consumed = EXCLUDED.token_consumed, + + duration_p50_ms = EXCLUDED.duration_p50_ms, + duration_p90_ms = EXCLUDED.duration_p90_ms, + duration_p95_ms = EXCLUDED.duration_p95_ms, + duration_p99_ms = EXCLUDED.duration_p99_ms, + duration_avg_ms = EXCLUDED.duration_avg_ms, + duration_max_ms = EXCLUDED.duration_max_ms, + + ttft_p50_ms = EXCLUDED.ttft_p50_ms, + ttft_p90_ms = EXCLUDED.ttft_p90_ms, + ttft_p95_ms = EXCLUDED.ttft_p95_ms, + ttft_p99_ms = EXCLUDED.ttft_p99_ms, + ttft_avg_ms = EXCLUDED.ttft_avg_ms, + ttft_max_ms = EXCLUDED.ttft_max_ms, + + computed_at = NOW() +` + + _, err := r.db.ExecContext(ctx, q, start, end) + return err +} + +func (r *opsRepository) UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error { + if r == nil || r.db == nil { + return fmt.Errorf("nil ops repository") + } + if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) { + return nil + } + + start := startTime.UTC() + end := endTime.UTC() + + q := ` +INSERT INTO ops_metrics_daily ( + bucket_date, + platform, + group_id, + success_count, + error_count_total, + business_limited_count, + error_count_sla, + upstream_error_count_excl_429_529, + upstream_429_count, + upstream_529_count, + token_consumed, + duration_p50_ms, + duration_p90_ms, + duration_p95_ms, + duration_p99_ms, + duration_avg_ms, + duration_max_ms, + ttft_p50_ms, + ttft_p90_ms, + ttft_p95_ms, + ttft_p99_ms, + ttft_avg_ms, + ttft_max_ms, + computed_at +) +SELECT + (bucket_start AT TIME ZONE 'UTC')::date AS bucket_date, + platform, + group_id, + + COALESCE(SUM(success_count), 0) AS success_count, + COALESCE(SUM(error_count_total), 0) AS error_count_total, + COALESCE(SUM(business_limited_count), 0) AS business_limited_count, + COALESCE(SUM(error_count_sla), 0) AS error_count_sla, + COALESCE(SUM(upstream_error_count_excl_429_529), 0) AS upstream_error_count_excl_429_529, + COALESCE(SUM(upstream_429_count), 0) AS upstream_429_count, + COALESCE(SUM(upstream_529_count), 0) AS upstream_529_count, + COALESCE(SUM(token_consumed), 0) AS token_consumed, + + -- Approximation: weighted average for p50/p90, max for p95/p99 (conservative tail). + ROUND(SUM(duration_p50_ms::double precision * success_count) FILTER (WHERE duration_p50_ms IS NOT NULL) + / NULLIF(SUM(success_count) FILTER (WHERE duration_p50_ms IS NOT NULL), 0))::int AS duration_p50_ms, + ROUND(SUM(duration_p90_ms::double precision * success_count) FILTER (WHERE duration_p90_ms IS NOT NULL) + / NULLIF(SUM(success_count) FILTER (WHERE duration_p90_ms IS NOT NULL), 0))::int AS duration_p90_ms, + MAX(duration_p95_ms) AS duration_p95_ms, + MAX(duration_p99_ms) AS duration_p99_ms, + SUM(duration_avg_ms * success_count) FILTER (WHERE duration_avg_ms IS NOT NULL) + / NULLIF(SUM(success_count) FILTER (WHERE duration_avg_ms IS NOT NULL), 0) AS duration_avg_ms, + MAX(duration_max_ms) AS duration_max_ms, + + ROUND(SUM(ttft_p50_ms::double precision * success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL) + / NULLIF(SUM(success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL), 0))::int AS ttft_p50_ms, + ROUND(SUM(ttft_p90_ms::double precision * success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL) + / NULLIF(SUM(success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL), 0))::int AS ttft_p90_ms, + MAX(ttft_p95_ms) AS ttft_p95_ms, + MAX(ttft_p99_ms) AS ttft_p99_ms, + SUM(ttft_avg_ms * success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL) + / NULLIF(SUM(success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL), 0) AS ttft_avg_ms, + MAX(ttft_max_ms) AS ttft_max_ms, + + NOW() +FROM ops_metrics_hourly +WHERE bucket_start >= $1 AND bucket_start < $2 +GROUP BY 1, 2, 3 +ON CONFLICT (bucket_date, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET + success_count = EXCLUDED.success_count, + error_count_total = EXCLUDED.error_count_total, + business_limited_count = EXCLUDED.business_limited_count, + error_count_sla = EXCLUDED.error_count_sla, + upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529, + upstream_429_count = EXCLUDED.upstream_429_count, + upstream_529_count = EXCLUDED.upstream_529_count, + token_consumed = EXCLUDED.token_consumed, + + duration_p50_ms = EXCLUDED.duration_p50_ms, + duration_p90_ms = EXCLUDED.duration_p90_ms, + duration_p95_ms = EXCLUDED.duration_p95_ms, + duration_p99_ms = EXCLUDED.duration_p99_ms, + duration_avg_ms = EXCLUDED.duration_avg_ms, + duration_max_ms = EXCLUDED.duration_max_ms, + + ttft_p50_ms = EXCLUDED.ttft_p50_ms, + ttft_p90_ms = EXCLUDED.ttft_p90_ms, + ttft_p95_ms = EXCLUDED.ttft_p95_ms, + ttft_p99_ms = EXCLUDED.ttft_p99_ms, + ttft_avg_ms = EXCLUDED.ttft_avg_ms, + ttft_max_ms = EXCLUDED.ttft_max_ms, + + computed_at = NOW() +` + + _, err := r.db.ExecContext(ctx, q, start, end) + return err +} + +func (r *opsRepository) GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error) { + if r == nil || r.db == nil { + return time.Time{}, false, fmt.Errorf("nil ops repository") + } + + var value sql.NullTime + if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_start) FROM ops_metrics_hourly`).Scan(&value); err != nil { + return time.Time{}, false, err + } + if !value.Valid { + return time.Time{}, false, nil + } + return value.Time.UTC(), true, nil +} + +func (r *opsRepository) GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error) { + if r == nil || r.db == nil { + return time.Time{}, false, fmt.Errorf("nil ops repository") + } + + var value sql.NullTime + if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_date) FROM ops_metrics_daily`).Scan(&value); err != nil { + return time.Time{}, false, err + } + if !value.Valid { + return time.Time{}, false, nil + } + t := value.Time.UTC() + return time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, time.UTC), true, nil +} + diff --git a/backend/internal/repository/ops_repo_request_details.go b/backend/internal/repository/ops_repo_request_details.go new file mode 100644 index 00000000..57b93b21 --- /dev/null +++ b/backend/internal/repository/ops_repo_request_details.go @@ -0,0 +1,285 @@ +package repository + +import ( + "context" + "database/sql" + "fmt" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" +) + +func (r *opsRepository) ListRequestDetails(ctx context.Context, filter *service.OpsRequestDetailFilter) ([]*service.OpsRequestDetail, int64, error) { + if r == nil || r.db == nil { + return nil, 0, fmt.Errorf("nil ops repository") + } + + page, pageSize, startTime, endTime := filter.Normalize() + offset := (page - 1) * pageSize + + conditions := make([]string, 0, 16) + args := make([]any, 0, 24) + + // Placeholders $1/$2 reserved for time window inside the CTE. + args = append(args, startTime.UTC(), endTime.UTC()) + + addCondition := func(condition string, values ...any) { + conditions = append(conditions, condition) + args = append(args, values...) + } + + if filter != nil { + if kind := strings.TrimSpace(strings.ToLower(filter.Kind)); kind != "" && kind != "all" { + if kind != string(service.OpsRequestKindSuccess) && kind != string(service.OpsRequestKindError) { + return nil, 0, fmt.Errorf("invalid kind") + } + addCondition(fmt.Sprintf("kind = $%d", len(args)+1), kind) + } + + if platform := strings.TrimSpace(strings.ToLower(filter.Platform)); platform != "" { + addCondition(fmt.Sprintf("platform = $%d", len(args)+1), platform) + } + if filter.GroupID != nil && *filter.GroupID > 0 { + addCondition(fmt.Sprintf("group_id = $%d", len(args)+1), *filter.GroupID) + } + + if filter.UserID != nil && *filter.UserID > 0 { + addCondition(fmt.Sprintf("user_id = $%d", len(args)+1), *filter.UserID) + } + if filter.APIKeyID != nil && *filter.APIKeyID > 0 { + addCondition(fmt.Sprintf("api_key_id = $%d", len(args)+1), *filter.APIKeyID) + } + if filter.AccountID != nil && *filter.AccountID > 0 { + addCondition(fmt.Sprintf("account_id = $%d", len(args)+1), *filter.AccountID) + } + + if model := strings.TrimSpace(filter.Model); model != "" { + addCondition(fmt.Sprintf("model = $%d", len(args)+1), model) + } + if requestID := strings.TrimSpace(filter.RequestID); requestID != "" { + addCondition(fmt.Sprintf("request_id = $%d", len(args)+1), requestID) + } + if q := strings.TrimSpace(filter.Query); q != "" { + like := "%" + strings.ToLower(q) + "%" + startIdx := len(args) + 1 + addCondition( + fmt.Sprintf("(LOWER(COALESCE(request_id,'')) LIKE $%d OR LOWER(COALESCE(model,'')) LIKE $%d OR LOWER(COALESCE(message,'')) LIKE $%d)", + startIdx, startIdx+1, startIdx+2, + ), + like, like, like, + ) + } + + if filter.MinDurationMs != nil { + addCondition(fmt.Sprintf("duration_ms >= $%d", len(args)+1), *filter.MinDurationMs) + } + if filter.MaxDurationMs != nil { + addCondition(fmt.Sprintf("duration_ms <= $%d", len(args)+1), *filter.MaxDurationMs) + } + } + + where := "" + if len(conditions) > 0 { + where = "WHERE " + strings.Join(conditions, " AND ") + } + + cte := ` +WITH combined AS ( + SELECT + 'success'::TEXT AS kind, + ul.created_at AS created_at, + ul.request_id AS request_id, + COALESCE(NULLIF(g.platform, ''), NULLIF(a.platform, ''), '') AS platform, + ul.model AS model, + ul.duration_ms AS duration_ms, + NULL::INT AS status_code, + NULL::BIGINT AS error_id, + NULL::TEXT AS phase, + NULL::TEXT AS severity, + NULL::TEXT AS message, + ul.user_id AS user_id, + ul.api_key_id AS api_key_id, + ul.account_id AS account_id, + ul.group_id AS group_id, + ul.stream AS stream + FROM usage_logs ul + LEFT JOIN groups g ON g.id = ul.group_id + LEFT JOIN accounts a ON a.id = ul.account_id + WHERE ul.created_at >= $1 AND ul.created_at < $2 + + UNION ALL + + SELECT + 'error'::TEXT AS kind, + o.created_at AS created_at, + COALESCE(NULLIF(o.request_id,''), NULLIF(o.client_request_id,''), '') AS request_id, + COALESCE(NULLIF(o.platform, ''), NULLIF(g.platform, ''), NULLIF(a.platform, ''), '') AS platform, + o.model AS model, + o.duration_ms AS duration_ms, + o.status_code AS status_code, + o.id AS error_id, + o.error_phase AS phase, + o.severity AS severity, + o.error_message AS message, + o.user_id AS user_id, + o.api_key_id AS api_key_id, + o.account_id AS account_id, + o.group_id AS group_id, + o.stream AS stream + FROM ops_error_logs o + LEFT JOIN groups g ON g.id = o.group_id + LEFT JOIN accounts a ON a.id = o.account_id + WHERE o.created_at >= $1 AND o.created_at < $2 +) +` + + countQuery := fmt.Sprintf(`%s SELECT COUNT(1) FROM combined %s`, cte, where) + var total int64 + if err := r.db.QueryRowContext(ctx, countQuery, args...).Scan(&total); err != nil { + if err == sql.ErrNoRows { + total = 0 + } else { + return nil, 0, err + } + } + + sort := "ORDER BY created_at DESC" + if filter != nil { + switch strings.TrimSpace(strings.ToLower(filter.Sort)) { + case "", "created_at_desc": + // default + case "duration_desc": + sort = "ORDER BY duration_ms DESC NULLS LAST, created_at DESC" + default: + return nil, 0, fmt.Errorf("invalid sort") + } + } + + listQuery := fmt.Sprintf(` +%s +SELECT + kind, + created_at, + request_id, + platform, + model, + duration_ms, + status_code, + error_id, + phase, + severity, + message, + user_id, + api_key_id, + account_id, + group_id, + stream +FROM combined +%s +%s +LIMIT $%d OFFSET $%d +`, cte, where, sort, len(args)+1, len(args)+2) + + listArgs := append(append([]any{}, args...), pageSize, offset) + rows, err := r.db.QueryContext(ctx, listQuery, listArgs...) + if err != nil { + return nil, 0, err + } + defer rows.Close() + + toIntPtr := func(v sql.NullInt64) *int { + if !v.Valid { + return nil + } + i := int(v.Int64) + return &i + } + toInt64Ptr := func(v sql.NullInt64) *int64 { + if !v.Valid { + return nil + } + i := v.Int64 + return &i + } + + out := make([]*service.OpsRequestDetail, 0, pageSize) + for rows.Next() { + var ( + kind string + createdAt time.Time + requestID sql.NullString + platform sql.NullString + model sql.NullString + + durationMs sql.NullInt64 + statusCode sql.NullInt64 + errorID sql.NullInt64 + + phase sql.NullString + severity sql.NullString + message sql.NullString + + userID sql.NullInt64 + apiKeyID sql.NullInt64 + accountID sql.NullInt64 + groupID sql.NullInt64 + + stream bool + ) + + if err := rows.Scan( + &kind, + &createdAt, + &requestID, + &platform, + &model, + &durationMs, + &statusCode, + &errorID, + &phase, + &severity, + &message, + &userID, + &apiKeyID, + &accountID, + &groupID, + &stream, + ); err != nil { + return nil, 0, err + } + + item := &service.OpsRequestDetail{ + Kind: service.OpsRequestKind(kind), + CreatedAt: createdAt, + RequestID: strings.TrimSpace(requestID.String), + Platform: strings.TrimSpace(platform.String), + Model: strings.TrimSpace(model.String), + + DurationMs: toIntPtr(durationMs), + StatusCode: toIntPtr(statusCode), + ErrorID: toInt64Ptr(errorID), + Phase: phase.String, + Severity: severity.String, + Message: message.String, + + UserID: toInt64Ptr(userID), + APIKeyID: toInt64Ptr(apiKeyID), + AccountID: toInt64Ptr(accountID), + GroupID: toInt64Ptr(groupID), + + Stream: stream, + } + + if item.Platform == "" { + item.Platform = "unknown" + } + + out = append(out, item) + } + if err := rows.Err(); err != nil { + return nil, 0, err + } + + return out, total, nil +} diff --git a/backend/internal/repository/ops_repo_trends.go b/backend/internal/repository/ops_repo_trends.go new file mode 100644 index 00000000..5f32c5d1 --- /dev/null +++ b/backend/internal/repository/ops_repo_trends.go @@ -0,0 +1,567 @@ +package repository + +import ( + "context" + "database/sql" + "fmt" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" +) + +func (r *opsRepository) GetThroughputTrend(ctx context.Context, filter *service.OpsDashboardFilter, bucketSeconds int) (*service.OpsThroughputTrendResponse, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + return nil, fmt.Errorf("nil filter") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, fmt.Errorf("start_time/end_time required") + } + + if bucketSeconds <= 0 { + bucketSeconds = 60 + } + if bucketSeconds != 60 && bucketSeconds != 300 && bucketSeconds != 3600 { + // Keep a small, predictable set of supported buckets for now. + bucketSeconds = 60 + } + + start := filter.StartTime.UTC() + end := filter.EndTime.UTC() + + usageJoin, usageWhere, usageArgs, next := buildUsageWhere(filter, start, end, 1) + errorWhere, errorArgs, _ := buildErrorWhere(filter, start, end, next) + + usageBucketExpr := opsBucketExprForUsage(bucketSeconds) + errorBucketExpr := opsBucketExprForError(bucketSeconds) + + q := ` +WITH usage_buckets AS ( + SELECT ` + usageBucketExpr + ` AS bucket, + COUNT(*) AS success_count, + COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed + FROM usage_logs ul + ` + usageJoin + ` + ` + usageWhere + ` + GROUP BY 1 +), +error_buckets AS ( + SELECT ` + errorBucketExpr + ` AS bucket, + COUNT(*) AS error_count + FROM ops_error_logs + ` + errorWhere + ` + GROUP BY 1 +), +combined AS ( + SELECT COALESCE(u.bucket, e.bucket) AS bucket, + COALESCE(u.success_count, 0) AS success_count, + COALESCE(e.error_count, 0) AS error_count, + COALESCE(u.token_consumed, 0) AS token_consumed + FROM usage_buckets u + FULL OUTER JOIN error_buckets e ON u.bucket = e.bucket +) +SELECT + bucket, + (success_count + error_count) AS request_count, + token_consumed +FROM combined +ORDER BY bucket ASC` + + args := append(usageArgs, errorArgs...) + + rows, err := r.db.QueryContext(ctx, q, args...) + if err != nil { + return nil, err + } + defer rows.Close() + + points := make([]*service.OpsThroughputTrendPoint, 0, 256) + for rows.Next() { + var bucket time.Time + var requests int64 + var tokens sql.NullInt64 + if err := rows.Scan(&bucket, &requests, &tokens); err != nil { + return nil, err + } + tokenConsumed := int64(0) + if tokens.Valid { + tokenConsumed = tokens.Int64 + } + + denom := float64(bucketSeconds) + if denom <= 0 { + denom = 60 + } + qps := roundTo1DP(float64(requests) / denom) + tps := roundTo1DP(float64(tokenConsumed) / denom) + + points = append(points, &service.OpsThroughputTrendPoint{ + BucketStart: bucket.UTC(), + RequestCount: requests, + TokenConsumed: tokenConsumed, + QPS: qps, + TPS: tps, + }) + } + if err := rows.Err(); err != nil { + return nil, err + } + + // Fill missing buckets with zeros so charts render continuous timelines. + points = fillOpsThroughputBuckets(start, end, bucketSeconds, points) + + var byPlatform []*service.OpsThroughputPlatformBreakdownItem + var topGroups []*service.OpsThroughputGroupBreakdownItem + + platform := "" + if filter != nil { + platform = strings.TrimSpace(strings.ToLower(filter.Platform)) + } + groupID := (*int64)(nil) + if filter != nil { + groupID = filter.GroupID + } + + // Drilldown helpers: + // - No platform/group: totals by platform + // - Platform selected but no group: top groups in that platform + if platform == "" && (groupID == nil || *groupID <= 0) { + items, err := r.getThroughputBreakdownByPlatform(ctx, start, end) + if err != nil { + return nil, err + } + byPlatform = items + } else if platform != "" && (groupID == nil || *groupID <= 0) { + items, err := r.getThroughputTopGroupsByPlatform(ctx, start, end, platform, 10) + if err != nil { + return nil, err + } + topGroups = items + } + + return &service.OpsThroughputTrendResponse{ + Bucket: opsBucketLabel(bucketSeconds), + Points: points, + + ByPlatform: byPlatform, + TopGroups: topGroups, + }, nil +} + +func (r *opsRepository) getThroughputBreakdownByPlatform(ctx context.Context, start, end time.Time) ([]*service.OpsThroughputPlatformBreakdownItem, error) { + q := ` +WITH usage_totals AS ( + SELECT COALESCE(NULLIF(g.platform,''), a.platform) AS platform, + COUNT(*) AS success_count, + COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed + FROM usage_logs ul + LEFT JOIN groups g ON g.id = ul.group_id + LEFT JOIN accounts a ON a.id = ul.account_id + WHERE ul.created_at >= $1 AND ul.created_at < $2 + GROUP BY 1 +), +error_totals AS ( + SELECT platform, + COUNT(*) AS error_count + FROM ops_error_logs + WHERE created_at >= $1 AND created_at < $2 + GROUP BY 1 +), +combined AS ( + SELECT COALESCE(u.platform, e.platform) AS platform, + COALESCE(u.success_count, 0) AS success_count, + COALESCE(e.error_count, 0) AS error_count, + COALESCE(u.token_consumed, 0) AS token_consumed + FROM usage_totals u + FULL OUTER JOIN error_totals e ON u.platform = e.platform +) +SELECT platform, (success_count + error_count) AS request_count, token_consumed +FROM combined +WHERE platform IS NOT NULL AND platform <> '' +ORDER BY request_count DESC` + + rows, err := r.db.QueryContext(ctx, q, start, end) + if err != nil { + return nil, err + } + defer rows.Close() + + items := make([]*service.OpsThroughputPlatformBreakdownItem, 0, 8) + for rows.Next() { + var platform string + var requests int64 + var tokens sql.NullInt64 + if err := rows.Scan(&platform, &requests, &tokens); err != nil { + return nil, err + } + tokenConsumed := int64(0) + if tokens.Valid { + tokenConsumed = tokens.Int64 + } + items = append(items, &service.OpsThroughputPlatformBreakdownItem{ + Platform: platform, + RequestCount: requests, + TokenConsumed: tokenConsumed, + }) + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +func (r *opsRepository) getThroughputTopGroupsByPlatform(ctx context.Context, start, end time.Time, platform string, limit int) ([]*service.OpsThroughputGroupBreakdownItem, error) { + if strings.TrimSpace(platform) == "" { + return nil, nil + } + if limit <= 0 || limit > 100 { + limit = 10 + } + + q := ` +WITH usage_totals AS ( + SELECT ul.group_id AS group_id, + g.name AS group_name, + COUNT(*) AS success_count, + COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed + FROM usage_logs ul + JOIN groups g ON g.id = ul.group_id + WHERE ul.created_at >= $1 AND ul.created_at < $2 + AND g.platform = $3 + GROUP BY 1, 2 +), +error_totals AS ( + SELECT group_id, + COUNT(*) AS error_count + FROM ops_error_logs + WHERE created_at >= $1 AND created_at < $2 + AND platform = $3 + AND group_id IS NOT NULL + GROUP BY 1 +), +combined AS ( + SELECT COALESCE(u.group_id, e.group_id) AS group_id, + COALESCE(u.group_name, g2.name, '') AS group_name, + COALESCE(u.success_count, 0) AS success_count, + COALESCE(e.error_count, 0) AS error_count, + COALESCE(u.token_consumed, 0) AS token_consumed + FROM usage_totals u + FULL OUTER JOIN error_totals e ON u.group_id = e.group_id + LEFT JOIN groups g2 ON g2.id = COALESCE(u.group_id, e.group_id) +) +SELECT group_id, group_name, (success_count + error_count) AS request_count, token_consumed +FROM combined +WHERE group_id IS NOT NULL +ORDER BY request_count DESC +LIMIT $4` + + rows, err := r.db.QueryContext(ctx, q, start, end, platform, limit) + if err != nil { + return nil, err + } + defer rows.Close() + + items := make([]*service.OpsThroughputGroupBreakdownItem, 0, limit) + for rows.Next() { + var groupID int64 + var groupName sql.NullString + var requests int64 + var tokens sql.NullInt64 + if err := rows.Scan(&groupID, &groupName, &requests, &tokens); err != nil { + return nil, err + } + tokenConsumed := int64(0) + if tokens.Valid { + tokenConsumed = tokens.Int64 + } + name := "" + if groupName.Valid { + name = groupName.String + } + items = append(items, &service.OpsThroughputGroupBreakdownItem{ + GroupID: groupID, + GroupName: name, + RequestCount: requests, + TokenConsumed: tokenConsumed, + }) + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +func opsBucketExprForUsage(bucketSeconds int) string { + switch bucketSeconds { + case 3600: + return "date_trunc('hour', ul.created_at)" + case 300: + // 5-minute buckets in UTC. + return "to_timestamp(floor(extract(epoch from ul.created_at) / 300) * 300)" + default: + return "date_trunc('minute', ul.created_at)" + } +} + +func opsBucketExprForError(bucketSeconds int) string { + switch bucketSeconds { + case 3600: + return "date_trunc('hour', created_at)" + case 300: + return "to_timestamp(floor(extract(epoch from created_at) / 300) * 300)" + default: + return "date_trunc('minute', created_at)" + } +} + +func opsBucketLabel(bucketSeconds int) string { + if bucketSeconds <= 0 { + return "1m" + } + if bucketSeconds%3600 == 0 { + h := bucketSeconds / 3600 + if h <= 0 { + h = 1 + } + return fmt.Sprintf("%dh", h) + } + m := bucketSeconds / 60 + if m <= 0 { + m = 1 + } + return fmt.Sprintf("%dm", m) +} + +func opsFloorToBucketStart(t time.Time, bucketSeconds int) time.Time { + t = t.UTC() + if bucketSeconds <= 0 { + bucketSeconds = 60 + } + secs := t.Unix() + floored := secs - (secs % int64(bucketSeconds)) + return time.Unix(floored, 0).UTC() +} + +func fillOpsThroughputBuckets(start, end time.Time, bucketSeconds int, points []*service.OpsThroughputTrendPoint) []*service.OpsThroughputTrendPoint { + if bucketSeconds <= 0 { + bucketSeconds = 60 + } + if !start.Before(end) { + return points + } + + endMinus := end.Add(-time.Nanosecond) + if endMinus.Before(start) { + return points + } + + first := opsFloorToBucketStart(start, bucketSeconds) + last := opsFloorToBucketStart(endMinus, bucketSeconds) + step := time.Duration(bucketSeconds) * time.Second + + existing := make(map[int64]*service.OpsThroughputTrendPoint, len(points)) + for _, p := range points { + if p == nil { + continue + } + existing[p.BucketStart.UTC().Unix()] = p + } + + out := make([]*service.OpsThroughputTrendPoint, 0, int(last.Sub(first)/step)+1) + for cursor := first; !cursor.After(last); cursor = cursor.Add(step) { + if p, ok := existing[cursor.Unix()]; ok && p != nil { + out = append(out, p) + continue + } + out = append(out, &service.OpsThroughputTrendPoint{ + BucketStart: cursor, + RequestCount: 0, + TokenConsumed: 0, + QPS: 0, + TPS: 0, + }) + } + return out +} + +func (r *opsRepository) GetErrorTrend(ctx context.Context, filter *service.OpsDashboardFilter, bucketSeconds int) (*service.OpsErrorTrendResponse, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + return nil, fmt.Errorf("nil filter") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, fmt.Errorf("start_time/end_time required") + } + + if bucketSeconds <= 0 { + bucketSeconds = 60 + } + if bucketSeconds != 60 && bucketSeconds != 300 && bucketSeconds != 3600 { + bucketSeconds = 60 + } + + start := filter.StartTime.UTC() + end := filter.EndTime.UTC() + where, args, _ := buildErrorWhere(filter, start, end, 1) + bucketExpr := opsBucketExprForError(bucketSeconds) + + q := ` +SELECT + ` + bucketExpr + ` AS bucket, + COUNT(*) AS error_total, + COUNT(*) FILTER (WHERE is_business_limited) AS business_limited, + COUNT(*) FILTER (WHERE NOT is_business_limited) AS error_sla, + COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) NOT IN (429, 529)) AS upstream_excl, + COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 429) AS upstream_429, + COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 529) AS upstream_529 +FROM ops_error_logs +` + where + ` +GROUP BY 1 +ORDER BY 1 ASC` + + rows, err := r.db.QueryContext(ctx, q, args...) + if err != nil { + return nil, err + } + defer rows.Close() + + points := make([]*service.OpsErrorTrendPoint, 0, 256) + for rows.Next() { + var bucket time.Time + var total, businessLimited, sla, upstreamExcl, upstream429, upstream529 int64 + if err := rows.Scan(&bucket, &total, &businessLimited, &sla, &upstreamExcl, &upstream429, &upstream529); err != nil { + return nil, err + } + points = append(points, &service.OpsErrorTrendPoint{ + BucketStart: bucket.UTC(), + + ErrorCountTotal: total, + BusinessLimitedCount: businessLimited, + ErrorCountSLA: sla, + + UpstreamErrorCountExcl429529: upstreamExcl, + Upstream429Count: upstream429, + Upstream529Count: upstream529, + }) + } + if err := rows.Err(); err != nil { + return nil, err + } + + points = fillOpsErrorTrendBuckets(start, end, bucketSeconds, points) + + return &service.OpsErrorTrendResponse{ + Bucket: opsBucketLabel(bucketSeconds), + Points: points, + }, nil +} + +func fillOpsErrorTrendBuckets(start, end time.Time, bucketSeconds int, points []*service.OpsErrorTrendPoint) []*service.OpsErrorTrendPoint { + if bucketSeconds <= 0 { + bucketSeconds = 60 + } + if !start.Before(end) { + return points + } + + endMinus := end.Add(-time.Nanosecond) + if endMinus.Before(start) { + return points + } + + first := opsFloorToBucketStart(start, bucketSeconds) + last := opsFloorToBucketStart(endMinus, bucketSeconds) + step := time.Duration(bucketSeconds) * time.Second + + existing := make(map[int64]*service.OpsErrorTrendPoint, len(points)) + for _, p := range points { + if p == nil { + continue + } + existing[p.BucketStart.UTC().Unix()] = p + } + + out := make([]*service.OpsErrorTrendPoint, 0, int(last.Sub(first)/step)+1) + for cursor := first; !cursor.After(last); cursor = cursor.Add(step) { + if p, ok := existing[cursor.Unix()]; ok && p != nil { + out = append(out, p) + continue + } + out = append(out, &service.OpsErrorTrendPoint{ + BucketStart: cursor, + + ErrorCountTotal: 0, + BusinessLimitedCount: 0, + ErrorCountSLA: 0, + + UpstreamErrorCountExcl429529: 0, + Upstream429Count: 0, + Upstream529Count: 0, + }) + } + return out +} + +func (r *opsRepository) GetErrorDistribution(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsErrorDistributionResponse, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + return nil, fmt.Errorf("nil filter") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, fmt.Errorf("start_time/end_time required") + } + + start := filter.StartTime.UTC() + end := filter.EndTime.UTC() + where, args, _ := buildErrorWhere(filter, start, end, 1) + + q := ` +SELECT + COALESCE(status_code, 0) AS status_code, + COUNT(*) AS total, + COUNT(*) FILTER (WHERE NOT is_business_limited) AS sla, + COUNT(*) FILTER (WHERE is_business_limited) AS business_limited +FROM ops_error_logs +` + where + ` +GROUP BY 1 +ORDER BY total DESC +LIMIT 20` + + rows, err := r.db.QueryContext(ctx, q, args...) + if err != nil { + return nil, err + } + defer rows.Close() + + items := make([]*service.OpsErrorDistributionItem, 0, 16) + var total int64 + for rows.Next() { + var statusCode int + var cntTotal, cntSLA, cntBiz int64 + if err := rows.Scan(&statusCode, &cntTotal, &cntSLA, &cntBiz); err != nil { + return nil, err + } + total += cntTotal + items = append(items, &service.OpsErrorDistributionItem{ + StatusCode: statusCode, + Total: cntTotal, + SLA: cntSLA, + BusinessLimited: cntBiz, + }) + } + if err := rows.Err(); err != nil { + return nil, err + } + + return &service.OpsErrorDistributionResponse{ + Total: total, + Items: items, + }, nil +} diff --git a/backend/internal/repository/ops_repo_window_stats.go b/backend/internal/repository/ops_repo_window_stats.go new file mode 100644 index 00000000..8221c473 --- /dev/null +++ b/backend/internal/repository/ops_repo_window_stats.go @@ -0,0 +1,50 @@ +package repository + +import ( + "context" + "fmt" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" +) + +func (r *opsRepository) GetWindowStats(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsWindowStats, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + return nil, fmt.Errorf("nil filter") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, fmt.Errorf("start_time/end_time required") + } + + start := filter.StartTime.UTC() + end := filter.EndTime.UTC() + if start.After(end) { + return nil, fmt.Errorf("start_time must be <= end_time") + } + // Bound excessively large windows to prevent accidental heavy queries. + if end.Sub(start) > 24*time.Hour { + return nil, fmt.Errorf("window too large") + } + + successCount, tokenConsumed, err := r.queryUsageCounts(ctx, filter, start, end) + if err != nil { + return nil, err + } + + errorTotal, _, _, _, _, _, err := r.queryErrorCounts(ctx, filter, start, end) + if err != nil { + return nil, err + } + + return &service.OpsWindowStats{ + StartTime: start, + EndTime: end, + + SuccessCount: successCount, + ErrorCountTotal: errorTotal, + TokenConsumed: tokenConsumed, + }, nil +} diff --git a/backend/internal/repository/wire.go b/backend/internal/repository/wire.go index f7574563..315bc1b6 100644 --- a/backend/internal/repository/wire.go +++ b/backend/internal/repository/wire.go @@ -35,6 +35,7 @@ var ProviderSet = wire.NewSet( NewRedeemCodeRepository, NewUsageLogRepository, NewSettingRepository, + NewOpsRepository, NewUserSubscriptionRepository, NewUserAttributeDefinitionRepository, NewUserAttributeValueRepository, From 5baa8b5673e77a5ebb3453094c8b13bef83e71f6 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:53:44 +0800 Subject: [PATCH 04/53] =?UTF-8?q?feat(service):=20=E5=AE=9E=E7=8E=B0?= =?UTF-8?q?=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E4=B8=9A=E5=8A=A1=E9=80=BB?= =?UTF-8?q?=E8=BE=91=E5=B1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 ops 主服务(ops_service.go)和端口定义(ops_port.go) - 实现账号可用性检查服务(ops_account_availability.go) - 实现数据聚合服务(ops_aggregation_service.go) - 实现告警评估服务(ops_alert_evaluator_service.go) - 实现告警管理服务(ops_alerts.go) - 实现数据清理服务(ops_cleanup_service.go) - 实现并发控制服务(ops_concurrency.go) - 实现仪表板服务(ops_dashboard.go) - 实现错误处理服务(ops_errors.go) - 实现直方图服务(ops_histograms.go) - 实现指标采集服务(ops_metrics_collector.go) - 实现查询模式服务(ops_query_mode.go) - 实现实时监控服务(ops_realtime.go) - 实现请求详情服务(ops_request_details.go) - 实现重试机制服务(ops_retry.go) - 实现配置管理服务(ops_settings.go) - 实现趋势分析服务(ops_trends.go) - 实现窗口统计服务(ops_window_stats.go) - 添加 ops 相关领域常量 - 注册 service 依赖注入 --- backend/internal/service/domain_constants.go | 22 + .../service/ops_account_availability.go | 157 ++++ .../service/ops_aggregation_service.go | 434 +++++++++ .../service/ops_alert_evaluator_service.go | 839 +++++++++++++++++ backend/internal/service/ops_alerts.go | 162 ++++ .../internal/service/ops_cleanup_service.go | 361 ++++++++ backend/internal/service/ops_concurrency.go | 257 ++++++ backend/internal/service/ops_dashboard.go | 77 ++ backend/internal/service/ops_errors.go | 45 + backend/internal/service/ops_histograms.go | 26 + .../internal/service/ops_metrics_collector.go | 861 ++++++++++++++++++ backend/internal/service/ops_port.go | 226 +++++ backend/internal/service/ops_query_mode.go | 40 + backend/internal/service/ops_realtime.go | 36 + .../internal/service/ops_request_details.go | 152 ++++ backend/internal/service/ops_retry.go | 635 +++++++++++++ backend/internal/service/ops_service.go | 451 +++++++++ backend/internal/service/ops_settings.go | 354 +++++++ backend/internal/service/ops_trends.go | 27 + backend/internal/service/ops_window_stats.go | 24 + backend/internal/service/wire.go | 58 ++ 21 files changed, 5244 insertions(+) create mode 100644 backend/internal/service/ops_account_availability.go create mode 100644 backend/internal/service/ops_aggregation_service.go create mode 100644 backend/internal/service/ops_alert_evaluator_service.go create mode 100644 backend/internal/service/ops_alerts.go create mode 100644 backend/internal/service/ops_cleanup_service.go create mode 100644 backend/internal/service/ops_concurrency.go create mode 100644 backend/internal/service/ops_dashboard.go create mode 100644 backend/internal/service/ops_errors.go create mode 100644 backend/internal/service/ops_histograms.go create mode 100644 backend/internal/service/ops_metrics_collector.go create mode 100644 backend/internal/service/ops_port.go create mode 100644 backend/internal/service/ops_query_mode.go create mode 100644 backend/internal/service/ops_realtime.go create mode 100644 backend/internal/service/ops_request_details.go create mode 100644 backend/internal/service/ops_retry.go create mode 100644 backend/internal/service/ops_service.go create mode 100644 backend/internal/service/ops_settings.go create mode 100644 backend/internal/service/ops_trends.go create mode 100644 backend/internal/service/ops_window_stats.go diff --git a/backend/internal/service/domain_constants.go b/backend/internal/service/domain_constants.go index 9c61ea2e..04f80dbe 100644 --- a/backend/internal/service/domain_constants.go +++ b/backend/internal/service/domain_constants.go @@ -105,6 +105,28 @@ const ( // Request identity patch (Claude -> Gemini systemInstruction injection) SettingKeyEnableIdentityPatch = "enable_identity_patch" SettingKeyIdentityPatchPrompt = "identity_patch_prompt" + + // ========================= + // Ops Monitoring (vNext) + // ========================= + + // SettingKeyOpsMonitoringEnabled is a DB-backed soft switch to enable/disable ops module at runtime. + SettingKeyOpsMonitoringEnabled = "ops_monitoring_enabled" + + // SettingKeyOpsRealtimeMonitoringEnabled controls realtime features (e.g. WS/QPS push). + SettingKeyOpsRealtimeMonitoringEnabled = "ops_realtime_monitoring_enabled" + + // SettingKeyOpsQueryModeDefault controls the default query mode for ops dashboard (auto/raw/preagg). + SettingKeyOpsQueryModeDefault = "ops_query_mode_default" + + // SettingKeyOpsEmailNotificationConfig stores JSON config for ops email notifications. + SettingKeyOpsEmailNotificationConfig = "ops_email_notification_config" + + // SettingKeyOpsAlertRuntimeSettings stores JSON config for ops alert evaluator runtime settings. + SettingKeyOpsAlertRuntimeSettings = "ops_alert_runtime_settings" + + // SettingKeyOpsMetricsIntervalSeconds controls the ops metrics collector interval (>=60). + SettingKeyOpsMetricsIntervalSeconds = "ops_metrics_interval_seconds" ) // AdminAPIKeyPrefix is the prefix for admin API keys (distinct from user "sk-" keys). diff --git a/backend/internal/service/ops_account_availability.go b/backend/internal/service/ops_account_availability.go new file mode 100644 index 00000000..d0cbbe5c --- /dev/null +++ b/backend/internal/service/ops_account_availability.go @@ -0,0 +1,157 @@ +package service + +import ( + "context" + "time" +) + +// GetAccountAvailabilityStats returns current account availability stats. +// +// Query-level filtering is intentionally limited to platform/group to match the dashboard scope. +func (s *OpsService) GetAccountAvailabilityStats(ctx context.Context, platformFilter string, groupIDFilter *int64) ( + map[string]*PlatformAvailability, + map[int64]*GroupAvailability, + map[int64]*AccountAvailability, + *time.Time, + error, +) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, nil, nil, nil, err + } + + accounts, err := s.listAllAccountsForOps(ctx, platformFilter) + if err != nil { + return nil, nil, nil, nil, err + } + + if groupIDFilter != nil && *groupIDFilter > 0 { + filtered := make([]Account, 0, len(accounts)) + for _, acc := range accounts { + for _, grp := range acc.Groups { + if grp != nil && grp.ID == *groupIDFilter { + filtered = append(filtered, acc) + break + } + } + } + accounts = filtered + } + + now := time.Now() + collectedAt := now + + platform := make(map[string]*PlatformAvailability) + group := make(map[int64]*GroupAvailability) + account := make(map[int64]*AccountAvailability) + + for _, acc := range accounts { + if acc.ID <= 0 { + continue + } + + isTempUnsched := false + if acc.TempUnschedulableUntil != nil && now.Before(*acc.TempUnschedulableUntil) { + isTempUnsched = true + } + + isRateLimited := acc.RateLimitResetAt != nil && now.Before(*acc.RateLimitResetAt) + isOverloaded := acc.OverloadUntil != nil && now.Before(*acc.OverloadUntil) + hasError := acc.Status == StatusError + + // Normalize exclusive status flags so the UI doesn't show conflicting badges. + if hasError { + isRateLimited = false + isOverloaded = false + } + + isAvailable := acc.Status == StatusActive && acc.Schedulable && !isRateLimited && !isOverloaded && !isTempUnsched + + if acc.Platform != "" { + if _, ok := platform[acc.Platform]; !ok { + platform[acc.Platform] = &PlatformAvailability{ + Platform: acc.Platform, + } + } + p := platform[acc.Platform] + p.TotalAccounts++ + if isAvailable { + p.AvailableCount++ + } + if isRateLimited { + p.RateLimitCount++ + } + if hasError { + p.ErrorCount++ + } + } + + for _, grp := range acc.Groups { + if grp == nil || grp.ID <= 0 { + continue + } + if _, ok := group[grp.ID]; !ok { + group[grp.ID] = &GroupAvailability{ + GroupID: grp.ID, + GroupName: grp.Name, + Platform: grp.Platform, + } + } + g := group[grp.ID] + g.TotalAccounts++ + if isAvailable { + g.AvailableCount++ + } + if isRateLimited { + g.RateLimitCount++ + } + if hasError { + g.ErrorCount++ + } + } + + displayGroupID := int64(0) + displayGroupName := "" + if len(acc.Groups) > 0 && acc.Groups[0] != nil { + displayGroupID = acc.Groups[0].ID + displayGroupName = acc.Groups[0].Name + } + + item := &AccountAvailability{ + AccountID: acc.ID, + AccountName: acc.Name, + Platform: acc.Platform, + GroupID: displayGroupID, + GroupName: displayGroupName, + Status: acc.Status, + + IsAvailable: isAvailable, + IsRateLimited: isRateLimited, + IsOverloaded: isOverloaded, + HasError: hasError, + + ErrorMessage: acc.ErrorMessage, + } + + if isRateLimited && acc.RateLimitResetAt != nil { + item.RateLimitResetAt = acc.RateLimitResetAt + remainingSec := int64(time.Until(*acc.RateLimitResetAt).Seconds()) + if remainingSec > 0 { + item.RateLimitRemainingSec = &remainingSec + } + } + if isOverloaded && acc.OverloadUntil != nil { + item.OverloadUntil = acc.OverloadUntil + remainingSec := int64(time.Until(*acc.OverloadUntil).Seconds()) + if remainingSec > 0 { + item.OverloadRemainingSec = &remainingSec + } + } + if isTempUnsched && acc.TempUnschedulableUntil != nil { + item.TempUnschedulableUntil = acc.TempUnschedulableUntil + } + + account[acc.ID] = item + } + + return platform, group, account, &collectedAt, nil +} diff --git a/backend/internal/service/ops_aggregation_service.go b/backend/internal/service/ops_aggregation_service.go new file mode 100644 index 00000000..04dbb11b --- /dev/null +++ b/backend/internal/service/ops_aggregation_service.go @@ -0,0 +1,434 @@ +package service + +import ( + "context" + "database/sql" + "errors" + "log" + "strings" + "sync" + "time" + + "github.com/Wei-Shaw/sub2api/internal/config" + "github.com/google/uuid" + "github.com/redis/go-redis/v9" +) + +const ( + opsAggHourlyJobName = "ops_preaggregation_hourly" + opsAggDailyJobName = "ops_preaggregation_daily" + + opsAggHourlyInterval = 10 * time.Minute + opsAggDailyInterval = 1 * time.Hour + + // Keep in sync with ops retention target (vNext default 30d). + opsAggBackfillWindow = 30 * 24 * time.Hour + + // Recompute overlap to absorb late-arriving rows near boundaries. + opsAggHourlyOverlap = 2 * time.Hour + opsAggDailyOverlap = 48 * time.Hour + + opsAggHourlyChunk = 24 * time.Hour + opsAggDailyChunk = 7 * 24 * time.Hour + + // Delay around boundaries (e.g. 10:00..10:05) to avoid aggregating buckets + // that may still receive late inserts. + opsAggSafeDelay = 5 * time.Minute + + opsAggMaxQueryTimeout = 3 * time.Second + opsAggHourlyTimeout = 5 * time.Minute + opsAggDailyTimeout = 2 * time.Minute + + opsAggHourlyLeaderLockKey = "ops:aggregation:hourly:leader" + opsAggDailyLeaderLockKey = "ops:aggregation:daily:leader" + + opsAggHourlyLeaderLockTTL = 15 * time.Minute + opsAggDailyLeaderLockTTL = 10 * time.Minute +) + +// OpsAggregationService periodically backfills ops_metrics_hourly / ops_metrics_daily +// for stable long-window dashboard queries. +// +// It is safe to run in multi-replica deployments when Redis is available (leader lock). +type OpsAggregationService struct { + opsRepo OpsRepository + settingRepo SettingRepository + cfg *config.Config + + db *sql.DB + redisClient *redis.Client + instanceID string + + stopCh chan struct{} + startOnce sync.Once + stopOnce sync.Once + + hourlyMu sync.Mutex + dailyMu sync.Mutex + + skipLogMu sync.Mutex + skipLogAt time.Time +} + +func NewOpsAggregationService( + opsRepo OpsRepository, + settingRepo SettingRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsAggregationService { + return &OpsAggregationService{ + opsRepo: opsRepo, + settingRepo: settingRepo, + cfg: cfg, + db: db, + redisClient: redisClient, + instanceID: uuid.NewString(), + } +} + +func (s *OpsAggregationService) Start() { + if s == nil { + return + } + s.startOnce.Do(func() { + if s.stopCh == nil { + s.stopCh = make(chan struct{}) + } + go s.hourlyLoop() + go s.dailyLoop() + }) +} + +func (s *OpsAggregationService) Stop() { + if s == nil { + return + } + s.stopOnce.Do(func() { + if s.stopCh != nil { + close(s.stopCh) + } + }) +} + +func (s *OpsAggregationService) hourlyLoop() { + // First run immediately. + s.aggregateHourly() + + ticker := time.NewTicker(opsAggHourlyInterval) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + s.aggregateHourly() + case <-s.stopCh: + return + } + } +} + +func (s *OpsAggregationService) dailyLoop() { + // First run immediately. + s.aggregateDaily() + + ticker := time.NewTicker(opsAggDailyInterval) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + s.aggregateDaily() + case <-s.stopCh: + return + } + } +} + +func (s *OpsAggregationService) aggregateHourly() { + if s == nil || s.opsRepo == nil { + return + } + if s.cfg != nil { + if !s.cfg.Ops.Enabled { + return + } + if !s.cfg.Ops.Aggregation.Enabled { + return + } + } + + ctx, cancel := context.WithTimeout(context.Background(), opsAggHourlyTimeout) + defer cancel() + + if !s.isMonitoringEnabled(ctx) { + return + } + + release, ok := s.tryAcquireLeaderLock(ctx, opsAggHourlyLeaderLockKey, opsAggHourlyLeaderLockTTL, "[OpsAggregation][hourly]") + if !ok { + return + } + if release != nil { + defer release() + } + + s.hourlyMu.Lock() + defer s.hourlyMu.Unlock() + + startedAt := time.Now().UTC() + runAt := startedAt + + // Aggregate stable full hours only. + end := utcFloorToHour(time.Now().UTC().Add(-opsAggSafeDelay)) + start := end.Add(-opsAggBackfillWindow) + + // Resume from the latest bucket with overlap. + { + ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout) + latest, ok, err := s.opsRepo.GetLatestHourlyBucketStart(ctxMax) + cancelMax() + if err != nil { + log.Printf("[OpsAggregation][hourly] failed to read latest bucket: %v", err) + } else if ok { + candidate := latest.Add(-opsAggHourlyOverlap) + if candidate.After(start) { + start = candidate + } + } + } + + start = utcFloorToHour(start) + if !start.Before(end) { + return + } + + var aggErr error + for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggHourlyChunk) { + chunkEnd := minTime(cursor.Add(opsAggHourlyChunk), end) + if err := s.opsRepo.UpsertHourlyMetrics(ctx, cursor, chunkEnd); err != nil { + aggErr = err + log.Printf("[OpsAggregation][hourly] upsert failed (%s..%s): %v", cursor.Format(time.RFC3339), chunkEnd.Format(time.RFC3339), err) + break + } + } + + finishedAt := time.Now().UTC() + durationMs := finishedAt.Sub(startedAt).Milliseconds() + dur := durationMs + + if aggErr != nil { + msg := truncateString(aggErr.Error(), 2048) + errAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer hbCancel() + _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAggHourlyJobName, + LastRunAt: &runAt, + LastErrorAt: &errAt, + LastError: &msg, + LastDurationMs: &dur, + }) + return + } + + successAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer hbCancel() + _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAggHourlyJobName, + LastRunAt: &runAt, + LastSuccessAt: &successAt, + LastDurationMs: &dur, + }) +} + +func (s *OpsAggregationService) aggregateDaily() { + if s == nil || s.opsRepo == nil { + return + } + if s.cfg != nil { + if !s.cfg.Ops.Enabled { + return + } + if !s.cfg.Ops.Aggregation.Enabled { + return + } + } + + ctx, cancel := context.WithTimeout(context.Background(), opsAggDailyTimeout) + defer cancel() + + if !s.isMonitoringEnabled(ctx) { + return + } + + release, ok := s.tryAcquireLeaderLock(ctx, opsAggDailyLeaderLockKey, opsAggDailyLeaderLockTTL, "[OpsAggregation][daily]") + if !ok { + return + } + if release != nil { + defer release() + } + + s.dailyMu.Lock() + defer s.dailyMu.Unlock() + + startedAt := time.Now().UTC() + runAt := startedAt + + end := utcFloorToDay(time.Now().UTC()) + start := end.Add(-opsAggBackfillWindow) + + { + ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout) + latest, ok, err := s.opsRepo.GetLatestDailyBucketDate(ctxMax) + cancelMax() + if err != nil { + log.Printf("[OpsAggregation][daily] failed to read latest bucket: %v", err) + } else if ok { + candidate := latest.Add(-opsAggDailyOverlap) + if candidate.After(start) { + start = candidate + } + } + } + + start = utcFloorToDay(start) + if !start.Before(end) { + return + } + + var aggErr error + for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggDailyChunk) { + chunkEnd := minTime(cursor.Add(opsAggDailyChunk), end) + if err := s.opsRepo.UpsertDailyMetrics(ctx, cursor, chunkEnd); err != nil { + aggErr = err + log.Printf("[OpsAggregation][daily] upsert failed (%s..%s): %v", cursor.Format("2006-01-02"), chunkEnd.Format("2006-01-02"), err) + break + } + } + + finishedAt := time.Now().UTC() + durationMs := finishedAt.Sub(startedAt).Milliseconds() + dur := durationMs + + if aggErr != nil { + msg := truncateString(aggErr.Error(), 2048) + errAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer hbCancel() + _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAggDailyJobName, + LastRunAt: &runAt, + LastErrorAt: &errAt, + LastError: &msg, + LastDurationMs: &dur, + }) + return + } + + successAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer hbCancel() + _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAggDailyJobName, + LastRunAt: &runAt, + LastSuccessAt: &successAt, + LastDurationMs: &dur, + }) +} + +func (s *OpsAggregationService) isMonitoringEnabled(ctx context.Context) bool { + if s == nil { + return false + } + if s.cfg != nil && !s.cfg.Ops.Enabled { + return false + } + if s.settingRepo == nil { + return true + } + if ctx == nil { + ctx = context.Background() + } + + value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled) + if err != nil { + if errors.Is(err, ErrSettingNotFound) { + return true + } + return true + } + switch strings.ToLower(strings.TrimSpace(value)) { + case "false", "0", "off", "disabled": + return false + default: + return true + } +} + +var opsAggReleaseScript = redis.NewScript(` +if redis.call("GET", KEYS[1]) == ARGV[1] then + return redis.call("DEL", KEYS[1]) +end +return 0 +`) + +func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) { + if s == nil || s.redisClient == nil { + return nil, true + } + if ctx == nil { + ctx = context.Background() + } + + ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() + if err != nil { + // Fail-open: do not block single-instance deployments. + return nil, true + } + if !ok { + s.maybeLogSkip(logPrefix) + return nil, false + } + + release := func() { + ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result() + } + return release, true +} + +func (s *OpsAggregationService) maybeLogSkip(prefix string) { + s.skipLogMu.Lock() + defer s.skipLogMu.Unlock() + + now := time.Now() + if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < time.Minute { + return + } + s.skipLogAt = now + if prefix == "" { + prefix = "[OpsAggregation]" + } + log.Printf("%s leader lock held by another instance; skipping", prefix) +} + +func utcFloorToHour(t time.Time) time.Time { + return t.UTC().Truncate(time.Hour) +} + +func utcFloorToDay(t time.Time) time.Time { + u := t.UTC() + y, m, d := u.Date() + return time.Date(y, m, d, 0, 0, 0, 0, time.UTC) +} + +func minTime(a, b time.Time) time.Time { + if a.Before(b) { + return a + } + return b +} diff --git a/backend/internal/service/ops_alert_evaluator_service.go b/backend/internal/service/ops_alert_evaluator_service.go new file mode 100644 index 00000000..b970c720 --- /dev/null +++ b/backend/internal/service/ops_alert_evaluator_service.go @@ -0,0 +1,839 @@ +package service + +import ( + "context" + "fmt" + "log" + "math" + "strconv" + "strings" + "sync" + "time" + + "github.com/Wei-Shaw/sub2api/internal/config" + "github.com/google/uuid" + "github.com/redis/go-redis/v9" +) + +const ( + opsAlertEvaluatorJobName = "ops_alert_evaluator" + + opsAlertEvaluatorTimeout = 45 * time.Second + opsAlertEvaluatorLeaderLockKey = "ops:alert:evaluator:leader" + opsAlertEvaluatorLeaderLockTTL = 90 * time.Second + opsAlertEvaluatorSkipLogInterval = 1 * time.Minute +) + +var opsAlertEvaluatorReleaseScript = redis.NewScript(` +if redis.call("GET", KEYS[1]) == ARGV[1] then + return redis.call("DEL", KEYS[1]) +end +return 0 +`) + +type OpsAlertEvaluatorService struct { + opsService *OpsService + opsRepo OpsRepository + emailService *EmailService + + redisClient *redis.Client + cfg *config.Config + instanceID string + + stopCh chan struct{} + startOnce sync.Once + stopOnce sync.Once + wg sync.WaitGroup + + mu sync.Mutex + ruleStates map[int64]*opsAlertRuleState + + emailLimiter *slidingWindowLimiter + + skipLogMu sync.Mutex + skipLogAt time.Time + + warnNoRedisOnce sync.Once +} + +type opsAlertRuleState struct { + LastEvaluatedAt time.Time + ConsecutiveBreaches int +} + +func NewOpsAlertEvaluatorService( + opsService *OpsService, + opsRepo OpsRepository, + emailService *EmailService, + redisClient *redis.Client, + cfg *config.Config, +) *OpsAlertEvaluatorService { + return &OpsAlertEvaluatorService{ + opsService: opsService, + opsRepo: opsRepo, + emailService: emailService, + redisClient: redisClient, + cfg: cfg, + instanceID: uuid.NewString(), + ruleStates: map[int64]*opsAlertRuleState{}, + emailLimiter: newSlidingWindowLimiter(0, time.Hour), + } +} + +func (s *OpsAlertEvaluatorService) Start() { + if s == nil { + return + } + s.startOnce.Do(func() { + if s.stopCh == nil { + s.stopCh = make(chan struct{}) + } + go s.run() + }) +} + +func (s *OpsAlertEvaluatorService) Stop() { + if s == nil { + return + } + s.stopOnce.Do(func() { + if s.stopCh != nil { + close(s.stopCh) + } + }) + s.wg.Wait() +} + +func (s *OpsAlertEvaluatorService) run() { + s.wg.Add(1) + defer s.wg.Done() + + // Start immediately to produce early feedback in ops dashboard. + timer := time.NewTimer(0) + defer timer.Stop() + + for { + select { + case <-timer.C: + interval := s.getInterval() + s.evaluateOnce(interval) + timer.Reset(interval) + case <-s.stopCh: + return + } + } +} + +func (s *OpsAlertEvaluatorService) getInterval() time.Duration { + // Default. + interval := 60 * time.Second + + if s == nil || s.opsService == nil { + return interval + } + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + cfg, err := s.opsService.GetOpsAlertRuntimeSettings(ctx) + if err != nil || cfg == nil { + return interval + } + if cfg.EvaluationIntervalSeconds <= 0 { + return interval + } + if cfg.EvaluationIntervalSeconds < 1 { + return interval + } + if cfg.EvaluationIntervalSeconds > int((24 * time.Hour).Seconds()) { + return interval + } + return time.Duration(cfg.EvaluationIntervalSeconds) * time.Second +} + +func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) { + if s == nil || s.opsRepo == nil { + return + } + if s.cfg != nil && !s.cfg.Ops.Enabled { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), opsAlertEvaluatorTimeout) + defer cancel() + + if s.opsService != nil && !s.opsService.IsMonitoringEnabled(ctx) { + return + } + + runtimeCfg := defaultOpsAlertRuntimeSettings() + if s.opsService != nil { + if loaded, err := s.opsService.GetOpsAlertRuntimeSettings(ctx); err == nil && loaded != nil { + runtimeCfg = loaded + } + } + + release, ok := s.tryAcquireLeaderLock(ctx, runtimeCfg.DistributedLock) + if !ok { + return + } + if release != nil { + defer release() + } + + startedAt := time.Now().UTC() + runAt := startedAt + + rules, err := s.opsRepo.ListAlertRules(ctx) + if err != nil { + s.recordHeartbeatError(runAt, time.Since(startedAt), err) + log.Printf("[OpsAlertEvaluator] list rules failed: %v", err) + return + } + + now := time.Now().UTC() + safeEnd := now.Truncate(time.Minute) + if safeEnd.IsZero() { + safeEnd = now + } + + systemMetrics, _ := s.opsRepo.GetLatestSystemMetrics(ctx, 1) + + // Cleanup stale state for removed rules. + s.pruneRuleStates(rules) + + for _, rule := range rules { + if rule == nil || !rule.Enabled || rule.ID <= 0 { + continue + } + + scopePlatform, scopeGroupID := parseOpsAlertRuleScope(rule.Filters) + + windowMinutes := rule.WindowMinutes + if windowMinutes <= 0 { + windowMinutes = 1 + } + windowStart := safeEnd.Add(-time.Duration(windowMinutes) * time.Minute) + windowEnd := safeEnd + + metricValue, ok := s.computeRuleMetric(ctx, rule, systemMetrics, windowStart, windowEnd, scopePlatform, scopeGroupID) + if !ok { + s.resetRuleState(rule.ID, now) + continue + } + + breachedNow := compareMetric(metricValue, rule.Operator, rule.Threshold) + required := requiredSustainedBreaches(rule.SustainedMinutes, interval) + consecutive := s.updateRuleBreaches(rule.ID, now, interval, breachedNow) + + activeEvent, err := s.opsRepo.GetActiveAlertEvent(ctx, rule.ID) + if err != nil { + log.Printf("[OpsAlertEvaluator] get active event failed (rule=%d): %v", rule.ID, err) + continue + } + + if breachedNow && consecutive >= required { + if activeEvent != nil { + continue + } + + latestEvent, err := s.opsRepo.GetLatestAlertEvent(ctx, rule.ID) + if err != nil { + log.Printf("[OpsAlertEvaluator] get latest event failed (rule=%d): %v", rule.ID, err) + continue + } + if latestEvent != nil && rule.CooldownMinutes > 0 { + cooldown := time.Duration(rule.CooldownMinutes) * time.Minute + if now.Sub(latestEvent.FiredAt) < cooldown { + continue + } + } + + firedEvent := &OpsAlertEvent{ + RuleID: rule.ID, + Severity: strings.TrimSpace(rule.Severity), + Status: OpsAlertStatusFiring, + Title: fmt.Sprintf("%s: %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name)), + Description: buildOpsAlertDescription(rule, metricValue, windowMinutes, scopePlatform, scopeGroupID), + MetricValue: float64Ptr(metricValue), + ThresholdValue: float64Ptr(rule.Threshold), + Dimensions: buildOpsAlertDimensions(scopePlatform, scopeGroupID), + FiredAt: now, + CreatedAt: now, + } + + created, err := s.opsRepo.CreateAlertEvent(ctx, firedEvent) + if err != nil { + log.Printf("[OpsAlertEvaluator] create event failed (rule=%d): %v", rule.ID, err) + continue + } + + if created != nil && created.ID > 0 { + s.maybeSendAlertEmail(ctx, runtimeCfg, rule, created) + } + continue + } + + // Not breached: resolve active event if present. + if activeEvent != nil { + resolvedAt := now + if err := s.opsRepo.UpdateAlertEventStatus(ctx, activeEvent.ID, OpsAlertStatusResolved, &resolvedAt); err != nil { + log.Printf("[OpsAlertEvaluator] resolve event failed (event=%d): %v", activeEvent.ID, err) + } + } + } + + s.recordHeartbeatSuccess(runAt, time.Since(startedAt)) +} + +func (s *OpsAlertEvaluatorService) pruneRuleStates(rules []*OpsAlertRule) { + s.mu.Lock() + defer s.mu.Unlock() + + live := map[int64]struct{}{} + for _, r := range rules { + if r != nil && r.ID > 0 { + live[r.ID] = struct{}{} + } + } + for id := range s.ruleStates { + if _, ok := live[id]; !ok { + delete(s.ruleStates, id) + } + } +} + +func (s *OpsAlertEvaluatorService) resetRuleState(ruleID int64, now time.Time) { + if ruleID <= 0 { + return + } + s.mu.Lock() + defer s.mu.Unlock() + state, ok := s.ruleStates[ruleID] + if !ok { + state = &opsAlertRuleState{} + s.ruleStates[ruleID] = state + } + state.LastEvaluatedAt = now + state.ConsecutiveBreaches = 0 +} + +func (s *OpsAlertEvaluatorService) updateRuleBreaches(ruleID int64, now time.Time, interval time.Duration, breached bool) int { + if ruleID <= 0 { + return 0 + } + s.mu.Lock() + defer s.mu.Unlock() + + state, ok := s.ruleStates[ruleID] + if !ok { + state = &opsAlertRuleState{} + s.ruleStates[ruleID] = state + } + + if !state.LastEvaluatedAt.IsZero() && interval > 0 { + if now.Sub(state.LastEvaluatedAt) > interval*2 { + state.ConsecutiveBreaches = 0 + } + } + + state.LastEvaluatedAt = now + if breached { + state.ConsecutiveBreaches++ + } else { + state.ConsecutiveBreaches = 0 + } + return state.ConsecutiveBreaches +} + +func requiredSustainedBreaches(sustainedMinutes int, interval time.Duration) int { + if sustainedMinutes <= 0 { + return 1 + } + if interval <= 0 { + return sustainedMinutes + } + required := int(math.Ceil(float64(sustainedMinutes*60) / interval.Seconds())) + if required < 1 { + return 1 + } + return required +} + +func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64) { + if filters == nil { + return "", nil + } + if v, ok := filters["platform"]; ok { + if s, ok := v.(string); ok { + platform = strings.TrimSpace(s) + } + } + if v, ok := filters["group_id"]; ok { + switch t := v.(type) { + case float64: + if t > 0 { + id := int64(t) + groupID = &id + } + case int64: + if t > 0 { + id := t + groupID = &id + } + case int: + if t > 0 { + id := int64(t) + groupID = &id + } + case string: + n, err := strconv.ParseInt(strings.TrimSpace(t), 10, 64) + if err == nil && n > 0 { + groupID = &n + } + } + } + return platform, groupID +} + +func (s *OpsAlertEvaluatorService) computeRuleMetric( + ctx context.Context, + rule *OpsAlertRule, + systemMetrics *OpsSystemMetricsSnapshot, + start time.Time, + end time.Time, + platform string, + groupID *int64, +) (float64, bool) { + if rule == nil { + return 0, false + } + switch strings.TrimSpace(rule.MetricType) { + case "cpu_usage_percent": + if systemMetrics != nil && systemMetrics.CPUUsagePercent != nil { + return *systemMetrics.CPUUsagePercent, true + } + return 0, false + case "memory_usage_percent": + if systemMetrics != nil && systemMetrics.MemoryUsagePercent != nil { + return *systemMetrics.MemoryUsagePercent, true + } + return 0, false + case "concurrency_queue_depth": + if systemMetrics != nil && systemMetrics.ConcurrencyQueueDepth != nil { + return float64(*systemMetrics.ConcurrencyQueueDepth), true + } + return 0, false + } + + overview, err := s.opsRepo.GetDashboardOverview(ctx, &OpsDashboardFilter{ + StartTime: start, + EndTime: end, + Platform: platform, + GroupID: groupID, + QueryMode: OpsQueryModeRaw, + }) + if err != nil { + return 0, false + } + if overview == nil { + return 0, false + } + + switch strings.TrimSpace(rule.MetricType) { + case "success_rate": + if overview.RequestCountSLA <= 0 { + return 0, false + } + return overview.SLA * 100, true + case "error_rate": + if overview.RequestCountSLA <= 0 { + return 0, false + } + return overview.ErrorRate * 100, true + case "upstream_error_rate": + if overview.RequestCountSLA <= 0 { + return 0, false + } + return overview.UpstreamErrorRate * 100, true + case "p95_latency_ms": + if overview.Duration.P95 == nil { + return 0, false + } + return float64(*overview.Duration.P95), true + case "p99_latency_ms": + if overview.Duration.P99 == nil { + return 0, false + } + return float64(*overview.Duration.P99), true + default: + return 0, false + } +} + +func compareMetric(value float64, operator string, threshold float64) bool { + switch strings.TrimSpace(operator) { + case ">": + return value > threshold + case ">=": + return value >= threshold + case "<": + return value < threshold + case "<=": + return value <= threshold + case "==": + return value == threshold + case "!=": + return value != threshold + default: + return false + } +} + +func buildOpsAlertDimensions(platform string, groupID *int64) map[string]any { + dims := map[string]any{} + if strings.TrimSpace(platform) != "" { + dims["platform"] = strings.TrimSpace(platform) + } + if groupID != nil && *groupID > 0 { + dims["group_id"] = *groupID + } + if len(dims) == 0 { + return nil + } + return dims +} + +func buildOpsAlertDescription(rule *OpsAlertRule, value float64, windowMinutes int, platform string, groupID *int64) string { + if rule == nil { + return "" + } + scope := "overall" + if strings.TrimSpace(platform) != "" { + scope = fmt.Sprintf("platform=%s", strings.TrimSpace(platform)) + } + if groupID != nil && *groupID > 0 { + scope = fmt.Sprintf("%s group_id=%d", scope, *groupID) + } + if windowMinutes <= 0 { + windowMinutes = 1 + } + return fmt.Sprintf("%s %s %.2f (current %.2f) over last %dm (%s)", + strings.TrimSpace(rule.MetricType), + strings.TrimSpace(rule.Operator), + rule.Threshold, + value, + windowMinutes, + strings.TrimSpace(scope), + ) +} + +func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runtimeCfg *OpsAlertRuntimeSettings, rule *OpsAlertRule, event *OpsAlertEvent) { + if s == nil || s.emailService == nil || s.opsService == nil || event == nil || rule == nil { + return + } + if event.EmailSent { + return + } + if !rule.NotifyEmail { + return + } + + emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx) + if err != nil || emailCfg == nil || !emailCfg.Alert.Enabled { + return + } + + if len(emailCfg.Alert.Recipients) == 0 { + return + } + if !shouldSendOpsAlertEmailByMinSeverity(strings.TrimSpace(emailCfg.Alert.MinSeverity), strings.TrimSpace(rule.Severity)) { + return + } + + if runtimeCfg != nil && runtimeCfg.Silencing.Enabled { + if isOpsAlertSilenced(time.Now().UTC(), rule, event, runtimeCfg.Silencing) { + return + } + } + + // Apply/update rate limiter. + s.emailLimiter.SetLimit(emailCfg.Alert.RateLimitPerHour) + + subject := fmt.Sprintf("[Ops Alert][%s] %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name)) + body := buildOpsAlertEmailBody(rule, event) + + anySent := false + for _, to := range emailCfg.Alert.Recipients { + addr := strings.TrimSpace(to) + if addr == "" { + continue + } + if !s.emailLimiter.Allow(time.Now().UTC()) { + continue + } + if err := s.emailService.SendEmail(ctx, addr, subject, body); err != nil { + // Ignore per-recipient failures; continue best-effort. + continue + } + anySent = true + } + + if anySent { + _ = s.opsRepo.UpdateAlertEventEmailSent(context.Background(), event.ID, true) + } +} + +func buildOpsAlertEmailBody(rule *OpsAlertRule, event *OpsAlertEvent) string { + if rule == nil || event == nil { + return "" + } + metric := strings.TrimSpace(rule.MetricType) + value := "-" + threshold := fmt.Sprintf("%.2f", rule.Threshold) + if event.MetricValue != nil { + value = fmt.Sprintf("%.2f", *event.MetricValue) + } + if event.ThresholdValue != nil { + threshold = fmt.Sprintf("%.2f", *event.ThresholdValue) + } + return fmt.Sprintf(` +

Ops Alert

+

Rule: %s

+

Severity: %s

+

Status: %s

+

Metric: %s %s %s

+

Fired at: %s

+

Description: %s

+`, + htmlEscape(rule.Name), + htmlEscape(rule.Severity), + htmlEscape(event.Status), + htmlEscape(metric), + htmlEscape(rule.Operator), + htmlEscape(fmt.Sprintf("%s (threshold %s)", value, threshold)), + event.FiredAt.Format(time.RFC3339), + htmlEscape(event.Description), + ) +} + +func shouldSendOpsAlertEmailByMinSeverity(minSeverity string, ruleSeverity string) bool { + minSeverity = strings.ToLower(strings.TrimSpace(minSeverity)) + if minSeverity == "" { + return true + } + + eventLevel := opsEmailSeverityForOps(ruleSeverity) + minLevel := strings.ToLower(minSeverity) + + rank := func(level string) int { + switch level { + case "critical": + return 3 + case "warning": + return 2 + case "info": + return 1 + default: + return 0 + } + } + return rank(eventLevel) >= rank(minLevel) +} + +func opsEmailSeverityForOps(severity string) string { + switch strings.ToUpper(strings.TrimSpace(severity)) { + case "P0": + return "critical" + case "P1": + return "warning" + default: + return "info" + } +} + +func isOpsAlertSilenced(now time.Time, rule *OpsAlertRule, event *OpsAlertEvent, silencing OpsAlertSilencingSettings) bool { + if !silencing.Enabled { + return false + } + if now.IsZero() { + now = time.Now().UTC() + } + if strings.TrimSpace(silencing.GlobalUntilRFC3339) != "" { + if t, err := time.Parse(time.RFC3339, strings.TrimSpace(silencing.GlobalUntilRFC3339)); err == nil { + if now.Before(t) { + return true + } + } + } + + for _, entry := range silencing.Entries { + untilRaw := strings.TrimSpace(entry.UntilRFC3339) + if untilRaw == "" { + continue + } + until, err := time.Parse(time.RFC3339, untilRaw) + if err != nil { + continue + } + if now.After(until) { + continue + } + if entry.RuleID != nil && rule != nil && rule.ID > 0 && *entry.RuleID != rule.ID { + continue + } + if len(entry.Severities) > 0 { + match := false + for _, s := range entry.Severities { + if strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(event.Severity)) || strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(rule.Severity)) { + match = true + break + } + } + if !match { + continue + } + } + return true + } + + return false +} + +func (s *OpsAlertEvaluatorService) tryAcquireLeaderLock(ctx context.Context, lock OpsDistributedLockSettings) (func(), bool) { + if !lock.Enabled { + return nil, true + } + if s.redisClient == nil { + s.warnNoRedisOnce.Do(func() { + log.Printf("[OpsAlertEvaluator] redis not configured; running without distributed lock") + }) + return nil, true + } + key := strings.TrimSpace(lock.Key) + if key == "" { + key = opsAlertEvaluatorLeaderLockKey + } + ttl := time.Duration(lock.TTLSeconds) * time.Second + if ttl <= 0 { + ttl = opsAlertEvaluatorLeaderLockTTL + } + + ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() + if err != nil { + // Fail-open for single-node environments, but warn. + s.warnNoRedisOnce.Do(func() { + log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; running without lock: %v", err) + }) + return nil, true + } + if !ok { + s.maybeLogSkip(key) + return nil, false + } + return func() { + _, _ = opsAlertEvaluatorReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result() + }, true +} + +func (s *OpsAlertEvaluatorService) maybeLogSkip(key string) { + s.skipLogMu.Lock() + defer s.skipLogMu.Unlock() + + now := time.Now() + if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < opsAlertEvaluatorSkipLogInterval { + return + } + s.skipLogAt = now + log.Printf("[OpsAlertEvaluator] leader lock held by another instance; skipping (key=%q)", key) +} + +func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) { + if s == nil || s.opsRepo == nil { + return + } + now := time.Now().UTC() + durMs := duration.Milliseconds() + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAlertEvaluatorJobName, + LastRunAt: &runAt, + LastSuccessAt: &now, + LastDurationMs: &durMs, + }) +} + +func (s *OpsAlertEvaluatorService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) { + if s == nil || s.opsRepo == nil || err == nil { + return + } + now := time.Now().UTC() + durMs := duration.Milliseconds() + msg := truncateString(err.Error(), 2048) + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ + JobName: opsAlertEvaluatorJobName, + LastRunAt: &runAt, + LastErrorAt: &now, + LastError: &msg, + LastDurationMs: &durMs, + }) +} + +func htmlEscape(s string) string { + replacer := strings.NewReplacer( + "&", "&", + "<", "<", + ">", ">", + `"`, """, + "'", "'", + ) + return replacer.Replace(s) +} + +type slidingWindowLimiter struct { + mu sync.Mutex + limit int + window time.Duration + sent []time.Time +} + +func newSlidingWindowLimiter(limit int, window time.Duration) *slidingWindowLimiter { + if window <= 0 { + window = time.Hour + } + return &slidingWindowLimiter{ + limit: limit, + window: window, + sent: []time.Time{}, + } +} + +func (l *slidingWindowLimiter) SetLimit(limit int) { + l.mu.Lock() + defer l.mu.Unlock() + l.limit = limit +} + +func (l *slidingWindowLimiter) Allow(now time.Time) bool { + l.mu.Lock() + defer l.mu.Unlock() + + if l.limit <= 0 { + return true + } + cutoff := now.Add(-l.window) + keep := l.sent[:0] + for _, t := range l.sent { + if t.After(cutoff) { + keep = append(keep, t) + } + } + l.sent = keep + if len(l.sent) >= l.limit { + return false + } + l.sent = append(l.sent, now) + return true +} diff --git a/backend/internal/service/ops_alerts.go b/backend/internal/service/ops_alerts.go new file mode 100644 index 00000000..b6c3d1c3 --- /dev/null +++ b/backend/internal/service/ops_alerts.go @@ -0,0 +1,162 @@ +package service + +import ( + "context" + "database/sql" + "errors" + "strings" + "time" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +func (s *OpsService) ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return []*OpsAlertRule{}, nil + } + return s.opsRepo.ListAlertRules(ctx) +} + +func (s *OpsService) CreateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if rule == nil { + return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule") + } + + created, err := s.opsRepo.CreateAlertRule(ctx, rule) + if err != nil { + return nil, err + } + return created, nil +} + +func (s *OpsService) UpdateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if rule == nil || rule.ID <= 0 { + return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule") + } + + updated, err := s.opsRepo.UpdateAlertRule(ctx, rule) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found") + } + return nil, err + } + return updated, nil +} + +func (s *OpsService) DeleteAlertRule(ctx context.Context, id int64) error { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return err + } + if s.opsRepo == nil { + return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if id <= 0 { + return infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id") + } + if err := s.opsRepo.DeleteAlertRule(ctx, id); err != nil { + if errors.Is(err, sql.ErrNoRows) { + return infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found") + } + return err + } + return nil +} + +func (s *OpsService) ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return []*OpsAlertEvent{}, nil + } + return s.opsRepo.ListAlertEvents(ctx, filter) +} + +func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if ruleID <= 0 { + return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id") + } + return s.opsRepo.GetActiveAlertEvent(ctx, ruleID) +} + +func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if ruleID <= 0 { + return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id") + } + return s.opsRepo.GetLatestAlertEvent(ctx, ruleID) +} + +func (s *OpsService) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if event == nil { + return nil, infraerrors.BadRequest("INVALID_EVENT", "invalid event") + } + + created, err := s.opsRepo.CreateAlertEvent(ctx, event) + if err != nil { + return nil, err + } + return created, nil +} + +func (s *OpsService) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return err + } + if s.opsRepo == nil { + return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if eventID <= 0 { + return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id") + } + if strings.TrimSpace(status) == "" { + return infraerrors.BadRequest("INVALID_STATUS", "invalid status") + } + return s.opsRepo.UpdateAlertEventStatus(ctx, eventID, status, resolvedAt) +} + +func (s *OpsService) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return err + } + if s.opsRepo == nil { + return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if eventID <= 0 { + return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id") + } + return s.opsRepo.UpdateAlertEventEmailSent(ctx, eventID, emailSent) +} diff --git a/backend/internal/service/ops_cleanup_service.go b/backend/internal/service/ops_cleanup_service.go new file mode 100644 index 00000000..ef825c04 --- /dev/null +++ b/backend/internal/service/ops_cleanup_service.go @@ -0,0 +1,361 @@ +package service + +import ( + "context" + "database/sql" + "fmt" + "log" + "strings" + "sync" + "time" + + "github.com/Wei-Shaw/sub2api/internal/config" + "github.com/google/uuid" + "github.com/redis/go-redis/v9" + "github.com/robfig/cron/v3" +) + +const ( + opsCleanupJobName = "ops_cleanup" + + opsCleanupLeaderLockKeyDefault = "ops:cleanup:leader" + opsCleanupLeaderLockTTLDefault = 30 * time.Minute +) + +var opsCleanupCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow) + +var opsCleanupReleaseScript = redis.NewScript(` +if redis.call("GET", KEYS[1]) == ARGV[1] then + return redis.call("DEL", KEYS[1]) +end +return 0 +`) + +// OpsCleanupService periodically deletes old ops data to prevent unbounded DB growth. +// +// - Scheduling: 5-field cron spec (minute hour dom month dow). +// - Multi-instance: best-effort Redis leader lock so only one node runs cleanup. +// - Safety: deletes in batches to avoid long transactions. +type OpsCleanupService struct { + opsRepo OpsRepository + db *sql.DB + redisClient *redis.Client + cfg *config.Config + + instanceID string + + cron *cron.Cron + entryID cron.EntryID + + startOnce sync.Once + stopOnce sync.Once + + warnNoRedisOnce sync.Once +} + +func NewOpsCleanupService( + opsRepo OpsRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsCleanupService { + return &OpsCleanupService{ + opsRepo: opsRepo, + db: db, + redisClient: redisClient, + cfg: cfg, + instanceID: uuid.NewString(), + } +} + +func (s *OpsCleanupService) Start() { + if s == nil { + return + } + if s.cfg != nil && !s.cfg.Ops.Enabled { + return + } + if s.cfg != nil && !s.cfg.Ops.Cleanup.Enabled { + log.Printf("[OpsCleanup] not started (disabled)") + return + } + if s.opsRepo == nil || s.db == nil { + log.Printf("[OpsCleanup] not started (missing deps)") + return + } + + s.startOnce.Do(func() { + schedule := "0 2 * * *" + if s.cfg != nil && strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule) != "" { + schedule = strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule) + } + + loc := time.Local + if s.cfg != nil && strings.TrimSpace(s.cfg.Timezone) != "" { + if parsed, err := time.LoadLocation(strings.TrimSpace(s.cfg.Timezone)); err == nil && parsed != nil { + loc = parsed + } + } + + c := cron.New(cron.WithParser(opsCleanupCronParser), cron.WithLocation(loc)) + id, err := c.AddFunc(schedule, func() { s.runScheduled() }) + if err != nil { + log.Printf("[OpsCleanup] not started (invalid schedule=%q): %v", schedule, err) + return + } + s.cron = c + s.entryID = id + s.cron.Start() + log.Printf("[OpsCleanup] started (schedule=%q tz=%s)", schedule, loc.String()) + }) +} + +func (s *OpsCleanupService) Stop() { + if s == nil { + return + } + s.stopOnce.Do(func() { + if s.cron != nil { + ctx := s.cron.Stop() + select { + case <-ctx.Done(): + case <-time.After(3 * time.Second): + log.Printf("[OpsCleanup] cron stop timed out") + } + } + }) +} + +func (s *OpsCleanupService) runScheduled() { + if s == nil || s.db == nil || s.opsRepo == nil { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute) + defer cancel() + + release, ok := s.tryAcquireLeaderLock(ctx) + if !ok { + return + } + if release != nil { + defer release() + } + + startedAt := time.Now().UTC() + runAt := startedAt + + counts, err := s.runCleanupOnce(ctx) + if err != nil { + s.recordHeartbeatError(runAt, time.Since(startedAt), err) + log.Printf("[OpsCleanup] cleanup failed: %v", err) + return + } + s.recordHeartbeatSuccess(runAt, time.Since(startedAt)) + log.Printf("[OpsCleanup] cleanup complete: %s", counts) +} + +type opsCleanupDeletedCounts struct { + errorLogs int64 + retryAttempts int64 + alertEvents int64 + systemMetrics int64 + hourlyPreagg int64 + dailyPreagg int64 +} + +func (c opsCleanupDeletedCounts) String() string { + return fmt.Sprintf( + "error_logs=%d retry_attempts=%d alert_events=%d system_metrics=%d hourly_preagg=%d daily_preagg=%d", + c.errorLogs, + c.retryAttempts, + c.alertEvents, + c.systemMetrics, + c.hourlyPreagg, + c.dailyPreagg, + ) +} + +func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDeletedCounts, error) { + out := opsCleanupDeletedCounts{} + if s == nil || s.db == nil || s.cfg == nil { + return out, nil + } + + batchSize := 5000 + + now := time.Now().UTC() + + // Error-like tables: error logs / retry attempts / alert events. + if days := s.cfg.Ops.Cleanup.ErrorLogRetentionDays; days > 0 { + cutoff := now.AddDate(0, 0, -days) + n, err := deleteOldRowsByID(ctx, s.db, "ops_error_logs", "created_at", cutoff, batchSize, false) + if err != nil { + return out, err + } + out.errorLogs = n + + n, err = deleteOldRowsByID(ctx, s.db, "ops_retry_attempts", "created_at", cutoff, batchSize, false) + if err != nil { + return out, err + } + out.retryAttempts = n + + n, err = deleteOldRowsByID(ctx, s.db, "ops_alert_events", "created_at", cutoff, batchSize, false) + if err != nil { + return out, err + } + out.alertEvents = n + } + + // Minute-level metrics snapshots. + if days := s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays; days > 0 { + cutoff := now.AddDate(0, 0, -days) + n, err := deleteOldRowsByID(ctx, s.db, "ops_system_metrics", "created_at", cutoff, batchSize, false) + if err != nil { + return out, err + } + out.systemMetrics = n + } + + // Pre-aggregation tables (hourly/daily). + if days := s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays; days > 0 { + cutoff := now.AddDate(0, 0, -days) + n, err := deleteOldRowsByID(ctx, s.db, "ops_metrics_hourly", "bucket_start", cutoff, batchSize, false) + if err != nil { + return out, err + } + out.hourlyPreagg = n + + n, err = deleteOldRowsByID(ctx, s.db, "ops_metrics_daily", "bucket_date", cutoff, batchSize, true) + if err != nil { + return out, err + } + out.dailyPreagg = n + } + + return out, nil +} + +func deleteOldRowsByID( + ctx context.Context, + db *sql.DB, + table string, + timeColumn string, + cutoff time.Time, + batchSize int, + castCutoffToDate bool, +) (int64, error) { + if db == nil { + return 0, nil + } + if batchSize <= 0 { + batchSize = 5000 + } + + where := fmt.Sprintf("%s < $1", timeColumn) + if castCutoffToDate { + where = fmt.Sprintf("%s < $1::date", timeColumn) + } + + q := fmt.Sprintf(` +WITH batch AS ( + SELECT id FROM %s + WHERE %s + ORDER BY id + LIMIT $2 +) +DELETE FROM %s +WHERE id IN (SELECT id FROM batch) +`, table, where, table) + + var total int64 + for { + res, err := db.ExecContext(ctx, q, cutoff, batchSize) + if err != nil { + // If ops tables aren't present yet (partial deployments), treat as no-op. + if strings.Contains(strings.ToLower(err.Error()), "does not exist") && strings.Contains(strings.ToLower(err.Error()), "relation") { + return total, nil + } + return total, err + } + affected, err := res.RowsAffected() + if err != nil { + return total, err + } + total += affected + if affected == 0 { + break + } + } + return total, nil +} + +func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) { + if s == nil { + return nil, false + } + // In simple run mode, assume single instance. + if s.cfg != nil && s.cfg.RunMode == config.RunModeSimple { + return nil, true + } + + if s.redisClient == nil { + s.warnNoRedisOnce.Do(func() { + log.Printf("[OpsCleanup] redis not configured; running without distributed lock") + }) + return nil, true + } + + key := opsCleanupLeaderLockKeyDefault + ttl := opsCleanupLeaderLockTTLDefault + + ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() + if err != nil { + s.warnNoRedisOnce.Do(func() { + log.Printf("[OpsCleanup] leader lock SetNX failed; running without lock: %v", err) + }) + return nil, true + } + if !ok { + return nil, false + } + + return func() { + _, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result() + }, true +} + +func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) { + if s == nil || s.opsRepo == nil { + return + } + now := time.Now().UTC() + durMs := duration.Milliseconds() + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ + JobName: opsCleanupJobName, + LastRunAt: &runAt, + LastSuccessAt: &now, + LastDurationMs: &durMs, + }) +} + +func (s *OpsCleanupService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) { + if s == nil || s.opsRepo == nil || err == nil { + return + } + now := time.Now().UTC() + durMs := duration.Milliseconds() + msg := truncateString(err.Error(), 2048) + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ + JobName: opsCleanupJobName, + LastRunAt: &runAt, + LastErrorAt: &now, + LastError: &msg, + LastDurationMs: &durMs, + }) +} diff --git a/backend/internal/service/ops_concurrency.go b/backend/internal/service/ops_concurrency.go new file mode 100644 index 00000000..c3b7b853 --- /dev/null +++ b/backend/internal/service/ops_concurrency.go @@ -0,0 +1,257 @@ +package service + +import ( + "context" + "log" + "time" + + "github.com/Wei-Shaw/sub2api/internal/pkg/pagination" +) + +const ( + opsAccountsPageSize = 100 + opsConcurrencyBatchChunkSize = 200 +) + +func (s *OpsService) listAllAccountsForOps(ctx context.Context, platformFilter string) ([]Account, error) { + if s == nil || s.accountRepo == nil { + return []Account{}, nil + } + + out := make([]Account, 0, 128) + page := 1 + for { + accounts, pageInfo, err := s.accountRepo.ListWithFilters(ctx, pagination.PaginationParams{ + Page: page, + PageSize: opsAccountsPageSize, + }, platformFilter, "", "", "") + if err != nil { + return nil, err + } + if len(accounts) == 0 { + break + } + + out = append(out, accounts...) + if pageInfo != nil && int64(len(out)) >= pageInfo.Total { + break + } + if len(accounts) < opsAccountsPageSize { + break + } + + page++ + if page > 10_000 { + log.Printf("[Ops] listAllAccountsForOps: aborting after too many pages (platform=%q)", platformFilter) + break + } + } + + return out, nil +} + +func (s *OpsService) getAccountsLoadMapBestEffort(ctx context.Context, accounts []Account) map[int64]*AccountLoadInfo { + if s == nil || s.concurrencyService == nil { + return map[int64]*AccountLoadInfo{} + } + if len(accounts) == 0 { + return map[int64]*AccountLoadInfo{} + } + + // De-duplicate IDs (and keep the max concurrency to avoid under-reporting). + unique := make(map[int64]int, len(accounts)) + for _, acc := range accounts { + if acc.ID <= 0 { + continue + } + if prev, ok := unique[acc.ID]; !ok || acc.Concurrency > prev { + unique[acc.ID] = acc.Concurrency + } + } + + batch := make([]AccountWithConcurrency, 0, len(unique)) + for id, maxConc := range unique { + batch = append(batch, AccountWithConcurrency{ + ID: id, + MaxConcurrency: maxConc, + }) + } + + out := make(map[int64]*AccountLoadInfo, len(batch)) + for i := 0; i < len(batch); i += opsConcurrencyBatchChunkSize { + end := i + opsConcurrencyBatchChunkSize + if end > len(batch) { + end = len(batch) + } + part, err := s.concurrencyService.GetAccountsLoadBatch(ctx, batch[i:end]) + if err != nil { + // Best-effort: return zeros rather than failing the ops UI. + log.Printf("[Ops] GetAccountsLoadBatch failed: %v", err) + continue + } + for k, v := range part { + out[k] = v + } + } + + return out +} + +// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account. +// +// Optional filters: +// - platformFilter: only include accounts in that platform (best-effort reduces DB load) +// - groupIDFilter: only include accounts that belong to that group +func (s *OpsService) GetConcurrencyStats( + ctx context.Context, + platformFilter string, + groupIDFilter *int64, +) (map[string]*PlatformConcurrencyInfo, map[int64]*GroupConcurrencyInfo, map[int64]*AccountConcurrencyInfo, *time.Time, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, nil, nil, nil, err + } + + accounts, err := s.listAllAccountsForOps(ctx, platformFilter) + if err != nil { + return nil, nil, nil, nil, err + } + + collectedAt := time.Now() + loadMap := s.getAccountsLoadMapBestEffort(ctx, accounts) + + platform := make(map[string]*PlatformConcurrencyInfo) + group := make(map[int64]*GroupConcurrencyInfo) + account := make(map[int64]*AccountConcurrencyInfo) + + for _, acc := range accounts { + if acc.ID <= 0 { + continue + } + + var matchedGroup *Group + if groupIDFilter != nil && *groupIDFilter > 0 { + for _, grp := range acc.Groups { + if grp == nil || grp.ID <= 0 { + continue + } + if grp.ID == *groupIDFilter { + matchedGroup = grp + break + } + } + // Group filter provided: skip accounts not in that group. + if matchedGroup == nil { + continue + } + } + + load := loadMap[acc.ID] + currentInUse := int64(0) + waiting := int64(0) + if load != nil { + currentInUse = int64(load.CurrentConcurrency) + waiting = int64(load.WaitingCount) + } + + // Account-level view picks one display group (the first group). + displayGroupID := int64(0) + displayGroupName := "" + if matchedGroup != nil { + displayGroupID = matchedGroup.ID + displayGroupName = matchedGroup.Name + } else if len(acc.Groups) > 0 && acc.Groups[0] != nil { + displayGroupID = acc.Groups[0].ID + displayGroupName = acc.Groups[0].Name + } + + if _, ok := account[acc.ID]; !ok { + info := &AccountConcurrencyInfo{ + AccountID: acc.ID, + AccountName: acc.Name, + Platform: acc.Platform, + GroupID: displayGroupID, + GroupName: displayGroupName, + CurrentInUse: currentInUse, + MaxCapacity: int64(acc.Concurrency), + WaitingInQueue: waiting, + } + if info.MaxCapacity > 0 { + info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100 + } + account[acc.ID] = info + } + + // Platform aggregation. + if acc.Platform != "" { + if _, ok := platform[acc.Platform]; !ok { + platform[acc.Platform] = &PlatformConcurrencyInfo{ + Platform: acc.Platform, + } + } + p := platform[acc.Platform] + p.MaxCapacity += int64(acc.Concurrency) + p.CurrentInUse += currentInUse + p.WaitingInQueue += waiting + } + + // Group aggregation (one account may contribute to multiple groups). + if matchedGroup != nil { + grp := matchedGroup + if _, ok := group[grp.ID]; !ok { + group[grp.ID] = &GroupConcurrencyInfo{ + GroupID: grp.ID, + GroupName: grp.Name, + Platform: grp.Platform, + } + } + g := group[grp.ID] + if g.GroupName == "" && grp.Name != "" { + g.GroupName = grp.Name + } + if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform { + // Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels. + g.Platform = "" + } + g.MaxCapacity += int64(acc.Concurrency) + g.CurrentInUse += currentInUse + g.WaitingInQueue += waiting + } else { + for _, grp := range acc.Groups { + if grp == nil || grp.ID <= 0 { + continue + } + if _, ok := group[grp.ID]; !ok { + group[grp.ID] = &GroupConcurrencyInfo{ + GroupID: grp.ID, + GroupName: grp.Name, + Platform: grp.Platform, + } + } + g := group[grp.ID] + if g.GroupName == "" && grp.Name != "" { + g.GroupName = grp.Name + } + if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform { + // Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels. + g.Platform = "" + } + g.MaxCapacity += int64(acc.Concurrency) + g.CurrentInUse += currentInUse + g.WaitingInQueue += waiting + } + } + } + + for _, info := range platform { + if info.MaxCapacity > 0 { + info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100 + } + } + for _, info := range group { + if info.MaxCapacity > 0 { + info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100 + } + } + + return platform, group, account, &collectedAt, nil +} diff --git a/backend/internal/service/ops_dashboard.go b/backend/internal/service/ops_dashboard.go new file mode 100644 index 00000000..23d6d82f --- /dev/null +++ b/backend/internal/service/ops_dashboard.go @@ -0,0 +1,77 @@ +package service + +import ( + "context" + "database/sql" + "errors" + "log" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + + // Resolve query mode (requested via query param, or DB default). + filter.QueryMode = s.resolveOpsQueryMode(ctx, filter.QueryMode) + + overview, err := s.opsRepo.GetDashboardOverview(ctx, filter) + if err != nil { + if errors.Is(err, ErrOpsPreaggregatedNotPopulated) { + return nil, infraerrors.Conflict("OPS_PREAGG_NOT_READY", "Pre-aggregated ops metrics are not populated yet") + } + return nil, err + } + + // Best-effort system health + jobs; dashboard metrics should still render if these are missing. + if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil { + overview.SystemMetrics = metrics + } else if err != nil && !errors.Is(err, sql.ErrNoRows) { + log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err) + } + + if heartbeats, err := s.opsRepo.ListJobHeartbeats(ctx); err == nil { + overview.JobHeartbeats = heartbeats + } else { + log.Printf("[Ops] ListJobHeartbeats failed: %v", err) + } + + return overview, nil +} + +func (s *OpsService) resolveOpsQueryMode(ctx context.Context, requested OpsQueryMode) OpsQueryMode { + if requested.IsValid() { + // Allow "auto" to be disabled via config until preagg is proven stable in production. + // Forced `preagg` via query param still works. + if requested == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables { + return OpsQueryModeRaw + } + return requested + } + + mode := OpsQueryModeAuto + if s != nil && s.settingRepo != nil { + if raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsQueryModeDefault); err == nil { + mode = ParseOpsQueryMode(raw) + } + } + + if mode == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables { + return OpsQueryModeRaw + } + return mode +} diff --git a/backend/internal/service/ops_errors.go b/backend/internal/service/ops_errors.go new file mode 100644 index 00000000..76b5ce8b --- /dev/null +++ b/backend/internal/service/ops_errors.go @@ -0,0 +1,45 @@ +package service + +import ( + "context" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +func (s *OpsService) GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + return s.opsRepo.GetErrorTrend(ctx, filter, bucketSeconds) +} + +func (s *OpsService) GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + return s.opsRepo.GetErrorDistribution(ctx, filter) +} diff --git a/backend/internal/service/ops_histograms.go b/backend/internal/service/ops_histograms.go new file mode 100644 index 00000000..9f5b514f --- /dev/null +++ b/backend/internal/service/ops_histograms.go @@ -0,0 +1,26 @@ +package service + +import ( + "context" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +func (s *OpsService) GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + return s.opsRepo.GetLatencyHistogram(ctx, filter) +} diff --git a/backend/internal/service/ops_metrics_collector.go b/backend/internal/service/ops_metrics_collector.go new file mode 100644 index 00000000..cd90e1bd --- /dev/null +++ b/backend/internal/service/ops_metrics_collector.go @@ -0,0 +1,861 @@ +package service + +import ( + "context" + "database/sql" + "errors" + "fmt" + "hash/fnv" + "log" + "math" + "os" + "runtime" + "strconv" + "strings" + "sync" + "time" + "unicode/utf8" + + "github.com/Wei-Shaw/sub2api/internal/config" + "github.com/google/uuid" + "github.com/redis/go-redis/v9" + "github.com/shirou/gopsutil/v4/cpu" + "github.com/shirou/gopsutil/v4/mem" +) + +const ( + opsMetricsCollectorJobName = "ops_metrics_collector" + opsMetricsCollectorMinInterval = 60 * time.Second + opsMetricsCollectorMaxInterval = 1 * time.Hour + + opsMetricsCollectorTimeout = 10 * time.Second + + opsMetricsCollectorLeaderLockKey = "ops:metrics:collector:leader" + opsMetricsCollectorLeaderLockTTL = 90 * time.Second + + opsMetricsCollectorHeartbeatTimeout = 2 * time.Second + + bytesPerMB = 1024 * 1024 +) + +var opsMetricsCollectorAdvisoryLockID = hashAdvisoryLockID(opsMetricsCollectorLeaderLockKey) + +type OpsMetricsCollector struct { + opsRepo OpsRepository + settingRepo SettingRepository + cfg *config.Config + + db *sql.DB + redisClient *redis.Client + instanceID string + + lastCgroupCPUUsageNanos uint64 + lastCgroupCPUSampleAt time.Time + + stopCh chan struct{} + startOnce sync.Once + stopOnce sync.Once + + skipLogMu sync.Mutex + skipLogAt time.Time +} + +func NewOpsMetricsCollector( + opsRepo OpsRepository, + settingRepo SettingRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsMetricsCollector { + return &OpsMetricsCollector{ + opsRepo: opsRepo, + settingRepo: settingRepo, + cfg: cfg, + db: db, + redisClient: redisClient, + instanceID: uuid.NewString(), + } +} + +func (c *OpsMetricsCollector) Start() { + if c == nil { + return + } + c.startOnce.Do(func() { + if c.stopCh == nil { + c.stopCh = make(chan struct{}) + } + go c.run() + }) +} + +func (c *OpsMetricsCollector) Stop() { + if c == nil { + return + } + c.stopOnce.Do(func() { + if c.stopCh != nil { + close(c.stopCh) + } + }) +} + +func (c *OpsMetricsCollector) run() { + // First run immediately so the dashboard has data soon after startup. + c.collectOnce() + + for { + interval := c.getInterval() + timer := time.NewTimer(interval) + select { + case <-timer.C: + c.collectOnce() + case <-c.stopCh: + timer.Stop() + return + } + } +} + +func (c *OpsMetricsCollector) getInterval() time.Duration { + interval := opsMetricsCollectorMinInterval + + if c.settingRepo == nil { + return interval + } + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + raw, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMetricsIntervalSeconds) + if err != nil { + return interval + } + raw = strings.TrimSpace(raw) + if raw == "" { + return interval + } + + seconds, err := strconv.Atoi(raw) + if err != nil { + return interval + } + if seconds < int(opsMetricsCollectorMinInterval.Seconds()) { + seconds = int(opsMetricsCollectorMinInterval.Seconds()) + } + if seconds > int(opsMetricsCollectorMaxInterval.Seconds()) { + seconds = int(opsMetricsCollectorMaxInterval.Seconds()) + } + return time.Duration(seconds) * time.Second +} + +func (c *OpsMetricsCollector) collectOnce() { + if c == nil { + return + } + if c.cfg != nil && !c.cfg.Ops.Enabled { + return + } + if c.opsRepo == nil { + return + } + if c.db == nil { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), opsMetricsCollectorTimeout) + defer cancel() + + if !c.isMonitoringEnabled(ctx) { + return + } + + release, ok := c.tryAcquireLeaderLock(ctx) + if !ok { + return + } + if release != nil { + defer release() + } + + startedAt := time.Now().UTC() + err := c.collectAndPersist(ctx) + finishedAt := time.Now().UTC() + + durationMs := finishedAt.Sub(startedAt).Milliseconds() + dur := durationMs + runAt := startedAt + + if err != nil { + msg := truncateString(err.Error(), 2048) + errAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout) + defer hbCancel() + _ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsMetricsCollectorJobName, + LastRunAt: &runAt, + LastErrorAt: &errAt, + LastError: &msg, + LastDurationMs: &dur, + }) + log.Printf("[OpsMetricsCollector] collect failed: %v", err) + return + } + + successAt := finishedAt + hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout) + defer hbCancel() + _ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ + JobName: opsMetricsCollectorJobName, + LastRunAt: &runAt, + LastSuccessAt: &successAt, + LastDurationMs: &dur, + }) +} + +func (c *OpsMetricsCollector) isMonitoringEnabled(ctx context.Context) bool { + if c == nil { + return false + } + if c.cfg != nil && !c.cfg.Ops.Enabled { + return false + } + if c.settingRepo == nil { + return true + } + if ctx == nil { + ctx = context.Background() + } + + value, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled) + if err != nil { + if errors.Is(err, ErrSettingNotFound) { + return true + } + // Fail-open: collector should not become a hard dependency. + return true + } + switch strings.ToLower(strings.TrimSpace(value)) { + case "false", "0", "off", "disabled": + return false + default: + return true + } +} + +func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error { + if ctx == nil { + ctx = context.Background() + } + + // Align to stable minute boundaries to avoid partial buckets and to maximize cache hits. + now := time.Now().UTC() + windowEnd := now.Truncate(time.Minute) + windowStart := windowEnd.Add(-1 * time.Minute) + + sys, err := c.collectSystemStats(ctx) + if err != nil { + // Continue; system stats are best-effort. + log.Printf("[OpsMetricsCollector] system stats error: %v", err) + } + + dbOK := c.checkDB(ctx) + redisOK := c.checkRedis(ctx) + active, idle := c.dbPoolStats() + + successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd) + if err != nil { + return fmt.Errorf("query usage counts: %w", err) + } + + duration, ttft, err := c.queryUsageLatency(ctx, windowStart, windowEnd) + if err != nil { + return fmt.Errorf("query usage latency: %w", err) + } + + errorTotal, businessLimited, errorSLA, upstreamExcl, upstream429, upstream529, err := c.queryErrorCounts(ctx, windowStart, windowEnd) + if err != nil { + return fmt.Errorf("query error counts: %w", err) + } + + windowSeconds := windowEnd.Sub(windowStart).Seconds() + if windowSeconds <= 0 { + windowSeconds = 60 + } + requestTotal := successCount + errorTotal + qps := float64(requestTotal) / windowSeconds + tps := float64(tokenConsumed) / windowSeconds + + goroutines := runtime.NumGoroutine() + + input := &OpsInsertSystemMetricsInput{ + CreatedAt: windowEnd, + WindowMinutes: 1, + + SuccessCount: successCount, + ErrorCountTotal: errorTotal, + BusinessLimitedCount: businessLimited, + ErrorCountSLA: errorSLA, + + UpstreamErrorCountExcl429529: upstreamExcl, + Upstream429Count: upstream429, + Upstream529Count: upstream529, + + TokenConsumed: tokenConsumed, + QPS: float64Ptr(roundTo1DP(qps)), + TPS: float64Ptr(roundTo1DP(tps)), + + DurationP50Ms: duration.p50, + DurationP90Ms: duration.p90, + DurationP95Ms: duration.p95, + DurationP99Ms: duration.p99, + DurationAvgMs: duration.avg, + DurationMaxMs: duration.max, + + TTFTP50Ms: ttft.p50, + TTFTP90Ms: ttft.p90, + TTFTP95Ms: ttft.p95, + TTFTP99Ms: ttft.p99, + TTFTAvgMs: ttft.avg, + TTFTMaxMs: ttft.max, + + CPUUsagePercent: sys.cpuUsagePercent, + MemoryUsedMB: sys.memoryUsedMB, + MemoryTotalMB: sys.memoryTotalMB, + MemoryUsagePercent: sys.memoryUsagePercent, + + DBOK: boolPtr(dbOK), + RedisOK: boolPtr(redisOK), + + DBConnActive: intPtr(active), + DBConnIdle: intPtr(idle), + GoroutineCount: intPtr(goroutines), + } + + return c.opsRepo.InsertSystemMetrics(ctx, input) +} + +type opsCollectedPercentiles struct { + p50 *int + p90 *int + p95 *int + p99 *int + avg *float64 + max *int +} + +func (c *OpsMetricsCollector) queryUsageCounts(ctx context.Context, start, end time.Time) (successCount int64, tokenConsumed int64, err error) { + q := ` +SELECT + COALESCE(COUNT(*), 0) AS success_count, + COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed +FROM usage_logs +WHERE created_at >= $1 AND created_at < $2` + + var tokens sql.NullInt64 + if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&successCount, &tokens); err != nil { + return 0, 0, err + } + if tokens.Valid { + tokenConsumed = tokens.Int64 + } + return successCount, tokenConsumed, nil +} + +func (c *OpsMetricsCollector) queryUsageLatency(ctx context.Context, start, end time.Time) (duration opsCollectedPercentiles, ttft opsCollectedPercentiles, err error) { + { + q := ` +SELECT + percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50, + percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90, + percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95, + percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99, + AVG(duration_ms) AS avg_ms, + MAX(duration_ms) AS max_ms +FROM usage_logs +WHERE created_at >= $1 AND created_at < $2 + AND duration_ms IS NOT NULL` + + var p50, p90, p95, p99 sql.NullFloat64 + var avg sql.NullFloat64 + var max sql.NullInt64 + if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil { + return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err + } + duration.p50 = floatToIntPtr(p50) + duration.p90 = floatToIntPtr(p90) + duration.p95 = floatToIntPtr(p95) + duration.p99 = floatToIntPtr(p99) + if avg.Valid { + v := roundTo1DP(avg.Float64) + duration.avg = &v + } + if max.Valid { + v := int(max.Int64) + duration.max = &v + } + } + + { + q := ` +SELECT + percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50, + percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90, + percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95, + percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99, + AVG(first_token_ms) AS avg_ms, + MAX(first_token_ms) AS max_ms +FROM usage_logs +WHERE created_at >= $1 AND created_at < $2 + AND first_token_ms IS NOT NULL` + + var p50, p90, p95, p99 sql.NullFloat64 + var avg sql.NullFloat64 + var max sql.NullInt64 + if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil { + return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err + } + ttft.p50 = floatToIntPtr(p50) + ttft.p90 = floatToIntPtr(p90) + ttft.p95 = floatToIntPtr(p95) + ttft.p99 = floatToIntPtr(p99) + if avg.Valid { + v := roundTo1DP(avg.Float64) + ttft.avg = &v + } + if max.Valid { + v := int(max.Int64) + ttft.max = &v + } + } + + return duration, ttft, nil +} + +func (c *OpsMetricsCollector) queryErrorCounts(ctx context.Context, start, end time.Time) ( + errorTotal int64, + businessLimited int64, + errorSLA int64, + upstreamExcl429529 int64, + upstream429 int64, + upstream529 int64, + err error, +) { + q := ` +SELECT + COALESCE(COUNT(*), 0) AS error_total, + COALESCE(COUNT(*) FILTER (WHERE is_business_limited), 0) AS business_limited, + COALESCE(COUNT(*) FILTER (WHERE NOT is_business_limited), 0) AS error_sla, + COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl, + COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 429), 0) AS upstream_429, + COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 529), 0) AS upstream_529 +FROM ops_error_logs +WHERE created_at >= $1 AND created_at < $2` + + if err := c.db.QueryRowContext(ctx, q, start, end).Scan( + &errorTotal, + &businessLimited, + &errorSLA, + &upstreamExcl429529, + &upstream429, + &upstream529, + ); err != nil { + return 0, 0, 0, 0, 0, 0, err + } + return errorTotal, businessLimited, errorSLA, upstreamExcl429529, upstream429, upstream529, nil +} + +type opsCollectedSystemStats struct { + cpuUsagePercent *float64 + memoryUsedMB *int64 + memoryTotalMB *int64 + memoryUsagePercent *float64 +} + +func (c *OpsMetricsCollector) collectSystemStats(ctx context.Context) (*opsCollectedSystemStats, error) { + out := &opsCollectedSystemStats{} + if ctx == nil { + ctx = context.Background() + } + + sampleAt := time.Now().UTC() + + // Prefer cgroup (container) metrics when available. + if cpuPct := c.tryCgroupCPUPercent(sampleAt); cpuPct != nil { + out.cpuUsagePercent = cpuPct + } + + cgroupUsed, cgroupTotal, cgroupOK := readCgroupMemoryBytes() + if cgroupOK { + usedMB := int64(cgroupUsed / bytesPerMB) + out.memoryUsedMB = &usedMB + if cgroupTotal > 0 { + totalMB := int64(cgroupTotal / bytesPerMB) + out.memoryTotalMB = &totalMB + pct := roundTo1DP(float64(cgroupUsed) / float64(cgroupTotal) * 100) + out.memoryUsagePercent = &pct + } + } + + // Fallback to host metrics if cgroup metrics are unavailable (or incomplete). + if out.cpuUsagePercent == nil { + if cpuPercents, err := cpu.PercentWithContext(ctx, 0, false); err == nil && len(cpuPercents) > 0 { + v := roundTo1DP(cpuPercents[0]) + out.cpuUsagePercent = &v + } + } + + // If total memory isn't available from cgroup (e.g. memory.max = "max"), fill total from host. + if out.memoryUsedMB == nil || out.memoryTotalMB == nil || out.memoryUsagePercent == nil { + if vm, err := mem.VirtualMemoryWithContext(ctx); err == nil && vm != nil { + if out.memoryUsedMB == nil { + usedMB := int64(vm.Used / bytesPerMB) + out.memoryUsedMB = &usedMB + } + if out.memoryTotalMB == nil { + totalMB := int64(vm.Total / bytesPerMB) + out.memoryTotalMB = &totalMB + } + if out.memoryUsagePercent == nil { + if out.memoryUsedMB != nil && out.memoryTotalMB != nil && *out.memoryTotalMB > 0 { + pct := roundTo1DP(float64(*out.memoryUsedMB) / float64(*out.memoryTotalMB) * 100) + out.memoryUsagePercent = &pct + } else { + pct := roundTo1DP(vm.UsedPercent) + out.memoryUsagePercent = &pct + } + } + } + } + + return out, nil +} + +func (c *OpsMetricsCollector) tryCgroupCPUPercent(now time.Time) *float64 { + usageNanos, ok := readCgroupCPUUsageNanos() + if !ok { + return nil + } + + // Initialize baseline sample. + if c.lastCgroupCPUSampleAt.IsZero() { + c.lastCgroupCPUUsageNanos = usageNanos + c.lastCgroupCPUSampleAt = now + return nil + } + + elapsed := now.Sub(c.lastCgroupCPUSampleAt) + if elapsed <= 0 { + c.lastCgroupCPUUsageNanos = usageNanos + c.lastCgroupCPUSampleAt = now + return nil + } + + prev := c.lastCgroupCPUUsageNanos + c.lastCgroupCPUUsageNanos = usageNanos + c.lastCgroupCPUSampleAt = now + + if usageNanos < prev { + // Counter reset (container restarted). + return nil + } + + deltaUsageSec := float64(usageNanos-prev) / 1e9 + elapsedSec := elapsed.Seconds() + if elapsedSec <= 0 { + return nil + } + + cores := readCgroupCPULimitCores() + if cores <= 0 { + // Can't reliably normalize; skip and fall back to gopsutil. + return nil + } + + pct := (deltaUsageSec / (elapsedSec * cores)) * 100 + if pct < 0 { + pct = 0 + } + // Clamp to avoid noise/jitter showing impossible values. + if pct > 100 { + pct = 100 + } + v := roundTo1DP(pct) + return &v +} + +func readCgroupMemoryBytes() (usedBytes uint64, totalBytes uint64, ok bool) { + // cgroup v2 (most common in modern containers) + if used, ok1 := readUintFile("/sys/fs/cgroup/memory.current"); ok1 { + usedBytes = used + rawMax, err := os.ReadFile("/sys/fs/cgroup/memory.max") + if err == nil { + s := strings.TrimSpace(string(rawMax)) + if s != "" && s != "max" { + if v, err := strconv.ParseUint(s, 10, 64); err == nil { + totalBytes = v + } + } + } + return usedBytes, totalBytes, true + } + + // cgroup v1 fallback + if used, ok1 := readUintFile("/sys/fs/cgroup/memory/memory.usage_in_bytes"); ok1 { + usedBytes = used + if limit, ok2 := readUintFile("/sys/fs/cgroup/memory/memory.limit_in_bytes"); ok2 { + // Some environments report a very large number when unlimited. + if limit > 0 && limit < (1<<60) { + totalBytes = limit + } + } + return usedBytes, totalBytes, true + } + + return 0, 0, false +} + +func readCgroupCPUUsageNanos() (usageNanos uint64, ok bool) { + // cgroup v2: cpu.stat has usage_usec + if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.stat"); err == nil { + lines := strings.Split(string(raw), "\n") + for _, line := range lines { + fields := strings.Fields(line) + if len(fields) != 2 { + continue + } + if fields[0] != "usage_usec" { + continue + } + v, err := strconv.ParseUint(fields[1], 10, 64) + if err != nil { + continue + } + return v * 1000, true + } + } + + // cgroup v1: cpuacct.usage is in nanoseconds + if v, ok := readUintFile("/sys/fs/cgroup/cpuacct/cpuacct.usage"); ok { + return v, true + } + + return 0, false +} + +func readCgroupCPULimitCores() float64 { + // cgroup v2: cpu.max => " " or "max " + if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.max"); err == nil { + fields := strings.Fields(string(raw)) + if len(fields) >= 2 && fields[0] != "max" { + quota, err1 := strconv.ParseFloat(fields[0], 64) + period, err2 := strconv.ParseFloat(fields[1], 64) + if err1 == nil && err2 == nil && quota > 0 && period > 0 { + return quota / period + } + } + } + + // cgroup v1: cpu.cfs_quota_us / cpu.cfs_period_us + quota, okQuota := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_quota_us") + period, okPeriod := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_period_us") + if okQuota && okPeriod && quota > 0 && period > 0 { + return float64(quota) / float64(period) + } + + return 0 +} + +func readUintFile(path string) (uint64, bool) { + raw, err := os.ReadFile(path) + if err != nil { + return 0, false + } + s := strings.TrimSpace(string(raw)) + if s == "" { + return 0, false + } + v, err := strconv.ParseUint(s, 10, 64) + if err != nil { + return 0, false + } + return v, true +} + +func readIntFile(path string) (int64, bool) { + raw, err := os.ReadFile(path) + if err != nil { + return 0, false + } + s := strings.TrimSpace(string(raw)) + if s == "" { + return 0, false + } + v, err := strconv.ParseInt(s, 10, 64) + if err != nil { + return 0, false + } + return v, true +} + +func (c *OpsMetricsCollector) checkDB(ctx context.Context) bool { + if c == nil || c.db == nil { + return false + } + if ctx == nil { + ctx = context.Background() + } + var one int + if err := c.db.QueryRowContext(ctx, "SELECT 1").Scan(&one); err != nil { + return false + } + return one == 1 +} + +func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool { + if c == nil || c.redisClient == nil { + return false + } + if ctx == nil { + ctx = context.Background() + } + return c.redisClient.Ping(ctx).Err() == nil +} + +func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) { + if c == nil || c.db == nil { + return 0, 0 + } + stats := c.db.Stats() + return stats.InUse, stats.Idle +} + +var opsMetricsCollectorReleaseScript = redis.NewScript(` +if redis.call("GET", KEYS[1]) == ARGV[1] then + return redis.call("DEL", KEYS[1]) +end +return 0 +`) + +func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), bool) { + if c == nil || c.redisClient == nil { + return nil, true + } + if ctx == nil { + ctx = context.Background() + } + + ok, err := c.redisClient.SetNX(ctx, opsMetricsCollectorLeaderLockKey, c.instanceID, opsMetricsCollectorLeaderLockTTL).Result() + if err != nil { + // Prefer fail-closed to avoid stampeding the database when Redis is flaky. + // Fallback to a DB advisory lock when Redis is present but unavailable. + release, ok := c.tryAcquireDBAdvisoryLock(ctx) + if !ok { + c.maybeLogSkip() + return nil, false + } + return release, true + } + if !ok { + c.maybeLogSkip() + return nil, false + } + + release := func() { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _, _ = opsMetricsCollectorReleaseScript.Run(ctx, c.redisClient, []string{opsMetricsCollectorLeaderLockKey}, c.instanceID).Result() + } + return release, true +} + +func (c *OpsMetricsCollector) tryAcquireDBAdvisoryLock(ctx context.Context) (func(), bool) { + if c == nil || c.db == nil { + return nil, false + } + if ctx == nil { + ctx = context.Background() + } + + conn, err := c.db.Conn(ctx) + if err != nil { + return nil, false + } + + acquired := false + if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", opsMetricsCollectorAdvisoryLockID).Scan(&acquired); err != nil { + _ = conn.Close() + return nil, false + } + if !acquired { + _ = conn.Close() + return nil, false + } + + release := func() { + unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", opsMetricsCollectorAdvisoryLockID) + _ = conn.Close() + } + return release, true +} + +func (c *OpsMetricsCollector) maybeLogSkip() { + c.skipLogMu.Lock() + defer c.skipLogMu.Unlock() + + now := time.Now() + if !c.skipLogAt.IsZero() && now.Sub(c.skipLogAt) < time.Minute { + return + } + c.skipLogAt = now + log.Printf("[OpsMetricsCollector] leader lock held by another instance; skipping") +} + +func floatToIntPtr(v sql.NullFloat64) *int { + if !v.Valid { + return nil + } + n := int(math.Round(v.Float64)) + return &n +} + +func roundTo1DP(v float64) float64 { + return math.Round(v*10) / 10 +} + +func truncateString(s string, max int) string { + if max <= 0 { + return "" + } + if len(s) <= max { + return s + } + cut := s[:max] + for len(cut) > 0 && !utf8.ValidString(cut) { + cut = cut[:len(cut)-1] + } + return cut +} + +func boolPtr(v bool) *bool { + out := v + return &out +} + +func intPtr(v int) *int { + out := v + return &out +} + +func float64Ptr(v float64) *float64 { + out := v + return &out +} + +func hashAdvisoryLockID(s string) int64 { + h := fnv.New64a() + _, _ = h.Write([]byte(s)) + return int64(h.Sum64()) +} diff --git a/backend/internal/service/ops_port.go b/backend/internal/service/ops_port.go new file mode 100644 index 00000000..a3d847e0 --- /dev/null +++ b/backend/internal/service/ops_port.go @@ -0,0 +1,226 @@ +package service + +import ( + "context" + "time" +) + +type OpsRepository interface { + InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error) + ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error) + GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) + ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error) + + InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error) + UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error + GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error) + + // Lightweight window stats (for realtime WS / quick sampling). + GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error) + + GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) + GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) + GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error) + GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error) + GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error) + + InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error + GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error) + + UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error + ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error) + + // Alerts (rules + events) + ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error) + CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error) + UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error) + DeleteAlertRule(ctx context.Context, id int64) error + + ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error) + GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) + GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) + CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error) + UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error + UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error + + // Pre-aggregation (hourly/daily) used for long-window dashboard performance. + UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error + UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error + GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error) + GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error) +} + +type OpsInsertErrorLogInput struct { + RequestID string + ClientRequestID string + + UserID *int64 + APIKeyID *int64 + AccountID *int64 + GroupID *int64 + ClientIP *string + + Platform string + Model string + RequestPath string + Stream bool + UserAgent string + + ErrorPhase string + ErrorType string + Severity string + StatusCode int + IsBusinessLimited bool + + ErrorMessage string + ErrorBody string + + ErrorSource string + ErrorOwner string + + UpstreamStatusCode *int + UpstreamErrorMessage *string + UpstreamErrorDetail *string + + DurationMs *int + TimeToFirstTokenMs *int64 + + RequestBodyJSON *string // sanitized json string (not raw bytes) + RequestBodyTruncated bool + RequestBodyBytes *int + RequestHeadersJSON *string // optional json string + + IsRetryable bool + RetryCount int + + CreatedAt time.Time +} + +type OpsInsertRetryAttemptInput struct { + RequestedByUserID int64 + SourceErrorID int64 + Mode string + PinnedAccountID *int64 + + // running|queued etc. + Status string + StartedAt time.Time +} + +type OpsUpdateRetryAttemptInput struct { + ID int64 + + // succeeded|failed + Status string + FinishedAt time.Time + DurationMs int64 + + // Optional correlation + ResultRequestID *string + ResultErrorID *int64 + + ErrorMessage *string +} + +type OpsInsertSystemMetricsInput struct { + CreatedAt time.Time + WindowMinutes int + + Platform *string + GroupID *int64 + + SuccessCount int64 + ErrorCountTotal int64 + BusinessLimitedCount int64 + ErrorCountSLA int64 + + UpstreamErrorCountExcl429529 int64 + Upstream429Count int64 + Upstream529Count int64 + + TokenConsumed int64 + + QPS *float64 + TPS *float64 + + DurationP50Ms *int + DurationP90Ms *int + DurationP95Ms *int + DurationP99Ms *int + DurationAvgMs *float64 + DurationMaxMs *int + + TTFTP50Ms *int + TTFTP90Ms *int + TTFTP95Ms *int + TTFTP99Ms *int + TTFTAvgMs *float64 + TTFTMaxMs *int + + CPUUsagePercent *float64 + MemoryUsedMB *int64 + MemoryTotalMB *int64 + MemoryUsagePercent *float64 + + DBOK *bool + RedisOK *bool + + DBConnActive *int + DBConnIdle *int + DBConnWaiting *int + + GoroutineCount *int + ConcurrencyQueueDepth *int +} + +type OpsSystemMetricsSnapshot struct { + ID int64 `json:"id"` + CreatedAt time.Time `json:"created_at"` + WindowMinutes int `json:"window_minutes"` + + CPUUsagePercent *float64 `json:"cpu_usage_percent"` + MemoryUsedMB *int64 `json:"memory_used_mb"` + MemoryTotalMB *int64 `json:"memory_total_mb"` + MemoryUsagePercent *float64 `json:"memory_usage_percent"` + + DBOK *bool `json:"db_ok"` + RedisOK *bool `json:"redis_ok"` + + DBConnActive *int `json:"db_conn_active"` + DBConnIdle *int `json:"db_conn_idle"` + DBConnWaiting *int `json:"db_conn_waiting"` + + GoroutineCount *int `json:"goroutine_count"` + ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"` +} + +type OpsUpsertJobHeartbeatInput struct { + JobName string + + LastRunAt *time.Time + LastSuccessAt *time.Time + LastErrorAt *time.Time + LastError *string + LastDurationMs *int64 +} + +type OpsJobHeartbeat struct { + JobName string `json:"job_name"` + + LastRunAt *time.Time `json:"last_run_at"` + LastSuccessAt *time.Time `json:"last_success_at"` + LastErrorAt *time.Time `json:"last_error_at"` + LastError *string `json:"last_error"` + LastDurationMs *int64 `json:"last_duration_ms"` + + UpdatedAt time.Time `json:"updated_at"` +} + +type OpsWindowStats struct { + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + + SuccessCount int64 `json:"success_count"` + ErrorCountTotal int64 `json:"error_count_total"` + TokenConsumed int64 `json:"token_consumed"` +} diff --git a/backend/internal/service/ops_query_mode.go b/backend/internal/service/ops_query_mode.go new file mode 100644 index 00000000..e6fa9c1e --- /dev/null +++ b/backend/internal/service/ops_query_mode.go @@ -0,0 +1,40 @@ +package service + +import ( + "errors" + "strings" +) + +type OpsQueryMode string + +const ( + OpsQueryModeAuto OpsQueryMode = "auto" + OpsQueryModeRaw OpsQueryMode = "raw" + OpsQueryModePreagg OpsQueryMode = "preagg" +) + +// ErrOpsPreaggregatedNotPopulated indicates that raw logs exist for a window, but the +// pre-aggregation tables are not populated yet. This is primarily used to implement +// the forced `preagg` mode UX. +var ErrOpsPreaggregatedNotPopulated = errors.New("ops pre-aggregated tables not populated") + +func ParseOpsQueryMode(raw string) OpsQueryMode { + v := strings.ToLower(strings.TrimSpace(raw)) + switch v { + case string(OpsQueryModeRaw): + return OpsQueryModeRaw + case string(OpsQueryModePreagg): + return OpsQueryModePreagg + default: + return OpsQueryModeAuto + } +} + +func (m OpsQueryMode) IsValid() bool { + switch m { + case OpsQueryModeAuto, OpsQueryModeRaw, OpsQueryModePreagg: + return true + default: + return false + } +} diff --git a/backend/internal/service/ops_realtime.go b/backend/internal/service/ops_realtime.go new file mode 100644 index 00000000..479b9482 --- /dev/null +++ b/backend/internal/service/ops_realtime.go @@ -0,0 +1,36 @@ +package service + +import ( + "context" + "errors" + "strings" +) + +// IsRealtimeMonitoringEnabled returns true when realtime ops features are enabled. +// +// This is a soft switch controlled by the DB setting `ops_realtime_monitoring_enabled`, +// and it is also gated by the hard switch/soft switch of overall ops monitoring. +func (s *OpsService) IsRealtimeMonitoringEnabled(ctx context.Context) bool { + if !s.IsMonitoringEnabled(ctx) { + return false + } + if s.settingRepo == nil { + return true + } + + value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsRealtimeMonitoringEnabled) + if err != nil { + // Default enabled when key is missing; fail-open on transient errors. + if errors.Is(err, ErrSettingNotFound) { + return true + } + return true + } + + switch strings.ToLower(strings.TrimSpace(value)) { + case "false", "0", "off", "disabled": + return false + default: + return true + } +} diff --git a/backend/internal/service/ops_request_details.go b/backend/internal/service/ops_request_details.go new file mode 100644 index 00000000..e33e6f38 --- /dev/null +++ b/backend/internal/service/ops_request_details.go @@ -0,0 +1,152 @@ +package service + +import ( + "context" + "time" +) + +type OpsRequestKind string + +const ( + OpsRequestKindSuccess OpsRequestKind = "success" + OpsRequestKindError OpsRequestKind = "error" +) + +// OpsRequestDetail is a request-level view across success (usage_logs) and error (ops_error_logs). +// It powers "request drilldown" UIs without exposing full request bodies for successful requests. +type OpsRequestDetail struct { + Kind OpsRequestKind `json:"kind"` + CreatedAt time.Time `json:"created_at"` + RequestID string `json:"request_id"` + + Platform string `json:"platform,omitempty"` + Model string `json:"model,omitempty"` + + DurationMs *int `json:"duration_ms,omitempty"` + StatusCode *int `json:"status_code,omitempty"` + + // When Kind == "error", ErrorID links to /admin/ops/errors/:id. + ErrorID *int64 `json:"error_id,omitempty"` + + Phase string `json:"phase,omitempty"` + Severity string `json:"severity,omitempty"` + Message string `json:"message,omitempty"` + + UserID *int64 `json:"user_id,omitempty"` + APIKeyID *int64 `json:"api_key_id,omitempty"` + AccountID *int64 `json:"account_id,omitempty"` + GroupID *int64 `json:"group_id,omitempty"` + + Stream bool `json:"stream"` +} + +type OpsRequestDetailFilter struct { + StartTime *time.Time + EndTime *time.Time + + // kind: success|error|all + Kind string + + Platform string + GroupID *int64 + + UserID *int64 + APIKeyID *int64 + AccountID *int64 + + Model string + RequestID string + Query string + + MinDurationMs *int + MaxDurationMs *int + + // Sort: created_at_desc (default) or duration_desc. + Sort string + + Page int + PageSize int +} + +func (f *OpsRequestDetailFilter) Normalize() (page, pageSize int, startTime, endTime time.Time) { + page = 1 + pageSize = 50 + endTime = time.Now() + startTime = endTime.Add(-1 * time.Hour) + + if f == nil { + return page, pageSize, startTime, endTime + } + + if f.Page > 0 { + page = f.Page + } + if f.PageSize > 0 { + pageSize = f.PageSize + } + if pageSize > 100 { + pageSize = 100 + } + + if f.EndTime != nil { + endTime = *f.EndTime + } + if f.StartTime != nil { + startTime = *f.StartTime + } else if f.EndTime != nil { + startTime = endTime.Add(-1 * time.Hour) + } + + if startTime.After(endTime) { + startTime, endTime = endTime, startTime + } + + return page, pageSize, startTime, endTime +} + +type OpsRequestDetailList struct { + Items []*OpsRequestDetail `json:"items"` + Total int64 `json:"total"` + Page int `json:"page"` + PageSize int `json:"page_size"` +} + +func (s *OpsService) ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) (*OpsRequestDetailList, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return &OpsRequestDetailList{ + Items: []*OpsRequestDetail{}, + Total: 0, + Page: 1, + PageSize: 50, + }, nil + } + + page, pageSize, startTime, endTime := filter.Normalize() + filterCopy := &OpsRequestDetailFilter{} + if filter != nil { + *filterCopy = *filter + } + filterCopy.Page = page + filterCopy.PageSize = pageSize + filterCopy.StartTime = &startTime + filterCopy.EndTime = &endTime + + items, total, err := s.opsRepo.ListRequestDetails(ctx, filterCopy) + if err != nil { + return nil, err + } + if items == nil { + items = []*OpsRequestDetail{} + } + + return &OpsRequestDetailList{ + Items: items, + Total: total, + Page: page, + PageSize: pageSize, + }, nil +} + diff --git a/backend/internal/service/ops_retry.go b/backend/internal/service/ops_retry.go new file mode 100644 index 00000000..3232e708 --- /dev/null +++ b/backend/internal/service/ops_retry.go @@ -0,0 +1,635 @@ +package service + +import ( + "bytes" + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "log" + "net/http" + "strings" + "time" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" + "github.com/gin-gonic/gin" + "github.com/lib/pq" +) + +const ( + OpsRetryModeClient = "client" + OpsRetryModeUpstream = "upstream" +) + +const ( + opsRetryStatusRunning = "running" + opsRetryStatusSucceeded = "succeeded" + opsRetryStatusFailed = "failed" +) + +const ( + opsRetryTimeout = 60 * time.Second + opsRetryCaptureBytesLimit = 64 * 1024 + opsRetryResponsePreviewMax = 8 * 1024 + opsRetryMinIntervalPerError = 10 * time.Second + opsRetryMaxAccountSwitches = 3 +) + +var opsRetryRequestHeaderAllowlist = map[string]bool{ + "anthropic-beta": true, + "anthropic-version": true, +} + +type opsRetryRequestType string + +const ( + opsRetryTypeMessages opsRetryRequestType = "messages" + opsRetryTypeOpenAI opsRetryRequestType = "openai_responses" + opsRetryTypeGeminiV1B opsRetryRequestType = "gemini_v1beta" +) + +type limitedResponseWriter struct { + header http.Header + status int + wroteHeader bool + + limit int + totalWritten int64 + buf bytes.Buffer +} + +func newLimitedResponseWriter(limit int) *limitedResponseWriter { + if limit <= 0 { + limit = 1 + } + return &limitedResponseWriter{ + header: make(http.Header), + status: http.StatusOK, + limit: limit, + } +} + +func (w *limitedResponseWriter) Header() http.Header { + return w.header +} + +func (w *limitedResponseWriter) WriteHeader(statusCode int) { + if w.wroteHeader { + return + } + w.wroteHeader = true + w.status = statusCode +} + +func (w *limitedResponseWriter) Write(p []byte) (int, error) { + if !w.wroteHeader { + w.WriteHeader(http.StatusOK) + } + w.totalWritten += int64(len(p)) + + if w.buf.Len() < w.limit { + remaining := w.limit - w.buf.Len() + if len(p) > remaining { + _, _ = w.buf.Write(p[:remaining]) + } else { + _, _ = w.buf.Write(p) + } + } + + // Pretend we wrote everything to avoid upstream/client code treating it as an error. + return len(p), nil +} + +func (w *limitedResponseWriter) Flush() {} + +func (w *limitedResponseWriter) bodyBytes() []byte { + return w.buf.Bytes() +} + +func (w *limitedResponseWriter) truncated() bool { + return w.totalWritten > int64(w.limit) +} + +func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + + mode = strings.ToLower(strings.TrimSpace(mode)) + switch mode { + case OpsRetryModeClient, OpsRetryModeUpstream: + default: + return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream") + } + + latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID) + if err != nil && !errors.Is(err, sql.ErrNoRows) { + return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err) + } + if latest != nil { + if strings.EqualFold(latest.Status, opsRetryStatusRunning) || strings.EqualFold(latest.Status, "queued") { + return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error") + } + + lastAttemptAt := latest.CreatedAt + if latest.FinishedAt != nil && !latest.FinishedAt.IsZero() { + lastAttemptAt = *latest.FinishedAt + } else if latest.StartedAt != nil && !latest.StartedAt.IsZero() { + lastAttemptAt = *latest.StartedAt + } + + if time.Since(lastAttemptAt) < opsRetryMinIntervalPerError { + return nil, infraerrors.Conflict("OPS_RETRY_TOO_FREQUENT", "Please wait before retrying this error again") + } + } + + errorLog, err := s.GetErrorLogByID(ctx, errorID) + if err != nil { + return nil, err + } + if strings.TrimSpace(errorLog.RequestBody) == "" { + return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry") + } + + var pinned *int64 + if mode == OpsRetryModeUpstream { + if pinnedAccountID != nil && *pinnedAccountID > 0 { + pinned = pinnedAccountID + } else if errorLog.AccountID != nil && *errorLog.AccountID > 0 { + pinned = errorLog.AccountID + } else { + return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry") + } + } + + startedAt := time.Now() + attemptID, err := s.opsRepo.InsertRetryAttempt(ctx, &OpsInsertRetryAttemptInput{ + RequestedByUserID: requestedByUserID, + SourceErrorID: errorID, + Mode: mode, + PinnedAccountID: pinned, + Status: opsRetryStatusRunning, + StartedAt: startedAt, + }) + if err != nil { + var pqErr *pq.Error + if errors.As(err, &pqErr) && string(pqErr.Code) == "23505" { + return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error") + } + return nil, infraerrors.InternalServer("OPS_RETRY_CREATE_ATTEMPT_FAILED", "Failed to create retry attempt").WithCause(err) + } + + result := &OpsRetryResult{ + AttemptID: attemptID, + Mode: mode, + Status: opsRetryStatusFailed, + PinnedAccountID: pinned, + HTTPStatusCode: 0, + UpstreamRequestID: "", + ResponsePreview: "", + ResponseTruncated: false, + ErrorMessage: "", + StartedAt: startedAt, + } + + execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout) + defer cancel() + + execRes := s.executeRetry(execCtx, errorLog, mode, pinned) + + finishedAt := time.Now() + result.FinishedAt = finishedAt + result.DurationMs = finishedAt.Sub(startedAt).Milliseconds() + + if execRes != nil { + result.Status = execRes.status + result.UsedAccountID = execRes.usedAccountID + result.HTTPStatusCode = execRes.httpStatusCode + result.UpstreamRequestID = execRes.upstreamRequestID + result.ResponsePreview = execRes.responsePreview + result.ResponseTruncated = execRes.responseTruncated + result.ErrorMessage = execRes.errorMessage + } + + updateCtx, updateCancel := context.WithTimeout(context.Background(), 3*time.Second) + defer updateCancel() + + var updateErrMsg *string + if strings.TrimSpace(result.ErrorMessage) != "" { + msg := result.ErrorMessage + updateErrMsg = &msg + } + var resultRequestID *string + if strings.TrimSpace(result.UpstreamRequestID) != "" { + v := result.UpstreamRequestID + resultRequestID = &v + } + + finalStatus := result.Status + if strings.TrimSpace(finalStatus) == "" { + finalStatus = opsRetryStatusFailed + } + + if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{ + ID: attemptID, + Status: finalStatus, + FinishedAt: finishedAt, + DurationMs: result.DurationMs, + ResultRequestID: resultRequestID, + ErrorMessage: updateErrMsg, + }); err != nil { + // Best-effort: retry itself already executed; do not fail the API response. + log.Printf("[Ops] UpdateRetryAttempt failed: %v", err) + } + + return result, nil +} + +type opsRetryExecution struct { + status string + + usedAccountID *int64 + httpStatusCode int + upstreamRequestID string + + responsePreview string + responseTruncated bool + + errorMessage string +} + +func (s *OpsService) executeRetry(ctx context.Context, errorLog *OpsErrorLogDetail, mode string, pinnedAccountID *int64) *opsRetryExecution { + if errorLog == nil { + return &opsRetryExecution{ + status: opsRetryStatusFailed, + errorMessage: "missing error log", + } + } + + reqType := detectOpsRetryType(errorLog.RequestPath) + bodyBytes := []byte(errorLog.RequestBody) + + switch reqType { + case opsRetryTypeMessages: + bodyBytes = FilterThinkingBlocksForRetry(bodyBytes) + case opsRetryTypeOpenAI, opsRetryTypeGeminiV1B: + // No-op + } + + switch strings.ToLower(strings.TrimSpace(mode)) { + case OpsRetryModeUpstream: + if pinnedAccountID == nil || *pinnedAccountID <= 0 { + return &opsRetryExecution{ + status: opsRetryStatusFailed, + errorMessage: "pinned_account_id required for upstream retry", + } + } + return s.executePinnedRetry(ctx, reqType, errorLog, bodyBytes, *pinnedAccountID) + case OpsRetryModeClient: + return s.executeClientRetry(ctx, reqType, errorLog, bodyBytes) + default: + return &opsRetryExecution{ + status: opsRetryStatusFailed, + errorMessage: "invalid retry mode", + } + } +} + +func detectOpsRetryType(path string) opsRetryRequestType { + p := strings.ToLower(strings.TrimSpace(path)) + switch { + case strings.Contains(p, "/responses"): + return opsRetryTypeOpenAI + case strings.Contains(p, "/v1beta/"): + return opsRetryTypeGeminiV1B + default: + return opsRetryTypeMessages + } +} + +func (s *OpsService) executePinnedRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, pinnedAccountID int64) *opsRetryExecution { + if s.accountRepo == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account repository not available"} + } + + account, err := s.accountRepo.GetByID(ctx, pinnedAccountID) + if err != nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("account not found: %v", err)} + } + if account == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account not found"} + } + if !account.IsSchedulable() { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account is not schedulable"} + } + if errorLog.GroupID != nil && *errorLog.GroupID > 0 { + if !containsInt64(account.GroupIDs, *errorLog.GroupID) { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "pinned account is not in the same group as the original request"} + } + } + + var release func() + if s.concurrencyService != nil { + acq, err := s.concurrencyService.AcquireAccountSlot(ctx, account.ID, account.Concurrency) + if err != nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("acquire account slot failed: %v", err)} + } + if acq == nil || !acq.Acquired { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account concurrency limit reached"} + } + release = acq.ReleaseFunc + } + if release != nil { + defer release() + } + + usedID := account.ID + exec := s.executeWithAccount(ctx, reqType, errorLog, body, account) + exec.usedAccountID = &usedID + if exec.status == "" { + exec.status = opsRetryStatusFailed + } + return exec +} + +func (s *OpsService) executeClientRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) *opsRetryExecution { + groupID := errorLog.GroupID + if groupID == nil || *groupID <= 0 { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "group_id missing; cannot reselect account"} + } + + model, stream, parsedErr := extractRetryModelAndStream(reqType, errorLog, body) + if parsedErr != nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: parsedErr.Error()} + } + _ = stream + + excluded := make(map[int64]struct{}) + switches := 0 + + for { + if switches >= opsRetryMaxAccountSwitches { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "retry failed after exhausting account failovers"} + } + + selection, selErr := s.selectAccountForRetry(ctx, reqType, groupID, model, excluded) + if selErr != nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: selErr.Error()} + } + if selection == nil || selection.Account == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "no available accounts"} + } + + account := selection.Account + if !selection.Acquired || selection.ReleaseFunc == nil { + excluded[account.ID] = struct{}{} + switches++ + continue + } + + exec := func() *opsRetryExecution { + defer selection.ReleaseFunc() + return s.executeWithAccount(ctx, reqType, errorLog, body, account) + }() + + if exec != nil { + if exec.status == opsRetryStatusSucceeded { + usedID := account.ID + exec.usedAccountID = &usedID + return exec + } + // If the gateway services ask for failover, try another account. + if s.isFailoverError(exec.errorMessage) { + excluded[account.ID] = struct{}{} + switches++ + continue + } + usedID := account.ID + exec.usedAccountID = &usedID + return exec + } + + excluded[account.ID] = struct{}{} + switches++ + } +} + +func (s *OpsService) selectAccountForRetry(ctx context.Context, reqType opsRetryRequestType, groupID *int64, model string, excludedIDs map[int64]struct{}) (*AccountSelectionResult, error) { + switch reqType { + case opsRetryTypeOpenAI: + if s.openAIGatewayService == nil { + return nil, fmt.Errorf("openai gateway service not available") + } + return s.openAIGatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs) + case opsRetryTypeGeminiV1B, opsRetryTypeMessages: + if s.gatewayService == nil { + return nil, fmt.Errorf("gateway service not available") + } + return s.gatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs) + default: + return nil, fmt.Errorf("unsupported retry type: %s", reqType) + } +} + +func extractRetryModelAndStream(reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) (model string, stream bool, err error) { + switch reqType { + case opsRetryTypeMessages: + parsed, parseErr := ParseGatewayRequest(body) + if parseErr != nil { + return "", false, fmt.Errorf("failed to parse messages request body: %w", parseErr) + } + return parsed.Model, parsed.Stream, nil + case opsRetryTypeOpenAI: + var v struct { + Model string `json:"model"` + Stream bool `json:"stream"` + } + if err := json.Unmarshal(body, &v); err != nil { + return "", false, fmt.Errorf("failed to parse openai request body: %w", err) + } + return strings.TrimSpace(v.Model), v.Stream, nil + case opsRetryTypeGeminiV1B: + if strings.TrimSpace(errorLog.Model) == "" { + return "", false, fmt.Errorf("missing model for gemini v1beta retry") + } + return strings.TrimSpace(errorLog.Model), errorLog.Stream, nil + default: + return "", false, fmt.Errorf("unsupported retry type: %s", reqType) + } +} + +func (s *OpsService) executeWithAccount(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, account *Account) *opsRetryExecution { + if account == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "missing account"} + } + + c, w := newOpsRetryContext(ctx, errorLog) + + var err error + switch reqType { + case opsRetryTypeOpenAI: + if s.openAIGatewayService == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "openai gateway service not available"} + } + _, err = s.openAIGatewayService.Forward(ctx, c, account, body) + case opsRetryTypeGeminiV1B: + if s.geminiCompatService == nil || s.antigravityGatewayService == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini services not available"} + } + modelName := strings.TrimSpace(errorLog.Model) + action := "generateContent" + if errorLog.Stream { + action = "streamGenerateContent" + } + if account.Platform == PlatformAntigravity { + _, err = s.antigravityGatewayService.ForwardGemini(ctx, c, account, modelName, action, errorLog.Stream, body) + } else { + _, err = s.geminiCompatService.ForwardNative(ctx, c, account, modelName, action, errorLog.Stream, body) + } + case opsRetryTypeMessages: + switch account.Platform { + case PlatformAntigravity: + if s.antigravityGatewayService == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "antigravity gateway service not available"} + } + _, err = s.antigravityGatewayService.Forward(ctx, c, account, body) + case PlatformGemini: + if s.geminiCompatService == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini gateway service not available"} + } + _, err = s.geminiCompatService.Forward(ctx, c, account, body) + default: + if s.gatewayService == nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gateway service not available"} + } + parsedReq, parseErr := ParseGatewayRequest(body) + if parseErr != nil { + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "failed to parse request body"} + } + _, err = s.gatewayService.Forward(ctx, c, account, parsedReq) + } + default: + return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "unsupported retry type"} + } + + statusCode := http.StatusOK + if c != nil && c.Writer != nil { + statusCode = c.Writer.Status() + } + + upstreamReqID := extractUpstreamRequestID(c) + preview, truncated := extractResponsePreview(w) + + exec := &opsRetryExecution{ + status: opsRetryStatusFailed, + httpStatusCode: statusCode, + upstreamRequestID: upstreamReqID, + responsePreview: preview, + responseTruncated: truncated, + errorMessage: "", + } + + if err == nil && statusCode < 400 { + exec.status = opsRetryStatusSucceeded + return exec + } + + if err != nil { + exec.errorMessage = err.Error() + } else { + exec.errorMessage = fmt.Sprintf("upstream returned status %d", statusCode) + } + + return exec +} + +func newOpsRetryContext(ctx context.Context, errorLog *OpsErrorLogDetail) (*gin.Context, *limitedResponseWriter) { + w := newLimitedResponseWriter(opsRetryCaptureBytesLimit) + c, _ := gin.CreateTestContext(w) + + path := "/" + if errorLog != nil && strings.TrimSpace(errorLog.RequestPath) != "" { + path = errorLog.RequestPath + } + + req, _ := http.NewRequestWithContext(ctx, http.MethodPost, "http://localhost"+path, bytes.NewReader(nil)) + req.Header.Set("content-type", "application/json") + if errorLog != nil && strings.TrimSpace(errorLog.UserAgent) != "" { + req.Header.Set("user-agent", errorLog.UserAgent) + } + // Restore a minimal, whitelisted subset of request headers to improve retry fidelity + // (e.g. anthropic-beta / anthropic-version). Never replay auth credentials. + if errorLog != nil && strings.TrimSpace(errorLog.RequestHeaders) != "" { + var stored map[string]string + if err := json.Unmarshal([]byte(errorLog.RequestHeaders), &stored); err == nil { + for k, v := range stored { + key := strings.TrimSpace(k) + if key == "" { + continue + } + if !opsRetryRequestHeaderAllowlist[strings.ToLower(key)] { + continue + } + val := strings.TrimSpace(v) + if val == "" { + continue + } + req.Header.Set(key, val) + } + } + } + + c.Request = req + return c, w +} + +func extractUpstreamRequestID(c *gin.Context) string { + if c == nil || c.Writer == nil { + return "" + } + h := c.Writer.Header() + if h == nil { + return "" + } + for _, key := range []string{"x-request-id", "X-Request-Id", "X-Request-ID"} { + if v := strings.TrimSpace(h.Get(key)); v != "" { + return v + } + } + return "" +} + +func extractResponsePreview(w *limitedResponseWriter) (preview string, truncated bool) { + if w == nil { + return "", false + } + b := bytes.TrimSpace(w.bodyBytes()) + if len(b) == 0 { + return "", w.truncated() + } + if len(b) > opsRetryResponsePreviewMax { + return string(b[:opsRetryResponsePreviewMax]), true + } + return string(b), w.truncated() +} + +func containsInt64(items []int64, needle int64) bool { + for _, v := range items { + if v == needle { + return true + } + } + return false +} + +func (s *OpsService) isFailoverError(message string) bool { + msg := strings.ToLower(strings.TrimSpace(message)) + if msg == "" { + return false + } + return strings.Contains(msg, "upstream error:") && strings.Contains(msg, "failover") +} diff --git a/backend/internal/service/ops_service.go b/backend/internal/service/ops_service.go new file mode 100644 index 00000000..169c523a --- /dev/null +++ b/backend/internal/service/ops_service.go @@ -0,0 +1,451 @@ +package service + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "log" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/config" + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +var ErrOpsDisabled = infraerrors.NotFound("OPS_DISABLED", "Ops monitoring is disabled") + +const ( + opsMaxStoredRequestBodyBytes = 10 * 1024 + opsMaxStoredErrorBodyBytes = 20 * 1024 +) + +// OpsService provides ingestion and query APIs for the Ops monitoring module. +type OpsService struct { + opsRepo OpsRepository + settingRepo SettingRepository + cfg *config.Config + + accountRepo AccountRepository + + concurrencyService *ConcurrencyService + gatewayService *GatewayService + openAIGatewayService *OpenAIGatewayService + geminiCompatService *GeminiMessagesCompatService + antigravityGatewayService *AntigravityGatewayService +} + +func NewOpsService( + opsRepo OpsRepository, + settingRepo SettingRepository, + cfg *config.Config, + accountRepo AccountRepository, + concurrencyService *ConcurrencyService, + gatewayService *GatewayService, + openAIGatewayService *OpenAIGatewayService, + geminiCompatService *GeminiMessagesCompatService, + antigravityGatewayService *AntigravityGatewayService, +) *OpsService { + return &OpsService{ + opsRepo: opsRepo, + settingRepo: settingRepo, + cfg: cfg, + + accountRepo: accountRepo, + + concurrencyService: concurrencyService, + gatewayService: gatewayService, + openAIGatewayService: openAIGatewayService, + geminiCompatService: geminiCompatService, + antigravityGatewayService: antigravityGatewayService, + } +} + +func (s *OpsService) RequireMonitoringEnabled(ctx context.Context) error { + if s.IsMonitoringEnabled(ctx) { + return nil + } + return ErrOpsDisabled +} + +func (s *OpsService) IsMonitoringEnabled(ctx context.Context) bool { + // Hard switch: disable ops entirely. + if s.cfg != nil && !s.cfg.Ops.Enabled { + return false + } + if s.settingRepo == nil { + return true + } + value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled) + if err != nil { + // Default enabled when key is missing, and fail-open on transient errors + // (ops should never block gateway traffic). + if errors.Is(err, ErrSettingNotFound) { + return true + } + return true + } + switch strings.ToLower(strings.TrimSpace(value)) { + case "false", "0", "off", "disabled": + return false + default: + return true + } +} + +func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogInput, rawRequestBody []byte) error { + if entry == nil { + return nil + } + if !s.IsMonitoringEnabled(ctx) { + return nil + } + if s.opsRepo == nil { + return nil + } + + // Ensure timestamps are always populated. + if entry.CreatedAt.IsZero() { + entry.CreatedAt = time.Now() + } + + // Ensure required fields exist (DB has NOT NULL constraints). + entry.ErrorPhase = strings.TrimSpace(entry.ErrorPhase) + entry.ErrorType = strings.TrimSpace(entry.ErrorType) + if entry.ErrorPhase == "" { + entry.ErrorPhase = "internal" + } + if entry.ErrorType == "" { + entry.ErrorType = "api_error" + } + + // Sanitize + trim request body (errors only). + if len(rawRequestBody) > 0 { + sanitized, truncated, bytesLen := sanitizeAndTrimRequestBody(rawRequestBody, opsMaxStoredRequestBodyBytes) + if sanitized != "" { + entry.RequestBodyJSON = &sanitized + } + entry.RequestBodyTruncated = truncated + entry.RequestBodyBytes = &bytesLen + } + + // Sanitize + truncate error_body to avoid storing sensitive data. + if strings.TrimSpace(entry.ErrorBody) != "" { + sanitized, _ := sanitizeErrorBodyForStorage(entry.ErrorBody, opsMaxStoredErrorBodyBytes) + entry.ErrorBody = sanitized + } + + if _, err := s.opsRepo.InsertErrorLog(ctx, entry); err != nil { + // Never bubble up to gateway; best-effort logging. + log.Printf("[Ops] RecordError failed: %v", err) + return err + } + return nil +} + +func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil + } + return s.opsRepo.ListErrorLogs(ctx, filter) +} + +func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found") + } + detail, err := s.opsRepo.GetErrorLogByID(ctx, id) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found") + } + return nil, infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err) + } + return detail, nil +} + +func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) { + bytesLen = len(raw) + if len(raw) == 0 { + return "", false, 0 + } + + var decoded any + if err := json.Unmarshal(raw, &decoded); err != nil { + // If it's not valid JSON, don't store (retry would not be reliable anyway). + return "", false, bytesLen + } + + decoded = redactSensitiveJSON(decoded) + + encoded, err := json.Marshal(decoded) + if err != nil { + return "", false, bytesLen + } + if len(encoded) <= maxBytes { + return string(encoded), false, bytesLen + } + + // Trim conversation history to keep the most recent context. + if root, ok := decoded.(map[string]any); ok { + if trimmed, ok := trimConversationArrays(root, maxBytes); ok { + encoded2, err2 := json.Marshal(trimmed) + if err2 == nil && len(encoded2) <= maxBytes { + return string(encoded2), true, bytesLen + } + // Fallthrough: keep shrinking. + decoded = trimmed + } + + essential := shrinkToEssentials(root) + encoded3, err3 := json.Marshal(essential) + if err3 == nil && len(encoded3) <= maxBytes { + return string(encoded3), true, bytesLen + } + } + + // Last resort: store a minimal placeholder (still valid JSON). + placeholder := map[string]any{ + "request_body_truncated": true, + } + if model := extractString(decoded, "model"); model != "" { + placeholder["model"] = model + } + encoded4, err4 := json.Marshal(placeholder) + if err4 != nil { + return "", true, bytesLen + } + return string(encoded4), true, bytesLen +} + +func redactSensitiveJSON(v any) any { + switch t := v.(type) { + case map[string]any: + out := make(map[string]any, len(t)) + for k, vv := range t { + if isSensitiveKey(k) { + out[k] = "[REDACTED]" + continue + } + out[k] = redactSensitiveJSON(vv) + } + return out + case []any: + out := make([]any, 0, len(t)) + for _, vv := range t { + out = append(out, redactSensitiveJSON(vv)) + } + return out + default: + return v + } +} + +func isSensitiveKey(key string) bool { + k := strings.ToLower(strings.TrimSpace(key)) + if k == "" { + return false + } + + // Exact matches (common credential fields). + switch k { + case "authorization", + "proxy-authorization", + "x-api-key", + "api_key", + "apikey", + "access_token", + "refresh_token", + "id_token", + "session_token", + "token", + "password", + "passwd", + "passphrase", + "secret", + "client_secret", + "private_key", + "jwt", + "signature", + "accesskeyid", + "secretaccesskey": + return true + } + + // Suffix matches. + for _, suffix := range []string{ + "_secret", + "_token", + "_id_token", + "_session_token", + "_password", + "_passwd", + "_passphrase", + "_key", + "secret_key", + "private_key", + } { + if strings.HasSuffix(k, suffix) { + return true + } + } + + // Substring matches (conservative, but errs on the side of privacy). + for _, sub := range []string{ + "secret", + "token", + "password", + "passwd", + "passphrase", + "privatekey", + "private_key", + "apikey", + "api_key", + "accesskeyid", + "secretaccesskey", + "bearer", + "cookie", + "credential", + "session", + "jwt", + "signature", + } { + if strings.Contains(k, sub) { + return true + } + } + + return false +} + +func trimConversationArrays(root map[string]any, maxBytes int) (map[string]any, bool) { + // Supported: anthropic/openai: messages; gemini: contents. + if out, ok := trimArrayField(root, "messages", maxBytes); ok { + return out, true + } + if out, ok := trimArrayField(root, "contents", maxBytes); ok { + return out, true + } + return root, false +} + +func trimArrayField(root map[string]any, field string, maxBytes int) (map[string]any, bool) { + raw, ok := root[field] + if !ok { + return nil, false + } + arr, ok := raw.([]any) + if !ok || len(arr) == 0 { + return nil, false + } + + // Keep at least the last message/content. Use binary search so we don't marshal O(n) times. + // We are dropping from the *front* of the array (oldest context first). + lo := 0 + hi := len(arr) - 1 // inclusive; hi ensures at least one item remains + + var best map[string]any + found := false + + for lo <= hi { + mid := (lo + hi) / 2 + candidateArr := arr[mid:] + if len(candidateArr) == 0 { + lo = mid + 1 + continue + } + + next := shallowCopyMap(root) + next[field] = candidateArr + encoded, err := json.Marshal(next) + if err != nil { + // If marshal fails, try dropping more. + lo = mid + 1 + continue + } + + if len(encoded) <= maxBytes { + best = next + found = true + // Try to keep more context by dropping fewer items. + hi = mid - 1 + continue + } + + // Need to drop more. + lo = mid + 1 + } + + if found { + return best, true + } + + // Nothing fit (even with only one element); return the smallest slice and let the + // caller fall back to shrinkToEssentials(). + next := shallowCopyMap(root) + next[field] = arr[len(arr)-1:] + return next, true +} + +func shrinkToEssentials(root map[string]any) map[string]any { + out := make(map[string]any) + for _, key := range []string{"model", "stream", "max_tokens", "temperature", "top_p", "top_k"} { + if v, ok := root[key]; ok { + out[key] = v + } + } + + // Keep only the last element of the conversation array. + if v, ok := root["messages"]; ok { + if arr, ok := v.([]any); ok && len(arr) > 0 { + out["messages"] = []any{arr[len(arr)-1]} + } + } + if v, ok := root["contents"]; ok { + if arr, ok := v.([]any); ok && len(arr) > 0 { + out["contents"] = []any{arr[len(arr)-1]} + } + } + return out +} + +func shallowCopyMap(m map[string]any) map[string]any { + out := make(map[string]any, len(m)) + for k, v := range m { + out[k] = v + } + return out +} + +func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, truncated bool) { + raw = strings.TrimSpace(raw) + if raw == "" { + return "", false + } + + // Prefer JSON-safe sanitization when possible. + if out, trunc, _ := sanitizeAndTrimRequestBody([]byte(raw), maxBytes); out != "" { + return out, trunc + } + + // Non-JSON: best-effort truncate. + if maxBytes > 0 && len(raw) > maxBytes { + return truncateString(raw, maxBytes), true + } + return raw, false +} + +func extractString(v any, key string) string { + root, ok := v.(map[string]any) + if !ok { + return "" + } + s, _ := root[key].(string) + return strings.TrimSpace(s) +} diff --git a/backend/internal/service/ops_settings.go b/backend/internal/service/ops_settings.go new file mode 100644 index 00000000..2f15bc79 --- /dev/null +++ b/backend/internal/service/ops_settings.go @@ -0,0 +1,354 @@ +package service + +import ( + "context" + "encoding/json" + "errors" + "strings" + "time" +) + +const ( + opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader" + opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second +) + +// ========================= +// Email notification config +// ========================= + +func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) { + defaultCfg := defaultOpsEmailNotificationConfig() + if s == nil || s.settingRepo == nil { + return defaultCfg, nil + } + if ctx == nil { + ctx = context.Background() + } + + raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig) + if err != nil { + if errors.Is(err, ErrSettingNotFound) { + // Initialize defaults on first read (best-effort). + if b, mErr := json.Marshal(defaultCfg); mErr == nil { + _ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b)) + } + return defaultCfg, nil + } + return nil, err + } + + cfg := &OpsEmailNotificationConfig{} + if err := json.Unmarshal([]byte(raw), cfg); err != nil { + // Corrupted JSON should not break ops UI; fall back to defaults. + return defaultCfg, nil + } + normalizeOpsEmailNotificationConfig(cfg) + return cfg, nil +} + +func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) { + if s == nil || s.settingRepo == nil { + return nil, errors.New("setting repository not initialized") + } + if ctx == nil { + ctx = context.Background() + } + if req == nil { + return nil, errors.New("invalid request") + } + + cfg, err := s.GetEmailNotificationConfig(ctx) + if err != nil { + return nil, err + } + + if req.Alert != nil { + cfg.Alert.Enabled = req.Alert.Enabled + if req.Alert.Recipients != nil { + cfg.Alert.Recipients = req.Alert.Recipients + } + cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity) + cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour + cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds + cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts + } + + if req.Report != nil { + cfg.Report.Enabled = req.Report.Enabled + if req.Report.Recipients != nil { + cfg.Report.Recipients = req.Report.Recipients + } + cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled + cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule) + cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled + cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule) + cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled + cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule) + cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount + cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled + cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule) + cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold + } + + if err := validateOpsEmailNotificationConfig(cfg); err != nil { + return nil, err + } + + normalizeOpsEmailNotificationConfig(cfg) + raw, err := json.Marshal(cfg) + if err != nil { + return nil, err + } + if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil { + return nil, err + } + return cfg, nil +} + +func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig { + return &OpsEmailNotificationConfig{ + Alert: OpsEmailAlertConfig{ + Enabled: true, + Recipients: []string{}, + MinSeverity: "", + RateLimitPerHour: 0, + BatchingWindowSeconds: 0, + IncludeResolvedAlerts: false, + }, + Report: OpsEmailReportConfig{ + Enabled: false, + Recipients: []string{}, + DailySummaryEnabled: false, + DailySummarySchedule: "0 9 * * *", + WeeklySummaryEnabled: false, + WeeklySummarySchedule: "0 9 * * 1", + ErrorDigestEnabled: false, + ErrorDigestSchedule: "0 9 * * *", + ErrorDigestMinCount: 10, + AccountHealthEnabled: false, + AccountHealthSchedule: "0 9 * * *", + AccountHealthErrorRateThreshold: 10.0, + }, + } +} + +func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) { + if cfg == nil { + return + } + if cfg.Alert.Recipients == nil { + cfg.Alert.Recipients = []string{} + } + if cfg.Report.Recipients == nil { + cfg.Report.Recipients = []string{} + } + + cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity) + cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule) + cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule) + cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule) + cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule) + + // Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings. + if cfg.Report.DailySummarySchedule == "" { + cfg.Report.DailySummarySchedule = "0 9 * * *" + } + if cfg.Report.WeeklySummarySchedule == "" { + cfg.Report.WeeklySummarySchedule = "0 9 * * 1" + } + if cfg.Report.ErrorDigestSchedule == "" { + cfg.Report.ErrorDigestSchedule = "0 9 * * *" + } + if cfg.Report.AccountHealthSchedule == "" { + cfg.Report.AccountHealthSchedule = "0 9 * * *" + } +} + +func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error { + if cfg == nil { + return errors.New("invalid config") + } + + if cfg.Alert.RateLimitPerHour < 0 { + return errors.New("alert.rate_limit_per_hour must be >= 0") + } + if cfg.Alert.BatchingWindowSeconds < 0 { + return errors.New("alert.batching_window_seconds must be >= 0") + } + switch strings.TrimSpace(cfg.Alert.MinSeverity) { + case "", "critical", "warning", "info": + default: + return errors.New("alert.min_severity must be one of: critical, warning, info, or empty") + } + + if cfg.Report.ErrorDigestMinCount < 0 { + return errors.New("report.error_digest_min_count must be >= 0") + } + if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 { + return errors.New("report.account_health_error_rate_threshold must be between 0 and 100") + } + return nil +} + +// ========================= +// Alert runtime settings +// ========================= + +func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings { + return &OpsAlertRuntimeSettings{ + EvaluationIntervalSeconds: 60, + DistributedLock: OpsDistributedLockSettings{ + Enabled: true, + Key: opsAlertEvaluatorLeaderLockKeyDefault, + TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()), + }, + Silencing: OpsAlertSilencingSettings{ + Enabled: false, + GlobalUntilRFC3339: "", + GlobalReason: "", + Entries: []OpsAlertSilenceEntry{}, + }, + } +} + +func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) { + if s == nil { + return + } + s.Key = strings.TrimSpace(s.Key) + if s.Key == "" { + s.Key = defaultKey + } + if s.TTLSeconds <= 0 { + s.TTLSeconds = defaultTTLSeconds + } +} + +func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) { + if s == nil { + return + } + s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339) + s.GlobalReason = strings.TrimSpace(s.GlobalReason) + if s.Entries == nil { + s.Entries = []OpsAlertSilenceEntry{} + } + for i := range s.Entries { + s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339) + s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason) + } +} + +func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error { + if strings.TrimSpace(s.Key) == "" { + return errors.New("distributed_lock.key is required") + } + if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) { + return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400") + } + return nil +} + +func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error { + parse := func(raw string) error { + if strings.TrimSpace(raw) == "" { + return nil + } + if _, err := time.Parse(time.RFC3339, raw); err != nil { + return errors.New("silencing time must be RFC3339") + } + return nil + } + + if err := parse(s.GlobalUntilRFC3339); err != nil { + return err + } + for _, entry := range s.Entries { + if strings.TrimSpace(entry.UntilRFC3339) == "" { + return errors.New("silencing.entries.until_rfc3339 is required") + } + if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil { + return errors.New("silencing.entries.until_rfc3339 must be RFC3339") + } + } + return nil +} + +func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) { + defaultCfg := defaultOpsAlertRuntimeSettings() + if s == nil || s.settingRepo == nil { + return defaultCfg, nil + } + if ctx == nil { + ctx = context.Background() + } + + raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings) + if err != nil { + if errors.Is(err, ErrSettingNotFound) { + if b, mErr := json.Marshal(defaultCfg); mErr == nil { + _ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b)) + } + return defaultCfg, nil + } + return nil, err + } + + cfg := &OpsAlertRuntimeSettings{} + if err := json.Unmarshal([]byte(raw), cfg); err != nil { + return defaultCfg, nil + } + + if cfg.EvaluationIntervalSeconds <= 0 { + cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds + } + normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds) + normalizeOpsAlertSilencingSettings(&cfg.Silencing) + + return cfg, nil +} + +func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) { + if s == nil || s.settingRepo == nil { + return nil, errors.New("setting repository not initialized") + } + if ctx == nil { + ctx = context.Background() + } + if cfg == nil { + return nil, errors.New("invalid config") + } + + if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) { + return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400") + } + if cfg.DistributedLock.Enabled { + if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil { + return nil, err + } + } + if cfg.Silencing.Enabled { + if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil { + return nil, err + } + } + + defaultCfg := defaultOpsAlertRuntimeSettings() + normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds) + normalizeOpsAlertSilencingSettings(&cfg.Silencing) + + raw, err := json.Marshal(cfg) + if err != nil { + return nil, err + } + if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil { + return nil, err + } + + // Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated). + updated := &OpsAlertRuntimeSettings{} + _ = json.Unmarshal(raw, updated) + return updated, nil +} + diff --git a/backend/internal/service/ops_trends.go b/backend/internal/service/ops_trends.go new file mode 100644 index 00000000..9237544c --- /dev/null +++ b/backend/internal/service/ops_trends.go @@ -0,0 +1,27 @@ +package service + +import ( + "context" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +func (s *OpsService) GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + return s.opsRepo.GetThroughputTrend(ctx, filter, bucketSeconds) +} + diff --git a/backend/internal/service/ops_window_stats.go b/backend/internal/service/ops_window_stats.go new file mode 100644 index 00000000..71021d15 --- /dev/null +++ b/backend/internal/service/ops_window_stats.go @@ -0,0 +1,24 @@ +package service + +import ( + "context" + "time" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +// GetWindowStats returns lightweight request/token counts for the provided window. +// It is intended for realtime sampling (e.g. WebSocket QPS push) without computing percentiles/peaks. +func (s *OpsService) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + filter := &OpsDashboardFilter{ + StartTime: startTime, + EndTime: endTime, + } + return s.opsRepo.GetWindowStats(ctx, filter) +} diff --git a/backend/internal/service/wire.go b/backend/internal/service/wire.go index d4b984d6..bf78601f 100644 --- a/backend/internal/service/wire.go +++ b/backend/internal/service/wire.go @@ -1,10 +1,12 @@ package service import ( + "database/sql" "time" "github.com/Wei-Shaw/sub2api/internal/config" "github.com/google/wire" + "github.com/redis/go-redis/v9" ) // BuildInfo contains build information @@ -70,6 +72,57 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi return svc } +// ProvideOpsMetricsCollector creates and starts OpsMetricsCollector. +func ProvideOpsMetricsCollector( + opsRepo OpsRepository, + settingRepo SettingRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsMetricsCollector { + collector := NewOpsMetricsCollector(opsRepo, settingRepo, db, redisClient, cfg) + collector.Start() + return collector +} + +// ProvideOpsAggregationService creates and starts OpsAggregationService (hourly/daily pre-aggregation). +func ProvideOpsAggregationService( + opsRepo OpsRepository, + settingRepo SettingRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsAggregationService { + svc := NewOpsAggregationService(opsRepo, settingRepo, db, redisClient, cfg) + svc.Start() + return svc +} + +// ProvideOpsAlertEvaluatorService creates and starts OpsAlertEvaluatorService. +func ProvideOpsAlertEvaluatorService( + opsService *OpsService, + opsRepo OpsRepository, + emailService *EmailService, + redisClient *redis.Client, + cfg *config.Config, +) *OpsAlertEvaluatorService { + svc := NewOpsAlertEvaluatorService(opsService, opsRepo, emailService, redisClient, cfg) + svc.Start() + return svc +} + +// ProvideOpsCleanupService creates and starts OpsCleanupService (cron scheduled). +func ProvideOpsCleanupService( + opsRepo OpsRepository, + db *sql.DB, + redisClient *redis.Client, + cfg *config.Config, +) *OpsCleanupService { + svc := NewOpsCleanupService(opsRepo, db, redisClient, cfg) + svc.Start() + return svc +} + // ProviderSet is the Wire provider set for all services var ProviderSet = wire.NewSet( // Core services @@ -101,6 +154,11 @@ var ProviderSet = wire.NewSet( NewAccountUsageService, NewAccountTestService, NewSettingService, + NewOpsService, + ProvideOpsMetricsCollector, + ProvideOpsAggregationService, + ProvideOpsAlertEvaluatorService, + ProvideOpsCleanupService, NewEmailService, ProvideEmailQueueService, NewTurnstileService, From f3ed95d4dea643e54417d0b4e6b8ccd318e0631d Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:54:26 +0800 Subject: [PATCH 05/53] =?UTF-8?q?feat(handler):=20=E5=AE=9E=E7=8E=B0?= =?UTF-8?q?=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=20API=20=E5=A4=84=E7=90=86?= =?UTF-8?q?=E5=99=A8=E5=92=8C=E4=B8=AD=E9=97=B4=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 ops 错误日志记录器(ops_error_logger.go) - 新增 ops 主处理器(ops_handler.go) - 新增告警管理处理器(ops_alerts_handler.go) - 新增仪表板处理器(ops_dashboard_handler.go) - 新增实时监控处理器(ops_realtime_handler.go) - 新增配置管理处理器(ops_settings_handler.go) - 新增 WebSocket 处理器(ops_ws_handler.go) - 扩展设置 DTO 支持 ops 配置 - 新增客户端请求 ID 中间件(client_request_id.go) - 新增 WebSocket 查询令牌认证中间件(ws_query_token_auth.go) - 更新管理员认证中间件支持 ops 路由 - 注册 handler 依赖注入 --- .../handler/admin/ops_alerts_handler.go | 433 ++++++++++ .../handler/admin/ops_dashboard_handler.go | 243 ++++++ backend/internal/handler/admin/ops_handler.go | 364 +++++++++ .../handler/admin/ops_realtime_handler.go | 120 +++ .../handler/admin/ops_settings_handler.go | 103 +++ .../internal/handler/admin/ops_ws_handler.go | 765 ++++++++++++++++++ backend/internal/handler/dto/settings.go | 5 + backend/internal/handler/ops_error_logger.go | 681 ++++++++++++++++ backend/internal/handler/wire.go | 3 + .../internal/server/middleware/admin_auth.go | 52 ++ .../server/middleware/client_request_id.go | 31 + .../server/middleware/ws_query_token_auth.go | 54 ++ 12 files changed, 2854 insertions(+) create mode 100644 backend/internal/handler/admin/ops_alerts_handler.go create mode 100644 backend/internal/handler/admin/ops_dashboard_handler.go create mode 100644 backend/internal/handler/admin/ops_handler.go create mode 100644 backend/internal/handler/admin/ops_realtime_handler.go create mode 100644 backend/internal/handler/admin/ops_settings_handler.go create mode 100644 backend/internal/handler/admin/ops_ws_handler.go create mode 100644 backend/internal/handler/ops_error_logger.go create mode 100644 backend/internal/server/middleware/client_request_id.go create mode 100644 backend/internal/server/middleware/ws_query_token_auth.go diff --git a/backend/internal/handler/admin/ops_alerts_handler.go b/backend/internal/handler/admin/ops_alerts_handler.go new file mode 100644 index 00000000..19d9d870 --- /dev/null +++ b/backend/internal/handler/admin/ops_alerts_handler.go @@ -0,0 +1,433 @@ +package admin + +import ( + "encoding/json" + "fmt" + "math" + "net/http" + "strconv" + "strings" + + "github.com/Wei-Shaw/sub2api/internal/pkg/response" + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/gin-gonic/gin" + "github.com/gin-gonic/gin/binding" +) + +var validOpsAlertMetricTypes = []string{ + "success_rate", + "error_rate", + "upstream_error_rate", + "p95_latency_ms", + "p99_latency_ms", + "cpu_usage_percent", + "memory_usage_percent", + "concurrency_queue_depth", +} + +var validOpsAlertMetricTypeSet = func() map[string]struct{} { + set := make(map[string]struct{}, len(validOpsAlertMetricTypes)) + for _, v := range validOpsAlertMetricTypes { + set[v] = struct{}{} + } + return set +}() + +var validOpsAlertOperators = []string{">", "<", ">=", "<=", "==", "!="} + +var validOpsAlertOperatorSet = func() map[string]struct{} { + set := make(map[string]struct{}, len(validOpsAlertOperators)) + for _, v := range validOpsAlertOperators { + set[v] = struct{}{} + } + return set +}() + +var validOpsAlertSeverities = []string{"P0", "P1", "P2", "P3"} + +var validOpsAlertSeveritySet = func() map[string]struct{} { + set := make(map[string]struct{}, len(validOpsAlertSeverities)) + for _, v := range validOpsAlertSeverities { + set[v] = struct{}{} + } + return set +}() + +type opsAlertRuleValidatedInput struct { + Name string + MetricType string + Operator string + Threshold float64 + + Severity string + + WindowMinutes int + SustainedMinutes int + CooldownMinutes int + + Enabled bool + NotifyEmail bool + + WindowProvided bool + SustainedProvided bool + CooldownProvided bool + SeverityProvided bool + EnabledProvided bool + NotifyProvided bool +} + +func isPercentOrRateMetric(metricType string) bool { + switch metricType { + case "success_rate", + "error_rate", + "upstream_error_rate", + "cpu_usage_percent", + "memory_usage_percent": + return true + default: + return false + } +} + +func validateOpsAlertRulePayload(raw map[string]json.RawMessage) (*opsAlertRuleValidatedInput, error) { + if raw == nil { + return nil, fmt.Errorf("invalid request body") + } + + requiredFields := []string{"name", "metric_type", "operator", "threshold"} + for _, field := range requiredFields { + if _, ok := raw[field]; !ok { + return nil, fmt.Errorf("%s is required", field) + } + } + + var name string + if err := json.Unmarshal(raw["name"], &name); err != nil || strings.TrimSpace(name) == "" { + return nil, fmt.Errorf("name is required") + } + name = strings.TrimSpace(name) + + var metricType string + if err := json.Unmarshal(raw["metric_type"], &metricType); err != nil || strings.TrimSpace(metricType) == "" { + return nil, fmt.Errorf("metric_type is required") + } + metricType = strings.TrimSpace(metricType) + if _, ok := validOpsAlertMetricTypeSet[metricType]; !ok { + return nil, fmt.Errorf("metric_type must be one of: %s", strings.Join(validOpsAlertMetricTypes, ", ")) + } + + var operator string + if err := json.Unmarshal(raw["operator"], &operator); err != nil || strings.TrimSpace(operator) == "" { + return nil, fmt.Errorf("operator is required") + } + operator = strings.TrimSpace(operator) + if _, ok := validOpsAlertOperatorSet[operator]; !ok { + return nil, fmt.Errorf("operator must be one of: %s", strings.Join(validOpsAlertOperators, ", ")) + } + + var threshold float64 + if err := json.Unmarshal(raw["threshold"], &threshold); err != nil { + return nil, fmt.Errorf("threshold must be a number") + } + if math.IsNaN(threshold) || math.IsInf(threshold, 0) { + return nil, fmt.Errorf("threshold must be a finite number") + } + if isPercentOrRateMetric(metricType) { + if threshold < 0 || threshold > 100 { + return nil, fmt.Errorf("threshold must be between 0 and 100 for metric_type %s", metricType) + } + } else if threshold < 0 { + return nil, fmt.Errorf("threshold must be >= 0") + } + + validated := &opsAlertRuleValidatedInput{ + Name: name, + MetricType: metricType, + Operator: operator, + Threshold: threshold, + } + + if v, ok := raw["severity"]; ok { + validated.SeverityProvided = true + var sev string + if err := json.Unmarshal(v, &sev); err != nil { + return nil, fmt.Errorf("severity must be a string") + } + sev = strings.ToUpper(strings.TrimSpace(sev)) + if sev != "" { + if _, ok := validOpsAlertSeveritySet[sev]; !ok { + return nil, fmt.Errorf("severity must be one of: %s", strings.Join(validOpsAlertSeverities, ", ")) + } + validated.Severity = sev + } + } + if validated.Severity == "" { + validated.Severity = "P2" + } + + if v, ok := raw["enabled"]; ok { + validated.EnabledProvided = true + if err := json.Unmarshal(v, &validated.Enabled); err != nil { + return nil, fmt.Errorf("enabled must be a boolean") + } + } else { + validated.Enabled = true + } + + if v, ok := raw["notify_email"]; ok { + validated.NotifyProvided = true + if err := json.Unmarshal(v, &validated.NotifyEmail); err != nil { + return nil, fmt.Errorf("notify_email must be a boolean") + } + } else { + validated.NotifyEmail = true + } + + if v, ok := raw["window_minutes"]; ok { + validated.WindowProvided = true + if err := json.Unmarshal(v, &validated.WindowMinutes); err != nil { + return nil, fmt.Errorf("window_minutes must be an integer") + } + switch validated.WindowMinutes { + case 1, 5, 60: + default: + return nil, fmt.Errorf("window_minutes must be one of: 1, 5, 60") + } + } else { + validated.WindowMinutes = 1 + } + + if v, ok := raw["sustained_minutes"]; ok { + validated.SustainedProvided = true + if err := json.Unmarshal(v, &validated.SustainedMinutes); err != nil { + return nil, fmt.Errorf("sustained_minutes must be an integer") + } + if validated.SustainedMinutes < 1 || validated.SustainedMinutes > 1440 { + return nil, fmt.Errorf("sustained_minutes must be between 1 and 1440") + } + } else { + validated.SustainedMinutes = 1 + } + + if v, ok := raw["cooldown_minutes"]; ok { + validated.CooldownProvided = true + if err := json.Unmarshal(v, &validated.CooldownMinutes); err != nil { + return nil, fmt.Errorf("cooldown_minutes must be an integer") + } + if validated.CooldownMinutes < 0 || validated.CooldownMinutes > 1440 { + return nil, fmt.Errorf("cooldown_minutes must be between 0 and 1440") + } + } else { + validated.CooldownMinutes = 0 + } + + return validated, nil +} + +// ListAlertRules returns all ops alert rules. +// GET /api/v1/admin/ops/alert-rules +func (h *OpsHandler) ListAlertRules(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + rules, err := h.opsService.ListAlertRules(c.Request.Context()) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, rules) +} + +// CreateAlertRule creates an ops alert rule. +// POST /api/v1/admin/ops/alert-rules +func (h *OpsHandler) CreateAlertRule(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + var raw map[string]json.RawMessage + if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + validated, err := validateOpsAlertRulePayload(raw) + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + var rule service.OpsAlertRule + if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + + rule.Name = validated.Name + rule.MetricType = validated.MetricType + rule.Operator = validated.Operator + rule.Threshold = validated.Threshold + rule.WindowMinutes = validated.WindowMinutes + rule.SustainedMinutes = validated.SustainedMinutes + rule.CooldownMinutes = validated.CooldownMinutes + rule.Severity = validated.Severity + rule.Enabled = validated.Enabled + rule.NotifyEmail = validated.NotifyEmail + + created, err := h.opsService.CreateAlertRule(c.Request.Context(), &rule) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, created) +} + +// UpdateAlertRule updates an existing ops alert rule. +// PUT /api/v1/admin/ops/alert-rules/:id +func (h *OpsHandler) UpdateAlertRule(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + id, err := strconv.ParseInt(c.Param("id"), 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid rule ID") + return + } + + var raw map[string]json.RawMessage + if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + validated, err := validateOpsAlertRulePayload(raw) + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + var rule service.OpsAlertRule + if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + + rule.ID = id + rule.Name = validated.Name + rule.MetricType = validated.MetricType + rule.Operator = validated.Operator + rule.Threshold = validated.Threshold + rule.WindowMinutes = validated.WindowMinutes + rule.SustainedMinutes = validated.SustainedMinutes + rule.CooldownMinutes = validated.CooldownMinutes + rule.Severity = validated.Severity + rule.Enabled = validated.Enabled + rule.NotifyEmail = validated.NotifyEmail + + updated, err := h.opsService.UpdateAlertRule(c.Request.Context(), &rule) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, updated) +} + +// DeleteAlertRule deletes an ops alert rule. +// DELETE /api/v1/admin/ops/alert-rules/:id +func (h *OpsHandler) DeleteAlertRule(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + id, err := strconv.ParseInt(c.Param("id"), 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid rule ID") + return + } + + if err := h.opsService.DeleteAlertRule(c.Request.Context(), id); err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, gin.H{"deleted": true}) +} + +// ListAlertEvents lists recent ops alert events. +// GET /api/v1/admin/ops/alert-events +func (h *OpsHandler) ListAlertEvents(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + limit := 100 + if raw := strings.TrimSpace(c.Query("limit")); raw != "" { + n, err := strconv.Atoi(raw) + if err != nil || n <= 0 { + response.BadRequest(c, "Invalid limit") + return + } + limit = n + } + + filter := &service.OpsAlertEventFilter{ + Limit: limit, + Status: strings.TrimSpace(c.Query("status")), + Severity: strings.TrimSpace(c.Query("severity")), + } + + // Optional global filter support (platform/group/time range). + if platform := strings.TrimSpace(c.Query("platform")); platform != "" { + filter.Platform = platform + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + if startTime, endTime, err := parseOpsTimeRange(c, "24h"); err == nil { + // Only apply when explicitly provided to avoid surprising default narrowing. + if strings.TrimSpace(c.Query("start_time")) != "" || strings.TrimSpace(c.Query("end_time")) != "" || strings.TrimSpace(c.Query("time_range")) != "" { + filter.StartTime = &startTime + filter.EndTime = &endTime + } + } else { + response.BadRequest(c, err.Error()) + return + } + + events, err := h.opsService.ListAlertEvents(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, events) +} + diff --git a/backend/internal/handler/admin/ops_dashboard_handler.go b/backend/internal/handler/admin/ops_dashboard_handler.go new file mode 100644 index 00000000..2c87f734 --- /dev/null +++ b/backend/internal/handler/admin/ops_dashboard_handler.go @@ -0,0 +1,243 @@ +package admin + +import ( + "net/http" + "strconv" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/pkg/response" + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/gin-gonic/gin" +) + +// GetDashboardOverview returns vNext ops dashboard overview (raw path). +// GET /api/v1/admin/ops/dashboard/overview +func (h *OpsHandler) GetDashboardOverview(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsDashboardFilter{ + StartTime: startTime, + EndTime: endTime, + Platform: strings.TrimSpace(c.Query("platform")), + QueryMode: parseOpsQueryMode(c), + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + + data, err := h.opsService.GetDashboardOverview(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, data) +} + +// GetDashboardThroughputTrend returns throughput time series (raw path). +// GET /api/v1/admin/ops/dashboard/throughput-trend +func (h *OpsHandler) GetDashboardThroughputTrend(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsDashboardFilter{ + StartTime: startTime, + EndTime: endTime, + Platform: strings.TrimSpace(c.Query("platform")), + QueryMode: parseOpsQueryMode(c), + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + + bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime)) + data, err := h.opsService.GetThroughputTrend(c.Request.Context(), filter, bucketSeconds) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, data) +} + +// GetDashboardLatencyHistogram returns the latency distribution histogram (success requests). +// GET /api/v1/admin/ops/dashboard/latency-histogram +func (h *OpsHandler) GetDashboardLatencyHistogram(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsDashboardFilter{ + StartTime: startTime, + EndTime: endTime, + Platform: strings.TrimSpace(c.Query("platform")), + QueryMode: parseOpsQueryMode(c), + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + + data, err := h.opsService.GetLatencyHistogram(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, data) +} + +// GetDashboardErrorTrend returns error counts time series (raw path). +// GET /api/v1/admin/ops/dashboard/error-trend +func (h *OpsHandler) GetDashboardErrorTrend(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsDashboardFilter{ + StartTime: startTime, + EndTime: endTime, + Platform: strings.TrimSpace(c.Query("platform")), + QueryMode: parseOpsQueryMode(c), + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + + bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime)) + data, err := h.opsService.GetErrorTrend(c.Request.Context(), filter, bucketSeconds) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, data) +} + +// GetDashboardErrorDistribution returns error distribution by status code (raw path). +// GET /api/v1/admin/ops/dashboard/error-distribution +func (h *OpsHandler) GetDashboardErrorDistribution(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsDashboardFilter{ + StartTime: startTime, + EndTime: endTime, + Platform: strings.TrimSpace(c.Query("platform")), + QueryMode: parseOpsQueryMode(c), + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + + data, err := h.opsService.GetErrorDistribution(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + response.Success(c, data) +} + +func pickThroughputBucketSeconds(window time.Duration) int { + // Keep buckets predictable and avoid huge responses. + switch { + case window <= 2*time.Hour: + return 60 + case window <= 24*time.Hour: + return 300 + default: + return 3600 + } +} + +func parseOpsQueryMode(c *gin.Context) service.OpsQueryMode { + if c == nil { + return "" + } + raw := strings.TrimSpace(c.Query("mode")) + if raw == "" { + // Empty means "use server default" (DB setting ops_query_mode_default). + return "" + } + return service.ParseOpsQueryMode(raw) +} diff --git a/backend/internal/handler/admin/ops_handler.go b/backend/internal/handler/admin/ops_handler.go new file mode 100644 index 00000000..bff7426a --- /dev/null +++ b/backend/internal/handler/admin/ops_handler.go @@ -0,0 +1,364 @@ +package admin + +import ( + "errors" + "fmt" + "io" + "net/http" + "strconv" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/pkg/response" + "github.com/Wei-Shaw/sub2api/internal/server/middleware" + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/gin-gonic/gin" +) + +type OpsHandler struct { + opsService *service.OpsService +} + +func NewOpsHandler(opsService *service.OpsService) *OpsHandler { + return &OpsHandler{opsService: opsService} +} + +// GetErrorLogs lists ops error logs. +// GET /api/v1/admin/ops/errors +func (h *OpsHandler) GetErrorLogs(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + page, pageSize := response.ParsePagination(c) + // Ops list can be larger than standard admin tables. + if pageSize > 500 { + pageSize = 500 + } + + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsErrorLogFilter{ + Page: page, + PageSize: pageSize, + } + if !startTime.IsZero() { + filter.StartTime = &startTime + } + if !endTime.IsZero() { + filter.EndTime = &endTime + } + + if platform := strings.TrimSpace(c.Query("platform")); platform != "" { + filter.Platform = platform + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + if v := strings.TrimSpace(c.Query("account_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid account_id") + return + } + filter.AccountID = &id + } + if phase := strings.TrimSpace(c.Query("phase")); phase != "" { + filter.Phase = phase + } + if q := strings.TrimSpace(c.Query("q")); q != "" { + filter.Query = q + } + if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" { + parts := strings.Split(statusCodesStr, ",") + out := make([]int, 0, len(parts)) + for _, part := range parts { + p := strings.TrimSpace(part) + if p == "" { + continue + } + n, err := strconv.Atoi(p) + if err != nil || n < 0 { + response.BadRequest(c, "Invalid status_codes") + return + } + out = append(out, n) + } + filter.StatusCodes = out + } + + result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + + response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize) +} + +// GetErrorLogByID returns a single error log detail. +// GET /api/v1/admin/ops/errors/:id +func (h *OpsHandler) GetErrorLogByID(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + detail, err := h.opsService.GetErrorLogByID(c.Request.Context(), id) + if err != nil { + response.ErrorFrom(c, err) + return + } + + response.Success(c, detail) +} + +// ListRequestDetails returns a request-level list (success + error) for drill-down. +// GET /api/v1/admin/ops/requests +func (h *OpsHandler) ListRequestDetails(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + page, pageSize := response.ParsePagination(c) + if pageSize > 100 { + pageSize = 100 + } + + startTime, endTime, err := parseOpsTimeRange(c, "1h") + if err != nil { + response.BadRequest(c, err.Error()) + return + } + + filter := &service.OpsRequestDetailFilter{ + Page: page, + PageSize: pageSize, + StartTime: &startTime, + EndTime: &endTime, + } + + filter.Kind = strings.TrimSpace(c.Query("kind")) + filter.Platform = strings.TrimSpace(c.Query("platform")) + filter.Model = strings.TrimSpace(c.Query("model")) + filter.RequestID = strings.TrimSpace(c.Query("request_id")) + filter.Query = strings.TrimSpace(c.Query("q")) + filter.Sort = strings.TrimSpace(c.Query("sort")) + + if v := strings.TrimSpace(c.Query("user_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid user_id") + return + } + filter.UserID = &id + } + if v := strings.TrimSpace(c.Query("api_key_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid api_key_id") + return + } + filter.APIKeyID = &id + } + if v := strings.TrimSpace(c.Query("account_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid account_id") + return + } + filter.AccountID = &id + } + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + filter.GroupID = &id + } + + if v := strings.TrimSpace(c.Query("min_duration_ms")); v != "" { + parsed, err := strconv.Atoi(v) + if err != nil || parsed < 0 { + response.BadRequest(c, "Invalid min_duration_ms") + return + } + filter.MinDurationMs = &parsed + } + if v := strings.TrimSpace(c.Query("max_duration_ms")); v != "" { + parsed, err := strconv.Atoi(v) + if err != nil || parsed < 0 { + response.BadRequest(c, "Invalid max_duration_ms") + return + } + filter.MaxDurationMs = &parsed + } + + out, err := h.opsService.ListRequestDetails(c.Request.Context(), filter) + if err != nil { + // Invalid sort/kind/platform etc should be a bad request; keep it simple. + if strings.Contains(strings.ToLower(err.Error()), "invalid") { + response.BadRequest(c, err.Error()) + return + } + response.Error(c, http.StatusInternalServerError, "Failed to list request details") + return + } + + response.Paginated(c, out.Items, out.Total, out.Page, out.PageSize) +} + +type opsRetryRequest struct { + Mode string `json:"mode"` + PinnedAccountID *int64 `json:"pinned_account_id"` +} + +// RetryErrorRequest retries a failed request using stored request_body. +// POST /api/v1/admin/ops/errors/:id/retry +func (h *OpsHandler) RetryErrorRequest(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + subject, ok := middleware.GetAuthSubjectFromContext(c) + if !ok || subject.UserID <= 0 { + response.Error(c, http.StatusUnauthorized, "Unauthorized") + return + } + + idStr := strings.TrimSpace(c.Param("id")) + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid error id") + return + } + + req := opsRetryRequest{Mode: service.OpsRetryModeClient} + if err := c.ShouldBindJSON(&req); err != nil && !errors.Is(err, io.EOF) { + response.BadRequest(c, "Invalid request: "+err.Error()) + return + } + if strings.TrimSpace(req.Mode) == "" { + req.Mode = service.OpsRetryModeClient + } + + result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, req.Mode, req.PinnedAccountID) + if err != nil { + response.ErrorFrom(c, err) + return + } + + response.Success(c, result) +} + +func parseOpsTimeRange(c *gin.Context, defaultRange string) (time.Time, time.Time, error) { + startStr := strings.TrimSpace(c.Query("start_time")) + endStr := strings.TrimSpace(c.Query("end_time")) + + parseTS := func(s string) (time.Time, error) { + if s == "" { + return time.Time{}, nil + } + if t, err := time.Parse(time.RFC3339Nano, s); err == nil { + return t, nil + } + return time.Parse(time.RFC3339, s) + } + + start, err := parseTS(startStr) + if err != nil { + return time.Time{}, time.Time{}, err + } + end, err := parseTS(endStr) + if err != nil { + return time.Time{}, time.Time{}, err + } + + // start/end explicitly provided (even partially) + if startStr != "" || endStr != "" { + if end.IsZero() { + end = time.Now() + } + if start.IsZero() { + dur, _ := parseOpsDuration(defaultRange) + start = end.Add(-dur) + } + if start.After(end) { + return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: start_time must be <= end_time") + } + if end.Sub(start) > 30*24*time.Hour { + return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days") + } + return start, end, nil + } + + // time_range fallback + tr := strings.TrimSpace(c.Query("time_range")) + if tr == "" { + tr = defaultRange + } + dur, ok := parseOpsDuration(tr) + if !ok { + dur, _ = parseOpsDuration(defaultRange) + } + + end = time.Now() + start = end.Add(-dur) + if end.Sub(start) > 30*24*time.Hour { + return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days") + } + return start, end, nil +} + +func parseOpsDuration(v string) (time.Duration, bool) { + switch strings.TrimSpace(v) { + case "5m": + return 5 * time.Minute, true + case "30m": + return 30 * time.Minute, true + case "1h": + return time.Hour, true + case "6h": + return 6 * time.Hour, true + case "24h": + return 24 * time.Hour, true + default: + return 0, false + } +} diff --git a/backend/internal/handler/admin/ops_realtime_handler.go b/backend/internal/handler/admin/ops_realtime_handler.go new file mode 100644 index 00000000..0c23c13b --- /dev/null +++ b/backend/internal/handler/admin/ops_realtime_handler.go @@ -0,0 +1,120 @@ +package admin + +import ( + "net/http" + "strconv" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/pkg/response" + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/gin-gonic/gin" +) + +// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account. +// GET /api/v1/admin/ops/concurrency +func (h *OpsHandler) GetConcurrencyStats(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) { + response.Success(c, gin.H{ + "enabled": false, + "platform": map[string]*service.PlatformConcurrencyInfo{}, + "group": map[int64]*service.GroupConcurrencyInfo{}, + "account": map[int64]*service.AccountConcurrencyInfo{}, + "timestamp": time.Now().UTC(), + }) + return + } + + platformFilter := strings.TrimSpace(c.Query("platform")) + var groupID *int64 + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + groupID = &id + } + + platform, group, account, collectedAt, err := h.opsService.GetConcurrencyStats(c.Request.Context(), platformFilter, groupID) + if err != nil { + response.ErrorFrom(c, err) + return + } + + payload := gin.H{ + "enabled": true, + "platform": platform, + "group": group, + "account": account, + } + if collectedAt != nil { + payload["timestamp"] = collectedAt.UTC() + } + response.Success(c, payload) +} + +// GetAccountAvailability returns account availability statistics. +// GET /api/v1/admin/ops/account-availability +// +// Query params: +// - platform: optional +// - group_id: optional +func (h *OpsHandler) GetAccountAvailability(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) { + response.Success(c, gin.H{ + "enabled": false, + "platform": map[string]*service.PlatformAvailability{}, + "group": map[int64]*service.GroupAvailability{}, + "account": map[int64]*service.AccountAvailability{}, + "timestamp": time.Now().UTC(), + }) + return + } + + platform := strings.TrimSpace(c.Query("platform")) + var groupID *int64 + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + groupID = &id + } + + platformStats, groupStats, accountStats, collectedAt, err := h.opsService.GetAccountAvailabilityStats(c.Request.Context(), platform, groupID) + if err != nil { + response.ErrorFrom(c, err) + return + } + + payload := gin.H{ + "enabled": true, + "platform": platformStats, + "group": groupStats, + "account": accountStats, + } + if collectedAt != nil { + payload["timestamp"] = collectedAt.UTC() + } + response.Success(c, payload) +} diff --git a/backend/internal/handler/admin/ops_settings_handler.go b/backend/internal/handler/admin/ops_settings_handler.go new file mode 100644 index 00000000..e76c1b20 --- /dev/null +++ b/backend/internal/handler/admin/ops_settings_handler.go @@ -0,0 +1,103 @@ +package admin + +import ( + "net/http" + + "github.com/Wei-Shaw/sub2api/internal/pkg/response" + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/gin-gonic/gin" +) + +// GetEmailNotificationConfig returns Ops email notification config (DB-backed). +// GET /api/v1/admin/ops/email-notification/config +func (h *OpsHandler) GetEmailNotificationConfig(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + cfg, err := h.opsService.GetEmailNotificationConfig(c.Request.Context()) + if err != nil { + response.Error(c, http.StatusInternalServerError, "Failed to get email notification config") + return + } + response.Success(c, cfg) +} + +// UpdateEmailNotificationConfig updates Ops email notification config (DB-backed). +// PUT /api/v1/admin/ops/email-notification/config +func (h *OpsHandler) UpdateEmailNotificationConfig(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + var req service.OpsEmailNotificationConfigUpdateRequest + if err := c.ShouldBindJSON(&req); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + + updated, err := h.opsService.UpdateEmailNotificationConfig(c.Request.Context(), &req) + if err != nil { + // Most failures here are validation errors from request payload; treat as 400. + response.Error(c, http.StatusBadRequest, err.Error()) + return + } + response.Success(c, updated) +} + +// GetAlertRuntimeSettings returns Ops alert evaluator runtime settings (DB-backed). +// GET /api/v1/admin/ops/runtime/alert +func (h *OpsHandler) GetAlertRuntimeSettings(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + cfg, err := h.opsService.GetOpsAlertRuntimeSettings(c.Request.Context()) + if err != nil { + response.Error(c, http.StatusInternalServerError, "Failed to get alert runtime settings") + return + } + response.Success(c, cfg) +} + +// UpdateAlertRuntimeSettings updates Ops alert evaluator runtime settings (DB-backed). +// PUT /api/v1/admin/ops/runtime/alert +func (h *OpsHandler) UpdateAlertRuntimeSettings(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + var req service.OpsAlertRuntimeSettings + if err := c.ShouldBindJSON(&req); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + + updated, err := h.opsService.UpdateOpsAlertRuntimeSettings(c.Request.Context(), &req) + if err != nil { + response.Error(c, http.StatusBadRequest, err.Error()) + return + } + response.Success(c, updated) +} + diff --git a/backend/internal/handler/admin/ops_ws_handler.go b/backend/internal/handler/admin/ops_ws_handler.go new file mode 100644 index 00000000..4bbd9055 --- /dev/null +++ b/backend/internal/handler/admin/ops_ws_handler.go @@ -0,0 +1,765 @@ +package admin + +import ( + "context" + "encoding/json" + "log" + "math" + "net" + "net/http" + "net/netip" + "net/url" + "os" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/gin-gonic/gin" + "github.com/gorilla/websocket" +) + +type OpsWSProxyConfig struct { + TrustProxy bool + TrustedProxies []netip.Prefix + OriginPolicy string +} + +const ( + envOpsWSTrustProxy = "OPS_WS_TRUST_PROXY" + envOpsWSTrustedProxies = "OPS_WS_TRUSTED_PROXIES" + envOpsWSOriginPolicy = "OPS_WS_ORIGIN_POLICY" + envOpsWSMaxConns = "OPS_WS_MAX_CONNS" + envOpsWSMaxConnsPerIP = "OPS_WS_MAX_CONNS_PER_IP" +) + +const ( + OriginPolicyStrict = "strict" + OriginPolicyPermissive = "permissive" +) + +var opsWSProxyConfig = loadOpsWSProxyConfigFromEnv() + +var upgrader = websocket.Upgrader{ + CheckOrigin: func(r *http.Request) bool { + return isAllowedOpsWSOrigin(r) + }, + // Subprotocol negotiation: + // - The frontend passes ["sub2api-admin", "jwt."]. + // - We always select "sub2api-admin" so the token is never echoed back in the handshake response. + Subprotocols: []string{"sub2api-admin"}, +} + +const ( + qpsWSPushInterval = 2 * time.Second + qpsWSRefreshInterval = 5 * time.Second + qpsWSRequestCountWindow = 1 * time.Minute + + defaultMaxWSConns = 100 + defaultMaxWSConnsPerIP = 20 +) + +var wsConnCount atomic.Int32 +var wsConnCountByIP sync.Map // map[string]*atomic.Int32 + +const qpsWSIdleStopDelay = 30 * time.Second + +const ( + opsWSCloseRealtimeDisabled = 4001 +) + +var qpsWSIdleStopMu sync.Mutex +var qpsWSIdleStopTimer *time.Timer + +func cancelQPSWSIdleStop() { + qpsWSIdleStopMu.Lock() + if qpsWSIdleStopTimer != nil { + qpsWSIdleStopTimer.Stop() + qpsWSIdleStopTimer = nil + } + qpsWSIdleStopMu.Unlock() +} + +func scheduleQPSWSIdleStop() { + qpsWSIdleStopMu.Lock() + if qpsWSIdleStopTimer != nil { + qpsWSIdleStopMu.Unlock() + return + } + qpsWSIdleStopTimer = time.AfterFunc(qpsWSIdleStopDelay, func() { + // Only stop if truly idle at fire time. + if wsConnCount.Load() == 0 { + qpsWSCache.Stop() + } + qpsWSIdleStopMu.Lock() + qpsWSIdleStopTimer = nil + qpsWSIdleStopMu.Unlock() + }) + qpsWSIdleStopMu.Unlock() +} + +type opsWSRuntimeLimits struct { + MaxConns int32 + MaxConnsPerIP int32 +} + +var opsWSLimits = loadOpsWSRuntimeLimitsFromEnv() + +const ( + qpsWSWriteTimeout = 10 * time.Second + qpsWSPongWait = 60 * time.Second + qpsWSPingInterval = 30 * time.Second + + // We don't expect clients to send application messages; we only read to process control frames (Pong/Close). + qpsWSMaxReadBytes = 1024 +) + +type opsWSQPSCache struct { + refreshInterval time.Duration + requestCountWindow time.Duration + + lastUpdatedUnixNano atomic.Int64 + payload atomic.Value // []byte + + opsService *service.OpsService + cancel context.CancelFunc + done chan struct{} + + mu sync.Mutex + running bool +} + +var qpsWSCache = &opsWSQPSCache{ + refreshInterval: qpsWSRefreshInterval, + requestCountWindow: qpsWSRequestCountWindow, +} + +func (c *opsWSQPSCache) start(opsService *service.OpsService) { + if c == nil || opsService == nil { + return + } + + for { + c.mu.Lock() + if c.running { + c.mu.Unlock() + return + } + + // If a previous refresh loop is currently stopping, wait for it to fully exit. + done := c.done + if done != nil { + c.mu.Unlock() + <-done + + c.mu.Lock() + if c.done == done && !c.running { + c.done = nil + } + c.mu.Unlock() + continue + } + + c.opsService = opsService + ctx, cancel := context.WithCancel(context.Background()) + c.cancel = cancel + c.done = make(chan struct{}) + done = c.done + c.running = true + c.mu.Unlock() + + go func() { + defer close(done) + c.refreshLoop(ctx) + }() + return + } +} + +// Stop stops the background refresh loop. +// It is safe to call multiple times. +func (c *opsWSQPSCache) Stop() { + if c == nil { + return + } + + c.mu.Lock() + if !c.running { + done := c.done + c.mu.Unlock() + if done != nil { + <-done + } + return + } + cancel := c.cancel + c.cancel = nil + c.running = false + c.opsService = nil + done := c.done + c.mu.Unlock() + + if cancel != nil { + cancel() + } + if done != nil { + <-done + } + + c.mu.Lock() + if c.done == done && !c.running { + c.done = nil + } + c.mu.Unlock() +} + +func (c *opsWSQPSCache) refreshLoop(ctx context.Context) { + ticker := time.NewTicker(c.refreshInterval) + defer ticker.Stop() + + c.refresh(ctx) + for { + select { + case <-ticker.C: + c.refresh(ctx) + case <-ctx.Done(): + return + } + } +} + +func (c *opsWSQPSCache) refresh(parentCtx context.Context) { + if c == nil { + return + } + + c.mu.Lock() + opsService := c.opsService + c.mu.Unlock() + if opsService == nil { + return + } + + if parentCtx == nil { + parentCtx = context.Background() + } + ctx, cancel := context.WithTimeout(parentCtx, 10*time.Second) + defer cancel() + + now := time.Now().UTC() + stats, err := opsService.GetWindowStats(ctx, now.Add(-c.requestCountWindow), now) + if err != nil || stats == nil { + if err != nil { + log.Printf("[OpsWS] refresh: get window stats failed: %v", err) + } + return + } + + requestCount := stats.SuccessCount + stats.ErrorCountTotal + qps := 0.0 + tps := 0.0 + if c.requestCountWindow > 0 { + seconds := c.requestCountWindow.Seconds() + qps = roundTo1DP(float64(requestCount) / seconds) + tps = roundTo1DP(float64(stats.TokenConsumed) / seconds) + } + + payload := gin.H{ + "type": "qps_update", + "timestamp": now.Format(time.RFC3339), + "data": gin.H{ + "qps": qps, + "tps": tps, + "request_count": requestCount, + }, + } + + msg, err := json.Marshal(payload) + if err != nil { + log.Printf("[OpsWS] refresh: marshal payload failed: %v", err) + return + } + + c.payload.Store(msg) + c.lastUpdatedUnixNano.Store(now.UnixNano()) +} + +func roundTo1DP(v float64) float64 { + return math.Round(v*10) / 10 +} + +func (c *opsWSQPSCache) getPayload() []byte { + if c == nil { + return nil + } + if cached, ok := c.payload.Load().([]byte); ok && cached != nil { + return cached + } + return nil +} + +func closeWS(conn *websocket.Conn, code int, reason string) { + if conn == nil { + return + } + msg := websocket.FormatCloseMessage(code, reason) + _ = conn.WriteControl(websocket.CloseMessage, msg, time.Now().Add(qpsWSWriteTimeout)) + _ = conn.Close() +} + +// QPSWSHandler handles realtime QPS push via WebSocket. +// GET /api/v1/admin/ops/ws/qps +func (h *OpsHandler) QPSWSHandler(c *gin.Context) { + clientIP := requestClientIP(c.Request) + + if h == nil || h.opsService == nil { + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "ops service not initialized"}) + return + } + + // If realtime monitoring is disabled, prefer a successful WS upgrade followed by a clean close + // with a deterministic close code. This prevents clients from spinning on 404/1006 reconnect loops. + if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) { + conn, err := upgrader.Upgrade(c.Writer, c.Request, nil) + if err != nil { + c.JSON(http.StatusNotFound, gin.H{"error": "ops realtime monitoring is disabled"}) + return + } + closeWS(conn, opsWSCloseRealtimeDisabled, "realtime_disabled") + return + } + + cancelQPSWSIdleStop() + // Lazily start the background refresh loop so unit tests that never hit the + // websocket route don't spawn goroutines that depend on DB/Redis stubs. + qpsWSCache.start(h.opsService) + + // Reserve a global slot before upgrading the connection to keep the limit strict. + if !tryAcquireOpsWSTotalSlot(opsWSLimits.MaxConns) { + log.Printf("[OpsWS] connection limit reached: %d/%d", wsConnCount.Load(), opsWSLimits.MaxConns) + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"}) + return + } + defer func() { + if wsConnCount.Add(-1) == 0 { + scheduleQPSWSIdleStop() + } + }() + + if opsWSLimits.MaxConnsPerIP > 0 && clientIP != "" { + if !tryAcquireOpsWSIPSlot(clientIP, opsWSLimits.MaxConnsPerIP) { + log.Printf("[OpsWS] per-ip connection limit reached: ip=%s limit=%d", clientIP, opsWSLimits.MaxConnsPerIP) + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"}) + return + } + defer releaseOpsWSIPSlot(clientIP) + } + + conn, err := upgrader.Upgrade(c.Writer, c.Request, nil) + if err != nil { + log.Printf("[OpsWS] upgrade failed: %v", err) + return + } + + defer func() { + _ = conn.Close() + }() + + handleQPSWebSocket(c.Request.Context(), conn) +} + +func tryAcquireOpsWSTotalSlot(limit int32) bool { + if limit <= 0 { + return true + } + for { + current := wsConnCount.Load() + if current >= limit { + return false + } + if wsConnCount.CompareAndSwap(current, current+1) { + return true + } + } +} + +func tryAcquireOpsWSIPSlot(clientIP string, limit int32) bool { + if strings.TrimSpace(clientIP) == "" || limit <= 0 { + return true + } + + v, _ := wsConnCountByIP.LoadOrStore(clientIP, &atomic.Int32{}) + counter := v.(*atomic.Int32) + + for { + current := counter.Load() + if current >= limit { + return false + } + if counter.CompareAndSwap(current, current+1) { + return true + } + } +} + +func releaseOpsWSIPSlot(clientIP string) { + if strings.TrimSpace(clientIP) == "" { + return + } + + v, ok := wsConnCountByIP.Load(clientIP) + if !ok { + return + } + counter := v.(*atomic.Int32) + next := counter.Add(-1) + if next <= 0 { + // Best-effort cleanup; safe even if a new slot was acquired concurrently. + wsConnCountByIP.Delete(clientIP) + } +} + +func handleQPSWebSocket(parentCtx context.Context, conn *websocket.Conn) { + if conn == nil { + return + } + + ctx, cancel := context.WithCancel(parentCtx) + defer cancel() + + var closeOnce sync.Once + closeConn := func() { + closeOnce.Do(func() { + _ = conn.Close() + }) + } + + closeFrameCh := make(chan []byte, 1) + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + defer cancel() + + conn.SetReadLimit(qpsWSMaxReadBytes) + if err := conn.SetReadDeadline(time.Now().Add(qpsWSPongWait)); err != nil { + log.Printf("[OpsWS] set read deadline failed: %v", err) + return + } + conn.SetPongHandler(func(string) error { + return conn.SetReadDeadline(time.Now().Add(qpsWSPongWait)) + }) + conn.SetCloseHandler(func(code int, text string) error { + select { + case closeFrameCh <- websocket.FormatCloseMessage(code, text): + default: + } + cancel() + return nil + }) + + for { + _, _, err := conn.ReadMessage() + if err != nil { + if websocket.IsUnexpectedCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway, websocket.CloseNoStatusReceived) { + log.Printf("[OpsWS] read failed: %v", err) + } + return + } + } + }() + + // Push QPS data every 2 seconds (values are globally cached and refreshed at most once per qpsWSRefreshInterval). + pushTicker := time.NewTicker(qpsWSPushInterval) + defer pushTicker.Stop() + + // Heartbeat ping every 30 seconds. + pingTicker := time.NewTicker(qpsWSPingInterval) + defer pingTicker.Stop() + + writeWithTimeout := func(messageType int, data []byte) error { + if err := conn.SetWriteDeadline(time.Now().Add(qpsWSWriteTimeout)); err != nil { + return err + } + return conn.WriteMessage(messageType, data) + } + + sendClose := func(closeFrame []byte) { + if closeFrame == nil { + closeFrame = websocket.FormatCloseMessage(websocket.CloseNormalClosure, "") + } + _ = writeWithTimeout(websocket.CloseMessage, closeFrame) + } + + for { + select { + case <-pushTicker.C: + msg := qpsWSCache.getPayload() + if msg == nil { + continue + } + if err := writeWithTimeout(websocket.TextMessage, msg); err != nil { + log.Printf("[OpsWS] write failed: %v", err) + cancel() + closeConn() + wg.Wait() + return + } + + case <-pingTicker.C: + if err := writeWithTimeout(websocket.PingMessage, nil); err != nil { + log.Printf("[OpsWS] ping failed: %v", err) + cancel() + closeConn() + wg.Wait() + return + } + + case closeFrame := <-closeFrameCh: + sendClose(closeFrame) + closeConn() + wg.Wait() + return + + case <-ctx.Done(): + var closeFrame []byte + select { + case closeFrame = <-closeFrameCh: + default: + } + sendClose(closeFrame) + + closeConn() + wg.Wait() + return + } + } +} + +func isAllowedOpsWSOrigin(r *http.Request) bool { + if r == nil { + return false + } + origin := strings.TrimSpace(r.Header.Get("Origin")) + if origin == "" { + switch strings.ToLower(strings.TrimSpace(opsWSProxyConfig.OriginPolicy)) { + case OriginPolicyStrict: + return false + case OriginPolicyPermissive, "": + return true + default: + return true + } + } + parsed, err := url.Parse(origin) + if err != nil || parsed.Hostname() == "" { + return false + } + originHost := strings.ToLower(parsed.Hostname()) + + trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r) + reqHost := hostWithoutPort(r.Host) + if trustProxyHeaders { + xfHost := strings.TrimSpace(r.Header.Get("X-Forwarded-Host")) + if xfHost != "" { + xfHost = strings.TrimSpace(strings.Split(xfHost, ",")[0]) + if xfHost != "" { + reqHost = hostWithoutPort(xfHost) + } + } + } + reqHost = strings.ToLower(reqHost) + if reqHost == "" { + return false + } + return originHost == reqHost +} + +func shouldTrustOpsWSProxyHeaders(r *http.Request) bool { + if r == nil { + return false + } + if !opsWSProxyConfig.TrustProxy { + return false + } + peerIP, ok := requestPeerIP(r) + if !ok { + return false + } + return isAddrInTrustedProxies(peerIP, opsWSProxyConfig.TrustedProxies) +} + +func requestPeerIP(r *http.Request) (netip.Addr, bool) { + if r == nil { + return netip.Addr{}, false + } + host, _, err := net.SplitHostPort(strings.TrimSpace(r.RemoteAddr)) + if err != nil { + host = strings.TrimSpace(r.RemoteAddr) + } + host = strings.TrimPrefix(host, "[") + host = strings.TrimSuffix(host, "]") + if host == "" { + return netip.Addr{}, false + } + addr, err := netip.ParseAddr(host) + if err != nil { + return netip.Addr{}, false + } + return addr.Unmap(), true +} + +func requestClientIP(r *http.Request) string { + if r == nil { + return "" + } + + trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r) + if trustProxyHeaders { + xff := strings.TrimSpace(r.Header.Get("X-Forwarded-For")) + if xff != "" { + // Use the left-most entry (original client). If multiple proxies add values, they are comma-separated. + xff = strings.TrimSpace(strings.Split(xff, ",")[0]) + xff = strings.TrimPrefix(xff, "[") + xff = strings.TrimSuffix(xff, "]") + if addr, err := netip.ParseAddr(xff); err == nil && addr.IsValid() { + return addr.Unmap().String() + } + } + } + + if peer, ok := requestPeerIP(r); ok && peer.IsValid() { + return peer.String() + } + return "" +} + +func isAddrInTrustedProxies(addr netip.Addr, trusted []netip.Prefix) bool { + if !addr.IsValid() { + return false + } + for _, p := range trusted { + if p.Contains(addr) { + return true + } + } + return false +} + +func loadOpsWSProxyConfigFromEnv() OpsWSProxyConfig { + cfg := OpsWSProxyConfig{ + TrustProxy: true, + TrustedProxies: defaultTrustedProxies(), + OriginPolicy: OriginPolicyPermissive, + } + + if v := strings.TrimSpace(os.Getenv(envOpsWSTrustProxy)); v != "" { + if parsed, err := strconv.ParseBool(v); err == nil { + cfg.TrustProxy = parsed + } else { + log.Printf("[OpsWS] invalid %s=%q (expected bool); using default=%v", envOpsWSTrustProxy, v, cfg.TrustProxy) + } + } + + if raw := strings.TrimSpace(os.Getenv(envOpsWSTrustedProxies)); raw != "" { + prefixes, invalid := parseTrustedProxyList(raw) + if len(invalid) > 0 { + log.Printf("[OpsWS] invalid %s entries ignored: %s", envOpsWSTrustedProxies, strings.Join(invalid, ", ")) + } + cfg.TrustedProxies = prefixes + } + + if v := strings.TrimSpace(os.Getenv(envOpsWSOriginPolicy)); v != "" { + normalized := strings.ToLower(v) + switch normalized { + case OriginPolicyStrict, OriginPolicyPermissive: + cfg.OriginPolicy = normalized + default: + log.Printf("[OpsWS] invalid %s=%q (expected %q or %q); using default=%q", envOpsWSOriginPolicy, v, OriginPolicyStrict, OriginPolicyPermissive, cfg.OriginPolicy) + } + } + + return cfg +} + +func loadOpsWSRuntimeLimitsFromEnv() opsWSRuntimeLimits { + cfg := opsWSRuntimeLimits{ + MaxConns: defaultMaxWSConns, + MaxConnsPerIP: defaultMaxWSConnsPerIP, + } + + if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConns)); v != "" { + if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 { + cfg.MaxConns = int32(parsed) + } else { + log.Printf("[OpsWS] invalid %s=%q (expected int>0); using default=%d", envOpsWSMaxConns, v, cfg.MaxConns) + } + } + if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConnsPerIP)); v != "" { + if parsed, err := strconv.Atoi(v); err == nil && parsed >= 0 { + cfg.MaxConnsPerIP = int32(parsed) + } else { + log.Printf("[OpsWS] invalid %s=%q (expected int>=0); using default=%d", envOpsWSMaxConnsPerIP, v, cfg.MaxConnsPerIP) + } + } + return cfg +} + +func defaultTrustedProxies() []netip.Prefix { + prefixes, _ := parseTrustedProxyList("127.0.0.0/8,::1/128") + return prefixes +} + +func parseTrustedProxyList(raw string) (prefixes []netip.Prefix, invalid []string) { + for _, token := range strings.Split(raw, ",") { + item := strings.TrimSpace(token) + if item == "" { + continue + } + + var ( + p netip.Prefix + err error + ) + if strings.Contains(item, "/") { + p, err = netip.ParsePrefix(item) + } else { + var addr netip.Addr + addr, err = netip.ParseAddr(item) + if err == nil { + addr = addr.Unmap() + bits := 128 + if addr.Is4() { + bits = 32 + } + p = netip.PrefixFrom(addr, bits) + } + } + + if err != nil || !p.IsValid() { + invalid = append(invalid, item) + continue + } + + prefixes = append(prefixes, p.Masked()) + } + return prefixes, invalid +} + +func hostWithoutPort(hostport string) string { + hostport = strings.TrimSpace(hostport) + if hostport == "" { + return "" + } + if host, _, err := net.SplitHostPort(hostport); err == nil { + return host + } + if strings.HasPrefix(hostport, "[") && strings.HasSuffix(hostport, "]") { + return strings.Trim(hostport, "[]") + } + parts := strings.Split(hostport, ":") + return parts[0] +} diff --git a/backend/internal/handler/dto/settings.go b/backend/internal/handler/dto/settings.go index 4c50cedf..6fd53b26 100644 --- a/backend/internal/handler/dto/settings.go +++ b/backend/internal/handler/dto/settings.go @@ -37,6 +37,11 @@ type SystemSettings struct { // Identity patch configuration (Claude -> Gemini) EnableIdentityPatch bool `json:"enable_identity_patch"` IdentityPatchPrompt string `json:"identity_patch_prompt"` + + // Ops monitoring (vNext) + OpsMonitoringEnabled bool `json:"ops_monitoring_enabled"` + OpsRealtimeMonitoringEnabled bool `json:"ops_realtime_monitoring_enabled"` + OpsQueryModeDefault string `json:"ops_query_mode_default"` } type PublicSettings struct { diff --git a/backend/internal/handler/ops_error_logger.go b/backend/internal/handler/ops_error_logger.go new file mode 100644 index 00000000..b3a90c2f --- /dev/null +++ b/backend/internal/handler/ops_error_logger.go @@ -0,0 +1,681 @@ +package handler + +import ( + "bytes" + "context" + "encoding/json" + "log" + "runtime" + "runtime/debug" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + "unicode/utf8" + + "github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey" + middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware" + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/gin-gonic/gin" +) + +const ( + opsModelKey = "ops_model" + opsStreamKey = "ops_stream" + opsRequestBodyKey = "ops_request_body" + opsAccountIDKey = "ops_account_id" +) + +const ( + opsErrorLogTimeout = 5 * time.Second + opsErrorLogDrainTimeout = 10 * time.Second + + opsErrorLogMinWorkerCount = 4 + opsErrorLogMaxWorkerCount = 32 + + opsErrorLogQueueSizePerWorker = 128 + opsErrorLogMinQueueSize = 256 + opsErrorLogMaxQueueSize = 8192 +) + +type opsErrorLogJob struct { + ops *service.OpsService + entry *service.OpsInsertErrorLogInput + requestBody []byte +} + +var ( + opsErrorLogOnce sync.Once + opsErrorLogQueue chan opsErrorLogJob + + opsErrorLogStopOnce sync.Once + opsErrorLogWorkersWg sync.WaitGroup + opsErrorLogMu sync.RWMutex + opsErrorLogStopping bool + opsErrorLogQueueLen atomic.Int64 + opsErrorLogEnqueued atomic.Int64 + opsErrorLogDropped atomic.Int64 + opsErrorLogProcessed atomic.Int64 + + opsErrorLogLastDropLogAt atomic.Int64 + + opsErrorLogShutdownCh = make(chan struct{}) + opsErrorLogShutdownOnce sync.Once + opsErrorLogDrained atomic.Bool +) + +func startOpsErrorLogWorkers() { + opsErrorLogMu.Lock() + defer opsErrorLogMu.Unlock() + + if opsErrorLogStopping { + return + } + + workerCount, queueSize := opsErrorLogConfig() + opsErrorLogQueue = make(chan opsErrorLogJob, queueSize) + opsErrorLogQueueLen.Store(0) + + opsErrorLogWorkersWg.Add(workerCount) + for i := 0; i < workerCount; i++ { + go func() { + defer opsErrorLogWorkersWg.Done() + for job := range opsErrorLogQueue { + opsErrorLogQueueLen.Add(-1) + if job.ops == nil || job.entry == nil { + continue + } + func() { + defer func() { + if r := recover(); r != nil { + log.Printf("[OpsErrorLogger] worker panic: %v\n%s", r, debug.Stack()) + } + }() + ctx, cancel := context.WithTimeout(context.Background(), opsErrorLogTimeout) + _ = job.ops.RecordError(ctx, job.entry, job.requestBody) + cancel() + opsErrorLogProcessed.Add(1) + }() + } + }() + } +} + +func enqueueOpsErrorLog(ops *service.OpsService, entry *service.OpsInsertErrorLogInput, requestBody []byte) { + if ops == nil || entry == nil { + return + } + select { + case <-opsErrorLogShutdownCh: + return + default: + } + + opsErrorLogMu.RLock() + stopping := opsErrorLogStopping + opsErrorLogMu.RUnlock() + if stopping { + return + } + + opsErrorLogOnce.Do(startOpsErrorLogWorkers) + + opsErrorLogMu.RLock() + defer opsErrorLogMu.RUnlock() + if opsErrorLogStopping || opsErrorLogQueue == nil { + return + } + + select { + case opsErrorLogQueue <- opsErrorLogJob{ops: ops, entry: entry, requestBody: requestBody}: + opsErrorLogQueueLen.Add(1) + opsErrorLogEnqueued.Add(1) + default: + // Queue is full; drop to avoid blocking request handling. + opsErrorLogDropped.Add(1) + maybeLogOpsErrorLogDrop() + } +} + +func StopOpsErrorLogWorkers() bool { + opsErrorLogStopOnce.Do(func() { + opsErrorLogShutdownOnce.Do(func() { + close(opsErrorLogShutdownCh) + }) + opsErrorLogDrained.Store(stopOpsErrorLogWorkers()) + }) + return opsErrorLogDrained.Load() +} + +func stopOpsErrorLogWorkers() bool { + opsErrorLogMu.Lock() + opsErrorLogStopping = true + ch := opsErrorLogQueue + if ch != nil { + close(ch) + } + opsErrorLogQueue = nil + opsErrorLogMu.Unlock() + + if ch == nil { + opsErrorLogQueueLen.Store(0) + return true + } + + done := make(chan struct{}) + go func() { + opsErrorLogWorkersWg.Wait() + close(done) + }() + + select { + case <-done: + opsErrorLogQueueLen.Store(0) + return true + case <-time.After(opsErrorLogDrainTimeout): + return false + } +} + +func OpsErrorLogQueueLength() int64 { + return opsErrorLogQueueLen.Load() +} + +func OpsErrorLogQueueCapacity() int { + opsErrorLogMu.RLock() + ch := opsErrorLogQueue + opsErrorLogMu.RUnlock() + if ch == nil { + return 0 + } + return cap(ch) +} + +func OpsErrorLogDroppedTotal() int64 { + return opsErrorLogDropped.Load() +} + +func OpsErrorLogEnqueuedTotal() int64 { + return opsErrorLogEnqueued.Load() +} + +func OpsErrorLogProcessedTotal() int64 { + return opsErrorLogProcessed.Load() +} + +func maybeLogOpsErrorLogDrop() { + now := time.Now().Unix() + + for { + last := opsErrorLogLastDropLogAt.Load() + if last != 0 && now-last < 60 { + return + } + if opsErrorLogLastDropLogAt.CompareAndSwap(last, now) { + break + } + } + + queued := opsErrorLogQueueLen.Load() + queueCap := OpsErrorLogQueueCapacity() + + log.Printf( + "[OpsErrorLogger] queue is full; dropping logs (queued=%d cap=%d enqueued_total=%d dropped_total=%d processed_total=%d)", + queued, + queueCap, + opsErrorLogEnqueued.Load(), + opsErrorLogDropped.Load(), + opsErrorLogProcessed.Load(), + ) +} + +func opsErrorLogConfig() (workerCount int, queueSize int) { + workerCount = runtime.GOMAXPROCS(0) * 2 + if workerCount < opsErrorLogMinWorkerCount { + workerCount = opsErrorLogMinWorkerCount + } + if workerCount > opsErrorLogMaxWorkerCount { + workerCount = opsErrorLogMaxWorkerCount + } + + queueSize = workerCount * opsErrorLogQueueSizePerWorker + if queueSize < opsErrorLogMinQueueSize { + queueSize = opsErrorLogMinQueueSize + } + if queueSize > opsErrorLogMaxQueueSize { + queueSize = opsErrorLogMaxQueueSize + } + + return workerCount, queueSize +} + +func setOpsRequestContext(c *gin.Context, model string, stream bool, requestBody []byte) { + if c == nil { + return + } + c.Set(opsModelKey, model) + c.Set(opsStreamKey, stream) + if len(requestBody) > 0 { + c.Set(opsRequestBodyKey, requestBody) + } +} + +func setOpsSelectedAccount(c *gin.Context, accountID int64) { + if c == nil || accountID <= 0 { + return + } + c.Set(opsAccountIDKey, accountID) +} + +type opsCaptureWriter struct { + gin.ResponseWriter + limit int + buf bytes.Buffer +} + +func (w *opsCaptureWriter) Write(b []byte) (int, error) { + if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit { + remaining := w.limit - w.buf.Len() + if len(b) > remaining { + _, _ = w.buf.Write(b[:remaining]) + } else { + _, _ = w.buf.Write(b) + } + } + return w.ResponseWriter.Write(b) +} + +func (w *opsCaptureWriter) WriteString(s string) (int, error) { + if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit { + remaining := w.limit - w.buf.Len() + if len(s) > remaining { + _, _ = w.buf.WriteString(s[:remaining]) + } else { + _, _ = w.buf.WriteString(s) + } + } + return w.ResponseWriter.WriteString(s) +} + +// OpsErrorLoggerMiddleware records error responses (status >= 400) into ops_error_logs. +// +// Notes: +// - It buffers response bodies only when status >= 400 to avoid overhead for successful traffic. +// - Streaming errors after the response has started (SSE) may still need explicit logging. +func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc { + return func(c *gin.Context) { + w := &opsCaptureWriter{ResponseWriter: c.Writer, limit: 64 * 1024} + c.Writer = w + c.Next() + + status := c.Writer.Status() + if status < 400 { + return + } + if ops == nil { + return + } + if !ops.IsMonitoringEnabled(c.Request.Context()) { + return + } + + body := w.buf.Bytes() + parsed := parseOpsErrorResponse(body) + + apiKey, _ := middleware2.GetAPIKeyFromContext(c) + + clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string) + + model, _ := c.Get(opsModelKey) + streamV, _ := c.Get(opsStreamKey) + accountIDV, _ := c.Get(opsAccountIDKey) + + var modelName string + if s, ok := model.(string); ok { + modelName = s + } + stream := false + if b, ok := streamV.(bool); ok { + stream = b + } + var accountID *int64 + if v, ok := accountIDV.(int64); ok && v > 0 { + accountID = &v + } + + fallbackPlatform := guessPlatformFromPath(c.Request.URL.Path) + platform := resolveOpsPlatform(apiKey, fallbackPlatform) + + requestID := c.Writer.Header().Get("X-Request-Id") + if requestID == "" { + requestID = c.Writer.Header().Get("x-request-id") + } + + phase := classifyOpsPhase(parsed.ErrorType, parsed.Message, parsed.Code) + isBusinessLimited := classifyOpsIsBusinessLimited(parsed.ErrorType, phase, parsed.Code, status, parsed.Message) + + errorOwner := classifyOpsErrorOwner(phase, parsed.Message) + errorSource := classifyOpsErrorSource(phase, parsed.Message) + + entry := &service.OpsInsertErrorLogInput{ + RequestID: requestID, + ClientRequestID: clientRequestID, + + AccountID: accountID, + Platform: platform, + Model: modelName, + RequestPath: func() string { + if c.Request != nil && c.Request.URL != nil { + return c.Request.URL.Path + } + return "" + }(), + Stream: stream, + UserAgent: c.GetHeader("User-Agent"), + + ErrorPhase: phase, + ErrorType: normalizeOpsErrorType(parsed.ErrorType, parsed.Code), + Severity: classifyOpsSeverity(parsed.ErrorType, status), + StatusCode: status, + IsBusinessLimited: isBusinessLimited, + + ErrorMessage: parsed.Message, + // Keep the full captured error body (capture is already capped at 64KB) so the + // service layer can sanitize JSON before truncating for storage. + ErrorBody: string(body), + ErrorSource: errorSource, + ErrorOwner: errorOwner, + + IsRetryable: classifyOpsIsRetryable(parsed.ErrorType, status), + RetryCount: 0, + CreatedAt: time.Now(), + } + + if apiKey != nil { + entry.APIKeyID = &apiKey.ID + if apiKey.User != nil { + entry.UserID = &apiKey.User.ID + } + if apiKey.GroupID != nil { + entry.GroupID = apiKey.GroupID + } + // Prefer group platform if present (more stable than inferring from path). + if apiKey.Group != nil && apiKey.Group.Platform != "" { + entry.Platform = apiKey.Group.Platform + } + } + + var clientIP string + if ip := strings.TrimSpace(c.ClientIP()); ip != "" { + clientIP = ip + entry.ClientIP = &clientIP + } + + var requestBody []byte + if v, ok := c.Get(opsRequestBodyKey); ok { + if b, ok := v.([]byte); ok && len(b) > 0 { + requestBody = b + } + } + // Persist only a minimal, whitelisted set of request headers to improve retry fidelity. + // Do NOT store Authorization/Cookie/etc. + entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c) + + enqueueOpsErrorLog(ops, entry, requestBody) + } +} + +var opsRetryRequestHeaderAllowlist = []string{ + "anthropic-beta", + "anthropic-version", +} + +func extractOpsRetryRequestHeaders(c *gin.Context) *string { + if c == nil || c.Request == nil { + return nil + } + + headers := make(map[string]string, 4) + for _, key := range opsRetryRequestHeaderAllowlist { + v := strings.TrimSpace(c.GetHeader(key)) + if v == "" { + continue + } + // Keep headers small even if a client sends something unexpected. + headers[key] = truncateString(v, 512) + } + if len(headers) == 0 { + return nil + } + + raw, err := json.Marshal(headers) + if err != nil { + return nil + } + s := string(raw) + return &s +} + +type parsedOpsError struct { + ErrorType string + Message string + Code string +} + +func parseOpsErrorResponse(body []byte) parsedOpsError { + if len(body) == 0 { + return parsedOpsError{} + } + + // Fast path: attempt to decode into a generic map. + var m map[string]any + if err := json.Unmarshal(body, &m); err != nil { + return parsedOpsError{Message: truncateString(string(body), 1024)} + } + + // Claude/OpenAI-style gateway error: { type:"error", error:{ type, message } } + if errObj, ok := m["error"].(map[string]any); ok { + t, _ := errObj["type"].(string) + msg, _ := errObj["message"].(string) + // Gemini googleError also uses "error": { code, message, status } + if msg == "" { + if v, ok := errObj["message"]; ok { + msg, _ = v.(string) + } + } + if t == "" { + // Gemini error does not have "type" field. + t = "api_error" + } + // For gemini error, capture numeric code as string for business-limited mapping if needed. + var code string + if v, ok := errObj["code"]; ok { + switch n := v.(type) { + case float64: + code = strconvItoa(int(n)) + case int: + code = strconvItoa(n) + } + } + return parsedOpsError{ErrorType: t, Message: msg, Code: code} + } + + // APIKeyAuth-style: { code:"INSUFFICIENT_BALANCE", message:"..." } + code, _ := m["code"].(string) + msg, _ := m["message"].(string) + if code != "" || msg != "" { + return parsedOpsError{ErrorType: "api_error", Message: msg, Code: code} + } + + return parsedOpsError{Message: truncateString(string(body), 1024)} +} + +func resolveOpsPlatform(apiKey *service.APIKey, fallback string) string { + if apiKey != nil && apiKey.Group != nil && apiKey.Group.Platform != "" { + return apiKey.Group.Platform + } + return fallback +} + +func guessPlatformFromPath(path string) string { + p := strings.ToLower(path) + switch { + case strings.HasPrefix(p, "/antigravity/"): + return service.PlatformAntigravity + case strings.HasPrefix(p, "/v1beta/"): + return service.PlatformGemini + case strings.Contains(p, "/responses"): + return service.PlatformOpenAI + default: + return "" + } +} + +func normalizeOpsErrorType(errType string, code string) string { + if errType != "" { + return errType + } + switch strings.TrimSpace(code) { + case "INSUFFICIENT_BALANCE": + return "billing_error" + case "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID": + return "subscription_error" + default: + return "api_error" + } +} + +func classifyOpsPhase(errType, message, code string) string { + msg := strings.ToLower(message) + switch strings.TrimSpace(code) { + case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID": + return "billing" + } + + switch errType { + case "authentication_error": + return "auth" + case "billing_error", "subscription_error": + return "billing" + case "rate_limit_error": + if strings.Contains(msg, "concurrency") || strings.Contains(msg, "pending") || strings.Contains(msg, "queue") { + return "concurrency" + } + return "upstream" + case "invalid_request_error": + return "response" + case "upstream_error", "overloaded_error": + return "upstream" + case "api_error": + if strings.Contains(msg, "no available accounts") { + return "scheduling" + } + return "internal" + default: + return "internal" + } +} + +func classifyOpsSeverity(errType string, status int) string { + switch errType { + case "invalid_request_error", "authentication_error", "billing_error", "subscription_error": + return "P3" + } + if status >= 500 { + return "P1" + } + if status == 429 { + return "P1" + } + if status >= 400 { + return "P2" + } + return "P3" +} + +func classifyOpsIsRetryable(errType string, statusCode int) bool { + switch errType { + case "authentication_error", "invalid_request_error": + return false + case "timeout_error": + return true + case "rate_limit_error": + // May be transient (upstream or queue); retry can help. + return true + case "billing_error", "subscription_error": + return false + case "upstream_error", "overloaded_error": + return statusCode >= 500 || statusCode == 429 || statusCode == 529 + default: + return statusCode >= 500 + } +} + +func classifyOpsIsBusinessLimited(errType, phase, code string, status int, message string) bool { + switch strings.TrimSpace(code) { + case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID": + return true + } + if phase == "billing" || phase == "concurrency" { + // SLA/错误率排除“用户级业务限制” + return true + } + // Avoid treating upstream rate limits as business-limited. + if errType == "rate_limit_error" && strings.Contains(strings.ToLower(message), "upstream") { + return false + } + _ = status + return false +} + +func classifyOpsErrorOwner(phase string, message string) string { + switch phase { + case "upstream", "network": + return "provider" + case "billing", "concurrency", "auth", "response": + return "client" + default: + if strings.Contains(strings.ToLower(message), "upstream") { + return "provider" + } + return "sub2api" + } +} + +func classifyOpsErrorSource(phase string, message string) string { + switch phase { + case "upstream": + return "upstream_http" + case "network": + return "upstream_network" + case "billing": + return "billing" + case "concurrency": + return "concurrency" + default: + if strings.Contains(strings.ToLower(message), "upstream") { + return "upstream_http" + } + return "internal" + } +} + +func truncateString(s string, max int) string { + if max <= 0 { + return "" + } + if len(s) <= max { + return s + } + cut := s[:max] + // Ensure truncation does not split multi-byte characters. + for len(cut) > 0 && !utf8.ValidString(cut) { + cut = cut[:len(cut)-1] + } + return cut +} + +func strconvItoa(v int) string { + return strconv.Itoa(v) +} diff --git a/backend/internal/handler/wire.go b/backend/internal/handler/wire.go index 1695f8a9..e5d8d077 100644 --- a/backend/internal/handler/wire.go +++ b/backend/internal/handler/wire.go @@ -20,6 +20,7 @@ func ProvideAdminHandlers( proxyHandler *admin.ProxyHandler, redeemHandler *admin.RedeemHandler, settingHandler *admin.SettingHandler, + opsHandler *admin.OpsHandler, systemHandler *admin.SystemHandler, subscriptionHandler *admin.SubscriptionHandler, usageHandler *admin.UsageHandler, @@ -37,6 +38,7 @@ func ProvideAdminHandlers( Proxy: proxyHandler, Redeem: redeemHandler, Setting: settingHandler, + Ops: opsHandler, System: systemHandler, Subscription: subscriptionHandler, Usage: usageHandler, @@ -106,6 +108,7 @@ var ProviderSet = wire.NewSet( admin.NewProxyHandler, admin.NewRedeemHandler, admin.NewSettingHandler, + admin.NewOpsHandler, ProvideSystemHandler, admin.NewSubscriptionHandler, admin.NewUsageHandler, diff --git a/backend/internal/server/middleware/admin_auth.go b/backend/internal/server/middleware/admin_auth.go index e02a7b0a..8f30107c 100644 --- a/backend/internal/server/middleware/admin_auth.go +++ b/backend/internal/server/middleware/admin_auth.go @@ -30,6 +30,20 @@ func adminAuth( settingService *service.SettingService, ) gin.HandlerFunc { return func(c *gin.Context) { + // WebSocket upgrade requests cannot set Authorization headers in browsers. + // For admin WebSocket endpoints (e.g. Ops realtime), allow passing the JWT via + // Sec-WebSocket-Protocol (subprotocol list) using a prefixed token item: + // Sec-WebSocket-Protocol: sub2api-admin, jwt. + if isWebSocketUpgradeRequest(c) { + if token := extractJWTFromWebSocketSubprotocol(c); token != "" { + if !validateJWTForAdmin(c, token, authService, userService) { + return + } + c.Next() + return + } + } + // 检查 x-api-key header(Admin API Key 认证) apiKey := c.GetHeader("x-api-key") if apiKey != "" { @@ -58,6 +72,44 @@ func adminAuth( } } +func isWebSocketUpgradeRequest(c *gin.Context) bool { + if c == nil || c.Request == nil { + return false + } + // RFC6455 handshake uses: + // Connection: Upgrade + // Upgrade: websocket + upgrade := strings.ToLower(strings.TrimSpace(c.GetHeader("Upgrade"))) + if upgrade != "websocket" { + return false + } + connection := strings.ToLower(c.GetHeader("Connection")) + return strings.Contains(connection, "upgrade") +} + +func extractJWTFromWebSocketSubprotocol(c *gin.Context) string { + if c == nil { + return "" + } + raw := strings.TrimSpace(c.GetHeader("Sec-WebSocket-Protocol")) + if raw == "" { + return "" + } + + // The header is a comma-separated list of tokens. We reserve the prefix "jwt." + // for carrying the admin JWT. + for _, part := range strings.Split(raw, ",") { + p := strings.TrimSpace(part) + if strings.HasPrefix(p, "jwt.") { + token := strings.TrimSpace(strings.TrimPrefix(p, "jwt.")) + if token != "" { + return token + } + } + } + return "" +} + // validateAdminAPIKey 验证管理员 API Key func validateAdminAPIKey( c *gin.Context, diff --git a/backend/internal/server/middleware/client_request_id.go b/backend/internal/server/middleware/client_request_id.go new file mode 100644 index 00000000..60d444ce --- /dev/null +++ b/backend/internal/server/middleware/client_request_id.go @@ -0,0 +1,31 @@ +package middleware + +import ( + "context" + + "github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey" + "github.com/gin-gonic/gin" + "github.com/google/uuid" +) + +// ClientRequestID ensures every request has a unique client_request_id in request.Context(). +// +// This is used by the Ops monitoring module for end-to-end request correlation. +func ClientRequestID() gin.HandlerFunc { + return func(c *gin.Context) { + if c.Request == nil { + c.Next() + return + } + + if v := c.Request.Context().Value(ctxkey.ClientRequestID); v != nil { + c.Next() + return + } + + id := uuid.New().String() + c.Request = c.Request.WithContext(context.WithValue(c.Request.Context(), ctxkey.ClientRequestID, id)) + c.Next() + } +} + diff --git a/backend/internal/server/middleware/ws_query_token_auth.go b/backend/internal/server/middleware/ws_query_token_auth.go new file mode 100644 index 00000000..3b8d086a --- /dev/null +++ b/backend/internal/server/middleware/ws_query_token_auth.go @@ -0,0 +1,54 @@ +package middleware + +import ( + "net/http" + "strings" + + "github.com/gin-gonic/gin" +) + +// InjectBearerTokenFromQueryForWebSocket copies `?token=` into the Authorization header +// for WebSocket handshake requests on a small allow-list of endpoints. +// +// Why: browsers can't set custom headers on WebSocket handshake, but our admin routes +// are protected by header-based auth. This keeps the token support scoped to WS only. +func InjectBearerTokenFromQueryForWebSocket() gin.HandlerFunc { + return func(c *gin.Context) { + if c == nil || c.Request == nil { + if c != nil { + c.Next() + } + return + } + + // Only GET websocket upgrades. + if c.Request.Method != http.MethodGet { + c.Next() + return + } + if !strings.EqualFold(strings.TrimSpace(c.GetHeader("Upgrade")), "websocket") { + c.Next() + return + } + + // If caller already supplied auth headers, don't override. + if strings.TrimSpace(c.GetHeader("Authorization")) != "" || strings.TrimSpace(c.GetHeader("x-api-key")) != "" { + c.Next() + return + } + + // Allow-list ops websocket endpoints. + path := strings.TrimSpace(c.Request.URL.Path) + if !strings.HasPrefix(path, "/api/v1/admin/ops/ws/") { + c.Next() + return + } + + token := strings.TrimSpace(c.Query("token")) + if token != "" { + c.Request.Header.Set("Authorization", "Bearer "+token) + } + + c.Next() + } +} From e0d12b46d895dd9d9fa2d3bda35c40630398ae19 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:55:12 +0800 Subject: [PATCH 06/53] =?UTF-8?q?feat(=E8=B7=AF=E7=94=B1):=20=E9=9B=86?= =?UTF-8?q?=E6=88=90=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E8=B7=AF=E7=94=B1?= =?UTF-8?q?=E5=88=B0=E6=9C=8D=E5=8A=A1=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 更新路由器注册 ops 监控路由 - 添加 ops 管理路由(dashboard, alerts, realtime, settings, ws) - 更新 gateway 路由支持请求追踪 - 集成 ops 服务到 HTTP 服务器 --- backend/internal/server/http.go | 3 +- backend/internal/server/router.go | 8 +++- backend/internal/server/routes/admin.go | 51 +++++++++++++++++++++++ backend/internal/server/routes/gateway.go | 13 +++++- 4 files changed, 71 insertions(+), 4 deletions(-) diff --git a/backend/internal/server/http.go b/backend/internal/server/http.go index a8740ecc..7b273771 100644 --- a/backend/internal/server/http.go +++ b/backend/internal/server/http.go @@ -30,6 +30,7 @@ func ProvideRouter( apiKeyAuth middleware2.APIKeyAuthMiddleware, apiKeyService *service.APIKeyService, subscriptionService *service.SubscriptionService, + opsService *service.OpsService, ) *gin.Engine { if cfg.Server.Mode == "release" { gin.SetMode(gin.ReleaseMode) @@ -47,7 +48,7 @@ func ProvideRouter( } } - return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, cfg) + return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg) } // ProvideHTTPServer 提供 HTTP 服务器 diff --git a/backend/internal/server/router.go b/backend/internal/server/router.go index 15a1b325..85df99bd 100644 --- a/backend/internal/server/router.go +++ b/backend/internal/server/router.go @@ -20,10 +20,13 @@ func SetupRouter( apiKeyAuth middleware2.APIKeyAuthMiddleware, apiKeyService *service.APIKeyService, subscriptionService *service.SubscriptionService, + opsService *service.OpsService, cfg *config.Config, ) *gin.Engine { // 应用中间件 r.Use(middleware2.Logger()) + // WebSocket handshake auth helper (token via query param, WS endpoints only). + r.Use(middleware2.InjectBearerTokenFromQueryForWebSocket()) r.Use(middleware2.CORS(cfg.CORS)) r.Use(middleware2.SecurityHeaders(cfg.Security.CSP)) @@ -33,7 +36,7 @@ func SetupRouter( } // 注册路由 - registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, cfg) + registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg) return r } @@ -47,6 +50,7 @@ func registerRoutes( apiKeyAuth middleware2.APIKeyAuthMiddleware, apiKeyService *service.APIKeyService, subscriptionService *service.SubscriptionService, + opsService *service.OpsService, cfg *config.Config, ) { // 通用路由(健康检查、状态等) @@ -59,5 +63,5 @@ func registerRoutes( routes.RegisterAuthRoutes(v1, h, jwtAuth) routes.RegisterUserRoutes(v1, h, jwtAuth) routes.RegisterAdminRoutes(v1, h, adminAuth) - routes.RegisterGatewayRoutes(r, h, apiKeyAuth, apiKeyService, subscriptionService, cfg) + routes.RegisterGatewayRoutes(r, h, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg) } diff --git a/backend/internal/server/routes/admin.go b/backend/internal/server/routes/admin.go index 663c2d02..e69b1eb8 100644 --- a/backend/internal/server/routes/admin.go +++ b/backend/internal/server/routes/admin.go @@ -47,6 +47,9 @@ func RegisterAdminRoutes( // 系统设置 registerSettingsRoutes(admin, h) + // 运维监控(Ops) + registerOpsRoutes(admin, h) + // 系统管理 registerSystemRoutes(admin, h) @@ -61,6 +64,54 @@ func RegisterAdminRoutes( } } +func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) { + ops := admin.Group("/ops") + { + // Realtime ops signals + ops.GET("/concurrency", h.Admin.Ops.GetConcurrencyStats) + ops.GET("/account-availability", h.Admin.Ops.GetAccountAvailability) + + // Alerts (rules + events) + ops.GET("/alert-rules", h.Admin.Ops.ListAlertRules) + ops.POST("/alert-rules", h.Admin.Ops.CreateAlertRule) + ops.PUT("/alert-rules/:id", h.Admin.Ops.UpdateAlertRule) + ops.DELETE("/alert-rules/:id", h.Admin.Ops.DeleteAlertRule) + ops.GET("/alert-events", h.Admin.Ops.ListAlertEvents) + + // Email notification config (DB-backed) + ops.GET("/email-notification/config", h.Admin.Ops.GetEmailNotificationConfig) + ops.PUT("/email-notification/config", h.Admin.Ops.UpdateEmailNotificationConfig) + + // Runtime settings (DB-backed) + runtime := ops.Group("/runtime") + { + runtime.GET("/alert", h.Admin.Ops.GetAlertRuntimeSettings) + runtime.PUT("/alert", h.Admin.Ops.UpdateAlertRuntimeSettings) + } + + // WebSocket realtime (QPS/TPS) + ws := ops.Group("/ws") + { + ws.GET("/qps", h.Admin.Ops.QPSWSHandler) + } + + // Error logs (MVP-1) + ops.GET("/errors", h.Admin.Ops.GetErrorLogs) + ops.GET("/errors/:id", h.Admin.Ops.GetErrorLogByID) + ops.POST("/errors/:id/retry", h.Admin.Ops.RetryErrorRequest) + + // Request drilldown (success + error) + ops.GET("/requests", h.Admin.Ops.ListRequestDetails) + + // Dashboard (vNext - raw path for MVP) + ops.GET("/dashboard/overview", h.Admin.Ops.GetDashboardOverview) + ops.GET("/dashboard/throughput-trend", h.Admin.Ops.GetDashboardThroughputTrend) + ops.GET("/dashboard/latency-histogram", h.Admin.Ops.GetDashboardLatencyHistogram) + ops.GET("/dashboard/error-trend", h.Admin.Ops.GetDashboardErrorTrend) + ops.GET("/dashboard/error-distribution", h.Admin.Ops.GetDashboardErrorDistribution) + } +} + func registerDashboardRoutes(admin *gin.RouterGroup, h *handler.Handlers) { dashboard := admin.Group("/dashboard") { diff --git a/backend/internal/server/routes/gateway.go b/backend/internal/server/routes/gateway.go index 0b62185e..bf019ce3 100644 --- a/backend/internal/server/routes/gateway.go +++ b/backend/internal/server/routes/gateway.go @@ -16,13 +16,18 @@ func RegisterGatewayRoutes( apiKeyAuth middleware.APIKeyAuthMiddleware, apiKeyService *service.APIKeyService, subscriptionService *service.SubscriptionService, + opsService *service.OpsService, cfg *config.Config, ) { bodyLimit := middleware.RequestBodyLimit(cfg.Gateway.MaxBodySize) + clientRequestID := middleware.ClientRequestID() + opsErrorLogger := handler.OpsErrorLoggerMiddleware(opsService) // API网关(Claude API兼容) gateway := r.Group("/v1") gateway.Use(bodyLimit) + gateway.Use(clientRequestID) + gateway.Use(opsErrorLogger) gateway.Use(gin.HandlerFunc(apiKeyAuth)) { gateway.POST("/messages", h.Gateway.Messages) @@ -36,6 +41,8 @@ func RegisterGatewayRoutes( // Gemini 原生 API 兼容层(Gemini SDK/CLI 直连) gemini := r.Group("/v1beta") gemini.Use(bodyLimit) + gemini.Use(clientRequestID) + gemini.Use(opsErrorLogger) gemini.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg)) { gemini.GET("/models", h.Gateway.GeminiV1BetaListModels) @@ -45,7 +52,7 @@ func RegisterGatewayRoutes( } // OpenAI Responses API(不带v1前缀的别名) - r.POST("/responses", bodyLimit, gin.HandlerFunc(apiKeyAuth), h.OpenAIGateway.Responses) + r.POST("/responses", bodyLimit, clientRequestID, opsErrorLogger, gin.HandlerFunc(apiKeyAuth), h.OpenAIGateway.Responses) // Antigravity 模型列表 r.GET("/antigravity/models", gin.HandlerFunc(apiKeyAuth), h.Gateway.AntigravityModels) @@ -53,6 +60,8 @@ func RegisterGatewayRoutes( // Antigravity 专用路由(仅使用 antigravity 账户,不混合调度) antigravityV1 := r.Group("/antigravity/v1") antigravityV1.Use(bodyLimit) + antigravityV1.Use(clientRequestID) + antigravityV1.Use(opsErrorLogger) antigravityV1.Use(middleware.ForcePlatform(service.PlatformAntigravity)) antigravityV1.Use(gin.HandlerFunc(apiKeyAuth)) { @@ -64,6 +73,8 @@ func RegisterGatewayRoutes( antigravityV1Beta := r.Group("/antigravity/v1beta") antigravityV1Beta.Use(bodyLimit) + antigravityV1Beta.Use(clientRequestID) + antigravityV1Beta.Use(opsErrorLogger) antigravityV1Beta.Use(middleware.ForcePlatform(service.PlatformAntigravity)) antigravityV1Beta.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg)) { From d55dd56fd22732014738dcf4f91d740c17ba016c Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:55:52 +0800 Subject: [PATCH 07/53] =?UTF-8?q?feat(=E4=BE=9D=E8=B5=96=E6=B3=A8=E5=85=A5?= =?UTF-8?q?):=20=E9=9B=86=E6=88=90=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7?= =?UTF-8?q?=E4=BE=9D=E8=B5=96=E6=B3=A8=E5=85=A5=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 更新 wire.go 添加 ops 服务依赖注入提供者 - 重新生成 wire_gen.go 包含完整的依赖注入图 --- backend/cmd/server/wire.go | 28 ++++++++++++++ backend/cmd/server/wire_gen.go | 71 +++++++++++++++++++++++++--------- 2 files changed, 81 insertions(+), 18 deletions(-) diff --git a/backend/cmd/server/wire.go b/backend/cmd/server/wire.go index ff6ab4e6..11c202f0 100644 --- a/backend/cmd/server/wire.go +++ b/backend/cmd/server/wire.go @@ -62,6 +62,10 @@ func provideServiceBuildInfo(buildInfo handler.BuildInfo) service.BuildInfo { func provideCleanup( entClient *ent.Client, rdb *redis.Client, + opsMetricsCollector *service.OpsMetricsCollector, + opsAggregation *service.OpsAggregationService, + opsAlertEvaluator *service.OpsAlertEvaluatorService, + opsCleanup *service.OpsCleanupService, tokenRefresh *service.TokenRefreshService, pricing *service.PricingService, emailQueue *service.EmailQueueService, @@ -80,6 +84,30 @@ func provideCleanup( name string fn func() error }{ + {"OpsCleanupService", func() error { + if opsCleanup != nil { + opsCleanup.Stop() + } + return nil + }}, + {"OpsAlertEvaluatorService", func() error { + if opsAlertEvaluator != nil { + opsAlertEvaluator.Stop() + } + return nil + }}, + {"OpsAggregationService", func() error { + if opsAggregation != nil { + opsAggregation.Stop() + } + return nil + }}, + {"OpsMetricsCollector", func() error { + if opsMetricsCollector != nil { + opsMetricsCollector.Stop() + } + return nil + }}, {"TokenRefreshService", func() error { tokenRefresh.Stop() return nil diff --git a/backend/cmd/server/wire_gen.go b/backend/cmd/server/wire_gen.go index 768254f9..2a254fd6 100644 --- a/backend/cmd/server/wire_gen.go +++ b/backend/cmd/server/wire_gen.go @@ -87,6 +87,7 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) { geminiOAuthClient := repository.NewGeminiOAuthClient(configConfig) geminiCliCodeAssistClient := repository.NewGeminiCliCodeAssistClient() geminiOAuthService := service.NewGeminiOAuthService(proxyRepository, geminiOAuthClient, geminiCliCodeAssistClient, configConfig) + antigravityOAuthService := service.NewAntigravityOAuthService(proxyRepository) geminiQuotaService := service.NewGeminiQuotaService(configConfig, settingRepository) tempUnschedCache := repository.NewTempUnschedCache(redisClient) rateLimitService := service.NewRateLimitService(accountRepository, usageLogRepository, configConfig, geminiQuotaService, tempUnschedCache) @@ -97,13 +98,12 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) { geminiTokenCache := repository.NewGeminiTokenCache(redisClient) geminiTokenProvider := service.NewGeminiTokenProvider(accountRepository, geminiTokenCache, geminiOAuthService) gatewayCache := repository.NewGatewayCache(redisClient) - antigravityOAuthService := service.NewAntigravityOAuthService(proxyRepository) antigravityTokenProvider := service.NewAntigravityTokenProvider(accountRepository, geminiTokenCache, antigravityOAuthService) httpUpstream := repository.NewHTTPUpstream(configConfig) antigravityGatewayService := service.NewAntigravityGatewayService(accountRepository, gatewayCache, antigravityTokenProvider, rateLimitService, httpUpstream, settingService) accountTestService := service.NewAccountTestService(accountRepository, geminiTokenProvider, antigravityGatewayService, httpUpstream, configConfig) concurrencyCache := repository.ProvideConcurrencyCache(redisClient, configConfig) - concurrencyService := service.NewConcurrencyService(concurrencyCache) + concurrencyService := service.ProvideConcurrencyService(concurrencyCache, accountRepository, configConfig) crsSyncService := service.NewCRSSyncService(accountRepository, proxyRepository, oAuthService, openAIOAuthService, geminiOAuthService, configConfig) accountHandler := admin.NewAccountHandler(adminService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService, rateLimitService, accountUsageService, accountTestService, concurrencyService, crsSyncService) oAuthHandler := admin.NewOAuthHandler(oAuthService) @@ -113,18 +113,7 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) { proxyHandler := admin.NewProxyHandler(adminService) adminRedeemHandler := admin.NewRedeemHandler(adminService) settingHandler := admin.NewSettingHandler(settingService, emailService, turnstileService) - updateCache := repository.NewUpdateCache(redisClient) - gitHubReleaseClient := repository.NewGitHubReleaseClient() - serviceBuildInfo := provideServiceBuildInfo(buildInfo) - updateService := service.ProvideUpdateService(updateCache, gitHubReleaseClient, serviceBuildInfo) - systemHandler := handler.ProvideSystemHandler(updateService) - adminSubscriptionHandler := admin.NewSubscriptionHandler(subscriptionService) - adminUsageHandler := admin.NewUsageHandler(usageService, apiKeyService, adminService) - userAttributeDefinitionRepository := repository.NewUserAttributeDefinitionRepository(client) - userAttributeValueRepository := repository.NewUserAttributeValueRepository(client) - userAttributeService := service.NewUserAttributeService(userAttributeDefinitionRepository, userAttributeValueRepository) - userAttributeHandler := admin.NewUserAttributeHandler(userAttributeService) - adminHandlers := handler.ProvideAdminHandlers(dashboardHandler, adminUserHandler, groupHandler, accountHandler, oAuthHandler, openAIOAuthHandler, geminiOAuthHandler, antigravityOAuthHandler, proxyHandler, adminRedeemHandler, settingHandler, systemHandler, adminSubscriptionHandler, adminUsageHandler, userAttributeHandler) + opsRepository := repository.NewOpsRepository(db) pricingRemoteClient := repository.NewPricingRemoteClient(configConfig) pricingService, err := service.ProvidePricingService(configConfig, pricingRemoteClient) if err != nil { @@ -136,19 +125,37 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) { timingWheelService := service.ProvideTimingWheelService() deferredService := service.ProvideDeferredService(accountRepository, timingWheelService) gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService) - geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig) - gatewayHandler := handler.NewGatewayHandler(gatewayService, geminiMessagesCompatService, antigravityGatewayService, userService, concurrencyService, billingCacheService, configConfig) openAIGatewayService := service.NewOpenAIGatewayService(accountRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, httpUpstream, deferredService) + geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig) + opsService := service.NewOpsService(opsRepository, settingRepository, configConfig, accountRepository, concurrencyService, gatewayService, openAIGatewayService, geminiMessagesCompatService, antigravityGatewayService) + opsHandler := admin.NewOpsHandler(opsService) + updateCache := repository.NewUpdateCache(redisClient) + gitHubReleaseClient := repository.NewGitHubReleaseClient() + serviceBuildInfo := provideServiceBuildInfo(buildInfo) + updateService := service.ProvideUpdateService(updateCache, gitHubReleaseClient, serviceBuildInfo) + systemHandler := handler.ProvideSystemHandler(updateService) + adminSubscriptionHandler := admin.NewSubscriptionHandler(subscriptionService) + adminUsageHandler := admin.NewUsageHandler(usageService, apiKeyService, adminService) + userAttributeDefinitionRepository := repository.NewUserAttributeDefinitionRepository(client) + userAttributeValueRepository := repository.NewUserAttributeValueRepository(client) + userAttributeService := service.NewUserAttributeService(userAttributeDefinitionRepository, userAttributeValueRepository) + userAttributeHandler := admin.NewUserAttributeHandler(userAttributeService) + adminHandlers := handler.ProvideAdminHandlers(dashboardHandler, adminUserHandler, groupHandler, accountHandler, oAuthHandler, openAIOAuthHandler, geminiOAuthHandler, antigravityOAuthHandler, proxyHandler, adminRedeemHandler, settingHandler, opsHandler, systemHandler, adminSubscriptionHandler, adminUsageHandler, userAttributeHandler) + gatewayHandler := handler.NewGatewayHandler(gatewayService, geminiMessagesCompatService, antigravityGatewayService, userService, concurrencyService, billingCacheService, configConfig) openAIGatewayHandler := handler.NewOpenAIGatewayHandler(openAIGatewayService, concurrencyService, billingCacheService, configConfig) handlerSettingHandler := handler.ProvideSettingHandler(settingService, buildInfo) handlers := handler.ProvideHandlers(authHandler, userHandler, apiKeyHandler, usageHandler, redeemHandler, subscriptionHandler, adminHandlers, gatewayHandler, openAIGatewayHandler, handlerSettingHandler) jwtAuthMiddleware := middleware.NewJWTAuthMiddleware(authService, userService) adminAuthMiddleware := middleware.NewAdminAuthMiddleware(authService, userService, settingService) apiKeyAuthMiddleware := middleware.NewAPIKeyAuthMiddleware(apiKeyService, subscriptionService, configConfig) - engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService) + engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, opsService) httpServer := server.ProvideHTTPServer(configConfig, engine) + opsMetricsCollector := service.ProvideOpsMetricsCollector(opsRepository, settingRepository, db, redisClient, configConfig) + opsAggregationService := service.ProvideOpsAggregationService(opsRepository, settingRepository, db, redisClient, configConfig) + opsAlertEvaluatorService := service.ProvideOpsAlertEvaluatorService(opsService, opsRepository, emailService, redisClient, configConfig) + opsCleanupService := service.ProvideOpsCleanupService(opsRepository, db, redisClient, configConfig) tokenRefreshService := service.ProvideTokenRefreshService(accountRepository, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService, configConfig) - v := provideCleanup(client, redisClient, tokenRefreshService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService) + v := provideCleanup(client, redisClient, opsMetricsCollector, opsAggregationService, opsAlertEvaluatorService, opsCleanupService, tokenRefreshService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService) application := &Application{ Server: httpServer, Cleanup: v, @@ -173,6 +180,10 @@ func provideServiceBuildInfo(buildInfo handler.BuildInfo) service.BuildInfo { func provideCleanup( entClient *ent.Client, rdb *redis.Client, + opsMetricsCollector *service.OpsMetricsCollector, + opsAggregation *service.OpsAggregationService, + opsAlertEvaluator *service.OpsAlertEvaluatorService, + opsCleanup *service.OpsCleanupService, tokenRefresh *service.TokenRefreshService, pricing *service.PricingService, emailQueue *service.EmailQueueService, @@ -190,6 +201,30 @@ func provideCleanup( name string fn func() error }{ + {"OpsCleanupService", func() error { + if opsCleanup != nil { + opsCleanup.Stop() + } + return nil + }}, + {"OpsAlertEvaluatorService", func() error { + if opsAlertEvaluator != nil { + opsAlertEvaluator.Stop() + } + return nil + }}, + {"OpsAggregationService", func() error { + if opsAggregation != nil { + opsAggregation.Stop() + } + return nil + }}, + {"OpsMetricsCollector", func() error { + if opsMetricsCollector != nil { + opsMetricsCollector.Stop() + } + return nil + }}, {"TokenRefreshService", func() error { tokenRefresh.Stop() return nil From fcdf839b6bb6defd344d26d07d8597110019b958 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:56:37 +0800 Subject: [PATCH 08/53] =?UTF-8?q?feat(=E7=BD=91=E5=85=B3):=20=E9=9B=86?= =?UTF-8?q?=E6=88=90=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E5=88=B0=20API=20?= =?UTF-8?q?=E7=BD=91=E5=85=B3=E5=A4=84=E7=90=86=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 在 gateway_handler 中添加请求监控和错误追踪 - 在 openai_gateway_handler 中集成 ops 指标采集 - 在 gemini_v1beta_handler 中集成 ops 指标采集 - 更新 handler 基类支持 ops 错误日志记录 --- backend/internal/handler/gateway_handler.go | 78 ++++++++++++------- .../internal/handler/gemini_v1beta_handler.go | 41 ++++++---- backend/internal/handler/handler.go | 1 + .../handler/openai_gateway_handler.go | 45 +++++++---- 4 files changed, 112 insertions(+), 53 deletions(-) diff --git a/backend/internal/handler/gateway_handler.go b/backend/internal/handler/gateway_handler.go index de3cbad9..7d1eab28 100644 --- a/backend/internal/handler/gateway_handler.go +++ b/backend/internal/handler/gateway_handler.go @@ -88,6 +88,8 @@ func (h *GatewayHandler) Messages(c *gin.Context) { return } + setOpsRequestContext(c, "", false, body) + parsedReq, err := service.ParseGatewayRequest(body) if err != nil { h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body") @@ -96,6 +98,8 @@ func (h *GatewayHandler) Messages(c *gin.Context) { reqModel := parsedReq.Model reqStream := parsedReq.Stream + setOpsRequestContext(c, reqModel, reqStream, body) + // 验证 model 必填 if reqModel == "" { h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "model is required") @@ -111,6 +115,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) { // 0. 检查wait队列是否已满 maxWait := service.CalculateMaxWait(subject.Concurrency) canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait) + waitCounted := false if err != nil { log.Printf("Increment wait count failed: %v", err) // On error, allow request to proceed @@ -118,8 +123,15 @@ func (h *GatewayHandler) Messages(c *gin.Context) { h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later") return } - // 确保在函数退出时减少wait计数 - defer h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID) + if err == nil && canWait { + waitCounted = true + } + // Ensure we decrement if we exit before acquiring the user slot. + defer func() { + if waitCounted { + h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID) + } + }() // 1. 首先获取用户并发槽位 userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted) @@ -128,6 +140,11 @@ func (h *GatewayHandler) Messages(c *gin.Context) { h.handleConcurrencyError(c, err, "user", streamStarted) return } + // User slot acquired: no longer waiting in the queue. + if waitCounted { + h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID) + waitCounted = false + } // 在请求结束或 Context 取消时确保释放槽位,避免客户端断开造成泄漏 userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc) if userReleaseFunc != nil { @@ -174,6 +191,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) { return } account := selection.Account + setOpsSelectedAccount(c, account.ID) // 检查预热请求拦截(在账号选择后、转发前检查) if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) { @@ -190,12 +208,12 @@ func (h *GatewayHandler) Messages(c *gin.Context) { // 3. 获取账号并发槽位 accountReleaseFunc := selection.ReleaseFunc - var accountWaitRelease func() if !selection.Acquired { if selection.WaitPlan == nil { h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted) return } + accountWaitCounted := false canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting) if err != nil { log.Printf("Increment account wait count failed: %v", err) @@ -203,12 +221,16 @@ func (h *GatewayHandler) Messages(c *gin.Context) { log.Printf("Account wait queue full: account=%d", account.ID) h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted) return - } else { - // Only set release function if increment succeeded - accountWaitRelease = func() { + } + if err == nil && canWait { + accountWaitCounted = true + } + // Ensure the wait counter is decremented if we exit before acquiring the slot. + defer func() { + if accountWaitCounted { h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID) } - } + }() accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout( c, @@ -219,20 +241,21 @@ func (h *GatewayHandler) Messages(c *gin.Context) { &streamStarted, ) if err != nil { - if accountWaitRelease != nil { - accountWaitRelease() - } log.Printf("Account concurrency acquire failed: %v", err) h.handleConcurrencyError(c, err, "account", streamStarted) return } + // Slot acquired: no longer waiting in queue. + if accountWaitCounted { + h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID) + accountWaitCounted = false + } if err := h.gatewayService.BindStickySession(c.Request.Context(), sessionKey, account.ID); err != nil { log.Printf("Bind sticky session failed: %v", err) } } // 账号槽位/等待计数需要在超时或断开时安全回收 accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc) - accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease) // 转发请求 - 根据账号平台分流 var result *service.ForwardResult @@ -244,9 +267,6 @@ func (h *GatewayHandler) Messages(c *gin.Context) { if accountReleaseFunc != nil { accountReleaseFunc() } - if accountWaitRelease != nil { - accountWaitRelease() - } if err != nil { var failoverErr *service.UpstreamFailoverError if errors.As(err, &failoverErr) { @@ -301,6 +321,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) { return } account := selection.Account + setOpsSelectedAccount(c, account.ID) // 检查预热请求拦截(在账号选择后、转发前检查) if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) { @@ -317,12 +338,12 @@ func (h *GatewayHandler) Messages(c *gin.Context) { // 3. 获取账号并发槽位 accountReleaseFunc := selection.ReleaseFunc - var accountWaitRelease func() if !selection.Acquired { if selection.WaitPlan == nil { h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted) return } + accountWaitCounted := false canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting) if err != nil { log.Printf("Increment account wait count failed: %v", err) @@ -330,12 +351,15 @@ func (h *GatewayHandler) Messages(c *gin.Context) { log.Printf("Account wait queue full: account=%d", account.ID) h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted) return - } else { - // Only set release function if increment succeeded - accountWaitRelease = func() { + } + if err == nil && canWait { + accountWaitCounted = true + } + defer func() { + if accountWaitCounted { h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID) } - } + }() accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout( c, @@ -346,20 +370,20 @@ func (h *GatewayHandler) Messages(c *gin.Context) { &streamStarted, ) if err != nil { - if accountWaitRelease != nil { - accountWaitRelease() - } log.Printf("Account concurrency acquire failed: %v", err) h.handleConcurrencyError(c, err, "account", streamStarted) return } + if accountWaitCounted { + h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID) + accountWaitCounted = false + } if err := h.gatewayService.BindStickySession(c.Request.Context(), sessionKey, account.ID); err != nil { log.Printf("Bind sticky session failed: %v", err) } } // 账号槽位/等待计数需要在超时或断开时安全回收 accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc) - accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease) // 转发请求 - 根据账号平台分流 var result *service.ForwardResult @@ -371,9 +395,6 @@ func (h *GatewayHandler) Messages(c *gin.Context) { if accountReleaseFunc != nil { accountReleaseFunc() } - if accountWaitRelease != nil { - accountWaitRelease() - } if err != nil { var failoverErr *service.UpstreamFailoverError if errors.As(err, &failoverErr) { @@ -672,6 +693,8 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) { return } + setOpsRequestContext(c, "", false, body) + parsedReq, err := service.ParseGatewayRequest(body) if err != nil { h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body") @@ -684,6 +707,8 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) { return } + setOpsRequestContext(c, parsedReq.Model, parsedReq.Stream, body) + // 获取订阅信息(可能为nil) subscription, _ := middleware2.GetSubscriptionFromContext(c) @@ -704,6 +729,7 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) { h.errorResponse(c, http.StatusServiceUnavailable, "api_error", "No available accounts: "+err.Error()) return } + setOpsSelectedAccount(c, account.ID) // 转发请求(不记录使用量) if err := h.gatewayService.ForwardCountTokens(c.Request.Context(), c, account, parsedReq); err != nil { diff --git a/backend/internal/handler/gemini_v1beta_handler.go b/backend/internal/handler/gemini_v1beta_handler.go index aaf651e9..73550575 100644 --- a/backend/internal/handler/gemini_v1beta_handler.go +++ b/backend/internal/handler/gemini_v1beta_handler.go @@ -161,6 +161,8 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { return } + setOpsRequestContext(c, modelName, stream, body) + // Get subscription (may be nil) subscription, _ := middleware.GetSubscriptionFromContext(c) @@ -170,13 +172,21 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { // 0) wait queue check maxWait := service.CalculateMaxWait(authSubject.Concurrency) canWait, err := geminiConcurrency.IncrementWaitCount(c.Request.Context(), authSubject.UserID, maxWait) + waitCounted := false if err != nil { log.Printf("Increment wait count failed: %v", err) } else if !canWait { googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later") return } - defer geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID) + if err == nil && canWait { + waitCounted = true + } + defer func() { + if waitCounted { + geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID) + } + }() // 1) user concurrency slot streamStarted := false @@ -185,6 +195,10 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { googleError(c, http.StatusTooManyRequests, err.Error()) return } + if waitCounted { + geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID) + waitCounted = false + } // 确保请求取消时也会释放槽位,避免长连接被动中断造成泄漏 userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc) if userReleaseFunc != nil { @@ -221,15 +235,16 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { return } account := selection.Account + setOpsSelectedAccount(c, account.ID) // 4) account concurrency slot accountReleaseFunc := selection.ReleaseFunc - var accountWaitRelease func() if !selection.Acquired { if selection.WaitPlan == nil { googleError(c, http.StatusServiceUnavailable, "No available Gemini accounts") return } + accountWaitCounted := false canWait, err := geminiConcurrency.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting) if err != nil { log.Printf("Increment account wait count failed: %v", err) @@ -237,12 +252,15 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { log.Printf("Account wait queue full: account=%d", account.ID) googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later") return - } else { - // Only set release function if increment succeeded - accountWaitRelease = func() { + } + if err == nil && canWait { + accountWaitCounted = true + } + defer func() { + if accountWaitCounted { geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID) } - } + }() accountReleaseFunc, err = geminiConcurrency.AcquireAccountSlotWithWaitTimeout( c, @@ -253,19 +271,19 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { &streamStarted, ) if err != nil { - if accountWaitRelease != nil { - accountWaitRelease() - } googleError(c, http.StatusTooManyRequests, err.Error()) return } + if accountWaitCounted { + geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID) + accountWaitCounted = false + } if err := h.gatewayService.BindStickySession(c.Request.Context(), sessionKey, account.ID); err != nil { log.Printf("Bind sticky session failed: %v", err) } } // 账号槽位/等待计数需要在超时或断开时安全回收 accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc) - accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease) // 5) forward (根据平台分流) var result *service.ForwardResult @@ -277,9 +295,6 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { if accountReleaseFunc != nil { accountReleaseFunc() } - if accountWaitRelease != nil { - accountWaitRelease() - } if err != nil { var failoverErr *service.UpstreamFailoverError if errors.As(err, &failoverErr) { diff --git a/backend/internal/handler/handler.go b/backend/internal/handler/handler.go index 817b71d3..030ebd68 100644 --- a/backend/internal/handler/handler.go +++ b/backend/internal/handler/handler.go @@ -17,6 +17,7 @@ type AdminHandlers struct { Proxy *admin.ProxyHandler Redeem *admin.RedeemHandler Setting *admin.SettingHandler + Ops *admin.OpsHandler System *admin.SystemHandler Subscription *admin.SubscriptionHandler Usage *admin.UsageHandler diff --git a/backend/internal/handler/openai_gateway_handler.go b/backend/internal/handler/openai_gateway_handler.go index 04d268a5..2ddf77ed 100644 --- a/backend/internal/handler/openai_gateway_handler.go +++ b/backend/internal/handler/openai_gateway_handler.go @@ -75,6 +75,8 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { return } + setOpsRequestContext(c, "", false, body) + // Parse request body to map for potential modification var reqBody map[string]any if err := json.Unmarshal(body, &reqBody); err != nil { @@ -104,6 +106,8 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { } } + setOpsRequestContext(c, reqModel, reqStream, body) + // Track if we've started streaming (for error handling) streamStarted := false @@ -113,6 +117,7 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { // 0. Check if wait queue is full maxWait := service.CalculateMaxWait(subject.Concurrency) canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait) + waitCounted := false if err != nil { log.Printf("Increment wait count failed: %v", err) // On error, allow request to proceed @@ -120,8 +125,14 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later") return } - // Ensure wait count is decremented when function exits - defer h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID) + if err == nil && canWait { + waitCounted = true + } + defer func() { + if waitCounted { + h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID) + } + }() // 1. First acquire user concurrency slot userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted) @@ -130,6 +141,11 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { h.handleConcurrencyError(c, err, "user", streamStarted) return } + // User slot acquired: no longer waiting. + if waitCounted { + h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID) + waitCounted = false + } // 确保请求取消时也会释放槽位,避免长连接被动中断造成泄漏 userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc) if userReleaseFunc != nil { @@ -167,15 +183,16 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { } account := selection.Account log.Printf("[OpenAI Handler] Selected account: id=%d name=%s", account.ID, account.Name) + setOpsSelectedAccount(c, account.ID) // 3. Acquire account concurrency slot accountReleaseFunc := selection.ReleaseFunc - var accountWaitRelease func() if !selection.Acquired { if selection.WaitPlan == nil { h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted) return } + accountWaitCounted := false canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting) if err != nil { log.Printf("Increment account wait count failed: %v", err) @@ -183,12 +200,15 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { log.Printf("Account wait queue full: account=%d", account.ID) h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted) return - } else { - // Only set release function if increment succeeded - accountWaitRelease = func() { + } + if err == nil && canWait { + accountWaitCounted = true + } + defer func() { + if accountWaitCounted { h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID) } - } + }() accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout( c, @@ -199,29 +219,26 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { &streamStarted, ) if err != nil { - if accountWaitRelease != nil { - accountWaitRelease() - } log.Printf("Account concurrency acquire failed: %v", err) h.handleConcurrencyError(c, err, "account", streamStarted) return } + if accountWaitCounted { + h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID) + accountWaitCounted = false + } if err := h.gatewayService.BindStickySession(c.Request.Context(), sessionHash, account.ID); err != nil { log.Printf("Bind sticky session failed: %v", err) } } // 账号槽位/等待计数需要在超时或断开时安全回收 accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc) - accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease) // Forward request result, err := h.gatewayService.Forward(c.Request.Context(), c, account, body) if accountReleaseFunc != nil { accountReleaseFunc() } - if accountWaitRelease != nil { - accountWaitRelease() - } if err != nil { var failoverErr *service.UpstreamFailoverError if errors.As(err, &failoverErr) { From 2d123a11ad208aef42b982655c825e5347c8b7f9 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:57:32 +0800 Subject: [PATCH 09/53] =?UTF-8?q?feat(=E8=AE=BE=E7=BD=AE):=20=E9=9B=86?= =?UTF-8?q?=E6=88=90=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E5=88=B0=E7=B3=BB=E7=BB=9F=E8=AE=BE=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 扩展 setting_handler 支持 ops 配置管理 - 扩展 setting_service 支持 ops 配置持久化 - 更新 settings_view 包含 ops 配置视图 --- .../internal/handler/admin/setting_handler.go | 38 +++++++++++++++++++ backend/internal/service/setting_service.go | 25 ++++++++++++ backend/internal/service/settings_view.go | 5 +++ 3 files changed, 68 insertions(+) diff --git a/backend/internal/handler/admin/setting_handler.go b/backend/internal/handler/admin/setting_handler.go index 743c4268..4d4d5639 100644 --- a/backend/internal/handler/admin/setting_handler.go +++ b/backend/internal/handler/admin/setting_handler.go @@ -65,6 +65,9 @@ func (h *SettingHandler) GetSettings(c *gin.Context) { FallbackModelAntigravity: settings.FallbackModelAntigravity, EnableIdentityPatch: settings.EnableIdentityPatch, IdentityPatchPrompt: settings.IdentityPatchPrompt, + OpsMonitoringEnabled: settings.OpsMonitoringEnabled, + OpsRealtimeMonitoringEnabled: settings.OpsRealtimeMonitoringEnabled, + OpsQueryModeDefault: settings.OpsQueryModeDefault, }) } @@ -110,6 +113,11 @@ type UpdateSettingsRequest struct { // Identity patch configuration (Claude -> Gemini) EnableIdentityPatch bool `json:"enable_identity_patch"` IdentityPatchPrompt string `json:"identity_patch_prompt"` + + // Ops monitoring (vNext) + OpsMonitoringEnabled *bool `json:"ops_monitoring_enabled"` + OpsRealtimeMonitoringEnabled *bool `json:"ops_realtime_monitoring_enabled"` + OpsQueryModeDefault *string `json:"ops_query_mode_default"` } // UpdateSettings 更新系统设置 @@ -193,6 +201,24 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) { FallbackModelAntigravity: req.FallbackModelAntigravity, EnableIdentityPatch: req.EnableIdentityPatch, IdentityPatchPrompt: req.IdentityPatchPrompt, + OpsMonitoringEnabled: func() bool { + if req.OpsMonitoringEnabled != nil { + return *req.OpsMonitoringEnabled + } + return previousSettings.OpsMonitoringEnabled + }(), + OpsRealtimeMonitoringEnabled: func() bool { + if req.OpsRealtimeMonitoringEnabled != nil { + return *req.OpsRealtimeMonitoringEnabled + } + return previousSettings.OpsRealtimeMonitoringEnabled + }(), + OpsQueryModeDefault: func() string { + if req.OpsQueryModeDefault != nil { + return *req.OpsQueryModeDefault + } + return previousSettings.OpsQueryModeDefault + }(), } if err := h.settingService.UpdateSettings(c.Request.Context(), settings); err != nil { @@ -237,6 +263,9 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) { FallbackModelAntigravity: updatedSettings.FallbackModelAntigravity, EnableIdentityPatch: updatedSettings.EnableIdentityPatch, IdentityPatchPrompt: updatedSettings.IdentityPatchPrompt, + OpsMonitoringEnabled: updatedSettings.OpsMonitoringEnabled, + OpsRealtimeMonitoringEnabled: updatedSettings.OpsRealtimeMonitoringEnabled, + OpsQueryModeDefault: updatedSettings.OpsQueryModeDefault, }) } @@ -337,6 +366,15 @@ func diffSettings(before *service.SystemSettings, after *service.SystemSettings, if before.FallbackModelAntigravity != after.FallbackModelAntigravity { changed = append(changed, "fallback_model_antigravity") } + if before.OpsMonitoringEnabled != after.OpsMonitoringEnabled { + changed = append(changed, "ops_monitoring_enabled") + } + if before.OpsRealtimeMonitoringEnabled != after.OpsRealtimeMonitoringEnabled { + changed = append(changed, "ops_realtime_monitoring_enabled") + } + if before.OpsQueryModeDefault != after.OpsQueryModeDefault { + changed = append(changed, "ops_query_mode_default") + } return changed } diff --git a/backend/internal/service/setting_service.go b/backend/internal/service/setting_service.go index 6ce8ba2b..1aea32be 100644 --- a/backend/internal/service/setting_service.go +++ b/backend/internal/service/setting_service.go @@ -7,6 +7,7 @@ import ( "errors" "fmt" "strconv" + "strings" "github.com/Wei-Shaw/sub2api/internal/config" infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" @@ -134,6 +135,11 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet updates[SettingKeyEnableIdentityPatch] = strconv.FormatBool(settings.EnableIdentityPatch) updates[SettingKeyIdentityPatchPrompt] = settings.IdentityPatchPrompt + // Ops monitoring (vNext) + updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled) + updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled) + updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault)) + return s.settingRepo.SetMultiple(ctx, updates) } @@ -220,6 +226,11 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error { // Identity patch defaults SettingKeyEnableIdentityPatch: "true", SettingKeyIdentityPatchPrompt: "", + + // Ops monitoring defaults (vNext) + SettingKeyOpsMonitoringEnabled: "true", + SettingKeyOpsRealtimeMonitoringEnabled: "true", + SettingKeyOpsQueryModeDefault: "auto", } return s.settingRepo.SetMultiple(ctx, defaults) @@ -286,9 +297,23 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin } result.IdentityPatchPrompt = settings[SettingKeyIdentityPatchPrompt] + // Ops monitoring settings (default: enabled, fail-open) + result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled]) + result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled]) + result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault])) + return result } +func isFalseSettingValue(value string) bool { + switch strings.ToLower(strings.TrimSpace(value)) { + case "false", "0", "off", "disabled": + return true + default: + return false + } +} + // getStringOrDefault 获取字符串值或默认值 func (s *SettingService) getStringOrDefault(settings map[string]string, key, defaultValue string) string { if value, ok := settings[key]; ok && value != "" { diff --git a/backend/internal/service/settings_view.go b/backend/internal/service/settings_view.go index de0331f7..e9d07bca 100644 --- a/backend/internal/service/settings_view.go +++ b/backend/internal/service/settings_view.go @@ -38,6 +38,11 @@ type SystemSettings struct { // Identity patch configuration (Claude -> Gemini) EnableIdentityPatch bool `json:"enable_identity_patch"` IdentityPatchPrompt string `json:"identity_patch_prompt"` + + // Ops monitoring (vNext) + OpsMonitoringEnabled bool + OpsRealtimeMonitoringEnabled bool + OpsQueryModeDefault string } type PublicSettings struct { From e846458009c525e045b232ffdcc483e702d23153 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:58:01 +0800 Subject: [PATCH 10/53] =?UTF-8?q?test(=E5=90=8E=E7=AB=AF):=20=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=20API=20=E5=A5=91=E7=BA=A6=E6=B5=8B=E8=AF=95=E6=94=AF?= =?UTF-8?q?=E6=8C=81=20ops=20=E7=9B=91=E6=8E=A7=E7=AB=AF=E7=82=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 更新 api_contract_test.go 包含 ops 相关端点测试 --- backend/internal/server/api_contract_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/internal/server/api_contract_test.go b/backend/internal/server/api_contract_test.go index f98ebc59..23cab19c 100644 --- a/backend/internal/server/api_contract_test.go +++ b/backend/internal/server/api_contract_test.go @@ -317,7 +317,9 @@ func TestAPIContracts(t *testing.T) { "fallback_model_gemini": "gemini-2.5-pro", "fallback_model_openai": "gpt-4o", "enable_identity_patch": true, - "identity_patch_prompt": "" + "identity_patch_prompt": "", + "ops_monitoring_enabled": true, + "ops_realtime_monitoring_enabled": true } }`, }, From 11d063e3c4b9fcc146ca318fe41a47c5bbf55530 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:58:33 +0800 Subject: [PATCH 11/53] =?UTF-8?q?feat(=E5=89=8D=E7=AB=AFAPI):=20=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=20API=20=E5=AE=A2?= =?UTF-8?q?=E6=88=B7=E7=AB=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 ops API 客户端(ops.ts) - 扩展 settings API 支持 ops 配置 - 更新 admin API 索引导出 ops 模块 - 扩展 API 客户端支持 WebSocket 连接 --- frontend/src/api/admin/index.ts | 7 +- frontend/src/api/admin/ops.ts | 906 +++++++++++++++++++++++++++++ frontend/src/api/admin/settings.ts | 21 + frontend/src/api/client.ts | 40 +- 4 files changed, 970 insertions(+), 4 deletions(-) create mode 100644 frontend/src/api/admin/ops.ts diff --git a/frontend/src/api/admin/index.ts b/frontend/src/api/admin/index.ts index ea12f6d2..9e719a90 100644 --- a/frontend/src/api/admin/index.ts +++ b/frontend/src/api/admin/index.ts @@ -16,6 +16,7 @@ import usageAPI from './usage' import geminiAPI from './gemini' import antigravityAPI from './antigravity' import userAttributesAPI from './userAttributes' +import opsAPI from './ops' /** * Unified admin API object for convenient access @@ -33,7 +34,8 @@ export const adminAPI = { usage: usageAPI, gemini: geminiAPI, antigravity: antigravityAPI, - userAttributes: userAttributesAPI + userAttributes: userAttributesAPI, + ops: opsAPI } export { @@ -49,7 +51,8 @@ export { usageAPI, geminiAPI, antigravityAPI, - userAttributesAPI + userAttributesAPI, + opsAPI } export default adminAPI diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts new file mode 100644 index 00000000..3c3529a9 --- /dev/null +++ b/frontend/src/api/admin/ops.ts @@ -0,0 +1,906 @@ +/** + * Admin Ops API endpoints (vNext) + * - Error logs list/detail + retry (client/upstream) + * - Dashboard overview (raw path) + */ + +import { apiClient } from '../client' +import type { PaginatedResponse } from '@/types' + +export type OpsRetryMode = 'client' | 'upstream' +export type OpsQueryMode = 'auto' | 'raw' | 'preagg' + +export interface OpsRequestOptions { + signal?: AbortSignal +} + +export interface OpsRetryRequest { + mode: OpsRetryMode + pinned_account_id?: number +} + +export interface OpsRetryResult { + attempt_id: number + mode: OpsRetryMode + status: 'running' | 'succeeded' | 'failed' | string + + pinned_account_id?: number | null + used_account_id?: number | null + + http_status_code: number + upstream_request_id: string + + response_preview: string + response_truncated: boolean + + error_message: string + + started_at: string + finished_at: string + duration_ms: number +} + +export interface OpsDashboardOverview { + start_time: string + end_time: string + platform: string + group_id?: number | null + + system_metrics?: OpsSystemMetricsSnapshot | null + job_heartbeats?: OpsJobHeartbeat[] | null + + success_count: number + error_count_total: number + business_limited_count: number + error_count_sla: number + request_count_total: number + request_count_sla: number + + token_consumed: number + + sla: number + error_rate: number + upstream_error_rate: number + upstream_error_count_excl_429_529: number + upstream_429_count: number + upstream_529_count: number + + qps: { + current: number + peak: number + avg: number + } + tps: { + current: number + peak: number + avg: number + } + + duration: OpsPercentiles + ttft: OpsPercentiles +} + +export interface OpsPercentiles { + p50_ms?: number | null + p90_ms?: number | null + p95_ms?: number | null + p99_ms?: number | null + avg_ms?: number | null + max_ms?: number | null +} + +export interface OpsThroughputTrendPoint { + bucket_start: string + request_count: number + token_consumed: number + qps: number + tps: number +} + +export interface OpsThroughputPlatformBreakdownItem { + platform: string + request_count: number + token_consumed: number +} + +export interface OpsThroughputGroupBreakdownItem { + group_id: number + group_name: string + request_count: number + token_consumed: number +} + +export interface OpsThroughputTrendResponse { + bucket: string + points: OpsThroughputTrendPoint[] + by_platform?: OpsThroughputPlatformBreakdownItem[] + top_groups?: OpsThroughputGroupBreakdownItem[] +} + +export type OpsRequestKind = 'success' | 'error' +export type OpsRequestDetailsKind = OpsRequestKind | 'all' +export type OpsRequestDetailsSort = 'created_at_desc' | 'duration_desc' + +export interface OpsRequestDetail { + kind: OpsRequestKind + created_at: string + request_id: string + + platform?: string + model?: string + duration_ms?: number | null + status_code?: number | null + + error_id?: number | null + phase?: string + severity?: string + message?: string + + user_id?: number | null + api_key_id?: number | null + account_id?: number | null + group_id?: number | null + + stream?: boolean +} + +export interface OpsRequestDetailsParams { + time_range?: '5m' | '30m' | '1h' | '6h' | '24h' + start_time?: string + end_time?: string + + kind?: OpsRequestDetailsKind + + platform?: string + group_id?: number | null + + user_id?: number + api_key_id?: number + account_id?: number + + model?: string + request_id?: string + q?: string + + min_duration_ms?: number + max_duration_ms?: number + + sort?: OpsRequestDetailsSort + + page?: number + page_size?: number +} + +export type OpsRequestDetailsResponse = PaginatedResponse + +export interface OpsLatencyHistogramBucket { + range: string + count: number +} + +export interface OpsLatencyHistogramResponse { + start_time: string + end_time: string + platform: string + group_id?: number | null + + total_requests: number + buckets: OpsLatencyHistogramBucket[] +} + +export interface OpsErrorTrendPoint { + bucket_start: string + error_count_total: number + business_limited_count: number + error_count_sla: number + upstream_error_count_excl_429_529: number + upstream_429_count: number + upstream_529_count: number +} + +export interface OpsErrorTrendResponse { + bucket: string + points: OpsErrorTrendPoint[] +} + +export interface OpsErrorDistributionItem { + status_code: number + total: number + sla: number + business_limited: number +} + +export interface OpsErrorDistributionResponse { + total: number + items: OpsErrorDistributionItem[] +} + +export interface OpsSystemMetricsSnapshot { + id: number + created_at: string + window_minutes: number + + cpu_usage_percent?: number | null + memory_used_mb?: number | null + memory_total_mb?: number | null + memory_usage_percent?: number | null + + db_ok?: boolean | null + redis_ok?: boolean | null + + db_conn_active?: number | null + db_conn_idle?: number | null + db_conn_waiting?: number | null + + goroutine_count?: number | null + concurrency_queue_depth?: number | null +} + +export interface OpsJobHeartbeat { + job_name: string + last_run_at?: string | null + last_success_at?: string | null + last_error_at?: string | null + last_error?: string | null + last_duration_ms?: number | null + updated_at: string +} + +export interface PlatformConcurrencyInfo { + platform: string + current_in_use: number + max_capacity: number + load_percentage: number + waiting_in_queue: number +} + +export interface GroupConcurrencyInfo { + group_id: number + group_name: string + platform: string + current_in_use: number + max_capacity: number + load_percentage: number + waiting_in_queue: number +} + +export interface AccountConcurrencyInfo { + account_id: number + account_name?: string + platform: string + group_id: number + group_name: string + current_in_use: number + max_capacity: number + load_percentage: number + waiting_in_queue: number +} + +export interface OpsConcurrencyStatsResponse { + enabled: boolean + platform: Record + group: Record + account: Record + timestamp?: string +} + +export async function getConcurrencyStats(platform?: string, groupId?: number | null): Promise { + const params: Record = {} + if (platform) { + params.platform = platform + } + if (typeof groupId === 'number' && groupId > 0) { + params.group_id = groupId + } + + const { data } = await apiClient.get('/admin/ops/concurrency', { params }) + return data +} + +export interface PlatformAvailability { + platform: string + total_accounts: number + available_count: number + rate_limit_count: number + error_count: number +} + +export interface GroupAvailability { + group_id: number + group_name: string + platform: string + total_accounts: number + available_count: number + rate_limit_count: number + error_count: number +} + +export interface AccountAvailability { + account_id: number + account_name: string + platform: string + group_id: number + group_name: string + status: string + is_available: boolean + is_rate_limited: boolean + rate_limit_reset_at?: string + rate_limit_remaining_sec?: number + is_overloaded: boolean + overload_until?: string + overload_remaining_sec?: number + has_error: boolean + error_message?: string +} + +export interface OpsAccountAvailabilityStatsResponse { + enabled: boolean + platform: Record + group: Record + account: Record + timestamp?: string +} + +export async function getAccountAvailabilityStats(platform?: string, groupId?: number | null): Promise { + const params: Record = {} + if (platform) { + params.platform = platform + } + if (typeof groupId === 'number' && groupId > 0) { + params.group_id = groupId + } + const { data } = await apiClient.get('/admin/ops/account-availability', { params }) + return data +} + +/** + * Subscribe to realtime QPS updates via WebSocket. + * + * Note: browsers cannot set Authorization headers for WebSockets. + * We authenticate via Sec-WebSocket-Protocol using a prefixed token item: + * ["sub2api-admin", "jwt."] + */ +export interface SubscribeQPSOptions { + token?: string | null + onOpen?: () => void + onClose?: (event: CloseEvent) => void + onError?: (event: Event) => void + /** + * Called when the server closes with an application close code that indicates + * reconnecting is not useful (e.g. feature flag disabled). + */ + onFatalClose?: (event: CloseEvent) => void + /** + * More granular status updates for UI (connecting/reconnecting/offline/etc). + */ + onStatusChange?: (status: OpsWSStatus) => void + /** + * Called when a reconnect is scheduled (helps display "retry in Xs"). + */ + onReconnectScheduled?: (info: { attempt: number, delayMs: number }) => void + wsBaseUrl?: string + /** + * Maximum reconnect attempts. Defaults to Infinity to keep the dashboard live. + * Set to 0 to disable reconnect. + */ + maxReconnectAttempts?: number + reconnectBaseDelayMs?: number + reconnectMaxDelayMs?: number + /** + * Stale connection detection (heartbeat-by-observation). + * If no messages are received within this window, the socket is closed to trigger a reconnect. + * Set to 0 to disable. + */ + staleTimeoutMs?: number + /** + * How often to check staleness. Only used when `staleTimeoutMs > 0`. + */ + staleCheckIntervalMs?: number +} + +export type OpsWSStatus = 'connecting' | 'connected' | 'reconnecting' | 'offline' | 'closed' + +export const OPS_WS_CLOSE_CODES = { + REALTIME_DISABLED: 4001 +} as const + +const OPS_WS_BASE_PROTOCOL = 'sub2api-admin' + +export function subscribeQPS(onMessage: (data: any) => void, options: SubscribeQPSOptions = {}): () => void { + let ws: WebSocket | null = null + let reconnectAttempts = 0 + const maxReconnectAttempts = Number.isFinite(options.maxReconnectAttempts as number) + ? (options.maxReconnectAttempts as number) + : Infinity + const baseDelayMs = options.reconnectBaseDelayMs ?? 1000 + const maxDelayMs = options.reconnectMaxDelayMs ?? 30000 + let reconnectTimer: ReturnType | null = null + let shouldReconnect = true + let isConnecting = false + let hasConnectedOnce = false + let lastMessageAt = 0 + const staleTimeoutMs = options.staleTimeoutMs ?? 120_000 + const staleCheckIntervalMs = options.staleCheckIntervalMs ?? 30_000 + let staleTimer: ReturnType | null = null + + const setStatus = (status: OpsWSStatus) => { + options.onStatusChange?.(status) + } + + const clearReconnectTimer = () => { + if (reconnectTimer) { + clearTimeout(reconnectTimer) + reconnectTimer = null + } + } + + const clearStaleTimer = () => { + if (staleTimer) { + clearInterval(staleTimer) + staleTimer = null + } + } + + const startStaleTimer = () => { + clearStaleTimer() + if (!staleTimeoutMs || staleTimeoutMs <= 0) return + staleTimer = setInterval(() => { + if (!shouldReconnect) return + if (!ws || ws.readyState !== WebSocket.OPEN) return + if (!lastMessageAt) return + const ageMs = Date.now() - lastMessageAt + if (ageMs > staleTimeoutMs) { + // Treat as a half-open connection; closing triggers the normal reconnect path. + ws.close() + } + }, staleCheckIntervalMs) + } + + const scheduleReconnect = () => { + if (!shouldReconnect) return + if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return + + // If we're offline, wait for the browser to come back online. + if (typeof navigator !== 'undefined' && 'onLine' in navigator && !navigator.onLine) { + setStatus('offline') + return + } + + const expDelay = baseDelayMs * Math.pow(2, reconnectAttempts) + const delay = Math.min(expDelay, maxDelayMs) + const jitter = Math.floor(Math.random() * 250) + clearReconnectTimer() + reconnectTimer = setTimeout(() => { + reconnectAttempts++ + connect() + }, delay + jitter) + options.onReconnectScheduled?.({ attempt: reconnectAttempts + 1, delayMs: delay + jitter }) + } + + const handleOnline = () => { + if (!shouldReconnect) return + if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return + connect() + } + + const handleOffline = () => { + setStatus('offline') + } + + const connect = () => { + if (!shouldReconnect) return + if (isConnecting) return + if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return + if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return + + isConnecting = true + setStatus(hasConnectedOnce ? 'reconnecting' : 'connecting') + const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:' + const wsBaseUrl = options.wsBaseUrl || import.meta.env.VITE_WS_BASE_URL || window.location.host + const wsURL = new URL(`${protocol}//${wsBaseUrl}/api/v1/admin/ops/ws/qps`) + + // Do NOT put admin JWT in the URL query string (it can leak via access logs, proxies, etc). + // Browsers cannot set Authorization headers for WebSockets, so we pass the token via + // Sec-WebSocket-Protocol (subprotocol list): ["sub2api-admin", "jwt."]. + const rawToken = String(options.token ?? localStorage.getItem('auth_token') ?? '').trim() + const protocols: string[] = [OPS_WS_BASE_PROTOCOL] + if (rawToken) protocols.push(`jwt.${rawToken}`) + + ws = new WebSocket(wsURL.toString(), protocols) + + ws.onopen = () => { + reconnectAttempts = 0 + isConnecting = false + hasConnectedOnce = true + clearReconnectTimer() + lastMessageAt = Date.now() + startStaleTimer() + setStatus('connected') + options.onOpen?.() + } + + ws.onmessage = (e) => { + try { + const data = JSON.parse(e.data) + lastMessageAt = Date.now() + onMessage(data) + } catch (err) { + console.warn('[OpsWS] Failed to parse message:', err) + } + } + + ws.onerror = (error) => { + console.error('[OpsWS] Connection error:', error) + options.onError?.(error) + } + + ws.onclose = (event) => { + isConnecting = false + options.onClose?.(event) + clearStaleTimer() + ws = null + + // If the server explicitly tells us to stop reconnecting, honor it. + if (event && typeof event.code === 'number' && event.code === OPS_WS_CLOSE_CODES.REALTIME_DISABLED) { + shouldReconnect = false + clearReconnectTimer() + setStatus('closed') + options.onFatalClose?.(event) + return + } + + scheduleReconnect() + } + } + + window.addEventListener('online', handleOnline) + window.addEventListener('offline', handleOffline) + connect() + + return () => { + shouldReconnect = false + window.removeEventListener('online', handleOnline) + window.removeEventListener('offline', handleOffline) + clearReconnectTimer() + clearStaleTimer() + if (ws) ws.close() + ws = null + setStatus('closed') + } +} + +export type OpsSeverity = string +export type OpsPhase = string + +export type AlertSeverity = 'critical' | 'warning' | 'info' +export type ThresholdMode = 'count' | 'percentage' | 'both' +export type MetricType = + | 'success_rate' + | 'error_rate' + | 'upstream_error_rate' + | 'p95_latency_ms' + | 'p99_latency_ms' + | 'cpu_usage_percent' + | 'memory_usage_percent' + | 'concurrency_queue_depth' +export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!=' + +export interface AlertRule { + id?: number + name: string + description?: string + enabled: boolean + metric_type: MetricType + operator: Operator + threshold: number + window_minutes: number + sustained_minutes: number + severity: OpsSeverity + cooldown_minutes: number + notify_email: boolean + filters?: Record + created_at?: string + updated_at?: string + last_triggered_at?: string | null +} + +export interface AlertEvent { + id: number + rule_id: number + severity: OpsSeverity | string + status: 'firing' | 'resolved' | string + title?: string + description?: string + metric_value?: number + threshold_value?: number + dimensions?: Record + fired_at: string + resolved_at?: string | null + email_sent: boolean + created_at: string +} + +export interface EmailNotificationConfig { + alert: { + enabled: boolean + recipients: string[] + min_severity: AlertSeverity | '' + rate_limit_per_hour: number + batching_window_seconds: number + include_resolved_alerts: boolean + } + report: { + enabled: boolean + recipients: string[] + daily_summary_enabled: boolean + daily_summary_schedule: string + weekly_summary_enabled: boolean + weekly_summary_schedule: string + error_digest_enabled: boolean + error_digest_schedule: string + error_digest_min_count: number + account_health_enabled: boolean + account_health_schedule: string + account_health_error_rate_threshold: number + } +} + +export interface OpsDistributedLockSettings { + enabled: boolean + key: string + ttl_seconds: number +} + +export interface OpsAlertRuntimeSettings { + evaluation_interval_seconds: number + distributed_lock: OpsDistributedLockSettings + silencing: { + enabled: boolean + global_until_rfc3339: string + global_reason: string + entries?: Array<{ + rule_id?: number + severities?: Array + until_rfc3339: string + reason: string + }> + } +} + +export interface OpsErrorLog { + id: number + created_at: string + phase: OpsPhase + type: string + severity: OpsSeverity + status_code: number + platform: string + model: string + latency_ms?: number | null + client_request_id: string + request_id: string + message: string + + user_id?: number | null + api_key_id?: number | null + account_id?: number | null + group_id?: number | null + + client_ip?: string | null + request_path?: string + stream?: boolean +} + +export interface OpsErrorDetail extends OpsErrorLog { + error_body: string + user_agent: string + + auth_latency_ms?: number | null + routing_latency_ms?: number | null + upstream_latency_ms?: number | null + response_latency_ms?: number | null + time_to_first_token_ms?: number | null + + request_body: string + request_body_truncated: boolean + request_body_bytes?: number | null + + is_business_limited: boolean +} + +export type OpsErrorLogsResponse = PaginatedResponse + +export async function getDashboardOverview( + params: { + time_range?: '5m' | '30m' | '1h' | '6h' | '24h' + start_time?: string + end_time?: string + platform?: string + group_id?: number | null + mode?: OpsQueryMode + }, + options: OpsRequestOptions = {} +): Promise { + const { data } = await apiClient.get('/admin/ops/dashboard/overview', { + params, + signal: options.signal + }) + return data +} + +export async function getThroughputTrend( + params: { + time_range?: '5m' | '30m' | '1h' | '6h' | '24h' + start_time?: string + end_time?: string + platform?: string + group_id?: number | null + mode?: OpsQueryMode + }, + options: OpsRequestOptions = {} +): Promise { + const { data } = await apiClient.get('/admin/ops/dashboard/throughput-trend', { + params, + signal: options.signal + }) + return data +} + +export async function getLatencyHistogram( + params: { + time_range?: '5m' | '30m' | '1h' | '6h' | '24h' + start_time?: string + end_time?: string + platform?: string + group_id?: number | null + mode?: OpsQueryMode + }, + options: OpsRequestOptions = {} +): Promise { + const { data } = await apiClient.get('/admin/ops/dashboard/latency-histogram', { + params, + signal: options.signal + }) + return data +} + +export async function getErrorTrend( + params: { + time_range?: '5m' | '30m' | '1h' | '6h' | '24h' + start_time?: string + end_time?: string + platform?: string + group_id?: number | null + mode?: OpsQueryMode + }, + options: OpsRequestOptions = {} +): Promise { + const { data } = await apiClient.get('/admin/ops/dashboard/error-trend', { + params, + signal: options.signal + }) + return data +} + +export async function getErrorDistribution( + params: { + time_range?: '5m' | '30m' | '1h' | '6h' | '24h' + start_time?: string + end_time?: string + platform?: string + group_id?: number | null + mode?: OpsQueryMode + }, + options: OpsRequestOptions = {} +): Promise { + const { data } = await apiClient.get('/admin/ops/dashboard/error-distribution', { + params, + signal: options.signal + }) + return data +} + +export async function listErrorLogs(params: { + page?: number + page_size?: number + time_range?: string + start_time?: string + end_time?: string + platform?: string + group_id?: number | null + account_id?: number | null + phase?: string + q?: string + status_codes?: string +}): Promise { + const { data } = await apiClient.get('/admin/ops/errors', { params }) + return data +} + +export async function getErrorLogDetail(id: number): Promise { + const { data } = await apiClient.get(`/admin/ops/errors/${id}`) + return data +} + +export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promise { + const { data } = await apiClient.post(`/admin/ops/errors/${id}/retry`, req) + return data +} + +export async function listRequestDetails(params: OpsRequestDetailsParams): Promise { + const { data } = await apiClient.get('/admin/ops/requests', { params }) + return data +} + +// Alert rules +export async function listAlertRules(): Promise { + const { data } = await apiClient.get('/admin/ops/alert-rules') + return data +} + +export async function createAlertRule(rule: AlertRule): Promise { + const { data } = await apiClient.post('/admin/ops/alert-rules', rule) + return data +} + +export async function updateAlertRule(id: number, rule: Partial): Promise { + const { data } = await apiClient.put(`/admin/ops/alert-rules/${id}`, rule) + return data +} + +export async function deleteAlertRule(id: number): Promise { + await apiClient.delete(`/admin/ops/alert-rules/${id}`) +} + +export async function listAlertEvents(limit = 100): Promise { + const { data } = await apiClient.get('/admin/ops/alert-events', { params: { limit } }) + return data +} + +// Email notification config +export async function getEmailNotificationConfig(): Promise { + const { data } = await apiClient.get('/admin/ops/email-notification/config') + return data +} + +export async function updateEmailNotificationConfig(config: EmailNotificationConfig): Promise { + const { data } = await apiClient.put('/admin/ops/email-notification/config', config) + return data +} + +// Runtime settings (DB-backed) +export async function getAlertRuntimeSettings(): Promise { + const { data } = await apiClient.get('/admin/ops/runtime/alert') + return data +} + +export async function updateAlertRuntimeSettings(config: OpsAlertRuntimeSettings): Promise { + const { data } = await apiClient.put('/admin/ops/runtime/alert', config) + return data +} + +export const opsAPI = { + getDashboardOverview, + getThroughputTrend, + getLatencyHistogram, + getErrorTrend, + getErrorDistribution, + getConcurrencyStats, + getAccountAvailabilityStats, + subscribeQPS, + listErrorLogs, + getErrorLogDetail, + retryErrorRequest, + listRequestDetails, + listAlertRules, + createAlertRule, + updateAlertRule, + deleteAlertRule, + listAlertEvents, + getEmailNotificationConfig, + updateEmailNotificationConfig, + getAlertRuntimeSettings, + updateAlertRuntimeSettings +} + +export default opsAPI diff --git a/frontend/src/api/admin/settings.ts b/frontend/src/api/admin/settings.ts index 6b46de7d..37b12e40 100644 --- a/frontend/src/api/admin/settings.ts +++ b/frontend/src/api/admin/settings.ts @@ -34,9 +34,22 @@ export interface SystemSettings { turnstile_enabled: boolean turnstile_site_key: string turnstile_secret_key_configured: boolean + + // Model fallback configuration + enable_model_fallback: boolean + fallback_model_anthropic: string + fallback_model_openai: string + fallback_model_gemini: string + fallback_model_antigravity: string + // Identity patch configuration (Claude -> Gemini) enable_identity_patch: boolean identity_patch_prompt: string + + // Ops Monitoring (vNext) + ops_monitoring_enabled: boolean + ops_realtime_monitoring_enabled: boolean + ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string } export interface UpdateSettingsRequest { @@ -60,8 +73,16 @@ export interface UpdateSettingsRequest { turnstile_enabled?: boolean turnstile_site_key?: string turnstile_secret_key?: string + enable_model_fallback?: boolean + fallback_model_anthropic?: string + fallback_model_openai?: string + fallback_model_gemini?: string + fallback_model_antigravity?: string enable_identity_patch?: boolean identity_patch_prompt?: string + ops_monitoring_enabled?: boolean + ops_realtime_monitoring_enabled?: boolean + ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string } /** diff --git a/frontend/src/api/client.ts b/frontend/src/api/client.ts index 4e53069a..3827498b 100644 --- a/frontend/src/api/client.ts +++ b/frontend/src/api/client.ts @@ -80,9 +80,45 @@ apiClient.interceptors.response.use( return response }, (error: AxiosError>) => { + // Request cancellation: keep the original axios cancellation error so callers can ignore it. + // Otherwise we'd misclassify it as a generic "network error". + if (error.code === 'ERR_CANCELED' || axios.isCancel(error)) { + return Promise.reject(error) + } + // Handle common errors if (error.response) { const { status, data } = error.response + const url = String(error.config?.url || '') + + // Validate `data` shape to avoid HTML error pages breaking our error handling. + const apiData = (typeof data === 'object' && data !== null ? data : {}) as Record + + // Ops monitoring disabled: treat as feature-flagged 404, and proactively redirect away + // from ops pages to avoid broken UI states. + if (status === 404 && apiData.message === 'Ops monitoring is disabled') { + try { + localStorage.setItem('ops_monitoring_enabled_cached', 'false') + } catch { + // ignore localStorage failures + } + try { + window.dispatchEvent(new CustomEvent('ops-monitoring-disabled')) + } catch { + // ignore event failures + } + + if (window.location.pathname.startsWith('/admin/ops')) { + window.location.href = '/admin/settings' + } + + return Promise.reject({ + status, + code: 'OPS_DISABLED', + message: apiData.message || error.message, + url + }) + } // 401: Unauthorized - clear token and redirect to login if (status === 401) { @@ -113,8 +149,8 @@ apiClient.interceptors.response.use( // Return structured error return Promise.reject({ status, - code: data?.code, - message: data?.message || error.message + code: apiData.code, + message: apiData.message || apiData.detail || error.message }) } From 337a188660174d8f7bd7534ff60f0e9e8acec866 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:59:02 +0800 Subject: [PATCH 12/53] =?UTF-8?q?feat(=E5=89=8D=E7=AB=AF=E7=8A=B6=E6=80=81?= =?UTF-8?q?):=20=E6=B7=BB=E5=8A=A0=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7?= =?UTF-8?q?=E7=8A=B6=E6=80=81=E7=AE=A1=E7=90=86=E5=92=8C=E8=B7=AF=E7=94=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 adminSettings store 管理 ops 配置状态 - 注册 adminSettings store 到全局 store - 添加 ops 监控相关路由(dashboard, alerts, realtime, settings) --- frontend/src/router/index.ts | 12 +++ frontend/src/stores/adminSettings.ts | 130 +++++++++++++++++++++++++++ frontend/src/stores/index.ts | 1 + 3 files changed, 143 insertions(+) create mode 100644 frontend/src/stores/adminSettings.ts diff --git a/frontend/src/router/index.ts b/frontend/src/router/index.ts index 48a6f0fd..c8d0214c 100644 --- a/frontend/src/router/index.ts +++ b/frontend/src/router/index.ts @@ -163,6 +163,18 @@ const routes: RouteRecordRaw[] = [ descriptionKey: 'admin.dashboard.description' } }, + { + path: '/admin/ops', + name: 'AdminOps', + component: () => import('@/views/admin/ops/OpsDashboard.vue'), + meta: { + requiresAuth: true, + requiresAdmin: true, + title: 'Ops Monitoring', + titleKey: 'admin.ops.title', + descriptionKey: 'admin.ops.description' + } + }, { path: '/admin/users', name: 'AdminUsers', diff --git a/frontend/src/stores/adminSettings.ts b/frontend/src/stores/adminSettings.ts new file mode 100644 index 00000000..460cc92b --- /dev/null +++ b/frontend/src/stores/adminSettings.ts @@ -0,0 +1,130 @@ +import { defineStore } from 'pinia' +import { ref } from 'vue' +import { adminAPI } from '@/api' + +export const useAdminSettingsStore = defineStore('adminSettings', () => { + const loaded = ref(false) + const loading = ref(false) + + const readCachedBool = (key: string, defaultValue: boolean): boolean => { + try { + const raw = localStorage.getItem(key) + if (raw === 'true') return true + if (raw === 'false') return false + } catch { + // ignore localStorage failures + } + return defaultValue + } + + const writeCachedBool = (key: string, value: boolean) => { + try { + localStorage.setItem(key, value ? 'true' : 'false') + } catch { + // ignore localStorage failures + } + } + + const readCachedString = (key: string, defaultValue: string): string => { + try { + const raw = localStorage.getItem(key) + if (typeof raw === 'string' && raw.length > 0) return raw + } catch { + // ignore localStorage failures + } + return defaultValue + } + + const writeCachedString = (key: string, value: string) => { + try { + localStorage.setItem(key, value) + } catch { + // ignore localStorage failures + } + } + + // Default open, but honor cached value to reduce UI flicker on first paint. + const opsMonitoringEnabled = ref(readCachedBool('ops_monitoring_enabled_cached', true)) + const opsRealtimeMonitoringEnabled = ref(readCachedBool('ops_realtime_monitoring_enabled_cached', true)) + const opsQueryModeDefault = ref(readCachedString('ops_query_mode_default_cached', 'auto')) + + async function fetch(force = false): Promise { + if (loaded.value && !force) return + if (loading.value) return + + loading.value = true + try { + const settings = await adminAPI.settings.getSettings() + opsMonitoringEnabled.value = settings.ops_monitoring_enabled ?? true + writeCachedBool('ops_monitoring_enabled_cached', opsMonitoringEnabled.value) + + opsRealtimeMonitoringEnabled.value = settings.ops_realtime_monitoring_enabled ?? true + writeCachedBool('ops_realtime_monitoring_enabled_cached', opsRealtimeMonitoringEnabled.value) + + opsQueryModeDefault.value = settings.ops_query_mode_default || 'auto' + writeCachedString('ops_query_mode_default_cached', opsQueryModeDefault.value) + + loaded.value = true + } catch (err) { + // Keep cached/default value: do not "flip" the UI based on a transient fetch failure. + loaded.value = true + console.error('[adminSettings] Failed to fetch settings:', err) + } finally { + loading.value = false + } + } + + function setOpsMonitoringEnabledLocal(value: boolean) { + opsMonitoringEnabled.value = value + writeCachedBool('ops_monitoring_enabled_cached', value) + loaded.value = true + } + + function setOpsRealtimeMonitoringEnabledLocal(value: boolean) { + opsRealtimeMonitoringEnabled.value = value + writeCachedBool('ops_realtime_monitoring_enabled_cached', value) + loaded.value = true + } + + function setOpsQueryModeDefaultLocal(value: string) { + opsQueryModeDefault.value = value || 'auto' + writeCachedString('ops_query_mode_default_cached', opsQueryModeDefault.value) + loaded.value = true + } + + // Keep UI consistent if we learn that ops is disabled via feature-gated 404s. + // (event is dispatched from the axios interceptor) + let eventHandlerCleanup: (() => void) | null = null + + function initializeEventListeners() { + if (eventHandlerCleanup) return + + try { + const handler = () => { + setOpsMonitoringEnabledLocal(false) + } + window.addEventListener('ops-monitoring-disabled', handler) + eventHandlerCleanup = () => { + window.removeEventListener('ops-monitoring-disabled', handler) + } + } catch { + // ignore window access failures (SSR) + } + } + + if (typeof window !== 'undefined') { + initializeEventListeners() + } + + return { + loaded, + loading, + opsMonitoringEnabled, + opsRealtimeMonitoringEnabled, + opsQueryModeDefault, + fetch, + setOpsMonitoringEnabledLocal, + setOpsRealtimeMonitoringEnabledLocal, + setOpsQueryModeDefaultLocal + } +}) diff --git a/frontend/src/stores/index.ts b/frontend/src/stores/index.ts index 0e4caef0..05c18e7e 100644 --- a/frontend/src/stores/index.ts +++ b/frontend/src/stores/index.ts @@ -5,6 +5,7 @@ export { useAuthStore } from './auth' export { useAppStore } from './app' +export { useAdminSettingsStore } from './adminSettings' export { useSubscriptionStore } from './subscriptions' export { useOnboardingStore } from './onboarding' From fc32b577986ecc4478f901da2479972f84f7c553 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:59:33 +0800 Subject: [PATCH 13/53] =?UTF-8?q?feat(=E5=9B=BD=E9=99=85=E5=8C=96):=20?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E5=A4=9A?= =?UTF-8?q?=E8=AF=AD=E8=A8=80=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加英文翻译(en.ts)包含 ops 监控所有文案 - 添加中文翻译(zh.ts)包含 ops 监控所有文案 --- frontend/src/i18n/locales/en.ts | 382 ++++++++++++++++++++++++++++++++ frontend/src/i18n/locales/zh.ts | 382 ++++++++++++++++++++++++++++++++ 2 files changed, 764 insertions(+) diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts index 393641a7..f80a235f 100644 --- a/frontend/src/i18n/locales/en.ts +++ b/frontend/src/i18n/locales/en.ts @@ -131,6 +131,7 @@ export default { noData: 'No data', success: 'Success', error: 'Error', + critical: 'Critical', warning: 'Warning', info: 'Info', active: 'Active', @@ -145,6 +146,8 @@ export default { copiedToClipboard: 'Copied to clipboard', copyFailed: 'Failed to copy', contactSupport: 'Contact Support', + add: 'Add', + invalidEmail: 'Please enter a valid email address', selectOption: 'Select an option', searchPlaceholder: 'Search...', noOptionsFound: 'No options found', @@ -177,6 +180,7 @@ export default { accounts: 'Accounts', proxies: 'Proxies', redeemCodes: 'Redeem Codes', + ops: 'Ops', settings: 'Settings', myAccount: 'My Account', lightMode: 'Light Mode', @@ -1713,6 +1717,370 @@ export default { failedToLoad: 'Failed to load usage records' }, + // Ops Monitoring + ops: { + title: 'Ops Monitoring', + description: 'Operational monitoring and troubleshooting', + // Dashboard + systemHealth: 'System Health', + overview: 'Overview', + noSystemMetrics: 'No system metrics collected yet.', + collectedAt: 'Collected at:', + window: 'window', + cpu: 'CPU', + memory: 'Memory', + db: 'DB', + redis: 'Redis', + goroutines: 'Goroutines', + jobs: 'Jobs', + active: 'active', + idle: 'idle', + ok: 'ok', + lastRun: 'last_run:', + lastSuccess: 'last_success:', + lastError: 'last_error:', + noData: 'No data.', + loadingText: 'loading', + ready: 'ready', + requestsTotal: 'Requests (total)', + slaScope: 'SLA scope:', + tokens: 'Tokens', + tps: 'TPS:', + current: 'current', + peak: 'peak', + sla: 'SLA (excl business limits)', + businessLimited: 'business_limited:', + errors: 'Errors', + errorRate: 'error_rate:', + upstreamRate: 'upstream_rate:', + latencyDuration: 'Latency (duration_ms)', + ttftLabel: 'TTFT (first_token_ms)', + p50: 'p50:', + p90: 'p90:', + p95: 'p95:', + p99: 'p99:', + avg: 'avg:', + max: 'max:', + qps: 'QPS', + requests: 'Requests', + upstream: 'Upstream', + client: 'Client', + system: 'System', + other: 'Other', + errorsSla: 'Errors (SLA scope)', + upstreamExcl429529: 'Upstream (excl 429/529)', + failedToLoadData: 'Failed to load ops data.', + tpsK: 'TPS (K)', + top: 'Top:', + throughputTrend: 'Throughput Trend', + latencyHistogram: 'Latency Histogram', + errorTrend: 'Error Trend', + errorDistribution: 'Error Distribution', + // Error Log + errorLog: { + timeId: 'Time / ID', + context: 'Context', + status: 'Status', + message: 'Message', + latency: 'Latency', + action: 'Action', + noErrors: 'No errors in this window.', + grp: 'GRP:', + acc: 'ACC:', + details: 'Details', + phase: 'Phase' + }, + // Error Details Modal + errorDetails: { + upstreamErrors: 'Upstream Errors', + requestErrors: 'Request Errors', + total: 'Total:', + searchPlaceholder: 'Search request_id / client_request_id / message', + accountIdPlaceholder: 'account_id' + }, + // Error Detail Modal + errorDetail: { + loading: 'Loading…', + requestId: 'Request ID', + time: 'Time', + phase: 'Phase', + status: 'Status', + message: 'Message', + basicInfo: 'Basic Info', + platform: 'Platform', + model: 'Model', + latency: 'Latency', + ttft: 'TTFT', + businessLimited: 'Business Limited', + requestPath: 'Request Path', + timings: 'Timings', + auth: 'Auth', + routing: 'Routing', + upstream: 'Upstream', + response: 'Response', + retry: 'Retry', + retryClient: 'Retry (Client)', + retryUpstream: 'Retry (Upstream pinned)', + pinnedAccountId: 'Pinned account_id', + retryNotes: 'Retry Notes', + requestBody: 'Request Body', + errorBody: 'Error Body', + trimmed: 'trimmed', + confirmRetry: 'Confirm Retry', + retrySuccess: 'Retry succeeded', + retryFailed: 'Retry failed', + na: 'N/A', + retryHint: 'Retry will resend the request with the same parameters', + retryClientHint: 'Use client retry (no account pinning)', + retryUpstreamHint: 'Use upstream pinned retry (pin to the error account)', + pinnedAccountIdHint: '(auto from error log)', + retryNote1: 'Retry will use the same request body and parameters', + retryNote2: 'If the original request failed due to account issues, pinned retry may still fail', + retryNote3: 'Client retry will reselect an account', + confirmRetryMessage: 'Confirm retry this request?', + confirmRetryHint: 'Will resend with the same request parameters' + }, + requestDetails: { + title: 'Request Details', + details: 'Details', + rangeLabel: 'Window: {range}', + rangeMinutes: '{n} minutes', + rangeHours: '{n} hours', + empty: 'No requests in this window.', + emptyHint: 'Try a different time range or remove filters.', + failedToLoad: 'Failed to load request details', + requestIdCopied: 'Request ID copied', + copyFailed: 'Copy failed', + copy: 'Copy', + viewError: 'View Error', + kind: { + success: 'SUCCESS', + error: 'ERROR' + }, + table: { + time: 'Time', + kind: 'Kind', + platform: 'Platform', + model: 'Model', + duration: 'Duration', + status: 'Status', + requestId: 'Request ID', + actions: 'Actions' + } + }, + alertEvents: { + title: 'Alert Events', + description: 'Recent alert firing/resolution records (email-only)', + loading: 'Loading...', + empty: 'No alert events', + loadFailed: 'Failed to load alert events', + table: { + time: 'Time', + status: 'Status', + severity: 'Severity', + title: 'Title', + metric: 'Metric / Threshold', + email: 'Email Sent' + } + }, + alertRules: { + title: 'Alert Rules', + description: 'Create and manage threshold-based system alerts (email-only)', + loading: 'Loading...', + empty: 'No alert rules', + loadFailed: 'Failed to load alert rules', + saveFailed: 'Failed to save alert rule', + deleteFailed: 'Failed to delete alert rule', + create: 'Create Rule', + createTitle: 'Create Alert Rule', + editTitle: 'Edit Alert Rule', + deleteConfirmTitle: 'Delete this rule?', + deleteConfirmMessage: 'This will remove the rule and its related events. Continue?', + metrics: { + successRate: 'Success Rate (%)', + errorRate: 'Error Rate (%)', + p95: 'P95 Latency (ms)', + p99: 'P99 Latency (ms)', + cpu: 'CPU Usage (%)', + memory: 'Memory Usage (%)', + queueDepth: 'Concurrency Queue Depth' + }, + table: { + name: 'Name', + metric: 'Metric', + severity: 'Severity', + enabled: 'Enabled', + actions: 'Actions' + }, + form: { + name: 'Name', + description: 'Description', + metric: 'Metric', + operator: 'Operator', + threshold: 'Threshold', + severity: 'Severity', + window: 'Window (minutes)', + sustained: 'Sustained (samples)', + cooldown: 'Cooldown (minutes)', + enabled: 'Enabled', + notifyEmail: 'Send email notifications' + }, + validation: { + title: 'Please fix the following issues', + invalid: 'Invalid rule', + nameRequired: 'Name is required', + metricRequired: 'Metric is required', + operatorRequired: 'Operator is required', + thresholdRequired: 'Threshold must be a number', + windowRange: 'Window must be one of: 1, 5, 60 minutes', + sustainedRange: 'Sustained must be between 1 and 1440 samples', + cooldownRange: 'Cooldown must be between 0 and 1440 minutes' + } + }, + runtime: { + title: 'Ops Runtime Settings', + description: 'Stored in database; changes take effect without editing config files.', + loading: 'Loading...', + noData: 'No runtime settings available', + loadFailed: 'Failed to load runtime settings', + saveSuccess: 'Runtime settings saved', + saveFailed: 'Failed to save runtime settings', + alertTitle: 'Alert Evaluator', + groupAvailabilityTitle: 'Group Availability Monitor', + evalIntervalSeconds: 'Evaluation Interval (seconds)', + silencing: { + title: 'Alert Silencing (Maintenance Mode)', + enabled: 'Enable silencing', + globalUntil: 'Silence until (RFC3339)', + untilPlaceholder: '2026-01-05T00:00:00Z', + untilHint: 'Leave empty to only toggle silencing without an expiry (not recommended).', + reason: 'Reason', + reasonPlaceholder: 'e.g., planned maintenance', + entries: { + title: 'Advanced: targeted silencing', + hint: 'Optional: silence only certain rules or severities. Leave fields empty to match all.', + add: 'Add Entry', + empty: 'No targeted entries', + entryTitle: 'Entry #{n}', + ruleId: 'Rule ID (optional)', + ruleIdPlaceholder: 'e.g., 1', + severities: 'Severities (optional)', + severitiesPlaceholder: 'e.g., P0,P1 (empty = all)', + until: 'Until (RFC3339)', + reason: 'Reason', + validation: { + untilRequired: 'Entry until time is required', + untilFormat: 'Entry until time must be a valid RFC3339 timestamp', + ruleIdPositive: 'Entry rule_id must be a positive integer', + severitiesFormat: 'Entry severities must be a comma-separated list of P0..P3' + } + }, + validation: { + timeFormat: 'Silence time must be a valid RFC3339 timestamp' + } + }, + lockEnabled: 'Distributed Lock Enabled', + lockKey: 'Distributed Lock Key', + lockTTLSeconds: 'Distributed Lock TTL (seconds)', + showAdvancedDeveloperSettings: 'Show advanced developer settings (Distributed Lock)', + advancedSettingsSummary: 'Advanced settings (Distributed Lock)', + evalIntervalHint: 'How often the evaluator runs. Keeping the default is recommended.', + validation: { + title: 'Please fix the following issues', + invalid: 'Invalid settings', + evalIntervalRange: 'Evaluation interval must be between 1 and 86400 seconds', + lockKeyRequired: 'Distributed lock key is required when lock is enabled', + lockKeyPrefix: 'Distributed lock key must start with "{prefix}"', + lockKeyHint: 'Recommended: start with "{prefix}" to avoid conflicts', + lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds' + } + }, + email: { + title: 'Email Notification', + description: 'Configure alert/report email notifications (stored in database).', + loading: 'Loading...', + noData: 'No email notification config', + loadFailed: 'Failed to load email notification config', + saveSuccess: 'Email notification config saved', + saveFailed: 'Failed to save email notification config', + alertTitle: 'Alert Emails', + reportTitle: 'Report Emails', + recipients: 'Recipients', + recipientsHint: 'If empty, the system may fallback to the first admin email.', + minSeverity: 'Min Severity', + minSeverityAll: 'All severities', + rateLimitPerHour: 'Rate limit per hour', + batchWindowSeconds: 'Batch window (seconds)', + includeResolved: 'Include resolved alerts', + dailySummary: 'Daily summary', + weeklySummary: 'Weekly summary', + errorDigest: 'Error digest', + errorDigestMinCount: 'Min errors for digest', + accountHealth: 'Account health', + accountHealthThreshold: 'Error rate threshold (%)', + cronPlaceholder: 'Cron expression', + reportHint: 'Schedules use cron syntax; leave empty to use defaults.', + validation: { + title: 'Please fix the following issues', + invalid: 'Invalid email notification config', + alertRecipientsRequired: 'Alert emails are enabled but no recipients are configured', + reportRecipientsRequired: 'Report emails are enabled but no recipients are configured', + invalidRecipients: 'One or more recipient emails are invalid', + rateLimitRange: 'Rate limit per hour must be a number ≥ 0', + batchWindowRange: 'Batch window must be between 0 and 86400 seconds', + cronRequired: 'A cron expression is required when schedule is enabled', + cronFormat: 'Cron expression format looks invalid (expected at least 5 parts)', + digestMinCountRange: 'Min errors for digest must be a number ≥ 0', + accountHealthThresholdRange: 'Account health threshold must be between 0 and 100' + } + }, + concurrency: { + title: 'Concurrency / Queue', + byPlatform: 'By Platform', + byGroup: 'By Group', + byAccount: 'By Account', + totalRows: '{count} rows', + disabledHint: 'Realtime monitoring is disabled in settings.', + empty: 'No data', + queued: 'Queue {count}', + rateLimited: 'Rate-limited {count}', + errorAccounts: 'Errors {count}', + loadFailed: 'Failed to load concurrency data' + }, + realtime: { + connected: 'Realtime connected', + connecting: 'Realtime connecting', + reconnecting: 'Realtime reconnecting', + offline: 'Realtime offline', + closed: 'Realtime closed', + reconnectIn: 'retry in {seconds}s' + }, + queryMode: { + auto: 'Auto', + raw: 'Raw', + preagg: 'Preagg' + }, + accountAvailability: { + available: 'Available', + unavailable: 'Unavailable', + accountError: 'Error' + }, + tooltips: { + throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.', + latencyHistogram: 'Latency distribution (duration_ms) for successful requests.', + errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).', + errorDistribution: 'Error distribution by status code.' + }, + charts: { + emptyRequest: 'No requests in this window.', + emptyError: 'No errors in this window.', + resetZoom: 'Reset', + resetZoomHint: 'Reset zoom (if enabled)', + downloadChart: 'Download', + downloadChartHint: 'Download chart as image' + } + }, + // Settings settings: { title: 'System Settings', @@ -1803,6 +2171,20 @@ export default { sending: 'Sending...', enterRecipientHint: 'Please enter a recipient email address' }, + opsMonitoring: { + title: 'Ops Monitoring', + description: 'Enable ops monitoring for troubleshooting and health visibility', + disabled: 'Ops monitoring is disabled', + enabled: 'Enable Ops Monitoring', + enabledHint: 'Enable the ops monitoring module (admin only)', + realtimeEnabled: 'Enable Realtime Monitoring', + realtimeEnabledHint: 'Enable realtime QPS/metrics push (WebSocket)', + queryMode: 'Default Query Mode', + queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)', + queryModeAuto: 'Auto (recommended)', + queryModeRaw: 'Raw (most accurate, slower)', + queryModePreagg: 'Preagg (fastest, requires aggregation)' + }, adminApiKey: { title: 'Admin API Key', description: 'Global API key for external system integration with full admin access', diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index fb46bbbe..646511f4 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -128,6 +128,7 @@ export default { noData: '暂无数据', success: '成功', error: '错误', + critical: '严重', warning: '警告', info: '提示', active: '启用', @@ -142,6 +143,8 @@ export default { copiedToClipboard: '已复制到剪贴板', copyFailed: '复制失败', contactSupport: '联系客服', + add: '添加', + invalidEmail: '请输入有效的邮箱地址', selectOption: '请选择', searchPlaceholder: '搜索...', noOptionsFound: '无匹配选项', @@ -175,6 +178,7 @@ export default { accounts: '账号管理', proxies: 'IP管理', redeemCodes: '兑换码', + ops: '运维监控', settings: '系统设置', myAccount: '我的账户', lightMode: '浅色模式', @@ -1858,6 +1862,370 @@ export default { failedToLoad: '加载使用记录失败' }, + // Ops Monitoring + ops: { + title: '运维监控', + description: '运维监控与排障', + // Dashboard + systemHealth: '系统健康', + overview: '概览', + noSystemMetrics: '尚未收集系统指标。', + collectedAt: '采集时间:', + window: '窗口', + cpu: 'CPU', + memory: '内存', + db: '数据库', + redis: 'Redis', + goroutines: '协程', + jobs: '后台任务', + active: '活跃', + idle: '空闲', + ok: '正常', + lastRun: '最近运行', + lastSuccess: '最近成功', + lastError: '最近错误', + noData: '暂无数据', + loadingText: '加载中...', + ready: '就绪', + requestsTotal: '请求(总计)', + slaScope: 'SLA 范围:', + tokens: 'Token', + tps: 'TPS', + current: '当前', + peak: '峰值', + sla: 'SLA(排除业务限制)', + businessLimited: '业务限制:', + errors: '错误', + errorRate: '错误率:', + upstreamRate: '上游错误率:', + latencyDuration: '延迟 (duration_ms)', + ttftLabel: 'TTFT (first_token_ms)', + p50: 'p50', + p90: 'p90', + p95: 'p95', + p99: 'p99', + avg: 'avg', + max: 'max', + qps: 'QPS', + requests: '请求', + upstream: '上游', + client: '客户端', + system: '系统', + other: '其他', + errorsSla: '错误(SLA范围)', + upstreamExcl429529: '上游(排除429/529)', + failedToLoadData: '加载运维数据失败', + tpsK: 'TPS (K)', + top: '最高:', + throughputTrend: '吞吐趋势', + latencyHistogram: '延迟分布', + errorTrend: '错误趋势', + errorDistribution: '错误分布', + // Error Log + errorLog: { + timeId: '时间 / ID', + context: '上下文', + status: '状态码', + message: '消息', + latency: '延迟', + action: '操作', + noErrors: '该窗口内暂无错误。', + grp: 'GRP:', + acc: 'ACC:', + details: '详情', + phase: '阶段' + }, + // Error Details Modal + errorDetails: { + upstreamErrors: '上游错误', + requestErrors: '请求错误', + total: '总计:', + searchPlaceholder: '搜索 request_id / client_request_id / message', + accountIdPlaceholder: 'account_id' + }, + // Error Detail Modal + errorDetail: { + loading: '加载中…', + requestId: '请求 ID', + time: '时间', + phase: '阶段', + status: '状态码', + message: '消息', + basicInfo: '基本信息', + platform: '平台', + model: '模型', + latency: '延迟', + ttft: 'TTFT', + businessLimited: '业务限制', + requestPath: '请求路径', + timings: '时序信息', + auth: '认证', + routing: '路由', + upstream: '上游', + response: '响应', + retry: '重试', + retryClient: '重试(客户端)', + retryUpstream: '重试(上游固定)', + pinnedAccountId: '固定 account_id', + retryNotes: '重试说明', + requestBody: '请求体', + errorBody: '错误体', + trimmed: '已截断', + confirmRetry: '确认重试', + retrySuccess: '重试成功', + retryFailed: '重试失败', + na: 'N/A', + retryHint: '重试将使用相同的请求参数重新发送请求', + retryClientHint: '使用客户端重试(不固定账号)', + retryUpstreamHint: '使用上游固定重试(固定到错误的账号)', + pinnedAccountIdHint: '(自动从错误日志获取)', + retryNote1: '重试会使用相同的请求体和参数', + retryNote2: '如果原请求失败是因为账号问题,固定重试可能仍会失败', + retryNote3: '客户端重试会重新选择账号', + confirmRetryMessage: '确认要重试该请求吗?', + confirmRetryHint: '将使用相同的请求参数重新发送' + }, + requestDetails: { + title: '请求明细', + details: '明细', + rangeLabel: '窗口:{range}', + rangeMinutes: '{n} 分钟', + rangeHours: '{n} 小时', + empty: '该窗口内暂无请求。', + emptyHint: '可尝试调整时间范围或取消部分筛选。', + failedToLoad: '加载请求明细失败', + requestIdCopied: '请求ID已复制', + copyFailed: '复制失败', + copy: '复制', + viewError: '查看错误', + kind: { + success: '成功', + error: '失败' + }, + table: { + time: '时间', + kind: '类型', + platform: '平台', + model: '模型', + duration: '耗时', + status: '状态码', + requestId: '请求ID', + actions: '操作' + } + }, + alertEvents: { + title: '告警事件', + description: '最近的告警触发/恢复记录(仅邮件通知)', + loading: '加载中...', + empty: '暂无告警事件', + loadFailed: '加载告警事件失败', + table: { + time: '时间', + status: '状态', + severity: '级别', + title: '标题', + metric: '指标 / 阈值', + email: '邮件已发送' + } + }, + alertRules: { + title: '告警规则', + description: '创建与管理系统阈值告警(仅邮件通知)', + loading: '加载中...', + empty: '暂无告警规则', + loadFailed: '加载告警规则失败', + saveFailed: '保存告警规则失败', + deleteFailed: '删除告警规则失败', + create: '新建规则', + createTitle: '新建告警规则', + editTitle: '编辑告警规则', + deleteConfirmTitle: '确认删除该规则?', + deleteConfirmMessage: '将删除该规则及其关联的告警事件,是否继续?', + metrics: { + successRate: '成功率 (%)', + errorRate: '错误率 (%)', + p95: 'P95 延迟 (ms)', + p99: 'P99 延迟 (ms)', + cpu: 'CPU 使用率 (%)', + memory: '内存使用率 (%)', + queueDepth: '并发排队深度' + }, + table: { + name: '名称', + metric: '指标', + severity: '级别', + enabled: '启用', + actions: '操作' + }, + form: { + name: '名称', + description: '描述', + metric: '指标', + operator: '运算符', + threshold: '阈值', + severity: '级别', + window: '统计窗口(分钟)', + sustained: '连续样本数(每分钟)', + cooldown: '冷却期(分钟)', + enabled: '启用', + notifyEmail: '发送邮件通知' + }, + validation: { + title: '请先修正以下问题', + invalid: '规则不合法', + nameRequired: '名称不能为空', + metricRequired: '指标不能为空', + operatorRequired: '运算符不能为空', + thresholdRequired: '阈值必须为数字', + windowRange: '统计窗口必须为 1 / 5 / 60 分钟之一', + sustainedRange: '连续样本数必须在 1 到 1440 之间', + cooldownRange: '冷却期必须在 0 到 1440 分钟之间' + } + }, + runtime: { + title: '运维监控运行设置', + description: '配置存储在数据库中,无需修改 config 文件即可生效。', + loading: '加载中...', + noData: '暂无运行设置', + loadFailed: '加载运行设置失败', + saveSuccess: '运行设置已保存', + saveFailed: '保存运行设置失败', + alertTitle: '告警评估器', + groupAvailabilityTitle: '分组可用性监控', + evalIntervalSeconds: '评估间隔(秒)', + silencing: { + title: '告警静默(维护模式)', + enabled: '启用静默', + globalUntil: '静默截止时间(RFC3339)', + untilPlaceholder: '2026-01-05T00:00:00Z', + untilHint: '建议填写截止时间,避免忘记关闭静默。', + reason: '原因', + reasonPlaceholder: '例如:计划维护', + entries: { + title: '高级:定向静默', + hint: '可选:仅静默特定规则或特定级别。字段留空表示匹配全部。', + add: '新增条目', + empty: '暂无定向静默条目', + entryTitle: '条目 #{n}', + ruleId: '规则ID(可选)', + ruleIdPlaceholder: '例如:1', + severities: '级别(可选)', + severitiesPlaceholder: '例如:P0,P1(留空=全部)', + until: '截止时间(RFC3339)', + reason: '原因', + validation: { + untilRequired: '条目截止时间不能为空', + untilFormat: '条目截止时间必须为合法的 RFC3339 时间戳', + ruleIdPositive: '条目 rule_id 必须为正整数', + severitiesFormat: '条目级别必须为 P0..P3 的逗号分隔列表' + } + }, + validation: { + timeFormat: '静默时间必须为合法的 RFC3339 时间戳' + } + }, + lockEnabled: '启用分布式锁', + lockKey: '分布式锁 Key', + lockTTLSeconds: '分布式锁 TTL(秒)', + showAdvancedDeveloperSettings: '显示高级开发者设置 (Distributed Lock)', + advancedSettingsSummary: '高级设置 (分布式锁)', + evalIntervalHint: '检测任务的执行频率,建议保持默认。', + validation: { + title: '请先修正以下问题', + invalid: '设置不合法', + evalIntervalRange: '评估间隔必须在 1 到 86400 秒之间', + lockKeyRequired: '启用分布式锁时必须填写 Lock Key', + lockKeyPrefix: '分布式锁 Key 必须以「{prefix}」开头', + lockKeyHint: '建议以「{prefix}」开头以避免冲突', + lockTtlRange: '分布式锁 TTL 必须在 1 到 86400 秒之间' + } + }, + email: { + title: '邮件通知配置', + description: '配置告警/报告邮件通知(存储在数据库中)。', + loading: '加载中...', + noData: '暂无邮件通知配置', + loadFailed: '加载邮件通知配置失败', + saveSuccess: '邮件通知配置已保存', + saveFailed: '保存邮件通知配置失败', + alertTitle: '告警邮件', + reportTitle: '报告邮件', + recipients: '收件人', + recipientsHint: '若为空,系统可能会回退使用第一个管理员邮箱。', + minSeverity: '最低级别', + minSeverityAll: '全部级别', + rateLimitPerHour: '每小时限额', + batchWindowSeconds: '合并窗口(秒)', + includeResolved: '包含恢复通知', + dailySummary: '每日摘要', + weeklySummary: '每周摘要', + errorDigest: '错误摘要', + errorDigestMinCount: '错误摘要最小数量', + accountHealth: '账号健康报告', + accountHealthThreshold: '错误率阈值(%)', + cronPlaceholder: 'Cron 表达式', + reportHint: '发送时间使用 Cron 语法;留空将使用默认值。', + validation: { + title: '请先修正以下问题', + invalid: '邮件通知配置不合法', + alertRecipientsRequired: '已启用告警邮件,但未配置任何收件人', + reportRecipientsRequired: '已启用报告邮件,但未配置任何收件人', + invalidRecipients: '存在不合法的收件人邮箱', + rateLimitRange: '每小时限额必须为 ≥ 0 的数字', + batchWindowRange: '合并窗口必须在 0 到 86400 秒之间', + cronRequired: '启用定时任务时必须填写 Cron 表达式', + cronFormat: 'Cron 表达式格式可能不正确(至少应包含 5 段)', + digestMinCountRange: '错误摘要最小数量必须为 ≥ 0 的数字', + accountHealthThresholdRange: '账号健康错误率阈值必须在 0 到 100 之间' + } + }, + concurrency: { + title: '并发 / 排队', + byPlatform: '按平台', + byGroup: '按分组', + byAccount: '按账号', + totalRows: '共 {count} 项', + disabledHint: '已在设置中关闭实时监控。', + empty: '暂无数据', + queued: '队列 {count}', + rateLimited: '限流 {count}', + errorAccounts: '异常 {count}', + loadFailed: '加载并发数据失败' + }, + realtime: { + connected: '实时已连接', + connecting: '实时连接中', + reconnecting: '实时重连中', + offline: '实时离线', + closed: '实时已关闭', + reconnectIn: '重连 {seconds}s' + }, + queryMode: { + auto: 'Auto(自动)', + raw: 'Raw(不聚合)', + preagg: 'Preagg(聚合)' + }, + accountAvailability: { + available: '可用', + unavailable: '不可用', + accountError: '异常' + }, + tooltips: { + throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。', + latencyHistogram: '成功请求的延迟分布(duration_ms)。', + errorTrend: '错误趋势(SLA 口径排除业务限制;上游错误率排除 429/529)。', + errorDistribution: '按状态码统计的错误分布。' + }, + charts: { + emptyRequest: '该时间窗口内暂无请求。', + emptyError: '该时间窗口内暂无错误。', + resetZoom: '重置', + resetZoomHint: '重置缩放(若启用)', + downloadChart: '下载', + downloadChartHint: '下载图表图片' + } + }, + // Settings settings: { title: '系统设置', @@ -1947,6 +2315,20 @@ export default { sending: '发送中...', enterRecipientHint: '请输入收件人邮箱地址' }, + opsMonitoring: { + title: '运维监控', + description: '启用运维监控模块,用于排障与健康可视化', + disabled: '运维监控已关闭', + enabled: '启用运维监控', + enabledHint: '启用 Ops 运维监控模块(仅管理员可见)', + realtimeEnabled: '启用实时监控', + realtimeEnabledHint: '启用实时 QPS/指标推送(WebSocket)', + queryMode: '默认查询模式', + queryModeHint: 'Ops Dashboard 默认查询模式(auto/raw/preagg)', + queryModeAuto: '自动(推荐)', + queryModeRaw: 'Raw(最准,但较慢)', + queryModePreagg: 'Preagg(最快,需预聚合)' + }, adminApiKey: { title: '管理员 API Key', description: '用于外部系统集成的全局 API Key,拥有完整的管理员权限', From 8ae75e7f6ecc21fe7e45df143ff0fbe602d95be6 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Fri, 9 Jan 2026 21:00:04 +0800 Subject: [PATCH 14/53] =?UTF-8?q?feat(=E5=89=8D=E7=AB=AFUI):=20=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E5=89=8D=E7=AB=AF?= =?UTF-8?q?=E7=95=8C=E9=9D=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增帮助提示组件(HelpTooltip.vue) - 更新侧边栏添加 ops 监控菜单项 - 扩展设置视图集成 ops 配置面板 - 新增 ops 监控视图目录(dashboard, alerts, realtime, settings 等) --- .../src/components/common/HelpTooltip.vue | 44 + frontend/src/components/layout/AppSidebar.vue | 25 +- frontend/src/views/admin/SettingsView.vue | 104 ++- frontend/src/views/admin/ops/OpsDashboard.vue | 854 ++++++++++++++++++ .../ops/components/OpsAlertEventsCard.vue | 165 ++++ .../ops/components/OpsAlertRulesCard.vue | 357 ++++++++ .../ops/components/OpsConcurrencyCard.vue | 525 +++++++++++ .../ops/components/OpsDashboardHeader.vue | 374 ++++++++ .../ops/components/OpsDashboardSkeleton.vue | 53 ++ .../components/OpsEmailNotificationCard.vue | 441 +++++++++ .../ops/components/OpsErrorDetailModal.vue | 360 ++++++++ .../ops/components/OpsErrorDetailsModal.vue | 293 ++++++ .../components/OpsErrorDistributionChart.vue | 157 ++++ .../admin/ops/components/OpsErrorLogTable.vue | 238 +++++ .../ops/components/OpsErrorTrendChart.vue | 185 ++++ .../admin/ops/components/OpsLatencyChart.vue | 101 +++ .../ops/components/OpsRequestDetailsModal.vue | 309 +++++++ .../ops/components/OpsRuntimeSettingsCard.vue | 439 +++++++++ .../components/OpsThroughputTrendChart.vue | 252 ++++++ frontend/src/views/admin/ops/types.ts | 17 + .../views/admin/ops/utils/opsFormatters.ts | 75 ++ 21 files changed, 5362 insertions(+), 6 deletions(-) create mode 100644 frontend/src/components/common/HelpTooltip.vue create mode 100644 frontend/src/views/admin/ops/OpsDashboard.vue create mode 100644 frontend/src/views/admin/ops/components/OpsAlertEventsCard.vue create mode 100644 frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue create mode 100644 frontend/src/views/admin/ops/components/OpsConcurrencyCard.vue create mode 100644 frontend/src/views/admin/ops/components/OpsDashboardHeader.vue create mode 100644 frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue create mode 100644 frontend/src/views/admin/ops/components/OpsEmailNotificationCard.vue create mode 100644 frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue create mode 100644 frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue create mode 100644 frontend/src/views/admin/ops/components/OpsErrorDistributionChart.vue create mode 100644 frontend/src/views/admin/ops/components/OpsErrorLogTable.vue create mode 100644 frontend/src/views/admin/ops/components/OpsErrorTrendChart.vue create mode 100644 frontend/src/views/admin/ops/components/OpsLatencyChart.vue create mode 100644 frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue create mode 100644 frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue create mode 100644 frontend/src/views/admin/ops/components/OpsThroughputTrendChart.vue create mode 100644 frontend/src/views/admin/ops/types.ts create mode 100644 frontend/src/views/admin/ops/utils/opsFormatters.ts diff --git a/frontend/src/components/common/HelpTooltip.vue b/frontend/src/components/common/HelpTooltip.vue new file mode 100644 index 00000000..7679ced4 --- /dev/null +++ b/frontend/src/components/common/HelpTooltip.vue @@ -0,0 +1,44 @@ + + + + diff --git a/frontend/src/components/layout/AppSidebar.vue b/frontend/src/components/layout/AppSidebar.vue index 791327a1..78217ec8 100644 --- a/frontend/src/components/layout/AppSidebar.vue +++ b/frontend/src/components/layout/AppSidebar.vue @@ -144,10 +144,10 @@ diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue new file mode 100644 index 00000000..c2c6adb6 --- /dev/null +++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue @@ -0,0 +1,374 @@ + + + diff --git a/frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue b/frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue new file mode 100644 index 00000000..5bbadd03 --- /dev/null +++ b/frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue @@ -0,0 +1,53 @@ + + diff --git a/frontend/src/views/admin/ops/components/OpsEmailNotificationCard.vue b/frontend/src/views/admin/ops/components/OpsEmailNotificationCard.vue new file mode 100644 index 00000000..0204cbeb --- /dev/null +++ b/frontend/src/views/admin/ops/components/OpsEmailNotificationCard.vue @@ -0,0 +1,441 @@ + + + diff --git a/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue new file mode 100644 index 00000000..118a1f3a --- /dev/null +++ b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue @@ -0,0 +1,360 @@ + + + diff --git a/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue b/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue new file mode 100644 index 00000000..f4a522de --- /dev/null +++ b/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue @@ -0,0 +1,293 @@ + + + diff --git a/frontend/src/views/admin/ops/components/OpsErrorDistributionChart.vue b/frontend/src/views/admin/ops/components/OpsErrorDistributionChart.vue new file mode 100644 index 00000000..a52b5442 --- /dev/null +++ b/frontend/src/views/admin/ops/components/OpsErrorDistributionChart.vue @@ -0,0 +1,157 @@ + + + diff --git a/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue b/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue new file mode 100644 index 00000000..6a4be1a7 --- /dev/null +++ b/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue @@ -0,0 +1,238 @@ + + + diff --git a/frontend/src/views/admin/ops/components/OpsErrorTrendChart.vue b/frontend/src/views/admin/ops/components/OpsErrorTrendChart.vue new file mode 100644 index 00000000..032e1205 --- /dev/null +++ b/frontend/src/views/admin/ops/components/OpsErrorTrendChart.vue @@ -0,0 +1,185 @@ + + + diff --git a/frontend/src/views/admin/ops/components/OpsLatencyChart.vue b/frontend/src/views/admin/ops/components/OpsLatencyChart.vue new file mode 100644 index 00000000..c62b3aa9 --- /dev/null +++ b/frontend/src/views/admin/ops/components/OpsLatencyChart.vue @@ -0,0 +1,101 @@ + + + diff --git a/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue b/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue new file mode 100644 index 00000000..541aa3ed --- /dev/null +++ b/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue @@ -0,0 +1,309 @@ + + + diff --git a/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue b/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue new file mode 100644 index 00000000..e9df347d --- /dev/null +++ b/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue @@ -0,0 +1,439 @@ + + + + diff --git a/frontend/src/views/admin/ops/components/OpsThroughputTrendChart.vue b/frontend/src/views/admin/ops/components/OpsThroughputTrendChart.vue new file mode 100644 index 00000000..e3bd26c2 --- /dev/null +++ b/frontend/src/views/admin/ops/components/OpsThroughputTrendChart.vue @@ -0,0 +1,252 @@ + + + diff --git a/frontend/src/views/admin/ops/types.ts b/frontend/src/views/admin/ops/types.ts new file mode 100644 index 00000000..08830542 --- /dev/null +++ b/frontend/src/views/admin/ops/types.ts @@ -0,0 +1,17 @@ +// Ops 前端视图层的共享类型(与后端 DTO 解耦)。 + +export type ChartState = 'loading' | 'empty' | 'ready' + +// Re-export ops alert/settings types so view components can import from a single place +// while keeping the API contract centralized in `@/api/admin/ops`. +export type { + AlertRule, + AlertEvent, + AlertSeverity, + ThresholdMode, + MetricType, + Operator, + EmailNotificationConfig, + OpsDistributedLockSettings, + OpsAlertRuntimeSettings +} from '@/api/admin/ops' diff --git a/frontend/src/views/admin/ops/utils/opsFormatters.ts b/frontend/src/views/admin/ops/utils/opsFormatters.ts new file mode 100644 index 00000000..d503b5a5 --- /dev/null +++ b/frontend/src/views/admin/ops/utils/opsFormatters.ts @@ -0,0 +1,75 @@ +/** + * Ops 页面共享的格式化/样式工具。 + * + * 目标:尽量对齐 `docs/sub2api` 备份版本的视觉表现(需求一致部分保持一致), + * 同时避免引入额外 UI 依赖。 + */ + +import type { OpsSeverity } from '@/api/admin/ops' +import { formatBytes } from '@/utils/format' + +export function getSeverityClass(severity: OpsSeverity): string { + const classes: Record = { + P0: 'bg-red-100 text-red-800 dark:bg-red-900/30 dark:text-red-400', + P1: 'bg-orange-100 text-orange-800 dark:bg-orange-900/30 dark:text-orange-400', + P2: 'bg-yellow-100 text-yellow-800 dark:bg-yellow-900/30 dark:text-yellow-400', + P3: 'bg-blue-100 text-blue-800 dark:bg-blue-900/30 dark:text-blue-400' + } + return classes[String(severity || '')] || classes.P3 +} + +export function truncateMessage(msg: string, maxLength = 80): string { + if (!msg) return '' + return msg.length > maxLength ? msg.substring(0, maxLength) + '...' : msg +} + +/** + * 格式化日期时间(短格式,和旧 Ops 页面一致)。 + * 输出: `MM-DD HH:mm:ss` + */ +export function formatDateTime(dateStr: string): string { + const d = new Date(dateStr) + if (Number.isNaN(d.getTime())) return '' + return `${String(d.getMonth() + 1).padStart(2, '0')}-${String(d.getDate()).padStart(2, '0')} ${String(d.getHours()).padStart(2, '0')}:${String(d.getMinutes()).padStart(2, '0')}:${String(d.getSeconds()).padStart(2, '0')}` +} + +export function sumNumbers(values: Array): number { + return values.reduce((acc, v) => { + const n = typeof v === 'number' && Number.isFinite(v) ? v : 0 + return acc + n + }, 0) +} + +/** + * 解析 time_range 为分钟数。 + * 支持:`5m/30m/1h/6h/24h` + */ +export function parseTimeRangeMinutes(range: string): number { + const trimmed = (range || '').trim() + if (!trimmed) return 60 + if (trimmed.endsWith('m')) { + const v = Number.parseInt(trimmed.slice(0, -1), 10) + return Number.isFinite(v) && v > 0 ? v : 60 + } + if (trimmed.endsWith('h')) { + const v = Number.parseInt(trimmed.slice(0, -1), 10) + return Number.isFinite(v) && v > 0 ? v * 60 : 60 + } + return 60 +} + +export function formatHistoryLabel(date: string | undefined, timeRange: string): string { + if (!date) return '' + const d = new Date(date) + if (Number.isNaN(d.getTime())) return '' + const minutes = parseTimeRangeMinutes(timeRange) + if (minutes >= 24 * 60) { + return `${String(d.getMonth() + 1).padStart(2, '0')}-${String(d.getDate()).padStart(2, '0')} ${String(d.getHours()).padStart(2, '0')}:${String(d.getMinutes()).padStart(2, '0')}` + } + return `${String(d.getHours()).padStart(2, '0')}:${String(d.getMinutes()).padStart(2, '0')}` +} + +export function formatByteRate(bytes: number, windowMinutes: number): string { + const seconds = Math.max(1, (windowMinutes || 1) * 60) + return `${formatBytes(bytes / seconds, 1)}/s` +} From 585257d34030c5f068a444cc1b718cc73ae9fa37 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Sat, 10 Jan 2026 01:38:47 +0800 Subject: [PATCH 15/53] =?UTF-8?q?feat(=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7?= =?UTF-8?q?):=20=E5=A2=9E=E5=BC=BA=E7=9B=91=E6=8E=A7=E5=8A=9F=E8=83=BD?= =?UTF-8?q?=E5=92=8C=E5=81=A5=E5=BA=B7=E8=AF=84=E5=88=86=E7=B3=BB=E7=BB=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 后端改进: - 新增健康评分计算服务(ops_health_score.go) - 添加分布式锁支持(ops_advisory_lock.go) - 优化指标采集和聚合逻辑 - 新增运维指标采集间隔配置(60-3600秒) - 移除未使用的WebSocket查询token认证中间件 - 改进清理服务和告警评估逻辑 前端改进: - 简化OpsDashboard组件结构 - 完善国际化文本(中英文) - 新增运维监控相关API类型定义 - 添加运维指标采集间隔设置界面 - 优化错误详情模态框 测试: - 添加健康评分单元测试 - 更新API契约测试 --- .../internal/handler/admin/setting_handler.go | 28 +- backend/internal/handler/dto/settings.go | 5 +- .../internal/repository/ops_repo_metrics.go | 27 +- backend/internal/server/api_contract_test.go | 4 +- .../server/middleware/ws_query_token_auth.go | 54 ---- backend/internal/server/router.go | 2 - backend/internal/service/ops_advisory_lock.go | 46 ++++ .../service/ops_aggregation_service.go | 33 ++- .../service/ops_alert_evaluator_service.go | 7 +- .../internal/service/ops_cleanup_service.go | 36 +-- backend/internal/service/ops_dashboard.go | 13 + .../internal/service/ops_dashboard_models.go | 4 + backend/internal/service/ops_health_score.go | 126 +++++++++ .../internal/service/ops_health_score_test.go | 60 +++++ .../internal/service/ops_metrics_collector.go | 66 ++--- backend/internal/service/ops_port.go | 10 + backend/internal/service/setting_service.go | 16 ++ backend/internal/service/settings_view.go | 1 + frontend/src/api/admin/ops.ts | 5 + frontend/src/api/admin/settings.ts | 2 + frontend/src/i18n/locales/en.ts | 60 ++++- frontend/src/i18n/locales/zh.ts | 76 +++++- frontend/src/views/admin/SettingsView.vue | 25 +- frontend/src/views/admin/ops/OpsDashboard.vue | 245 +----------------- .../ops/components/OpsErrorDetailModal.vue | 4 +- 25 files changed, 570 insertions(+), 385 deletions(-) delete mode 100644 backend/internal/server/middleware/ws_query_token_auth.go create mode 100644 backend/internal/service/ops_advisory_lock.go create mode 100644 backend/internal/service/ops_health_score.go create mode 100644 backend/internal/service/ops_health_score_test.go diff --git a/backend/internal/handler/admin/setting_handler.go b/backend/internal/handler/admin/setting_handler.go index 4d4d5639..59f47010 100644 --- a/backend/internal/handler/admin/setting_handler.go +++ b/backend/internal/handler/admin/setting_handler.go @@ -68,6 +68,7 @@ func (h *SettingHandler) GetSettings(c *gin.Context) { OpsMonitoringEnabled: settings.OpsMonitoringEnabled, OpsRealtimeMonitoringEnabled: settings.OpsRealtimeMonitoringEnabled, OpsQueryModeDefault: settings.OpsQueryModeDefault, + OpsMetricsIntervalSeconds: settings.OpsMetricsIntervalSeconds, }) } @@ -115,9 +116,10 @@ type UpdateSettingsRequest struct { IdentityPatchPrompt string `json:"identity_patch_prompt"` // Ops monitoring (vNext) - OpsMonitoringEnabled *bool `json:"ops_monitoring_enabled"` - OpsRealtimeMonitoringEnabled *bool `json:"ops_realtime_monitoring_enabled"` + OpsMonitoringEnabled *bool `json:"ops_monitoring_enabled"` + OpsRealtimeMonitoringEnabled *bool `json:"ops_realtime_monitoring_enabled"` OpsQueryModeDefault *string `json:"ops_query_mode_default"` + OpsMetricsIntervalSeconds *int `json:"ops_metrics_interval_seconds"` } // UpdateSettings 更新系统设置 @@ -173,6 +175,18 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) { } } + // Ops metrics collector interval validation (seconds). + if req.OpsMetricsIntervalSeconds != nil { + v := *req.OpsMetricsIntervalSeconds + if v < 60 { + v = 60 + } + if v > 3600 { + v = 3600 + } + req.OpsMetricsIntervalSeconds = &v + } + settings := &service.SystemSettings{ RegistrationEnabled: req.RegistrationEnabled, EmailVerifyEnabled: req.EmailVerifyEnabled, @@ -219,6 +233,12 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) { } return previousSettings.OpsQueryModeDefault }(), + OpsMetricsIntervalSeconds: func() int { + if req.OpsMetricsIntervalSeconds != nil { + return *req.OpsMetricsIntervalSeconds + } + return previousSettings.OpsMetricsIntervalSeconds + }(), } if err := h.settingService.UpdateSettings(c.Request.Context(), settings); err != nil { @@ -266,6 +286,7 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) { OpsMonitoringEnabled: updatedSettings.OpsMonitoringEnabled, OpsRealtimeMonitoringEnabled: updatedSettings.OpsRealtimeMonitoringEnabled, OpsQueryModeDefault: updatedSettings.OpsQueryModeDefault, + OpsMetricsIntervalSeconds: updatedSettings.OpsMetricsIntervalSeconds, }) } @@ -375,6 +396,9 @@ func diffSettings(before *service.SystemSettings, after *service.SystemSettings, if before.OpsQueryModeDefault != after.OpsQueryModeDefault { changed = append(changed, "ops_query_mode_default") } + if before.OpsMetricsIntervalSeconds != after.OpsMetricsIntervalSeconds { + changed = append(changed, "ops_metrics_interval_seconds") + } return changed } diff --git a/backend/internal/handler/dto/settings.go b/backend/internal/handler/dto/settings.go index 6fd53b26..3f631bfa 100644 --- a/backend/internal/handler/dto/settings.go +++ b/backend/internal/handler/dto/settings.go @@ -39,9 +39,10 @@ type SystemSettings struct { IdentityPatchPrompt string `json:"identity_patch_prompt"` // Ops monitoring (vNext) - OpsMonitoringEnabled bool `json:"ops_monitoring_enabled"` - OpsRealtimeMonitoringEnabled bool `json:"ops_realtime_monitoring_enabled"` + OpsMonitoringEnabled bool `json:"ops_monitoring_enabled"` + OpsRealtimeMonitoringEnabled bool `json:"ops_realtime_monitoring_enabled"` OpsQueryModeDefault string `json:"ops_query_mode_default"` + OpsMetricsIntervalSeconds int `json:"ops_metrics_interval_seconds"` } type PublicSettings struct { diff --git a/backend/internal/repository/ops_repo_metrics.go b/backend/internal/repository/ops_repo_metrics.go index 96bad88a..75345595 100644 --- a/backend/internal/repository/ops_repo_metrics.go +++ b/backend/internal/repository/ops_repo_metrics.go @@ -68,6 +68,9 @@ INSERT INTO ops_system_metrics ( db_ok, redis_ok, + redis_conn_total, + redis_conn_idle, + db_conn_active, db_conn_idle, db_conn_waiting, @@ -83,8 +86,9 @@ INSERT INTO ops_system_metrics ( $21,$22,$23,$24,$25,$26, $27,$28,$29,$30, $31,$32, - $33,$34,$35, - $36,$37 + $33,$34, + $35,$36,$37, + $38,$39 )` _, err := r.db.ExecContext( @@ -130,6 +134,9 @@ INSERT INTO ops_system_metrics ( opsNullBool(input.DBOK), opsNullBool(input.RedisOK), + opsNullInt(input.RedisConnTotal), + opsNullInt(input.RedisConnIdle), + opsNullInt(input.DBConnActive), opsNullInt(input.DBConnIdle), opsNullInt(input.DBConnWaiting), @@ -162,6 +169,9 @@ SELECT db_ok, redis_ok, + redis_conn_total, + redis_conn_idle, + db_conn_active, db_conn_idle, db_conn_waiting, @@ -182,6 +192,8 @@ LIMIT 1` var memPct sql.NullFloat64 var dbOK sql.NullBool var redisOK sql.NullBool + var redisTotal sql.NullInt64 + var redisIdle sql.NullInt64 var dbActive sql.NullInt64 var dbIdle sql.NullInt64 var dbWaiting sql.NullInt64 @@ -198,6 +210,8 @@ LIMIT 1` &memPct, &dbOK, &redisOK, + &redisTotal, + &redisIdle, &dbActive, &dbIdle, &dbWaiting, @@ -231,6 +245,14 @@ LIMIT 1` v := redisOK.Bool out.RedisOK = &v } + if redisTotal.Valid { + v := int(redisTotal.Int64) + out.RedisConnTotal = &v + } + if redisIdle.Valid { + v := int(redisIdle.Int64) + out.RedisConnIdle = &v + } if dbActive.Valid { v := int(dbActive.Int64) out.DBConnActive = &v @@ -398,4 +420,3 @@ func opsNullTime(v *time.Time) any { } return sql.NullTime{Time: *v, Valid: true} } - diff --git a/backend/internal/server/api_contract_test.go b/backend/internal/server/api_contract_test.go index 23cab19c..f8140fe6 100644 --- a/backend/internal/server/api_contract_test.go +++ b/backend/internal/server/api_contract_test.go @@ -319,7 +319,9 @@ func TestAPIContracts(t *testing.T) { "enable_identity_patch": true, "identity_patch_prompt": "", "ops_monitoring_enabled": true, - "ops_realtime_monitoring_enabled": true + "ops_realtime_monitoring_enabled": true, + "ops_query_mode_default": "auto", + "ops_metrics_interval_seconds": 60 } }`, }, diff --git a/backend/internal/server/middleware/ws_query_token_auth.go b/backend/internal/server/middleware/ws_query_token_auth.go deleted file mode 100644 index 3b8d086a..00000000 --- a/backend/internal/server/middleware/ws_query_token_auth.go +++ /dev/null @@ -1,54 +0,0 @@ -package middleware - -import ( - "net/http" - "strings" - - "github.com/gin-gonic/gin" -) - -// InjectBearerTokenFromQueryForWebSocket copies `?token=` into the Authorization header -// for WebSocket handshake requests on a small allow-list of endpoints. -// -// Why: browsers can't set custom headers on WebSocket handshake, but our admin routes -// are protected by header-based auth. This keeps the token support scoped to WS only. -func InjectBearerTokenFromQueryForWebSocket() gin.HandlerFunc { - return func(c *gin.Context) { - if c == nil || c.Request == nil { - if c != nil { - c.Next() - } - return - } - - // Only GET websocket upgrades. - if c.Request.Method != http.MethodGet { - c.Next() - return - } - if !strings.EqualFold(strings.TrimSpace(c.GetHeader("Upgrade")), "websocket") { - c.Next() - return - } - - // If caller already supplied auth headers, don't override. - if strings.TrimSpace(c.GetHeader("Authorization")) != "" || strings.TrimSpace(c.GetHeader("x-api-key")) != "" { - c.Next() - return - } - - // Allow-list ops websocket endpoints. - path := strings.TrimSpace(c.Request.URL.Path) - if !strings.HasPrefix(path, "/api/v1/admin/ops/ws/") { - c.Next() - return - } - - token := strings.TrimSpace(c.Query("token")) - if token != "" { - c.Request.Header.Set("Authorization", "Bearer "+token) - } - - c.Next() - } -} diff --git a/backend/internal/server/router.go b/backend/internal/server/router.go index 85df99bd..3ea087d6 100644 --- a/backend/internal/server/router.go +++ b/backend/internal/server/router.go @@ -25,8 +25,6 @@ func SetupRouter( ) *gin.Engine { // 应用中间件 r.Use(middleware2.Logger()) - // WebSocket handshake auth helper (token via query param, WS endpoints only). - r.Use(middleware2.InjectBearerTokenFromQueryForWebSocket()) r.Use(middleware2.CORS(cfg.CORS)) r.Use(middleware2.SecurityHeaders(cfg.Security.CSP)) diff --git a/backend/internal/service/ops_advisory_lock.go b/backend/internal/service/ops_advisory_lock.go new file mode 100644 index 00000000..f7ef4cee --- /dev/null +++ b/backend/internal/service/ops_advisory_lock.go @@ -0,0 +1,46 @@ +package service + +import ( + "context" + "database/sql" + "hash/fnv" + "time" +) + +func hashAdvisoryLockID(key string) int64 { + h := fnv.New64a() + _, _ = h.Write([]byte(key)) + return int64(h.Sum64()) +} + +func tryAcquireDBAdvisoryLock(ctx context.Context, db *sql.DB, lockID int64) (func(), bool) { + if db == nil { + return nil, false + } + if ctx == nil { + ctx = context.Background() + } + + conn, err := db.Conn(ctx) + if err != nil { + return nil, false + } + + acquired := false + if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", lockID).Scan(&acquired); err != nil { + _ = conn.Close() + return nil, false + } + if !acquired { + _ = conn.Close() + return nil, false + } + + release := func() { + unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", lockID) + _ = conn.Close() + } + return release, true +} diff --git a/backend/internal/service/ops_aggregation_service.go b/backend/internal/service/ops_aggregation_service.go index 04dbb11b..2a6afbba 100644 --- a/backend/internal/service/ops_aggregation_service.go +++ b/backend/internal/service/ops_aggregation_service.go @@ -376,28 +376,37 @@ return 0 `) func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) { - if s == nil || s.redisClient == nil { - return nil, true + if s == nil { + return nil, false } if ctx == nil { ctx = context.Background() } - ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() - if err != nil { - // Fail-open: do not block single-instance deployments. - return nil, true + // Prefer Redis leader lock when available (multi-instance), but avoid stampeding + // the DB when Redis is flaky by falling back to a DB advisory lock. + if s.redisClient != nil { + ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() + if err == nil { + if !ok { + s.maybeLogSkip(logPrefix) + return nil, false + } + release := func() { + ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + _, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result() + } + return release, true + } + // Redis error: fall through to DB advisory lock. } + + release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key)) if !ok { s.maybeLogSkip(logPrefix) return nil, false } - - release := func() { - ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second) - defer cancel() - _, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result() - } return release, true } diff --git a/backend/internal/service/ops_alert_evaluator_service.go b/backend/internal/service/ops_alert_evaluator_service.go index b970c720..81712136 100644 --- a/backend/internal/service/ops_alert_evaluator_service.go +++ b/backend/internal/service/ops_alert_evaluator_service.go @@ -720,11 +720,12 @@ func (s *OpsAlertEvaluatorService) tryAcquireLeaderLock(ctx context.Context, loc ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() if err != nil { - // Fail-open for single-node environments, but warn. + // Prefer fail-closed to avoid duplicate evaluators stampeding the DB when Redis is flaky. + // Single-node deployments can disable the distributed lock via runtime settings. s.warnNoRedisOnce.Do(func() { - log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; running without lock: %v", err) + log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; skipping this cycle: %v", err) }) - return nil, true + return nil, false } if !ok { s.maybeLogSkip(key) diff --git a/backend/internal/service/ops_cleanup_service.go b/backend/internal/service/ops_cleanup_service.go index ef825c04..08c6a16e 100644 --- a/backend/internal/service/ops_cleanup_service.go +++ b/backend/internal/service/ops_cleanup_service.go @@ -300,30 +300,36 @@ func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), b return nil, true } - if s.redisClient == nil { - s.warnNoRedisOnce.Do(func() { - log.Printf("[OpsCleanup] redis not configured; running without distributed lock") - }) - return nil, true - } - key := opsCleanupLeaderLockKeyDefault ttl := opsCleanupLeaderLockTTLDefault - ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() - if err != nil { + // Prefer Redis leader lock when available, but avoid stampeding the DB when Redis is flaky by + // falling back to a DB advisory lock. + if s.redisClient != nil { + ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() + if err == nil { + if !ok { + return nil, false + } + return func() { + _, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result() + }, true + } + // Redis error: fall back to DB advisory lock. s.warnNoRedisOnce.Do(func() { - log.Printf("[OpsCleanup] leader lock SetNX failed; running without lock: %v", err) + log.Printf("[OpsCleanup] leader lock SetNX failed; falling back to DB advisory lock: %v", err) + }) + } else { + s.warnNoRedisOnce.Do(func() { + log.Printf("[OpsCleanup] redis not configured; using DB advisory lock") }) - return nil, true } + + release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key)) if !ok { return nil, false } - - return func() { - _, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result() - }, true + return release, true } func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) { diff --git a/backend/internal/service/ops_dashboard.go b/backend/internal/service/ops_dashboard.go index 23d6d82f..31822ba8 100644 --- a/backend/internal/service/ops_dashboard.go +++ b/backend/internal/service/ops_dashboard.go @@ -5,6 +5,7 @@ import ( "database/sql" "errors" "log" + "time" infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" ) @@ -39,6 +40,16 @@ func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashbo // Best-effort system health + jobs; dashboard metrics should still render if these are missing. if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil { + // Attach config-derived limits so the UI can show "current / max" for connection pools. + // These are best-effort and should never block the dashboard rendering. + if s != nil && s.cfg != nil { + if s.cfg.Database.MaxOpenConns > 0 { + metrics.DBMaxOpenConns = intPtr(s.cfg.Database.MaxOpenConns) + } + if s.cfg.Redis.PoolSize > 0 { + metrics.RedisPoolSize = intPtr(s.cfg.Redis.PoolSize) + } + } overview.SystemMetrics = metrics } else if err != nil && !errors.Is(err, sql.ErrNoRows) { log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err) @@ -50,6 +61,8 @@ func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashbo log.Printf("[Ops] ListJobHeartbeats failed: %v", err) } + overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview) + return overview, nil } diff --git a/backend/internal/service/ops_dashboard_models.go b/backend/internal/service/ops_dashboard_models.go index 51a0b1fb..f189031b 100644 --- a/backend/internal/service/ops_dashboard_models.go +++ b/backend/internal/service/ops_dashboard_models.go @@ -35,6 +35,10 @@ type OpsDashboardOverview struct { Platform string `json:"platform"` GroupID *int64 `json:"group_id"` + // HealthScore is a backend-computed overall health score (0-100). + // It is derived from the monitored metrics in this overview, plus best-effort system metrics/job heartbeats. + HealthScore int `json:"health_score"` + // Latest system-level snapshot (window=1m, global). SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"` diff --git a/backend/internal/service/ops_health_score.go b/backend/internal/service/ops_health_score.go new file mode 100644 index 00000000..68cfc10d --- /dev/null +++ b/backend/internal/service/ops_health_score.go @@ -0,0 +1,126 @@ +package service + +import ( + "math" + "time" +) + +// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview. +// +// Design goals: +// - Backend-owned scoring (UI only displays). +// - Uses "overall" business indicators (SLA/error/latency) plus infra indicators (db/redis/cpu/mem/jobs). +// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data. +func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int { + if overview == nil { + return 0 + } + + // Idle/no-data: avoid showing a "bad" score when there is no traffic. + // UI can still render a gray/idle state based on QPS + error rate. + if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 { + return 100 + } + + score := 100.0 + + // --- SLA (primary signal) --- + // SLA is a ratio (0..1). Target is intentionally modest for LLM gateways; it can be tuned later. + slaPct := clampFloat64(overview.SLA*100, 0, 100) + if slaPct < 99.5 { + // Up to -45 points as SLA drops. + score -= math.Min(45, (99.5-slaPct)*12) + } + + // --- Error rates (secondary signal) --- + errorPct := clampFloat64(overview.ErrorRate*100, 0, 100) + if errorPct > 1 { + // Cap at -20 points by 6% error rate. + score -= math.Min(20, (errorPct-1)*4) + } + + upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100) + if upstreamPct > 1 { + // Upstream instability deserves extra weight, but keep it smaller than SLA/error. + score -= math.Min(15, (upstreamPct-1)*3) + } + + // --- Latency (tail-focused) --- + // Use p99 of duration + TTFT. Penalize only when clearly elevated. + if overview.Duration.P99 != nil { + p99 := float64(*overview.Duration.P99) + if p99 > 2000 { + // From 2s upward, gradually penalize up to -20. + score -= math.Min(20, (p99-2000)/900) // ~20s => ~-20 + } + } + if overview.TTFT.P99 != nil { + p99 := float64(*overview.TTFT.P99) + if p99 > 500 { + // TTFT > 500ms starts hurting; cap at -10. + score -= math.Min(10, (p99-500)/200) // 2.5s => -10 + } + } + + // --- System metrics snapshot (best-effort) --- + if overview.SystemMetrics != nil { + if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK { + score -= 20 + } + if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK { + score -= 15 + } + + if overview.SystemMetrics.CPUUsagePercent != nil { + cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100) + if cpuPct > 85 { + score -= math.Min(10, (cpuPct-85)*1.5) + } + } + if overview.SystemMetrics.MemoryUsagePercent != nil { + memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100) + if memPct > 90 { + score -= math.Min(10, (memPct-90)*1.0) + } + } + + if overview.SystemMetrics.DBConnWaiting != nil && *overview.SystemMetrics.DBConnWaiting > 0 { + waiting := float64(*overview.SystemMetrics.DBConnWaiting) + score -= math.Min(10, waiting*2) + } + if overview.SystemMetrics.ConcurrencyQueueDepth != nil && *overview.SystemMetrics.ConcurrencyQueueDepth > 0 { + depth := float64(*overview.SystemMetrics.ConcurrencyQueueDepth) + score -= math.Min(10, depth*0.5) + } + } + + // --- Job heartbeats (best-effort) --- + // Penalize only clear "error after last success" signals, and cap the impact. + jobPenalty := 0.0 + for _, hb := range overview.JobHeartbeats { + if hb == nil { + continue + } + if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) { + jobPenalty += 5 + continue + } + if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute { + jobPenalty += 2 + } + } + score -= math.Min(15, jobPenalty) + + score = clampFloat64(score, 0, 100) + return int(math.Round(score)) +} + +func clampFloat64(v float64, min float64, max float64) float64 { + if v < min { + return min + } + if v > max { + return max + } + return v +} diff --git a/backend/internal/service/ops_health_score_test.go b/backend/internal/service/ops_health_score_test.go new file mode 100644 index 00000000..d7e5dd8c --- /dev/null +++ b/backend/internal/service/ops_health_score_test.go @@ -0,0 +1,60 @@ +//go:build unit + +package service + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestComputeDashboardHealthScore_IdleReturns100(t *testing.T) { + t.Parallel() + + score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{}) + require.Equal(t, 100, score) +} + +func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) { + t.Parallel() + + ov := &OpsDashboardOverview{ + RequestCountTotal: 100, + RequestCountSLA: 100, + SuccessCount: 90, + ErrorCountTotal: 10, + ErrorCountSLA: 10, + + SLA: 0.90, + ErrorRate: 0.10, + UpstreamErrorRate: 0.08, + + Duration: OpsPercentiles{P99: intPtr(20_000)}, + TTFT: OpsPercentiles{P99: intPtr(2_000)}, + + SystemMetrics: &OpsSystemMetricsSnapshot{ + DBOK: boolPtr(false), + RedisOK: boolPtr(false), + CPUUsagePercent: float64Ptr(98.0), + MemoryUsagePercent: float64Ptr(97.0), + DBConnWaiting: intPtr(3), + ConcurrencyQueueDepth: intPtr(10), + }, + JobHeartbeats: []*OpsJobHeartbeat{ + { + JobName: "job-a", + LastErrorAt: timePtr(time.Now().UTC().Add(-1 * time.Minute)), + LastError: stringPtr("boom"), + }, + }, + } + + score := computeDashboardHealthScore(time.Now().UTC(), ov) + require.Less(t, score, 80) + require.GreaterOrEqual(t, score, 0) +} + +func timePtr(v time.Time) *time.Time { return &v } + +func stringPtr(v string) *string { return &v } diff --git a/backend/internal/service/ops_metrics_collector.go b/backend/internal/service/ops_metrics_collector.go index cd90e1bd..e55e365b 100644 --- a/backend/internal/service/ops_metrics_collector.go +++ b/backend/internal/service/ops_metrics_collector.go @@ -5,7 +5,6 @@ import ( "database/sql" "errors" "fmt" - "hash/fnv" "log" "math" "os" @@ -262,6 +261,7 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error { dbOK := c.checkDB(ctx) redisOK := c.checkRedis(ctx) active, idle := c.dbPoolStats() + redisTotal, redisIdle, redisStatsOK := c.redisPoolStats() successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd) if err != nil { @@ -327,6 +327,19 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error { DBOK: boolPtr(dbOK), RedisOK: boolPtr(redisOK), + RedisConnTotal: func() *int { + if !redisStatsOK { + return nil + } + return intPtr(redisTotal) + }(), + RedisConnIdle: func() *int { + if !redisStatsOK { + return nil + } + return intPtr(redisIdle) + }(), + DBConnActive: intPtr(active), DBConnIdle: intPtr(idle), GoroutineCount: intPtr(goroutines), @@ -722,6 +735,17 @@ func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool { return c.redisClient.Ping(ctx).Err() == nil } +func (c *OpsMetricsCollector) redisPoolStats() (total int, idle int, ok bool) { + if c == nil || c.redisClient == nil { + return 0, 0, false + } + stats := c.redisClient.PoolStats() + if stats == nil { + return 0, 0, false + } + return int(stats.TotalConns), int(stats.IdleConns), true +} + func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) { if c == nil || c.db == nil { return 0, 0 @@ -749,7 +773,7 @@ func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), if err != nil { // Prefer fail-closed to avoid stampeding the database when Redis is flaky. // Fallback to a DB advisory lock when Redis is present but unavailable. - release, ok := c.tryAcquireDBAdvisoryLock(ctx) + release, ok := tryAcquireDBAdvisoryLock(ctx, c.db, opsMetricsCollectorAdvisoryLockID) if !ok { c.maybeLogSkip() return nil, false @@ -769,38 +793,6 @@ func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), return release, true } -func (c *OpsMetricsCollector) tryAcquireDBAdvisoryLock(ctx context.Context) (func(), bool) { - if c == nil || c.db == nil { - return nil, false - } - if ctx == nil { - ctx = context.Background() - } - - conn, err := c.db.Conn(ctx) - if err != nil { - return nil, false - } - - acquired := false - if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", opsMetricsCollectorAdvisoryLockID).Scan(&acquired); err != nil { - _ = conn.Close() - return nil, false - } - if !acquired { - _ = conn.Close() - return nil, false - } - - release := func() { - unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - defer cancel() - _, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", opsMetricsCollectorAdvisoryLockID) - _ = conn.Close() - } - return release, true -} - func (c *OpsMetricsCollector) maybeLogSkip() { c.skipLogMu.Lock() defer c.skipLogMu.Unlock() @@ -853,9 +845,3 @@ func float64Ptr(v float64) *float64 { out := v return &out } - -func hashAdvisoryLockID(s string) int64 { - h := fnv.New64a() - _, _ = h.Write([]byte(s)) - return int64(h.Sum64()) -} diff --git a/backend/internal/service/ops_port.go b/backend/internal/service/ops_port.go index a3d847e0..90591a56 100644 --- a/backend/internal/service/ops_port.go +++ b/backend/internal/service/ops_port.go @@ -165,6 +165,9 @@ type OpsInsertSystemMetricsInput struct { DBOK *bool RedisOK *bool + RedisConnTotal *int + RedisConnIdle *int + DBConnActive *int DBConnIdle *int DBConnWaiting *int @@ -186,6 +189,13 @@ type OpsSystemMetricsSnapshot struct { DBOK *bool `json:"db_ok"` RedisOK *bool `json:"redis_ok"` + // Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max". + DBMaxOpenConns *int `json:"db_max_open_conns"` + RedisPoolSize *int `json:"redis_pool_size"` + + RedisConnTotal *int `json:"redis_conn_total"` + RedisConnIdle *int `json:"redis_conn_idle"` + DBConnActive *int `json:"db_conn_active"` DBConnIdle *int `json:"db_conn_idle"` DBConnWaiting *int `json:"db_conn_waiting"` diff --git a/backend/internal/service/setting_service.go b/backend/internal/service/setting_service.go index 1aea32be..09772616 100644 --- a/backend/internal/service/setting_service.go +++ b/backend/internal/service/setting_service.go @@ -139,6 +139,9 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled) updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled) updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault)) + if settings.OpsMetricsIntervalSeconds > 0 { + updates[SettingKeyOpsMetricsIntervalSeconds] = strconv.Itoa(settings.OpsMetricsIntervalSeconds) + } return s.settingRepo.SetMultiple(ctx, updates) } @@ -231,6 +234,7 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error { SettingKeyOpsMonitoringEnabled: "true", SettingKeyOpsRealtimeMonitoringEnabled: "true", SettingKeyOpsQueryModeDefault: "auto", + SettingKeyOpsMetricsIntervalSeconds: "60", } return s.settingRepo.SetMultiple(ctx, defaults) @@ -301,6 +305,18 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled]) result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled]) result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault])) + result.OpsMetricsIntervalSeconds = 60 + if raw := strings.TrimSpace(settings[SettingKeyOpsMetricsIntervalSeconds]); raw != "" { + if v, err := strconv.Atoi(raw); err == nil { + if v < 60 { + v = 60 + } + if v > 3600 { + v = 3600 + } + result.OpsMetricsIntervalSeconds = v + } + } return result } diff --git a/backend/internal/service/settings_view.go b/backend/internal/service/settings_view.go index e9d07bca..1f3d925a 100644 --- a/backend/internal/service/settings_view.go +++ b/backend/internal/service/settings_view.go @@ -43,6 +43,7 @@ type SystemSettings struct { OpsMonitoringEnabled bool OpsRealtimeMonitoringEnabled bool OpsQueryModeDefault string + OpsMetricsIntervalSeconds int } type PublicSettings struct { diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts index 3c3529a9..851993ca 100644 --- a/frontend/src/api/admin/ops.ts +++ b/frontend/src/api/admin/ops.ts @@ -46,6 +46,8 @@ export interface OpsDashboardOverview { platform: string group_id?: number | null + health_score?: number + system_metrics?: OpsSystemMetricsSnapshot | null job_heartbeats?: OpsJobHeartbeat[] | null @@ -228,6 +230,9 @@ export interface OpsSystemMetricsSnapshot { db_ok?: boolean | null redis_ok?: boolean | null + redis_conn_total?: number | null + redis_conn_idle?: number | null + db_conn_active?: number | null db_conn_idle?: number | null db_conn_waiting?: number | null diff --git a/frontend/src/api/admin/settings.ts b/frontend/src/api/admin/settings.ts index 37b12e40..9ddeb5bf 100644 --- a/frontend/src/api/admin/settings.ts +++ b/frontend/src/api/admin/settings.ts @@ -50,6 +50,7 @@ export interface SystemSettings { ops_monitoring_enabled: boolean ops_realtime_monitoring_enabled: boolean ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string + ops_metrics_interval_seconds: number } export interface UpdateSettingsRequest { @@ -83,6 +84,7 @@ export interface UpdateSettingsRequest { ops_monitoring_enabled?: boolean ops_realtime_monitoring_enabled?: boolean ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string + ops_metrics_interval_seconds?: number } /** diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts index f80a235f..1caae1d5 100644 --- a/frontend/src/i18n/locales/en.ts +++ b/frontend/src/i18n/locales/en.ts @@ -1733,8 +1733,10 @@ export default { redis: 'Redis', goroutines: 'Goroutines', jobs: 'Jobs', + jobsHelp: 'Click “Details” to view job heartbeats and recent errors', active: 'active', idle: 'idle', + waiting: 'waiting', ok: 'ok', lastRun: 'last_run:', lastSuccess: 'last_success:', @@ -1770,12 +1772,50 @@ export default { errorsSla: 'Errors (SLA scope)', upstreamExcl429529: 'Upstream (excl 429/529)', failedToLoadData: 'Failed to load ops data.', + failedToLoadOverview: 'Failed to load overview', + failedToLoadThroughputTrend: 'Failed to load throughput trend', + failedToLoadLatencyHistogram: 'Failed to load latency histogram', + failedToLoadErrorTrend: 'Failed to load error trend', + failedToLoadErrorDistribution: 'Failed to load error distribution', + failedToLoadErrorDetail: 'Failed to load error detail', + retryFailed: 'Retry failed', tpsK: 'TPS (K)', top: 'Top:', throughputTrend: 'Throughput Trend', latencyHistogram: 'Latency Histogram', errorTrend: 'Error Trend', errorDistribution: 'Error Distribution', + // Health Score & Diagnosis + health: 'Health', + healthCondition: 'Health Condition', + healthHelp: 'Overall system health score based on SLA, error rate, and resource usage', + healthyStatus: 'Healthy', + riskyStatus: 'At Risk', + idleStatus: 'Idle', + diagnosis: { + title: 'Smart Diagnosis', + footer: 'Automated diagnostic suggestions based on current metrics', + idle: 'System is currently idle', + idleImpact: 'No active traffic', + upstreamCritical: 'Upstream error rate critically high ({rate}%)', + upstreamCriticalImpact: 'May affect many user requests', + upstreamHigh: 'Upstream error rate elevated ({rate}%)', + upstreamHighImpact: 'Recommend checking upstream service status', + slaCritical: 'SLA critically below target ({sla}%)', + slaCriticalImpact: 'User experience severely degraded', + slaLow: 'SLA below target ({sla}%)', + slaLowImpact: 'Service quality needs attention', + errorHigh: 'Error rate too high ({rate}%)', + errorHighImpact: 'Many requests failing', + errorElevated: 'Error rate elevated ({rate}%)', + errorElevatedImpact: 'Recommend checking error logs', + healthCritical: 'Overall health score critically low ({score})', + healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation', + healthLow: 'Overall health score low ({score})', + healthLowImpact: 'May indicate minor instability; monitor SLA and error rates', + healthy: 'All system metrics normal', + healthyImpact: 'Service running stable' + }, // Error Log errorLog: { timeId: 'Time / ID', @@ -2069,7 +2109,21 @@ export default { throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.', latencyHistogram: 'Latency distribution (duration_ms) for successful requests.', errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).', - errorDistribution: 'Error distribution by status code.' + errorDistribution: 'Error distribution by status code.', + goroutines: + 'Number of Go runtime goroutines (lightweight threads). There is no absolute “safe” number—use your historical baseline. Heuristic: <2k is common; 2k–8k watch; >8k plus rising queue/latency often suggests blocking/leaks.', + cpu: 'CPU usage percentage, showing system processor load.', + memory: 'Memory usage, including used and total available memory.', + db: 'Database connection pool status, including active, idle, and waiting connections.', + redis: 'Redis connection pool status, showing active and idle connections.', + jobs: 'Background job execution status, including last run time, success time, and error information.', + qps: 'Queries Per Second (QPS) and Tokens Per Second (TPS), real-time system throughput.', + tokens: 'Total number of tokens processed in the current time window.', + sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).', + errors: 'Error statistics, including total errors, error rate, and upstream error rate.', + latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.', + ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.', + health: 'System health score (0-100), considering SLA, error rate, and resource usage.' }, charts: { emptyRequest: 'No requests in this window.', @@ -2183,7 +2237,9 @@ export default { queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)', queryModeAuto: 'Auto (recommended)', queryModeRaw: 'Raw (most accurate, slower)', - queryModePreagg: 'Preagg (fastest, requires aggregation)' + queryModePreagg: 'Preagg (fastest, requires aggregation)', + metricsInterval: 'Metrics Collection Interval (seconds)', + metricsIntervalHint: 'How often to collect system/request metrics (60-3600 seconds)' }, adminApiKey: { title: 'Admin API Key', diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index 646511f4..d8ce293c 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -1878,8 +1878,10 @@ export default { redis: 'Redis', goroutines: '协程', jobs: '后台任务', + jobsHelp: '点击“明细”查看任务心跳与报错信息', active: '活跃', idle: '空闲', + waiting: '等待', ok: '正常', lastRun: '最近运行', lastSuccess: '最近成功', @@ -1898,8 +1900,8 @@ export default { errors: '错误', errorRate: '错误率:', upstreamRate: '上游错误率:', - latencyDuration: '延迟 (duration_ms)', - ttftLabel: 'TTFT (first_token_ms)', + latencyDuration: '延迟(毫秒)', + ttftLabel: '首字延迟(毫秒)', p50: 'p50', p90: 'p90', p95: 'p95', @@ -1915,12 +1917,50 @@ export default { errorsSla: '错误(SLA范围)', upstreamExcl429529: '上游(排除429/529)', failedToLoadData: '加载运维数据失败', - tpsK: 'TPS (K)', + failedToLoadOverview: '加载概览数据失败', + failedToLoadThroughputTrend: '加载吞吐趋势失败', + failedToLoadLatencyHistogram: '加载延迟分布失败', + failedToLoadErrorTrend: '加载错误趋势失败', + failedToLoadErrorDistribution: '加载错误分布失败', + failedToLoadErrorDetail: '加载错误详情失败', + retryFailed: '重试失败', + tpsK: 'TPS(千)', top: '最高:', throughputTrend: '吞吐趋势', latencyHistogram: '延迟分布', errorTrend: '错误趋势', errorDistribution: '错误分布', + // Health Score & Diagnosis + health: '健康', + healthCondition: '健康状况', + healthHelp: '基于 SLA、错误率和资源使用情况的系统整体健康评分', + healthyStatus: '健康', + riskyStatus: '风险', + idleStatus: '待机', + diagnosis: { + title: '智能诊断', + footer: '基于当前指标的自动诊断建议', + idle: '系统当前处于待机状态', + idleImpact: '无活跃流量', + upstreamCritical: '上游错误率严重偏高 ({rate}%)', + upstreamCriticalImpact: '可能影响大量用户请求', + upstreamHigh: '上游错误率偏高 ({rate}%)', + upstreamHighImpact: '建议检查上游服务状态', + slaCritical: 'SLA 严重低于目标 ({sla}%)', + slaCriticalImpact: '用户体验严重受损', + slaLow: 'SLA 低于目标 ({sla}%)', + slaLowImpact: '需要关注服务质量', + errorHigh: '错误率过高 ({rate}%)', + errorHighImpact: '大量请求失败', + errorElevated: '错误率偏高 ({rate}%)', + errorElevatedImpact: '建议检查错误日志', + healthCritical: '综合健康评分过低 ({score})', + healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与延迟', + healthLow: '综合健康评分偏低 ({score})', + healthLowImpact: '可能存在轻度波动,建议关注 SLA 与错误率', + healthy: '所有系统指标正常', + healthyImpact: '服务运行稳定' + }, // Error Log errorLog: { timeId: '时间 / ID', @@ -2212,9 +2252,23 @@ export default { }, tooltips: { throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。', - latencyHistogram: '成功请求的延迟分布(duration_ms)。', + latencyHistogram: '成功请求的延迟分布(毫秒)。', errorTrend: '错误趋势(SLA 口径排除业务限制;上游错误率排除 429/529)。', - errorDistribution: '按状态码统计的错误分布。' + errorDistribution: '按状态码统计的错误分布。', + goroutines: + 'Go 运行时的协程数量(轻量级线程)。没有绝对“安全值”,建议以历史基线为准。经验参考:<2000 常见;2000-8000 需关注;>8000 且伴随队列/延迟上升时,优先排查阻塞/泄漏。', + cpu: 'CPU 使用率,显示系统处理器的负载情况。', + memory: '内存使用率,包括已使用和总可用内存。', + db: '数据库连接池状态,包括活跃连接、空闲连接和等待连接数。', + redis: 'Redis 连接池状态,显示活跃和空闲的连接数。', + jobs: '后台任务执行状态,包括最近运行时间、成功时间和错误信息。', + qps: '每秒查询数(QPS)和每秒Token数(TPS),实时显示系统吞吐量。', + tokens: '当前时间窗口内处理的总Token数量。', + sla: '服务等级协议达成率,排除业务限制(如余额不足、配额超限)的成功请求占比。', + errors: '错误统计,包括总错误数、错误率和上游错误率。', + latency: '请求延迟统计,包括 p50、p90、p95、p99 等百分位数。', + ttft: '首Token延迟(Time To First Token),衡量流式响应的首字节返回速度。', + health: '系统健康评分(0-100),综合考虑 SLA、错误率和资源使用情况。' }, charts: { emptyRequest: '该时间窗口内暂无请求。', @@ -2320,14 +2374,16 @@ export default { description: '启用运维监控模块,用于排障与健康可视化', disabled: '运维监控已关闭', enabled: '启用运维监控', - enabledHint: '启用 Ops 运维监控模块(仅管理员可见)', + enabledHint: '启用运维监控模块(仅管理员可见)', realtimeEnabled: '启用实时监控', - realtimeEnabledHint: '启用实时 QPS/指标推送(WebSocket)', + realtimeEnabledHint: '启用实时请求速率和指标推送(WebSocket)', queryMode: '默认查询模式', - queryModeHint: 'Ops Dashboard 默认查询模式(auto/raw/preagg)', + queryModeHint: '运维监控默认查询模式(自动/原始/预聚合)', queryModeAuto: '自动(推荐)', - queryModeRaw: 'Raw(最准,但较慢)', - queryModePreagg: 'Preagg(最快,需预聚合)' + queryModeRaw: '原始(最准确,但较慢)', + queryModePreagg: '预聚合(最快,需预聚合)', + metricsInterval: '采集频率(秒)', + metricsIntervalHint: '系统/请求指标采集频率(60-3600 秒)' }, adminApiKey: { title: '管理员 API Key', diff --git a/frontend/src/views/admin/SettingsView.vue b/frontend/src/views/admin/SettingsView.vue index 4375a6cc..cf7a2867 100644 --- a/frontend/src/views/admin/SettingsView.vue +++ b/frontend/src/views/admin/SettingsView.vue @@ -715,6 +715,25 @@ class="w-[220px]" /> + +
+
+ +

+ {{ t('admin.settings.opsMonitoring.metricsIntervalHint') }} +

+
+ +
@@ -824,7 +843,8 @@ const form = reactive({ // Ops Monitoring (vNext) ops_monitoring_enabled: true, ops_realtime_monitoring_enabled: true, - ops_query_mode_default: 'auto' + ops_query_mode_default: 'auto', + ops_metrics_interval_seconds: 60 }) const opsQueryModeOptions = computed(() => [ @@ -922,7 +942,8 @@ async function saveSettings() { identity_patch_prompt: form.identity_patch_prompt, ops_monitoring_enabled: form.ops_monitoring_enabled, ops_realtime_monitoring_enabled: form.ops_realtime_monitoring_enabled, - ops_query_mode_default: form.ops_query_mode_default + ops_query_mode_default: form.ops_query_mode_default, + ops_metrics_interval_seconds: form.ops_metrics_interval_seconds } const updated = await adminAPI.settings.updateSettings(payload) Object.assign(form, updated) diff --git a/frontend/src/views/admin/ops/OpsDashboard.vue b/frontend/src/views/admin/ops/OpsDashboard.vue index 56add66f..212717fb 100644 --- a/frontend/src/views/admin/ops/OpsDashboard.vue +++ b/frontend/src/views/admin/ops/OpsDashboard.vue @@ -33,190 +33,6 @@ @open-error-details="openErrorDetails" /> - -
-
-

{{ t('admin.ops.systemHealth') }}

-
-
-
-
-
- -
- {{ t('admin.ops.noSystemMetrics') }} -
- -
-
- {{ t('admin.ops.collectedAt') }} {{ formatDateTime(overview.system_metrics.created_at) }} ({{ t('admin.ops.window') }} - {{ overview.system_metrics.window_minutes }}m) -
- -
-
-
{{ t('admin.ops.cpu') }}
-
- {{ formatPercent0to100(overview.system_metrics.cpu_usage_percent) }} -
-
- -
-
{{ t('admin.ops.memory') }}
-
- {{ formatPercent0to100(overview.system_metrics.memory_usage_percent) }} -
-
- {{ formatMBPair(overview.system_metrics.memory_used_mb, overview.system_metrics.memory_total_mb) }} -
-
- -
-
{{ t('admin.ops.db') }}
-
- {{ boolOkLabel(overview.system_metrics.db_ok) }} -
-
- {{ t('admin.ops.active') }}: {{ overview.system_metrics.db_conn_active ?? '-' }}, {{ t('admin.ops.idle') }}: - {{ overview.system_metrics.db_conn_idle ?? '-' }} -
-
- -
-
{{ t('admin.ops.redis') }}
-
- {{ boolOkLabel(overview.system_metrics.redis_ok) }} -
-
- -
-
{{ t('admin.ops.goroutines') }}
-
- {{ overview.system_metrics.goroutine_count ?? '-' }} -
-
-
- -
-
- {{ t('admin.ops.jobs') }} -
-
-
-
- {{ job.job_name }} -
-
- {{ t('admin.ops.lastRun') }}: {{ job.last_run_at ? formatDateTime(job.last_run_at) : '-' }} · {{ t('admin.ops.lastSuccess') }}: - {{ job.last_success_at ? formatDateTime(job.last_success_at) : '-' }} · - - {{ t('admin.ops.lastError') }}: {{ job.last_error }} - - {{ t('admin.ops.ok') }} -
-
-
-
-
-
-
- -
-
-

{{ t('admin.ops.overview') }}

-
-
-
-
-
- -
- {{ t('admin.ops.noData') }} -
- -
-
-
-
{{ t('admin.ops.requestsTotal') }}
-
- {{ formatInt(overview.request_count_total) }} -
-
- {{ t('admin.ops.slaScope') }} {{ formatInt(overview.request_count_sla) }} -
-
- -
-
{{ t('admin.ops.tokens') }}
-
- {{ formatInt(overview.token_consumed) }} -
-
- {{ t('admin.ops.tps') }} {{ overview.tps.current }} ({{ t('admin.ops.peak') }} {{ overview.tps.peak }}) -
-
- -
-
{{ t('admin.ops.sla') }}
-
- {{ formatPercent(overview.sla) }} -
-
- {{ t('admin.ops.businessLimited') }}: {{ formatInt(overview.business_limited_count) }} -
-
- -
-
{{ t('admin.ops.errors') }}
-
- {{ t('admin.ops.errorRate') }}: {{ formatPercent(overview.error_rate) }} -
-
- {{ t('admin.ops.upstreamRate') }}: {{ formatPercent(overview.upstream_error_rate) }} -
-
- 429: {{ formatInt(overview.upstream_429_count) }} · 529: - {{ formatInt(overview.upstream_529_count) }} -
-
-
- -
-
-
{{ t('admin.ops.latencyDuration') }}
-
-
{{ t('admin.ops.p50') }}: {{ formatMs(overview.duration.p50_ms) }}
-
{{ t('admin.ops.p90') }}: {{ formatMs(overview.duration.p90_ms) }}
-
{{ t('admin.ops.p95') }}: {{ formatMs(overview.duration.p95_ms) }}
-
{{ t('admin.ops.p99') }}: {{ formatMs(overview.duration.p99_ms) }}
-
{{ t('admin.ops.avg') }}: {{ formatMs(overview.duration.avg_ms) }}
-
{{ t('admin.ops.max') }}: {{ formatMs(overview.duration.max_ms) }}
-
-
- -
-
{{ t('admin.ops.ttftLabel') }}
-
-
{{ t('admin.ops.p50') }}: {{ formatMs(overview.ttft.p50_ms) }}
-
{{ t('admin.ops.p90') }}: {{ formatMs(overview.ttft.p90_ms) }}
-
{{ t('admin.ops.p95') }}: {{ formatMs(overview.ttft.p95_ms) }}
-
{{ t('admin.ops.p99') }}: {{ formatMs(overview.ttft.p99_ms) }}
-
{{ t('admin.ops.avg') }}: {{ formatMs(overview.ttft.avg_ms) }}
-
{{ t('admin.ops.max') }}: {{ formatMs(overview.ttft.max_ms) }}
-
-
-
-
-
-
-
@@ -308,7 +124,6 @@ import OpsLatencyChart from './components/OpsLatencyChart.vue' import OpsThroughputTrendChart from './components/OpsThroughputTrendChart.vue' import OpsAlertEventsCard from './components/OpsAlertEventsCard.vue' import OpsRequestDetailsModal, { type OpsRequestDetailsPreset } from './components/OpsRequestDetailsModal.vue' -import { formatDateTime, formatNumberLocaleString } from '@/utils/format' const route = useRoute() const router = useRouter() @@ -486,7 +301,6 @@ const syncQueryToRoute = useDebounceFn(async () => { }, 250) const overview = ref(null) -const loadingOverview = ref(false) const throughputTrend = ref(null) const loadingTrend = ref(false) @@ -523,12 +337,15 @@ function handleThroughputSelectGroup(nextGroupId: number) { groupId.value = id } -function handleOpenRequestDetails() { - requestDetailsPreset.value = { +function handleOpenRequestDetails(preset?: OpsRequestDetailsPreset) { + const basePreset: OpsRequestDetailsPreset = { title: t('admin.ops.requestDetails.title'), kind: 'all', sort: 'created_at_desc' } + + requestDetailsPreset.value = { ...basePreset, ...(preset ?? {}) } + if (!requestDetailsPreset.value.title) requestDetailsPreset.value.title = basePreset.title showRequestDetails.value = true } @@ -573,46 +390,8 @@ function openError(id: number) { showErrorModal.value = true } -function formatInt(v: number | null | undefined): string { - if (typeof v !== 'number') return '0' - return formatNumberLocaleString(v) -} - -function formatPercent(v: number | null | undefined): string { - if (typeof v !== 'number') return '-' - return `${(v * 100).toFixed(2)}%` -} - -function formatPercent0to100(v: number | null | undefined): string { - if (typeof v !== 'number') return '-' - return `${v.toFixed(1)}%` -} - -function formatMBPair(used: number | null | undefined, total: number | null | undefined): string { - if (typeof used !== 'number' || typeof total !== 'number') return '-' - return `${formatNumberLocaleString(used)} / ${formatNumberLocaleString(total)} MB` -} - -function boolOkLabel(v: boolean | null | undefined): string { - if (v === true) return 'OK' - if (v === false) return 'FAIL' - return '-' -} - -function boolOkClass(v: boolean | null | undefined): string { - if (v === true) return 'text-emerald-600 dark:text-emerald-400' - if (v === false) return 'text-rose-600 dark:text-rose-400' - return 'text-gray-900 dark:text-white' -} - -function formatMs(v: number | null | undefined): string { - if (v == null) return '-' - return `${v}ms` -} - async function refreshOverviewWithCancel(fetchSeq: number, signal: AbortSignal) { if (!opsEnabled.value) return - loadingOverview.value = true try { const data = await opsAPI.getDashboardOverview( { @@ -628,11 +407,7 @@ async function refreshOverviewWithCancel(fetchSeq: number, signal: AbortSignal) } catch (err: any) { if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return overview.value = null - appStore.showError(err?.message || 'Failed to load overview') - } finally { - if (fetchSeq === dashboardFetchSeq) { - loadingOverview.value = false - } + appStore.showError(err?.message || t('admin.ops.failedToLoadOverview')) } } @@ -654,7 +429,7 @@ async function refreshThroughputTrendWithCancel(fetchSeq: number, signal: AbortS } catch (err: any) { if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return throughputTrend.value = null - appStore.showError(err?.message || 'Failed to load throughput trend') + appStore.showError(err?.message || t('admin.ops.failedToLoadThroughputTrend')) } finally { if (fetchSeq === dashboardFetchSeq) { loadingTrend.value = false @@ -680,7 +455,7 @@ async function refreshLatencyHistogramWithCancel(fetchSeq: number, signal: Abort } catch (err: any) { if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return latencyHistogram.value = null - appStore.showError(err?.message || 'Failed to load latency histogram') + appStore.showError(err?.message || t('admin.ops.failedToLoadLatencyHistogram')) } finally { if (fetchSeq === dashboardFetchSeq) { loadingLatency.value = false @@ -706,7 +481,7 @@ async function refreshErrorTrendWithCancel(fetchSeq: number, signal: AbortSignal } catch (err: any) { if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return errorTrend.value = null - appStore.showError(err?.message || 'Failed to load error trend') + appStore.showError(err?.message || t('admin.ops.failedToLoadErrorTrend')) } finally { if (fetchSeq === dashboardFetchSeq) { loadingErrorTrend.value = false @@ -732,7 +507,7 @@ async function refreshErrorDistributionWithCancel(fetchSeq: number, signal: Abor } catch (err: any) { if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return errorDistribution.value = null - appStore.showError(err?.message || 'Failed to load error distribution') + appStore.showError(err?.message || t('admin.ops.failedToLoadErrorDistribution')) } finally { if (fetchSeq === dashboardFetchSeq) { loadingErrorDistribution.value = false diff --git a/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue index 118a1f3a..f8166040 100644 --- a/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue +++ b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue @@ -286,7 +286,7 @@ async function fetchDetail(id: number) { } } catch (err: any) { detail.value = null - appStore.showError(err?.message || 'Failed to load error detail') + appStore.showError(err?.message || t('admin.ops.failedToLoadErrorDetail')) } finally { loading.value = false } @@ -348,7 +348,7 @@ async function runConfirmedRetry() { const summary = res.status === 'succeeded' ? t('admin.ops.errorDetail.retrySuccess') : t('admin.ops.errorDetail.retryFailed') appStore.showSuccess(summary) } catch (err: any) { - appStore.showError(err?.message || 'Retry failed') + appStore.showError(err?.message || t('admin.ops.retryFailed')) } finally { retrying.value = false } From c48dc097ff5ddb59552c2f51c6432007951f4231 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Sat, 10 Jan 2026 02:17:38 +0800 Subject: [PATCH 16/53] =?UTF-8?q?feat(=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7?= =?UTF-8?q?):=20=E9=87=8D=E6=9E=84=E4=BB=AA=E8=A1=A8=E6=9D=BF=E5=B8=83?= =?UTF-8?q?=E5=B1=80=E5=92=8C=E5=A2=9E=E5=BC=BA=E6=95=B0=E6=8D=AE=E5=B1=95?= =?UTF-8?q?=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 主要改动: - 重构仪表板为左右布局(5:7比例) - 左侧:健康评分 + 实时信息(当前/峰值/平均 QPS/TPS) - 右侧:6个卡片展示详细指标(3列x2行) - 总请求:请求数、Token数、平均QPS/TPS、平均延迟/TTFT - SLA:百分比、异常数、进度条 - 延迟:P99/P95/P90/P50/Avg/Max(带颜色编码) - TTFT:P99/P95/P90/P50/Avg/Max(带颜色编码) - 请求错误:错误率、错误数、业务限制数 - 上游错误:错误率、错误数(排除429/529)、429/529数 - 添加延迟/TTFT颜色编码(<500ms绿色,<1s黄色,<2s橙色,≥2s红色) - 添加实时窗口选择器(1min/5min/30min/1h) - 优化时间段选择器标签("近5分钟"等) - 完善中英文i18n翻译 - 数据库:添加Redis连接池字段(redis_conn_total, redis_conn_idle) --- .../migrations/030_ops_monitoring_vnext.sql | 10 + frontend/src/api/admin/ops.ts | 4 + frontend/src/i18n/locales/en.ts | 49 + frontend/src/i18n/locales/zh.ts | 49 + .../ops/components/OpsDashboardHeader.vue | 1103 +++++++++++++++-- 5 files changed, 1104 insertions(+), 111 deletions(-) diff --git a/backend/migrations/030_ops_monitoring_vnext.sql b/backend/migrations/030_ops_monitoring_vnext.sql index 39b19e5d..a18c061d 100644 --- a/backend/migrations/030_ops_monitoring_vnext.sql +++ b/backend/migrations/030_ops_monitoring_vnext.sql @@ -705,3 +705,13 @@ INSERT INTO ops_alert_rules ( '当错误率超过 20% 且持续 1 分钟时触发告警(服务严重异常)', true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW() ) ON CONFLICT (name) DO NOTHING; + +-- Ops Monitoring vNext: add Redis pool stats fields to system metrics snapshots. +-- This migration is intentionally idempotent. + +ALTER TABLE ops_system_metrics + ADD COLUMN IF NOT EXISTS redis_conn_total INT, + ADD COLUMN IF NOT EXISTS redis_conn_idle INT; + +COMMENT ON COLUMN ops_system_metrics.redis_conn_total IS 'Redis pool total connections (go-redis PoolStats.TotalConns).'; +COMMENT ON COLUMN ops_system_metrics.redis_conn_idle IS 'Redis pool idle connections (go-redis PoolStats.IdleConns).'; diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts index 851993ca..42b9e70d 100644 --- a/frontend/src/api/admin/ops.ts +++ b/frontend/src/api/admin/ops.ts @@ -230,6 +230,10 @@ export interface OpsSystemMetricsSnapshot { db_ok?: boolean | null redis_ok?: boolean | null + // Config-derived limits (best-effort) for rendering "current vs max". + db_max_open_conns?: number | null + redis_pool_size?: number | null + redis_conn_total?: number | null redis_conn_idle?: number | null diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts index 1caae1d5..a4c631cb 100644 --- a/frontend/src/i18n/locales/en.ts +++ b/frontend/src/i18n/locales/en.ts @@ -1737,6 +1737,8 @@ export default { active: 'active', idle: 'idle', waiting: 'waiting', + conns: 'conns', + queue: 'queue', ok: 'ok', lastRun: 'last_run:', lastSuccess: 'last_success:', @@ -1750,6 +1752,17 @@ export default { tps: 'TPS:', current: 'current', peak: 'peak', + average: 'average', + totalRequests: 'Total Requests', + avgQps: 'Avg QPS', + avgTps: 'Avg TPS', + avgLatency: 'Avg Latency', + avgTtft: 'Avg TTFT', + exceptions: 'Exceptions', + requestErrors: 'Request Errors', + errorCount: 'Error Count', + upstreamErrors: 'Upstream Errors', + errorCountExcl429529: 'Error Count (excl 429/529)', sla: 'SLA (excl business limits)', businessLimited: 'business_limited:', errors: 'Errors', @@ -1792,6 +1805,42 @@ export default { healthyStatus: 'Healthy', riskyStatus: 'At Risk', idleStatus: 'Idle', + realtime: { + title: 'Realtime', + connected: 'Connected', + connecting: 'Connecting', + reconnecting: 'Reconnecting', + offline: 'Offline', + closed: 'Closed', + reconnectIn: 'Reconnect in {seconds}s' + }, + tooltips: { + qps: 'Queries per second - real-time request rate', + sla: 'Service Level Agreement - percentage of requests within acceptable latency', + latency: 'Request duration from start to finish', + ttft: 'Time to First Token - latency until first response token', + errors: 'Request errors within SLA scope', + upstreamErrors: 'Errors from upstream services (excluding rate limits)', + totalRequests: 'Total requests and tokens consumed in this time window', + cpu: 'CPU usage percentage', + memory: 'Memory usage percentage', + db: 'Database connection pool status', + redis: 'Redis connection pool status', + goroutines: 'Go routine count (concurrent tasks)', + jobs: 'Background job health status' + }, + timeRange: { + '5m': 'Last 5 minutes', + '30m': 'Last 30 minutes', + '1h': 'Last 1 hour', + '6h': 'Last 6 hours', + '24h': 'Last 24 hours' + }, + queryMode: { + auto: 'Auto', + raw: 'Raw Query', + preagg: 'Pre-aggregated' + }, diagnosis: { title: 'Smart Diagnosis', footer: 'Automated diagnostic suggestions based on current metrics', diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index d8ce293c..ced386d5 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -1882,6 +1882,8 @@ export default { active: '活跃', idle: '空闲', waiting: '等待', + conns: '连接', + queue: '队列', ok: '正常', lastRun: '最近运行', lastSuccess: '最近成功', @@ -1895,6 +1897,17 @@ export default { tps: 'TPS', current: '当前', peak: '峰值', + average: '平均', + totalRequests: '总请求', + avgQps: '平均 QPS', + avgTps: '平均 TPS', + avgLatency: '平均延迟', + avgTtft: '平均首字延迟', + exceptions: '异常数', + requestErrors: '请求错误', + errorCount: '错误数', + upstreamErrors: '上游错误', + errorCountExcl429529: '错误数(排除429/529)', sla: 'SLA(排除业务限制)', businessLimited: '业务限制:', errors: '错误', @@ -1937,6 +1950,42 @@ export default { healthyStatus: '健康', riskyStatus: '风险', idleStatus: '待机', + realtime: { + title: '实时信息', + connected: '已连接', + connecting: '连接中', + reconnecting: '重连中', + offline: '离线', + closed: '已关闭', + reconnectIn: '{seconds}秒后重连' + }, + tooltips: { + qps: '每秒查询数 - 实时请求速率', + sla: '服务等级协议 - 可接受延迟范围内的请求百分比', + latency: '从开始到结束的请求持续时间', + ttft: '首字延迟 - 直到第一个响应令牌的延迟', + errors: 'SLA 范围内的请求错误', + upstreamErrors: '上游服务错误(不包括速率限制)', + totalRequests: '此时间窗口内的总请求数和消耗的令牌数', + cpu: 'CPU 使用率', + memory: '内存使用率', + db: '数据库连接池状态', + redis: 'Redis 连接池状态', + goroutines: 'Go 协程数(并发任务)', + jobs: '后台任务健康状态' + }, + timeRange: { + '5m': '近5分钟', + '30m': '近30分钟', + '1h': '近1小时', + '6h': '近6小时', + '24h': '近24小时' + }, + queryMode: { + auto: '自动', + raw: '原始查询', + preagg: '预聚合' + }, diagnosis: { title: '智能诊断', footer: '基于当前指标的自动诊断建议', diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue index c2c6adb6..04cae822 100644 --- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue +++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue @@ -2,10 +2,15 @@ import { computed, onMounted, ref, watch } from 'vue' import { useI18n } from 'vue-i18n' import Select from '@/components/common/Select.vue' +import HelpTooltip from '@/components/common/HelpTooltip.vue' +import BaseDialog from '@/components/common/BaseDialog.vue' import { adminAPI } from '@/api' import type { OpsDashboardOverview, OpsWSStatus } from '@/api/admin/ops' +import type { OpsRequestDetailsPreset } from './OpsRequestDetailsModal.vue' import { formatNumber } from '@/utils/format' +type RealtimeWindow = '1min' | '5min' | '30min' | '1h' + interface Props { overview?: OpsDashboardOverview | null wsStatus: OpsWSStatus @@ -27,7 +32,7 @@ interface Emits { (e: 'update:timeRange', value: string): void (e: 'update:queryMode', value: string): void (e: 'refresh'): void - (e: 'openRequestDetails'): void + (e: 'openRequestDetails', preset?: OpsRequestDetailsPreset): void (e: 'openErrorDetails', kind: 'request' | 'upstream'): void } @@ -36,6 +41,13 @@ const emit = defineEmits() const { t } = useI18n() +const realtimeWindow = ref('1min') + +const overview = computed(() => props.overview ?? null) +const systemMetrics = computed(() => overview.value?.system_metrics ?? null) + +// --- Filters --- + const groups = ref>([]) const platformOptions = computed(() => [ @@ -47,11 +59,11 @@ const platformOptions = computed(() => [ ]) const timeRangeOptions = computed(() => [ - { value: '5m', label: '5m' }, - { value: '30m', label: '30m' }, - { value: '1h', label: '1h' }, - { value: '6h', label: '6h' }, - { value: '24h', label: '24h' } + { value: '5m', label: t('admin.ops.timeRange.5m') }, + { value: '30m', label: t('admin.ops.timeRange.30m') }, + { value: '1h', label: t('admin.ops.timeRange.1h') }, + { value: '6h', label: t('admin.ops.timeRange.6h') }, + { value: '24h', label: t('admin.ops.timeRange.24h') } ]) const queryModeOptions = computed(() => [ @@ -107,65 +119,107 @@ function handleQueryModeChange(val: string | number | boolean | null) { emit('update:queryMode', String(val || 'auto')) } +function openDetails(preset?: OpsRequestDetailsPreset) { + emit('openRequestDetails', preset) +} + +function openErrorDetails(kind: 'request' | 'upstream') { + emit('openErrorDetails', kind) +} + const updatedAtLabel = computed(() => { if (!props.lastUpdated) return t('common.unknown') return props.lastUpdated.toLocaleTimeString() }) -const totalRequestsLabel = computed(() => { - const n = props.overview?.request_count_total ?? 0 - return formatNumber(n) -}) +// --- Color coding for latency/TTFT --- +function getLatencyColor(ms: number | null | undefined): string { + if (ms == null) return 'text-gray-900 dark:text-white' + if (ms < 500) return 'text-green-600 dark:text-green-400' + if (ms < 1000) return 'text-yellow-600 dark:text-yellow-400' + if (ms < 2000) return 'text-orange-600 dark:text-orange-400' + return 'text-red-600 dark:text-red-400' +} -const totalTokensLabel = computed(() => { - const n = props.overview?.token_consumed ?? 0 - return formatNumber(n) -}) +// --- Realtime / Overview labels --- -const qpsLabel = computed(() => { +const totalRequestsLabel = computed(() => formatNumber(overview.value?.request_count_total ?? 0)) +const totalTokensLabel = computed(() => formatNumber(overview.value?.token_consumed ?? 0)) + +const displayRealTimeQps = computed(() => { + const ov = overview.value + if (!ov) return 0 const useRealtime = props.wsStatus === 'connected' && !!props.wsHasData - const n = useRealtime ? props.realTimeQps : props.overview?.qps?.current - if (typeof n !== 'number') return '-' - return n.toFixed(1) + const v = useRealtime ? props.realTimeQps : ov.qps?.current + return typeof v === 'number' && Number.isFinite(v) ? v : 0 }) -const tpsLabel = computed(() => { +const displayRealTimeTps = computed(() => { + const ov = overview.value + if (!ov) return 0 const useRealtime = props.wsStatus === 'connected' && !!props.wsHasData - const n = useRealtime ? props.realTimeTps : props.overview?.tps?.current - if (typeof n !== 'number') return '-' - return n.toFixed(1) + const v = useRealtime ? props.realTimeTps : ov.tps?.current + return typeof v === 'number' && Number.isFinite(v) ? v : 0 }) const qpsPeakLabel = computed(() => { - const n = props.overview?.qps?.peak - if (typeof n !== 'number') return '-' - return n.toFixed(1) + const v = overview.value?.qps?.peak + if (typeof v !== 'number') return '-' + return v.toFixed(1) }) const tpsPeakLabel = computed(() => { - const n = props.overview?.tps?.peak - if (typeof n !== 'number') return '-' - return n.toFixed(1) + const v = overview.value?.tps?.peak + if (typeof v !== 'number') return '-' + return v.toFixed(1) }) -const slaLabel = computed(() => { - const v = props.overview?.sla +const qpsAvgLabel = computed(() => { + const v = overview.value?.qps?.avg if (typeof v !== 'number') return '-' - return `${(v * 100).toFixed(3)}%` + return v.toFixed(1) }) -const errorRateLabel = computed(() => { - const v = props.overview?.error_rate +const tpsAvgLabel = computed(() => { + const v = overview.value?.tps?.avg if (typeof v !== 'number') return '-' - return `${(v * 100).toFixed(2)}%` + return v.toFixed(1) }) -const upstreamErrorRateLabel = computed(() => { - const v = props.overview?.upstream_error_rate - if (typeof v !== 'number') return '-' - return `${(v * 100).toFixed(2)}%` +const slaPercent = computed(() => { + const v = overview.value?.sla + if (typeof v !== 'number') return null + return v * 100 }) +const errorRatePercent = computed(() => { + const v = overview.value?.error_rate + if (typeof v !== 'number') return null + return v * 100 +}) + +const upstreamErrorRatePercent = computed(() => { + const v = overview.value?.upstream_error_rate + if (typeof v !== 'number') return null + return v * 100 +}) + +const durationP99Ms = computed(() => overview.value?.duration?.p99_ms ?? null) +const durationP95Ms = computed(() => overview.value?.duration?.p95_ms ?? null) +const durationP90Ms = computed(() => overview.value?.duration?.p90_ms ?? null) +const durationP50Ms = computed(() => overview.value?.duration?.p50_ms ?? null) +const durationAvgMs = computed(() => overview.value?.duration?.avg_ms ?? null) +const durationMaxMs = computed(() => overview.value?.duration?.max_ms ?? null) + +const ttftP99Ms = computed(() => overview.value?.ttft?.p99_ms ?? null) +const ttftP95Ms = computed(() => overview.value?.ttft?.p95_ms ?? null) +const ttftP90Ms = computed(() => overview.value?.ttft?.p90_ms ?? null) +const ttftP50Ms = computed(() => overview.value?.ttft?.p50_ms ?? null) +const ttftAvgMs = computed(() => overview.value?.ttft?.avg_ms ?? null) +const ttftMaxMs = computed(() => overview.value?.ttft?.max_ms ?? null) + +// --- WebSocket status --- + const wsStatusLabel = computed(() => { switch (props.wsStatus) { case 'connected': @@ -204,11 +258,365 @@ const wsReconnectHint = computed(() => { const sec = Math.max(1, Math.ceil(delayMs / 1000)) return t('admin.ops.realtime.reconnectIn', { seconds: sec }) }) + +// --- Health Score & Diagnosis (primary) --- + +const isSystemIdle = computed(() => { + const ov = overview.value + if (!ov) return true + const qps = props.wsStatus === 'connected' && props.wsHasData ? props.realTimeQps : ov.qps?.current + const errorRate = ov.error_rate ?? 0 + return (qps ?? 0) === 0 && errorRate === 0 +}) + +const healthScoreValue = computed(() => { + const v = overview.value?.health_score + return typeof v === 'number' && Number.isFinite(v) ? v : null +}) + +const healthScoreColor = computed(() => { + if (isSystemIdle.value) return '#9ca3af' // gray-400 + const score = healthScoreValue.value + if (score == null) return '#9ca3af' + if (score >= 90) return '#10b981' // green + if (score >= 60) return '#f59e0b' // yellow + return '#ef4444' // red +}) + +const healthScoreClass = computed(() => { + if (isSystemIdle.value) return 'text-gray-400' + const score = healthScoreValue.value + if (score == null) return 'text-gray-400' + if (score >= 90) return 'text-green-500' + if (score >= 60) return 'text-yellow-500' + return 'text-red-500' +}) + +const circleSize = 100 +const strokeWidth = 8 +const radius = (circleSize - strokeWidth) / 2 +const circumference = 2 * Math.PI * radius +const dashOffset = computed(() => { + if (isSystemIdle.value) return 0 + if (healthScoreValue.value == null) return 0 + const score = Math.max(0, Math.min(100, healthScoreValue.value)) + return circumference - (score / 100) * circumference +}) + +interface DiagnosisItem { + type: 'critical' | 'warning' | 'info' + message: string + impact: string +} + +const diagnosisReport = computed(() => { + const ov = overview.value + if (!ov) return [] + + const report: DiagnosisItem[] = [] + + if (isSystemIdle.value) { + report.push({ + type: 'info', + message: t('admin.ops.diagnosis.idle'), + impact: t('admin.ops.diagnosis.idleImpact') + }) + return report + } + + const upstreamRatePct = (ov.upstream_error_rate ?? 0) * 100 + if (upstreamRatePct > 10) { + report.push({ + type: 'critical', + message: t('admin.ops.diagnosis.upstreamCritical', { rate: upstreamRatePct.toFixed(2) }), + impact: t('admin.ops.diagnosis.upstreamCriticalImpact') + }) + } else if (upstreamRatePct > 3) { + report.push({ + type: 'warning', + message: t('admin.ops.diagnosis.upstreamHigh', { rate: upstreamRatePct.toFixed(2) }), + impact: t('admin.ops.diagnosis.upstreamHighImpact') + }) + } + + const slaPct = (ov.sla ?? 0) * 100 + if (slaPct < 90) { + report.push({ + type: 'critical', + message: t('admin.ops.diagnosis.slaCritical', { sla: slaPct.toFixed(2) }), + impact: t('admin.ops.diagnosis.slaCriticalImpact') + }) + } else if (slaPct < 98) { + report.push({ + type: 'warning', + message: t('admin.ops.diagnosis.slaLow', { sla: slaPct.toFixed(2) }), + impact: t('admin.ops.diagnosis.slaLowImpact') + }) + } + + const errorPct = (ov.error_rate ?? 0) * 100 + if (errorPct > 5) { + report.push({ + type: 'critical', + message: t('admin.ops.diagnosis.errorHigh', { rate: errorPct.toFixed(2) }), + impact: t('admin.ops.diagnosis.errorHighImpact') + }) + } else if (errorPct > 1) { + report.push({ + type: 'warning', + message: t('admin.ops.diagnosis.errorElevated', { rate: errorPct.toFixed(2) }), + impact: t('admin.ops.diagnosis.errorElevatedImpact') + }) + } + + if (healthScoreValue.value != null) { + if (healthScoreValue.value < 60) { + report.push({ + type: 'critical', + message: t('admin.ops.diagnosis.healthCritical', { score: healthScoreValue.value }), + impact: t('admin.ops.diagnosis.healthCriticalImpact') + }) + } else if (healthScoreValue.value < 90) { + report.push({ + type: 'warning', + message: t('admin.ops.diagnosis.healthLow', { score: healthScoreValue.value }), + impact: t('admin.ops.diagnosis.healthLowImpact') + }) + } + } + + if (report.length === 0) { + report.push({ + type: 'info', + message: t('admin.ops.diagnosis.healthy'), + impact: t('admin.ops.diagnosis.healthyImpact') + }) + } + + return report +}) + +// --- System health (secondary) --- + +function formatTimeShort(ts?: string | null): string { + if (!ts) return '-' + const d = new Date(ts) + if (Number.isNaN(d.getTime())) return '-' + return d.toLocaleTimeString() +} + +const cpuPercentValue = computed(() => { + const v = systemMetrics.value?.cpu_usage_percent + return typeof v === 'number' && Number.isFinite(v) ? v : null +}) + +const cpuPercentClass = computed(() => { + const v = cpuPercentValue.value + if (v == null) return 'text-gray-900 dark:text-white' + if (v >= 95) return 'text-rose-600 dark:text-rose-400' + if (v >= 80) return 'text-yellow-600 dark:text-yellow-400' + return 'text-emerald-600 dark:text-emerald-400' +}) + +const memPercentValue = computed(() => { + const v = systemMetrics.value?.memory_usage_percent + return typeof v === 'number' && Number.isFinite(v) ? v : null +}) + +const memPercentClass = computed(() => { + const v = memPercentValue.value + if (v == null) return 'text-gray-900 dark:text-white' + if (v >= 95) return 'text-rose-600 dark:text-rose-400' + if (v >= 85) return 'text-yellow-600 dark:text-yellow-400' + return 'text-emerald-600 dark:text-emerald-400' +}) + +const dbConnActiveValue = computed(() => { + const v = systemMetrics.value?.db_conn_active + return typeof v === 'number' && Number.isFinite(v) ? v : null +}) + +const dbConnIdleValue = computed(() => { + const v = systemMetrics.value?.db_conn_idle + return typeof v === 'number' && Number.isFinite(v) ? v : null +}) + +const dbConnWaitingValue = computed(() => { + const v = systemMetrics.value?.db_conn_waiting + return typeof v === 'number' && Number.isFinite(v) ? v : null +}) + +const dbConnOpenValue = computed(() => { + if (dbConnActiveValue.value == null || dbConnIdleValue.value == null) return null + return dbConnActiveValue.value + dbConnIdleValue.value +}) + +const dbMaxOpenConnsValue = computed(() => { + const v = systemMetrics.value?.db_max_open_conns + return typeof v === 'number' && Number.isFinite(v) ? v : null +}) + +const dbUsagePercent = computed(() => { + if (dbConnOpenValue.value == null || dbMaxOpenConnsValue.value == null || dbMaxOpenConnsValue.value <= 0) return null + return Math.min(100, Math.max(0, (dbConnOpenValue.value / dbMaxOpenConnsValue.value) * 100)) +}) + +const dbMiddleLabel = computed(() => { + if (systemMetrics.value?.db_ok === false) return 'FAIL' + if (dbUsagePercent.value != null) return `${dbUsagePercent.value.toFixed(0)}%` + if (systemMetrics.value?.db_ok === true) return t('admin.ops.ok') + return t('admin.ops.noData') +}) + +const dbMiddleClass = computed(() => { + if (systemMetrics.value?.db_ok === false) return 'text-rose-600 dark:text-rose-400' + if (dbUsagePercent.value != null) { + if (dbUsagePercent.value >= 90) return 'text-rose-600 dark:text-rose-400' + if (dbUsagePercent.value >= 70) return 'text-yellow-600 dark:text-yellow-400' + return 'text-emerald-600 dark:text-emerald-400' + } + if (systemMetrics.value?.db_ok === true) return 'text-emerald-600 dark:text-emerald-400' + return 'text-gray-900 dark:text-white' +}) + +const redisConnTotalValue = computed(() => { + const v = systemMetrics.value?.redis_conn_total + return typeof v === 'number' && Number.isFinite(v) ? v : null +}) + +const redisConnIdleValue = computed(() => { + const v = systemMetrics.value?.redis_conn_idle + return typeof v === 'number' && Number.isFinite(v) ? v : null +}) + +const redisConnActiveValue = computed(() => { + if (redisConnTotalValue.value == null || redisConnIdleValue.value == null) return null + return Math.max(redisConnTotalValue.value - redisConnIdleValue.value, 0) +}) + +const redisPoolSizeValue = computed(() => { + const v = systemMetrics.value?.redis_pool_size + return typeof v === 'number' && Number.isFinite(v) ? v : null +}) + +const redisUsagePercent = computed(() => { + if (redisConnTotalValue.value == null || redisPoolSizeValue.value == null || redisPoolSizeValue.value <= 0) return null + return Math.min(100, Math.max(0, (redisConnTotalValue.value / redisPoolSizeValue.value) * 100)) +}) + +const redisMiddleLabel = computed(() => { + if (systemMetrics.value?.redis_ok === false) return 'FAIL' + if (redisUsagePercent.value != null) return `${redisUsagePercent.value.toFixed(0)}%` + if (systemMetrics.value?.redis_ok === true) return t('admin.ops.ok') + return t('admin.ops.noData') +}) + +const redisMiddleClass = computed(() => { + if (systemMetrics.value?.redis_ok === false) return 'text-rose-600 dark:text-rose-400' + if (redisUsagePercent.value != null) { + if (redisUsagePercent.value >= 90) return 'text-rose-600 dark:text-rose-400' + if (redisUsagePercent.value >= 70) return 'text-yellow-600 dark:text-yellow-400' + return 'text-emerald-600 dark:text-emerald-400' + } + if (systemMetrics.value?.redis_ok === true) return 'text-emerald-600 dark:text-emerald-400' + return 'text-gray-900 dark:text-white' +}) + +const goroutineCountValue = computed(() => { + const v = systemMetrics.value?.goroutine_count + return typeof v === 'number' && Number.isFinite(v) ? v : null +}) + +const goroutinesWarnThreshold = 8_000 +const goroutinesCriticalThreshold = 15_000 + +const goroutineStatus = computed<'ok' | 'warning' | 'critical' | 'unknown'>(() => { + const n = goroutineCountValue.value + if (n == null) return 'unknown' + if (n >= goroutinesCriticalThreshold) return 'critical' + if (n >= goroutinesWarnThreshold) return 'warning' + return 'ok' +}) + +const goroutineStatusLabel = computed(() => { + switch (goroutineStatus.value) { + case 'ok': + return t('admin.ops.ok') + case 'warning': + return t('common.warning') + case 'critical': + return t('common.critical') + default: + return t('admin.ops.noData') + } +}) + +const goroutineStatusClass = computed(() => { + switch (goroutineStatus.value) { + case 'ok': + return 'text-emerald-600 dark:text-emerald-400' + case 'warning': + return 'text-yellow-600 dark:text-yellow-400' + case 'critical': + return 'text-rose-600 dark:text-rose-400' + default: + return 'text-gray-900 dark:text-white' + } +}) + +const jobHeartbeats = computed(() => overview.value?.job_heartbeats ?? []) + +const jobsStatus = computed<'ok' | 'warn' | 'unknown'>(() => { + const list = jobHeartbeats.value + if (!list.length) return 'unknown' + for (const hb of list) { + if (!hb) continue + if (hb.last_error_at && (!hb.last_success_at || hb.last_error_at > hb.last_success_at)) return 'warn' + } + return 'ok' +}) + +const jobsWarnCount = computed(() => { + let warn = 0 + for (const hb of jobHeartbeats.value) { + if (!hb) continue + if (hb.last_error_at && (!hb.last_success_at || hb.last_error_at > hb.last_success_at)) warn++ + } + return warn +}) + +const jobsStatusLabel = computed(() => { + switch (jobsStatus.value) { + case 'ok': + return t('admin.ops.ok') + case 'warn': + return t('common.warning') + default: + return t('admin.ops.noData') + } +}) + +const jobsStatusClass = computed(() => { + switch (jobsStatus.value) { + case 'ok': + return 'text-emerald-600 dark:text-emerald-400' + case 'warn': + return 'text-yellow-600 dark:text-yellow-400' + default: + return 'text-gray-900 dark:text-white' + } +}) + +const showJobsDetails = ref(false) + +function openJobsDetails() { + showJobsDetails.value = true +} From 13ae0ce7b0299fe83ea7ac0edf61c223ba5f83da Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Sat, 10 Jan 2026 13:26:01 +0800 Subject: [PATCH 17/53] =?UTF-8?q?refactor(migration):=20=E9=87=8D=E5=91=BD?= =?UTF-8?q?=E5=90=8D=20ops=20=E8=BF=81=E7=A7=BB=E6=96=87=E4=BB=B6=E9=81=BF?= =?UTF-8?q?=E5=85=8D=E7=BC=96=E5=8F=B7=E5=86=B2=E7=AA=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 将 030_ops_monitoring_vnext.sql 重命名为 033_ops_monitoring_vnext.sql 以避免与主分支的 030_add_account_expires_at.sql 冲突。 --- ...{030_ops_monitoring_vnext.sql => 033_ops_monitoring_vnext.sql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename backend/migrations/{030_ops_monitoring_vnext.sql => 033_ops_monitoring_vnext.sql} (100%) diff --git a/backend/migrations/030_ops_monitoring_vnext.sql b/backend/migrations/033_ops_monitoring_vnext.sql similarity index 100% rename from backend/migrations/030_ops_monitoring_vnext.sql rename to backend/migrations/033_ops_monitoring_vnext.sql From fc4ea65936fe01949e17424451546268cc89d250 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Sun, 11 Jan 2026 10:59:01 +0800 Subject: [PATCH 18/53] =?UTF-8?q?fix:=20=E4=B8=B4=E6=97=B6=E4=BF=9D?= =?UTF-8?q?=E5=AD=98=E7=BC=96=E8=AF=91=E9=94=99=E8=AF=AF=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加 LinuxDo 和 Update 配置(从 main 分支缺失) - 添加 LinuxDoConnectSyntheticEmailDomain 常量 - 添加 IsClaudeCodeClient context key - 添加 GetLinuxDoConnectOAuthConfig 方法 - 修复 BindStickySession 调用签名 - 修复前端 i18n 重复属性 - 重新生成 wire 依赖注入代码 这个提交准备被合并替换,先保存以防丢失。 --- backend/cmd/server/wire_gen.go | 14 +- backend/internal/config/config.go | 130 +++++++++++++++--- backend/internal/handler/gateway_handler.go | 4 +- .../internal/handler/gemini_v1beta_handler.go | 2 +- .../handler/openai_gateway_handler.go | 2 +- backend/internal/pkg/ctxkey/ctxkey.go | 3 + backend/internal/service/auth_service.go | 3 + backend/internal/service/domain_constants.go | 6 + backend/internal/service/setting_service.go | 93 +++++++++++++ frontend/src/i18n/locales/en.ts | 29 ---- frontend/src/i18n/locales/zh.ts | 29 ---- 11 files changed, 231 insertions(+), 84 deletions(-) diff --git a/backend/cmd/server/wire_gen.go b/backend/cmd/server/wire_gen.go index 2a254fd6..e8f94c37 100644 --- a/backend/cmd/server/wire_gen.go +++ b/backend/cmd/server/wire_gen.go @@ -53,7 +53,7 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) { emailQueueService := service.ProvideEmailQueueService(emailService) authService := service.NewAuthService(userRepository, configConfig, settingService, emailService, turnstileService, emailQueueService) userService := service.NewUserService(userRepository) - authHandler := handler.NewAuthHandler(configConfig, authService, userService) + authHandler := handler.NewAuthHandler(configConfig, authService, userService, settingService) userHandler := handler.NewUserHandler(userService) apiKeyRepository := repository.NewAPIKeyRepository(client) groupRepository := repository.NewGroupRepository(client, db) @@ -114,7 +114,7 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) { adminRedeemHandler := admin.NewRedeemHandler(adminService) settingHandler := admin.NewSettingHandler(settingService, emailService, turnstileService) opsRepository := repository.NewOpsRepository(db) - pricingRemoteClient := repository.NewPricingRemoteClient(configConfig) + pricingRemoteClient := repository.ProvidePricingRemoteClient(configConfig) pricingService, err := service.ProvidePricingService(configConfig, pricingRemoteClient) if err != nil { return nil, err @@ -130,7 +130,7 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) { opsService := service.NewOpsService(opsRepository, settingRepository, configConfig, accountRepository, concurrencyService, gatewayService, openAIGatewayService, geminiMessagesCompatService, antigravityGatewayService) opsHandler := admin.NewOpsHandler(opsService) updateCache := repository.NewUpdateCache(redisClient) - gitHubReleaseClient := repository.NewGitHubReleaseClient() + gitHubReleaseClient := repository.ProvideGitHubReleaseClient(configConfig) serviceBuildInfo := provideServiceBuildInfo(buildInfo) updateService := service.ProvideUpdateService(updateCache, gitHubReleaseClient, serviceBuildInfo) systemHandler := handler.ProvideSystemHandler(updateService) @@ -155,7 +155,8 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) { opsAlertEvaluatorService := service.ProvideOpsAlertEvaluatorService(opsService, opsRepository, emailService, redisClient, configConfig) opsCleanupService := service.ProvideOpsCleanupService(opsRepository, db, redisClient, configConfig) tokenRefreshService := service.ProvideTokenRefreshService(accountRepository, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService, configConfig) - v := provideCleanup(client, redisClient, opsMetricsCollector, opsAggregationService, opsAlertEvaluatorService, opsCleanupService, tokenRefreshService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService) + accountExpiryService := service.ProvideAccountExpiryService(accountRepository) + v := provideCleanup(client, redisClient, opsMetricsCollector, opsAggregationService, opsAlertEvaluatorService, opsCleanupService, tokenRefreshService, accountExpiryService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService) application := &Application{ Server: httpServer, Cleanup: v, @@ -185,6 +186,7 @@ func provideCleanup( opsAlertEvaluator *service.OpsAlertEvaluatorService, opsCleanup *service.OpsCleanupService, tokenRefresh *service.TokenRefreshService, + accountExpiry *service.AccountExpiryService, pricing *service.PricingService, emailQueue *service.EmailQueueService, billingCache *service.BillingCacheService, @@ -229,6 +231,10 @@ func provideCleanup( tokenRefresh.Stop() return nil }}, + {"AccountExpiryService", func() error { + accountExpiry.Stop() + return nil + }}, {"PricingService", func() error { pricing.Stop() return nil diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index 6e66b22c..67431cdf 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -6,6 +6,7 @@ import ( "encoding/hex" "fmt" "log" + "net/url" "os" "strings" "time" @@ -35,24 +36,26 @@ const ( ) type Config struct { - Server ServerConfig `mapstructure:"server"` - CORS CORSConfig `mapstructure:"cors"` - Security SecurityConfig `mapstructure:"security"` - Billing BillingConfig `mapstructure:"billing"` - Turnstile TurnstileConfig `mapstructure:"turnstile"` - Database DatabaseConfig `mapstructure:"database"` - Redis RedisConfig `mapstructure:"redis"` - Ops OpsConfig `mapstructure:"ops"` - JWT JWTConfig `mapstructure:"jwt"` - Default DefaultConfig `mapstructure:"default"` - RateLimit RateLimitConfig `mapstructure:"rate_limit"` - Pricing PricingConfig `mapstructure:"pricing"` - Gateway GatewayConfig `mapstructure:"gateway"` - Concurrency ConcurrencyConfig `mapstructure:"concurrency"` - TokenRefresh TokenRefreshConfig `mapstructure:"token_refresh"` - RunMode string `mapstructure:"run_mode" yaml:"run_mode"` - Timezone string `mapstructure:"timezone"` // e.g. "Asia/Shanghai", "UTC" - Gemini GeminiConfig `mapstructure:"gemini"` + Server ServerConfig `mapstructure:"server"` + CORS CORSConfig `mapstructure:"cors"` + Security SecurityConfig `mapstructure:"security"` + Billing BillingConfig `mapstructure:"billing"` + Turnstile TurnstileConfig `mapstructure:"turnstile"` + Database DatabaseConfig `mapstructure:"database"` + Redis RedisConfig `mapstructure:"redis"` + Ops OpsConfig `mapstructure:"ops"` + JWT JWTConfig `mapstructure:"jwt"` + LinuxDo LinuxDoConnectConfig `mapstructure:"linuxdo_connect"` + Default DefaultConfig `mapstructure:"default"` + RateLimit RateLimitConfig `mapstructure:"rate_limit"` + Pricing PricingConfig `mapstructure:"pricing"` + Gateway GatewayConfig `mapstructure:"gateway"` + Concurrency ConcurrencyConfig `mapstructure:"concurrency"` + TokenRefresh TokenRefreshConfig `mapstructure:"token_refresh"` + RunMode string `mapstructure:"run_mode" yaml:"run_mode"` + Timezone string `mapstructure:"timezone"` // e.g. "Asia/Shanghai", "UTC" + Gemini GeminiConfig `mapstructure:"gemini"` + Update UpdateConfig `mapstructure:"update"` } type GeminiConfig struct { @@ -77,6 +80,33 @@ type GeminiTierQuotaConfig struct { CooldownMinutes *int `mapstructure:"cooldown_minutes" json:"cooldown_minutes"` } +type UpdateConfig struct { + // ProxyURL 用于访问 GitHub 的代理地址 + // 支持 http/https/socks5/socks5h 协议 + // 例如: "http://127.0.0.1:7890", "socks5://127.0.0.1:1080" + ProxyURL string `mapstructure:"proxy_url"` +} + +type LinuxDoConnectConfig struct { + Enabled bool `mapstructure:"enabled"` + ClientID string `mapstructure:"client_id"` + ClientSecret string `mapstructure:"client_secret"` + AuthorizeURL string `mapstructure:"authorize_url"` + TokenURL string `mapstructure:"token_url"` + UserInfoURL string `mapstructure:"userinfo_url"` + Scopes string `mapstructure:"scopes"` + RedirectURL string `mapstructure:"redirect_url"` // 后端回调地址(需在提供方后台登记) + FrontendRedirectURL string `mapstructure:"frontend_redirect_url"` // 前端接收 token 的路由(默认:/auth/linuxdo/callback) + TokenAuthMethod string `mapstructure:"token_auth_method"` // client_secret_post / client_secret_basic / none + UsePKCE bool `mapstructure:"use_pkce"` + + // 可选:用于从 userinfo JSON 中提取字段的 gjson 路径。 + // 为空时,服务端会尝试一组常见字段名。 + UserInfoEmailPath string `mapstructure:"userinfo_email_path"` + UserInfoIDPath string `mapstructure:"userinfo_id_path"` + UserInfoUsernamePath string `mapstructure:"userinfo_username_path"` +} + // TokenRefreshConfig OAuth token自动刷新配置 type TokenRefreshConfig struct { // 是否启用自动刷新 @@ -834,3 +864,67 @@ func GetServerAddress() string { port := v.GetInt("server.port") return fmt.Sprintf("%s:%d", host, port) } + +// ValidateAbsoluteHTTPURL 验证是否为有效的绝对 HTTP(S) URL +func ValidateAbsoluteHTTPURL(raw string) error { + raw = strings.TrimSpace(raw) + if raw == "" { + return fmt.Errorf("empty url") + } + u, err := url.Parse(raw) + if err != nil { + return err + } + if !u.IsAbs() { + return fmt.Errorf("must be absolute") + } + if !isHTTPScheme(u.Scheme) { + return fmt.Errorf("unsupported scheme: %s", u.Scheme) + } + if strings.TrimSpace(u.Host) == "" { + return fmt.Errorf("missing host") + } + if u.Fragment != "" { + return fmt.Errorf("must not include fragment") + } + return nil +} + +// ValidateFrontendRedirectURL 验证前端重定向 URL(可以是绝对 URL 或相对路径) +func ValidateFrontendRedirectURL(raw string) error { + raw = strings.TrimSpace(raw) + if raw == "" { + return fmt.Errorf("empty url") + } + if strings.ContainsAny(raw, "\r\n") { + return fmt.Errorf("contains invalid characters") + } + if strings.HasPrefix(raw, "/") { + if strings.HasPrefix(raw, "//") { + return fmt.Errorf("must not start with //") + } + return nil + } + u, err := url.Parse(raw) + if err != nil { + return err + } + if !u.IsAbs() { + return fmt.Errorf("must be absolute http(s) url or relative path") + } + if !isHTTPScheme(u.Scheme) { + return fmt.Errorf("unsupported scheme: %s", u.Scheme) + } + if strings.TrimSpace(u.Host) == "" { + return fmt.Errorf("missing host") + } + if u.Fragment != "" { + return fmt.Errorf("must not include fragment") + } + return nil +} + +// isHTTPScheme 检查是否为 HTTP 或 HTTPS 协议 +func isHTTPScheme(scheme string) bool { + return strings.EqualFold(scheme, "http") || strings.EqualFold(scheme, "https") +} diff --git a/backend/internal/handler/gateway_handler.go b/backend/internal/handler/gateway_handler.go index 7d1eab28..284a4f8f 100644 --- a/backend/internal/handler/gateway_handler.go +++ b/backend/internal/handler/gateway_handler.go @@ -250,7 +250,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) { h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID) accountWaitCounted = false } - if err := h.gatewayService.BindStickySession(c.Request.Context(), sessionKey, account.ID); err != nil { + if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil { log.Printf("Bind sticky session failed: %v", err) } } @@ -378,7 +378,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) { h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID) accountWaitCounted = false } - if err := h.gatewayService.BindStickySession(c.Request.Context(), sessionKey, account.ID); err != nil { + if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil { log.Printf("Bind sticky session failed: %v", err) } } diff --git a/backend/internal/handler/gemini_v1beta_handler.go b/backend/internal/handler/gemini_v1beta_handler.go index 73550575..d639beb3 100644 --- a/backend/internal/handler/gemini_v1beta_handler.go +++ b/backend/internal/handler/gemini_v1beta_handler.go @@ -278,7 +278,7 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) { geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID) accountWaitCounted = false } - if err := h.gatewayService.BindStickySession(c.Request.Context(), sessionKey, account.ID); err != nil { + if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionKey, account.ID); err != nil { log.Printf("Bind sticky session failed: %v", err) } } diff --git a/backend/internal/handler/openai_gateway_handler.go b/backend/internal/handler/openai_gateway_handler.go index 2ddf77ed..eba69006 100644 --- a/backend/internal/handler/openai_gateway_handler.go +++ b/backend/internal/handler/openai_gateway_handler.go @@ -227,7 +227,7 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) { h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID) accountWaitCounted = false } - if err := h.gatewayService.BindStickySession(c.Request.Context(), sessionHash, account.ID); err != nil { + if err := h.gatewayService.BindStickySession(c.Request.Context(), apiKey.GroupID, sessionHash, account.ID); err != nil { log.Printf("Bind sticky session failed: %v", err) } } diff --git a/backend/internal/pkg/ctxkey/ctxkey.go b/backend/internal/pkg/ctxkey/ctxkey.go index 61d98cc2..38759dd4 100644 --- a/backend/internal/pkg/ctxkey/ctxkey.go +++ b/backend/internal/pkg/ctxkey/ctxkey.go @@ -13,4 +13,7 @@ const ( // RetryCount 表示当前请求在网关层的重试次数(用于 Ops 记录与排障)。 RetryCount Key = "ctx_retry_count" + + // IsClaudeCodeClient 标识当前请求是否来自 Claude Code 客户端 + IsClaudeCodeClient Key = "ctx_is_claude_code_client" ) diff --git a/backend/internal/service/auth_service.go b/backend/internal/service/auth_service.go index e232deb3..a07afa9e 100644 --- a/backend/internal/service/auth_service.go +++ b/backend/internal/service/auth_service.go @@ -35,6 +35,9 @@ var ( // maxTokenLength 限制 token 大小,避免超长 header 触发解析时的异常内存分配。 const maxTokenLength = 8192 +// LinuxDoConnectSyntheticEmailDomain LinuxDo Connect 生成的合成邮箱域名后缀 +const LinuxDoConnectSyntheticEmailDomain = "@linuxdo.synthetic" + // JWTClaims JWT载荷数据 type JWTClaims struct { UserID int64 `json:"user_id"` diff --git a/backend/internal/service/domain_constants.go b/backend/internal/service/domain_constants.go index 04f80dbe..4edf126b 100644 --- a/backend/internal/service/domain_constants.go +++ b/backend/internal/service/domain_constants.go @@ -77,6 +77,12 @@ const ( SettingKeyTurnstileSiteKey = "turnstile_site_key" // Turnstile Site Key SettingKeyTurnstileSecretKey = "turnstile_secret_key" // Turnstile Secret Key + // LinuxDo Connect OAuth 登录设置 + SettingKeyLinuxDoConnectEnabled = "linuxdo_connect_enabled" + SettingKeyLinuxDoConnectClientID = "linuxdo_connect_client_id" + SettingKeyLinuxDoConnectClientSecret = "linuxdo_connect_client_secret" + SettingKeyLinuxDoConnectRedirectURL = "linuxdo_connect_redirect_url" + // OEM设置 SettingKeySiteName = "site_name" // 网站名称 SettingKeySiteLogo = "site_logo" // 网站Logo (base64) diff --git a/backend/internal/service/setting_service.go b/backend/internal/service/setting_service.go index 09772616..c7e7ca4c 100644 --- a/backend/internal/service/setting_service.go +++ b/backend/internal/service/setting_service.go @@ -472,3 +472,96 @@ func (s *SettingService) GetFallbackModel(ctx context.Context, platform string) } return value } + +// GetLinuxDoConnectOAuthConfig 返回用于登录的"最终生效" LinuxDo Connect 配置。 +// +// 优先级: +// - 若对应系统设置键存在,则覆盖 config.yaml/env 的值 +// - 否则回退到 config.yaml/env 的值 +func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) { + if s == nil || s.cfg == nil { + return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded") + } + + effective := s.cfg.LinuxDo + + keys := []string{ + SettingKeyLinuxDoConnectEnabled, + SettingKeyLinuxDoConnectClientID, + SettingKeyLinuxDoConnectClientSecret, + SettingKeyLinuxDoConnectRedirectURL, + } + settings, err := s.settingRepo.GetMultiple(ctx, keys) + if err != nil { + return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err) + } + + if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok { + effective.Enabled = raw == "true" + } + if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" { + effective.ClientID = strings.TrimSpace(v) + } + if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" { + effective.ClientSecret = strings.TrimSpace(v) + } + if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" { + effective.RedirectURL = strings.TrimSpace(v) + } + + if !effective.Enabled { + return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled") + } + + // 基础健壮性校验(避免把用户重定向到一个必然失败或不安全的 OAuth 流程里)。 + if strings.TrimSpace(effective.ClientID) == "" { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured") + } + if strings.TrimSpace(effective.AuthorizeURL) == "" { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured") + } + if strings.TrimSpace(effective.TokenURL) == "" { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured") + } + if strings.TrimSpace(effective.UserInfoURL) == "" { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured") + } + if strings.TrimSpace(effective.RedirectURL) == "" { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured") + } + if strings.TrimSpace(effective.FrontendRedirectURL) == "" { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured") + } + + if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid") + } + if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid") + } + if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid") + } + if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid") + } + if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid") + } + + method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod)) + switch method { + case "", "client_secret_post", "client_secret_basic": + if strings.TrimSpace(effective.ClientSecret) == "" { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured") + } + case "none": + if !effective.UsePKCE { + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none") + } + default: + return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid") + } + + return effective, nil +} diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts index 05e58e47..dcf3a57e 100644 --- a/frontend/src/i18n/locales/en.ts +++ b/frontend/src/i18n/locales/en.ts @@ -1844,30 +1844,6 @@ export default { healthyStatus: 'Healthy', riskyStatus: 'At Risk', idleStatus: 'Idle', - realtime: { - title: 'Realtime', - connected: 'Connected', - connecting: 'Connecting', - reconnecting: 'Reconnecting', - offline: 'Offline', - closed: 'Closed', - reconnectIn: 'Reconnect in {seconds}s' - }, - tooltips: { - qps: 'Queries per second - real-time request rate', - sla: 'Service Level Agreement - percentage of requests within acceptable latency', - latency: 'Request duration from start to finish', - ttft: 'Time to First Token - latency until first response token', - errors: 'Request errors within SLA scope', - upstreamErrors: 'Errors from upstream services (excluding rate limits)', - totalRequests: 'Total requests and tokens consumed in this time window', - cpu: 'CPU usage percentage', - memory: 'Memory usage percentage', - db: 'Database connection pool status', - redis: 'Redis connection pool status', - goroutines: 'Go routine count (concurrent tasks)', - jobs: 'Background job health status' - }, timeRange: { '5m': 'Last 5 minutes', '30m': 'Last 30 minutes', @@ -1875,11 +1851,6 @@ export default { '6h': 'Last 6 hours', '24h': 'Last 24 hours' }, - queryMode: { - auto: 'Auto', - raw: 'Raw Query', - preagg: 'Pre-aggregated' - }, diagnosis: { title: 'Smart Diagnosis', footer: 'Automated diagnostic suggestions based on current metrics', diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index 841bafb6..16a6c083 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -1989,30 +1989,6 @@ export default { healthyStatus: '健康', riskyStatus: '风险', idleStatus: '待机', - realtime: { - title: '实时信息', - connected: '已连接', - connecting: '连接中', - reconnecting: '重连中', - offline: '离线', - closed: '已关闭', - reconnectIn: '{seconds}秒后重连' - }, - tooltips: { - qps: '每秒查询数 - 实时请求速率', - sla: '服务等级协议 - 可接受延迟范围内的请求百分比', - latency: '从开始到结束的请求持续时间', - ttft: '首字延迟 - 直到第一个响应令牌的延迟', - errors: 'SLA 范围内的请求错误', - upstreamErrors: '上游服务错误(不包括速率限制)', - totalRequests: '此时间窗口内的总请求数和消耗的令牌数', - cpu: 'CPU 使用率', - memory: '内存使用率', - db: '数据库连接池状态', - redis: 'Redis 连接池状态', - goroutines: 'Go 协程数(并发任务)', - jobs: '后台任务健康状态' - }, timeRange: { '5m': '近5分钟', '30m': '近30分钟', @@ -2020,11 +1996,6 @@ export default { '6h': '近6小时', '24h': '近24小时' }, - queryMode: { - auto: '自动', - raw: '原始查询', - preagg: '预聚合' - }, diagnosis: { title: '智能诊断', footer: '基于当前指标的自动诊断建议', From 89a725a433988d4dcf7184019a76a2b89f701687 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Sun, 11 Jan 2026 11:49:34 +0800 Subject: [PATCH 19/53] =?UTF-8?q?feat(ops):=20=E6=B7=BB=E5=8A=A0QPS?= =?UTF-8?q?=E8=84=89=E6=90=8F=E7=BA=BF=E5=9B=BE=E5=B9=B6=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E6=8C=87=E6=A0=87=E5=B8=83=E5=B1=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加实时QPS/TPS历史数据追踪(最近60个数据点) - 在平均QPS/TPS上方添加SVG脉搏线图(sparkline) - 将延迟和TTFT卡片的指标布局从2列改为3列 - 恢复Max指标显示(P95/P90/P50/Avg/Max) --- backend/internal/handler/ops_error_logger.go | 33 +++++++ backend/internal/service/gateway_service.go | 97 ++++++++++++++++--- .../service/openai_gateway_service.go | 46 ++++++++- backend/internal/service/ops_service.go | 28 ++++++ .../internal/service/ops_upstream_context.go | 31 ++++++ .../ops/components/OpsDashboardHeader.vue | 41 +++++++- 6 files changed, 255 insertions(+), 21 deletions(-) create mode 100644 backend/internal/service/ops_upstream_context.go diff --git a/backend/internal/handler/ops_error_logger.go b/backend/internal/handler/ops_error_logger.go index b3a90c2f..5e692cdf 100644 --- a/backend/internal/handler/ops_error_logger.go +++ b/backend/internal/handler/ops_error_logger.go @@ -392,6 +392,39 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc { CreatedAt: time.Now(), } + // Capture upstream error context set by gateway services (if present). + // This does NOT affect the client response; it enriches Ops troubleshooting data. + { + if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok { + switch t := v.(type) { + case int: + if t > 0 { + code := t + entry.UpstreamStatusCode = &code + } + case int64: + if t > 0 { + code := int(t) + entry.UpstreamStatusCode = &code + } + } + } + if v, ok := c.Get(service.OpsUpstreamErrorMessageKey); ok { + if s, ok := v.(string); ok { + if msg := strings.TrimSpace(s); msg != "" { + entry.UpstreamErrorMessage = &msg + } + } + } + if v, ok := c.Get(service.OpsUpstreamErrorDetailKey); ok { + if s, ok := v.(string); ok { + if detail := strings.TrimSpace(s); detail != "" { + entry.UpstreamErrorDetail = &detail + } + } + } + } + if apiKey != nil { entry.APIKeyID = &apiKey.ID if apiKey.User != nil { diff --git a/backend/internal/service/gateway_service.go b/backend/internal/service/gateway_service.go index 31148b17..a2b74a15 100644 --- a/backend/internal/service/gateway_service.go +++ b/backend/internal/service/gateway_service.go @@ -1399,7 +1399,17 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A if resp != nil && resp.Body != nil { _ = resp.Body.Close() } - return nil, fmt.Errorf("upstream request failed: %w", err) + // Ensure the client receives an error response (handlers assume Forward writes on non-failover errors). + safeErr := sanitizeUpstreamErrorMessage(err.Error()) + setOpsUpstreamError(c, 0, safeErr, "") + c.JSON(http.StatusBadGateway, gin.H{ + "type": "error", + "error": gin.H{ + "type": "upstream_error", + "message": "Upstream request failed", + }, + }) + return nil, fmt.Errorf("upstream request failed: %s", safeErr) } // 优先检测thinking block签名错误(400)并重试一次 @@ -1859,7 +1869,21 @@ func extractUpstreamErrorMessage(body []byte) string { } func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*ForwardResult, error) { - body, _ := io.ReadAll(resp.Body) + body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) + + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + + // Enrich Ops error logs with upstream status + message, and optionally a truncated body snippet. + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(body), maxBytes) + } + setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail) // 处理上游错误,标记账号状态 shouldDisable := false @@ -1870,24 +1894,33 @@ func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Res return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } + // 记录上游错误响应体摘要便于排障(可选:由配置控制;不回显到客户端) + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + log.Printf( + "Upstream error %d (account=%d platform=%s type=%s): %s", + resp.StatusCode, + account.ID, + account.Platform, + account.Type, + truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes), + ) + } + // 根据状态码返回适当的自定义错误响应(不透传上游详细信息) var errType, errMsg string var statusCode int switch resp.StatusCode { case 400: - // 仅记录上游错误摘要(避免输出请求内容);需要时可通过配置打开 - if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { - log.Printf( - "Upstream 400 error (account=%d platform=%s type=%s): %s", - account.ID, - account.Platform, - account.Type, - truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes), - ) - } c.Data(http.StatusBadRequest, "application/json", body) - return nil, fmt.Errorf("upstream error: %d", resp.StatusCode) + summary := upstreamMsg + if summary == "" { + summary = truncateForLog(body, 512) + } + if summary == "" { + return nil, fmt.Errorf("upstream error: %d", resp.StatusCode) + } + return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, summary) case 401: statusCode = http.StatusBadGateway errType = "upstream_error" @@ -1923,7 +1956,10 @@ func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Res }, }) - return nil, fmt.Errorf("upstream error: %d", resp.StatusCode) + if upstreamMsg == "" { + return nil, fmt.Errorf("upstream error: %d", resp.StatusCode) + } + return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg) } func (s *GatewayService) handleRetryExhaustedSideEffects(ctx context.Context, resp *http.Response, account *Account) { @@ -1949,8 +1985,36 @@ func (s *GatewayService) handleFailoverSideEffects(ctx context.Context, resp *ht // OAuth 403:标记账号异常 // API Key 未配置错误码:仅返回错误,不标记账号 func (s *GatewayService) handleRetryExhaustedError(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*ForwardResult, error) { + // Capture upstream error body before side-effects consume the stream. + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) + _ = resp.Body.Close() + resp.Body = io.NopCloser(bytes.NewReader(respBody)) + s.handleRetryExhaustedSideEffects(ctx, resp, account) + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail) + + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + log.Printf( + "Upstream error %d retries_exhausted (account=%d platform=%s type=%s): %s", + resp.StatusCode, + account.ID, + account.Platform, + account.Type, + truncateForLog(respBody, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes), + ) + } + // 返回统一的重试耗尽错误响应 c.JSON(http.StatusBadGateway, gin.H{ "type": "error", @@ -1960,7 +2024,10 @@ func (s *GatewayService) handleRetryExhaustedError(ctx context.Context, resp *ht }, }) - return nil, fmt.Errorf("upstream error: %d (retries exhausted)", resp.StatusCode) + if upstreamMsg == "" { + return nil, fmt.Errorf("upstream error: %d (retries exhausted)", resp.StatusCode) + } + return nil, fmt.Errorf("upstream error: %d (retries exhausted) message=%s", resp.StatusCode, upstreamMsg) } // streamingResult 流式响应结果 diff --git a/backend/internal/service/openai_gateway_service.go b/backend/internal/service/openai_gateway_service.go index 9d365ad6..c8d133df 100644 --- a/backend/internal/service/openai_gateway_service.go +++ b/backend/internal/service/openai_gateway_service.go @@ -587,7 +587,16 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco // Send request resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { - return nil, fmt.Errorf("upstream request failed: %w", err) + // Ensure the client receives an error response (handlers assume Forward writes on non-failover errors). + safeErr := sanitizeUpstreamErrorMessage(err.Error()) + setOpsUpstreamError(c, 0, safeErr, "") + c.JSON(http.StatusBadGateway, gin.H{ + "error": gin.H{ + "type": "upstream_error", + "message": "Upstream request failed", + }, + }) + return nil, fmt.Errorf("upstream request failed: %s", safeErr) } defer func() { _ = resp.Body.Close() }() @@ -707,7 +716,30 @@ func (s *OpenAIGatewayService) buildUpstreamRequest(ctx context.Context, c *gin. } func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *http.Response, c *gin.Context, account *Account) (*OpenAIForwardResult, error) { - body, _ := io.ReadAll(resp.Body) + body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) + + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(body), maxBytes) + } + setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail) + + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + log.Printf( + "OpenAI upstream error %d (account=%d platform=%s type=%s): %s", + resp.StatusCode, + account.ID, + account.Platform, + account.Type, + truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes), + ) + } // Check custom error codes if !account.ShouldHandleErrorCode(resp.StatusCode) { @@ -717,7 +749,10 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht "message": "Upstream gateway error", }, }) - return nil, fmt.Errorf("upstream error: %d (not in custom error codes)", resp.StatusCode) + if upstreamMsg == "" { + return nil, fmt.Errorf("upstream error: %d (not in custom error codes)", resp.StatusCode) + } + return nil, fmt.Errorf("upstream error: %d (not in custom error codes) message=%s", resp.StatusCode, upstreamMsg) } // Handle upstream error (mark account status) @@ -763,7 +798,10 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht }, }) - return nil, fmt.Errorf("upstream error: %d", resp.StatusCode) + if upstreamMsg == "" { + return nil, fmt.Errorf("upstream error: %d", resp.StatusCode) + } + return nil, fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg) } // openaiStreamingResult streaming response result diff --git a/backend/internal/service/ops_service.go b/backend/internal/service/ops_service.go index 169c523a..c9cccdc7 100644 --- a/backend/internal/service/ops_service.go +++ b/backend/internal/service/ops_service.go @@ -135,6 +135,34 @@ func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogIn entry.ErrorBody = sanitized } + // Sanitize upstream error context if provided by gateway services. + if entry.UpstreamStatusCode != nil && *entry.UpstreamStatusCode <= 0 { + entry.UpstreamStatusCode = nil + } + if entry.UpstreamErrorMessage != nil { + msg := strings.TrimSpace(*entry.UpstreamErrorMessage) + msg = sanitizeUpstreamErrorMessage(msg) + msg = truncateString(msg, 2048) + if strings.TrimSpace(msg) == "" { + entry.UpstreamErrorMessage = nil + } else { + entry.UpstreamErrorMessage = &msg + } + } + if entry.UpstreamErrorDetail != nil { + detail := strings.TrimSpace(*entry.UpstreamErrorDetail) + if detail == "" { + entry.UpstreamErrorDetail = nil + } else { + sanitized, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes) + if strings.TrimSpace(sanitized) == "" { + entry.UpstreamErrorDetail = nil + } else { + entry.UpstreamErrorDetail = &sanitized + } + } + } + if _, err := s.opsRepo.InsertErrorLog(ctx, entry); err != nil { // Never bubble up to gateway; best-effort logging. log.Printf("[Ops] RecordError failed: %v", err) diff --git a/backend/internal/service/ops_upstream_context.go b/backend/internal/service/ops_upstream_context.go new file mode 100644 index 00000000..70e8f6af --- /dev/null +++ b/backend/internal/service/ops_upstream_context.go @@ -0,0 +1,31 @@ +package service + +import ( + "strings" + + "github.com/gin-gonic/gin" +) + +// Gin context keys used by Ops error logger for capturing upstream error details. +// These keys are set by gateway services and consumed by handler/ops_error_logger.go. +const ( + OpsUpstreamStatusCodeKey = "ops_upstream_status_code" + OpsUpstreamErrorMessageKey = "ops_upstream_error_message" + OpsUpstreamErrorDetailKey = "ops_upstream_error_detail" +) + +func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) { + if c == nil { + return + } + if upstreamStatusCode > 0 { + c.Set(OpsUpstreamStatusCodeKey, upstreamStatusCode) + } + if msg := strings.TrimSpace(upstreamMessage); msg != "" { + c.Set(OpsUpstreamErrorMessageKey, msg) + } + if detail := strings.TrimSpace(upstreamDetail); detail != "" { + c.Set(OpsUpstreamErrorDetailKey, detail) + } +} + diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue index 20e6dcd3..35eeb59c 100644 --- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue +++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue @@ -162,6 +162,25 @@ const displayRealTimeTps = computed(() => { return typeof v === 'number' && Number.isFinite(v) ? v : 0 }) +// Sparkline history (keep last 60 data points) +const qpsHistory = ref([]) +const tpsHistory = ref([]) +const MAX_HISTORY_POINTS = 60 + +watch([displayRealTimeQps, displayRealTimeTps], ([newQps, newTps]) => { + // Add new data points + qpsHistory.value.push(newQps) + tpsHistory.value.push(newTps) + + // Keep only last N points + if (qpsHistory.value.length > MAX_HISTORY_POINTS) { + qpsHistory.value.shift() + } + if (tpsHistory.value.length > MAX_HISTORY_POINTS) { + tpsHistory.value.shift() + } +}) + const qpsPeakLabel = computed(() => { const v = overview.value?.qps?.peak if (typeof v !== 'number') return '-' @@ -866,6 +885,16 @@ function openJobsDetails() {
+ + + +
{{ t('admin.ops.average') }}
QPS: {{ qpsAvgLabel }} @@ -974,7 +1003,7 @@ function openJobsDetails() {
ms (P99)
-
+
P95: {{ durationP95Ms ?? '-' }}ms @@ -991,6 +1020,10 @@ function openJobsDetails() { Avg: {{ durationAvgMs ?? '-' }}ms
+
+ Max: + {{ durationMaxMs ?? '-' }}ms +
@@ -1015,7 +1048,7 @@ function openJobsDetails() {
ms (P99)
-
+
P95: {{ ttftP95Ms ?? '-' }}ms @@ -1032,6 +1065,10 @@ function openJobsDetails() { Avg: {{ ttftAvgMs ?? '-' }}ms
+
+ Max: + {{ ttftMaxMs ?? '-' }}ms +
From c2962752eb93d5bd939a3edbeff451b4e700b40e Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Sun, 11 Jan 2026 15:29:59 +0800 Subject: [PATCH 20/53] =?UTF-8?q?feat(ops):=20=E6=B7=BB=E5=8A=A0=E4=B8=8A?= =?UTF-8?q?=E6=B8=B8=E9=94=99=E8=AF=AF=E4=BA=8B=E4=BB=B6=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E5=BA=93=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新建ops_upstream_error_events表存储上游服务错误详情 - 记录上游错误的请求ID、平台、模型、状态码等信息 - 支持索引优化查询性能(request_id, platform, status_code, created_at) --- backend/migrations/034_ops_upstream_error_events.sql | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 backend/migrations/034_ops_upstream_error_events.sql diff --git a/backend/migrations/034_ops_upstream_error_events.sql b/backend/migrations/034_ops_upstream_error_events.sql new file mode 100644 index 00000000..f8bfa5e2 --- /dev/null +++ b/backend/migrations/034_ops_upstream_error_events.sql @@ -0,0 +1,9 @@ +-- Add upstream error events list (JSONB) to ops_error_logs for per-request correlation. +-- +-- This is intentionally idempotent. + +ALTER TABLE ops_error_logs + ADD COLUMN IF NOT EXISTS upstream_errors JSONB; + +COMMENT ON COLUMN ops_error_logs.upstream_errors IS + 'Sanitized upstream error events list (JSON array), correlated per gateway request (request_id/client_request_id); used for per-request upstream debugging.'; From 7ebca553ef80c910e8ab0d0cb1b8ff30dbbc7c12 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Sun, 11 Jan 2026 15:30:27 +0800 Subject: [PATCH 21/53] =?UTF-8?q?feat(ops):=20=E5=AE=9E=E7=8E=B0=E4=B8=8A?= =?UTF-8?q?=E6=B8=B8=E9=94=99=E8=AF=AF=E4=BA=8B=E4=BB=B6=E8=AE=B0=E5=BD=95?= =?UTF-8?q?=E4=B8=8E=E6=9F=A5=E8=AF=A2=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **新增功能**: - 新建ops_upstream_error_events表存储上游服务错误详情 - 支持记录上游429/529/5xx错误的详细上下文信息 - 提供按时间范围查询上游错误事件的API **后端改动**: 1. 模型层(ops_models.go, ops_port.go): - 新增UpstreamErrorEvent结构体 - 扩展Repository接口支持上游错误事件CRUD 2. 仓储层(ops_repo.go): - 实现InsertUpstreamErrorEvent写入上游错误 - 实现GetUpstreamErrorEvents按时间范围查询 3. 服务层(ops_service.go, ops_upstream_context.go): - ops_service: 新增GetUpstreamErrorEvents查询方法 - ops_upstream_context: 封装上游错误上下文构建逻辑 4. Handler层(ops_error_logger.go): - 新增GetUpstreamErrorsHandler处理上游错误查询请求 5. Gateway层集成: - antigravity_gateway_service.go: 429/529错误时记录上游事件 - gateway_service.go: OpenAI 429/5xx错误时记录 - gemini_messages_compat_service.go: Gemini 429/5xx错误时记录 - openai_gateway_service.go: OpenAI 429/5xx错误时记录 - ratelimit_service.go: 429限流错误时记录 **数据记录字段**: - request_id: 关联ops_logs主记录 - platform/model: 上游服务标识 - status_code/error_message: 错误详情 - request_headers/response_body: 调试信息(可选) - created_at: 错误发生时间 --- backend/internal/handler/ops_error_logger.go | 21 ++ backend/internal/repository/ops_repo.go | 22 +- .../service/antigravity_gateway_service.go | 256 +++++++++++++++++- backend/internal/service/gateway_service.go | 156 ++++++++++- .../service/gemini_messages_compat_service.go | 245 ++++++++++++++++- .../service/openai_gateway_service.go | 55 +++- backend/internal/service/ops_models.go | 34 ++- backend/internal/service/ops_port.go | 6 + backend/internal/service/ops_service.go | 55 ++++ .../internal/service/ops_upstream_context.go | 75 +++++ backend/internal/service/ratelimit_service.go | 23 +- 11 files changed, 907 insertions(+), 41 deletions(-) diff --git a/backend/internal/handler/ops_error_logger.go b/backend/internal/handler/ops_error_logger.go index 5e692cdf..f4ab00c4 100644 --- a/backend/internal/handler/ops_error_logger.go +++ b/backend/internal/handler/ops_error_logger.go @@ -423,6 +423,27 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc { } } } + if v, ok := c.Get(service.OpsUpstreamErrorsKey); ok { + if events, ok := v.([]*service.OpsUpstreamErrorEvent); ok && len(events) > 0 { + entry.UpstreamErrors = events + // Best-effort backfill the single upstream fields from the last event when missing. + last := events[len(events)-1] + if last != nil { + if entry.UpstreamStatusCode == nil && last.UpstreamStatusCode > 0 { + code := last.UpstreamStatusCode + entry.UpstreamStatusCode = &code + } + if entry.UpstreamErrorMessage == nil && strings.TrimSpace(last.Message) != "" { + msg := strings.TrimSpace(last.Message) + entry.UpstreamErrorMessage = &msg + } + if entry.UpstreamErrorDetail == nil && strings.TrimSpace(last.Detail) != "" { + detail := strings.TrimSpace(last.Detail) + entry.UpstreamErrorDetail = &detail + } + } + } + } } if apiKey != nil { diff --git a/backend/internal/repository/ops_repo.go b/backend/internal/repository/ops_repo.go index b27a9ea0..86372166 100644 --- a/backend/internal/repository/ops_repo.go +++ b/backend/internal/repository/ops_repo.go @@ -53,6 +53,7 @@ INSERT INTO ops_error_logs ( upstream_status_code, upstream_error_message, upstream_error_detail, + upstream_errors, duration_ms, time_to_first_token_ms, request_body, @@ -63,7 +64,7 @@ INSERT INTO ops_error_logs ( retry_count, created_at ) VALUES ( - $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33 + $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34 ) RETURNING id` var id int64 @@ -94,6 +95,7 @@ INSERT INTO ops_error_logs ( opsNullInt(input.UpstreamStatusCode), opsNullString(input.UpstreamErrorMessage), opsNullString(input.UpstreamErrorDetail), + opsNullString(input.UpstreamErrorsJSON), opsNullInt(input.DurationMs), opsNullInt64(input.TimeToFirstTokenMs), opsNullString(input.RequestBodyJSON), @@ -267,6 +269,10 @@ SELECT COALESCE(request_id, ''), COALESCE(error_message, ''), COALESCE(error_body, ''), + upstream_status_code, + COALESCE(upstream_error_message, ''), + COALESCE(upstream_error_detail, ''), + COALESCE(upstream_errors::text, ''), is_business_limited, user_id, api_key_id, @@ -292,6 +298,7 @@ LIMIT 1` var out service.OpsErrorLogDetail var latency sql.NullInt64 var statusCode sql.NullInt64 + var upstreamStatusCode sql.NullInt64 var clientIP sql.NullString var userID sql.NullInt64 var apiKeyID sql.NullInt64 @@ -318,6 +325,10 @@ LIMIT 1` &out.RequestID, &out.Message, &out.ErrorBody, + &upstreamStatusCode, + &out.UpstreamErrorMessage, + &out.UpstreamErrorDetail, + &out.UpstreamErrors, &out.IsBusinessLimited, &userID, &apiKeyID, @@ -350,6 +361,10 @@ LIMIT 1` s := clientIP.String out.ClientIP = &s } + if upstreamStatusCode.Valid && upstreamStatusCode.Int64 > 0 { + v := int(upstreamStatusCode.Int64) + out.UpstreamStatusCode = &v + } if userID.Valid { v := userID.Int64 out.UserID = &v @@ -401,6 +416,11 @@ LIMIT 1` if out.RequestHeaders == "null" { out.RequestHeaders = "" } + // Normalize upstream_errors to empty string when stored as JSON null. + out.UpstreamErrors = strings.TrimSpace(out.UpstreamErrors) + if out.UpstreamErrors == "null" { + out.UpstreamErrors = "" + } return &out, nil } diff --git a/backend/internal/service/antigravity_gateway_service.go b/backend/internal/service/antigravity_gateway_service.go index 4fd55757..4dd4d303 100644 --- a/backend/internal/service/antigravity_gateway_service.go +++ b/backend/internal/service/antigravity_gateway_service.go @@ -564,6 +564,14 @@ urlFallbackLoop: resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { + safeErr := sanitizeUpstreamErrorMessage(err.Error()) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "request_error", + Message: safeErr, + }) // 检查是否应触发 URL 降级 if shouldAntigravityFallbackToNextURL(err, 0) && urlIdx < len(availableURLs)-1 { antigravity.DefaultURLAvailability.MarkUnavailable(baseURL) @@ -579,6 +587,7 @@ urlFallbackLoop: continue } log.Printf("%s status=request_failed retries_exhausted error=%v", prefix, err) + setOpsUpstreamError(c, 0, safeErr, "") return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries") } @@ -586,6 +595,26 @@ urlFallbackLoop: if resp.StatusCode == http.StatusTooManyRequests && urlIdx < len(availableURLs)-1 { respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) _ = resp.Body.Close() + upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "retry", + Message: upstreamMsg, + Detail: upstreamDetail, + }) antigravity.DefaultURLAvailability.MarkUnavailable(baseURL) log.Printf("%s URL fallback (HTTP 429): %s -> %s body=%s", prefix, baseURL, availableURLs[urlIdx+1], truncateForLog(respBody, 200)) continue urlFallbackLoop @@ -596,6 +625,26 @@ urlFallbackLoop: _ = resp.Body.Close() if attempt < antigravityMaxRetries { + upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "retry", + Message: upstreamMsg, + Detail: upstreamDetail, + }) log.Printf("%s status=%d retry=%d/%d body=%s", prefix, resp.StatusCode, attempt, antigravityMaxRetries, truncateForLog(respBody, 500)) if !sleepAntigravityBackoffWithContext(ctx, attempt) { log.Printf("%s status=context_canceled_during_backoff", prefix) @@ -628,6 +677,27 @@ urlFallbackLoop: // Antigravity /v1internal 链路在部分场景会对 thought/thinking signature 做严格校验, // 当历史消息携带的 signature 不合法时会直接 400;去除 thinking 后可继续完成请求。 if resp.StatusCode == http.StatusBadRequest && isSignatureRelatedError(respBody) { + upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "signature_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + // Conservative two-stage fallback: // 1) Disable top-level thinking + thinking->text // 2) Only if still signature-related 400: also downgrade tool_use/tool_result to text. @@ -661,6 +731,13 @@ urlFallbackLoop: } retryResp, retryErr := s.httpUpstream.Do(retryReq, proxyURL, account.ID, account.Concurrency) if retryErr != nil { + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "signature_retry_request_error", + Message: sanitizeUpstreamErrorMessage(retryErr.Error()), + }) log.Printf("Antigravity account %d: signature retry request failed (%s): %v", account.ID, stage.name, retryErr) continue } @@ -674,6 +751,25 @@ urlFallbackLoop: retryBody, _ := io.ReadAll(io.LimitReader(retryResp.Body, 2<<20)) _ = retryResp.Body.Close() + kind := "signature_retry" + if strings.TrimSpace(stage.name) != "" { + kind = "signature_retry_" + strings.ReplaceAll(stage.name, "+", "_") + } + retryUpstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(retryBody)) + retryUpstreamMsg = sanitizeUpstreamErrorMessage(retryUpstreamMsg) + retryUpstreamDetail := "" + if logBody { + retryUpstreamDetail = truncateString(string(retryBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: retryResp.StatusCode, + UpstreamRequestID: retryResp.Header.Get("x-request-id"), + Kind: kind, + Message: retryUpstreamMsg, + Detail: retryUpstreamDetail, + }) // If this stage fixed the signature issue, we stop; otherwise we may try the next stage. if retryResp.StatusCode != http.StatusBadRequest || !isSignatureRelatedError(retryBody) { @@ -701,10 +797,30 @@ urlFallbackLoop: s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope) if s.shouldFailoverUpstreamError(resp.StatusCode) { + upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "failover", + Message: upstreamMsg, + Detail: upstreamDetail, + }) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } - return nil, s.writeMappedClaudeError(c, resp.StatusCode, respBody) + return nil, s.writeMappedClaudeError(c, account, resp.StatusCode, resp.Header.Get("x-request-id"), respBody) } } @@ -1108,6 +1224,14 @@ urlFallbackLoop: resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { + safeErr := sanitizeUpstreamErrorMessage(err.Error()) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "request_error", + Message: safeErr, + }) // 检查是否应触发 URL 降级 if shouldAntigravityFallbackToNextURL(err, 0) && urlIdx < len(availableURLs)-1 { antigravity.DefaultURLAvailability.MarkUnavailable(baseURL) @@ -1123,6 +1247,7 @@ urlFallbackLoop: continue } log.Printf("%s status=request_failed retries_exhausted error=%v", prefix, err) + setOpsUpstreamError(c, 0, safeErr, "") return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries") } @@ -1130,6 +1255,26 @@ urlFallbackLoop: if resp.StatusCode == http.StatusTooManyRequests && urlIdx < len(availableURLs)-1 { respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) _ = resp.Body.Close() + upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "retry", + Message: upstreamMsg, + Detail: upstreamDetail, + }) antigravity.DefaultURLAvailability.MarkUnavailable(baseURL) log.Printf("%s URL fallback (HTTP 429): %s -> %s body=%s", prefix, baseURL, availableURLs[urlIdx+1], truncateForLog(respBody, 200)) continue urlFallbackLoop @@ -1140,6 +1285,26 @@ urlFallbackLoop: _ = resp.Body.Close() if attempt < antigravityMaxRetries { + upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "retry", + Message: upstreamMsg, + Detail: upstreamDetail, + }) log.Printf("%s status=%d retry=%d/%d", prefix, resp.StatusCode, attempt, antigravityMaxRetries) if !sleepAntigravityBackoffWithContext(ctx, attempt) { log.Printf("%s status=context_canceled_during_backoff", prefix) @@ -1205,21 +1370,59 @@ urlFallbackLoop: s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope) - if s.shouldFailoverUpstreamError(resp.StatusCode) { - return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} - } - - // 解包并返回错误 requestID := resp.Header.Get("x-request-id") if requestID != "" { c.Header("x-request-id", requestID) } - unwrapped, _ := s.unwrapV1InternalResponse(respBody) + + unwrapped, unwrapErr := s.unwrapV1InternalResponse(respBody) + unwrappedForOps := unwrapped + if unwrapErr != nil || len(unwrappedForOps) == 0 { + unwrappedForOps = respBody + } + upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(unwrappedForOps)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(unwrappedForOps), maxBytes) + } + + // Always record upstream context for Ops error logs, even when we will failover. + setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail) + + if s.shouldFailoverUpstreamError(resp.StatusCode) { + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: requestID, + Kind: "failover", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} + } + contentType := resp.Header.Get("Content-Type") if contentType == "" { contentType = "application/json" } - c.Data(resp.StatusCode, contentType, unwrapped) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: requestID, + Kind: "http_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + c.Data(resp.StatusCode, contentType, unwrappedForOps) return nil, fmt.Errorf("antigravity upstream error: %d", resp.StatusCode) } @@ -1674,9 +1877,35 @@ func (s *AntigravityGatewayService) writeClaudeError(c *gin.Context, status int, return fmt.Errorf("%s", message) } -func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, upstreamStatus int, body []byte) error { - // 记录上游错误详情便于调试 - log.Printf("[antigravity-Forward] upstream_error status=%d body=%s", upstreamStatus, string(body)) +func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, account *Account, upstreamStatus int, upstreamRequestID string, body []byte) error { + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + + logBody := s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBody + maxBytes := 2048 + if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes > 0 { + maxBytes = s.settingService.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + } + + upstreamDetail := "" + if logBody { + upstreamDetail = truncateString(string(body), maxBytes) + } + setOpsUpstreamError(c, upstreamStatus, upstreamMsg, upstreamDetail) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: upstreamStatus, + UpstreamRequestID: upstreamRequestID, + Kind: "http_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + + // 记录上游错误详情便于排障(可选:由配置控制;不回显到客户端) + if logBody { + log.Printf("[antigravity-Forward] upstream_error status=%d body=%s", upstreamStatus, truncateForLog(body, maxBytes)) + } var statusCode int var errType, errMsg string @@ -1712,7 +1941,10 @@ func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, upstr "type": "error", "error": gin.H{"type": errType, "message": errMsg}, }) - return fmt.Errorf("upstream error: %d", upstreamStatus) + if upstreamMsg == "" { + return fmt.Errorf("upstream error: %d", upstreamStatus) + } + return fmt.Errorf("upstream error: %d message=%s", upstreamStatus, upstreamMsg) } func (s *AntigravityGatewayService) writeGoogleError(c *gin.Context, status int, message string) error { diff --git a/backend/internal/service/gateway_service.go b/backend/internal/service/gateway_service.go index a2b74a15..b48af7b0 100644 --- a/backend/internal/service/gateway_service.go +++ b/backend/internal/service/gateway_service.go @@ -1402,6 +1402,13 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A // Ensure the client receives an error response (handlers assume Forward writes on non-failover errors). safeErr := sanitizeUpstreamErrorMessage(err.Error()) setOpsUpstreamError(c, 0, safeErr, "") + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "request_error", + Message: safeErr, + }) c.JSON(http.StatusBadGateway, gin.H{ "type": "error", "error": gin.H{ @@ -1419,6 +1426,21 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A _ = resp.Body.Close() if s.isThinkingBlockSignatureError(respBody) { + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "signature_error", + Message: extractUpstreamErrorMessage(respBody), + Detail: func() string { + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes) + } + return "" + }(), + }) + looksLikeToolSignatureError := func(msg string) bool { m := strings.ToLower(msg) return strings.Contains(m, "tool_use") || @@ -1455,6 +1477,20 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A retryRespBody, retryReadErr := io.ReadAll(io.LimitReader(retryResp.Body, 2<<20)) _ = retryResp.Body.Close() if retryReadErr == nil && retryResp.StatusCode == 400 && s.isThinkingBlockSignatureError(retryRespBody) { + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: retryResp.StatusCode, + UpstreamRequestID: retryResp.Header.Get("x-request-id"), + Kind: "signature_retry_thinking", + Message: extractUpstreamErrorMessage(retryRespBody), + Detail: func() string { + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + return truncateString(string(retryRespBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes) + } + return "" + }(), + }) msg2 := extractUpstreamErrorMessage(retryRespBody) if looksLikeToolSignatureError(msg2) && time.Since(retryStart) < maxRetryElapsed { log.Printf("Account %d: signature retry still failing and looks tool-related, retrying with tool blocks downgraded", account.ID) @@ -1469,6 +1505,13 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A if retryResp2 != nil && retryResp2.Body != nil { _ = retryResp2.Body.Close() } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "signature_retry_tools_request_error", + Message: sanitizeUpstreamErrorMessage(retryErr2.Error()), + }) log.Printf("Account %d: tool-downgrade signature retry failed: %v", account.ID, retryErr2) } else { log.Printf("Account %d: tool-downgrade signature retry build failed: %v", account.ID, buildErr2) @@ -1518,9 +1561,24 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A break } + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) + _ = resp.Body.Close() + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "retry", + Message: extractUpstreamErrorMessage(respBody), + Detail: func() string { + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes) + } + return "" + }(), + }) log.Printf("Account %d: upstream error %d, retry %d/%d after %v (elapsed=%v/%v)", account.ID, resp.StatusCode, attempt, maxRetryAttempts, delay, elapsed, maxRetryElapsed) - _ = resp.Body.Close() if err := sleepWithContext(ctx, delay); err != nil { return nil, err } @@ -1548,7 +1606,25 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A // 处理重试耗尽的情况 if resp.StatusCode >= 400 && s.shouldRetryUpstreamError(account, resp.StatusCode) { if s.shouldFailoverUpstreamError(resp.StatusCode) { + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) + _ = resp.Body.Close() + resp.Body = io.NopCloser(bytes.NewReader(respBody)) + s.handleRetryExhaustedSideEffects(ctx, resp, account) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "retry_exhausted_failover", + Message: extractUpstreamErrorMessage(respBody), + Detail: func() string { + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes) + } + return "" + }(), + }) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } return s.handleRetryExhaustedError(ctx, resp, c, account) @@ -1556,7 +1632,25 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A // 处理可切换账号的错误 if resp.StatusCode >= 400 && s.shouldFailoverUpstreamError(resp.StatusCode) { + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) + _ = resp.Body.Close() + resp.Body = io.NopCloser(bytes.NewReader(respBody)) + s.handleFailoverSideEffects(ctx, resp, account) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "failover", + Message: extractUpstreamErrorMessage(respBody), + Detail: func() string { + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + return truncateString(string(respBody), s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes) + } + return "" + }(), + }) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } @@ -1573,6 +1667,26 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A resp.Body = io.NopCloser(bytes.NewReader(respBody)) if s.shouldFailoverOn400(respBody) { + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "failover_on_400", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + if s.cfg.Gateway.LogUpstreamErrorBody { log.Printf( "Account %d: 400 error, attempting failover: %s", @@ -1884,6 +1998,15 @@ func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Res upstreamDetail = truncateString(string(body), maxBytes) } setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "http_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) // 处理上游错误,标记账号状态 shouldDisable := false @@ -1963,7 +2086,7 @@ func (s *GatewayService) handleErrorResponse(ctx context.Context, resp *http.Res } func (s *GatewayService) handleRetryExhaustedSideEffects(ctx context.Context, resp *http.Response, account *Account) { - body, _ := io.ReadAll(resp.Body) + body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) statusCode := resp.StatusCode // OAuth/Setup Token 账号的 403:标记账号异常 @@ -1977,7 +2100,7 @@ func (s *GatewayService) handleRetryExhaustedSideEffects(ctx context.Context, re } func (s *GatewayService) handleFailoverSideEffects(ctx context.Context, resp *http.Response, account *Account) { - body, _ := io.ReadAll(resp.Body) + body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body) } @@ -2003,6 +2126,15 @@ func (s *GatewayService) handleRetryExhaustedError(ctx context.Context, resp *ht upstreamDetail = truncateString(string(respBody), maxBytes) } setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "retry_exhausted", + Message: upstreamMsg, + Detail: upstreamDetail, + }) if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { log.Printf( @@ -2557,6 +2689,7 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context, // 发送请求 resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { + setOpsUpstreamError(c, 0, sanitizeUpstreamErrorMessage(err.Error()), "") s.countTokensError(c, http.StatusBadGateway, "upstream_error", "Request failed") return fmt.Errorf("upstream request failed: %w", err) } @@ -2594,6 +2727,18 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context, // 标记账号状态(429/529等) s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody) + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail) + // 记录上游错误摘要便于排障(不回显请求内容) if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { log.Printf( @@ -2615,7 +2760,10 @@ func (s *GatewayService) ForwardCountTokens(ctx context.Context, c *gin.Context, errMsg = "Service overloaded" } s.countTokensError(c, resp.StatusCode, "upstream_error", errMsg) - return fmt.Errorf("upstream error: %d", resp.StatusCode) + if upstreamMsg == "" { + return fmt.Errorf("upstream error: %d", resp.StatusCode) + } + return fmt.Errorf("upstream error: %d message=%s", resp.StatusCode, upstreamMsg) } // 透传成功响应 diff --git a/backend/internal/service/gemini_messages_compat_service.go b/backend/internal/service/gemini_messages_compat_service.go index 78452b1e..d1b65b71 100644 --- a/backend/internal/service/gemini_messages_compat_service.go +++ b/backend/internal/service/gemini_messages_compat_service.go @@ -543,12 +543,21 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { + safeErr := sanitizeUpstreamErrorMessage(err.Error()) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "request_error", + Message: safeErr, + }) if attempt < geminiMaxRetries { log.Printf("Gemini account %d: upstream request failed, retry %d/%d: %v", account.ID, attempt, geminiMaxRetries, err) sleepGeminiBackoff(attempt) continue } - return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries: "+sanitizeUpstreamErrorMessage(err.Error())) + setOpsUpstreamError(c, 0, safeErr, "") + return nil, s.writeClaudeError(c, http.StatusBadGateway, "upstream_error", "Upstream request failed after retries: "+safeErr) } // Special-case: signature/thought_signature validation errors are not transient, but may be fixed by @@ -558,6 +567,30 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex _ = resp.Body.Close() if isGeminiSignatureRelatedError(respBody) { + upstreamReqID := resp.Header.Get(requestIDHeader) + if upstreamReqID == "" { + upstreamReqID = resp.Header.Get("x-goog-request-id") + } + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: upstreamReqID, + Kind: "signature_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + var strippedClaudeBody []byte stageName := "" switch signatureRetryStage { @@ -608,6 +641,30 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody) } if attempt < geminiMaxRetries { + upstreamReqID := resp.Header.Get(requestIDHeader) + if upstreamReqID == "" { + upstreamReqID = resp.Header.Get("x-goog-request-id") + } + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: upstreamReqID, + Kind: "retry", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + log.Printf("Gemini account %d: upstream status %d, retry %d/%d", account.ID, resp.StatusCode, attempt, geminiMaxRetries) sleepGeminiBackoff(attempt) continue @@ -633,12 +690,62 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex } s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody) if tempMatched { + upstreamReqID := resp.Header.Get(requestIDHeader) + if upstreamReqID == "" { + upstreamReqID = resp.Header.Get("x-goog-request-id") + } + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: upstreamReqID, + Kind: "failover", + Message: upstreamMsg, + Detail: upstreamDetail, + }) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) { + upstreamReqID := resp.Header.Get(requestIDHeader) + if upstreamReqID == "" { + upstreamReqID = resp.Header.Get("x-goog-request-id") + } + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: upstreamReqID, + Kind: "failover", + Message: upstreamMsg, + Detail: upstreamDetail, + }) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } - return nil, s.writeGeminiMappedError(c, resp.StatusCode, respBody) + upstreamReqID := resp.Header.Get(requestIDHeader) + if upstreamReqID == "" { + upstreamReqID = resp.Header.Get("x-goog-request-id") + } + return nil, s.writeGeminiMappedError(c, account, resp.StatusCode, upstreamReqID, respBody) } requestID := resp.Header.Get(requestIDHeader) @@ -863,6 +970,14 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency) if err != nil { + safeErr := sanitizeUpstreamErrorMessage(err.Error()) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "request_error", + Message: safeErr, + }) if attempt < geminiMaxRetries { log.Printf("Gemini account %d: upstream request failed, retry %d/%d: %v", account.ID, attempt, geminiMaxRetries, err) sleepGeminiBackoff(attempt) @@ -880,7 +995,8 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. FirstTokenMs: nil, }, nil } - return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries: "+sanitizeUpstreamErrorMessage(err.Error())) + setOpsUpstreamError(c, 0, safeErr, "") + return nil, s.writeGoogleError(c, http.StatusBadGateway, "Upstream request failed after retries: "+safeErr) } if resp.StatusCode >= 400 && s.shouldRetryGeminiUpstreamError(account, resp.StatusCode) { @@ -899,6 +1015,30 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. s.handleGeminiUpstreamError(ctx, account, resp.StatusCode, resp.Header, respBody) } if attempt < geminiMaxRetries { + upstreamReqID := resp.Header.Get(requestIDHeader) + if upstreamReqID == "" { + upstreamReqID = resp.Header.Get("x-goog-request-id") + } + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: upstreamReqID, + Kind: "retry", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + log.Printf("Gemini account %d: upstream status %d, retry %d/%d", account.ID, resp.StatusCode, attempt, geminiMaxRetries) sleepGeminiBackoff(attempt) continue @@ -962,19 +1102,84 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin. } if tempMatched { + evBody := unwrapIfNeeded(isOAuth, respBody) + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(evBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(evBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: requestID, + Kind: "failover", + Message: upstreamMsg, + Detail: upstreamDetail, + }) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } if s.shouldFailoverGeminiUpstreamError(resp.StatusCode) { + evBody := unwrapIfNeeded(isOAuth, respBody) + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(evBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(evBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: requestID, + Kind: "failover", + Message: upstreamMsg, + Detail: upstreamDetail, + }) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } respBody = unwrapIfNeeded(isOAuth, respBody) + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + log.Printf("[Gemini] native upstream error %d: %s", resp.StatusCode, truncateForLog(respBody, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)) + } + setOpsUpstreamError(c, resp.StatusCode, upstreamMsg, upstreamDetail) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: requestID, + Kind: "http_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + contentType := resp.Header.Get("Content-Type") if contentType == "" { contentType = "application/json" } c.Data(resp.StatusCode, contentType, respBody) - return nil, fmt.Errorf("gemini upstream error: %d", resp.StatusCode) + if upstreamMsg == "" { + return nil, fmt.Errorf("gemini upstream error: %d", resp.StatusCode) + } + return nil, fmt.Errorf("gemini upstream error: %d message=%s", resp.StatusCode, upstreamMsg) } var usage *ClaudeUsage @@ -1076,7 +1281,32 @@ func sanitizeUpstreamErrorMessage(msg string) string { return sensitiveQueryParamRegex.ReplaceAllString(msg, `$1***`) } -func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, upstreamStatus int, body []byte) error { +func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, account *Account, upstreamStatus int, upstreamRequestID string, body []byte) error { + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(body)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(body), maxBytes) + } + setOpsUpstreamError(c, upstreamStatus, upstreamMsg, upstreamDetail) + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: upstreamStatus, + UpstreamRequestID: upstreamRequestID, + Kind: "http_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + log.Printf("[Gemini] upstream error %d: %s", upstreamStatus, truncateForLog(body, s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes)) + } + var statusCode int var errType, errMsg string @@ -1184,7 +1414,10 @@ func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, ups "type": "error", "error": gin.H{"type": errType, "message": errMsg}, }) - return fmt.Errorf("upstream error: %d", upstreamStatus) + if upstreamMsg == "" { + return fmt.Errorf("upstream error: %d", upstreamStatus) + } + return fmt.Errorf("upstream error: %d message=%s", upstreamStatus, upstreamMsg) } type claudeErrorMapping struct { diff --git a/backend/internal/service/openai_gateway_service.go b/backend/internal/service/openai_gateway_service.go index c8d133df..d11cbdd9 100644 --- a/backend/internal/service/openai_gateway_service.go +++ b/backend/internal/service/openai_gateway_service.go @@ -511,7 +511,7 @@ func (s *OpenAIGatewayService) shouldFailoverUpstreamError(statusCode int) bool } func (s *OpenAIGatewayService) handleFailoverSideEffects(ctx context.Context, resp *http.Response, account *Account) { - body, _ := io.ReadAll(resp.Body) + body, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body) } @@ -590,6 +590,13 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco // Ensure the client receives an error response (handlers assume Forward writes on non-failover errors). safeErr := sanitizeUpstreamErrorMessage(err.Error()) setOpsUpstreamError(c, 0, safeErr, "") + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: 0, + Kind: "request_error", + Message: safeErr, + }) c.JSON(http.StatusBadGateway, gin.H{ "error": gin.H{ "type": "upstream_error", @@ -603,6 +610,30 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco // Handle error response if resp.StatusCode >= 400 { if s.shouldFailoverUpstreamError(resp.StatusCode) { + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) + _ = resp.Body.Close() + resp.Body = io.NopCloser(bytes.NewReader(respBody)) + + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(respBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + upstreamDetail := "" + if s.cfg != nil && s.cfg.Gateway.LogUpstreamErrorBody { + maxBytes := s.cfg.Gateway.LogUpstreamErrorBodyMaxBytes + if maxBytes <= 0 { + maxBytes = 2048 + } + upstreamDetail = truncateString(string(respBody), maxBytes) + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "failover", + Message: upstreamMsg, + Detail: upstreamDetail, + }) + s.handleFailoverSideEffects(ctx, resp, account) return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } @@ -743,6 +774,15 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht // Check custom error codes if !account.ShouldHandleErrorCode(resp.StatusCode) { + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: "http_error", + Message: upstreamMsg, + Detail: upstreamDetail, + }) c.JSON(http.StatusInternalServerError, gin.H{ "error": gin.H{ "type": "upstream_error", @@ -760,6 +800,19 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht if s.rateLimitService != nil { shouldDisable = s.rateLimitService.HandleUpstreamError(ctx, account, resp.StatusCode, resp.Header, body) } + kind := "http_error" + if shouldDisable { + kind = "failover" + } + appendOpsUpstreamError(c, OpsUpstreamErrorEvent{ + Platform: account.Platform, + AccountID: account.ID, + UpstreamStatusCode: resp.StatusCode, + UpstreamRequestID: resp.Header.Get("x-request-id"), + Kind: kind, + Message: upstreamMsg, + Detail: upstreamDetail, + }) if shouldDisable { return nil, &UpstreamFailoverError{StatusCode: resp.StatusCode} } diff --git a/backend/internal/service/ops_models.go b/backend/internal/service/ops_models.go index 90b2dc47..996267fd 100644 --- a/backend/internal/service/ops_models.go +++ b/backend/internal/service/ops_models.go @@ -36,12 +36,18 @@ type OpsErrorLogDetail struct { ErrorBody string `json:"error_body"` UserAgent string `json:"user_agent"` + // Upstream context (optional) + UpstreamStatusCode *int `json:"upstream_status_code,omitempty"` + UpstreamErrorMessage string `json:"upstream_error_message,omitempty"` + UpstreamErrorDetail string `json:"upstream_error_detail,omitempty"` + UpstreamErrors string `json:"upstream_errors,omitempty"` // JSON array (string) for display/parsing + // Timings (optional) - AuthLatencyMs *int64 `json:"auth_latency_ms"` - RoutingLatencyMs *int64 `json:"routing_latency_ms"` - UpstreamLatencyMs *int64 `json:"upstream_latency_ms"` - ResponseLatencyMs *int64 `json:"response_latency_ms"` - TimeToFirstTokenMs *int64 `json:"time_to_first_token_ms"` + AuthLatencyMs *int64 `json:"auth_latency_ms"` + RoutingLatencyMs *int64 `json:"routing_latency_ms"` + UpstreamLatencyMs *int64 `json:"upstream_latency_ms"` + ResponseLatencyMs *int64 `json:"response_latency_ms"` + TimeToFirstTokenMs *int64 `json:"time_to_first_token_ms"` // Retry context RequestBody string `json:"request_body"` @@ -57,8 +63,8 @@ type OpsErrorLogFilter struct { StartTime *time.Time EndTime *time.Time - Platform string - GroupID *int64 + Platform string + GroupID *int64 AccountID *int64 StatusCodes []int @@ -71,9 +77,9 @@ type OpsErrorLogFilter struct { type OpsErrorLogList struct { Errors []*OpsErrorLog `json:"errors"` - Total int `json:"total"` - Page int `json:"page"` - PageSize int `json:"page_size"` + Total int `json:"total"` + Page int `json:"page"` + PageSize int `json:"page_size"` } type OpsRetryAttempt struct { @@ -97,18 +103,18 @@ type OpsRetryAttempt struct { } type OpsRetryResult struct { - AttemptID int64 `json:"attempt_id"` + AttemptID int64 `json:"attempt_id"` Mode string `json:"mode"` Status string `json:"status"` PinnedAccountID *int64 `json:"pinned_account_id"` UsedAccountID *int64 `json:"used_account_id"` - HTTPStatusCode int `json:"http_status_code"` + HTTPStatusCode int `json:"http_status_code"` UpstreamRequestID string `json:"upstream_request_id"` - ResponsePreview string `json:"response_preview"` - ResponseTruncated bool `json:"response_truncated"` + ResponsePreview string `json:"response_preview"` + ResponseTruncated bool `json:"response_truncated"` ErrorMessage string `json:"error_message"` diff --git a/backend/internal/service/ops_port.go b/backend/internal/service/ops_port.go index 90591a56..39f3aaf2 100644 --- a/backend/internal/service/ops_port.go +++ b/backend/internal/service/ops_port.go @@ -81,6 +81,12 @@ type OpsInsertErrorLogInput struct { UpstreamStatusCode *int UpstreamErrorMessage *string UpstreamErrorDetail *string + // UpstreamErrors captures all upstream error attempts observed during handling this request. + // It is populated during request processing (gin context) and sanitized+serialized by OpsService. + UpstreamErrors []*OpsUpstreamErrorEvent + // UpstreamErrorsJSON is the sanitized JSON string stored into ops_error_logs.upstream_errors. + // It is set by OpsService.RecordError before persisting. + UpstreamErrorsJSON *string DurationMs *int TimeToFirstTokenMs *int64 diff --git a/backend/internal/service/ops_service.go b/backend/internal/service/ops_service.go index c9cccdc7..e3ad5589 100644 --- a/backend/internal/service/ops_service.go +++ b/backend/internal/service/ops_service.go @@ -163,6 +163,61 @@ func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogIn } } + // Sanitize + serialize upstream error events list. + if len(entry.UpstreamErrors) > 0 { + const maxEvents = 32 + events := entry.UpstreamErrors + if len(events) > maxEvents { + events = events[len(events)-maxEvents:] + } + + sanitized := make([]*OpsUpstreamErrorEvent, 0, len(events)) + for _, ev := range events { + if ev == nil { + continue + } + out := *ev + + out.Platform = strings.TrimSpace(out.Platform) + out.UpstreamRequestID = truncateString(strings.TrimSpace(out.UpstreamRequestID), 128) + out.Kind = truncateString(strings.TrimSpace(out.Kind), 64) + + if out.AccountID < 0 { + out.AccountID = 0 + } + if out.UpstreamStatusCode < 0 { + out.UpstreamStatusCode = 0 + } + if out.AtUnixMs < 0 { + out.AtUnixMs = 0 + } + + msg := sanitizeUpstreamErrorMessage(strings.TrimSpace(out.Message)) + msg = truncateString(msg, 2048) + out.Message = msg + + detail := strings.TrimSpace(out.Detail) + if detail != "" { + // Keep upstream detail small; request bodies are not stored here, only upstream error payloads. + sanitizedDetail, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes) + out.Detail = sanitizedDetail + } else { + out.Detail = "" + } + + // Drop fully-empty events (can happen if only status code was known). + if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" { + continue + } + + evCopy := out + sanitized = append(sanitized, &evCopy) + } + + entry.UpstreamErrorsJSON = marshalOpsUpstreamErrors(sanitized) + entry.UpstreamErrors = nil + } + if _, err := s.opsRepo.InsertErrorLog(ctx, entry); err != nil { // Never bubble up to gateway; best-effort logging. log.Printf("[Ops] RecordError failed: %v", err) diff --git a/backend/internal/service/ops_upstream_context.go b/backend/internal/service/ops_upstream_context.go index 70e8f6af..f096cf80 100644 --- a/backend/internal/service/ops_upstream_context.go +++ b/backend/internal/service/ops_upstream_context.go @@ -1,7 +1,9 @@ package service import ( + "encoding/json" "strings" + "time" "github.com/gin-gonic/gin" ) @@ -12,6 +14,7 @@ const ( OpsUpstreamStatusCodeKey = "ops_upstream_status_code" OpsUpstreamErrorMessageKey = "ops_upstream_error_message" OpsUpstreamErrorDetailKey = "ops_upstream_error_detail" + OpsUpstreamErrorsKey = "ops_upstream_errors" ) func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) { @@ -29,3 +32,75 @@ func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage } } +// OpsUpstreamErrorEvent describes one upstream error attempt during a single gateway request. +// It is stored in ops_error_logs.upstream_errors as a JSON array. +type OpsUpstreamErrorEvent struct { + AtUnixMs int64 `json:"at_unix_ms,omitempty"` + + // Context + Platform string `json:"platform,omitempty"` + AccountID int64 `json:"account_id,omitempty"` + + // Outcome + UpstreamStatusCode int `json:"upstream_status_code,omitempty"` + UpstreamRequestID string `json:"upstream_request_id,omitempty"` + + // Kind: http_error | request_error | retry_exhausted | failover + Kind string `json:"kind,omitempty"` + + Message string `json:"message,omitempty"` + Detail string `json:"detail,omitempty"` +} + +func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) { + if c == nil { + return + } + if ev.AtUnixMs <= 0 { + ev.AtUnixMs = time.Now().UnixMilli() + } + ev.Platform = strings.TrimSpace(ev.Platform) + ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID) + ev.Kind = strings.TrimSpace(ev.Kind) + ev.Message = strings.TrimSpace(ev.Message) + ev.Detail = strings.TrimSpace(ev.Detail) + if ev.Message != "" { + ev.Message = sanitizeUpstreamErrorMessage(ev.Message) + } + + var existing []*OpsUpstreamErrorEvent + if v, ok := c.Get(OpsUpstreamErrorsKey); ok { + if arr, ok := v.([]*OpsUpstreamErrorEvent); ok { + existing = arr + } + } + + evCopy := ev + existing = append(existing, &evCopy) + c.Set(OpsUpstreamErrorsKey, existing) +} + +func getOpsUpstreamErrors(c *gin.Context) []*OpsUpstreamErrorEvent { + if c == nil { + return nil + } + if v, ok := c.Get(OpsUpstreamErrorsKey); ok { + if arr, ok := v.([]*OpsUpstreamErrorEvent); ok { + return arr + } + } + return nil +} + +func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string { + if len(events) == 0 { + return nil + } + // Ensure we always store a valid JSON value. + raw, err := json.Marshal(events) + if err != nil || len(raw) == 0 { + return nil + } + s := string(raw) + return &s +} diff --git a/backend/internal/service/ratelimit_service.go b/backend/internal/service/ratelimit_service.go index f1362646..d570b92e 100644 --- a/backend/internal/service/ratelimit_service.go +++ b/backend/internal/service/ratelimit_service.go @@ -55,19 +55,36 @@ func (s *RateLimitService) HandleUpstreamError(ctx context.Context, account *Acc } tempMatched := s.tryTempUnschedulable(ctx, account, statusCode, responseBody) + upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(responseBody)) + upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg) + if upstreamMsg != "" { + upstreamMsg = truncateForLog([]byte(upstreamMsg), 512) + } switch statusCode { case 401: // 认证失败:停止调度,记录错误 - s.handleAuthError(ctx, account, "Authentication failed (401): invalid or expired credentials") + msg := "Authentication failed (401): invalid or expired credentials" + if upstreamMsg != "" { + msg = "Authentication failed (401): " + upstreamMsg + } + s.handleAuthError(ctx, account, msg) shouldDisable = true case 402: // 支付要求:余额不足或计费问题,停止调度 - s.handleAuthError(ctx, account, "Payment required (402): insufficient balance or billing issue") + msg := "Payment required (402): insufficient balance or billing issue" + if upstreamMsg != "" { + msg = "Payment required (402): " + upstreamMsg + } + s.handleAuthError(ctx, account, msg) shouldDisable = true case 403: // 禁止访问:停止调度,记录错误 - s.handleAuthError(ctx, account, "Access forbidden (403): account may be suspended or lack permissions") + msg := "Access forbidden (403): account may be suspended or lack permissions" + if upstreamMsg != "" { + msg = "Access forbidden (403): " + upstreamMsg + } + s.handleAuthError(ctx, account, msg) shouldDisable = true case 429: s.handle429(ctx, account, headers) From abdc4f39cb6c5a0b614f7cb0bf1012d3ae46e6b7 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Sun, 11 Jan 2026 15:30:59 +0800 Subject: [PATCH 22/53] =?UTF-8?q?feat(ops):=20=E6=81=A2=E5=A4=8D=E4=BB=AA?= =?UTF-8?q?=E8=A1=A8=E7=9B=98=E8=84=89=E6=90=8F=E5=8A=A8=E7=94=BB=E6=95=88?= =?UTF-8?q?=E6=9E=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 将静态QPS历史折线图替换为动画脉搏线 - 使用SVG animate元素实现心跳效果(2秒循环动画) - 增强流量可视化:通过脉冲跳动直观展示流量"活跃"状态 - 恢复重构前的视觉效果与用户体验 --- .../ops/components/OpsDashboardHeader.vue | 37 +++++++++++++------ 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue index 35eeb59c..05b711d4 100644 --- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue +++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue @@ -228,12 +228,14 @@ const durationP95Ms = computed(() => overview.value?.duration?.p95_ms ?? null) const durationP90Ms = computed(() => overview.value?.duration?.p90_ms ?? null) const durationP50Ms = computed(() => overview.value?.duration?.p50_ms ?? null) const durationAvgMs = computed(() => overview.value?.duration?.avg_ms ?? null) +const durationMaxMs = computed(() => overview.value?.duration?.max_ms ?? null) const ttftP99Ms = computed(() => overview.value?.ttft?.p99_ms ?? null) const ttftP95Ms = computed(() => overview.value?.ttft?.p95_ms ?? null) const ttftP90Ms = computed(() => overview.value?.ttft?.p90_ms ?? null) const ttftP50Ms = computed(() => overview.value?.ttft?.p50_ms ?? null) const ttftAvgMs = computed(() => overview.value?.ttft?.avg_ms ?? null) +const ttftMaxMs = computed(() => overview.value?.ttft?.max_ms ?? null) // --- WebSocket status --- @@ -883,24 +885,37 @@ function openJobsDetails() { - +
- - - -
{{ t('admin.ops.average') }}
QPS: {{ qpsAvgLabel }} TPS: {{ tpsAvgLabel }}
+ + +
+ + + + + +
From e5857161ffde222e691002cf5df1589792a7b579 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Sun, 11 Jan 2026 15:31:48 +0800 Subject: [PATCH 23/53] =?UTF-8?q?feat(ops):=20=E5=A2=9E=E5=BC=BA=E9=94=99?= =?UTF-8?q?=E8=AF=AF=E8=AF=A6=E6=83=85=E5=BC=B9=E7=AA=97=E4=B8=8EAPI?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **前端改动**: 1. OpsErrorDetailModal.vue: - 新增上游错误详情展示功能 - 支持查看上游错误的请求头、响应体等调试信息 - 改进错误信息格式化与可读性 2. ops.ts API: - 新增getUpstreamErrors接口调用上游错误查询API **后端配置**: - config.go/config.yaml/deploy/config.example.yaml: - 更新配置支持上游错误事件记录开关 - 添加相关配置项文档说明 --- backend/internal/config/config.go | 2 +- config.yaml | 2 +- deploy/config.example.yaml | 2 +- frontend/src/api/admin/ops.ts | 6 ++ .../ops/components/OpsErrorDetailModal.vue | 97 +++++++++++++++++++ 5 files changed, 106 insertions(+), 3 deletions(-) diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index 579e498a..25c6cb65 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -635,7 +635,7 @@ func setDefaults() { // Gateway viper.SetDefault("gateway.response_header_timeout", 600) // 600秒(10分钟)等待上游响应头,LLM高负载时可能排队较久 - viper.SetDefault("gateway.log_upstream_error_body", false) + viper.SetDefault("gateway.log_upstream_error_body", true) viper.SetDefault("gateway.log_upstream_error_body_max_bytes", 2048) viper.SetDefault("gateway.inject_beta_for_apikey", false) viper.SetDefault("gateway.failover_on_400", false) diff --git a/config.yaml b/config.yaml index 106de2c3..13e7977c 100644 --- a/config.yaml +++ b/config.yaml @@ -159,7 +159,7 @@ gateway: max_line_size: 41943040 # Log upstream error response body summary (safe/truncated; does not log request content) # 记录上游错误响应体摘要(安全/截断;不记录请求内容) - log_upstream_error_body: false + log_upstream_error_body: true # Max bytes to log from upstream error body # 记录上游错误响应体的最大字节数 log_upstream_error_body_max_bytes: 2048 diff --git a/deploy/config.example.yaml b/deploy/config.example.yaml index 87ff3148..7ca26968 100644 --- a/deploy/config.example.yaml +++ b/deploy/config.example.yaml @@ -159,7 +159,7 @@ gateway: max_line_size: 41943040 # Log upstream error response body summary (safe/truncated; does not log request content) # 记录上游错误响应体摘要(安全/截断;不记录请求内容) - log_upstream_error_body: false + log_upstream_error_body: true # Max bytes to log from upstream error body # 记录上游错误响应体的最大字节数 log_upstream_error_body_max_bytes: 2048 diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts index 42b9e70d..3c39a32b 100644 --- a/frontend/src/api/admin/ops.ts +++ b/frontend/src/api/admin/ops.ts @@ -704,6 +704,12 @@ export interface OpsErrorDetail extends OpsErrorLog { error_body: string user_agent: string + // Upstream context (optional; enriched by gateway services) + upstream_status_code?: number | null + upstream_error_message?: string + upstream_error_detail?: string + upstream_errors?: string + auth_latency_ms?: number | null routing_latency_ms?: number | null upstream_latency_ms?: number | null diff --git a/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue index f8166040..0726bacd 100644 --- a/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue +++ b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue @@ -177,6 +177,81 @@ + +
+

+ {{ t('admin.ops.errorDetails.upstreamErrors') }} +

+ +
+
+
status
+
+ {{ detail.upstream_status_code != null ? detail.upstream_status_code : '—' }} +
+
+
+
message
+
+ {{ detail.upstream_error_message || '—' }} +
+
+
+ +
+
detail
+
{{ prettyJSON(detail.upstream_error_detail) }}
+
+ +
+
upstream_errors
+ +
+
+
+
+ #{{ idx + 1 }} {{ ev.kind }} +
+
+ {{ ev.at_unix_ms ? formatDateTime(new Date(ev.at_unix_ms)) : '' }} +
+
+ +
+
account_id: {{ ev.account_id ?? '—' }}
+
status: {{ ev.upstream_status_code ?? '—' }}
+
+ request_id: {{ ev.upstream_request_id || '—' }} +
+
+ +
+ {{ ev.message }} +
+ +
{{ prettyJSON(ev.detail) }}
+
+
+ +
{{ prettyJSON(detail.upstream_errors) }}
+
+
+
@@ -259,6 +334,28 @@ const title = computed(() => { const emptyText = computed(() => 'No error selected.') +type UpstreamErrorEvent = { + at_unix_ms?: number + platform?: string + account_id?: number + upstream_status_code?: number + upstream_request_id?: string + kind?: string + message?: string + detail?: string +} + +const upstreamErrors = computed(() => { + const raw = detail.value?.upstream_errors + if (!raw) return [] + try { + const parsed = JSON.parse(raw) + return Array.isArray(parsed) ? (parsed as UpstreamErrorEvent[]) : [] + } catch { + return [] + } +}) + function close() { emit('update:show', false) } From e4bc9f6fb05ce0383c5c3f6e9da6fa487a7f2634 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Sun, 11 Jan 2026 15:50:26 +0800 Subject: [PATCH 24/53] =?UTF-8?q?feat(ops):=20=E4=BC=98=E5=8C=96=E4=BB=AA?= =?UTF-8?q?=E8=A1=A8=E7=9B=98Header=E5=93=8D=E5=BA=94=E5=BC=8F=E5=B8=83?= =?UTF-8?q?=E5=B1=80=E4=B8=8E=E6=8C=87=E6=A0=87=E5=B1=95=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **响应式优化**: - 添加flex-wrap支持窄屏时间选择器自动换行 - 当前QPS/TPS在窄屏时自动换行,避免溢出 - 时间按钮在窄屏使用更小字号和间距(9px/1.5px) - 当前数值使用响应式字体(xl→sm:2xl) **指标展示优化**: 1. 请求卡片: - 标题简化:总请求 → 请求 - 字段调整:请求 → 请求数 - 移除:平均延迟、平均首字延迟(避免冗余) 2. 延迟和TTFT卡片: - 布局:grid → flex-wrap(自适应布局) - 指标不换行:添加whitespace-nowrap - 最小宽度:min-w-[60px]保证可读性 - 单位内联:名称、数值、单位在同一行(P95: 123 ms) - 自动换行:整个指标项作为整体换行 **效果**: - 窄屏:所有元素自动适配,无溢出 - 宽屏:充分利用空间,清晰展示 - 灵活布局:根据容器宽度自动调整指标排列 --- .../ops/components/OpsDashboardHeader.vue | 186 ++++++++++-------- 1 file changed, 101 insertions(+), 85 deletions(-) diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue index 05b711d4..312642c3 100644 --- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue +++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue @@ -833,9 +833,9 @@ function openJobsDetails() {
-
+
-
+
@@ -844,12 +844,12 @@ function openJobsDetails() {
-
+
-
- +
+
{{ t('admin.ops.current') }}
-
- {{ displayRealTimeQps.toFixed(1) }} - QPS -
-
- TPS: {{ displayRealTimeTps.toFixed(1) }} +
+
+ {{ displayRealTimeQps.toFixed(1) }} + QPS +
+
+ {{ displayRealTimeTps.toFixed(1) }} + TPS +
- -
-
{{ t('admin.ops.peak') }}
-
- {{ qpsPeakLabel }} - QPS + +
+ +
+
{{ t('admin.ops.peak') }}
+
+
+ {{ qpsPeakLabel }} + QPS +
+
+ {{ tpsPeakLabel }} + TPS +
+
-
- TPS: {{ tpsPeakLabel }} + + +
+
{{ t('admin.ops.average') }}
+
+
+ {{ qpsAvgLabel }} + QPS +
+
+ {{ tpsAvgLabel }} + TPS +
+
- -
-
{{ t('admin.ops.average') }}
-
- QPS: {{ qpsAvgLabel }} - - TPS: {{ tpsAvgLabel }} -
- - -
- - - - - -
+ +
+ + + + +
@@ -924,11 +938,11 @@ function openJobsDetails() {
- +
- {{ t('admin.ops.totalRequests') }} + {{ t('admin.ops.requests') }}
@@ -1018,26 +1024,31 @@ function openJobsDetails() {
ms (P99)
-
-
+
+
P95: - {{ durationP95Ms ?? '-' }}ms + {{ durationP95Ms ?? '-' }} + ms
-
+
P90: - {{ durationP90Ms ?? '-' }}ms + {{ durationP90Ms ?? '-' }} + ms
-
+
P50: - {{ durationP50Ms ?? '-' }}ms + {{ durationP50Ms ?? '-' }} + ms
-
+
Avg: - {{ durationAvgMs ?? '-' }}ms + {{ durationAvgMs ?? '-' }} + ms
-
+
Max: - {{ durationMaxMs ?? '-' }}ms + {{ durationMaxMs ?? '-' }} + ms
@@ -1063,26 +1074,31 @@ function openJobsDetails() {
ms (P99)
-
-
+
+
P95: - {{ ttftP95Ms ?? '-' }}ms + {{ ttftP95Ms ?? '-' }} + ms
-
+
P90: - {{ ttftP90Ms ?? '-' }}ms + {{ ttftP90Ms ?? '-' }} + ms
-
+
P50: - {{ ttftP50Ms ?? '-' }}ms + {{ ttftP50Ms ?? '-' }} + ms
-
+
Avg: - {{ ttftAvgMs ?? '-' }}ms + {{ ttftAvgMs ?? '-' }} + ms
-
+
Max: - {{ ttftMaxMs ?? '-' }}ms + {{ ttftMaxMs ?? '-' }} + ms
From abbde130abc40c1fa507ed6ddc107debd3bfe39e Mon Sep 17 00:00:00 2001 From: cyhhao Date: Sun, 11 Jan 2026 18:43:47 +0800 Subject: [PATCH 25/53] Revert Codex OAuth fallback handling --- .../service/openai_codex_transform.go | 124 ------------------ 1 file changed, 124 deletions(-) diff --git a/backend/internal/service/openai_codex_transform.go b/backend/internal/service/openai_codex_transform.go index 965fb770..94e74f22 100644 --- a/backend/internal/service/openai_codex_transform.go +++ b/backend/internal/service/openai_codex_transform.go @@ -115,12 +115,6 @@ func applyCodexOAuthTransform(reqBody map[string]any) codexTransformResult { existingInstructions = strings.TrimSpace(existingInstructions) if instructions != "" { - if existingInstructions != "" && existingInstructions != instructions { - if input, ok := reqBody["input"].([]any); ok { - reqBody["input"] = prependSystemInstruction(input, existingInstructions) - result.Modified = true - } - } if existingInstructions != instructions { reqBody["instructions"] = instructions result.Modified = true @@ -129,7 +123,6 @@ func applyCodexOAuthTransform(reqBody map[string]any) codexTransformResult { if input, ok := reqBody["input"].([]any); ok { input = filterCodexInput(input) - input = normalizeOrphanedToolOutputs(input) reqBody["input"] = input result.Modified = true } @@ -266,19 +259,6 @@ func filterCodexInput(input []any) []any { return filtered } -func prependSystemInstruction(input []any, instructions string) []any { - message := map[string]any{ - "role": "system", - "content": []any{ - map[string]any{ - "type": "input_text", - "text": instructions, - }, - }, - } - return append([]any{message}, input...) -} - func normalizeCodexTools(reqBody map[string]any) bool { rawTools, ok := reqBody["tools"] if !ok || rawTools == nil { @@ -341,110 +321,6 @@ func normalizeCodexTools(reqBody map[string]any) bool { return modified } -func normalizeOrphanedToolOutputs(input []any) []any { - functionCallIDs := map[string]bool{} - localShellCallIDs := map[string]bool{} - customToolCallIDs := map[string]bool{} - - for _, item := range input { - m, ok := item.(map[string]any) - if !ok { - continue - } - callID := getCallID(m) - if callID == "" { - continue - } - switch m["type"] { - case "function_call": - functionCallIDs[callID] = true - case "local_shell_call": - localShellCallIDs[callID] = true - case "custom_tool_call": - customToolCallIDs[callID] = true - } - } - - output := make([]any, 0, len(input)) - for _, item := range input { - m, ok := item.(map[string]any) - if !ok { - output = append(output, item) - continue - } - switch m["type"] { - case "function_call_output": - callID := getCallID(m) - if callID == "" || (!functionCallIDs[callID] && !localShellCallIDs[callID]) { - output = append(output, convertOrphanedOutputToMessage(m, callID)) - continue - } - case "custom_tool_call_output": - callID := getCallID(m) - if callID == "" || !customToolCallIDs[callID] { - output = append(output, convertOrphanedOutputToMessage(m, callID)) - continue - } - case "local_shell_call_output": - callID := getCallID(m) - if callID == "" || !localShellCallIDs[callID] { - output = append(output, convertOrphanedOutputToMessage(m, callID)) - continue - } - } - output = append(output, m) - } - return output -} - -func getCallID(item map[string]any) string { - raw, ok := item["call_id"] - if !ok { - return "" - } - callID, ok := raw.(string) - if !ok { - return "" - } - callID = strings.TrimSpace(callID) - if callID == "" { - return "" - } - return callID -} - -func convertOrphanedOutputToMessage(item map[string]any, callID string) map[string]any { - toolName := "tool" - if name, ok := item["name"].(string); ok && name != "" { - toolName = name - } - labelID := callID - if labelID == "" { - labelID = "unknown" - } - text := stringifyOutput(item["output"]) - if len(text) > 16000 { - text = text[:16000] + "\n...[truncated]" - } - return map[string]any{ - "type": "message", - "role": "assistant", - "content": fmt.Sprintf("[Previous %s result; call_id=%s]: %s", toolName, labelID, text), - } -} - -func stringifyOutput(output any) string { - switch v := output.(type) { - case string: - return v - default: - if data, err := json.Marshal(v); err == nil { - return string(data) - } - return fmt.Sprintf("%v", v) - } -} - func codexCachePath(filename string) string { home, err := os.UserHomeDir() if err != nil { From f541636840b76961d856a671b6a4f44a32b0ebcc Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Sun, 11 Jan 2026 19:50:43 +0800 Subject: [PATCH 26/53] =?UTF-8?q?feat(ops):=20=E4=BC=98=E5=8C=96=E8=AD=A6?= =?UTF-8?q?=E6=8A=A5=E8=A7=84=E5=88=99=E5=92=8C=E8=AE=BE=E7=BD=AE=E7=9A=84?= =?UTF-8?q?=E6=88=90=E5=8A=9F=E6=8F=90=E7=A4=BA=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加警报规则保存成功提示:"警报规则保存成功" - 添加警报规则删除成功提示:"警报规则删除成功" - 添加运维监控设置保存成功提示:"运维监控设置保存成功" - 替换通用的"操作成功"提示为具体的业务提示 - 失败时显示后端返回的详细错误信息 相关文件: - frontend/src/i18n/locales/zh.ts - frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue - frontend/src/views/admin/ops/components/OpsSettingsDialog.vue --- frontend/src/i18n/locales/zh.ts | 42 ++ .../ops/components/OpsAlertRulesCard.vue | 4 +- .../ops/components/OpsSettingsDialog.vue | 395 ++++++++++++++++++ 3 files changed, 439 insertions(+), 2 deletions(-) create mode 100644 frontend/src/views/admin/ops/components/OpsSettingsDialog.vue diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index dacf2c61..95406179 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -154,6 +154,7 @@ export default { saving: '保存中...', selectedCount: '(已选 {count} 个)', refresh: '刷新', + settings: '设置', notAvailable: '不可用', now: '现在', unknown: '未知', @@ -2205,13 +2206,16 @@ export default { loading: '加载中...', empty: '暂无告警规则', loadFailed: '加载告警规则失败', + saveSuccess: '警报规则保存成功', saveFailed: '保存告警规则失败', + deleteSuccess: '警报规则删除成功', deleteFailed: '删除告警规则失败', create: '新建规则', createTitle: '新建告警规则', editTitle: '编辑告警规则', deleteConfirmTitle: '确认删除该规则?', deleteConfirmMessage: '将删除该规则及其关联的告警事件,是否继续?', + manage: '预警规则', metrics: { successRate: '成功率 (%)', errorRate: '错误率 (%)', @@ -2350,6 +2354,42 @@ export default { accountHealthThresholdRange: '账号健康错误率阈值必须在 0 到 100 之间' } }, + settings: { + title: '运维监控设置', + loadFailed: '加载设置失败', + saveSuccess: '运维监控设置保存成功', + saveFailed: '保存设置失败', + dataCollection: '数据采集', + evaluationInterval: '评估间隔(秒)', + evaluationIntervalHint: '检测任务的执行频率,建议保持默认', + alertConfig: '预警配置', + enableAlert: '开启预警', + alertRecipients: '预警接收邮箱', + emailPlaceholder: '输入邮箱地址', + recipientsHint: '若为空,系统将使用第一个管理员邮箱作为默认收件人', + minSeverity: '最低级别', + reportConfig: '评估报告配置', + enableReport: '开启评估报告', + reportRecipients: '评估报告接收邮箱', + dailySummary: '每日摘要', + weeklySummary: '每周摘要', + advancedSettings: '高级设置', + dataRetention: '数据保留策略', + enableCleanup: '启用数据清理', + cleanupSchedule: '清理计划(Cron)', + cleanupScheduleHint: '例如:0 2 * * * 表示每天凌晨2点', + errorLogRetentionDays: '错误日志保留天数', + minuteMetricsRetentionDays: '分钟指标保留天数', + hourlyMetricsRetentionDays: '小时指标保留天数', + retentionDaysHint: '建议保留7-90天,过长会占用存储空间', + aggregation: '预聚合任务', + enableAggregation: '启用预聚合任务', + aggregationHint: '预聚合可提升长时间窗口查询性能', + validation: { + title: '请先修正以下问题', + retentionDaysRange: '保留天数必须在1-365天之间' + } + }, concurrency: { title: '并发 / 排队', byPlatform: '按平台', @@ -2383,10 +2423,12 @@ export default { accountError: '异常' }, tooltips: { + totalRequests: '当前时间窗口内的总请求数和Token消耗量。', throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。', latencyHistogram: '成功请求的延迟分布(毫秒)。', errorTrend: '错误趋势(SLA 口径排除业务限制;上游错误率排除 429/529)。', errorDistribution: '按状态码统计的错误分布。', + upstreamErrors: '上游服务返回的错误,包括API提供商的错误响应(排除429/529限流错误)。', goroutines: 'Go 运行时的协程数量(轻量级线程)。没有绝对“安全值”,建议以历史基线为准。经验参考:<2000 常见;2000-8000 需关注;>8000 且伴随队列/延迟上升时,优先排查阻塞/泄漏。', cpu: 'CPU 使用率,显示系统处理器的负载情况。', diff --git a/frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue b/frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue index 6bf1dcae..edf8c40c 100644 --- a/frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue +++ b/frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue @@ -136,7 +136,7 @@ async function save() { draft.value = null editingId.value = null await load() - appStore.showSuccess(t('common.success')) + appStore.showSuccess(t('admin.ops.alertRules.saveSuccess')) } catch (err: any) { console.error('[OpsAlertRulesCard] Failed to save rule', err) appStore.showError(err?.response?.data?.detail || t('admin.ops.alertRules.saveFailed')) @@ -160,7 +160,7 @@ async function confirmDelete() { showDeleteConfirm.value = false pendingDelete.value = null await load() - appStore.showSuccess(t('common.success')) + appStore.showSuccess(t('admin.ops.alertRules.deleteSuccess')) } catch (err: any) { console.error('[OpsAlertRulesCard] Failed to delete rule', err) appStore.showError(err?.response?.data?.detail || t('admin.ops.alertRules.deleteFailed')) diff --git a/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue new file mode 100644 index 00000000..968c5081 --- /dev/null +++ b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue @@ -0,0 +1,395 @@ + + + From 988b4d0254635ecd127fedeb5bdbc99f23aee996 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Sun, 11 Jan 2026 19:51:18 +0800 Subject: [PATCH 27/53] =?UTF-8?q?feat(ops):=20=E6=B7=BB=E5=8A=A0=E9=AB=98?= =?UTF-8?q?=E7=BA=A7=E8=AE=BE=E7=BD=AEAPI=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增OpsAdvancedSettings数据模型 - 支持数据保留策略配置(错误日志、分钟级指标、小时级指标) - 支持数据聚合开关配置 - 添加GET/PUT /admin/ops/advanced-settings接口 - 添加配置校验和默认值处理 相关文件: - backend/internal/service/ops_settings_models.go - backend/internal/service/ops_settings.go - backend/internal/handler/admin/ops_settings_handler.go - backend/internal/server/routes/admin.go - backend/internal/service/domain_constants.go --- .../handler/admin/ops_settings_handler.go | 46 +++++++ backend/internal/server/routes/admin.go | 4 + backend/internal/service/domain_constants.go | 3 + backend/internal/service/ops_settings.go | 112 ++++++++++++++++++ .../internal/service/ops_settings_models.go | 18 +++ 5 files changed, 183 insertions(+) diff --git a/backend/internal/handler/admin/ops_settings_handler.go b/backend/internal/handler/admin/ops_settings_handler.go index e76c1b20..deac13b7 100644 --- a/backend/internal/handler/admin/ops_settings_handler.go +++ b/backend/internal/handler/admin/ops_settings_handler.go @@ -101,3 +101,49 @@ func (h *OpsHandler) UpdateAlertRuntimeSettings(c *gin.Context) { response.Success(c, updated) } +// GetAdvancedSettings returns Ops advanced settings (DB-backed). +// GET /api/v1/admin/ops/advanced-settings +func (h *OpsHandler) GetAdvancedSettings(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + cfg, err := h.opsService.GetOpsAdvancedSettings(c.Request.Context()) + if err != nil { + response.Error(c, http.StatusInternalServerError, "Failed to get advanced settings") + return + } + response.Success(c, cfg) +} + +// UpdateAdvancedSettings updates Ops advanced settings (DB-backed). +// PUT /api/v1/admin/ops/advanced-settings +func (h *OpsHandler) UpdateAdvancedSettings(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + var req service.OpsAdvancedSettings + if err := c.ShouldBindJSON(&req); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + + updated, err := h.opsService.UpdateOpsAdvancedSettings(c.Request.Context(), &req) + if err != nil { + response.Error(c, http.StatusBadRequest, err.Error()) + return + } + response.Success(c, updated) +} + diff --git a/backend/internal/server/routes/admin.go b/backend/internal/server/routes/admin.go index e3385ef1..f3e66d04 100644 --- a/backend/internal/server/routes/admin.go +++ b/backend/internal/server/routes/admin.go @@ -92,6 +92,10 @@ func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) { runtime.PUT("/alert", h.Admin.Ops.UpdateAlertRuntimeSettings) } + // Advanced settings (DB-backed) + ops.GET("/advanced-settings", h.Admin.Ops.GetAdvancedSettings) + ops.PUT("/advanced-settings", h.Admin.Ops.UpdateAdvancedSettings) + // WebSocket realtime (QPS/TPS) ws := ops.Group("/ws") { diff --git a/backend/internal/service/domain_constants.go b/backend/internal/service/domain_constants.go index 4fcebe2b..398d9fbd 100644 --- a/backend/internal/service/domain_constants.go +++ b/backend/internal/service/domain_constants.go @@ -143,6 +143,9 @@ const ( // SettingKeyOpsMetricsIntervalSeconds controls the ops metrics collector interval (>=60). SettingKeyOpsMetricsIntervalSeconds = "ops_metrics_interval_seconds" + + // SettingKeyOpsAdvancedSettings stores JSON config for ops advanced settings (data retention, aggregation). + SettingKeyOpsAdvancedSettings = "ops_advanced_settings" ) // AdminAPIKeyPrefix is the prefix for admin API keys (distinct from user "sk-" keys). diff --git a/backend/internal/service/ops_settings.go b/backend/internal/service/ops_settings.go index 2f15bc79..00db8e99 100644 --- a/backend/internal/service/ops_settings.go +++ b/backend/internal/service/ops_settings.go @@ -352,3 +352,115 @@ func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *Ops return updated, nil } +// ========================= +// Advanced settings +// ========================= + +func defaultOpsAdvancedSettings() *OpsAdvancedSettings { + return &OpsAdvancedSettings{ + DataRetention: OpsDataRetentionSettings{ + CleanupEnabled: false, + CleanupSchedule: "0 2 * * *", + ErrorLogRetentionDays: 30, + MinuteMetricsRetentionDays: 30, + HourlyMetricsRetentionDays: 30, + }, + Aggregation: OpsAggregationSettings{ + AggregationEnabled: false, + }, + } +} + +func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) { + if cfg == nil { + return + } + cfg.DataRetention.CleanupSchedule = strings.TrimSpace(cfg.DataRetention.CleanupSchedule) + if cfg.DataRetention.CleanupSchedule == "" { + cfg.DataRetention.CleanupSchedule = "0 2 * * *" + } + if cfg.DataRetention.ErrorLogRetentionDays <= 0 { + cfg.DataRetention.ErrorLogRetentionDays = 30 + } + if cfg.DataRetention.MinuteMetricsRetentionDays <= 0 { + cfg.DataRetention.MinuteMetricsRetentionDays = 30 + } + if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 { + cfg.DataRetention.HourlyMetricsRetentionDays = 30 + } +} + +func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error { + if cfg == nil { + return errors.New("invalid config") + } + if cfg.DataRetention.ErrorLogRetentionDays < 1 || cfg.DataRetention.ErrorLogRetentionDays > 365 { + return errors.New("error_log_retention_days must be between 1 and 365") + } + if cfg.DataRetention.MinuteMetricsRetentionDays < 1 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 { + return errors.New("minute_metrics_retention_days must be between 1 and 365") + } + if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 { + return errors.New("hourly_metrics_retention_days must be between 1 and 365") + } + return nil +} + +func (s *OpsService) GetOpsAdvancedSettings(ctx context.Context) (*OpsAdvancedSettings, error) { + defaultCfg := defaultOpsAdvancedSettings() + if s == nil || s.settingRepo == nil { + return defaultCfg, nil + } + if ctx == nil { + ctx = context.Background() + } + + raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAdvancedSettings) + if err != nil { + if errors.Is(err, ErrSettingNotFound) { + if b, mErr := json.Marshal(defaultCfg); mErr == nil { + _ = s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(b)) + } + return defaultCfg, nil + } + return nil, err + } + + cfg := &OpsAdvancedSettings{} + if err := json.Unmarshal([]byte(raw), cfg); err != nil { + return defaultCfg, nil + } + + normalizeOpsAdvancedSettings(cfg) + return cfg, nil +} + +func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdvancedSettings) (*OpsAdvancedSettings, error) { + if s == nil || s.settingRepo == nil { + return nil, errors.New("setting repository not initialized") + } + if ctx == nil { + ctx = context.Background() + } + if cfg == nil { + return nil, errors.New("invalid config") + } + + if err := validateOpsAdvancedSettings(cfg); err != nil { + return nil, err + } + + normalizeOpsAdvancedSettings(cfg) + raw, err := json.Marshal(cfg) + if err != nil { + return nil, err + } + if err := s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(raw)); err != nil { + return nil, err + } + + updated := &OpsAdvancedSettings{} + _ = json.Unmarshal(raw, updated) + return updated, nil +} + diff --git a/backend/internal/service/ops_settings_models.go b/backend/internal/service/ops_settings_models.go index 78399c49..52a9db66 100644 --- a/backend/internal/service/ops_settings_models.go +++ b/backend/internal/service/ops_settings_models.go @@ -68,3 +68,21 @@ type OpsAlertRuntimeSettings struct { Silencing OpsAlertSilencingSettings `json:"silencing"` } +// OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation). +type OpsAdvancedSettings struct { + DataRetention OpsDataRetentionSettings `json:"data_retention"` + Aggregation OpsAggregationSettings `json:"aggregation"` +} + +type OpsDataRetentionSettings struct { + CleanupEnabled bool `json:"cleanup_enabled"` + CleanupSchedule string `json:"cleanup_schedule"` + ErrorLogRetentionDays int `json:"error_log_retention_days"` + MinuteMetricsRetentionDays int `json:"minute_metrics_retention_days"` + HourlyMetricsRetentionDays int `json:"hourly_metrics_retention_days"` +} + +type OpsAggregationSettings struct { + AggregationEnabled bool `json:"aggregation_enabled"` +} + From a39316e004baff96487423ac40093b9dabeda3df Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Sun, 11 Jan 2026 19:51:37 +0800 Subject: [PATCH 28/53] =?UTF-8?q?feat(ops):=20=E9=9B=86=E6=88=90=E8=BF=90?= =?UTF-8?q?=E7=BB=B4=E7=9B=91=E6=8E=A7=E8=AE=BE=E7=BD=AE=E5=AF=B9=E8=AF=9D?= =?UTF-8?q?=E6=A1=86=E5=88=B0=E4=BB=AA=E8=A1=A8=E7=9B=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 在OpsDashboardHeader添加设置和警报规则按钮 - 在OpsDashboard集成OpsSettingsDialog组件 - 添加警报规则弹窗展示 - 添加高级设置API类型定义 - 支持从Header快速访问设置和规则管理 相关文件: - frontend/src/api/admin/ops.ts - frontend/src/views/admin/ops/types.ts - frontend/src/views/admin/ops/OpsDashboard.vue - frontend/src/views/admin/ops/components/OpsDashboardHeader.vue --- frontend/src/api/admin/ops.ts | 32 +++++++++++++++++- frontend/src/views/admin/ops/OpsDashboard.vue | 16 +++++++++ .../ops/components/OpsDashboardHeader.vue | 33 +++++++++++++++++-- frontend/src/views/admin/ops/types.ts | 5 ++- 4 files changed, 82 insertions(+), 4 deletions(-) diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts index 3c39a32b..c0df4605 100644 --- a/frontend/src/api/admin/ops.ts +++ b/frontend/src/api/admin/ops.ts @@ -676,6 +676,23 @@ export interface OpsAlertRuntimeSettings { } } +export interface OpsAdvancedSettings { + data_retention: OpsDataRetentionSettings + aggregation: OpsAggregationSettings +} + +export interface OpsDataRetentionSettings { + cleanup_enabled: boolean + cleanup_schedule: string + error_log_retention_days: number + minute_metrics_retention_days: number + hourly_metrics_retention_days: number +} + +export interface OpsAggregationSettings { + aggregation_enabled: boolean +} + export interface OpsErrorLog { id: number created_at: string @@ -894,6 +911,17 @@ export async function updateAlertRuntimeSettings(config: OpsAlertRuntimeSettings return data } +// Advanced settings (DB-backed) +export async function getAdvancedSettings(): Promise { + const { data } = await apiClient.get('/admin/ops/advanced-settings') + return data +} + +export async function updateAdvancedSettings(config: OpsAdvancedSettings): Promise { + const { data } = await apiClient.put('/admin/ops/advanced-settings', config) + return data +} + export const opsAPI = { getDashboardOverview, getThroughputTrend, @@ -915,7 +943,9 @@ export const opsAPI = { getEmailNotificationConfig, updateEmailNotificationConfig, getAlertRuntimeSettings, - updateAlertRuntimeSettings + updateAlertRuntimeSettings, + getAdvancedSettings, + updateAdvancedSettings } export default opsAPI diff --git a/frontend/src/views/admin/ops/OpsDashboard.vue b/frontend/src/views/admin/ops/OpsDashboard.vue index 212717fb..e8fedc5a 100644 --- a/frontend/src/views/admin/ops/OpsDashboard.vue +++ b/frontend/src/views/admin/ops/OpsDashboard.vue @@ -31,6 +31,8 @@ @refresh="fetchData" @open-request-details="handleOpenRequestDetails" @open-error-details="openErrorDetails" + @open-settings="showSettingsDialog = true" + @open-alert-rules="showAlertRulesCard = true" /> @@ -72,6 +74,14 @@ + + + + + + + + ({ sort: 'created_at_desc' }) +const showSettingsDialog = ref(false) +const showAlertRulesCard = ref(false) + function handleThroughputSelectPlatform(nextPlatform: string) { platform.value = nextPlatform || '' groupId.value = null diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue index 312642c3..23609a06 100644 --- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue +++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue @@ -34,6 +34,8 @@ interface Emits { (e: 'refresh'): void (e: 'openRequestDetails', preset?: OpsRequestDetailsPreset): void (e: 'openErrorDetails', kind: 'request' | 'upstream'): void + (e: 'openSettings'): void + (e: 'openAlertRules'): void } const props = defineProps() @@ -723,6 +725,33 @@ function openJobsDetails() { /> + + + + + +
@@ -955,11 +984,11 @@ function openJobsDetails() {
- 请求数: + {{ t('admin.ops.requests') }}: {{ totalRequestsLabel }}
- Token: + {{ t('admin.ops.tokens') }}: {{ totalTokensLabel }}
diff --git a/frontend/src/views/admin/ops/types.ts b/frontend/src/views/admin/ops/types.ts index 08830542..45ba031f 100644 --- a/frontend/src/views/admin/ops/types.ts +++ b/frontend/src/views/admin/ops/types.ts @@ -13,5 +13,8 @@ export type { Operator, EmailNotificationConfig, OpsDistributedLockSettings, - OpsAlertRuntimeSettings + OpsAlertRuntimeSettings, + OpsAdvancedSettings, + OpsDataRetentionSettings, + OpsAggregationSettings } from '@/api/admin/ops' From 63dc6a68dfb76ef21c5207de57a224b5a36fb25d Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Sun, 11 Jan 2026 19:58:38 +0800 Subject: [PATCH 29/53] =?UTF-8?q?feat(ops):=20=E9=9A=90=E8=97=8F=E6=9F=A5?= =?UTF-8?q?=E8=AF=A2=E6=A8=A1=E5=BC=8F=E9=80=89=E6=8B=A9=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 在OpsDashboardHeader中隐藏queryMode选择器(使用v-if="false") - 保留所有后端逻辑和前端状态管理 - auto模式逻辑:优先使用预聚合数据,不存在时回退到实时计算 - 用户界面更简洁,后端自动选择最优查询方式 相关文件: - frontend/src/views/admin/ops/components/OpsDashboardHeader.vue --- frontend/src/views/admin/ops/components/OpsDashboardHeader.vue | 1 + 1 file changed, 1 insertion(+) diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue index 23609a06..fb622eaa 100644 --- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue +++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue @@ -703,6 +703,7 @@ function openJobsDetails() { /> +
+

{{ selectedMetricDefinition.description }}

+

+ {{ + t('admin.ops.alertRules.hints.recommended', { + operator: selectedMetricDefinition.recommendedOperator, + threshold: selectedMetricDefinition.recommendedThreshold, + unit: selectedMetricDefinition.unit || '' + }) + }} +

+
@@ -328,6 +530,23 @@ function cancelDelete() { +

+ {{ isGroupMetricSelected ? t('admin.ops.alertRules.hints.groupRequired') : t('admin.ops.alertRules.hints.groupOptional') }} +

+
+
diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue index fb622eaa..afc17813 100644 --- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue +++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue @@ -239,47 +239,6 @@ const ttftP50Ms = computed(() => overview.value?.ttft?.p50_ms ?? null) const ttftAvgMs = computed(() => overview.value?.ttft?.avg_ms ?? null) const ttftMaxMs = computed(() => overview.value?.ttft?.max_ms ?? null) -// --- WebSocket status --- - -const wsStatusLabel = computed(() => { - switch (props.wsStatus) { - case 'connected': - return t('admin.ops.realtime.connected') - case 'connecting': - return t('admin.ops.realtime.connecting') - case 'reconnecting': - return t('admin.ops.realtime.reconnecting') - case 'offline': - return t('admin.ops.realtime.offline') - case 'closed': - default: - return t('admin.ops.realtime.closed') - } -}) - -const wsStatusDotClass = computed(() => { - switch (props.wsStatus) { - case 'connected': - return 'bg-green-500' - case 'reconnecting': - case 'connecting': - return 'bg-yellow-500' - case 'offline': - return 'bg-orange-500' - case 'closed': - default: - return 'bg-gray-400' - } -}) - -const wsReconnectHint = computed(() => { - if (props.wsStatus !== 'reconnecting') return '' - const delayMs = props.wsReconnectInMs ?? null - if (typeof delayMs !== 'number' || !Number.isFinite(delayMs) || delayMs <= 0) return '' - const sec = Math.max(1, Math.ceil(delayMs / 1000)) - return t('admin.ops.realtime.reconnectIn', { seconds: sec }) -}) - // --- Health Score & Diagnosis (primary) --- const isSystemIdle = computed(() => { @@ -662,19 +621,14 @@ function openJobsDetails() { · {{ t('common.refresh') }}: {{ updatedAtLabel }} - · - - - - +
@@ -1189,14 +1143,6 @@ function openJobsDetails() {
-
- - {{ t('admin.ops.collectedAt') }} {{ formatTimeShort(systemMetrics.created_at) }} - ({{ t('admin.ops.window') }} {{ systemMetrics.window_minutes }}m) - - {{ t('admin.ops.noSystemMetrics') }} -
-
diff --git a/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue b/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue index e1063bce..3044ee3a 100644 --- a/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue +++ b/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue @@ -150,11 +150,10 @@ const kindBadgeClass = (kind: string) => {