From 4b9e47cec915f4ca1709e206a31dd43937d2a4af Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Fri, 9 Jan 2026 20:51:41 +0800
Subject: [PATCH 01/53] =?UTF-8?q?feat(=E5=9F=BA=E7=A1=80=E8=AE=BE=E6=96=BD?=
=?UTF-8?q?):=20=E6=B7=BB=E5=8A=A0=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7?=
=?UTF-8?q?=E5=8A=9F=E8=83=BD=E7=9A=84=E5=9F=BA=E7=A1=80=E9=85=8D=E7=BD=AE?=
=?UTF-8?q?=E5=92=8C=E4=BE=9D=E8=B5=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 更新 .gitignore 排除临时文件
- 添加 ops 监控相关配置项到 config.yaml
- 更新 Go 依赖包(go.mod/go.sum)
- 扩展 config.go 支持 ops 监控配置
- 新增上下文键定义(ClientRequestID)
---
.gitignore | 2 -
backend/go.mod | 4 +-
backend/go.sum | 4 ++
backend/internal/config/config.go | 71 +++++++++++++++++++++++++++
backend/internal/pkg/ctxkey/ctxkey.go | 6 +++
config.yaml | 35 +++++++++++++
6 files changed, 119 insertions(+), 3 deletions(-)
diff --git a/.gitignore b/.gitignore
index 93ae19f3..ec218bfa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -123,6 +123,4 @@ backend/cmd/server/server
deploy/docker-compose.override.yml
.gocache/
vite.config.js
-!docs/
docs/*
-!docs/dependency-security.md
diff --git a/backend/go.mod b/backend/go.mod
index 9ac48305..97f599f8 100644
--- a/backend/go.mod
+++ b/backend/go.mod
@@ -8,9 +8,11 @@ require (
github.com/golang-jwt/jwt/v5 v5.2.2
github.com/google/uuid v1.6.0
github.com/google/wire v0.7.0
+ github.com/gorilla/websocket v1.5.3
github.com/imroc/req/v3 v3.57.0
github.com/lib/pq v1.10.9
github.com/redis/go-redis/v9 v9.17.2
+ github.com/shirou/gopsutil/v4 v4.25.6
github.com/spf13/viper v1.18.2
github.com/stretchr/testify v1.11.1
github.com/testcontainers/testcontainers-go/modules/postgres v0.40.0
@@ -104,9 +106,9 @@ require (
github.com/quic-go/quic-go v0.57.1 // indirect
github.com/refraction-networking/utls v1.8.1 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
+ github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/sagikazarmark/locafero v0.4.0 // indirect
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
- github.com/shirou/gopsutil/v4 v4.25.6 // indirect
github.com/sirupsen/logrus v1.9.3 // indirect
github.com/sourcegraph/conc v0.3.0 // indirect
github.com/spaolacci/murmur3 v1.1.0 // indirect
diff --git a/backend/go.sum b/backend/go.sum
index 38e2b53e..0adfa4de 100644
--- a/backend/go.sum
+++ b/backend/go.sum
@@ -113,6 +113,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/google/wire v0.7.0 h1:JxUKI6+CVBgCO2WToKy/nQk0sS+amI9z9EjVmdaocj4=
github.com/google/wire v0.7.0/go.mod h1:n6YbUQD9cPKTnHXEBN2DXlOp/mVADhVErcMFb0v3J18=
+github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
+github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg=
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4=
github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
@@ -220,6 +222,8 @@ github.com/refraction-networking/utls v1.8.1 h1:yNY1kapmQU8JeM1sSw2H2asfTIwWxIkr
github.com/refraction-networking/utls v1.8.1/go.mod h1:jkSOEkLqn+S/jtpEHPOsVv/4V4EVnelwbMQl4vCWXAM=
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
+github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go
index e49c188b..6e66b22c 100644
--- a/backend/internal/config/config.go
+++ b/backend/internal/config/config.go
@@ -42,6 +42,7 @@ type Config struct {
Turnstile TurnstileConfig `mapstructure:"turnstile"`
Database DatabaseConfig `mapstructure:"database"`
Redis RedisConfig `mapstructure:"redis"`
+ Ops OpsConfig `mapstructure:"ops"`
JWT JWTConfig `mapstructure:"jwt"`
Default DefaultConfig `mapstructure:"default"`
RateLimit RateLimitConfig `mapstructure:"rate_limit"`
@@ -304,6 +305,47 @@ func (r *RedisConfig) Address() string {
return fmt.Sprintf("%s:%d", r.Host, r.Port)
}
+type OpsConfig struct {
+ // Enabled controls whether ops features should run.
+ //
+ // NOTE: vNext still has a DB-backed feature flag (ops_monitoring_enabled) for runtime on/off.
+ // This config flag is the "hard switch" for deployments that want to disable ops completely.
+ Enabled bool `mapstructure:"enabled"`
+
+ // UsePreaggregatedTables prefers ops_metrics_hourly/daily for long-window dashboard queries.
+ UsePreaggregatedTables bool `mapstructure:"use_preaggregated_tables"`
+
+ // Cleanup controls periodic deletion of old ops data to prevent unbounded growth.
+ Cleanup OpsCleanupConfig `mapstructure:"cleanup"`
+
+ // MetricsCollectorCache controls Redis caching for expensive per-window collector queries.
+ MetricsCollectorCache OpsMetricsCollectorCacheConfig `mapstructure:"metrics_collector_cache"`
+
+ // Pre-aggregation configuration.
+ Aggregation OpsAggregationConfig `mapstructure:"aggregation"`
+}
+
+type OpsCleanupConfig struct {
+ Enabled bool `mapstructure:"enabled"`
+ Schedule string `mapstructure:"schedule"`
+
+ // Retention days (0 disables that cleanup target).
+ //
+ // vNext requirement: default 30 days across ops datasets.
+ ErrorLogRetentionDays int `mapstructure:"error_log_retention_days"`
+ MinuteMetricsRetentionDays int `mapstructure:"minute_metrics_retention_days"`
+ HourlyMetricsRetentionDays int `mapstructure:"hourly_metrics_retention_days"`
+}
+
+type OpsAggregationConfig struct {
+ Enabled bool `mapstructure:"enabled"`
+}
+
+type OpsMetricsCollectorCacheConfig struct {
+ Enabled bool `mapstructure:"enabled"`
+ TTL time.Duration `mapstructure:"ttl"`
+}
+
type JWTConfig struct {
Secret string `mapstructure:"secret"`
ExpireHour int `mapstructure:"expire_hour"`
@@ -489,6 +531,20 @@ func setDefaults() {
viper.SetDefault("redis.pool_size", 128)
viper.SetDefault("redis.min_idle_conns", 10)
+ // Ops (vNext)
+ viper.SetDefault("ops.enabled", true)
+ viper.SetDefault("ops.use_preaggregated_tables", false)
+ viper.SetDefault("ops.cleanup.enabled", true)
+ viper.SetDefault("ops.cleanup.schedule", "0 2 * * *")
+ // Retention days: vNext defaults to 30 days across ops datasets.
+ viper.SetDefault("ops.cleanup.error_log_retention_days", 30)
+ viper.SetDefault("ops.cleanup.minute_metrics_retention_days", 30)
+ viper.SetDefault("ops.cleanup.hourly_metrics_retention_days", 30)
+ viper.SetDefault("ops.aggregation.enabled", true)
+ viper.SetDefault("ops.metrics_collector_cache.enabled", true)
+ // TTL should be slightly larger than collection interval (1m) to maximize cross-replica cache hits.
+ viper.SetDefault("ops.metrics_collector_cache.ttl", 65*time.Second)
+
// JWT
viper.SetDefault("jwt.secret", "")
viper.SetDefault("jwt.expire_hour", 24)
@@ -687,6 +743,21 @@ func (c *Config) Validate() error {
if c.Gateway.Scheduling.SlotCleanupInterval < 0 {
return fmt.Errorf("gateway.scheduling.slot_cleanup_interval must be non-negative")
}
+ if c.Ops.MetricsCollectorCache.TTL < 0 {
+ return fmt.Errorf("ops.metrics_collector_cache.ttl must be non-negative")
+ }
+ if c.Ops.Cleanup.ErrorLogRetentionDays < 0 {
+ return fmt.Errorf("ops.cleanup.error_log_retention_days must be non-negative")
+ }
+ if c.Ops.Cleanup.MinuteMetricsRetentionDays < 0 {
+ return fmt.Errorf("ops.cleanup.minute_metrics_retention_days must be non-negative")
+ }
+ if c.Ops.Cleanup.HourlyMetricsRetentionDays < 0 {
+ return fmt.Errorf("ops.cleanup.hourly_metrics_retention_days must be non-negative")
+ }
+ if c.Ops.Cleanup.Enabled && strings.TrimSpace(c.Ops.Cleanup.Schedule) == "" {
+ return fmt.Errorf("ops.cleanup.schedule is required when ops.cleanup.enabled=true")
+ }
if c.Concurrency.PingInterval < 5 || c.Concurrency.PingInterval > 30 {
return fmt.Errorf("concurrency.ping_interval must be between 5-30 seconds")
}
diff --git a/backend/internal/pkg/ctxkey/ctxkey.go b/backend/internal/pkg/ctxkey/ctxkey.go
index 8920ea69..61d98cc2 100644
--- a/backend/internal/pkg/ctxkey/ctxkey.go
+++ b/backend/internal/pkg/ctxkey/ctxkey.go
@@ -7,4 +7,10 @@ type Key string
const (
// ForcePlatform 强制平台(用于 /antigravity 路由),由 middleware.ForcePlatform 设置
ForcePlatform Key = "ctx_force_platform"
+
+ // ClientRequestID 客户端请求的唯一标识,用于追踪请求全生命周期(用于 Ops 监控与排障)。
+ ClientRequestID Key = "ctx_client_request_id"
+
+ // RetryCount 表示当前请求在网关层的重试次数(用于 Ops 记录与排障)。
+ RetryCount Key = "ctx_retry_count"
)
diff --git a/config.yaml b/config.yaml
index f43c9c19..0ce796e7 100644
--- a/config.yaml
+++ b/config.yaml
@@ -221,6 +221,41 @@ redis:
# 数据库编号(0-15)
db: 0
+# =============================================================================
+# Ops Monitoring (Optional)
+# 运维监控 (可选)
+# =============================================================================
+ops:
+ # Hard switch: disable all ops background jobs and APIs when false
+ # 硬开关:为 false 时禁用所有 Ops 后台任务与接口
+ enabled: true
+
+ # Prefer pre-aggregated tables (ops_metrics_hourly/ops_metrics_daily) for long-window dashboard queries.
+ # 优先使用预聚合表(用于长时间窗口查询性能)
+ use_preaggregated_tables: false
+
+ # Data cleanup configuration
+ # 数据清理配置(vNext 默认统一保留 30 天)
+ cleanup:
+ enabled: true
+ # Cron expression (minute hour dom month dow), e.g. "0 2 * * *" = daily at 2 AM
+ # Cron 表达式(分 时 日 月 周),例如 "0 2 * * *" = 每天凌晨 2 点
+ schedule: "0 2 * * *"
+ error_log_retention_days: 30
+ minute_metrics_retention_days: 30
+ hourly_metrics_retention_days: 30
+
+ # Pre-aggregation configuration
+ # 预聚合任务配置
+ aggregation:
+ enabled: true
+
+ # OpsMetricsCollector Redis cache (reduces duplicate expensive window aggregation in multi-replica deployments)
+ # 指标采集 Redis 缓存(多副本部署时减少重复计算)
+ metrics_collector_cache:
+ enabled: true
+ ttl: 65s
+
# =============================================================================
# JWT Configuration
# JWT 配置
From d55866d3755fb6c4b109e225fd52d62414a4c0e5 Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Fri, 9 Jan 2026 20:52:17 +0800
Subject: [PATCH 02/53] =?UTF-8?q?feat(=E6=95=B0=E6=8D=AE=E5=BA=93):=20?=
=?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E6=95=B0?=
=?UTF-8?q?=E6=8D=AE=E6=A8=A1=E5=9E=8B=E5=92=8C=E6=95=B0=E6=8D=AE=E5=BA=93?=
=?UTF-8?q?=E8=BF=81=E7=A7=BB=E8=84=9A=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 新增 ops 监控数据库迁移脚本(表结构定义)
- 定义核心数据模型(ops_models.go)
- 定义告警相关模型(ops_alert_models.go)
- 定义仪表板数据模型(ops_dashboard_models.go)
- 定义实时监控数据模型(ops_realtime_models.go)
- 定义配置相关模型(ops_settings_models.go)
- 定义趋势分析数据模型(ops_trend_models.go)
---
backend/internal/service/ops_alert_models.go | 75 ++
.../internal/service/ops_dashboard_models.go | 83 ++
backend/internal/service/ops_models.go | 118 +++
.../internal/service/ops_realtime_models.go | 81 ++
.../internal/service/ops_settings_models.go | 70 ++
backend/internal/service/ops_trend_models.go | 65 ++
.../migrations/030_ops_monitoring_vnext.sql | 707 ++++++++++++++++++
7 files changed, 1199 insertions(+)
create mode 100644 backend/internal/service/ops_alert_models.go
create mode 100644 backend/internal/service/ops_dashboard_models.go
create mode 100644 backend/internal/service/ops_models.go
create mode 100644 backend/internal/service/ops_realtime_models.go
create mode 100644 backend/internal/service/ops_settings_models.go
create mode 100644 backend/internal/service/ops_trend_models.go
create mode 100644 backend/migrations/030_ops_monitoring_vnext.sql
diff --git a/backend/internal/service/ops_alert_models.go b/backend/internal/service/ops_alert_models.go
new file mode 100644
index 00000000..783a3d1e
--- /dev/null
+++ b/backend/internal/service/ops_alert_models.go
@@ -0,0 +1,75 @@
+package service
+
+import "time"
+
+// Ops alert rule/event models.
+//
+// NOTE: These are admin-facing DTOs and intentionally keep JSON naming aligned
+// with the existing ops dashboard frontend (backup style).
+
+const (
+ OpsAlertStatusFiring = "firing"
+ OpsAlertStatusResolved = "resolved"
+)
+
+type OpsAlertRule struct {
+ ID int64 `json:"id"`
+ Name string `json:"name"`
+ Description string `json:"description"`
+
+ Enabled bool `json:"enabled"`
+ Severity string `json:"severity"`
+
+ MetricType string `json:"metric_type"`
+ Operator string `json:"operator"`
+ Threshold float64 `json:"threshold"`
+
+ WindowMinutes int `json:"window_minutes"`
+ SustainedMinutes int `json:"sustained_minutes"`
+ CooldownMinutes int `json:"cooldown_minutes"`
+
+ NotifyEmail bool `json:"notify_email"`
+
+ Filters map[string]any `json:"filters,omitempty"`
+
+ LastTriggeredAt *time.Time `json:"last_triggered_at,omitempty"`
+ CreatedAt time.Time `json:"created_at"`
+ UpdatedAt time.Time `json:"updated_at"`
+}
+
+type OpsAlertEvent struct {
+ ID int64 `json:"id"`
+ RuleID int64 `json:"rule_id"`
+ Severity string `json:"severity"`
+ Status string `json:"status"`
+
+ Title string `json:"title"`
+ Description string `json:"description"`
+
+ MetricValue *float64 `json:"metric_value,omitempty"`
+ ThresholdValue *float64 `json:"threshold_value,omitempty"`
+
+ Dimensions map[string]any `json:"dimensions,omitempty"`
+
+ FiredAt time.Time `json:"fired_at"`
+ ResolvedAt *time.Time `json:"resolved_at,omitempty"`
+
+ EmailSent bool `json:"email_sent"`
+ CreatedAt time.Time `json:"created_at"`
+}
+
+type OpsAlertEventFilter struct {
+ Limit int
+
+ // Optional filters.
+ Status string
+ Severity string
+
+ StartTime *time.Time
+ EndTime *time.Time
+
+ // Dimensions filters (best-effort).
+ Platform string
+ GroupID *int64
+}
+
diff --git a/backend/internal/service/ops_dashboard_models.go b/backend/internal/service/ops_dashboard_models.go
new file mode 100644
index 00000000..51a0b1fb
--- /dev/null
+++ b/backend/internal/service/ops_dashboard_models.go
@@ -0,0 +1,83 @@
+package service
+
+import "time"
+
+type OpsDashboardFilter struct {
+ StartTime time.Time
+ EndTime time.Time
+
+ Platform string
+ GroupID *int64
+
+ // QueryMode controls whether dashboard queries should use raw logs or pre-aggregated tables.
+ // Expected values: auto/raw/preagg (see OpsQueryMode).
+ QueryMode OpsQueryMode
+}
+
+type OpsRateSummary struct {
+ Current float64 `json:"current"`
+ Peak float64 `json:"peak"`
+ Avg float64 `json:"avg"`
+}
+
+type OpsPercentiles struct {
+ P50 *int `json:"p50_ms"`
+ P90 *int `json:"p90_ms"`
+ P95 *int `json:"p95_ms"`
+ P99 *int `json:"p99_ms"`
+ Avg *int `json:"avg_ms"`
+ Max *int `json:"max_ms"`
+}
+
+type OpsDashboardOverview struct {
+ StartTime time.Time `json:"start_time"`
+ EndTime time.Time `json:"end_time"`
+ Platform string `json:"platform"`
+ GroupID *int64 `json:"group_id"`
+
+ // Latest system-level snapshot (window=1m, global).
+ SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"`
+
+ // Background jobs health (heartbeats).
+ JobHeartbeats []*OpsJobHeartbeat `json:"job_heartbeats"`
+
+ SuccessCount int64 `json:"success_count"`
+ ErrorCountTotal int64 `json:"error_count_total"`
+ BusinessLimitedCount int64 `json:"business_limited_count"`
+
+ ErrorCountSLA int64 `json:"error_count_sla"`
+ RequestCountTotal int64 `json:"request_count_total"`
+ RequestCountSLA int64 `json:"request_count_sla"`
+
+ TokenConsumed int64 `json:"token_consumed"`
+
+ SLA float64 `json:"sla"`
+ ErrorRate float64 `json:"error_rate"`
+ UpstreamErrorRate float64 `json:"upstream_error_rate"`
+ UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
+ Upstream429Count int64 `json:"upstream_429_count"`
+ Upstream529Count int64 `json:"upstream_529_count"`
+
+ QPS OpsRateSummary `json:"qps"`
+ TPS OpsRateSummary `json:"tps"`
+
+ Duration OpsPercentiles `json:"duration"`
+ TTFT OpsPercentiles `json:"ttft"`
+}
+
+type OpsLatencyHistogramBucket struct {
+ Range string `json:"range"`
+ Count int64 `json:"count"`
+}
+
+// OpsLatencyHistogramResponse is a coarse latency distribution histogram (success requests only).
+// It is used by the Ops dashboard to quickly identify tail latency regressions.
+type OpsLatencyHistogramResponse struct {
+ StartTime time.Time `json:"start_time"`
+ EndTime time.Time `json:"end_time"`
+ Platform string `json:"platform"`
+ GroupID *int64 `json:"group_id"`
+
+ TotalRequests int64 `json:"total_requests"`
+ Buckets []*OpsLatencyHistogramBucket `json:"buckets"`
+}
diff --git a/backend/internal/service/ops_models.go b/backend/internal/service/ops_models.go
new file mode 100644
index 00000000..90b2dc47
--- /dev/null
+++ b/backend/internal/service/ops_models.go
@@ -0,0 +1,118 @@
+package service
+
+import "time"
+
+type OpsErrorLog struct {
+ ID int64 `json:"id"`
+ CreatedAt time.Time `json:"created_at"`
+
+ Phase string `json:"phase"`
+ Type string `json:"type"`
+ Severity string `json:"severity"`
+
+ StatusCode int `json:"status_code"`
+ Platform string `json:"platform"`
+ Model string `json:"model"`
+
+ LatencyMs *int `json:"latency_ms"`
+
+ ClientRequestID string `json:"client_request_id"`
+ RequestID string `json:"request_id"`
+ Message string `json:"message"`
+
+ UserID *int64 `json:"user_id"`
+ APIKeyID *int64 `json:"api_key_id"`
+ AccountID *int64 `json:"account_id"`
+ GroupID *int64 `json:"group_id"`
+
+ ClientIP *string `json:"client_ip"`
+ RequestPath string `json:"request_path"`
+ Stream bool `json:"stream"`
+}
+
+type OpsErrorLogDetail struct {
+ OpsErrorLog
+
+ ErrorBody string `json:"error_body"`
+ UserAgent string `json:"user_agent"`
+
+ // Timings (optional)
+ AuthLatencyMs *int64 `json:"auth_latency_ms"`
+ RoutingLatencyMs *int64 `json:"routing_latency_ms"`
+ UpstreamLatencyMs *int64 `json:"upstream_latency_ms"`
+ ResponseLatencyMs *int64 `json:"response_latency_ms"`
+ TimeToFirstTokenMs *int64 `json:"time_to_first_token_ms"`
+
+ // Retry context
+ RequestBody string `json:"request_body"`
+ RequestBodyTruncated bool `json:"request_body_truncated"`
+ RequestBodyBytes *int `json:"request_body_bytes"`
+ RequestHeaders string `json:"request_headers,omitempty"`
+
+ // vNext metric semantics
+ IsBusinessLimited bool `json:"is_business_limited"`
+}
+
+type OpsErrorLogFilter struct {
+ StartTime *time.Time
+ EndTime *time.Time
+
+ Platform string
+ GroupID *int64
+ AccountID *int64
+
+ StatusCodes []int
+ Phase string
+ Query string
+
+ Page int
+ PageSize int
+}
+
+type OpsErrorLogList struct {
+ Errors []*OpsErrorLog `json:"errors"`
+ Total int `json:"total"`
+ Page int `json:"page"`
+ PageSize int `json:"page_size"`
+}
+
+type OpsRetryAttempt struct {
+ ID int64 `json:"id"`
+ CreatedAt time.Time `json:"created_at"`
+
+ RequestedByUserID int64 `json:"requested_by_user_id"`
+ SourceErrorID int64 `json:"source_error_id"`
+ Mode string `json:"mode"`
+ PinnedAccountID *int64 `json:"pinned_account_id"`
+
+ Status string `json:"status"`
+ StartedAt *time.Time `json:"started_at"`
+ FinishedAt *time.Time `json:"finished_at"`
+ DurationMs *int64 `json:"duration_ms"`
+
+ ResultRequestID *string `json:"result_request_id"`
+ ResultErrorID *int64 `json:"result_error_id"`
+
+ ErrorMessage *string `json:"error_message"`
+}
+
+type OpsRetryResult struct {
+ AttemptID int64 `json:"attempt_id"`
+ Mode string `json:"mode"`
+ Status string `json:"status"`
+
+ PinnedAccountID *int64 `json:"pinned_account_id"`
+ UsedAccountID *int64 `json:"used_account_id"`
+
+ HTTPStatusCode int `json:"http_status_code"`
+ UpstreamRequestID string `json:"upstream_request_id"`
+
+ ResponsePreview string `json:"response_preview"`
+ ResponseTruncated bool `json:"response_truncated"`
+
+ ErrorMessage string `json:"error_message"`
+
+ StartedAt time.Time `json:"started_at"`
+ FinishedAt time.Time `json:"finished_at"`
+ DurationMs int64 `json:"duration_ms"`
+}
diff --git a/backend/internal/service/ops_realtime_models.go b/backend/internal/service/ops_realtime_models.go
new file mode 100644
index 00000000..f7514a24
--- /dev/null
+++ b/backend/internal/service/ops_realtime_models.go
@@ -0,0 +1,81 @@
+package service
+
+import "time"
+
+// PlatformConcurrencyInfo aggregates concurrency usage by platform.
+type PlatformConcurrencyInfo struct {
+ Platform string `json:"platform"`
+ CurrentInUse int64 `json:"current_in_use"`
+ MaxCapacity int64 `json:"max_capacity"`
+ LoadPercentage float64 `json:"load_percentage"`
+ WaitingInQueue int64 `json:"waiting_in_queue"`
+}
+
+// GroupConcurrencyInfo aggregates concurrency usage by group.
+//
+// Note: one account can belong to multiple groups; group totals are therefore not additive across groups.
+type GroupConcurrencyInfo struct {
+ GroupID int64 `json:"group_id"`
+ GroupName string `json:"group_name"`
+ Platform string `json:"platform"`
+ CurrentInUse int64 `json:"current_in_use"`
+ MaxCapacity int64 `json:"max_capacity"`
+ LoadPercentage float64 `json:"load_percentage"`
+ WaitingInQueue int64 `json:"waiting_in_queue"`
+}
+
+// AccountConcurrencyInfo represents real-time concurrency usage for a single account.
+type AccountConcurrencyInfo struct {
+ AccountID int64 `json:"account_id"`
+ AccountName string `json:"account_name"`
+ Platform string `json:"platform"`
+ GroupID int64 `json:"group_id"`
+ GroupName string `json:"group_name"`
+ CurrentInUse int64 `json:"current_in_use"`
+ MaxCapacity int64 `json:"max_capacity"`
+ LoadPercentage float64 `json:"load_percentage"`
+ WaitingInQueue int64 `json:"waiting_in_queue"`
+}
+
+// PlatformAvailability aggregates account availability by platform.
+type PlatformAvailability struct {
+ Platform string `json:"platform"`
+ TotalAccounts int64 `json:"total_accounts"`
+ AvailableCount int64 `json:"available_count"`
+ RateLimitCount int64 `json:"rate_limit_count"`
+ ErrorCount int64 `json:"error_count"`
+}
+
+// GroupAvailability aggregates account availability by group.
+type GroupAvailability struct {
+ GroupID int64 `json:"group_id"`
+ GroupName string `json:"group_name"`
+ Platform string `json:"platform"`
+ TotalAccounts int64 `json:"total_accounts"`
+ AvailableCount int64 `json:"available_count"`
+ RateLimitCount int64 `json:"rate_limit_count"`
+ ErrorCount int64 `json:"error_count"`
+}
+
+// AccountAvailability represents current availability for a single account.
+type AccountAvailability struct {
+ AccountID int64 `json:"account_id"`
+ AccountName string `json:"account_name"`
+ Platform string `json:"platform"`
+ GroupID int64 `json:"group_id"`
+ GroupName string `json:"group_name"`
+
+ Status string `json:"status"`
+
+ IsAvailable bool `json:"is_available"`
+ IsRateLimited bool `json:"is_rate_limited"`
+ IsOverloaded bool `json:"is_overloaded"`
+ HasError bool `json:"has_error"`
+
+ RateLimitResetAt *time.Time `json:"rate_limit_reset_at"`
+ RateLimitRemainingSec *int64 `json:"rate_limit_remaining_sec"`
+ OverloadUntil *time.Time `json:"overload_until"`
+ OverloadRemainingSec *int64 `json:"overload_remaining_sec"`
+ ErrorMessage string `json:"error_message"`
+ TempUnschedulableUntil *time.Time `json:"temp_unschedulable_until,omitempty"`
+}
diff --git a/backend/internal/service/ops_settings_models.go b/backend/internal/service/ops_settings_models.go
new file mode 100644
index 00000000..78399c49
--- /dev/null
+++ b/backend/internal/service/ops_settings_models.go
@@ -0,0 +1,70 @@
+package service
+
+// Ops settings models stored in DB `settings` table (JSON blobs).
+
+type OpsEmailNotificationConfig struct {
+ Alert OpsEmailAlertConfig `json:"alert"`
+ Report OpsEmailReportConfig `json:"report"`
+}
+
+type OpsEmailAlertConfig struct {
+ Enabled bool `json:"enabled"`
+ Recipients []string `json:"recipients"`
+ MinSeverity string `json:"min_severity"`
+ RateLimitPerHour int `json:"rate_limit_per_hour"`
+ BatchingWindowSeconds int `json:"batching_window_seconds"`
+ IncludeResolvedAlerts bool `json:"include_resolved_alerts"`
+}
+
+type OpsEmailReportConfig struct {
+ Enabled bool `json:"enabled"`
+ Recipients []string `json:"recipients"`
+ DailySummaryEnabled bool `json:"daily_summary_enabled"`
+ DailySummarySchedule string `json:"daily_summary_schedule"`
+ WeeklySummaryEnabled bool `json:"weekly_summary_enabled"`
+ WeeklySummarySchedule string `json:"weekly_summary_schedule"`
+ ErrorDigestEnabled bool `json:"error_digest_enabled"`
+ ErrorDigestSchedule string `json:"error_digest_schedule"`
+ ErrorDigestMinCount int `json:"error_digest_min_count"`
+ AccountHealthEnabled bool `json:"account_health_enabled"`
+ AccountHealthSchedule string `json:"account_health_schedule"`
+ AccountHealthErrorRateThreshold float64 `json:"account_health_error_rate_threshold"`
+}
+
+// OpsEmailNotificationConfigUpdateRequest allows partial updates, while the
+// frontend can still send the full config shape.
+type OpsEmailNotificationConfigUpdateRequest struct {
+ Alert *OpsEmailAlertConfig `json:"alert"`
+ Report *OpsEmailReportConfig `json:"report"`
+}
+
+type OpsDistributedLockSettings struct {
+ Enabled bool `json:"enabled"`
+ Key string `json:"key"`
+ TTLSeconds int `json:"ttl_seconds"`
+}
+
+type OpsAlertSilenceEntry struct {
+ RuleID *int64 `json:"rule_id,omitempty"`
+ Severities []string `json:"severities,omitempty"`
+
+ UntilRFC3339 string `json:"until_rfc3339"`
+ Reason string `json:"reason"`
+}
+
+type OpsAlertSilencingSettings struct {
+ Enabled bool `json:"enabled"`
+
+ GlobalUntilRFC3339 string `json:"global_until_rfc3339"`
+ GlobalReason string `json:"global_reason"`
+
+ Entries []OpsAlertSilenceEntry `json:"entries,omitempty"`
+}
+
+type OpsAlertRuntimeSettings struct {
+ EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"`
+
+ DistributedLock OpsDistributedLockSettings `json:"distributed_lock"`
+ Silencing OpsAlertSilencingSettings `json:"silencing"`
+}
+
diff --git a/backend/internal/service/ops_trend_models.go b/backend/internal/service/ops_trend_models.go
new file mode 100644
index 00000000..f6d07c14
--- /dev/null
+++ b/backend/internal/service/ops_trend_models.go
@@ -0,0 +1,65 @@
+package service
+
+import "time"
+
+type OpsThroughputTrendPoint struct {
+ BucketStart time.Time `json:"bucket_start"`
+ RequestCount int64 `json:"request_count"`
+ TokenConsumed int64 `json:"token_consumed"`
+ QPS float64 `json:"qps"`
+ TPS float64 `json:"tps"`
+}
+
+type OpsThroughputPlatformBreakdownItem struct {
+ Platform string `json:"platform"`
+ RequestCount int64 `json:"request_count"`
+ TokenConsumed int64 `json:"token_consumed"`
+}
+
+type OpsThroughputGroupBreakdownItem struct {
+ GroupID int64 `json:"group_id"`
+ GroupName string `json:"group_name"`
+ RequestCount int64 `json:"request_count"`
+ TokenConsumed int64 `json:"token_consumed"`
+}
+
+type OpsThroughputTrendResponse struct {
+ Bucket string `json:"bucket"`
+
+ Points []*OpsThroughputTrendPoint `json:"points"`
+
+ // Optional drilldown helpers:
+ // - When no platform/group is selected: returns totals by platform.
+ // - When platform is selected but group is not: returns top groups in that platform.
+ ByPlatform []*OpsThroughputPlatformBreakdownItem `json:"by_platform,omitempty"`
+ TopGroups []*OpsThroughputGroupBreakdownItem `json:"top_groups,omitempty"`
+}
+
+type OpsErrorTrendPoint struct {
+ BucketStart time.Time `json:"bucket_start"`
+
+ ErrorCountTotal int64 `json:"error_count_total"`
+ BusinessLimitedCount int64 `json:"business_limited_count"`
+ ErrorCountSLA int64 `json:"error_count_sla"`
+
+ UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
+ Upstream429Count int64 `json:"upstream_429_count"`
+ Upstream529Count int64 `json:"upstream_529_count"`
+}
+
+type OpsErrorTrendResponse struct {
+ Bucket string `json:"bucket"`
+ Points []*OpsErrorTrendPoint `json:"points"`
+}
+
+type OpsErrorDistributionItem struct {
+ StatusCode int `json:"status_code"`
+ Total int64 `json:"total"`
+ SLA int64 `json:"sla"`
+ BusinessLimited int64 `json:"business_limited"`
+}
+
+type OpsErrorDistributionResponse struct {
+ Total int64 `json:"total"`
+ Items []*OpsErrorDistributionItem `json:"items"`
+}
diff --git a/backend/migrations/030_ops_monitoring_vnext.sql b/backend/migrations/030_ops_monitoring_vnext.sql
new file mode 100644
index 00000000..39b19e5d
--- /dev/null
+++ b/backend/migrations/030_ops_monitoring_vnext.sql
@@ -0,0 +1,707 @@
+-- Ops Monitoring (vNext): squashed migration (030)
+--
+-- This repository originally planned Ops vNext as migrations 030-036:
+-- 030 drop legacy ops tables
+-- 031 core schema
+-- 032 pre-aggregation tables
+-- 033 indexes + optional extensions
+-- 034 add avg/max to preagg
+-- 035 add notify_email to alert rules
+-- 036 seed default alert rules
+--
+-- Since these migrations have NOT been applied to any environment yet, we squash them
+-- into a single 030 migration for easier review and a cleaner migration history.
+--
+-- Notes:
+-- - This is intentionally destructive for ops_* data (error logs / metrics / alerts).
+-- - It is idempotent (DROP/CREATE/ALTER IF EXISTS/IF NOT EXISTS), but will wipe ops_* data if re-run.
+
+-- =====================================================================
+-- 030_ops_drop_legacy_ops_tables.sql
+-- =====================================================================
+
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+
+-- Legacy pre-aggregation tables (from 026 and/or previous branches)
+DROP TABLE IF EXISTS ops_metrics_daily CASCADE;
+DROP TABLE IF EXISTS ops_metrics_hourly CASCADE;
+
+-- Core ops tables that may exist in some deployments / branches
+DROP TABLE IF EXISTS ops_system_metrics CASCADE;
+DROP TABLE IF EXISTS ops_error_logs CASCADE;
+DROP TABLE IF EXISTS ops_alert_events CASCADE;
+DROP TABLE IF EXISTS ops_alert_rules CASCADE;
+DROP TABLE IF EXISTS ops_job_heartbeats CASCADE;
+DROP TABLE IF EXISTS ops_retry_attempts CASCADE;
+
+-- Optional legacy tables (best-effort cleanup)
+DROP TABLE IF EXISTS ops_scheduled_reports CASCADE;
+DROP TABLE IF EXISTS ops_group_availability_configs CASCADE;
+DROP TABLE IF EXISTS ops_group_availability_events CASCADE;
+
+-- Optional legacy views/indexes
+DROP VIEW IF EXISTS ops_latest_metrics CASCADE;
+
+-- =====================================================================
+-- 031_ops_core_schema.sql
+-- =====================================================================
+
+-- Ops Monitoring (vNext): core schema (errors / retries / metrics / jobs / alerts)
+--
+-- Design goals:
+-- - Support global filtering (time/platform/group) across all ops modules.
+-- - Persist enough context for two retry modes (client retry / pinned upstream retry).
+-- - Make ops background jobs observable via job heartbeats.
+-- - Keep schema stable and indexes targeted (high-write tables).
+--
+-- Notes:
+-- - This migration is idempotent.
+-- - ops_* tables intentionally avoid strict foreign keys to reduce write amplification/locks.
+
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+
+-- ============================================
+-- 1) ops_error_logs: error log details (high-write)
+-- ============================================
+
+CREATE TABLE IF NOT EXISTS ops_error_logs (
+ id BIGSERIAL PRIMARY KEY,
+
+ -- Correlation / identities
+ request_id VARCHAR(64),
+ client_request_id VARCHAR(64),
+ user_id BIGINT,
+ api_key_id BIGINT,
+ account_id BIGINT,
+ group_id BIGINT,
+ client_ip inet,
+
+ -- Dimensions for global filtering
+ platform VARCHAR(32),
+
+ -- Request metadata
+ model VARCHAR(100),
+ request_path VARCHAR(256),
+ stream BOOLEAN NOT NULL DEFAULT false,
+ user_agent TEXT,
+
+ -- Core error classification
+ error_phase VARCHAR(32) NOT NULL,
+ error_type VARCHAR(64) NOT NULL,
+ severity VARCHAR(8) NOT NULL DEFAULT 'P2',
+ status_code INT,
+
+ -- vNext metric semantics
+ is_business_limited BOOLEAN NOT NULL DEFAULT false,
+
+ -- Error details (sanitized/truncated at ingest time)
+ error_message TEXT,
+ error_body TEXT,
+
+ -- Provider/upstream details (optional; useful for trends & account health)
+ error_source VARCHAR(64),
+ error_owner VARCHAR(32),
+ account_status VARCHAR(50),
+ upstream_status_code INT,
+ upstream_error_message TEXT,
+ upstream_error_detail TEXT,
+ provider_error_code VARCHAR(64),
+ provider_error_type VARCHAR(64),
+ network_error_type VARCHAR(50),
+ retry_after_seconds INT,
+
+ -- Timings (ms) - optional
+ duration_ms INT,
+ time_to_first_token_ms BIGINT,
+ auth_latency_ms BIGINT,
+ routing_latency_ms BIGINT,
+ upstream_latency_ms BIGINT,
+ response_latency_ms BIGINT,
+
+ -- Retry context (only stored for error requests)
+ request_body JSONB,
+ request_headers JSONB,
+ request_body_truncated BOOLEAN NOT NULL DEFAULT false,
+ request_body_bytes INT,
+
+ -- Retryability flags (best-effort classification)
+ is_retryable BOOLEAN NOT NULL DEFAULT false,
+ retry_count INT NOT NULL DEFAULT 0,
+
+ created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+COMMENT ON TABLE ops_error_logs IS 'Ops error logs (vNext). Stores sanitized error details and request_body for retries (errors only).';
+
+-- ============================================
+-- 2) ops_retry_attempts: audit log for retries
+-- ============================================
+
+CREATE TABLE IF NOT EXISTS ops_retry_attempts (
+ id BIGSERIAL PRIMARY KEY,
+
+ created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+
+ requested_by_user_id BIGINT,
+ source_error_id BIGINT,
+
+ -- client|upstream
+ mode VARCHAR(16) NOT NULL,
+ pinned_account_id BIGINT,
+
+ -- queued|running|succeeded|failed
+ status VARCHAR(16) NOT NULL DEFAULT 'queued',
+ started_at TIMESTAMPTZ,
+ finished_at TIMESTAMPTZ,
+ duration_ms BIGINT,
+
+ -- Optional result correlation
+ result_request_id VARCHAR(64),
+ result_error_id BIGINT,
+ result_usage_request_id VARCHAR(64),
+
+ error_message TEXT
+);
+
+COMMENT ON TABLE ops_retry_attempts IS 'Audit table for ops retries (client retry / pinned upstream retry).';
+
+-- ============================================
+-- 3) ops_system_metrics: system + request window snapshots
+-- ============================================
+
+CREATE TABLE IF NOT EXISTS ops_system_metrics (
+ id BIGSERIAL PRIMARY KEY,
+
+ created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+ window_minutes INT NOT NULL DEFAULT 1,
+
+ -- Optional dimensions (only if collector chooses to write per-dimension snapshots)
+ platform VARCHAR(32),
+ group_id BIGINT,
+
+ -- Core counts
+ success_count BIGINT NOT NULL DEFAULT 0,
+ error_count_total BIGINT NOT NULL DEFAULT 0,
+ business_limited_count BIGINT NOT NULL DEFAULT 0,
+ error_count_sla BIGINT NOT NULL DEFAULT 0,
+
+ upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
+ upstream_429_count BIGINT NOT NULL DEFAULT 0,
+ upstream_529_count BIGINT NOT NULL DEFAULT 0,
+
+ token_consumed BIGINT NOT NULL DEFAULT 0,
+
+ -- Rates
+ qps DOUBLE PRECISION,
+ tps DOUBLE PRECISION,
+
+ -- Duration percentiles (ms) - success requests
+ duration_p50_ms INT,
+ duration_p90_ms INT,
+ duration_p95_ms INT,
+ duration_p99_ms INT,
+ duration_avg_ms DOUBLE PRECISION,
+ duration_max_ms INT,
+
+ -- TTFT percentiles (ms) - success requests (streaming)
+ ttft_p50_ms INT,
+ ttft_p90_ms INT,
+ ttft_p95_ms INT,
+ ttft_p99_ms INT,
+ ttft_avg_ms DOUBLE PRECISION,
+ ttft_max_ms INT,
+
+ -- System resources
+ cpu_usage_percent DOUBLE PRECISION,
+ memory_used_mb BIGINT,
+ memory_total_mb BIGINT,
+ memory_usage_percent DOUBLE PRECISION,
+
+ -- Dependency health (best-effort)
+ db_ok BOOLEAN,
+ redis_ok BOOLEAN,
+
+ -- DB pool & runtime
+ db_conn_active INT,
+ db_conn_idle INT,
+ db_conn_waiting INT,
+ goroutine_count INT,
+
+ -- Queue / concurrency
+ concurrency_queue_depth INT
+);
+
+COMMENT ON TABLE ops_system_metrics IS 'Ops system/request metrics snapshots (vNext). Used for dashboard overview and realtime rates.';
+
+-- ============================================
+-- 4) ops_job_heartbeats: background jobs health
+-- ============================================
+
+CREATE TABLE IF NOT EXISTS ops_job_heartbeats (
+ job_name VARCHAR(64) PRIMARY KEY,
+
+ last_run_at TIMESTAMPTZ,
+ last_success_at TIMESTAMPTZ,
+ last_error_at TIMESTAMPTZ,
+ last_error TEXT,
+ last_duration_ms BIGINT,
+
+ updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+COMMENT ON TABLE ops_job_heartbeats IS 'Ops background jobs heartbeats (vNext).';
+
+-- ============================================
+-- 5) ops_alert_rules / ops_alert_events
+-- ============================================
+
+CREATE TABLE IF NOT EXISTS ops_alert_rules (
+ id BIGSERIAL PRIMARY KEY,
+
+ name VARCHAR(128) NOT NULL,
+ description TEXT,
+ enabled BOOLEAN NOT NULL DEFAULT true,
+
+ severity VARCHAR(16) NOT NULL DEFAULT 'warning',
+
+ -- Metric definition
+ -- Metric definition
+ metric_type VARCHAR(64) NOT NULL,
+ operator VARCHAR(8) NOT NULL,
+ threshold DOUBLE PRECISION NOT NULL,
+
+ window_minutes INT NOT NULL DEFAULT 5,
+ sustained_minutes INT NOT NULL DEFAULT 5,
+ cooldown_minutes INT NOT NULL DEFAULT 10,
+
+ -- Optional scoping: platform/group filters etc.
+ filters JSONB,
+
+ last_triggered_at TIMESTAMPTZ,
+ created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+ updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_alert_rules_name_unique
+ ON ops_alert_rules (name);
+
+CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled
+ ON ops_alert_rules (enabled);
+
+CREATE TABLE IF NOT EXISTS ops_alert_events (
+ id BIGSERIAL PRIMARY KEY,
+
+ rule_id BIGINT,
+ severity VARCHAR(16) NOT NULL,
+ status VARCHAR(16) NOT NULL DEFAULT 'firing',
+
+ title VARCHAR(200),
+ description TEXT,
+
+ metric_value DOUBLE PRECISION,
+ threshold_value DOUBLE PRECISION,
+ dimensions JSONB,
+
+ fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+ resolved_at TIMESTAMPTZ,
+
+ email_sent BOOLEAN NOT NULL DEFAULT false,
+ created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status
+ ON ops_alert_events (rule_id, status);
+
+CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at
+ ON ops_alert_events (fired_at DESC);
+
+-- =====================================================================
+-- 032_ops_preaggregation_tables.sql
+-- =====================================================================
+
+-- Ops Monitoring (vNext): pre-aggregation tables
+--
+-- Purpose:
+-- - Provide stable query performance for 1–24h windows (and beyond), avoiding expensive
+-- percentile_cont scans on raw logs for every dashboard refresh.
+-- - Support global filter dimensions: overall / platform / group.
+--
+-- Design note:
+-- - We keep a single table with nullable platform/group_id, and enforce uniqueness via a
+-- COALESCE-based unique index (because UNIQUE with NULLs allows duplicates in Postgres).
+
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+
+-- ============================================
+-- 1) ops_metrics_hourly
+-- ============================================
+
+CREATE TABLE IF NOT EXISTS ops_metrics_hourly (
+ id BIGSERIAL PRIMARY KEY,
+
+ bucket_start TIMESTAMPTZ NOT NULL,
+ platform VARCHAR(32),
+ group_id BIGINT,
+
+ success_count BIGINT NOT NULL DEFAULT 0,
+ error_count_total BIGINT NOT NULL DEFAULT 0,
+ business_limited_count BIGINT NOT NULL DEFAULT 0,
+ error_count_sla BIGINT NOT NULL DEFAULT 0,
+
+ upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
+ upstream_429_count BIGINT NOT NULL DEFAULT 0,
+ upstream_529_count BIGINT NOT NULL DEFAULT 0,
+
+ token_consumed BIGINT NOT NULL DEFAULT 0,
+
+ -- Duration percentiles (ms)
+ duration_p50_ms INT,
+ duration_p90_ms INT,
+ duration_p95_ms INT,
+ duration_p99_ms INT,
+
+ -- TTFT percentiles (ms)
+ ttft_p50_ms INT,
+ ttft_p90_ms INT,
+ ttft_p95_ms INT,
+ ttft_p99_ms INT,
+
+ computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+ created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+-- Uniqueness across three “dimension modes” (overall / platform / group).
+-- Postgres UNIQUE treats NULLs as distinct, so we enforce uniqueness via COALESCE.
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_hourly_unique_dim
+ ON ops_metrics_hourly (
+ bucket_start,
+ COALESCE(platform, ''),
+ COALESCE(group_id, 0)
+ );
+
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_bucket
+ ON ops_metrics_hourly (bucket_start DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_platform_bucket
+ ON ops_metrics_hourly (platform, bucket_start DESC)
+ WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
+
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_group_bucket
+ ON ops_metrics_hourly (group_id, bucket_start DESC)
+ WHERE group_id IS NOT NULL AND group_id <> 0;
+
+COMMENT ON TABLE ops_metrics_hourly IS 'vNext hourly pre-aggregated ops metrics (overall/platform/group).';
+
+-- ============================================
+-- 2) ops_metrics_daily (optional; for longer windows)
+-- ============================================
+
+CREATE TABLE IF NOT EXISTS ops_metrics_daily (
+ id BIGSERIAL PRIMARY KEY,
+
+ bucket_date DATE NOT NULL,
+ platform VARCHAR(32),
+ group_id BIGINT,
+
+ success_count BIGINT NOT NULL DEFAULT 0,
+ error_count_total BIGINT NOT NULL DEFAULT 0,
+ business_limited_count BIGINT NOT NULL DEFAULT 0,
+ error_count_sla BIGINT NOT NULL DEFAULT 0,
+
+ upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
+ upstream_429_count BIGINT NOT NULL DEFAULT 0,
+ upstream_529_count BIGINT NOT NULL DEFAULT 0,
+
+ token_consumed BIGINT NOT NULL DEFAULT 0,
+
+ duration_p50_ms INT,
+ duration_p90_ms INT,
+ duration_p95_ms INT,
+ duration_p99_ms INT,
+
+ ttft_p50_ms INT,
+ ttft_p90_ms INT,
+ ttft_p95_ms INT,
+ ttft_p99_ms INT,
+
+ computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+ created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_daily_unique_dim
+ ON ops_metrics_daily (
+ bucket_date,
+ COALESCE(platform, ''),
+ COALESCE(group_id, 0)
+ );
+
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_bucket
+ ON ops_metrics_daily (bucket_date DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_platform_bucket
+ ON ops_metrics_daily (platform, bucket_date DESC)
+ WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
+
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_group_bucket
+ ON ops_metrics_daily (group_id, bucket_date DESC)
+ WHERE group_id IS NOT NULL AND group_id <> 0;
+
+COMMENT ON TABLE ops_metrics_daily IS 'vNext daily pre-aggregated ops metrics (overall/platform/group).';
+
+-- =====================================================================
+-- 033_ops_indexes_and_extensions.sql
+-- =====================================================================
+
+-- Ops Monitoring (vNext): indexes and optional extensions
+--
+-- This migration intentionally keeps "optional" objects (like pg_trgm) best-effort,
+-- so environments without extension privileges won't fail the whole migration chain.
+
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+
+-- ============================================
+-- 1) Core btree indexes (always safe)
+-- ============================================
+
+-- ops_error_logs
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at
+ ON ops_error_logs (created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform_time
+ ON ops_error_logs (platform, created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_group_time
+ ON ops_error_logs (group_id, created_at DESC)
+ WHERE group_id IS NOT NULL;
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_account_time
+ ON ops_error_logs (account_id, created_at DESC)
+ WHERE account_id IS NOT NULL;
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_status_time
+ ON ops_error_logs (status_code, created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_time
+ ON ops_error_logs (error_phase, created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_type_time
+ ON ops_error_logs (error_type, created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id
+ ON ops_error_logs (request_id);
+
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id
+ ON ops_error_logs (client_request_id);
+
+-- ops_system_metrics
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at
+ ON ops_system_metrics (created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time
+ ON ops_system_metrics (window_minutes, created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_platform_time
+ ON ops_system_metrics (platform, created_at DESC)
+ WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
+
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_group_time
+ ON ops_system_metrics (group_id, created_at DESC)
+ WHERE group_id IS NOT NULL;
+
+-- ops_retry_attempts
+CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_created_at
+ ON ops_retry_attempts (created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_source_error
+ ON ops_retry_attempts (source_error_id, created_at DESC)
+ WHERE source_error_id IS NOT NULL;
+
+-- Prevent concurrent retries for the same ops_error_logs row (race-free, multi-instance safe).
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_retry_attempts_unique_active
+ ON ops_retry_attempts (source_error_id)
+ WHERE source_error_id IS NOT NULL AND status IN ('queued', 'running');
+
+-- ============================================
+-- 2) Optional: pg_trgm + trigram indexes for fuzzy search
+-- ============================================
+
+DO $$
+BEGIN
+ BEGIN
+ CREATE EXTENSION IF NOT EXISTS pg_trgm;
+ EXCEPTION WHEN OTHERS THEN
+ -- Missing privileges or extension package should not block migrations.
+ RAISE NOTICE 'pg_trgm extension not created: %', SQLERRM;
+ END;
+
+ IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm') THEN
+ -- request_id / client_request_id fuzzy search
+ EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id_trgm
+ ON ops_error_logs USING gin (request_id gin_trgm_ops)';
+ EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id_trgm
+ ON ops_error_logs USING gin (client_request_id gin_trgm_ops)';
+
+ -- error_message fuzzy search
+ EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_error_message_trgm
+ ON ops_error_logs USING gin (error_message gin_trgm_ops)';
+ END IF;
+END $$;
+
+-- =====================================================================
+-- 034_ops_preaggregation_add_avg_max.sql
+-- =====================================================================
+
+-- Ops Monitoring (vNext): extend pre-aggregation tables with avg/max latency fields
+--
+-- Why:
+-- - The dashboard overview returns avg/max for duration/TTFT.
+-- - Hourly/daily pre-aggregation tables originally stored only p50/p90/p95/p99, which makes
+-- it impossible to answer avg/max in preagg mode without falling back to raw scans.
+--
+-- This migration is idempotent and safe to run multiple times.
+--
+-- NOTE: We keep the existing p50/p90/p95/p99 columns as-is; these are still used for
+-- approximate long-window summaries.
+
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+
+-- Hourly table
+ALTER TABLE ops_metrics_hourly
+ ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
+ ADD COLUMN IF NOT EXISTS duration_max_ms INT,
+ ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
+ ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
+
+-- Daily table
+ALTER TABLE ops_metrics_daily
+ ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
+ ADD COLUMN IF NOT EXISTS duration_max_ms INT,
+ ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
+ ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
+
+-- =====================================================================
+-- 035_ops_alert_rules_notify_email.sql
+-- =====================================================================
+
+-- Ops Monitoring (vNext): alert rule notify settings
+--
+-- Adds notify_email flag to ops_alert_rules to keep UI parity with the backup Ops dashboard.
+-- Migration is idempotent.
+
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+
+ALTER TABLE ops_alert_rules
+ ADD COLUMN IF NOT EXISTS notify_email BOOLEAN NOT NULL DEFAULT true;
+
+-- =====================================================================
+-- 036_ops_seed_default_alert_rules.sql
+-- =====================================================================
+
+-- Ops Monitoring (vNext): seed default alert rules (idempotent)
+--
+-- Goal:
+-- - Provide "out of the box" alert rules so the Ops dashboard can immediately show alert events.
+-- - Keep inserts idempotent via ON CONFLICT (name) DO NOTHING.
+--
+-- Notes:
+-- - Thresholds are intentionally conservative defaults and should be tuned per deployment.
+-- - Metric semantics follow vNext:
+-- - success_rate / error_rate are based on SLA-scope counts (exclude is_business_limited).
+-- - upstream_error_rate excludes 429/529.
+
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+
+-- 1) High error rate (P1)
+INSERT INTO ops_alert_rules (
+ name, description, enabled, metric_type, operator, threshold,
+ window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+ created_at, updated_at
+) VALUES (
+ '错误率过高',
+ '当错误率超过 5% 且持续 5 分钟时触发告警',
+ true, 'error_rate', '>', 5.0, 5, 5, 'P1', true, 20, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+
+-- 2) Low success rate (P0)
+INSERT INTO ops_alert_rules (
+ name, description, enabled, metric_type, operator, threshold,
+ window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+ created_at, updated_at
+) VALUES (
+ '成功率过低',
+ '当成功率低于 95% 且持续 5 分钟时触发告警(服务可用性下降)',
+ true, 'success_rate', '<', 95.0, 5, 5, 'P0', true, 15, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+
+-- 3) P99 latency too high (P2)
+INSERT INTO ops_alert_rules (
+ name, description, enabled, metric_type, operator, threshold,
+ window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+ created_at, updated_at
+) VALUES (
+ 'P99延迟过高',
+ '当 P99 延迟超过 3000ms 且持续 10 分钟时触发告警',
+ true, 'p99_latency_ms', '>', 3000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+
+-- 4) P95 latency too high (P2)
+INSERT INTO ops_alert_rules (
+ name, description, enabled, metric_type, operator, threshold,
+ window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+ created_at, updated_at
+) VALUES (
+ 'P95延迟过高',
+ '当 P95 延迟超过 2000ms 且持续 10 分钟时触发告警',
+ true, 'p95_latency_ms', '>', 2000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+
+-- 5) CPU usage too high (P2)
+INSERT INTO ops_alert_rules (
+ name, description, enabled, metric_type, operator, threshold,
+ window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+ created_at, updated_at
+) VALUES (
+ 'CPU使用率过高',
+ '当 CPU 使用率超过 85% 且持续 10 分钟时触发告警',
+ true, 'cpu_usage_percent', '>', 85.0, 5, 10, 'P2', true, 30, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+
+-- 6) Memory usage too high (P1)
+INSERT INTO ops_alert_rules (
+ name, description, enabled, metric_type, operator, threshold,
+ window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+ created_at, updated_at
+) VALUES (
+ '内存使用率过高',
+ '当内存使用率超过 90% 且持续 10 分钟时触发告警(可能导致 OOM)',
+ true, 'memory_usage_percent', '>', 90.0, 5, 10, 'P1', true, 20, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+
+-- 7) Concurrency queue buildup (P1)
+INSERT INTO ops_alert_rules (
+ name, description, enabled, metric_type, operator, threshold,
+ window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+ created_at, updated_at
+) VALUES (
+ '并发队列积压',
+ '当并发队列深度超过 100 且持续 5 分钟时触发告警(系统处理能力不足)',
+ true, 'concurrency_queue_depth', '>', 100.0, 5, 5, 'P1', true, 20, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+
+-- 8) Extremely high error rate (P0)
+INSERT INTO ops_alert_rules (
+ name, description, enabled, metric_type, operator, threshold,
+ window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+ created_at, updated_at
+) VALUES (
+ '错误率极高',
+ '当错误率超过 20% 且持续 1 分钟时触发告警(服务严重异常)',
+ true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
From bb5303272b801611361fd317e6a28e2a052b4b9c Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Fri, 9 Jan 2026 20:52:57 +0800
Subject: [PATCH 03/53] =?UTF-8?q?feat(repository):=20=E5=AE=9E=E7=8E=B0?=
=?UTF-8?q?=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E6=95=B0=E6=8D=AE=E8=AE=BF?=
=?UTF-8?q?=E9=97=AE=E5=B1=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 新增 ops 主仓库(ops_repo.go)
- 实现告警数据访问(ops_repo_alerts.go)
- 实现仪表板数据访问(ops_repo_dashboard.go)
- 实现直方图数据访问(ops_repo_histograms.go)
- 实现延迟直方图桶逻辑(ops_repo_latency_histogram_buckets.go)
- 新增延迟直方图桶测试(ops_repo_latency_histogram_buckets_test.go)
- 实现指标数据访问(ops_repo_metrics.go)
- 实现预聚合数据访问(ops_repo_preagg.go)
- 实现请求详情数据访问(ops_repo_request_details.go)
- 实现趋势数据访问(ops_repo_trends.go)
- 实现窗口统计数据访问(ops_repo_window_stats.go)
- 更新并发缓存支持 ops 场景
- 注册 repository 依赖注入
---
.../internal/repository/concurrency_cache.go | 16 +-
backend/internal/repository/ops_repo.go | 676 +++++++++++
.../internal/repository/ops_repo_alerts.go | 689 +++++++++++
.../internal/repository/ops_repo_dashboard.go | 1012 +++++++++++++++++
.../repository/ops_repo_histograms.go | 79 ++
.../ops_repo_latency_histogram_buckets.go | 64 ++
...ops_repo_latency_histogram_buckets_test.go | 14 +
.../internal/repository/ops_repo_metrics.go | 401 +++++++
.../internal/repository/ops_repo_preagg.go | 359 ++++++
.../repository/ops_repo_request_details.go | 285 +++++
.../internal/repository/ops_repo_trends.go | 567 +++++++++
.../repository/ops_repo_window_stats.go | 50 +
backend/internal/repository/wire.go | 1 +
13 files changed, 4203 insertions(+), 10 deletions(-)
create mode 100644 backend/internal/repository/ops_repo.go
create mode 100644 backend/internal/repository/ops_repo_alerts.go
create mode 100644 backend/internal/repository/ops_repo_dashboard.go
create mode 100644 backend/internal/repository/ops_repo_histograms.go
create mode 100644 backend/internal/repository/ops_repo_latency_histogram_buckets.go
create mode 100644 backend/internal/repository/ops_repo_latency_histogram_buckets_test.go
create mode 100644 backend/internal/repository/ops_repo_metrics.go
create mode 100644 backend/internal/repository/ops_repo_preagg.go
create mode 100644 backend/internal/repository/ops_repo_request_details.go
create mode 100644 backend/internal/repository/ops_repo_trends.go
create mode 100644 backend/internal/repository/ops_repo_window_stats.go
diff --git a/backend/internal/repository/concurrency_cache.go b/backend/internal/repository/concurrency_cache.go
index 0831f5eb..b34961e1 100644
--- a/backend/internal/repository/concurrency_cache.go
+++ b/backend/internal/repository/concurrency_cache.go
@@ -93,7 +93,7 @@ var (
return redis.call('ZCARD', key)
`)
- // incrementWaitScript - only sets TTL on first creation to avoid refreshing
+ // incrementWaitScript - refreshes TTL on each increment to keep queue depth accurate
// KEYS[1] = wait queue key
// ARGV[1] = maxWait
// ARGV[2] = TTL in seconds
@@ -111,15 +111,13 @@ var (
local newVal = redis.call('INCR', KEYS[1])
- -- Only set TTL on first creation to avoid refreshing zombie data
- if newVal == 1 then
- redis.call('EXPIRE', KEYS[1], ARGV[2])
- end
+ -- Refresh TTL so long-running traffic doesn't expire active queue counters.
+ redis.call('EXPIRE', KEYS[1], ARGV[2])
return 1
`)
- // incrementAccountWaitScript - account-level wait queue count
+ // incrementAccountWaitScript - account-level wait queue count (refresh TTL on each increment)
incrementAccountWaitScript = redis.NewScript(`
local current = redis.call('GET', KEYS[1])
if current == false then
@@ -134,10 +132,8 @@ var (
local newVal = redis.call('INCR', KEYS[1])
- -- Only set TTL on first creation to avoid refreshing zombie data
- if newVal == 1 then
- redis.call('EXPIRE', KEYS[1], ARGV[2])
- end
+ -- Refresh TTL so long-running traffic doesn't expire active queue counters.
+ redis.call('EXPIRE', KEYS[1], ARGV[2])
return 1
`)
diff --git a/backend/internal/repository/ops_repo.go b/backend/internal/repository/ops_repo.go
new file mode 100644
index 00000000..b27a9ea0
--- /dev/null
+++ b/backend/internal/repository/ops_repo.go
@@ -0,0 +1,676 @@
+package repository
+
+import (
+ "context"
+ "database/sql"
+ "fmt"
+ "strings"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/service"
+ "github.com/lib/pq"
+)
+
+type opsRepository struct {
+ db *sql.DB
+}
+
+func NewOpsRepository(db *sql.DB) service.OpsRepository {
+ return &opsRepository{db: db}
+}
+
+func (r *opsRepository) InsertErrorLog(ctx context.Context, input *service.OpsInsertErrorLogInput) (int64, error) {
+ if r == nil || r.db == nil {
+ return 0, fmt.Errorf("nil ops repository")
+ }
+ if input == nil {
+ return 0, fmt.Errorf("nil input")
+ }
+
+ q := `
+INSERT INTO ops_error_logs (
+ request_id,
+ client_request_id,
+ user_id,
+ api_key_id,
+ account_id,
+ group_id,
+ client_ip,
+ platform,
+ model,
+ request_path,
+ stream,
+ user_agent,
+ error_phase,
+ error_type,
+ severity,
+ status_code,
+ is_business_limited,
+ error_message,
+ error_body,
+ error_source,
+ error_owner,
+ upstream_status_code,
+ upstream_error_message,
+ upstream_error_detail,
+ duration_ms,
+ time_to_first_token_ms,
+ request_body,
+ request_body_truncated,
+ request_body_bytes,
+ request_headers,
+ is_retryable,
+ retry_count,
+ created_at
+) VALUES (
+ $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33
+) RETURNING id`
+
+ var id int64
+ err := r.db.QueryRowContext(
+ ctx,
+ q,
+ opsNullString(input.RequestID),
+ opsNullString(input.ClientRequestID),
+ opsNullInt64(input.UserID),
+ opsNullInt64(input.APIKeyID),
+ opsNullInt64(input.AccountID),
+ opsNullInt64(input.GroupID),
+ opsNullString(input.ClientIP),
+ opsNullString(input.Platform),
+ opsNullString(input.Model),
+ opsNullString(input.RequestPath),
+ input.Stream,
+ opsNullString(input.UserAgent),
+ input.ErrorPhase,
+ input.ErrorType,
+ opsNullString(input.Severity),
+ opsNullInt(input.StatusCode),
+ input.IsBusinessLimited,
+ opsNullString(input.ErrorMessage),
+ opsNullString(input.ErrorBody),
+ opsNullString(input.ErrorSource),
+ opsNullString(input.ErrorOwner),
+ opsNullInt(input.UpstreamStatusCode),
+ opsNullString(input.UpstreamErrorMessage),
+ opsNullString(input.UpstreamErrorDetail),
+ opsNullInt(input.DurationMs),
+ opsNullInt64(input.TimeToFirstTokenMs),
+ opsNullString(input.RequestBodyJSON),
+ input.RequestBodyTruncated,
+ opsNullInt(input.RequestBodyBytes),
+ opsNullString(input.RequestHeadersJSON),
+ input.IsRetryable,
+ input.RetryCount,
+ input.CreatedAt,
+ ).Scan(&id)
+ if err != nil {
+ return 0, err
+ }
+ return id, nil
+}
+
+func (r *opsRepository) ListErrorLogs(ctx context.Context, filter *service.OpsErrorLogFilter) (*service.OpsErrorLogList, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if filter == nil {
+ filter = &service.OpsErrorLogFilter{}
+ }
+
+ page := filter.Page
+ if page <= 0 {
+ page = 1
+ }
+ pageSize := filter.PageSize
+ if pageSize <= 0 {
+ pageSize = 20
+ }
+ if pageSize > 500 {
+ pageSize = 500
+ }
+
+ where, args := buildOpsErrorLogsWhere(filter)
+ countSQL := "SELECT COUNT(*) FROM ops_error_logs " + where
+
+ var total int
+ if err := r.db.QueryRowContext(ctx, countSQL, args...).Scan(&total); err != nil {
+ return nil, err
+ }
+
+ offset := (page - 1) * pageSize
+ argsWithLimit := append(args, pageSize, offset)
+ selectSQL := `
+SELECT
+ id,
+ created_at,
+ error_phase,
+ error_type,
+ severity,
+ COALESCE(status_code, 0),
+ COALESCE(platform, ''),
+ COALESCE(model, ''),
+ duration_ms,
+ COALESCE(client_request_id, ''),
+ COALESCE(request_id, ''),
+ COALESCE(error_message, ''),
+ user_id,
+ api_key_id,
+ account_id,
+ group_id,
+ CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END,
+ COALESCE(request_path, ''),
+ stream
+FROM ops_error_logs
+` + where + `
+ORDER BY created_at DESC
+LIMIT $` + itoa(len(args)+1) + ` OFFSET $` + itoa(len(args)+2)
+
+ rows, err := r.db.QueryContext(ctx, selectSQL, argsWithLimit...)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ out := make([]*service.OpsErrorLog, 0, pageSize)
+ for rows.Next() {
+ var item service.OpsErrorLog
+ var latency sql.NullInt64
+ var statusCode sql.NullInt64
+ var clientIP sql.NullString
+ var userID sql.NullInt64
+ var apiKeyID sql.NullInt64
+ var accountID sql.NullInt64
+ var groupID sql.NullInt64
+ if err := rows.Scan(
+ &item.ID,
+ &item.CreatedAt,
+ &item.Phase,
+ &item.Type,
+ &item.Severity,
+ &statusCode,
+ &item.Platform,
+ &item.Model,
+ &latency,
+ &item.ClientRequestID,
+ &item.RequestID,
+ &item.Message,
+ &userID,
+ &apiKeyID,
+ &accountID,
+ &groupID,
+ &clientIP,
+ &item.RequestPath,
+ &item.Stream,
+ ); err != nil {
+ return nil, err
+ }
+ if latency.Valid {
+ v := int(latency.Int64)
+ item.LatencyMs = &v
+ }
+ item.StatusCode = int(statusCode.Int64)
+ if clientIP.Valid {
+ s := clientIP.String
+ item.ClientIP = &s
+ }
+ if userID.Valid {
+ v := userID.Int64
+ item.UserID = &v
+ }
+ if apiKeyID.Valid {
+ v := apiKeyID.Int64
+ item.APIKeyID = &v
+ }
+ if accountID.Valid {
+ v := accountID.Int64
+ item.AccountID = &v
+ }
+ if groupID.Valid {
+ v := groupID.Int64
+ item.GroupID = &v
+ }
+ out = append(out, &item)
+ }
+ if err := rows.Err(); err != nil {
+ return nil, err
+ }
+
+ return &service.OpsErrorLogList{
+ Errors: out,
+ Total: total,
+ Page: page,
+ PageSize: pageSize,
+ }, nil
+}
+
+func (r *opsRepository) GetErrorLogByID(ctx context.Context, id int64) (*service.OpsErrorLogDetail, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if id <= 0 {
+ return nil, fmt.Errorf("invalid id")
+ }
+
+ q := `
+SELECT
+ id,
+ created_at,
+ error_phase,
+ error_type,
+ severity,
+ COALESCE(status_code, 0),
+ COALESCE(platform, ''),
+ COALESCE(model, ''),
+ duration_ms,
+ COALESCE(client_request_id, ''),
+ COALESCE(request_id, ''),
+ COALESCE(error_message, ''),
+ COALESCE(error_body, ''),
+ is_business_limited,
+ user_id,
+ api_key_id,
+ account_id,
+ group_id,
+ CASE WHEN client_ip IS NULL THEN NULL ELSE client_ip::text END,
+ COALESCE(request_path, ''),
+ stream,
+ COALESCE(user_agent, ''),
+ auth_latency_ms,
+ routing_latency_ms,
+ upstream_latency_ms,
+ response_latency_ms,
+ time_to_first_token_ms,
+ COALESCE(request_body::text, ''),
+ request_body_truncated,
+ request_body_bytes,
+ COALESCE(request_headers::text, '')
+FROM ops_error_logs
+WHERE id = $1
+LIMIT 1`
+
+ var out service.OpsErrorLogDetail
+ var latency sql.NullInt64
+ var statusCode sql.NullInt64
+ var clientIP sql.NullString
+ var userID sql.NullInt64
+ var apiKeyID sql.NullInt64
+ var accountID sql.NullInt64
+ var groupID sql.NullInt64
+ var authLatency sql.NullInt64
+ var routingLatency sql.NullInt64
+ var upstreamLatency sql.NullInt64
+ var responseLatency sql.NullInt64
+ var ttft sql.NullInt64
+ var requestBodyBytes sql.NullInt64
+
+ err := r.db.QueryRowContext(ctx, q, id).Scan(
+ &out.ID,
+ &out.CreatedAt,
+ &out.Phase,
+ &out.Type,
+ &out.Severity,
+ &statusCode,
+ &out.Platform,
+ &out.Model,
+ &latency,
+ &out.ClientRequestID,
+ &out.RequestID,
+ &out.Message,
+ &out.ErrorBody,
+ &out.IsBusinessLimited,
+ &userID,
+ &apiKeyID,
+ &accountID,
+ &groupID,
+ &clientIP,
+ &out.RequestPath,
+ &out.Stream,
+ &out.UserAgent,
+ &authLatency,
+ &routingLatency,
+ &upstreamLatency,
+ &responseLatency,
+ &ttft,
+ &out.RequestBody,
+ &out.RequestBodyTruncated,
+ &requestBodyBytes,
+ &out.RequestHeaders,
+ )
+ if err != nil {
+ return nil, err
+ }
+
+ out.StatusCode = int(statusCode.Int64)
+ if latency.Valid {
+ v := int(latency.Int64)
+ out.LatencyMs = &v
+ }
+ if clientIP.Valid {
+ s := clientIP.String
+ out.ClientIP = &s
+ }
+ if userID.Valid {
+ v := userID.Int64
+ out.UserID = &v
+ }
+ if apiKeyID.Valid {
+ v := apiKeyID.Int64
+ out.APIKeyID = &v
+ }
+ if accountID.Valid {
+ v := accountID.Int64
+ out.AccountID = &v
+ }
+ if groupID.Valid {
+ v := groupID.Int64
+ out.GroupID = &v
+ }
+ if authLatency.Valid {
+ v := authLatency.Int64
+ out.AuthLatencyMs = &v
+ }
+ if routingLatency.Valid {
+ v := routingLatency.Int64
+ out.RoutingLatencyMs = &v
+ }
+ if upstreamLatency.Valid {
+ v := upstreamLatency.Int64
+ out.UpstreamLatencyMs = &v
+ }
+ if responseLatency.Valid {
+ v := responseLatency.Int64
+ out.ResponseLatencyMs = &v
+ }
+ if ttft.Valid {
+ v := ttft.Int64
+ out.TimeToFirstTokenMs = &v
+ }
+ if requestBodyBytes.Valid {
+ v := int(requestBodyBytes.Int64)
+ out.RequestBodyBytes = &v
+ }
+
+ // Normalize request_body to empty string when stored as JSON null.
+ out.RequestBody = strings.TrimSpace(out.RequestBody)
+ if out.RequestBody == "null" {
+ out.RequestBody = ""
+ }
+ // Normalize request_headers to empty string when stored as JSON null.
+ out.RequestHeaders = strings.TrimSpace(out.RequestHeaders)
+ if out.RequestHeaders == "null" {
+ out.RequestHeaders = ""
+ }
+
+ return &out, nil
+}
+
+func (r *opsRepository) InsertRetryAttempt(ctx context.Context, input *service.OpsInsertRetryAttemptInput) (int64, error) {
+ if r == nil || r.db == nil {
+ return 0, fmt.Errorf("nil ops repository")
+ }
+ if input == nil {
+ return 0, fmt.Errorf("nil input")
+ }
+ if input.SourceErrorID <= 0 {
+ return 0, fmt.Errorf("invalid source_error_id")
+ }
+ if strings.TrimSpace(input.Mode) == "" {
+ return 0, fmt.Errorf("invalid mode")
+ }
+
+ q := `
+INSERT INTO ops_retry_attempts (
+ requested_by_user_id,
+ source_error_id,
+ mode,
+ pinned_account_id,
+ status,
+ started_at
+) VALUES (
+ $1,$2,$3,$4,$5,$6
+) RETURNING id`
+
+ var id int64
+ err := r.db.QueryRowContext(
+ ctx,
+ q,
+ opsNullInt64(&input.RequestedByUserID),
+ input.SourceErrorID,
+ strings.TrimSpace(input.Mode),
+ opsNullInt64(input.PinnedAccountID),
+ strings.TrimSpace(input.Status),
+ input.StartedAt,
+ ).Scan(&id)
+ if err != nil {
+ return 0, err
+ }
+ return id, nil
+}
+
+func (r *opsRepository) UpdateRetryAttempt(ctx context.Context, input *service.OpsUpdateRetryAttemptInput) error {
+ if r == nil || r.db == nil {
+ return fmt.Errorf("nil ops repository")
+ }
+ if input == nil {
+ return fmt.Errorf("nil input")
+ }
+ if input.ID <= 0 {
+ return fmt.Errorf("invalid id")
+ }
+
+ q := `
+UPDATE ops_retry_attempts
+SET
+ status = $2,
+ finished_at = $3,
+ duration_ms = $4,
+ result_request_id = $5,
+ result_error_id = $6,
+ error_message = $7
+WHERE id = $1`
+
+ _, err := r.db.ExecContext(
+ ctx,
+ q,
+ input.ID,
+ strings.TrimSpace(input.Status),
+ nullTime(input.FinishedAt),
+ input.DurationMs,
+ opsNullString(input.ResultRequestID),
+ opsNullInt64(input.ResultErrorID),
+ opsNullString(input.ErrorMessage),
+ )
+ return err
+}
+
+func (r *opsRepository) GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*service.OpsRetryAttempt, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if sourceErrorID <= 0 {
+ return nil, fmt.Errorf("invalid source_error_id")
+ }
+
+ q := `
+SELECT
+ id,
+ created_at,
+ COALESCE(requested_by_user_id, 0),
+ source_error_id,
+ COALESCE(mode, ''),
+ pinned_account_id,
+ COALESCE(status, ''),
+ started_at,
+ finished_at,
+ duration_ms,
+ result_request_id,
+ result_error_id,
+ error_message
+FROM ops_retry_attempts
+WHERE source_error_id = $1
+ORDER BY created_at DESC
+LIMIT 1`
+
+ var out service.OpsRetryAttempt
+ var pinnedAccountID sql.NullInt64
+ var requestedBy sql.NullInt64
+ var startedAt sql.NullTime
+ var finishedAt sql.NullTime
+ var durationMs sql.NullInt64
+ var resultRequestID sql.NullString
+ var resultErrorID sql.NullInt64
+ var errorMessage sql.NullString
+
+ err := r.db.QueryRowContext(ctx, q, sourceErrorID).Scan(
+ &out.ID,
+ &out.CreatedAt,
+ &requestedBy,
+ &out.SourceErrorID,
+ &out.Mode,
+ &pinnedAccountID,
+ &out.Status,
+ &startedAt,
+ &finishedAt,
+ &durationMs,
+ &resultRequestID,
+ &resultErrorID,
+ &errorMessage,
+ )
+ if err != nil {
+ return nil, err
+ }
+ out.RequestedByUserID = requestedBy.Int64
+ if pinnedAccountID.Valid {
+ v := pinnedAccountID.Int64
+ out.PinnedAccountID = &v
+ }
+ if startedAt.Valid {
+ t := startedAt.Time
+ out.StartedAt = &t
+ }
+ if finishedAt.Valid {
+ t := finishedAt.Time
+ out.FinishedAt = &t
+ }
+ if durationMs.Valid {
+ v := durationMs.Int64
+ out.DurationMs = &v
+ }
+ if resultRequestID.Valid {
+ s := resultRequestID.String
+ out.ResultRequestID = &s
+ }
+ if resultErrorID.Valid {
+ v := resultErrorID.Int64
+ out.ResultErrorID = &v
+ }
+ if errorMessage.Valid {
+ s := errorMessage.String
+ out.ErrorMessage = &s
+ }
+
+ return &out, nil
+}
+
+func nullTime(t time.Time) sql.NullTime {
+ if t.IsZero() {
+ return sql.NullTime{}
+ }
+ return sql.NullTime{Time: t, Valid: true}
+}
+
+func buildOpsErrorLogsWhere(filter *service.OpsErrorLogFilter) (string, []any) {
+ clauses := make([]string, 0, 8)
+ args := make([]any, 0, 8)
+ clauses = append(clauses, "1=1")
+
+ if filter.StartTime != nil && !filter.StartTime.IsZero() {
+ args = append(args, filter.StartTime.UTC())
+ clauses = append(clauses, "created_at >= $"+itoa(len(args)))
+ }
+ if filter.EndTime != nil && !filter.EndTime.IsZero() {
+ args = append(args, filter.EndTime.UTC())
+ // Keep time-window semantics consistent with other ops queries: [start, end)
+ clauses = append(clauses, "created_at < $"+itoa(len(args)))
+ }
+ if p := strings.TrimSpace(filter.Platform); p != "" {
+ args = append(args, p)
+ clauses = append(clauses, "platform = $"+itoa(len(args)))
+ }
+ if filter.GroupID != nil && *filter.GroupID > 0 {
+ args = append(args, *filter.GroupID)
+ clauses = append(clauses, "group_id = $"+itoa(len(args)))
+ }
+ if filter.AccountID != nil && *filter.AccountID > 0 {
+ args = append(args, *filter.AccountID)
+ clauses = append(clauses, "account_id = $"+itoa(len(args)))
+ }
+ if phase := strings.TrimSpace(filter.Phase); phase != "" {
+ args = append(args, phase)
+ clauses = append(clauses, "error_phase = $"+itoa(len(args)))
+ }
+ if len(filter.StatusCodes) > 0 {
+ args = append(args, pq.Array(filter.StatusCodes))
+ clauses = append(clauses, "status_code = ANY($"+itoa(len(args))+")")
+ }
+ if q := strings.TrimSpace(filter.Query); q != "" {
+ like := "%" + q + "%"
+ args = append(args, like)
+ n := itoa(len(args))
+ clauses = append(clauses, "(request_id ILIKE $"+n+" OR client_request_id ILIKE $"+n+" OR error_message ILIKE $"+n+")")
+ }
+
+ return "WHERE " + strings.Join(clauses, " AND "), args
+}
+
+// Helpers for nullable args
+func opsNullString(v any) any {
+ switch s := v.(type) {
+ case nil:
+ return sql.NullString{}
+ case *string:
+ if s == nil || strings.TrimSpace(*s) == "" {
+ return sql.NullString{}
+ }
+ return sql.NullString{String: strings.TrimSpace(*s), Valid: true}
+ case string:
+ if strings.TrimSpace(s) == "" {
+ return sql.NullString{}
+ }
+ return sql.NullString{String: strings.TrimSpace(s), Valid: true}
+ default:
+ return sql.NullString{}
+ }
+}
+
+func opsNullInt64(v *int64) any {
+ if v == nil || *v == 0 {
+ return sql.NullInt64{}
+ }
+ return sql.NullInt64{Int64: *v, Valid: true}
+}
+
+func opsNullInt(v any) any {
+ switch n := v.(type) {
+ case nil:
+ return sql.NullInt64{}
+ case *int:
+ if n == nil || *n == 0 {
+ return sql.NullInt64{}
+ }
+ return sql.NullInt64{Int64: int64(*n), Valid: true}
+ case *int64:
+ if n == nil || *n == 0 {
+ return sql.NullInt64{}
+ }
+ return sql.NullInt64{Int64: *n, Valid: true}
+ case int:
+ if n == 0 {
+ return sql.NullInt64{}
+ }
+ return sql.NullInt64{Int64: int64(n), Valid: true}
+ default:
+ return sql.NullInt64{}
+ }
+}
diff --git a/backend/internal/repository/ops_repo_alerts.go b/backend/internal/repository/ops_repo_alerts.go
new file mode 100644
index 00000000..ce99e6f7
--- /dev/null
+++ b/backend/internal/repository/ops_repo_alerts.go
@@ -0,0 +1,689 @@
+package repository
+
+import (
+ "context"
+ "database/sql"
+ "encoding/json"
+ "fmt"
+ "strings"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/service"
+)
+
+func (r *opsRepository) ListAlertRules(ctx context.Context) ([]*service.OpsAlertRule, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+
+ q := `
+SELECT
+ id,
+ name,
+ COALESCE(description, ''),
+ enabled,
+ COALESCE(severity, ''),
+ metric_type,
+ operator,
+ threshold,
+ window_minutes,
+ sustained_minutes,
+ cooldown_minutes,
+ COALESCE(notify_email, true),
+ filters,
+ last_triggered_at,
+ created_at,
+ updated_at
+FROM ops_alert_rules
+ORDER BY id DESC`
+
+ rows, err := r.db.QueryContext(ctx, q)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ out := []*service.OpsAlertRule{}
+ for rows.Next() {
+ var rule service.OpsAlertRule
+ var filtersRaw []byte
+ var lastTriggeredAt sql.NullTime
+ if err := rows.Scan(
+ &rule.ID,
+ &rule.Name,
+ &rule.Description,
+ &rule.Enabled,
+ &rule.Severity,
+ &rule.MetricType,
+ &rule.Operator,
+ &rule.Threshold,
+ &rule.WindowMinutes,
+ &rule.SustainedMinutes,
+ &rule.CooldownMinutes,
+ &rule.NotifyEmail,
+ &filtersRaw,
+ &lastTriggeredAt,
+ &rule.CreatedAt,
+ &rule.UpdatedAt,
+ ); err != nil {
+ return nil, err
+ }
+ if lastTriggeredAt.Valid {
+ v := lastTriggeredAt.Time
+ rule.LastTriggeredAt = &v
+ }
+ if len(filtersRaw) > 0 && string(filtersRaw) != "null" {
+ var decoded map[string]any
+ if err := json.Unmarshal(filtersRaw, &decoded); err == nil {
+ rule.Filters = decoded
+ }
+ }
+ out = append(out, &rule)
+ }
+ if err := rows.Err(); err != nil {
+ return nil, err
+ }
+ return out, nil
+}
+
+func (r *opsRepository) CreateAlertRule(ctx context.Context, input *service.OpsAlertRule) (*service.OpsAlertRule, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if input == nil {
+ return nil, fmt.Errorf("nil input")
+ }
+
+ filtersArg, err := opsNullJSONMap(input.Filters)
+ if err != nil {
+ return nil, err
+ }
+
+ q := `
+INSERT INTO ops_alert_rules (
+ name,
+ description,
+ enabled,
+ severity,
+ metric_type,
+ operator,
+ threshold,
+ window_minutes,
+ sustained_minutes,
+ cooldown_minutes,
+ notify_email,
+ filters,
+ created_at,
+ updated_at
+) VALUES (
+ $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,NOW(),NOW()
+)
+RETURNING
+ id,
+ name,
+ COALESCE(description, ''),
+ enabled,
+ COALESCE(severity, ''),
+ metric_type,
+ operator,
+ threshold,
+ window_minutes,
+ sustained_minutes,
+ cooldown_minutes,
+ COALESCE(notify_email, true),
+ filters,
+ last_triggered_at,
+ created_at,
+ updated_at`
+
+ var out service.OpsAlertRule
+ var filtersRaw []byte
+ var lastTriggeredAt sql.NullTime
+
+ if err := r.db.QueryRowContext(
+ ctx,
+ q,
+ strings.TrimSpace(input.Name),
+ strings.TrimSpace(input.Description),
+ input.Enabled,
+ strings.TrimSpace(input.Severity),
+ strings.TrimSpace(input.MetricType),
+ strings.TrimSpace(input.Operator),
+ input.Threshold,
+ input.WindowMinutes,
+ input.SustainedMinutes,
+ input.CooldownMinutes,
+ input.NotifyEmail,
+ filtersArg,
+ ).Scan(
+ &out.ID,
+ &out.Name,
+ &out.Description,
+ &out.Enabled,
+ &out.Severity,
+ &out.MetricType,
+ &out.Operator,
+ &out.Threshold,
+ &out.WindowMinutes,
+ &out.SustainedMinutes,
+ &out.CooldownMinutes,
+ &out.NotifyEmail,
+ &filtersRaw,
+ &lastTriggeredAt,
+ &out.CreatedAt,
+ &out.UpdatedAt,
+ ); err != nil {
+ return nil, err
+ }
+ if lastTriggeredAt.Valid {
+ v := lastTriggeredAt.Time
+ out.LastTriggeredAt = &v
+ }
+ if len(filtersRaw) > 0 && string(filtersRaw) != "null" {
+ var decoded map[string]any
+ if err := json.Unmarshal(filtersRaw, &decoded); err == nil {
+ out.Filters = decoded
+ }
+ }
+
+ return &out, nil
+}
+
+func (r *opsRepository) UpdateAlertRule(ctx context.Context, input *service.OpsAlertRule) (*service.OpsAlertRule, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if input == nil {
+ return nil, fmt.Errorf("nil input")
+ }
+ if input.ID <= 0 {
+ return nil, fmt.Errorf("invalid id")
+ }
+
+ filtersArg, err := opsNullJSONMap(input.Filters)
+ if err != nil {
+ return nil, err
+ }
+
+ q := `
+UPDATE ops_alert_rules
+SET
+ name = $2,
+ description = $3,
+ enabled = $4,
+ severity = $5,
+ metric_type = $6,
+ operator = $7,
+ threshold = $8,
+ window_minutes = $9,
+ sustained_minutes = $10,
+ cooldown_minutes = $11,
+ notify_email = $12,
+ filters = $13,
+ updated_at = NOW()
+WHERE id = $1
+RETURNING
+ id,
+ name,
+ COALESCE(description, ''),
+ enabled,
+ COALESCE(severity, ''),
+ metric_type,
+ operator,
+ threshold,
+ window_minutes,
+ sustained_minutes,
+ cooldown_minutes,
+ COALESCE(notify_email, true),
+ filters,
+ last_triggered_at,
+ created_at,
+ updated_at`
+
+ var out service.OpsAlertRule
+ var filtersRaw []byte
+ var lastTriggeredAt sql.NullTime
+
+ if err := r.db.QueryRowContext(
+ ctx,
+ q,
+ input.ID,
+ strings.TrimSpace(input.Name),
+ strings.TrimSpace(input.Description),
+ input.Enabled,
+ strings.TrimSpace(input.Severity),
+ strings.TrimSpace(input.MetricType),
+ strings.TrimSpace(input.Operator),
+ input.Threshold,
+ input.WindowMinutes,
+ input.SustainedMinutes,
+ input.CooldownMinutes,
+ input.NotifyEmail,
+ filtersArg,
+ ).Scan(
+ &out.ID,
+ &out.Name,
+ &out.Description,
+ &out.Enabled,
+ &out.Severity,
+ &out.MetricType,
+ &out.Operator,
+ &out.Threshold,
+ &out.WindowMinutes,
+ &out.SustainedMinutes,
+ &out.CooldownMinutes,
+ &out.NotifyEmail,
+ &filtersRaw,
+ &lastTriggeredAt,
+ &out.CreatedAt,
+ &out.UpdatedAt,
+ ); err != nil {
+ return nil, err
+ }
+
+ if lastTriggeredAt.Valid {
+ v := lastTriggeredAt.Time
+ out.LastTriggeredAt = &v
+ }
+ if len(filtersRaw) > 0 && string(filtersRaw) != "null" {
+ var decoded map[string]any
+ if err := json.Unmarshal(filtersRaw, &decoded); err == nil {
+ out.Filters = decoded
+ }
+ }
+
+ return &out, nil
+}
+
+func (r *opsRepository) DeleteAlertRule(ctx context.Context, id int64) error {
+ if r == nil || r.db == nil {
+ return fmt.Errorf("nil ops repository")
+ }
+ if id <= 0 {
+ return fmt.Errorf("invalid id")
+ }
+
+ res, err := r.db.ExecContext(ctx, "DELETE FROM ops_alert_rules WHERE id = $1", id)
+ if err != nil {
+ return err
+ }
+ affected, err := res.RowsAffected()
+ if err != nil {
+ return err
+ }
+ if affected == 0 {
+ return sql.ErrNoRows
+ }
+ return nil
+}
+
+func (r *opsRepository) ListAlertEvents(ctx context.Context, filter *service.OpsAlertEventFilter) ([]*service.OpsAlertEvent, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if filter == nil {
+ filter = &service.OpsAlertEventFilter{}
+ }
+
+ limit := filter.Limit
+ if limit <= 0 {
+ limit = 100
+ }
+ if limit > 500 {
+ limit = 500
+ }
+
+ where, args := buildOpsAlertEventsWhere(filter)
+ args = append(args, limit)
+ limitArg := "$" + itoa(len(args))
+
+ q := `
+SELECT
+ id,
+ COALESCE(rule_id, 0),
+ COALESCE(severity, ''),
+ COALESCE(status, ''),
+ COALESCE(title, ''),
+ COALESCE(description, ''),
+ metric_value,
+ threshold_value,
+ dimensions,
+ fired_at,
+ resolved_at,
+ email_sent,
+ created_at
+FROM ops_alert_events
+` + where + `
+ORDER BY fired_at DESC
+LIMIT ` + limitArg
+
+ rows, err := r.db.QueryContext(ctx, q, args...)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ out := []*service.OpsAlertEvent{}
+ for rows.Next() {
+ var ev service.OpsAlertEvent
+ var metricValue sql.NullFloat64
+ var thresholdValue sql.NullFloat64
+ var dimensionsRaw []byte
+ var resolvedAt sql.NullTime
+ if err := rows.Scan(
+ &ev.ID,
+ &ev.RuleID,
+ &ev.Severity,
+ &ev.Status,
+ &ev.Title,
+ &ev.Description,
+ &metricValue,
+ &thresholdValue,
+ &dimensionsRaw,
+ &ev.FiredAt,
+ &resolvedAt,
+ &ev.EmailSent,
+ &ev.CreatedAt,
+ ); err != nil {
+ return nil, err
+ }
+ if metricValue.Valid {
+ v := metricValue.Float64
+ ev.MetricValue = &v
+ }
+ if thresholdValue.Valid {
+ v := thresholdValue.Float64
+ ev.ThresholdValue = &v
+ }
+ if resolvedAt.Valid {
+ v := resolvedAt.Time
+ ev.ResolvedAt = &v
+ }
+ if len(dimensionsRaw) > 0 && string(dimensionsRaw) != "null" {
+ var decoded map[string]any
+ if err := json.Unmarshal(dimensionsRaw, &decoded); err == nil {
+ ev.Dimensions = decoded
+ }
+ }
+ out = append(out, &ev)
+ }
+ if err := rows.Err(); err != nil {
+ return nil, err
+ }
+ return out, nil
+}
+
+func (r *opsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if ruleID <= 0 {
+ return nil, fmt.Errorf("invalid rule id")
+ }
+
+ q := `
+SELECT
+ id,
+ COALESCE(rule_id, 0),
+ COALESCE(severity, ''),
+ COALESCE(status, ''),
+ COALESCE(title, ''),
+ COALESCE(description, ''),
+ metric_value,
+ threshold_value,
+ dimensions,
+ fired_at,
+ resolved_at,
+ email_sent,
+ created_at
+FROM ops_alert_events
+WHERE rule_id = $1 AND status = $2
+ORDER BY fired_at DESC
+LIMIT 1`
+
+ row := r.db.QueryRowContext(ctx, q, ruleID, service.OpsAlertStatusFiring)
+ ev, err := scanOpsAlertEvent(row)
+ if err != nil {
+ if err == sql.ErrNoRows {
+ return nil, nil
+ }
+ return nil, err
+ }
+ return ev, nil
+}
+
+func (r *opsRepository) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if ruleID <= 0 {
+ return nil, fmt.Errorf("invalid rule id")
+ }
+
+ q := `
+SELECT
+ id,
+ COALESCE(rule_id, 0),
+ COALESCE(severity, ''),
+ COALESCE(status, ''),
+ COALESCE(title, ''),
+ COALESCE(description, ''),
+ metric_value,
+ threshold_value,
+ dimensions,
+ fired_at,
+ resolved_at,
+ email_sent,
+ created_at
+FROM ops_alert_events
+WHERE rule_id = $1
+ORDER BY fired_at DESC
+LIMIT 1`
+
+ row := r.db.QueryRowContext(ctx, q, ruleID)
+ ev, err := scanOpsAlertEvent(row)
+ if err != nil {
+ if err == sql.ErrNoRows {
+ return nil, nil
+ }
+ return nil, err
+ }
+ return ev, nil
+}
+
+func (r *opsRepository) CreateAlertEvent(ctx context.Context, event *service.OpsAlertEvent) (*service.OpsAlertEvent, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if event == nil {
+ return nil, fmt.Errorf("nil event")
+ }
+
+ dimensionsArg, err := opsNullJSONMap(event.Dimensions)
+ if err != nil {
+ return nil, err
+ }
+
+ q := `
+INSERT INTO ops_alert_events (
+ rule_id,
+ severity,
+ status,
+ title,
+ description,
+ metric_value,
+ threshold_value,
+ dimensions,
+ fired_at,
+ resolved_at,
+ email_sent,
+ created_at
+) VALUES (
+ $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,NOW()
+)
+RETURNING
+ id,
+ COALESCE(rule_id, 0),
+ COALESCE(severity, ''),
+ COALESCE(status, ''),
+ COALESCE(title, ''),
+ COALESCE(description, ''),
+ metric_value,
+ threshold_value,
+ dimensions,
+ fired_at,
+ resolved_at,
+ email_sent,
+ created_at`
+
+ row := r.db.QueryRowContext(
+ ctx,
+ q,
+ opsNullInt64(&event.RuleID),
+ opsNullString(event.Severity),
+ opsNullString(event.Status),
+ opsNullString(event.Title),
+ opsNullString(event.Description),
+ opsNullFloat64(event.MetricValue),
+ opsNullFloat64(event.ThresholdValue),
+ dimensionsArg,
+ event.FiredAt,
+ opsNullTime(event.ResolvedAt),
+ event.EmailSent,
+ )
+ return scanOpsAlertEvent(row)
+}
+
+func (r *opsRepository) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
+ if r == nil || r.db == nil {
+ return fmt.Errorf("nil ops repository")
+ }
+ if eventID <= 0 {
+ return fmt.Errorf("invalid event id")
+ }
+ if strings.TrimSpace(status) == "" {
+ return fmt.Errorf("invalid status")
+ }
+
+ q := `
+UPDATE ops_alert_events
+SET status = $2,
+ resolved_at = $3
+WHERE id = $1`
+
+ _, err := r.db.ExecContext(ctx, q, eventID, strings.TrimSpace(status), opsNullTime(resolvedAt))
+ return err
+}
+
+func (r *opsRepository) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error {
+ if r == nil || r.db == nil {
+ return fmt.Errorf("nil ops repository")
+ }
+ if eventID <= 0 {
+ return fmt.Errorf("invalid event id")
+ }
+
+ _, err := r.db.ExecContext(ctx, "UPDATE ops_alert_events SET email_sent = $2 WHERE id = $1", eventID, emailSent)
+ return err
+}
+
+type opsAlertEventRow interface {
+ Scan(dest ...any) error
+}
+
+func scanOpsAlertEvent(row opsAlertEventRow) (*service.OpsAlertEvent, error) {
+ var ev service.OpsAlertEvent
+ var metricValue sql.NullFloat64
+ var thresholdValue sql.NullFloat64
+ var dimensionsRaw []byte
+ var resolvedAt sql.NullTime
+
+ if err := row.Scan(
+ &ev.ID,
+ &ev.RuleID,
+ &ev.Severity,
+ &ev.Status,
+ &ev.Title,
+ &ev.Description,
+ &metricValue,
+ &thresholdValue,
+ &dimensionsRaw,
+ &ev.FiredAt,
+ &resolvedAt,
+ &ev.EmailSent,
+ &ev.CreatedAt,
+ ); err != nil {
+ return nil, err
+ }
+ if metricValue.Valid {
+ v := metricValue.Float64
+ ev.MetricValue = &v
+ }
+ if thresholdValue.Valid {
+ v := thresholdValue.Float64
+ ev.ThresholdValue = &v
+ }
+ if resolvedAt.Valid {
+ v := resolvedAt.Time
+ ev.ResolvedAt = &v
+ }
+ if len(dimensionsRaw) > 0 && string(dimensionsRaw) != "null" {
+ var decoded map[string]any
+ if err := json.Unmarshal(dimensionsRaw, &decoded); err == nil {
+ ev.Dimensions = decoded
+ }
+ }
+ return &ev, nil
+}
+
+func buildOpsAlertEventsWhere(filter *service.OpsAlertEventFilter) (string, []any) {
+ clauses := []string{"1=1"}
+ args := []any{}
+
+ if filter == nil {
+ return "WHERE " + strings.Join(clauses, " AND "), args
+ }
+
+ if status := strings.TrimSpace(filter.Status); status != "" {
+ args = append(args, status)
+ clauses = append(clauses, "status = $"+itoa(len(args)))
+ }
+ if severity := strings.TrimSpace(filter.Severity); severity != "" {
+ args = append(args, severity)
+ clauses = append(clauses, "severity = $"+itoa(len(args)))
+ }
+ if filter.StartTime != nil && !filter.StartTime.IsZero() {
+ args = append(args, *filter.StartTime)
+ clauses = append(clauses, "fired_at >= $"+itoa(len(args)))
+ }
+ if filter.EndTime != nil && !filter.EndTime.IsZero() {
+ args = append(args, *filter.EndTime)
+ clauses = append(clauses, "fired_at < $"+itoa(len(args)))
+ }
+
+ // Dimensions are stored in JSONB. We filter best-effort without requiring GIN indexes.
+ if platform := strings.TrimSpace(filter.Platform); platform != "" {
+ args = append(args, platform)
+ clauses = append(clauses, "(dimensions->>'platform') = $"+itoa(len(args)))
+ }
+ if filter.GroupID != nil && *filter.GroupID > 0 {
+ args = append(args, fmt.Sprintf("%d", *filter.GroupID))
+ clauses = append(clauses, "(dimensions->>'group_id') = $"+itoa(len(args)))
+ }
+
+ return "WHERE " + strings.Join(clauses, " AND "), args
+}
+
+func opsNullJSONMap(v map[string]any) (any, error) {
+ if v == nil {
+ return sql.NullString{}, nil
+ }
+ b, err := json.Marshal(v)
+ if err != nil {
+ return nil, err
+ }
+ if len(b) == 0 {
+ return sql.NullString{}, nil
+ }
+ return sql.NullString{String: string(b), Valid: true}, nil
+}
diff --git a/backend/internal/repository/ops_repo_dashboard.go b/backend/internal/repository/ops_repo_dashboard.go
new file mode 100644
index 00000000..d96efd48
--- /dev/null
+++ b/backend/internal/repository/ops_repo_dashboard.go
@@ -0,0 +1,1012 @@
+package repository
+
+import (
+ "context"
+ "database/sql"
+ "errors"
+ "fmt"
+ "math"
+ "strings"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/service"
+)
+
+func (r *opsRepository) GetDashboardOverview(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsDashboardOverview, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if filter == nil {
+ return nil, fmt.Errorf("nil filter")
+ }
+ if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+ return nil, fmt.Errorf("start_time/end_time required")
+ }
+
+ mode := filter.QueryMode
+ if !mode.IsValid() {
+ mode = service.OpsQueryModeRaw
+ }
+
+ switch mode {
+ case service.OpsQueryModePreagg:
+ return r.getDashboardOverviewPreaggregated(ctx, filter)
+ case service.OpsQueryModeAuto:
+ out, err := r.getDashboardOverviewPreaggregated(ctx, filter)
+ if err != nil && errors.Is(err, service.ErrOpsPreaggregatedNotPopulated) {
+ return r.getDashboardOverviewRaw(ctx, filter)
+ }
+ return out, err
+ default:
+ return r.getDashboardOverviewRaw(ctx, filter)
+ }
+}
+
+func (r *opsRepository) getDashboardOverviewRaw(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsDashboardOverview, error) {
+ start := filter.StartTime.UTC()
+ end := filter.EndTime.UTC()
+
+ successCount, tokenConsumed, err := r.queryUsageCounts(ctx, filter, start, end)
+ if err != nil {
+ return nil, err
+ }
+
+ duration, ttft, err := r.queryUsageLatency(ctx, filter, start, end)
+ if err != nil {
+ return nil, err
+ }
+
+ errorTotal, businessLimited, errorCountSLA, upstreamExcl, upstream429, upstream529, err := r.queryErrorCounts(ctx, filter, start, end)
+ if err != nil {
+ return nil, err
+ }
+
+ windowSeconds := end.Sub(start).Seconds()
+ if windowSeconds <= 0 {
+ windowSeconds = 1
+ }
+
+ requestCountTotal := successCount + errorTotal
+ requestCountSLA := successCount + errorCountSLA
+
+ sla := safeDivideFloat64(float64(successCount), float64(requestCountSLA))
+ errorRate := safeDivideFloat64(float64(errorCountSLA), float64(requestCountSLA))
+ upstreamErrorRate := safeDivideFloat64(float64(upstreamExcl), float64(requestCountSLA))
+
+ qpsCurrent, tpsCurrent, err := r.queryCurrentRates(ctx, filter, end)
+ if err != nil {
+ return nil, err
+ }
+
+ qpsPeak, err := r.queryPeakQPS(ctx, filter, start, end)
+ if err != nil {
+ return nil, err
+ }
+ tpsPeak, err := r.queryPeakTPS(ctx, filter, start, end)
+ if err != nil {
+ return nil, err
+ }
+
+ qpsAvg := roundTo1DP(float64(requestCountTotal) / windowSeconds)
+ tpsAvg := roundTo1DP(float64(tokenConsumed) / windowSeconds)
+
+ return &service.OpsDashboardOverview{
+ StartTime: start,
+ EndTime: end,
+ Platform: strings.TrimSpace(filter.Platform),
+ GroupID: filter.GroupID,
+
+ SuccessCount: successCount,
+ ErrorCountTotal: errorTotal,
+ BusinessLimitedCount: businessLimited,
+ ErrorCountSLA: errorCountSLA,
+ RequestCountTotal: requestCountTotal,
+ RequestCountSLA: requestCountSLA,
+ TokenConsumed: tokenConsumed,
+
+ SLA: roundTo4DP(sla),
+ ErrorRate: roundTo4DP(errorRate),
+ UpstreamErrorRate: roundTo4DP(upstreamErrorRate),
+ UpstreamErrorCountExcl429529: upstreamExcl,
+ Upstream429Count: upstream429,
+ Upstream529Count: upstream529,
+
+ QPS: service.OpsRateSummary{
+ Current: qpsCurrent,
+ Peak: qpsPeak,
+ Avg: qpsAvg,
+ },
+ TPS: service.OpsRateSummary{
+ Current: tpsCurrent,
+ Peak: tpsPeak,
+ Avg: tpsAvg,
+ },
+
+ Duration: duration,
+ TTFT: ttft,
+ }, nil
+}
+
+type opsDashboardPartial struct {
+ successCount int64
+ errorCountTotal int64
+ businessLimitedCount int64
+ errorCountSLA int64
+
+ upstreamErrorCountExcl429529 int64
+ upstream429Count int64
+ upstream529Count int64
+
+ tokenConsumed int64
+
+ duration service.OpsPercentiles
+ ttft service.OpsPercentiles
+}
+
+func (r *opsRepository) getDashboardOverviewPreaggregated(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsDashboardOverview, error) {
+ if filter == nil {
+ return nil, fmt.Errorf("nil filter")
+ }
+
+ start := filter.StartTime.UTC()
+ end := filter.EndTime.UTC()
+
+ // Stable full-hour range covered by pre-aggregation.
+ aggSafeEnd := preaggSafeEnd(end)
+ aggFullStart := utcCeilToHour(start)
+ aggFullEnd := utcFloorToHour(aggSafeEnd)
+
+ // If there are no stable full-hour buckets, use raw directly (short windows).
+ if !aggFullStart.Before(aggFullEnd) {
+ return r.getDashboardOverviewRaw(ctx, filter)
+ }
+
+ // 1) Pre-aggregated stable segment.
+ preaggRows, err := r.listHourlyMetricsRows(ctx, filter, aggFullStart, aggFullEnd)
+ if err != nil {
+ return nil, err
+ }
+ if len(preaggRows) == 0 {
+ // Distinguish "no data" vs "preagg not populated yet".
+ if exists, err := r.rawOpsDataExists(ctx, filter, aggFullStart, aggFullEnd); err == nil && exists {
+ return nil, service.ErrOpsPreaggregatedNotPopulated
+ }
+ }
+ preagg := aggregateHourlyRows(preaggRows)
+
+ // 2) Raw head/tail fragments (at most ~1 hour each).
+ head := opsDashboardPartial{}
+ tail := opsDashboardPartial{}
+
+ if start.Before(aggFullStart) {
+ part, err := r.queryRawPartial(ctx, filter, start, minTime(end, aggFullStart))
+ if err != nil {
+ return nil, err
+ }
+ head = *part
+ }
+ if aggFullEnd.Before(end) {
+ part, err := r.queryRawPartial(ctx, filter, maxTime(start, aggFullEnd), end)
+ if err != nil {
+ return nil, err
+ }
+ tail = *part
+ }
+
+ // Merge counts.
+ successCount := preagg.successCount + head.successCount + tail.successCount
+ errorTotal := preagg.errorCountTotal + head.errorCountTotal + tail.errorCountTotal
+ businessLimited := preagg.businessLimitedCount + head.businessLimitedCount + tail.businessLimitedCount
+ errorCountSLA := preagg.errorCountSLA + head.errorCountSLA + tail.errorCountSLA
+
+ upstreamExcl := preagg.upstreamErrorCountExcl429529 + head.upstreamErrorCountExcl429529 + tail.upstreamErrorCountExcl429529
+ upstream429 := preagg.upstream429Count + head.upstream429Count + tail.upstream429Count
+ upstream529 := preagg.upstream529Count + head.upstream529Count + tail.upstream529Count
+
+ tokenConsumed := preagg.tokenConsumed + head.tokenConsumed + tail.tokenConsumed
+
+ // Approximate percentiles across segments:
+ // - p50/p90/avg: weighted average by success_count
+ // - p95/p99/max: max (conservative tail)
+ duration := combineApproxPercentiles([]opsPercentileSegment{
+ {weight: preagg.successCount, p: preagg.duration},
+ {weight: head.successCount, p: head.duration},
+ {weight: tail.successCount, p: tail.duration},
+ })
+ ttft := combineApproxPercentiles([]opsPercentileSegment{
+ {weight: preagg.successCount, p: preagg.ttft},
+ {weight: head.successCount, p: head.ttft},
+ {weight: tail.successCount, p: tail.ttft},
+ })
+
+ windowSeconds := end.Sub(start).Seconds()
+ if windowSeconds <= 0 {
+ windowSeconds = 1
+ }
+
+ requestCountTotal := successCount + errorTotal
+ requestCountSLA := successCount + errorCountSLA
+
+ sla := safeDivideFloat64(float64(successCount), float64(requestCountSLA))
+ errorRate := safeDivideFloat64(float64(errorCountSLA), float64(requestCountSLA))
+ upstreamErrorRate := safeDivideFloat64(float64(upstreamExcl), float64(requestCountSLA))
+
+ // Keep "current" rates as raw, to preserve realtime semantics.
+ qpsCurrent, tpsCurrent, err := r.queryCurrentRates(ctx, filter, end)
+ if err != nil {
+ return nil, err
+ }
+
+ // NOTE: peak still uses raw logs (minute granularity). This is typically cheaper than percentile_cont
+ // and keeps semantics consistent across modes.
+ qpsPeak, err := r.queryPeakQPS(ctx, filter, start, end)
+ if err != nil {
+ return nil, err
+ }
+ tpsPeak, err := r.queryPeakTPS(ctx, filter, start, end)
+ if err != nil {
+ return nil, err
+ }
+
+ qpsAvg := roundTo1DP(float64(requestCountTotal) / windowSeconds)
+ tpsAvg := roundTo1DP(float64(tokenConsumed) / windowSeconds)
+
+ return &service.OpsDashboardOverview{
+ StartTime: start,
+ EndTime: end,
+ Platform: strings.TrimSpace(filter.Platform),
+ GroupID: filter.GroupID,
+
+ SuccessCount: successCount,
+ ErrorCountTotal: errorTotal,
+ BusinessLimitedCount: businessLimited,
+ ErrorCountSLA: errorCountSLA,
+ RequestCountTotal: requestCountTotal,
+ RequestCountSLA: requestCountSLA,
+ TokenConsumed: tokenConsumed,
+
+ SLA: roundTo4DP(sla),
+ ErrorRate: roundTo4DP(errorRate),
+ UpstreamErrorRate: roundTo4DP(upstreamErrorRate),
+ UpstreamErrorCountExcl429529: upstreamExcl,
+ Upstream429Count: upstream429,
+ Upstream529Count: upstream529,
+
+ QPS: service.OpsRateSummary{
+ Current: qpsCurrent,
+ Peak: qpsPeak,
+ Avg: qpsAvg,
+ },
+ TPS: service.OpsRateSummary{
+ Current: tpsCurrent,
+ Peak: tpsPeak,
+ Avg: tpsAvg,
+ },
+
+ Duration: duration,
+ TTFT: ttft,
+ }, nil
+}
+
+type opsHourlyMetricsRow struct {
+ bucketStart time.Time
+
+ successCount int64
+ errorCountTotal int64
+ businessLimitedCount int64
+ errorCountSLA int64
+
+ upstreamErrorCountExcl429529 int64
+ upstream429Count int64
+ upstream529Count int64
+
+ tokenConsumed int64
+
+ durationP50 sql.NullInt64
+ durationP90 sql.NullInt64
+ durationP95 sql.NullInt64
+ durationP99 sql.NullInt64
+ durationAvg sql.NullFloat64
+ durationMax sql.NullInt64
+
+ ttftP50 sql.NullInt64
+ ttftP90 sql.NullInt64
+ ttftP95 sql.NullInt64
+ ttftP99 sql.NullInt64
+ ttftAvg sql.NullFloat64
+ ttftMax sql.NullInt64
+}
+
+func (r *opsRepository) listHourlyMetricsRows(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) ([]opsHourlyMetricsRow, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if start.IsZero() || end.IsZero() || !start.Before(end) {
+ return []opsHourlyMetricsRow{}, nil
+ }
+
+ where := "bucket_start >= $1 AND bucket_start < $2"
+ args := []any{start.UTC(), end.UTC()}
+ idx := 3
+
+ platform := ""
+ groupID := (*int64)(nil)
+ if filter != nil {
+ platform = strings.TrimSpace(strings.ToLower(filter.Platform))
+ groupID = filter.GroupID
+ }
+
+ switch {
+ case groupID != nil && *groupID > 0:
+ where += fmt.Sprintf(" AND group_id = $%d", idx)
+ args = append(args, *groupID)
+ idx++
+ if platform != "" {
+ where += fmt.Sprintf(" AND platform = $%d", idx)
+ args = append(args, platform)
+ idx++
+ }
+ case platform != "":
+ where += fmt.Sprintf(" AND platform = $%d AND group_id IS NULL", idx)
+ args = append(args, platform)
+ idx++
+ default:
+ where += " AND platform IS NULL AND group_id IS NULL"
+ }
+
+ q := `
+SELECT
+ bucket_start,
+ success_count,
+ error_count_total,
+ business_limited_count,
+ error_count_sla,
+ upstream_error_count_excl_429_529,
+ upstream_429_count,
+ upstream_529_count,
+ token_consumed,
+ duration_p50_ms,
+ duration_p90_ms,
+ duration_p95_ms,
+ duration_p99_ms,
+ duration_avg_ms,
+ duration_max_ms,
+ ttft_p50_ms,
+ ttft_p90_ms,
+ ttft_p95_ms,
+ ttft_p99_ms,
+ ttft_avg_ms,
+ ttft_max_ms
+FROM ops_metrics_hourly
+WHERE ` + where + `
+ORDER BY bucket_start ASC`
+
+ rows, err := r.db.QueryContext(ctx, q, args...)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ out := make([]opsHourlyMetricsRow, 0, 64)
+ for rows.Next() {
+ var row opsHourlyMetricsRow
+ if err := rows.Scan(
+ &row.bucketStart,
+ &row.successCount,
+ &row.errorCountTotal,
+ &row.businessLimitedCount,
+ &row.errorCountSLA,
+ &row.upstreamErrorCountExcl429529,
+ &row.upstream429Count,
+ &row.upstream529Count,
+ &row.tokenConsumed,
+ &row.durationP50,
+ &row.durationP90,
+ &row.durationP95,
+ &row.durationP99,
+ &row.durationAvg,
+ &row.durationMax,
+ &row.ttftP50,
+ &row.ttftP90,
+ &row.ttftP95,
+ &row.ttftP99,
+ &row.ttftAvg,
+ &row.ttftMax,
+ ); err != nil {
+ return nil, err
+ }
+ out = append(out, row)
+ }
+ if err := rows.Err(); err != nil {
+ return nil, err
+ }
+ return out, nil
+}
+
+func aggregateHourlyRows(rows []opsHourlyMetricsRow) opsDashboardPartial {
+ out := opsDashboardPartial{}
+ if len(rows) == 0 {
+ return out
+ }
+
+ var (
+ p50Sum float64
+ p50W int64
+ p90Sum float64
+ p90W int64
+ avgSum float64
+ avgW int64
+ )
+ var (
+ ttftP50Sum float64
+ ttftP50W int64
+ ttftP90Sum float64
+ ttftP90W int64
+ ttftAvgSum float64
+ ttftAvgW int64
+ )
+
+ var (
+ p95Max *int
+ p99Max *int
+ maxMax *int
+
+ ttftP95Max *int
+ ttftP99Max *int
+ ttftMaxMax *int
+ )
+
+ for _, row := range rows {
+ out.successCount += row.successCount
+ out.errorCountTotal += row.errorCountTotal
+ out.businessLimitedCount += row.businessLimitedCount
+ out.errorCountSLA += row.errorCountSLA
+
+ out.upstreamErrorCountExcl429529 += row.upstreamErrorCountExcl429529
+ out.upstream429Count += row.upstream429Count
+ out.upstream529Count += row.upstream529Count
+
+ out.tokenConsumed += row.tokenConsumed
+
+ if row.successCount > 0 {
+ if row.durationP50.Valid {
+ p50Sum += float64(row.durationP50.Int64) * float64(row.successCount)
+ p50W += row.successCount
+ }
+ if row.durationP90.Valid {
+ p90Sum += float64(row.durationP90.Int64) * float64(row.successCount)
+ p90W += row.successCount
+ }
+ if row.durationAvg.Valid {
+ avgSum += row.durationAvg.Float64 * float64(row.successCount)
+ avgW += row.successCount
+ }
+ if row.ttftP50.Valid {
+ ttftP50Sum += float64(row.ttftP50.Int64) * float64(row.successCount)
+ ttftP50W += row.successCount
+ }
+ if row.ttftP90.Valid {
+ ttftP90Sum += float64(row.ttftP90.Int64) * float64(row.successCount)
+ ttftP90W += row.successCount
+ }
+ if row.ttftAvg.Valid {
+ ttftAvgSum += row.ttftAvg.Float64 * float64(row.successCount)
+ ttftAvgW += row.successCount
+ }
+ }
+
+ if row.durationP95.Valid {
+ v := int(row.durationP95.Int64)
+ if p95Max == nil || v > *p95Max {
+ p95Max = &v
+ }
+ }
+ if row.durationP99.Valid {
+ v := int(row.durationP99.Int64)
+ if p99Max == nil || v > *p99Max {
+ p99Max = &v
+ }
+ }
+ if row.durationMax.Valid {
+ v := int(row.durationMax.Int64)
+ if maxMax == nil || v > *maxMax {
+ maxMax = &v
+ }
+ }
+
+ if row.ttftP95.Valid {
+ v := int(row.ttftP95.Int64)
+ if ttftP95Max == nil || v > *ttftP95Max {
+ ttftP95Max = &v
+ }
+ }
+ if row.ttftP99.Valid {
+ v := int(row.ttftP99.Int64)
+ if ttftP99Max == nil || v > *ttftP99Max {
+ ttftP99Max = &v
+ }
+ }
+ if row.ttftMax.Valid {
+ v := int(row.ttftMax.Int64)
+ if ttftMaxMax == nil || v > *ttftMaxMax {
+ ttftMaxMax = &v
+ }
+ }
+ }
+
+ // duration
+ if p50W > 0 {
+ v := int(math.Round(p50Sum / float64(p50W)))
+ out.duration.P50 = &v
+ }
+ if p90W > 0 {
+ v := int(math.Round(p90Sum / float64(p90W)))
+ out.duration.P90 = &v
+ }
+ out.duration.P95 = p95Max
+ out.duration.P99 = p99Max
+ if avgW > 0 {
+ v := int(math.Round(avgSum / float64(avgW)))
+ out.duration.Avg = &v
+ }
+ out.duration.Max = maxMax
+
+ // ttft
+ if ttftP50W > 0 {
+ v := int(math.Round(ttftP50Sum / float64(ttftP50W)))
+ out.ttft.P50 = &v
+ }
+ if ttftP90W > 0 {
+ v := int(math.Round(ttftP90Sum / float64(ttftP90W)))
+ out.ttft.P90 = &v
+ }
+ out.ttft.P95 = ttftP95Max
+ out.ttft.P99 = ttftP99Max
+ if ttftAvgW > 0 {
+ v := int(math.Round(ttftAvgSum / float64(ttftAvgW)))
+ out.ttft.Avg = &v
+ }
+ out.ttft.Max = ttftMaxMax
+
+ return out
+}
+
+func (r *opsRepository) queryRawPartial(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (*opsDashboardPartial, error) {
+ successCount, tokenConsumed, err := r.queryUsageCounts(ctx, filter, start, end)
+ if err != nil {
+ return nil, err
+ }
+
+ duration, ttft, err := r.queryUsageLatency(ctx, filter, start, end)
+ if err != nil {
+ return nil, err
+ }
+
+ errorTotal, businessLimited, errorCountSLA, upstreamExcl, upstream429, upstream529, err := r.queryErrorCounts(ctx, filter, start, end)
+ if err != nil {
+ return nil, err
+ }
+
+ return &opsDashboardPartial{
+ successCount: successCount,
+ errorCountTotal: errorTotal,
+ businessLimitedCount: businessLimited,
+ errorCountSLA: errorCountSLA,
+ upstreamErrorCountExcl429529: upstreamExcl,
+ upstream429Count: upstream429,
+ upstream529Count: upstream529,
+ tokenConsumed: tokenConsumed,
+ duration: duration,
+ ttft: ttft,
+ }, nil
+}
+
+func (r *opsRepository) rawOpsDataExists(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (bool, error) {
+ {
+ join, where, args, _ := buildUsageWhere(filter, start, end, 1)
+ q := `SELECT EXISTS(SELECT 1 FROM usage_logs ul ` + join + ` ` + where + ` LIMIT 1)`
+ var exists bool
+ if err := r.db.QueryRowContext(ctx, q, args...).Scan(&exists); err != nil {
+ return false, err
+ }
+ if exists {
+ return true, nil
+ }
+ }
+
+ {
+ where, args, _ := buildErrorWhere(filter, start, end, 1)
+ q := `SELECT EXISTS(SELECT 1 FROM ops_error_logs ` + where + ` LIMIT 1)`
+ var exists bool
+ if err := r.db.QueryRowContext(ctx, q, args...).Scan(&exists); err != nil {
+ return false, err
+ }
+ return exists, nil
+ }
+}
+
+type opsPercentileSegment struct {
+ weight int64
+ p service.OpsPercentiles
+}
+
+func combineApproxPercentiles(segments []opsPercentileSegment) service.OpsPercentiles {
+ weightedInt := func(get func(service.OpsPercentiles) *int) *int {
+ var sum float64
+ var w int64
+ for _, seg := range segments {
+ if seg.weight <= 0 {
+ continue
+ }
+ v := get(seg.p)
+ if v == nil {
+ continue
+ }
+ sum += float64(*v) * float64(seg.weight)
+ w += seg.weight
+ }
+ if w <= 0 {
+ return nil
+ }
+ out := int(math.Round(sum / float64(w)))
+ return &out
+ }
+
+ maxInt := func(get func(service.OpsPercentiles) *int) *int {
+ var max *int
+ for _, seg := range segments {
+ v := get(seg.p)
+ if v == nil {
+ continue
+ }
+ if max == nil || *v > *max {
+ c := *v
+ max = &c
+ }
+ }
+ return max
+ }
+
+ return service.OpsPercentiles{
+ P50: weightedInt(func(p service.OpsPercentiles) *int { return p.P50 }),
+ P90: weightedInt(func(p service.OpsPercentiles) *int { return p.P90 }),
+ P95: maxInt(func(p service.OpsPercentiles) *int { return p.P95 }),
+ P99: maxInt(func(p service.OpsPercentiles) *int { return p.P99 }),
+ Avg: weightedInt(func(p service.OpsPercentiles) *int { return p.Avg }),
+ Max: maxInt(func(p service.OpsPercentiles) *int { return p.Max }),
+ }
+}
+
+func preaggSafeEnd(endTime time.Time) time.Time {
+ now := time.Now().UTC()
+ cutoff := now.Add(-5 * time.Minute)
+ if endTime.After(cutoff) {
+ return cutoff
+ }
+ return endTime
+}
+
+func utcCeilToHour(t time.Time) time.Time {
+ u := t.UTC()
+ f := u.Truncate(time.Hour)
+ if f.Equal(u) {
+ return f
+ }
+ return f.Add(time.Hour)
+}
+
+func utcFloorToHour(t time.Time) time.Time {
+ return t.UTC().Truncate(time.Hour)
+}
+
+func minTime(a, b time.Time) time.Time {
+ if a.Before(b) {
+ return a
+ }
+ return b
+}
+
+func maxTime(a, b time.Time) time.Time {
+ if a.After(b) {
+ return a
+ }
+ return b
+}
+
+func (r *opsRepository) queryUsageCounts(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (successCount int64, tokenConsumed int64, err error) {
+ join, where, args, _ := buildUsageWhere(filter, start, end, 1)
+
+ q := `
+SELECT
+ COALESCE(COUNT(*), 0) AS success_count,
+ COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
+FROM usage_logs ul
+` + join + `
+` + where
+
+ var tokens sql.NullInt64
+ if err := r.db.QueryRowContext(ctx, q, args...).Scan(&successCount, &tokens); err != nil {
+ return 0, 0, err
+ }
+ if tokens.Valid {
+ tokenConsumed = tokens.Int64
+ }
+ return successCount, tokenConsumed, nil
+}
+
+func (r *opsRepository) queryUsageLatency(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (duration service.OpsPercentiles, ttft service.OpsPercentiles, err error) {
+ {
+ join, where, args, _ := buildUsageWhere(filter, start, end, 1)
+ q := `
+SELECT
+ percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50,
+ percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90,
+ percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95,
+ percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99,
+ AVG(duration_ms) AS avg_ms,
+ MAX(duration_ms) AS max_ms
+FROM usage_logs ul
+` + join + `
+` + where + `
+AND duration_ms IS NOT NULL`
+
+ var p50, p90, p95, p99 sql.NullFloat64
+ var avg sql.NullFloat64
+ var max sql.NullInt64
+ if err := r.db.QueryRowContext(ctx, q, args...).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
+ return service.OpsPercentiles{}, service.OpsPercentiles{}, err
+ }
+ duration.P50 = floatToIntPtr(p50)
+ duration.P90 = floatToIntPtr(p90)
+ duration.P95 = floatToIntPtr(p95)
+ duration.P99 = floatToIntPtr(p99)
+ duration.Avg = floatToIntPtr(avg)
+ if max.Valid {
+ v := int(max.Int64)
+ duration.Max = &v
+ }
+ }
+
+ {
+ join, where, args, _ := buildUsageWhere(filter, start, end, 1)
+ q := `
+SELECT
+ percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50,
+ percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90,
+ percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95,
+ percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99,
+ AVG(first_token_ms) AS avg_ms,
+ MAX(first_token_ms) AS max_ms
+FROM usage_logs ul
+` + join + `
+` + where + `
+AND first_token_ms IS NOT NULL`
+
+ var p50, p90, p95, p99 sql.NullFloat64
+ var avg sql.NullFloat64
+ var max sql.NullInt64
+ if err := r.db.QueryRowContext(ctx, q, args...).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
+ return service.OpsPercentiles{}, service.OpsPercentiles{}, err
+ }
+ ttft.P50 = floatToIntPtr(p50)
+ ttft.P90 = floatToIntPtr(p90)
+ ttft.P95 = floatToIntPtr(p95)
+ ttft.P99 = floatToIntPtr(p99)
+ ttft.Avg = floatToIntPtr(avg)
+ if max.Valid {
+ v := int(max.Int64)
+ ttft.Max = &v
+ }
+ }
+
+ return duration, ttft, nil
+}
+
+func (r *opsRepository) queryErrorCounts(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (
+ errorTotal int64,
+ businessLimited int64,
+ errorCountSLA int64,
+ upstreamExcl429529 int64,
+ upstream429 int64,
+ upstream529 int64,
+ err error,
+) {
+ where, args, _ := buildErrorWhere(filter, start, end, 1)
+
+ q := `
+SELECT
+ COALESCE(COUNT(*), 0) AS error_total,
+ COALESCE(COUNT(*) FILTER (WHERE is_business_limited), 0) AS business_limited,
+ COALESCE(COUNT(*) FILTER (WHERE NOT is_business_limited), 0) AS error_sla,
+ COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl,
+ COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 429), 0) AS upstream_429,
+ COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 529), 0) AS upstream_529
+FROM ops_error_logs
+` + where
+
+ if err := r.db.QueryRowContext(ctx, q, args...).Scan(
+ &errorTotal,
+ &businessLimited,
+ &errorCountSLA,
+ &upstreamExcl429529,
+ &upstream429,
+ &upstream529,
+ ); err != nil {
+ return 0, 0, 0, 0, 0, 0, err
+ }
+ return errorTotal, businessLimited, errorCountSLA, upstreamExcl429529, upstream429, upstream529, nil
+}
+
+func (r *opsRepository) queryCurrentRates(ctx context.Context, filter *service.OpsDashboardFilter, end time.Time) (qpsCurrent float64, tpsCurrent float64, err error) {
+ windowStart := end.Add(-1 * time.Minute)
+
+ successCount1m, token1m, err := r.queryUsageCounts(ctx, filter, windowStart, end)
+ if err != nil {
+ return 0, 0, err
+ }
+ errorCount1m, _, _, _, _, _, err := r.queryErrorCounts(ctx, filter, windowStart, end)
+ if err != nil {
+ return 0, 0, err
+ }
+
+ qpsCurrent = roundTo1DP(float64(successCount1m+errorCount1m) / 60.0)
+ tpsCurrent = roundTo1DP(float64(token1m) / 60.0)
+ return qpsCurrent, tpsCurrent, nil
+}
+
+func (r *opsRepository) queryPeakQPS(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (float64, error) {
+ usageJoin, usageWhere, usageArgs, next := buildUsageWhere(filter, start, end, 1)
+ errorWhere, errorArgs, _ := buildErrorWhere(filter, start, end, next)
+
+ q := `
+WITH usage_buckets AS (
+ SELECT date_trunc('minute', ul.created_at) AS bucket, COUNT(*) AS cnt
+ FROM usage_logs ul
+ ` + usageJoin + `
+ ` + usageWhere + `
+ GROUP BY 1
+),
+error_buckets AS (
+ SELECT date_trunc('minute', created_at) AS bucket, COUNT(*) AS cnt
+ FROM ops_error_logs
+ ` + errorWhere + `
+ GROUP BY 1
+),
+combined AS (
+ SELECT COALESCE(u.bucket, e.bucket) AS bucket,
+ COALESCE(u.cnt, 0) + COALESCE(e.cnt, 0) AS total
+ FROM usage_buckets u
+ FULL OUTER JOIN error_buckets e ON u.bucket = e.bucket
+)
+SELECT COALESCE(MAX(total), 0) FROM combined`
+
+ args := append(usageArgs, errorArgs...)
+
+ var maxPerMinute sql.NullInt64
+ if err := r.db.QueryRowContext(ctx, q, args...).Scan(&maxPerMinute); err != nil {
+ return 0, err
+ }
+ if !maxPerMinute.Valid || maxPerMinute.Int64 <= 0 {
+ return 0, nil
+ }
+ return roundTo1DP(float64(maxPerMinute.Int64) / 60.0), nil
+}
+
+func (r *opsRepository) queryPeakTPS(ctx context.Context, filter *service.OpsDashboardFilter, start, end time.Time) (float64, error) {
+ join, where, args, _ := buildUsageWhere(filter, start, end, 1)
+
+ q := `
+SELECT COALESCE(MAX(tokens_per_min), 0)
+FROM (
+ SELECT
+ date_trunc('minute', ul.created_at) AS bucket,
+ COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS tokens_per_min
+ FROM usage_logs ul
+ ` + join + `
+ ` + where + `
+ GROUP BY 1
+) t`
+
+ var maxPerMinute sql.NullInt64
+ if err := r.db.QueryRowContext(ctx, q, args...).Scan(&maxPerMinute); err != nil {
+ return 0, err
+ }
+ if !maxPerMinute.Valid || maxPerMinute.Int64 <= 0 {
+ return 0, nil
+ }
+ return roundTo1DP(float64(maxPerMinute.Int64) / 60.0), nil
+}
+
+func buildUsageWhere(filter *service.OpsDashboardFilter, start, end time.Time, startIndex int) (join string, where string, args []any, nextIndex int) {
+ platform := ""
+ groupID := (*int64)(nil)
+ if filter != nil {
+ platform = strings.TrimSpace(strings.ToLower(filter.Platform))
+ groupID = filter.GroupID
+ }
+
+ idx := startIndex
+ clauses := make([]string, 0, 4)
+ args = make([]any, 0, 4)
+
+ args = append(args, start)
+ clauses = append(clauses, fmt.Sprintf("ul.created_at >= $%d", idx))
+ idx++
+ args = append(args, end)
+ clauses = append(clauses, fmt.Sprintf("ul.created_at < $%d", idx))
+ idx++
+
+ if groupID != nil && *groupID > 0 {
+ args = append(args, *groupID)
+ clauses = append(clauses, fmt.Sprintf("ul.group_id = $%d", idx))
+ idx++
+ }
+ if platform != "" {
+ // Prefer group.platform when available; fall back to account.platform so we don't
+ // drop rows where group_id is NULL.
+ join = "LEFT JOIN groups g ON g.id = ul.group_id LEFT JOIN accounts a ON a.id = ul.account_id"
+ args = append(args, platform)
+ clauses = append(clauses, fmt.Sprintf("COALESCE(NULLIF(g.platform,''), a.platform) = $%d", idx))
+ idx++
+ }
+
+ where = "WHERE " + strings.Join(clauses, " AND ")
+ return join, where, args, idx
+}
+
+func buildErrorWhere(filter *service.OpsDashboardFilter, start, end time.Time, startIndex int) (where string, args []any, nextIndex int) {
+ platform := ""
+ groupID := (*int64)(nil)
+ if filter != nil {
+ platform = strings.TrimSpace(strings.ToLower(filter.Platform))
+ groupID = filter.GroupID
+ }
+
+ idx := startIndex
+ clauses := make([]string, 0, 4)
+ args = make([]any, 0, 4)
+
+ args = append(args, start)
+ clauses = append(clauses, fmt.Sprintf("created_at >= $%d", idx))
+ idx++
+ args = append(args, end)
+ clauses = append(clauses, fmt.Sprintf("created_at < $%d", idx))
+ idx++
+
+ if groupID != nil && *groupID > 0 {
+ args = append(args, *groupID)
+ clauses = append(clauses, fmt.Sprintf("group_id = $%d", idx))
+ idx++
+ }
+ if platform != "" {
+ args = append(args, platform)
+ clauses = append(clauses, fmt.Sprintf("platform = $%d", idx))
+ idx++
+ }
+
+ where = "WHERE " + strings.Join(clauses, " AND ")
+ return where, args, idx
+}
+
+func floatToIntPtr(v sql.NullFloat64) *int {
+ if !v.Valid {
+ return nil
+ }
+ n := int(math.Round(v.Float64))
+ return &n
+}
+
+func safeDivideFloat64(numerator float64, denominator float64) float64 {
+ if denominator == 0 {
+ return 0
+ }
+ return numerator / denominator
+}
+
+func roundTo1DP(v float64) float64 {
+ return math.Round(v*10) / 10
+}
+
+func roundTo4DP(v float64) float64 {
+ return math.Round(v*10000) / 10000
+}
diff --git a/backend/internal/repository/ops_repo_histograms.go b/backend/internal/repository/ops_repo_histograms.go
new file mode 100644
index 00000000..143c7e83
--- /dev/null
+++ b/backend/internal/repository/ops_repo_histograms.go
@@ -0,0 +1,79 @@
+package repository
+
+import (
+ "context"
+ "fmt"
+ "strings"
+
+ "github.com/Wei-Shaw/sub2api/internal/service"
+)
+
+func (r *opsRepository) GetLatencyHistogram(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsLatencyHistogramResponse, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if filter == nil {
+ return nil, fmt.Errorf("nil filter")
+ }
+ if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+ return nil, fmt.Errorf("start_time/end_time required")
+ }
+
+ start := filter.StartTime.UTC()
+ end := filter.EndTime.UTC()
+
+ join, where, args, _ := buildUsageWhere(filter, start, end, 1)
+ rangeExpr := latencyHistogramRangeCaseExpr("ul.duration_ms")
+ orderExpr := latencyHistogramRangeOrderCaseExpr("ul.duration_ms")
+
+ q := `
+SELECT
+ ` + rangeExpr + ` AS range,
+ COALESCE(COUNT(*), 0) AS count,
+ ` + orderExpr + ` AS ord
+FROM usage_logs ul
+` + join + `
+` + where + `
+AND ul.duration_ms IS NOT NULL
+GROUP BY 1, 3
+ORDER BY 3 ASC`
+
+ rows, err := r.db.QueryContext(ctx, q, args...)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ counts := make(map[string]int64, len(latencyHistogramOrderedRanges))
+ var total int64
+ for rows.Next() {
+ var label string
+ var count int64
+ var _ord int
+ if err := rows.Scan(&label, &count, &_ord); err != nil {
+ return nil, err
+ }
+ counts[label] = count
+ total += count
+ }
+ if err := rows.Err(); err != nil {
+ return nil, err
+ }
+
+ buckets := make([]*service.OpsLatencyHistogramBucket, 0, len(latencyHistogramOrderedRanges))
+ for _, label := range latencyHistogramOrderedRanges {
+ buckets = append(buckets, &service.OpsLatencyHistogramBucket{
+ Range: label,
+ Count: counts[label],
+ })
+ }
+
+ return &service.OpsLatencyHistogramResponse{
+ StartTime: start,
+ EndTime: end,
+ Platform: strings.TrimSpace(filter.Platform),
+ GroupID: filter.GroupID,
+ TotalRequests: total,
+ Buckets: buckets,
+ }, nil
+}
diff --git a/backend/internal/repository/ops_repo_latency_histogram_buckets.go b/backend/internal/repository/ops_repo_latency_histogram_buckets.go
new file mode 100644
index 00000000..fc085fc6
--- /dev/null
+++ b/backend/internal/repository/ops_repo_latency_histogram_buckets.go
@@ -0,0 +1,64 @@
+package repository
+
+import (
+ "fmt"
+ "strings"
+)
+
+type latencyHistogramBucket struct {
+ upperMs int
+ label string
+}
+
+var latencyHistogramBuckets = []latencyHistogramBucket{
+ {upperMs: 100, label: "0-100ms"},
+ {upperMs: 200, label: "100-200ms"},
+ {upperMs: 500, label: "200-500ms"},
+ {upperMs: 1000, label: "500-1000ms"},
+ {upperMs: 2000, label: "1000-2000ms"},
+ {upperMs: 0, label: "2000ms+"}, // default bucket
+}
+
+var latencyHistogramOrderedRanges = func() []string {
+ out := make([]string, 0, len(latencyHistogramBuckets))
+ for _, b := range latencyHistogramBuckets {
+ out = append(out, b.label)
+ }
+ return out
+}()
+
+func latencyHistogramRangeCaseExpr(column string) string {
+ var sb strings.Builder
+ sb.WriteString("CASE\n")
+
+ for _, b := range latencyHistogramBuckets {
+ if b.upperMs <= 0 {
+ continue
+ }
+ sb.WriteString(fmt.Sprintf("\tWHEN %s < %d THEN '%s'\n", column, b.upperMs, b.label))
+ }
+
+ // Default bucket.
+ last := latencyHistogramBuckets[len(latencyHistogramBuckets)-1]
+ sb.WriteString(fmt.Sprintf("\tELSE '%s'\n", last.label))
+ sb.WriteString("END")
+ return sb.String()
+}
+
+func latencyHistogramRangeOrderCaseExpr(column string) string {
+ var sb strings.Builder
+ sb.WriteString("CASE\n")
+
+ order := 1
+ for _, b := range latencyHistogramBuckets {
+ if b.upperMs <= 0 {
+ continue
+ }
+ sb.WriteString(fmt.Sprintf("\tWHEN %s < %d THEN %d\n", column, b.upperMs, order))
+ order++
+ }
+
+ sb.WriteString(fmt.Sprintf("\tELSE %d\n", order))
+ sb.WriteString("END")
+ return sb.String()
+}
diff --git a/backend/internal/repository/ops_repo_latency_histogram_buckets_test.go b/backend/internal/repository/ops_repo_latency_histogram_buckets_test.go
new file mode 100644
index 00000000..dc79f6cc
--- /dev/null
+++ b/backend/internal/repository/ops_repo_latency_histogram_buckets_test.go
@@ -0,0 +1,14 @@
+package repository
+
+import (
+ "testing"
+
+ "github.com/stretchr/testify/require"
+)
+
+func TestLatencyHistogramBuckets_AreConsistent(t *testing.T) {
+ require.Equal(t, len(latencyHistogramBuckets), len(latencyHistogramOrderedRanges))
+ for i, b := range latencyHistogramBuckets {
+ require.Equal(t, b.label, latencyHistogramOrderedRanges[i])
+ }
+}
diff --git a/backend/internal/repository/ops_repo_metrics.go b/backend/internal/repository/ops_repo_metrics.go
new file mode 100644
index 00000000..96bad88a
--- /dev/null
+++ b/backend/internal/repository/ops_repo_metrics.go
@@ -0,0 +1,401 @@
+package repository
+
+import (
+ "context"
+ "database/sql"
+ "fmt"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/service"
+)
+
+func (r *opsRepository) InsertSystemMetrics(ctx context.Context, input *service.OpsInsertSystemMetricsInput) error {
+ if r == nil || r.db == nil {
+ return fmt.Errorf("nil ops repository")
+ }
+ if input == nil {
+ return fmt.Errorf("nil input")
+ }
+
+ window := input.WindowMinutes
+ if window <= 0 {
+ window = 1
+ }
+ createdAt := input.CreatedAt
+ if createdAt.IsZero() {
+ createdAt = time.Now().UTC()
+ }
+
+ q := `
+INSERT INTO ops_system_metrics (
+ created_at,
+ window_minutes,
+ platform,
+ group_id,
+
+ success_count,
+ error_count_total,
+ business_limited_count,
+ error_count_sla,
+
+ upstream_error_count_excl_429_529,
+ upstream_429_count,
+ upstream_529_count,
+
+ token_consumed,
+ qps,
+ tps,
+
+ duration_p50_ms,
+ duration_p90_ms,
+ duration_p95_ms,
+ duration_p99_ms,
+ duration_avg_ms,
+ duration_max_ms,
+
+ ttft_p50_ms,
+ ttft_p90_ms,
+ ttft_p95_ms,
+ ttft_p99_ms,
+ ttft_avg_ms,
+ ttft_max_ms,
+
+ cpu_usage_percent,
+ memory_used_mb,
+ memory_total_mb,
+ memory_usage_percent,
+
+ db_ok,
+ redis_ok,
+
+ db_conn_active,
+ db_conn_idle,
+ db_conn_waiting,
+
+ goroutine_count,
+ concurrency_queue_depth
+) VALUES (
+ $1,$2,$3,$4,
+ $5,$6,$7,$8,
+ $9,$10,$11,
+ $12,$13,$14,
+ $15,$16,$17,$18,$19,$20,
+ $21,$22,$23,$24,$25,$26,
+ $27,$28,$29,$30,
+ $31,$32,
+ $33,$34,$35,
+ $36,$37
+)`
+
+ _, err := r.db.ExecContext(
+ ctx,
+ q,
+ createdAt,
+ window,
+ opsNullString(input.Platform),
+ opsNullInt64(input.GroupID),
+
+ input.SuccessCount,
+ input.ErrorCountTotal,
+ input.BusinessLimitedCount,
+ input.ErrorCountSLA,
+
+ input.UpstreamErrorCountExcl429529,
+ input.Upstream429Count,
+ input.Upstream529Count,
+
+ input.TokenConsumed,
+ opsNullFloat64(input.QPS),
+ opsNullFloat64(input.TPS),
+
+ opsNullInt(input.DurationP50Ms),
+ opsNullInt(input.DurationP90Ms),
+ opsNullInt(input.DurationP95Ms),
+ opsNullInt(input.DurationP99Ms),
+ opsNullFloat64(input.DurationAvgMs),
+ opsNullInt(input.DurationMaxMs),
+
+ opsNullInt(input.TTFTP50Ms),
+ opsNullInt(input.TTFTP90Ms),
+ opsNullInt(input.TTFTP95Ms),
+ opsNullInt(input.TTFTP99Ms),
+ opsNullFloat64(input.TTFTAvgMs),
+ opsNullInt(input.TTFTMaxMs),
+
+ opsNullFloat64(input.CPUUsagePercent),
+ opsNullInt(input.MemoryUsedMB),
+ opsNullInt(input.MemoryTotalMB),
+ opsNullFloat64(input.MemoryUsagePercent),
+
+ opsNullBool(input.DBOK),
+ opsNullBool(input.RedisOK),
+
+ opsNullInt(input.DBConnActive),
+ opsNullInt(input.DBConnIdle),
+ opsNullInt(input.DBConnWaiting),
+
+ opsNullInt(input.GoroutineCount),
+ opsNullInt(input.ConcurrencyQueueDepth),
+ )
+ return err
+}
+
+func (r *opsRepository) GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*service.OpsSystemMetricsSnapshot, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if windowMinutes <= 0 {
+ windowMinutes = 1
+ }
+
+ q := `
+SELECT
+ id,
+ created_at,
+ window_minutes,
+
+ cpu_usage_percent,
+ memory_used_mb,
+ memory_total_mb,
+ memory_usage_percent,
+
+ db_ok,
+ redis_ok,
+
+ db_conn_active,
+ db_conn_idle,
+ db_conn_waiting,
+
+ goroutine_count,
+ concurrency_queue_depth
+FROM ops_system_metrics
+WHERE window_minutes = $1
+ AND platform IS NULL
+ AND group_id IS NULL
+ORDER BY created_at DESC
+LIMIT 1`
+
+ var out service.OpsSystemMetricsSnapshot
+ var cpu sql.NullFloat64
+ var memUsed sql.NullInt64
+ var memTotal sql.NullInt64
+ var memPct sql.NullFloat64
+ var dbOK sql.NullBool
+ var redisOK sql.NullBool
+ var dbActive sql.NullInt64
+ var dbIdle sql.NullInt64
+ var dbWaiting sql.NullInt64
+ var goroutines sql.NullInt64
+ var queueDepth sql.NullInt64
+
+ if err := r.db.QueryRowContext(ctx, q, windowMinutes).Scan(
+ &out.ID,
+ &out.CreatedAt,
+ &out.WindowMinutes,
+ &cpu,
+ &memUsed,
+ &memTotal,
+ &memPct,
+ &dbOK,
+ &redisOK,
+ &dbActive,
+ &dbIdle,
+ &dbWaiting,
+ &goroutines,
+ &queueDepth,
+ ); err != nil {
+ return nil, err
+ }
+
+ if cpu.Valid {
+ v := cpu.Float64
+ out.CPUUsagePercent = &v
+ }
+ if memUsed.Valid {
+ v := memUsed.Int64
+ out.MemoryUsedMB = &v
+ }
+ if memTotal.Valid {
+ v := memTotal.Int64
+ out.MemoryTotalMB = &v
+ }
+ if memPct.Valid {
+ v := memPct.Float64
+ out.MemoryUsagePercent = &v
+ }
+ if dbOK.Valid {
+ v := dbOK.Bool
+ out.DBOK = &v
+ }
+ if redisOK.Valid {
+ v := redisOK.Bool
+ out.RedisOK = &v
+ }
+ if dbActive.Valid {
+ v := int(dbActive.Int64)
+ out.DBConnActive = &v
+ }
+ if dbIdle.Valid {
+ v := int(dbIdle.Int64)
+ out.DBConnIdle = &v
+ }
+ if dbWaiting.Valid {
+ v := int(dbWaiting.Int64)
+ out.DBConnWaiting = &v
+ }
+ if goroutines.Valid {
+ v := int(goroutines.Int64)
+ out.GoroutineCount = &v
+ }
+ if queueDepth.Valid {
+ v := int(queueDepth.Int64)
+ out.ConcurrencyQueueDepth = &v
+ }
+
+ return &out, nil
+}
+
+func (r *opsRepository) UpsertJobHeartbeat(ctx context.Context, input *service.OpsUpsertJobHeartbeatInput) error {
+ if r == nil || r.db == nil {
+ return fmt.Errorf("nil ops repository")
+ }
+ if input == nil {
+ return fmt.Errorf("nil input")
+ }
+ if input.JobName == "" {
+ return fmt.Errorf("job_name required")
+ }
+
+ q := `
+INSERT INTO ops_job_heartbeats (
+ job_name,
+ last_run_at,
+ last_success_at,
+ last_error_at,
+ last_error,
+ last_duration_ms,
+ updated_at
+) VALUES (
+ $1,$2,$3,$4,$5,$6,NOW()
+)
+ON CONFLICT (job_name) DO UPDATE SET
+ last_run_at = COALESCE(EXCLUDED.last_run_at, ops_job_heartbeats.last_run_at),
+ last_success_at = COALESCE(EXCLUDED.last_success_at, ops_job_heartbeats.last_success_at),
+ last_error_at = CASE
+ WHEN EXCLUDED.last_success_at IS NOT NULL THEN NULL
+ ELSE COALESCE(EXCLUDED.last_error_at, ops_job_heartbeats.last_error_at)
+ END,
+ last_error = CASE
+ WHEN EXCLUDED.last_success_at IS NOT NULL THEN NULL
+ ELSE COALESCE(EXCLUDED.last_error, ops_job_heartbeats.last_error)
+ END,
+ last_duration_ms = COALESCE(EXCLUDED.last_duration_ms, ops_job_heartbeats.last_duration_ms),
+ updated_at = NOW()`
+
+ _, err := r.db.ExecContext(
+ ctx,
+ q,
+ input.JobName,
+ opsNullTime(input.LastRunAt),
+ opsNullTime(input.LastSuccessAt),
+ opsNullTime(input.LastErrorAt),
+ opsNullString(input.LastError),
+ opsNullInt(input.LastDurationMs),
+ )
+ return err
+}
+
+func (r *opsRepository) ListJobHeartbeats(ctx context.Context) ([]*service.OpsJobHeartbeat, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+
+ q := `
+SELECT
+ job_name,
+ last_run_at,
+ last_success_at,
+ last_error_at,
+ last_error,
+ last_duration_ms,
+ updated_at
+FROM ops_job_heartbeats
+ORDER BY job_name ASC`
+
+ rows, err := r.db.QueryContext(ctx, q)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ out := make([]*service.OpsJobHeartbeat, 0, 8)
+ for rows.Next() {
+ var item service.OpsJobHeartbeat
+ var lastRun sql.NullTime
+ var lastSuccess sql.NullTime
+ var lastErrorAt sql.NullTime
+ var lastError sql.NullString
+ var lastDuration sql.NullInt64
+
+ if err := rows.Scan(
+ &item.JobName,
+ &lastRun,
+ &lastSuccess,
+ &lastErrorAt,
+ &lastError,
+ &lastDuration,
+ &item.UpdatedAt,
+ ); err != nil {
+ return nil, err
+ }
+
+ if lastRun.Valid {
+ v := lastRun.Time
+ item.LastRunAt = &v
+ }
+ if lastSuccess.Valid {
+ v := lastSuccess.Time
+ item.LastSuccessAt = &v
+ }
+ if lastErrorAt.Valid {
+ v := lastErrorAt.Time
+ item.LastErrorAt = &v
+ }
+ if lastError.Valid {
+ v := lastError.String
+ item.LastError = &v
+ }
+ if lastDuration.Valid {
+ v := lastDuration.Int64
+ item.LastDurationMs = &v
+ }
+
+ out = append(out, &item)
+ }
+ if err := rows.Err(); err != nil {
+ return nil, err
+ }
+ return out, nil
+}
+
+func opsNullBool(v *bool) any {
+ if v == nil {
+ return sql.NullBool{}
+ }
+ return sql.NullBool{Bool: *v, Valid: true}
+}
+
+func opsNullFloat64(v *float64) any {
+ if v == nil {
+ return sql.NullFloat64{}
+ }
+ return sql.NullFloat64{Float64: *v, Valid: true}
+}
+
+func opsNullTime(v *time.Time) any {
+ if v == nil || v.IsZero() {
+ return sql.NullTime{}
+ }
+ return sql.NullTime{Time: *v, Valid: true}
+}
+
diff --git a/backend/internal/repository/ops_repo_preagg.go b/backend/internal/repository/ops_repo_preagg.go
new file mode 100644
index 00000000..6a8b9184
--- /dev/null
+++ b/backend/internal/repository/ops_repo_preagg.go
@@ -0,0 +1,359 @@
+package repository
+
+import (
+ "context"
+ "database/sql"
+ "fmt"
+ "time"
+)
+
+func (r *opsRepository) UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error {
+ if r == nil || r.db == nil {
+ return fmt.Errorf("nil ops repository")
+ }
+ if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) {
+ return nil
+ }
+
+ start := startTime.UTC()
+ end := endTime.UTC()
+
+ // NOTE:
+ // - We aggregate usage_logs + ops_error_logs into ops_metrics_hourly.
+ // - We emit three dimension granularities via GROUPING SETS:
+ // 1) overall: (bucket_start)
+ // 2) platform: (bucket_start, platform)
+ // 3) group: (bucket_start, platform, group_id)
+ //
+ // IMPORTANT: Postgres UNIQUE treats NULLs as distinct, so the table uses a COALESCE-based
+ // unique index; our ON CONFLICT target must match that expression set.
+ q := `
+WITH usage_base AS (
+ SELECT
+ date_trunc('hour', ul.created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start,
+ g.platform AS platform,
+ ul.group_id AS group_id,
+ ul.duration_ms AS duration_ms,
+ ul.first_token_ms AS first_token_ms,
+ (ul.input_tokens + ul.output_tokens + ul.cache_creation_tokens + ul.cache_read_tokens) AS tokens
+ FROM usage_logs ul
+ JOIN groups g ON g.id = ul.group_id
+ WHERE ul.created_at >= $1 AND ul.created_at < $2
+),
+usage_agg AS (
+ SELECT
+ bucket_start,
+ CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform,
+ CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id,
+ COUNT(*) AS success_count,
+ COALESCE(SUM(tokens), 0) AS token_consumed,
+
+ percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p50_ms,
+ percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p90_ms,
+ percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p95_ms,
+ percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_p99_ms,
+ AVG(duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS duration_avg_ms,
+ MAX(duration_ms) AS duration_max_ms,
+
+ percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p50_ms,
+ percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p90_ms,
+ percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p95_ms,
+ percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_p99_ms,
+ AVG(first_token_ms) FILTER (WHERE first_token_ms IS NOT NULL) AS ttft_avg_ms,
+ MAX(first_token_ms) AS ttft_max_ms
+ FROM usage_base
+ GROUP BY GROUPING SETS (
+ (bucket_start),
+ (bucket_start, platform),
+ (bucket_start, platform, group_id)
+ )
+),
+error_base AS (
+ SELECT
+ date_trunc('hour', created_at AT TIME ZONE 'UTC') AT TIME ZONE 'UTC' AS bucket_start,
+ platform AS platform,
+ group_id AS group_id,
+ is_business_limited AS is_business_limited,
+ error_owner AS error_owner,
+ status_code AS status_code
+ FROM ops_error_logs
+ WHERE created_at >= $1 AND created_at < $2
+),
+error_agg AS (
+ SELECT
+ bucket_start,
+ CASE WHEN GROUPING(platform) = 1 THEN NULL ELSE platform END AS platform,
+ CASE WHEN GROUPING(group_id) = 1 THEN NULL ELSE group_id END AS group_id,
+ COUNT(*) AS error_count_total,
+ COUNT(*) FILTER (WHERE is_business_limited) AS business_limited_count,
+ COUNT(*) FILTER (WHERE NOT is_business_limited) AS error_count_sla,
+ COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) NOT IN (429, 529)) AS upstream_error_count_excl_429_529,
+ COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 429) AS upstream_429_count,
+ COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 529) AS upstream_529_count
+ FROM error_base
+ GROUP BY GROUPING SETS (
+ (bucket_start),
+ (bucket_start, platform),
+ (bucket_start, platform, group_id)
+ )
+ HAVING GROUPING(group_id) = 1 OR group_id IS NOT NULL
+),
+combined AS (
+ SELECT
+ COALESCE(u.bucket_start, e.bucket_start) AS bucket_start,
+ COALESCE(u.platform, e.platform) AS platform,
+ COALESCE(u.group_id, e.group_id) AS group_id,
+
+ COALESCE(u.success_count, 0) AS success_count,
+ COALESCE(e.error_count_total, 0) AS error_count_total,
+ COALESCE(e.business_limited_count, 0) AS business_limited_count,
+ COALESCE(e.error_count_sla, 0) AS error_count_sla,
+ COALESCE(e.upstream_error_count_excl_429_529, 0) AS upstream_error_count_excl_429_529,
+ COALESCE(e.upstream_429_count, 0) AS upstream_429_count,
+ COALESCE(e.upstream_529_count, 0) AS upstream_529_count,
+
+ COALESCE(u.token_consumed, 0) AS token_consumed,
+
+ u.duration_p50_ms,
+ u.duration_p90_ms,
+ u.duration_p95_ms,
+ u.duration_p99_ms,
+ u.duration_avg_ms,
+ u.duration_max_ms,
+
+ u.ttft_p50_ms,
+ u.ttft_p90_ms,
+ u.ttft_p95_ms,
+ u.ttft_p99_ms,
+ u.ttft_avg_ms,
+ u.ttft_max_ms
+ FROM usage_agg u
+ FULL OUTER JOIN error_agg e
+ ON u.bucket_start = e.bucket_start
+ AND COALESCE(u.platform, '') = COALESCE(e.platform, '')
+ AND COALESCE(u.group_id, 0) = COALESCE(e.group_id, 0)
+)
+INSERT INTO ops_metrics_hourly (
+ bucket_start,
+ platform,
+ group_id,
+ success_count,
+ error_count_total,
+ business_limited_count,
+ error_count_sla,
+ upstream_error_count_excl_429_529,
+ upstream_429_count,
+ upstream_529_count,
+ token_consumed,
+ duration_p50_ms,
+ duration_p90_ms,
+ duration_p95_ms,
+ duration_p99_ms,
+ duration_avg_ms,
+ duration_max_ms,
+ ttft_p50_ms,
+ ttft_p90_ms,
+ ttft_p95_ms,
+ ttft_p99_ms,
+ ttft_avg_ms,
+ ttft_max_ms,
+ computed_at
+)
+SELECT
+ bucket_start,
+ NULLIF(platform, '') AS platform,
+ group_id,
+ success_count,
+ error_count_total,
+ business_limited_count,
+ error_count_sla,
+ upstream_error_count_excl_429_529,
+ upstream_429_count,
+ upstream_529_count,
+ token_consumed,
+ duration_p50_ms::int,
+ duration_p90_ms::int,
+ duration_p95_ms::int,
+ duration_p99_ms::int,
+ duration_avg_ms,
+ duration_max_ms::int,
+ ttft_p50_ms::int,
+ ttft_p90_ms::int,
+ ttft_p95_ms::int,
+ ttft_p99_ms::int,
+ ttft_avg_ms,
+ ttft_max_ms::int,
+ NOW()
+FROM combined
+WHERE bucket_start IS NOT NULL
+ AND (platform IS NULL OR platform <> '')
+ON CONFLICT (bucket_start, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET
+ success_count = EXCLUDED.success_count,
+ error_count_total = EXCLUDED.error_count_total,
+ business_limited_count = EXCLUDED.business_limited_count,
+ error_count_sla = EXCLUDED.error_count_sla,
+ upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529,
+ upstream_429_count = EXCLUDED.upstream_429_count,
+ upstream_529_count = EXCLUDED.upstream_529_count,
+ token_consumed = EXCLUDED.token_consumed,
+
+ duration_p50_ms = EXCLUDED.duration_p50_ms,
+ duration_p90_ms = EXCLUDED.duration_p90_ms,
+ duration_p95_ms = EXCLUDED.duration_p95_ms,
+ duration_p99_ms = EXCLUDED.duration_p99_ms,
+ duration_avg_ms = EXCLUDED.duration_avg_ms,
+ duration_max_ms = EXCLUDED.duration_max_ms,
+
+ ttft_p50_ms = EXCLUDED.ttft_p50_ms,
+ ttft_p90_ms = EXCLUDED.ttft_p90_ms,
+ ttft_p95_ms = EXCLUDED.ttft_p95_ms,
+ ttft_p99_ms = EXCLUDED.ttft_p99_ms,
+ ttft_avg_ms = EXCLUDED.ttft_avg_ms,
+ ttft_max_ms = EXCLUDED.ttft_max_ms,
+
+ computed_at = NOW()
+`
+
+ _, err := r.db.ExecContext(ctx, q, start, end)
+ return err
+}
+
+func (r *opsRepository) UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error {
+ if r == nil || r.db == nil {
+ return fmt.Errorf("nil ops repository")
+ }
+ if startTime.IsZero() || endTime.IsZero() || !endTime.After(startTime) {
+ return nil
+ }
+
+ start := startTime.UTC()
+ end := endTime.UTC()
+
+ q := `
+INSERT INTO ops_metrics_daily (
+ bucket_date,
+ platform,
+ group_id,
+ success_count,
+ error_count_total,
+ business_limited_count,
+ error_count_sla,
+ upstream_error_count_excl_429_529,
+ upstream_429_count,
+ upstream_529_count,
+ token_consumed,
+ duration_p50_ms,
+ duration_p90_ms,
+ duration_p95_ms,
+ duration_p99_ms,
+ duration_avg_ms,
+ duration_max_ms,
+ ttft_p50_ms,
+ ttft_p90_ms,
+ ttft_p95_ms,
+ ttft_p99_ms,
+ ttft_avg_ms,
+ ttft_max_ms,
+ computed_at
+)
+SELECT
+ (bucket_start AT TIME ZONE 'UTC')::date AS bucket_date,
+ platform,
+ group_id,
+
+ COALESCE(SUM(success_count), 0) AS success_count,
+ COALESCE(SUM(error_count_total), 0) AS error_count_total,
+ COALESCE(SUM(business_limited_count), 0) AS business_limited_count,
+ COALESCE(SUM(error_count_sla), 0) AS error_count_sla,
+ COALESCE(SUM(upstream_error_count_excl_429_529), 0) AS upstream_error_count_excl_429_529,
+ COALESCE(SUM(upstream_429_count), 0) AS upstream_429_count,
+ COALESCE(SUM(upstream_529_count), 0) AS upstream_529_count,
+ COALESCE(SUM(token_consumed), 0) AS token_consumed,
+
+ -- Approximation: weighted average for p50/p90, max for p95/p99 (conservative tail).
+ ROUND(SUM(duration_p50_ms::double precision * success_count) FILTER (WHERE duration_p50_ms IS NOT NULL)
+ / NULLIF(SUM(success_count) FILTER (WHERE duration_p50_ms IS NOT NULL), 0))::int AS duration_p50_ms,
+ ROUND(SUM(duration_p90_ms::double precision * success_count) FILTER (WHERE duration_p90_ms IS NOT NULL)
+ / NULLIF(SUM(success_count) FILTER (WHERE duration_p90_ms IS NOT NULL), 0))::int AS duration_p90_ms,
+ MAX(duration_p95_ms) AS duration_p95_ms,
+ MAX(duration_p99_ms) AS duration_p99_ms,
+ SUM(duration_avg_ms * success_count) FILTER (WHERE duration_avg_ms IS NOT NULL)
+ / NULLIF(SUM(success_count) FILTER (WHERE duration_avg_ms IS NOT NULL), 0) AS duration_avg_ms,
+ MAX(duration_max_ms) AS duration_max_ms,
+
+ ROUND(SUM(ttft_p50_ms::double precision * success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL)
+ / NULLIF(SUM(success_count) FILTER (WHERE ttft_p50_ms IS NOT NULL), 0))::int AS ttft_p50_ms,
+ ROUND(SUM(ttft_p90_ms::double precision * success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL)
+ / NULLIF(SUM(success_count) FILTER (WHERE ttft_p90_ms IS NOT NULL), 0))::int AS ttft_p90_ms,
+ MAX(ttft_p95_ms) AS ttft_p95_ms,
+ MAX(ttft_p99_ms) AS ttft_p99_ms,
+ SUM(ttft_avg_ms * success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL)
+ / NULLIF(SUM(success_count) FILTER (WHERE ttft_avg_ms IS NOT NULL), 0) AS ttft_avg_ms,
+ MAX(ttft_max_ms) AS ttft_max_ms,
+
+ NOW()
+FROM ops_metrics_hourly
+WHERE bucket_start >= $1 AND bucket_start < $2
+GROUP BY 1, 2, 3
+ON CONFLICT (bucket_date, COALESCE(platform, ''), COALESCE(group_id, 0)) DO UPDATE SET
+ success_count = EXCLUDED.success_count,
+ error_count_total = EXCLUDED.error_count_total,
+ business_limited_count = EXCLUDED.business_limited_count,
+ error_count_sla = EXCLUDED.error_count_sla,
+ upstream_error_count_excl_429_529 = EXCLUDED.upstream_error_count_excl_429_529,
+ upstream_429_count = EXCLUDED.upstream_429_count,
+ upstream_529_count = EXCLUDED.upstream_529_count,
+ token_consumed = EXCLUDED.token_consumed,
+
+ duration_p50_ms = EXCLUDED.duration_p50_ms,
+ duration_p90_ms = EXCLUDED.duration_p90_ms,
+ duration_p95_ms = EXCLUDED.duration_p95_ms,
+ duration_p99_ms = EXCLUDED.duration_p99_ms,
+ duration_avg_ms = EXCLUDED.duration_avg_ms,
+ duration_max_ms = EXCLUDED.duration_max_ms,
+
+ ttft_p50_ms = EXCLUDED.ttft_p50_ms,
+ ttft_p90_ms = EXCLUDED.ttft_p90_ms,
+ ttft_p95_ms = EXCLUDED.ttft_p95_ms,
+ ttft_p99_ms = EXCLUDED.ttft_p99_ms,
+ ttft_avg_ms = EXCLUDED.ttft_avg_ms,
+ ttft_max_ms = EXCLUDED.ttft_max_ms,
+
+ computed_at = NOW()
+`
+
+ _, err := r.db.ExecContext(ctx, q, start, end)
+ return err
+}
+
+func (r *opsRepository) GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error) {
+ if r == nil || r.db == nil {
+ return time.Time{}, false, fmt.Errorf("nil ops repository")
+ }
+
+ var value sql.NullTime
+ if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_start) FROM ops_metrics_hourly`).Scan(&value); err != nil {
+ return time.Time{}, false, err
+ }
+ if !value.Valid {
+ return time.Time{}, false, nil
+ }
+ return value.Time.UTC(), true, nil
+}
+
+func (r *opsRepository) GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error) {
+ if r == nil || r.db == nil {
+ return time.Time{}, false, fmt.Errorf("nil ops repository")
+ }
+
+ var value sql.NullTime
+ if err := r.db.QueryRowContext(ctx, `SELECT MAX(bucket_date) FROM ops_metrics_daily`).Scan(&value); err != nil {
+ return time.Time{}, false, err
+ }
+ if !value.Valid {
+ return time.Time{}, false, nil
+ }
+ t := value.Time.UTC()
+ return time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, time.UTC), true, nil
+}
+
diff --git a/backend/internal/repository/ops_repo_request_details.go b/backend/internal/repository/ops_repo_request_details.go
new file mode 100644
index 00000000..57b93b21
--- /dev/null
+++ b/backend/internal/repository/ops_repo_request_details.go
@@ -0,0 +1,285 @@
+package repository
+
+import (
+ "context"
+ "database/sql"
+ "fmt"
+ "strings"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/service"
+)
+
+func (r *opsRepository) ListRequestDetails(ctx context.Context, filter *service.OpsRequestDetailFilter) ([]*service.OpsRequestDetail, int64, error) {
+ if r == nil || r.db == nil {
+ return nil, 0, fmt.Errorf("nil ops repository")
+ }
+
+ page, pageSize, startTime, endTime := filter.Normalize()
+ offset := (page - 1) * pageSize
+
+ conditions := make([]string, 0, 16)
+ args := make([]any, 0, 24)
+
+ // Placeholders $1/$2 reserved for time window inside the CTE.
+ args = append(args, startTime.UTC(), endTime.UTC())
+
+ addCondition := func(condition string, values ...any) {
+ conditions = append(conditions, condition)
+ args = append(args, values...)
+ }
+
+ if filter != nil {
+ if kind := strings.TrimSpace(strings.ToLower(filter.Kind)); kind != "" && kind != "all" {
+ if kind != string(service.OpsRequestKindSuccess) && kind != string(service.OpsRequestKindError) {
+ return nil, 0, fmt.Errorf("invalid kind")
+ }
+ addCondition(fmt.Sprintf("kind = $%d", len(args)+1), kind)
+ }
+
+ if platform := strings.TrimSpace(strings.ToLower(filter.Platform)); platform != "" {
+ addCondition(fmt.Sprintf("platform = $%d", len(args)+1), platform)
+ }
+ if filter.GroupID != nil && *filter.GroupID > 0 {
+ addCondition(fmt.Sprintf("group_id = $%d", len(args)+1), *filter.GroupID)
+ }
+
+ if filter.UserID != nil && *filter.UserID > 0 {
+ addCondition(fmt.Sprintf("user_id = $%d", len(args)+1), *filter.UserID)
+ }
+ if filter.APIKeyID != nil && *filter.APIKeyID > 0 {
+ addCondition(fmt.Sprintf("api_key_id = $%d", len(args)+1), *filter.APIKeyID)
+ }
+ if filter.AccountID != nil && *filter.AccountID > 0 {
+ addCondition(fmt.Sprintf("account_id = $%d", len(args)+1), *filter.AccountID)
+ }
+
+ if model := strings.TrimSpace(filter.Model); model != "" {
+ addCondition(fmt.Sprintf("model = $%d", len(args)+1), model)
+ }
+ if requestID := strings.TrimSpace(filter.RequestID); requestID != "" {
+ addCondition(fmt.Sprintf("request_id = $%d", len(args)+1), requestID)
+ }
+ if q := strings.TrimSpace(filter.Query); q != "" {
+ like := "%" + strings.ToLower(q) + "%"
+ startIdx := len(args) + 1
+ addCondition(
+ fmt.Sprintf("(LOWER(COALESCE(request_id,'')) LIKE $%d OR LOWER(COALESCE(model,'')) LIKE $%d OR LOWER(COALESCE(message,'')) LIKE $%d)",
+ startIdx, startIdx+1, startIdx+2,
+ ),
+ like, like, like,
+ )
+ }
+
+ if filter.MinDurationMs != nil {
+ addCondition(fmt.Sprintf("duration_ms >= $%d", len(args)+1), *filter.MinDurationMs)
+ }
+ if filter.MaxDurationMs != nil {
+ addCondition(fmt.Sprintf("duration_ms <= $%d", len(args)+1), *filter.MaxDurationMs)
+ }
+ }
+
+ where := ""
+ if len(conditions) > 0 {
+ where = "WHERE " + strings.Join(conditions, " AND ")
+ }
+
+ cte := `
+WITH combined AS (
+ SELECT
+ 'success'::TEXT AS kind,
+ ul.created_at AS created_at,
+ ul.request_id AS request_id,
+ COALESCE(NULLIF(g.platform, ''), NULLIF(a.platform, ''), '') AS platform,
+ ul.model AS model,
+ ul.duration_ms AS duration_ms,
+ NULL::INT AS status_code,
+ NULL::BIGINT AS error_id,
+ NULL::TEXT AS phase,
+ NULL::TEXT AS severity,
+ NULL::TEXT AS message,
+ ul.user_id AS user_id,
+ ul.api_key_id AS api_key_id,
+ ul.account_id AS account_id,
+ ul.group_id AS group_id,
+ ul.stream AS stream
+ FROM usage_logs ul
+ LEFT JOIN groups g ON g.id = ul.group_id
+ LEFT JOIN accounts a ON a.id = ul.account_id
+ WHERE ul.created_at >= $1 AND ul.created_at < $2
+
+ UNION ALL
+
+ SELECT
+ 'error'::TEXT AS kind,
+ o.created_at AS created_at,
+ COALESCE(NULLIF(o.request_id,''), NULLIF(o.client_request_id,''), '') AS request_id,
+ COALESCE(NULLIF(o.platform, ''), NULLIF(g.platform, ''), NULLIF(a.platform, ''), '') AS platform,
+ o.model AS model,
+ o.duration_ms AS duration_ms,
+ o.status_code AS status_code,
+ o.id AS error_id,
+ o.error_phase AS phase,
+ o.severity AS severity,
+ o.error_message AS message,
+ o.user_id AS user_id,
+ o.api_key_id AS api_key_id,
+ o.account_id AS account_id,
+ o.group_id AS group_id,
+ o.stream AS stream
+ FROM ops_error_logs o
+ LEFT JOIN groups g ON g.id = o.group_id
+ LEFT JOIN accounts a ON a.id = o.account_id
+ WHERE o.created_at >= $1 AND o.created_at < $2
+)
+`
+
+ countQuery := fmt.Sprintf(`%s SELECT COUNT(1) FROM combined %s`, cte, where)
+ var total int64
+ if err := r.db.QueryRowContext(ctx, countQuery, args...).Scan(&total); err != nil {
+ if err == sql.ErrNoRows {
+ total = 0
+ } else {
+ return nil, 0, err
+ }
+ }
+
+ sort := "ORDER BY created_at DESC"
+ if filter != nil {
+ switch strings.TrimSpace(strings.ToLower(filter.Sort)) {
+ case "", "created_at_desc":
+ // default
+ case "duration_desc":
+ sort = "ORDER BY duration_ms DESC NULLS LAST, created_at DESC"
+ default:
+ return nil, 0, fmt.Errorf("invalid sort")
+ }
+ }
+
+ listQuery := fmt.Sprintf(`
+%s
+SELECT
+ kind,
+ created_at,
+ request_id,
+ platform,
+ model,
+ duration_ms,
+ status_code,
+ error_id,
+ phase,
+ severity,
+ message,
+ user_id,
+ api_key_id,
+ account_id,
+ group_id,
+ stream
+FROM combined
+%s
+%s
+LIMIT $%d OFFSET $%d
+`, cte, where, sort, len(args)+1, len(args)+2)
+
+ listArgs := append(append([]any{}, args...), pageSize, offset)
+ rows, err := r.db.QueryContext(ctx, listQuery, listArgs...)
+ if err != nil {
+ return nil, 0, err
+ }
+ defer rows.Close()
+
+ toIntPtr := func(v sql.NullInt64) *int {
+ if !v.Valid {
+ return nil
+ }
+ i := int(v.Int64)
+ return &i
+ }
+ toInt64Ptr := func(v sql.NullInt64) *int64 {
+ if !v.Valid {
+ return nil
+ }
+ i := v.Int64
+ return &i
+ }
+
+ out := make([]*service.OpsRequestDetail, 0, pageSize)
+ for rows.Next() {
+ var (
+ kind string
+ createdAt time.Time
+ requestID sql.NullString
+ platform sql.NullString
+ model sql.NullString
+
+ durationMs sql.NullInt64
+ statusCode sql.NullInt64
+ errorID sql.NullInt64
+
+ phase sql.NullString
+ severity sql.NullString
+ message sql.NullString
+
+ userID sql.NullInt64
+ apiKeyID sql.NullInt64
+ accountID sql.NullInt64
+ groupID sql.NullInt64
+
+ stream bool
+ )
+
+ if err := rows.Scan(
+ &kind,
+ &createdAt,
+ &requestID,
+ &platform,
+ &model,
+ &durationMs,
+ &statusCode,
+ &errorID,
+ &phase,
+ &severity,
+ &message,
+ &userID,
+ &apiKeyID,
+ &accountID,
+ &groupID,
+ &stream,
+ ); err != nil {
+ return nil, 0, err
+ }
+
+ item := &service.OpsRequestDetail{
+ Kind: service.OpsRequestKind(kind),
+ CreatedAt: createdAt,
+ RequestID: strings.TrimSpace(requestID.String),
+ Platform: strings.TrimSpace(platform.String),
+ Model: strings.TrimSpace(model.String),
+
+ DurationMs: toIntPtr(durationMs),
+ StatusCode: toIntPtr(statusCode),
+ ErrorID: toInt64Ptr(errorID),
+ Phase: phase.String,
+ Severity: severity.String,
+ Message: message.String,
+
+ UserID: toInt64Ptr(userID),
+ APIKeyID: toInt64Ptr(apiKeyID),
+ AccountID: toInt64Ptr(accountID),
+ GroupID: toInt64Ptr(groupID),
+
+ Stream: stream,
+ }
+
+ if item.Platform == "" {
+ item.Platform = "unknown"
+ }
+
+ out = append(out, item)
+ }
+ if err := rows.Err(); err != nil {
+ return nil, 0, err
+ }
+
+ return out, total, nil
+}
diff --git a/backend/internal/repository/ops_repo_trends.go b/backend/internal/repository/ops_repo_trends.go
new file mode 100644
index 00000000..5f32c5d1
--- /dev/null
+++ b/backend/internal/repository/ops_repo_trends.go
@@ -0,0 +1,567 @@
+package repository
+
+import (
+ "context"
+ "database/sql"
+ "fmt"
+ "strings"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/service"
+)
+
+func (r *opsRepository) GetThroughputTrend(ctx context.Context, filter *service.OpsDashboardFilter, bucketSeconds int) (*service.OpsThroughputTrendResponse, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if filter == nil {
+ return nil, fmt.Errorf("nil filter")
+ }
+ if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+ return nil, fmt.Errorf("start_time/end_time required")
+ }
+
+ if bucketSeconds <= 0 {
+ bucketSeconds = 60
+ }
+ if bucketSeconds != 60 && bucketSeconds != 300 && bucketSeconds != 3600 {
+ // Keep a small, predictable set of supported buckets for now.
+ bucketSeconds = 60
+ }
+
+ start := filter.StartTime.UTC()
+ end := filter.EndTime.UTC()
+
+ usageJoin, usageWhere, usageArgs, next := buildUsageWhere(filter, start, end, 1)
+ errorWhere, errorArgs, _ := buildErrorWhere(filter, start, end, next)
+
+ usageBucketExpr := opsBucketExprForUsage(bucketSeconds)
+ errorBucketExpr := opsBucketExprForError(bucketSeconds)
+
+ q := `
+WITH usage_buckets AS (
+ SELECT ` + usageBucketExpr + ` AS bucket,
+ COUNT(*) AS success_count,
+ COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
+ FROM usage_logs ul
+ ` + usageJoin + `
+ ` + usageWhere + `
+ GROUP BY 1
+),
+error_buckets AS (
+ SELECT ` + errorBucketExpr + ` AS bucket,
+ COUNT(*) AS error_count
+ FROM ops_error_logs
+ ` + errorWhere + `
+ GROUP BY 1
+),
+combined AS (
+ SELECT COALESCE(u.bucket, e.bucket) AS bucket,
+ COALESCE(u.success_count, 0) AS success_count,
+ COALESCE(e.error_count, 0) AS error_count,
+ COALESCE(u.token_consumed, 0) AS token_consumed
+ FROM usage_buckets u
+ FULL OUTER JOIN error_buckets e ON u.bucket = e.bucket
+)
+SELECT
+ bucket,
+ (success_count + error_count) AS request_count,
+ token_consumed
+FROM combined
+ORDER BY bucket ASC`
+
+ args := append(usageArgs, errorArgs...)
+
+ rows, err := r.db.QueryContext(ctx, q, args...)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ points := make([]*service.OpsThroughputTrendPoint, 0, 256)
+ for rows.Next() {
+ var bucket time.Time
+ var requests int64
+ var tokens sql.NullInt64
+ if err := rows.Scan(&bucket, &requests, &tokens); err != nil {
+ return nil, err
+ }
+ tokenConsumed := int64(0)
+ if tokens.Valid {
+ tokenConsumed = tokens.Int64
+ }
+
+ denom := float64(bucketSeconds)
+ if denom <= 0 {
+ denom = 60
+ }
+ qps := roundTo1DP(float64(requests) / denom)
+ tps := roundTo1DP(float64(tokenConsumed) / denom)
+
+ points = append(points, &service.OpsThroughputTrendPoint{
+ BucketStart: bucket.UTC(),
+ RequestCount: requests,
+ TokenConsumed: tokenConsumed,
+ QPS: qps,
+ TPS: tps,
+ })
+ }
+ if err := rows.Err(); err != nil {
+ return nil, err
+ }
+
+ // Fill missing buckets with zeros so charts render continuous timelines.
+ points = fillOpsThroughputBuckets(start, end, bucketSeconds, points)
+
+ var byPlatform []*service.OpsThroughputPlatformBreakdownItem
+ var topGroups []*service.OpsThroughputGroupBreakdownItem
+
+ platform := ""
+ if filter != nil {
+ platform = strings.TrimSpace(strings.ToLower(filter.Platform))
+ }
+ groupID := (*int64)(nil)
+ if filter != nil {
+ groupID = filter.GroupID
+ }
+
+ // Drilldown helpers:
+ // - No platform/group: totals by platform
+ // - Platform selected but no group: top groups in that platform
+ if platform == "" && (groupID == nil || *groupID <= 0) {
+ items, err := r.getThroughputBreakdownByPlatform(ctx, start, end)
+ if err != nil {
+ return nil, err
+ }
+ byPlatform = items
+ } else if platform != "" && (groupID == nil || *groupID <= 0) {
+ items, err := r.getThroughputTopGroupsByPlatform(ctx, start, end, platform, 10)
+ if err != nil {
+ return nil, err
+ }
+ topGroups = items
+ }
+
+ return &service.OpsThroughputTrendResponse{
+ Bucket: opsBucketLabel(bucketSeconds),
+ Points: points,
+
+ ByPlatform: byPlatform,
+ TopGroups: topGroups,
+ }, nil
+}
+
+func (r *opsRepository) getThroughputBreakdownByPlatform(ctx context.Context, start, end time.Time) ([]*service.OpsThroughputPlatformBreakdownItem, error) {
+ q := `
+WITH usage_totals AS (
+ SELECT COALESCE(NULLIF(g.platform,''), a.platform) AS platform,
+ COUNT(*) AS success_count,
+ COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
+ FROM usage_logs ul
+ LEFT JOIN groups g ON g.id = ul.group_id
+ LEFT JOIN accounts a ON a.id = ul.account_id
+ WHERE ul.created_at >= $1 AND ul.created_at < $2
+ GROUP BY 1
+),
+error_totals AS (
+ SELECT platform,
+ COUNT(*) AS error_count
+ FROM ops_error_logs
+ WHERE created_at >= $1 AND created_at < $2
+ GROUP BY 1
+),
+combined AS (
+ SELECT COALESCE(u.platform, e.platform) AS platform,
+ COALESCE(u.success_count, 0) AS success_count,
+ COALESCE(e.error_count, 0) AS error_count,
+ COALESCE(u.token_consumed, 0) AS token_consumed
+ FROM usage_totals u
+ FULL OUTER JOIN error_totals e ON u.platform = e.platform
+)
+SELECT platform, (success_count + error_count) AS request_count, token_consumed
+FROM combined
+WHERE platform IS NOT NULL AND platform <> ''
+ORDER BY request_count DESC`
+
+ rows, err := r.db.QueryContext(ctx, q, start, end)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ items := make([]*service.OpsThroughputPlatformBreakdownItem, 0, 8)
+ for rows.Next() {
+ var platform string
+ var requests int64
+ var tokens sql.NullInt64
+ if err := rows.Scan(&platform, &requests, &tokens); err != nil {
+ return nil, err
+ }
+ tokenConsumed := int64(0)
+ if tokens.Valid {
+ tokenConsumed = tokens.Int64
+ }
+ items = append(items, &service.OpsThroughputPlatformBreakdownItem{
+ Platform: platform,
+ RequestCount: requests,
+ TokenConsumed: tokenConsumed,
+ })
+ }
+ if err := rows.Err(); err != nil {
+ return nil, err
+ }
+ return items, nil
+}
+
+func (r *opsRepository) getThroughputTopGroupsByPlatform(ctx context.Context, start, end time.Time, platform string, limit int) ([]*service.OpsThroughputGroupBreakdownItem, error) {
+ if strings.TrimSpace(platform) == "" {
+ return nil, nil
+ }
+ if limit <= 0 || limit > 100 {
+ limit = 10
+ }
+
+ q := `
+WITH usage_totals AS (
+ SELECT ul.group_id AS group_id,
+ g.name AS group_name,
+ COUNT(*) AS success_count,
+ COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
+ FROM usage_logs ul
+ JOIN groups g ON g.id = ul.group_id
+ WHERE ul.created_at >= $1 AND ul.created_at < $2
+ AND g.platform = $3
+ GROUP BY 1, 2
+),
+error_totals AS (
+ SELECT group_id,
+ COUNT(*) AS error_count
+ FROM ops_error_logs
+ WHERE created_at >= $1 AND created_at < $2
+ AND platform = $3
+ AND group_id IS NOT NULL
+ GROUP BY 1
+),
+combined AS (
+ SELECT COALESCE(u.group_id, e.group_id) AS group_id,
+ COALESCE(u.group_name, g2.name, '') AS group_name,
+ COALESCE(u.success_count, 0) AS success_count,
+ COALESCE(e.error_count, 0) AS error_count,
+ COALESCE(u.token_consumed, 0) AS token_consumed
+ FROM usage_totals u
+ FULL OUTER JOIN error_totals e ON u.group_id = e.group_id
+ LEFT JOIN groups g2 ON g2.id = COALESCE(u.group_id, e.group_id)
+)
+SELECT group_id, group_name, (success_count + error_count) AS request_count, token_consumed
+FROM combined
+WHERE group_id IS NOT NULL
+ORDER BY request_count DESC
+LIMIT $4`
+
+ rows, err := r.db.QueryContext(ctx, q, start, end, platform, limit)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ items := make([]*service.OpsThroughputGroupBreakdownItem, 0, limit)
+ for rows.Next() {
+ var groupID int64
+ var groupName sql.NullString
+ var requests int64
+ var tokens sql.NullInt64
+ if err := rows.Scan(&groupID, &groupName, &requests, &tokens); err != nil {
+ return nil, err
+ }
+ tokenConsumed := int64(0)
+ if tokens.Valid {
+ tokenConsumed = tokens.Int64
+ }
+ name := ""
+ if groupName.Valid {
+ name = groupName.String
+ }
+ items = append(items, &service.OpsThroughputGroupBreakdownItem{
+ GroupID: groupID,
+ GroupName: name,
+ RequestCount: requests,
+ TokenConsumed: tokenConsumed,
+ })
+ }
+ if err := rows.Err(); err != nil {
+ return nil, err
+ }
+ return items, nil
+}
+
+func opsBucketExprForUsage(bucketSeconds int) string {
+ switch bucketSeconds {
+ case 3600:
+ return "date_trunc('hour', ul.created_at)"
+ case 300:
+ // 5-minute buckets in UTC.
+ return "to_timestamp(floor(extract(epoch from ul.created_at) / 300) * 300)"
+ default:
+ return "date_trunc('minute', ul.created_at)"
+ }
+}
+
+func opsBucketExprForError(bucketSeconds int) string {
+ switch bucketSeconds {
+ case 3600:
+ return "date_trunc('hour', created_at)"
+ case 300:
+ return "to_timestamp(floor(extract(epoch from created_at) / 300) * 300)"
+ default:
+ return "date_trunc('minute', created_at)"
+ }
+}
+
+func opsBucketLabel(bucketSeconds int) string {
+ if bucketSeconds <= 0 {
+ return "1m"
+ }
+ if bucketSeconds%3600 == 0 {
+ h := bucketSeconds / 3600
+ if h <= 0 {
+ h = 1
+ }
+ return fmt.Sprintf("%dh", h)
+ }
+ m := bucketSeconds / 60
+ if m <= 0 {
+ m = 1
+ }
+ return fmt.Sprintf("%dm", m)
+}
+
+func opsFloorToBucketStart(t time.Time, bucketSeconds int) time.Time {
+ t = t.UTC()
+ if bucketSeconds <= 0 {
+ bucketSeconds = 60
+ }
+ secs := t.Unix()
+ floored := secs - (secs % int64(bucketSeconds))
+ return time.Unix(floored, 0).UTC()
+}
+
+func fillOpsThroughputBuckets(start, end time.Time, bucketSeconds int, points []*service.OpsThroughputTrendPoint) []*service.OpsThroughputTrendPoint {
+ if bucketSeconds <= 0 {
+ bucketSeconds = 60
+ }
+ if !start.Before(end) {
+ return points
+ }
+
+ endMinus := end.Add(-time.Nanosecond)
+ if endMinus.Before(start) {
+ return points
+ }
+
+ first := opsFloorToBucketStart(start, bucketSeconds)
+ last := opsFloorToBucketStart(endMinus, bucketSeconds)
+ step := time.Duration(bucketSeconds) * time.Second
+
+ existing := make(map[int64]*service.OpsThroughputTrendPoint, len(points))
+ for _, p := range points {
+ if p == nil {
+ continue
+ }
+ existing[p.BucketStart.UTC().Unix()] = p
+ }
+
+ out := make([]*service.OpsThroughputTrendPoint, 0, int(last.Sub(first)/step)+1)
+ for cursor := first; !cursor.After(last); cursor = cursor.Add(step) {
+ if p, ok := existing[cursor.Unix()]; ok && p != nil {
+ out = append(out, p)
+ continue
+ }
+ out = append(out, &service.OpsThroughputTrendPoint{
+ BucketStart: cursor,
+ RequestCount: 0,
+ TokenConsumed: 0,
+ QPS: 0,
+ TPS: 0,
+ })
+ }
+ return out
+}
+
+func (r *opsRepository) GetErrorTrend(ctx context.Context, filter *service.OpsDashboardFilter, bucketSeconds int) (*service.OpsErrorTrendResponse, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if filter == nil {
+ return nil, fmt.Errorf("nil filter")
+ }
+ if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+ return nil, fmt.Errorf("start_time/end_time required")
+ }
+
+ if bucketSeconds <= 0 {
+ bucketSeconds = 60
+ }
+ if bucketSeconds != 60 && bucketSeconds != 300 && bucketSeconds != 3600 {
+ bucketSeconds = 60
+ }
+
+ start := filter.StartTime.UTC()
+ end := filter.EndTime.UTC()
+ where, args, _ := buildErrorWhere(filter, start, end, 1)
+ bucketExpr := opsBucketExprForError(bucketSeconds)
+
+ q := `
+SELECT
+ ` + bucketExpr + ` AS bucket,
+ COUNT(*) AS error_total,
+ COUNT(*) FILTER (WHERE is_business_limited) AS business_limited,
+ COUNT(*) FILTER (WHERE NOT is_business_limited) AS error_sla,
+ COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) NOT IN (429, 529)) AS upstream_excl,
+ COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 429) AS upstream_429,
+ COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 529) AS upstream_529
+FROM ops_error_logs
+` + where + `
+GROUP BY 1
+ORDER BY 1 ASC`
+
+ rows, err := r.db.QueryContext(ctx, q, args...)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ points := make([]*service.OpsErrorTrendPoint, 0, 256)
+ for rows.Next() {
+ var bucket time.Time
+ var total, businessLimited, sla, upstreamExcl, upstream429, upstream529 int64
+ if err := rows.Scan(&bucket, &total, &businessLimited, &sla, &upstreamExcl, &upstream429, &upstream529); err != nil {
+ return nil, err
+ }
+ points = append(points, &service.OpsErrorTrendPoint{
+ BucketStart: bucket.UTC(),
+
+ ErrorCountTotal: total,
+ BusinessLimitedCount: businessLimited,
+ ErrorCountSLA: sla,
+
+ UpstreamErrorCountExcl429529: upstreamExcl,
+ Upstream429Count: upstream429,
+ Upstream529Count: upstream529,
+ })
+ }
+ if err := rows.Err(); err != nil {
+ return nil, err
+ }
+
+ points = fillOpsErrorTrendBuckets(start, end, bucketSeconds, points)
+
+ return &service.OpsErrorTrendResponse{
+ Bucket: opsBucketLabel(bucketSeconds),
+ Points: points,
+ }, nil
+}
+
+func fillOpsErrorTrendBuckets(start, end time.Time, bucketSeconds int, points []*service.OpsErrorTrendPoint) []*service.OpsErrorTrendPoint {
+ if bucketSeconds <= 0 {
+ bucketSeconds = 60
+ }
+ if !start.Before(end) {
+ return points
+ }
+
+ endMinus := end.Add(-time.Nanosecond)
+ if endMinus.Before(start) {
+ return points
+ }
+
+ first := opsFloorToBucketStart(start, bucketSeconds)
+ last := opsFloorToBucketStart(endMinus, bucketSeconds)
+ step := time.Duration(bucketSeconds) * time.Second
+
+ existing := make(map[int64]*service.OpsErrorTrendPoint, len(points))
+ for _, p := range points {
+ if p == nil {
+ continue
+ }
+ existing[p.BucketStart.UTC().Unix()] = p
+ }
+
+ out := make([]*service.OpsErrorTrendPoint, 0, int(last.Sub(first)/step)+1)
+ for cursor := first; !cursor.After(last); cursor = cursor.Add(step) {
+ if p, ok := existing[cursor.Unix()]; ok && p != nil {
+ out = append(out, p)
+ continue
+ }
+ out = append(out, &service.OpsErrorTrendPoint{
+ BucketStart: cursor,
+
+ ErrorCountTotal: 0,
+ BusinessLimitedCount: 0,
+ ErrorCountSLA: 0,
+
+ UpstreamErrorCountExcl429529: 0,
+ Upstream429Count: 0,
+ Upstream529Count: 0,
+ })
+ }
+ return out
+}
+
+func (r *opsRepository) GetErrorDistribution(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsErrorDistributionResponse, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if filter == nil {
+ return nil, fmt.Errorf("nil filter")
+ }
+ if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+ return nil, fmt.Errorf("start_time/end_time required")
+ }
+
+ start := filter.StartTime.UTC()
+ end := filter.EndTime.UTC()
+ where, args, _ := buildErrorWhere(filter, start, end, 1)
+
+ q := `
+SELECT
+ COALESCE(status_code, 0) AS status_code,
+ COUNT(*) AS total,
+ COUNT(*) FILTER (WHERE NOT is_business_limited) AS sla,
+ COUNT(*) FILTER (WHERE is_business_limited) AS business_limited
+FROM ops_error_logs
+` + where + `
+GROUP BY 1
+ORDER BY total DESC
+LIMIT 20`
+
+ rows, err := r.db.QueryContext(ctx, q, args...)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ items := make([]*service.OpsErrorDistributionItem, 0, 16)
+ var total int64
+ for rows.Next() {
+ var statusCode int
+ var cntTotal, cntSLA, cntBiz int64
+ if err := rows.Scan(&statusCode, &cntTotal, &cntSLA, &cntBiz); err != nil {
+ return nil, err
+ }
+ total += cntTotal
+ items = append(items, &service.OpsErrorDistributionItem{
+ StatusCode: statusCode,
+ Total: cntTotal,
+ SLA: cntSLA,
+ BusinessLimited: cntBiz,
+ })
+ }
+ if err := rows.Err(); err != nil {
+ return nil, err
+ }
+
+ return &service.OpsErrorDistributionResponse{
+ Total: total,
+ Items: items,
+ }, nil
+}
diff --git a/backend/internal/repository/ops_repo_window_stats.go b/backend/internal/repository/ops_repo_window_stats.go
new file mode 100644
index 00000000..8221c473
--- /dev/null
+++ b/backend/internal/repository/ops_repo_window_stats.go
@@ -0,0 +1,50 @@
+package repository
+
+import (
+ "context"
+ "fmt"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/service"
+)
+
+func (r *opsRepository) GetWindowStats(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsWindowStats, error) {
+ if r == nil || r.db == nil {
+ return nil, fmt.Errorf("nil ops repository")
+ }
+ if filter == nil {
+ return nil, fmt.Errorf("nil filter")
+ }
+ if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+ return nil, fmt.Errorf("start_time/end_time required")
+ }
+
+ start := filter.StartTime.UTC()
+ end := filter.EndTime.UTC()
+ if start.After(end) {
+ return nil, fmt.Errorf("start_time must be <= end_time")
+ }
+ // Bound excessively large windows to prevent accidental heavy queries.
+ if end.Sub(start) > 24*time.Hour {
+ return nil, fmt.Errorf("window too large")
+ }
+
+ successCount, tokenConsumed, err := r.queryUsageCounts(ctx, filter, start, end)
+ if err != nil {
+ return nil, err
+ }
+
+ errorTotal, _, _, _, _, _, err := r.queryErrorCounts(ctx, filter, start, end)
+ if err != nil {
+ return nil, err
+ }
+
+ return &service.OpsWindowStats{
+ StartTime: start,
+ EndTime: end,
+
+ SuccessCount: successCount,
+ ErrorCountTotal: errorTotal,
+ TokenConsumed: tokenConsumed,
+ }, nil
+}
diff --git a/backend/internal/repository/wire.go b/backend/internal/repository/wire.go
index f7574563..315bc1b6 100644
--- a/backend/internal/repository/wire.go
+++ b/backend/internal/repository/wire.go
@@ -35,6 +35,7 @@ var ProviderSet = wire.NewSet(
NewRedeemCodeRepository,
NewUsageLogRepository,
NewSettingRepository,
+ NewOpsRepository,
NewUserSubscriptionRepository,
NewUserAttributeDefinitionRepository,
NewUserAttributeValueRepository,
From 5baa8b5673e77a5ebb3453094c8b13bef83e71f6 Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Fri, 9 Jan 2026 20:53:44 +0800
Subject: [PATCH 04/53] =?UTF-8?q?feat(service):=20=E5=AE=9E=E7=8E=B0?=
=?UTF-8?q?=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E4=B8=9A=E5=8A=A1=E9=80=BB?=
=?UTF-8?q?=E8=BE=91=E5=B1=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 新增 ops 主服务(ops_service.go)和端口定义(ops_port.go)
- 实现账号可用性检查服务(ops_account_availability.go)
- 实现数据聚合服务(ops_aggregation_service.go)
- 实现告警评估服务(ops_alert_evaluator_service.go)
- 实现告警管理服务(ops_alerts.go)
- 实现数据清理服务(ops_cleanup_service.go)
- 实现并发控制服务(ops_concurrency.go)
- 实现仪表板服务(ops_dashboard.go)
- 实现错误处理服务(ops_errors.go)
- 实现直方图服务(ops_histograms.go)
- 实现指标采集服务(ops_metrics_collector.go)
- 实现查询模式服务(ops_query_mode.go)
- 实现实时监控服务(ops_realtime.go)
- 实现请求详情服务(ops_request_details.go)
- 实现重试机制服务(ops_retry.go)
- 实现配置管理服务(ops_settings.go)
- 实现趋势分析服务(ops_trends.go)
- 实现窗口统计服务(ops_window_stats.go)
- 添加 ops 相关领域常量
- 注册 service 依赖注入
---
backend/internal/service/domain_constants.go | 22 +
.../service/ops_account_availability.go | 157 ++++
.../service/ops_aggregation_service.go | 434 +++++++++
.../service/ops_alert_evaluator_service.go | 839 +++++++++++++++++
backend/internal/service/ops_alerts.go | 162 ++++
.../internal/service/ops_cleanup_service.go | 361 ++++++++
backend/internal/service/ops_concurrency.go | 257 ++++++
backend/internal/service/ops_dashboard.go | 77 ++
backend/internal/service/ops_errors.go | 45 +
backend/internal/service/ops_histograms.go | 26 +
.../internal/service/ops_metrics_collector.go | 861 ++++++++++++++++++
backend/internal/service/ops_port.go | 226 +++++
backend/internal/service/ops_query_mode.go | 40 +
backend/internal/service/ops_realtime.go | 36 +
.../internal/service/ops_request_details.go | 152 ++++
backend/internal/service/ops_retry.go | 635 +++++++++++++
backend/internal/service/ops_service.go | 451 +++++++++
backend/internal/service/ops_settings.go | 354 +++++++
backend/internal/service/ops_trends.go | 27 +
backend/internal/service/ops_window_stats.go | 24 +
backend/internal/service/wire.go | 58 ++
21 files changed, 5244 insertions(+)
create mode 100644 backend/internal/service/ops_account_availability.go
create mode 100644 backend/internal/service/ops_aggregation_service.go
create mode 100644 backend/internal/service/ops_alert_evaluator_service.go
create mode 100644 backend/internal/service/ops_alerts.go
create mode 100644 backend/internal/service/ops_cleanup_service.go
create mode 100644 backend/internal/service/ops_concurrency.go
create mode 100644 backend/internal/service/ops_dashboard.go
create mode 100644 backend/internal/service/ops_errors.go
create mode 100644 backend/internal/service/ops_histograms.go
create mode 100644 backend/internal/service/ops_metrics_collector.go
create mode 100644 backend/internal/service/ops_port.go
create mode 100644 backend/internal/service/ops_query_mode.go
create mode 100644 backend/internal/service/ops_realtime.go
create mode 100644 backend/internal/service/ops_request_details.go
create mode 100644 backend/internal/service/ops_retry.go
create mode 100644 backend/internal/service/ops_service.go
create mode 100644 backend/internal/service/ops_settings.go
create mode 100644 backend/internal/service/ops_trends.go
create mode 100644 backend/internal/service/ops_window_stats.go
diff --git a/backend/internal/service/domain_constants.go b/backend/internal/service/domain_constants.go
index 9c61ea2e..04f80dbe 100644
--- a/backend/internal/service/domain_constants.go
+++ b/backend/internal/service/domain_constants.go
@@ -105,6 +105,28 @@ const (
// Request identity patch (Claude -> Gemini systemInstruction injection)
SettingKeyEnableIdentityPatch = "enable_identity_patch"
SettingKeyIdentityPatchPrompt = "identity_patch_prompt"
+
+ // =========================
+ // Ops Monitoring (vNext)
+ // =========================
+
+ // SettingKeyOpsMonitoringEnabled is a DB-backed soft switch to enable/disable ops module at runtime.
+ SettingKeyOpsMonitoringEnabled = "ops_monitoring_enabled"
+
+ // SettingKeyOpsRealtimeMonitoringEnabled controls realtime features (e.g. WS/QPS push).
+ SettingKeyOpsRealtimeMonitoringEnabled = "ops_realtime_monitoring_enabled"
+
+ // SettingKeyOpsQueryModeDefault controls the default query mode for ops dashboard (auto/raw/preagg).
+ SettingKeyOpsQueryModeDefault = "ops_query_mode_default"
+
+ // SettingKeyOpsEmailNotificationConfig stores JSON config for ops email notifications.
+ SettingKeyOpsEmailNotificationConfig = "ops_email_notification_config"
+
+ // SettingKeyOpsAlertRuntimeSettings stores JSON config for ops alert evaluator runtime settings.
+ SettingKeyOpsAlertRuntimeSettings = "ops_alert_runtime_settings"
+
+ // SettingKeyOpsMetricsIntervalSeconds controls the ops metrics collector interval (>=60).
+ SettingKeyOpsMetricsIntervalSeconds = "ops_metrics_interval_seconds"
)
// AdminAPIKeyPrefix is the prefix for admin API keys (distinct from user "sk-" keys).
diff --git a/backend/internal/service/ops_account_availability.go b/backend/internal/service/ops_account_availability.go
new file mode 100644
index 00000000..d0cbbe5c
--- /dev/null
+++ b/backend/internal/service/ops_account_availability.go
@@ -0,0 +1,157 @@
+package service
+
+import (
+ "context"
+ "time"
+)
+
+// GetAccountAvailabilityStats returns current account availability stats.
+//
+// Query-level filtering is intentionally limited to platform/group to match the dashboard scope.
+func (s *OpsService) GetAccountAvailabilityStats(ctx context.Context, platformFilter string, groupIDFilter *int64) (
+ map[string]*PlatformAvailability,
+ map[int64]*GroupAvailability,
+ map[int64]*AccountAvailability,
+ *time.Time,
+ error,
+) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, nil, nil, nil, err
+ }
+
+ accounts, err := s.listAllAccountsForOps(ctx, platformFilter)
+ if err != nil {
+ return nil, nil, nil, nil, err
+ }
+
+ if groupIDFilter != nil && *groupIDFilter > 0 {
+ filtered := make([]Account, 0, len(accounts))
+ for _, acc := range accounts {
+ for _, grp := range acc.Groups {
+ if grp != nil && grp.ID == *groupIDFilter {
+ filtered = append(filtered, acc)
+ break
+ }
+ }
+ }
+ accounts = filtered
+ }
+
+ now := time.Now()
+ collectedAt := now
+
+ platform := make(map[string]*PlatformAvailability)
+ group := make(map[int64]*GroupAvailability)
+ account := make(map[int64]*AccountAvailability)
+
+ for _, acc := range accounts {
+ if acc.ID <= 0 {
+ continue
+ }
+
+ isTempUnsched := false
+ if acc.TempUnschedulableUntil != nil && now.Before(*acc.TempUnschedulableUntil) {
+ isTempUnsched = true
+ }
+
+ isRateLimited := acc.RateLimitResetAt != nil && now.Before(*acc.RateLimitResetAt)
+ isOverloaded := acc.OverloadUntil != nil && now.Before(*acc.OverloadUntil)
+ hasError := acc.Status == StatusError
+
+ // Normalize exclusive status flags so the UI doesn't show conflicting badges.
+ if hasError {
+ isRateLimited = false
+ isOverloaded = false
+ }
+
+ isAvailable := acc.Status == StatusActive && acc.Schedulable && !isRateLimited && !isOverloaded && !isTempUnsched
+
+ if acc.Platform != "" {
+ if _, ok := platform[acc.Platform]; !ok {
+ platform[acc.Platform] = &PlatformAvailability{
+ Platform: acc.Platform,
+ }
+ }
+ p := platform[acc.Platform]
+ p.TotalAccounts++
+ if isAvailable {
+ p.AvailableCount++
+ }
+ if isRateLimited {
+ p.RateLimitCount++
+ }
+ if hasError {
+ p.ErrorCount++
+ }
+ }
+
+ for _, grp := range acc.Groups {
+ if grp == nil || grp.ID <= 0 {
+ continue
+ }
+ if _, ok := group[grp.ID]; !ok {
+ group[grp.ID] = &GroupAvailability{
+ GroupID: grp.ID,
+ GroupName: grp.Name,
+ Platform: grp.Platform,
+ }
+ }
+ g := group[grp.ID]
+ g.TotalAccounts++
+ if isAvailable {
+ g.AvailableCount++
+ }
+ if isRateLimited {
+ g.RateLimitCount++
+ }
+ if hasError {
+ g.ErrorCount++
+ }
+ }
+
+ displayGroupID := int64(0)
+ displayGroupName := ""
+ if len(acc.Groups) > 0 && acc.Groups[0] != nil {
+ displayGroupID = acc.Groups[0].ID
+ displayGroupName = acc.Groups[0].Name
+ }
+
+ item := &AccountAvailability{
+ AccountID: acc.ID,
+ AccountName: acc.Name,
+ Platform: acc.Platform,
+ GroupID: displayGroupID,
+ GroupName: displayGroupName,
+ Status: acc.Status,
+
+ IsAvailable: isAvailable,
+ IsRateLimited: isRateLimited,
+ IsOverloaded: isOverloaded,
+ HasError: hasError,
+
+ ErrorMessage: acc.ErrorMessage,
+ }
+
+ if isRateLimited && acc.RateLimitResetAt != nil {
+ item.RateLimitResetAt = acc.RateLimitResetAt
+ remainingSec := int64(time.Until(*acc.RateLimitResetAt).Seconds())
+ if remainingSec > 0 {
+ item.RateLimitRemainingSec = &remainingSec
+ }
+ }
+ if isOverloaded && acc.OverloadUntil != nil {
+ item.OverloadUntil = acc.OverloadUntil
+ remainingSec := int64(time.Until(*acc.OverloadUntil).Seconds())
+ if remainingSec > 0 {
+ item.OverloadRemainingSec = &remainingSec
+ }
+ }
+ if isTempUnsched && acc.TempUnschedulableUntil != nil {
+ item.TempUnschedulableUntil = acc.TempUnschedulableUntil
+ }
+
+ account[acc.ID] = item
+ }
+
+ return platform, group, account, &collectedAt, nil
+}
diff --git a/backend/internal/service/ops_aggregation_service.go b/backend/internal/service/ops_aggregation_service.go
new file mode 100644
index 00000000..04dbb11b
--- /dev/null
+++ b/backend/internal/service/ops_aggregation_service.go
@@ -0,0 +1,434 @@
+package service
+
+import (
+ "context"
+ "database/sql"
+ "errors"
+ "log"
+ "strings"
+ "sync"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/config"
+ "github.com/google/uuid"
+ "github.com/redis/go-redis/v9"
+)
+
+const (
+ opsAggHourlyJobName = "ops_preaggregation_hourly"
+ opsAggDailyJobName = "ops_preaggregation_daily"
+
+ opsAggHourlyInterval = 10 * time.Minute
+ opsAggDailyInterval = 1 * time.Hour
+
+ // Keep in sync with ops retention target (vNext default 30d).
+ opsAggBackfillWindow = 30 * 24 * time.Hour
+
+ // Recompute overlap to absorb late-arriving rows near boundaries.
+ opsAggHourlyOverlap = 2 * time.Hour
+ opsAggDailyOverlap = 48 * time.Hour
+
+ opsAggHourlyChunk = 24 * time.Hour
+ opsAggDailyChunk = 7 * 24 * time.Hour
+
+ // Delay around boundaries (e.g. 10:00..10:05) to avoid aggregating buckets
+ // that may still receive late inserts.
+ opsAggSafeDelay = 5 * time.Minute
+
+ opsAggMaxQueryTimeout = 3 * time.Second
+ opsAggHourlyTimeout = 5 * time.Minute
+ opsAggDailyTimeout = 2 * time.Minute
+
+ opsAggHourlyLeaderLockKey = "ops:aggregation:hourly:leader"
+ opsAggDailyLeaderLockKey = "ops:aggregation:daily:leader"
+
+ opsAggHourlyLeaderLockTTL = 15 * time.Minute
+ opsAggDailyLeaderLockTTL = 10 * time.Minute
+)
+
+// OpsAggregationService periodically backfills ops_metrics_hourly / ops_metrics_daily
+// for stable long-window dashboard queries.
+//
+// It is safe to run in multi-replica deployments when Redis is available (leader lock).
+type OpsAggregationService struct {
+ opsRepo OpsRepository
+ settingRepo SettingRepository
+ cfg *config.Config
+
+ db *sql.DB
+ redisClient *redis.Client
+ instanceID string
+
+ stopCh chan struct{}
+ startOnce sync.Once
+ stopOnce sync.Once
+
+ hourlyMu sync.Mutex
+ dailyMu sync.Mutex
+
+ skipLogMu sync.Mutex
+ skipLogAt time.Time
+}
+
+func NewOpsAggregationService(
+ opsRepo OpsRepository,
+ settingRepo SettingRepository,
+ db *sql.DB,
+ redisClient *redis.Client,
+ cfg *config.Config,
+) *OpsAggregationService {
+ return &OpsAggregationService{
+ opsRepo: opsRepo,
+ settingRepo: settingRepo,
+ cfg: cfg,
+ db: db,
+ redisClient: redisClient,
+ instanceID: uuid.NewString(),
+ }
+}
+
+func (s *OpsAggregationService) Start() {
+ if s == nil {
+ return
+ }
+ s.startOnce.Do(func() {
+ if s.stopCh == nil {
+ s.stopCh = make(chan struct{})
+ }
+ go s.hourlyLoop()
+ go s.dailyLoop()
+ })
+}
+
+func (s *OpsAggregationService) Stop() {
+ if s == nil {
+ return
+ }
+ s.stopOnce.Do(func() {
+ if s.stopCh != nil {
+ close(s.stopCh)
+ }
+ })
+}
+
+func (s *OpsAggregationService) hourlyLoop() {
+ // First run immediately.
+ s.aggregateHourly()
+
+ ticker := time.NewTicker(opsAggHourlyInterval)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-ticker.C:
+ s.aggregateHourly()
+ case <-s.stopCh:
+ return
+ }
+ }
+}
+
+func (s *OpsAggregationService) dailyLoop() {
+ // First run immediately.
+ s.aggregateDaily()
+
+ ticker := time.NewTicker(opsAggDailyInterval)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-ticker.C:
+ s.aggregateDaily()
+ case <-s.stopCh:
+ return
+ }
+ }
+}
+
+func (s *OpsAggregationService) aggregateHourly() {
+ if s == nil || s.opsRepo == nil {
+ return
+ }
+ if s.cfg != nil {
+ if !s.cfg.Ops.Enabled {
+ return
+ }
+ if !s.cfg.Ops.Aggregation.Enabled {
+ return
+ }
+ }
+
+ ctx, cancel := context.WithTimeout(context.Background(), opsAggHourlyTimeout)
+ defer cancel()
+
+ if !s.isMonitoringEnabled(ctx) {
+ return
+ }
+
+ release, ok := s.tryAcquireLeaderLock(ctx, opsAggHourlyLeaderLockKey, opsAggHourlyLeaderLockTTL, "[OpsAggregation][hourly]")
+ if !ok {
+ return
+ }
+ if release != nil {
+ defer release()
+ }
+
+ s.hourlyMu.Lock()
+ defer s.hourlyMu.Unlock()
+
+ startedAt := time.Now().UTC()
+ runAt := startedAt
+
+ // Aggregate stable full hours only.
+ end := utcFloorToHour(time.Now().UTC().Add(-opsAggSafeDelay))
+ start := end.Add(-opsAggBackfillWindow)
+
+ // Resume from the latest bucket with overlap.
+ {
+ ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout)
+ latest, ok, err := s.opsRepo.GetLatestHourlyBucketStart(ctxMax)
+ cancelMax()
+ if err != nil {
+ log.Printf("[OpsAggregation][hourly] failed to read latest bucket: %v", err)
+ } else if ok {
+ candidate := latest.Add(-opsAggHourlyOverlap)
+ if candidate.After(start) {
+ start = candidate
+ }
+ }
+ }
+
+ start = utcFloorToHour(start)
+ if !start.Before(end) {
+ return
+ }
+
+ var aggErr error
+ for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggHourlyChunk) {
+ chunkEnd := minTime(cursor.Add(opsAggHourlyChunk), end)
+ if err := s.opsRepo.UpsertHourlyMetrics(ctx, cursor, chunkEnd); err != nil {
+ aggErr = err
+ log.Printf("[OpsAggregation][hourly] upsert failed (%s..%s): %v", cursor.Format(time.RFC3339), chunkEnd.Format(time.RFC3339), err)
+ break
+ }
+ }
+
+ finishedAt := time.Now().UTC()
+ durationMs := finishedAt.Sub(startedAt).Milliseconds()
+ dur := durationMs
+
+ if aggErr != nil {
+ msg := truncateString(aggErr.Error(), 2048)
+ errAt := finishedAt
+ hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer hbCancel()
+ _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
+ JobName: opsAggHourlyJobName,
+ LastRunAt: &runAt,
+ LastErrorAt: &errAt,
+ LastError: &msg,
+ LastDurationMs: &dur,
+ })
+ return
+ }
+
+ successAt := finishedAt
+ hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer hbCancel()
+ _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
+ JobName: opsAggHourlyJobName,
+ LastRunAt: &runAt,
+ LastSuccessAt: &successAt,
+ LastDurationMs: &dur,
+ })
+}
+
+func (s *OpsAggregationService) aggregateDaily() {
+ if s == nil || s.opsRepo == nil {
+ return
+ }
+ if s.cfg != nil {
+ if !s.cfg.Ops.Enabled {
+ return
+ }
+ if !s.cfg.Ops.Aggregation.Enabled {
+ return
+ }
+ }
+
+ ctx, cancel := context.WithTimeout(context.Background(), opsAggDailyTimeout)
+ defer cancel()
+
+ if !s.isMonitoringEnabled(ctx) {
+ return
+ }
+
+ release, ok := s.tryAcquireLeaderLock(ctx, opsAggDailyLeaderLockKey, opsAggDailyLeaderLockTTL, "[OpsAggregation][daily]")
+ if !ok {
+ return
+ }
+ if release != nil {
+ defer release()
+ }
+
+ s.dailyMu.Lock()
+ defer s.dailyMu.Unlock()
+
+ startedAt := time.Now().UTC()
+ runAt := startedAt
+
+ end := utcFloorToDay(time.Now().UTC())
+ start := end.Add(-opsAggBackfillWindow)
+
+ {
+ ctxMax, cancelMax := context.WithTimeout(context.Background(), opsAggMaxQueryTimeout)
+ latest, ok, err := s.opsRepo.GetLatestDailyBucketDate(ctxMax)
+ cancelMax()
+ if err != nil {
+ log.Printf("[OpsAggregation][daily] failed to read latest bucket: %v", err)
+ } else if ok {
+ candidate := latest.Add(-opsAggDailyOverlap)
+ if candidate.After(start) {
+ start = candidate
+ }
+ }
+ }
+
+ start = utcFloorToDay(start)
+ if !start.Before(end) {
+ return
+ }
+
+ var aggErr error
+ for cursor := start; cursor.Before(end); cursor = cursor.Add(opsAggDailyChunk) {
+ chunkEnd := minTime(cursor.Add(opsAggDailyChunk), end)
+ if err := s.opsRepo.UpsertDailyMetrics(ctx, cursor, chunkEnd); err != nil {
+ aggErr = err
+ log.Printf("[OpsAggregation][daily] upsert failed (%s..%s): %v", cursor.Format("2006-01-02"), chunkEnd.Format("2006-01-02"), err)
+ break
+ }
+ }
+
+ finishedAt := time.Now().UTC()
+ durationMs := finishedAt.Sub(startedAt).Milliseconds()
+ dur := durationMs
+
+ if aggErr != nil {
+ msg := truncateString(aggErr.Error(), 2048)
+ errAt := finishedAt
+ hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer hbCancel()
+ _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
+ JobName: opsAggDailyJobName,
+ LastRunAt: &runAt,
+ LastErrorAt: &errAt,
+ LastError: &msg,
+ LastDurationMs: &dur,
+ })
+ return
+ }
+
+ successAt := finishedAt
+ hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer hbCancel()
+ _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
+ JobName: opsAggDailyJobName,
+ LastRunAt: &runAt,
+ LastSuccessAt: &successAt,
+ LastDurationMs: &dur,
+ })
+}
+
+func (s *OpsAggregationService) isMonitoringEnabled(ctx context.Context) bool {
+ if s == nil {
+ return false
+ }
+ if s.cfg != nil && !s.cfg.Ops.Enabled {
+ return false
+ }
+ if s.settingRepo == nil {
+ return true
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+
+ value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
+ if err != nil {
+ if errors.Is(err, ErrSettingNotFound) {
+ return true
+ }
+ return true
+ }
+ switch strings.ToLower(strings.TrimSpace(value)) {
+ case "false", "0", "off", "disabled":
+ return false
+ default:
+ return true
+ }
+}
+
+var opsAggReleaseScript = redis.NewScript(`
+if redis.call("GET", KEYS[1]) == ARGV[1] then
+ return redis.call("DEL", KEYS[1])
+end
+return 0
+`)
+
+func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) {
+ if s == nil || s.redisClient == nil {
+ return nil, true
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+
+ ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
+ if err != nil {
+ // Fail-open: do not block single-instance deployments.
+ return nil, true
+ }
+ if !ok {
+ s.maybeLogSkip(logPrefix)
+ return nil, false
+ }
+
+ release := func() {
+ ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer cancel()
+ _, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result()
+ }
+ return release, true
+}
+
+func (s *OpsAggregationService) maybeLogSkip(prefix string) {
+ s.skipLogMu.Lock()
+ defer s.skipLogMu.Unlock()
+
+ now := time.Now()
+ if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < time.Minute {
+ return
+ }
+ s.skipLogAt = now
+ if prefix == "" {
+ prefix = "[OpsAggregation]"
+ }
+ log.Printf("%s leader lock held by another instance; skipping", prefix)
+}
+
+func utcFloorToHour(t time.Time) time.Time {
+ return t.UTC().Truncate(time.Hour)
+}
+
+func utcFloorToDay(t time.Time) time.Time {
+ u := t.UTC()
+ y, m, d := u.Date()
+ return time.Date(y, m, d, 0, 0, 0, 0, time.UTC)
+}
+
+func minTime(a, b time.Time) time.Time {
+ if a.Before(b) {
+ return a
+ }
+ return b
+}
diff --git a/backend/internal/service/ops_alert_evaluator_service.go b/backend/internal/service/ops_alert_evaluator_service.go
new file mode 100644
index 00000000..b970c720
--- /dev/null
+++ b/backend/internal/service/ops_alert_evaluator_service.go
@@ -0,0 +1,839 @@
+package service
+
+import (
+ "context"
+ "fmt"
+ "log"
+ "math"
+ "strconv"
+ "strings"
+ "sync"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/config"
+ "github.com/google/uuid"
+ "github.com/redis/go-redis/v9"
+)
+
+const (
+ opsAlertEvaluatorJobName = "ops_alert_evaluator"
+
+ opsAlertEvaluatorTimeout = 45 * time.Second
+ opsAlertEvaluatorLeaderLockKey = "ops:alert:evaluator:leader"
+ opsAlertEvaluatorLeaderLockTTL = 90 * time.Second
+ opsAlertEvaluatorSkipLogInterval = 1 * time.Minute
+)
+
+var opsAlertEvaluatorReleaseScript = redis.NewScript(`
+if redis.call("GET", KEYS[1]) == ARGV[1] then
+ return redis.call("DEL", KEYS[1])
+end
+return 0
+`)
+
+type OpsAlertEvaluatorService struct {
+ opsService *OpsService
+ opsRepo OpsRepository
+ emailService *EmailService
+
+ redisClient *redis.Client
+ cfg *config.Config
+ instanceID string
+
+ stopCh chan struct{}
+ startOnce sync.Once
+ stopOnce sync.Once
+ wg sync.WaitGroup
+
+ mu sync.Mutex
+ ruleStates map[int64]*opsAlertRuleState
+
+ emailLimiter *slidingWindowLimiter
+
+ skipLogMu sync.Mutex
+ skipLogAt time.Time
+
+ warnNoRedisOnce sync.Once
+}
+
+type opsAlertRuleState struct {
+ LastEvaluatedAt time.Time
+ ConsecutiveBreaches int
+}
+
+func NewOpsAlertEvaluatorService(
+ opsService *OpsService,
+ opsRepo OpsRepository,
+ emailService *EmailService,
+ redisClient *redis.Client,
+ cfg *config.Config,
+) *OpsAlertEvaluatorService {
+ return &OpsAlertEvaluatorService{
+ opsService: opsService,
+ opsRepo: opsRepo,
+ emailService: emailService,
+ redisClient: redisClient,
+ cfg: cfg,
+ instanceID: uuid.NewString(),
+ ruleStates: map[int64]*opsAlertRuleState{},
+ emailLimiter: newSlidingWindowLimiter(0, time.Hour),
+ }
+}
+
+func (s *OpsAlertEvaluatorService) Start() {
+ if s == nil {
+ return
+ }
+ s.startOnce.Do(func() {
+ if s.stopCh == nil {
+ s.stopCh = make(chan struct{})
+ }
+ go s.run()
+ })
+}
+
+func (s *OpsAlertEvaluatorService) Stop() {
+ if s == nil {
+ return
+ }
+ s.stopOnce.Do(func() {
+ if s.stopCh != nil {
+ close(s.stopCh)
+ }
+ })
+ s.wg.Wait()
+}
+
+func (s *OpsAlertEvaluatorService) run() {
+ s.wg.Add(1)
+ defer s.wg.Done()
+
+ // Start immediately to produce early feedback in ops dashboard.
+ timer := time.NewTimer(0)
+ defer timer.Stop()
+
+ for {
+ select {
+ case <-timer.C:
+ interval := s.getInterval()
+ s.evaluateOnce(interval)
+ timer.Reset(interval)
+ case <-s.stopCh:
+ return
+ }
+ }
+}
+
+func (s *OpsAlertEvaluatorService) getInterval() time.Duration {
+ // Default.
+ interval := 60 * time.Second
+
+ if s == nil || s.opsService == nil {
+ return interval
+ }
+ ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer cancel()
+
+ cfg, err := s.opsService.GetOpsAlertRuntimeSettings(ctx)
+ if err != nil || cfg == nil {
+ return interval
+ }
+ if cfg.EvaluationIntervalSeconds <= 0 {
+ return interval
+ }
+ if cfg.EvaluationIntervalSeconds < 1 {
+ return interval
+ }
+ if cfg.EvaluationIntervalSeconds > int((24 * time.Hour).Seconds()) {
+ return interval
+ }
+ return time.Duration(cfg.EvaluationIntervalSeconds) * time.Second
+}
+
+func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
+ if s == nil || s.opsRepo == nil {
+ return
+ }
+ if s.cfg != nil && !s.cfg.Ops.Enabled {
+ return
+ }
+
+ ctx, cancel := context.WithTimeout(context.Background(), opsAlertEvaluatorTimeout)
+ defer cancel()
+
+ if s.opsService != nil && !s.opsService.IsMonitoringEnabled(ctx) {
+ return
+ }
+
+ runtimeCfg := defaultOpsAlertRuntimeSettings()
+ if s.opsService != nil {
+ if loaded, err := s.opsService.GetOpsAlertRuntimeSettings(ctx); err == nil && loaded != nil {
+ runtimeCfg = loaded
+ }
+ }
+
+ release, ok := s.tryAcquireLeaderLock(ctx, runtimeCfg.DistributedLock)
+ if !ok {
+ return
+ }
+ if release != nil {
+ defer release()
+ }
+
+ startedAt := time.Now().UTC()
+ runAt := startedAt
+
+ rules, err := s.opsRepo.ListAlertRules(ctx)
+ if err != nil {
+ s.recordHeartbeatError(runAt, time.Since(startedAt), err)
+ log.Printf("[OpsAlertEvaluator] list rules failed: %v", err)
+ return
+ }
+
+ now := time.Now().UTC()
+ safeEnd := now.Truncate(time.Minute)
+ if safeEnd.IsZero() {
+ safeEnd = now
+ }
+
+ systemMetrics, _ := s.opsRepo.GetLatestSystemMetrics(ctx, 1)
+
+ // Cleanup stale state for removed rules.
+ s.pruneRuleStates(rules)
+
+ for _, rule := range rules {
+ if rule == nil || !rule.Enabled || rule.ID <= 0 {
+ continue
+ }
+
+ scopePlatform, scopeGroupID := parseOpsAlertRuleScope(rule.Filters)
+
+ windowMinutes := rule.WindowMinutes
+ if windowMinutes <= 0 {
+ windowMinutes = 1
+ }
+ windowStart := safeEnd.Add(-time.Duration(windowMinutes) * time.Minute)
+ windowEnd := safeEnd
+
+ metricValue, ok := s.computeRuleMetric(ctx, rule, systemMetrics, windowStart, windowEnd, scopePlatform, scopeGroupID)
+ if !ok {
+ s.resetRuleState(rule.ID, now)
+ continue
+ }
+
+ breachedNow := compareMetric(metricValue, rule.Operator, rule.Threshold)
+ required := requiredSustainedBreaches(rule.SustainedMinutes, interval)
+ consecutive := s.updateRuleBreaches(rule.ID, now, interval, breachedNow)
+
+ activeEvent, err := s.opsRepo.GetActiveAlertEvent(ctx, rule.ID)
+ if err != nil {
+ log.Printf("[OpsAlertEvaluator] get active event failed (rule=%d): %v", rule.ID, err)
+ continue
+ }
+
+ if breachedNow && consecutive >= required {
+ if activeEvent != nil {
+ continue
+ }
+
+ latestEvent, err := s.opsRepo.GetLatestAlertEvent(ctx, rule.ID)
+ if err != nil {
+ log.Printf("[OpsAlertEvaluator] get latest event failed (rule=%d): %v", rule.ID, err)
+ continue
+ }
+ if latestEvent != nil && rule.CooldownMinutes > 0 {
+ cooldown := time.Duration(rule.CooldownMinutes) * time.Minute
+ if now.Sub(latestEvent.FiredAt) < cooldown {
+ continue
+ }
+ }
+
+ firedEvent := &OpsAlertEvent{
+ RuleID: rule.ID,
+ Severity: strings.TrimSpace(rule.Severity),
+ Status: OpsAlertStatusFiring,
+ Title: fmt.Sprintf("%s: %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name)),
+ Description: buildOpsAlertDescription(rule, metricValue, windowMinutes, scopePlatform, scopeGroupID),
+ MetricValue: float64Ptr(metricValue),
+ ThresholdValue: float64Ptr(rule.Threshold),
+ Dimensions: buildOpsAlertDimensions(scopePlatform, scopeGroupID),
+ FiredAt: now,
+ CreatedAt: now,
+ }
+
+ created, err := s.opsRepo.CreateAlertEvent(ctx, firedEvent)
+ if err != nil {
+ log.Printf("[OpsAlertEvaluator] create event failed (rule=%d): %v", rule.ID, err)
+ continue
+ }
+
+ if created != nil && created.ID > 0 {
+ s.maybeSendAlertEmail(ctx, runtimeCfg, rule, created)
+ }
+ continue
+ }
+
+ // Not breached: resolve active event if present.
+ if activeEvent != nil {
+ resolvedAt := now
+ if err := s.opsRepo.UpdateAlertEventStatus(ctx, activeEvent.ID, OpsAlertStatusResolved, &resolvedAt); err != nil {
+ log.Printf("[OpsAlertEvaluator] resolve event failed (event=%d): %v", activeEvent.ID, err)
+ }
+ }
+ }
+
+ s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
+}
+
+func (s *OpsAlertEvaluatorService) pruneRuleStates(rules []*OpsAlertRule) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ live := map[int64]struct{}{}
+ for _, r := range rules {
+ if r != nil && r.ID > 0 {
+ live[r.ID] = struct{}{}
+ }
+ }
+ for id := range s.ruleStates {
+ if _, ok := live[id]; !ok {
+ delete(s.ruleStates, id)
+ }
+ }
+}
+
+func (s *OpsAlertEvaluatorService) resetRuleState(ruleID int64, now time.Time) {
+ if ruleID <= 0 {
+ return
+ }
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ state, ok := s.ruleStates[ruleID]
+ if !ok {
+ state = &opsAlertRuleState{}
+ s.ruleStates[ruleID] = state
+ }
+ state.LastEvaluatedAt = now
+ state.ConsecutiveBreaches = 0
+}
+
+func (s *OpsAlertEvaluatorService) updateRuleBreaches(ruleID int64, now time.Time, interval time.Duration, breached bool) int {
+ if ruleID <= 0 {
+ return 0
+ }
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ state, ok := s.ruleStates[ruleID]
+ if !ok {
+ state = &opsAlertRuleState{}
+ s.ruleStates[ruleID] = state
+ }
+
+ if !state.LastEvaluatedAt.IsZero() && interval > 0 {
+ if now.Sub(state.LastEvaluatedAt) > interval*2 {
+ state.ConsecutiveBreaches = 0
+ }
+ }
+
+ state.LastEvaluatedAt = now
+ if breached {
+ state.ConsecutiveBreaches++
+ } else {
+ state.ConsecutiveBreaches = 0
+ }
+ return state.ConsecutiveBreaches
+}
+
+func requiredSustainedBreaches(sustainedMinutes int, interval time.Duration) int {
+ if sustainedMinutes <= 0 {
+ return 1
+ }
+ if interval <= 0 {
+ return sustainedMinutes
+ }
+ required := int(math.Ceil(float64(sustainedMinutes*60) / interval.Seconds()))
+ if required < 1 {
+ return 1
+ }
+ return required
+}
+
+func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64) {
+ if filters == nil {
+ return "", nil
+ }
+ if v, ok := filters["platform"]; ok {
+ if s, ok := v.(string); ok {
+ platform = strings.TrimSpace(s)
+ }
+ }
+ if v, ok := filters["group_id"]; ok {
+ switch t := v.(type) {
+ case float64:
+ if t > 0 {
+ id := int64(t)
+ groupID = &id
+ }
+ case int64:
+ if t > 0 {
+ id := t
+ groupID = &id
+ }
+ case int:
+ if t > 0 {
+ id := int64(t)
+ groupID = &id
+ }
+ case string:
+ n, err := strconv.ParseInt(strings.TrimSpace(t), 10, 64)
+ if err == nil && n > 0 {
+ groupID = &n
+ }
+ }
+ }
+ return platform, groupID
+}
+
+func (s *OpsAlertEvaluatorService) computeRuleMetric(
+ ctx context.Context,
+ rule *OpsAlertRule,
+ systemMetrics *OpsSystemMetricsSnapshot,
+ start time.Time,
+ end time.Time,
+ platform string,
+ groupID *int64,
+) (float64, bool) {
+ if rule == nil {
+ return 0, false
+ }
+ switch strings.TrimSpace(rule.MetricType) {
+ case "cpu_usage_percent":
+ if systemMetrics != nil && systemMetrics.CPUUsagePercent != nil {
+ return *systemMetrics.CPUUsagePercent, true
+ }
+ return 0, false
+ case "memory_usage_percent":
+ if systemMetrics != nil && systemMetrics.MemoryUsagePercent != nil {
+ return *systemMetrics.MemoryUsagePercent, true
+ }
+ return 0, false
+ case "concurrency_queue_depth":
+ if systemMetrics != nil && systemMetrics.ConcurrencyQueueDepth != nil {
+ return float64(*systemMetrics.ConcurrencyQueueDepth), true
+ }
+ return 0, false
+ }
+
+ overview, err := s.opsRepo.GetDashboardOverview(ctx, &OpsDashboardFilter{
+ StartTime: start,
+ EndTime: end,
+ Platform: platform,
+ GroupID: groupID,
+ QueryMode: OpsQueryModeRaw,
+ })
+ if err != nil {
+ return 0, false
+ }
+ if overview == nil {
+ return 0, false
+ }
+
+ switch strings.TrimSpace(rule.MetricType) {
+ case "success_rate":
+ if overview.RequestCountSLA <= 0 {
+ return 0, false
+ }
+ return overview.SLA * 100, true
+ case "error_rate":
+ if overview.RequestCountSLA <= 0 {
+ return 0, false
+ }
+ return overview.ErrorRate * 100, true
+ case "upstream_error_rate":
+ if overview.RequestCountSLA <= 0 {
+ return 0, false
+ }
+ return overview.UpstreamErrorRate * 100, true
+ case "p95_latency_ms":
+ if overview.Duration.P95 == nil {
+ return 0, false
+ }
+ return float64(*overview.Duration.P95), true
+ case "p99_latency_ms":
+ if overview.Duration.P99 == nil {
+ return 0, false
+ }
+ return float64(*overview.Duration.P99), true
+ default:
+ return 0, false
+ }
+}
+
+func compareMetric(value float64, operator string, threshold float64) bool {
+ switch strings.TrimSpace(operator) {
+ case ">":
+ return value > threshold
+ case ">=":
+ return value >= threshold
+ case "<":
+ return value < threshold
+ case "<=":
+ return value <= threshold
+ case "==":
+ return value == threshold
+ case "!=":
+ return value != threshold
+ default:
+ return false
+ }
+}
+
+func buildOpsAlertDimensions(platform string, groupID *int64) map[string]any {
+ dims := map[string]any{}
+ if strings.TrimSpace(platform) != "" {
+ dims["platform"] = strings.TrimSpace(platform)
+ }
+ if groupID != nil && *groupID > 0 {
+ dims["group_id"] = *groupID
+ }
+ if len(dims) == 0 {
+ return nil
+ }
+ return dims
+}
+
+func buildOpsAlertDescription(rule *OpsAlertRule, value float64, windowMinutes int, platform string, groupID *int64) string {
+ if rule == nil {
+ return ""
+ }
+ scope := "overall"
+ if strings.TrimSpace(platform) != "" {
+ scope = fmt.Sprintf("platform=%s", strings.TrimSpace(platform))
+ }
+ if groupID != nil && *groupID > 0 {
+ scope = fmt.Sprintf("%s group_id=%d", scope, *groupID)
+ }
+ if windowMinutes <= 0 {
+ windowMinutes = 1
+ }
+ return fmt.Sprintf("%s %s %.2f (current %.2f) over last %dm (%s)",
+ strings.TrimSpace(rule.MetricType),
+ strings.TrimSpace(rule.Operator),
+ rule.Threshold,
+ value,
+ windowMinutes,
+ strings.TrimSpace(scope),
+ )
+}
+
+func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runtimeCfg *OpsAlertRuntimeSettings, rule *OpsAlertRule, event *OpsAlertEvent) {
+ if s == nil || s.emailService == nil || s.opsService == nil || event == nil || rule == nil {
+ return
+ }
+ if event.EmailSent {
+ return
+ }
+ if !rule.NotifyEmail {
+ return
+ }
+
+ emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx)
+ if err != nil || emailCfg == nil || !emailCfg.Alert.Enabled {
+ return
+ }
+
+ if len(emailCfg.Alert.Recipients) == 0 {
+ return
+ }
+ if !shouldSendOpsAlertEmailByMinSeverity(strings.TrimSpace(emailCfg.Alert.MinSeverity), strings.TrimSpace(rule.Severity)) {
+ return
+ }
+
+ if runtimeCfg != nil && runtimeCfg.Silencing.Enabled {
+ if isOpsAlertSilenced(time.Now().UTC(), rule, event, runtimeCfg.Silencing) {
+ return
+ }
+ }
+
+ // Apply/update rate limiter.
+ s.emailLimiter.SetLimit(emailCfg.Alert.RateLimitPerHour)
+
+ subject := fmt.Sprintf("[Ops Alert][%s] %s", strings.TrimSpace(rule.Severity), strings.TrimSpace(rule.Name))
+ body := buildOpsAlertEmailBody(rule, event)
+
+ anySent := false
+ for _, to := range emailCfg.Alert.Recipients {
+ addr := strings.TrimSpace(to)
+ if addr == "" {
+ continue
+ }
+ if !s.emailLimiter.Allow(time.Now().UTC()) {
+ continue
+ }
+ if err := s.emailService.SendEmail(ctx, addr, subject, body); err != nil {
+ // Ignore per-recipient failures; continue best-effort.
+ continue
+ }
+ anySent = true
+ }
+
+ if anySent {
+ _ = s.opsRepo.UpdateAlertEventEmailSent(context.Background(), event.ID, true)
+ }
+}
+
+func buildOpsAlertEmailBody(rule *OpsAlertRule, event *OpsAlertEvent) string {
+ if rule == nil || event == nil {
+ return ""
+ }
+ metric := strings.TrimSpace(rule.MetricType)
+ value := "-"
+ threshold := fmt.Sprintf("%.2f", rule.Threshold)
+ if event.MetricValue != nil {
+ value = fmt.Sprintf("%.2f", *event.MetricValue)
+ }
+ if event.ThresholdValue != nil {
+ threshold = fmt.Sprintf("%.2f", *event.ThresholdValue)
+ }
+ return fmt.Sprintf(`
+
Ops Alert
+Rule: %s
+Severity: %s
+Status: %s
+Metric: %s %s %s
+Fired at: %s
+Description: %s
+`,
+ htmlEscape(rule.Name),
+ htmlEscape(rule.Severity),
+ htmlEscape(event.Status),
+ htmlEscape(metric),
+ htmlEscape(rule.Operator),
+ htmlEscape(fmt.Sprintf("%s (threshold %s)", value, threshold)),
+ event.FiredAt.Format(time.RFC3339),
+ htmlEscape(event.Description),
+ )
+}
+
+func shouldSendOpsAlertEmailByMinSeverity(minSeverity string, ruleSeverity string) bool {
+ minSeverity = strings.ToLower(strings.TrimSpace(minSeverity))
+ if minSeverity == "" {
+ return true
+ }
+
+ eventLevel := opsEmailSeverityForOps(ruleSeverity)
+ minLevel := strings.ToLower(minSeverity)
+
+ rank := func(level string) int {
+ switch level {
+ case "critical":
+ return 3
+ case "warning":
+ return 2
+ case "info":
+ return 1
+ default:
+ return 0
+ }
+ }
+ return rank(eventLevel) >= rank(minLevel)
+}
+
+func opsEmailSeverityForOps(severity string) string {
+ switch strings.ToUpper(strings.TrimSpace(severity)) {
+ case "P0":
+ return "critical"
+ case "P1":
+ return "warning"
+ default:
+ return "info"
+ }
+}
+
+func isOpsAlertSilenced(now time.Time, rule *OpsAlertRule, event *OpsAlertEvent, silencing OpsAlertSilencingSettings) bool {
+ if !silencing.Enabled {
+ return false
+ }
+ if now.IsZero() {
+ now = time.Now().UTC()
+ }
+ if strings.TrimSpace(silencing.GlobalUntilRFC3339) != "" {
+ if t, err := time.Parse(time.RFC3339, strings.TrimSpace(silencing.GlobalUntilRFC3339)); err == nil {
+ if now.Before(t) {
+ return true
+ }
+ }
+ }
+
+ for _, entry := range silencing.Entries {
+ untilRaw := strings.TrimSpace(entry.UntilRFC3339)
+ if untilRaw == "" {
+ continue
+ }
+ until, err := time.Parse(time.RFC3339, untilRaw)
+ if err != nil {
+ continue
+ }
+ if now.After(until) {
+ continue
+ }
+ if entry.RuleID != nil && rule != nil && rule.ID > 0 && *entry.RuleID != rule.ID {
+ continue
+ }
+ if len(entry.Severities) > 0 {
+ match := false
+ for _, s := range entry.Severities {
+ if strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(event.Severity)) || strings.EqualFold(strings.TrimSpace(s), strings.TrimSpace(rule.Severity)) {
+ match = true
+ break
+ }
+ }
+ if !match {
+ continue
+ }
+ }
+ return true
+ }
+
+ return false
+}
+
+func (s *OpsAlertEvaluatorService) tryAcquireLeaderLock(ctx context.Context, lock OpsDistributedLockSettings) (func(), bool) {
+ if !lock.Enabled {
+ return nil, true
+ }
+ if s.redisClient == nil {
+ s.warnNoRedisOnce.Do(func() {
+ log.Printf("[OpsAlertEvaluator] redis not configured; running without distributed lock")
+ })
+ return nil, true
+ }
+ key := strings.TrimSpace(lock.Key)
+ if key == "" {
+ key = opsAlertEvaluatorLeaderLockKey
+ }
+ ttl := time.Duration(lock.TTLSeconds) * time.Second
+ if ttl <= 0 {
+ ttl = opsAlertEvaluatorLeaderLockTTL
+ }
+
+ ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
+ if err != nil {
+ // Fail-open for single-node environments, but warn.
+ s.warnNoRedisOnce.Do(func() {
+ log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; running without lock: %v", err)
+ })
+ return nil, true
+ }
+ if !ok {
+ s.maybeLogSkip(key)
+ return nil, false
+ }
+ return func() {
+ _, _ = opsAlertEvaluatorReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
+ }, true
+}
+
+func (s *OpsAlertEvaluatorService) maybeLogSkip(key string) {
+ s.skipLogMu.Lock()
+ defer s.skipLogMu.Unlock()
+
+ now := time.Now()
+ if !s.skipLogAt.IsZero() && now.Sub(s.skipLogAt) < opsAlertEvaluatorSkipLogInterval {
+ return
+ }
+ s.skipLogAt = now
+ log.Printf("[OpsAlertEvaluator] leader lock held by another instance; skipping (key=%q)", key)
+}
+
+func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
+ if s == nil || s.opsRepo == nil {
+ return
+ }
+ now := time.Now().UTC()
+ durMs := duration.Milliseconds()
+ ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer cancel()
+ _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
+ JobName: opsAlertEvaluatorJobName,
+ LastRunAt: &runAt,
+ LastSuccessAt: &now,
+ LastDurationMs: &durMs,
+ })
+}
+
+func (s *OpsAlertEvaluatorService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
+ if s == nil || s.opsRepo == nil || err == nil {
+ return
+ }
+ now := time.Now().UTC()
+ durMs := duration.Milliseconds()
+ msg := truncateString(err.Error(), 2048)
+ ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer cancel()
+ _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
+ JobName: opsAlertEvaluatorJobName,
+ LastRunAt: &runAt,
+ LastErrorAt: &now,
+ LastError: &msg,
+ LastDurationMs: &durMs,
+ })
+}
+
+func htmlEscape(s string) string {
+ replacer := strings.NewReplacer(
+ "&", "&",
+ "<", "<",
+ ">", ">",
+ `"`, """,
+ "'", "'",
+ )
+ return replacer.Replace(s)
+}
+
+type slidingWindowLimiter struct {
+ mu sync.Mutex
+ limit int
+ window time.Duration
+ sent []time.Time
+}
+
+func newSlidingWindowLimiter(limit int, window time.Duration) *slidingWindowLimiter {
+ if window <= 0 {
+ window = time.Hour
+ }
+ return &slidingWindowLimiter{
+ limit: limit,
+ window: window,
+ sent: []time.Time{},
+ }
+}
+
+func (l *slidingWindowLimiter) SetLimit(limit int) {
+ l.mu.Lock()
+ defer l.mu.Unlock()
+ l.limit = limit
+}
+
+func (l *slidingWindowLimiter) Allow(now time.Time) bool {
+ l.mu.Lock()
+ defer l.mu.Unlock()
+
+ if l.limit <= 0 {
+ return true
+ }
+ cutoff := now.Add(-l.window)
+ keep := l.sent[:0]
+ for _, t := range l.sent {
+ if t.After(cutoff) {
+ keep = append(keep, t)
+ }
+ }
+ l.sent = keep
+ if len(l.sent) >= l.limit {
+ return false
+ }
+ l.sent = append(l.sent, now)
+ return true
+}
diff --git a/backend/internal/service/ops_alerts.go b/backend/internal/service/ops_alerts.go
new file mode 100644
index 00000000..b6c3d1c3
--- /dev/null
+++ b/backend/internal/service/ops_alerts.go
@@ -0,0 +1,162 @@
+package service
+
+import (
+ "context"
+ "database/sql"
+ "errors"
+ "strings"
+ "time"
+
+ infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+func (s *OpsService) ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return []*OpsAlertRule{}, nil
+ }
+ return s.opsRepo.ListAlertRules(ctx)
+}
+
+func (s *OpsService) CreateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+ }
+ if rule == nil {
+ return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule")
+ }
+
+ created, err := s.opsRepo.CreateAlertRule(ctx, rule)
+ if err != nil {
+ return nil, err
+ }
+ return created, nil
+}
+
+func (s *OpsService) UpdateAlertRule(ctx context.Context, rule *OpsAlertRule) (*OpsAlertRule, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+ }
+ if rule == nil || rule.ID <= 0 {
+ return nil, infraerrors.BadRequest("INVALID_RULE", "invalid rule")
+ }
+
+ updated, err := s.opsRepo.UpdateAlertRule(ctx, rule)
+ if err != nil {
+ if errors.Is(err, sql.ErrNoRows) {
+ return nil, infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found")
+ }
+ return nil, err
+ }
+ return updated, nil
+}
+
+func (s *OpsService) DeleteAlertRule(ctx context.Context, id int64) error {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return err
+ }
+ if s.opsRepo == nil {
+ return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+ }
+ if id <= 0 {
+ return infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
+ }
+ if err := s.opsRepo.DeleteAlertRule(ctx, id); err != nil {
+ if errors.Is(err, sql.ErrNoRows) {
+ return infraerrors.NotFound("OPS_ALERT_RULE_NOT_FOUND", "alert rule not found")
+ }
+ return err
+ }
+ return nil
+}
+
+func (s *OpsService) ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return []*OpsAlertEvent{}, nil
+ }
+ return s.opsRepo.ListAlertEvents(ctx, filter)
+}
+
+func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+ }
+ if ruleID <= 0 {
+ return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
+ }
+ return s.opsRepo.GetActiveAlertEvent(ctx, ruleID)
+}
+
+func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+ }
+ if ruleID <= 0 {
+ return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
+ }
+ return s.opsRepo.GetLatestAlertEvent(ctx, ruleID)
+}
+
+func (s *OpsService) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+ }
+ if event == nil {
+ return nil, infraerrors.BadRequest("INVALID_EVENT", "invalid event")
+ }
+
+ created, err := s.opsRepo.CreateAlertEvent(ctx, event)
+ if err != nil {
+ return nil, err
+ }
+ return created, nil
+}
+
+func (s *OpsService) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return err
+ }
+ if s.opsRepo == nil {
+ return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+ }
+ if eventID <= 0 {
+ return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
+ }
+ if strings.TrimSpace(status) == "" {
+ return infraerrors.BadRequest("INVALID_STATUS", "invalid status")
+ }
+ return s.opsRepo.UpdateAlertEventStatus(ctx, eventID, status, resolvedAt)
+}
+
+func (s *OpsService) UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return err
+ }
+ if s.opsRepo == nil {
+ return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+ }
+ if eventID <= 0 {
+ return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
+ }
+ return s.opsRepo.UpdateAlertEventEmailSent(ctx, eventID, emailSent)
+}
diff --git a/backend/internal/service/ops_cleanup_service.go b/backend/internal/service/ops_cleanup_service.go
new file mode 100644
index 00000000..ef825c04
--- /dev/null
+++ b/backend/internal/service/ops_cleanup_service.go
@@ -0,0 +1,361 @@
+package service
+
+import (
+ "context"
+ "database/sql"
+ "fmt"
+ "log"
+ "strings"
+ "sync"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/config"
+ "github.com/google/uuid"
+ "github.com/redis/go-redis/v9"
+ "github.com/robfig/cron/v3"
+)
+
+const (
+ opsCleanupJobName = "ops_cleanup"
+
+ opsCleanupLeaderLockKeyDefault = "ops:cleanup:leader"
+ opsCleanupLeaderLockTTLDefault = 30 * time.Minute
+)
+
+var opsCleanupCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
+
+var opsCleanupReleaseScript = redis.NewScript(`
+if redis.call("GET", KEYS[1]) == ARGV[1] then
+ return redis.call("DEL", KEYS[1])
+end
+return 0
+`)
+
+// OpsCleanupService periodically deletes old ops data to prevent unbounded DB growth.
+//
+// - Scheduling: 5-field cron spec (minute hour dom month dow).
+// - Multi-instance: best-effort Redis leader lock so only one node runs cleanup.
+// - Safety: deletes in batches to avoid long transactions.
+type OpsCleanupService struct {
+ opsRepo OpsRepository
+ db *sql.DB
+ redisClient *redis.Client
+ cfg *config.Config
+
+ instanceID string
+
+ cron *cron.Cron
+ entryID cron.EntryID
+
+ startOnce sync.Once
+ stopOnce sync.Once
+
+ warnNoRedisOnce sync.Once
+}
+
+func NewOpsCleanupService(
+ opsRepo OpsRepository,
+ db *sql.DB,
+ redisClient *redis.Client,
+ cfg *config.Config,
+) *OpsCleanupService {
+ return &OpsCleanupService{
+ opsRepo: opsRepo,
+ db: db,
+ redisClient: redisClient,
+ cfg: cfg,
+ instanceID: uuid.NewString(),
+ }
+}
+
+func (s *OpsCleanupService) Start() {
+ if s == nil {
+ return
+ }
+ if s.cfg != nil && !s.cfg.Ops.Enabled {
+ return
+ }
+ if s.cfg != nil && !s.cfg.Ops.Cleanup.Enabled {
+ log.Printf("[OpsCleanup] not started (disabled)")
+ return
+ }
+ if s.opsRepo == nil || s.db == nil {
+ log.Printf("[OpsCleanup] not started (missing deps)")
+ return
+ }
+
+ s.startOnce.Do(func() {
+ schedule := "0 2 * * *"
+ if s.cfg != nil && strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule) != "" {
+ schedule = strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule)
+ }
+
+ loc := time.Local
+ if s.cfg != nil && strings.TrimSpace(s.cfg.Timezone) != "" {
+ if parsed, err := time.LoadLocation(strings.TrimSpace(s.cfg.Timezone)); err == nil && parsed != nil {
+ loc = parsed
+ }
+ }
+
+ c := cron.New(cron.WithParser(opsCleanupCronParser), cron.WithLocation(loc))
+ id, err := c.AddFunc(schedule, func() { s.runScheduled() })
+ if err != nil {
+ log.Printf("[OpsCleanup] not started (invalid schedule=%q): %v", schedule, err)
+ return
+ }
+ s.cron = c
+ s.entryID = id
+ s.cron.Start()
+ log.Printf("[OpsCleanup] started (schedule=%q tz=%s)", schedule, loc.String())
+ })
+}
+
+func (s *OpsCleanupService) Stop() {
+ if s == nil {
+ return
+ }
+ s.stopOnce.Do(func() {
+ if s.cron != nil {
+ ctx := s.cron.Stop()
+ select {
+ case <-ctx.Done():
+ case <-time.After(3 * time.Second):
+ log.Printf("[OpsCleanup] cron stop timed out")
+ }
+ }
+ })
+}
+
+func (s *OpsCleanupService) runScheduled() {
+ if s == nil || s.db == nil || s.opsRepo == nil {
+ return
+ }
+
+ ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
+ defer cancel()
+
+ release, ok := s.tryAcquireLeaderLock(ctx)
+ if !ok {
+ return
+ }
+ if release != nil {
+ defer release()
+ }
+
+ startedAt := time.Now().UTC()
+ runAt := startedAt
+
+ counts, err := s.runCleanupOnce(ctx)
+ if err != nil {
+ s.recordHeartbeatError(runAt, time.Since(startedAt), err)
+ log.Printf("[OpsCleanup] cleanup failed: %v", err)
+ return
+ }
+ s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
+ log.Printf("[OpsCleanup] cleanup complete: %s", counts)
+}
+
+type opsCleanupDeletedCounts struct {
+ errorLogs int64
+ retryAttempts int64
+ alertEvents int64
+ systemMetrics int64
+ hourlyPreagg int64
+ dailyPreagg int64
+}
+
+func (c opsCleanupDeletedCounts) String() string {
+ return fmt.Sprintf(
+ "error_logs=%d retry_attempts=%d alert_events=%d system_metrics=%d hourly_preagg=%d daily_preagg=%d",
+ c.errorLogs,
+ c.retryAttempts,
+ c.alertEvents,
+ c.systemMetrics,
+ c.hourlyPreagg,
+ c.dailyPreagg,
+ )
+}
+
+func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDeletedCounts, error) {
+ out := opsCleanupDeletedCounts{}
+ if s == nil || s.db == nil || s.cfg == nil {
+ return out, nil
+ }
+
+ batchSize := 5000
+
+ now := time.Now().UTC()
+
+ // Error-like tables: error logs / retry attempts / alert events.
+ if days := s.cfg.Ops.Cleanup.ErrorLogRetentionDays; days > 0 {
+ cutoff := now.AddDate(0, 0, -days)
+ n, err := deleteOldRowsByID(ctx, s.db, "ops_error_logs", "created_at", cutoff, batchSize, false)
+ if err != nil {
+ return out, err
+ }
+ out.errorLogs = n
+
+ n, err = deleteOldRowsByID(ctx, s.db, "ops_retry_attempts", "created_at", cutoff, batchSize, false)
+ if err != nil {
+ return out, err
+ }
+ out.retryAttempts = n
+
+ n, err = deleteOldRowsByID(ctx, s.db, "ops_alert_events", "created_at", cutoff, batchSize, false)
+ if err != nil {
+ return out, err
+ }
+ out.alertEvents = n
+ }
+
+ // Minute-level metrics snapshots.
+ if days := s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays; days > 0 {
+ cutoff := now.AddDate(0, 0, -days)
+ n, err := deleteOldRowsByID(ctx, s.db, "ops_system_metrics", "created_at", cutoff, batchSize, false)
+ if err != nil {
+ return out, err
+ }
+ out.systemMetrics = n
+ }
+
+ // Pre-aggregation tables (hourly/daily).
+ if days := s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays; days > 0 {
+ cutoff := now.AddDate(0, 0, -days)
+ n, err := deleteOldRowsByID(ctx, s.db, "ops_metrics_hourly", "bucket_start", cutoff, batchSize, false)
+ if err != nil {
+ return out, err
+ }
+ out.hourlyPreagg = n
+
+ n, err = deleteOldRowsByID(ctx, s.db, "ops_metrics_daily", "bucket_date", cutoff, batchSize, true)
+ if err != nil {
+ return out, err
+ }
+ out.dailyPreagg = n
+ }
+
+ return out, nil
+}
+
+func deleteOldRowsByID(
+ ctx context.Context,
+ db *sql.DB,
+ table string,
+ timeColumn string,
+ cutoff time.Time,
+ batchSize int,
+ castCutoffToDate bool,
+) (int64, error) {
+ if db == nil {
+ return 0, nil
+ }
+ if batchSize <= 0 {
+ batchSize = 5000
+ }
+
+ where := fmt.Sprintf("%s < $1", timeColumn)
+ if castCutoffToDate {
+ where = fmt.Sprintf("%s < $1::date", timeColumn)
+ }
+
+ q := fmt.Sprintf(`
+WITH batch AS (
+ SELECT id FROM %s
+ WHERE %s
+ ORDER BY id
+ LIMIT $2
+)
+DELETE FROM %s
+WHERE id IN (SELECT id FROM batch)
+`, table, where, table)
+
+ var total int64
+ for {
+ res, err := db.ExecContext(ctx, q, cutoff, batchSize)
+ if err != nil {
+ // If ops tables aren't present yet (partial deployments), treat as no-op.
+ if strings.Contains(strings.ToLower(err.Error()), "does not exist") && strings.Contains(strings.ToLower(err.Error()), "relation") {
+ return total, nil
+ }
+ return total, err
+ }
+ affected, err := res.RowsAffected()
+ if err != nil {
+ return total, err
+ }
+ total += affected
+ if affected == 0 {
+ break
+ }
+ }
+ return total, nil
+}
+
+func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
+ if s == nil {
+ return nil, false
+ }
+ // In simple run mode, assume single instance.
+ if s.cfg != nil && s.cfg.RunMode == config.RunModeSimple {
+ return nil, true
+ }
+
+ if s.redisClient == nil {
+ s.warnNoRedisOnce.Do(func() {
+ log.Printf("[OpsCleanup] redis not configured; running without distributed lock")
+ })
+ return nil, true
+ }
+
+ key := opsCleanupLeaderLockKeyDefault
+ ttl := opsCleanupLeaderLockTTLDefault
+
+ ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
+ if err != nil {
+ s.warnNoRedisOnce.Do(func() {
+ log.Printf("[OpsCleanup] leader lock SetNX failed; running without lock: %v", err)
+ })
+ return nil, true
+ }
+ if !ok {
+ return nil, false
+ }
+
+ return func() {
+ _, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
+ }, true
+}
+
+func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
+ if s == nil || s.opsRepo == nil {
+ return
+ }
+ now := time.Now().UTC()
+ durMs := duration.Milliseconds()
+ ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer cancel()
+ _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
+ JobName: opsCleanupJobName,
+ LastRunAt: &runAt,
+ LastSuccessAt: &now,
+ LastDurationMs: &durMs,
+ })
+}
+
+func (s *OpsCleanupService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
+ if s == nil || s.opsRepo == nil || err == nil {
+ return
+ }
+ now := time.Now().UTC()
+ durMs := duration.Milliseconds()
+ msg := truncateString(err.Error(), 2048)
+ ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer cancel()
+ _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
+ JobName: opsCleanupJobName,
+ LastRunAt: &runAt,
+ LastErrorAt: &now,
+ LastError: &msg,
+ LastDurationMs: &durMs,
+ })
+}
diff --git a/backend/internal/service/ops_concurrency.go b/backend/internal/service/ops_concurrency.go
new file mode 100644
index 00000000..c3b7b853
--- /dev/null
+++ b/backend/internal/service/ops_concurrency.go
@@ -0,0 +1,257 @@
+package service
+
+import (
+ "context"
+ "log"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/pkg/pagination"
+)
+
+const (
+ opsAccountsPageSize = 100
+ opsConcurrencyBatchChunkSize = 200
+)
+
+func (s *OpsService) listAllAccountsForOps(ctx context.Context, platformFilter string) ([]Account, error) {
+ if s == nil || s.accountRepo == nil {
+ return []Account{}, nil
+ }
+
+ out := make([]Account, 0, 128)
+ page := 1
+ for {
+ accounts, pageInfo, err := s.accountRepo.ListWithFilters(ctx, pagination.PaginationParams{
+ Page: page,
+ PageSize: opsAccountsPageSize,
+ }, platformFilter, "", "", "")
+ if err != nil {
+ return nil, err
+ }
+ if len(accounts) == 0 {
+ break
+ }
+
+ out = append(out, accounts...)
+ if pageInfo != nil && int64(len(out)) >= pageInfo.Total {
+ break
+ }
+ if len(accounts) < opsAccountsPageSize {
+ break
+ }
+
+ page++
+ if page > 10_000 {
+ log.Printf("[Ops] listAllAccountsForOps: aborting after too many pages (platform=%q)", platformFilter)
+ break
+ }
+ }
+
+ return out, nil
+}
+
+func (s *OpsService) getAccountsLoadMapBestEffort(ctx context.Context, accounts []Account) map[int64]*AccountLoadInfo {
+ if s == nil || s.concurrencyService == nil {
+ return map[int64]*AccountLoadInfo{}
+ }
+ if len(accounts) == 0 {
+ return map[int64]*AccountLoadInfo{}
+ }
+
+ // De-duplicate IDs (and keep the max concurrency to avoid under-reporting).
+ unique := make(map[int64]int, len(accounts))
+ for _, acc := range accounts {
+ if acc.ID <= 0 {
+ continue
+ }
+ if prev, ok := unique[acc.ID]; !ok || acc.Concurrency > prev {
+ unique[acc.ID] = acc.Concurrency
+ }
+ }
+
+ batch := make([]AccountWithConcurrency, 0, len(unique))
+ for id, maxConc := range unique {
+ batch = append(batch, AccountWithConcurrency{
+ ID: id,
+ MaxConcurrency: maxConc,
+ })
+ }
+
+ out := make(map[int64]*AccountLoadInfo, len(batch))
+ for i := 0; i < len(batch); i += opsConcurrencyBatchChunkSize {
+ end := i + opsConcurrencyBatchChunkSize
+ if end > len(batch) {
+ end = len(batch)
+ }
+ part, err := s.concurrencyService.GetAccountsLoadBatch(ctx, batch[i:end])
+ if err != nil {
+ // Best-effort: return zeros rather than failing the ops UI.
+ log.Printf("[Ops] GetAccountsLoadBatch failed: %v", err)
+ continue
+ }
+ for k, v := range part {
+ out[k] = v
+ }
+ }
+
+ return out
+}
+
+// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account.
+//
+// Optional filters:
+// - platformFilter: only include accounts in that platform (best-effort reduces DB load)
+// - groupIDFilter: only include accounts that belong to that group
+func (s *OpsService) GetConcurrencyStats(
+ ctx context.Context,
+ platformFilter string,
+ groupIDFilter *int64,
+) (map[string]*PlatformConcurrencyInfo, map[int64]*GroupConcurrencyInfo, map[int64]*AccountConcurrencyInfo, *time.Time, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, nil, nil, nil, err
+ }
+
+ accounts, err := s.listAllAccountsForOps(ctx, platformFilter)
+ if err != nil {
+ return nil, nil, nil, nil, err
+ }
+
+ collectedAt := time.Now()
+ loadMap := s.getAccountsLoadMapBestEffort(ctx, accounts)
+
+ platform := make(map[string]*PlatformConcurrencyInfo)
+ group := make(map[int64]*GroupConcurrencyInfo)
+ account := make(map[int64]*AccountConcurrencyInfo)
+
+ for _, acc := range accounts {
+ if acc.ID <= 0 {
+ continue
+ }
+
+ var matchedGroup *Group
+ if groupIDFilter != nil && *groupIDFilter > 0 {
+ for _, grp := range acc.Groups {
+ if grp == nil || grp.ID <= 0 {
+ continue
+ }
+ if grp.ID == *groupIDFilter {
+ matchedGroup = grp
+ break
+ }
+ }
+ // Group filter provided: skip accounts not in that group.
+ if matchedGroup == nil {
+ continue
+ }
+ }
+
+ load := loadMap[acc.ID]
+ currentInUse := int64(0)
+ waiting := int64(0)
+ if load != nil {
+ currentInUse = int64(load.CurrentConcurrency)
+ waiting = int64(load.WaitingCount)
+ }
+
+ // Account-level view picks one display group (the first group).
+ displayGroupID := int64(0)
+ displayGroupName := ""
+ if matchedGroup != nil {
+ displayGroupID = matchedGroup.ID
+ displayGroupName = matchedGroup.Name
+ } else if len(acc.Groups) > 0 && acc.Groups[0] != nil {
+ displayGroupID = acc.Groups[0].ID
+ displayGroupName = acc.Groups[0].Name
+ }
+
+ if _, ok := account[acc.ID]; !ok {
+ info := &AccountConcurrencyInfo{
+ AccountID: acc.ID,
+ AccountName: acc.Name,
+ Platform: acc.Platform,
+ GroupID: displayGroupID,
+ GroupName: displayGroupName,
+ CurrentInUse: currentInUse,
+ MaxCapacity: int64(acc.Concurrency),
+ WaitingInQueue: waiting,
+ }
+ if info.MaxCapacity > 0 {
+ info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
+ }
+ account[acc.ID] = info
+ }
+
+ // Platform aggregation.
+ if acc.Platform != "" {
+ if _, ok := platform[acc.Platform]; !ok {
+ platform[acc.Platform] = &PlatformConcurrencyInfo{
+ Platform: acc.Platform,
+ }
+ }
+ p := platform[acc.Platform]
+ p.MaxCapacity += int64(acc.Concurrency)
+ p.CurrentInUse += currentInUse
+ p.WaitingInQueue += waiting
+ }
+
+ // Group aggregation (one account may contribute to multiple groups).
+ if matchedGroup != nil {
+ grp := matchedGroup
+ if _, ok := group[grp.ID]; !ok {
+ group[grp.ID] = &GroupConcurrencyInfo{
+ GroupID: grp.ID,
+ GroupName: grp.Name,
+ Platform: grp.Platform,
+ }
+ }
+ g := group[grp.ID]
+ if g.GroupName == "" && grp.Name != "" {
+ g.GroupName = grp.Name
+ }
+ if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform {
+ // Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels.
+ g.Platform = ""
+ }
+ g.MaxCapacity += int64(acc.Concurrency)
+ g.CurrentInUse += currentInUse
+ g.WaitingInQueue += waiting
+ } else {
+ for _, grp := range acc.Groups {
+ if grp == nil || grp.ID <= 0 {
+ continue
+ }
+ if _, ok := group[grp.ID]; !ok {
+ group[grp.ID] = &GroupConcurrencyInfo{
+ GroupID: grp.ID,
+ GroupName: grp.Name,
+ Platform: grp.Platform,
+ }
+ }
+ g := group[grp.ID]
+ if g.GroupName == "" && grp.Name != "" {
+ g.GroupName = grp.Name
+ }
+ if g.Platform != "" && grp.Platform != "" && g.Platform != grp.Platform {
+ // Groups are expected to be platform-scoped. If mismatch is observed, avoid misleading labels.
+ g.Platform = ""
+ }
+ g.MaxCapacity += int64(acc.Concurrency)
+ g.CurrentInUse += currentInUse
+ g.WaitingInQueue += waiting
+ }
+ }
+ }
+
+ for _, info := range platform {
+ if info.MaxCapacity > 0 {
+ info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
+ }
+ }
+ for _, info := range group {
+ if info.MaxCapacity > 0 {
+ info.LoadPercentage = float64(info.CurrentInUse) / float64(info.MaxCapacity) * 100
+ }
+ }
+
+ return platform, group, account, &collectedAt, nil
+}
diff --git a/backend/internal/service/ops_dashboard.go b/backend/internal/service/ops_dashboard.go
new file mode 100644
index 00000000..23d6d82f
--- /dev/null
+++ b/backend/internal/service/ops_dashboard.go
@@ -0,0 +1,77 @@
+package service
+
+import (
+ "context"
+ "database/sql"
+ "errors"
+ "log"
+
+ infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+ }
+ if filter == nil {
+ return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
+ }
+ if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+ return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
+ }
+ if filter.StartTime.After(filter.EndTime) {
+ return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
+ }
+
+ // Resolve query mode (requested via query param, or DB default).
+ filter.QueryMode = s.resolveOpsQueryMode(ctx, filter.QueryMode)
+
+ overview, err := s.opsRepo.GetDashboardOverview(ctx, filter)
+ if err != nil {
+ if errors.Is(err, ErrOpsPreaggregatedNotPopulated) {
+ return nil, infraerrors.Conflict("OPS_PREAGG_NOT_READY", "Pre-aggregated ops metrics are not populated yet")
+ }
+ return nil, err
+ }
+
+ // Best-effort system health + jobs; dashboard metrics should still render if these are missing.
+ if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil {
+ overview.SystemMetrics = metrics
+ } else if err != nil && !errors.Is(err, sql.ErrNoRows) {
+ log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err)
+ }
+
+ if heartbeats, err := s.opsRepo.ListJobHeartbeats(ctx); err == nil {
+ overview.JobHeartbeats = heartbeats
+ } else {
+ log.Printf("[Ops] ListJobHeartbeats failed: %v", err)
+ }
+
+ return overview, nil
+}
+
+func (s *OpsService) resolveOpsQueryMode(ctx context.Context, requested OpsQueryMode) OpsQueryMode {
+ if requested.IsValid() {
+ // Allow "auto" to be disabled via config until preagg is proven stable in production.
+ // Forced `preagg` via query param still works.
+ if requested == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
+ return OpsQueryModeRaw
+ }
+ return requested
+ }
+
+ mode := OpsQueryModeAuto
+ if s != nil && s.settingRepo != nil {
+ if raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsQueryModeDefault); err == nil {
+ mode = ParseOpsQueryMode(raw)
+ }
+ }
+
+ if mode == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
+ return OpsQueryModeRaw
+ }
+ return mode
+}
diff --git a/backend/internal/service/ops_errors.go b/backend/internal/service/ops_errors.go
new file mode 100644
index 00000000..76b5ce8b
--- /dev/null
+++ b/backend/internal/service/ops_errors.go
@@ -0,0 +1,45 @@
+package service
+
+import (
+ "context"
+
+ infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+func (s *OpsService) GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+ }
+ if filter == nil {
+ return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
+ }
+ if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+ return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
+ }
+ if filter.StartTime.After(filter.EndTime) {
+ return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
+ }
+ return s.opsRepo.GetErrorTrend(ctx, filter, bucketSeconds)
+}
+
+func (s *OpsService) GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+ }
+ if filter == nil {
+ return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
+ }
+ if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+ return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
+ }
+ if filter.StartTime.After(filter.EndTime) {
+ return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
+ }
+ return s.opsRepo.GetErrorDistribution(ctx, filter)
+}
diff --git a/backend/internal/service/ops_histograms.go b/backend/internal/service/ops_histograms.go
new file mode 100644
index 00000000..9f5b514f
--- /dev/null
+++ b/backend/internal/service/ops_histograms.go
@@ -0,0 +1,26 @@
+package service
+
+import (
+ "context"
+
+ infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+func (s *OpsService) GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+ }
+ if filter == nil {
+ return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
+ }
+ if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+ return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
+ }
+ if filter.StartTime.After(filter.EndTime) {
+ return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
+ }
+ return s.opsRepo.GetLatencyHistogram(ctx, filter)
+}
diff --git a/backend/internal/service/ops_metrics_collector.go b/backend/internal/service/ops_metrics_collector.go
new file mode 100644
index 00000000..cd90e1bd
--- /dev/null
+++ b/backend/internal/service/ops_metrics_collector.go
@@ -0,0 +1,861 @@
+package service
+
+import (
+ "context"
+ "database/sql"
+ "errors"
+ "fmt"
+ "hash/fnv"
+ "log"
+ "math"
+ "os"
+ "runtime"
+ "strconv"
+ "strings"
+ "sync"
+ "time"
+ "unicode/utf8"
+
+ "github.com/Wei-Shaw/sub2api/internal/config"
+ "github.com/google/uuid"
+ "github.com/redis/go-redis/v9"
+ "github.com/shirou/gopsutil/v4/cpu"
+ "github.com/shirou/gopsutil/v4/mem"
+)
+
+const (
+ opsMetricsCollectorJobName = "ops_metrics_collector"
+ opsMetricsCollectorMinInterval = 60 * time.Second
+ opsMetricsCollectorMaxInterval = 1 * time.Hour
+
+ opsMetricsCollectorTimeout = 10 * time.Second
+
+ opsMetricsCollectorLeaderLockKey = "ops:metrics:collector:leader"
+ opsMetricsCollectorLeaderLockTTL = 90 * time.Second
+
+ opsMetricsCollectorHeartbeatTimeout = 2 * time.Second
+
+ bytesPerMB = 1024 * 1024
+)
+
+var opsMetricsCollectorAdvisoryLockID = hashAdvisoryLockID(opsMetricsCollectorLeaderLockKey)
+
+type OpsMetricsCollector struct {
+ opsRepo OpsRepository
+ settingRepo SettingRepository
+ cfg *config.Config
+
+ db *sql.DB
+ redisClient *redis.Client
+ instanceID string
+
+ lastCgroupCPUUsageNanos uint64
+ lastCgroupCPUSampleAt time.Time
+
+ stopCh chan struct{}
+ startOnce sync.Once
+ stopOnce sync.Once
+
+ skipLogMu sync.Mutex
+ skipLogAt time.Time
+}
+
+func NewOpsMetricsCollector(
+ opsRepo OpsRepository,
+ settingRepo SettingRepository,
+ db *sql.DB,
+ redisClient *redis.Client,
+ cfg *config.Config,
+) *OpsMetricsCollector {
+ return &OpsMetricsCollector{
+ opsRepo: opsRepo,
+ settingRepo: settingRepo,
+ cfg: cfg,
+ db: db,
+ redisClient: redisClient,
+ instanceID: uuid.NewString(),
+ }
+}
+
+func (c *OpsMetricsCollector) Start() {
+ if c == nil {
+ return
+ }
+ c.startOnce.Do(func() {
+ if c.stopCh == nil {
+ c.stopCh = make(chan struct{})
+ }
+ go c.run()
+ })
+}
+
+func (c *OpsMetricsCollector) Stop() {
+ if c == nil {
+ return
+ }
+ c.stopOnce.Do(func() {
+ if c.stopCh != nil {
+ close(c.stopCh)
+ }
+ })
+}
+
+func (c *OpsMetricsCollector) run() {
+ // First run immediately so the dashboard has data soon after startup.
+ c.collectOnce()
+
+ for {
+ interval := c.getInterval()
+ timer := time.NewTimer(interval)
+ select {
+ case <-timer.C:
+ c.collectOnce()
+ case <-c.stopCh:
+ timer.Stop()
+ return
+ }
+ }
+}
+
+func (c *OpsMetricsCollector) getInterval() time.Duration {
+ interval := opsMetricsCollectorMinInterval
+
+ if c.settingRepo == nil {
+ return interval
+ }
+
+ ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer cancel()
+
+ raw, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMetricsIntervalSeconds)
+ if err != nil {
+ return interval
+ }
+ raw = strings.TrimSpace(raw)
+ if raw == "" {
+ return interval
+ }
+
+ seconds, err := strconv.Atoi(raw)
+ if err != nil {
+ return interval
+ }
+ if seconds < int(opsMetricsCollectorMinInterval.Seconds()) {
+ seconds = int(opsMetricsCollectorMinInterval.Seconds())
+ }
+ if seconds > int(opsMetricsCollectorMaxInterval.Seconds()) {
+ seconds = int(opsMetricsCollectorMaxInterval.Seconds())
+ }
+ return time.Duration(seconds) * time.Second
+}
+
+func (c *OpsMetricsCollector) collectOnce() {
+ if c == nil {
+ return
+ }
+ if c.cfg != nil && !c.cfg.Ops.Enabled {
+ return
+ }
+ if c.opsRepo == nil {
+ return
+ }
+ if c.db == nil {
+ return
+ }
+
+ ctx, cancel := context.WithTimeout(context.Background(), opsMetricsCollectorTimeout)
+ defer cancel()
+
+ if !c.isMonitoringEnabled(ctx) {
+ return
+ }
+
+ release, ok := c.tryAcquireLeaderLock(ctx)
+ if !ok {
+ return
+ }
+ if release != nil {
+ defer release()
+ }
+
+ startedAt := time.Now().UTC()
+ err := c.collectAndPersist(ctx)
+ finishedAt := time.Now().UTC()
+
+ durationMs := finishedAt.Sub(startedAt).Milliseconds()
+ dur := durationMs
+ runAt := startedAt
+
+ if err != nil {
+ msg := truncateString(err.Error(), 2048)
+ errAt := finishedAt
+ hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
+ defer hbCancel()
+ _ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
+ JobName: opsMetricsCollectorJobName,
+ LastRunAt: &runAt,
+ LastErrorAt: &errAt,
+ LastError: &msg,
+ LastDurationMs: &dur,
+ })
+ log.Printf("[OpsMetricsCollector] collect failed: %v", err)
+ return
+ }
+
+ successAt := finishedAt
+ hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
+ defer hbCancel()
+ _ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
+ JobName: opsMetricsCollectorJobName,
+ LastRunAt: &runAt,
+ LastSuccessAt: &successAt,
+ LastDurationMs: &dur,
+ })
+}
+
+func (c *OpsMetricsCollector) isMonitoringEnabled(ctx context.Context) bool {
+ if c == nil {
+ return false
+ }
+ if c.cfg != nil && !c.cfg.Ops.Enabled {
+ return false
+ }
+ if c.settingRepo == nil {
+ return true
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+
+ value, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
+ if err != nil {
+ if errors.Is(err, ErrSettingNotFound) {
+ return true
+ }
+ // Fail-open: collector should not become a hard dependency.
+ return true
+ }
+ switch strings.ToLower(strings.TrimSpace(value)) {
+ case "false", "0", "off", "disabled":
+ return false
+ default:
+ return true
+ }
+}
+
+func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
+ if ctx == nil {
+ ctx = context.Background()
+ }
+
+ // Align to stable minute boundaries to avoid partial buckets and to maximize cache hits.
+ now := time.Now().UTC()
+ windowEnd := now.Truncate(time.Minute)
+ windowStart := windowEnd.Add(-1 * time.Minute)
+
+ sys, err := c.collectSystemStats(ctx)
+ if err != nil {
+ // Continue; system stats are best-effort.
+ log.Printf("[OpsMetricsCollector] system stats error: %v", err)
+ }
+
+ dbOK := c.checkDB(ctx)
+ redisOK := c.checkRedis(ctx)
+ active, idle := c.dbPoolStats()
+
+ successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd)
+ if err != nil {
+ return fmt.Errorf("query usage counts: %w", err)
+ }
+
+ duration, ttft, err := c.queryUsageLatency(ctx, windowStart, windowEnd)
+ if err != nil {
+ return fmt.Errorf("query usage latency: %w", err)
+ }
+
+ errorTotal, businessLimited, errorSLA, upstreamExcl, upstream429, upstream529, err := c.queryErrorCounts(ctx, windowStart, windowEnd)
+ if err != nil {
+ return fmt.Errorf("query error counts: %w", err)
+ }
+
+ windowSeconds := windowEnd.Sub(windowStart).Seconds()
+ if windowSeconds <= 0 {
+ windowSeconds = 60
+ }
+ requestTotal := successCount + errorTotal
+ qps := float64(requestTotal) / windowSeconds
+ tps := float64(tokenConsumed) / windowSeconds
+
+ goroutines := runtime.NumGoroutine()
+
+ input := &OpsInsertSystemMetricsInput{
+ CreatedAt: windowEnd,
+ WindowMinutes: 1,
+
+ SuccessCount: successCount,
+ ErrorCountTotal: errorTotal,
+ BusinessLimitedCount: businessLimited,
+ ErrorCountSLA: errorSLA,
+
+ UpstreamErrorCountExcl429529: upstreamExcl,
+ Upstream429Count: upstream429,
+ Upstream529Count: upstream529,
+
+ TokenConsumed: tokenConsumed,
+ QPS: float64Ptr(roundTo1DP(qps)),
+ TPS: float64Ptr(roundTo1DP(tps)),
+
+ DurationP50Ms: duration.p50,
+ DurationP90Ms: duration.p90,
+ DurationP95Ms: duration.p95,
+ DurationP99Ms: duration.p99,
+ DurationAvgMs: duration.avg,
+ DurationMaxMs: duration.max,
+
+ TTFTP50Ms: ttft.p50,
+ TTFTP90Ms: ttft.p90,
+ TTFTP95Ms: ttft.p95,
+ TTFTP99Ms: ttft.p99,
+ TTFTAvgMs: ttft.avg,
+ TTFTMaxMs: ttft.max,
+
+ CPUUsagePercent: sys.cpuUsagePercent,
+ MemoryUsedMB: sys.memoryUsedMB,
+ MemoryTotalMB: sys.memoryTotalMB,
+ MemoryUsagePercent: sys.memoryUsagePercent,
+
+ DBOK: boolPtr(dbOK),
+ RedisOK: boolPtr(redisOK),
+
+ DBConnActive: intPtr(active),
+ DBConnIdle: intPtr(idle),
+ GoroutineCount: intPtr(goroutines),
+ }
+
+ return c.opsRepo.InsertSystemMetrics(ctx, input)
+}
+
+type opsCollectedPercentiles struct {
+ p50 *int
+ p90 *int
+ p95 *int
+ p99 *int
+ avg *float64
+ max *int
+}
+
+func (c *OpsMetricsCollector) queryUsageCounts(ctx context.Context, start, end time.Time) (successCount int64, tokenConsumed int64, err error) {
+ q := `
+SELECT
+ COALESCE(COUNT(*), 0) AS success_count,
+ COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
+FROM usage_logs
+WHERE created_at >= $1 AND created_at < $2`
+
+ var tokens sql.NullInt64
+ if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&successCount, &tokens); err != nil {
+ return 0, 0, err
+ }
+ if tokens.Valid {
+ tokenConsumed = tokens.Int64
+ }
+ return successCount, tokenConsumed, nil
+}
+
+func (c *OpsMetricsCollector) queryUsageLatency(ctx context.Context, start, end time.Time) (duration opsCollectedPercentiles, ttft opsCollectedPercentiles, err error) {
+ {
+ q := `
+SELECT
+ percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50,
+ percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90,
+ percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95,
+ percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99,
+ AVG(duration_ms) AS avg_ms,
+ MAX(duration_ms) AS max_ms
+FROM usage_logs
+WHERE created_at >= $1 AND created_at < $2
+ AND duration_ms IS NOT NULL`
+
+ var p50, p90, p95, p99 sql.NullFloat64
+ var avg sql.NullFloat64
+ var max sql.NullInt64
+ if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
+ return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
+ }
+ duration.p50 = floatToIntPtr(p50)
+ duration.p90 = floatToIntPtr(p90)
+ duration.p95 = floatToIntPtr(p95)
+ duration.p99 = floatToIntPtr(p99)
+ if avg.Valid {
+ v := roundTo1DP(avg.Float64)
+ duration.avg = &v
+ }
+ if max.Valid {
+ v := int(max.Int64)
+ duration.max = &v
+ }
+ }
+
+ {
+ q := `
+SELECT
+ percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50,
+ percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90,
+ percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95,
+ percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99,
+ AVG(first_token_ms) AS avg_ms,
+ MAX(first_token_ms) AS max_ms
+FROM usage_logs
+WHERE created_at >= $1 AND created_at < $2
+ AND first_token_ms IS NOT NULL`
+
+ var p50, p90, p95, p99 sql.NullFloat64
+ var avg sql.NullFloat64
+ var max sql.NullInt64
+ if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
+ return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
+ }
+ ttft.p50 = floatToIntPtr(p50)
+ ttft.p90 = floatToIntPtr(p90)
+ ttft.p95 = floatToIntPtr(p95)
+ ttft.p99 = floatToIntPtr(p99)
+ if avg.Valid {
+ v := roundTo1DP(avg.Float64)
+ ttft.avg = &v
+ }
+ if max.Valid {
+ v := int(max.Int64)
+ ttft.max = &v
+ }
+ }
+
+ return duration, ttft, nil
+}
+
+func (c *OpsMetricsCollector) queryErrorCounts(ctx context.Context, start, end time.Time) (
+ errorTotal int64,
+ businessLimited int64,
+ errorSLA int64,
+ upstreamExcl429529 int64,
+ upstream429 int64,
+ upstream529 int64,
+ err error,
+) {
+ q := `
+SELECT
+ COALESCE(COUNT(*), 0) AS error_total,
+ COALESCE(COUNT(*) FILTER (WHERE is_business_limited), 0) AS business_limited,
+ COALESCE(COUNT(*) FILTER (WHERE NOT is_business_limited), 0) AS error_sla,
+ COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl,
+ COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 429), 0) AS upstream_429,
+ COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 529), 0) AS upstream_529
+FROM ops_error_logs
+WHERE created_at >= $1 AND created_at < $2`
+
+ if err := c.db.QueryRowContext(ctx, q, start, end).Scan(
+ &errorTotal,
+ &businessLimited,
+ &errorSLA,
+ &upstreamExcl429529,
+ &upstream429,
+ &upstream529,
+ ); err != nil {
+ return 0, 0, 0, 0, 0, 0, err
+ }
+ return errorTotal, businessLimited, errorSLA, upstreamExcl429529, upstream429, upstream529, nil
+}
+
+type opsCollectedSystemStats struct {
+ cpuUsagePercent *float64
+ memoryUsedMB *int64
+ memoryTotalMB *int64
+ memoryUsagePercent *float64
+}
+
+func (c *OpsMetricsCollector) collectSystemStats(ctx context.Context) (*opsCollectedSystemStats, error) {
+ out := &opsCollectedSystemStats{}
+ if ctx == nil {
+ ctx = context.Background()
+ }
+
+ sampleAt := time.Now().UTC()
+
+ // Prefer cgroup (container) metrics when available.
+ if cpuPct := c.tryCgroupCPUPercent(sampleAt); cpuPct != nil {
+ out.cpuUsagePercent = cpuPct
+ }
+
+ cgroupUsed, cgroupTotal, cgroupOK := readCgroupMemoryBytes()
+ if cgroupOK {
+ usedMB := int64(cgroupUsed / bytesPerMB)
+ out.memoryUsedMB = &usedMB
+ if cgroupTotal > 0 {
+ totalMB := int64(cgroupTotal / bytesPerMB)
+ out.memoryTotalMB = &totalMB
+ pct := roundTo1DP(float64(cgroupUsed) / float64(cgroupTotal) * 100)
+ out.memoryUsagePercent = &pct
+ }
+ }
+
+ // Fallback to host metrics if cgroup metrics are unavailable (or incomplete).
+ if out.cpuUsagePercent == nil {
+ if cpuPercents, err := cpu.PercentWithContext(ctx, 0, false); err == nil && len(cpuPercents) > 0 {
+ v := roundTo1DP(cpuPercents[0])
+ out.cpuUsagePercent = &v
+ }
+ }
+
+ // If total memory isn't available from cgroup (e.g. memory.max = "max"), fill total from host.
+ if out.memoryUsedMB == nil || out.memoryTotalMB == nil || out.memoryUsagePercent == nil {
+ if vm, err := mem.VirtualMemoryWithContext(ctx); err == nil && vm != nil {
+ if out.memoryUsedMB == nil {
+ usedMB := int64(vm.Used / bytesPerMB)
+ out.memoryUsedMB = &usedMB
+ }
+ if out.memoryTotalMB == nil {
+ totalMB := int64(vm.Total / bytesPerMB)
+ out.memoryTotalMB = &totalMB
+ }
+ if out.memoryUsagePercent == nil {
+ if out.memoryUsedMB != nil && out.memoryTotalMB != nil && *out.memoryTotalMB > 0 {
+ pct := roundTo1DP(float64(*out.memoryUsedMB) / float64(*out.memoryTotalMB) * 100)
+ out.memoryUsagePercent = &pct
+ } else {
+ pct := roundTo1DP(vm.UsedPercent)
+ out.memoryUsagePercent = &pct
+ }
+ }
+ }
+ }
+
+ return out, nil
+}
+
+func (c *OpsMetricsCollector) tryCgroupCPUPercent(now time.Time) *float64 {
+ usageNanos, ok := readCgroupCPUUsageNanos()
+ if !ok {
+ return nil
+ }
+
+ // Initialize baseline sample.
+ if c.lastCgroupCPUSampleAt.IsZero() {
+ c.lastCgroupCPUUsageNanos = usageNanos
+ c.lastCgroupCPUSampleAt = now
+ return nil
+ }
+
+ elapsed := now.Sub(c.lastCgroupCPUSampleAt)
+ if elapsed <= 0 {
+ c.lastCgroupCPUUsageNanos = usageNanos
+ c.lastCgroupCPUSampleAt = now
+ return nil
+ }
+
+ prev := c.lastCgroupCPUUsageNanos
+ c.lastCgroupCPUUsageNanos = usageNanos
+ c.lastCgroupCPUSampleAt = now
+
+ if usageNanos < prev {
+ // Counter reset (container restarted).
+ return nil
+ }
+
+ deltaUsageSec := float64(usageNanos-prev) / 1e9
+ elapsedSec := elapsed.Seconds()
+ if elapsedSec <= 0 {
+ return nil
+ }
+
+ cores := readCgroupCPULimitCores()
+ if cores <= 0 {
+ // Can't reliably normalize; skip and fall back to gopsutil.
+ return nil
+ }
+
+ pct := (deltaUsageSec / (elapsedSec * cores)) * 100
+ if pct < 0 {
+ pct = 0
+ }
+ // Clamp to avoid noise/jitter showing impossible values.
+ if pct > 100 {
+ pct = 100
+ }
+ v := roundTo1DP(pct)
+ return &v
+}
+
+func readCgroupMemoryBytes() (usedBytes uint64, totalBytes uint64, ok bool) {
+ // cgroup v2 (most common in modern containers)
+ if used, ok1 := readUintFile("/sys/fs/cgroup/memory.current"); ok1 {
+ usedBytes = used
+ rawMax, err := os.ReadFile("/sys/fs/cgroup/memory.max")
+ if err == nil {
+ s := strings.TrimSpace(string(rawMax))
+ if s != "" && s != "max" {
+ if v, err := strconv.ParseUint(s, 10, 64); err == nil {
+ totalBytes = v
+ }
+ }
+ }
+ return usedBytes, totalBytes, true
+ }
+
+ // cgroup v1 fallback
+ if used, ok1 := readUintFile("/sys/fs/cgroup/memory/memory.usage_in_bytes"); ok1 {
+ usedBytes = used
+ if limit, ok2 := readUintFile("/sys/fs/cgroup/memory/memory.limit_in_bytes"); ok2 {
+ // Some environments report a very large number when unlimited.
+ if limit > 0 && limit < (1<<60) {
+ totalBytes = limit
+ }
+ }
+ return usedBytes, totalBytes, true
+ }
+
+ return 0, 0, false
+}
+
+func readCgroupCPUUsageNanos() (usageNanos uint64, ok bool) {
+ // cgroup v2: cpu.stat has usage_usec
+ if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.stat"); err == nil {
+ lines := strings.Split(string(raw), "\n")
+ for _, line := range lines {
+ fields := strings.Fields(line)
+ if len(fields) != 2 {
+ continue
+ }
+ if fields[0] != "usage_usec" {
+ continue
+ }
+ v, err := strconv.ParseUint(fields[1], 10, 64)
+ if err != nil {
+ continue
+ }
+ return v * 1000, true
+ }
+ }
+
+ // cgroup v1: cpuacct.usage is in nanoseconds
+ if v, ok := readUintFile("/sys/fs/cgroup/cpuacct/cpuacct.usage"); ok {
+ return v, true
+ }
+
+ return 0, false
+}
+
+func readCgroupCPULimitCores() float64 {
+ // cgroup v2: cpu.max => " " or "max "
+ if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.max"); err == nil {
+ fields := strings.Fields(string(raw))
+ if len(fields) >= 2 && fields[0] != "max" {
+ quota, err1 := strconv.ParseFloat(fields[0], 64)
+ period, err2 := strconv.ParseFloat(fields[1], 64)
+ if err1 == nil && err2 == nil && quota > 0 && period > 0 {
+ return quota / period
+ }
+ }
+ }
+
+ // cgroup v1: cpu.cfs_quota_us / cpu.cfs_period_us
+ quota, okQuota := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_quota_us")
+ period, okPeriod := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_period_us")
+ if okQuota && okPeriod && quota > 0 && period > 0 {
+ return float64(quota) / float64(period)
+ }
+
+ return 0
+}
+
+func readUintFile(path string) (uint64, bool) {
+ raw, err := os.ReadFile(path)
+ if err != nil {
+ return 0, false
+ }
+ s := strings.TrimSpace(string(raw))
+ if s == "" {
+ return 0, false
+ }
+ v, err := strconv.ParseUint(s, 10, 64)
+ if err != nil {
+ return 0, false
+ }
+ return v, true
+}
+
+func readIntFile(path string) (int64, bool) {
+ raw, err := os.ReadFile(path)
+ if err != nil {
+ return 0, false
+ }
+ s := strings.TrimSpace(string(raw))
+ if s == "" {
+ return 0, false
+ }
+ v, err := strconv.ParseInt(s, 10, 64)
+ if err != nil {
+ return 0, false
+ }
+ return v, true
+}
+
+func (c *OpsMetricsCollector) checkDB(ctx context.Context) bool {
+ if c == nil || c.db == nil {
+ return false
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+ var one int
+ if err := c.db.QueryRowContext(ctx, "SELECT 1").Scan(&one); err != nil {
+ return false
+ }
+ return one == 1
+}
+
+func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool {
+ if c == nil || c.redisClient == nil {
+ return false
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+ return c.redisClient.Ping(ctx).Err() == nil
+}
+
+func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) {
+ if c == nil || c.db == nil {
+ return 0, 0
+ }
+ stats := c.db.Stats()
+ return stats.InUse, stats.Idle
+}
+
+var opsMetricsCollectorReleaseScript = redis.NewScript(`
+if redis.call("GET", KEYS[1]) == ARGV[1] then
+ return redis.call("DEL", KEYS[1])
+end
+return 0
+`)
+
+func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
+ if c == nil || c.redisClient == nil {
+ return nil, true
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+
+ ok, err := c.redisClient.SetNX(ctx, opsMetricsCollectorLeaderLockKey, c.instanceID, opsMetricsCollectorLeaderLockTTL).Result()
+ if err != nil {
+ // Prefer fail-closed to avoid stampeding the database when Redis is flaky.
+ // Fallback to a DB advisory lock when Redis is present but unavailable.
+ release, ok := c.tryAcquireDBAdvisoryLock(ctx)
+ if !ok {
+ c.maybeLogSkip()
+ return nil, false
+ }
+ return release, true
+ }
+ if !ok {
+ c.maybeLogSkip()
+ return nil, false
+ }
+
+ release := func() {
+ ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer cancel()
+ _, _ = opsMetricsCollectorReleaseScript.Run(ctx, c.redisClient, []string{opsMetricsCollectorLeaderLockKey}, c.instanceID).Result()
+ }
+ return release, true
+}
+
+func (c *OpsMetricsCollector) tryAcquireDBAdvisoryLock(ctx context.Context) (func(), bool) {
+ if c == nil || c.db == nil {
+ return nil, false
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+
+ conn, err := c.db.Conn(ctx)
+ if err != nil {
+ return nil, false
+ }
+
+ acquired := false
+ if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", opsMetricsCollectorAdvisoryLockID).Scan(&acquired); err != nil {
+ _ = conn.Close()
+ return nil, false
+ }
+ if !acquired {
+ _ = conn.Close()
+ return nil, false
+ }
+
+ release := func() {
+ unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer cancel()
+ _, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", opsMetricsCollectorAdvisoryLockID)
+ _ = conn.Close()
+ }
+ return release, true
+}
+
+func (c *OpsMetricsCollector) maybeLogSkip() {
+ c.skipLogMu.Lock()
+ defer c.skipLogMu.Unlock()
+
+ now := time.Now()
+ if !c.skipLogAt.IsZero() && now.Sub(c.skipLogAt) < time.Minute {
+ return
+ }
+ c.skipLogAt = now
+ log.Printf("[OpsMetricsCollector] leader lock held by another instance; skipping")
+}
+
+func floatToIntPtr(v sql.NullFloat64) *int {
+ if !v.Valid {
+ return nil
+ }
+ n := int(math.Round(v.Float64))
+ return &n
+}
+
+func roundTo1DP(v float64) float64 {
+ return math.Round(v*10) / 10
+}
+
+func truncateString(s string, max int) string {
+ if max <= 0 {
+ return ""
+ }
+ if len(s) <= max {
+ return s
+ }
+ cut := s[:max]
+ for len(cut) > 0 && !utf8.ValidString(cut) {
+ cut = cut[:len(cut)-1]
+ }
+ return cut
+}
+
+func boolPtr(v bool) *bool {
+ out := v
+ return &out
+}
+
+func intPtr(v int) *int {
+ out := v
+ return &out
+}
+
+func float64Ptr(v float64) *float64 {
+ out := v
+ return &out
+}
+
+func hashAdvisoryLockID(s string) int64 {
+ h := fnv.New64a()
+ _, _ = h.Write([]byte(s))
+ return int64(h.Sum64())
+}
diff --git a/backend/internal/service/ops_port.go b/backend/internal/service/ops_port.go
new file mode 100644
index 00000000..a3d847e0
--- /dev/null
+++ b/backend/internal/service/ops_port.go
@@ -0,0 +1,226 @@
+package service
+
+import (
+ "context"
+ "time"
+)
+
+type OpsRepository interface {
+ InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error)
+ ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error)
+ GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error)
+ ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error)
+
+ InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
+ UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
+ GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)
+
+ // Lightweight window stats (for realtime WS / quick sampling).
+ GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
+
+ GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error)
+ GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error)
+ GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error)
+ GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error)
+ GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error)
+
+ InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error
+ GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error)
+
+ UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error
+ ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error)
+
+ // Alerts (rules + events)
+ ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error)
+ CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
+ UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
+ DeleteAlertRule(ctx context.Context, id int64) error
+
+ ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
+ GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
+ GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
+ CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
+ UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
+ UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error
+
+ // Pre-aggregation (hourly/daily) used for long-window dashboard performance.
+ UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
+ UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
+ GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error)
+ GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error)
+}
+
+type OpsInsertErrorLogInput struct {
+ RequestID string
+ ClientRequestID string
+
+ UserID *int64
+ APIKeyID *int64
+ AccountID *int64
+ GroupID *int64
+ ClientIP *string
+
+ Platform string
+ Model string
+ RequestPath string
+ Stream bool
+ UserAgent string
+
+ ErrorPhase string
+ ErrorType string
+ Severity string
+ StatusCode int
+ IsBusinessLimited bool
+
+ ErrorMessage string
+ ErrorBody string
+
+ ErrorSource string
+ ErrorOwner string
+
+ UpstreamStatusCode *int
+ UpstreamErrorMessage *string
+ UpstreamErrorDetail *string
+
+ DurationMs *int
+ TimeToFirstTokenMs *int64
+
+ RequestBodyJSON *string // sanitized json string (not raw bytes)
+ RequestBodyTruncated bool
+ RequestBodyBytes *int
+ RequestHeadersJSON *string // optional json string
+
+ IsRetryable bool
+ RetryCount int
+
+ CreatedAt time.Time
+}
+
+type OpsInsertRetryAttemptInput struct {
+ RequestedByUserID int64
+ SourceErrorID int64
+ Mode string
+ PinnedAccountID *int64
+
+ // running|queued etc.
+ Status string
+ StartedAt time.Time
+}
+
+type OpsUpdateRetryAttemptInput struct {
+ ID int64
+
+ // succeeded|failed
+ Status string
+ FinishedAt time.Time
+ DurationMs int64
+
+ // Optional correlation
+ ResultRequestID *string
+ ResultErrorID *int64
+
+ ErrorMessage *string
+}
+
+type OpsInsertSystemMetricsInput struct {
+ CreatedAt time.Time
+ WindowMinutes int
+
+ Platform *string
+ GroupID *int64
+
+ SuccessCount int64
+ ErrorCountTotal int64
+ BusinessLimitedCount int64
+ ErrorCountSLA int64
+
+ UpstreamErrorCountExcl429529 int64
+ Upstream429Count int64
+ Upstream529Count int64
+
+ TokenConsumed int64
+
+ QPS *float64
+ TPS *float64
+
+ DurationP50Ms *int
+ DurationP90Ms *int
+ DurationP95Ms *int
+ DurationP99Ms *int
+ DurationAvgMs *float64
+ DurationMaxMs *int
+
+ TTFTP50Ms *int
+ TTFTP90Ms *int
+ TTFTP95Ms *int
+ TTFTP99Ms *int
+ TTFTAvgMs *float64
+ TTFTMaxMs *int
+
+ CPUUsagePercent *float64
+ MemoryUsedMB *int64
+ MemoryTotalMB *int64
+ MemoryUsagePercent *float64
+
+ DBOK *bool
+ RedisOK *bool
+
+ DBConnActive *int
+ DBConnIdle *int
+ DBConnWaiting *int
+
+ GoroutineCount *int
+ ConcurrencyQueueDepth *int
+}
+
+type OpsSystemMetricsSnapshot struct {
+ ID int64 `json:"id"`
+ CreatedAt time.Time `json:"created_at"`
+ WindowMinutes int `json:"window_minutes"`
+
+ CPUUsagePercent *float64 `json:"cpu_usage_percent"`
+ MemoryUsedMB *int64 `json:"memory_used_mb"`
+ MemoryTotalMB *int64 `json:"memory_total_mb"`
+ MemoryUsagePercent *float64 `json:"memory_usage_percent"`
+
+ DBOK *bool `json:"db_ok"`
+ RedisOK *bool `json:"redis_ok"`
+
+ DBConnActive *int `json:"db_conn_active"`
+ DBConnIdle *int `json:"db_conn_idle"`
+ DBConnWaiting *int `json:"db_conn_waiting"`
+
+ GoroutineCount *int `json:"goroutine_count"`
+ ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"`
+}
+
+type OpsUpsertJobHeartbeatInput struct {
+ JobName string
+
+ LastRunAt *time.Time
+ LastSuccessAt *time.Time
+ LastErrorAt *time.Time
+ LastError *string
+ LastDurationMs *int64
+}
+
+type OpsJobHeartbeat struct {
+ JobName string `json:"job_name"`
+
+ LastRunAt *time.Time `json:"last_run_at"`
+ LastSuccessAt *time.Time `json:"last_success_at"`
+ LastErrorAt *time.Time `json:"last_error_at"`
+ LastError *string `json:"last_error"`
+ LastDurationMs *int64 `json:"last_duration_ms"`
+
+ UpdatedAt time.Time `json:"updated_at"`
+}
+
+type OpsWindowStats struct {
+ StartTime time.Time `json:"start_time"`
+ EndTime time.Time `json:"end_time"`
+
+ SuccessCount int64 `json:"success_count"`
+ ErrorCountTotal int64 `json:"error_count_total"`
+ TokenConsumed int64 `json:"token_consumed"`
+}
diff --git a/backend/internal/service/ops_query_mode.go b/backend/internal/service/ops_query_mode.go
new file mode 100644
index 00000000..e6fa9c1e
--- /dev/null
+++ b/backend/internal/service/ops_query_mode.go
@@ -0,0 +1,40 @@
+package service
+
+import (
+ "errors"
+ "strings"
+)
+
+type OpsQueryMode string
+
+const (
+ OpsQueryModeAuto OpsQueryMode = "auto"
+ OpsQueryModeRaw OpsQueryMode = "raw"
+ OpsQueryModePreagg OpsQueryMode = "preagg"
+)
+
+// ErrOpsPreaggregatedNotPopulated indicates that raw logs exist for a window, but the
+// pre-aggregation tables are not populated yet. This is primarily used to implement
+// the forced `preagg` mode UX.
+var ErrOpsPreaggregatedNotPopulated = errors.New("ops pre-aggregated tables not populated")
+
+func ParseOpsQueryMode(raw string) OpsQueryMode {
+ v := strings.ToLower(strings.TrimSpace(raw))
+ switch v {
+ case string(OpsQueryModeRaw):
+ return OpsQueryModeRaw
+ case string(OpsQueryModePreagg):
+ return OpsQueryModePreagg
+ default:
+ return OpsQueryModeAuto
+ }
+}
+
+func (m OpsQueryMode) IsValid() bool {
+ switch m {
+ case OpsQueryModeAuto, OpsQueryModeRaw, OpsQueryModePreagg:
+ return true
+ default:
+ return false
+ }
+}
diff --git a/backend/internal/service/ops_realtime.go b/backend/internal/service/ops_realtime.go
new file mode 100644
index 00000000..479b9482
--- /dev/null
+++ b/backend/internal/service/ops_realtime.go
@@ -0,0 +1,36 @@
+package service
+
+import (
+ "context"
+ "errors"
+ "strings"
+)
+
+// IsRealtimeMonitoringEnabled returns true when realtime ops features are enabled.
+//
+// This is a soft switch controlled by the DB setting `ops_realtime_monitoring_enabled`,
+// and it is also gated by the hard switch/soft switch of overall ops monitoring.
+func (s *OpsService) IsRealtimeMonitoringEnabled(ctx context.Context) bool {
+ if !s.IsMonitoringEnabled(ctx) {
+ return false
+ }
+ if s.settingRepo == nil {
+ return true
+ }
+
+ value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsRealtimeMonitoringEnabled)
+ if err != nil {
+ // Default enabled when key is missing; fail-open on transient errors.
+ if errors.Is(err, ErrSettingNotFound) {
+ return true
+ }
+ return true
+ }
+
+ switch strings.ToLower(strings.TrimSpace(value)) {
+ case "false", "0", "off", "disabled":
+ return false
+ default:
+ return true
+ }
+}
diff --git a/backend/internal/service/ops_request_details.go b/backend/internal/service/ops_request_details.go
new file mode 100644
index 00000000..e33e6f38
--- /dev/null
+++ b/backend/internal/service/ops_request_details.go
@@ -0,0 +1,152 @@
+package service
+
+import (
+ "context"
+ "time"
+)
+
+type OpsRequestKind string
+
+const (
+ OpsRequestKindSuccess OpsRequestKind = "success"
+ OpsRequestKindError OpsRequestKind = "error"
+)
+
+// OpsRequestDetail is a request-level view across success (usage_logs) and error (ops_error_logs).
+// It powers "request drilldown" UIs without exposing full request bodies for successful requests.
+type OpsRequestDetail struct {
+ Kind OpsRequestKind `json:"kind"`
+ CreatedAt time.Time `json:"created_at"`
+ RequestID string `json:"request_id"`
+
+ Platform string `json:"platform,omitempty"`
+ Model string `json:"model,omitempty"`
+
+ DurationMs *int `json:"duration_ms,omitempty"`
+ StatusCode *int `json:"status_code,omitempty"`
+
+ // When Kind == "error", ErrorID links to /admin/ops/errors/:id.
+ ErrorID *int64 `json:"error_id,omitempty"`
+
+ Phase string `json:"phase,omitempty"`
+ Severity string `json:"severity,omitempty"`
+ Message string `json:"message,omitempty"`
+
+ UserID *int64 `json:"user_id,omitempty"`
+ APIKeyID *int64 `json:"api_key_id,omitempty"`
+ AccountID *int64 `json:"account_id,omitempty"`
+ GroupID *int64 `json:"group_id,omitempty"`
+
+ Stream bool `json:"stream"`
+}
+
+type OpsRequestDetailFilter struct {
+ StartTime *time.Time
+ EndTime *time.Time
+
+ // kind: success|error|all
+ Kind string
+
+ Platform string
+ GroupID *int64
+
+ UserID *int64
+ APIKeyID *int64
+ AccountID *int64
+
+ Model string
+ RequestID string
+ Query string
+
+ MinDurationMs *int
+ MaxDurationMs *int
+
+ // Sort: created_at_desc (default) or duration_desc.
+ Sort string
+
+ Page int
+ PageSize int
+}
+
+func (f *OpsRequestDetailFilter) Normalize() (page, pageSize int, startTime, endTime time.Time) {
+ page = 1
+ pageSize = 50
+ endTime = time.Now()
+ startTime = endTime.Add(-1 * time.Hour)
+
+ if f == nil {
+ return page, pageSize, startTime, endTime
+ }
+
+ if f.Page > 0 {
+ page = f.Page
+ }
+ if f.PageSize > 0 {
+ pageSize = f.PageSize
+ }
+ if pageSize > 100 {
+ pageSize = 100
+ }
+
+ if f.EndTime != nil {
+ endTime = *f.EndTime
+ }
+ if f.StartTime != nil {
+ startTime = *f.StartTime
+ } else if f.EndTime != nil {
+ startTime = endTime.Add(-1 * time.Hour)
+ }
+
+ if startTime.After(endTime) {
+ startTime, endTime = endTime, startTime
+ }
+
+ return page, pageSize, startTime, endTime
+}
+
+type OpsRequestDetailList struct {
+ Items []*OpsRequestDetail `json:"items"`
+ Total int64 `json:"total"`
+ Page int `json:"page"`
+ PageSize int `json:"page_size"`
+}
+
+func (s *OpsService) ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) (*OpsRequestDetailList, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return &OpsRequestDetailList{
+ Items: []*OpsRequestDetail{},
+ Total: 0,
+ Page: 1,
+ PageSize: 50,
+ }, nil
+ }
+
+ page, pageSize, startTime, endTime := filter.Normalize()
+ filterCopy := &OpsRequestDetailFilter{}
+ if filter != nil {
+ *filterCopy = *filter
+ }
+ filterCopy.Page = page
+ filterCopy.PageSize = pageSize
+ filterCopy.StartTime = &startTime
+ filterCopy.EndTime = &endTime
+
+ items, total, err := s.opsRepo.ListRequestDetails(ctx, filterCopy)
+ if err != nil {
+ return nil, err
+ }
+ if items == nil {
+ items = []*OpsRequestDetail{}
+ }
+
+ return &OpsRequestDetailList{
+ Items: items,
+ Total: total,
+ Page: page,
+ PageSize: pageSize,
+ }, nil
+}
+
diff --git a/backend/internal/service/ops_retry.go b/backend/internal/service/ops_retry.go
new file mode 100644
index 00000000..3232e708
--- /dev/null
+++ b/backend/internal/service/ops_retry.go
@@ -0,0 +1,635 @@
+package service
+
+import (
+ "bytes"
+ "context"
+ "database/sql"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "log"
+ "net/http"
+ "strings"
+ "time"
+
+ infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+ "github.com/gin-gonic/gin"
+ "github.com/lib/pq"
+)
+
+const (
+ OpsRetryModeClient = "client"
+ OpsRetryModeUpstream = "upstream"
+)
+
+const (
+ opsRetryStatusRunning = "running"
+ opsRetryStatusSucceeded = "succeeded"
+ opsRetryStatusFailed = "failed"
+)
+
+const (
+ opsRetryTimeout = 60 * time.Second
+ opsRetryCaptureBytesLimit = 64 * 1024
+ opsRetryResponsePreviewMax = 8 * 1024
+ opsRetryMinIntervalPerError = 10 * time.Second
+ opsRetryMaxAccountSwitches = 3
+)
+
+var opsRetryRequestHeaderAllowlist = map[string]bool{
+ "anthropic-beta": true,
+ "anthropic-version": true,
+}
+
+type opsRetryRequestType string
+
+const (
+ opsRetryTypeMessages opsRetryRequestType = "messages"
+ opsRetryTypeOpenAI opsRetryRequestType = "openai_responses"
+ opsRetryTypeGeminiV1B opsRetryRequestType = "gemini_v1beta"
+)
+
+type limitedResponseWriter struct {
+ header http.Header
+ status int
+ wroteHeader bool
+
+ limit int
+ totalWritten int64
+ buf bytes.Buffer
+}
+
+func newLimitedResponseWriter(limit int) *limitedResponseWriter {
+ if limit <= 0 {
+ limit = 1
+ }
+ return &limitedResponseWriter{
+ header: make(http.Header),
+ status: http.StatusOK,
+ limit: limit,
+ }
+}
+
+func (w *limitedResponseWriter) Header() http.Header {
+ return w.header
+}
+
+func (w *limitedResponseWriter) WriteHeader(statusCode int) {
+ if w.wroteHeader {
+ return
+ }
+ w.wroteHeader = true
+ w.status = statusCode
+}
+
+func (w *limitedResponseWriter) Write(p []byte) (int, error) {
+ if !w.wroteHeader {
+ w.WriteHeader(http.StatusOK)
+ }
+ w.totalWritten += int64(len(p))
+
+ if w.buf.Len() < w.limit {
+ remaining := w.limit - w.buf.Len()
+ if len(p) > remaining {
+ _, _ = w.buf.Write(p[:remaining])
+ } else {
+ _, _ = w.buf.Write(p)
+ }
+ }
+
+ // Pretend we wrote everything to avoid upstream/client code treating it as an error.
+ return len(p), nil
+}
+
+func (w *limitedResponseWriter) Flush() {}
+
+func (w *limitedResponseWriter) bodyBytes() []byte {
+ return w.buf.Bytes()
+}
+
+func (w *limitedResponseWriter) truncated() bool {
+ return w.totalWritten > int64(w.limit)
+}
+
+func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+ }
+
+ mode = strings.ToLower(strings.TrimSpace(mode))
+ switch mode {
+ case OpsRetryModeClient, OpsRetryModeUpstream:
+ default:
+ return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
+ }
+
+ latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
+ if err != nil && !errors.Is(err, sql.ErrNoRows) {
+ return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
+ }
+ if latest != nil {
+ if strings.EqualFold(latest.Status, opsRetryStatusRunning) || strings.EqualFold(latest.Status, "queued") {
+ return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
+ }
+
+ lastAttemptAt := latest.CreatedAt
+ if latest.FinishedAt != nil && !latest.FinishedAt.IsZero() {
+ lastAttemptAt = *latest.FinishedAt
+ } else if latest.StartedAt != nil && !latest.StartedAt.IsZero() {
+ lastAttemptAt = *latest.StartedAt
+ }
+
+ if time.Since(lastAttemptAt) < opsRetryMinIntervalPerError {
+ return nil, infraerrors.Conflict("OPS_RETRY_TOO_FREQUENT", "Please wait before retrying this error again")
+ }
+ }
+
+ errorLog, err := s.GetErrorLogByID(ctx, errorID)
+ if err != nil {
+ return nil, err
+ }
+ if strings.TrimSpace(errorLog.RequestBody) == "" {
+ return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
+ }
+
+ var pinned *int64
+ if mode == OpsRetryModeUpstream {
+ if pinnedAccountID != nil && *pinnedAccountID > 0 {
+ pinned = pinnedAccountID
+ } else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
+ pinned = errorLog.AccountID
+ } else {
+ return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
+ }
+ }
+
+ startedAt := time.Now()
+ attemptID, err := s.opsRepo.InsertRetryAttempt(ctx, &OpsInsertRetryAttemptInput{
+ RequestedByUserID: requestedByUserID,
+ SourceErrorID: errorID,
+ Mode: mode,
+ PinnedAccountID: pinned,
+ Status: opsRetryStatusRunning,
+ StartedAt: startedAt,
+ })
+ if err != nil {
+ var pqErr *pq.Error
+ if errors.As(err, &pqErr) && string(pqErr.Code) == "23505" {
+ return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
+ }
+ return nil, infraerrors.InternalServer("OPS_RETRY_CREATE_ATTEMPT_FAILED", "Failed to create retry attempt").WithCause(err)
+ }
+
+ result := &OpsRetryResult{
+ AttemptID: attemptID,
+ Mode: mode,
+ Status: opsRetryStatusFailed,
+ PinnedAccountID: pinned,
+ HTTPStatusCode: 0,
+ UpstreamRequestID: "",
+ ResponsePreview: "",
+ ResponseTruncated: false,
+ ErrorMessage: "",
+ StartedAt: startedAt,
+ }
+
+ execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
+ defer cancel()
+
+ execRes := s.executeRetry(execCtx, errorLog, mode, pinned)
+
+ finishedAt := time.Now()
+ result.FinishedAt = finishedAt
+ result.DurationMs = finishedAt.Sub(startedAt).Milliseconds()
+
+ if execRes != nil {
+ result.Status = execRes.status
+ result.UsedAccountID = execRes.usedAccountID
+ result.HTTPStatusCode = execRes.httpStatusCode
+ result.UpstreamRequestID = execRes.upstreamRequestID
+ result.ResponsePreview = execRes.responsePreview
+ result.ResponseTruncated = execRes.responseTruncated
+ result.ErrorMessage = execRes.errorMessage
+ }
+
+ updateCtx, updateCancel := context.WithTimeout(context.Background(), 3*time.Second)
+ defer updateCancel()
+
+ var updateErrMsg *string
+ if strings.TrimSpace(result.ErrorMessage) != "" {
+ msg := result.ErrorMessage
+ updateErrMsg = &msg
+ }
+ var resultRequestID *string
+ if strings.TrimSpace(result.UpstreamRequestID) != "" {
+ v := result.UpstreamRequestID
+ resultRequestID = &v
+ }
+
+ finalStatus := result.Status
+ if strings.TrimSpace(finalStatus) == "" {
+ finalStatus = opsRetryStatusFailed
+ }
+
+ if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{
+ ID: attemptID,
+ Status: finalStatus,
+ FinishedAt: finishedAt,
+ DurationMs: result.DurationMs,
+ ResultRequestID: resultRequestID,
+ ErrorMessage: updateErrMsg,
+ }); err != nil {
+ // Best-effort: retry itself already executed; do not fail the API response.
+ log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
+ }
+
+ return result, nil
+}
+
+type opsRetryExecution struct {
+ status string
+
+ usedAccountID *int64
+ httpStatusCode int
+ upstreamRequestID string
+
+ responsePreview string
+ responseTruncated bool
+
+ errorMessage string
+}
+
+func (s *OpsService) executeRetry(ctx context.Context, errorLog *OpsErrorLogDetail, mode string, pinnedAccountID *int64) *opsRetryExecution {
+ if errorLog == nil {
+ return &opsRetryExecution{
+ status: opsRetryStatusFailed,
+ errorMessage: "missing error log",
+ }
+ }
+
+ reqType := detectOpsRetryType(errorLog.RequestPath)
+ bodyBytes := []byte(errorLog.RequestBody)
+
+ switch reqType {
+ case opsRetryTypeMessages:
+ bodyBytes = FilterThinkingBlocksForRetry(bodyBytes)
+ case opsRetryTypeOpenAI, opsRetryTypeGeminiV1B:
+ // No-op
+ }
+
+ switch strings.ToLower(strings.TrimSpace(mode)) {
+ case OpsRetryModeUpstream:
+ if pinnedAccountID == nil || *pinnedAccountID <= 0 {
+ return &opsRetryExecution{
+ status: opsRetryStatusFailed,
+ errorMessage: "pinned_account_id required for upstream retry",
+ }
+ }
+ return s.executePinnedRetry(ctx, reqType, errorLog, bodyBytes, *pinnedAccountID)
+ case OpsRetryModeClient:
+ return s.executeClientRetry(ctx, reqType, errorLog, bodyBytes)
+ default:
+ return &opsRetryExecution{
+ status: opsRetryStatusFailed,
+ errorMessage: "invalid retry mode",
+ }
+ }
+}
+
+func detectOpsRetryType(path string) opsRetryRequestType {
+ p := strings.ToLower(strings.TrimSpace(path))
+ switch {
+ case strings.Contains(p, "/responses"):
+ return opsRetryTypeOpenAI
+ case strings.Contains(p, "/v1beta/"):
+ return opsRetryTypeGeminiV1B
+ default:
+ return opsRetryTypeMessages
+ }
+}
+
+func (s *OpsService) executePinnedRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, pinnedAccountID int64) *opsRetryExecution {
+ if s.accountRepo == nil {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account repository not available"}
+ }
+
+ account, err := s.accountRepo.GetByID(ctx, pinnedAccountID)
+ if err != nil {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("account not found: %v", err)}
+ }
+ if account == nil {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account not found"}
+ }
+ if !account.IsSchedulable() {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account is not schedulable"}
+ }
+ if errorLog.GroupID != nil && *errorLog.GroupID > 0 {
+ if !containsInt64(account.GroupIDs, *errorLog.GroupID) {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "pinned account is not in the same group as the original request"}
+ }
+ }
+
+ var release func()
+ if s.concurrencyService != nil {
+ acq, err := s.concurrencyService.AcquireAccountSlot(ctx, account.ID, account.Concurrency)
+ if err != nil {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("acquire account slot failed: %v", err)}
+ }
+ if acq == nil || !acq.Acquired {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account concurrency limit reached"}
+ }
+ release = acq.ReleaseFunc
+ }
+ if release != nil {
+ defer release()
+ }
+
+ usedID := account.ID
+ exec := s.executeWithAccount(ctx, reqType, errorLog, body, account)
+ exec.usedAccountID = &usedID
+ if exec.status == "" {
+ exec.status = opsRetryStatusFailed
+ }
+ return exec
+}
+
+func (s *OpsService) executeClientRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) *opsRetryExecution {
+ groupID := errorLog.GroupID
+ if groupID == nil || *groupID <= 0 {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "group_id missing; cannot reselect account"}
+ }
+
+ model, stream, parsedErr := extractRetryModelAndStream(reqType, errorLog, body)
+ if parsedErr != nil {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: parsedErr.Error()}
+ }
+ _ = stream
+
+ excluded := make(map[int64]struct{})
+ switches := 0
+
+ for {
+ if switches >= opsRetryMaxAccountSwitches {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "retry failed after exhausting account failovers"}
+ }
+
+ selection, selErr := s.selectAccountForRetry(ctx, reqType, groupID, model, excluded)
+ if selErr != nil {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: selErr.Error()}
+ }
+ if selection == nil || selection.Account == nil {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "no available accounts"}
+ }
+
+ account := selection.Account
+ if !selection.Acquired || selection.ReleaseFunc == nil {
+ excluded[account.ID] = struct{}{}
+ switches++
+ continue
+ }
+
+ exec := func() *opsRetryExecution {
+ defer selection.ReleaseFunc()
+ return s.executeWithAccount(ctx, reqType, errorLog, body, account)
+ }()
+
+ if exec != nil {
+ if exec.status == opsRetryStatusSucceeded {
+ usedID := account.ID
+ exec.usedAccountID = &usedID
+ return exec
+ }
+ // If the gateway services ask for failover, try another account.
+ if s.isFailoverError(exec.errorMessage) {
+ excluded[account.ID] = struct{}{}
+ switches++
+ continue
+ }
+ usedID := account.ID
+ exec.usedAccountID = &usedID
+ return exec
+ }
+
+ excluded[account.ID] = struct{}{}
+ switches++
+ }
+}
+
+func (s *OpsService) selectAccountForRetry(ctx context.Context, reqType opsRetryRequestType, groupID *int64, model string, excludedIDs map[int64]struct{}) (*AccountSelectionResult, error) {
+ switch reqType {
+ case opsRetryTypeOpenAI:
+ if s.openAIGatewayService == nil {
+ return nil, fmt.Errorf("openai gateway service not available")
+ }
+ return s.openAIGatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
+ case opsRetryTypeGeminiV1B, opsRetryTypeMessages:
+ if s.gatewayService == nil {
+ return nil, fmt.Errorf("gateway service not available")
+ }
+ return s.gatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
+ default:
+ return nil, fmt.Errorf("unsupported retry type: %s", reqType)
+ }
+}
+
+func extractRetryModelAndStream(reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) (model string, stream bool, err error) {
+ switch reqType {
+ case opsRetryTypeMessages:
+ parsed, parseErr := ParseGatewayRequest(body)
+ if parseErr != nil {
+ return "", false, fmt.Errorf("failed to parse messages request body: %w", parseErr)
+ }
+ return parsed.Model, parsed.Stream, nil
+ case opsRetryTypeOpenAI:
+ var v struct {
+ Model string `json:"model"`
+ Stream bool `json:"stream"`
+ }
+ if err := json.Unmarshal(body, &v); err != nil {
+ return "", false, fmt.Errorf("failed to parse openai request body: %w", err)
+ }
+ return strings.TrimSpace(v.Model), v.Stream, nil
+ case opsRetryTypeGeminiV1B:
+ if strings.TrimSpace(errorLog.Model) == "" {
+ return "", false, fmt.Errorf("missing model for gemini v1beta retry")
+ }
+ return strings.TrimSpace(errorLog.Model), errorLog.Stream, nil
+ default:
+ return "", false, fmt.Errorf("unsupported retry type: %s", reqType)
+ }
+}
+
+func (s *OpsService) executeWithAccount(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, account *Account) *opsRetryExecution {
+ if account == nil {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "missing account"}
+ }
+
+ c, w := newOpsRetryContext(ctx, errorLog)
+
+ var err error
+ switch reqType {
+ case opsRetryTypeOpenAI:
+ if s.openAIGatewayService == nil {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "openai gateway service not available"}
+ }
+ _, err = s.openAIGatewayService.Forward(ctx, c, account, body)
+ case opsRetryTypeGeminiV1B:
+ if s.geminiCompatService == nil || s.antigravityGatewayService == nil {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini services not available"}
+ }
+ modelName := strings.TrimSpace(errorLog.Model)
+ action := "generateContent"
+ if errorLog.Stream {
+ action = "streamGenerateContent"
+ }
+ if account.Platform == PlatformAntigravity {
+ _, err = s.antigravityGatewayService.ForwardGemini(ctx, c, account, modelName, action, errorLog.Stream, body)
+ } else {
+ _, err = s.geminiCompatService.ForwardNative(ctx, c, account, modelName, action, errorLog.Stream, body)
+ }
+ case opsRetryTypeMessages:
+ switch account.Platform {
+ case PlatformAntigravity:
+ if s.antigravityGatewayService == nil {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "antigravity gateway service not available"}
+ }
+ _, err = s.antigravityGatewayService.Forward(ctx, c, account, body)
+ case PlatformGemini:
+ if s.geminiCompatService == nil {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini gateway service not available"}
+ }
+ _, err = s.geminiCompatService.Forward(ctx, c, account, body)
+ default:
+ if s.gatewayService == nil {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gateway service not available"}
+ }
+ parsedReq, parseErr := ParseGatewayRequest(body)
+ if parseErr != nil {
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "failed to parse request body"}
+ }
+ _, err = s.gatewayService.Forward(ctx, c, account, parsedReq)
+ }
+ default:
+ return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "unsupported retry type"}
+ }
+
+ statusCode := http.StatusOK
+ if c != nil && c.Writer != nil {
+ statusCode = c.Writer.Status()
+ }
+
+ upstreamReqID := extractUpstreamRequestID(c)
+ preview, truncated := extractResponsePreview(w)
+
+ exec := &opsRetryExecution{
+ status: opsRetryStatusFailed,
+ httpStatusCode: statusCode,
+ upstreamRequestID: upstreamReqID,
+ responsePreview: preview,
+ responseTruncated: truncated,
+ errorMessage: "",
+ }
+
+ if err == nil && statusCode < 400 {
+ exec.status = opsRetryStatusSucceeded
+ return exec
+ }
+
+ if err != nil {
+ exec.errorMessage = err.Error()
+ } else {
+ exec.errorMessage = fmt.Sprintf("upstream returned status %d", statusCode)
+ }
+
+ return exec
+}
+
+func newOpsRetryContext(ctx context.Context, errorLog *OpsErrorLogDetail) (*gin.Context, *limitedResponseWriter) {
+ w := newLimitedResponseWriter(opsRetryCaptureBytesLimit)
+ c, _ := gin.CreateTestContext(w)
+
+ path := "/"
+ if errorLog != nil && strings.TrimSpace(errorLog.RequestPath) != "" {
+ path = errorLog.RequestPath
+ }
+
+ req, _ := http.NewRequestWithContext(ctx, http.MethodPost, "http://localhost"+path, bytes.NewReader(nil))
+ req.Header.Set("content-type", "application/json")
+ if errorLog != nil && strings.TrimSpace(errorLog.UserAgent) != "" {
+ req.Header.Set("user-agent", errorLog.UserAgent)
+ }
+ // Restore a minimal, whitelisted subset of request headers to improve retry fidelity
+ // (e.g. anthropic-beta / anthropic-version). Never replay auth credentials.
+ if errorLog != nil && strings.TrimSpace(errorLog.RequestHeaders) != "" {
+ var stored map[string]string
+ if err := json.Unmarshal([]byte(errorLog.RequestHeaders), &stored); err == nil {
+ for k, v := range stored {
+ key := strings.TrimSpace(k)
+ if key == "" {
+ continue
+ }
+ if !opsRetryRequestHeaderAllowlist[strings.ToLower(key)] {
+ continue
+ }
+ val := strings.TrimSpace(v)
+ if val == "" {
+ continue
+ }
+ req.Header.Set(key, val)
+ }
+ }
+ }
+
+ c.Request = req
+ return c, w
+}
+
+func extractUpstreamRequestID(c *gin.Context) string {
+ if c == nil || c.Writer == nil {
+ return ""
+ }
+ h := c.Writer.Header()
+ if h == nil {
+ return ""
+ }
+ for _, key := range []string{"x-request-id", "X-Request-Id", "X-Request-ID"} {
+ if v := strings.TrimSpace(h.Get(key)); v != "" {
+ return v
+ }
+ }
+ return ""
+}
+
+func extractResponsePreview(w *limitedResponseWriter) (preview string, truncated bool) {
+ if w == nil {
+ return "", false
+ }
+ b := bytes.TrimSpace(w.bodyBytes())
+ if len(b) == 0 {
+ return "", w.truncated()
+ }
+ if len(b) > opsRetryResponsePreviewMax {
+ return string(b[:opsRetryResponsePreviewMax]), true
+ }
+ return string(b), w.truncated()
+}
+
+func containsInt64(items []int64, needle int64) bool {
+ for _, v := range items {
+ if v == needle {
+ return true
+ }
+ }
+ return false
+}
+
+func (s *OpsService) isFailoverError(message string) bool {
+ msg := strings.ToLower(strings.TrimSpace(message))
+ if msg == "" {
+ return false
+ }
+ return strings.Contains(msg, "upstream error:") && strings.Contains(msg, "failover")
+}
diff --git a/backend/internal/service/ops_service.go b/backend/internal/service/ops_service.go
new file mode 100644
index 00000000..169c523a
--- /dev/null
+++ b/backend/internal/service/ops_service.go
@@ -0,0 +1,451 @@
+package service
+
+import (
+ "context"
+ "database/sql"
+ "encoding/json"
+ "errors"
+ "log"
+ "strings"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/config"
+ infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+var ErrOpsDisabled = infraerrors.NotFound("OPS_DISABLED", "Ops monitoring is disabled")
+
+const (
+ opsMaxStoredRequestBodyBytes = 10 * 1024
+ opsMaxStoredErrorBodyBytes = 20 * 1024
+)
+
+// OpsService provides ingestion and query APIs for the Ops monitoring module.
+type OpsService struct {
+ opsRepo OpsRepository
+ settingRepo SettingRepository
+ cfg *config.Config
+
+ accountRepo AccountRepository
+
+ concurrencyService *ConcurrencyService
+ gatewayService *GatewayService
+ openAIGatewayService *OpenAIGatewayService
+ geminiCompatService *GeminiMessagesCompatService
+ antigravityGatewayService *AntigravityGatewayService
+}
+
+func NewOpsService(
+ opsRepo OpsRepository,
+ settingRepo SettingRepository,
+ cfg *config.Config,
+ accountRepo AccountRepository,
+ concurrencyService *ConcurrencyService,
+ gatewayService *GatewayService,
+ openAIGatewayService *OpenAIGatewayService,
+ geminiCompatService *GeminiMessagesCompatService,
+ antigravityGatewayService *AntigravityGatewayService,
+) *OpsService {
+ return &OpsService{
+ opsRepo: opsRepo,
+ settingRepo: settingRepo,
+ cfg: cfg,
+
+ accountRepo: accountRepo,
+
+ concurrencyService: concurrencyService,
+ gatewayService: gatewayService,
+ openAIGatewayService: openAIGatewayService,
+ geminiCompatService: geminiCompatService,
+ antigravityGatewayService: antigravityGatewayService,
+ }
+}
+
+func (s *OpsService) RequireMonitoringEnabled(ctx context.Context) error {
+ if s.IsMonitoringEnabled(ctx) {
+ return nil
+ }
+ return ErrOpsDisabled
+}
+
+func (s *OpsService) IsMonitoringEnabled(ctx context.Context) bool {
+ // Hard switch: disable ops entirely.
+ if s.cfg != nil && !s.cfg.Ops.Enabled {
+ return false
+ }
+ if s.settingRepo == nil {
+ return true
+ }
+ value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
+ if err != nil {
+ // Default enabled when key is missing, and fail-open on transient errors
+ // (ops should never block gateway traffic).
+ if errors.Is(err, ErrSettingNotFound) {
+ return true
+ }
+ return true
+ }
+ switch strings.ToLower(strings.TrimSpace(value)) {
+ case "false", "0", "off", "disabled":
+ return false
+ default:
+ return true
+ }
+}
+
+func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogInput, rawRequestBody []byte) error {
+ if entry == nil {
+ return nil
+ }
+ if !s.IsMonitoringEnabled(ctx) {
+ return nil
+ }
+ if s.opsRepo == nil {
+ return nil
+ }
+
+ // Ensure timestamps are always populated.
+ if entry.CreatedAt.IsZero() {
+ entry.CreatedAt = time.Now()
+ }
+
+ // Ensure required fields exist (DB has NOT NULL constraints).
+ entry.ErrorPhase = strings.TrimSpace(entry.ErrorPhase)
+ entry.ErrorType = strings.TrimSpace(entry.ErrorType)
+ if entry.ErrorPhase == "" {
+ entry.ErrorPhase = "internal"
+ }
+ if entry.ErrorType == "" {
+ entry.ErrorType = "api_error"
+ }
+
+ // Sanitize + trim request body (errors only).
+ if len(rawRequestBody) > 0 {
+ sanitized, truncated, bytesLen := sanitizeAndTrimRequestBody(rawRequestBody, opsMaxStoredRequestBodyBytes)
+ if sanitized != "" {
+ entry.RequestBodyJSON = &sanitized
+ }
+ entry.RequestBodyTruncated = truncated
+ entry.RequestBodyBytes = &bytesLen
+ }
+
+ // Sanitize + truncate error_body to avoid storing sensitive data.
+ if strings.TrimSpace(entry.ErrorBody) != "" {
+ sanitized, _ := sanitizeErrorBodyForStorage(entry.ErrorBody, opsMaxStoredErrorBodyBytes)
+ entry.ErrorBody = sanitized
+ }
+
+ if _, err := s.opsRepo.InsertErrorLog(ctx, entry); err != nil {
+ // Never bubble up to gateway; best-effort logging.
+ log.Printf("[Ops] RecordError failed: %v", err)
+ return err
+ }
+ return nil
+}
+
+func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil
+ }
+ return s.opsRepo.ListErrorLogs(ctx, filter)
+}
+
+func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
+ }
+ detail, err := s.opsRepo.GetErrorLogByID(ctx, id)
+ if err != nil {
+ if errors.Is(err, sql.ErrNoRows) {
+ return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
+ }
+ return nil, infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err)
+ }
+ return detail, nil
+}
+
+func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) {
+ bytesLen = len(raw)
+ if len(raw) == 0 {
+ return "", false, 0
+ }
+
+ var decoded any
+ if err := json.Unmarshal(raw, &decoded); err != nil {
+ // If it's not valid JSON, don't store (retry would not be reliable anyway).
+ return "", false, bytesLen
+ }
+
+ decoded = redactSensitiveJSON(decoded)
+
+ encoded, err := json.Marshal(decoded)
+ if err != nil {
+ return "", false, bytesLen
+ }
+ if len(encoded) <= maxBytes {
+ return string(encoded), false, bytesLen
+ }
+
+ // Trim conversation history to keep the most recent context.
+ if root, ok := decoded.(map[string]any); ok {
+ if trimmed, ok := trimConversationArrays(root, maxBytes); ok {
+ encoded2, err2 := json.Marshal(trimmed)
+ if err2 == nil && len(encoded2) <= maxBytes {
+ return string(encoded2), true, bytesLen
+ }
+ // Fallthrough: keep shrinking.
+ decoded = trimmed
+ }
+
+ essential := shrinkToEssentials(root)
+ encoded3, err3 := json.Marshal(essential)
+ if err3 == nil && len(encoded3) <= maxBytes {
+ return string(encoded3), true, bytesLen
+ }
+ }
+
+ // Last resort: store a minimal placeholder (still valid JSON).
+ placeholder := map[string]any{
+ "request_body_truncated": true,
+ }
+ if model := extractString(decoded, "model"); model != "" {
+ placeholder["model"] = model
+ }
+ encoded4, err4 := json.Marshal(placeholder)
+ if err4 != nil {
+ return "", true, bytesLen
+ }
+ return string(encoded4), true, bytesLen
+}
+
+func redactSensitiveJSON(v any) any {
+ switch t := v.(type) {
+ case map[string]any:
+ out := make(map[string]any, len(t))
+ for k, vv := range t {
+ if isSensitiveKey(k) {
+ out[k] = "[REDACTED]"
+ continue
+ }
+ out[k] = redactSensitiveJSON(vv)
+ }
+ return out
+ case []any:
+ out := make([]any, 0, len(t))
+ for _, vv := range t {
+ out = append(out, redactSensitiveJSON(vv))
+ }
+ return out
+ default:
+ return v
+ }
+}
+
+func isSensitiveKey(key string) bool {
+ k := strings.ToLower(strings.TrimSpace(key))
+ if k == "" {
+ return false
+ }
+
+ // Exact matches (common credential fields).
+ switch k {
+ case "authorization",
+ "proxy-authorization",
+ "x-api-key",
+ "api_key",
+ "apikey",
+ "access_token",
+ "refresh_token",
+ "id_token",
+ "session_token",
+ "token",
+ "password",
+ "passwd",
+ "passphrase",
+ "secret",
+ "client_secret",
+ "private_key",
+ "jwt",
+ "signature",
+ "accesskeyid",
+ "secretaccesskey":
+ return true
+ }
+
+ // Suffix matches.
+ for _, suffix := range []string{
+ "_secret",
+ "_token",
+ "_id_token",
+ "_session_token",
+ "_password",
+ "_passwd",
+ "_passphrase",
+ "_key",
+ "secret_key",
+ "private_key",
+ } {
+ if strings.HasSuffix(k, suffix) {
+ return true
+ }
+ }
+
+ // Substring matches (conservative, but errs on the side of privacy).
+ for _, sub := range []string{
+ "secret",
+ "token",
+ "password",
+ "passwd",
+ "passphrase",
+ "privatekey",
+ "private_key",
+ "apikey",
+ "api_key",
+ "accesskeyid",
+ "secretaccesskey",
+ "bearer",
+ "cookie",
+ "credential",
+ "session",
+ "jwt",
+ "signature",
+ } {
+ if strings.Contains(k, sub) {
+ return true
+ }
+ }
+
+ return false
+}
+
+func trimConversationArrays(root map[string]any, maxBytes int) (map[string]any, bool) {
+ // Supported: anthropic/openai: messages; gemini: contents.
+ if out, ok := trimArrayField(root, "messages", maxBytes); ok {
+ return out, true
+ }
+ if out, ok := trimArrayField(root, "contents", maxBytes); ok {
+ return out, true
+ }
+ return root, false
+}
+
+func trimArrayField(root map[string]any, field string, maxBytes int) (map[string]any, bool) {
+ raw, ok := root[field]
+ if !ok {
+ return nil, false
+ }
+ arr, ok := raw.([]any)
+ if !ok || len(arr) == 0 {
+ return nil, false
+ }
+
+ // Keep at least the last message/content. Use binary search so we don't marshal O(n) times.
+ // We are dropping from the *front* of the array (oldest context first).
+ lo := 0
+ hi := len(arr) - 1 // inclusive; hi ensures at least one item remains
+
+ var best map[string]any
+ found := false
+
+ for lo <= hi {
+ mid := (lo + hi) / 2
+ candidateArr := arr[mid:]
+ if len(candidateArr) == 0 {
+ lo = mid + 1
+ continue
+ }
+
+ next := shallowCopyMap(root)
+ next[field] = candidateArr
+ encoded, err := json.Marshal(next)
+ if err != nil {
+ // If marshal fails, try dropping more.
+ lo = mid + 1
+ continue
+ }
+
+ if len(encoded) <= maxBytes {
+ best = next
+ found = true
+ // Try to keep more context by dropping fewer items.
+ hi = mid - 1
+ continue
+ }
+
+ // Need to drop more.
+ lo = mid + 1
+ }
+
+ if found {
+ return best, true
+ }
+
+ // Nothing fit (even with only one element); return the smallest slice and let the
+ // caller fall back to shrinkToEssentials().
+ next := shallowCopyMap(root)
+ next[field] = arr[len(arr)-1:]
+ return next, true
+}
+
+func shrinkToEssentials(root map[string]any) map[string]any {
+ out := make(map[string]any)
+ for _, key := range []string{"model", "stream", "max_tokens", "temperature", "top_p", "top_k"} {
+ if v, ok := root[key]; ok {
+ out[key] = v
+ }
+ }
+
+ // Keep only the last element of the conversation array.
+ if v, ok := root["messages"]; ok {
+ if arr, ok := v.([]any); ok && len(arr) > 0 {
+ out["messages"] = []any{arr[len(arr)-1]}
+ }
+ }
+ if v, ok := root["contents"]; ok {
+ if arr, ok := v.([]any); ok && len(arr) > 0 {
+ out["contents"] = []any{arr[len(arr)-1]}
+ }
+ }
+ return out
+}
+
+func shallowCopyMap(m map[string]any) map[string]any {
+ out := make(map[string]any, len(m))
+ for k, v := range m {
+ out[k] = v
+ }
+ return out
+}
+
+func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, truncated bool) {
+ raw = strings.TrimSpace(raw)
+ if raw == "" {
+ return "", false
+ }
+
+ // Prefer JSON-safe sanitization when possible.
+ if out, trunc, _ := sanitizeAndTrimRequestBody([]byte(raw), maxBytes); out != "" {
+ return out, trunc
+ }
+
+ // Non-JSON: best-effort truncate.
+ if maxBytes > 0 && len(raw) > maxBytes {
+ return truncateString(raw, maxBytes), true
+ }
+ return raw, false
+}
+
+func extractString(v any, key string) string {
+ root, ok := v.(map[string]any)
+ if !ok {
+ return ""
+ }
+ s, _ := root[key].(string)
+ return strings.TrimSpace(s)
+}
diff --git a/backend/internal/service/ops_settings.go b/backend/internal/service/ops_settings.go
new file mode 100644
index 00000000..2f15bc79
--- /dev/null
+++ b/backend/internal/service/ops_settings.go
@@ -0,0 +1,354 @@
+package service
+
+import (
+ "context"
+ "encoding/json"
+ "errors"
+ "strings"
+ "time"
+)
+
+const (
+ opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader"
+ opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second
+)
+
+// =========================
+// Email notification config
+// =========================
+
+func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) {
+ defaultCfg := defaultOpsEmailNotificationConfig()
+ if s == nil || s.settingRepo == nil {
+ return defaultCfg, nil
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+
+ raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig)
+ if err != nil {
+ if errors.Is(err, ErrSettingNotFound) {
+ // Initialize defaults on first read (best-effort).
+ if b, mErr := json.Marshal(defaultCfg); mErr == nil {
+ _ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b))
+ }
+ return defaultCfg, nil
+ }
+ return nil, err
+ }
+
+ cfg := &OpsEmailNotificationConfig{}
+ if err := json.Unmarshal([]byte(raw), cfg); err != nil {
+ // Corrupted JSON should not break ops UI; fall back to defaults.
+ return defaultCfg, nil
+ }
+ normalizeOpsEmailNotificationConfig(cfg)
+ return cfg, nil
+}
+
+func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) {
+ if s == nil || s.settingRepo == nil {
+ return nil, errors.New("setting repository not initialized")
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+ if req == nil {
+ return nil, errors.New("invalid request")
+ }
+
+ cfg, err := s.GetEmailNotificationConfig(ctx)
+ if err != nil {
+ return nil, err
+ }
+
+ if req.Alert != nil {
+ cfg.Alert.Enabled = req.Alert.Enabled
+ if req.Alert.Recipients != nil {
+ cfg.Alert.Recipients = req.Alert.Recipients
+ }
+ cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity)
+ cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour
+ cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds
+ cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts
+ }
+
+ if req.Report != nil {
+ cfg.Report.Enabled = req.Report.Enabled
+ if req.Report.Recipients != nil {
+ cfg.Report.Recipients = req.Report.Recipients
+ }
+ cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled
+ cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule)
+ cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled
+ cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule)
+ cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled
+ cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule)
+ cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount
+ cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled
+ cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule)
+ cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold
+ }
+
+ if err := validateOpsEmailNotificationConfig(cfg); err != nil {
+ return nil, err
+ }
+
+ normalizeOpsEmailNotificationConfig(cfg)
+ raw, err := json.Marshal(cfg)
+ if err != nil {
+ return nil, err
+ }
+ if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil {
+ return nil, err
+ }
+ return cfg, nil
+}
+
+func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig {
+ return &OpsEmailNotificationConfig{
+ Alert: OpsEmailAlertConfig{
+ Enabled: true,
+ Recipients: []string{},
+ MinSeverity: "",
+ RateLimitPerHour: 0,
+ BatchingWindowSeconds: 0,
+ IncludeResolvedAlerts: false,
+ },
+ Report: OpsEmailReportConfig{
+ Enabled: false,
+ Recipients: []string{},
+ DailySummaryEnabled: false,
+ DailySummarySchedule: "0 9 * * *",
+ WeeklySummaryEnabled: false,
+ WeeklySummarySchedule: "0 9 * * 1",
+ ErrorDigestEnabled: false,
+ ErrorDigestSchedule: "0 9 * * *",
+ ErrorDigestMinCount: 10,
+ AccountHealthEnabled: false,
+ AccountHealthSchedule: "0 9 * * *",
+ AccountHealthErrorRateThreshold: 10.0,
+ },
+ }
+}
+
+func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) {
+ if cfg == nil {
+ return
+ }
+ if cfg.Alert.Recipients == nil {
+ cfg.Alert.Recipients = []string{}
+ }
+ if cfg.Report.Recipients == nil {
+ cfg.Report.Recipients = []string{}
+ }
+
+ cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity)
+ cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule)
+ cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule)
+ cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule)
+ cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule)
+
+ // Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings.
+ if cfg.Report.DailySummarySchedule == "" {
+ cfg.Report.DailySummarySchedule = "0 9 * * *"
+ }
+ if cfg.Report.WeeklySummarySchedule == "" {
+ cfg.Report.WeeklySummarySchedule = "0 9 * * 1"
+ }
+ if cfg.Report.ErrorDigestSchedule == "" {
+ cfg.Report.ErrorDigestSchedule = "0 9 * * *"
+ }
+ if cfg.Report.AccountHealthSchedule == "" {
+ cfg.Report.AccountHealthSchedule = "0 9 * * *"
+ }
+}
+
+func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error {
+ if cfg == nil {
+ return errors.New("invalid config")
+ }
+
+ if cfg.Alert.RateLimitPerHour < 0 {
+ return errors.New("alert.rate_limit_per_hour must be >= 0")
+ }
+ if cfg.Alert.BatchingWindowSeconds < 0 {
+ return errors.New("alert.batching_window_seconds must be >= 0")
+ }
+ switch strings.TrimSpace(cfg.Alert.MinSeverity) {
+ case "", "critical", "warning", "info":
+ default:
+ return errors.New("alert.min_severity must be one of: critical, warning, info, or empty")
+ }
+
+ if cfg.Report.ErrorDigestMinCount < 0 {
+ return errors.New("report.error_digest_min_count must be >= 0")
+ }
+ if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 {
+ return errors.New("report.account_health_error_rate_threshold must be between 0 and 100")
+ }
+ return nil
+}
+
+// =========================
+// Alert runtime settings
+// =========================
+
+func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings {
+ return &OpsAlertRuntimeSettings{
+ EvaluationIntervalSeconds: 60,
+ DistributedLock: OpsDistributedLockSettings{
+ Enabled: true,
+ Key: opsAlertEvaluatorLeaderLockKeyDefault,
+ TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()),
+ },
+ Silencing: OpsAlertSilencingSettings{
+ Enabled: false,
+ GlobalUntilRFC3339: "",
+ GlobalReason: "",
+ Entries: []OpsAlertSilenceEntry{},
+ },
+ }
+}
+
+func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) {
+ if s == nil {
+ return
+ }
+ s.Key = strings.TrimSpace(s.Key)
+ if s.Key == "" {
+ s.Key = defaultKey
+ }
+ if s.TTLSeconds <= 0 {
+ s.TTLSeconds = defaultTTLSeconds
+ }
+}
+
+func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) {
+ if s == nil {
+ return
+ }
+ s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339)
+ s.GlobalReason = strings.TrimSpace(s.GlobalReason)
+ if s.Entries == nil {
+ s.Entries = []OpsAlertSilenceEntry{}
+ }
+ for i := range s.Entries {
+ s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339)
+ s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason)
+ }
+}
+
+func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error {
+ if strings.TrimSpace(s.Key) == "" {
+ return errors.New("distributed_lock.key is required")
+ }
+ if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) {
+ return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400")
+ }
+ return nil
+}
+
+func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error {
+ parse := func(raw string) error {
+ if strings.TrimSpace(raw) == "" {
+ return nil
+ }
+ if _, err := time.Parse(time.RFC3339, raw); err != nil {
+ return errors.New("silencing time must be RFC3339")
+ }
+ return nil
+ }
+
+ if err := parse(s.GlobalUntilRFC3339); err != nil {
+ return err
+ }
+ for _, entry := range s.Entries {
+ if strings.TrimSpace(entry.UntilRFC3339) == "" {
+ return errors.New("silencing.entries.until_rfc3339 is required")
+ }
+ if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil {
+ return errors.New("silencing.entries.until_rfc3339 must be RFC3339")
+ }
+ }
+ return nil
+}
+
+func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) {
+ defaultCfg := defaultOpsAlertRuntimeSettings()
+ if s == nil || s.settingRepo == nil {
+ return defaultCfg, nil
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+
+ raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings)
+ if err != nil {
+ if errors.Is(err, ErrSettingNotFound) {
+ if b, mErr := json.Marshal(defaultCfg); mErr == nil {
+ _ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b))
+ }
+ return defaultCfg, nil
+ }
+ return nil, err
+ }
+
+ cfg := &OpsAlertRuntimeSettings{}
+ if err := json.Unmarshal([]byte(raw), cfg); err != nil {
+ return defaultCfg, nil
+ }
+
+ if cfg.EvaluationIntervalSeconds <= 0 {
+ cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds
+ }
+ normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
+ normalizeOpsAlertSilencingSettings(&cfg.Silencing)
+
+ return cfg, nil
+}
+
+func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) {
+ if s == nil || s.settingRepo == nil {
+ return nil, errors.New("setting repository not initialized")
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+ if cfg == nil {
+ return nil, errors.New("invalid config")
+ }
+
+ if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) {
+ return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400")
+ }
+ if cfg.DistributedLock.Enabled {
+ if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil {
+ return nil, err
+ }
+ }
+ if cfg.Silencing.Enabled {
+ if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil {
+ return nil, err
+ }
+ }
+
+ defaultCfg := defaultOpsAlertRuntimeSettings()
+ normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
+ normalizeOpsAlertSilencingSettings(&cfg.Silencing)
+
+ raw, err := json.Marshal(cfg)
+ if err != nil {
+ return nil, err
+ }
+ if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil {
+ return nil, err
+ }
+
+ // Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated).
+ updated := &OpsAlertRuntimeSettings{}
+ _ = json.Unmarshal(raw, updated)
+ return updated, nil
+}
+
diff --git a/backend/internal/service/ops_trends.go b/backend/internal/service/ops_trends.go
new file mode 100644
index 00000000..9237544c
--- /dev/null
+++ b/backend/internal/service/ops_trends.go
@@ -0,0 +1,27 @@
+package service
+
+import (
+ "context"
+
+ infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+func (s *OpsService) GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+ }
+ if filter == nil {
+ return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
+ }
+ if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+ return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
+ }
+ if filter.StartTime.After(filter.EndTime) {
+ return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
+ }
+ return s.opsRepo.GetThroughputTrend(ctx, filter, bucketSeconds)
+}
+
diff --git a/backend/internal/service/ops_window_stats.go b/backend/internal/service/ops_window_stats.go
new file mode 100644
index 00000000..71021d15
--- /dev/null
+++ b/backend/internal/service/ops_window_stats.go
@@ -0,0 +1,24 @@
+package service
+
+import (
+ "context"
+ "time"
+
+ infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+// GetWindowStats returns lightweight request/token counts for the provided window.
+// It is intended for realtime sampling (e.g. WebSocket QPS push) without computing percentiles/peaks.
+func (s *OpsService) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) {
+ if err := s.RequireMonitoringEnabled(ctx); err != nil {
+ return nil, err
+ }
+ if s.opsRepo == nil {
+ return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+ }
+ filter := &OpsDashboardFilter{
+ StartTime: startTime,
+ EndTime: endTime,
+ }
+ return s.opsRepo.GetWindowStats(ctx, filter)
+}
diff --git a/backend/internal/service/wire.go b/backend/internal/service/wire.go
index d4b984d6..bf78601f 100644
--- a/backend/internal/service/wire.go
+++ b/backend/internal/service/wire.go
@@ -1,10 +1,12 @@
package service
import (
+ "database/sql"
"time"
"github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/wire"
+ "github.com/redis/go-redis/v9"
)
// BuildInfo contains build information
@@ -70,6 +72,57 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi
return svc
}
+// ProvideOpsMetricsCollector creates and starts OpsMetricsCollector.
+func ProvideOpsMetricsCollector(
+ opsRepo OpsRepository,
+ settingRepo SettingRepository,
+ db *sql.DB,
+ redisClient *redis.Client,
+ cfg *config.Config,
+) *OpsMetricsCollector {
+ collector := NewOpsMetricsCollector(opsRepo, settingRepo, db, redisClient, cfg)
+ collector.Start()
+ return collector
+}
+
+// ProvideOpsAggregationService creates and starts OpsAggregationService (hourly/daily pre-aggregation).
+func ProvideOpsAggregationService(
+ opsRepo OpsRepository,
+ settingRepo SettingRepository,
+ db *sql.DB,
+ redisClient *redis.Client,
+ cfg *config.Config,
+) *OpsAggregationService {
+ svc := NewOpsAggregationService(opsRepo, settingRepo, db, redisClient, cfg)
+ svc.Start()
+ return svc
+}
+
+// ProvideOpsAlertEvaluatorService creates and starts OpsAlertEvaluatorService.
+func ProvideOpsAlertEvaluatorService(
+ opsService *OpsService,
+ opsRepo OpsRepository,
+ emailService *EmailService,
+ redisClient *redis.Client,
+ cfg *config.Config,
+) *OpsAlertEvaluatorService {
+ svc := NewOpsAlertEvaluatorService(opsService, opsRepo, emailService, redisClient, cfg)
+ svc.Start()
+ return svc
+}
+
+// ProvideOpsCleanupService creates and starts OpsCleanupService (cron scheduled).
+func ProvideOpsCleanupService(
+ opsRepo OpsRepository,
+ db *sql.DB,
+ redisClient *redis.Client,
+ cfg *config.Config,
+) *OpsCleanupService {
+ svc := NewOpsCleanupService(opsRepo, db, redisClient, cfg)
+ svc.Start()
+ return svc
+}
+
// ProviderSet is the Wire provider set for all services
var ProviderSet = wire.NewSet(
// Core services
@@ -101,6 +154,11 @@ var ProviderSet = wire.NewSet(
NewAccountUsageService,
NewAccountTestService,
NewSettingService,
+ NewOpsService,
+ ProvideOpsMetricsCollector,
+ ProvideOpsAggregationService,
+ ProvideOpsAlertEvaluatorService,
+ ProvideOpsCleanupService,
NewEmailService,
ProvideEmailQueueService,
NewTurnstileService,
From f3ed95d4dea643e54417d0b4e6b8ccd318e0631d Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Fri, 9 Jan 2026 20:54:26 +0800
Subject: [PATCH 05/53] =?UTF-8?q?feat(handler):=20=E5=AE=9E=E7=8E=B0?=
=?UTF-8?q?=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=20API=20=E5=A4=84=E7=90=86?=
=?UTF-8?q?=E5=99=A8=E5=92=8C=E4=B8=AD=E9=97=B4=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 新增 ops 错误日志记录器(ops_error_logger.go)
- 新增 ops 主处理器(ops_handler.go)
- 新增告警管理处理器(ops_alerts_handler.go)
- 新增仪表板处理器(ops_dashboard_handler.go)
- 新增实时监控处理器(ops_realtime_handler.go)
- 新增配置管理处理器(ops_settings_handler.go)
- 新增 WebSocket 处理器(ops_ws_handler.go)
- 扩展设置 DTO 支持 ops 配置
- 新增客户端请求 ID 中间件(client_request_id.go)
- 新增 WebSocket 查询令牌认证中间件(ws_query_token_auth.go)
- 更新管理员认证中间件支持 ops 路由
- 注册 handler 依赖注入
---
.../handler/admin/ops_alerts_handler.go | 433 ++++++++++
.../handler/admin/ops_dashboard_handler.go | 243 ++++++
backend/internal/handler/admin/ops_handler.go | 364 +++++++++
.../handler/admin/ops_realtime_handler.go | 120 +++
.../handler/admin/ops_settings_handler.go | 103 +++
.../internal/handler/admin/ops_ws_handler.go | 765 ++++++++++++++++++
backend/internal/handler/dto/settings.go | 5 +
backend/internal/handler/ops_error_logger.go | 681 ++++++++++++++++
backend/internal/handler/wire.go | 3 +
.../internal/server/middleware/admin_auth.go | 52 ++
.../server/middleware/client_request_id.go | 31 +
.../server/middleware/ws_query_token_auth.go | 54 ++
12 files changed, 2854 insertions(+)
create mode 100644 backend/internal/handler/admin/ops_alerts_handler.go
create mode 100644 backend/internal/handler/admin/ops_dashboard_handler.go
create mode 100644 backend/internal/handler/admin/ops_handler.go
create mode 100644 backend/internal/handler/admin/ops_realtime_handler.go
create mode 100644 backend/internal/handler/admin/ops_settings_handler.go
create mode 100644 backend/internal/handler/admin/ops_ws_handler.go
create mode 100644 backend/internal/handler/ops_error_logger.go
create mode 100644 backend/internal/server/middleware/client_request_id.go
create mode 100644 backend/internal/server/middleware/ws_query_token_auth.go
diff --git a/backend/internal/handler/admin/ops_alerts_handler.go b/backend/internal/handler/admin/ops_alerts_handler.go
new file mode 100644
index 00000000..19d9d870
--- /dev/null
+++ b/backend/internal/handler/admin/ops_alerts_handler.go
@@ -0,0 +1,433 @@
+package admin
+
+import (
+ "encoding/json"
+ "fmt"
+ "math"
+ "net/http"
+ "strconv"
+ "strings"
+
+ "github.com/Wei-Shaw/sub2api/internal/pkg/response"
+ "github.com/Wei-Shaw/sub2api/internal/service"
+ "github.com/gin-gonic/gin"
+ "github.com/gin-gonic/gin/binding"
+)
+
+var validOpsAlertMetricTypes = []string{
+ "success_rate",
+ "error_rate",
+ "upstream_error_rate",
+ "p95_latency_ms",
+ "p99_latency_ms",
+ "cpu_usage_percent",
+ "memory_usage_percent",
+ "concurrency_queue_depth",
+}
+
+var validOpsAlertMetricTypeSet = func() map[string]struct{} {
+ set := make(map[string]struct{}, len(validOpsAlertMetricTypes))
+ for _, v := range validOpsAlertMetricTypes {
+ set[v] = struct{}{}
+ }
+ return set
+}()
+
+var validOpsAlertOperators = []string{">", "<", ">=", "<=", "==", "!="}
+
+var validOpsAlertOperatorSet = func() map[string]struct{} {
+ set := make(map[string]struct{}, len(validOpsAlertOperators))
+ for _, v := range validOpsAlertOperators {
+ set[v] = struct{}{}
+ }
+ return set
+}()
+
+var validOpsAlertSeverities = []string{"P0", "P1", "P2", "P3"}
+
+var validOpsAlertSeveritySet = func() map[string]struct{} {
+ set := make(map[string]struct{}, len(validOpsAlertSeverities))
+ for _, v := range validOpsAlertSeverities {
+ set[v] = struct{}{}
+ }
+ return set
+}()
+
+type opsAlertRuleValidatedInput struct {
+ Name string
+ MetricType string
+ Operator string
+ Threshold float64
+
+ Severity string
+
+ WindowMinutes int
+ SustainedMinutes int
+ CooldownMinutes int
+
+ Enabled bool
+ NotifyEmail bool
+
+ WindowProvided bool
+ SustainedProvided bool
+ CooldownProvided bool
+ SeverityProvided bool
+ EnabledProvided bool
+ NotifyProvided bool
+}
+
+func isPercentOrRateMetric(metricType string) bool {
+ switch metricType {
+ case "success_rate",
+ "error_rate",
+ "upstream_error_rate",
+ "cpu_usage_percent",
+ "memory_usage_percent":
+ return true
+ default:
+ return false
+ }
+}
+
+func validateOpsAlertRulePayload(raw map[string]json.RawMessage) (*opsAlertRuleValidatedInput, error) {
+ if raw == nil {
+ return nil, fmt.Errorf("invalid request body")
+ }
+
+ requiredFields := []string{"name", "metric_type", "operator", "threshold"}
+ for _, field := range requiredFields {
+ if _, ok := raw[field]; !ok {
+ return nil, fmt.Errorf("%s is required", field)
+ }
+ }
+
+ var name string
+ if err := json.Unmarshal(raw["name"], &name); err != nil || strings.TrimSpace(name) == "" {
+ return nil, fmt.Errorf("name is required")
+ }
+ name = strings.TrimSpace(name)
+
+ var metricType string
+ if err := json.Unmarshal(raw["metric_type"], &metricType); err != nil || strings.TrimSpace(metricType) == "" {
+ return nil, fmt.Errorf("metric_type is required")
+ }
+ metricType = strings.TrimSpace(metricType)
+ if _, ok := validOpsAlertMetricTypeSet[metricType]; !ok {
+ return nil, fmt.Errorf("metric_type must be one of: %s", strings.Join(validOpsAlertMetricTypes, ", "))
+ }
+
+ var operator string
+ if err := json.Unmarshal(raw["operator"], &operator); err != nil || strings.TrimSpace(operator) == "" {
+ return nil, fmt.Errorf("operator is required")
+ }
+ operator = strings.TrimSpace(operator)
+ if _, ok := validOpsAlertOperatorSet[operator]; !ok {
+ return nil, fmt.Errorf("operator must be one of: %s", strings.Join(validOpsAlertOperators, ", "))
+ }
+
+ var threshold float64
+ if err := json.Unmarshal(raw["threshold"], &threshold); err != nil {
+ return nil, fmt.Errorf("threshold must be a number")
+ }
+ if math.IsNaN(threshold) || math.IsInf(threshold, 0) {
+ return nil, fmt.Errorf("threshold must be a finite number")
+ }
+ if isPercentOrRateMetric(metricType) {
+ if threshold < 0 || threshold > 100 {
+ return nil, fmt.Errorf("threshold must be between 0 and 100 for metric_type %s", metricType)
+ }
+ } else if threshold < 0 {
+ return nil, fmt.Errorf("threshold must be >= 0")
+ }
+
+ validated := &opsAlertRuleValidatedInput{
+ Name: name,
+ MetricType: metricType,
+ Operator: operator,
+ Threshold: threshold,
+ }
+
+ if v, ok := raw["severity"]; ok {
+ validated.SeverityProvided = true
+ var sev string
+ if err := json.Unmarshal(v, &sev); err != nil {
+ return nil, fmt.Errorf("severity must be a string")
+ }
+ sev = strings.ToUpper(strings.TrimSpace(sev))
+ if sev != "" {
+ if _, ok := validOpsAlertSeveritySet[sev]; !ok {
+ return nil, fmt.Errorf("severity must be one of: %s", strings.Join(validOpsAlertSeverities, ", "))
+ }
+ validated.Severity = sev
+ }
+ }
+ if validated.Severity == "" {
+ validated.Severity = "P2"
+ }
+
+ if v, ok := raw["enabled"]; ok {
+ validated.EnabledProvided = true
+ if err := json.Unmarshal(v, &validated.Enabled); err != nil {
+ return nil, fmt.Errorf("enabled must be a boolean")
+ }
+ } else {
+ validated.Enabled = true
+ }
+
+ if v, ok := raw["notify_email"]; ok {
+ validated.NotifyProvided = true
+ if err := json.Unmarshal(v, &validated.NotifyEmail); err != nil {
+ return nil, fmt.Errorf("notify_email must be a boolean")
+ }
+ } else {
+ validated.NotifyEmail = true
+ }
+
+ if v, ok := raw["window_minutes"]; ok {
+ validated.WindowProvided = true
+ if err := json.Unmarshal(v, &validated.WindowMinutes); err != nil {
+ return nil, fmt.Errorf("window_minutes must be an integer")
+ }
+ switch validated.WindowMinutes {
+ case 1, 5, 60:
+ default:
+ return nil, fmt.Errorf("window_minutes must be one of: 1, 5, 60")
+ }
+ } else {
+ validated.WindowMinutes = 1
+ }
+
+ if v, ok := raw["sustained_minutes"]; ok {
+ validated.SustainedProvided = true
+ if err := json.Unmarshal(v, &validated.SustainedMinutes); err != nil {
+ return nil, fmt.Errorf("sustained_minutes must be an integer")
+ }
+ if validated.SustainedMinutes < 1 || validated.SustainedMinutes > 1440 {
+ return nil, fmt.Errorf("sustained_minutes must be between 1 and 1440")
+ }
+ } else {
+ validated.SustainedMinutes = 1
+ }
+
+ if v, ok := raw["cooldown_minutes"]; ok {
+ validated.CooldownProvided = true
+ if err := json.Unmarshal(v, &validated.CooldownMinutes); err != nil {
+ return nil, fmt.Errorf("cooldown_minutes must be an integer")
+ }
+ if validated.CooldownMinutes < 0 || validated.CooldownMinutes > 1440 {
+ return nil, fmt.Errorf("cooldown_minutes must be between 0 and 1440")
+ }
+ } else {
+ validated.CooldownMinutes = 0
+ }
+
+ return validated, nil
+}
+
+// ListAlertRules returns all ops alert rules.
+// GET /api/v1/admin/ops/alert-rules
+func (h *OpsHandler) ListAlertRules(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ rules, err := h.opsService.ListAlertRules(c.Request.Context())
+ if err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+ response.Success(c, rules)
+}
+
+// CreateAlertRule creates an ops alert rule.
+// POST /api/v1/admin/ops/alert-rules
+func (h *OpsHandler) CreateAlertRule(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ var raw map[string]json.RawMessage
+ if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil {
+ response.BadRequest(c, "Invalid request body")
+ return
+ }
+ validated, err := validateOpsAlertRulePayload(raw)
+ if err != nil {
+ response.BadRequest(c, err.Error())
+ return
+ }
+
+ var rule service.OpsAlertRule
+ if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil {
+ response.BadRequest(c, "Invalid request body")
+ return
+ }
+
+ rule.Name = validated.Name
+ rule.MetricType = validated.MetricType
+ rule.Operator = validated.Operator
+ rule.Threshold = validated.Threshold
+ rule.WindowMinutes = validated.WindowMinutes
+ rule.SustainedMinutes = validated.SustainedMinutes
+ rule.CooldownMinutes = validated.CooldownMinutes
+ rule.Severity = validated.Severity
+ rule.Enabled = validated.Enabled
+ rule.NotifyEmail = validated.NotifyEmail
+
+ created, err := h.opsService.CreateAlertRule(c.Request.Context(), &rule)
+ if err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+ response.Success(c, created)
+}
+
+// UpdateAlertRule updates an existing ops alert rule.
+// PUT /api/v1/admin/ops/alert-rules/:id
+func (h *OpsHandler) UpdateAlertRule(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ id, err := strconv.ParseInt(c.Param("id"), 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid rule ID")
+ return
+ }
+
+ var raw map[string]json.RawMessage
+ if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil {
+ response.BadRequest(c, "Invalid request body")
+ return
+ }
+ validated, err := validateOpsAlertRulePayload(raw)
+ if err != nil {
+ response.BadRequest(c, err.Error())
+ return
+ }
+
+ var rule service.OpsAlertRule
+ if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil {
+ response.BadRequest(c, "Invalid request body")
+ return
+ }
+
+ rule.ID = id
+ rule.Name = validated.Name
+ rule.MetricType = validated.MetricType
+ rule.Operator = validated.Operator
+ rule.Threshold = validated.Threshold
+ rule.WindowMinutes = validated.WindowMinutes
+ rule.SustainedMinutes = validated.SustainedMinutes
+ rule.CooldownMinutes = validated.CooldownMinutes
+ rule.Severity = validated.Severity
+ rule.Enabled = validated.Enabled
+ rule.NotifyEmail = validated.NotifyEmail
+
+ updated, err := h.opsService.UpdateAlertRule(c.Request.Context(), &rule)
+ if err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+ response.Success(c, updated)
+}
+
+// DeleteAlertRule deletes an ops alert rule.
+// DELETE /api/v1/admin/ops/alert-rules/:id
+func (h *OpsHandler) DeleteAlertRule(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ id, err := strconv.ParseInt(c.Param("id"), 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid rule ID")
+ return
+ }
+
+ if err := h.opsService.DeleteAlertRule(c.Request.Context(), id); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+ response.Success(c, gin.H{"deleted": true})
+}
+
+// ListAlertEvents lists recent ops alert events.
+// GET /api/v1/admin/ops/alert-events
+func (h *OpsHandler) ListAlertEvents(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ limit := 100
+ if raw := strings.TrimSpace(c.Query("limit")); raw != "" {
+ n, err := strconv.Atoi(raw)
+ if err != nil || n <= 0 {
+ response.BadRequest(c, "Invalid limit")
+ return
+ }
+ limit = n
+ }
+
+ filter := &service.OpsAlertEventFilter{
+ Limit: limit,
+ Status: strings.TrimSpace(c.Query("status")),
+ Severity: strings.TrimSpace(c.Query("severity")),
+ }
+
+ // Optional global filter support (platform/group/time range).
+ if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
+ filter.Platform = platform
+ }
+ if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+ id, err := strconv.ParseInt(v, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid group_id")
+ return
+ }
+ filter.GroupID = &id
+ }
+ if startTime, endTime, err := parseOpsTimeRange(c, "24h"); err == nil {
+ // Only apply when explicitly provided to avoid surprising default narrowing.
+ if strings.TrimSpace(c.Query("start_time")) != "" || strings.TrimSpace(c.Query("end_time")) != "" || strings.TrimSpace(c.Query("time_range")) != "" {
+ filter.StartTime = &startTime
+ filter.EndTime = &endTime
+ }
+ } else {
+ response.BadRequest(c, err.Error())
+ return
+ }
+
+ events, err := h.opsService.ListAlertEvents(c.Request.Context(), filter)
+ if err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+ response.Success(c, events)
+}
+
diff --git a/backend/internal/handler/admin/ops_dashboard_handler.go b/backend/internal/handler/admin/ops_dashboard_handler.go
new file mode 100644
index 00000000..2c87f734
--- /dev/null
+++ b/backend/internal/handler/admin/ops_dashboard_handler.go
@@ -0,0 +1,243 @@
+package admin
+
+import (
+ "net/http"
+ "strconv"
+ "strings"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/pkg/response"
+ "github.com/Wei-Shaw/sub2api/internal/service"
+ "github.com/gin-gonic/gin"
+)
+
+// GetDashboardOverview returns vNext ops dashboard overview (raw path).
+// GET /api/v1/admin/ops/dashboard/overview
+func (h *OpsHandler) GetDashboardOverview(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ startTime, endTime, err := parseOpsTimeRange(c, "1h")
+ if err != nil {
+ response.BadRequest(c, err.Error())
+ return
+ }
+
+ filter := &service.OpsDashboardFilter{
+ StartTime: startTime,
+ EndTime: endTime,
+ Platform: strings.TrimSpace(c.Query("platform")),
+ QueryMode: parseOpsQueryMode(c),
+ }
+ if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+ id, err := strconv.ParseInt(v, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid group_id")
+ return
+ }
+ filter.GroupID = &id
+ }
+
+ data, err := h.opsService.GetDashboardOverview(c.Request.Context(), filter)
+ if err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+ response.Success(c, data)
+}
+
+// GetDashboardThroughputTrend returns throughput time series (raw path).
+// GET /api/v1/admin/ops/dashboard/throughput-trend
+func (h *OpsHandler) GetDashboardThroughputTrend(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ startTime, endTime, err := parseOpsTimeRange(c, "1h")
+ if err != nil {
+ response.BadRequest(c, err.Error())
+ return
+ }
+
+ filter := &service.OpsDashboardFilter{
+ StartTime: startTime,
+ EndTime: endTime,
+ Platform: strings.TrimSpace(c.Query("platform")),
+ QueryMode: parseOpsQueryMode(c),
+ }
+ if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+ id, err := strconv.ParseInt(v, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid group_id")
+ return
+ }
+ filter.GroupID = &id
+ }
+
+ bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime))
+ data, err := h.opsService.GetThroughputTrend(c.Request.Context(), filter, bucketSeconds)
+ if err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+ response.Success(c, data)
+}
+
+// GetDashboardLatencyHistogram returns the latency distribution histogram (success requests).
+// GET /api/v1/admin/ops/dashboard/latency-histogram
+func (h *OpsHandler) GetDashboardLatencyHistogram(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ startTime, endTime, err := parseOpsTimeRange(c, "1h")
+ if err != nil {
+ response.BadRequest(c, err.Error())
+ return
+ }
+
+ filter := &service.OpsDashboardFilter{
+ StartTime: startTime,
+ EndTime: endTime,
+ Platform: strings.TrimSpace(c.Query("platform")),
+ QueryMode: parseOpsQueryMode(c),
+ }
+ if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+ id, err := strconv.ParseInt(v, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid group_id")
+ return
+ }
+ filter.GroupID = &id
+ }
+
+ data, err := h.opsService.GetLatencyHistogram(c.Request.Context(), filter)
+ if err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+ response.Success(c, data)
+}
+
+// GetDashboardErrorTrend returns error counts time series (raw path).
+// GET /api/v1/admin/ops/dashboard/error-trend
+func (h *OpsHandler) GetDashboardErrorTrend(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ startTime, endTime, err := parseOpsTimeRange(c, "1h")
+ if err != nil {
+ response.BadRequest(c, err.Error())
+ return
+ }
+
+ filter := &service.OpsDashboardFilter{
+ StartTime: startTime,
+ EndTime: endTime,
+ Platform: strings.TrimSpace(c.Query("platform")),
+ QueryMode: parseOpsQueryMode(c),
+ }
+ if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+ id, err := strconv.ParseInt(v, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid group_id")
+ return
+ }
+ filter.GroupID = &id
+ }
+
+ bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime))
+ data, err := h.opsService.GetErrorTrend(c.Request.Context(), filter, bucketSeconds)
+ if err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+ response.Success(c, data)
+}
+
+// GetDashboardErrorDistribution returns error distribution by status code (raw path).
+// GET /api/v1/admin/ops/dashboard/error-distribution
+func (h *OpsHandler) GetDashboardErrorDistribution(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ startTime, endTime, err := parseOpsTimeRange(c, "1h")
+ if err != nil {
+ response.BadRequest(c, err.Error())
+ return
+ }
+
+ filter := &service.OpsDashboardFilter{
+ StartTime: startTime,
+ EndTime: endTime,
+ Platform: strings.TrimSpace(c.Query("platform")),
+ QueryMode: parseOpsQueryMode(c),
+ }
+ if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+ id, err := strconv.ParseInt(v, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid group_id")
+ return
+ }
+ filter.GroupID = &id
+ }
+
+ data, err := h.opsService.GetErrorDistribution(c.Request.Context(), filter)
+ if err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+ response.Success(c, data)
+}
+
+func pickThroughputBucketSeconds(window time.Duration) int {
+ // Keep buckets predictable and avoid huge responses.
+ switch {
+ case window <= 2*time.Hour:
+ return 60
+ case window <= 24*time.Hour:
+ return 300
+ default:
+ return 3600
+ }
+}
+
+func parseOpsQueryMode(c *gin.Context) service.OpsQueryMode {
+ if c == nil {
+ return ""
+ }
+ raw := strings.TrimSpace(c.Query("mode"))
+ if raw == "" {
+ // Empty means "use server default" (DB setting ops_query_mode_default).
+ return ""
+ }
+ return service.ParseOpsQueryMode(raw)
+}
diff --git a/backend/internal/handler/admin/ops_handler.go b/backend/internal/handler/admin/ops_handler.go
new file mode 100644
index 00000000..bff7426a
--- /dev/null
+++ b/backend/internal/handler/admin/ops_handler.go
@@ -0,0 +1,364 @@
+package admin
+
+import (
+ "errors"
+ "fmt"
+ "io"
+ "net/http"
+ "strconv"
+ "strings"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/pkg/response"
+ "github.com/Wei-Shaw/sub2api/internal/server/middleware"
+ "github.com/Wei-Shaw/sub2api/internal/service"
+ "github.com/gin-gonic/gin"
+)
+
+type OpsHandler struct {
+ opsService *service.OpsService
+}
+
+func NewOpsHandler(opsService *service.OpsService) *OpsHandler {
+ return &OpsHandler{opsService: opsService}
+}
+
+// GetErrorLogs lists ops error logs.
+// GET /api/v1/admin/ops/errors
+func (h *OpsHandler) GetErrorLogs(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ page, pageSize := response.ParsePagination(c)
+ // Ops list can be larger than standard admin tables.
+ if pageSize > 500 {
+ pageSize = 500
+ }
+
+ startTime, endTime, err := parseOpsTimeRange(c, "1h")
+ if err != nil {
+ response.BadRequest(c, err.Error())
+ return
+ }
+
+ filter := &service.OpsErrorLogFilter{
+ Page: page,
+ PageSize: pageSize,
+ }
+ if !startTime.IsZero() {
+ filter.StartTime = &startTime
+ }
+ if !endTime.IsZero() {
+ filter.EndTime = &endTime
+ }
+
+ if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
+ filter.Platform = platform
+ }
+ if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+ id, err := strconv.ParseInt(v, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid group_id")
+ return
+ }
+ filter.GroupID = &id
+ }
+ if v := strings.TrimSpace(c.Query("account_id")); v != "" {
+ id, err := strconv.ParseInt(v, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid account_id")
+ return
+ }
+ filter.AccountID = &id
+ }
+ if phase := strings.TrimSpace(c.Query("phase")); phase != "" {
+ filter.Phase = phase
+ }
+ if q := strings.TrimSpace(c.Query("q")); q != "" {
+ filter.Query = q
+ }
+ if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" {
+ parts := strings.Split(statusCodesStr, ",")
+ out := make([]int, 0, len(parts))
+ for _, part := range parts {
+ p := strings.TrimSpace(part)
+ if p == "" {
+ continue
+ }
+ n, err := strconv.Atoi(p)
+ if err != nil || n < 0 {
+ response.BadRequest(c, "Invalid status_codes")
+ return
+ }
+ out = append(out, n)
+ }
+ filter.StatusCodes = out
+ }
+
+ result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter)
+ if err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize)
+}
+
+// GetErrorLogByID returns a single error log detail.
+// GET /api/v1/admin/ops/errors/:id
+func (h *OpsHandler) GetErrorLogByID(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ idStr := strings.TrimSpace(c.Param("id"))
+ id, err := strconv.ParseInt(idStr, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid error id")
+ return
+ }
+
+ detail, err := h.opsService.GetErrorLogByID(c.Request.Context(), id)
+ if err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ response.Success(c, detail)
+}
+
+// ListRequestDetails returns a request-level list (success + error) for drill-down.
+// GET /api/v1/admin/ops/requests
+func (h *OpsHandler) ListRequestDetails(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ page, pageSize := response.ParsePagination(c)
+ if pageSize > 100 {
+ pageSize = 100
+ }
+
+ startTime, endTime, err := parseOpsTimeRange(c, "1h")
+ if err != nil {
+ response.BadRequest(c, err.Error())
+ return
+ }
+
+ filter := &service.OpsRequestDetailFilter{
+ Page: page,
+ PageSize: pageSize,
+ StartTime: &startTime,
+ EndTime: &endTime,
+ }
+
+ filter.Kind = strings.TrimSpace(c.Query("kind"))
+ filter.Platform = strings.TrimSpace(c.Query("platform"))
+ filter.Model = strings.TrimSpace(c.Query("model"))
+ filter.RequestID = strings.TrimSpace(c.Query("request_id"))
+ filter.Query = strings.TrimSpace(c.Query("q"))
+ filter.Sort = strings.TrimSpace(c.Query("sort"))
+
+ if v := strings.TrimSpace(c.Query("user_id")); v != "" {
+ id, err := strconv.ParseInt(v, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid user_id")
+ return
+ }
+ filter.UserID = &id
+ }
+ if v := strings.TrimSpace(c.Query("api_key_id")); v != "" {
+ id, err := strconv.ParseInt(v, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid api_key_id")
+ return
+ }
+ filter.APIKeyID = &id
+ }
+ if v := strings.TrimSpace(c.Query("account_id")); v != "" {
+ id, err := strconv.ParseInt(v, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid account_id")
+ return
+ }
+ filter.AccountID = &id
+ }
+ if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+ id, err := strconv.ParseInt(v, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid group_id")
+ return
+ }
+ filter.GroupID = &id
+ }
+
+ if v := strings.TrimSpace(c.Query("min_duration_ms")); v != "" {
+ parsed, err := strconv.Atoi(v)
+ if err != nil || parsed < 0 {
+ response.BadRequest(c, "Invalid min_duration_ms")
+ return
+ }
+ filter.MinDurationMs = &parsed
+ }
+ if v := strings.TrimSpace(c.Query("max_duration_ms")); v != "" {
+ parsed, err := strconv.Atoi(v)
+ if err != nil || parsed < 0 {
+ response.BadRequest(c, "Invalid max_duration_ms")
+ return
+ }
+ filter.MaxDurationMs = &parsed
+ }
+
+ out, err := h.opsService.ListRequestDetails(c.Request.Context(), filter)
+ if err != nil {
+ // Invalid sort/kind/platform etc should be a bad request; keep it simple.
+ if strings.Contains(strings.ToLower(err.Error()), "invalid") {
+ response.BadRequest(c, err.Error())
+ return
+ }
+ response.Error(c, http.StatusInternalServerError, "Failed to list request details")
+ return
+ }
+
+ response.Paginated(c, out.Items, out.Total, out.Page, out.PageSize)
+}
+
+type opsRetryRequest struct {
+ Mode string `json:"mode"`
+ PinnedAccountID *int64 `json:"pinned_account_id"`
+}
+
+// RetryErrorRequest retries a failed request using stored request_body.
+// POST /api/v1/admin/ops/errors/:id/retry
+func (h *OpsHandler) RetryErrorRequest(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ subject, ok := middleware.GetAuthSubjectFromContext(c)
+ if !ok || subject.UserID <= 0 {
+ response.Error(c, http.StatusUnauthorized, "Unauthorized")
+ return
+ }
+
+ idStr := strings.TrimSpace(c.Param("id"))
+ id, err := strconv.ParseInt(idStr, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid error id")
+ return
+ }
+
+ req := opsRetryRequest{Mode: service.OpsRetryModeClient}
+ if err := c.ShouldBindJSON(&req); err != nil && !errors.Is(err, io.EOF) {
+ response.BadRequest(c, "Invalid request: "+err.Error())
+ return
+ }
+ if strings.TrimSpace(req.Mode) == "" {
+ req.Mode = service.OpsRetryModeClient
+ }
+
+ result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, req.Mode, req.PinnedAccountID)
+ if err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ response.Success(c, result)
+}
+
+func parseOpsTimeRange(c *gin.Context, defaultRange string) (time.Time, time.Time, error) {
+ startStr := strings.TrimSpace(c.Query("start_time"))
+ endStr := strings.TrimSpace(c.Query("end_time"))
+
+ parseTS := func(s string) (time.Time, error) {
+ if s == "" {
+ return time.Time{}, nil
+ }
+ if t, err := time.Parse(time.RFC3339Nano, s); err == nil {
+ return t, nil
+ }
+ return time.Parse(time.RFC3339, s)
+ }
+
+ start, err := parseTS(startStr)
+ if err != nil {
+ return time.Time{}, time.Time{}, err
+ }
+ end, err := parseTS(endStr)
+ if err != nil {
+ return time.Time{}, time.Time{}, err
+ }
+
+ // start/end explicitly provided (even partially)
+ if startStr != "" || endStr != "" {
+ if end.IsZero() {
+ end = time.Now()
+ }
+ if start.IsZero() {
+ dur, _ := parseOpsDuration(defaultRange)
+ start = end.Add(-dur)
+ }
+ if start.After(end) {
+ return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: start_time must be <= end_time")
+ }
+ if end.Sub(start) > 30*24*time.Hour {
+ return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days")
+ }
+ return start, end, nil
+ }
+
+ // time_range fallback
+ tr := strings.TrimSpace(c.Query("time_range"))
+ if tr == "" {
+ tr = defaultRange
+ }
+ dur, ok := parseOpsDuration(tr)
+ if !ok {
+ dur, _ = parseOpsDuration(defaultRange)
+ }
+
+ end = time.Now()
+ start = end.Add(-dur)
+ if end.Sub(start) > 30*24*time.Hour {
+ return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days")
+ }
+ return start, end, nil
+}
+
+func parseOpsDuration(v string) (time.Duration, bool) {
+ switch strings.TrimSpace(v) {
+ case "5m":
+ return 5 * time.Minute, true
+ case "30m":
+ return 30 * time.Minute, true
+ case "1h":
+ return time.Hour, true
+ case "6h":
+ return 6 * time.Hour, true
+ case "24h":
+ return 24 * time.Hour, true
+ default:
+ return 0, false
+ }
+}
diff --git a/backend/internal/handler/admin/ops_realtime_handler.go b/backend/internal/handler/admin/ops_realtime_handler.go
new file mode 100644
index 00000000..0c23c13b
--- /dev/null
+++ b/backend/internal/handler/admin/ops_realtime_handler.go
@@ -0,0 +1,120 @@
+package admin
+
+import (
+ "net/http"
+ "strconv"
+ "strings"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/pkg/response"
+ "github.com/Wei-Shaw/sub2api/internal/service"
+ "github.com/gin-gonic/gin"
+)
+
+// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account.
+// GET /api/v1/admin/ops/concurrency
+func (h *OpsHandler) GetConcurrencyStats(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
+ response.Success(c, gin.H{
+ "enabled": false,
+ "platform": map[string]*service.PlatformConcurrencyInfo{},
+ "group": map[int64]*service.GroupConcurrencyInfo{},
+ "account": map[int64]*service.AccountConcurrencyInfo{},
+ "timestamp": time.Now().UTC(),
+ })
+ return
+ }
+
+ platformFilter := strings.TrimSpace(c.Query("platform"))
+ var groupID *int64
+ if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+ id, err := strconv.ParseInt(v, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid group_id")
+ return
+ }
+ groupID = &id
+ }
+
+ platform, group, account, collectedAt, err := h.opsService.GetConcurrencyStats(c.Request.Context(), platformFilter, groupID)
+ if err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ payload := gin.H{
+ "enabled": true,
+ "platform": platform,
+ "group": group,
+ "account": account,
+ }
+ if collectedAt != nil {
+ payload["timestamp"] = collectedAt.UTC()
+ }
+ response.Success(c, payload)
+}
+
+// GetAccountAvailability returns account availability statistics.
+// GET /api/v1/admin/ops/account-availability
+//
+// Query params:
+// - platform: optional
+// - group_id: optional
+func (h *OpsHandler) GetAccountAvailability(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
+ response.Success(c, gin.H{
+ "enabled": false,
+ "platform": map[string]*service.PlatformAvailability{},
+ "group": map[int64]*service.GroupAvailability{},
+ "account": map[int64]*service.AccountAvailability{},
+ "timestamp": time.Now().UTC(),
+ })
+ return
+ }
+
+ platform := strings.TrimSpace(c.Query("platform"))
+ var groupID *int64
+ if v := strings.TrimSpace(c.Query("group_id")); v != "" {
+ id, err := strconv.ParseInt(v, 10, 64)
+ if err != nil || id <= 0 {
+ response.BadRequest(c, "Invalid group_id")
+ return
+ }
+ groupID = &id
+ }
+
+ platformStats, groupStats, accountStats, collectedAt, err := h.opsService.GetAccountAvailabilityStats(c.Request.Context(), platform, groupID)
+ if err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ payload := gin.H{
+ "enabled": true,
+ "platform": platformStats,
+ "group": groupStats,
+ "account": accountStats,
+ }
+ if collectedAt != nil {
+ payload["timestamp"] = collectedAt.UTC()
+ }
+ response.Success(c, payload)
+}
diff --git a/backend/internal/handler/admin/ops_settings_handler.go b/backend/internal/handler/admin/ops_settings_handler.go
new file mode 100644
index 00000000..e76c1b20
--- /dev/null
+++ b/backend/internal/handler/admin/ops_settings_handler.go
@@ -0,0 +1,103 @@
+package admin
+
+import (
+ "net/http"
+
+ "github.com/Wei-Shaw/sub2api/internal/pkg/response"
+ "github.com/Wei-Shaw/sub2api/internal/service"
+ "github.com/gin-gonic/gin"
+)
+
+// GetEmailNotificationConfig returns Ops email notification config (DB-backed).
+// GET /api/v1/admin/ops/email-notification/config
+func (h *OpsHandler) GetEmailNotificationConfig(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ cfg, err := h.opsService.GetEmailNotificationConfig(c.Request.Context())
+ if err != nil {
+ response.Error(c, http.StatusInternalServerError, "Failed to get email notification config")
+ return
+ }
+ response.Success(c, cfg)
+}
+
+// UpdateEmailNotificationConfig updates Ops email notification config (DB-backed).
+// PUT /api/v1/admin/ops/email-notification/config
+func (h *OpsHandler) UpdateEmailNotificationConfig(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ var req service.OpsEmailNotificationConfigUpdateRequest
+ if err := c.ShouldBindJSON(&req); err != nil {
+ response.BadRequest(c, "Invalid request body")
+ return
+ }
+
+ updated, err := h.opsService.UpdateEmailNotificationConfig(c.Request.Context(), &req)
+ if err != nil {
+ // Most failures here are validation errors from request payload; treat as 400.
+ response.Error(c, http.StatusBadRequest, err.Error())
+ return
+ }
+ response.Success(c, updated)
+}
+
+// GetAlertRuntimeSettings returns Ops alert evaluator runtime settings (DB-backed).
+// GET /api/v1/admin/ops/runtime/alert
+func (h *OpsHandler) GetAlertRuntimeSettings(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ cfg, err := h.opsService.GetOpsAlertRuntimeSettings(c.Request.Context())
+ if err != nil {
+ response.Error(c, http.StatusInternalServerError, "Failed to get alert runtime settings")
+ return
+ }
+ response.Success(c, cfg)
+}
+
+// UpdateAlertRuntimeSettings updates Ops alert evaluator runtime settings (DB-backed).
+// PUT /api/v1/admin/ops/runtime/alert
+func (h *OpsHandler) UpdateAlertRuntimeSettings(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ var req service.OpsAlertRuntimeSettings
+ if err := c.ShouldBindJSON(&req); err != nil {
+ response.BadRequest(c, "Invalid request body")
+ return
+ }
+
+ updated, err := h.opsService.UpdateOpsAlertRuntimeSettings(c.Request.Context(), &req)
+ if err != nil {
+ response.Error(c, http.StatusBadRequest, err.Error())
+ return
+ }
+ response.Success(c, updated)
+}
+
diff --git a/backend/internal/handler/admin/ops_ws_handler.go b/backend/internal/handler/admin/ops_ws_handler.go
new file mode 100644
index 00000000..4bbd9055
--- /dev/null
+++ b/backend/internal/handler/admin/ops_ws_handler.go
@@ -0,0 +1,765 @@
+package admin
+
+import (
+ "context"
+ "encoding/json"
+ "log"
+ "math"
+ "net"
+ "net/http"
+ "net/netip"
+ "net/url"
+ "os"
+ "strconv"
+ "strings"
+ "sync"
+ "sync/atomic"
+ "time"
+
+ "github.com/Wei-Shaw/sub2api/internal/service"
+ "github.com/gin-gonic/gin"
+ "github.com/gorilla/websocket"
+)
+
+type OpsWSProxyConfig struct {
+ TrustProxy bool
+ TrustedProxies []netip.Prefix
+ OriginPolicy string
+}
+
+const (
+ envOpsWSTrustProxy = "OPS_WS_TRUST_PROXY"
+ envOpsWSTrustedProxies = "OPS_WS_TRUSTED_PROXIES"
+ envOpsWSOriginPolicy = "OPS_WS_ORIGIN_POLICY"
+ envOpsWSMaxConns = "OPS_WS_MAX_CONNS"
+ envOpsWSMaxConnsPerIP = "OPS_WS_MAX_CONNS_PER_IP"
+)
+
+const (
+ OriginPolicyStrict = "strict"
+ OriginPolicyPermissive = "permissive"
+)
+
+var opsWSProxyConfig = loadOpsWSProxyConfigFromEnv()
+
+var upgrader = websocket.Upgrader{
+ CheckOrigin: func(r *http.Request) bool {
+ return isAllowedOpsWSOrigin(r)
+ },
+ // Subprotocol negotiation:
+ // - The frontend passes ["sub2api-admin", "jwt."].
+ // - We always select "sub2api-admin" so the token is never echoed back in the handshake response.
+ Subprotocols: []string{"sub2api-admin"},
+}
+
+const (
+ qpsWSPushInterval = 2 * time.Second
+ qpsWSRefreshInterval = 5 * time.Second
+ qpsWSRequestCountWindow = 1 * time.Minute
+
+ defaultMaxWSConns = 100
+ defaultMaxWSConnsPerIP = 20
+)
+
+var wsConnCount atomic.Int32
+var wsConnCountByIP sync.Map // map[string]*atomic.Int32
+
+const qpsWSIdleStopDelay = 30 * time.Second
+
+const (
+ opsWSCloseRealtimeDisabled = 4001
+)
+
+var qpsWSIdleStopMu sync.Mutex
+var qpsWSIdleStopTimer *time.Timer
+
+func cancelQPSWSIdleStop() {
+ qpsWSIdleStopMu.Lock()
+ if qpsWSIdleStopTimer != nil {
+ qpsWSIdleStopTimer.Stop()
+ qpsWSIdleStopTimer = nil
+ }
+ qpsWSIdleStopMu.Unlock()
+}
+
+func scheduleQPSWSIdleStop() {
+ qpsWSIdleStopMu.Lock()
+ if qpsWSIdleStopTimer != nil {
+ qpsWSIdleStopMu.Unlock()
+ return
+ }
+ qpsWSIdleStopTimer = time.AfterFunc(qpsWSIdleStopDelay, func() {
+ // Only stop if truly idle at fire time.
+ if wsConnCount.Load() == 0 {
+ qpsWSCache.Stop()
+ }
+ qpsWSIdleStopMu.Lock()
+ qpsWSIdleStopTimer = nil
+ qpsWSIdleStopMu.Unlock()
+ })
+ qpsWSIdleStopMu.Unlock()
+}
+
+type opsWSRuntimeLimits struct {
+ MaxConns int32
+ MaxConnsPerIP int32
+}
+
+var opsWSLimits = loadOpsWSRuntimeLimitsFromEnv()
+
+const (
+ qpsWSWriteTimeout = 10 * time.Second
+ qpsWSPongWait = 60 * time.Second
+ qpsWSPingInterval = 30 * time.Second
+
+ // We don't expect clients to send application messages; we only read to process control frames (Pong/Close).
+ qpsWSMaxReadBytes = 1024
+)
+
+type opsWSQPSCache struct {
+ refreshInterval time.Duration
+ requestCountWindow time.Duration
+
+ lastUpdatedUnixNano atomic.Int64
+ payload atomic.Value // []byte
+
+ opsService *service.OpsService
+ cancel context.CancelFunc
+ done chan struct{}
+
+ mu sync.Mutex
+ running bool
+}
+
+var qpsWSCache = &opsWSQPSCache{
+ refreshInterval: qpsWSRefreshInterval,
+ requestCountWindow: qpsWSRequestCountWindow,
+}
+
+func (c *opsWSQPSCache) start(opsService *service.OpsService) {
+ if c == nil || opsService == nil {
+ return
+ }
+
+ for {
+ c.mu.Lock()
+ if c.running {
+ c.mu.Unlock()
+ return
+ }
+
+ // If a previous refresh loop is currently stopping, wait for it to fully exit.
+ done := c.done
+ if done != nil {
+ c.mu.Unlock()
+ <-done
+
+ c.mu.Lock()
+ if c.done == done && !c.running {
+ c.done = nil
+ }
+ c.mu.Unlock()
+ continue
+ }
+
+ c.opsService = opsService
+ ctx, cancel := context.WithCancel(context.Background())
+ c.cancel = cancel
+ c.done = make(chan struct{})
+ done = c.done
+ c.running = true
+ c.mu.Unlock()
+
+ go func() {
+ defer close(done)
+ c.refreshLoop(ctx)
+ }()
+ return
+ }
+}
+
+// Stop stops the background refresh loop.
+// It is safe to call multiple times.
+func (c *opsWSQPSCache) Stop() {
+ if c == nil {
+ return
+ }
+
+ c.mu.Lock()
+ if !c.running {
+ done := c.done
+ c.mu.Unlock()
+ if done != nil {
+ <-done
+ }
+ return
+ }
+ cancel := c.cancel
+ c.cancel = nil
+ c.running = false
+ c.opsService = nil
+ done := c.done
+ c.mu.Unlock()
+
+ if cancel != nil {
+ cancel()
+ }
+ if done != nil {
+ <-done
+ }
+
+ c.mu.Lock()
+ if c.done == done && !c.running {
+ c.done = nil
+ }
+ c.mu.Unlock()
+}
+
+func (c *opsWSQPSCache) refreshLoop(ctx context.Context) {
+ ticker := time.NewTicker(c.refreshInterval)
+ defer ticker.Stop()
+
+ c.refresh(ctx)
+ for {
+ select {
+ case <-ticker.C:
+ c.refresh(ctx)
+ case <-ctx.Done():
+ return
+ }
+ }
+}
+
+func (c *opsWSQPSCache) refresh(parentCtx context.Context) {
+ if c == nil {
+ return
+ }
+
+ c.mu.Lock()
+ opsService := c.opsService
+ c.mu.Unlock()
+ if opsService == nil {
+ return
+ }
+
+ if parentCtx == nil {
+ parentCtx = context.Background()
+ }
+ ctx, cancel := context.WithTimeout(parentCtx, 10*time.Second)
+ defer cancel()
+
+ now := time.Now().UTC()
+ stats, err := opsService.GetWindowStats(ctx, now.Add(-c.requestCountWindow), now)
+ if err != nil || stats == nil {
+ if err != nil {
+ log.Printf("[OpsWS] refresh: get window stats failed: %v", err)
+ }
+ return
+ }
+
+ requestCount := stats.SuccessCount + stats.ErrorCountTotal
+ qps := 0.0
+ tps := 0.0
+ if c.requestCountWindow > 0 {
+ seconds := c.requestCountWindow.Seconds()
+ qps = roundTo1DP(float64(requestCount) / seconds)
+ tps = roundTo1DP(float64(stats.TokenConsumed) / seconds)
+ }
+
+ payload := gin.H{
+ "type": "qps_update",
+ "timestamp": now.Format(time.RFC3339),
+ "data": gin.H{
+ "qps": qps,
+ "tps": tps,
+ "request_count": requestCount,
+ },
+ }
+
+ msg, err := json.Marshal(payload)
+ if err != nil {
+ log.Printf("[OpsWS] refresh: marshal payload failed: %v", err)
+ return
+ }
+
+ c.payload.Store(msg)
+ c.lastUpdatedUnixNano.Store(now.UnixNano())
+}
+
+func roundTo1DP(v float64) float64 {
+ return math.Round(v*10) / 10
+}
+
+func (c *opsWSQPSCache) getPayload() []byte {
+ if c == nil {
+ return nil
+ }
+ if cached, ok := c.payload.Load().([]byte); ok && cached != nil {
+ return cached
+ }
+ return nil
+}
+
+func closeWS(conn *websocket.Conn, code int, reason string) {
+ if conn == nil {
+ return
+ }
+ msg := websocket.FormatCloseMessage(code, reason)
+ _ = conn.WriteControl(websocket.CloseMessage, msg, time.Now().Add(qpsWSWriteTimeout))
+ _ = conn.Close()
+}
+
+// QPSWSHandler handles realtime QPS push via WebSocket.
+// GET /api/v1/admin/ops/ws/qps
+func (h *OpsHandler) QPSWSHandler(c *gin.Context) {
+ clientIP := requestClientIP(c.Request)
+
+ if h == nil || h.opsService == nil {
+ c.JSON(http.StatusServiceUnavailable, gin.H{"error": "ops service not initialized"})
+ return
+ }
+
+ // If realtime monitoring is disabled, prefer a successful WS upgrade followed by a clean close
+ // with a deterministic close code. This prevents clients from spinning on 404/1006 reconnect loops.
+ if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
+ conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
+ if err != nil {
+ c.JSON(http.StatusNotFound, gin.H{"error": "ops realtime monitoring is disabled"})
+ return
+ }
+ closeWS(conn, opsWSCloseRealtimeDisabled, "realtime_disabled")
+ return
+ }
+
+ cancelQPSWSIdleStop()
+ // Lazily start the background refresh loop so unit tests that never hit the
+ // websocket route don't spawn goroutines that depend on DB/Redis stubs.
+ qpsWSCache.start(h.opsService)
+
+ // Reserve a global slot before upgrading the connection to keep the limit strict.
+ if !tryAcquireOpsWSTotalSlot(opsWSLimits.MaxConns) {
+ log.Printf("[OpsWS] connection limit reached: %d/%d", wsConnCount.Load(), opsWSLimits.MaxConns)
+ c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"})
+ return
+ }
+ defer func() {
+ if wsConnCount.Add(-1) == 0 {
+ scheduleQPSWSIdleStop()
+ }
+ }()
+
+ if opsWSLimits.MaxConnsPerIP > 0 && clientIP != "" {
+ if !tryAcquireOpsWSIPSlot(clientIP, opsWSLimits.MaxConnsPerIP) {
+ log.Printf("[OpsWS] per-ip connection limit reached: ip=%s limit=%d", clientIP, opsWSLimits.MaxConnsPerIP)
+ c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"})
+ return
+ }
+ defer releaseOpsWSIPSlot(clientIP)
+ }
+
+ conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
+ if err != nil {
+ log.Printf("[OpsWS] upgrade failed: %v", err)
+ return
+ }
+
+ defer func() {
+ _ = conn.Close()
+ }()
+
+ handleQPSWebSocket(c.Request.Context(), conn)
+}
+
+func tryAcquireOpsWSTotalSlot(limit int32) bool {
+ if limit <= 0 {
+ return true
+ }
+ for {
+ current := wsConnCount.Load()
+ if current >= limit {
+ return false
+ }
+ if wsConnCount.CompareAndSwap(current, current+1) {
+ return true
+ }
+ }
+}
+
+func tryAcquireOpsWSIPSlot(clientIP string, limit int32) bool {
+ if strings.TrimSpace(clientIP) == "" || limit <= 0 {
+ return true
+ }
+
+ v, _ := wsConnCountByIP.LoadOrStore(clientIP, &atomic.Int32{})
+ counter := v.(*atomic.Int32)
+
+ for {
+ current := counter.Load()
+ if current >= limit {
+ return false
+ }
+ if counter.CompareAndSwap(current, current+1) {
+ return true
+ }
+ }
+}
+
+func releaseOpsWSIPSlot(clientIP string) {
+ if strings.TrimSpace(clientIP) == "" {
+ return
+ }
+
+ v, ok := wsConnCountByIP.Load(clientIP)
+ if !ok {
+ return
+ }
+ counter := v.(*atomic.Int32)
+ next := counter.Add(-1)
+ if next <= 0 {
+ // Best-effort cleanup; safe even if a new slot was acquired concurrently.
+ wsConnCountByIP.Delete(clientIP)
+ }
+}
+
+func handleQPSWebSocket(parentCtx context.Context, conn *websocket.Conn) {
+ if conn == nil {
+ return
+ }
+
+ ctx, cancel := context.WithCancel(parentCtx)
+ defer cancel()
+
+ var closeOnce sync.Once
+ closeConn := func() {
+ closeOnce.Do(func() {
+ _ = conn.Close()
+ })
+ }
+
+ closeFrameCh := make(chan []byte, 1)
+
+ var wg sync.WaitGroup
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ defer cancel()
+
+ conn.SetReadLimit(qpsWSMaxReadBytes)
+ if err := conn.SetReadDeadline(time.Now().Add(qpsWSPongWait)); err != nil {
+ log.Printf("[OpsWS] set read deadline failed: %v", err)
+ return
+ }
+ conn.SetPongHandler(func(string) error {
+ return conn.SetReadDeadline(time.Now().Add(qpsWSPongWait))
+ })
+ conn.SetCloseHandler(func(code int, text string) error {
+ select {
+ case closeFrameCh <- websocket.FormatCloseMessage(code, text):
+ default:
+ }
+ cancel()
+ return nil
+ })
+
+ for {
+ _, _, err := conn.ReadMessage()
+ if err != nil {
+ if websocket.IsUnexpectedCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway, websocket.CloseNoStatusReceived) {
+ log.Printf("[OpsWS] read failed: %v", err)
+ }
+ return
+ }
+ }
+ }()
+
+ // Push QPS data every 2 seconds (values are globally cached and refreshed at most once per qpsWSRefreshInterval).
+ pushTicker := time.NewTicker(qpsWSPushInterval)
+ defer pushTicker.Stop()
+
+ // Heartbeat ping every 30 seconds.
+ pingTicker := time.NewTicker(qpsWSPingInterval)
+ defer pingTicker.Stop()
+
+ writeWithTimeout := func(messageType int, data []byte) error {
+ if err := conn.SetWriteDeadline(time.Now().Add(qpsWSWriteTimeout)); err != nil {
+ return err
+ }
+ return conn.WriteMessage(messageType, data)
+ }
+
+ sendClose := func(closeFrame []byte) {
+ if closeFrame == nil {
+ closeFrame = websocket.FormatCloseMessage(websocket.CloseNormalClosure, "")
+ }
+ _ = writeWithTimeout(websocket.CloseMessage, closeFrame)
+ }
+
+ for {
+ select {
+ case <-pushTicker.C:
+ msg := qpsWSCache.getPayload()
+ if msg == nil {
+ continue
+ }
+ if err := writeWithTimeout(websocket.TextMessage, msg); err != nil {
+ log.Printf("[OpsWS] write failed: %v", err)
+ cancel()
+ closeConn()
+ wg.Wait()
+ return
+ }
+
+ case <-pingTicker.C:
+ if err := writeWithTimeout(websocket.PingMessage, nil); err != nil {
+ log.Printf("[OpsWS] ping failed: %v", err)
+ cancel()
+ closeConn()
+ wg.Wait()
+ return
+ }
+
+ case closeFrame := <-closeFrameCh:
+ sendClose(closeFrame)
+ closeConn()
+ wg.Wait()
+ return
+
+ case <-ctx.Done():
+ var closeFrame []byte
+ select {
+ case closeFrame = <-closeFrameCh:
+ default:
+ }
+ sendClose(closeFrame)
+
+ closeConn()
+ wg.Wait()
+ return
+ }
+ }
+}
+
+func isAllowedOpsWSOrigin(r *http.Request) bool {
+ if r == nil {
+ return false
+ }
+ origin := strings.TrimSpace(r.Header.Get("Origin"))
+ if origin == "" {
+ switch strings.ToLower(strings.TrimSpace(opsWSProxyConfig.OriginPolicy)) {
+ case OriginPolicyStrict:
+ return false
+ case OriginPolicyPermissive, "":
+ return true
+ default:
+ return true
+ }
+ }
+ parsed, err := url.Parse(origin)
+ if err != nil || parsed.Hostname() == "" {
+ return false
+ }
+ originHost := strings.ToLower(parsed.Hostname())
+
+ trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r)
+ reqHost := hostWithoutPort(r.Host)
+ if trustProxyHeaders {
+ xfHost := strings.TrimSpace(r.Header.Get("X-Forwarded-Host"))
+ if xfHost != "" {
+ xfHost = strings.TrimSpace(strings.Split(xfHost, ",")[0])
+ if xfHost != "" {
+ reqHost = hostWithoutPort(xfHost)
+ }
+ }
+ }
+ reqHost = strings.ToLower(reqHost)
+ if reqHost == "" {
+ return false
+ }
+ return originHost == reqHost
+}
+
+func shouldTrustOpsWSProxyHeaders(r *http.Request) bool {
+ if r == nil {
+ return false
+ }
+ if !opsWSProxyConfig.TrustProxy {
+ return false
+ }
+ peerIP, ok := requestPeerIP(r)
+ if !ok {
+ return false
+ }
+ return isAddrInTrustedProxies(peerIP, opsWSProxyConfig.TrustedProxies)
+}
+
+func requestPeerIP(r *http.Request) (netip.Addr, bool) {
+ if r == nil {
+ return netip.Addr{}, false
+ }
+ host, _, err := net.SplitHostPort(strings.TrimSpace(r.RemoteAddr))
+ if err != nil {
+ host = strings.TrimSpace(r.RemoteAddr)
+ }
+ host = strings.TrimPrefix(host, "[")
+ host = strings.TrimSuffix(host, "]")
+ if host == "" {
+ return netip.Addr{}, false
+ }
+ addr, err := netip.ParseAddr(host)
+ if err != nil {
+ return netip.Addr{}, false
+ }
+ return addr.Unmap(), true
+}
+
+func requestClientIP(r *http.Request) string {
+ if r == nil {
+ return ""
+ }
+
+ trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r)
+ if trustProxyHeaders {
+ xff := strings.TrimSpace(r.Header.Get("X-Forwarded-For"))
+ if xff != "" {
+ // Use the left-most entry (original client). If multiple proxies add values, they are comma-separated.
+ xff = strings.TrimSpace(strings.Split(xff, ",")[0])
+ xff = strings.TrimPrefix(xff, "[")
+ xff = strings.TrimSuffix(xff, "]")
+ if addr, err := netip.ParseAddr(xff); err == nil && addr.IsValid() {
+ return addr.Unmap().String()
+ }
+ }
+ }
+
+ if peer, ok := requestPeerIP(r); ok && peer.IsValid() {
+ return peer.String()
+ }
+ return ""
+}
+
+func isAddrInTrustedProxies(addr netip.Addr, trusted []netip.Prefix) bool {
+ if !addr.IsValid() {
+ return false
+ }
+ for _, p := range trusted {
+ if p.Contains(addr) {
+ return true
+ }
+ }
+ return false
+}
+
+func loadOpsWSProxyConfigFromEnv() OpsWSProxyConfig {
+ cfg := OpsWSProxyConfig{
+ TrustProxy: true,
+ TrustedProxies: defaultTrustedProxies(),
+ OriginPolicy: OriginPolicyPermissive,
+ }
+
+ if v := strings.TrimSpace(os.Getenv(envOpsWSTrustProxy)); v != "" {
+ if parsed, err := strconv.ParseBool(v); err == nil {
+ cfg.TrustProxy = parsed
+ } else {
+ log.Printf("[OpsWS] invalid %s=%q (expected bool); using default=%v", envOpsWSTrustProxy, v, cfg.TrustProxy)
+ }
+ }
+
+ if raw := strings.TrimSpace(os.Getenv(envOpsWSTrustedProxies)); raw != "" {
+ prefixes, invalid := parseTrustedProxyList(raw)
+ if len(invalid) > 0 {
+ log.Printf("[OpsWS] invalid %s entries ignored: %s", envOpsWSTrustedProxies, strings.Join(invalid, ", "))
+ }
+ cfg.TrustedProxies = prefixes
+ }
+
+ if v := strings.TrimSpace(os.Getenv(envOpsWSOriginPolicy)); v != "" {
+ normalized := strings.ToLower(v)
+ switch normalized {
+ case OriginPolicyStrict, OriginPolicyPermissive:
+ cfg.OriginPolicy = normalized
+ default:
+ log.Printf("[OpsWS] invalid %s=%q (expected %q or %q); using default=%q", envOpsWSOriginPolicy, v, OriginPolicyStrict, OriginPolicyPermissive, cfg.OriginPolicy)
+ }
+ }
+
+ return cfg
+}
+
+func loadOpsWSRuntimeLimitsFromEnv() opsWSRuntimeLimits {
+ cfg := opsWSRuntimeLimits{
+ MaxConns: defaultMaxWSConns,
+ MaxConnsPerIP: defaultMaxWSConnsPerIP,
+ }
+
+ if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConns)); v != "" {
+ if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 {
+ cfg.MaxConns = int32(parsed)
+ } else {
+ log.Printf("[OpsWS] invalid %s=%q (expected int>0); using default=%d", envOpsWSMaxConns, v, cfg.MaxConns)
+ }
+ }
+ if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConnsPerIP)); v != "" {
+ if parsed, err := strconv.Atoi(v); err == nil && parsed >= 0 {
+ cfg.MaxConnsPerIP = int32(parsed)
+ } else {
+ log.Printf("[OpsWS] invalid %s=%q (expected int>=0); using default=%d", envOpsWSMaxConnsPerIP, v, cfg.MaxConnsPerIP)
+ }
+ }
+ return cfg
+}
+
+func defaultTrustedProxies() []netip.Prefix {
+ prefixes, _ := parseTrustedProxyList("127.0.0.0/8,::1/128")
+ return prefixes
+}
+
+func parseTrustedProxyList(raw string) (prefixes []netip.Prefix, invalid []string) {
+ for _, token := range strings.Split(raw, ",") {
+ item := strings.TrimSpace(token)
+ if item == "" {
+ continue
+ }
+
+ var (
+ p netip.Prefix
+ err error
+ )
+ if strings.Contains(item, "/") {
+ p, err = netip.ParsePrefix(item)
+ } else {
+ var addr netip.Addr
+ addr, err = netip.ParseAddr(item)
+ if err == nil {
+ addr = addr.Unmap()
+ bits := 128
+ if addr.Is4() {
+ bits = 32
+ }
+ p = netip.PrefixFrom(addr, bits)
+ }
+ }
+
+ if err != nil || !p.IsValid() {
+ invalid = append(invalid, item)
+ continue
+ }
+
+ prefixes = append(prefixes, p.Masked())
+ }
+ return prefixes, invalid
+}
+
+func hostWithoutPort(hostport string) string {
+ hostport = strings.TrimSpace(hostport)
+ if hostport == "" {
+ return ""
+ }
+ if host, _, err := net.SplitHostPort(hostport); err == nil {
+ return host
+ }
+ if strings.HasPrefix(hostport, "[") && strings.HasSuffix(hostport, "]") {
+ return strings.Trim(hostport, "[]")
+ }
+ parts := strings.Split(hostport, ":")
+ return parts[0]
+}
diff --git a/backend/internal/handler/dto/settings.go b/backend/internal/handler/dto/settings.go
index 4c50cedf..6fd53b26 100644
--- a/backend/internal/handler/dto/settings.go
+++ b/backend/internal/handler/dto/settings.go
@@ -37,6 +37,11 @@ type SystemSettings struct {
// Identity patch configuration (Claude -> Gemini)
EnableIdentityPatch bool `json:"enable_identity_patch"`
IdentityPatchPrompt string `json:"identity_patch_prompt"`
+
+ // Ops monitoring (vNext)
+ OpsMonitoringEnabled bool `json:"ops_monitoring_enabled"`
+ OpsRealtimeMonitoringEnabled bool `json:"ops_realtime_monitoring_enabled"`
+ OpsQueryModeDefault string `json:"ops_query_mode_default"`
}
type PublicSettings struct {
diff --git a/backend/internal/handler/ops_error_logger.go b/backend/internal/handler/ops_error_logger.go
new file mode 100644
index 00000000..b3a90c2f
--- /dev/null
+++ b/backend/internal/handler/ops_error_logger.go
@@ -0,0 +1,681 @@
+package handler
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "log"
+ "runtime"
+ "runtime/debug"
+ "strconv"
+ "strings"
+ "sync"
+ "sync/atomic"
+ "time"
+ "unicode/utf8"
+
+ "github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
+ middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
+ "github.com/Wei-Shaw/sub2api/internal/service"
+ "github.com/gin-gonic/gin"
+)
+
+const (
+ opsModelKey = "ops_model"
+ opsStreamKey = "ops_stream"
+ opsRequestBodyKey = "ops_request_body"
+ opsAccountIDKey = "ops_account_id"
+)
+
+const (
+ opsErrorLogTimeout = 5 * time.Second
+ opsErrorLogDrainTimeout = 10 * time.Second
+
+ opsErrorLogMinWorkerCount = 4
+ opsErrorLogMaxWorkerCount = 32
+
+ opsErrorLogQueueSizePerWorker = 128
+ opsErrorLogMinQueueSize = 256
+ opsErrorLogMaxQueueSize = 8192
+)
+
+type opsErrorLogJob struct {
+ ops *service.OpsService
+ entry *service.OpsInsertErrorLogInput
+ requestBody []byte
+}
+
+var (
+ opsErrorLogOnce sync.Once
+ opsErrorLogQueue chan opsErrorLogJob
+
+ opsErrorLogStopOnce sync.Once
+ opsErrorLogWorkersWg sync.WaitGroup
+ opsErrorLogMu sync.RWMutex
+ opsErrorLogStopping bool
+ opsErrorLogQueueLen atomic.Int64
+ opsErrorLogEnqueued atomic.Int64
+ opsErrorLogDropped atomic.Int64
+ opsErrorLogProcessed atomic.Int64
+
+ opsErrorLogLastDropLogAt atomic.Int64
+
+ opsErrorLogShutdownCh = make(chan struct{})
+ opsErrorLogShutdownOnce sync.Once
+ opsErrorLogDrained atomic.Bool
+)
+
+func startOpsErrorLogWorkers() {
+ opsErrorLogMu.Lock()
+ defer opsErrorLogMu.Unlock()
+
+ if opsErrorLogStopping {
+ return
+ }
+
+ workerCount, queueSize := opsErrorLogConfig()
+ opsErrorLogQueue = make(chan opsErrorLogJob, queueSize)
+ opsErrorLogQueueLen.Store(0)
+
+ opsErrorLogWorkersWg.Add(workerCount)
+ for i := 0; i < workerCount; i++ {
+ go func() {
+ defer opsErrorLogWorkersWg.Done()
+ for job := range opsErrorLogQueue {
+ opsErrorLogQueueLen.Add(-1)
+ if job.ops == nil || job.entry == nil {
+ continue
+ }
+ func() {
+ defer func() {
+ if r := recover(); r != nil {
+ log.Printf("[OpsErrorLogger] worker panic: %v\n%s", r, debug.Stack())
+ }
+ }()
+ ctx, cancel := context.WithTimeout(context.Background(), opsErrorLogTimeout)
+ _ = job.ops.RecordError(ctx, job.entry, job.requestBody)
+ cancel()
+ opsErrorLogProcessed.Add(1)
+ }()
+ }
+ }()
+ }
+}
+
+func enqueueOpsErrorLog(ops *service.OpsService, entry *service.OpsInsertErrorLogInput, requestBody []byte) {
+ if ops == nil || entry == nil {
+ return
+ }
+ select {
+ case <-opsErrorLogShutdownCh:
+ return
+ default:
+ }
+
+ opsErrorLogMu.RLock()
+ stopping := opsErrorLogStopping
+ opsErrorLogMu.RUnlock()
+ if stopping {
+ return
+ }
+
+ opsErrorLogOnce.Do(startOpsErrorLogWorkers)
+
+ opsErrorLogMu.RLock()
+ defer opsErrorLogMu.RUnlock()
+ if opsErrorLogStopping || opsErrorLogQueue == nil {
+ return
+ }
+
+ select {
+ case opsErrorLogQueue <- opsErrorLogJob{ops: ops, entry: entry, requestBody: requestBody}:
+ opsErrorLogQueueLen.Add(1)
+ opsErrorLogEnqueued.Add(1)
+ default:
+ // Queue is full; drop to avoid blocking request handling.
+ opsErrorLogDropped.Add(1)
+ maybeLogOpsErrorLogDrop()
+ }
+}
+
+func StopOpsErrorLogWorkers() bool {
+ opsErrorLogStopOnce.Do(func() {
+ opsErrorLogShutdownOnce.Do(func() {
+ close(opsErrorLogShutdownCh)
+ })
+ opsErrorLogDrained.Store(stopOpsErrorLogWorkers())
+ })
+ return opsErrorLogDrained.Load()
+}
+
+func stopOpsErrorLogWorkers() bool {
+ opsErrorLogMu.Lock()
+ opsErrorLogStopping = true
+ ch := opsErrorLogQueue
+ if ch != nil {
+ close(ch)
+ }
+ opsErrorLogQueue = nil
+ opsErrorLogMu.Unlock()
+
+ if ch == nil {
+ opsErrorLogQueueLen.Store(0)
+ return true
+ }
+
+ done := make(chan struct{})
+ go func() {
+ opsErrorLogWorkersWg.Wait()
+ close(done)
+ }()
+
+ select {
+ case <-done:
+ opsErrorLogQueueLen.Store(0)
+ return true
+ case <-time.After(opsErrorLogDrainTimeout):
+ return false
+ }
+}
+
+func OpsErrorLogQueueLength() int64 {
+ return opsErrorLogQueueLen.Load()
+}
+
+func OpsErrorLogQueueCapacity() int {
+ opsErrorLogMu.RLock()
+ ch := opsErrorLogQueue
+ opsErrorLogMu.RUnlock()
+ if ch == nil {
+ return 0
+ }
+ return cap(ch)
+}
+
+func OpsErrorLogDroppedTotal() int64 {
+ return opsErrorLogDropped.Load()
+}
+
+func OpsErrorLogEnqueuedTotal() int64 {
+ return opsErrorLogEnqueued.Load()
+}
+
+func OpsErrorLogProcessedTotal() int64 {
+ return opsErrorLogProcessed.Load()
+}
+
+func maybeLogOpsErrorLogDrop() {
+ now := time.Now().Unix()
+
+ for {
+ last := opsErrorLogLastDropLogAt.Load()
+ if last != 0 && now-last < 60 {
+ return
+ }
+ if opsErrorLogLastDropLogAt.CompareAndSwap(last, now) {
+ break
+ }
+ }
+
+ queued := opsErrorLogQueueLen.Load()
+ queueCap := OpsErrorLogQueueCapacity()
+
+ log.Printf(
+ "[OpsErrorLogger] queue is full; dropping logs (queued=%d cap=%d enqueued_total=%d dropped_total=%d processed_total=%d)",
+ queued,
+ queueCap,
+ opsErrorLogEnqueued.Load(),
+ opsErrorLogDropped.Load(),
+ opsErrorLogProcessed.Load(),
+ )
+}
+
+func opsErrorLogConfig() (workerCount int, queueSize int) {
+ workerCount = runtime.GOMAXPROCS(0) * 2
+ if workerCount < opsErrorLogMinWorkerCount {
+ workerCount = opsErrorLogMinWorkerCount
+ }
+ if workerCount > opsErrorLogMaxWorkerCount {
+ workerCount = opsErrorLogMaxWorkerCount
+ }
+
+ queueSize = workerCount * opsErrorLogQueueSizePerWorker
+ if queueSize < opsErrorLogMinQueueSize {
+ queueSize = opsErrorLogMinQueueSize
+ }
+ if queueSize > opsErrorLogMaxQueueSize {
+ queueSize = opsErrorLogMaxQueueSize
+ }
+
+ return workerCount, queueSize
+}
+
+func setOpsRequestContext(c *gin.Context, model string, stream bool, requestBody []byte) {
+ if c == nil {
+ return
+ }
+ c.Set(opsModelKey, model)
+ c.Set(opsStreamKey, stream)
+ if len(requestBody) > 0 {
+ c.Set(opsRequestBodyKey, requestBody)
+ }
+}
+
+func setOpsSelectedAccount(c *gin.Context, accountID int64) {
+ if c == nil || accountID <= 0 {
+ return
+ }
+ c.Set(opsAccountIDKey, accountID)
+}
+
+type opsCaptureWriter struct {
+ gin.ResponseWriter
+ limit int
+ buf bytes.Buffer
+}
+
+func (w *opsCaptureWriter) Write(b []byte) (int, error) {
+ if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit {
+ remaining := w.limit - w.buf.Len()
+ if len(b) > remaining {
+ _, _ = w.buf.Write(b[:remaining])
+ } else {
+ _, _ = w.buf.Write(b)
+ }
+ }
+ return w.ResponseWriter.Write(b)
+}
+
+func (w *opsCaptureWriter) WriteString(s string) (int, error) {
+ if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit {
+ remaining := w.limit - w.buf.Len()
+ if len(s) > remaining {
+ _, _ = w.buf.WriteString(s[:remaining])
+ } else {
+ _, _ = w.buf.WriteString(s)
+ }
+ }
+ return w.ResponseWriter.WriteString(s)
+}
+
+// OpsErrorLoggerMiddleware records error responses (status >= 400) into ops_error_logs.
+//
+// Notes:
+// - It buffers response bodies only when status >= 400 to avoid overhead for successful traffic.
+// - Streaming errors after the response has started (SSE) may still need explicit logging.
+func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
+ return func(c *gin.Context) {
+ w := &opsCaptureWriter{ResponseWriter: c.Writer, limit: 64 * 1024}
+ c.Writer = w
+ c.Next()
+
+ status := c.Writer.Status()
+ if status < 400 {
+ return
+ }
+ if ops == nil {
+ return
+ }
+ if !ops.IsMonitoringEnabled(c.Request.Context()) {
+ return
+ }
+
+ body := w.buf.Bytes()
+ parsed := parseOpsErrorResponse(body)
+
+ apiKey, _ := middleware2.GetAPIKeyFromContext(c)
+
+ clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string)
+
+ model, _ := c.Get(opsModelKey)
+ streamV, _ := c.Get(opsStreamKey)
+ accountIDV, _ := c.Get(opsAccountIDKey)
+
+ var modelName string
+ if s, ok := model.(string); ok {
+ modelName = s
+ }
+ stream := false
+ if b, ok := streamV.(bool); ok {
+ stream = b
+ }
+ var accountID *int64
+ if v, ok := accountIDV.(int64); ok && v > 0 {
+ accountID = &v
+ }
+
+ fallbackPlatform := guessPlatformFromPath(c.Request.URL.Path)
+ platform := resolveOpsPlatform(apiKey, fallbackPlatform)
+
+ requestID := c.Writer.Header().Get("X-Request-Id")
+ if requestID == "" {
+ requestID = c.Writer.Header().Get("x-request-id")
+ }
+
+ phase := classifyOpsPhase(parsed.ErrorType, parsed.Message, parsed.Code)
+ isBusinessLimited := classifyOpsIsBusinessLimited(parsed.ErrorType, phase, parsed.Code, status, parsed.Message)
+
+ errorOwner := classifyOpsErrorOwner(phase, parsed.Message)
+ errorSource := classifyOpsErrorSource(phase, parsed.Message)
+
+ entry := &service.OpsInsertErrorLogInput{
+ RequestID: requestID,
+ ClientRequestID: clientRequestID,
+
+ AccountID: accountID,
+ Platform: platform,
+ Model: modelName,
+ RequestPath: func() string {
+ if c.Request != nil && c.Request.URL != nil {
+ return c.Request.URL.Path
+ }
+ return ""
+ }(),
+ Stream: stream,
+ UserAgent: c.GetHeader("User-Agent"),
+
+ ErrorPhase: phase,
+ ErrorType: normalizeOpsErrorType(parsed.ErrorType, parsed.Code),
+ Severity: classifyOpsSeverity(parsed.ErrorType, status),
+ StatusCode: status,
+ IsBusinessLimited: isBusinessLimited,
+
+ ErrorMessage: parsed.Message,
+ // Keep the full captured error body (capture is already capped at 64KB) so the
+ // service layer can sanitize JSON before truncating for storage.
+ ErrorBody: string(body),
+ ErrorSource: errorSource,
+ ErrorOwner: errorOwner,
+
+ IsRetryable: classifyOpsIsRetryable(parsed.ErrorType, status),
+ RetryCount: 0,
+ CreatedAt: time.Now(),
+ }
+
+ if apiKey != nil {
+ entry.APIKeyID = &apiKey.ID
+ if apiKey.User != nil {
+ entry.UserID = &apiKey.User.ID
+ }
+ if apiKey.GroupID != nil {
+ entry.GroupID = apiKey.GroupID
+ }
+ // Prefer group platform if present (more stable than inferring from path).
+ if apiKey.Group != nil && apiKey.Group.Platform != "" {
+ entry.Platform = apiKey.Group.Platform
+ }
+ }
+
+ var clientIP string
+ if ip := strings.TrimSpace(c.ClientIP()); ip != "" {
+ clientIP = ip
+ entry.ClientIP = &clientIP
+ }
+
+ var requestBody []byte
+ if v, ok := c.Get(opsRequestBodyKey); ok {
+ if b, ok := v.([]byte); ok && len(b) > 0 {
+ requestBody = b
+ }
+ }
+ // Persist only a minimal, whitelisted set of request headers to improve retry fidelity.
+ // Do NOT store Authorization/Cookie/etc.
+ entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c)
+
+ enqueueOpsErrorLog(ops, entry, requestBody)
+ }
+}
+
+var opsRetryRequestHeaderAllowlist = []string{
+ "anthropic-beta",
+ "anthropic-version",
+}
+
+func extractOpsRetryRequestHeaders(c *gin.Context) *string {
+ if c == nil || c.Request == nil {
+ return nil
+ }
+
+ headers := make(map[string]string, 4)
+ for _, key := range opsRetryRequestHeaderAllowlist {
+ v := strings.TrimSpace(c.GetHeader(key))
+ if v == "" {
+ continue
+ }
+ // Keep headers small even if a client sends something unexpected.
+ headers[key] = truncateString(v, 512)
+ }
+ if len(headers) == 0 {
+ return nil
+ }
+
+ raw, err := json.Marshal(headers)
+ if err != nil {
+ return nil
+ }
+ s := string(raw)
+ return &s
+}
+
+type parsedOpsError struct {
+ ErrorType string
+ Message string
+ Code string
+}
+
+func parseOpsErrorResponse(body []byte) parsedOpsError {
+ if len(body) == 0 {
+ return parsedOpsError{}
+ }
+
+ // Fast path: attempt to decode into a generic map.
+ var m map[string]any
+ if err := json.Unmarshal(body, &m); err != nil {
+ return parsedOpsError{Message: truncateString(string(body), 1024)}
+ }
+
+ // Claude/OpenAI-style gateway error: { type:"error", error:{ type, message } }
+ if errObj, ok := m["error"].(map[string]any); ok {
+ t, _ := errObj["type"].(string)
+ msg, _ := errObj["message"].(string)
+ // Gemini googleError also uses "error": { code, message, status }
+ if msg == "" {
+ if v, ok := errObj["message"]; ok {
+ msg, _ = v.(string)
+ }
+ }
+ if t == "" {
+ // Gemini error does not have "type" field.
+ t = "api_error"
+ }
+ // For gemini error, capture numeric code as string for business-limited mapping if needed.
+ var code string
+ if v, ok := errObj["code"]; ok {
+ switch n := v.(type) {
+ case float64:
+ code = strconvItoa(int(n))
+ case int:
+ code = strconvItoa(n)
+ }
+ }
+ return parsedOpsError{ErrorType: t, Message: msg, Code: code}
+ }
+
+ // APIKeyAuth-style: { code:"INSUFFICIENT_BALANCE", message:"..." }
+ code, _ := m["code"].(string)
+ msg, _ := m["message"].(string)
+ if code != "" || msg != "" {
+ return parsedOpsError{ErrorType: "api_error", Message: msg, Code: code}
+ }
+
+ return parsedOpsError{Message: truncateString(string(body), 1024)}
+}
+
+func resolveOpsPlatform(apiKey *service.APIKey, fallback string) string {
+ if apiKey != nil && apiKey.Group != nil && apiKey.Group.Platform != "" {
+ return apiKey.Group.Platform
+ }
+ return fallback
+}
+
+func guessPlatformFromPath(path string) string {
+ p := strings.ToLower(path)
+ switch {
+ case strings.HasPrefix(p, "/antigravity/"):
+ return service.PlatformAntigravity
+ case strings.HasPrefix(p, "/v1beta/"):
+ return service.PlatformGemini
+ case strings.Contains(p, "/responses"):
+ return service.PlatformOpenAI
+ default:
+ return ""
+ }
+}
+
+func normalizeOpsErrorType(errType string, code string) string {
+ if errType != "" {
+ return errType
+ }
+ switch strings.TrimSpace(code) {
+ case "INSUFFICIENT_BALANCE":
+ return "billing_error"
+ case "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
+ return "subscription_error"
+ default:
+ return "api_error"
+ }
+}
+
+func classifyOpsPhase(errType, message, code string) string {
+ msg := strings.ToLower(message)
+ switch strings.TrimSpace(code) {
+ case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
+ return "billing"
+ }
+
+ switch errType {
+ case "authentication_error":
+ return "auth"
+ case "billing_error", "subscription_error":
+ return "billing"
+ case "rate_limit_error":
+ if strings.Contains(msg, "concurrency") || strings.Contains(msg, "pending") || strings.Contains(msg, "queue") {
+ return "concurrency"
+ }
+ return "upstream"
+ case "invalid_request_error":
+ return "response"
+ case "upstream_error", "overloaded_error":
+ return "upstream"
+ case "api_error":
+ if strings.Contains(msg, "no available accounts") {
+ return "scheduling"
+ }
+ return "internal"
+ default:
+ return "internal"
+ }
+}
+
+func classifyOpsSeverity(errType string, status int) string {
+ switch errType {
+ case "invalid_request_error", "authentication_error", "billing_error", "subscription_error":
+ return "P3"
+ }
+ if status >= 500 {
+ return "P1"
+ }
+ if status == 429 {
+ return "P1"
+ }
+ if status >= 400 {
+ return "P2"
+ }
+ return "P3"
+}
+
+func classifyOpsIsRetryable(errType string, statusCode int) bool {
+ switch errType {
+ case "authentication_error", "invalid_request_error":
+ return false
+ case "timeout_error":
+ return true
+ case "rate_limit_error":
+ // May be transient (upstream or queue); retry can help.
+ return true
+ case "billing_error", "subscription_error":
+ return false
+ case "upstream_error", "overloaded_error":
+ return statusCode >= 500 || statusCode == 429 || statusCode == 529
+ default:
+ return statusCode >= 500
+ }
+}
+
+func classifyOpsIsBusinessLimited(errType, phase, code string, status int, message string) bool {
+ switch strings.TrimSpace(code) {
+ case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
+ return true
+ }
+ if phase == "billing" || phase == "concurrency" {
+ // SLA/错误率排除“用户级业务限制”
+ return true
+ }
+ // Avoid treating upstream rate limits as business-limited.
+ if errType == "rate_limit_error" && strings.Contains(strings.ToLower(message), "upstream") {
+ return false
+ }
+ _ = status
+ return false
+}
+
+func classifyOpsErrorOwner(phase string, message string) string {
+ switch phase {
+ case "upstream", "network":
+ return "provider"
+ case "billing", "concurrency", "auth", "response":
+ return "client"
+ default:
+ if strings.Contains(strings.ToLower(message), "upstream") {
+ return "provider"
+ }
+ return "sub2api"
+ }
+}
+
+func classifyOpsErrorSource(phase string, message string) string {
+ switch phase {
+ case "upstream":
+ return "upstream_http"
+ case "network":
+ return "upstream_network"
+ case "billing":
+ return "billing"
+ case "concurrency":
+ return "concurrency"
+ default:
+ if strings.Contains(strings.ToLower(message), "upstream") {
+ return "upstream_http"
+ }
+ return "internal"
+ }
+}
+
+func truncateString(s string, max int) string {
+ if max <= 0 {
+ return ""
+ }
+ if len(s) <= max {
+ return s
+ }
+ cut := s[:max]
+ // Ensure truncation does not split multi-byte characters.
+ for len(cut) > 0 && !utf8.ValidString(cut) {
+ cut = cut[:len(cut)-1]
+ }
+ return cut
+}
+
+func strconvItoa(v int) string {
+ return strconv.Itoa(v)
+}
diff --git a/backend/internal/handler/wire.go b/backend/internal/handler/wire.go
index 1695f8a9..e5d8d077 100644
--- a/backend/internal/handler/wire.go
+++ b/backend/internal/handler/wire.go
@@ -20,6 +20,7 @@ func ProvideAdminHandlers(
proxyHandler *admin.ProxyHandler,
redeemHandler *admin.RedeemHandler,
settingHandler *admin.SettingHandler,
+ opsHandler *admin.OpsHandler,
systemHandler *admin.SystemHandler,
subscriptionHandler *admin.SubscriptionHandler,
usageHandler *admin.UsageHandler,
@@ -37,6 +38,7 @@ func ProvideAdminHandlers(
Proxy: proxyHandler,
Redeem: redeemHandler,
Setting: settingHandler,
+ Ops: opsHandler,
System: systemHandler,
Subscription: subscriptionHandler,
Usage: usageHandler,
@@ -106,6 +108,7 @@ var ProviderSet = wire.NewSet(
admin.NewProxyHandler,
admin.NewRedeemHandler,
admin.NewSettingHandler,
+ admin.NewOpsHandler,
ProvideSystemHandler,
admin.NewSubscriptionHandler,
admin.NewUsageHandler,
diff --git a/backend/internal/server/middleware/admin_auth.go b/backend/internal/server/middleware/admin_auth.go
index e02a7b0a..8f30107c 100644
--- a/backend/internal/server/middleware/admin_auth.go
+++ b/backend/internal/server/middleware/admin_auth.go
@@ -30,6 +30,20 @@ func adminAuth(
settingService *service.SettingService,
) gin.HandlerFunc {
return func(c *gin.Context) {
+ // WebSocket upgrade requests cannot set Authorization headers in browsers.
+ // For admin WebSocket endpoints (e.g. Ops realtime), allow passing the JWT via
+ // Sec-WebSocket-Protocol (subprotocol list) using a prefixed token item:
+ // Sec-WebSocket-Protocol: sub2api-admin, jwt.
+ if isWebSocketUpgradeRequest(c) {
+ if token := extractJWTFromWebSocketSubprotocol(c); token != "" {
+ if !validateJWTForAdmin(c, token, authService, userService) {
+ return
+ }
+ c.Next()
+ return
+ }
+ }
+
// 检查 x-api-key header(Admin API Key 认证)
apiKey := c.GetHeader("x-api-key")
if apiKey != "" {
@@ -58,6 +72,44 @@ func adminAuth(
}
}
+func isWebSocketUpgradeRequest(c *gin.Context) bool {
+ if c == nil || c.Request == nil {
+ return false
+ }
+ // RFC6455 handshake uses:
+ // Connection: Upgrade
+ // Upgrade: websocket
+ upgrade := strings.ToLower(strings.TrimSpace(c.GetHeader("Upgrade")))
+ if upgrade != "websocket" {
+ return false
+ }
+ connection := strings.ToLower(c.GetHeader("Connection"))
+ return strings.Contains(connection, "upgrade")
+}
+
+func extractJWTFromWebSocketSubprotocol(c *gin.Context) string {
+ if c == nil {
+ return ""
+ }
+ raw := strings.TrimSpace(c.GetHeader("Sec-WebSocket-Protocol"))
+ if raw == "" {
+ return ""
+ }
+
+ // The header is a comma-separated list of tokens. We reserve the prefix "jwt."
+ // for carrying the admin JWT.
+ for _, part := range strings.Split(raw, ",") {
+ p := strings.TrimSpace(part)
+ if strings.HasPrefix(p, "jwt.") {
+ token := strings.TrimSpace(strings.TrimPrefix(p, "jwt."))
+ if token != "" {
+ return token
+ }
+ }
+ }
+ return ""
+}
+
// validateAdminAPIKey 验证管理员 API Key
func validateAdminAPIKey(
c *gin.Context,
diff --git a/backend/internal/server/middleware/client_request_id.go b/backend/internal/server/middleware/client_request_id.go
new file mode 100644
index 00000000..60d444ce
--- /dev/null
+++ b/backend/internal/server/middleware/client_request_id.go
@@ -0,0 +1,31 @@
+package middleware
+
+import (
+ "context"
+
+ "github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
+ "github.com/gin-gonic/gin"
+ "github.com/google/uuid"
+)
+
+// ClientRequestID ensures every request has a unique client_request_id in request.Context().
+//
+// This is used by the Ops monitoring module for end-to-end request correlation.
+func ClientRequestID() gin.HandlerFunc {
+ return func(c *gin.Context) {
+ if c.Request == nil {
+ c.Next()
+ return
+ }
+
+ if v := c.Request.Context().Value(ctxkey.ClientRequestID); v != nil {
+ c.Next()
+ return
+ }
+
+ id := uuid.New().String()
+ c.Request = c.Request.WithContext(context.WithValue(c.Request.Context(), ctxkey.ClientRequestID, id))
+ c.Next()
+ }
+}
+
diff --git a/backend/internal/server/middleware/ws_query_token_auth.go b/backend/internal/server/middleware/ws_query_token_auth.go
new file mode 100644
index 00000000..3b8d086a
--- /dev/null
+++ b/backend/internal/server/middleware/ws_query_token_auth.go
@@ -0,0 +1,54 @@
+package middleware
+
+import (
+ "net/http"
+ "strings"
+
+ "github.com/gin-gonic/gin"
+)
+
+// InjectBearerTokenFromQueryForWebSocket copies `?token=` into the Authorization header
+// for WebSocket handshake requests on a small allow-list of endpoints.
+//
+// Why: browsers can't set custom headers on WebSocket handshake, but our admin routes
+// are protected by header-based auth. This keeps the token support scoped to WS only.
+func InjectBearerTokenFromQueryForWebSocket() gin.HandlerFunc {
+ return func(c *gin.Context) {
+ if c == nil || c.Request == nil {
+ if c != nil {
+ c.Next()
+ }
+ return
+ }
+
+ // Only GET websocket upgrades.
+ if c.Request.Method != http.MethodGet {
+ c.Next()
+ return
+ }
+ if !strings.EqualFold(strings.TrimSpace(c.GetHeader("Upgrade")), "websocket") {
+ c.Next()
+ return
+ }
+
+ // If caller already supplied auth headers, don't override.
+ if strings.TrimSpace(c.GetHeader("Authorization")) != "" || strings.TrimSpace(c.GetHeader("x-api-key")) != "" {
+ c.Next()
+ return
+ }
+
+ // Allow-list ops websocket endpoints.
+ path := strings.TrimSpace(c.Request.URL.Path)
+ if !strings.HasPrefix(path, "/api/v1/admin/ops/ws/") {
+ c.Next()
+ return
+ }
+
+ token := strings.TrimSpace(c.Query("token"))
+ if token != "" {
+ c.Request.Header.Set("Authorization", "Bearer "+token)
+ }
+
+ c.Next()
+ }
+}
From e0d12b46d895dd9d9fa2d3bda35c40630398ae19 Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Fri, 9 Jan 2026 20:55:12 +0800
Subject: [PATCH 06/53] =?UTF-8?q?feat(=E8=B7=AF=E7=94=B1):=20=E9=9B=86?=
=?UTF-8?q?=E6=88=90=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E8=B7=AF=E7=94=B1?=
=?UTF-8?q?=E5=88=B0=E6=9C=8D=E5=8A=A1=E5=99=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 更新路由器注册 ops 监控路由
- 添加 ops 管理路由(dashboard, alerts, realtime, settings, ws)
- 更新 gateway 路由支持请求追踪
- 集成 ops 服务到 HTTP 服务器
---
backend/internal/server/http.go | 3 +-
backend/internal/server/router.go | 8 +++-
backend/internal/server/routes/admin.go | 51 +++++++++++++++++++++++
backend/internal/server/routes/gateway.go | 13 +++++-
4 files changed, 71 insertions(+), 4 deletions(-)
diff --git a/backend/internal/server/http.go b/backend/internal/server/http.go
index a8740ecc..7b273771 100644
--- a/backend/internal/server/http.go
+++ b/backend/internal/server/http.go
@@ -30,6 +30,7 @@ func ProvideRouter(
apiKeyAuth middleware2.APIKeyAuthMiddleware,
apiKeyService *service.APIKeyService,
subscriptionService *service.SubscriptionService,
+ opsService *service.OpsService,
) *gin.Engine {
if cfg.Server.Mode == "release" {
gin.SetMode(gin.ReleaseMode)
@@ -47,7 +48,7 @@ func ProvideRouter(
}
}
- return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, cfg)
+ return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg)
}
// ProvideHTTPServer 提供 HTTP 服务器
diff --git a/backend/internal/server/router.go b/backend/internal/server/router.go
index 15a1b325..85df99bd 100644
--- a/backend/internal/server/router.go
+++ b/backend/internal/server/router.go
@@ -20,10 +20,13 @@ func SetupRouter(
apiKeyAuth middleware2.APIKeyAuthMiddleware,
apiKeyService *service.APIKeyService,
subscriptionService *service.SubscriptionService,
+ opsService *service.OpsService,
cfg *config.Config,
) *gin.Engine {
// 应用中间件
r.Use(middleware2.Logger())
+ // WebSocket handshake auth helper (token via query param, WS endpoints only).
+ r.Use(middleware2.InjectBearerTokenFromQueryForWebSocket())
r.Use(middleware2.CORS(cfg.CORS))
r.Use(middleware2.SecurityHeaders(cfg.Security.CSP))
@@ -33,7 +36,7 @@ func SetupRouter(
}
// 注册路由
- registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, cfg)
+ registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg)
return r
}
@@ -47,6 +50,7 @@ func registerRoutes(
apiKeyAuth middleware2.APIKeyAuthMiddleware,
apiKeyService *service.APIKeyService,
subscriptionService *service.SubscriptionService,
+ opsService *service.OpsService,
cfg *config.Config,
) {
// 通用路由(健康检查、状态等)
@@ -59,5 +63,5 @@ func registerRoutes(
routes.RegisterAuthRoutes(v1, h, jwtAuth)
routes.RegisterUserRoutes(v1, h, jwtAuth)
routes.RegisterAdminRoutes(v1, h, adminAuth)
- routes.RegisterGatewayRoutes(r, h, apiKeyAuth, apiKeyService, subscriptionService, cfg)
+ routes.RegisterGatewayRoutes(r, h, apiKeyAuth, apiKeyService, subscriptionService, opsService, cfg)
}
diff --git a/backend/internal/server/routes/admin.go b/backend/internal/server/routes/admin.go
index 663c2d02..e69b1eb8 100644
--- a/backend/internal/server/routes/admin.go
+++ b/backend/internal/server/routes/admin.go
@@ -47,6 +47,9 @@ func RegisterAdminRoutes(
// 系统设置
registerSettingsRoutes(admin, h)
+ // 运维监控(Ops)
+ registerOpsRoutes(admin, h)
+
// 系统管理
registerSystemRoutes(admin, h)
@@ -61,6 +64,54 @@ func RegisterAdminRoutes(
}
}
+func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) {
+ ops := admin.Group("/ops")
+ {
+ // Realtime ops signals
+ ops.GET("/concurrency", h.Admin.Ops.GetConcurrencyStats)
+ ops.GET("/account-availability", h.Admin.Ops.GetAccountAvailability)
+
+ // Alerts (rules + events)
+ ops.GET("/alert-rules", h.Admin.Ops.ListAlertRules)
+ ops.POST("/alert-rules", h.Admin.Ops.CreateAlertRule)
+ ops.PUT("/alert-rules/:id", h.Admin.Ops.UpdateAlertRule)
+ ops.DELETE("/alert-rules/:id", h.Admin.Ops.DeleteAlertRule)
+ ops.GET("/alert-events", h.Admin.Ops.ListAlertEvents)
+
+ // Email notification config (DB-backed)
+ ops.GET("/email-notification/config", h.Admin.Ops.GetEmailNotificationConfig)
+ ops.PUT("/email-notification/config", h.Admin.Ops.UpdateEmailNotificationConfig)
+
+ // Runtime settings (DB-backed)
+ runtime := ops.Group("/runtime")
+ {
+ runtime.GET("/alert", h.Admin.Ops.GetAlertRuntimeSettings)
+ runtime.PUT("/alert", h.Admin.Ops.UpdateAlertRuntimeSettings)
+ }
+
+ // WebSocket realtime (QPS/TPS)
+ ws := ops.Group("/ws")
+ {
+ ws.GET("/qps", h.Admin.Ops.QPSWSHandler)
+ }
+
+ // Error logs (MVP-1)
+ ops.GET("/errors", h.Admin.Ops.GetErrorLogs)
+ ops.GET("/errors/:id", h.Admin.Ops.GetErrorLogByID)
+ ops.POST("/errors/:id/retry", h.Admin.Ops.RetryErrorRequest)
+
+ // Request drilldown (success + error)
+ ops.GET("/requests", h.Admin.Ops.ListRequestDetails)
+
+ // Dashboard (vNext - raw path for MVP)
+ ops.GET("/dashboard/overview", h.Admin.Ops.GetDashboardOverview)
+ ops.GET("/dashboard/throughput-trend", h.Admin.Ops.GetDashboardThroughputTrend)
+ ops.GET("/dashboard/latency-histogram", h.Admin.Ops.GetDashboardLatencyHistogram)
+ ops.GET("/dashboard/error-trend", h.Admin.Ops.GetDashboardErrorTrend)
+ ops.GET("/dashboard/error-distribution", h.Admin.Ops.GetDashboardErrorDistribution)
+ }
+}
+
func registerDashboardRoutes(admin *gin.RouterGroup, h *handler.Handlers) {
dashboard := admin.Group("/dashboard")
{
diff --git a/backend/internal/server/routes/gateway.go b/backend/internal/server/routes/gateway.go
index 0b62185e..bf019ce3 100644
--- a/backend/internal/server/routes/gateway.go
+++ b/backend/internal/server/routes/gateway.go
@@ -16,13 +16,18 @@ func RegisterGatewayRoutes(
apiKeyAuth middleware.APIKeyAuthMiddleware,
apiKeyService *service.APIKeyService,
subscriptionService *service.SubscriptionService,
+ opsService *service.OpsService,
cfg *config.Config,
) {
bodyLimit := middleware.RequestBodyLimit(cfg.Gateway.MaxBodySize)
+ clientRequestID := middleware.ClientRequestID()
+ opsErrorLogger := handler.OpsErrorLoggerMiddleware(opsService)
// API网关(Claude API兼容)
gateway := r.Group("/v1")
gateway.Use(bodyLimit)
+ gateway.Use(clientRequestID)
+ gateway.Use(opsErrorLogger)
gateway.Use(gin.HandlerFunc(apiKeyAuth))
{
gateway.POST("/messages", h.Gateway.Messages)
@@ -36,6 +41,8 @@ func RegisterGatewayRoutes(
// Gemini 原生 API 兼容层(Gemini SDK/CLI 直连)
gemini := r.Group("/v1beta")
gemini.Use(bodyLimit)
+ gemini.Use(clientRequestID)
+ gemini.Use(opsErrorLogger)
gemini.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg))
{
gemini.GET("/models", h.Gateway.GeminiV1BetaListModels)
@@ -45,7 +52,7 @@ func RegisterGatewayRoutes(
}
// OpenAI Responses API(不带v1前缀的别名)
- r.POST("/responses", bodyLimit, gin.HandlerFunc(apiKeyAuth), h.OpenAIGateway.Responses)
+ r.POST("/responses", bodyLimit, clientRequestID, opsErrorLogger, gin.HandlerFunc(apiKeyAuth), h.OpenAIGateway.Responses)
// Antigravity 模型列表
r.GET("/antigravity/models", gin.HandlerFunc(apiKeyAuth), h.Gateway.AntigravityModels)
@@ -53,6 +60,8 @@ func RegisterGatewayRoutes(
// Antigravity 专用路由(仅使用 antigravity 账户,不混合调度)
antigravityV1 := r.Group("/antigravity/v1")
antigravityV1.Use(bodyLimit)
+ antigravityV1.Use(clientRequestID)
+ antigravityV1.Use(opsErrorLogger)
antigravityV1.Use(middleware.ForcePlatform(service.PlatformAntigravity))
antigravityV1.Use(gin.HandlerFunc(apiKeyAuth))
{
@@ -64,6 +73,8 @@ func RegisterGatewayRoutes(
antigravityV1Beta := r.Group("/antigravity/v1beta")
antigravityV1Beta.Use(bodyLimit)
+ antigravityV1Beta.Use(clientRequestID)
+ antigravityV1Beta.Use(opsErrorLogger)
antigravityV1Beta.Use(middleware.ForcePlatform(service.PlatformAntigravity))
antigravityV1Beta.Use(middleware.APIKeyAuthWithSubscriptionGoogle(apiKeyService, subscriptionService, cfg))
{
From d55dd56fd22732014738dcf4f91d740c17ba016c Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Fri, 9 Jan 2026 20:55:52 +0800
Subject: [PATCH 07/53] =?UTF-8?q?feat(=E4=BE=9D=E8=B5=96=E6=B3=A8=E5=85=A5?=
=?UTF-8?q?):=20=E9=9B=86=E6=88=90=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7?=
=?UTF-8?q?=E4=BE=9D=E8=B5=96=E6=B3=A8=E5=85=A5=E9=85=8D=E7=BD=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 更新 wire.go 添加 ops 服务依赖注入提供者
- 重新生成 wire_gen.go 包含完整的依赖注入图
---
backend/cmd/server/wire.go | 28 ++++++++++++++
backend/cmd/server/wire_gen.go | 71 +++++++++++++++++++++++++---------
2 files changed, 81 insertions(+), 18 deletions(-)
diff --git a/backend/cmd/server/wire.go b/backend/cmd/server/wire.go
index ff6ab4e6..11c202f0 100644
--- a/backend/cmd/server/wire.go
+++ b/backend/cmd/server/wire.go
@@ -62,6 +62,10 @@ func provideServiceBuildInfo(buildInfo handler.BuildInfo) service.BuildInfo {
func provideCleanup(
entClient *ent.Client,
rdb *redis.Client,
+ opsMetricsCollector *service.OpsMetricsCollector,
+ opsAggregation *service.OpsAggregationService,
+ opsAlertEvaluator *service.OpsAlertEvaluatorService,
+ opsCleanup *service.OpsCleanupService,
tokenRefresh *service.TokenRefreshService,
pricing *service.PricingService,
emailQueue *service.EmailQueueService,
@@ -80,6 +84,30 @@ func provideCleanup(
name string
fn func() error
}{
+ {"OpsCleanupService", func() error {
+ if opsCleanup != nil {
+ opsCleanup.Stop()
+ }
+ return nil
+ }},
+ {"OpsAlertEvaluatorService", func() error {
+ if opsAlertEvaluator != nil {
+ opsAlertEvaluator.Stop()
+ }
+ return nil
+ }},
+ {"OpsAggregationService", func() error {
+ if opsAggregation != nil {
+ opsAggregation.Stop()
+ }
+ return nil
+ }},
+ {"OpsMetricsCollector", func() error {
+ if opsMetricsCollector != nil {
+ opsMetricsCollector.Stop()
+ }
+ return nil
+ }},
{"TokenRefreshService", func() error {
tokenRefresh.Stop()
return nil
diff --git a/backend/cmd/server/wire_gen.go b/backend/cmd/server/wire_gen.go
index 768254f9..2a254fd6 100644
--- a/backend/cmd/server/wire_gen.go
+++ b/backend/cmd/server/wire_gen.go
@@ -87,6 +87,7 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
geminiOAuthClient := repository.NewGeminiOAuthClient(configConfig)
geminiCliCodeAssistClient := repository.NewGeminiCliCodeAssistClient()
geminiOAuthService := service.NewGeminiOAuthService(proxyRepository, geminiOAuthClient, geminiCliCodeAssistClient, configConfig)
+ antigravityOAuthService := service.NewAntigravityOAuthService(proxyRepository)
geminiQuotaService := service.NewGeminiQuotaService(configConfig, settingRepository)
tempUnschedCache := repository.NewTempUnschedCache(redisClient)
rateLimitService := service.NewRateLimitService(accountRepository, usageLogRepository, configConfig, geminiQuotaService, tempUnschedCache)
@@ -97,13 +98,12 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
geminiTokenCache := repository.NewGeminiTokenCache(redisClient)
geminiTokenProvider := service.NewGeminiTokenProvider(accountRepository, geminiTokenCache, geminiOAuthService)
gatewayCache := repository.NewGatewayCache(redisClient)
- antigravityOAuthService := service.NewAntigravityOAuthService(proxyRepository)
antigravityTokenProvider := service.NewAntigravityTokenProvider(accountRepository, geminiTokenCache, antigravityOAuthService)
httpUpstream := repository.NewHTTPUpstream(configConfig)
antigravityGatewayService := service.NewAntigravityGatewayService(accountRepository, gatewayCache, antigravityTokenProvider, rateLimitService, httpUpstream, settingService)
accountTestService := service.NewAccountTestService(accountRepository, geminiTokenProvider, antigravityGatewayService, httpUpstream, configConfig)
concurrencyCache := repository.ProvideConcurrencyCache(redisClient, configConfig)
- concurrencyService := service.NewConcurrencyService(concurrencyCache)
+ concurrencyService := service.ProvideConcurrencyService(concurrencyCache, accountRepository, configConfig)
crsSyncService := service.NewCRSSyncService(accountRepository, proxyRepository, oAuthService, openAIOAuthService, geminiOAuthService, configConfig)
accountHandler := admin.NewAccountHandler(adminService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService, rateLimitService, accountUsageService, accountTestService, concurrencyService, crsSyncService)
oAuthHandler := admin.NewOAuthHandler(oAuthService)
@@ -113,18 +113,7 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
proxyHandler := admin.NewProxyHandler(adminService)
adminRedeemHandler := admin.NewRedeemHandler(adminService)
settingHandler := admin.NewSettingHandler(settingService, emailService, turnstileService)
- updateCache := repository.NewUpdateCache(redisClient)
- gitHubReleaseClient := repository.NewGitHubReleaseClient()
- serviceBuildInfo := provideServiceBuildInfo(buildInfo)
- updateService := service.ProvideUpdateService(updateCache, gitHubReleaseClient, serviceBuildInfo)
- systemHandler := handler.ProvideSystemHandler(updateService)
- adminSubscriptionHandler := admin.NewSubscriptionHandler(subscriptionService)
- adminUsageHandler := admin.NewUsageHandler(usageService, apiKeyService, adminService)
- userAttributeDefinitionRepository := repository.NewUserAttributeDefinitionRepository(client)
- userAttributeValueRepository := repository.NewUserAttributeValueRepository(client)
- userAttributeService := service.NewUserAttributeService(userAttributeDefinitionRepository, userAttributeValueRepository)
- userAttributeHandler := admin.NewUserAttributeHandler(userAttributeService)
- adminHandlers := handler.ProvideAdminHandlers(dashboardHandler, adminUserHandler, groupHandler, accountHandler, oAuthHandler, openAIOAuthHandler, geminiOAuthHandler, antigravityOAuthHandler, proxyHandler, adminRedeemHandler, settingHandler, systemHandler, adminSubscriptionHandler, adminUsageHandler, userAttributeHandler)
+ opsRepository := repository.NewOpsRepository(db)
pricingRemoteClient := repository.NewPricingRemoteClient(configConfig)
pricingService, err := service.ProvidePricingService(configConfig, pricingRemoteClient)
if err != nil {
@@ -136,19 +125,37 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
timingWheelService := service.ProvideTimingWheelService()
deferredService := service.ProvideDeferredService(accountRepository, timingWheelService)
gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService)
- geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig)
- gatewayHandler := handler.NewGatewayHandler(gatewayService, geminiMessagesCompatService, antigravityGatewayService, userService, concurrencyService, billingCacheService, configConfig)
openAIGatewayService := service.NewOpenAIGatewayService(accountRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, concurrencyService, billingService, rateLimitService, billingCacheService, httpUpstream, deferredService)
+ geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig)
+ opsService := service.NewOpsService(opsRepository, settingRepository, configConfig, accountRepository, concurrencyService, gatewayService, openAIGatewayService, geminiMessagesCompatService, antigravityGatewayService)
+ opsHandler := admin.NewOpsHandler(opsService)
+ updateCache := repository.NewUpdateCache(redisClient)
+ gitHubReleaseClient := repository.NewGitHubReleaseClient()
+ serviceBuildInfo := provideServiceBuildInfo(buildInfo)
+ updateService := service.ProvideUpdateService(updateCache, gitHubReleaseClient, serviceBuildInfo)
+ systemHandler := handler.ProvideSystemHandler(updateService)
+ adminSubscriptionHandler := admin.NewSubscriptionHandler(subscriptionService)
+ adminUsageHandler := admin.NewUsageHandler(usageService, apiKeyService, adminService)
+ userAttributeDefinitionRepository := repository.NewUserAttributeDefinitionRepository(client)
+ userAttributeValueRepository := repository.NewUserAttributeValueRepository(client)
+ userAttributeService := service.NewUserAttributeService(userAttributeDefinitionRepository, userAttributeValueRepository)
+ userAttributeHandler := admin.NewUserAttributeHandler(userAttributeService)
+ adminHandlers := handler.ProvideAdminHandlers(dashboardHandler, adminUserHandler, groupHandler, accountHandler, oAuthHandler, openAIOAuthHandler, geminiOAuthHandler, antigravityOAuthHandler, proxyHandler, adminRedeemHandler, settingHandler, opsHandler, systemHandler, adminSubscriptionHandler, adminUsageHandler, userAttributeHandler)
+ gatewayHandler := handler.NewGatewayHandler(gatewayService, geminiMessagesCompatService, antigravityGatewayService, userService, concurrencyService, billingCacheService, configConfig)
openAIGatewayHandler := handler.NewOpenAIGatewayHandler(openAIGatewayService, concurrencyService, billingCacheService, configConfig)
handlerSettingHandler := handler.ProvideSettingHandler(settingService, buildInfo)
handlers := handler.ProvideHandlers(authHandler, userHandler, apiKeyHandler, usageHandler, redeemHandler, subscriptionHandler, adminHandlers, gatewayHandler, openAIGatewayHandler, handlerSettingHandler)
jwtAuthMiddleware := middleware.NewJWTAuthMiddleware(authService, userService)
adminAuthMiddleware := middleware.NewAdminAuthMiddleware(authService, userService, settingService)
apiKeyAuthMiddleware := middleware.NewAPIKeyAuthMiddleware(apiKeyService, subscriptionService, configConfig)
- engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService)
+ engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, opsService)
httpServer := server.ProvideHTTPServer(configConfig, engine)
+ opsMetricsCollector := service.ProvideOpsMetricsCollector(opsRepository, settingRepository, db, redisClient, configConfig)
+ opsAggregationService := service.ProvideOpsAggregationService(opsRepository, settingRepository, db, redisClient, configConfig)
+ opsAlertEvaluatorService := service.ProvideOpsAlertEvaluatorService(opsService, opsRepository, emailService, redisClient, configConfig)
+ opsCleanupService := service.ProvideOpsCleanupService(opsRepository, db, redisClient, configConfig)
tokenRefreshService := service.ProvideTokenRefreshService(accountRepository, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService, configConfig)
- v := provideCleanup(client, redisClient, tokenRefreshService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService)
+ v := provideCleanup(client, redisClient, opsMetricsCollector, opsAggregationService, opsAlertEvaluatorService, opsCleanupService, tokenRefreshService, pricingService, emailQueueService, billingCacheService, oAuthService, openAIOAuthService, geminiOAuthService, antigravityOAuthService)
application := &Application{
Server: httpServer,
Cleanup: v,
@@ -173,6 +180,10 @@ func provideServiceBuildInfo(buildInfo handler.BuildInfo) service.BuildInfo {
func provideCleanup(
entClient *ent.Client,
rdb *redis.Client,
+ opsMetricsCollector *service.OpsMetricsCollector,
+ opsAggregation *service.OpsAggregationService,
+ opsAlertEvaluator *service.OpsAlertEvaluatorService,
+ opsCleanup *service.OpsCleanupService,
tokenRefresh *service.TokenRefreshService,
pricing *service.PricingService,
emailQueue *service.EmailQueueService,
@@ -190,6 +201,30 @@ func provideCleanup(
name string
fn func() error
}{
+ {"OpsCleanupService", func() error {
+ if opsCleanup != nil {
+ opsCleanup.Stop()
+ }
+ return nil
+ }},
+ {"OpsAlertEvaluatorService", func() error {
+ if opsAlertEvaluator != nil {
+ opsAlertEvaluator.Stop()
+ }
+ return nil
+ }},
+ {"OpsAggregationService", func() error {
+ if opsAggregation != nil {
+ opsAggregation.Stop()
+ }
+ return nil
+ }},
+ {"OpsMetricsCollector", func() error {
+ if opsMetricsCollector != nil {
+ opsMetricsCollector.Stop()
+ }
+ return nil
+ }},
{"TokenRefreshService", func() error {
tokenRefresh.Stop()
return nil
From fcdf839b6bb6defd344d26d07d8597110019b958 Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Fri, 9 Jan 2026 20:56:37 +0800
Subject: [PATCH 08/53] =?UTF-8?q?feat(=E7=BD=91=E5=85=B3):=20=E9=9B=86?=
=?UTF-8?q?=E6=88=90=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E5=88=B0=20API=20?=
=?UTF-8?q?=E7=BD=91=E5=85=B3=E5=A4=84=E7=90=86=E5=99=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 在 gateway_handler 中添加请求监控和错误追踪
- 在 openai_gateway_handler 中集成 ops 指标采集
- 在 gemini_v1beta_handler 中集成 ops 指标采集
- 更新 handler 基类支持 ops 错误日志记录
---
backend/internal/handler/gateway_handler.go | 78 ++++++++++++-------
.../internal/handler/gemini_v1beta_handler.go | 41 ++++++----
backend/internal/handler/handler.go | 1 +
.../handler/openai_gateway_handler.go | 45 +++++++----
4 files changed, 112 insertions(+), 53 deletions(-)
diff --git a/backend/internal/handler/gateway_handler.go b/backend/internal/handler/gateway_handler.go
index de3cbad9..7d1eab28 100644
--- a/backend/internal/handler/gateway_handler.go
+++ b/backend/internal/handler/gateway_handler.go
@@ -88,6 +88,8 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
return
}
+ setOpsRequestContext(c, "", false, body)
+
parsedReq, err := service.ParseGatewayRequest(body)
if err != nil {
h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
@@ -96,6 +98,8 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
reqModel := parsedReq.Model
reqStream := parsedReq.Stream
+ setOpsRequestContext(c, reqModel, reqStream, body)
+
// 验证 model 必填
if reqModel == "" {
h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "model is required")
@@ -111,6 +115,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
// 0. 检查wait队列是否已满
maxWait := service.CalculateMaxWait(subject.Concurrency)
canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait)
+ waitCounted := false
if err != nil {
log.Printf("Increment wait count failed: %v", err)
// On error, allow request to proceed
@@ -118,8 +123,15 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later")
return
}
- // 确保在函数退出时减少wait计数
- defer h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
+ if err == nil && canWait {
+ waitCounted = true
+ }
+ // Ensure we decrement if we exit before acquiring the user slot.
+ defer func() {
+ if waitCounted {
+ h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
+ }
+ }()
// 1. 首先获取用户并发槽位
userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted)
@@ -128,6 +140,11 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
h.handleConcurrencyError(c, err, "user", streamStarted)
return
}
+ // User slot acquired: no longer waiting in the queue.
+ if waitCounted {
+ h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
+ waitCounted = false
+ }
// 在请求结束或 Context 取消时确保释放槽位,避免客户端断开造成泄漏
userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
if userReleaseFunc != nil {
@@ -174,6 +191,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
return
}
account := selection.Account
+ setOpsSelectedAccount(c, account.ID)
// 检查预热请求拦截(在账号选择后、转发前检查)
if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) {
@@ -190,12 +208,12 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
// 3. 获取账号并发槽位
accountReleaseFunc := selection.ReleaseFunc
- var accountWaitRelease func()
if !selection.Acquired {
if selection.WaitPlan == nil {
h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
return
}
+ accountWaitCounted := false
canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
if err != nil {
log.Printf("Increment account wait count failed: %v", err)
@@ -203,12 +221,16 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
log.Printf("Account wait queue full: account=%d", account.ID)
h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
return
- } else {
- // Only set release function if increment succeeded
- accountWaitRelease = func() {
+ }
+ if err == nil && canWait {
+ accountWaitCounted = true
+ }
+ // Ensure the wait counter is decremented if we exit before acquiring the slot.
+ defer func() {
+ if accountWaitCounted {
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
}
- }
+ }()
accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
c,
@@ -219,20 +241,21 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
&streamStarted,
)
if err != nil {
- if accountWaitRelease != nil {
- accountWaitRelease()
- }
log.Printf("Account concurrency acquire failed: %v", err)
h.handleConcurrencyError(c, err, "account", streamStarted)
return
}
+ // Slot acquired: no longer waiting in queue.
+ if accountWaitCounted {
+ h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
+ accountWaitCounted = false
+ }
if err := h.gatewayService.BindStickySession(c.Request.Context(), sessionKey, account.ID); err != nil {
log.Printf("Bind sticky session failed: %v", err)
}
}
// 账号槽位/等待计数需要在超时或断开时安全回收
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
- accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
// 转发请求 - 根据账号平台分流
var result *service.ForwardResult
@@ -244,9 +267,6 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
if accountReleaseFunc != nil {
accountReleaseFunc()
}
- if accountWaitRelease != nil {
- accountWaitRelease()
- }
if err != nil {
var failoverErr *service.UpstreamFailoverError
if errors.As(err, &failoverErr) {
@@ -301,6 +321,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
return
}
account := selection.Account
+ setOpsSelectedAccount(c, account.ID)
// 检查预热请求拦截(在账号选择后、转发前检查)
if account.IsInterceptWarmupEnabled() && isWarmupRequest(body) {
@@ -317,12 +338,12 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
// 3. 获取账号并发槽位
accountReleaseFunc := selection.ReleaseFunc
- var accountWaitRelease func()
if !selection.Acquired {
if selection.WaitPlan == nil {
h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
return
}
+ accountWaitCounted := false
canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
if err != nil {
log.Printf("Increment account wait count failed: %v", err)
@@ -330,12 +351,15 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
log.Printf("Account wait queue full: account=%d", account.ID)
h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
return
- } else {
- // Only set release function if increment succeeded
- accountWaitRelease = func() {
+ }
+ if err == nil && canWait {
+ accountWaitCounted = true
+ }
+ defer func() {
+ if accountWaitCounted {
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
}
- }
+ }()
accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
c,
@@ -346,20 +370,20 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
&streamStarted,
)
if err != nil {
- if accountWaitRelease != nil {
- accountWaitRelease()
- }
log.Printf("Account concurrency acquire failed: %v", err)
h.handleConcurrencyError(c, err, "account", streamStarted)
return
}
+ if accountWaitCounted {
+ h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
+ accountWaitCounted = false
+ }
if err := h.gatewayService.BindStickySession(c.Request.Context(), sessionKey, account.ID); err != nil {
log.Printf("Bind sticky session failed: %v", err)
}
}
// 账号槽位/等待计数需要在超时或断开时安全回收
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
- accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
// 转发请求 - 根据账号平台分流
var result *service.ForwardResult
@@ -371,9 +395,6 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
if accountReleaseFunc != nil {
accountReleaseFunc()
}
- if accountWaitRelease != nil {
- accountWaitRelease()
- }
if err != nil {
var failoverErr *service.UpstreamFailoverError
if errors.As(err, &failoverErr) {
@@ -672,6 +693,8 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) {
return
}
+ setOpsRequestContext(c, "", false, body)
+
parsedReq, err := service.ParseGatewayRequest(body)
if err != nil {
h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
@@ -684,6 +707,8 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) {
return
}
+ setOpsRequestContext(c, parsedReq.Model, parsedReq.Stream, body)
+
// 获取订阅信息(可能为nil)
subscription, _ := middleware2.GetSubscriptionFromContext(c)
@@ -704,6 +729,7 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) {
h.errorResponse(c, http.StatusServiceUnavailable, "api_error", "No available accounts: "+err.Error())
return
}
+ setOpsSelectedAccount(c, account.ID)
// 转发请求(不记录使用量)
if err := h.gatewayService.ForwardCountTokens(c.Request.Context(), c, account, parsedReq); err != nil {
diff --git a/backend/internal/handler/gemini_v1beta_handler.go b/backend/internal/handler/gemini_v1beta_handler.go
index aaf651e9..73550575 100644
--- a/backend/internal/handler/gemini_v1beta_handler.go
+++ b/backend/internal/handler/gemini_v1beta_handler.go
@@ -161,6 +161,8 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
return
}
+ setOpsRequestContext(c, modelName, stream, body)
+
// Get subscription (may be nil)
subscription, _ := middleware.GetSubscriptionFromContext(c)
@@ -170,13 +172,21 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
// 0) wait queue check
maxWait := service.CalculateMaxWait(authSubject.Concurrency)
canWait, err := geminiConcurrency.IncrementWaitCount(c.Request.Context(), authSubject.UserID, maxWait)
+ waitCounted := false
if err != nil {
log.Printf("Increment wait count failed: %v", err)
} else if !canWait {
googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later")
return
}
- defer geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID)
+ if err == nil && canWait {
+ waitCounted = true
+ }
+ defer func() {
+ if waitCounted {
+ geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID)
+ }
+ }()
// 1) user concurrency slot
streamStarted := false
@@ -185,6 +195,10 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
googleError(c, http.StatusTooManyRequests, err.Error())
return
}
+ if waitCounted {
+ geminiConcurrency.DecrementWaitCount(c.Request.Context(), authSubject.UserID)
+ waitCounted = false
+ }
// 确保请求取消时也会释放槽位,避免长连接被动中断造成泄漏
userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
if userReleaseFunc != nil {
@@ -221,15 +235,16 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
return
}
account := selection.Account
+ setOpsSelectedAccount(c, account.ID)
// 4) account concurrency slot
accountReleaseFunc := selection.ReleaseFunc
- var accountWaitRelease func()
if !selection.Acquired {
if selection.WaitPlan == nil {
googleError(c, http.StatusServiceUnavailable, "No available Gemini accounts")
return
}
+ accountWaitCounted := false
canWait, err := geminiConcurrency.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
if err != nil {
log.Printf("Increment account wait count failed: %v", err)
@@ -237,12 +252,15 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
log.Printf("Account wait queue full: account=%d", account.ID)
googleError(c, http.StatusTooManyRequests, "Too many pending requests, please retry later")
return
- } else {
- // Only set release function if increment succeeded
- accountWaitRelease = func() {
+ }
+ if err == nil && canWait {
+ accountWaitCounted = true
+ }
+ defer func() {
+ if accountWaitCounted {
geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID)
}
- }
+ }()
accountReleaseFunc, err = geminiConcurrency.AcquireAccountSlotWithWaitTimeout(
c,
@@ -253,19 +271,19 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
&streamStarted,
)
if err != nil {
- if accountWaitRelease != nil {
- accountWaitRelease()
- }
googleError(c, http.StatusTooManyRequests, err.Error())
return
}
+ if accountWaitCounted {
+ geminiConcurrency.DecrementAccountWaitCount(c.Request.Context(), account.ID)
+ accountWaitCounted = false
+ }
if err := h.gatewayService.BindStickySession(c.Request.Context(), sessionKey, account.ID); err != nil {
log.Printf("Bind sticky session failed: %v", err)
}
}
// 账号槽位/等待计数需要在超时或断开时安全回收
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
- accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
// 5) forward (根据平台分流)
var result *service.ForwardResult
@@ -277,9 +295,6 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
if accountReleaseFunc != nil {
accountReleaseFunc()
}
- if accountWaitRelease != nil {
- accountWaitRelease()
- }
if err != nil {
var failoverErr *service.UpstreamFailoverError
if errors.As(err, &failoverErr) {
diff --git a/backend/internal/handler/handler.go b/backend/internal/handler/handler.go
index 817b71d3..030ebd68 100644
--- a/backend/internal/handler/handler.go
+++ b/backend/internal/handler/handler.go
@@ -17,6 +17,7 @@ type AdminHandlers struct {
Proxy *admin.ProxyHandler
Redeem *admin.RedeemHandler
Setting *admin.SettingHandler
+ Ops *admin.OpsHandler
System *admin.SystemHandler
Subscription *admin.SubscriptionHandler
Usage *admin.UsageHandler
diff --git a/backend/internal/handler/openai_gateway_handler.go b/backend/internal/handler/openai_gateway_handler.go
index 04d268a5..2ddf77ed 100644
--- a/backend/internal/handler/openai_gateway_handler.go
+++ b/backend/internal/handler/openai_gateway_handler.go
@@ -75,6 +75,8 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
return
}
+ setOpsRequestContext(c, "", false, body)
+
// Parse request body to map for potential modification
var reqBody map[string]any
if err := json.Unmarshal(body, &reqBody); err != nil {
@@ -104,6 +106,8 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
}
}
+ setOpsRequestContext(c, reqModel, reqStream, body)
+
// Track if we've started streaming (for error handling)
streamStarted := false
@@ -113,6 +117,7 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
// 0. Check if wait queue is full
maxWait := service.CalculateMaxWait(subject.Concurrency)
canWait, err := h.concurrencyHelper.IncrementWaitCount(c.Request.Context(), subject.UserID, maxWait)
+ waitCounted := false
if err != nil {
log.Printf("Increment wait count failed: %v", err)
// On error, allow request to proceed
@@ -120,8 +125,14 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
h.errorResponse(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later")
return
}
- // Ensure wait count is decremented when function exits
- defer h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
+ if err == nil && canWait {
+ waitCounted = true
+ }
+ defer func() {
+ if waitCounted {
+ h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
+ }
+ }()
// 1. First acquire user concurrency slot
userReleaseFunc, err := h.concurrencyHelper.AcquireUserSlotWithWait(c, subject.UserID, subject.Concurrency, reqStream, &streamStarted)
@@ -130,6 +141,11 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
h.handleConcurrencyError(c, err, "user", streamStarted)
return
}
+ // User slot acquired: no longer waiting.
+ if waitCounted {
+ h.concurrencyHelper.DecrementWaitCount(c.Request.Context(), subject.UserID)
+ waitCounted = false
+ }
// 确保请求取消时也会释放槽位,避免长连接被动中断造成泄漏
userReleaseFunc = wrapReleaseOnDone(c.Request.Context(), userReleaseFunc)
if userReleaseFunc != nil {
@@ -167,15 +183,16 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
}
account := selection.Account
log.Printf("[OpenAI Handler] Selected account: id=%d name=%s", account.ID, account.Name)
+ setOpsSelectedAccount(c, account.ID)
// 3. Acquire account concurrency slot
accountReleaseFunc := selection.ReleaseFunc
- var accountWaitRelease func()
if !selection.Acquired {
if selection.WaitPlan == nil {
h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts", streamStarted)
return
}
+ accountWaitCounted := false
canWait, err := h.concurrencyHelper.IncrementAccountWaitCount(c.Request.Context(), account.ID, selection.WaitPlan.MaxWaiting)
if err != nil {
log.Printf("Increment account wait count failed: %v", err)
@@ -183,12 +200,15 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
log.Printf("Account wait queue full: account=%d", account.ID)
h.handleStreamingAwareError(c, http.StatusTooManyRequests, "rate_limit_error", "Too many pending requests, please retry later", streamStarted)
return
- } else {
- // Only set release function if increment succeeded
- accountWaitRelease = func() {
+ }
+ if err == nil && canWait {
+ accountWaitCounted = true
+ }
+ defer func() {
+ if accountWaitCounted {
h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
}
- }
+ }()
accountReleaseFunc, err = h.concurrencyHelper.AcquireAccountSlotWithWaitTimeout(
c,
@@ -199,29 +219,26 @@ func (h *OpenAIGatewayHandler) Responses(c *gin.Context) {
&streamStarted,
)
if err != nil {
- if accountWaitRelease != nil {
- accountWaitRelease()
- }
log.Printf("Account concurrency acquire failed: %v", err)
h.handleConcurrencyError(c, err, "account", streamStarted)
return
}
+ if accountWaitCounted {
+ h.concurrencyHelper.DecrementAccountWaitCount(c.Request.Context(), account.ID)
+ accountWaitCounted = false
+ }
if err := h.gatewayService.BindStickySession(c.Request.Context(), sessionHash, account.ID); err != nil {
log.Printf("Bind sticky session failed: %v", err)
}
}
// 账号槽位/等待计数需要在超时或断开时安全回收
accountReleaseFunc = wrapReleaseOnDone(c.Request.Context(), accountReleaseFunc)
- accountWaitRelease = wrapReleaseOnDone(c.Request.Context(), accountWaitRelease)
// Forward request
result, err := h.gatewayService.Forward(c.Request.Context(), c, account, body)
if accountReleaseFunc != nil {
accountReleaseFunc()
}
- if accountWaitRelease != nil {
- accountWaitRelease()
- }
if err != nil {
var failoverErr *service.UpstreamFailoverError
if errors.As(err, &failoverErr) {
From 2d123a11ad208aef42b982655c825e5347c8b7f9 Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Fri, 9 Jan 2026 20:57:32 +0800
Subject: [PATCH 09/53] =?UTF-8?q?feat(=E8=AE=BE=E7=BD=AE):=20=E9=9B=86?=
=?UTF-8?q?=E6=88=90=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E9=85=8D=E7=BD=AE?=
=?UTF-8?q?=E5=88=B0=E7=B3=BB=E7=BB=9F=E8=AE=BE=E7=BD=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 扩展 setting_handler 支持 ops 配置管理
- 扩展 setting_service 支持 ops 配置持久化
- 更新 settings_view 包含 ops 配置视图
---
.../internal/handler/admin/setting_handler.go | 38 +++++++++++++++++++
backend/internal/service/setting_service.go | 25 ++++++++++++
backend/internal/service/settings_view.go | 5 +++
3 files changed, 68 insertions(+)
diff --git a/backend/internal/handler/admin/setting_handler.go b/backend/internal/handler/admin/setting_handler.go
index 743c4268..4d4d5639 100644
--- a/backend/internal/handler/admin/setting_handler.go
+++ b/backend/internal/handler/admin/setting_handler.go
@@ -65,6 +65,9 @@ func (h *SettingHandler) GetSettings(c *gin.Context) {
FallbackModelAntigravity: settings.FallbackModelAntigravity,
EnableIdentityPatch: settings.EnableIdentityPatch,
IdentityPatchPrompt: settings.IdentityPatchPrompt,
+ OpsMonitoringEnabled: settings.OpsMonitoringEnabled,
+ OpsRealtimeMonitoringEnabled: settings.OpsRealtimeMonitoringEnabled,
+ OpsQueryModeDefault: settings.OpsQueryModeDefault,
})
}
@@ -110,6 +113,11 @@ type UpdateSettingsRequest struct {
// Identity patch configuration (Claude -> Gemini)
EnableIdentityPatch bool `json:"enable_identity_patch"`
IdentityPatchPrompt string `json:"identity_patch_prompt"`
+
+ // Ops monitoring (vNext)
+ OpsMonitoringEnabled *bool `json:"ops_monitoring_enabled"`
+ OpsRealtimeMonitoringEnabled *bool `json:"ops_realtime_monitoring_enabled"`
+ OpsQueryModeDefault *string `json:"ops_query_mode_default"`
}
// UpdateSettings 更新系统设置
@@ -193,6 +201,24 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
FallbackModelAntigravity: req.FallbackModelAntigravity,
EnableIdentityPatch: req.EnableIdentityPatch,
IdentityPatchPrompt: req.IdentityPatchPrompt,
+ OpsMonitoringEnabled: func() bool {
+ if req.OpsMonitoringEnabled != nil {
+ return *req.OpsMonitoringEnabled
+ }
+ return previousSettings.OpsMonitoringEnabled
+ }(),
+ OpsRealtimeMonitoringEnabled: func() bool {
+ if req.OpsRealtimeMonitoringEnabled != nil {
+ return *req.OpsRealtimeMonitoringEnabled
+ }
+ return previousSettings.OpsRealtimeMonitoringEnabled
+ }(),
+ OpsQueryModeDefault: func() string {
+ if req.OpsQueryModeDefault != nil {
+ return *req.OpsQueryModeDefault
+ }
+ return previousSettings.OpsQueryModeDefault
+ }(),
}
if err := h.settingService.UpdateSettings(c.Request.Context(), settings); err != nil {
@@ -237,6 +263,9 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
FallbackModelAntigravity: updatedSettings.FallbackModelAntigravity,
EnableIdentityPatch: updatedSettings.EnableIdentityPatch,
IdentityPatchPrompt: updatedSettings.IdentityPatchPrompt,
+ OpsMonitoringEnabled: updatedSettings.OpsMonitoringEnabled,
+ OpsRealtimeMonitoringEnabled: updatedSettings.OpsRealtimeMonitoringEnabled,
+ OpsQueryModeDefault: updatedSettings.OpsQueryModeDefault,
})
}
@@ -337,6 +366,15 @@ func diffSettings(before *service.SystemSettings, after *service.SystemSettings,
if before.FallbackModelAntigravity != after.FallbackModelAntigravity {
changed = append(changed, "fallback_model_antigravity")
}
+ if before.OpsMonitoringEnabled != after.OpsMonitoringEnabled {
+ changed = append(changed, "ops_monitoring_enabled")
+ }
+ if before.OpsRealtimeMonitoringEnabled != after.OpsRealtimeMonitoringEnabled {
+ changed = append(changed, "ops_realtime_monitoring_enabled")
+ }
+ if before.OpsQueryModeDefault != after.OpsQueryModeDefault {
+ changed = append(changed, "ops_query_mode_default")
+ }
return changed
}
diff --git a/backend/internal/service/setting_service.go b/backend/internal/service/setting_service.go
index 6ce8ba2b..1aea32be 100644
--- a/backend/internal/service/setting_service.go
+++ b/backend/internal/service/setting_service.go
@@ -7,6 +7,7 @@ import (
"errors"
"fmt"
"strconv"
+ "strings"
"github.com/Wei-Shaw/sub2api/internal/config"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
@@ -134,6 +135,11 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
updates[SettingKeyEnableIdentityPatch] = strconv.FormatBool(settings.EnableIdentityPatch)
updates[SettingKeyIdentityPatchPrompt] = settings.IdentityPatchPrompt
+ // Ops monitoring (vNext)
+ updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled)
+ updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled)
+ updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault))
+
return s.settingRepo.SetMultiple(ctx, updates)
}
@@ -220,6 +226,11 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error {
// Identity patch defaults
SettingKeyEnableIdentityPatch: "true",
SettingKeyIdentityPatchPrompt: "",
+
+ // Ops monitoring defaults (vNext)
+ SettingKeyOpsMonitoringEnabled: "true",
+ SettingKeyOpsRealtimeMonitoringEnabled: "true",
+ SettingKeyOpsQueryModeDefault: "auto",
}
return s.settingRepo.SetMultiple(ctx, defaults)
@@ -286,9 +297,23 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin
}
result.IdentityPatchPrompt = settings[SettingKeyIdentityPatchPrompt]
+ // Ops monitoring settings (default: enabled, fail-open)
+ result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled])
+ result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled])
+ result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault]))
+
return result
}
+func isFalseSettingValue(value string) bool {
+ switch strings.ToLower(strings.TrimSpace(value)) {
+ case "false", "0", "off", "disabled":
+ return true
+ default:
+ return false
+ }
+}
+
// getStringOrDefault 获取字符串值或默认值
func (s *SettingService) getStringOrDefault(settings map[string]string, key, defaultValue string) string {
if value, ok := settings[key]; ok && value != "" {
diff --git a/backend/internal/service/settings_view.go b/backend/internal/service/settings_view.go
index de0331f7..e9d07bca 100644
--- a/backend/internal/service/settings_view.go
+++ b/backend/internal/service/settings_view.go
@@ -38,6 +38,11 @@ type SystemSettings struct {
// Identity patch configuration (Claude -> Gemini)
EnableIdentityPatch bool `json:"enable_identity_patch"`
IdentityPatchPrompt string `json:"identity_patch_prompt"`
+
+ // Ops monitoring (vNext)
+ OpsMonitoringEnabled bool
+ OpsRealtimeMonitoringEnabled bool
+ OpsQueryModeDefault string
}
type PublicSettings struct {
From e846458009c525e045b232ffdcc483e702d23153 Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Fri, 9 Jan 2026 20:58:01 +0800
Subject: [PATCH 10/53] =?UTF-8?q?test(=E5=90=8E=E7=AB=AF):=20=E6=9B=B4?=
=?UTF-8?q?=E6=96=B0=20API=20=E5=A5=91=E7=BA=A6=E6=B5=8B=E8=AF=95=E6=94=AF?=
=?UTF-8?q?=E6=8C=81=20ops=20=E7=9B=91=E6=8E=A7=E7=AB=AF=E7=82=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 更新 api_contract_test.go 包含 ops 相关端点测试
---
backend/internal/server/api_contract_test.go | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/backend/internal/server/api_contract_test.go b/backend/internal/server/api_contract_test.go
index f98ebc59..23cab19c 100644
--- a/backend/internal/server/api_contract_test.go
+++ b/backend/internal/server/api_contract_test.go
@@ -317,7 +317,9 @@ func TestAPIContracts(t *testing.T) {
"fallback_model_gemini": "gemini-2.5-pro",
"fallback_model_openai": "gpt-4o",
"enable_identity_patch": true,
- "identity_patch_prompt": ""
+ "identity_patch_prompt": "",
+ "ops_monitoring_enabled": true,
+ "ops_realtime_monitoring_enabled": true
}
}`,
},
From 11d063e3c4b9fcc146ca318fe41a47c5bbf55530 Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Fri, 9 Jan 2026 20:58:33 +0800
Subject: [PATCH 11/53] =?UTF-8?q?feat(=E5=89=8D=E7=AB=AFAPI):=20=E5=AE=9E?=
=?UTF-8?q?=E7=8E=B0=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=20API=20=E5=AE=A2?=
=?UTF-8?q?=E6=88=B7=E7=AB=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 新增 ops API 客户端(ops.ts)
- 扩展 settings API 支持 ops 配置
- 更新 admin API 索引导出 ops 模块
- 扩展 API 客户端支持 WebSocket 连接
---
frontend/src/api/admin/index.ts | 7 +-
frontend/src/api/admin/ops.ts | 906 +++++++++++++++++++++++++++++
frontend/src/api/admin/settings.ts | 21 +
frontend/src/api/client.ts | 40 +-
4 files changed, 970 insertions(+), 4 deletions(-)
create mode 100644 frontend/src/api/admin/ops.ts
diff --git a/frontend/src/api/admin/index.ts b/frontend/src/api/admin/index.ts
index ea12f6d2..9e719a90 100644
--- a/frontend/src/api/admin/index.ts
+++ b/frontend/src/api/admin/index.ts
@@ -16,6 +16,7 @@ import usageAPI from './usage'
import geminiAPI from './gemini'
import antigravityAPI from './antigravity'
import userAttributesAPI from './userAttributes'
+import opsAPI from './ops'
/**
* Unified admin API object for convenient access
@@ -33,7 +34,8 @@ export const adminAPI = {
usage: usageAPI,
gemini: geminiAPI,
antigravity: antigravityAPI,
- userAttributes: userAttributesAPI
+ userAttributes: userAttributesAPI,
+ ops: opsAPI
}
export {
@@ -49,7 +51,8 @@ export {
usageAPI,
geminiAPI,
antigravityAPI,
- userAttributesAPI
+ userAttributesAPI,
+ opsAPI
}
export default adminAPI
diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts
new file mode 100644
index 00000000..3c3529a9
--- /dev/null
+++ b/frontend/src/api/admin/ops.ts
@@ -0,0 +1,906 @@
+/**
+ * Admin Ops API endpoints (vNext)
+ * - Error logs list/detail + retry (client/upstream)
+ * - Dashboard overview (raw path)
+ */
+
+import { apiClient } from '../client'
+import type { PaginatedResponse } from '@/types'
+
+export type OpsRetryMode = 'client' | 'upstream'
+export type OpsQueryMode = 'auto' | 'raw' | 'preagg'
+
+export interface OpsRequestOptions {
+ signal?: AbortSignal
+}
+
+export interface OpsRetryRequest {
+ mode: OpsRetryMode
+ pinned_account_id?: number
+}
+
+export interface OpsRetryResult {
+ attempt_id: number
+ mode: OpsRetryMode
+ status: 'running' | 'succeeded' | 'failed' | string
+
+ pinned_account_id?: number | null
+ used_account_id?: number | null
+
+ http_status_code: number
+ upstream_request_id: string
+
+ response_preview: string
+ response_truncated: boolean
+
+ error_message: string
+
+ started_at: string
+ finished_at: string
+ duration_ms: number
+}
+
+export interface OpsDashboardOverview {
+ start_time: string
+ end_time: string
+ platform: string
+ group_id?: number | null
+
+ system_metrics?: OpsSystemMetricsSnapshot | null
+ job_heartbeats?: OpsJobHeartbeat[] | null
+
+ success_count: number
+ error_count_total: number
+ business_limited_count: number
+ error_count_sla: number
+ request_count_total: number
+ request_count_sla: number
+
+ token_consumed: number
+
+ sla: number
+ error_rate: number
+ upstream_error_rate: number
+ upstream_error_count_excl_429_529: number
+ upstream_429_count: number
+ upstream_529_count: number
+
+ qps: {
+ current: number
+ peak: number
+ avg: number
+ }
+ tps: {
+ current: number
+ peak: number
+ avg: number
+ }
+
+ duration: OpsPercentiles
+ ttft: OpsPercentiles
+}
+
+export interface OpsPercentiles {
+ p50_ms?: number | null
+ p90_ms?: number | null
+ p95_ms?: number | null
+ p99_ms?: number | null
+ avg_ms?: number | null
+ max_ms?: number | null
+}
+
+export interface OpsThroughputTrendPoint {
+ bucket_start: string
+ request_count: number
+ token_consumed: number
+ qps: number
+ tps: number
+}
+
+export interface OpsThroughputPlatformBreakdownItem {
+ platform: string
+ request_count: number
+ token_consumed: number
+}
+
+export interface OpsThroughputGroupBreakdownItem {
+ group_id: number
+ group_name: string
+ request_count: number
+ token_consumed: number
+}
+
+export interface OpsThroughputTrendResponse {
+ bucket: string
+ points: OpsThroughputTrendPoint[]
+ by_platform?: OpsThroughputPlatformBreakdownItem[]
+ top_groups?: OpsThroughputGroupBreakdownItem[]
+}
+
+export type OpsRequestKind = 'success' | 'error'
+export type OpsRequestDetailsKind = OpsRequestKind | 'all'
+export type OpsRequestDetailsSort = 'created_at_desc' | 'duration_desc'
+
+export interface OpsRequestDetail {
+ kind: OpsRequestKind
+ created_at: string
+ request_id: string
+
+ platform?: string
+ model?: string
+ duration_ms?: number | null
+ status_code?: number | null
+
+ error_id?: number | null
+ phase?: string
+ severity?: string
+ message?: string
+
+ user_id?: number | null
+ api_key_id?: number | null
+ account_id?: number | null
+ group_id?: number | null
+
+ stream?: boolean
+}
+
+export interface OpsRequestDetailsParams {
+ time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+ start_time?: string
+ end_time?: string
+
+ kind?: OpsRequestDetailsKind
+
+ platform?: string
+ group_id?: number | null
+
+ user_id?: number
+ api_key_id?: number
+ account_id?: number
+
+ model?: string
+ request_id?: string
+ q?: string
+
+ min_duration_ms?: number
+ max_duration_ms?: number
+
+ sort?: OpsRequestDetailsSort
+
+ page?: number
+ page_size?: number
+}
+
+export type OpsRequestDetailsResponse = PaginatedResponse
+
+export interface OpsLatencyHistogramBucket {
+ range: string
+ count: number
+}
+
+export interface OpsLatencyHistogramResponse {
+ start_time: string
+ end_time: string
+ platform: string
+ group_id?: number | null
+
+ total_requests: number
+ buckets: OpsLatencyHistogramBucket[]
+}
+
+export interface OpsErrorTrendPoint {
+ bucket_start: string
+ error_count_total: number
+ business_limited_count: number
+ error_count_sla: number
+ upstream_error_count_excl_429_529: number
+ upstream_429_count: number
+ upstream_529_count: number
+}
+
+export interface OpsErrorTrendResponse {
+ bucket: string
+ points: OpsErrorTrendPoint[]
+}
+
+export interface OpsErrorDistributionItem {
+ status_code: number
+ total: number
+ sla: number
+ business_limited: number
+}
+
+export interface OpsErrorDistributionResponse {
+ total: number
+ items: OpsErrorDistributionItem[]
+}
+
+export interface OpsSystemMetricsSnapshot {
+ id: number
+ created_at: string
+ window_minutes: number
+
+ cpu_usage_percent?: number | null
+ memory_used_mb?: number | null
+ memory_total_mb?: number | null
+ memory_usage_percent?: number | null
+
+ db_ok?: boolean | null
+ redis_ok?: boolean | null
+
+ db_conn_active?: number | null
+ db_conn_idle?: number | null
+ db_conn_waiting?: number | null
+
+ goroutine_count?: number | null
+ concurrency_queue_depth?: number | null
+}
+
+export interface OpsJobHeartbeat {
+ job_name: string
+ last_run_at?: string | null
+ last_success_at?: string | null
+ last_error_at?: string | null
+ last_error?: string | null
+ last_duration_ms?: number | null
+ updated_at: string
+}
+
+export interface PlatformConcurrencyInfo {
+ platform: string
+ current_in_use: number
+ max_capacity: number
+ load_percentage: number
+ waiting_in_queue: number
+}
+
+export interface GroupConcurrencyInfo {
+ group_id: number
+ group_name: string
+ platform: string
+ current_in_use: number
+ max_capacity: number
+ load_percentage: number
+ waiting_in_queue: number
+}
+
+export interface AccountConcurrencyInfo {
+ account_id: number
+ account_name?: string
+ platform: string
+ group_id: number
+ group_name: string
+ current_in_use: number
+ max_capacity: number
+ load_percentage: number
+ waiting_in_queue: number
+}
+
+export interface OpsConcurrencyStatsResponse {
+ enabled: boolean
+ platform: Record
+ group: Record
+ account: Record
+ timestamp?: string
+}
+
+export async function getConcurrencyStats(platform?: string, groupId?: number | null): Promise {
+ const params: Record = {}
+ if (platform) {
+ params.platform = platform
+ }
+ if (typeof groupId === 'number' && groupId > 0) {
+ params.group_id = groupId
+ }
+
+ const { data } = await apiClient.get('/admin/ops/concurrency', { params })
+ return data
+}
+
+export interface PlatformAvailability {
+ platform: string
+ total_accounts: number
+ available_count: number
+ rate_limit_count: number
+ error_count: number
+}
+
+export interface GroupAvailability {
+ group_id: number
+ group_name: string
+ platform: string
+ total_accounts: number
+ available_count: number
+ rate_limit_count: number
+ error_count: number
+}
+
+export interface AccountAvailability {
+ account_id: number
+ account_name: string
+ platform: string
+ group_id: number
+ group_name: string
+ status: string
+ is_available: boolean
+ is_rate_limited: boolean
+ rate_limit_reset_at?: string
+ rate_limit_remaining_sec?: number
+ is_overloaded: boolean
+ overload_until?: string
+ overload_remaining_sec?: number
+ has_error: boolean
+ error_message?: string
+}
+
+export interface OpsAccountAvailabilityStatsResponse {
+ enabled: boolean
+ platform: Record
+ group: Record
+ account: Record
+ timestamp?: string
+}
+
+export async function getAccountAvailabilityStats(platform?: string, groupId?: number | null): Promise {
+ const params: Record = {}
+ if (platform) {
+ params.platform = platform
+ }
+ if (typeof groupId === 'number' && groupId > 0) {
+ params.group_id = groupId
+ }
+ const { data } = await apiClient.get('/admin/ops/account-availability', { params })
+ return data
+}
+
+/**
+ * Subscribe to realtime QPS updates via WebSocket.
+ *
+ * Note: browsers cannot set Authorization headers for WebSockets.
+ * We authenticate via Sec-WebSocket-Protocol using a prefixed token item:
+ * ["sub2api-admin", "jwt."]
+ */
+export interface SubscribeQPSOptions {
+ token?: string | null
+ onOpen?: () => void
+ onClose?: (event: CloseEvent) => void
+ onError?: (event: Event) => void
+ /**
+ * Called when the server closes with an application close code that indicates
+ * reconnecting is not useful (e.g. feature flag disabled).
+ */
+ onFatalClose?: (event: CloseEvent) => void
+ /**
+ * More granular status updates for UI (connecting/reconnecting/offline/etc).
+ */
+ onStatusChange?: (status: OpsWSStatus) => void
+ /**
+ * Called when a reconnect is scheduled (helps display "retry in Xs").
+ */
+ onReconnectScheduled?: (info: { attempt: number, delayMs: number }) => void
+ wsBaseUrl?: string
+ /**
+ * Maximum reconnect attempts. Defaults to Infinity to keep the dashboard live.
+ * Set to 0 to disable reconnect.
+ */
+ maxReconnectAttempts?: number
+ reconnectBaseDelayMs?: number
+ reconnectMaxDelayMs?: number
+ /**
+ * Stale connection detection (heartbeat-by-observation).
+ * If no messages are received within this window, the socket is closed to trigger a reconnect.
+ * Set to 0 to disable.
+ */
+ staleTimeoutMs?: number
+ /**
+ * How often to check staleness. Only used when `staleTimeoutMs > 0`.
+ */
+ staleCheckIntervalMs?: number
+}
+
+export type OpsWSStatus = 'connecting' | 'connected' | 'reconnecting' | 'offline' | 'closed'
+
+export const OPS_WS_CLOSE_CODES = {
+ REALTIME_DISABLED: 4001
+} as const
+
+const OPS_WS_BASE_PROTOCOL = 'sub2api-admin'
+
+export function subscribeQPS(onMessage: (data: any) => void, options: SubscribeQPSOptions = {}): () => void {
+ let ws: WebSocket | null = null
+ let reconnectAttempts = 0
+ const maxReconnectAttempts = Number.isFinite(options.maxReconnectAttempts as number)
+ ? (options.maxReconnectAttempts as number)
+ : Infinity
+ const baseDelayMs = options.reconnectBaseDelayMs ?? 1000
+ const maxDelayMs = options.reconnectMaxDelayMs ?? 30000
+ let reconnectTimer: ReturnType | null = null
+ let shouldReconnect = true
+ let isConnecting = false
+ let hasConnectedOnce = false
+ let lastMessageAt = 0
+ const staleTimeoutMs = options.staleTimeoutMs ?? 120_000
+ const staleCheckIntervalMs = options.staleCheckIntervalMs ?? 30_000
+ let staleTimer: ReturnType | null = null
+
+ const setStatus = (status: OpsWSStatus) => {
+ options.onStatusChange?.(status)
+ }
+
+ const clearReconnectTimer = () => {
+ if (reconnectTimer) {
+ clearTimeout(reconnectTimer)
+ reconnectTimer = null
+ }
+ }
+
+ const clearStaleTimer = () => {
+ if (staleTimer) {
+ clearInterval(staleTimer)
+ staleTimer = null
+ }
+ }
+
+ const startStaleTimer = () => {
+ clearStaleTimer()
+ if (!staleTimeoutMs || staleTimeoutMs <= 0) return
+ staleTimer = setInterval(() => {
+ if (!shouldReconnect) return
+ if (!ws || ws.readyState !== WebSocket.OPEN) return
+ if (!lastMessageAt) return
+ const ageMs = Date.now() - lastMessageAt
+ if (ageMs > staleTimeoutMs) {
+ // Treat as a half-open connection; closing triggers the normal reconnect path.
+ ws.close()
+ }
+ }, staleCheckIntervalMs)
+ }
+
+ const scheduleReconnect = () => {
+ if (!shouldReconnect) return
+ if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
+
+ // If we're offline, wait for the browser to come back online.
+ if (typeof navigator !== 'undefined' && 'onLine' in navigator && !navigator.onLine) {
+ setStatus('offline')
+ return
+ }
+
+ const expDelay = baseDelayMs * Math.pow(2, reconnectAttempts)
+ const delay = Math.min(expDelay, maxDelayMs)
+ const jitter = Math.floor(Math.random() * 250)
+ clearReconnectTimer()
+ reconnectTimer = setTimeout(() => {
+ reconnectAttempts++
+ connect()
+ }, delay + jitter)
+ options.onReconnectScheduled?.({ attempt: reconnectAttempts + 1, delayMs: delay + jitter })
+ }
+
+ const handleOnline = () => {
+ if (!shouldReconnect) return
+ if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
+ connect()
+ }
+
+ const handleOffline = () => {
+ setStatus('offline')
+ }
+
+ const connect = () => {
+ if (!shouldReconnect) return
+ if (isConnecting) return
+ if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
+ if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
+
+ isConnecting = true
+ setStatus(hasConnectedOnce ? 'reconnecting' : 'connecting')
+ const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
+ const wsBaseUrl = options.wsBaseUrl || import.meta.env.VITE_WS_BASE_URL || window.location.host
+ const wsURL = new URL(`${protocol}//${wsBaseUrl}/api/v1/admin/ops/ws/qps`)
+
+ // Do NOT put admin JWT in the URL query string (it can leak via access logs, proxies, etc).
+ // Browsers cannot set Authorization headers for WebSockets, so we pass the token via
+ // Sec-WebSocket-Protocol (subprotocol list): ["sub2api-admin", "jwt."].
+ const rawToken = String(options.token ?? localStorage.getItem('auth_token') ?? '').trim()
+ const protocols: string[] = [OPS_WS_BASE_PROTOCOL]
+ if (rawToken) protocols.push(`jwt.${rawToken}`)
+
+ ws = new WebSocket(wsURL.toString(), protocols)
+
+ ws.onopen = () => {
+ reconnectAttempts = 0
+ isConnecting = false
+ hasConnectedOnce = true
+ clearReconnectTimer()
+ lastMessageAt = Date.now()
+ startStaleTimer()
+ setStatus('connected')
+ options.onOpen?.()
+ }
+
+ ws.onmessage = (e) => {
+ try {
+ const data = JSON.parse(e.data)
+ lastMessageAt = Date.now()
+ onMessage(data)
+ } catch (err) {
+ console.warn('[OpsWS] Failed to parse message:', err)
+ }
+ }
+
+ ws.onerror = (error) => {
+ console.error('[OpsWS] Connection error:', error)
+ options.onError?.(error)
+ }
+
+ ws.onclose = (event) => {
+ isConnecting = false
+ options.onClose?.(event)
+ clearStaleTimer()
+ ws = null
+
+ // If the server explicitly tells us to stop reconnecting, honor it.
+ if (event && typeof event.code === 'number' && event.code === OPS_WS_CLOSE_CODES.REALTIME_DISABLED) {
+ shouldReconnect = false
+ clearReconnectTimer()
+ setStatus('closed')
+ options.onFatalClose?.(event)
+ return
+ }
+
+ scheduleReconnect()
+ }
+ }
+
+ window.addEventListener('online', handleOnline)
+ window.addEventListener('offline', handleOffline)
+ connect()
+
+ return () => {
+ shouldReconnect = false
+ window.removeEventListener('online', handleOnline)
+ window.removeEventListener('offline', handleOffline)
+ clearReconnectTimer()
+ clearStaleTimer()
+ if (ws) ws.close()
+ ws = null
+ setStatus('closed')
+ }
+}
+
+export type OpsSeverity = string
+export type OpsPhase = string
+
+export type AlertSeverity = 'critical' | 'warning' | 'info'
+export type ThresholdMode = 'count' | 'percentage' | 'both'
+export type MetricType =
+ | 'success_rate'
+ | 'error_rate'
+ | 'upstream_error_rate'
+ | 'p95_latency_ms'
+ | 'p99_latency_ms'
+ | 'cpu_usage_percent'
+ | 'memory_usage_percent'
+ | 'concurrency_queue_depth'
+export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!='
+
+export interface AlertRule {
+ id?: number
+ name: string
+ description?: string
+ enabled: boolean
+ metric_type: MetricType
+ operator: Operator
+ threshold: number
+ window_minutes: number
+ sustained_minutes: number
+ severity: OpsSeverity
+ cooldown_minutes: number
+ notify_email: boolean
+ filters?: Record
+ created_at?: string
+ updated_at?: string
+ last_triggered_at?: string | null
+}
+
+export interface AlertEvent {
+ id: number
+ rule_id: number
+ severity: OpsSeverity | string
+ status: 'firing' | 'resolved' | string
+ title?: string
+ description?: string
+ metric_value?: number
+ threshold_value?: number
+ dimensions?: Record
+ fired_at: string
+ resolved_at?: string | null
+ email_sent: boolean
+ created_at: string
+}
+
+export interface EmailNotificationConfig {
+ alert: {
+ enabled: boolean
+ recipients: string[]
+ min_severity: AlertSeverity | ''
+ rate_limit_per_hour: number
+ batching_window_seconds: number
+ include_resolved_alerts: boolean
+ }
+ report: {
+ enabled: boolean
+ recipients: string[]
+ daily_summary_enabled: boolean
+ daily_summary_schedule: string
+ weekly_summary_enabled: boolean
+ weekly_summary_schedule: string
+ error_digest_enabled: boolean
+ error_digest_schedule: string
+ error_digest_min_count: number
+ account_health_enabled: boolean
+ account_health_schedule: string
+ account_health_error_rate_threshold: number
+ }
+}
+
+export interface OpsDistributedLockSettings {
+ enabled: boolean
+ key: string
+ ttl_seconds: number
+}
+
+export interface OpsAlertRuntimeSettings {
+ evaluation_interval_seconds: number
+ distributed_lock: OpsDistributedLockSettings
+ silencing: {
+ enabled: boolean
+ global_until_rfc3339: string
+ global_reason: string
+ entries?: Array<{
+ rule_id?: number
+ severities?: Array
+ until_rfc3339: string
+ reason: string
+ }>
+ }
+}
+
+export interface OpsErrorLog {
+ id: number
+ created_at: string
+ phase: OpsPhase
+ type: string
+ severity: OpsSeverity
+ status_code: number
+ platform: string
+ model: string
+ latency_ms?: number | null
+ client_request_id: string
+ request_id: string
+ message: string
+
+ user_id?: number | null
+ api_key_id?: number | null
+ account_id?: number | null
+ group_id?: number | null
+
+ client_ip?: string | null
+ request_path?: string
+ stream?: boolean
+}
+
+export interface OpsErrorDetail extends OpsErrorLog {
+ error_body: string
+ user_agent: string
+
+ auth_latency_ms?: number | null
+ routing_latency_ms?: number | null
+ upstream_latency_ms?: number | null
+ response_latency_ms?: number | null
+ time_to_first_token_ms?: number | null
+
+ request_body: string
+ request_body_truncated: boolean
+ request_body_bytes?: number | null
+
+ is_business_limited: boolean
+}
+
+export type OpsErrorLogsResponse = PaginatedResponse
+
+export async function getDashboardOverview(
+ params: {
+ time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+ start_time?: string
+ end_time?: string
+ platform?: string
+ group_id?: number | null
+ mode?: OpsQueryMode
+ },
+ options: OpsRequestOptions = {}
+): Promise {
+ const { data } = await apiClient.get('/admin/ops/dashboard/overview', {
+ params,
+ signal: options.signal
+ })
+ return data
+}
+
+export async function getThroughputTrend(
+ params: {
+ time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+ start_time?: string
+ end_time?: string
+ platform?: string
+ group_id?: number | null
+ mode?: OpsQueryMode
+ },
+ options: OpsRequestOptions = {}
+): Promise {
+ const { data } = await apiClient.get('/admin/ops/dashboard/throughput-trend', {
+ params,
+ signal: options.signal
+ })
+ return data
+}
+
+export async function getLatencyHistogram(
+ params: {
+ time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+ start_time?: string
+ end_time?: string
+ platform?: string
+ group_id?: number | null
+ mode?: OpsQueryMode
+ },
+ options: OpsRequestOptions = {}
+): Promise {
+ const { data } = await apiClient.get('/admin/ops/dashboard/latency-histogram', {
+ params,
+ signal: options.signal
+ })
+ return data
+}
+
+export async function getErrorTrend(
+ params: {
+ time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+ start_time?: string
+ end_time?: string
+ platform?: string
+ group_id?: number | null
+ mode?: OpsQueryMode
+ },
+ options: OpsRequestOptions = {}
+): Promise {
+ const { data } = await apiClient.get('/admin/ops/dashboard/error-trend', {
+ params,
+ signal: options.signal
+ })
+ return data
+}
+
+export async function getErrorDistribution(
+ params: {
+ time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+ start_time?: string
+ end_time?: string
+ platform?: string
+ group_id?: number | null
+ mode?: OpsQueryMode
+ },
+ options: OpsRequestOptions = {}
+): Promise {
+ const { data } = await apiClient.get('/admin/ops/dashboard/error-distribution', {
+ params,
+ signal: options.signal
+ })
+ return data
+}
+
+export async function listErrorLogs(params: {
+ page?: number
+ page_size?: number
+ time_range?: string
+ start_time?: string
+ end_time?: string
+ platform?: string
+ group_id?: number | null
+ account_id?: number | null
+ phase?: string
+ q?: string
+ status_codes?: string
+}): Promise {
+ const { data } = await apiClient.get('/admin/ops/errors', { params })
+ return data
+}
+
+export async function getErrorLogDetail(id: number): Promise {
+ const { data } = await apiClient.get(`/admin/ops/errors/${id}`)
+ return data
+}
+
+export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promise {
+ const { data } = await apiClient.post(`/admin/ops/errors/${id}/retry`, req)
+ return data
+}
+
+export async function listRequestDetails(params: OpsRequestDetailsParams): Promise {
+ const { data } = await apiClient.get('/admin/ops/requests', { params })
+ return data
+}
+
+// Alert rules
+export async function listAlertRules(): Promise {
+ const { data } = await apiClient.get('/admin/ops/alert-rules')
+ return data
+}
+
+export async function createAlertRule(rule: AlertRule): Promise {
+ const { data } = await apiClient.post('/admin/ops/alert-rules', rule)
+ return data
+}
+
+export async function updateAlertRule(id: number, rule: Partial): Promise {
+ const { data } = await apiClient.put(`/admin/ops/alert-rules/${id}`, rule)
+ return data
+}
+
+export async function deleteAlertRule(id: number): Promise {
+ await apiClient.delete(`/admin/ops/alert-rules/${id}`)
+}
+
+export async function listAlertEvents(limit = 100): Promise {
+ const { data } = await apiClient.get('/admin/ops/alert-events', { params: { limit } })
+ return data
+}
+
+// Email notification config
+export async function getEmailNotificationConfig(): Promise {
+ const { data } = await apiClient.get('/admin/ops/email-notification/config')
+ return data
+}
+
+export async function updateEmailNotificationConfig(config: EmailNotificationConfig): Promise {
+ const { data } = await apiClient.put('/admin/ops/email-notification/config', config)
+ return data
+}
+
+// Runtime settings (DB-backed)
+export async function getAlertRuntimeSettings(): Promise {
+ const { data } = await apiClient.get('/admin/ops/runtime/alert')
+ return data
+}
+
+export async function updateAlertRuntimeSettings(config: OpsAlertRuntimeSettings): Promise {
+ const { data } = await apiClient.put('/admin/ops/runtime/alert', config)
+ return data
+}
+
+export const opsAPI = {
+ getDashboardOverview,
+ getThroughputTrend,
+ getLatencyHistogram,
+ getErrorTrend,
+ getErrorDistribution,
+ getConcurrencyStats,
+ getAccountAvailabilityStats,
+ subscribeQPS,
+ listErrorLogs,
+ getErrorLogDetail,
+ retryErrorRequest,
+ listRequestDetails,
+ listAlertRules,
+ createAlertRule,
+ updateAlertRule,
+ deleteAlertRule,
+ listAlertEvents,
+ getEmailNotificationConfig,
+ updateEmailNotificationConfig,
+ getAlertRuntimeSettings,
+ updateAlertRuntimeSettings
+}
+
+export default opsAPI
diff --git a/frontend/src/api/admin/settings.ts b/frontend/src/api/admin/settings.ts
index 6b46de7d..37b12e40 100644
--- a/frontend/src/api/admin/settings.ts
+++ b/frontend/src/api/admin/settings.ts
@@ -34,9 +34,22 @@ export interface SystemSettings {
turnstile_enabled: boolean
turnstile_site_key: string
turnstile_secret_key_configured: boolean
+
+ // Model fallback configuration
+ enable_model_fallback: boolean
+ fallback_model_anthropic: string
+ fallback_model_openai: string
+ fallback_model_gemini: string
+ fallback_model_antigravity: string
+
// Identity patch configuration (Claude -> Gemini)
enable_identity_patch: boolean
identity_patch_prompt: string
+
+ // Ops Monitoring (vNext)
+ ops_monitoring_enabled: boolean
+ ops_realtime_monitoring_enabled: boolean
+ ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string
}
export interface UpdateSettingsRequest {
@@ -60,8 +73,16 @@ export interface UpdateSettingsRequest {
turnstile_enabled?: boolean
turnstile_site_key?: string
turnstile_secret_key?: string
+ enable_model_fallback?: boolean
+ fallback_model_anthropic?: string
+ fallback_model_openai?: string
+ fallback_model_gemini?: string
+ fallback_model_antigravity?: string
enable_identity_patch?: boolean
identity_patch_prompt?: string
+ ops_monitoring_enabled?: boolean
+ ops_realtime_monitoring_enabled?: boolean
+ ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string
}
/**
diff --git a/frontend/src/api/client.ts b/frontend/src/api/client.ts
index 4e53069a..3827498b 100644
--- a/frontend/src/api/client.ts
+++ b/frontend/src/api/client.ts
@@ -80,9 +80,45 @@ apiClient.interceptors.response.use(
return response
},
(error: AxiosError>) => {
+ // Request cancellation: keep the original axios cancellation error so callers can ignore it.
+ // Otherwise we'd misclassify it as a generic "network error".
+ if (error.code === 'ERR_CANCELED' || axios.isCancel(error)) {
+ return Promise.reject(error)
+ }
+
// Handle common errors
if (error.response) {
const { status, data } = error.response
+ const url = String(error.config?.url || '')
+
+ // Validate `data` shape to avoid HTML error pages breaking our error handling.
+ const apiData = (typeof data === 'object' && data !== null ? data : {}) as Record
+
+ // Ops monitoring disabled: treat as feature-flagged 404, and proactively redirect away
+ // from ops pages to avoid broken UI states.
+ if (status === 404 && apiData.message === 'Ops monitoring is disabled') {
+ try {
+ localStorage.setItem('ops_monitoring_enabled_cached', 'false')
+ } catch {
+ // ignore localStorage failures
+ }
+ try {
+ window.dispatchEvent(new CustomEvent('ops-monitoring-disabled'))
+ } catch {
+ // ignore event failures
+ }
+
+ if (window.location.pathname.startsWith('/admin/ops')) {
+ window.location.href = '/admin/settings'
+ }
+
+ return Promise.reject({
+ status,
+ code: 'OPS_DISABLED',
+ message: apiData.message || error.message,
+ url
+ })
+ }
// 401: Unauthorized - clear token and redirect to login
if (status === 401) {
@@ -113,8 +149,8 @@ apiClient.interceptors.response.use(
// Return structured error
return Promise.reject({
status,
- code: data?.code,
- message: data?.message || error.message
+ code: apiData.code,
+ message: apiData.message || apiData.detail || error.message
})
}
From 337a188660174d8f7bd7534ff60f0e9e8acec866 Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Fri, 9 Jan 2026 20:59:02 +0800
Subject: [PATCH 12/53] =?UTF-8?q?feat(=E5=89=8D=E7=AB=AF=E7=8A=B6=E6=80=81?=
=?UTF-8?q?):=20=E6=B7=BB=E5=8A=A0=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7?=
=?UTF-8?q?=E7=8A=B6=E6=80=81=E7=AE=A1=E7=90=86=E5=92=8C=E8=B7=AF=E7=94=B1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 新增 adminSettings store 管理 ops 配置状态
- 注册 adminSettings store 到全局 store
- 添加 ops 监控相关路由(dashboard, alerts, realtime, settings)
---
frontend/src/router/index.ts | 12 +++
frontend/src/stores/adminSettings.ts | 130 +++++++++++++++++++++++++++
frontend/src/stores/index.ts | 1 +
3 files changed, 143 insertions(+)
create mode 100644 frontend/src/stores/adminSettings.ts
diff --git a/frontend/src/router/index.ts b/frontend/src/router/index.ts
index 48a6f0fd..c8d0214c 100644
--- a/frontend/src/router/index.ts
+++ b/frontend/src/router/index.ts
@@ -163,6 +163,18 @@ const routes: RouteRecordRaw[] = [
descriptionKey: 'admin.dashboard.description'
}
},
+ {
+ path: '/admin/ops',
+ name: 'AdminOps',
+ component: () => import('@/views/admin/ops/OpsDashboard.vue'),
+ meta: {
+ requiresAuth: true,
+ requiresAdmin: true,
+ title: 'Ops Monitoring',
+ titleKey: 'admin.ops.title',
+ descriptionKey: 'admin.ops.description'
+ }
+ },
{
path: '/admin/users',
name: 'AdminUsers',
diff --git a/frontend/src/stores/adminSettings.ts b/frontend/src/stores/adminSettings.ts
new file mode 100644
index 00000000..460cc92b
--- /dev/null
+++ b/frontend/src/stores/adminSettings.ts
@@ -0,0 +1,130 @@
+import { defineStore } from 'pinia'
+import { ref } from 'vue'
+import { adminAPI } from '@/api'
+
+export const useAdminSettingsStore = defineStore('adminSettings', () => {
+ const loaded = ref(false)
+ const loading = ref(false)
+
+ const readCachedBool = (key: string, defaultValue: boolean): boolean => {
+ try {
+ const raw = localStorage.getItem(key)
+ if (raw === 'true') return true
+ if (raw === 'false') return false
+ } catch {
+ // ignore localStorage failures
+ }
+ return defaultValue
+ }
+
+ const writeCachedBool = (key: string, value: boolean) => {
+ try {
+ localStorage.setItem(key, value ? 'true' : 'false')
+ } catch {
+ // ignore localStorage failures
+ }
+ }
+
+ const readCachedString = (key: string, defaultValue: string): string => {
+ try {
+ const raw = localStorage.getItem(key)
+ if (typeof raw === 'string' && raw.length > 0) return raw
+ } catch {
+ // ignore localStorage failures
+ }
+ return defaultValue
+ }
+
+ const writeCachedString = (key: string, value: string) => {
+ try {
+ localStorage.setItem(key, value)
+ } catch {
+ // ignore localStorage failures
+ }
+ }
+
+ // Default open, but honor cached value to reduce UI flicker on first paint.
+ const opsMonitoringEnabled = ref(readCachedBool('ops_monitoring_enabled_cached', true))
+ const opsRealtimeMonitoringEnabled = ref(readCachedBool('ops_realtime_monitoring_enabled_cached', true))
+ const opsQueryModeDefault = ref(readCachedString('ops_query_mode_default_cached', 'auto'))
+
+ async function fetch(force = false): Promise {
+ if (loaded.value && !force) return
+ if (loading.value) return
+
+ loading.value = true
+ try {
+ const settings = await adminAPI.settings.getSettings()
+ opsMonitoringEnabled.value = settings.ops_monitoring_enabled ?? true
+ writeCachedBool('ops_monitoring_enabled_cached', opsMonitoringEnabled.value)
+
+ opsRealtimeMonitoringEnabled.value = settings.ops_realtime_monitoring_enabled ?? true
+ writeCachedBool('ops_realtime_monitoring_enabled_cached', opsRealtimeMonitoringEnabled.value)
+
+ opsQueryModeDefault.value = settings.ops_query_mode_default || 'auto'
+ writeCachedString('ops_query_mode_default_cached', opsQueryModeDefault.value)
+
+ loaded.value = true
+ } catch (err) {
+ // Keep cached/default value: do not "flip" the UI based on a transient fetch failure.
+ loaded.value = true
+ console.error('[adminSettings] Failed to fetch settings:', err)
+ } finally {
+ loading.value = false
+ }
+ }
+
+ function setOpsMonitoringEnabledLocal(value: boolean) {
+ opsMonitoringEnabled.value = value
+ writeCachedBool('ops_monitoring_enabled_cached', value)
+ loaded.value = true
+ }
+
+ function setOpsRealtimeMonitoringEnabledLocal(value: boolean) {
+ opsRealtimeMonitoringEnabled.value = value
+ writeCachedBool('ops_realtime_monitoring_enabled_cached', value)
+ loaded.value = true
+ }
+
+ function setOpsQueryModeDefaultLocal(value: string) {
+ opsQueryModeDefault.value = value || 'auto'
+ writeCachedString('ops_query_mode_default_cached', opsQueryModeDefault.value)
+ loaded.value = true
+ }
+
+ // Keep UI consistent if we learn that ops is disabled via feature-gated 404s.
+ // (event is dispatched from the axios interceptor)
+ let eventHandlerCleanup: (() => void) | null = null
+
+ function initializeEventListeners() {
+ if (eventHandlerCleanup) return
+
+ try {
+ const handler = () => {
+ setOpsMonitoringEnabledLocal(false)
+ }
+ window.addEventListener('ops-monitoring-disabled', handler)
+ eventHandlerCleanup = () => {
+ window.removeEventListener('ops-monitoring-disabled', handler)
+ }
+ } catch {
+ // ignore window access failures (SSR)
+ }
+ }
+
+ if (typeof window !== 'undefined') {
+ initializeEventListeners()
+ }
+
+ return {
+ loaded,
+ loading,
+ opsMonitoringEnabled,
+ opsRealtimeMonitoringEnabled,
+ opsQueryModeDefault,
+ fetch,
+ setOpsMonitoringEnabledLocal,
+ setOpsRealtimeMonitoringEnabledLocal,
+ setOpsQueryModeDefaultLocal
+ }
+})
diff --git a/frontend/src/stores/index.ts b/frontend/src/stores/index.ts
index 0e4caef0..05c18e7e 100644
--- a/frontend/src/stores/index.ts
+++ b/frontend/src/stores/index.ts
@@ -5,6 +5,7 @@
export { useAuthStore } from './auth'
export { useAppStore } from './app'
+export { useAdminSettingsStore } from './adminSettings'
export { useSubscriptionStore } from './subscriptions'
export { useOnboardingStore } from './onboarding'
From fc32b577986ecc4478f901da2479972f84f7c553 Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Fri, 9 Jan 2026 20:59:33 +0800
Subject: [PATCH 13/53] =?UTF-8?q?feat(=E5=9B=BD=E9=99=85=E5=8C=96):=20?=
=?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E5=A4=9A?=
=?UTF-8?q?=E8=AF=AD=E8=A8=80=E6=94=AF=E6=8C=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 添加英文翻译(en.ts)包含 ops 监控所有文案
- 添加中文翻译(zh.ts)包含 ops 监控所有文案
---
frontend/src/i18n/locales/en.ts | 382 ++++++++++++++++++++++++++++++++
frontend/src/i18n/locales/zh.ts | 382 ++++++++++++++++++++++++++++++++
2 files changed, 764 insertions(+)
diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts
index 393641a7..f80a235f 100644
--- a/frontend/src/i18n/locales/en.ts
+++ b/frontend/src/i18n/locales/en.ts
@@ -131,6 +131,7 @@ export default {
noData: 'No data',
success: 'Success',
error: 'Error',
+ critical: 'Critical',
warning: 'Warning',
info: 'Info',
active: 'Active',
@@ -145,6 +146,8 @@ export default {
copiedToClipboard: 'Copied to clipboard',
copyFailed: 'Failed to copy',
contactSupport: 'Contact Support',
+ add: 'Add',
+ invalidEmail: 'Please enter a valid email address',
selectOption: 'Select an option',
searchPlaceholder: 'Search...',
noOptionsFound: 'No options found',
@@ -177,6 +180,7 @@ export default {
accounts: 'Accounts',
proxies: 'Proxies',
redeemCodes: 'Redeem Codes',
+ ops: 'Ops',
settings: 'Settings',
myAccount: 'My Account',
lightMode: 'Light Mode',
@@ -1713,6 +1717,370 @@ export default {
failedToLoad: 'Failed to load usage records'
},
+ // Ops Monitoring
+ ops: {
+ title: 'Ops Monitoring',
+ description: 'Operational monitoring and troubleshooting',
+ // Dashboard
+ systemHealth: 'System Health',
+ overview: 'Overview',
+ noSystemMetrics: 'No system metrics collected yet.',
+ collectedAt: 'Collected at:',
+ window: 'window',
+ cpu: 'CPU',
+ memory: 'Memory',
+ db: 'DB',
+ redis: 'Redis',
+ goroutines: 'Goroutines',
+ jobs: 'Jobs',
+ active: 'active',
+ idle: 'idle',
+ ok: 'ok',
+ lastRun: 'last_run:',
+ lastSuccess: 'last_success:',
+ lastError: 'last_error:',
+ noData: 'No data.',
+ loadingText: 'loading',
+ ready: 'ready',
+ requestsTotal: 'Requests (total)',
+ slaScope: 'SLA scope:',
+ tokens: 'Tokens',
+ tps: 'TPS:',
+ current: 'current',
+ peak: 'peak',
+ sla: 'SLA (excl business limits)',
+ businessLimited: 'business_limited:',
+ errors: 'Errors',
+ errorRate: 'error_rate:',
+ upstreamRate: 'upstream_rate:',
+ latencyDuration: 'Latency (duration_ms)',
+ ttftLabel: 'TTFT (first_token_ms)',
+ p50: 'p50:',
+ p90: 'p90:',
+ p95: 'p95:',
+ p99: 'p99:',
+ avg: 'avg:',
+ max: 'max:',
+ qps: 'QPS',
+ requests: 'Requests',
+ upstream: 'Upstream',
+ client: 'Client',
+ system: 'System',
+ other: 'Other',
+ errorsSla: 'Errors (SLA scope)',
+ upstreamExcl429529: 'Upstream (excl 429/529)',
+ failedToLoadData: 'Failed to load ops data.',
+ tpsK: 'TPS (K)',
+ top: 'Top:',
+ throughputTrend: 'Throughput Trend',
+ latencyHistogram: 'Latency Histogram',
+ errorTrend: 'Error Trend',
+ errorDistribution: 'Error Distribution',
+ // Error Log
+ errorLog: {
+ timeId: 'Time / ID',
+ context: 'Context',
+ status: 'Status',
+ message: 'Message',
+ latency: 'Latency',
+ action: 'Action',
+ noErrors: 'No errors in this window.',
+ grp: 'GRP:',
+ acc: 'ACC:',
+ details: 'Details',
+ phase: 'Phase'
+ },
+ // Error Details Modal
+ errorDetails: {
+ upstreamErrors: 'Upstream Errors',
+ requestErrors: 'Request Errors',
+ total: 'Total:',
+ searchPlaceholder: 'Search request_id / client_request_id / message',
+ accountIdPlaceholder: 'account_id'
+ },
+ // Error Detail Modal
+ errorDetail: {
+ loading: 'Loading…',
+ requestId: 'Request ID',
+ time: 'Time',
+ phase: 'Phase',
+ status: 'Status',
+ message: 'Message',
+ basicInfo: 'Basic Info',
+ platform: 'Platform',
+ model: 'Model',
+ latency: 'Latency',
+ ttft: 'TTFT',
+ businessLimited: 'Business Limited',
+ requestPath: 'Request Path',
+ timings: 'Timings',
+ auth: 'Auth',
+ routing: 'Routing',
+ upstream: 'Upstream',
+ response: 'Response',
+ retry: 'Retry',
+ retryClient: 'Retry (Client)',
+ retryUpstream: 'Retry (Upstream pinned)',
+ pinnedAccountId: 'Pinned account_id',
+ retryNotes: 'Retry Notes',
+ requestBody: 'Request Body',
+ errorBody: 'Error Body',
+ trimmed: 'trimmed',
+ confirmRetry: 'Confirm Retry',
+ retrySuccess: 'Retry succeeded',
+ retryFailed: 'Retry failed',
+ na: 'N/A',
+ retryHint: 'Retry will resend the request with the same parameters',
+ retryClientHint: 'Use client retry (no account pinning)',
+ retryUpstreamHint: 'Use upstream pinned retry (pin to the error account)',
+ pinnedAccountIdHint: '(auto from error log)',
+ retryNote1: 'Retry will use the same request body and parameters',
+ retryNote2: 'If the original request failed due to account issues, pinned retry may still fail',
+ retryNote3: 'Client retry will reselect an account',
+ confirmRetryMessage: 'Confirm retry this request?',
+ confirmRetryHint: 'Will resend with the same request parameters'
+ },
+ requestDetails: {
+ title: 'Request Details',
+ details: 'Details',
+ rangeLabel: 'Window: {range}',
+ rangeMinutes: '{n} minutes',
+ rangeHours: '{n} hours',
+ empty: 'No requests in this window.',
+ emptyHint: 'Try a different time range or remove filters.',
+ failedToLoad: 'Failed to load request details',
+ requestIdCopied: 'Request ID copied',
+ copyFailed: 'Copy failed',
+ copy: 'Copy',
+ viewError: 'View Error',
+ kind: {
+ success: 'SUCCESS',
+ error: 'ERROR'
+ },
+ table: {
+ time: 'Time',
+ kind: 'Kind',
+ platform: 'Platform',
+ model: 'Model',
+ duration: 'Duration',
+ status: 'Status',
+ requestId: 'Request ID',
+ actions: 'Actions'
+ }
+ },
+ alertEvents: {
+ title: 'Alert Events',
+ description: 'Recent alert firing/resolution records (email-only)',
+ loading: 'Loading...',
+ empty: 'No alert events',
+ loadFailed: 'Failed to load alert events',
+ table: {
+ time: 'Time',
+ status: 'Status',
+ severity: 'Severity',
+ title: 'Title',
+ metric: 'Metric / Threshold',
+ email: 'Email Sent'
+ }
+ },
+ alertRules: {
+ title: 'Alert Rules',
+ description: 'Create and manage threshold-based system alerts (email-only)',
+ loading: 'Loading...',
+ empty: 'No alert rules',
+ loadFailed: 'Failed to load alert rules',
+ saveFailed: 'Failed to save alert rule',
+ deleteFailed: 'Failed to delete alert rule',
+ create: 'Create Rule',
+ createTitle: 'Create Alert Rule',
+ editTitle: 'Edit Alert Rule',
+ deleteConfirmTitle: 'Delete this rule?',
+ deleteConfirmMessage: 'This will remove the rule and its related events. Continue?',
+ metrics: {
+ successRate: 'Success Rate (%)',
+ errorRate: 'Error Rate (%)',
+ p95: 'P95 Latency (ms)',
+ p99: 'P99 Latency (ms)',
+ cpu: 'CPU Usage (%)',
+ memory: 'Memory Usage (%)',
+ queueDepth: 'Concurrency Queue Depth'
+ },
+ table: {
+ name: 'Name',
+ metric: 'Metric',
+ severity: 'Severity',
+ enabled: 'Enabled',
+ actions: 'Actions'
+ },
+ form: {
+ name: 'Name',
+ description: 'Description',
+ metric: 'Metric',
+ operator: 'Operator',
+ threshold: 'Threshold',
+ severity: 'Severity',
+ window: 'Window (minutes)',
+ sustained: 'Sustained (samples)',
+ cooldown: 'Cooldown (minutes)',
+ enabled: 'Enabled',
+ notifyEmail: 'Send email notifications'
+ },
+ validation: {
+ title: 'Please fix the following issues',
+ invalid: 'Invalid rule',
+ nameRequired: 'Name is required',
+ metricRequired: 'Metric is required',
+ operatorRequired: 'Operator is required',
+ thresholdRequired: 'Threshold must be a number',
+ windowRange: 'Window must be one of: 1, 5, 60 minutes',
+ sustainedRange: 'Sustained must be between 1 and 1440 samples',
+ cooldownRange: 'Cooldown must be between 0 and 1440 minutes'
+ }
+ },
+ runtime: {
+ title: 'Ops Runtime Settings',
+ description: 'Stored in database; changes take effect without editing config files.',
+ loading: 'Loading...',
+ noData: 'No runtime settings available',
+ loadFailed: 'Failed to load runtime settings',
+ saveSuccess: 'Runtime settings saved',
+ saveFailed: 'Failed to save runtime settings',
+ alertTitle: 'Alert Evaluator',
+ groupAvailabilityTitle: 'Group Availability Monitor',
+ evalIntervalSeconds: 'Evaluation Interval (seconds)',
+ silencing: {
+ title: 'Alert Silencing (Maintenance Mode)',
+ enabled: 'Enable silencing',
+ globalUntil: 'Silence until (RFC3339)',
+ untilPlaceholder: '2026-01-05T00:00:00Z',
+ untilHint: 'Leave empty to only toggle silencing without an expiry (not recommended).',
+ reason: 'Reason',
+ reasonPlaceholder: 'e.g., planned maintenance',
+ entries: {
+ title: 'Advanced: targeted silencing',
+ hint: 'Optional: silence only certain rules or severities. Leave fields empty to match all.',
+ add: 'Add Entry',
+ empty: 'No targeted entries',
+ entryTitle: 'Entry #{n}',
+ ruleId: 'Rule ID (optional)',
+ ruleIdPlaceholder: 'e.g., 1',
+ severities: 'Severities (optional)',
+ severitiesPlaceholder: 'e.g., P0,P1 (empty = all)',
+ until: 'Until (RFC3339)',
+ reason: 'Reason',
+ validation: {
+ untilRequired: 'Entry until time is required',
+ untilFormat: 'Entry until time must be a valid RFC3339 timestamp',
+ ruleIdPositive: 'Entry rule_id must be a positive integer',
+ severitiesFormat: 'Entry severities must be a comma-separated list of P0..P3'
+ }
+ },
+ validation: {
+ timeFormat: 'Silence time must be a valid RFC3339 timestamp'
+ }
+ },
+ lockEnabled: 'Distributed Lock Enabled',
+ lockKey: 'Distributed Lock Key',
+ lockTTLSeconds: 'Distributed Lock TTL (seconds)',
+ showAdvancedDeveloperSettings: 'Show advanced developer settings (Distributed Lock)',
+ advancedSettingsSummary: 'Advanced settings (Distributed Lock)',
+ evalIntervalHint: 'How often the evaluator runs. Keeping the default is recommended.',
+ validation: {
+ title: 'Please fix the following issues',
+ invalid: 'Invalid settings',
+ evalIntervalRange: 'Evaluation interval must be between 1 and 86400 seconds',
+ lockKeyRequired: 'Distributed lock key is required when lock is enabled',
+ lockKeyPrefix: 'Distributed lock key must start with "{prefix}"',
+ lockKeyHint: 'Recommended: start with "{prefix}" to avoid conflicts',
+ lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds'
+ }
+ },
+ email: {
+ title: 'Email Notification',
+ description: 'Configure alert/report email notifications (stored in database).',
+ loading: 'Loading...',
+ noData: 'No email notification config',
+ loadFailed: 'Failed to load email notification config',
+ saveSuccess: 'Email notification config saved',
+ saveFailed: 'Failed to save email notification config',
+ alertTitle: 'Alert Emails',
+ reportTitle: 'Report Emails',
+ recipients: 'Recipients',
+ recipientsHint: 'If empty, the system may fallback to the first admin email.',
+ minSeverity: 'Min Severity',
+ minSeverityAll: 'All severities',
+ rateLimitPerHour: 'Rate limit per hour',
+ batchWindowSeconds: 'Batch window (seconds)',
+ includeResolved: 'Include resolved alerts',
+ dailySummary: 'Daily summary',
+ weeklySummary: 'Weekly summary',
+ errorDigest: 'Error digest',
+ errorDigestMinCount: 'Min errors for digest',
+ accountHealth: 'Account health',
+ accountHealthThreshold: 'Error rate threshold (%)',
+ cronPlaceholder: 'Cron expression',
+ reportHint: 'Schedules use cron syntax; leave empty to use defaults.',
+ validation: {
+ title: 'Please fix the following issues',
+ invalid: 'Invalid email notification config',
+ alertRecipientsRequired: 'Alert emails are enabled but no recipients are configured',
+ reportRecipientsRequired: 'Report emails are enabled but no recipients are configured',
+ invalidRecipients: 'One or more recipient emails are invalid',
+ rateLimitRange: 'Rate limit per hour must be a number ≥ 0',
+ batchWindowRange: 'Batch window must be between 0 and 86400 seconds',
+ cronRequired: 'A cron expression is required when schedule is enabled',
+ cronFormat: 'Cron expression format looks invalid (expected at least 5 parts)',
+ digestMinCountRange: 'Min errors for digest must be a number ≥ 0',
+ accountHealthThresholdRange: 'Account health threshold must be between 0 and 100'
+ }
+ },
+ concurrency: {
+ title: 'Concurrency / Queue',
+ byPlatform: 'By Platform',
+ byGroup: 'By Group',
+ byAccount: 'By Account',
+ totalRows: '{count} rows',
+ disabledHint: 'Realtime monitoring is disabled in settings.',
+ empty: 'No data',
+ queued: 'Queue {count}',
+ rateLimited: 'Rate-limited {count}',
+ errorAccounts: 'Errors {count}',
+ loadFailed: 'Failed to load concurrency data'
+ },
+ realtime: {
+ connected: 'Realtime connected',
+ connecting: 'Realtime connecting',
+ reconnecting: 'Realtime reconnecting',
+ offline: 'Realtime offline',
+ closed: 'Realtime closed',
+ reconnectIn: 'retry in {seconds}s'
+ },
+ queryMode: {
+ auto: 'Auto',
+ raw: 'Raw',
+ preagg: 'Preagg'
+ },
+ accountAvailability: {
+ available: 'Available',
+ unavailable: 'Unavailable',
+ accountError: 'Error'
+ },
+ tooltips: {
+ throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
+ latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
+ errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
+ errorDistribution: 'Error distribution by status code.'
+ },
+ charts: {
+ emptyRequest: 'No requests in this window.',
+ emptyError: 'No errors in this window.',
+ resetZoom: 'Reset',
+ resetZoomHint: 'Reset zoom (if enabled)',
+ downloadChart: 'Download',
+ downloadChartHint: 'Download chart as image'
+ }
+ },
+
// Settings
settings: {
title: 'System Settings',
@@ -1803,6 +2171,20 @@ export default {
sending: 'Sending...',
enterRecipientHint: 'Please enter a recipient email address'
},
+ opsMonitoring: {
+ title: 'Ops Monitoring',
+ description: 'Enable ops monitoring for troubleshooting and health visibility',
+ disabled: 'Ops monitoring is disabled',
+ enabled: 'Enable Ops Monitoring',
+ enabledHint: 'Enable the ops monitoring module (admin only)',
+ realtimeEnabled: 'Enable Realtime Monitoring',
+ realtimeEnabledHint: 'Enable realtime QPS/metrics push (WebSocket)',
+ queryMode: 'Default Query Mode',
+ queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)',
+ queryModeAuto: 'Auto (recommended)',
+ queryModeRaw: 'Raw (most accurate, slower)',
+ queryModePreagg: 'Preagg (fastest, requires aggregation)'
+ },
adminApiKey: {
title: 'Admin API Key',
description: 'Global API key for external system integration with full admin access',
diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts
index fb46bbbe..646511f4 100644
--- a/frontend/src/i18n/locales/zh.ts
+++ b/frontend/src/i18n/locales/zh.ts
@@ -128,6 +128,7 @@ export default {
noData: '暂无数据',
success: '成功',
error: '错误',
+ critical: '严重',
warning: '警告',
info: '提示',
active: '启用',
@@ -142,6 +143,8 @@ export default {
copiedToClipboard: '已复制到剪贴板',
copyFailed: '复制失败',
contactSupport: '联系客服',
+ add: '添加',
+ invalidEmail: '请输入有效的邮箱地址',
selectOption: '请选择',
searchPlaceholder: '搜索...',
noOptionsFound: '无匹配选项',
@@ -175,6 +178,7 @@ export default {
accounts: '账号管理',
proxies: 'IP管理',
redeemCodes: '兑换码',
+ ops: '运维监控',
settings: '系统设置',
myAccount: '我的账户',
lightMode: '浅色模式',
@@ -1858,6 +1862,370 @@ export default {
failedToLoad: '加载使用记录失败'
},
+ // Ops Monitoring
+ ops: {
+ title: '运维监控',
+ description: '运维监控与排障',
+ // Dashboard
+ systemHealth: '系统健康',
+ overview: '概览',
+ noSystemMetrics: '尚未收集系统指标。',
+ collectedAt: '采集时间:',
+ window: '窗口',
+ cpu: 'CPU',
+ memory: '内存',
+ db: '数据库',
+ redis: 'Redis',
+ goroutines: '协程',
+ jobs: '后台任务',
+ active: '活跃',
+ idle: '空闲',
+ ok: '正常',
+ lastRun: '最近运行',
+ lastSuccess: '最近成功',
+ lastError: '最近错误',
+ noData: '暂无数据',
+ loadingText: '加载中...',
+ ready: '就绪',
+ requestsTotal: '请求(总计)',
+ slaScope: 'SLA 范围:',
+ tokens: 'Token',
+ tps: 'TPS',
+ current: '当前',
+ peak: '峰值',
+ sla: 'SLA(排除业务限制)',
+ businessLimited: '业务限制:',
+ errors: '错误',
+ errorRate: '错误率:',
+ upstreamRate: '上游错误率:',
+ latencyDuration: '延迟 (duration_ms)',
+ ttftLabel: 'TTFT (first_token_ms)',
+ p50: 'p50',
+ p90: 'p90',
+ p95: 'p95',
+ p99: 'p99',
+ avg: 'avg',
+ max: 'max',
+ qps: 'QPS',
+ requests: '请求',
+ upstream: '上游',
+ client: '客户端',
+ system: '系统',
+ other: '其他',
+ errorsSla: '错误(SLA范围)',
+ upstreamExcl429529: '上游(排除429/529)',
+ failedToLoadData: '加载运维数据失败',
+ tpsK: 'TPS (K)',
+ top: '最高:',
+ throughputTrend: '吞吐趋势',
+ latencyHistogram: '延迟分布',
+ errorTrend: '错误趋势',
+ errorDistribution: '错误分布',
+ // Error Log
+ errorLog: {
+ timeId: '时间 / ID',
+ context: '上下文',
+ status: '状态码',
+ message: '消息',
+ latency: '延迟',
+ action: '操作',
+ noErrors: '该窗口内暂无错误。',
+ grp: 'GRP:',
+ acc: 'ACC:',
+ details: '详情',
+ phase: '阶段'
+ },
+ // Error Details Modal
+ errorDetails: {
+ upstreamErrors: '上游错误',
+ requestErrors: '请求错误',
+ total: '总计:',
+ searchPlaceholder: '搜索 request_id / client_request_id / message',
+ accountIdPlaceholder: 'account_id'
+ },
+ // Error Detail Modal
+ errorDetail: {
+ loading: '加载中…',
+ requestId: '请求 ID',
+ time: '时间',
+ phase: '阶段',
+ status: '状态码',
+ message: '消息',
+ basicInfo: '基本信息',
+ platform: '平台',
+ model: '模型',
+ latency: '延迟',
+ ttft: 'TTFT',
+ businessLimited: '业务限制',
+ requestPath: '请求路径',
+ timings: '时序信息',
+ auth: '认证',
+ routing: '路由',
+ upstream: '上游',
+ response: '响应',
+ retry: '重试',
+ retryClient: '重试(客户端)',
+ retryUpstream: '重试(上游固定)',
+ pinnedAccountId: '固定 account_id',
+ retryNotes: '重试说明',
+ requestBody: '请求体',
+ errorBody: '错误体',
+ trimmed: '已截断',
+ confirmRetry: '确认重试',
+ retrySuccess: '重试成功',
+ retryFailed: '重试失败',
+ na: 'N/A',
+ retryHint: '重试将使用相同的请求参数重新发送请求',
+ retryClientHint: '使用客户端重试(不固定账号)',
+ retryUpstreamHint: '使用上游固定重试(固定到错误的账号)',
+ pinnedAccountIdHint: '(自动从错误日志获取)',
+ retryNote1: '重试会使用相同的请求体和参数',
+ retryNote2: '如果原请求失败是因为账号问题,固定重试可能仍会失败',
+ retryNote3: '客户端重试会重新选择账号',
+ confirmRetryMessage: '确认要重试该请求吗?',
+ confirmRetryHint: '将使用相同的请求参数重新发送'
+ },
+ requestDetails: {
+ title: '请求明细',
+ details: '明细',
+ rangeLabel: '窗口:{range}',
+ rangeMinutes: '{n} 分钟',
+ rangeHours: '{n} 小时',
+ empty: '该窗口内暂无请求。',
+ emptyHint: '可尝试调整时间范围或取消部分筛选。',
+ failedToLoad: '加载请求明细失败',
+ requestIdCopied: '请求ID已复制',
+ copyFailed: '复制失败',
+ copy: '复制',
+ viewError: '查看错误',
+ kind: {
+ success: '成功',
+ error: '失败'
+ },
+ table: {
+ time: '时间',
+ kind: '类型',
+ platform: '平台',
+ model: '模型',
+ duration: '耗时',
+ status: '状态码',
+ requestId: '请求ID',
+ actions: '操作'
+ }
+ },
+ alertEvents: {
+ title: '告警事件',
+ description: '最近的告警触发/恢复记录(仅邮件通知)',
+ loading: '加载中...',
+ empty: '暂无告警事件',
+ loadFailed: '加载告警事件失败',
+ table: {
+ time: '时间',
+ status: '状态',
+ severity: '级别',
+ title: '标题',
+ metric: '指标 / 阈值',
+ email: '邮件已发送'
+ }
+ },
+ alertRules: {
+ title: '告警规则',
+ description: '创建与管理系统阈值告警(仅邮件通知)',
+ loading: '加载中...',
+ empty: '暂无告警规则',
+ loadFailed: '加载告警规则失败',
+ saveFailed: '保存告警规则失败',
+ deleteFailed: '删除告警规则失败',
+ create: '新建规则',
+ createTitle: '新建告警规则',
+ editTitle: '编辑告警规则',
+ deleteConfirmTitle: '确认删除该规则?',
+ deleteConfirmMessage: '将删除该规则及其关联的告警事件,是否继续?',
+ metrics: {
+ successRate: '成功率 (%)',
+ errorRate: '错误率 (%)',
+ p95: 'P95 延迟 (ms)',
+ p99: 'P99 延迟 (ms)',
+ cpu: 'CPU 使用率 (%)',
+ memory: '内存使用率 (%)',
+ queueDepth: '并发排队深度'
+ },
+ table: {
+ name: '名称',
+ metric: '指标',
+ severity: '级别',
+ enabled: '启用',
+ actions: '操作'
+ },
+ form: {
+ name: '名称',
+ description: '描述',
+ metric: '指标',
+ operator: '运算符',
+ threshold: '阈值',
+ severity: '级别',
+ window: '统计窗口(分钟)',
+ sustained: '连续样本数(每分钟)',
+ cooldown: '冷却期(分钟)',
+ enabled: '启用',
+ notifyEmail: '发送邮件通知'
+ },
+ validation: {
+ title: '请先修正以下问题',
+ invalid: '规则不合法',
+ nameRequired: '名称不能为空',
+ metricRequired: '指标不能为空',
+ operatorRequired: '运算符不能为空',
+ thresholdRequired: '阈值必须为数字',
+ windowRange: '统计窗口必须为 1 / 5 / 60 分钟之一',
+ sustainedRange: '连续样本数必须在 1 到 1440 之间',
+ cooldownRange: '冷却期必须在 0 到 1440 分钟之间'
+ }
+ },
+ runtime: {
+ title: '运维监控运行设置',
+ description: '配置存储在数据库中,无需修改 config 文件即可生效。',
+ loading: '加载中...',
+ noData: '暂无运行设置',
+ loadFailed: '加载运行设置失败',
+ saveSuccess: '运行设置已保存',
+ saveFailed: '保存运行设置失败',
+ alertTitle: '告警评估器',
+ groupAvailabilityTitle: '分组可用性监控',
+ evalIntervalSeconds: '评估间隔(秒)',
+ silencing: {
+ title: '告警静默(维护模式)',
+ enabled: '启用静默',
+ globalUntil: '静默截止时间(RFC3339)',
+ untilPlaceholder: '2026-01-05T00:00:00Z',
+ untilHint: '建议填写截止时间,避免忘记关闭静默。',
+ reason: '原因',
+ reasonPlaceholder: '例如:计划维护',
+ entries: {
+ title: '高级:定向静默',
+ hint: '可选:仅静默特定规则或特定级别。字段留空表示匹配全部。',
+ add: '新增条目',
+ empty: '暂无定向静默条目',
+ entryTitle: '条目 #{n}',
+ ruleId: '规则ID(可选)',
+ ruleIdPlaceholder: '例如:1',
+ severities: '级别(可选)',
+ severitiesPlaceholder: '例如:P0,P1(留空=全部)',
+ until: '截止时间(RFC3339)',
+ reason: '原因',
+ validation: {
+ untilRequired: '条目截止时间不能为空',
+ untilFormat: '条目截止时间必须为合法的 RFC3339 时间戳',
+ ruleIdPositive: '条目 rule_id 必须为正整数',
+ severitiesFormat: '条目级别必须为 P0..P3 的逗号分隔列表'
+ }
+ },
+ validation: {
+ timeFormat: '静默时间必须为合法的 RFC3339 时间戳'
+ }
+ },
+ lockEnabled: '启用分布式锁',
+ lockKey: '分布式锁 Key',
+ lockTTLSeconds: '分布式锁 TTL(秒)',
+ showAdvancedDeveloperSettings: '显示高级开发者设置 (Distributed Lock)',
+ advancedSettingsSummary: '高级设置 (分布式锁)',
+ evalIntervalHint: '检测任务的执行频率,建议保持默认。',
+ validation: {
+ title: '请先修正以下问题',
+ invalid: '设置不合法',
+ evalIntervalRange: '评估间隔必须在 1 到 86400 秒之间',
+ lockKeyRequired: '启用分布式锁时必须填写 Lock Key',
+ lockKeyPrefix: '分布式锁 Key 必须以「{prefix}」开头',
+ lockKeyHint: '建议以「{prefix}」开头以避免冲突',
+ lockTtlRange: '分布式锁 TTL 必须在 1 到 86400 秒之间'
+ }
+ },
+ email: {
+ title: '邮件通知配置',
+ description: '配置告警/报告邮件通知(存储在数据库中)。',
+ loading: '加载中...',
+ noData: '暂无邮件通知配置',
+ loadFailed: '加载邮件通知配置失败',
+ saveSuccess: '邮件通知配置已保存',
+ saveFailed: '保存邮件通知配置失败',
+ alertTitle: '告警邮件',
+ reportTitle: '报告邮件',
+ recipients: '收件人',
+ recipientsHint: '若为空,系统可能会回退使用第一个管理员邮箱。',
+ minSeverity: '最低级别',
+ minSeverityAll: '全部级别',
+ rateLimitPerHour: '每小时限额',
+ batchWindowSeconds: '合并窗口(秒)',
+ includeResolved: '包含恢复通知',
+ dailySummary: '每日摘要',
+ weeklySummary: '每周摘要',
+ errorDigest: '错误摘要',
+ errorDigestMinCount: '错误摘要最小数量',
+ accountHealth: '账号健康报告',
+ accountHealthThreshold: '错误率阈值(%)',
+ cronPlaceholder: 'Cron 表达式',
+ reportHint: '发送时间使用 Cron 语法;留空将使用默认值。',
+ validation: {
+ title: '请先修正以下问题',
+ invalid: '邮件通知配置不合法',
+ alertRecipientsRequired: '已启用告警邮件,但未配置任何收件人',
+ reportRecipientsRequired: '已启用报告邮件,但未配置任何收件人',
+ invalidRecipients: '存在不合法的收件人邮箱',
+ rateLimitRange: '每小时限额必须为 ≥ 0 的数字',
+ batchWindowRange: '合并窗口必须在 0 到 86400 秒之间',
+ cronRequired: '启用定时任务时必须填写 Cron 表达式',
+ cronFormat: 'Cron 表达式格式可能不正确(至少应包含 5 段)',
+ digestMinCountRange: '错误摘要最小数量必须为 ≥ 0 的数字',
+ accountHealthThresholdRange: '账号健康错误率阈值必须在 0 到 100 之间'
+ }
+ },
+ concurrency: {
+ title: '并发 / 排队',
+ byPlatform: '按平台',
+ byGroup: '按分组',
+ byAccount: '按账号',
+ totalRows: '共 {count} 项',
+ disabledHint: '已在设置中关闭实时监控。',
+ empty: '暂无数据',
+ queued: '队列 {count}',
+ rateLimited: '限流 {count}',
+ errorAccounts: '异常 {count}',
+ loadFailed: '加载并发数据失败'
+ },
+ realtime: {
+ connected: '实时已连接',
+ connecting: '实时连接中',
+ reconnecting: '实时重连中',
+ offline: '实时离线',
+ closed: '实时已关闭',
+ reconnectIn: '重连 {seconds}s'
+ },
+ queryMode: {
+ auto: 'Auto(自动)',
+ raw: 'Raw(不聚合)',
+ preagg: 'Preagg(聚合)'
+ },
+ accountAvailability: {
+ available: '可用',
+ unavailable: '不可用',
+ accountError: '异常'
+ },
+ tooltips: {
+ throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。',
+ latencyHistogram: '成功请求的延迟分布(duration_ms)。',
+ errorTrend: '错误趋势(SLA 口径排除业务限制;上游错误率排除 429/529)。',
+ errorDistribution: '按状态码统计的错误分布。'
+ },
+ charts: {
+ emptyRequest: '该时间窗口内暂无请求。',
+ emptyError: '该时间窗口内暂无错误。',
+ resetZoom: '重置',
+ resetZoomHint: '重置缩放(若启用)',
+ downloadChart: '下载',
+ downloadChartHint: '下载图表图片'
+ }
+ },
+
// Settings
settings: {
title: '系统设置',
@@ -1947,6 +2315,20 @@ export default {
sending: '发送中...',
enterRecipientHint: '请输入收件人邮箱地址'
},
+ opsMonitoring: {
+ title: '运维监控',
+ description: '启用运维监控模块,用于排障与健康可视化',
+ disabled: '运维监控已关闭',
+ enabled: '启用运维监控',
+ enabledHint: '启用 Ops 运维监控模块(仅管理员可见)',
+ realtimeEnabled: '启用实时监控',
+ realtimeEnabledHint: '启用实时 QPS/指标推送(WebSocket)',
+ queryMode: '默认查询模式',
+ queryModeHint: 'Ops Dashboard 默认查询模式(auto/raw/preagg)',
+ queryModeAuto: '自动(推荐)',
+ queryModeRaw: 'Raw(最准,但较慢)',
+ queryModePreagg: 'Preagg(最快,需预聚合)'
+ },
adminApiKey: {
title: '管理员 API Key',
description: '用于外部系统集成的全局 API Key,拥有完整的管理员权限',
From 8ae75e7f6ecc21fe7e45df143ff0fbe602d95be6 Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Fri, 9 Jan 2026 21:00:04 +0800
Subject: [PATCH 14/53] =?UTF-8?q?feat(=E5=89=8D=E7=AB=AFUI):=20=E5=AE=9E?=
=?UTF-8?q?=E7=8E=B0=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7=E5=89=8D=E7=AB=AF?=
=?UTF-8?q?=E7=95=8C=E9=9D=A2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 新增帮助提示组件(HelpTooltip.vue)
- 更新侧边栏添加 ops 监控菜单项
- 扩展设置视图集成 ops 配置面板
- 新增 ops 监控视图目录(dashboard, alerts, realtime, settings 等)
---
.../src/components/common/HelpTooltip.vue | 44 +
frontend/src/components/layout/AppSidebar.vue | 25 +-
frontend/src/views/admin/SettingsView.vue | 104 ++-
frontend/src/views/admin/ops/OpsDashboard.vue | 854 ++++++++++++++++++
.../ops/components/OpsAlertEventsCard.vue | 165 ++++
.../ops/components/OpsAlertRulesCard.vue | 357 ++++++++
.../ops/components/OpsConcurrencyCard.vue | 525 +++++++++++
.../ops/components/OpsDashboardHeader.vue | 374 ++++++++
.../ops/components/OpsDashboardSkeleton.vue | 53 ++
.../components/OpsEmailNotificationCard.vue | 441 +++++++++
.../ops/components/OpsErrorDetailModal.vue | 360 ++++++++
.../ops/components/OpsErrorDetailsModal.vue | 293 ++++++
.../components/OpsErrorDistributionChart.vue | 157 ++++
.../admin/ops/components/OpsErrorLogTable.vue | 238 +++++
.../ops/components/OpsErrorTrendChart.vue | 185 ++++
.../admin/ops/components/OpsLatencyChart.vue | 101 +++
.../ops/components/OpsRequestDetailsModal.vue | 309 +++++++
.../ops/components/OpsRuntimeSettingsCard.vue | 439 +++++++++
.../components/OpsThroughputTrendChart.vue | 252 ++++++
frontend/src/views/admin/ops/types.ts | 17 +
.../views/admin/ops/utils/opsFormatters.ts | 75 ++
21 files changed, 5362 insertions(+), 6 deletions(-)
create mode 100644 frontend/src/components/common/HelpTooltip.vue
create mode 100644 frontend/src/views/admin/ops/OpsDashboard.vue
create mode 100644 frontend/src/views/admin/ops/components/OpsAlertEventsCard.vue
create mode 100644 frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue
create mode 100644 frontend/src/views/admin/ops/components/OpsConcurrencyCard.vue
create mode 100644 frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
create mode 100644 frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue
create mode 100644 frontend/src/views/admin/ops/components/OpsEmailNotificationCard.vue
create mode 100644 frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue
create mode 100644 frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue
create mode 100644 frontend/src/views/admin/ops/components/OpsErrorDistributionChart.vue
create mode 100644 frontend/src/views/admin/ops/components/OpsErrorLogTable.vue
create mode 100644 frontend/src/views/admin/ops/components/OpsErrorTrendChart.vue
create mode 100644 frontend/src/views/admin/ops/components/OpsLatencyChart.vue
create mode 100644 frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue
create mode 100644 frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue
create mode 100644 frontend/src/views/admin/ops/components/OpsThroughputTrendChart.vue
create mode 100644 frontend/src/views/admin/ops/types.ts
create mode 100644 frontend/src/views/admin/ops/utils/opsFormatters.ts
diff --git a/frontend/src/components/common/HelpTooltip.vue b/frontend/src/components/common/HelpTooltip.vue
new file mode 100644
index 00000000..7679ced4
--- /dev/null
+++ b/frontend/src/components/common/HelpTooltip.vue
@@ -0,0 +1,44 @@
+
+
+
+
+
+
diff --git a/frontend/src/components/layout/AppSidebar.vue b/frontend/src/components/layout/AppSidebar.vue
index 791327a1..78217ec8 100644
--- a/frontend/src/components/layout/AppSidebar.vue
+++ b/frontend/src/components/layout/AppSidebar.vue
@@ -144,10 +144,10 @@
diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
new file mode 100644
index 00000000..c2c6adb6
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
@@ -0,0 +1,374 @@
+
+
+
+
+
+
+
+
+
+ {{ t('admin.ops.title') }}
+
+
+
+
+
+
+ {{ props.loading ? t('admin.ops.loadingText') : t('admin.ops.ready') }}
+
+ ·
+ {{ t('common.refresh') }}: {{ updatedAtLabel }}
+ ·
+
+
+
+
+ {{ wsStatusLabel }}
+ ({{ wsReconnectHint }})
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
{{ t('admin.ops.requests') }}
+
+ {{ totalRequestsLabel }}
+
+
+ {{ t('admin.ops.tokens') }}: {{ totalTokensLabel }}
+
+
+
+
QPS / TPS
+
+
+ {{ qpsLabel }} / {{ tpsLabel }}
+
+
+
+
+ {{ t('admin.ops.peak') }}: {{ qpsPeakLabel }} / {{ tpsPeakLabel }}
+
+
+
+
SLA
+
+ {{ slaLabel }}
+
+
+ {{ t('admin.ops.businessLimited') }}: {{ formatNumber(props.overview?.business_limited_count ?? 0) }}
+
+
+
+
{{ t('admin.ops.errors') }}
+
+
+ {{ t('admin.ops.errorRate') }}: {{ errorRateLabel }}
+
+
+
+
+
+
+
+ {{ t('admin.ops.upstreamRate') }}: {{ upstreamErrorRateLabel }}
+
+
+ 429: {{ formatNumber(props.overview?.upstream_429_count ?? 0) }} · 529:
+ {{ formatNumber(props.overview?.upstream_529_count ?? 0) }}
+
+
+
+
+
diff --git a/frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue b/frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue
new file mode 100644
index 00000000..5bbadd03
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue
@@ -0,0 +1,53 @@
+
+
+
+
diff --git a/frontend/src/views/admin/ops/components/OpsEmailNotificationCard.vue b/frontend/src/views/admin/ops/components/OpsEmailNotificationCard.vue
new file mode 100644
index 00000000..0204cbeb
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsEmailNotificationCard.vue
@@ -0,0 +1,441 @@
+
+
+
+
+
+
+
{{ t('admin.ops.email.title') }}
+
{{ t('admin.ops.email.description') }}
+
+
+
+
+
+
+
+
+ {{ t('admin.ops.email.loading') }}
+ {{ t('admin.ops.email.noData') }}
+
+
+
+
+
{{ t('admin.ops.email.alertTitle') }}
+
+
+ {{ t('common.enabled') }}:
+
+ {{ config.alert.enabled ? t('common.enabled') : t('common.disabled') }}
+
+
+
+ {{ t('admin.ops.email.recipients') }}:
+ {{ config.alert.recipients.length }}
+
+
+ {{ t('admin.ops.email.minSeverity') }}:
+ {{
+ config.alert.min_severity || t('admin.ops.email.minSeverityAll')
+ }}
+
+
+ {{ t('admin.ops.email.rateLimitPerHour') }}:
+ {{ config.alert.rate_limit_per_hour }}
+
+
+
+
+
+
{{ t('admin.ops.email.reportTitle') }}
+
+
+ {{ t('common.enabled') }}:
+
+ {{ config.report.enabled ? t('common.enabled') : t('common.disabled') }}
+
+
+
+ {{ t('admin.ops.email.recipients') }}:
+ {{ config.report.recipients.length }}
+
+
+
+
+
+
+
+
+
+
{{ t('admin.ops.email.validation.title') }}
+
+
+
+
{{ t('admin.ops.email.alertTitle') }}
+
+
+
{{ t('common.enabled') }}
+
+
+
+
+
{{ t('admin.ops.email.minSeverity') }}
+
+
+
+
+
{{ t('admin.ops.email.recipients') }}
+
+
+
+
+
{{ alertRecipientError }}
+
+
+ {{ email }}
+
+
+
+
{{ t('admin.ops.email.recipientsHint') }}
+
+
+
+
{{ t('admin.ops.email.rateLimitPerHour') }}
+
+
+
+
+
{{ t('admin.ops.email.batchWindowSeconds') }}
+
+
+
+
+
{{ t('admin.ops.email.includeResolved') }}
+
+
+
+
+
+
+
{{ t('admin.ops.email.reportTitle') }}
+
+
+
{{ t('common.enabled') }}
+
+
+
+
+
{{ t('admin.ops.email.recipients') }}
+
+
+
+
+
{{ reportRecipientError }}
+
+
+ {{ email }}
+
+
+
+
+
+
+
+
+
+
+
+
{{ t('admin.ops.email.errorDigestMinCount') }}
+
+
+
+
+
{{ t('admin.ops.email.accountHealthThreshold') }}
+
+
+
+
{{ t('admin.ops.email.reportHint') }}
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue
new file mode 100644
index 00000000..118a1f3a
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue
@@ -0,0 +1,360 @@
+
+
+
+
+
+
{{ t('admin.ops.errorDetail.loading') }}
+
+
+
+
+ {{ emptyText }}
+
+
+
+
+
+
+
{{ t('admin.ops.errorDetail.requestId') }}
+
+ {{ detail.request_id || detail.client_request_id || '—' }}
+
+
+
+
+
{{ t('admin.ops.errorDetail.time') }}
+
+ {{ formatDateTime(detail.created_at) }}
+
+
+
+
+
{{ t('admin.ops.errorDetail.phase') }}
+
+ {{ detail.phase || '—' }}
+
+
+ {{ detail.type || '—' }}
+
+
+
+
+
{{ t('admin.ops.errorDetail.status') }}
+
+
+ {{ detail.status_code }}
+
+
+ {{ detail.severity }}
+
+
+
+
+
+
+
+
{{ t('admin.ops.errorDetail.message') }}
+
+ {{ detail.message || '—' }}
+
+
+
+
+
+
{{ t('admin.ops.errorDetail.basicInfo') }}
+
+
+
{{ t('admin.ops.errorDetail.platform') }}
+
{{ detail.platform || '—' }}
+
+
+
{{ t('admin.ops.errorDetail.model') }}
+
{{ detail.model || '—' }}
+
+
+
{{ t('admin.ops.errorDetail.latency') }}
+
+ {{ detail.latency_ms != null ? `${detail.latency_ms}ms` : '—' }}
+
+
+
+
{{ t('admin.ops.errorDetail.ttft') }}
+
+ {{ detail.time_to_first_token_ms != null ? `${detail.time_to_first_token_ms}ms` : '—' }}
+
+
+
+
{{ t('admin.ops.errorDetail.businessLimited') }}
+
+ {{ detail.is_business_limited ? 'true' : 'false' }}
+
+
+
+
{{ t('admin.ops.errorDetail.requestPath') }}
+
+ {{ detail.request_path || '—' }}
+
+
+
+
+
+
+
+
{{ t('admin.ops.errorDetail.timings') }}
+
+
+
{{ t('admin.ops.errorDetail.auth') }}
+
+ {{ detail.auth_latency_ms != null ? `${detail.auth_latency_ms}ms` : '—' }}
+
+
+
+
{{ t('admin.ops.errorDetail.routing') }}
+
+ {{ detail.routing_latency_ms != null ? `${detail.routing_latency_ms}ms` : '—' }}
+
+
+
+
{{ t('admin.ops.errorDetail.upstream') }}
+
+ {{ detail.upstream_latency_ms != null ? `${detail.upstream_latency_ms}ms` : '—' }}
+
+
+
+
{{ t('admin.ops.errorDetail.response') }}
+
+ {{ detail.response_latency_ms != null ? `${detail.response_latency_ms}ms` : '—' }}
+
+
+
+
+
+
+
+
+
+
{{ t('admin.ops.errorDetail.retry') }}
+
+ {{ t('admin.ops.errorDetail.retryNote1') }}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {{ t('admin.ops.errorDetail.retryNote2') }}
+
+
+
+
+
{{ t('admin.ops.errorDetail.retryNotes') }}
+
+ - {{ t('admin.ops.errorDetail.retryNote3') }}
+ - {{ t('admin.ops.errorDetail.retryNote4') }}
+
+
+
+
+
+
+
+
+
+
{{ t('admin.ops.errorDetail.requestBody') }}
+
+ {{ t('admin.ops.errorDetail.trimmed') }}
+
+
+
{{ prettyJSON(detail.request_body) }}
+
+
+
+
+
{{ t('admin.ops.errorDetail.errorBody') }}
+
{{ prettyJSON(detail.error_body) }}
+
+
+
+
+
+
+
+
diff --git a/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue b/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue
new file mode 100644
index 00000000..f4a522de
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue
@@ -0,0 +1,293 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
{{ modalTitle }}
+
+ {{ t('admin.ops.errorDetails.total') }} {{ total }}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/frontend/src/views/admin/ops/components/OpsErrorDistributionChart.vue b/frontend/src/views/admin/ops/components/OpsErrorDistributionChart.vue
new file mode 100644
index 00000000..a52b5442
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsErrorDistributionChart.vue
@@ -0,0 +1,157 @@
+
+
+
+
+
+
+
+ {{ t('admin.ops.errorDistribution') }}
+
+
+
+
+
+
+
+
+
+
+
+
+ {{ t('admin.ops.top') }}: {{ topReason.label }}
+
+
+
+
+ {{ item.count }}
+
+
+
+
+
+
+
{{ t('common.loading') }}
+
+
+
+
+
diff --git a/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue b/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue
new file mode 100644
index 00000000..6a4be1a7
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue
@@ -0,0 +1,238 @@
+
+
+
+
+
+
+
+
+ |
+ {{ t('admin.ops.errorLog.timeId') }}
+ |
+
+ {{ t('admin.ops.errorLog.context') }}
+ |
+
+ {{ t('admin.ops.errorLog.status') }}
+ |
+
+ {{ t('admin.ops.errorLog.message') }}
+ |
+
+ {{ t('admin.ops.errorLog.latency') }}
+ |
+
+ {{ t('admin.ops.errorLog.action') }}
+ |
+
+
+
+
+ |
+ {{ t('admin.ops.errorLog.noErrors') }}
+ |
+
+
+
+
+ |
+
+
+ {{ formatDateTime(log.created_at).split(' ')[1] }}
+
+
+ {{ (log.request_id || log.client_request_id || '').substring(0, 12) }}
+
+
+ |
+
+
+
+
+
+ {{ log.platform || '-' }}
+
+
+ {{ log.model }}
+
+
+ {{ t('admin.ops.errorLog.grp') }} {{ log.group_id }}
+ {{ t('admin.ops.errorLog.acc') }} {{ log.account_id }}
+
+
+ |
+
+
+
+
+
+ {{ log.status_code }}
+
+
+ {{ log.severity }}
+
+
+ |
+
+
+
+
+
+ {{ formatSmartMessage(log.message) || '-' }}
+
+
+
+
+ {{ log.phase }}
+
+
+
+ {{ log.client_ip }}
+
+
+
+ |
+
+
+
+
+
+ {{ log.latency_ms != null ? Math.round(log.latency_ms) + 'ms' : '--' }}
+
+
+ |
+
+
+
+
+ |
+
+
+
+
+
+
+
+
+
+
diff --git a/frontend/src/views/admin/ops/components/OpsErrorTrendChart.vue b/frontend/src/views/admin/ops/components/OpsErrorTrendChart.vue
new file mode 100644
index 00000000..032e1205
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsErrorTrendChart.vue
@@ -0,0 +1,185 @@
+
+
+
+
+
+
+
+ {{ t('admin.ops.errorTrend') }}
+
+
+
+
+
+
+
+
+
+
+
+
{{ t('common.loading') }}
+
+
+
+
+
diff --git a/frontend/src/views/admin/ops/components/OpsLatencyChart.vue b/frontend/src/views/admin/ops/components/OpsLatencyChart.vue
new file mode 100644
index 00000000..c62b3aa9
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsLatencyChart.vue
@@ -0,0 +1,101 @@
+
+
+
+
+
+
+
+ {{ t('admin.ops.latencyHistogram') }}
+
+
+
+
+
+
+
+
{{ t('common.loading') }}
+
+
+
+
+
diff --git a/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue b/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue
new file mode 100644
index 00000000..541aa3ed
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue
@@ -0,0 +1,309 @@
+
+
+
+
+
+
+
+
+
+
+ {{ props.preset.title || t('admin.ops.requestDetails.title') }}
+
+
+ {{ t('admin.ops.requestDetails.rangeLabel', { range: rangeLabel }) }}
+
+
+
+
+
+
+
+
+
+
+
+
+
{{ t('common.loading') }}
+
+
+
+
+
+
+
{{ t('admin.ops.requestDetails.empty') }}
+
{{ t('admin.ops.requestDetails.emptyHint') }}
+
+
+
+
+
+
+
+ |
+ {{ t('admin.ops.requestDetails.table.time') }}
+ |
+
+ {{ t('admin.ops.requestDetails.table.kind') }}
+ |
+
+ {{ t('admin.ops.requestDetails.table.platform') }}
+ |
+
+ {{ t('admin.ops.requestDetails.table.model') }}
+ |
+
+ {{ t('admin.ops.requestDetails.table.duration') }}
+ |
+
+ {{ t('admin.ops.requestDetails.table.status') }}
+ |
+
+ {{ t('admin.ops.requestDetails.table.requestId') }}
+ |
+
+ {{ t('admin.ops.requestDetails.table.actions') }}
+ |
+
+
+
+
+ |
+ {{ formatDateTime(row.created_at) }}
+ |
+
+
+ {{ row.kind === 'error' ? t('admin.ops.requestDetails.kind.error') : t('admin.ops.requestDetails.kind.success') }}
+
+ |
+
+ {{ (row.platform || 'unknown').toUpperCase() }}
+ |
+
+ {{ row.model || '-' }}
+ |
+
+ {{ typeof row.duration_ms === 'number' ? `${row.duration_ms} ms` : '-' }}
+ |
+
+ {{ row.status_code ?? '-' }}
+ |
+
+
+
+ {{ row.request_id }}
+
+
+
+ -
+ |
+
+
+ -
+ |
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue b/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue
new file mode 100644
index 00000000..e9df347d
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue
@@ -0,0 +1,439 @@
+
+
+
+
+
+
+
{{ t('admin.ops.runtime.title') }}
+
{{ t('admin.ops.runtime.description') }}
+
+
+
+
+
+ {{ t('admin.ops.runtime.loading') }}
+ {{ t('admin.ops.runtime.noData') }}
+
+
+
+
+
+
{{ t('admin.ops.runtime.alertTitle') }}
+
+
+
+
+ {{ t('admin.ops.runtime.evalIntervalSeconds') }}:
+ {{ alertSettings.evaluation_interval_seconds }}s
+
+
+ {{ t('admin.ops.runtime.silencing.globalUntil') }}:
+ {{ alertSettings.silencing.global_until_rfc3339 }}
+
+
+
+
+ {{ t('admin.ops.runtime.showAdvancedDeveloperSettings') }}
+
+
+
+ {{ t('admin.ops.runtime.lockEnabled') }}:
+ {{ alertSettings.distributed_lock.enabled }}
+
+
+ {{ t('admin.ops.runtime.lockKey') }}:
+ {{ alertSettings.distributed_lock.key }}
+
+
+ {{ t('admin.ops.runtime.lockTTLSeconds') }}:
+ {{ alertSettings.distributed_lock.ttl_seconds }}s
+
+
+
+
+
+
+
+
+
+
+
+
{{ t('admin.ops.runtime.validation.title') }}
+
+
+
+
+
{{ t('admin.ops.runtime.evalIntervalSeconds') }}
+
+
{{ t('admin.ops.runtime.evalIntervalHint') }}
+
+
+
+
{{ t('admin.ops.runtime.silencing.title') }}
+
+
+
+
+
+
{{ t('admin.ops.runtime.silencing.globalUntil') }}
+
+
{{ t('admin.ops.runtime.silencing.untilHint') }}
+
+
+
+
{{ t('admin.ops.runtime.silencing.reason') }}
+
+
+
+
+
+
+
{{ t('admin.ops.runtime.silencing.entries.title') }}
+
{{ t('admin.ops.runtime.silencing.entries.hint') }}
+
+
+
+
+
+ {{ t('admin.ops.runtime.silencing.entries.empty') }}
+
+
+
+
+
+
+ {{ t('admin.ops.runtime.silencing.entries.entryTitle', { n: idx + 1 }) }}
+
+
+
+
+
+
+
{{ t('admin.ops.runtime.silencing.entries.ruleId') }}
+
+
+
+
+
{{ t('admin.ops.runtime.silencing.entries.severities') }}
+
+
+
+
+
{{ t('admin.ops.runtime.silencing.entries.until') }}
+
+
+
+
+
{{ t('admin.ops.runtime.silencing.entries.reason') }}
+
+
+
+
+
+
+
+
+
+
+ {{ t('admin.ops.runtime.advancedSettingsSummary') }}
+
+
+
+
+
+
{{ t('admin.ops.runtime.lockKey') }}
+
+
+ {{ t('admin.ops.runtime.validation.lockKeyHint', { prefix: 'ops:' }) }}
+
+
+
+
{{ t('admin.ops.runtime.lockTTLSeconds') }}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/frontend/src/views/admin/ops/components/OpsThroughputTrendChart.vue b/frontend/src/views/admin/ops/components/OpsThroughputTrendChart.vue
new file mode 100644
index 00000000..e3bd26c2
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsThroughputTrendChart.vue
@@ -0,0 +1,252 @@
+
+
+
+
+
+
+
+ {{ t('admin.ops.throughputTrend') }}
+
+
+
+ {{ t('admin.ops.qps') }}
+ {{ t('admin.ops.tpsK') }}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
{{ t('common.loading') }}
+
+
+
+
+
diff --git a/frontend/src/views/admin/ops/types.ts b/frontend/src/views/admin/ops/types.ts
new file mode 100644
index 00000000..08830542
--- /dev/null
+++ b/frontend/src/views/admin/ops/types.ts
@@ -0,0 +1,17 @@
+// Ops 前端视图层的共享类型(与后端 DTO 解耦)。
+
+export type ChartState = 'loading' | 'empty' | 'ready'
+
+// Re-export ops alert/settings types so view components can import from a single place
+// while keeping the API contract centralized in `@/api/admin/ops`.
+export type {
+ AlertRule,
+ AlertEvent,
+ AlertSeverity,
+ ThresholdMode,
+ MetricType,
+ Operator,
+ EmailNotificationConfig,
+ OpsDistributedLockSettings,
+ OpsAlertRuntimeSettings
+} from '@/api/admin/ops'
diff --git a/frontend/src/views/admin/ops/utils/opsFormatters.ts b/frontend/src/views/admin/ops/utils/opsFormatters.ts
new file mode 100644
index 00000000..d503b5a5
--- /dev/null
+++ b/frontend/src/views/admin/ops/utils/opsFormatters.ts
@@ -0,0 +1,75 @@
+/**
+ * Ops 页面共享的格式化/样式工具。
+ *
+ * 目标:尽量对齐 `docs/sub2api` 备份版本的视觉表现(需求一致部分保持一致),
+ * 同时避免引入额外 UI 依赖。
+ */
+
+import type { OpsSeverity } from '@/api/admin/ops'
+import { formatBytes } from '@/utils/format'
+
+export function getSeverityClass(severity: OpsSeverity): string {
+ const classes: Record = {
+ P0: 'bg-red-100 text-red-800 dark:bg-red-900/30 dark:text-red-400',
+ P1: 'bg-orange-100 text-orange-800 dark:bg-orange-900/30 dark:text-orange-400',
+ P2: 'bg-yellow-100 text-yellow-800 dark:bg-yellow-900/30 dark:text-yellow-400',
+ P3: 'bg-blue-100 text-blue-800 dark:bg-blue-900/30 dark:text-blue-400'
+ }
+ return classes[String(severity || '')] || classes.P3
+}
+
+export function truncateMessage(msg: string, maxLength = 80): string {
+ if (!msg) return ''
+ return msg.length > maxLength ? msg.substring(0, maxLength) + '...' : msg
+}
+
+/**
+ * 格式化日期时间(短格式,和旧 Ops 页面一致)。
+ * 输出: `MM-DD HH:mm:ss`
+ */
+export function formatDateTime(dateStr: string): string {
+ const d = new Date(dateStr)
+ if (Number.isNaN(d.getTime())) return ''
+ return `${String(d.getMonth() + 1).padStart(2, '0')}-${String(d.getDate()).padStart(2, '0')} ${String(d.getHours()).padStart(2, '0')}:${String(d.getMinutes()).padStart(2, '0')}:${String(d.getSeconds()).padStart(2, '0')}`
+}
+
+export function sumNumbers(values: Array): number {
+ return values.reduce((acc, v) => {
+ const n = typeof v === 'number' && Number.isFinite(v) ? v : 0
+ return acc + n
+ }, 0)
+}
+
+/**
+ * 解析 time_range 为分钟数。
+ * 支持:`5m/30m/1h/6h/24h`
+ */
+export function parseTimeRangeMinutes(range: string): number {
+ const trimmed = (range || '').trim()
+ if (!trimmed) return 60
+ if (trimmed.endsWith('m')) {
+ const v = Number.parseInt(trimmed.slice(0, -1), 10)
+ return Number.isFinite(v) && v > 0 ? v : 60
+ }
+ if (trimmed.endsWith('h')) {
+ const v = Number.parseInt(trimmed.slice(0, -1), 10)
+ return Number.isFinite(v) && v > 0 ? v * 60 : 60
+ }
+ return 60
+}
+
+export function formatHistoryLabel(date: string | undefined, timeRange: string): string {
+ if (!date) return ''
+ const d = new Date(date)
+ if (Number.isNaN(d.getTime())) return ''
+ const minutes = parseTimeRangeMinutes(timeRange)
+ if (minutes >= 24 * 60) {
+ return `${String(d.getMonth() + 1).padStart(2, '0')}-${String(d.getDate()).padStart(2, '0')} ${String(d.getHours()).padStart(2, '0')}:${String(d.getMinutes()).padStart(2, '0')}`
+ }
+ return `${String(d.getHours()).padStart(2, '0')}:${String(d.getMinutes()).padStart(2, '0')}`
+}
+
+export function formatByteRate(bytes: number, windowMinutes: number): string {
+ const seconds = Math.max(1, (windowMinutes || 1) * 60)
+ return `${formatBytes(bytes / seconds, 1)}/s`
+}
From 585257d34030c5f068a444cc1b718cc73ae9fa37 Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Sat, 10 Jan 2026 01:38:47 +0800
Subject: [PATCH 15/53] =?UTF-8?q?feat(=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7?=
=?UTF-8?q?):=20=E5=A2=9E=E5=BC=BA=E7=9B=91=E6=8E=A7=E5=8A=9F=E8=83=BD?=
=?UTF-8?q?=E5=92=8C=E5=81=A5=E5=BA=B7=E8=AF=84=E5=88=86=E7=B3=BB=E7=BB=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
后端改进:
- 新增健康评分计算服务(ops_health_score.go)
- 添加分布式锁支持(ops_advisory_lock.go)
- 优化指标采集和聚合逻辑
- 新增运维指标采集间隔配置(60-3600秒)
- 移除未使用的WebSocket查询token认证中间件
- 改进清理服务和告警评估逻辑
前端改进:
- 简化OpsDashboard组件结构
- 完善国际化文本(中英文)
- 新增运维监控相关API类型定义
- 添加运维指标采集间隔设置界面
- 优化错误详情模态框
测试:
- 添加健康评分单元测试
- 更新API契约测试
---
.../internal/handler/admin/setting_handler.go | 28 +-
backend/internal/handler/dto/settings.go | 5 +-
.../internal/repository/ops_repo_metrics.go | 27 +-
backend/internal/server/api_contract_test.go | 4 +-
.../server/middleware/ws_query_token_auth.go | 54 ----
backend/internal/server/router.go | 2 -
backend/internal/service/ops_advisory_lock.go | 46 ++++
.../service/ops_aggregation_service.go | 33 ++-
.../service/ops_alert_evaluator_service.go | 7 +-
.../internal/service/ops_cleanup_service.go | 36 +--
backend/internal/service/ops_dashboard.go | 13 +
.../internal/service/ops_dashboard_models.go | 4 +
backend/internal/service/ops_health_score.go | 126 +++++++++
.../internal/service/ops_health_score_test.go | 60 +++++
.../internal/service/ops_metrics_collector.go | 66 ++---
backend/internal/service/ops_port.go | 10 +
backend/internal/service/setting_service.go | 16 ++
backend/internal/service/settings_view.go | 1 +
frontend/src/api/admin/ops.ts | 5 +
frontend/src/api/admin/settings.ts | 2 +
frontend/src/i18n/locales/en.ts | 60 ++++-
frontend/src/i18n/locales/zh.ts | 76 +++++-
frontend/src/views/admin/SettingsView.vue | 25 +-
frontend/src/views/admin/ops/OpsDashboard.vue | 245 +-----------------
.../ops/components/OpsErrorDetailModal.vue | 4 +-
25 files changed, 570 insertions(+), 385 deletions(-)
delete mode 100644 backend/internal/server/middleware/ws_query_token_auth.go
create mode 100644 backend/internal/service/ops_advisory_lock.go
create mode 100644 backend/internal/service/ops_health_score.go
create mode 100644 backend/internal/service/ops_health_score_test.go
diff --git a/backend/internal/handler/admin/setting_handler.go b/backend/internal/handler/admin/setting_handler.go
index 4d4d5639..59f47010 100644
--- a/backend/internal/handler/admin/setting_handler.go
+++ b/backend/internal/handler/admin/setting_handler.go
@@ -68,6 +68,7 @@ func (h *SettingHandler) GetSettings(c *gin.Context) {
OpsMonitoringEnabled: settings.OpsMonitoringEnabled,
OpsRealtimeMonitoringEnabled: settings.OpsRealtimeMonitoringEnabled,
OpsQueryModeDefault: settings.OpsQueryModeDefault,
+ OpsMetricsIntervalSeconds: settings.OpsMetricsIntervalSeconds,
})
}
@@ -115,9 +116,10 @@ type UpdateSettingsRequest struct {
IdentityPatchPrompt string `json:"identity_patch_prompt"`
// Ops monitoring (vNext)
- OpsMonitoringEnabled *bool `json:"ops_monitoring_enabled"`
- OpsRealtimeMonitoringEnabled *bool `json:"ops_realtime_monitoring_enabled"`
+ OpsMonitoringEnabled *bool `json:"ops_monitoring_enabled"`
+ OpsRealtimeMonitoringEnabled *bool `json:"ops_realtime_monitoring_enabled"`
OpsQueryModeDefault *string `json:"ops_query_mode_default"`
+ OpsMetricsIntervalSeconds *int `json:"ops_metrics_interval_seconds"`
}
// UpdateSettings 更新系统设置
@@ -173,6 +175,18 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
}
}
+ // Ops metrics collector interval validation (seconds).
+ if req.OpsMetricsIntervalSeconds != nil {
+ v := *req.OpsMetricsIntervalSeconds
+ if v < 60 {
+ v = 60
+ }
+ if v > 3600 {
+ v = 3600
+ }
+ req.OpsMetricsIntervalSeconds = &v
+ }
+
settings := &service.SystemSettings{
RegistrationEnabled: req.RegistrationEnabled,
EmailVerifyEnabled: req.EmailVerifyEnabled,
@@ -219,6 +233,12 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
}
return previousSettings.OpsQueryModeDefault
}(),
+ OpsMetricsIntervalSeconds: func() int {
+ if req.OpsMetricsIntervalSeconds != nil {
+ return *req.OpsMetricsIntervalSeconds
+ }
+ return previousSettings.OpsMetricsIntervalSeconds
+ }(),
}
if err := h.settingService.UpdateSettings(c.Request.Context(), settings); err != nil {
@@ -266,6 +286,7 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
OpsMonitoringEnabled: updatedSettings.OpsMonitoringEnabled,
OpsRealtimeMonitoringEnabled: updatedSettings.OpsRealtimeMonitoringEnabled,
OpsQueryModeDefault: updatedSettings.OpsQueryModeDefault,
+ OpsMetricsIntervalSeconds: updatedSettings.OpsMetricsIntervalSeconds,
})
}
@@ -375,6 +396,9 @@ func diffSettings(before *service.SystemSettings, after *service.SystemSettings,
if before.OpsQueryModeDefault != after.OpsQueryModeDefault {
changed = append(changed, "ops_query_mode_default")
}
+ if before.OpsMetricsIntervalSeconds != after.OpsMetricsIntervalSeconds {
+ changed = append(changed, "ops_metrics_interval_seconds")
+ }
return changed
}
diff --git a/backend/internal/handler/dto/settings.go b/backend/internal/handler/dto/settings.go
index 6fd53b26..3f631bfa 100644
--- a/backend/internal/handler/dto/settings.go
+++ b/backend/internal/handler/dto/settings.go
@@ -39,9 +39,10 @@ type SystemSettings struct {
IdentityPatchPrompt string `json:"identity_patch_prompt"`
// Ops monitoring (vNext)
- OpsMonitoringEnabled bool `json:"ops_monitoring_enabled"`
- OpsRealtimeMonitoringEnabled bool `json:"ops_realtime_monitoring_enabled"`
+ OpsMonitoringEnabled bool `json:"ops_monitoring_enabled"`
+ OpsRealtimeMonitoringEnabled bool `json:"ops_realtime_monitoring_enabled"`
OpsQueryModeDefault string `json:"ops_query_mode_default"`
+ OpsMetricsIntervalSeconds int `json:"ops_metrics_interval_seconds"`
}
type PublicSettings struct {
diff --git a/backend/internal/repository/ops_repo_metrics.go b/backend/internal/repository/ops_repo_metrics.go
index 96bad88a..75345595 100644
--- a/backend/internal/repository/ops_repo_metrics.go
+++ b/backend/internal/repository/ops_repo_metrics.go
@@ -68,6 +68,9 @@ INSERT INTO ops_system_metrics (
db_ok,
redis_ok,
+ redis_conn_total,
+ redis_conn_idle,
+
db_conn_active,
db_conn_idle,
db_conn_waiting,
@@ -83,8 +86,9 @@ INSERT INTO ops_system_metrics (
$21,$22,$23,$24,$25,$26,
$27,$28,$29,$30,
$31,$32,
- $33,$34,$35,
- $36,$37
+ $33,$34,
+ $35,$36,$37,
+ $38,$39
)`
_, err := r.db.ExecContext(
@@ -130,6 +134,9 @@ INSERT INTO ops_system_metrics (
opsNullBool(input.DBOK),
opsNullBool(input.RedisOK),
+ opsNullInt(input.RedisConnTotal),
+ opsNullInt(input.RedisConnIdle),
+
opsNullInt(input.DBConnActive),
opsNullInt(input.DBConnIdle),
opsNullInt(input.DBConnWaiting),
@@ -162,6 +169,9 @@ SELECT
db_ok,
redis_ok,
+ redis_conn_total,
+ redis_conn_idle,
+
db_conn_active,
db_conn_idle,
db_conn_waiting,
@@ -182,6 +192,8 @@ LIMIT 1`
var memPct sql.NullFloat64
var dbOK sql.NullBool
var redisOK sql.NullBool
+ var redisTotal sql.NullInt64
+ var redisIdle sql.NullInt64
var dbActive sql.NullInt64
var dbIdle sql.NullInt64
var dbWaiting sql.NullInt64
@@ -198,6 +210,8 @@ LIMIT 1`
&memPct,
&dbOK,
&redisOK,
+ &redisTotal,
+ &redisIdle,
&dbActive,
&dbIdle,
&dbWaiting,
@@ -231,6 +245,14 @@ LIMIT 1`
v := redisOK.Bool
out.RedisOK = &v
}
+ if redisTotal.Valid {
+ v := int(redisTotal.Int64)
+ out.RedisConnTotal = &v
+ }
+ if redisIdle.Valid {
+ v := int(redisIdle.Int64)
+ out.RedisConnIdle = &v
+ }
if dbActive.Valid {
v := int(dbActive.Int64)
out.DBConnActive = &v
@@ -398,4 +420,3 @@ func opsNullTime(v *time.Time) any {
}
return sql.NullTime{Time: *v, Valid: true}
}
-
diff --git a/backend/internal/server/api_contract_test.go b/backend/internal/server/api_contract_test.go
index 23cab19c..f8140fe6 100644
--- a/backend/internal/server/api_contract_test.go
+++ b/backend/internal/server/api_contract_test.go
@@ -319,7 +319,9 @@ func TestAPIContracts(t *testing.T) {
"enable_identity_patch": true,
"identity_patch_prompt": "",
"ops_monitoring_enabled": true,
- "ops_realtime_monitoring_enabled": true
+ "ops_realtime_monitoring_enabled": true,
+ "ops_query_mode_default": "auto",
+ "ops_metrics_interval_seconds": 60
}
}`,
},
diff --git a/backend/internal/server/middleware/ws_query_token_auth.go b/backend/internal/server/middleware/ws_query_token_auth.go
deleted file mode 100644
index 3b8d086a..00000000
--- a/backend/internal/server/middleware/ws_query_token_auth.go
+++ /dev/null
@@ -1,54 +0,0 @@
-package middleware
-
-import (
- "net/http"
- "strings"
-
- "github.com/gin-gonic/gin"
-)
-
-// InjectBearerTokenFromQueryForWebSocket copies `?token=` into the Authorization header
-// for WebSocket handshake requests on a small allow-list of endpoints.
-//
-// Why: browsers can't set custom headers on WebSocket handshake, but our admin routes
-// are protected by header-based auth. This keeps the token support scoped to WS only.
-func InjectBearerTokenFromQueryForWebSocket() gin.HandlerFunc {
- return func(c *gin.Context) {
- if c == nil || c.Request == nil {
- if c != nil {
- c.Next()
- }
- return
- }
-
- // Only GET websocket upgrades.
- if c.Request.Method != http.MethodGet {
- c.Next()
- return
- }
- if !strings.EqualFold(strings.TrimSpace(c.GetHeader("Upgrade")), "websocket") {
- c.Next()
- return
- }
-
- // If caller already supplied auth headers, don't override.
- if strings.TrimSpace(c.GetHeader("Authorization")) != "" || strings.TrimSpace(c.GetHeader("x-api-key")) != "" {
- c.Next()
- return
- }
-
- // Allow-list ops websocket endpoints.
- path := strings.TrimSpace(c.Request.URL.Path)
- if !strings.HasPrefix(path, "/api/v1/admin/ops/ws/") {
- c.Next()
- return
- }
-
- token := strings.TrimSpace(c.Query("token"))
- if token != "" {
- c.Request.Header.Set("Authorization", "Bearer "+token)
- }
-
- c.Next()
- }
-}
diff --git a/backend/internal/server/router.go b/backend/internal/server/router.go
index 85df99bd..3ea087d6 100644
--- a/backend/internal/server/router.go
+++ b/backend/internal/server/router.go
@@ -25,8 +25,6 @@ func SetupRouter(
) *gin.Engine {
// 应用中间件
r.Use(middleware2.Logger())
- // WebSocket handshake auth helper (token via query param, WS endpoints only).
- r.Use(middleware2.InjectBearerTokenFromQueryForWebSocket())
r.Use(middleware2.CORS(cfg.CORS))
r.Use(middleware2.SecurityHeaders(cfg.Security.CSP))
diff --git a/backend/internal/service/ops_advisory_lock.go b/backend/internal/service/ops_advisory_lock.go
new file mode 100644
index 00000000..f7ef4cee
--- /dev/null
+++ b/backend/internal/service/ops_advisory_lock.go
@@ -0,0 +1,46 @@
+package service
+
+import (
+ "context"
+ "database/sql"
+ "hash/fnv"
+ "time"
+)
+
+func hashAdvisoryLockID(key string) int64 {
+ h := fnv.New64a()
+ _, _ = h.Write([]byte(key))
+ return int64(h.Sum64())
+}
+
+func tryAcquireDBAdvisoryLock(ctx context.Context, db *sql.DB, lockID int64) (func(), bool) {
+ if db == nil {
+ return nil, false
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+
+ conn, err := db.Conn(ctx)
+ if err != nil {
+ return nil, false
+ }
+
+ acquired := false
+ if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", lockID).Scan(&acquired); err != nil {
+ _ = conn.Close()
+ return nil, false
+ }
+ if !acquired {
+ _ = conn.Close()
+ return nil, false
+ }
+
+ release := func() {
+ unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer cancel()
+ _, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", lockID)
+ _ = conn.Close()
+ }
+ return release, true
+}
diff --git a/backend/internal/service/ops_aggregation_service.go b/backend/internal/service/ops_aggregation_service.go
index 04dbb11b..2a6afbba 100644
--- a/backend/internal/service/ops_aggregation_service.go
+++ b/backend/internal/service/ops_aggregation_service.go
@@ -376,28 +376,37 @@ return 0
`)
func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) {
- if s == nil || s.redisClient == nil {
- return nil, true
+ if s == nil {
+ return nil, false
}
if ctx == nil {
ctx = context.Background()
}
- ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
- if err != nil {
- // Fail-open: do not block single-instance deployments.
- return nil, true
+ // Prefer Redis leader lock when available (multi-instance), but avoid stampeding
+ // the DB when Redis is flaky by falling back to a DB advisory lock.
+ if s.redisClient != nil {
+ ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
+ if err == nil {
+ if !ok {
+ s.maybeLogSkip(logPrefix)
+ return nil, false
+ }
+ release := func() {
+ ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer cancel()
+ _, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result()
+ }
+ return release, true
+ }
+ // Redis error: fall through to DB advisory lock.
}
+
+ release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
if !ok {
s.maybeLogSkip(logPrefix)
return nil, false
}
-
- release := func() {
- ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second)
- defer cancel()
- _, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result()
- }
return release, true
}
diff --git a/backend/internal/service/ops_alert_evaluator_service.go b/backend/internal/service/ops_alert_evaluator_service.go
index b970c720..81712136 100644
--- a/backend/internal/service/ops_alert_evaluator_service.go
+++ b/backend/internal/service/ops_alert_evaluator_service.go
@@ -720,11 +720,12 @@ func (s *OpsAlertEvaluatorService) tryAcquireLeaderLock(ctx context.Context, loc
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
if err != nil {
- // Fail-open for single-node environments, but warn.
+ // Prefer fail-closed to avoid duplicate evaluators stampeding the DB when Redis is flaky.
+ // Single-node deployments can disable the distributed lock via runtime settings.
s.warnNoRedisOnce.Do(func() {
- log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; running without lock: %v", err)
+ log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; skipping this cycle: %v", err)
})
- return nil, true
+ return nil, false
}
if !ok {
s.maybeLogSkip(key)
diff --git a/backend/internal/service/ops_cleanup_service.go b/backend/internal/service/ops_cleanup_service.go
index ef825c04..08c6a16e 100644
--- a/backend/internal/service/ops_cleanup_service.go
+++ b/backend/internal/service/ops_cleanup_service.go
@@ -300,30 +300,36 @@ func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), b
return nil, true
}
- if s.redisClient == nil {
- s.warnNoRedisOnce.Do(func() {
- log.Printf("[OpsCleanup] redis not configured; running without distributed lock")
- })
- return nil, true
- }
-
key := opsCleanupLeaderLockKeyDefault
ttl := opsCleanupLeaderLockTTLDefault
- ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
- if err != nil {
+ // Prefer Redis leader lock when available, but avoid stampeding the DB when Redis is flaky by
+ // falling back to a DB advisory lock.
+ if s.redisClient != nil {
+ ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
+ if err == nil {
+ if !ok {
+ return nil, false
+ }
+ return func() {
+ _, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
+ }, true
+ }
+ // Redis error: fall back to DB advisory lock.
s.warnNoRedisOnce.Do(func() {
- log.Printf("[OpsCleanup] leader lock SetNX failed; running without lock: %v", err)
+ log.Printf("[OpsCleanup] leader lock SetNX failed; falling back to DB advisory lock: %v", err)
+ })
+ } else {
+ s.warnNoRedisOnce.Do(func() {
+ log.Printf("[OpsCleanup] redis not configured; using DB advisory lock")
})
- return nil, true
}
+
+ release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
if !ok {
return nil, false
}
-
- return func() {
- _, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
- }, true
+ return release, true
}
func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
diff --git a/backend/internal/service/ops_dashboard.go b/backend/internal/service/ops_dashboard.go
index 23d6d82f..31822ba8 100644
--- a/backend/internal/service/ops_dashboard.go
+++ b/backend/internal/service/ops_dashboard.go
@@ -5,6 +5,7 @@ import (
"database/sql"
"errors"
"log"
+ "time"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
@@ -39,6 +40,16 @@ func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashbo
// Best-effort system health + jobs; dashboard metrics should still render if these are missing.
if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil {
+ // Attach config-derived limits so the UI can show "current / max" for connection pools.
+ // These are best-effort and should never block the dashboard rendering.
+ if s != nil && s.cfg != nil {
+ if s.cfg.Database.MaxOpenConns > 0 {
+ metrics.DBMaxOpenConns = intPtr(s.cfg.Database.MaxOpenConns)
+ }
+ if s.cfg.Redis.PoolSize > 0 {
+ metrics.RedisPoolSize = intPtr(s.cfg.Redis.PoolSize)
+ }
+ }
overview.SystemMetrics = metrics
} else if err != nil && !errors.Is(err, sql.ErrNoRows) {
log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err)
@@ -50,6 +61,8 @@ func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashbo
log.Printf("[Ops] ListJobHeartbeats failed: %v", err)
}
+ overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview)
+
return overview, nil
}
diff --git a/backend/internal/service/ops_dashboard_models.go b/backend/internal/service/ops_dashboard_models.go
index 51a0b1fb..f189031b 100644
--- a/backend/internal/service/ops_dashboard_models.go
+++ b/backend/internal/service/ops_dashboard_models.go
@@ -35,6 +35,10 @@ type OpsDashboardOverview struct {
Platform string `json:"platform"`
GroupID *int64 `json:"group_id"`
+ // HealthScore is a backend-computed overall health score (0-100).
+ // It is derived from the monitored metrics in this overview, plus best-effort system metrics/job heartbeats.
+ HealthScore int `json:"health_score"`
+
// Latest system-level snapshot (window=1m, global).
SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"`
diff --git a/backend/internal/service/ops_health_score.go b/backend/internal/service/ops_health_score.go
new file mode 100644
index 00000000..68cfc10d
--- /dev/null
+++ b/backend/internal/service/ops_health_score.go
@@ -0,0 +1,126 @@
+package service
+
+import (
+ "math"
+ "time"
+)
+
+// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
+//
+// Design goals:
+// - Backend-owned scoring (UI only displays).
+// - Uses "overall" business indicators (SLA/error/latency) plus infra indicators (db/redis/cpu/mem/jobs).
+// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
+func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
+ if overview == nil {
+ return 0
+ }
+
+ // Idle/no-data: avoid showing a "bad" score when there is no traffic.
+ // UI can still render a gray/idle state based on QPS + error rate.
+ if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 {
+ return 100
+ }
+
+ score := 100.0
+
+ // --- SLA (primary signal) ---
+ // SLA is a ratio (0..1). Target is intentionally modest for LLM gateways; it can be tuned later.
+ slaPct := clampFloat64(overview.SLA*100, 0, 100)
+ if slaPct < 99.5 {
+ // Up to -45 points as SLA drops.
+ score -= math.Min(45, (99.5-slaPct)*12)
+ }
+
+ // --- Error rates (secondary signal) ---
+ errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
+ if errorPct > 1 {
+ // Cap at -20 points by 6% error rate.
+ score -= math.Min(20, (errorPct-1)*4)
+ }
+
+ upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
+ if upstreamPct > 1 {
+ // Upstream instability deserves extra weight, but keep it smaller than SLA/error.
+ score -= math.Min(15, (upstreamPct-1)*3)
+ }
+
+ // --- Latency (tail-focused) ---
+ // Use p99 of duration + TTFT. Penalize only when clearly elevated.
+ if overview.Duration.P99 != nil {
+ p99 := float64(*overview.Duration.P99)
+ if p99 > 2000 {
+ // From 2s upward, gradually penalize up to -20.
+ score -= math.Min(20, (p99-2000)/900) // ~20s => ~-20
+ }
+ }
+ if overview.TTFT.P99 != nil {
+ p99 := float64(*overview.TTFT.P99)
+ if p99 > 500 {
+ // TTFT > 500ms starts hurting; cap at -10.
+ score -= math.Min(10, (p99-500)/200) // 2.5s => -10
+ }
+ }
+
+ // --- System metrics snapshot (best-effort) ---
+ if overview.SystemMetrics != nil {
+ if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
+ score -= 20
+ }
+ if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
+ score -= 15
+ }
+
+ if overview.SystemMetrics.CPUUsagePercent != nil {
+ cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
+ if cpuPct > 85 {
+ score -= math.Min(10, (cpuPct-85)*1.5)
+ }
+ }
+ if overview.SystemMetrics.MemoryUsagePercent != nil {
+ memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
+ if memPct > 90 {
+ score -= math.Min(10, (memPct-90)*1.0)
+ }
+ }
+
+ if overview.SystemMetrics.DBConnWaiting != nil && *overview.SystemMetrics.DBConnWaiting > 0 {
+ waiting := float64(*overview.SystemMetrics.DBConnWaiting)
+ score -= math.Min(10, waiting*2)
+ }
+ if overview.SystemMetrics.ConcurrencyQueueDepth != nil && *overview.SystemMetrics.ConcurrencyQueueDepth > 0 {
+ depth := float64(*overview.SystemMetrics.ConcurrencyQueueDepth)
+ score -= math.Min(10, depth*0.5)
+ }
+ }
+
+ // --- Job heartbeats (best-effort) ---
+ // Penalize only clear "error after last success" signals, and cap the impact.
+ jobPenalty := 0.0
+ for _, hb := range overview.JobHeartbeats {
+ if hb == nil {
+ continue
+ }
+ if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
+ jobPenalty += 5
+ continue
+ }
+ if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
+ jobPenalty += 2
+ }
+ }
+ score -= math.Min(15, jobPenalty)
+
+ score = clampFloat64(score, 0, 100)
+ return int(math.Round(score))
+}
+
+func clampFloat64(v float64, min float64, max float64) float64 {
+ if v < min {
+ return min
+ }
+ if v > max {
+ return max
+ }
+ return v
+}
diff --git a/backend/internal/service/ops_health_score_test.go b/backend/internal/service/ops_health_score_test.go
new file mode 100644
index 00000000..d7e5dd8c
--- /dev/null
+++ b/backend/internal/service/ops_health_score_test.go
@@ -0,0 +1,60 @@
+//go:build unit
+
+package service
+
+import (
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/require"
+)
+
+func TestComputeDashboardHealthScore_IdleReturns100(t *testing.T) {
+ t.Parallel()
+
+ score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{})
+ require.Equal(t, 100, score)
+}
+
+func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) {
+ t.Parallel()
+
+ ov := &OpsDashboardOverview{
+ RequestCountTotal: 100,
+ RequestCountSLA: 100,
+ SuccessCount: 90,
+ ErrorCountTotal: 10,
+ ErrorCountSLA: 10,
+
+ SLA: 0.90,
+ ErrorRate: 0.10,
+ UpstreamErrorRate: 0.08,
+
+ Duration: OpsPercentiles{P99: intPtr(20_000)},
+ TTFT: OpsPercentiles{P99: intPtr(2_000)},
+
+ SystemMetrics: &OpsSystemMetricsSnapshot{
+ DBOK: boolPtr(false),
+ RedisOK: boolPtr(false),
+ CPUUsagePercent: float64Ptr(98.0),
+ MemoryUsagePercent: float64Ptr(97.0),
+ DBConnWaiting: intPtr(3),
+ ConcurrencyQueueDepth: intPtr(10),
+ },
+ JobHeartbeats: []*OpsJobHeartbeat{
+ {
+ JobName: "job-a",
+ LastErrorAt: timePtr(time.Now().UTC().Add(-1 * time.Minute)),
+ LastError: stringPtr("boom"),
+ },
+ },
+ }
+
+ score := computeDashboardHealthScore(time.Now().UTC(), ov)
+ require.Less(t, score, 80)
+ require.GreaterOrEqual(t, score, 0)
+}
+
+func timePtr(v time.Time) *time.Time { return &v }
+
+func stringPtr(v string) *string { return &v }
diff --git a/backend/internal/service/ops_metrics_collector.go b/backend/internal/service/ops_metrics_collector.go
index cd90e1bd..e55e365b 100644
--- a/backend/internal/service/ops_metrics_collector.go
+++ b/backend/internal/service/ops_metrics_collector.go
@@ -5,7 +5,6 @@ import (
"database/sql"
"errors"
"fmt"
- "hash/fnv"
"log"
"math"
"os"
@@ -262,6 +261,7 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
dbOK := c.checkDB(ctx)
redisOK := c.checkRedis(ctx)
active, idle := c.dbPoolStats()
+ redisTotal, redisIdle, redisStatsOK := c.redisPoolStats()
successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd)
if err != nil {
@@ -327,6 +327,19 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
DBOK: boolPtr(dbOK),
RedisOK: boolPtr(redisOK),
+ RedisConnTotal: func() *int {
+ if !redisStatsOK {
+ return nil
+ }
+ return intPtr(redisTotal)
+ }(),
+ RedisConnIdle: func() *int {
+ if !redisStatsOK {
+ return nil
+ }
+ return intPtr(redisIdle)
+ }(),
+
DBConnActive: intPtr(active),
DBConnIdle: intPtr(idle),
GoroutineCount: intPtr(goroutines),
@@ -722,6 +735,17 @@ func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool {
return c.redisClient.Ping(ctx).Err() == nil
}
+func (c *OpsMetricsCollector) redisPoolStats() (total int, idle int, ok bool) {
+ if c == nil || c.redisClient == nil {
+ return 0, 0, false
+ }
+ stats := c.redisClient.PoolStats()
+ if stats == nil {
+ return 0, 0, false
+ }
+ return int(stats.TotalConns), int(stats.IdleConns), true
+}
+
func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) {
if c == nil || c.db == nil {
return 0, 0
@@ -749,7 +773,7 @@ func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(),
if err != nil {
// Prefer fail-closed to avoid stampeding the database when Redis is flaky.
// Fallback to a DB advisory lock when Redis is present but unavailable.
- release, ok := c.tryAcquireDBAdvisoryLock(ctx)
+ release, ok := tryAcquireDBAdvisoryLock(ctx, c.db, opsMetricsCollectorAdvisoryLockID)
if !ok {
c.maybeLogSkip()
return nil, false
@@ -769,38 +793,6 @@ func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(),
return release, true
}
-func (c *OpsMetricsCollector) tryAcquireDBAdvisoryLock(ctx context.Context) (func(), bool) {
- if c == nil || c.db == nil {
- return nil, false
- }
- if ctx == nil {
- ctx = context.Background()
- }
-
- conn, err := c.db.Conn(ctx)
- if err != nil {
- return nil, false
- }
-
- acquired := false
- if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", opsMetricsCollectorAdvisoryLockID).Scan(&acquired); err != nil {
- _ = conn.Close()
- return nil, false
- }
- if !acquired {
- _ = conn.Close()
- return nil, false
- }
-
- release := func() {
- unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
- defer cancel()
- _, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", opsMetricsCollectorAdvisoryLockID)
- _ = conn.Close()
- }
- return release, true
-}
-
func (c *OpsMetricsCollector) maybeLogSkip() {
c.skipLogMu.Lock()
defer c.skipLogMu.Unlock()
@@ -853,9 +845,3 @@ func float64Ptr(v float64) *float64 {
out := v
return &out
}
-
-func hashAdvisoryLockID(s string) int64 {
- h := fnv.New64a()
- _, _ = h.Write([]byte(s))
- return int64(h.Sum64())
-}
diff --git a/backend/internal/service/ops_port.go b/backend/internal/service/ops_port.go
index a3d847e0..90591a56 100644
--- a/backend/internal/service/ops_port.go
+++ b/backend/internal/service/ops_port.go
@@ -165,6 +165,9 @@ type OpsInsertSystemMetricsInput struct {
DBOK *bool
RedisOK *bool
+ RedisConnTotal *int
+ RedisConnIdle *int
+
DBConnActive *int
DBConnIdle *int
DBConnWaiting *int
@@ -186,6 +189,13 @@ type OpsSystemMetricsSnapshot struct {
DBOK *bool `json:"db_ok"`
RedisOK *bool `json:"redis_ok"`
+ // Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max".
+ DBMaxOpenConns *int `json:"db_max_open_conns"`
+ RedisPoolSize *int `json:"redis_pool_size"`
+
+ RedisConnTotal *int `json:"redis_conn_total"`
+ RedisConnIdle *int `json:"redis_conn_idle"`
+
DBConnActive *int `json:"db_conn_active"`
DBConnIdle *int `json:"db_conn_idle"`
DBConnWaiting *int `json:"db_conn_waiting"`
diff --git a/backend/internal/service/setting_service.go b/backend/internal/service/setting_service.go
index 1aea32be..09772616 100644
--- a/backend/internal/service/setting_service.go
+++ b/backend/internal/service/setting_service.go
@@ -139,6 +139,9 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled)
updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled)
updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault))
+ if settings.OpsMetricsIntervalSeconds > 0 {
+ updates[SettingKeyOpsMetricsIntervalSeconds] = strconv.Itoa(settings.OpsMetricsIntervalSeconds)
+ }
return s.settingRepo.SetMultiple(ctx, updates)
}
@@ -231,6 +234,7 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error {
SettingKeyOpsMonitoringEnabled: "true",
SettingKeyOpsRealtimeMonitoringEnabled: "true",
SettingKeyOpsQueryModeDefault: "auto",
+ SettingKeyOpsMetricsIntervalSeconds: "60",
}
return s.settingRepo.SetMultiple(ctx, defaults)
@@ -301,6 +305,18 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin
result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled])
result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled])
result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault]))
+ result.OpsMetricsIntervalSeconds = 60
+ if raw := strings.TrimSpace(settings[SettingKeyOpsMetricsIntervalSeconds]); raw != "" {
+ if v, err := strconv.Atoi(raw); err == nil {
+ if v < 60 {
+ v = 60
+ }
+ if v > 3600 {
+ v = 3600
+ }
+ result.OpsMetricsIntervalSeconds = v
+ }
+ }
return result
}
diff --git a/backend/internal/service/settings_view.go b/backend/internal/service/settings_view.go
index e9d07bca..1f3d925a 100644
--- a/backend/internal/service/settings_view.go
+++ b/backend/internal/service/settings_view.go
@@ -43,6 +43,7 @@ type SystemSettings struct {
OpsMonitoringEnabled bool
OpsRealtimeMonitoringEnabled bool
OpsQueryModeDefault string
+ OpsMetricsIntervalSeconds int
}
type PublicSettings struct {
diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts
index 3c3529a9..851993ca 100644
--- a/frontend/src/api/admin/ops.ts
+++ b/frontend/src/api/admin/ops.ts
@@ -46,6 +46,8 @@ export interface OpsDashboardOverview {
platform: string
group_id?: number | null
+ health_score?: number
+
system_metrics?: OpsSystemMetricsSnapshot | null
job_heartbeats?: OpsJobHeartbeat[] | null
@@ -228,6 +230,9 @@ export interface OpsSystemMetricsSnapshot {
db_ok?: boolean | null
redis_ok?: boolean | null
+ redis_conn_total?: number | null
+ redis_conn_idle?: number | null
+
db_conn_active?: number | null
db_conn_idle?: number | null
db_conn_waiting?: number | null
diff --git a/frontend/src/api/admin/settings.ts b/frontend/src/api/admin/settings.ts
index 37b12e40..9ddeb5bf 100644
--- a/frontend/src/api/admin/settings.ts
+++ b/frontend/src/api/admin/settings.ts
@@ -50,6 +50,7 @@ export interface SystemSettings {
ops_monitoring_enabled: boolean
ops_realtime_monitoring_enabled: boolean
ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string
+ ops_metrics_interval_seconds: number
}
export interface UpdateSettingsRequest {
@@ -83,6 +84,7 @@ export interface UpdateSettingsRequest {
ops_monitoring_enabled?: boolean
ops_realtime_monitoring_enabled?: boolean
ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string
+ ops_metrics_interval_seconds?: number
}
/**
diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts
index f80a235f..1caae1d5 100644
--- a/frontend/src/i18n/locales/en.ts
+++ b/frontend/src/i18n/locales/en.ts
@@ -1733,8 +1733,10 @@ export default {
redis: 'Redis',
goroutines: 'Goroutines',
jobs: 'Jobs',
+ jobsHelp: 'Click “Details” to view job heartbeats and recent errors',
active: 'active',
idle: 'idle',
+ waiting: 'waiting',
ok: 'ok',
lastRun: 'last_run:',
lastSuccess: 'last_success:',
@@ -1770,12 +1772,50 @@ export default {
errorsSla: 'Errors (SLA scope)',
upstreamExcl429529: 'Upstream (excl 429/529)',
failedToLoadData: 'Failed to load ops data.',
+ failedToLoadOverview: 'Failed to load overview',
+ failedToLoadThroughputTrend: 'Failed to load throughput trend',
+ failedToLoadLatencyHistogram: 'Failed to load latency histogram',
+ failedToLoadErrorTrend: 'Failed to load error trend',
+ failedToLoadErrorDistribution: 'Failed to load error distribution',
+ failedToLoadErrorDetail: 'Failed to load error detail',
+ retryFailed: 'Retry failed',
tpsK: 'TPS (K)',
top: 'Top:',
throughputTrend: 'Throughput Trend',
latencyHistogram: 'Latency Histogram',
errorTrend: 'Error Trend',
errorDistribution: 'Error Distribution',
+ // Health Score & Diagnosis
+ health: 'Health',
+ healthCondition: 'Health Condition',
+ healthHelp: 'Overall system health score based on SLA, error rate, and resource usage',
+ healthyStatus: 'Healthy',
+ riskyStatus: 'At Risk',
+ idleStatus: 'Idle',
+ diagnosis: {
+ title: 'Smart Diagnosis',
+ footer: 'Automated diagnostic suggestions based on current metrics',
+ idle: 'System is currently idle',
+ idleImpact: 'No active traffic',
+ upstreamCritical: 'Upstream error rate critically high ({rate}%)',
+ upstreamCriticalImpact: 'May affect many user requests',
+ upstreamHigh: 'Upstream error rate elevated ({rate}%)',
+ upstreamHighImpact: 'Recommend checking upstream service status',
+ slaCritical: 'SLA critically below target ({sla}%)',
+ slaCriticalImpact: 'User experience severely degraded',
+ slaLow: 'SLA below target ({sla}%)',
+ slaLowImpact: 'Service quality needs attention',
+ errorHigh: 'Error rate too high ({rate}%)',
+ errorHighImpact: 'Many requests failing',
+ errorElevated: 'Error rate elevated ({rate}%)',
+ errorElevatedImpact: 'Recommend checking error logs',
+ healthCritical: 'Overall health score critically low ({score})',
+ healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation',
+ healthLow: 'Overall health score low ({score})',
+ healthLowImpact: 'May indicate minor instability; monitor SLA and error rates',
+ healthy: 'All system metrics normal',
+ healthyImpact: 'Service running stable'
+ },
// Error Log
errorLog: {
timeId: 'Time / ID',
@@ -2069,7 +2109,21 @@ export default {
throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
- errorDistribution: 'Error distribution by status code.'
+ errorDistribution: 'Error distribution by status code.',
+ goroutines:
+ 'Number of Go runtime goroutines (lightweight threads). There is no absolute “safe” number—use your historical baseline. Heuristic: <2k is common; 2k–8k watch; >8k plus rising queue/latency often suggests blocking/leaks.',
+ cpu: 'CPU usage percentage, showing system processor load.',
+ memory: 'Memory usage, including used and total available memory.',
+ db: 'Database connection pool status, including active, idle, and waiting connections.',
+ redis: 'Redis connection pool status, showing active and idle connections.',
+ jobs: 'Background job execution status, including last run time, success time, and error information.',
+ qps: 'Queries Per Second (QPS) and Tokens Per Second (TPS), real-time system throughput.',
+ tokens: 'Total number of tokens processed in the current time window.',
+ sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
+ errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
+ latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
+ ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
+ health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
},
charts: {
emptyRequest: 'No requests in this window.',
@@ -2183,7 +2237,9 @@ export default {
queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)',
queryModeAuto: 'Auto (recommended)',
queryModeRaw: 'Raw (most accurate, slower)',
- queryModePreagg: 'Preagg (fastest, requires aggregation)'
+ queryModePreagg: 'Preagg (fastest, requires aggregation)',
+ metricsInterval: 'Metrics Collection Interval (seconds)',
+ metricsIntervalHint: 'How often to collect system/request metrics (60-3600 seconds)'
},
adminApiKey: {
title: 'Admin API Key',
diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts
index 646511f4..d8ce293c 100644
--- a/frontend/src/i18n/locales/zh.ts
+++ b/frontend/src/i18n/locales/zh.ts
@@ -1878,8 +1878,10 @@ export default {
redis: 'Redis',
goroutines: '协程',
jobs: '后台任务',
+ jobsHelp: '点击“明细”查看任务心跳与报错信息',
active: '活跃',
idle: '空闲',
+ waiting: '等待',
ok: '正常',
lastRun: '最近运行',
lastSuccess: '最近成功',
@@ -1898,8 +1900,8 @@ export default {
errors: '错误',
errorRate: '错误率:',
upstreamRate: '上游错误率:',
- latencyDuration: '延迟 (duration_ms)',
- ttftLabel: 'TTFT (first_token_ms)',
+ latencyDuration: '延迟(毫秒)',
+ ttftLabel: '首字延迟(毫秒)',
p50: 'p50',
p90: 'p90',
p95: 'p95',
@@ -1915,12 +1917,50 @@ export default {
errorsSla: '错误(SLA范围)',
upstreamExcl429529: '上游(排除429/529)',
failedToLoadData: '加载运维数据失败',
- tpsK: 'TPS (K)',
+ failedToLoadOverview: '加载概览数据失败',
+ failedToLoadThroughputTrend: '加载吞吐趋势失败',
+ failedToLoadLatencyHistogram: '加载延迟分布失败',
+ failedToLoadErrorTrend: '加载错误趋势失败',
+ failedToLoadErrorDistribution: '加载错误分布失败',
+ failedToLoadErrorDetail: '加载错误详情失败',
+ retryFailed: '重试失败',
+ tpsK: 'TPS(千)',
top: '最高:',
throughputTrend: '吞吐趋势',
latencyHistogram: '延迟分布',
errorTrend: '错误趋势',
errorDistribution: '错误分布',
+ // Health Score & Diagnosis
+ health: '健康',
+ healthCondition: '健康状况',
+ healthHelp: '基于 SLA、错误率和资源使用情况的系统整体健康评分',
+ healthyStatus: '健康',
+ riskyStatus: '风险',
+ idleStatus: '待机',
+ diagnosis: {
+ title: '智能诊断',
+ footer: '基于当前指标的自动诊断建议',
+ idle: '系统当前处于待机状态',
+ idleImpact: '无活跃流量',
+ upstreamCritical: '上游错误率严重偏高 ({rate}%)',
+ upstreamCriticalImpact: '可能影响大量用户请求',
+ upstreamHigh: '上游错误率偏高 ({rate}%)',
+ upstreamHighImpact: '建议检查上游服务状态',
+ slaCritical: 'SLA 严重低于目标 ({sla}%)',
+ slaCriticalImpact: '用户体验严重受损',
+ slaLow: 'SLA 低于目标 ({sla}%)',
+ slaLowImpact: '需要关注服务质量',
+ errorHigh: '错误率过高 ({rate}%)',
+ errorHighImpact: '大量请求失败',
+ errorElevated: '错误率偏高 ({rate}%)',
+ errorElevatedImpact: '建议检查错误日志',
+ healthCritical: '综合健康评分过低 ({score})',
+ healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与延迟',
+ healthLow: '综合健康评分偏低 ({score})',
+ healthLowImpact: '可能存在轻度波动,建议关注 SLA 与错误率',
+ healthy: '所有系统指标正常',
+ healthyImpact: '服务运行稳定'
+ },
// Error Log
errorLog: {
timeId: '时间 / ID',
@@ -2212,9 +2252,23 @@ export default {
},
tooltips: {
throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。',
- latencyHistogram: '成功请求的延迟分布(duration_ms)。',
+ latencyHistogram: '成功请求的延迟分布(毫秒)。',
errorTrend: '错误趋势(SLA 口径排除业务限制;上游错误率排除 429/529)。',
- errorDistribution: '按状态码统计的错误分布。'
+ errorDistribution: '按状态码统计的错误分布。',
+ goroutines:
+ 'Go 运行时的协程数量(轻量级线程)。没有绝对“安全值”,建议以历史基线为准。经验参考:<2000 常见;2000-8000 需关注;>8000 且伴随队列/延迟上升时,优先排查阻塞/泄漏。',
+ cpu: 'CPU 使用率,显示系统处理器的负载情况。',
+ memory: '内存使用率,包括已使用和总可用内存。',
+ db: '数据库连接池状态,包括活跃连接、空闲连接和等待连接数。',
+ redis: 'Redis 连接池状态,显示活跃和空闲的连接数。',
+ jobs: '后台任务执行状态,包括最近运行时间、成功时间和错误信息。',
+ qps: '每秒查询数(QPS)和每秒Token数(TPS),实时显示系统吞吐量。',
+ tokens: '当前时间窗口内处理的总Token数量。',
+ sla: '服务等级协议达成率,排除业务限制(如余额不足、配额超限)的成功请求占比。',
+ errors: '错误统计,包括总错误数、错误率和上游错误率。',
+ latency: '请求延迟统计,包括 p50、p90、p95、p99 等百分位数。',
+ ttft: '首Token延迟(Time To First Token),衡量流式响应的首字节返回速度。',
+ health: '系统健康评分(0-100),综合考虑 SLA、错误率和资源使用情况。'
},
charts: {
emptyRequest: '该时间窗口内暂无请求。',
@@ -2320,14 +2374,16 @@ export default {
description: '启用运维监控模块,用于排障与健康可视化',
disabled: '运维监控已关闭',
enabled: '启用运维监控',
- enabledHint: '启用 Ops 运维监控模块(仅管理员可见)',
+ enabledHint: '启用运维监控模块(仅管理员可见)',
realtimeEnabled: '启用实时监控',
- realtimeEnabledHint: '启用实时 QPS/指标推送(WebSocket)',
+ realtimeEnabledHint: '启用实时请求速率和指标推送(WebSocket)',
queryMode: '默认查询模式',
- queryModeHint: 'Ops Dashboard 默认查询模式(auto/raw/preagg)',
+ queryModeHint: '运维监控默认查询模式(自动/原始/预聚合)',
queryModeAuto: '自动(推荐)',
- queryModeRaw: 'Raw(最准,但较慢)',
- queryModePreagg: 'Preagg(最快,需预聚合)'
+ queryModeRaw: '原始(最准确,但较慢)',
+ queryModePreagg: '预聚合(最快,需预聚合)',
+ metricsInterval: '采集频率(秒)',
+ metricsIntervalHint: '系统/请求指标采集频率(60-3600 秒)'
},
adminApiKey: {
title: '管理员 API Key',
diff --git a/frontend/src/views/admin/SettingsView.vue b/frontend/src/views/admin/SettingsView.vue
index 4375a6cc..cf7a2867 100644
--- a/frontend/src/views/admin/SettingsView.vue
+++ b/frontend/src/views/admin/SettingsView.vue
@@ -715,6 +715,25 @@
class="w-[220px]"
/>
+
+
+
+
+
+ {{ t('admin.settings.opsMonitoring.metricsIntervalHint') }}
+
+
+
+
@@ -824,7 +843,8 @@ const form = reactive({
// Ops Monitoring (vNext)
ops_monitoring_enabled: true,
ops_realtime_monitoring_enabled: true,
- ops_query_mode_default: 'auto'
+ ops_query_mode_default: 'auto',
+ ops_metrics_interval_seconds: 60
})
const opsQueryModeOptions = computed(() => [
@@ -922,7 +942,8 @@ async function saveSettings() {
identity_patch_prompt: form.identity_patch_prompt,
ops_monitoring_enabled: form.ops_monitoring_enabled,
ops_realtime_monitoring_enabled: form.ops_realtime_monitoring_enabled,
- ops_query_mode_default: form.ops_query_mode_default
+ ops_query_mode_default: form.ops_query_mode_default,
+ ops_metrics_interval_seconds: form.ops_metrics_interval_seconds
}
const updated = await adminAPI.settings.updateSettings(payload)
Object.assign(form, updated)
diff --git a/frontend/src/views/admin/ops/OpsDashboard.vue b/frontend/src/views/admin/ops/OpsDashboard.vue
index 56add66f..212717fb 100644
--- a/frontend/src/views/admin/ops/OpsDashboard.vue
+++ b/frontend/src/views/admin/ops/OpsDashboard.vue
@@ -33,190 +33,6 @@
@open-error-details="openErrorDetails"
/>
-
-
-
-
{{ t('admin.ops.systemHealth') }}
-
-
-
-
-
- {{ t('admin.ops.noSystemMetrics') }}
-
-
-
-
- {{ t('admin.ops.collectedAt') }} {{ formatDateTime(overview.system_metrics.created_at) }} ({{ t('admin.ops.window') }}
- {{ overview.system_metrics.window_minutes }}m)
-
-
-
-
-
{{ t('admin.ops.cpu') }}
-
- {{ formatPercent0to100(overview.system_metrics.cpu_usage_percent) }}
-
-
-
-
-
{{ t('admin.ops.memory') }}
-
- {{ formatPercent0to100(overview.system_metrics.memory_usage_percent) }}
-
-
- {{ formatMBPair(overview.system_metrics.memory_used_mb, overview.system_metrics.memory_total_mb) }}
-
-
-
-
-
{{ t('admin.ops.db') }}
-
- {{ boolOkLabel(overview.system_metrics.db_ok) }}
-
-
- {{ t('admin.ops.active') }}: {{ overview.system_metrics.db_conn_active ?? '-' }}, {{ t('admin.ops.idle') }}:
- {{ overview.system_metrics.db_conn_idle ?? '-' }}
-
-
-
-
-
{{ t('admin.ops.redis') }}
-
- {{ boolOkLabel(overview.system_metrics.redis_ok) }}
-
-
-
-
-
{{ t('admin.ops.goroutines') }}
-
- {{ overview.system_metrics.goroutine_count ?? '-' }}
-
-
-
-
-
-
- {{ t('admin.ops.jobs') }}
-
-
-
-
- {{ job.job_name }}
-
-
- {{ t('admin.ops.lastRun') }}: {{ job.last_run_at ? formatDateTime(job.last_run_at) : '-' }} · {{ t('admin.ops.lastSuccess') }}:
- {{ job.last_success_at ? formatDateTime(job.last_success_at) : '-' }} ·
-
- {{ t('admin.ops.lastError') }}: {{ job.last_error }}
-
- {{ t('admin.ops.ok') }}
-
-
-
-
-
-
-
-
-
-
-
{{ t('admin.ops.overview') }}
-
-
-
-
-
- {{ t('admin.ops.noData') }}
-
-
-
-
-
-
{{ t('admin.ops.requestsTotal') }}
-
- {{ formatInt(overview.request_count_total) }}
-
-
- {{ t('admin.ops.slaScope') }} {{ formatInt(overview.request_count_sla) }}
-
-
-
-
-
{{ t('admin.ops.tokens') }}
-
- {{ formatInt(overview.token_consumed) }}
-
-
- {{ t('admin.ops.tps') }} {{ overview.tps.current }} ({{ t('admin.ops.peak') }} {{ overview.tps.peak }})
-
-
-
-
-
{{ t('admin.ops.sla') }}
-
- {{ formatPercent(overview.sla) }}
-
-
- {{ t('admin.ops.businessLimited') }}: {{ formatInt(overview.business_limited_count) }}
-
-
-
-
-
{{ t('admin.ops.errors') }}
-
- {{ t('admin.ops.errorRate') }}: {{ formatPercent(overview.error_rate) }}
-
-
- {{ t('admin.ops.upstreamRate') }}: {{ formatPercent(overview.upstream_error_rate) }}
-
-
- 429: {{ formatInt(overview.upstream_429_count) }} · 529:
- {{ formatInt(overview.upstream_529_count) }}
-
-
-
-
-
-
-
{{ t('admin.ops.latencyDuration') }}
-
-
{{ t('admin.ops.p50') }}: {{ formatMs(overview.duration.p50_ms) }}
-
{{ t('admin.ops.p90') }}: {{ formatMs(overview.duration.p90_ms) }}
-
{{ t('admin.ops.p95') }}: {{ formatMs(overview.duration.p95_ms) }}
-
{{ t('admin.ops.p99') }}: {{ formatMs(overview.duration.p99_ms) }}
-
{{ t('admin.ops.avg') }}: {{ formatMs(overview.duration.avg_ms) }}
-
{{ t('admin.ops.max') }}: {{ formatMs(overview.duration.max_ms) }}
-
-
-
-
-
{{ t('admin.ops.ttftLabel') }}
-
-
{{ t('admin.ops.p50') }}: {{ formatMs(overview.ttft.p50_ms) }}
-
{{ t('admin.ops.p90') }}: {{ formatMs(overview.ttft.p90_ms) }}
-
{{ t('admin.ops.p95') }}: {{ formatMs(overview.ttft.p95_ms) }}
-
{{ t('admin.ops.p99') }}: {{ formatMs(overview.ttft.p99_ms) }}
-
{{ t('admin.ops.avg') }}: {{ formatMs(overview.ttft.avg_ms) }}
-
{{ t('admin.ops.max') }}: {{ formatMs(overview.ttft.max_ms) }}
-
-
-
-
-
-
-
@@ -308,7 +124,6 @@ import OpsLatencyChart from './components/OpsLatencyChart.vue'
import OpsThroughputTrendChart from './components/OpsThroughputTrendChart.vue'
import OpsAlertEventsCard from './components/OpsAlertEventsCard.vue'
import OpsRequestDetailsModal, { type OpsRequestDetailsPreset } from './components/OpsRequestDetailsModal.vue'
-import { formatDateTime, formatNumberLocaleString } from '@/utils/format'
const route = useRoute()
const router = useRouter()
@@ -486,7 +301,6 @@ const syncQueryToRoute = useDebounceFn(async () => {
}, 250)
const overview = ref
(null)
-const loadingOverview = ref(false)
const throughputTrend = ref(null)
const loadingTrend = ref(false)
@@ -523,12 +337,15 @@ function handleThroughputSelectGroup(nextGroupId: number) {
groupId.value = id
}
-function handleOpenRequestDetails() {
- requestDetailsPreset.value = {
+function handleOpenRequestDetails(preset?: OpsRequestDetailsPreset) {
+ const basePreset: OpsRequestDetailsPreset = {
title: t('admin.ops.requestDetails.title'),
kind: 'all',
sort: 'created_at_desc'
}
+
+ requestDetailsPreset.value = { ...basePreset, ...(preset ?? {}) }
+ if (!requestDetailsPreset.value.title) requestDetailsPreset.value.title = basePreset.title
showRequestDetails.value = true
}
@@ -573,46 +390,8 @@ function openError(id: number) {
showErrorModal.value = true
}
-function formatInt(v: number | null | undefined): string {
- if (typeof v !== 'number') return '0'
- return formatNumberLocaleString(v)
-}
-
-function formatPercent(v: number | null | undefined): string {
- if (typeof v !== 'number') return '-'
- return `${(v * 100).toFixed(2)}%`
-}
-
-function formatPercent0to100(v: number | null | undefined): string {
- if (typeof v !== 'number') return '-'
- return `${v.toFixed(1)}%`
-}
-
-function formatMBPair(used: number | null | undefined, total: number | null | undefined): string {
- if (typeof used !== 'number' || typeof total !== 'number') return '-'
- return `${formatNumberLocaleString(used)} / ${formatNumberLocaleString(total)} MB`
-}
-
-function boolOkLabel(v: boolean | null | undefined): string {
- if (v === true) return 'OK'
- if (v === false) return 'FAIL'
- return '-'
-}
-
-function boolOkClass(v: boolean | null | undefined): string {
- if (v === true) return 'text-emerald-600 dark:text-emerald-400'
- if (v === false) return 'text-rose-600 dark:text-rose-400'
- return 'text-gray-900 dark:text-white'
-}
-
-function formatMs(v: number | null | undefined): string {
- if (v == null) return '-'
- return `${v}ms`
-}
-
async function refreshOverviewWithCancel(fetchSeq: number, signal: AbortSignal) {
if (!opsEnabled.value) return
- loadingOverview.value = true
try {
const data = await opsAPI.getDashboardOverview(
{
@@ -628,11 +407,7 @@ async function refreshOverviewWithCancel(fetchSeq: number, signal: AbortSignal)
} catch (err: any) {
if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
overview.value = null
- appStore.showError(err?.message || 'Failed to load overview')
- } finally {
- if (fetchSeq === dashboardFetchSeq) {
- loadingOverview.value = false
- }
+ appStore.showError(err?.message || t('admin.ops.failedToLoadOverview'))
}
}
@@ -654,7 +429,7 @@ async function refreshThroughputTrendWithCancel(fetchSeq: number, signal: AbortS
} catch (err: any) {
if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
throughputTrend.value = null
- appStore.showError(err?.message || 'Failed to load throughput trend')
+ appStore.showError(err?.message || t('admin.ops.failedToLoadThroughputTrend'))
} finally {
if (fetchSeq === dashboardFetchSeq) {
loadingTrend.value = false
@@ -680,7 +455,7 @@ async function refreshLatencyHistogramWithCancel(fetchSeq: number, signal: Abort
} catch (err: any) {
if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
latencyHistogram.value = null
- appStore.showError(err?.message || 'Failed to load latency histogram')
+ appStore.showError(err?.message || t('admin.ops.failedToLoadLatencyHistogram'))
} finally {
if (fetchSeq === dashboardFetchSeq) {
loadingLatency.value = false
@@ -706,7 +481,7 @@ async function refreshErrorTrendWithCancel(fetchSeq: number, signal: AbortSignal
} catch (err: any) {
if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
errorTrend.value = null
- appStore.showError(err?.message || 'Failed to load error trend')
+ appStore.showError(err?.message || t('admin.ops.failedToLoadErrorTrend'))
} finally {
if (fetchSeq === dashboardFetchSeq) {
loadingErrorTrend.value = false
@@ -732,7 +507,7 @@ async function refreshErrorDistributionWithCancel(fetchSeq: number, signal: Abor
} catch (err: any) {
if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
errorDistribution.value = null
- appStore.showError(err?.message || 'Failed to load error distribution')
+ appStore.showError(err?.message || t('admin.ops.failedToLoadErrorDistribution'))
} finally {
if (fetchSeq === dashboardFetchSeq) {
loadingErrorDistribution.value = false
diff --git a/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue
index 118a1f3a..f8166040 100644
--- a/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue
+++ b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue
@@ -286,7 +286,7 @@ async function fetchDetail(id: number) {
}
} catch (err: any) {
detail.value = null
- appStore.showError(err?.message || 'Failed to load error detail')
+ appStore.showError(err?.message || t('admin.ops.failedToLoadErrorDetail'))
} finally {
loading.value = false
}
@@ -348,7 +348,7 @@ async function runConfirmedRetry() {
const summary = res.status === 'succeeded' ? t('admin.ops.errorDetail.retrySuccess') : t('admin.ops.errorDetail.retryFailed')
appStore.showSuccess(summary)
} catch (err: any) {
- appStore.showError(err?.message || 'Retry failed')
+ appStore.showError(err?.message || t('admin.ops.retryFailed'))
} finally {
retrying.value = false
}
From c48dc097ff5ddb59552c2f51c6432007951f4231 Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Sat, 10 Jan 2026 02:17:38 +0800
Subject: [PATCH 16/53] =?UTF-8?q?feat(=E8=BF=90=E7=BB=B4=E7=9B=91=E6=8E=A7?=
=?UTF-8?q?):=20=E9=87=8D=E6=9E=84=E4=BB=AA=E8=A1=A8=E6=9D=BF=E5=B8=83?=
=?UTF-8?q?=E5=B1=80=E5=92=8C=E5=A2=9E=E5=BC=BA=E6=95=B0=E6=8D=AE=E5=B1=95?=
=?UTF-8?q?=E7=A4=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
主要改动:
- 重构仪表板为左右布局(5:7比例)
- 左侧:健康评分 + 实时信息(当前/峰值/平均 QPS/TPS)
- 右侧:6个卡片展示详细指标(3列x2行)
- 总请求:请求数、Token数、平均QPS/TPS、平均延迟/TTFT
- SLA:百分比、异常数、进度条
- 延迟:P99/P95/P90/P50/Avg/Max(带颜色编码)
- TTFT:P99/P95/P90/P50/Avg/Max(带颜色编码)
- 请求错误:错误率、错误数、业务限制数
- 上游错误:错误率、错误数(排除429/529)、429/529数
- 添加延迟/TTFT颜色编码(<500ms绿色,<1s黄色,<2s橙色,≥2s红色)
- 添加实时窗口选择器(1min/5min/30min/1h)
- 优化时间段选择器标签("近5分钟"等)
- 完善中英文i18n翻译
- 数据库:添加Redis连接池字段(redis_conn_total, redis_conn_idle)
---
.../migrations/030_ops_monitoring_vnext.sql | 10 +
frontend/src/api/admin/ops.ts | 4 +
frontend/src/i18n/locales/en.ts | 49 +
frontend/src/i18n/locales/zh.ts | 49 +
.../ops/components/OpsDashboardHeader.vue | 1103 +++++++++++++++--
5 files changed, 1104 insertions(+), 111 deletions(-)
diff --git a/backend/migrations/030_ops_monitoring_vnext.sql b/backend/migrations/030_ops_monitoring_vnext.sql
index 39b19e5d..a18c061d 100644
--- a/backend/migrations/030_ops_monitoring_vnext.sql
+++ b/backend/migrations/030_ops_monitoring_vnext.sql
@@ -705,3 +705,13 @@ INSERT INTO ops_alert_rules (
'当错误率超过 20% 且持续 1 分钟时触发告警(服务严重异常)',
true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
+
+-- Ops Monitoring vNext: add Redis pool stats fields to system metrics snapshots.
+-- This migration is intentionally idempotent.
+
+ALTER TABLE ops_system_metrics
+ ADD COLUMN IF NOT EXISTS redis_conn_total INT,
+ ADD COLUMN IF NOT EXISTS redis_conn_idle INT;
+
+COMMENT ON COLUMN ops_system_metrics.redis_conn_total IS 'Redis pool total connections (go-redis PoolStats.TotalConns).';
+COMMENT ON COLUMN ops_system_metrics.redis_conn_idle IS 'Redis pool idle connections (go-redis PoolStats.IdleConns).';
diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts
index 851993ca..42b9e70d 100644
--- a/frontend/src/api/admin/ops.ts
+++ b/frontend/src/api/admin/ops.ts
@@ -230,6 +230,10 @@ export interface OpsSystemMetricsSnapshot {
db_ok?: boolean | null
redis_ok?: boolean | null
+ // Config-derived limits (best-effort) for rendering "current vs max".
+ db_max_open_conns?: number | null
+ redis_pool_size?: number | null
+
redis_conn_total?: number | null
redis_conn_idle?: number | null
diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts
index 1caae1d5..a4c631cb 100644
--- a/frontend/src/i18n/locales/en.ts
+++ b/frontend/src/i18n/locales/en.ts
@@ -1737,6 +1737,8 @@ export default {
active: 'active',
idle: 'idle',
waiting: 'waiting',
+ conns: 'conns',
+ queue: 'queue',
ok: 'ok',
lastRun: 'last_run:',
lastSuccess: 'last_success:',
@@ -1750,6 +1752,17 @@ export default {
tps: 'TPS:',
current: 'current',
peak: 'peak',
+ average: 'average',
+ totalRequests: 'Total Requests',
+ avgQps: 'Avg QPS',
+ avgTps: 'Avg TPS',
+ avgLatency: 'Avg Latency',
+ avgTtft: 'Avg TTFT',
+ exceptions: 'Exceptions',
+ requestErrors: 'Request Errors',
+ errorCount: 'Error Count',
+ upstreamErrors: 'Upstream Errors',
+ errorCountExcl429529: 'Error Count (excl 429/529)',
sla: 'SLA (excl business limits)',
businessLimited: 'business_limited:',
errors: 'Errors',
@@ -1792,6 +1805,42 @@ export default {
healthyStatus: 'Healthy',
riskyStatus: 'At Risk',
idleStatus: 'Idle',
+ realtime: {
+ title: 'Realtime',
+ connected: 'Connected',
+ connecting: 'Connecting',
+ reconnecting: 'Reconnecting',
+ offline: 'Offline',
+ closed: 'Closed',
+ reconnectIn: 'Reconnect in {seconds}s'
+ },
+ tooltips: {
+ qps: 'Queries per second - real-time request rate',
+ sla: 'Service Level Agreement - percentage of requests within acceptable latency',
+ latency: 'Request duration from start to finish',
+ ttft: 'Time to First Token - latency until first response token',
+ errors: 'Request errors within SLA scope',
+ upstreamErrors: 'Errors from upstream services (excluding rate limits)',
+ totalRequests: 'Total requests and tokens consumed in this time window',
+ cpu: 'CPU usage percentage',
+ memory: 'Memory usage percentage',
+ db: 'Database connection pool status',
+ redis: 'Redis connection pool status',
+ goroutines: 'Go routine count (concurrent tasks)',
+ jobs: 'Background job health status'
+ },
+ timeRange: {
+ '5m': 'Last 5 minutes',
+ '30m': 'Last 30 minutes',
+ '1h': 'Last 1 hour',
+ '6h': 'Last 6 hours',
+ '24h': 'Last 24 hours'
+ },
+ queryMode: {
+ auto: 'Auto',
+ raw: 'Raw Query',
+ preagg: 'Pre-aggregated'
+ },
diagnosis: {
title: 'Smart Diagnosis',
footer: 'Automated diagnostic suggestions based on current metrics',
diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts
index d8ce293c..ced386d5 100644
--- a/frontend/src/i18n/locales/zh.ts
+++ b/frontend/src/i18n/locales/zh.ts
@@ -1882,6 +1882,8 @@ export default {
active: '活跃',
idle: '空闲',
waiting: '等待',
+ conns: '连接',
+ queue: '队列',
ok: '正常',
lastRun: '最近运行',
lastSuccess: '最近成功',
@@ -1895,6 +1897,17 @@ export default {
tps: 'TPS',
current: '当前',
peak: '峰值',
+ average: '平均',
+ totalRequests: '总请求',
+ avgQps: '平均 QPS',
+ avgTps: '平均 TPS',
+ avgLatency: '平均延迟',
+ avgTtft: '平均首字延迟',
+ exceptions: '异常数',
+ requestErrors: '请求错误',
+ errorCount: '错误数',
+ upstreamErrors: '上游错误',
+ errorCountExcl429529: '错误数(排除429/529)',
sla: 'SLA(排除业务限制)',
businessLimited: '业务限制:',
errors: '错误',
@@ -1937,6 +1950,42 @@ export default {
healthyStatus: '健康',
riskyStatus: '风险',
idleStatus: '待机',
+ realtime: {
+ title: '实时信息',
+ connected: '已连接',
+ connecting: '连接中',
+ reconnecting: '重连中',
+ offline: '离线',
+ closed: '已关闭',
+ reconnectIn: '{seconds}秒后重连'
+ },
+ tooltips: {
+ qps: '每秒查询数 - 实时请求速率',
+ sla: '服务等级协议 - 可接受延迟范围内的请求百分比',
+ latency: '从开始到结束的请求持续时间',
+ ttft: '首字延迟 - 直到第一个响应令牌的延迟',
+ errors: 'SLA 范围内的请求错误',
+ upstreamErrors: '上游服务错误(不包括速率限制)',
+ totalRequests: '此时间窗口内的总请求数和消耗的令牌数',
+ cpu: 'CPU 使用率',
+ memory: '内存使用率',
+ db: '数据库连接池状态',
+ redis: 'Redis 连接池状态',
+ goroutines: 'Go 协程数(并发任务)',
+ jobs: '后台任务健康状态'
+ },
+ timeRange: {
+ '5m': '近5分钟',
+ '30m': '近30分钟',
+ '1h': '近1小时',
+ '6h': '近6小时',
+ '24h': '近24小时'
+ },
+ queryMode: {
+ auto: '自动',
+ raw: '原始查询',
+ preagg: '预聚合'
+ },
diagnosis: {
title: '智能诊断',
footer: '基于当前指标的自动诊断建议',
diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
index c2c6adb6..04cae822 100644
--- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
+++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
@@ -2,10 +2,15 @@
import { computed, onMounted, ref, watch } from 'vue'
import { useI18n } from 'vue-i18n'
import Select from '@/components/common/Select.vue'
+import HelpTooltip from '@/components/common/HelpTooltip.vue'
+import BaseDialog from '@/components/common/BaseDialog.vue'
import { adminAPI } from '@/api'
import type { OpsDashboardOverview, OpsWSStatus } from '@/api/admin/ops'
+import type { OpsRequestDetailsPreset } from './OpsRequestDetailsModal.vue'
import { formatNumber } from '@/utils/format'
+type RealtimeWindow = '1min' | '5min' | '30min' | '1h'
+
interface Props {
overview?: OpsDashboardOverview | null
wsStatus: OpsWSStatus
@@ -27,7 +32,7 @@ interface Emits {
(e: 'update:timeRange', value: string): void
(e: 'update:queryMode', value: string): void
(e: 'refresh'): void
- (e: 'openRequestDetails'): void
+ (e: 'openRequestDetails', preset?: OpsRequestDetailsPreset): void
(e: 'openErrorDetails', kind: 'request' | 'upstream'): void
}
@@ -36,6 +41,13 @@ const emit = defineEmits()
const { t } = useI18n()
+const realtimeWindow = ref('1min')
+
+const overview = computed(() => props.overview ?? null)
+const systemMetrics = computed(() => overview.value?.system_metrics ?? null)
+
+// --- Filters ---
+
const groups = ref>([])
const platformOptions = computed(() => [
@@ -47,11 +59,11 @@ const platformOptions = computed(() => [
])
const timeRangeOptions = computed(() => [
- { value: '5m', label: '5m' },
- { value: '30m', label: '30m' },
- { value: '1h', label: '1h' },
- { value: '6h', label: '6h' },
- { value: '24h', label: '24h' }
+ { value: '5m', label: t('admin.ops.timeRange.5m') },
+ { value: '30m', label: t('admin.ops.timeRange.30m') },
+ { value: '1h', label: t('admin.ops.timeRange.1h') },
+ { value: '6h', label: t('admin.ops.timeRange.6h') },
+ { value: '24h', label: t('admin.ops.timeRange.24h') }
])
const queryModeOptions = computed(() => [
@@ -107,65 +119,107 @@ function handleQueryModeChange(val: string | number | boolean | null) {
emit('update:queryMode', String(val || 'auto'))
}
+function openDetails(preset?: OpsRequestDetailsPreset) {
+ emit('openRequestDetails', preset)
+}
+
+function openErrorDetails(kind: 'request' | 'upstream') {
+ emit('openErrorDetails', kind)
+}
+
const updatedAtLabel = computed(() => {
if (!props.lastUpdated) return t('common.unknown')
return props.lastUpdated.toLocaleTimeString()
})
-const totalRequestsLabel = computed(() => {
- const n = props.overview?.request_count_total ?? 0
- return formatNumber(n)
-})
+// --- Color coding for latency/TTFT ---
+function getLatencyColor(ms: number | null | undefined): string {
+ if (ms == null) return 'text-gray-900 dark:text-white'
+ if (ms < 500) return 'text-green-600 dark:text-green-400'
+ if (ms < 1000) return 'text-yellow-600 dark:text-yellow-400'
+ if (ms < 2000) return 'text-orange-600 dark:text-orange-400'
+ return 'text-red-600 dark:text-red-400'
+}
-const totalTokensLabel = computed(() => {
- const n = props.overview?.token_consumed ?? 0
- return formatNumber(n)
-})
+// --- Realtime / Overview labels ---
-const qpsLabel = computed(() => {
+const totalRequestsLabel = computed(() => formatNumber(overview.value?.request_count_total ?? 0))
+const totalTokensLabel = computed(() => formatNumber(overview.value?.token_consumed ?? 0))
+
+const displayRealTimeQps = computed(() => {
+ const ov = overview.value
+ if (!ov) return 0
const useRealtime = props.wsStatus === 'connected' && !!props.wsHasData
- const n = useRealtime ? props.realTimeQps : props.overview?.qps?.current
- if (typeof n !== 'number') return '-'
- return n.toFixed(1)
+ const v = useRealtime ? props.realTimeQps : ov.qps?.current
+ return typeof v === 'number' && Number.isFinite(v) ? v : 0
})
-const tpsLabel = computed(() => {
+const displayRealTimeTps = computed(() => {
+ const ov = overview.value
+ if (!ov) return 0
const useRealtime = props.wsStatus === 'connected' && !!props.wsHasData
- const n = useRealtime ? props.realTimeTps : props.overview?.tps?.current
- if (typeof n !== 'number') return '-'
- return n.toFixed(1)
+ const v = useRealtime ? props.realTimeTps : ov.tps?.current
+ return typeof v === 'number' && Number.isFinite(v) ? v : 0
})
const qpsPeakLabel = computed(() => {
- const n = props.overview?.qps?.peak
- if (typeof n !== 'number') return '-'
- return n.toFixed(1)
+ const v = overview.value?.qps?.peak
+ if (typeof v !== 'number') return '-'
+ return v.toFixed(1)
})
const tpsPeakLabel = computed(() => {
- const n = props.overview?.tps?.peak
- if (typeof n !== 'number') return '-'
- return n.toFixed(1)
+ const v = overview.value?.tps?.peak
+ if (typeof v !== 'number') return '-'
+ return v.toFixed(1)
})
-const slaLabel = computed(() => {
- const v = props.overview?.sla
+const qpsAvgLabel = computed(() => {
+ const v = overview.value?.qps?.avg
if (typeof v !== 'number') return '-'
- return `${(v * 100).toFixed(3)}%`
+ return v.toFixed(1)
})
-const errorRateLabel = computed(() => {
- const v = props.overview?.error_rate
+const tpsAvgLabel = computed(() => {
+ const v = overview.value?.tps?.avg
if (typeof v !== 'number') return '-'
- return `${(v * 100).toFixed(2)}%`
+ return v.toFixed(1)
})
-const upstreamErrorRateLabel = computed(() => {
- const v = props.overview?.upstream_error_rate
- if (typeof v !== 'number') return '-'
- return `${(v * 100).toFixed(2)}%`
+const slaPercent = computed(() => {
+ const v = overview.value?.sla
+ if (typeof v !== 'number') return null
+ return v * 100
})
+const errorRatePercent = computed(() => {
+ const v = overview.value?.error_rate
+ if (typeof v !== 'number') return null
+ return v * 100
+})
+
+const upstreamErrorRatePercent = computed(() => {
+ const v = overview.value?.upstream_error_rate
+ if (typeof v !== 'number') return null
+ return v * 100
+})
+
+const durationP99Ms = computed(() => overview.value?.duration?.p99_ms ?? null)
+const durationP95Ms = computed(() => overview.value?.duration?.p95_ms ?? null)
+const durationP90Ms = computed(() => overview.value?.duration?.p90_ms ?? null)
+const durationP50Ms = computed(() => overview.value?.duration?.p50_ms ?? null)
+const durationAvgMs = computed(() => overview.value?.duration?.avg_ms ?? null)
+const durationMaxMs = computed(() => overview.value?.duration?.max_ms ?? null)
+
+const ttftP99Ms = computed(() => overview.value?.ttft?.p99_ms ?? null)
+const ttftP95Ms = computed(() => overview.value?.ttft?.p95_ms ?? null)
+const ttftP90Ms = computed(() => overview.value?.ttft?.p90_ms ?? null)
+const ttftP50Ms = computed(() => overview.value?.ttft?.p50_ms ?? null)
+const ttftAvgMs = computed(() => overview.value?.ttft?.avg_ms ?? null)
+const ttftMaxMs = computed(() => overview.value?.ttft?.max_ms ?? null)
+
+// --- WebSocket status ---
+
const wsStatusLabel = computed(() => {
switch (props.wsStatus) {
case 'connected':
@@ -204,11 +258,365 @@ const wsReconnectHint = computed(() => {
const sec = Math.max(1, Math.ceil(delayMs / 1000))
return t('admin.ops.realtime.reconnectIn', { seconds: sec })
})
+
+// --- Health Score & Diagnosis (primary) ---
+
+const isSystemIdle = computed(() => {
+ const ov = overview.value
+ if (!ov) return true
+ const qps = props.wsStatus === 'connected' && props.wsHasData ? props.realTimeQps : ov.qps?.current
+ const errorRate = ov.error_rate ?? 0
+ return (qps ?? 0) === 0 && errorRate === 0
+})
+
+const healthScoreValue = computed(() => {
+ const v = overview.value?.health_score
+ return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const healthScoreColor = computed(() => {
+ if (isSystemIdle.value) return '#9ca3af' // gray-400
+ const score = healthScoreValue.value
+ if (score == null) return '#9ca3af'
+ if (score >= 90) return '#10b981' // green
+ if (score >= 60) return '#f59e0b' // yellow
+ return '#ef4444' // red
+})
+
+const healthScoreClass = computed(() => {
+ if (isSystemIdle.value) return 'text-gray-400'
+ const score = healthScoreValue.value
+ if (score == null) return 'text-gray-400'
+ if (score >= 90) return 'text-green-500'
+ if (score >= 60) return 'text-yellow-500'
+ return 'text-red-500'
+})
+
+const circleSize = 100
+const strokeWidth = 8
+const radius = (circleSize - strokeWidth) / 2
+const circumference = 2 * Math.PI * radius
+const dashOffset = computed(() => {
+ if (isSystemIdle.value) return 0
+ if (healthScoreValue.value == null) return 0
+ const score = Math.max(0, Math.min(100, healthScoreValue.value))
+ return circumference - (score / 100) * circumference
+})
+
+interface DiagnosisItem {
+ type: 'critical' | 'warning' | 'info'
+ message: string
+ impact: string
+}
+
+const diagnosisReport = computed(() => {
+ const ov = overview.value
+ if (!ov) return []
+
+ const report: DiagnosisItem[] = []
+
+ if (isSystemIdle.value) {
+ report.push({
+ type: 'info',
+ message: t('admin.ops.diagnosis.idle'),
+ impact: t('admin.ops.diagnosis.idleImpact')
+ })
+ return report
+ }
+
+ const upstreamRatePct = (ov.upstream_error_rate ?? 0) * 100
+ if (upstreamRatePct > 10) {
+ report.push({
+ type: 'critical',
+ message: t('admin.ops.diagnosis.upstreamCritical', { rate: upstreamRatePct.toFixed(2) }),
+ impact: t('admin.ops.diagnosis.upstreamCriticalImpact')
+ })
+ } else if (upstreamRatePct > 3) {
+ report.push({
+ type: 'warning',
+ message: t('admin.ops.diagnosis.upstreamHigh', { rate: upstreamRatePct.toFixed(2) }),
+ impact: t('admin.ops.diagnosis.upstreamHighImpact')
+ })
+ }
+
+ const slaPct = (ov.sla ?? 0) * 100
+ if (slaPct < 90) {
+ report.push({
+ type: 'critical',
+ message: t('admin.ops.diagnosis.slaCritical', { sla: slaPct.toFixed(2) }),
+ impact: t('admin.ops.diagnosis.slaCriticalImpact')
+ })
+ } else if (slaPct < 98) {
+ report.push({
+ type: 'warning',
+ message: t('admin.ops.diagnosis.slaLow', { sla: slaPct.toFixed(2) }),
+ impact: t('admin.ops.diagnosis.slaLowImpact')
+ })
+ }
+
+ const errorPct = (ov.error_rate ?? 0) * 100
+ if (errorPct > 5) {
+ report.push({
+ type: 'critical',
+ message: t('admin.ops.diagnosis.errorHigh', { rate: errorPct.toFixed(2) }),
+ impact: t('admin.ops.diagnosis.errorHighImpact')
+ })
+ } else if (errorPct > 1) {
+ report.push({
+ type: 'warning',
+ message: t('admin.ops.diagnosis.errorElevated', { rate: errorPct.toFixed(2) }),
+ impact: t('admin.ops.diagnosis.errorElevatedImpact')
+ })
+ }
+
+ if (healthScoreValue.value != null) {
+ if (healthScoreValue.value < 60) {
+ report.push({
+ type: 'critical',
+ message: t('admin.ops.diagnosis.healthCritical', { score: healthScoreValue.value }),
+ impact: t('admin.ops.diagnosis.healthCriticalImpact')
+ })
+ } else if (healthScoreValue.value < 90) {
+ report.push({
+ type: 'warning',
+ message: t('admin.ops.diagnosis.healthLow', { score: healthScoreValue.value }),
+ impact: t('admin.ops.diagnosis.healthLowImpact')
+ })
+ }
+ }
+
+ if (report.length === 0) {
+ report.push({
+ type: 'info',
+ message: t('admin.ops.diagnosis.healthy'),
+ impact: t('admin.ops.diagnosis.healthyImpact')
+ })
+ }
+
+ return report
+})
+
+// --- System health (secondary) ---
+
+function formatTimeShort(ts?: string | null): string {
+ if (!ts) return '-'
+ const d = new Date(ts)
+ if (Number.isNaN(d.getTime())) return '-'
+ return d.toLocaleTimeString()
+}
+
+const cpuPercentValue = computed(() => {
+ const v = systemMetrics.value?.cpu_usage_percent
+ return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const cpuPercentClass = computed(() => {
+ const v = cpuPercentValue.value
+ if (v == null) return 'text-gray-900 dark:text-white'
+ if (v >= 95) return 'text-rose-600 dark:text-rose-400'
+ if (v >= 80) return 'text-yellow-600 dark:text-yellow-400'
+ return 'text-emerald-600 dark:text-emerald-400'
+})
+
+const memPercentValue = computed(() => {
+ const v = systemMetrics.value?.memory_usage_percent
+ return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const memPercentClass = computed(() => {
+ const v = memPercentValue.value
+ if (v == null) return 'text-gray-900 dark:text-white'
+ if (v >= 95) return 'text-rose-600 dark:text-rose-400'
+ if (v >= 85) return 'text-yellow-600 dark:text-yellow-400'
+ return 'text-emerald-600 dark:text-emerald-400'
+})
+
+const dbConnActiveValue = computed(() => {
+ const v = systemMetrics.value?.db_conn_active
+ return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const dbConnIdleValue = computed(() => {
+ const v = systemMetrics.value?.db_conn_idle
+ return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const dbConnWaitingValue = computed(() => {
+ const v = systemMetrics.value?.db_conn_waiting
+ return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const dbConnOpenValue = computed(() => {
+ if (dbConnActiveValue.value == null || dbConnIdleValue.value == null) return null
+ return dbConnActiveValue.value + dbConnIdleValue.value
+})
+
+const dbMaxOpenConnsValue = computed(() => {
+ const v = systemMetrics.value?.db_max_open_conns
+ return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const dbUsagePercent = computed(() => {
+ if (dbConnOpenValue.value == null || dbMaxOpenConnsValue.value == null || dbMaxOpenConnsValue.value <= 0) return null
+ return Math.min(100, Math.max(0, (dbConnOpenValue.value / dbMaxOpenConnsValue.value) * 100))
+})
+
+const dbMiddleLabel = computed(() => {
+ if (systemMetrics.value?.db_ok === false) return 'FAIL'
+ if (dbUsagePercent.value != null) return `${dbUsagePercent.value.toFixed(0)}%`
+ if (systemMetrics.value?.db_ok === true) return t('admin.ops.ok')
+ return t('admin.ops.noData')
+})
+
+const dbMiddleClass = computed(() => {
+ if (systemMetrics.value?.db_ok === false) return 'text-rose-600 dark:text-rose-400'
+ if (dbUsagePercent.value != null) {
+ if (dbUsagePercent.value >= 90) return 'text-rose-600 dark:text-rose-400'
+ if (dbUsagePercent.value >= 70) return 'text-yellow-600 dark:text-yellow-400'
+ return 'text-emerald-600 dark:text-emerald-400'
+ }
+ if (systemMetrics.value?.db_ok === true) return 'text-emerald-600 dark:text-emerald-400'
+ return 'text-gray-900 dark:text-white'
+})
+
+const redisConnTotalValue = computed(() => {
+ const v = systemMetrics.value?.redis_conn_total
+ return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const redisConnIdleValue = computed(() => {
+ const v = systemMetrics.value?.redis_conn_idle
+ return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const redisConnActiveValue = computed(() => {
+ if (redisConnTotalValue.value == null || redisConnIdleValue.value == null) return null
+ return Math.max(redisConnTotalValue.value - redisConnIdleValue.value, 0)
+})
+
+const redisPoolSizeValue = computed(() => {
+ const v = systemMetrics.value?.redis_pool_size
+ return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const redisUsagePercent = computed(() => {
+ if (redisConnTotalValue.value == null || redisPoolSizeValue.value == null || redisPoolSizeValue.value <= 0) return null
+ return Math.min(100, Math.max(0, (redisConnTotalValue.value / redisPoolSizeValue.value) * 100))
+})
+
+const redisMiddleLabel = computed(() => {
+ if (systemMetrics.value?.redis_ok === false) return 'FAIL'
+ if (redisUsagePercent.value != null) return `${redisUsagePercent.value.toFixed(0)}%`
+ if (systemMetrics.value?.redis_ok === true) return t('admin.ops.ok')
+ return t('admin.ops.noData')
+})
+
+const redisMiddleClass = computed(() => {
+ if (systemMetrics.value?.redis_ok === false) return 'text-rose-600 dark:text-rose-400'
+ if (redisUsagePercent.value != null) {
+ if (redisUsagePercent.value >= 90) return 'text-rose-600 dark:text-rose-400'
+ if (redisUsagePercent.value >= 70) return 'text-yellow-600 dark:text-yellow-400'
+ return 'text-emerald-600 dark:text-emerald-400'
+ }
+ if (systemMetrics.value?.redis_ok === true) return 'text-emerald-600 dark:text-emerald-400'
+ return 'text-gray-900 dark:text-white'
+})
+
+const goroutineCountValue = computed(() => {
+ const v = systemMetrics.value?.goroutine_count
+ return typeof v === 'number' && Number.isFinite(v) ? v : null
+})
+
+const goroutinesWarnThreshold = 8_000
+const goroutinesCriticalThreshold = 15_000
+
+const goroutineStatus = computed<'ok' | 'warning' | 'critical' | 'unknown'>(() => {
+ const n = goroutineCountValue.value
+ if (n == null) return 'unknown'
+ if (n >= goroutinesCriticalThreshold) return 'critical'
+ if (n >= goroutinesWarnThreshold) return 'warning'
+ return 'ok'
+})
+
+const goroutineStatusLabel = computed(() => {
+ switch (goroutineStatus.value) {
+ case 'ok':
+ return t('admin.ops.ok')
+ case 'warning':
+ return t('common.warning')
+ case 'critical':
+ return t('common.critical')
+ default:
+ return t('admin.ops.noData')
+ }
+})
+
+const goroutineStatusClass = computed(() => {
+ switch (goroutineStatus.value) {
+ case 'ok':
+ return 'text-emerald-600 dark:text-emerald-400'
+ case 'warning':
+ return 'text-yellow-600 dark:text-yellow-400'
+ case 'critical':
+ return 'text-rose-600 dark:text-rose-400'
+ default:
+ return 'text-gray-900 dark:text-white'
+ }
+})
+
+const jobHeartbeats = computed(() => overview.value?.job_heartbeats ?? [])
+
+const jobsStatus = computed<'ok' | 'warn' | 'unknown'>(() => {
+ const list = jobHeartbeats.value
+ if (!list.length) return 'unknown'
+ for (const hb of list) {
+ if (!hb) continue
+ if (hb.last_error_at && (!hb.last_success_at || hb.last_error_at > hb.last_success_at)) return 'warn'
+ }
+ return 'ok'
+})
+
+const jobsWarnCount = computed(() => {
+ let warn = 0
+ for (const hb of jobHeartbeats.value) {
+ if (!hb) continue
+ if (hb.last_error_at && (!hb.last_success_at || hb.last_error_at > hb.last_success_at)) warn++
+ }
+ return warn
+})
+
+const jobsStatusLabel = computed(() => {
+ switch (jobsStatus.value) {
+ case 'ok':
+ return t('admin.ops.ok')
+ case 'warn':
+ return t('common.warning')
+ default:
+ return t('admin.ops.noData')
+ }
+})
+
+const jobsStatusClass = computed(() => {
+ switch (jobsStatus.value) {
+ case 'ok':
+ return 'text-emerald-600 dark:text-emerald-400'
+ case 'warn':
+ return 'text-yellow-600 dark:text-yellow-400'
+ default:
+ return 'text-gray-900 dark:text-white'
+ }
+})
+
+const showJobsDetails = ref(false)
+
+function openJobsDetails() {
+ showJobsDetails.value = true
+}
-
+
@@ -222,21 +630,25 @@ const wsReconnectHint = computed(() => {
{{ t('admin.ops.title') }}
+
-
+
{{ props.loading ? t('admin.ops.loadingText') : t('admin.ops.ready') }}
+
·
{{ t('common.refresh') }}: {{ updatedAtLabel }}
·
+
+
{{ wsStatusLabel }}
@@ -256,7 +668,7 @@ const wsReconnectHint = computed(() => {
@@ -295,80 +707,549 @@ const wsReconnectHint = computed(() => {
-
-
-
-
{{ t('admin.ops.requests') }}
-
- {{ totalRequestsLabel }}
-
-
- {{ t('admin.ops.tokens') }}: {{ totalTokensLabel }}
-
-
-
-
QPS / TPS
-
-
- {{ qpsLabel }} / {{ tpsLabel }}
-
-
-
+
-
-
{{ t('admin.ops.average') }}
QPS: {{ qpsAvgLabel }}
TPS: {{ tpsAvgLabel }}
+
+
+
From e5857161ffde222e691002cf5df1589792a7b579 Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Sun, 11 Jan 2026 15:31:48 +0800
Subject: [PATCH 23/53] =?UTF-8?q?feat(ops):=20=E5=A2=9E=E5=BC=BA=E9=94=99?=
=?UTF-8?q?=E8=AF=AF=E8=AF=A6=E6=83=85=E5=BC=B9=E7=AA=97=E4=B8=8EAPI?=
=?UTF-8?q?=E6=94=AF=E6=8C=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
**前端改动**:
1. OpsErrorDetailModal.vue:
- 新增上游错误详情展示功能
- 支持查看上游错误的请求头、响应体等调试信息
- 改进错误信息格式化与可读性
2. ops.ts API:
- 新增getUpstreamErrors接口调用上游错误查询API
**后端配置**:
- config.go/config.yaml/deploy/config.example.yaml:
- 更新配置支持上游错误事件记录开关
- 添加相关配置项文档说明
---
backend/internal/config/config.go | 2 +-
config.yaml | 2 +-
deploy/config.example.yaml | 2 +-
frontend/src/api/admin/ops.ts | 6 ++
.../ops/components/OpsErrorDetailModal.vue | 97 +++++++++++++++++++
5 files changed, 106 insertions(+), 3 deletions(-)
diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go
index 579e498a..25c6cb65 100644
--- a/backend/internal/config/config.go
+++ b/backend/internal/config/config.go
@@ -635,7 +635,7 @@ func setDefaults() {
// Gateway
viper.SetDefault("gateway.response_header_timeout", 600) // 600秒(10分钟)等待上游响应头,LLM高负载时可能排队较久
- viper.SetDefault("gateway.log_upstream_error_body", false)
+ viper.SetDefault("gateway.log_upstream_error_body", true)
viper.SetDefault("gateway.log_upstream_error_body_max_bytes", 2048)
viper.SetDefault("gateway.inject_beta_for_apikey", false)
viper.SetDefault("gateway.failover_on_400", false)
diff --git a/config.yaml b/config.yaml
index 106de2c3..13e7977c 100644
--- a/config.yaml
+++ b/config.yaml
@@ -159,7 +159,7 @@ gateway:
max_line_size: 41943040
# Log upstream error response body summary (safe/truncated; does not log request content)
# 记录上游错误响应体摘要(安全/截断;不记录请求内容)
- log_upstream_error_body: false
+ log_upstream_error_body: true
# Max bytes to log from upstream error body
# 记录上游错误响应体的最大字节数
log_upstream_error_body_max_bytes: 2048
diff --git a/deploy/config.example.yaml b/deploy/config.example.yaml
index 87ff3148..7ca26968 100644
--- a/deploy/config.example.yaml
+++ b/deploy/config.example.yaml
@@ -159,7 +159,7 @@ gateway:
max_line_size: 41943040
# Log upstream error response body summary (safe/truncated; does not log request content)
# 记录上游错误响应体摘要(安全/截断;不记录请求内容)
- log_upstream_error_body: false
+ log_upstream_error_body: true
# Max bytes to log from upstream error body
# 记录上游错误响应体的最大字节数
log_upstream_error_body_max_bytes: 2048
diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts
index 42b9e70d..3c39a32b 100644
--- a/frontend/src/api/admin/ops.ts
+++ b/frontend/src/api/admin/ops.ts
@@ -704,6 +704,12 @@ export interface OpsErrorDetail extends OpsErrorLog {
error_body: string
user_agent: string
+ // Upstream context (optional; enriched by gateway services)
+ upstream_status_code?: number | null
+ upstream_error_message?: string
+ upstream_error_detail?: string
+ upstream_errors?: string
+
auth_latency_ms?: number | null
routing_latency_ms?: number | null
upstream_latency_ms?: number | null
diff --git a/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue
index f8166040..0726bacd 100644
--- a/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue
+++ b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue
@@ -177,6 +177,81 @@
+
+
+
+ {{ t('admin.ops.errorDetails.upstreamErrors') }}
+
+
+
+
+
status
+
+ {{ detail.upstream_status_code != null ? detail.upstream_status_code : '—' }}
+
+
+
+
message
+
+ {{ detail.upstream_error_message || '—' }}
+
+
+
+
+
+
detail
+
{{ prettyJSON(detail.upstream_error_detail) }}
+
+
+
+
upstream_errors
+
+
+
+
+
+ #{{ idx + 1 }} {{ ev.kind }}
+
+
+ {{ ev.at_unix_ms ? formatDateTime(new Date(ev.at_unix_ms)) : '' }}
+
+
+
+
+
account_id: {{ ev.account_id ?? '—' }}
+
status: {{ ev.upstream_status_code ?? '—' }}
+
+ request_id: {{ ev.upstream_request_id || '—' }}
+
+
+
+
+ {{ ev.message }}
+
+
+
{{ prettyJSON(ev.detail) }}
+
+
+
+
{{ prettyJSON(detail.upstream_errors) }}
+
+
+
@@ -259,6 +334,28 @@ const title = computed(() => {
const emptyText = computed(() => 'No error selected.')
+type UpstreamErrorEvent = {
+ at_unix_ms?: number
+ platform?: string
+ account_id?: number
+ upstream_status_code?: number
+ upstream_request_id?: string
+ kind?: string
+ message?: string
+ detail?: string
+}
+
+const upstreamErrors = computed
(() => {
+ const raw = detail.value?.upstream_errors
+ if (!raw) return []
+ try {
+ const parsed = JSON.parse(raw)
+ return Array.isArray(parsed) ? (parsed as UpstreamErrorEvent[]) : []
+ } catch {
+ return []
+ }
+})
+
function close() {
emit('update:show', false)
}
From e4bc9f6fb05ce0383c5c3f6e9da6fa487a7f2634 Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Sun, 11 Jan 2026 15:50:26 +0800
Subject: [PATCH 24/53] =?UTF-8?q?feat(ops):=20=E4=BC=98=E5=8C=96=E4=BB=AA?=
=?UTF-8?q?=E8=A1=A8=E7=9B=98Header=E5=93=8D=E5=BA=94=E5=BC=8F=E5=B8=83?=
=?UTF-8?q?=E5=B1=80=E4=B8=8E=E6=8C=87=E6=A0=87=E5=B1=95=E7=A4=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
**响应式优化**:
- 添加flex-wrap支持窄屏时间选择器自动换行
- 当前QPS/TPS在窄屏时自动换行,避免溢出
- 时间按钮在窄屏使用更小字号和间距(9px/1.5px)
- 当前数值使用响应式字体(xl→sm:2xl)
**指标展示优化**:
1. 请求卡片:
- 标题简化:总请求 → 请求
- 字段调整:请求 → 请求数
- 移除:平均延迟、平均首字延迟(避免冗余)
2. 延迟和TTFT卡片:
- 布局:grid → flex-wrap(自适应布局)
- 指标不换行:添加whitespace-nowrap
- 最小宽度:min-w-[60px]保证可读性
- 单位内联:名称、数值、单位在同一行(P95: 123 ms)
- 自动换行:整个指标项作为整体换行
**效果**:
- 窄屏:所有元素自动适配,无溢出
- 宽屏:充分利用空间,清晰展示
- 灵活布局:根据容器宽度自动调整指标排列
---
.../ops/components/OpsDashboardHeader.vue | 186 ++++++++++--------
1 file changed, 101 insertions(+), 85 deletions(-)
diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
index 05b711d4..312642c3 100644
--- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
+++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
@@ -833,9 +833,9 @@ function openJobsDetails() {
-
+
-
+
@@ -844,12 +844,12 @@ function openJobsDetails() {
-
+
-
-
+
+
{{ t('admin.ops.current') }}
-
- {{ displayRealTimeQps.toFixed(1) }}
- QPS
-
-
- TPS: {{ displayRealTimeTps.toFixed(1) }}
+
+
+ {{ displayRealTimeQps.toFixed(1) }}
+ QPS
+
+
+ {{ displayRealTimeTps.toFixed(1) }}
+ TPS
+
-
-
-
{{ t('admin.ops.peak') }}
-
-
{{ qpsPeakLabel }}
-
QPS
+
+
+
+
+
{{ t('admin.ops.peak') }}
+
+
+ {{ qpsPeakLabel }}
+ QPS
+
+
+ {{ tpsPeakLabel }}
+ TPS
+
+
-
- TPS: {{ tpsPeakLabel }}
+
+
+
+
{{ t('admin.ops.average') }}
+
+
+ {{ qpsAvgLabel }}
+ QPS
+
+
+ {{ tpsAvgLabel }}
+ TPS
+
+
-
-
-
{{ t('admin.ops.average') }}
-
- QPS: {{ qpsAvgLabel }}
-
- TPS: {{ tpsAvgLabel }}
-
-
-
-
+
+
@@ -924,11 +938,11 @@ function openJobsDetails() {
-
+
- {{ t('admin.ops.totalRequests') }}
+ {{ t('admin.ops.requests') }}
- {{ t('admin.ops.requests') }}:
+ 请求数:
{{ totalRequestsLabel }}
- {{ t('admin.ops.tokens') }}:
+ Token:
{{ totalTokensLabel }}
@@ -956,14 +970,6 @@ function openJobsDetails() {
{{ t('admin.ops.avgTps') }}:
{{ tpsAvgLabel }}
-
- {{ t('admin.ops.avgLatency') }}:
- {{ durationAvgMs ?? '-' }}ms
-
-
- {{ t('admin.ops.avgTtft') }}:
- {{ ttftAvgMs ?? '-' }}ms
-
@@ -1018,26 +1024,31 @@ function openJobsDetails() {
ms (P99)
-
-
+
+
P95:
- {{ durationP95Ms ?? '-' }}ms
+ {{ durationP95Ms ?? '-' }}
+ ms
-
+
P90:
- {{ durationP90Ms ?? '-' }}ms
+ {{ durationP90Ms ?? '-' }}
+ ms
-
+
P50:
- {{ durationP50Ms ?? '-' }}ms
+ {{ durationP50Ms ?? '-' }}
+ ms
-
+
Avg:
- {{ durationAvgMs ?? '-' }}ms
+ {{ durationAvgMs ?? '-' }}
+ ms
-
+
Max:
- {{ durationMaxMs ?? '-' }}ms
+ {{ durationMaxMs ?? '-' }}
+ ms
@@ -1063,26 +1074,31 @@ function openJobsDetails() {
ms (P99)
-
-
+
+
P95:
- {{ ttftP95Ms ?? '-' }}ms
+ {{ ttftP95Ms ?? '-' }}
+ ms
-
+
P90:
- {{ ttftP90Ms ?? '-' }}ms
+ {{ ttftP90Ms ?? '-' }}
+ ms
-
+
P50:
- {{ ttftP50Ms ?? '-' }}ms
+ {{ ttftP50Ms ?? '-' }}
+ ms
-
+
Avg:
- {{ ttftAvgMs ?? '-' }}ms
+ {{ ttftAvgMs ?? '-' }}
+ ms
-
+
Max:
- {{ ttftMaxMs ?? '-' }}ms
+ {{ ttftMaxMs ?? '-' }}
+ ms
From abbde130abc40c1fa507ed6ddc107debd3bfe39e Mon Sep 17 00:00:00 2001
From: cyhhao
Date: Sun, 11 Jan 2026 18:43:47 +0800
Subject: [PATCH 25/53] Revert Codex OAuth fallback handling
---
.../service/openai_codex_transform.go | 124 ------------------
1 file changed, 124 deletions(-)
diff --git a/backend/internal/service/openai_codex_transform.go b/backend/internal/service/openai_codex_transform.go
index 965fb770..94e74f22 100644
--- a/backend/internal/service/openai_codex_transform.go
+++ b/backend/internal/service/openai_codex_transform.go
@@ -115,12 +115,6 @@ func applyCodexOAuthTransform(reqBody map[string]any) codexTransformResult {
existingInstructions = strings.TrimSpace(existingInstructions)
if instructions != "" {
- if existingInstructions != "" && existingInstructions != instructions {
- if input, ok := reqBody["input"].([]any); ok {
- reqBody["input"] = prependSystemInstruction(input, existingInstructions)
- result.Modified = true
- }
- }
if existingInstructions != instructions {
reqBody["instructions"] = instructions
result.Modified = true
@@ -129,7 +123,6 @@ func applyCodexOAuthTransform(reqBody map[string]any) codexTransformResult {
if input, ok := reqBody["input"].([]any); ok {
input = filterCodexInput(input)
- input = normalizeOrphanedToolOutputs(input)
reqBody["input"] = input
result.Modified = true
}
@@ -266,19 +259,6 @@ func filterCodexInput(input []any) []any {
return filtered
}
-func prependSystemInstruction(input []any, instructions string) []any {
- message := map[string]any{
- "role": "system",
- "content": []any{
- map[string]any{
- "type": "input_text",
- "text": instructions,
- },
- },
- }
- return append([]any{message}, input...)
-}
-
func normalizeCodexTools(reqBody map[string]any) bool {
rawTools, ok := reqBody["tools"]
if !ok || rawTools == nil {
@@ -341,110 +321,6 @@ func normalizeCodexTools(reqBody map[string]any) bool {
return modified
}
-func normalizeOrphanedToolOutputs(input []any) []any {
- functionCallIDs := map[string]bool{}
- localShellCallIDs := map[string]bool{}
- customToolCallIDs := map[string]bool{}
-
- for _, item := range input {
- m, ok := item.(map[string]any)
- if !ok {
- continue
- }
- callID := getCallID(m)
- if callID == "" {
- continue
- }
- switch m["type"] {
- case "function_call":
- functionCallIDs[callID] = true
- case "local_shell_call":
- localShellCallIDs[callID] = true
- case "custom_tool_call":
- customToolCallIDs[callID] = true
- }
- }
-
- output := make([]any, 0, len(input))
- for _, item := range input {
- m, ok := item.(map[string]any)
- if !ok {
- output = append(output, item)
- continue
- }
- switch m["type"] {
- case "function_call_output":
- callID := getCallID(m)
- if callID == "" || (!functionCallIDs[callID] && !localShellCallIDs[callID]) {
- output = append(output, convertOrphanedOutputToMessage(m, callID))
- continue
- }
- case "custom_tool_call_output":
- callID := getCallID(m)
- if callID == "" || !customToolCallIDs[callID] {
- output = append(output, convertOrphanedOutputToMessage(m, callID))
- continue
- }
- case "local_shell_call_output":
- callID := getCallID(m)
- if callID == "" || !localShellCallIDs[callID] {
- output = append(output, convertOrphanedOutputToMessage(m, callID))
- continue
- }
- }
- output = append(output, m)
- }
- return output
-}
-
-func getCallID(item map[string]any) string {
- raw, ok := item["call_id"]
- if !ok {
- return ""
- }
- callID, ok := raw.(string)
- if !ok {
- return ""
- }
- callID = strings.TrimSpace(callID)
- if callID == "" {
- return ""
- }
- return callID
-}
-
-func convertOrphanedOutputToMessage(item map[string]any, callID string) map[string]any {
- toolName := "tool"
- if name, ok := item["name"].(string); ok && name != "" {
- toolName = name
- }
- labelID := callID
- if labelID == "" {
- labelID = "unknown"
- }
- text := stringifyOutput(item["output"])
- if len(text) > 16000 {
- text = text[:16000] + "\n...[truncated]"
- }
- return map[string]any{
- "type": "message",
- "role": "assistant",
- "content": fmt.Sprintf("[Previous %s result; call_id=%s]: %s", toolName, labelID, text),
- }
-}
-
-func stringifyOutput(output any) string {
- switch v := output.(type) {
- case string:
- return v
- default:
- if data, err := json.Marshal(v); err == nil {
- return string(data)
- }
- return fmt.Sprintf("%v", v)
- }
-}
-
func codexCachePath(filename string) string {
home, err := os.UserHomeDir()
if err != nil {
From f541636840b76961d856a671b6a4f44a32b0ebcc Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Sun, 11 Jan 2026 19:50:43 +0800
Subject: [PATCH 26/53] =?UTF-8?q?feat(ops):=20=E4=BC=98=E5=8C=96=E8=AD=A6?=
=?UTF-8?q?=E6=8A=A5=E8=A7=84=E5=88=99=E5=92=8C=E8=AE=BE=E7=BD=AE=E7=9A=84?=
=?UTF-8?q?=E6=88=90=E5=8A=9F=E6=8F=90=E7=A4=BA=E4=BF=A1=E6=81=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 添加警报规则保存成功提示:"警报规则保存成功"
- 添加警报规则删除成功提示:"警报规则删除成功"
- 添加运维监控设置保存成功提示:"运维监控设置保存成功"
- 替换通用的"操作成功"提示为具体的业务提示
- 失败时显示后端返回的详细错误信息
相关文件:
- frontend/src/i18n/locales/zh.ts
- frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue
- frontend/src/views/admin/ops/components/OpsSettingsDialog.vue
---
frontend/src/i18n/locales/zh.ts | 42 ++
.../ops/components/OpsAlertRulesCard.vue | 4 +-
.../ops/components/OpsSettingsDialog.vue | 395 ++++++++++++++++++
3 files changed, 439 insertions(+), 2 deletions(-)
create mode 100644 frontend/src/views/admin/ops/components/OpsSettingsDialog.vue
diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts
index dacf2c61..95406179 100644
--- a/frontend/src/i18n/locales/zh.ts
+++ b/frontend/src/i18n/locales/zh.ts
@@ -154,6 +154,7 @@ export default {
saving: '保存中...',
selectedCount: '(已选 {count} 个)',
refresh: '刷新',
+ settings: '设置',
notAvailable: '不可用',
now: '现在',
unknown: '未知',
@@ -2205,13 +2206,16 @@ export default {
loading: '加载中...',
empty: '暂无告警规则',
loadFailed: '加载告警规则失败',
+ saveSuccess: '警报规则保存成功',
saveFailed: '保存告警规则失败',
+ deleteSuccess: '警报规则删除成功',
deleteFailed: '删除告警规则失败',
create: '新建规则',
createTitle: '新建告警规则',
editTitle: '编辑告警规则',
deleteConfirmTitle: '确认删除该规则?',
deleteConfirmMessage: '将删除该规则及其关联的告警事件,是否继续?',
+ manage: '预警规则',
metrics: {
successRate: '成功率 (%)',
errorRate: '错误率 (%)',
@@ -2350,6 +2354,42 @@ export default {
accountHealthThresholdRange: '账号健康错误率阈值必须在 0 到 100 之间'
}
},
+ settings: {
+ title: '运维监控设置',
+ loadFailed: '加载设置失败',
+ saveSuccess: '运维监控设置保存成功',
+ saveFailed: '保存设置失败',
+ dataCollection: '数据采集',
+ evaluationInterval: '评估间隔(秒)',
+ evaluationIntervalHint: '检测任务的执行频率,建议保持默认',
+ alertConfig: '预警配置',
+ enableAlert: '开启预警',
+ alertRecipients: '预警接收邮箱',
+ emailPlaceholder: '输入邮箱地址',
+ recipientsHint: '若为空,系统将使用第一个管理员邮箱作为默认收件人',
+ minSeverity: '最低级别',
+ reportConfig: '评估报告配置',
+ enableReport: '开启评估报告',
+ reportRecipients: '评估报告接收邮箱',
+ dailySummary: '每日摘要',
+ weeklySummary: '每周摘要',
+ advancedSettings: '高级设置',
+ dataRetention: '数据保留策略',
+ enableCleanup: '启用数据清理',
+ cleanupSchedule: '清理计划(Cron)',
+ cleanupScheduleHint: '例如:0 2 * * * 表示每天凌晨2点',
+ errorLogRetentionDays: '错误日志保留天数',
+ minuteMetricsRetentionDays: '分钟指标保留天数',
+ hourlyMetricsRetentionDays: '小时指标保留天数',
+ retentionDaysHint: '建议保留7-90天,过长会占用存储空间',
+ aggregation: '预聚合任务',
+ enableAggregation: '启用预聚合任务',
+ aggregationHint: '预聚合可提升长时间窗口查询性能',
+ validation: {
+ title: '请先修正以下问题',
+ retentionDaysRange: '保留天数必须在1-365天之间'
+ }
+ },
concurrency: {
title: '并发 / 排队',
byPlatform: '按平台',
@@ -2383,10 +2423,12 @@ export default {
accountError: '异常'
},
tooltips: {
+ totalRequests: '当前时间窗口内的总请求数和Token消耗量。',
throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。',
latencyHistogram: '成功请求的延迟分布(毫秒)。',
errorTrend: '错误趋势(SLA 口径排除业务限制;上游错误率排除 429/529)。',
errorDistribution: '按状态码统计的错误分布。',
+ upstreamErrors: '上游服务返回的错误,包括API提供商的错误响应(排除429/529限流错误)。',
goroutines:
'Go 运行时的协程数量(轻量级线程)。没有绝对“安全值”,建议以历史基线为准。经验参考:<2000 常见;2000-8000 需关注;>8000 且伴随队列/延迟上升时,优先排查阻塞/泄漏。',
cpu: 'CPU 使用率,显示系统处理器的负载情况。',
diff --git a/frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue b/frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue
index 6bf1dcae..edf8c40c 100644
--- a/frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue
+++ b/frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue
@@ -136,7 +136,7 @@ async function save() {
draft.value = null
editingId.value = null
await load()
- appStore.showSuccess(t('common.success'))
+ appStore.showSuccess(t('admin.ops.alertRules.saveSuccess'))
} catch (err: any) {
console.error('[OpsAlertRulesCard] Failed to save rule', err)
appStore.showError(err?.response?.data?.detail || t('admin.ops.alertRules.saveFailed'))
@@ -160,7 +160,7 @@ async function confirmDelete() {
showDeleteConfirm.value = false
pendingDelete.value = null
await load()
- appStore.showSuccess(t('common.success'))
+ appStore.showSuccess(t('admin.ops.alertRules.deleteSuccess'))
} catch (err: any) {
console.error('[OpsAlertRulesCard] Failed to delete rule', err)
appStore.showError(err?.response?.data?.detail || t('admin.ops.alertRules.deleteFailed'))
diff --git a/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue
new file mode 100644
index 00000000..968c5081
--- /dev/null
+++ b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue
@@ -0,0 +1,395 @@
+
+
+
+
+
+ {{ t('common.loading') }}
+
+
+
+
+
+
{{ t('admin.ops.settings.validation.title') }}
+
+
+
+
+
+
{{ t('admin.ops.settings.dataCollection') }}
+
+
+
+
{{ t('admin.ops.settings.evaluationIntervalHint') }}
+
+
+
+
+
+
{{ t('admin.ops.settings.alertConfig') }}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {{ t('common.add') }}
+
+
+
+
+ {{ email }}
+ ×
+
+
+
+ {{ t('admin.ops.settings.recipientsHint') }}
+
+
+
+
+
+
+
+
+
+
+
+
+
{{ t('admin.ops.settings.reportConfig') }}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {{ t('common.add') }}
+
+
+
+
+ {{ email }}
+ ×
+
+
+
+ {{ t('admin.ops.settings.recipientsHint') }}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {{ t('admin.ops.settings.advancedSettings') }}
+
+
+
+
+
{{ t('admin.ops.settings.dataRetention') }}
+
+
+
+
+
+
+
+
+
+
{{ t('admin.ops.settings.cleanupScheduleHint') }}
+
+
+
+
{{ t('admin.ops.settings.retentionDaysHint') }}
+
+
+
+
+
{{ t('admin.ops.settings.aggregation') }}
+
+
+
+
+
{{ t('admin.ops.settings.aggregationHint') }}
+
+
+
+
+
+
+
+
+
+
+ {{ t('common.cancel') }}
+
+ {{ saving ? t('common.saving') : t('common.save') }}
+
+
+
+
+
From 988b4d0254635ecd127fedeb5bdbc99f23aee996 Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Sun, 11 Jan 2026 19:51:18 +0800
Subject: [PATCH 27/53] =?UTF-8?q?feat(ops):=20=E6=B7=BB=E5=8A=A0=E9=AB=98?=
=?UTF-8?q?=E7=BA=A7=E8=AE=BE=E7=BD=AEAPI=E6=94=AF=E6=8C=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 新增OpsAdvancedSettings数据模型
- 支持数据保留策略配置(错误日志、分钟级指标、小时级指标)
- 支持数据聚合开关配置
- 添加GET/PUT /admin/ops/advanced-settings接口
- 添加配置校验和默认值处理
相关文件:
- backend/internal/service/ops_settings_models.go
- backend/internal/service/ops_settings.go
- backend/internal/handler/admin/ops_settings_handler.go
- backend/internal/server/routes/admin.go
- backend/internal/service/domain_constants.go
---
.../handler/admin/ops_settings_handler.go | 46 +++++++
backend/internal/server/routes/admin.go | 4 +
backend/internal/service/domain_constants.go | 3 +
backend/internal/service/ops_settings.go | 112 ++++++++++++++++++
.../internal/service/ops_settings_models.go | 18 +++
5 files changed, 183 insertions(+)
diff --git a/backend/internal/handler/admin/ops_settings_handler.go b/backend/internal/handler/admin/ops_settings_handler.go
index e76c1b20..deac13b7 100644
--- a/backend/internal/handler/admin/ops_settings_handler.go
+++ b/backend/internal/handler/admin/ops_settings_handler.go
@@ -101,3 +101,49 @@ func (h *OpsHandler) UpdateAlertRuntimeSettings(c *gin.Context) {
response.Success(c, updated)
}
+// GetAdvancedSettings returns Ops advanced settings (DB-backed).
+// GET /api/v1/admin/ops/advanced-settings
+func (h *OpsHandler) GetAdvancedSettings(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ cfg, err := h.opsService.GetOpsAdvancedSettings(c.Request.Context())
+ if err != nil {
+ response.Error(c, http.StatusInternalServerError, "Failed to get advanced settings")
+ return
+ }
+ response.Success(c, cfg)
+}
+
+// UpdateAdvancedSettings updates Ops advanced settings (DB-backed).
+// PUT /api/v1/admin/ops/advanced-settings
+func (h *OpsHandler) UpdateAdvancedSettings(c *gin.Context) {
+ if h.opsService == nil {
+ response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
+ return
+ }
+ if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
+ response.ErrorFrom(c, err)
+ return
+ }
+
+ var req service.OpsAdvancedSettings
+ if err := c.ShouldBindJSON(&req); err != nil {
+ response.BadRequest(c, "Invalid request body")
+ return
+ }
+
+ updated, err := h.opsService.UpdateOpsAdvancedSettings(c.Request.Context(), &req)
+ if err != nil {
+ response.Error(c, http.StatusBadRequest, err.Error())
+ return
+ }
+ response.Success(c, updated)
+}
+
diff --git a/backend/internal/server/routes/admin.go b/backend/internal/server/routes/admin.go
index e3385ef1..f3e66d04 100644
--- a/backend/internal/server/routes/admin.go
+++ b/backend/internal/server/routes/admin.go
@@ -92,6 +92,10 @@ func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) {
runtime.PUT("/alert", h.Admin.Ops.UpdateAlertRuntimeSettings)
}
+ // Advanced settings (DB-backed)
+ ops.GET("/advanced-settings", h.Admin.Ops.GetAdvancedSettings)
+ ops.PUT("/advanced-settings", h.Admin.Ops.UpdateAdvancedSettings)
+
// WebSocket realtime (QPS/TPS)
ws := ops.Group("/ws")
{
diff --git a/backend/internal/service/domain_constants.go b/backend/internal/service/domain_constants.go
index 4fcebe2b..398d9fbd 100644
--- a/backend/internal/service/domain_constants.go
+++ b/backend/internal/service/domain_constants.go
@@ -143,6 +143,9 @@ const (
// SettingKeyOpsMetricsIntervalSeconds controls the ops metrics collector interval (>=60).
SettingKeyOpsMetricsIntervalSeconds = "ops_metrics_interval_seconds"
+
+ // SettingKeyOpsAdvancedSettings stores JSON config for ops advanced settings (data retention, aggregation).
+ SettingKeyOpsAdvancedSettings = "ops_advanced_settings"
)
// AdminAPIKeyPrefix is the prefix for admin API keys (distinct from user "sk-" keys).
diff --git a/backend/internal/service/ops_settings.go b/backend/internal/service/ops_settings.go
index 2f15bc79..00db8e99 100644
--- a/backend/internal/service/ops_settings.go
+++ b/backend/internal/service/ops_settings.go
@@ -352,3 +352,115 @@ func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *Ops
return updated, nil
}
+// =========================
+// Advanced settings
+// =========================
+
+func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
+ return &OpsAdvancedSettings{
+ DataRetention: OpsDataRetentionSettings{
+ CleanupEnabled: false,
+ CleanupSchedule: "0 2 * * *",
+ ErrorLogRetentionDays: 30,
+ MinuteMetricsRetentionDays: 30,
+ HourlyMetricsRetentionDays: 30,
+ },
+ Aggregation: OpsAggregationSettings{
+ AggregationEnabled: false,
+ },
+ }
+}
+
+func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) {
+ if cfg == nil {
+ return
+ }
+ cfg.DataRetention.CleanupSchedule = strings.TrimSpace(cfg.DataRetention.CleanupSchedule)
+ if cfg.DataRetention.CleanupSchedule == "" {
+ cfg.DataRetention.CleanupSchedule = "0 2 * * *"
+ }
+ if cfg.DataRetention.ErrorLogRetentionDays <= 0 {
+ cfg.DataRetention.ErrorLogRetentionDays = 30
+ }
+ if cfg.DataRetention.MinuteMetricsRetentionDays <= 0 {
+ cfg.DataRetention.MinuteMetricsRetentionDays = 30
+ }
+ if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 {
+ cfg.DataRetention.HourlyMetricsRetentionDays = 30
+ }
+}
+
+func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
+ if cfg == nil {
+ return errors.New("invalid config")
+ }
+ if cfg.DataRetention.ErrorLogRetentionDays < 1 || cfg.DataRetention.ErrorLogRetentionDays > 365 {
+ return errors.New("error_log_retention_days must be between 1 and 365")
+ }
+ if cfg.DataRetention.MinuteMetricsRetentionDays < 1 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 {
+ return errors.New("minute_metrics_retention_days must be between 1 and 365")
+ }
+ if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
+ return errors.New("hourly_metrics_retention_days must be between 1 and 365")
+ }
+ return nil
+}
+
+func (s *OpsService) GetOpsAdvancedSettings(ctx context.Context) (*OpsAdvancedSettings, error) {
+ defaultCfg := defaultOpsAdvancedSettings()
+ if s == nil || s.settingRepo == nil {
+ return defaultCfg, nil
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+
+ raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAdvancedSettings)
+ if err != nil {
+ if errors.Is(err, ErrSettingNotFound) {
+ if b, mErr := json.Marshal(defaultCfg); mErr == nil {
+ _ = s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(b))
+ }
+ return defaultCfg, nil
+ }
+ return nil, err
+ }
+
+ cfg := &OpsAdvancedSettings{}
+ if err := json.Unmarshal([]byte(raw), cfg); err != nil {
+ return defaultCfg, nil
+ }
+
+ normalizeOpsAdvancedSettings(cfg)
+ return cfg, nil
+}
+
+func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdvancedSettings) (*OpsAdvancedSettings, error) {
+ if s == nil || s.settingRepo == nil {
+ return nil, errors.New("setting repository not initialized")
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+ if cfg == nil {
+ return nil, errors.New("invalid config")
+ }
+
+ if err := validateOpsAdvancedSettings(cfg); err != nil {
+ return nil, err
+ }
+
+ normalizeOpsAdvancedSettings(cfg)
+ raw, err := json.Marshal(cfg)
+ if err != nil {
+ return nil, err
+ }
+ if err := s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(raw)); err != nil {
+ return nil, err
+ }
+
+ updated := &OpsAdvancedSettings{}
+ _ = json.Unmarshal(raw, updated)
+ return updated, nil
+}
+
diff --git a/backend/internal/service/ops_settings_models.go b/backend/internal/service/ops_settings_models.go
index 78399c49..52a9db66 100644
--- a/backend/internal/service/ops_settings_models.go
+++ b/backend/internal/service/ops_settings_models.go
@@ -68,3 +68,21 @@ type OpsAlertRuntimeSettings struct {
Silencing OpsAlertSilencingSettings `json:"silencing"`
}
+// OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation).
+type OpsAdvancedSettings struct {
+ DataRetention OpsDataRetentionSettings `json:"data_retention"`
+ Aggregation OpsAggregationSettings `json:"aggregation"`
+}
+
+type OpsDataRetentionSettings struct {
+ CleanupEnabled bool `json:"cleanup_enabled"`
+ CleanupSchedule string `json:"cleanup_schedule"`
+ ErrorLogRetentionDays int `json:"error_log_retention_days"`
+ MinuteMetricsRetentionDays int `json:"minute_metrics_retention_days"`
+ HourlyMetricsRetentionDays int `json:"hourly_metrics_retention_days"`
+}
+
+type OpsAggregationSettings struct {
+ AggregationEnabled bool `json:"aggregation_enabled"`
+}
+
From a39316e004baff96487423ac40093b9dabeda3df Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Sun, 11 Jan 2026 19:51:37 +0800
Subject: [PATCH 28/53] =?UTF-8?q?feat(ops):=20=E9=9B=86=E6=88=90=E8=BF=90?=
=?UTF-8?q?=E7=BB=B4=E7=9B=91=E6=8E=A7=E8=AE=BE=E7=BD=AE=E5=AF=B9=E8=AF=9D?=
=?UTF-8?q?=E6=A1=86=E5=88=B0=E4=BB=AA=E8=A1=A8=E7=9B=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 在OpsDashboardHeader添加设置和警报规则按钮
- 在OpsDashboard集成OpsSettingsDialog组件
- 添加警报规则弹窗展示
- 添加高级设置API类型定义
- 支持从Header快速访问设置和规则管理
相关文件:
- frontend/src/api/admin/ops.ts
- frontend/src/views/admin/ops/types.ts
- frontend/src/views/admin/ops/OpsDashboard.vue
- frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
---
frontend/src/api/admin/ops.ts | 32 +++++++++++++++++-
frontend/src/views/admin/ops/OpsDashboard.vue | 16 +++++++++
.../ops/components/OpsDashboardHeader.vue | 33 +++++++++++++++++--
frontend/src/views/admin/ops/types.ts | 5 ++-
4 files changed, 82 insertions(+), 4 deletions(-)
diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts
index 3c39a32b..c0df4605 100644
--- a/frontend/src/api/admin/ops.ts
+++ b/frontend/src/api/admin/ops.ts
@@ -676,6 +676,23 @@ export interface OpsAlertRuntimeSettings {
}
}
+export interface OpsAdvancedSettings {
+ data_retention: OpsDataRetentionSettings
+ aggregation: OpsAggregationSettings
+}
+
+export interface OpsDataRetentionSettings {
+ cleanup_enabled: boolean
+ cleanup_schedule: string
+ error_log_retention_days: number
+ minute_metrics_retention_days: number
+ hourly_metrics_retention_days: number
+}
+
+export interface OpsAggregationSettings {
+ aggregation_enabled: boolean
+}
+
export interface OpsErrorLog {
id: number
created_at: string
@@ -894,6 +911,17 @@ export async function updateAlertRuntimeSettings(config: OpsAlertRuntimeSettings
return data
}
+// Advanced settings (DB-backed)
+export async function getAdvancedSettings(): Promise {
+ const { data } = await apiClient.get('/admin/ops/advanced-settings')
+ return data
+}
+
+export async function updateAdvancedSettings(config: OpsAdvancedSettings): Promise {
+ const { data } = await apiClient.put('/admin/ops/advanced-settings', config)
+ return data
+}
+
export const opsAPI = {
getDashboardOverview,
getThroughputTrend,
@@ -915,7 +943,9 @@ export const opsAPI = {
getEmailNotificationConfig,
updateEmailNotificationConfig,
getAlertRuntimeSettings,
- updateAlertRuntimeSettings
+ updateAlertRuntimeSettings,
+ getAdvancedSettings,
+ updateAdvancedSettings
}
export default opsAPI
diff --git a/frontend/src/views/admin/ops/OpsDashboard.vue b/frontend/src/views/admin/ops/OpsDashboard.vue
index 212717fb..e8fedc5a 100644
--- a/frontend/src/views/admin/ops/OpsDashboard.vue
+++ b/frontend/src/views/admin/ops/OpsDashboard.vue
@@ -31,6 +31,8 @@
@refresh="fetchData"
@open-request-details="handleOpenRequestDetails"
@open-error-details="openErrorDetails"
+ @open-settings="showSettingsDialog = true"
+ @open-alert-rules="showAlertRulesCard = true"
/>
@@ -72,6 +74,14 @@
+
+
+
+
+
+
+
+
({
sort: 'created_at_desc'
})
+const showSettingsDialog = ref(false)
+const showAlertRulesCard = ref(false)
+
function handleThroughputSelectPlatform(nextPlatform: string) {
platform.value = nextPlatform || ''
groupId.value = null
diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
index 312642c3..23609a06 100644
--- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
+++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
@@ -34,6 +34,8 @@ interface Emits {
(e: 'refresh'): void
(e: 'openRequestDetails', preset?: OpsRequestDetailsPreset): void
(e: 'openErrorDetails', kind: 'request' | 'upstream'): void
+ (e: 'openSettings'): void
+ (e: 'openAlertRules'): void
}
const props = defineProps()
@@ -723,6 +725,33 @@ function openJobsDetails() {
/>
+
+
+
+
+
+ {{ t('admin.ops.alertRules.manage') }}
+
+
+
+
+ {{ t('common.settings') }}
+
@@ -955,11 +984,11 @@ function openJobsDetails() {
- 请求数:
+ {{ t('admin.ops.requests') }}:
{{ totalRequestsLabel }}
- Token:
+ {{ t('admin.ops.tokens') }}:
{{ totalTokensLabel }}
diff --git a/frontend/src/views/admin/ops/types.ts b/frontend/src/views/admin/ops/types.ts
index 08830542..45ba031f 100644
--- a/frontend/src/views/admin/ops/types.ts
+++ b/frontend/src/views/admin/ops/types.ts
@@ -13,5 +13,8 @@ export type {
Operator,
EmailNotificationConfig,
OpsDistributedLockSettings,
- OpsAlertRuntimeSettings
+ OpsAlertRuntimeSettings,
+ OpsAdvancedSettings,
+ OpsDataRetentionSettings,
+ OpsAggregationSettings
} from '@/api/admin/ops'
From 63dc6a68dfb76ef21c5207de57a224b5a36fb25d Mon Sep 17 00:00:00 2001
From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com>
Date: Sun, 11 Jan 2026 19:58:38 +0800
Subject: [PATCH 29/53] =?UTF-8?q?feat(ops):=20=E9=9A=90=E8=97=8F=E6=9F=A5?=
=?UTF-8?q?=E8=AF=A2=E6=A8=A1=E5=BC=8F=E9=80=89=E6=8B=A9=E5=99=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 在OpsDashboardHeader中隐藏queryMode选择器(使用v-if="false")
- 保留所有后端逻辑和前端状态管理
- auto模式逻辑:优先使用预聚合数据,不存在时回退到实时计算
- 用户界面更简洁,后端自动选择最优查询方式
相关文件:
- frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
---
frontend/src/views/admin/ops/components/OpsDashboardHeader.vue | 1 +
1 file changed, 1 insertion(+)
diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
index 23609a06..fb622eaa 100644
--- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
+++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
@@ -703,6 +703,7 @@ function openJobsDetails() {
/>