diff --git a/.gitignore b/.gitignore index 93ae19f3..ec218bfa 100644 --- a/.gitignore +++ b/.gitignore @@ -123,6 +123,4 @@ backend/cmd/server/server deploy/docker-compose.override.yml .gocache/ vite.config.js -!docs/ docs/* -!docs/dependency-security.md diff --git a/backend/go.mod b/backend/go.mod index 9ac48305..97f599f8 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -8,9 +8,11 @@ require ( github.com/golang-jwt/jwt/v5 v5.2.2 github.com/google/uuid v1.6.0 github.com/google/wire v0.7.0 + github.com/gorilla/websocket v1.5.3 github.com/imroc/req/v3 v3.57.0 github.com/lib/pq v1.10.9 github.com/redis/go-redis/v9 v9.17.2 + github.com/shirou/gopsutil/v4 v4.25.6 github.com/spf13/viper v1.18.2 github.com/stretchr/testify v1.11.1 github.com/testcontainers/testcontainers-go/modules/postgres v0.40.0 @@ -104,9 +106,9 @@ require ( github.com/quic-go/quic-go v0.57.1 // indirect github.com/refraction-networking/utls v1.8.1 // indirect github.com/rivo/uniseg v0.2.0 // indirect + github.com/robfig/cron/v3 v3.0.1 // indirect github.com/sagikazarmark/locafero v0.4.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect - github.com/shirou/gopsutil/v4 v4.25.6 // indirect github.com/sirupsen/logrus v1.9.3 // indirect github.com/sourcegraph/conc v0.3.0 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect diff --git a/backend/go.sum b/backend/go.sum index 38e2b53e..0adfa4de 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -113,6 +113,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/wire v0.7.0 h1:JxUKI6+CVBgCO2WToKy/nQk0sS+amI9z9EjVmdaocj4= github.com/google/wire v0.7.0/go.mod h1:n6YbUQD9cPKTnHXEBN2DXlOp/mVADhVErcMFb0v3J18= +github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= +github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= @@ -220,6 +222,8 @@ github.com/refraction-networking/utls v1.8.1 h1:yNY1kapmQU8JeM1sSw2H2asfTIwWxIkr github.com/refraction-networking/utls v1.8.1/go.mod h1:jkSOEkLqn+S/jtpEHPOsVv/4V4EVnelwbMQl4vCWXAM= github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= +github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index e49c188b..6e66b22c 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -42,6 +42,7 @@ type Config struct { Turnstile TurnstileConfig `mapstructure:"turnstile"` Database DatabaseConfig `mapstructure:"database"` Redis RedisConfig `mapstructure:"redis"` + Ops OpsConfig `mapstructure:"ops"` JWT JWTConfig `mapstructure:"jwt"` Default DefaultConfig `mapstructure:"default"` RateLimit RateLimitConfig `mapstructure:"rate_limit"` @@ -304,6 +305,47 @@ func (r *RedisConfig) Address() string { return fmt.Sprintf("%s:%d", r.Host, r.Port) } +type OpsConfig struct { + // Enabled controls whether ops features should run. + // + // NOTE: vNext still has a DB-backed feature flag (ops_monitoring_enabled) for runtime on/off. + // This config flag is the "hard switch" for deployments that want to disable ops completely. + Enabled bool `mapstructure:"enabled"` + + // UsePreaggregatedTables prefers ops_metrics_hourly/daily for long-window dashboard queries. + UsePreaggregatedTables bool `mapstructure:"use_preaggregated_tables"` + + // Cleanup controls periodic deletion of old ops data to prevent unbounded growth. + Cleanup OpsCleanupConfig `mapstructure:"cleanup"` + + // MetricsCollectorCache controls Redis caching for expensive per-window collector queries. + MetricsCollectorCache OpsMetricsCollectorCacheConfig `mapstructure:"metrics_collector_cache"` + + // Pre-aggregation configuration. + Aggregation OpsAggregationConfig `mapstructure:"aggregation"` +} + +type OpsCleanupConfig struct { + Enabled bool `mapstructure:"enabled"` + Schedule string `mapstructure:"schedule"` + + // Retention days (0 disables that cleanup target). + // + // vNext requirement: default 30 days across ops datasets. + ErrorLogRetentionDays int `mapstructure:"error_log_retention_days"` + MinuteMetricsRetentionDays int `mapstructure:"minute_metrics_retention_days"` + HourlyMetricsRetentionDays int `mapstructure:"hourly_metrics_retention_days"` +} + +type OpsAggregationConfig struct { + Enabled bool `mapstructure:"enabled"` +} + +type OpsMetricsCollectorCacheConfig struct { + Enabled bool `mapstructure:"enabled"` + TTL time.Duration `mapstructure:"ttl"` +} + type JWTConfig struct { Secret string `mapstructure:"secret"` ExpireHour int `mapstructure:"expire_hour"` @@ -489,6 +531,20 @@ func setDefaults() { viper.SetDefault("redis.pool_size", 128) viper.SetDefault("redis.min_idle_conns", 10) + // Ops (vNext) + viper.SetDefault("ops.enabled", true) + viper.SetDefault("ops.use_preaggregated_tables", false) + viper.SetDefault("ops.cleanup.enabled", true) + viper.SetDefault("ops.cleanup.schedule", "0 2 * * *") + // Retention days: vNext defaults to 30 days across ops datasets. + viper.SetDefault("ops.cleanup.error_log_retention_days", 30) + viper.SetDefault("ops.cleanup.minute_metrics_retention_days", 30) + viper.SetDefault("ops.cleanup.hourly_metrics_retention_days", 30) + viper.SetDefault("ops.aggregation.enabled", true) + viper.SetDefault("ops.metrics_collector_cache.enabled", true) + // TTL should be slightly larger than collection interval (1m) to maximize cross-replica cache hits. + viper.SetDefault("ops.metrics_collector_cache.ttl", 65*time.Second) + // JWT viper.SetDefault("jwt.secret", "") viper.SetDefault("jwt.expire_hour", 24) @@ -687,6 +743,21 @@ func (c *Config) Validate() error { if c.Gateway.Scheduling.SlotCleanupInterval < 0 { return fmt.Errorf("gateway.scheduling.slot_cleanup_interval must be non-negative") } + if c.Ops.MetricsCollectorCache.TTL < 0 { + return fmt.Errorf("ops.metrics_collector_cache.ttl must be non-negative") + } + if c.Ops.Cleanup.ErrorLogRetentionDays < 0 { + return fmt.Errorf("ops.cleanup.error_log_retention_days must be non-negative") + } + if c.Ops.Cleanup.MinuteMetricsRetentionDays < 0 { + return fmt.Errorf("ops.cleanup.minute_metrics_retention_days must be non-negative") + } + if c.Ops.Cleanup.HourlyMetricsRetentionDays < 0 { + return fmt.Errorf("ops.cleanup.hourly_metrics_retention_days must be non-negative") + } + if c.Ops.Cleanup.Enabled && strings.TrimSpace(c.Ops.Cleanup.Schedule) == "" { + return fmt.Errorf("ops.cleanup.schedule is required when ops.cleanup.enabled=true") + } if c.Concurrency.PingInterval < 5 || c.Concurrency.PingInterval > 30 { return fmt.Errorf("concurrency.ping_interval must be between 5-30 seconds") } diff --git a/backend/internal/pkg/ctxkey/ctxkey.go b/backend/internal/pkg/ctxkey/ctxkey.go index 8920ea69..61d98cc2 100644 --- a/backend/internal/pkg/ctxkey/ctxkey.go +++ b/backend/internal/pkg/ctxkey/ctxkey.go @@ -7,4 +7,10 @@ type Key string const ( // ForcePlatform 强制平台(用于 /antigravity 路由),由 middleware.ForcePlatform 设置 ForcePlatform Key = "ctx_force_platform" + + // ClientRequestID 客户端请求的唯一标识,用于追踪请求全生命周期(用于 Ops 监控与排障)。 + ClientRequestID Key = "ctx_client_request_id" + + // RetryCount 表示当前请求在网关层的重试次数(用于 Ops 记录与排障)。 + RetryCount Key = "ctx_retry_count" ) diff --git a/config.yaml b/config.yaml index f43c9c19..0ce796e7 100644 --- a/config.yaml +++ b/config.yaml @@ -221,6 +221,41 @@ redis: # 数据库编号(0-15) db: 0 +# ============================================================================= +# Ops Monitoring (Optional) +# 运维监控 (可选) +# ============================================================================= +ops: + # Hard switch: disable all ops background jobs and APIs when false + # 硬开关:为 false 时禁用所有 Ops 后台任务与接口 + enabled: true + + # Prefer pre-aggregated tables (ops_metrics_hourly/ops_metrics_daily) for long-window dashboard queries. + # 优先使用预聚合表(用于长时间窗口查询性能) + use_preaggregated_tables: false + + # Data cleanup configuration + # 数据清理配置(vNext 默认统一保留 30 天) + cleanup: + enabled: true + # Cron expression (minute hour dom month dow), e.g. "0 2 * * *" = daily at 2 AM + # Cron 表达式(分 时 日 月 周),例如 "0 2 * * *" = 每天凌晨 2 点 + schedule: "0 2 * * *" + error_log_retention_days: 30 + minute_metrics_retention_days: 30 + hourly_metrics_retention_days: 30 + + # Pre-aggregation configuration + # 预聚合任务配置 + aggregation: + enabled: true + + # OpsMetricsCollector Redis cache (reduces duplicate expensive window aggregation in multi-replica deployments) + # 指标采集 Redis 缓存(多副本部署时减少重复计算) + metrics_collector_cache: + enabled: true + ttl: 65s + # ============================================================================= # JWT Configuration # JWT 配置