From df1ef3deb63af2946bf56750a2c7f4ded362150e Mon Sep 17 00:00:00 2001 From: ianshaw Date: Sat, 3 Jan 2026 06:18:44 -0800 Subject: [PATCH] =?UTF-8?q?refactor:=20=E7=A7=BB=E9=99=A4=20Ops=20?= =?UTF-8?q?=E7=9B=91=E6=8E=A7=E6=A8=A1=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 移除未完成的运维监控功能,简化系统架构: - 删除 ops_handler, ops_service, ops_repo 等后端代码 - 删除 ops 相关数据库迁移文件 - 删除前端 OpsDashboard 页面和 API --- backend/internal/handler/admin/ops_handler.go | 402 ----- .../internal/handler/admin/ops_ws_handler.go | 286 ---- .../handler/admin/ops_ws_handler_test.go | 123 -- backend/internal/handler/ops_error_logger.go | 166 -- backend/internal/repository/ops.go | 190 --- backend/internal/repository/ops_cache.go | 127 -- backend/internal/repository/ops_repo.go | 1333 ----------------- .../middleware/ops_auth_error_logger.go | 55 - backend/internal/service/ops.go | 99 -- backend/internal/service/ops_alert_service.go | 834 ----------- .../ops_alert_service_integration_test.go | 271 ---- .../service/ops_alert_service_test.go | 315 ---- backend/internal/service/ops_alerts.go | 92 -- .../internal/service/ops_metrics_collector.go | 203 --- backend/internal/service/ops_service.go | 1020 ------------- .../017_ops_metrics_and_error_logs.sql | 48 - .../018_ops_metrics_system_stats.sql | 14 - backend/migrations/019_ops_alerts.sql | 42 - .../migrations/020_seed_ops_alert_rules.sql | 32 - .../021_seed_ops_alert_rules_more.sql | 205 --- .../022_enable_ops_alert_webhook.sql | 7 - .../023_ops_metrics_request_counts.sql | 6 - .../migrations/025_enhance_ops_monitoring.sql | 272 ---- frontend/src/api/admin/ops.ts | 324 ---- frontend/src/views/admin/ops/OpsDashboard.vue | 417 ------ 25 files changed, 6883 deletions(-) delete mode 100644 backend/internal/handler/admin/ops_handler.go delete mode 100644 backend/internal/handler/admin/ops_ws_handler.go delete mode 100644 backend/internal/handler/admin/ops_ws_handler_test.go delete mode 100644 backend/internal/handler/ops_error_logger.go delete mode 100644 backend/internal/repository/ops.go delete mode 100644 backend/internal/repository/ops_cache.go delete mode 100644 backend/internal/repository/ops_repo.go delete mode 100644 backend/internal/server/middleware/ops_auth_error_logger.go delete mode 100644 backend/internal/service/ops.go delete mode 100644 backend/internal/service/ops_alert_service.go delete mode 100644 backend/internal/service/ops_alert_service_integration_test.go delete mode 100644 backend/internal/service/ops_alert_service_test.go delete mode 100644 backend/internal/service/ops_alerts.go delete mode 100644 backend/internal/service/ops_metrics_collector.go delete mode 100644 backend/internal/service/ops_service.go delete mode 100644 backend/migrations/017_ops_metrics_and_error_logs.sql delete mode 100644 backend/migrations/018_ops_metrics_system_stats.sql delete mode 100644 backend/migrations/019_ops_alerts.sql delete mode 100644 backend/migrations/020_seed_ops_alert_rules.sql delete mode 100644 backend/migrations/021_seed_ops_alert_rules_more.sql delete mode 100644 backend/migrations/022_enable_ops_alert_webhook.sql delete mode 100644 backend/migrations/023_ops_metrics_request_counts.sql delete mode 100644 backend/migrations/025_enhance_ops_monitoring.sql delete mode 100644 frontend/src/api/admin/ops.ts delete mode 100644 frontend/src/views/admin/ops/OpsDashboard.vue diff --git a/backend/internal/handler/admin/ops_handler.go b/backend/internal/handler/admin/ops_handler.go deleted file mode 100644 index 0d1402fe..00000000 --- a/backend/internal/handler/admin/ops_handler.go +++ /dev/null @@ -1,402 +0,0 @@ -package admin - -import ( - "math" - "net/http" - "strconv" - "time" - - "github.com/Wei-Shaw/sub2api/internal/pkg/response" - "github.com/Wei-Shaw/sub2api/internal/service" - "github.com/gin-gonic/gin" -) - -// OpsHandler handles ops dashboard endpoints. -type OpsHandler struct { - opsService *service.OpsService -} - -// NewOpsHandler creates a new OpsHandler. -func NewOpsHandler(opsService *service.OpsService) *OpsHandler { - return &OpsHandler{opsService: opsService} -} - -// GetMetrics returns the latest ops metrics snapshot. -// GET /api/v1/admin/ops/metrics -func (h *OpsHandler) GetMetrics(c *gin.Context) { - metrics, err := h.opsService.GetLatestMetrics(c.Request.Context()) - if err != nil { - response.Error(c, http.StatusInternalServerError, "Failed to get ops metrics") - return - } - response.Success(c, metrics) -} - -// ListMetricsHistory returns a time-range slice of metrics for charts. -// GET /api/v1/admin/ops/metrics/history -// -// Query params: -// - window_minutes: int (default 1) -// - minutes: int (lookback; optional) -// - start_time/end_time: RFC3339 timestamps (optional; overrides minutes when provided) -// - limit: int (optional; max 100, default 300 for backward compatibility) -func (h *OpsHandler) ListMetricsHistory(c *gin.Context) { - windowMinutes := 1 - if v := c.Query("window_minutes"); v != "" { - if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 { - windowMinutes = parsed - } else { - response.BadRequest(c, "Invalid window_minutes") - return - } - } - - limit := 300 - limitProvided := false - if v := c.Query("limit"); v != "" { - parsed, err := strconv.Atoi(v) - if err != nil || parsed <= 0 || parsed > 5000 { - response.BadRequest(c, "Invalid limit (must be 1-5000)") - return - } - limit = parsed - limitProvided = true - } - - endTime := time.Now() - startTime := time.Time{} - - if startTimeStr := c.Query("start_time"); startTimeStr != "" { - parsed, err := time.Parse(time.RFC3339, startTimeStr) - if err != nil { - response.BadRequest(c, "Invalid start_time format (RFC3339)") - return - } - startTime = parsed - } - if endTimeStr := c.Query("end_time"); endTimeStr != "" { - parsed, err := time.Parse(time.RFC3339, endTimeStr) - if err != nil { - response.BadRequest(c, "Invalid end_time format (RFC3339)") - return - } - endTime = parsed - } - - // If explicit range not provided, use lookback minutes. - if startTime.IsZero() { - if v := c.Query("minutes"); v != "" { - minutes, err := strconv.Atoi(v) - if err != nil || minutes <= 0 { - response.BadRequest(c, "Invalid minutes") - return - } - if minutes > 60*24*7 { - minutes = 60 * 24 * 7 - } - startTime = endTime.Add(-time.Duration(minutes) * time.Minute) - } - } - - // Default time range: last 24 hours. - if startTime.IsZero() { - startTime = endTime.Add(-24 * time.Hour) - if !limitProvided { - // Metrics are collected at 1-minute cadence; 24h requires ~1440 points. - limit = 24 * 60 - } - } - - if startTime.After(endTime) { - response.BadRequest(c, "Invalid time range: start_time must be <= end_time") - return - } - - items, err := h.opsService.ListMetricsHistory(c.Request.Context(), windowMinutes, startTime, endTime, limit) - if err != nil { - response.Error(c, http.StatusInternalServerError, "Failed to list ops metrics history") - return - } - response.Success(c, gin.H{"items": items}) -} - -// ListErrorLogs lists recent error logs with optional filters. -// GET /api/v1/admin/ops/error-logs -// -// Query params: -// - start_time/end_time: RFC3339 timestamps (optional) -// - platform: string (optional) -// - phase: string (optional) -// - severity: string (optional) -// - q: string (optional; fuzzy match) -// - limit: int (optional; default 100; max 500) -func (h *OpsHandler) ListErrorLogs(c *gin.Context) { - var filters service.OpsErrorLogFilters - - if startTimeStr := c.Query("start_time"); startTimeStr != "" { - startTime, err := time.Parse(time.RFC3339, startTimeStr) - if err != nil { - response.BadRequest(c, "Invalid start_time format (RFC3339)") - return - } - filters.StartTime = &startTime - } - if endTimeStr := c.Query("end_time"); endTimeStr != "" { - endTime, err := time.Parse(time.RFC3339, endTimeStr) - if err != nil { - response.BadRequest(c, "Invalid end_time format (RFC3339)") - return - } - filters.EndTime = &endTime - } - - if filters.StartTime != nil && filters.EndTime != nil && filters.StartTime.After(*filters.EndTime) { - response.BadRequest(c, "Invalid time range: start_time must be <= end_time") - return - } - - filters.Platform = c.Query("platform") - filters.Phase = c.Query("phase") - filters.Severity = c.Query("severity") - filters.Query = c.Query("q") - - filters.Limit = 100 - if limitStr := c.Query("limit"); limitStr != "" { - limit, err := strconv.Atoi(limitStr) - if err != nil || limit <= 0 || limit > 500 { - response.BadRequest(c, "Invalid limit (must be 1-500)") - return - } - filters.Limit = limit - } - - items, total, err := h.opsService.ListErrorLogs(c.Request.Context(), filters) - if err != nil { - response.Error(c, http.StatusInternalServerError, "Failed to list error logs") - return - } - - response.Success(c, gin.H{ - "items": items, - "total": total, - }) -} - -// GetDashboardOverview returns realtime ops dashboard overview. -// GET /api/v1/admin/ops/dashboard/overview -// -// Query params: -// - time_range: string (optional; default "1h") one of: 5m, 30m, 1h, 6h, 24h -func (h *OpsHandler) GetDashboardOverview(c *gin.Context) { - timeRange := c.Query("time_range") - if timeRange == "" { - timeRange = "1h" - } - - switch timeRange { - case "5m", "30m", "1h", "6h", "24h": - default: - response.BadRequest(c, "Invalid time_range (supported: 5m, 30m, 1h, 6h, 24h)") - return - } - - data, err := h.opsService.GetDashboardOverview(c.Request.Context(), timeRange) - if err != nil { - response.Error(c, http.StatusInternalServerError, "Failed to get dashboard overview") - return - } - response.Success(c, data) -} - -// GetProviderHealth returns upstream provider health comparison data. -// GET /api/v1/admin/ops/dashboard/providers -// -// Query params: -// - time_range: string (optional; default "1h") one of: 5m, 30m, 1h, 6h, 24h -func (h *OpsHandler) GetProviderHealth(c *gin.Context) { - timeRange := c.Query("time_range") - if timeRange == "" { - timeRange = "1h" - } - - switch timeRange { - case "5m", "30m", "1h", "6h", "24h": - default: - response.BadRequest(c, "Invalid time_range (supported: 5m, 30m, 1h, 6h, 24h)") - return - } - - providers, err := h.opsService.GetProviderHealth(c.Request.Context(), timeRange) - if err != nil { - response.Error(c, http.StatusInternalServerError, "Failed to get provider health") - return - } - - var totalRequests int64 - var weightedSuccess float64 - var bestProvider string - var worstProvider string - var bestRate float64 - var worstRate float64 - hasRate := false - - for _, p := range providers { - if p == nil { - continue - } - totalRequests += p.RequestCount - weightedSuccess += (p.SuccessRate / 100) * float64(p.RequestCount) - - if p.RequestCount <= 0 { - continue - } - if !hasRate { - bestProvider = p.Name - worstProvider = p.Name - bestRate = p.SuccessRate - worstRate = p.SuccessRate - hasRate = true - continue - } - - if p.SuccessRate > bestRate { - bestProvider = p.Name - bestRate = p.SuccessRate - } - if p.SuccessRate < worstRate { - worstProvider = p.Name - worstRate = p.SuccessRate - } - } - - avgSuccessRate := 0.0 - if totalRequests > 0 { - avgSuccessRate = (weightedSuccess / float64(totalRequests)) * 100 - avgSuccessRate = math.Round(avgSuccessRate*100) / 100 - } - - response.Success(c, gin.H{ - "providers": providers, - "summary": gin.H{ - "total_requests": totalRequests, - "avg_success_rate": avgSuccessRate, - "best_provider": bestProvider, - "worst_provider": worstProvider, - }, - }) -} - -// GetErrorLogs returns a paginated error log list with multi-dimensional filters. -// GET /api/v1/admin/ops/errors -func (h *OpsHandler) GetErrorLogs(c *gin.Context) { - page, pageSize := response.ParsePagination(c) - - filter := &service.ErrorLogFilter{ - Page: page, - PageSize: pageSize, - } - - if startTimeStr := c.Query("start_time"); startTimeStr != "" { - startTime, err := time.Parse(time.RFC3339, startTimeStr) - if err != nil { - response.BadRequest(c, "Invalid start_time format (RFC3339)") - return - } - filter.StartTime = &startTime - } - if endTimeStr := c.Query("end_time"); endTimeStr != "" { - endTime, err := time.Parse(time.RFC3339, endTimeStr) - if err != nil { - response.BadRequest(c, "Invalid end_time format (RFC3339)") - return - } - filter.EndTime = &endTime - } - - if filter.StartTime != nil && filter.EndTime != nil && filter.StartTime.After(*filter.EndTime) { - response.BadRequest(c, "Invalid time range: start_time must be <= end_time") - return - } - - if errorCodeStr := c.Query("error_code"); errorCodeStr != "" { - code, err := strconv.Atoi(errorCodeStr) - if err != nil || code < 0 { - response.BadRequest(c, "Invalid error_code") - return - } - filter.ErrorCode = &code - } - - // Keep both parameter names for compatibility: provider (docs) and platform (legacy). - filter.Provider = c.Query("provider") - if filter.Provider == "" { - filter.Provider = c.Query("platform") - } - - if accountIDStr := c.Query("account_id"); accountIDStr != "" { - accountID, err := strconv.ParseInt(accountIDStr, 10, 64) - if err != nil || accountID <= 0 { - response.BadRequest(c, "Invalid account_id") - return - } - filter.AccountID = &accountID - } - - out, err := h.opsService.GetErrorLogs(c.Request.Context(), filter) - if err != nil { - response.Error(c, http.StatusInternalServerError, "Failed to get error logs") - return - } - - response.Success(c, gin.H{ - "errors": out.Errors, - "total": out.Total, - "page": out.Page, - "page_size": out.PageSize, - }) -} - -// GetLatencyHistogram returns the latency distribution histogram. -// GET /api/v1/admin/ops/dashboard/latency-histogram -func (h *OpsHandler) GetLatencyHistogram(c *gin.Context) { - timeRange := c.Query("time_range") - if timeRange == "" { - timeRange = "1h" - } - - buckets, err := h.opsService.GetLatencyHistogram(c.Request.Context(), timeRange) - if err != nil { - response.Error(c, http.StatusInternalServerError, "Failed to get latency histogram") - return - } - - totalRequests := int64(0) - for _, b := range buckets { - totalRequests += b.Count - } - - response.Success(c, gin.H{ - "buckets": buckets, - "total_requests": totalRequests, - "slow_request_threshold": 1000, - }) -} - -// GetErrorDistribution returns the error distribution. -// GET /api/v1/admin/ops/dashboard/errors/distribution -func (h *OpsHandler) GetErrorDistribution(c *gin.Context) { - timeRange := c.Query("time_range") - if timeRange == "" { - timeRange = "1h" - } - - items, err := h.opsService.GetErrorDistribution(c.Request.Context(), timeRange) - if err != nil { - response.Error(c, http.StatusInternalServerError, "Failed to get error distribution") - return - } - - response.Success(c, gin.H{ - "items": items, - }) -} diff --git a/backend/internal/handler/admin/ops_ws_handler.go b/backend/internal/handler/admin/ops_ws_handler.go deleted file mode 100644 index 429f6ae4..00000000 --- a/backend/internal/handler/admin/ops_ws_handler.go +++ /dev/null @@ -1,286 +0,0 @@ -package admin - -import ( - "context" - "encoding/json" - "log" - "net" - "net/http" - "net/netip" - "net/url" - "os" - "strconv" - "strings" - "time" - - "github.com/gin-gonic/gin" - "github.com/gorilla/websocket" -) - -type OpsWSProxyConfig struct { - TrustProxy bool - TrustedProxies []netip.Prefix - OriginPolicy string -} - -const ( - envOpsWSTrustProxy = "OPS_WS_TRUST_PROXY" - envOpsWSTrustedProxies = "OPS_WS_TRUSTED_PROXIES" - envOpsWSOriginPolicy = "OPS_WS_ORIGIN_POLICY" -) - -const ( - OriginPolicyStrict = "strict" - OriginPolicyPermissive = "permissive" -) - -var opsWSProxyConfig = loadOpsWSProxyConfigFromEnv() - -var upgrader = websocket.Upgrader{ - CheckOrigin: func(r *http.Request) bool { - return isAllowedOpsWSOrigin(r) - }, -} - -// QPSWSHandler handles realtime QPS push via WebSocket. -// GET /api/v1/admin/ops/ws/qps -func (h *OpsHandler) QPSWSHandler(c *gin.Context) { - conn, err := upgrader.Upgrade(c.Writer, c.Request, nil) - if err != nil { - log.Printf("[OpsWS] upgrade failed: %v", err) - return - } - defer func() { _ = conn.Close() }() - - // Set pong handler - if err := conn.SetReadDeadline(time.Now().Add(60 * time.Second)); err != nil { - log.Printf("[OpsWS] set read deadline failed: %v", err) - return - } - conn.SetPongHandler(func(string) error { - return conn.SetReadDeadline(time.Now().Add(60 * time.Second)) - }) - - // Push QPS data every 2 seconds - ticker := time.NewTicker(2 * time.Second) - defer ticker.Stop() - - // Heartbeat ping every 30 seconds - pingTicker := time.NewTicker(30 * time.Second) - defer pingTicker.Stop() - - ctx, cancel := context.WithCancel(c.Request.Context()) - defer cancel() - - for { - select { - case <-ticker.C: - // Fetch 1m window stats for current QPS - data, err := h.opsService.GetDashboardOverview(ctx, "5m") - if err != nil { - log.Printf("[OpsWS] get overview failed: %v", err) - continue - } - - payload := gin.H{ - "type": "qps_update", - "timestamp": time.Now().Format(time.RFC3339), - "data": gin.H{ - "qps": data.QPS.Current, - "tps": data.TPS.Current, - "request_count": data.Errors.TotalCount + int64(data.QPS.Avg1h*60), // Rough estimate - }, - } - - msg, _ := json.Marshal(payload) - if err := conn.WriteMessage(websocket.TextMessage, msg); err != nil { - log.Printf("[OpsWS] write failed: %v", err) - return - } - case <-pingTicker.C: - if err := conn.WriteMessage(websocket.PingMessage, nil); err != nil { - log.Printf("[OpsWS] ping failed: %v", err) - return - } - case <-ctx.Done(): - return - } - } -} - -func isAllowedOpsWSOrigin(r *http.Request) bool { - if r == nil { - return false - } - origin := strings.TrimSpace(r.Header.Get("Origin")) - if origin == "" { - switch strings.ToLower(strings.TrimSpace(opsWSProxyConfig.OriginPolicy)) { - case OriginPolicyStrict: - return false - case OriginPolicyPermissive, "": - return true - default: - return true - } - } - parsed, err := url.Parse(origin) - if err != nil || parsed.Hostname() == "" { - return false - } - originHost := strings.ToLower(parsed.Hostname()) - - trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r) - reqHost := hostWithoutPort(r.Host) - if trustProxyHeaders { - xfHost := strings.TrimSpace(r.Header.Get("X-Forwarded-Host")) - if xfHost != "" { - xfHost = strings.TrimSpace(strings.Split(xfHost, ",")[0]) - if xfHost != "" { - reqHost = hostWithoutPort(xfHost) - } - } - } - reqHost = strings.ToLower(reqHost) - if reqHost == "" { - return false - } - return originHost == reqHost -} - -func shouldTrustOpsWSProxyHeaders(r *http.Request) bool { - if r == nil { - return false - } - if !opsWSProxyConfig.TrustProxy { - return false - } - peerIP, ok := requestPeerIP(r) - if !ok { - return false - } - return isAddrInTrustedProxies(peerIP, opsWSProxyConfig.TrustedProxies) -} - -func requestPeerIP(r *http.Request) (netip.Addr, bool) { - if r == nil { - return netip.Addr{}, false - } - host, _, err := net.SplitHostPort(strings.TrimSpace(r.RemoteAddr)) - if err != nil { - host = strings.TrimSpace(r.RemoteAddr) - } - host = strings.TrimPrefix(host, "[") - host = strings.TrimSuffix(host, "]") - if host == "" { - return netip.Addr{}, false - } - addr, err := netip.ParseAddr(host) - if err != nil { - return netip.Addr{}, false - } - return addr.Unmap(), true -} - -func isAddrInTrustedProxies(addr netip.Addr, trusted []netip.Prefix) bool { - if !addr.IsValid() { - return false - } - for _, p := range trusted { - if p.Contains(addr) { - return true - } - } - return false -} - -func loadOpsWSProxyConfigFromEnv() OpsWSProxyConfig { - cfg := OpsWSProxyConfig{ - TrustProxy: true, - TrustedProxies: defaultTrustedProxies(), - OriginPolicy: OriginPolicyPermissive, - } - - if v := strings.TrimSpace(os.Getenv(envOpsWSTrustProxy)); v != "" { - if parsed, err := strconv.ParseBool(v); err == nil { - cfg.TrustProxy = parsed - } else { - log.Printf("[OpsWS] invalid %s=%q (expected bool); using default=%v", envOpsWSTrustProxy, v, cfg.TrustProxy) - } - } - - if raw := strings.TrimSpace(os.Getenv(envOpsWSTrustedProxies)); raw != "" { - prefixes, invalid := parseTrustedProxyList(raw) - if len(invalid) > 0 { - log.Printf("[OpsWS] invalid %s entries ignored: %s", envOpsWSTrustedProxies, strings.Join(invalid, ", ")) - } - cfg.TrustedProxies = prefixes - } - - if v := strings.TrimSpace(os.Getenv(envOpsWSOriginPolicy)); v != "" { - normalized := strings.ToLower(v) - switch normalized { - case OriginPolicyStrict, OriginPolicyPermissive: - cfg.OriginPolicy = normalized - default: - log.Printf("[OpsWS] invalid %s=%q (expected %q or %q); using default=%q", envOpsWSOriginPolicy, v, OriginPolicyStrict, OriginPolicyPermissive, cfg.OriginPolicy) - } - } - - return cfg -} - -func defaultTrustedProxies() []netip.Prefix { - prefixes, _ := parseTrustedProxyList("127.0.0.0/8,::1/128") - return prefixes -} - -func parseTrustedProxyList(raw string) (prefixes []netip.Prefix, invalid []string) { - for _, token := range strings.Split(raw, ",") { - item := strings.TrimSpace(token) - if item == "" { - continue - } - - var ( - p netip.Prefix - err error - ) - if strings.Contains(item, "/") { - p, err = netip.ParsePrefix(item) - } else { - var addr netip.Addr - addr, err = netip.ParseAddr(item) - if err == nil { - addr = addr.Unmap() - bits := 128 - if addr.Is4() { - bits = 32 - } - p = netip.PrefixFrom(addr, bits) - } - } - - if err != nil || !p.IsValid() { - invalid = append(invalid, item) - continue - } - - prefixes = append(prefixes, p.Masked()) - } - return prefixes, invalid -} - -func hostWithoutPort(hostport string) string { - hostport = strings.TrimSpace(hostport) - if hostport == "" { - return "" - } - if host, _, err := net.SplitHostPort(hostport); err == nil { - return host - } - if strings.HasPrefix(hostport, "[") && strings.HasSuffix(hostport, "]") { - return strings.Trim(hostport, "[]") - } - parts := strings.Split(hostport, ":") - return parts[0] -} diff --git a/backend/internal/handler/admin/ops_ws_handler_test.go b/backend/internal/handler/admin/ops_ws_handler_test.go deleted file mode 100644 index b53a3723..00000000 --- a/backend/internal/handler/admin/ops_ws_handler_test.go +++ /dev/null @@ -1,123 +0,0 @@ -package admin - -import ( - "net/http" - "net/netip" - "testing" -) - -func TestIsAllowedOpsWSOrigin_AllowsEmptyOrigin(t *testing.T) { - original := opsWSProxyConfig - t.Cleanup(func() { opsWSProxyConfig = original }) - opsWSProxyConfig = OpsWSProxyConfig{OriginPolicy: OriginPolicyPermissive} - - req, err := http.NewRequest(http.MethodGet, "http://example.test", nil) - if err != nil { - t.Fatalf("NewRequest: %v", err) - } - - if !isAllowedOpsWSOrigin(req) { - t.Fatalf("expected empty Origin to be allowed") - } -} - -func TestIsAllowedOpsWSOrigin_RejectsEmptyOrigin_WhenStrict(t *testing.T) { - original := opsWSProxyConfig - t.Cleanup(func() { opsWSProxyConfig = original }) - opsWSProxyConfig = OpsWSProxyConfig{OriginPolicy: OriginPolicyStrict} - - req, err := http.NewRequest(http.MethodGet, "http://example.test", nil) - if err != nil { - t.Fatalf("NewRequest: %v", err) - } - - if isAllowedOpsWSOrigin(req) { - t.Fatalf("expected empty Origin to be rejected under strict policy") - } -} - -func TestIsAllowedOpsWSOrigin_UsesXForwardedHostOnlyFromTrustedProxy(t *testing.T) { - original := opsWSProxyConfig - t.Cleanup(func() { opsWSProxyConfig = original }) - - opsWSProxyConfig = OpsWSProxyConfig{ - TrustProxy: true, - TrustedProxies: []netip.Prefix{ - netip.MustParsePrefix("127.0.0.0/8"), - }, - } - - // Untrusted peer: ignore X-Forwarded-Host and compare against r.Host. - { - req, err := http.NewRequest(http.MethodGet, "http://internal.service.local", nil) - if err != nil { - t.Fatalf("NewRequest: %v", err) - } - req.RemoteAddr = "192.0.2.1:12345" - req.Host = "internal.service.local" - req.Header.Set("Origin", "https://public.example.com") - req.Header.Set("X-Forwarded-Host", "public.example.com") - - if isAllowedOpsWSOrigin(req) { - t.Fatalf("expected Origin to be rejected when peer is not a trusted proxy") - } - } - - // Trusted peer: allow X-Forwarded-Host to participate in Origin validation. - { - req, err := http.NewRequest(http.MethodGet, "http://internal.service.local", nil) - if err != nil { - t.Fatalf("NewRequest: %v", err) - } - req.RemoteAddr = "127.0.0.1:23456" - req.Host = "internal.service.local" - req.Header.Set("Origin", "https://public.example.com") - req.Header.Set("X-Forwarded-Host", "public.example.com") - - if !isAllowedOpsWSOrigin(req) { - t.Fatalf("expected Origin to be accepted when peer is a trusted proxy") - } - } -} - -func TestLoadOpsWSProxyConfigFromEnv_OriginPolicy(t *testing.T) { - t.Setenv(envOpsWSOriginPolicy, "STRICT") - cfg := loadOpsWSProxyConfigFromEnv() - if cfg.OriginPolicy != OriginPolicyStrict { - t.Fatalf("OriginPolicy=%q, want %q", cfg.OriginPolicy, OriginPolicyStrict) - } -} - -func TestLoadOpsWSProxyConfigFromEnv_OriginPolicyInvalidUsesDefault(t *testing.T) { - t.Setenv(envOpsWSOriginPolicy, "nope") - cfg := loadOpsWSProxyConfigFromEnv() - if cfg.OriginPolicy != OriginPolicyPermissive { - t.Fatalf("OriginPolicy=%q, want %q", cfg.OriginPolicy, OriginPolicyPermissive) - } -} - -func TestParseTrustedProxyList(t *testing.T) { - prefixes, invalid := parseTrustedProxyList("10.0.0.1, 10.0.0.0/8, bad, ::1/128") - if len(prefixes) != 3 { - t.Fatalf("prefixes=%d, want 3", len(prefixes)) - } - if len(invalid) != 1 || invalid[0] != "bad" { - t.Fatalf("invalid=%v, want [bad]", invalid) - } -} - -func TestRequestPeerIP_ParsesIPv6(t *testing.T) { - req, err := http.NewRequest(http.MethodGet, "http://example.test", nil) - if err != nil { - t.Fatalf("NewRequest: %v", err) - } - req.RemoteAddr = "[::1]:1234" - - addr, ok := requestPeerIP(req) - if !ok { - t.Fatalf("expected IPv6 peer IP to parse") - } - if addr != netip.MustParseAddr("::1") { - t.Fatalf("addr=%s, want ::1", addr) - } -} diff --git a/backend/internal/handler/ops_error_logger.go b/backend/internal/handler/ops_error_logger.go deleted file mode 100644 index 5b5e1edd..00000000 --- a/backend/internal/handler/ops_error_logger.go +++ /dev/null @@ -1,166 +0,0 @@ -package handler - -import ( - "context" - "strings" - "sync" - "time" - - middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware" - "github.com/Wei-Shaw/sub2api/internal/service" - "github.com/gin-gonic/gin" -) - -const ( - opsModelKey = "ops_model" - opsStreamKey = "ops_stream" -) - -const ( - opsErrorLogWorkerCount = 10 - opsErrorLogQueueSize = 256 - opsErrorLogTimeout = 2 * time.Second -) - -type opsErrorLogJob struct { - ops *service.OpsService - entry *service.OpsErrorLog -} - -var ( - opsErrorLogOnce sync.Once - opsErrorLogQueue chan opsErrorLogJob -) - -func startOpsErrorLogWorkers() { - opsErrorLogQueue = make(chan opsErrorLogJob, opsErrorLogQueueSize) - for i := 0; i < opsErrorLogWorkerCount; i++ { - go func() { - for job := range opsErrorLogQueue { - if job.ops == nil || job.entry == nil { - continue - } - ctx, cancel := context.WithTimeout(context.Background(), opsErrorLogTimeout) - _ = job.ops.RecordError(ctx, job.entry) - cancel() - } - }() - } -} - -func enqueueOpsErrorLog(ops *service.OpsService, entry *service.OpsErrorLog) { - if ops == nil || entry == nil { - return - } - - opsErrorLogOnce.Do(startOpsErrorLogWorkers) - - select { - case opsErrorLogQueue <- opsErrorLogJob{ops: ops, entry: entry}: - default: - // Queue is full; drop to avoid blocking request handling. - } -} - -func setOpsRequestContext(c *gin.Context, model string, stream bool) { - c.Set(opsModelKey, model) - c.Set(opsStreamKey, stream) -} - -func recordOpsError(c *gin.Context, ops *service.OpsService, status int, errType, message, fallbackPlatform string) { - if ops == nil || c == nil { - return - } - - model, _ := c.Get(opsModelKey) - stream, _ := c.Get(opsStreamKey) - - var modelName string - if m, ok := model.(string); ok { - modelName = m - } - streaming, _ := stream.(bool) - - apiKey, _ := middleware2.GetAPIKeyFromContext(c) - - logEntry := &service.OpsErrorLog{ - Phase: classifyOpsPhase(errType, message), - Type: errType, - Severity: classifyOpsSeverity(errType, status), - StatusCode: status, - Platform: resolveOpsPlatform(apiKey, fallbackPlatform), - Model: modelName, - RequestID: c.Writer.Header().Get("x-request-id"), - Message: message, - ClientIP: c.ClientIP(), - RequestPath: func() string { - if c.Request != nil && c.Request.URL != nil { - return c.Request.URL.Path - } - return "" - }(), - Stream: streaming, - } - - if apiKey != nil { - logEntry.APIKeyID = &apiKey.ID - if apiKey.User != nil { - logEntry.UserID = &apiKey.User.ID - } - if apiKey.GroupID != nil { - logEntry.GroupID = apiKey.GroupID - } - } - - enqueueOpsErrorLog(ops, logEntry) -} - -func resolveOpsPlatform(apiKey *service.APIKey, fallback string) string { - if apiKey != nil && apiKey.Group != nil && apiKey.Group.Platform != "" { - return apiKey.Group.Platform - } - return fallback -} - -func classifyOpsPhase(errType, message string) string { - msg := strings.ToLower(message) - switch errType { - case "authentication_error": - return "auth" - case "billing_error", "subscription_error": - return "billing" - case "rate_limit_error": - if strings.Contains(msg, "concurrency") || strings.Contains(msg, "pending") { - return "concurrency" - } - return "upstream" - case "invalid_request_error": - return "response" - case "upstream_error", "overloaded_error": - return "upstream" - case "api_error": - if strings.Contains(msg, "no available accounts") { - return "scheduling" - } - return "internal" - default: - return "internal" - } -} - -func classifyOpsSeverity(errType string, status int) string { - switch errType { - case "invalid_request_error", "authentication_error", "billing_error", "subscription_error": - return "P3" - } - if status >= 500 { - return "P1" - } - if status == 429 { - return "P1" - } - if status >= 400 { - return "P2" - } - return "P3" -} diff --git a/backend/internal/repository/ops.go b/backend/internal/repository/ops.go deleted file mode 100644 index 969a49a7..00000000 --- a/backend/internal/repository/ops.go +++ /dev/null @@ -1,190 +0,0 @@ -package repository - -import ( - "context" - "database/sql" - "fmt" - "strconv" - "strings" - "time" - - "github.com/Wei-Shaw/sub2api/internal/service" -) - -// ListErrorLogs queries ops_error_logs with optional filters and pagination. -// It returns the list items and the total count of matching rows. -func (r *OpsRepository) ListErrorLogs(ctx context.Context, filter *service.ErrorLogFilter) ([]*service.ErrorLog, int64, error) { - page := 1 - pageSize := 20 - if filter != nil { - if filter.Page > 0 { - page = filter.Page - } - if filter.PageSize > 0 { - pageSize = filter.PageSize - } - } - if pageSize > 100 { - pageSize = 100 - } - offset := (page - 1) * pageSize - - conditions := make([]string, 0) - args := make([]any, 0) - - addCondition := func(condition string, values ...any) { - conditions = append(conditions, condition) - args = append(args, values...) - } - - if filter != nil { - // 默认查询最近 24 小时 - if filter.StartTime == nil && filter.EndTime == nil { - defaultStart := time.Now().Add(-24 * time.Hour) - filter.StartTime = &defaultStart - } - - if filter.StartTime != nil { - addCondition(fmt.Sprintf("created_at >= $%d", len(args)+1), *filter.StartTime) - } - if filter.EndTime != nil { - addCondition(fmt.Sprintf("created_at <= $%d", len(args)+1), *filter.EndTime) - } - if filter.ErrorCode != nil { - addCondition(fmt.Sprintf("status_code = $%d", len(args)+1), *filter.ErrorCode) - } - if provider := strings.TrimSpace(filter.Provider); provider != "" { - addCondition(fmt.Sprintf("platform = $%d", len(args)+1), provider) - } - if filter.AccountID != nil { - addCondition(fmt.Sprintf("account_id = $%d", len(args)+1), *filter.AccountID) - } - } - - where := "" - if len(conditions) > 0 { - where = "WHERE " + strings.Join(conditions, " AND ") - } - - countQuery := fmt.Sprintf(`SELECT COUNT(1) FROM ops_error_logs %s`, where) - var total int64 - if err := scanSingleRow(ctx, r.sql, countQuery, args, &total); err != nil { - if err == sql.ErrNoRows { - total = 0 - } else { - return nil, 0, err - } - } - - listQuery := fmt.Sprintf(` - SELECT - id, - created_at, - severity, - request_id, - account_id, - request_path, - platform, - model, - status_code, - error_message, - duration_ms, - retry_count, - stream - FROM ops_error_logs - %s - ORDER BY created_at DESC - LIMIT $%d OFFSET $%d - `, where, len(args)+1, len(args)+2) - - listArgs := append(append([]any{}, args...), pageSize, offset) - rows, err := r.sql.QueryContext(ctx, listQuery, listArgs...) - if err != nil { - return nil, 0, err - } - defer func() { _ = rows.Close() }() - - results := make([]*service.ErrorLog, 0) - for rows.Next() { - var ( - id int64 - createdAt time.Time - severity sql.NullString - requestID sql.NullString - accountID sql.NullInt64 - requestURI sql.NullString - platform sql.NullString - model sql.NullString - statusCode sql.NullInt64 - message sql.NullString - durationMs sql.NullInt64 - retryCount sql.NullInt64 - stream sql.NullBool - ) - - if err := rows.Scan( - &id, - &createdAt, - &severity, - &requestID, - &accountID, - &requestURI, - &platform, - &model, - &statusCode, - &message, - &durationMs, - &retryCount, - &stream, - ); err != nil { - return nil, 0, err - } - - entry := &service.ErrorLog{ - ID: id, - Timestamp: createdAt, - Level: levelFromSeverity(severity.String), - RequestID: requestID.String, - APIPath: requestURI.String, - Provider: platform.String, - Model: model.String, - HTTPCode: int(statusCode.Int64), - Stream: stream.Bool, - } - if accountID.Valid { - entry.AccountID = strconv.FormatInt(accountID.Int64, 10) - } - if message.Valid { - entry.ErrorMessage = message.String - } - if durationMs.Valid { - v := int(durationMs.Int64) - entry.DurationMs = &v - } - if retryCount.Valid { - v := int(retryCount.Int64) - entry.RetryCount = &v - } - - results = append(results, entry) - } - if err := rows.Err(); err != nil { - return nil, 0, err - } - - return results, total, nil -} - -func levelFromSeverity(severity string) string { - sev := strings.ToUpper(strings.TrimSpace(severity)) - switch sev { - case "P0", "P1": - return "CRITICAL" - case "P2": - return "ERROR" - case "P3": - return "WARN" - default: - return "ERROR" - } -} diff --git a/backend/internal/repository/ops_cache.go b/backend/internal/repository/ops_cache.go deleted file mode 100644 index 99d60634..00000000 --- a/backend/internal/repository/ops_cache.go +++ /dev/null @@ -1,127 +0,0 @@ -package repository - -import ( - "context" - "encoding/json" - "errors" - "fmt" - "strings" - "time" - - "github.com/Wei-Shaw/sub2api/internal/service" - "github.com/redis/go-redis/v9" -) - -const ( - opsLatestMetricsKey = "ops:metrics:latest" - - opsDashboardOverviewKeyPrefix = "ops:dashboard:overview:" - - opsLatestMetricsTTL = 10 * time.Second -) - -func (r *OpsRepository) GetCachedLatestSystemMetric(ctx context.Context) (*service.OpsMetrics, error) { - if ctx == nil { - ctx = context.Background() - } - if r == nil || r.rdb == nil { - return nil, nil - } - - data, err := r.rdb.Get(ctx, opsLatestMetricsKey).Bytes() - if errors.Is(err, redis.Nil) { - return nil, nil - } - if err != nil { - return nil, fmt.Errorf("redis get cached latest system metric: %w", err) - } - - var metric service.OpsMetrics - if err := json.Unmarshal(data, &metric); err != nil { - return nil, fmt.Errorf("unmarshal cached latest system metric: %w", err) - } - return &metric, nil -} - -func (r *OpsRepository) SetCachedLatestSystemMetric(ctx context.Context, metric *service.OpsMetrics) error { - if metric == nil { - return nil - } - if ctx == nil { - ctx = context.Background() - } - if r == nil || r.rdb == nil { - return nil - } - - data, err := json.Marshal(metric) - if err != nil { - return fmt.Errorf("marshal cached latest system metric: %w", err) - } - return r.rdb.Set(ctx, opsLatestMetricsKey, data, opsLatestMetricsTTL).Err() -} - -func (r *OpsRepository) GetCachedDashboardOverview(ctx context.Context, timeRange string) (*service.DashboardOverviewData, error) { - if ctx == nil { - ctx = context.Background() - } - if r == nil || r.rdb == nil { - return nil, nil - } - rangeKey := strings.TrimSpace(timeRange) - if rangeKey == "" { - rangeKey = "1h" - } - - key := opsDashboardOverviewKeyPrefix + rangeKey - data, err := r.rdb.Get(ctx, key).Bytes() - if errors.Is(err, redis.Nil) { - return nil, nil - } - if err != nil { - return nil, fmt.Errorf("redis get cached dashboard overview: %w", err) - } - - var overview service.DashboardOverviewData - if err := json.Unmarshal(data, &overview); err != nil { - return nil, fmt.Errorf("unmarshal cached dashboard overview: %w", err) - } - return &overview, nil -} - -func (r *OpsRepository) SetCachedDashboardOverview(ctx context.Context, timeRange string, data *service.DashboardOverviewData, ttl time.Duration) error { - if data == nil { - return nil - } - if ttl <= 0 { - ttl = 10 * time.Second - } - if ctx == nil { - ctx = context.Background() - } - if r == nil || r.rdb == nil { - return nil - } - - rangeKey := strings.TrimSpace(timeRange) - if rangeKey == "" { - rangeKey = "1h" - } - - payload, err := json.Marshal(data) - if err != nil { - return fmt.Errorf("marshal cached dashboard overview: %w", err) - } - key := opsDashboardOverviewKeyPrefix + rangeKey - return r.rdb.Set(ctx, key, payload, ttl).Err() -} - -func (r *OpsRepository) PingRedis(ctx context.Context) error { - if ctx == nil { - ctx = context.Background() - } - if r == nil || r.rdb == nil { - return errors.New("redis client is nil") - } - return r.rdb.Ping(ctx).Err() -} diff --git a/backend/internal/repository/ops_repo.go b/backend/internal/repository/ops_repo.go deleted file mode 100644 index f75f9abf..00000000 --- a/backend/internal/repository/ops_repo.go +++ /dev/null @@ -1,1333 +0,0 @@ -package repository - -import ( - "context" - "database/sql" - "encoding/json" - "errors" - "fmt" - "math" - "strings" - "time" - - dbent "github.com/Wei-Shaw/sub2api/ent" - "github.com/Wei-Shaw/sub2api/internal/service" - "github.com/redis/go-redis/v9" -) - -const ( - DefaultWindowMinutes = 1 - - MaxErrorLogsLimit = 500 - DefaultErrorLogsLimit = 200 - - MaxRecentSystemMetricsLimit = 500 - DefaultRecentSystemMetricsLimit = 60 - - MaxMetricsLimit = 5000 - DefaultMetricsLimit = 300 -) - -type OpsRepository struct { - sql sqlExecutor - rdb *redis.Client -} - -func NewOpsRepository(_ *dbent.Client, sqlDB *sql.DB, rdb *redis.Client) service.OpsRepository { - return &OpsRepository{sql: sqlDB, rdb: rdb} -} - -func (r *OpsRepository) CreateErrorLog(ctx context.Context, log *service.OpsErrorLog) error { - if log == nil { - return nil - } - - createdAt := log.CreatedAt - if createdAt.IsZero() { - createdAt = time.Now() - } - - query := ` - INSERT INTO ops_error_logs ( - request_id, - user_id, - api_key_id, - account_id, - group_id, - client_ip, - error_phase, - error_type, - severity, - status_code, - platform, - model, - request_path, - stream, - error_message, - duration_ms, - created_at - ) VALUES ( - $1, $2, $3, $4, $5, - $6, $7, $8, $9, $10, - $11, $12, $13, $14, $15, - $16, $17 - ) - RETURNING id, created_at - ` - - requestID := nullString(log.RequestID) - clientIP := nullString(log.ClientIP) - platform := nullString(log.Platform) - model := nullString(log.Model) - requestPath := nullString(log.RequestPath) - message := nullString(log.Message) - latency := nullInt(log.LatencyMs) - - args := []any{ - requestID, - nullInt64(log.UserID), - nullInt64(log.APIKeyID), - nullInt64(log.AccountID), - nullInt64(log.GroupID), - clientIP, - log.Phase, - log.Type, - log.Severity, - log.StatusCode, - platform, - model, - requestPath, - log.Stream, - message, - latency, - createdAt, - } - - if err := scanSingleRow(ctx, r.sql, query, args, &log.ID, &log.CreatedAt); err != nil { - return err - } - return nil -} - -func (r *OpsRepository) ListErrorLogsLegacy(ctx context.Context, filters service.OpsErrorLogFilters) ([]service.OpsErrorLog, error) { - conditions := make([]string, 0) - args := make([]any, 0) - - addCondition := func(condition string, values ...any) { - conditions = append(conditions, condition) - args = append(args, values...) - } - - if filters.StartTime != nil { - addCondition(fmt.Sprintf("created_at >= $%d", len(args)+1), *filters.StartTime) - } - if filters.EndTime != nil { - addCondition(fmt.Sprintf("created_at <= $%d", len(args)+1), *filters.EndTime) - } - if filters.Platform != "" { - addCondition(fmt.Sprintf("platform = $%d", len(args)+1), filters.Platform) - } - if filters.Phase != "" { - addCondition(fmt.Sprintf("error_phase = $%d", len(args)+1), filters.Phase) - } - if filters.Severity != "" { - addCondition(fmt.Sprintf("severity = $%d", len(args)+1), filters.Severity) - } - if filters.Query != "" { - like := "%" + strings.ToLower(filters.Query) + "%" - startIdx := len(args) + 1 - addCondition( - fmt.Sprintf("(LOWER(request_id) LIKE $%d OR LOWER(model) LIKE $%d OR LOWER(error_message) LIKE $%d OR LOWER(error_type) LIKE $%d)", - startIdx, startIdx+1, startIdx+2, startIdx+3, - ), - like, like, like, like, - ) - } - - limit := filters.Limit - if limit <= 0 || limit > MaxErrorLogsLimit { - limit = DefaultErrorLogsLimit - } - - where := "" - if len(conditions) > 0 { - where = "WHERE " + strings.Join(conditions, " AND ") - } - - query := fmt.Sprintf(` - SELECT - id, - created_at, - user_id, - api_key_id, - account_id, - group_id, - client_ip, - error_phase, - error_type, - severity, - status_code, - platform, - model, - request_path, - stream, - duration_ms, - request_id, - error_message - FROM ops_error_logs - %s - ORDER BY created_at DESC - LIMIT $%d - `, where, len(args)+1) - - args = append(args, limit) - - rows, err := r.sql.QueryContext(ctx, query, args...) - if err != nil { - return nil, err - } - defer func() { _ = rows.Close() }() - - results := make([]service.OpsErrorLog, 0) - for rows.Next() { - logEntry, err := scanOpsErrorLog(rows) - if err != nil { - return nil, err - } - results = append(results, *logEntry) - } - if err := rows.Err(); err != nil { - return nil, err - } - return results, nil -} - -func (r *OpsRepository) GetLatestSystemMetric(ctx context.Context) (*service.OpsMetrics, error) { - query := ` - SELECT - window_minutes, - request_count, - success_count, - error_count, - success_rate, - error_rate, - p95_latency_ms, - p99_latency_ms, - http2_errors, - active_alerts, - cpu_usage_percent, - memory_used_mb, - memory_total_mb, - memory_usage_percent, - heap_alloc_mb, - gc_pause_ms, - concurrency_queue_depth, - created_at AS updated_at - FROM ops_system_metrics - WHERE window_minutes = $1 - ORDER BY updated_at DESC, id DESC - LIMIT 1 - ` - - var windowMinutes sql.NullInt64 - var requestCount, successCount, errorCount sql.NullInt64 - var successRate, errorRate sql.NullFloat64 - var p95Latency, p99Latency, http2Errors, activeAlerts sql.NullInt64 - var cpuUsage, memoryUsage, gcPause sql.NullFloat64 - var memoryUsed, memoryTotal, heapAlloc, queueDepth sql.NullInt64 - var createdAt time.Time - if err := scanSingleRow( - ctx, - r.sql, - query, - []any{DefaultWindowMinutes}, - &windowMinutes, - &requestCount, - &successCount, - &errorCount, - &successRate, - &errorRate, - &p95Latency, - &p99Latency, - &http2Errors, - &activeAlerts, - &cpuUsage, - &memoryUsed, - &memoryTotal, - &memoryUsage, - &heapAlloc, - &gcPause, - &queueDepth, - &createdAt, - ); err != nil { - return nil, err - } - - metric := &service.OpsMetrics{ - UpdatedAt: createdAt, - } - if windowMinutes.Valid { - metric.WindowMinutes = int(windowMinutes.Int64) - } - if requestCount.Valid { - metric.RequestCount = requestCount.Int64 - } - if successCount.Valid { - metric.SuccessCount = successCount.Int64 - } - if errorCount.Valid { - metric.ErrorCount = errorCount.Int64 - } - if successRate.Valid { - metric.SuccessRate = successRate.Float64 - } - if errorRate.Valid { - metric.ErrorRate = errorRate.Float64 - } - if p95Latency.Valid { - metric.P95LatencyMs = int(p95Latency.Int64) - } - if p99Latency.Valid { - metric.P99LatencyMs = int(p99Latency.Int64) - } - if http2Errors.Valid { - metric.HTTP2Errors = int(http2Errors.Int64) - } - if activeAlerts.Valid { - metric.ActiveAlerts = int(activeAlerts.Int64) - } - if cpuUsage.Valid { - metric.CPUUsagePercent = cpuUsage.Float64 - } - if memoryUsed.Valid { - metric.MemoryUsedMB = memoryUsed.Int64 - } - if memoryTotal.Valid { - metric.MemoryTotalMB = memoryTotal.Int64 - } - if memoryUsage.Valid { - metric.MemoryUsagePercent = memoryUsage.Float64 - } - if heapAlloc.Valid { - metric.HeapAllocMB = heapAlloc.Int64 - } - if gcPause.Valid { - metric.GCPauseMs = gcPause.Float64 - } - if queueDepth.Valid { - metric.ConcurrencyQueueDepth = int(queueDepth.Int64) - } - return metric, nil -} - -func (r *OpsRepository) CreateSystemMetric(ctx context.Context, metric *service.OpsMetrics) error { - if metric == nil { - return nil - } - createdAt := metric.UpdatedAt - if createdAt.IsZero() { - createdAt = time.Now() - } - windowMinutes := metric.WindowMinutes - if windowMinutes <= 0 { - windowMinutes = DefaultWindowMinutes - } - - query := ` - INSERT INTO ops_system_metrics ( - window_minutes, - request_count, - success_count, - error_count, - success_rate, - error_rate, - p95_latency_ms, - p99_latency_ms, - http2_errors, - active_alerts, - cpu_usage_percent, - memory_used_mb, - memory_total_mb, - memory_usage_percent, - heap_alloc_mb, - gc_pause_ms, - concurrency_queue_depth, - created_at - ) VALUES ( - $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, - $11, $12, $13, $14, $15, $16, $17, $18 - ) - ` - _, err := r.sql.ExecContext(ctx, query, - windowMinutes, - metric.RequestCount, - metric.SuccessCount, - metric.ErrorCount, - metric.SuccessRate, - metric.ErrorRate, - metric.P95LatencyMs, - metric.P99LatencyMs, - metric.HTTP2Errors, - metric.ActiveAlerts, - metric.CPUUsagePercent, - metric.MemoryUsedMB, - metric.MemoryTotalMB, - metric.MemoryUsagePercent, - metric.HeapAllocMB, - metric.GCPauseMs, - metric.ConcurrencyQueueDepth, - createdAt, - ) - return err -} - -func (r *OpsRepository) ListRecentSystemMetrics(ctx context.Context, windowMinutes, limit int) ([]service.OpsMetrics, error) { - if windowMinutes <= 0 { - windowMinutes = DefaultWindowMinutes - } - if limit <= 0 || limit > MaxRecentSystemMetricsLimit { - limit = DefaultRecentSystemMetricsLimit - } - - query := ` - SELECT - window_minutes, - request_count, - success_count, - error_count, - success_rate, - error_rate, - p95_latency_ms, - p99_latency_ms, - http2_errors, - active_alerts, - cpu_usage_percent, - memory_used_mb, - memory_total_mb, - memory_usage_percent, - heap_alloc_mb, - gc_pause_ms, - concurrency_queue_depth, - created_at AS updated_at - FROM ops_system_metrics - WHERE window_minutes = $1 - ORDER BY updated_at DESC, id DESC - LIMIT $2 - ` - - rows, err := r.sql.QueryContext(ctx, query, windowMinutes, limit) - if err != nil { - return nil, err - } - defer func() { _ = rows.Close() }() - - results := make([]service.OpsMetrics, 0) - for rows.Next() { - metric, err := scanOpsSystemMetric(rows) - if err != nil { - return nil, err - } - results = append(results, *metric) - } - if err := rows.Err(); err != nil { - return nil, err - } - return results, nil -} - -func (r *OpsRepository) ListSystemMetricsRange(ctx context.Context, windowMinutes int, startTime, endTime time.Time, limit int) ([]service.OpsMetrics, error) { - if windowMinutes <= 0 { - windowMinutes = DefaultWindowMinutes - } - if limit <= 0 || limit > MaxMetricsLimit { - limit = DefaultMetricsLimit - } - if endTime.IsZero() { - endTime = time.Now() - } - if startTime.IsZero() { - startTime = endTime.Add(-time.Duration(limit) * time.Minute) - } - if startTime.After(endTime) { - startTime, endTime = endTime, startTime - } - - query := ` - SELECT - window_minutes, - request_count, - success_count, - error_count, - success_rate, - error_rate, - p95_latency_ms, - p99_latency_ms, - http2_errors, - active_alerts, - cpu_usage_percent, - memory_used_mb, - memory_total_mb, - memory_usage_percent, - heap_alloc_mb, - gc_pause_ms, - concurrency_queue_depth, - created_at - FROM ops_system_metrics - WHERE window_minutes = $1 - AND created_at >= $2 - AND created_at <= $3 - ORDER BY created_at ASC - LIMIT $4 - ` - - rows, err := r.sql.QueryContext(ctx, query, windowMinutes, startTime, endTime, limit) - if err != nil { - return nil, err - } - defer func() { _ = rows.Close() }() - - results := make([]service.OpsMetrics, 0) - for rows.Next() { - metric, err := scanOpsSystemMetric(rows) - if err != nil { - return nil, err - } - results = append(results, *metric) - } - if err := rows.Err(); err != nil { - return nil, err - } - return results, nil -} - -func (r *OpsRepository) ListAlertRules(ctx context.Context) ([]service.OpsAlertRule, error) { - query := ` - SELECT - id, - name, - description, - enabled, - metric_type, - operator, - threshold, - window_minutes, - sustained_minutes, - severity, - notify_email, - notify_webhook, - webhook_url, - cooldown_minutes, - dimension_filters, - notify_channels, - notify_config, - created_at, - updated_at - FROM ops_alert_rules - ORDER BY id ASC - ` - - rows, err := r.sql.QueryContext(ctx, query) - if err != nil { - return nil, err - } - defer func() { _ = rows.Close() }() - - rules := make([]service.OpsAlertRule, 0) - for rows.Next() { - var rule service.OpsAlertRule - var description sql.NullString - var webhookURL sql.NullString - var dimensionFilters, notifyChannels, notifyConfig []byte - if err := rows.Scan( - &rule.ID, - &rule.Name, - &description, - &rule.Enabled, - &rule.MetricType, - &rule.Operator, - &rule.Threshold, - &rule.WindowMinutes, - &rule.SustainedMinutes, - &rule.Severity, - &rule.NotifyEmail, - &rule.NotifyWebhook, - &webhookURL, - &rule.CooldownMinutes, - &dimensionFilters, - ¬ifyChannels, - ¬ifyConfig, - &rule.CreatedAt, - &rule.UpdatedAt, - ); err != nil { - return nil, err - } - if description.Valid { - rule.Description = description.String - } - if webhookURL.Valid { - rule.WebhookURL = webhookURL.String - } - if len(dimensionFilters) > 0 { - _ = json.Unmarshal(dimensionFilters, &rule.DimensionFilters) - } - if len(notifyChannels) > 0 { - _ = json.Unmarshal(notifyChannels, &rule.NotifyChannels) - } - if len(notifyConfig) > 0 { - _ = json.Unmarshal(notifyConfig, &rule.NotifyConfig) - } - rules = append(rules, rule) - } - if err := rows.Err(); err != nil { - return nil, err - } - return rules, nil -} - -func (r *OpsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) { - return r.getAlertEvent(ctx, `WHERE rule_id = $1 AND status = $2`, []any{ruleID, service.OpsAlertStatusFiring}) -} - -func (r *OpsRepository) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) { - return r.getAlertEvent(ctx, `WHERE rule_id = $1`, []any{ruleID}) -} - -func (r *OpsRepository) CreateAlertEvent(ctx context.Context, event *service.OpsAlertEvent) error { - if event == nil { - return nil - } - if event.FiredAt.IsZero() { - event.FiredAt = time.Now() - } - if event.CreatedAt.IsZero() { - event.CreatedAt = event.FiredAt - } - if event.Status == "" { - event.Status = service.OpsAlertStatusFiring - } - - query := ` - INSERT INTO ops_alert_events ( - rule_id, - severity, - status, - title, - description, - metric_value, - threshold_value, - fired_at, - resolved_at, - email_sent, - webhook_sent, - created_at - ) VALUES ( - $1, $2, $3, $4, $5, $6, - $7, $8, $9, $10, $11, $12 - ) - RETURNING id, created_at - ` - - var resolvedAt sql.NullTime - if event.ResolvedAt != nil { - resolvedAt = sql.NullTime{Time: *event.ResolvedAt, Valid: true} - } - - if err := scanSingleRow( - ctx, - r.sql, - query, - []any{ - event.RuleID, - event.Severity, - event.Status, - event.Title, - event.Description, - event.MetricValue, - event.ThresholdValue, - event.FiredAt, - resolvedAt, - event.EmailSent, - event.WebhookSent, - event.CreatedAt, - }, - &event.ID, - &event.CreatedAt, - ); err != nil { - return err - } - return nil -} - -func (r *OpsRepository) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error { - var resolved sql.NullTime - if resolvedAt != nil { - resolved = sql.NullTime{Time: *resolvedAt, Valid: true} - } - _, err := r.sql.ExecContext(ctx, ` - UPDATE ops_alert_events - SET status = $2, resolved_at = $3 - WHERE id = $1 - `, eventID, status, resolved) - return err -} - -func (r *OpsRepository) UpdateAlertEventNotifications(ctx context.Context, eventID int64, emailSent, webhookSent bool) error { - _, err := r.sql.ExecContext(ctx, ` - UPDATE ops_alert_events - SET email_sent = $2, webhook_sent = $3 - WHERE id = $1 - `, eventID, emailSent, webhookSent) - return err -} - -func (r *OpsRepository) CountActiveAlerts(ctx context.Context) (int, error) { - var count int64 - if err := scanSingleRow( - ctx, - r.sql, - `SELECT COUNT(*) FROM ops_alert_events WHERE status = $1`, - []any{service.OpsAlertStatusFiring}, - &count, - ); err != nil { - if errors.Is(err, sql.ErrNoRows) { - return 0, nil - } - return 0, err - } - return int(count), nil -} - -func (r *OpsRepository) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*service.OpsWindowStats, error) { - query := ` - WITH - usage_agg AS ( - SELECT - COUNT(*) AS success_count, - percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) - FILTER (WHERE duration_ms IS NOT NULL) AS p95, - percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) - FILTER (WHERE duration_ms IS NOT NULL) AS p99 - FROM usage_logs - WHERE created_at >= $1 AND created_at < $2 - ), - error_agg AS ( - SELECT - COUNT(*) AS error_count, - COUNT(*) FILTER ( - WHERE - error_type = 'network_error' - OR error_message ILIKE '%http2%' - OR error_message ILIKE '%http/2%' - ) AS http2_errors - FROM ops_error_logs - WHERE created_at >= $1 AND created_at < $2 - ) - SELECT - usage_agg.success_count, - error_agg.error_count, - usage_agg.p95, - usage_agg.p99, - error_agg.http2_errors - FROM usage_agg - CROSS JOIN error_agg - ` - - var stats service.OpsWindowStats - var p95Latency, p99Latency sql.NullFloat64 - var http2Errors int64 - if err := scanSingleRow( - ctx, - r.sql, - query, - []any{startTime, endTime}, - &stats.SuccessCount, - &stats.ErrorCount, - &p95Latency, - &p99Latency, - &http2Errors, - ); err != nil { - return nil, err - } - - stats.HTTP2Errors = int(http2Errors) - if p95Latency.Valid { - stats.P95LatencyMs = int(math.Round(p95Latency.Float64)) - } - if p99Latency.Valid { - stats.P99LatencyMs = int(math.Round(p99Latency.Float64)) - } - - return &stats, nil -} - -func (r *OpsRepository) GetOverviewStats(ctx context.Context, startTime, endTime time.Time) (*service.OverviewStats, error) { - query := ` - WITH - usage_stats AS ( - SELECT - COUNT(*) AS request_count, - COUNT(*) FILTER (WHERE duration_ms IS NOT NULL) AS success_count, - percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS p50, - percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS p95, - percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS p99, - percentile_cont(0.999) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS p999, - AVG(duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS avg_latency, - MAX(duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS max_latency - FROM usage_logs - WHERE created_at >= $1 AND created_at < $2 - ), - error_stats AS ( - SELECT - COUNT(*) AS error_count, - COUNT(*) FILTER (WHERE status_code >= 400 AND status_code < 500) AS error_4xx, - COUNT(*) FILTER (WHERE status_code >= 500) AS error_5xx, - COUNT(*) FILTER ( - WHERE - error_type IN ('timeout', 'timeout_error') - OR error_message ILIKE '%timeout%' - OR error_message ILIKE '%deadline exceeded%' - ) AS timeout_count - FROM ops_error_logs - WHERE created_at >= $1 AND created_at < $2 - ), - top_error AS ( - SELECT - COALESCE(status_code::text, 'unknown') AS error_code, - error_message, - COUNT(*) AS error_count - FROM ops_error_logs - WHERE created_at >= $1 AND created_at < $2 - GROUP BY status_code, error_message - ORDER BY error_count DESC - LIMIT 1 - ), - latest_metrics AS ( - SELECT - cpu_usage_percent, - memory_usage_percent, - memory_used_mb, - memory_total_mb, - concurrency_queue_depth - FROM ops_system_metrics - ORDER BY created_at DESC - LIMIT 1 - ) - SELECT - COALESCE(usage_stats.request_count, 0) + COALESCE(error_stats.error_count, 0) AS request_count, - COALESCE(usage_stats.success_count, 0), - COALESCE(error_stats.error_count, 0), - COALESCE(error_stats.error_4xx, 0), - COALESCE(error_stats.error_5xx, 0), - COALESCE(error_stats.timeout_count, 0), - COALESCE(usage_stats.p50, 0), - COALESCE(usage_stats.p95, 0), - COALESCE(usage_stats.p99, 0), - COALESCE(usage_stats.p999, 0), - COALESCE(usage_stats.avg_latency, 0), - COALESCE(usage_stats.max_latency, 0), - COALESCE(top_error.error_code, ''), - COALESCE(top_error.error_message, ''), - COALESCE(top_error.error_count, 0), - COALESCE(latest_metrics.cpu_usage_percent, 0), - COALESCE(latest_metrics.memory_usage_percent, 0), - COALESCE(latest_metrics.memory_used_mb, 0), - COALESCE(latest_metrics.memory_total_mb, 0), - COALESCE(latest_metrics.concurrency_queue_depth, 0) - FROM usage_stats - CROSS JOIN error_stats - LEFT JOIN top_error ON true - LEFT JOIN latest_metrics ON true - ` - - var stats service.OverviewStats - var p50, p95, p99, p999, avgLatency, maxLatency sql.NullFloat64 - - err := scanSingleRow( - ctx, - r.sql, - query, - []any{startTime, endTime}, - &stats.RequestCount, - &stats.SuccessCount, - &stats.ErrorCount, - &stats.Error4xxCount, - &stats.Error5xxCount, - &stats.TimeoutCount, - &p50, - &p95, - &p99, - &p999, - &avgLatency, - &maxLatency, - &stats.TopErrorCode, - &stats.TopErrorMsg, - &stats.TopErrorCount, - &stats.CPUUsage, - &stats.MemoryUsage, - &stats.MemoryUsedMB, - &stats.MemoryTotalMB, - &stats.ConcurrencyQueueDepth, - ) - if err != nil { - return nil, err - } - - if p50.Valid { - stats.LatencyP50 = int(p50.Float64) - } - if p95.Valid { - stats.LatencyP95 = int(p95.Float64) - } - if p99.Valid { - stats.LatencyP99 = int(p99.Float64) - } - if p999.Valid { - stats.LatencyP999 = int(p999.Float64) - } - if avgLatency.Valid { - stats.LatencyAvg = int(avgLatency.Float64) - } - if maxLatency.Valid { - stats.LatencyMax = int(maxLatency.Float64) - } - - return &stats, nil -} - -func (r *OpsRepository) GetProviderStats(ctx context.Context, startTime, endTime time.Time) ([]*service.ProviderStats, error) { - if startTime.IsZero() || endTime.IsZero() { - return nil, nil - } - if startTime.After(endTime) { - startTime, endTime = endTime, startTime - } - - query := ` - WITH combined AS ( - SELECT - COALESCE(g.platform, a.platform, '') AS platform, - u.duration_ms AS duration_ms, - 1 AS is_success, - 0 AS is_error, - NULL::INT AS status_code, - NULL::TEXT AS error_type, - NULL::TEXT AS error_message - FROM usage_logs u - LEFT JOIN groups g ON g.id = u.group_id - LEFT JOIN accounts a ON a.id = u.account_id - WHERE u.created_at >= $1 AND u.created_at < $2 - - UNION ALL - - SELECT - COALESCE(NULLIF(o.platform, ''), g.platform, a.platform, '') AS platform, - o.duration_ms AS duration_ms, - 0 AS is_success, - 1 AS is_error, - o.status_code AS status_code, - o.error_type AS error_type, - o.error_message AS error_message - FROM ops_error_logs o - LEFT JOIN groups g ON g.id = o.group_id - LEFT JOIN accounts a ON a.id = o.account_id - WHERE o.created_at >= $1 AND o.created_at < $2 - ) - SELECT - platform, - COUNT(*) AS request_count, - COALESCE(SUM(is_success), 0) AS success_count, - COALESCE(SUM(is_error), 0) AS error_count, - COALESCE(AVG(duration_ms) FILTER (WHERE duration_ms IS NOT NULL), 0) AS avg_latency_ms, - percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) - FILTER (WHERE duration_ms IS NOT NULL) AS p99_latency_ms, - COUNT(*) FILTER (WHERE is_error = 1 AND status_code >= 400 AND status_code < 500) AS error_4xx, - COUNT(*) FILTER (WHERE is_error = 1 AND status_code >= 500 AND status_code < 600) AS error_5xx, - COUNT(*) FILTER ( - WHERE - is_error = 1 - AND ( - status_code = 504 - OR error_type ILIKE '%timeout%' - OR error_message ILIKE '%timeout%' - ) - ) AS timeout_count - FROM combined - WHERE platform <> '' - GROUP BY platform - ORDER BY request_count DESC, platform ASC - ` - - rows, err := r.sql.QueryContext(ctx, query, startTime, endTime) - if err != nil { - return nil, err - } - defer func() { _ = rows.Close() }() - - results := make([]*service.ProviderStats, 0) - for rows.Next() { - var item service.ProviderStats - var avgLatency sql.NullFloat64 - var p99Latency sql.NullFloat64 - if err := rows.Scan( - &item.Platform, - &item.RequestCount, - &item.SuccessCount, - &item.ErrorCount, - &avgLatency, - &p99Latency, - &item.Error4xxCount, - &item.Error5xxCount, - &item.TimeoutCount, - ); err != nil { - return nil, err - } - - if avgLatency.Valid { - item.AvgLatencyMs = int(math.Round(avgLatency.Float64)) - } - if p99Latency.Valid { - item.P99LatencyMs = int(math.Round(p99Latency.Float64)) - } - - results = append(results, &item) - } - if err := rows.Err(); err != nil { - return nil, err - } - return results, nil -} - -func (r *OpsRepository) GetLatencyHistogram(ctx context.Context, startTime, endTime time.Time) ([]*service.LatencyHistogramItem, error) { - query := ` - WITH buckets AS ( - SELECT - CASE - WHEN duration_ms < 200 THEN '<200ms' - WHEN duration_ms < 500 THEN '200-500ms' - WHEN duration_ms < 1000 THEN '500-1000ms' - WHEN duration_ms < 3000 THEN '1000-3000ms' - ELSE '>3000ms' - END AS range_name, - CASE - WHEN duration_ms < 200 THEN 1 - WHEN duration_ms < 500 THEN 2 - WHEN duration_ms < 1000 THEN 3 - WHEN duration_ms < 3000 THEN 4 - ELSE 5 - END AS range_order, - COUNT(*) AS count - FROM usage_logs - WHERE created_at >= $1 AND created_at < $2 AND duration_ms IS NOT NULL - GROUP BY 1, 2 - ), - total AS ( - SELECT SUM(count) AS total_count FROM buckets - ) - SELECT - b.range_name, - b.count, - ROUND((b.count::numeric / t.total_count) * 100, 2) AS percentage - FROM buckets b - CROSS JOIN total t - ORDER BY b.range_order ASC - ` - - rows, err := r.sql.QueryContext(ctx, query, startTime, endTime) - if err != nil { - return nil, err - } - defer func() { _ = rows.Close() }() - - results := make([]*service.LatencyHistogramItem, 0) - for rows.Next() { - var item service.LatencyHistogramItem - if err := rows.Scan(&item.Range, &item.Count, &item.Percentage); err != nil { - return nil, err - } - results = append(results, &item) - } - return results, nil -} - -func (r *OpsRepository) GetErrorDistribution(ctx context.Context, startTime, endTime time.Time) ([]*service.ErrorDistributionItem, error) { - query := ` - WITH errors AS ( - SELECT - COALESCE(status_code::text, 'unknown') AS code, - COALESCE(error_message, 'Unknown error') AS message, - COUNT(*) AS count - FROM ops_error_logs - WHERE created_at >= $1 AND created_at < $2 - GROUP BY 1, 2 - ), - total AS ( - SELECT SUM(count) AS total_count FROM errors - ) - SELECT - e.code, - e.message, - e.count, - ROUND((e.count::numeric / t.total_count) * 100, 2) AS percentage - FROM errors e - CROSS JOIN total t - ORDER BY e.count DESC - LIMIT 20 - ` - - rows, err := r.sql.QueryContext(ctx, query, startTime, endTime) - if err != nil { - return nil, err - } - defer func() { _ = rows.Close() }() - - results := make([]*service.ErrorDistributionItem, 0) - for rows.Next() { - var item service.ErrorDistributionItem - if err := rows.Scan(&item.Code, &item.Message, &item.Count, &item.Percentage); err != nil { - return nil, err - } - results = append(results, &item) - } - return results, nil -} - -func (r *OpsRepository) getAlertEvent(ctx context.Context, whereClause string, args []any) (*service.OpsAlertEvent, error) { - query := fmt.Sprintf(` - SELECT - id, - rule_id, - severity, - status, - title, - description, - metric_value, - threshold_value, - fired_at, - resolved_at, - email_sent, - webhook_sent, - created_at - FROM ops_alert_events - %s - ORDER BY fired_at DESC - LIMIT 1 - `, whereClause) - - var event service.OpsAlertEvent - var resolvedAt sql.NullTime - var metricValue sql.NullFloat64 - var thresholdValue sql.NullFloat64 - if err := scanSingleRow( - ctx, - r.sql, - query, - args, - &event.ID, - &event.RuleID, - &event.Severity, - &event.Status, - &event.Title, - &event.Description, - &metricValue, - &thresholdValue, - &event.FiredAt, - &resolvedAt, - &event.EmailSent, - &event.WebhookSent, - &event.CreatedAt, - ); err != nil { - if errors.Is(err, sql.ErrNoRows) { - return nil, nil - } - return nil, err - } - - if metricValue.Valid { - event.MetricValue = metricValue.Float64 - } - if thresholdValue.Valid { - event.ThresholdValue = thresholdValue.Float64 - } - if resolvedAt.Valid { - event.ResolvedAt = &resolvedAt.Time - } - return &event, nil -} - -func scanOpsSystemMetric(rows *sql.Rows) (*service.OpsMetrics, error) { - var metric service.OpsMetrics - var windowMinutes sql.NullInt64 - var requestCount, successCount, errorCount sql.NullInt64 - var successRate, errorRate sql.NullFloat64 - var p95Latency, p99Latency, http2Errors, activeAlerts sql.NullInt64 - var cpuUsage, memoryUsage, gcPause sql.NullFloat64 - var memoryUsed, memoryTotal, heapAlloc, queueDepth sql.NullInt64 - - if err := rows.Scan( - &windowMinutes, - &requestCount, - &successCount, - &errorCount, - &successRate, - &errorRate, - &p95Latency, - &p99Latency, - &http2Errors, - &activeAlerts, - &cpuUsage, - &memoryUsed, - &memoryTotal, - &memoryUsage, - &heapAlloc, - &gcPause, - &queueDepth, - &metric.UpdatedAt, - ); err != nil { - return nil, err - } - - if windowMinutes.Valid { - metric.WindowMinutes = int(windowMinutes.Int64) - } - if requestCount.Valid { - metric.RequestCount = requestCount.Int64 - } - if successCount.Valid { - metric.SuccessCount = successCount.Int64 - } - if errorCount.Valid { - metric.ErrorCount = errorCount.Int64 - } - if successRate.Valid { - metric.SuccessRate = successRate.Float64 - } - if errorRate.Valid { - metric.ErrorRate = errorRate.Float64 - } - if p95Latency.Valid { - metric.P95LatencyMs = int(p95Latency.Int64) - } - if p99Latency.Valid { - metric.P99LatencyMs = int(p99Latency.Int64) - } - if http2Errors.Valid { - metric.HTTP2Errors = int(http2Errors.Int64) - } - if activeAlerts.Valid { - metric.ActiveAlerts = int(activeAlerts.Int64) - } - if cpuUsage.Valid { - metric.CPUUsagePercent = cpuUsage.Float64 - } - if memoryUsed.Valid { - metric.MemoryUsedMB = memoryUsed.Int64 - } - if memoryTotal.Valid { - metric.MemoryTotalMB = memoryTotal.Int64 - } - if memoryUsage.Valid { - metric.MemoryUsagePercent = memoryUsage.Float64 - } - if heapAlloc.Valid { - metric.HeapAllocMB = heapAlloc.Int64 - } - if gcPause.Valid { - metric.GCPauseMs = gcPause.Float64 - } - if queueDepth.Valid { - metric.ConcurrencyQueueDepth = int(queueDepth.Int64) - } - - return &metric, nil -} - -func scanOpsErrorLog(rows *sql.Rows) (*service.OpsErrorLog, error) { - var entry service.OpsErrorLog - var userID, apiKeyID, accountID, groupID sql.NullInt64 - var clientIP sql.NullString - var statusCode sql.NullInt64 - var platform sql.NullString - var model sql.NullString - var requestPath sql.NullString - var stream sql.NullBool - var latency sql.NullInt64 - var requestID sql.NullString - var message sql.NullString - - if err := rows.Scan( - &entry.ID, - &entry.CreatedAt, - &userID, - &apiKeyID, - &accountID, - &groupID, - &clientIP, - &entry.Phase, - &entry.Type, - &entry.Severity, - &statusCode, - &platform, - &model, - &requestPath, - &stream, - &latency, - &requestID, - &message, - ); err != nil { - return nil, err - } - - if userID.Valid { - v := userID.Int64 - entry.UserID = &v - } - if apiKeyID.Valid { - v := apiKeyID.Int64 - entry.APIKeyID = &v - } - if accountID.Valid { - v := accountID.Int64 - entry.AccountID = &v - } - if groupID.Valid { - v := groupID.Int64 - entry.GroupID = &v - } - if clientIP.Valid { - entry.ClientIP = clientIP.String - } - if statusCode.Valid { - entry.StatusCode = int(statusCode.Int64) - } - if platform.Valid { - entry.Platform = platform.String - } - if model.Valid { - entry.Model = model.String - } - if requestPath.Valid { - entry.RequestPath = requestPath.String - } - if stream.Valid { - entry.Stream = stream.Bool - } - if latency.Valid { - value := int(latency.Int64) - entry.LatencyMs = &value - } - if requestID.Valid { - entry.RequestID = requestID.String - } - if message.Valid { - entry.Message = message.String - } - - return &entry, nil -} - -func nullString(value string) sql.NullString { - if value == "" { - return sql.NullString{} - } - return sql.NullString{String: value, Valid: true} -} diff --git a/backend/internal/server/middleware/ops_auth_error_logger.go b/backend/internal/server/middleware/ops_auth_error_logger.go deleted file mode 100644 index 1c89b807..00000000 --- a/backend/internal/server/middleware/ops_auth_error_logger.go +++ /dev/null @@ -1,55 +0,0 @@ -package middleware - -import ( - "context" - "sync" - "time" - - "github.com/Wei-Shaw/sub2api/internal/service" -) - -const ( - opsAuthErrorLogWorkerCount = 10 - opsAuthErrorLogQueueSize = 256 - opsAuthErrorLogTimeout = 2 * time.Second -) - -type opsAuthErrorLogJob struct { - ops *service.OpsService - entry *service.OpsErrorLog -} - -var ( - opsAuthErrorLogOnce sync.Once - opsAuthErrorLogQueue chan opsAuthErrorLogJob -) - -func startOpsAuthErrorLogWorkers() { - opsAuthErrorLogQueue = make(chan opsAuthErrorLogJob, opsAuthErrorLogQueueSize) - for i := 0; i < opsAuthErrorLogWorkerCount; i++ { - go func() { - for job := range opsAuthErrorLogQueue { - if job.ops == nil || job.entry == nil { - continue - } - ctx, cancel := context.WithTimeout(context.Background(), opsAuthErrorLogTimeout) - _ = job.ops.RecordError(ctx, job.entry) - cancel() - } - }() - } -} - -func enqueueOpsAuthErrorLog(ops *service.OpsService, entry *service.OpsErrorLog) { - if ops == nil || entry == nil { - return - } - - opsAuthErrorLogOnce.Do(startOpsAuthErrorLogWorkers) - - select { - case opsAuthErrorLogQueue <- opsAuthErrorLogJob{ops: ops, entry: entry}: - default: - // Queue is full; drop to avoid blocking request handling. - } -} diff --git a/backend/internal/service/ops.go b/backend/internal/service/ops.go deleted file mode 100644 index 6a44d75c..00000000 --- a/backend/internal/service/ops.go +++ /dev/null @@ -1,99 +0,0 @@ -package service - -import ( - "context" - "time" -) - -// ErrorLog represents an ops error log item for list queries. -// -// Field naming matches docs/API-运维监控中心2.0.md (L3 根因追踪 - 错误日志列表). -type ErrorLog struct { - ID int64 `json:"id"` - Timestamp time.Time `json:"timestamp"` - - Level string `json:"level,omitempty"` - RequestID string `json:"request_id,omitempty"` - AccountID string `json:"account_id,omitempty"` - APIPath string `json:"api_path,omitempty"` - Provider string `json:"provider,omitempty"` - Model string `json:"model,omitempty"` - HTTPCode int `json:"http_code,omitempty"` - ErrorMessage string `json:"error_message,omitempty"` - - DurationMs *int `json:"duration_ms,omitempty"` - RetryCount *int `json:"retry_count,omitempty"` - Stream bool `json:"stream,omitempty"` -} - -// ErrorLogFilter describes optional filters and pagination for listing ops error logs. -type ErrorLogFilter struct { - StartTime *time.Time - EndTime *time.Time - - ErrorCode *int - Provider string - AccountID *int64 - - Page int - PageSize int -} - -func (f *ErrorLogFilter) normalize() (page, pageSize int) { - page = 1 - pageSize = 20 - if f == nil { - return page, pageSize - } - - if f.Page > 0 { - page = f.Page - } - if f.PageSize > 0 { - pageSize = f.PageSize - } - if pageSize > 100 { - pageSize = 100 - } - return page, pageSize -} - -type ErrorLogListResponse struct { - Errors []*ErrorLog `json:"errors"` - Total int64 `json:"total"` - Page int `json:"page"` - PageSize int `json:"page_size"` -} - -func (s *OpsService) GetErrorLogs(ctx context.Context, filter *ErrorLogFilter) (*ErrorLogListResponse, error) { - if s == nil || s.repo == nil { - return &ErrorLogListResponse{ - Errors: []*ErrorLog{}, - Total: 0, - Page: 1, - PageSize: 20, - }, nil - } - - page, pageSize := filter.normalize() - if filter == nil { - filter = &ErrorLogFilter{} - } - filter.Page = page - filter.PageSize = pageSize - - items, total, err := s.repo.ListErrorLogs(ctx, filter) - if err != nil { - return nil, err - } - if items == nil { - items = []*ErrorLog{} - } - - return &ErrorLogListResponse{ - Errors: items, - Total: total, - Page: page, - PageSize: pageSize, - }, nil -} diff --git a/backend/internal/service/ops_alert_service.go b/backend/internal/service/ops_alert_service.go deleted file mode 100644 index afe283af..00000000 --- a/backend/internal/service/ops_alert_service.go +++ /dev/null @@ -1,834 +0,0 @@ -package service - -import ( - "bytes" - "context" - "encoding/json" - "errors" - "fmt" - "log" - "net" - "net/http" - "net/url" - "strconv" - "strings" - "sync" - "time" -) - -type OpsAlertService struct { - opsService *OpsService - userService *UserService - emailService *EmailService - httpClient *http.Client - - interval time.Duration - - startOnce sync.Once - stopOnce sync.Once - stopCtx context.Context - stop context.CancelFunc - wg sync.WaitGroup -} - -// opsAlertEvalInterval defines how often OpsAlertService evaluates alert rules. -// -// Production uses opsMetricsInterval. Tests may override this variable to keep -// integration tests fast without changing production defaults. -var opsAlertEvalInterval = opsMetricsInterval - -func NewOpsAlertService(opsService *OpsService, userService *UserService, emailService *EmailService) *OpsAlertService { - return &OpsAlertService{ - opsService: opsService, - userService: userService, - emailService: emailService, - httpClient: &http.Client{Timeout: 10 * time.Second}, - interval: opsAlertEvalInterval, - } -} - -// Start launches the background alert evaluation loop. -// -// Stop must be called during shutdown to ensure the goroutine exits. -func (s *OpsAlertService) Start() { - s.StartWithContext(context.Background()) -} - -// StartWithContext is like Start but allows the caller to provide a parent context. -// When the parent context is canceled, the service stops automatically. -func (s *OpsAlertService) StartWithContext(ctx context.Context) { - if s == nil { - return - } - if ctx == nil { - ctx = context.Background() - } - - s.startOnce.Do(func() { - if s.interval <= 0 { - s.interval = opsAlertEvalInterval - } - - s.stopCtx, s.stop = context.WithCancel(ctx) - s.wg.Add(1) - go s.run() - }) -} - -// Stop gracefully stops the background goroutine started by Start/StartWithContext. -// It is safe to call Stop multiple times. -func (s *OpsAlertService) Stop() { - if s == nil { - return - } - - s.stopOnce.Do(func() { - if s.stop != nil { - s.stop() - } - }) - s.wg.Wait() -} - -func (s *OpsAlertService) run() { - defer s.wg.Done() - - ticker := time.NewTicker(s.interval) - defer ticker.Stop() - - s.evaluateOnce() - for { - select { - case <-ticker.C: - s.evaluateOnce() - case <-s.stopCtx.Done(): - return - } - } -} - -func (s *OpsAlertService) evaluateOnce() { - ctx, cancel := context.WithTimeout(s.stopCtx, opsAlertEvaluateTimeout) - defer cancel() - - s.Evaluate(ctx, time.Now()) -} - -func (s *OpsAlertService) Evaluate(ctx context.Context, now time.Time) { - if s == nil || s.opsService == nil { - return - } - - rules, err := s.opsService.ListAlertRules(ctx) - if err != nil { - log.Printf("[OpsAlert] failed to list rules: %v", err) - return - } - if len(rules) == 0 { - return - } - - maxSustainedByWindow := make(map[int]int) - for _, rule := range rules { - if !rule.Enabled { - continue - } - window := rule.WindowMinutes - if window <= 0 { - window = 1 - } - sustained := rule.SustainedMinutes - if sustained <= 0 { - sustained = 1 - } - if sustained > maxSustainedByWindow[window] { - maxSustainedByWindow[window] = sustained - } - } - - metricsByWindow := make(map[int][]OpsMetrics) - for window, limit := range maxSustainedByWindow { - metrics, err := s.opsService.ListRecentSystemMetrics(ctx, window, limit) - if err != nil { - log.Printf("[OpsAlert] failed to load metrics window=%dm: %v", window, err) - continue - } - metricsByWindow[window] = metrics - } - - for _, rule := range rules { - if !rule.Enabled { - continue - } - window := rule.WindowMinutes - if window <= 0 { - window = 1 - } - sustained := rule.SustainedMinutes - if sustained <= 0 { - sustained = 1 - } - - metrics := metricsByWindow[window] - selected, ok := selectContiguousMetrics(metrics, sustained, now) - if !ok { - continue - } - - breached, latestValue, ok := evaluateRule(rule, selected) - if !ok { - continue - } - - activeEvent, err := s.opsService.GetActiveAlertEvent(ctx, rule.ID) - if err != nil { - log.Printf("[OpsAlert] failed to get active event (rule=%d): %v", rule.ID, err) - continue - } - - if breached { - if activeEvent != nil { - continue - } - - lastEvent, err := s.opsService.GetLatestAlertEvent(ctx, rule.ID) - if err != nil { - log.Printf("[OpsAlert] failed to get latest event (rule=%d): %v", rule.ID, err) - continue - } - if lastEvent != nil && rule.CooldownMinutes > 0 { - cooldown := time.Duration(rule.CooldownMinutes) * time.Minute - if now.Sub(lastEvent.FiredAt) < cooldown { - continue - } - } - - event := &OpsAlertEvent{ - RuleID: rule.ID, - Severity: rule.Severity, - Status: OpsAlertStatusFiring, - Title: fmt.Sprintf("%s: %s", rule.Severity, rule.Name), - Description: buildAlertDescription(rule, latestValue), - MetricValue: latestValue, - ThresholdValue: rule.Threshold, - FiredAt: now, - CreatedAt: now, - } - - if err := s.opsService.CreateAlertEvent(ctx, event); err != nil { - log.Printf("[OpsAlert] failed to create event (rule=%d): %v", rule.ID, err) - continue - } - - emailSent, webhookSent := s.dispatchNotifications(ctx, rule, event) - if emailSent || webhookSent { - if err := s.opsService.UpdateAlertEventNotifications(ctx, event.ID, emailSent, webhookSent); err != nil { - log.Printf("[OpsAlert] failed to update notification flags (event=%d): %v", event.ID, err) - } - } - } else if activeEvent != nil { - resolvedAt := now - if err := s.opsService.UpdateAlertEventStatus(ctx, activeEvent.ID, OpsAlertStatusResolved, &resolvedAt); err != nil { - log.Printf("[OpsAlert] failed to resolve event (event=%d): %v", activeEvent.ID, err) - } - } - } -} - -const opsMetricsContinuityTolerance = 20 * time.Second - -// selectContiguousMetrics picks the newest N metrics and verifies they are continuous. -// -// This prevents a sustained rule from triggering when metrics sampling has gaps -// (e.g. collector downtime) and avoids evaluating "stale" data. -// -// Assumptions: -// - Metrics are ordered by UpdatedAt DESC (newest first). -// - Metrics are expected to be collected at opsMetricsInterval cadence. -func selectContiguousMetrics(metrics []OpsMetrics, needed int, now time.Time) ([]OpsMetrics, bool) { - if needed <= 0 { - return nil, false - } - if len(metrics) < needed { - return nil, false - } - newest := metrics[0].UpdatedAt - if newest.IsZero() { - return nil, false - } - if now.Sub(newest) > opsMetricsInterval+opsMetricsContinuityTolerance { - return nil, false - } - - selected := metrics[:needed] - for i := 0; i < len(selected)-1; i++ { - a := selected[i].UpdatedAt - b := selected[i+1].UpdatedAt - if a.IsZero() || b.IsZero() { - return nil, false - } - gap := a.Sub(b) - if gap < opsMetricsInterval-opsMetricsContinuityTolerance || gap > opsMetricsInterval+opsMetricsContinuityTolerance { - return nil, false - } - } - return selected, true -} - -func evaluateRule(rule OpsAlertRule, metrics []OpsMetrics) (bool, float64, bool) { - if len(metrics) == 0 { - return false, 0, false - } - - latestValue, ok := metricValue(metrics[0], rule.MetricType) - if !ok { - return false, 0, false - } - - for _, metric := range metrics { - value, ok := metricValue(metric, rule.MetricType) - if !ok || !compareMetric(value, rule.Operator, rule.Threshold) { - return false, latestValue, true - } - } - - return true, latestValue, true -} - -func metricValue(metric OpsMetrics, metricType string) (float64, bool) { - switch metricType { - case OpsMetricSuccessRate: - if metric.RequestCount == 0 { - return 0, false - } - return metric.SuccessRate, true - case OpsMetricErrorRate: - if metric.RequestCount == 0 { - return 0, false - } - return metric.ErrorRate, true - case OpsMetricP95LatencyMs: - return float64(metric.P95LatencyMs), true - case OpsMetricP99LatencyMs: - return float64(metric.P99LatencyMs), true - case OpsMetricHTTP2Errors: - return float64(metric.HTTP2Errors), true - case OpsMetricCPUUsagePercent: - return metric.CPUUsagePercent, true - case OpsMetricMemoryUsagePercent: - return metric.MemoryUsagePercent, true - case OpsMetricQueueDepth: - return float64(metric.ConcurrencyQueueDepth), true - default: - return 0, false - } -} - -func compareMetric(value float64, operator string, threshold float64) bool { - switch operator { - case ">": - return value > threshold - case ">=": - return value >= threshold - case "<": - return value < threshold - case "<=": - return value <= threshold - case "==": - return value == threshold - default: - return false - } -} - -func buildAlertDescription(rule OpsAlertRule, value float64) string { - window := rule.WindowMinutes - if window <= 0 { - window = 1 - } - return fmt.Sprintf("Rule %s triggered: %s %s %.2f (current %.2f) over last %dm", - rule.Name, - rule.MetricType, - rule.Operator, - rule.Threshold, - value, - window, - ) -} - -func (s *OpsAlertService) dispatchNotifications(ctx context.Context, rule OpsAlertRule, event *OpsAlertEvent) (bool, bool) { - emailSent := false - webhookSent := false - - notifyCtx, cancel := s.notificationContext(ctx) - defer cancel() - - if rule.NotifyEmail { - emailSent = s.sendEmailNotification(notifyCtx, rule, event) - } - if rule.NotifyWebhook && rule.WebhookURL != "" { - webhookSent = s.sendWebhookNotification(notifyCtx, rule, event) - } - // Fallback channel: if email is enabled but ultimately fails, try webhook even if the - // webhook toggle is off (as long as a webhook URL is configured). - if rule.NotifyEmail && !emailSent && !rule.NotifyWebhook && rule.WebhookURL != "" { - log.Printf("[OpsAlert] email failed; attempting webhook fallback (rule=%d)", rule.ID) - webhookSent = s.sendWebhookNotification(notifyCtx, rule, event) - } - - return emailSent, webhookSent -} - -const ( - opsAlertEvaluateTimeout = 45 * time.Second - opsAlertNotificationTimeout = 30 * time.Second - opsAlertEmailMaxRetries = 3 -) - -var opsAlertEmailBackoff = []time.Duration{ - 1 * time.Second, - 2 * time.Second, - 4 * time.Second, -} - -func (s *OpsAlertService) notificationContext(ctx context.Context) (context.Context, context.CancelFunc) { - parent := ctx - if s != nil && s.stopCtx != nil { - parent = s.stopCtx - } - if parent == nil { - parent = context.Background() - } - return context.WithTimeout(parent, opsAlertNotificationTimeout) -} - -var opsAlertSleep = sleepWithContext - -func sleepWithContext(ctx context.Context, d time.Duration) error { - if d <= 0 { - return nil - } - if ctx == nil { - time.Sleep(d) - return nil - } - timer := time.NewTimer(d) - defer timer.Stop() - select { - case <-ctx.Done(): - return ctx.Err() - case <-timer.C: - return nil - } -} - -func retryWithBackoff( - ctx context.Context, - maxRetries int, - backoff []time.Duration, - fn func() error, - onError func(attempt int, total int, nextDelay time.Duration, err error), -) error { - if ctx == nil { - ctx = context.Background() - } - if maxRetries < 0 { - maxRetries = 0 - } - totalAttempts := maxRetries + 1 - - var lastErr error - for attempt := 1; attempt <= totalAttempts; attempt++ { - if attempt > 1 { - backoffIdx := attempt - 2 - if backoffIdx < len(backoff) { - if err := opsAlertSleep(ctx, backoff[backoffIdx]); err != nil { - return err - } - } - } - - if err := ctx.Err(); err != nil { - return err - } - - if err := fn(); err != nil { - lastErr = err - nextDelay := time.Duration(0) - if attempt < totalAttempts { - nextIdx := attempt - 1 - if nextIdx < len(backoff) { - nextDelay = backoff[nextIdx] - } - } - if onError != nil { - onError(attempt, totalAttempts, nextDelay, err) - } - continue - } - return nil - } - - return lastErr -} - -func (s *OpsAlertService) sendEmailNotification(ctx context.Context, rule OpsAlertRule, event *OpsAlertEvent) bool { - if s.emailService == nil || s.userService == nil { - return false - } - - if ctx == nil { - ctx = context.Background() - } - - admin, err := s.userService.GetFirstAdmin(ctx) - if err != nil || admin == nil || admin.Email == "" { - return false - } - - subject := fmt.Sprintf("[Ops Alert][%s] %s", rule.Severity, rule.Name) - body := fmt.Sprintf( - "Alert triggered: %s\n\nMetric: %s\nThreshold: %.2f\nCurrent: %.2f\nWindow: %dm\nStatus: %s\nTime: %s", - rule.Name, - rule.MetricType, - rule.Threshold, - event.MetricValue, - rule.WindowMinutes, - event.Status, - event.FiredAt.Format(time.RFC3339), - ) - - config, err := s.emailService.GetSMTPConfig(ctx) - if err != nil { - log.Printf("[OpsAlert] email config load failed: %v", err) - return false - } - - if err := retryWithBackoff( - ctx, - opsAlertEmailMaxRetries, - opsAlertEmailBackoff, - func() error { - return s.emailService.SendEmailWithConfig(config, admin.Email, subject, body) - }, - func(attempt int, total int, nextDelay time.Duration, err error) { - if attempt < total { - log.Printf("[OpsAlert] email send failed (attempt=%d/%d), retrying in %s: %v", attempt, total, nextDelay, err) - return - } - log.Printf("[OpsAlert] email send failed (attempt=%d/%d), giving up: %v", attempt, total, err) - }, - ); err != nil { - if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { - log.Printf("[OpsAlert] email send canceled: %v", err) - } - return false - } - return true -} - -func (s *OpsAlertService) sendWebhookNotification(ctx context.Context, rule OpsAlertRule, event *OpsAlertEvent) bool { - ctx, cancel := context.WithTimeout(ctx, 10*time.Second) - defer cancel() - - webhookTarget, err := validateWebhookURL(ctx, rule.WebhookURL) - if err != nil { - log.Printf("[OpsAlert] invalid webhook url (rule=%d): %v", rule.ID, err) - return false - } - - payload := map[string]any{ - "rule_id": rule.ID, - "rule_name": rule.Name, - "severity": rule.Severity, - "status": event.Status, - "metric_type": rule.MetricType, - "metric_value": event.MetricValue, - "threshold_value": rule.Threshold, - "window_minutes": rule.WindowMinutes, - "fired_at": event.FiredAt.Format(time.RFC3339), - } - - body, err := json.Marshal(payload) - if err != nil { - return false - } - - req, err := http.NewRequestWithContext(ctx, http.MethodPost, webhookTarget.URL.String(), bytes.NewReader(body)) - if err != nil { - return false - } - req.Header.Set("Content-Type", "application/json") - - resp, err := buildWebhookHTTPClient(s.httpClient, webhookTarget).Do(req) - if err != nil { - log.Printf("[OpsAlert] webhook send failed: %v", err) - return false - } - defer func() { _ = resp.Body.Close() }() - - if resp.StatusCode < http.StatusOK || resp.StatusCode >= http.StatusMultipleChoices { - log.Printf("[OpsAlert] webhook returned status %d", resp.StatusCode) - return false - } - return true -} - -const webhookHTTPClientTimeout = 10 * time.Second - -func buildWebhookHTTPClient(base *http.Client, webhookTarget *validatedWebhookTarget) *http.Client { - var client http.Client - if base != nil { - client = *base - } - if client.Timeout <= 0 { - client.Timeout = webhookHTTPClientTimeout - } - client.CheckRedirect = func(req *http.Request, via []*http.Request) error { - return http.ErrUseLastResponse - } - if webhookTarget != nil { - client.Transport = buildWebhookTransport(client.Transport, webhookTarget) - } - return &client -} - -var disallowedWebhookIPNets = []net.IPNet{ - // "this host on this network" / unspecified. - mustParseCIDR("0.0.0.0/8"), - mustParseCIDR("127.0.0.0/8"), // loopback (includes 127.0.0.1) - mustParseCIDR("10.0.0.0/8"), // RFC1918 - mustParseCIDR("192.168.0.0/16"), // RFC1918 - mustParseCIDR("172.16.0.0/12"), // RFC1918 (172.16.0.0 - 172.31.255.255) - mustParseCIDR("100.64.0.0/10"), // RFC6598 (carrier-grade NAT) - mustParseCIDR("169.254.0.0/16"), // IPv4 link-local (includes 169.254.169.254 metadata IP on many clouds) - mustParseCIDR("198.18.0.0/15"), // RFC2544 benchmark testing - mustParseCIDR("224.0.0.0/4"), // IPv4 multicast - mustParseCIDR("240.0.0.0/4"), // IPv4 reserved - mustParseCIDR("::/128"), // IPv6 unspecified - mustParseCIDR("::1/128"), // IPv6 loopback - mustParseCIDR("fc00::/7"), // IPv6 unique local - mustParseCIDR("fe80::/10"), // IPv6 link-local - mustParseCIDR("ff00::/8"), // IPv6 multicast -} - -func mustParseCIDR(cidr string) net.IPNet { - _, block, err := net.ParseCIDR(cidr) - if err != nil { - panic(err) - } - return *block -} - -var lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) { - return net.DefaultResolver.LookupIPAddr(ctx, host) -} - -type validatedWebhookTarget struct { - URL *url.URL - - host string - port string - pinnedIPs []net.IP -} - -var webhookBaseDialContext = func(ctx context.Context, network, addr string) (net.Conn, error) { - dialer := net.Dialer{ - Timeout: 5 * time.Second, - KeepAlive: 30 * time.Second, - } - return dialer.DialContext(ctx, network, addr) -} - -func buildWebhookTransport(base http.RoundTripper, webhookTarget *validatedWebhookTarget) http.RoundTripper { - if webhookTarget == nil || webhookTarget.URL == nil { - return base - } - - var transport *http.Transport - switch typed := base.(type) { - case *http.Transport: - if typed != nil { - transport = typed.Clone() - } - } - if transport == nil { - if defaultTransport, ok := http.DefaultTransport.(*http.Transport); ok && defaultTransport != nil { - transport = defaultTransport.Clone() - } else { - transport = (&http.Transport{}).Clone() - } - } - - webhookHost := webhookTarget.host - webhookPort := webhookTarget.port - pinnedIPs := append([]net.IP(nil), webhookTarget.pinnedIPs...) - - transport.Proxy = nil - transport.DialTLSContext = nil - transport.DialContext = func(ctx context.Context, network, addr string) (net.Conn, error) { - host, port, err := net.SplitHostPort(addr) - if err != nil || host == "" || port == "" { - return nil, fmt.Errorf("webhook dial target is invalid: %q", addr) - } - - canonicalHost := strings.TrimSuffix(strings.ToLower(host), ".") - if canonicalHost != webhookHost || port != webhookPort { - return nil, fmt.Errorf("webhook dial target mismatch: %q", addr) - } - - var lastErr error - for _, ip := range pinnedIPs { - if isDisallowedWebhookIP(ip) { - lastErr = fmt.Errorf("webhook target resolves to a disallowed ip") - continue - } - - dialAddr := net.JoinHostPort(ip.String(), port) - conn, err := webhookBaseDialContext(ctx, network, dialAddr) - if err == nil { - return conn, nil - } - lastErr = err - } - if lastErr == nil { - lastErr = errors.New("webhook target has no resolved addresses") - } - return nil, lastErr - } - - return transport -} - -func validateWebhookURL(ctx context.Context, raw string) (*validatedWebhookTarget, error) { - raw = strings.TrimSpace(raw) - if raw == "" { - return nil, errors.New("webhook url is empty") - } - // Avoid request smuggling / header injection vectors. - if strings.ContainsAny(raw, "\r\n") { - return nil, errors.New("webhook url contains invalid characters") - } - - parsed, err := url.Parse(raw) - if err != nil { - return nil, errors.New("webhook url format is invalid") - } - if !strings.EqualFold(parsed.Scheme, "https") { - return nil, errors.New("webhook url scheme must be https") - } - parsed.Scheme = "https" - if parsed.Host == "" || parsed.Hostname() == "" { - return nil, errors.New("webhook url must include host") - } - if parsed.User != nil { - return nil, errors.New("webhook url must not include userinfo") - } - if parsed.Port() != "" { - port, err := strconv.Atoi(parsed.Port()) - if err != nil || port < 1 || port > 65535 { - return nil, errors.New("webhook url port is invalid") - } - } - - host := strings.TrimSuffix(strings.ToLower(parsed.Hostname()), ".") - if host == "localhost" { - return nil, errors.New("webhook url host must not be localhost") - } - - if ip := net.ParseIP(host); ip != nil { - if isDisallowedWebhookIP(ip) { - return nil, errors.New("webhook url host resolves to a disallowed ip") - } - return &validatedWebhookTarget{ - URL: parsed, - host: host, - port: portForScheme(parsed), - pinnedIPs: []net.IP{ip}, - }, nil - } - - if ctx == nil { - ctx = context.Background() - } - ips, err := lookupIPAddrs(ctx, host) - if err != nil || len(ips) == 0 { - return nil, errors.New("webhook url host cannot be resolved") - } - pinned := make([]net.IP, 0, len(ips)) - for _, addr := range ips { - if isDisallowedWebhookIP(addr.IP) { - return nil, errors.New("webhook url host resolves to a disallowed ip") - } - if addr.IP != nil { - pinned = append(pinned, addr.IP) - } - } - - if len(pinned) == 0 { - return nil, errors.New("webhook url host cannot be resolved") - } - - return &validatedWebhookTarget{ - URL: parsed, - host: host, - port: portForScheme(parsed), - pinnedIPs: uniqueResolvedIPs(pinned), - }, nil -} - -func isDisallowedWebhookIP(ip net.IP) bool { - if ip == nil { - return false - } - if ip4 := ip.To4(); ip4 != nil { - ip = ip4 - } else if ip16 := ip.To16(); ip16 != nil { - ip = ip16 - } else { - return false - } - - // Disallow non-public addresses even if they're not explicitly covered by the CIDR list. - // This provides defense-in-depth against SSRF targets such as link-local, multicast, and - // unspecified addresses, and ensures any "pinned" IP is still blocked at dial time. - if ip.IsUnspecified() || - ip.IsLoopback() || - ip.IsMulticast() || - ip.IsLinkLocalUnicast() || - ip.IsLinkLocalMulticast() || - ip.IsPrivate() { - return true - } - - for _, block := range disallowedWebhookIPNets { - if block.Contains(ip) { - return true - } - } - return false -} - -func portForScheme(u *url.URL) string { - if u != nil && u.Port() != "" { - return u.Port() - } - return "443" -} - -func uniqueResolvedIPs(ips []net.IP) []net.IP { - seen := make(map[string]struct{}, len(ips)) - out := make([]net.IP, 0, len(ips)) - for _, ip := range ips { - if ip == nil { - continue - } - key := ip.String() - if _, ok := seen[key]; ok { - continue - } - seen[key] = struct{}{} - out = append(out, ip) - } - return out -} diff --git a/backend/internal/service/ops_alert_service_integration_test.go b/backend/internal/service/ops_alert_service_integration_test.go deleted file mode 100644 index 695cd2e5..00000000 --- a/backend/internal/service/ops_alert_service_integration_test.go +++ /dev/null @@ -1,271 +0,0 @@ -//go:build integration - -package service - -import ( - "context" - "database/sql" - "sync" - "sync/atomic" - "testing" - "time" - - "github.com/stretchr/testify/require" -) - -// This integration test protects the DI startup contract for OpsAlertService. -// -// Background: -// - OpsMetricsCollector previously called alertService.Start()/Evaluate() directly. -// - Those direct calls were removed, so OpsAlertService must now start via DI -// (ProvideOpsAlertService in wire.go) and run its own evaluation ticker. -// -// What we validate here: -// 1. When we construct via the Wire provider functions (ProvideOpsAlertService + -// ProvideOpsMetricsCollector), OpsAlertService starts automatically. -// 2. Its evaluation loop continues to tick even if OpsMetricsCollector is stopped, -// proving the alert evaluator is independent. -// 3. The evaluation path can trigger alert logic (CreateAlertEvent called). -func TestOpsAlertService_StartedViaWireProviders_RunsIndependentTicker(t *testing.T) { - oldInterval := opsAlertEvalInterval - opsAlertEvalInterval = 25 * time.Millisecond - t.Cleanup(func() { opsAlertEvalInterval = oldInterval }) - - repo := newFakeOpsRepository() - opsService := NewOpsService(repo, nil) - - // Start via the Wire provider function (the production DI path). - alertService := ProvideOpsAlertService(opsService, nil, nil) - t.Cleanup(alertService.Stop) - - // Construct via ProvideOpsMetricsCollector (wire.go). Stop immediately to ensure - // the alert ticker keeps running without the metrics collector. - collector := ProvideOpsMetricsCollector(opsService, NewConcurrencyService(nil)) - collector.Stop() - - // Wait for at least one evaluation (run() calls evaluateOnce immediately). - require.Eventually(t, func() bool { - return repo.listRulesCalls.Load() >= 1 - }, 1*time.Second, 5*time.Millisecond) - - // Confirm the evaluation loop keeps ticking after the metrics collector is stopped. - callsAfterCollectorStop := repo.listRulesCalls.Load() - require.Eventually(t, func() bool { - return repo.listRulesCalls.Load() >= callsAfterCollectorStop+2 - }, 1*time.Second, 5*time.Millisecond) - - // Confirm the evaluation logic actually fires an alert event at least once. - select { - case <-repo.eventCreatedCh: - // ok - case <-time.After(2 * time.Second): - t.Fatalf("expected OpsAlertService to create an alert event, but none was created (ListAlertRules calls=%d)", repo.listRulesCalls.Load()) - } -} - -func newFakeOpsRepository() *fakeOpsRepository { - return &fakeOpsRepository{ - eventCreatedCh: make(chan struct{}), - } -} - -// fakeOpsRepository is a lightweight in-memory stub of OpsRepository for integration tests. -// It avoids real DB/Redis usage and provides deterministic responses fast. -type fakeOpsRepository struct { - listRulesCalls atomic.Int64 - - mu sync.Mutex - activeEvent *OpsAlertEvent - latestEvent *OpsAlertEvent - nextEventID int64 - eventCreatedCh chan struct{} - eventOnce sync.Once -} - -func (r *fakeOpsRepository) CreateErrorLog(ctx context.Context, log *OpsErrorLog) error { - return nil -} - -func (r *fakeOpsRepository) ListErrorLogsLegacy(ctx context.Context, filters OpsErrorLogFilters) ([]OpsErrorLog, error) { - return nil, nil -} - -func (r *fakeOpsRepository) ListErrorLogs(ctx context.Context, filter *ErrorLogFilter) ([]*ErrorLog, int64, error) { - return nil, 0, nil -} - -func (r *fakeOpsRepository) GetLatestSystemMetric(ctx context.Context) (*OpsMetrics, error) { - return &OpsMetrics{WindowMinutes: 1}, sql.ErrNoRows -} - -func (r *fakeOpsRepository) CreateSystemMetric(ctx context.Context, metric *OpsMetrics) error { - return nil -} - -func (r *fakeOpsRepository) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) { - return &OpsWindowStats{}, nil -} - -func (r *fakeOpsRepository) GetProviderStats(ctx context.Context, startTime, endTime time.Time) ([]*ProviderStats, error) { - return nil, nil -} - -func (r *fakeOpsRepository) GetLatencyHistogram(ctx context.Context, startTime, endTime time.Time) ([]*LatencyHistogramItem, error) { - return nil, nil -} - -func (r *fakeOpsRepository) GetErrorDistribution(ctx context.Context, startTime, endTime time.Time) ([]*ErrorDistributionItem, error) { - return nil, nil -} - -func (r *fakeOpsRepository) ListRecentSystemMetrics(ctx context.Context, windowMinutes, limit int) ([]OpsMetrics, error) { - if limit <= 0 { - limit = 1 - } - now := time.Now() - metrics := make([]OpsMetrics, 0, limit) - for i := 0; i < limit; i++ { - metrics = append(metrics, OpsMetrics{ - WindowMinutes: windowMinutes, - CPUUsagePercent: 99, - UpdatedAt: now.Add(-time.Duration(i) * opsMetricsInterval), - }) - } - return metrics, nil -} - -func (r *fakeOpsRepository) ListSystemMetricsRange(ctx context.Context, windowMinutes int, startTime, endTime time.Time, limit int) ([]OpsMetrics, error) { - return nil, nil -} - -func (r *fakeOpsRepository) ListAlertRules(ctx context.Context) ([]OpsAlertRule, error) { - call := r.listRulesCalls.Add(1) - // Delay enabling rules slightly so the test can stop OpsMetricsCollector first, - // then observe the alert evaluator ticking independently. - if call < 5 { - return nil, nil - } - return []OpsAlertRule{ - { - ID: 1, - Name: "cpu too high (test)", - Enabled: true, - MetricType: OpsMetricCPUUsagePercent, - Operator: ">", - Threshold: 0, - WindowMinutes: 1, - SustainedMinutes: 1, - Severity: "P1", - NotifyEmail: false, - NotifyWebhook: false, - CooldownMinutes: 0, - }, - }, nil -} - -func (r *fakeOpsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) { - r.mu.Lock() - defer r.mu.Unlock() - if r.activeEvent == nil { - return nil, nil - } - if r.activeEvent.RuleID != ruleID { - return nil, nil - } - if r.activeEvent.Status != OpsAlertStatusFiring { - return nil, nil - } - clone := *r.activeEvent - return &clone, nil -} - -func (r *fakeOpsRepository) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) { - r.mu.Lock() - defer r.mu.Unlock() - if r.latestEvent == nil || r.latestEvent.RuleID != ruleID { - return nil, nil - } - clone := *r.latestEvent - return &clone, nil -} - -func (r *fakeOpsRepository) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) error { - if event == nil { - return nil - } - r.mu.Lock() - defer r.mu.Unlock() - - r.nextEventID++ - event.ID = r.nextEventID - - clone := *event - r.latestEvent = &clone - if clone.Status == OpsAlertStatusFiring { - r.activeEvent = &clone - } - - r.eventOnce.Do(func() { close(r.eventCreatedCh) }) - return nil -} - -func (r *fakeOpsRepository) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error { - r.mu.Lock() - defer r.mu.Unlock() - if r.activeEvent != nil && r.activeEvent.ID == eventID { - r.activeEvent.Status = status - r.activeEvent.ResolvedAt = resolvedAt - } - if r.latestEvent != nil && r.latestEvent.ID == eventID { - r.latestEvent.Status = status - r.latestEvent.ResolvedAt = resolvedAt - } - return nil -} - -func (r *fakeOpsRepository) UpdateAlertEventNotifications(ctx context.Context, eventID int64, emailSent, webhookSent bool) error { - r.mu.Lock() - defer r.mu.Unlock() - if r.activeEvent != nil && r.activeEvent.ID == eventID { - r.activeEvent.EmailSent = emailSent - r.activeEvent.WebhookSent = webhookSent - } - if r.latestEvent != nil && r.latestEvent.ID == eventID { - r.latestEvent.EmailSent = emailSent - r.latestEvent.WebhookSent = webhookSent - } - return nil -} - -func (r *fakeOpsRepository) CountActiveAlerts(ctx context.Context) (int, error) { - r.mu.Lock() - defer r.mu.Unlock() - if r.activeEvent == nil { - return 0, nil - } - return 1, nil -} - -func (r *fakeOpsRepository) GetOverviewStats(ctx context.Context, startTime, endTime time.Time) (*OverviewStats, error) { - return &OverviewStats{}, nil -} - -func (r *fakeOpsRepository) GetCachedLatestSystemMetric(ctx context.Context) (*OpsMetrics, error) { - return nil, nil -} - -func (r *fakeOpsRepository) SetCachedLatestSystemMetric(ctx context.Context, metric *OpsMetrics) error { - return nil -} - -func (r *fakeOpsRepository) GetCachedDashboardOverview(ctx context.Context, timeRange string) (*DashboardOverviewData, error) { - return nil, nil -} - -func (r *fakeOpsRepository) SetCachedDashboardOverview(ctx context.Context, timeRange string, data *DashboardOverviewData, ttl time.Duration) error { - return nil -} - -func (r *fakeOpsRepository) PingRedis(ctx context.Context) error { - return nil -} diff --git a/backend/internal/service/ops_alert_service_test.go b/backend/internal/service/ops_alert_service_test.go deleted file mode 100644 index ec20d81c..00000000 --- a/backend/internal/service/ops_alert_service_test.go +++ /dev/null @@ -1,315 +0,0 @@ -//go:build unit || opsalert_unit - -package service - -import ( - "context" - "errors" - "net" - "net/http" - "testing" - "time" - - "github.com/stretchr/testify/require" -) - -func TestSelectContiguousMetrics_Contiguous(t *testing.T) { - now := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC) - metrics := []OpsMetrics{ - {UpdatedAt: now}, - {UpdatedAt: now.Add(-1 * time.Minute)}, - {UpdatedAt: now.Add(-2 * time.Minute)}, - } - - selected, ok := selectContiguousMetrics(metrics, 3, now) - require.True(t, ok) - require.Len(t, selected, 3) -} - -func TestSelectContiguousMetrics_GapFails(t *testing.T) { - now := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC) - metrics := []OpsMetrics{ - {UpdatedAt: now}, - // Missing the -1m sample (gap ~=2m). - {UpdatedAt: now.Add(-2 * time.Minute)}, - {UpdatedAt: now.Add(-3 * time.Minute)}, - } - - _, ok := selectContiguousMetrics(metrics, 3, now) - require.False(t, ok) -} - -func TestSelectContiguousMetrics_StaleNewestFails(t *testing.T) { - now := time.Date(2026, 1, 1, 0, 10, 0, 0, time.UTC) - metrics := []OpsMetrics{ - {UpdatedAt: now.Add(-10 * time.Minute)}, - {UpdatedAt: now.Add(-11 * time.Minute)}, - } - - _, ok := selectContiguousMetrics(metrics, 2, now) - require.False(t, ok) -} - -func TestMetricValue_SuccessRate_NoTrafficIsNoData(t *testing.T) { - metric := OpsMetrics{ - RequestCount: 0, - SuccessRate: 0, - } - value, ok := metricValue(metric, OpsMetricSuccessRate) - require.False(t, ok) - require.Equal(t, 0.0, value) -} - -func TestOpsAlertService_StopWithoutStart_NoPanic(t *testing.T) { - s := NewOpsAlertService(nil, nil, nil) - require.NotPanics(t, func() { s.Stop() }) -} - -func TestOpsAlertService_StartStop_Graceful(t *testing.T) { - s := NewOpsAlertService(nil, nil, nil) - s.interval = 5 * time.Millisecond - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - s.StartWithContext(ctx) - - done := make(chan struct{}) - go func() { - s.Stop() - close(done) - }() - - select { - case <-done: - // ok - case <-time.After(1 * time.Second): - t.Fatal("Stop did not return; background goroutine likely stuck") - } - - require.NotPanics(t, func() { s.Stop() }) -} - -func TestBuildWebhookHTTPClient_DefaultTimeout(t *testing.T) { - client := buildWebhookHTTPClient(nil, nil) - require.Equal(t, webhookHTTPClientTimeout, client.Timeout) - require.NotNil(t, client.CheckRedirect) - require.ErrorIs(t, client.CheckRedirect(nil, nil), http.ErrUseLastResponse) - - base := &http.Client{} - client = buildWebhookHTTPClient(base, nil) - require.Equal(t, webhookHTTPClientTimeout, client.Timeout) - require.NotNil(t, client.CheckRedirect) - - base = &http.Client{Timeout: 2 * time.Second} - client = buildWebhookHTTPClient(base, nil) - require.Equal(t, 2*time.Second, client.Timeout) - require.NotNil(t, client.CheckRedirect) -} - -func TestValidateWebhookURL_RequiresHTTPS(t *testing.T) { - oldLookup := lookupIPAddrs - t.Cleanup(func() { lookupIPAddrs = oldLookup }) - lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) { - return []net.IPAddr{{IP: net.ParseIP("93.184.216.34")}}, nil - } - - _, err := validateWebhookURL(context.Background(), "http://example.com/webhook") - require.Error(t, err) -} - -func TestValidateWebhookURL_InvalidFormatRejected(t *testing.T) { - _, err := validateWebhookURL(context.Background(), "https://[::1") - require.Error(t, err) -} - -func TestValidateWebhookURL_RejectsUserinfo(t *testing.T) { - oldLookup := lookupIPAddrs - t.Cleanup(func() { lookupIPAddrs = oldLookup }) - lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) { - return []net.IPAddr{{IP: net.ParseIP("93.184.216.34")}}, nil - } - - _, err := validateWebhookURL(context.Background(), "https://user:pass@example.com/webhook") - require.Error(t, err) -} - -func TestValidateWebhookURL_RejectsLocalhost(t *testing.T) { - _, err := validateWebhookURL(context.Background(), "https://localhost/webhook") - require.Error(t, err) -} - -func TestValidateWebhookURL_RejectsPrivateIPLiteral(t *testing.T) { - cases := []string{ - "https://0.0.0.0/webhook", - "https://127.0.0.1/webhook", - "https://10.0.0.1/webhook", - "https://192.168.1.2/webhook", - "https://172.16.0.1/webhook", - "https://172.31.255.255/webhook", - "https://100.64.0.1/webhook", - "https://169.254.169.254/webhook", - "https://198.18.0.1/webhook", - "https://224.0.0.1/webhook", - "https://240.0.0.1/webhook", - "https://[::]/webhook", - "https://[::1]/webhook", - "https://[ff02::1]/webhook", - } - for _, tc := range cases { - t.Run(tc, func(t *testing.T) { - _, err := validateWebhookURL(context.Background(), tc) - require.Error(t, err) - }) - } -} - -func TestValidateWebhookURL_RejectsPrivateIPViaDNS(t *testing.T) { - oldLookup := lookupIPAddrs - t.Cleanup(func() { lookupIPAddrs = oldLookup }) - lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) { - require.Equal(t, "internal.example", host) - return []net.IPAddr{{IP: net.ParseIP("10.0.0.2")}}, nil - } - - _, err := validateWebhookURL(context.Background(), "https://internal.example/webhook") - require.Error(t, err) -} - -func TestValidateWebhookURL_RejectsLinkLocalIPViaDNS(t *testing.T) { - oldLookup := lookupIPAddrs - t.Cleanup(func() { lookupIPAddrs = oldLookup }) - lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) { - require.Equal(t, "metadata.example", host) - return []net.IPAddr{{IP: net.ParseIP("169.254.169.254")}}, nil - } - - _, err := validateWebhookURL(context.Background(), "https://metadata.example/webhook") - require.Error(t, err) -} - -func TestValidateWebhookURL_AllowsPublicHostViaDNS(t *testing.T) { - oldLookup := lookupIPAddrs - t.Cleanup(func() { lookupIPAddrs = oldLookup }) - lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) { - require.Equal(t, "example.com", host) - return []net.IPAddr{{IP: net.ParseIP("93.184.216.34")}}, nil - } - - target, err := validateWebhookURL(context.Background(), "https://example.com:443/webhook") - require.NoError(t, err) - require.Equal(t, "https", target.URL.Scheme) - require.Equal(t, "example.com", target.URL.Hostname()) - require.Equal(t, "443", target.URL.Port()) -} - -func TestValidateWebhookURL_RejectsInvalidPort(t *testing.T) { - oldLookup := lookupIPAddrs - t.Cleanup(func() { lookupIPAddrs = oldLookup }) - lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) { - return []net.IPAddr{{IP: net.ParseIP("93.184.216.34")}}, nil - } - - _, err := validateWebhookURL(context.Background(), "https://example.com:99999/webhook") - require.Error(t, err) -} - -func TestWebhookTransport_UsesPinnedIP_NoDNSRebinding(t *testing.T) { - oldLookup := lookupIPAddrs - oldDial := webhookBaseDialContext - t.Cleanup(func() { - lookupIPAddrs = oldLookup - webhookBaseDialContext = oldDial - }) - - lookupCalls := 0 - lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) { - lookupCalls++ - require.Equal(t, "example.com", host) - return []net.IPAddr{{IP: net.ParseIP("93.184.216.34")}}, nil - } - - target, err := validateWebhookURL(context.Background(), "https://example.com/webhook") - require.NoError(t, err) - require.Equal(t, 1, lookupCalls) - - lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) { - lookupCalls++ - return []net.IPAddr{{IP: net.ParseIP("10.0.0.1")}}, nil - } - - var dialAddrs []string - webhookBaseDialContext = func(ctx context.Context, network, addr string) (net.Conn, error) { - dialAddrs = append(dialAddrs, addr) - return nil, errors.New("dial blocked in test") - } - - client := buildWebhookHTTPClient(nil, target) - transport, ok := client.Transport.(*http.Transport) - require.True(t, ok) - - _, err = transport.DialContext(context.Background(), "tcp", "example.com:443") - require.Error(t, err) - require.Equal(t, []string{"93.184.216.34:443"}, dialAddrs) - require.Equal(t, 1, lookupCalls, "dial path must not re-resolve DNS") -} - -func TestRetryWithBackoff_SucceedsAfterRetries(t *testing.T) { - oldSleep := opsAlertSleep - t.Cleanup(func() { opsAlertSleep = oldSleep }) - - var slept []time.Duration - opsAlertSleep = func(ctx context.Context, d time.Duration) error { - slept = append(slept, d) - return nil - } - - attempts := 0 - err := retryWithBackoff( - context.Background(), - 3, - []time.Duration{time.Second, 2 * time.Second, 4 * time.Second}, - func() error { - attempts++ - if attempts <= 3 { - return errors.New("send failed") - } - return nil - }, - nil, - ) - require.NoError(t, err) - require.Equal(t, 4, attempts) - require.Equal(t, []time.Duration{time.Second, 2 * time.Second, 4 * time.Second}, slept) -} - -func TestRetryWithBackoff_ContextCanceledStopsRetries(t *testing.T) { - oldSleep := opsAlertSleep - t.Cleanup(func() { opsAlertSleep = oldSleep }) - - var slept []time.Duration - opsAlertSleep = func(ctx context.Context, d time.Duration) error { - slept = append(slept, d) - return ctx.Err() - } - - ctx, cancel := context.WithCancel(context.Background()) - attempts := 0 - err := retryWithBackoff( - ctx, - 3, - []time.Duration{time.Second, 2 * time.Second, 4 * time.Second}, - func() error { - attempts++ - return errors.New("send failed") - }, - func(attempt int, total int, nextDelay time.Duration, err error) { - if attempt == 1 { - cancel() - } - }, - ) - require.ErrorIs(t, err, context.Canceled) - require.Equal(t, 1, attempts) - require.Equal(t, []time.Duration{time.Second}, slept) -} diff --git a/backend/internal/service/ops_alerts.go b/backend/internal/service/ops_alerts.go deleted file mode 100644 index 0a239864..00000000 --- a/backend/internal/service/ops_alerts.go +++ /dev/null @@ -1,92 +0,0 @@ -package service - -import ( - "context" - "time" -) - -const ( - OpsAlertStatusFiring = "firing" - OpsAlertStatusResolved = "resolved" -) - -const ( - OpsMetricSuccessRate = "success_rate" - OpsMetricErrorRate = "error_rate" - OpsMetricP95LatencyMs = "p95_latency_ms" - OpsMetricP99LatencyMs = "p99_latency_ms" - OpsMetricHTTP2Errors = "http2_errors" - OpsMetricCPUUsagePercent = "cpu_usage_percent" - OpsMetricMemoryUsagePercent = "memory_usage_percent" - OpsMetricQueueDepth = "concurrency_queue_depth" -) - -type OpsAlertRule struct { - ID int64 `json:"id"` - Name string `json:"name"` - Description string `json:"description"` - Enabled bool `json:"enabled"` - MetricType string `json:"metric_type"` - Operator string `json:"operator"` - Threshold float64 `json:"threshold"` - WindowMinutes int `json:"window_minutes"` - SustainedMinutes int `json:"sustained_minutes"` - Severity string `json:"severity"` - NotifyEmail bool `json:"notify_email"` - NotifyWebhook bool `json:"notify_webhook"` - WebhookURL string `json:"webhook_url"` - CooldownMinutes int `json:"cooldown_minutes"` - DimensionFilters map[string]any `json:"dimension_filters,omitempty"` - NotifyChannels []string `json:"notify_channels,omitempty"` - NotifyConfig map[string]any `json:"notify_config,omitempty"` - CreatedAt time.Time `json:"created_at"` - UpdatedAt time.Time `json:"updated_at"` -} - -type OpsAlertEvent struct { - ID int64 `json:"id"` - RuleID int64 `json:"rule_id"` - Severity string `json:"severity"` - Status string `json:"status"` - Title string `json:"title"` - Description string `json:"description"` - MetricValue float64 `json:"metric_value"` - ThresholdValue float64 `json:"threshold_value"` - FiredAt time.Time `json:"fired_at"` - ResolvedAt *time.Time `json:"resolved_at"` - EmailSent bool `json:"email_sent"` - WebhookSent bool `json:"webhook_sent"` - CreatedAt time.Time `json:"created_at"` -} - -func (s *OpsService) ListAlertRules(ctx context.Context) ([]OpsAlertRule, error) { - return s.repo.ListAlertRules(ctx) -} - -func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) { - return s.repo.GetActiveAlertEvent(ctx, ruleID) -} - -func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) { - return s.repo.GetLatestAlertEvent(ctx, ruleID) -} - -func (s *OpsService) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) error { - return s.repo.CreateAlertEvent(ctx, event) -} - -func (s *OpsService) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error { - return s.repo.UpdateAlertEventStatus(ctx, eventID, status, resolvedAt) -} - -func (s *OpsService) UpdateAlertEventNotifications(ctx context.Context, eventID int64, emailSent, webhookSent bool) error { - return s.repo.UpdateAlertEventNotifications(ctx, eventID, emailSent, webhookSent) -} - -func (s *OpsService) ListRecentSystemMetrics(ctx context.Context, windowMinutes, limit int) ([]OpsMetrics, error) { - return s.repo.ListRecentSystemMetrics(ctx, windowMinutes, limit) -} - -func (s *OpsService) CountActiveAlerts(ctx context.Context) (int, error) { - return s.repo.CountActiveAlerts(ctx) -} diff --git a/backend/internal/service/ops_metrics_collector.go b/backend/internal/service/ops_metrics_collector.go deleted file mode 100644 index 01bd4596..00000000 --- a/backend/internal/service/ops_metrics_collector.go +++ /dev/null @@ -1,203 +0,0 @@ -package service - -import ( - "context" - "log" - "runtime" - "sync" - "time" - - "github.com/shirou/gopsutil/v4/cpu" - "github.com/shirou/gopsutil/v4/mem" -) - -const ( - opsMetricsInterval = 1 * time.Minute - opsMetricsCollectTimeout = 10 * time.Second - - opsMetricsWindowShortMinutes = 1 - opsMetricsWindowLongMinutes = 5 - - bytesPerMB = 1024 * 1024 - cpuUsageSampleInterval = 0 * time.Second - - percentScale = 100 -) - -type OpsMetricsCollector struct { - opsService *OpsService - concurrencyService *ConcurrencyService - interval time.Duration - lastGCPauseTotal uint64 - lastGCPauseMu sync.Mutex - stopCh chan struct{} - startOnce sync.Once - stopOnce sync.Once -} - -func NewOpsMetricsCollector(opsService *OpsService, concurrencyService *ConcurrencyService) *OpsMetricsCollector { - return &OpsMetricsCollector{ - opsService: opsService, - concurrencyService: concurrencyService, - interval: opsMetricsInterval, - } -} - -func (c *OpsMetricsCollector) Start() { - if c == nil { - return - } - c.startOnce.Do(func() { - if c.stopCh == nil { - c.stopCh = make(chan struct{}) - } - go c.run() - }) -} - -func (c *OpsMetricsCollector) Stop() { - if c == nil { - return - } - c.stopOnce.Do(func() { - if c.stopCh != nil { - close(c.stopCh) - } - }) -} - -func (c *OpsMetricsCollector) run() { - ticker := time.NewTicker(c.interval) - defer ticker.Stop() - - c.collectOnce() - for { - select { - case <-ticker.C: - c.collectOnce() - case <-c.stopCh: - return - } - } -} - -func (c *OpsMetricsCollector) collectOnce() { - if c.opsService == nil { - return - } - - ctx, cancel := context.WithTimeout(context.Background(), opsMetricsCollectTimeout) - defer cancel() - - now := time.Now() - systemStats := c.collectSystemStats(ctx) - queueDepth := c.collectQueueDepth(ctx) - activeAlerts := c.collectActiveAlerts(ctx) - - for _, window := range []int{opsMetricsWindowShortMinutes, opsMetricsWindowLongMinutes} { - startTime := now.Add(-time.Duration(window) * time.Minute) - windowStats, err := c.opsService.GetWindowStats(ctx, startTime, now) - if err != nil { - log.Printf("[OpsMetrics] failed to get window stats (%dm): %v", window, err) - continue - } - - successRate, errorRate := computeRates(windowStats.SuccessCount, windowStats.ErrorCount) - requestCount := windowStats.SuccessCount + windowStats.ErrorCount - metric := &OpsMetrics{ - WindowMinutes: window, - RequestCount: requestCount, - SuccessCount: windowStats.SuccessCount, - ErrorCount: windowStats.ErrorCount, - SuccessRate: successRate, - ErrorRate: errorRate, - P95LatencyMs: windowStats.P95LatencyMs, - P99LatencyMs: windowStats.P99LatencyMs, - HTTP2Errors: windowStats.HTTP2Errors, - ActiveAlerts: activeAlerts, - CPUUsagePercent: systemStats.cpuUsage, - MemoryUsedMB: systemStats.memoryUsedMB, - MemoryTotalMB: systemStats.memoryTotalMB, - MemoryUsagePercent: systemStats.memoryUsagePercent, - HeapAllocMB: systemStats.heapAllocMB, - GCPauseMs: systemStats.gcPauseMs, - ConcurrencyQueueDepth: queueDepth, - UpdatedAt: now, - } - - if err := c.opsService.RecordMetrics(ctx, metric); err != nil { - log.Printf("[OpsMetrics] failed to record metrics (%dm): %v", window, err) - } - } - -} - -func computeRates(successCount, errorCount int64) (float64, float64) { - total := successCount + errorCount - if total == 0 { - // No traffic => no data. Rates are kept at 0 and request_count will be 0. - // The UI should render this as N/A instead of "100% success". - return 0, 0 - } - successRate := float64(successCount) / float64(total) * percentScale - errorRate := float64(errorCount) / float64(total) * percentScale - return successRate, errorRate -} - -type opsSystemStats struct { - cpuUsage float64 - memoryUsedMB int64 - memoryTotalMB int64 - memoryUsagePercent float64 - heapAllocMB int64 - gcPauseMs float64 -} - -func (c *OpsMetricsCollector) collectSystemStats(ctx context.Context) opsSystemStats { - stats := opsSystemStats{} - - if percents, err := cpu.PercentWithContext(ctx, cpuUsageSampleInterval, false); err == nil && len(percents) > 0 { - stats.cpuUsage = percents[0] - } - - if vm, err := mem.VirtualMemoryWithContext(ctx); err == nil { - stats.memoryUsedMB = int64(vm.Used / bytesPerMB) - stats.memoryTotalMB = int64(vm.Total / bytesPerMB) - stats.memoryUsagePercent = vm.UsedPercent - } - - var memStats runtime.MemStats - runtime.ReadMemStats(&memStats) - stats.heapAllocMB = int64(memStats.HeapAlloc / bytesPerMB) - c.lastGCPauseMu.Lock() - if c.lastGCPauseTotal != 0 && memStats.PauseTotalNs >= c.lastGCPauseTotal { - stats.gcPauseMs = float64(memStats.PauseTotalNs-c.lastGCPauseTotal) / float64(time.Millisecond) - } - c.lastGCPauseTotal = memStats.PauseTotalNs - c.lastGCPauseMu.Unlock() - - return stats -} - -func (c *OpsMetricsCollector) collectQueueDepth(ctx context.Context) int { - if c.concurrencyService == nil { - return 0 - } - depth, err := c.concurrencyService.GetTotalWaitCount(ctx) - if err != nil { - log.Printf("[OpsMetrics] failed to get queue depth: %v", err) - return 0 - } - return depth -} - -func (c *OpsMetricsCollector) collectActiveAlerts(ctx context.Context) int { - if c.opsService == nil { - return 0 - } - count, err := c.opsService.CountActiveAlerts(ctx) - if err != nil { - return 0 - } - return count -} diff --git a/backend/internal/service/ops_service.go b/backend/internal/service/ops_service.go deleted file mode 100644 index 63a539d4..00000000 --- a/backend/internal/service/ops_service.go +++ /dev/null @@ -1,1020 +0,0 @@ -package service - -import ( - "context" - "database/sql" - "errors" - "fmt" - "log" - "math" - "runtime" - "strings" - "sync" - "time" - - "github.com/shirou/gopsutil/v4/disk" -) - -type OpsMetrics struct { - WindowMinutes int `json:"window_minutes"` - RequestCount int64 `json:"request_count"` - SuccessCount int64 `json:"success_count"` - ErrorCount int64 `json:"error_count"` - SuccessRate float64 `json:"success_rate"` - ErrorRate float64 `json:"error_rate"` - P95LatencyMs int `json:"p95_latency_ms"` - P99LatencyMs int `json:"p99_latency_ms"` - HTTP2Errors int `json:"http2_errors"` - ActiveAlerts int `json:"active_alerts"` - CPUUsagePercent float64 `json:"cpu_usage_percent"` - MemoryUsedMB int64 `json:"memory_used_mb"` - MemoryTotalMB int64 `json:"memory_total_mb"` - MemoryUsagePercent float64 `json:"memory_usage_percent"` - HeapAllocMB int64 `json:"heap_alloc_mb"` - GCPauseMs float64 `json:"gc_pause_ms"` - ConcurrencyQueueDepth int `json:"concurrency_queue_depth"` - UpdatedAt time.Time `json:"updated_at,omitempty"` -} - -type OpsErrorLog struct { - ID int64 `json:"id"` - CreatedAt time.Time `json:"created_at"` - Phase string `json:"phase"` - Type string `json:"type"` - Severity string `json:"severity"` - StatusCode int `json:"status_code"` - Platform string `json:"platform"` - Model string `json:"model"` - LatencyMs *int `json:"latency_ms"` - RequestID string `json:"request_id"` - Message string `json:"message"` - - UserID *int64 `json:"user_id,omitempty"` - APIKeyID *int64 `json:"api_key_id,omitempty"` - AccountID *int64 `json:"account_id,omitempty"` - GroupID *int64 `json:"group_id,omitempty"` - ClientIP string `json:"client_ip,omitempty"` - RequestPath string `json:"request_path,omitempty"` - Stream bool `json:"stream"` -} - -type OpsErrorLogFilters struct { - StartTime *time.Time - EndTime *time.Time - Platform string - Phase string - Severity string - Query string - Limit int -} - -type OpsWindowStats struct { - SuccessCount int64 - ErrorCount int64 - P95LatencyMs int - P99LatencyMs int - HTTP2Errors int -} - -type ProviderStats struct { - Platform string - - RequestCount int64 - SuccessCount int64 - ErrorCount int64 - - AvgLatencyMs int - P99LatencyMs int - - Error4xxCount int64 - Error5xxCount int64 - TimeoutCount int64 -} - -type ProviderHealthErrorsByType struct { - HTTP4xx int64 `json:"4xx"` - HTTP5xx int64 `json:"5xx"` - Timeout int64 `json:"timeout"` -} - -type ProviderHealthData struct { - Name string `json:"name"` - RequestCount int64 `json:"request_count"` - SuccessRate float64 `json:"success_rate"` - ErrorRate float64 `json:"error_rate"` - LatencyAvg int `json:"latency_avg"` - LatencyP99 int `json:"latency_p99"` - Status string `json:"status"` - ErrorsByType ProviderHealthErrorsByType `json:"errors_by_type"` -} - -type LatencyHistogramItem struct { - Range string `json:"range"` - Count int64 `json:"count"` - Percentage float64 `json:"percentage"` -} - -type ErrorDistributionItem struct { - Code string `json:"code"` - Message string `json:"message"` - Count int64 `json:"count"` - Percentage float64 `json:"percentage"` -} - -type OpsRepository interface { - CreateErrorLog(ctx context.Context, log *OpsErrorLog) error - // ListErrorLogsLegacy keeps the original non-paginated query API used by the - // existing /api/v1/admin/ops/error-logs endpoint (limit is capped at 500; for - // stable pagination use /api/v1/admin/ops/errors). - ListErrorLogsLegacy(ctx context.Context, filters OpsErrorLogFilters) ([]OpsErrorLog, error) - - // ListErrorLogs provides a paginated error-log query API (with total count). - ListErrorLogs(ctx context.Context, filter *ErrorLogFilter) ([]*ErrorLog, int64, error) - GetLatestSystemMetric(ctx context.Context) (*OpsMetrics, error) - CreateSystemMetric(ctx context.Context, metric *OpsMetrics) error - GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) - GetProviderStats(ctx context.Context, startTime, endTime time.Time) ([]*ProviderStats, error) - GetLatencyHistogram(ctx context.Context, startTime, endTime time.Time) ([]*LatencyHistogramItem, error) - GetErrorDistribution(ctx context.Context, startTime, endTime time.Time) ([]*ErrorDistributionItem, error) - ListRecentSystemMetrics(ctx context.Context, windowMinutes, limit int) ([]OpsMetrics, error) - ListSystemMetricsRange(ctx context.Context, windowMinutes int, startTime, endTime time.Time, limit int) ([]OpsMetrics, error) - ListAlertRules(ctx context.Context) ([]OpsAlertRule, error) - GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) - GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) - CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) error - UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error - UpdateAlertEventNotifications(ctx context.Context, eventID int64, emailSent, webhookSent bool) error - CountActiveAlerts(ctx context.Context) (int, error) - GetOverviewStats(ctx context.Context, startTime, endTime time.Time) (*OverviewStats, error) - - // Redis-backed cache/health (best-effort; implementation lives in repository layer). - GetCachedLatestSystemMetric(ctx context.Context) (*OpsMetrics, error) - SetCachedLatestSystemMetric(ctx context.Context, metric *OpsMetrics) error - GetCachedDashboardOverview(ctx context.Context, timeRange string) (*DashboardOverviewData, error) - SetCachedDashboardOverview(ctx context.Context, timeRange string, data *DashboardOverviewData, ttl time.Duration) error - PingRedis(ctx context.Context) error -} - -type OpsService struct { - repo OpsRepository - sqlDB *sql.DB - - redisNilWarnOnce sync.Once - dbNilWarnOnce sync.Once -} - -const opsDBQueryTimeout = 5 * time.Second - -func NewOpsService(repo OpsRepository, sqlDB *sql.DB) *OpsService { - svc := &OpsService{repo: repo, sqlDB: sqlDB} - - // Best-effort startup health checks: log warnings if Redis/DB is unavailable, - // but never fail service startup (graceful degradation). - log.Printf("[OpsService] Performing startup health checks...") - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - defer cancel() - - redisStatus := svc.checkRedisHealth(ctx) - dbStatus := svc.checkDatabaseHealth(ctx) - - log.Printf("[OpsService] Startup health check complete: Redis=%s, Database=%s", redisStatus, dbStatus) - if redisStatus == "critical" || dbStatus == "critical" { - log.Printf("[OpsService][WARN] Service starting with degraded dependencies - some features may be unavailable") - } - - return svc -} - -func (s *OpsService) RecordError(ctx context.Context, log *OpsErrorLog) error { - if log == nil { - return nil - } - if log.CreatedAt.IsZero() { - log.CreatedAt = time.Now() - } - if log.Severity == "" { - log.Severity = "P2" - } - if log.Phase == "" { - log.Phase = "internal" - } - if log.Type == "" { - log.Type = "unknown_error" - } - if log.Message == "" { - log.Message = "Unknown error" - } - - ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) - defer cancel() - return s.repo.CreateErrorLog(ctxDB, log) -} - -func (s *OpsService) RecordMetrics(ctx context.Context, metric *OpsMetrics) error { - if metric == nil { - return nil - } - if metric.UpdatedAt.IsZero() { - metric.UpdatedAt = time.Now() - } - - ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) - defer cancel() - if err := s.repo.CreateSystemMetric(ctxDB, metric); err != nil { - return err - } - - // Latest metrics snapshot is queried frequently by the ops dashboard; keep a short-lived cache - // to avoid unnecessary DB pressure. Only cache the default (1-minute) window metrics. - windowMinutes := metric.WindowMinutes - if windowMinutes == 0 { - windowMinutes = 1 - } - if windowMinutes == 1 { - if repo := s.repo; repo != nil { - _ = repo.SetCachedLatestSystemMetric(ctx, metric) - } - } - return nil -} - -func (s *OpsService) ListErrorLogs(ctx context.Context, filters OpsErrorLogFilters) ([]OpsErrorLog, int, error) { - ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) - defer cancel() - logs, err := s.repo.ListErrorLogsLegacy(ctxDB, filters) - if err != nil { - return nil, 0, err - } - return logs, len(logs), nil -} - -func (s *OpsService) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) { - ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) - defer cancel() - return s.repo.GetWindowStats(ctxDB, startTime, endTime) -} - -func (s *OpsService) GetLatestMetrics(ctx context.Context) (*OpsMetrics, error) { - // Cache first (best-effort): cache errors should not break the dashboard. - if s != nil { - if repo := s.repo; repo != nil { - if cached, err := repo.GetCachedLatestSystemMetric(ctx); err == nil && cached != nil { - if cached.WindowMinutes == 0 { - cached.WindowMinutes = 1 - } - return cached, nil - } - } - } - - ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) - defer cancel() - metric, err := s.repo.GetLatestSystemMetric(ctxDB) - if err != nil { - if errors.Is(err, sql.ErrNoRows) { - return &OpsMetrics{WindowMinutes: 1}, nil - } - return nil, err - } - if metric == nil { - return &OpsMetrics{WindowMinutes: 1}, nil - } - if metric.WindowMinutes == 0 { - metric.WindowMinutes = 1 - } - - // Backfill cache (best-effort). - if s != nil { - if repo := s.repo; repo != nil { - _ = repo.SetCachedLatestSystemMetric(ctx, metric) - } - } - return metric, nil -} - -func (s *OpsService) ListMetricsHistory(ctx context.Context, windowMinutes int, startTime, endTime time.Time, limit int) ([]OpsMetrics, error) { - if s == nil || s.repo == nil { - return nil, nil - } - if windowMinutes <= 0 { - windowMinutes = 1 - } - if limit <= 0 || limit > 5000 { - limit = 300 - } - if endTime.IsZero() { - endTime = time.Now() - } - if startTime.IsZero() { - startTime = endTime.Add(-time.Duration(limit) * opsMetricsInterval) - } - if startTime.After(endTime) { - startTime, endTime = endTime, startTime - } - - ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) - defer cancel() - return s.repo.ListSystemMetricsRange(ctxDB, windowMinutes, startTime, endTime, limit) -} - -// DashboardOverviewData represents aggregated metrics for the ops dashboard overview. -type DashboardOverviewData struct { - Timestamp time.Time `json:"timestamp"` - HealthScore int `json:"health_score"` - SLA SLAData `json:"sla"` - QPS QPSData `json:"qps"` - TPS TPSData `json:"tps"` - Latency LatencyData `json:"latency"` - Errors ErrorData `json:"errors"` - Resources ResourceData `json:"resources"` - SystemStatus SystemStatusData `json:"system_status"` -} - -type SLAData struct { - Current float64 `json:"current"` - Threshold float64 `json:"threshold"` - Status string `json:"status"` - Trend string `json:"trend"` - Change24h float64 `json:"change_24h"` -} - -type QPSData struct { - Current float64 `json:"current"` - Peak1h float64 `json:"peak_1h"` - Avg1h float64 `json:"avg_1h"` - ChangeVsYesterday float64 `json:"change_vs_yesterday"` -} - -type TPSData struct { - Current float64 `json:"current"` - Peak1h float64 `json:"peak_1h"` - Avg1h float64 `json:"avg_1h"` -} - -type LatencyData struct { - P50 int `json:"p50"` - P95 int `json:"p95"` - P99 int `json:"p99"` - P999 int `json:"p999"` - Avg int `json:"avg"` - Max int `json:"max"` - ThresholdP99 int `json:"threshold_p99"` - Status string `json:"status"` -} - -type ErrorData struct { - TotalCount int64 `json:"total_count"` - ErrorRate float64 `json:"error_rate"` - Count4xx int64 `json:"4xx_count"` - Count5xx int64 `json:"5xx_count"` - TimeoutCount int64 `json:"timeout_count"` - TopError *TopError `json:"top_error,omitempty"` -} - -type TopError struct { - Code string `json:"code"` - Message string `json:"message"` - Count int64 `json:"count"` -} - -type ResourceData struct { - CPUUsage float64 `json:"cpu_usage"` - MemoryUsage float64 `json:"memory_usage"` - DiskUsage float64 `json:"disk_usage"` - Goroutines int `json:"goroutines"` - DBConnections DBConnectionsData `json:"db_connections"` -} - -type DBConnectionsData struct { - Active int `json:"active"` - Idle int `json:"idle"` - Waiting int `json:"waiting"` - Max int `json:"max"` -} - -type SystemStatusData struct { - Redis string `json:"redis"` - Database string `json:"database"` - BackgroundJobs string `json:"background_jobs"` -} - -type OverviewStats struct { - RequestCount int64 - SuccessCount int64 - ErrorCount int64 - Error4xxCount int64 - Error5xxCount int64 - TimeoutCount int64 - LatencyP50 int - LatencyP95 int - LatencyP99 int - LatencyP999 int - LatencyAvg int - LatencyMax int - TopErrorCode string - TopErrorMsg string - TopErrorCount int64 - CPUUsage float64 - MemoryUsage float64 - MemoryUsedMB int64 - MemoryTotalMB int64 - ConcurrencyQueueDepth int -} - -func (s *OpsService) GetDashboardOverview(ctx context.Context, timeRange string) (*DashboardOverviewData, error) { - if s == nil { - return nil, errors.New("ops service not initialized") - } - repo := s.repo - if repo == nil { - return nil, errors.New("ops repository not initialized") - } - if s.sqlDB == nil { - return nil, errors.New("ops service not initialized") - } - if strings.TrimSpace(timeRange) == "" { - timeRange = "1h" - } - - duration, err := parseTimeRange(timeRange) - if err != nil { - return nil, err - } - - if cached, err := repo.GetCachedDashboardOverview(ctx, timeRange); err == nil && cached != nil { - return cached, nil - } - - now := time.Now().UTC() - startTime := now.Add(-duration) - - ctxStats, cancelStats := context.WithTimeout(ctx, opsDBQueryTimeout) - stats, err := repo.GetOverviewStats(ctxStats, startTime, now) - cancelStats() - if err != nil { - return nil, fmt.Errorf("get overview stats: %w", err) - } - if stats == nil { - return nil, errors.New("get overview stats returned nil") - } - - var statsYesterday *OverviewStats - { - yesterdayEnd := now.Add(-24 * time.Hour) - yesterdayStart := yesterdayEnd.Add(-duration) - ctxYesterday, cancelYesterday := context.WithTimeout(ctx, opsDBQueryTimeout) - ys, err := repo.GetOverviewStats(ctxYesterday, yesterdayStart, yesterdayEnd) - cancelYesterday() - if err != nil { - // Best-effort: overview should still work when historical comparison fails. - log.Printf("[OpsOverview] get yesterday overview stats failed: %v", err) - } else { - statsYesterday = ys - } - } - - totalReqs := stats.SuccessCount + stats.ErrorCount - successRate, errorRate := calculateRates(stats.SuccessCount, stats.ErrorCount, totalReqs) - - successRateYesterday := 0.0 - totalReqsYesterday := int64(0) - if statsYesterday != nil { - totalReqsYesterday = statsYesterday.SuccessCount + statsYesterday.ErrorCount - successRateYesterday, _ = calculateRates(statsYesterday.SuccessCount, statsYesterday.ErrorCount, totalReqsYesterday) - } - - slaThreshold := 99.9 - slaChange24h := roundTo2DP(successRate - successRateYesterday) - slaTrend := classifyTrend(slaChange24h, 0.05) - slaStatus := classifySLAStatus(successRate, slaThreshold) - - latencyThresholdP99 := 1000 - latencyStatus := classifyLatencyStatus(stats.LatencyP99, latencyThresholdP99) - - qpsCurrent := 0.0 - { - ctxWindow, cancelWindow := context.WithTimeout(ctx, opsDBQueryTimeout) - windowStats, err := repo.GetWindowStats(ctxWindow, now.Add(-1*time.Minute), now) - cancelWindow() - if err == nil && windowStats != nil { - qpsCurrent = roundTo1DP(float64(windowStats.SuccessCount+windowStats.ErrorCount) / 60) - } else if err != nil { - log.Printf("[OpsOverview] get realtime qps failed: %v", err) - } - } - - qpsAvg := roundTo1DP(safeDivide(float64(totalReqs), duration.Seconds())) - qpsPeak := qpsAvg - { - limit := int(duration.Minutes()) + 5 - if limit < 10 { - limit = 10 - } - if limit > 5000 { - limit = 5000 - } - ctxMetrics, cancelMetrics := context.WithTimeout(ctx, opsDBQueryTimeout) - items, err := repo.ListSystemMetricsRange(ctxMetrics, 1, startTime, now, limit) - cancelMetrics() - if err != nil { - log.Printf("[OpsOverview] get metrics range for peak qps failed: %v", err) - } else { - maxQPS := 0.0 - for _, item := range items { - v := float64(item.RequestCount) / 60 - if v > maxQPS { - maxQPS = v - } - } - if maxQPS > 0 { - qpsPeak = roundTo1DP(maxQPS) - } - } - } - - qpsAvgYesterday := 0.0 - if duration.Seconds() > 0 && totalReqsYesterday > 0 { - qpsAvgYesterday = float64(totalReqsYesterday) / duration.Seconds() - } - qpsChangeVsYesterday := roundTo1DP(percentChange(qpsAvgYesterday, float64(totalReqs)/duration.Seconds())) - - tpsCurrent, tpsPeak, tpsAvg := 0.0, 0.0, 0.0 - if current, peak, avg, err := s.getTokenTPS(ctx, now, startTime, duration); err != nil { - log.Printf("[OpsOverview] get token tps failed: %v", err) - } else { - tpsCurrent, tpsPeak, tpsAvg = roundTo1DP(current), roundTo1DP(peak), roundTo1DP(avg) - } - - diskUsage := 0.0 - if v, err := getDiskUsagePercent(ctx, "/"); err != nil { - log.Printf("[OpsOverview] get disk usage failed: %v", err) - } else { - diskUsage = roundTo1DP(v) - } - - redisStatus := s.checkRedisHealth(ctx) - dbStatus := s.checkDatabaseHealth(ctx) - healthScore := calculateHealthScore(successRate, stats.LatencyP99, errorRate, redisStatus, dbStatus) - - data := &DashboardOverviewData{ - Timestamp: now, - HealthScore: healthScore, - SLA: SLAData{ - Current: successRate, - Threshold: slaThreshold, - Status: slaStatus, - Trend: slaTrend, - Change24h: slaChange24h, - }, - QPS: QPSData{ - Current: qpsCurrent, - Peak1h: qpsPeak, - Avg1h: qpsAvg, - ChangeVsYesterday: qpsChangeVsYesterday, - }, - TPS: TPSData{ - Current: tpsCurrent, - Peak1h: tpsPeak, - Avg1h: tpsAvg, - }, - Latency: LatencyData{ - P50: stats.LatencyP50, - P95: stats.LatencyP95, - P99: stats.LatencyP99, - P999: stats.LatencyP999, - Avg: stats.LatencyAvg, - Max: stats.LatencyMax, - ThresholdP99: latencyThresholdP99, - Status: latencyStatus, - }, - Errors: ErrorData{ - TotalCount: stats.ErrorCount, - ErrorRate: errorRate, - Count4xx: stats.Error4xxCount, - Count5xx: stats.Error5xxCount, - TimeoutCount: stats.TimeoutCount, - }, - Resources: ResourceData{ - CPUUsage: roundTo1DP(stats.CPUUsage), - MemoryUsage: roundTo1DP(stats.MemoryUsage), - DiskUsage: diskUsage, - Goroutines: runtime.NumGoroutine(), - DBConnections: s.getDBConnections(), - }, - SystemStatus: SystemStatusData{ - Redis: redisStatus, - Database: dbStatus, - BackgroundJobs: "healthy", - }, - } - - if stats.TopErrorCount > 0 { - data.Errors.TopError = &TopError{ - Code: stats.TopErrorCode, - Message: stats.TopErrorMsg, - Count: stats.TopErrorCount, - } - } - - _ = repo.SetCachedDashboardOverview(ctx, timeRange, data, 10*time.Second) - - return data, nil -} - -func (s *OpsService) GetProviderHealth(ctx context.Context, timeRange string) ([]*ProviderHealthData, error) { - if s == nil || s.repo == nil { - return nil, nil - } - - if strings.TrimSpace(timeRange) == "" { - timeRange = "1h" - } - window, err := parseTimeRange(timeRange) - if err != nil { - return nil, err - } - - endTime := time.Now() - startTime := endTime.Add(-window) - - ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) - stats, err := s.repo.GetProviderStats(ctxDB, startTime, endTime) - cancel() - if err != nil { - return nil, err - } - - results := make([]*ProviderHealthData, 0, len(stats)) - for _, item := range stats { - if item == nil { - continue - } - - successRate, errorRate := calculateRates(item.SuccessCount, item.ErrorCount, item.RequestCount) - - results = append(results, &ProviderHealthData{ - Name: formatPlatformName(item.Platform), - RequestCount: item.RequestCount, - SuccessRate: successRate, - ErrorRate: errorRate, - LatencyAvg: item.AvgLatencyMs, - LatencyP99: item.P99LatencyMs, - Status: classifyProviderStatus(successRate, item.P99LatencyMs, item.TimeoutCount, item.RequestCount), - ErrorsByType: ProviderHealthErrorsByType{ - HTTP4xx: item.Error4xxCount, - HTTP5xx: item.Error5xxCount, - Timeout: item.TimeoutCount, - }, - }) - } - - return results, nil -} - -func (s *OpsService) GetLatencyHistogram(ctx context.Context, timeRange string) ([]*LatencyHistogramItem, error) { - if s == nil || s.repo == nil { - return nil, nil - } - duration, err := parseTimeRange(timeRange) - if err != nil { - return nil, err - } - endTime := time.Now() - startTime := endTime.Add(-duration) - ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) - defer cancel() - return s.repo.GetLatencyHistogram(ctxDB, startTime, endTime) -} - -func (s *OpsService) GetErrorDistribution(ctx context.Context, timeRange string) ([]*ErrorDistributionItem, error) { - if s == nil || s.repo == nil { - return nil, nil - } - duration, err := parseTimeRange(timeRange) - if err != nil { - return nil, err - } - endTime := time.Now() - startTime := endTime.Add(-duration) - ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) - defer cancel() - return s.repo.GetErrorDistribution(ctxDB, startTime, endTime) -} - -func parseTimeRange(timeRange string) (time.Duration, error) { - value := strings.TrimSpace(timeRange) - if value == "" { - return 0, errors.New("invalid time range") - } - - // Support "7d" style day ranges for convenience. - if strings.HasSuffix(value, "d") { - numberPart := strings.TrimSuffix(value, "d") - if numberPart == "" { - return 0, errors.New("invalid time range") - } - days := 0 - for _, ch := range numberPart { - if ch < '0' || ch > '9' { - return 0, errors.New("invalid time range") - } - days = days*10 + int(ch-'0') - } - if days <= 0 { - return 0, errors.New("invalid time range") - } - return time.Duration(days) * 24 * time.Hour, nil - } - - dur, err := time.ParseDuration(value) - if err != nil || dur <= 0 { - return 0, errors.New("invalid time range") - } - - // Cap to avoid unbounded queries. - const maxWindow = 30 * 24 * time.Hour - if dur > maxWindow { - dur = maxWindow - } - - return dur, nil -} - -func calculateHealthScore(successRate float64, p99Latency int, errorRate float64, redisStatus, dbStatus string) int { - score := 100.0 - - // SLA impact (max -45 points) - if successRate < 99.9 { - score -= math.Min(45, (99.9-successRate)*12) - } - - // Latency impact (max -35 points) - if p99Latency > 1000 { - score -= math.Min(35, float64(p99Latency-1000)/80) - } - - // Error rate impact (max -20 points) - if errorRate > 0.1 { - score -= math.Min(20, (errorRate-0.1)*60) - } - - // Infra status impact - if redisStatus != "healthy" { - score -= 15 - } - if dbStatus != "healthy" { - score -= 20 - } - - if score < 0 { - score = 0 - } - if score > 100 { - score = 100 - } - - return int(math.Round(score)) -} - -func calculateRates(successCount, errorCount, requestCount int64) (successRate float64, errorRate float64) { - if requestCount <= 0 { - return 0, 0 - } - successRate = (float64(successCount) / float64(requestCount)) * 100 - errorRate = (float64(errorCount) / float64(requestCount)) * 100 - return roundTo2DP(successRate), roundTo2DP(errorRate) -} - -func roundTo2DP(v float64) float64 { - return math.Round(v*100) / 100 -} - -func roundTo1DP(v float64) float64 { - return math.Round(v*10) / 10 -} - -func safeDivide(numerator float64, denominator float64) float64 { - if denominator <= 0 { - return 0 - } - return numerator / denominator -} - -func percentChange(previous float64, current float64) float64 { - if previous == 0 { - if current > 0 { - return 100.0 - } - return 0 - } - return (current - previous) / previous * 100 -} - -func classifyTrend(delta float64, deadband float64) string { - if delta > deadband { - return "up" - } - if delta < -deadband { - return "down" - } - return "stable" -} - -func classifySLAStatus(successRate float64, threshold float64) string { - if successRate >= threshold { - return "healthy" - } - if successRate >= threshold-0.5 { - return "warning" - } - return "critical" -} - -func classifyLatencyStatus(p99LatencyMs int, thresholdP99 int) string { - if thresholdP99 <= 0 { - return "healthy" - } - if p99LatencyMs <= thresholdP99 { - return "healthy" - } - if p99LatencyMs <= thresholdP99*2 { - return "warning" - } - return "critical" -} - -func getDiskUsagePercent(ctx context.Context, path string) (float64, error) { - usage, err := disk.UsageWithContext(ctx, path) - if err != nil { - return 0, err - } - if usage == nil { - return 0, nil - } - return usage.UsedPercent, nil -} - -func (s *OpsService) checkRedisHealth(ctx context.Context) string { - if s == nil { - log.Printf("[OpsOverview][WARN] ops service is nil; redis health check skipped") - return "critical" - } - if s.repo == nil { - s.redisNilWarnOnce.Do(func() { - log.Printf("[OpsOverview][WARN] ops repository is nil; redis health check skipped") - }) - return "critical" - } - - ctxPing, cancel := context.WithTimeout(ctx, 800*time.Millisecond) - defer cancel() - - if err := s.repo.PingRedis(ctxPing); err != nil { - log.Printf("[OpsOverview][WARN] redis ping failed: %v", err) - return "critical" - } - return "healthy" -} - -func (s *OpsService) checkDatabaseHealth(ctx context.Context) string { - if s == nil { - log.Printf("[OpsOverview][WARN] ops service is nil; db health check skipped") - return "critical" - } - if s.sqlDB == nil { - s.dbNilWarnOnce.Do(func() { - log.Printf("[OpsOverview][WARN] database is nil; db health check skipped") - }) - return "critical" - } - - ctxPing, cancel := context.WithTimeout(ctx, 800*time.Millisecond) - defer cancel() - - if err := s.sqlDB.PingContext(ctxPing); err != nil { - log.Printf("[OpsOverview][WARN] db ping failed: %v", err) - return "critical" - } - return "healthy" -} - -func (s *OpsService) getDBConnections() DBConnectionsData { - if s == nil || s.sqlDB == nil { - return DBConnectionsData{} - } - - stats := s.sqlDB.Stats() - maxOpen := stats.MaxOpenConnections - if maxOpen < 0 { - maxOpen = 0 - } - - return DBConnectionsData{ - Active: stats.InUse, - Idle: stats.Idle, - Waiting: 0, - Max: maxOpen, - } -} - -func (s *OpsService) getTokenTPS(ctx context.Context, endTime time.Time, startTime time.Time, duration time.Duration) (current float64, peak float64, avg float64, err error) { - if s == nil || s.sqlDB == nil { - return 0, 0, 0, nil - } - - if duration <= 0 { - return 0, 0, 0, nil - } - - // Current TPS: last 1 minute. - var tokensLastMinute int64 - { - lastMinuteStart := endTime.Add(-1 * time.Minute) - ctxQuery, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) - row := s.sqlDB.QueryRowContext(ctxQuery, ` - SELECT COALESCE(SUM(input_tokens + output_tokens), 0) - FROM usage_logs - WHERE created_at >= $1 AND created_at < $2 - `, lastMinuteStart, endTime) - scanErr := row.Scan(&tokensLastMinute) - cancel() - if scanErr != nil { - return 0, 0, 0, scanErr - } - } - - var totalTokens int64 - var maxTokensPerMinute int64 - { - ctxQuery, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) - row := s.sqlDB.QueryRowContext(ctxQuery, ` - WITH buckets AS ( - SELECT - date_trunc('minute', created_at) AS bucket, - SUM(input_tokens + output_tokens) AS tokens - FROM usage_logs - WHERE created_at >= $1 AND created_at < $2 - GROUP BY 1 - ) - SELECT - COALESCE(SUM(tokens), 0) AS total_tokens, - COALESCE(MAX(tokens), 0) AS max_tokens_per_minute - FROM buckets - `, startTime, endTime) - scanErr := row.Scan(&totalTokens, &maxTokensPerMinute) - cancel() - if scanErr != nil { - return 0, 0, 0, scanErr - } - } - - current = safeDivide(float64(tokensLastMinute), 60) - peak = safeDivide(float64(maxTokensPerMinute), 60) - avg = safeDivide(float64(totalTokens), duration.Seconds()) - return current, peak, avg, nil -} - -func formatPlatformName(platform string) string { - switch strings.ToLower(strings.TrimSpace(platform)) { - case PlatformOpenAI: - return "OpenAI" - case PlatformAnthropic: - return "Anthropic" - case PlatformGemini: - return "Gemini" - case PlatformAntigravity: - return "Antigravity" - default: - if platform == "" { - return "Unknown" - } - if len(platform) == 1 { - return strings.ToUpper(platform) - } - return strings.ToUpper(platform[:1]) + platform[1:] - } -} - -func classifyProviderStatus(successRate float64, p99LatencyMs int, timeoutCount int64, requestCount int64) string { - if requestCount <= 0 { - return "healthy" - } - - if successRate < 98 { - return "critical" - } - if successRate < 99.5 { - return "warning" - } - - // Heavy timeout volume should be highlighted even if the overall success rate is okay. - if timeoutCount >= 10 && requestCount >= 100 { - return "warning" - } - - if p99LatencyMs > 0 && p99LatencyMs >= 5000 { - return "warning" - } - - return "healthy" -} diff --git a/backend/migrations/017_ops_metrics_and_error_logs.sql b/backend/migrations/017_ops_metrics_and_error_logs.sql deleted file mode 100644 index fd6a0215..00000000 --- a/backend/migrations/017_ops_metrics_and_error_logs.sql +++ /dev/null @@ -1,48 +0,0 @@ --- Ops error logs and system metrics - -CREATE TABLE IF NOT EXISTS ops_error_logs ( - id BIGSERIAL PRIMARY KEY, - request_id VARCHAR(64), - user_id BIGINT, - api_key_id BIGINT, - account_id BIGINT, - group_id BIGINT, - client_ip INET, - error_phase VARCHAR(32) NOT NULL, - error_type VARCHAR(64) NOT NULL, - severity VARCHAR(4) NOT NULL, - status_code INT, - platform VARCHAR(32), - model VARCHAR(100), - request_path VARCHAR(256), - stream BOOLEAN NOT NULL DEFAULT FALSE, - error_message TEXT, - error_body TEXT, - provider_error_code VARCHAR(64), - provider_error_type VARCHAR(64), - is_retryable BOOLEAN NOT NULL DEFAULT FALSE, - is_user_actionable BOOLEAN NOT NULL DEFAULT FALSE, - retry_count INT NOT NULL DEFAULT 0, - completion_status VARCHAR(16), - duration_ms INT, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() -); - -CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at ON ops_error_logs (created_at DESC); -CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase ON ops_error_logs (error_phase); -CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform ON ops_error_logs (platform); -CREATE INDEX IF NOT EXISTS idx_ops_error_logs_severity ON ops_error_logs (severity); -CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_platform_time ON ops_error_logs (error_phase, platform, created_at DESC); - -CREATE TABLE IF NOT EXISTS ops_system_metrics ( - id BIGSERIAL PRIMARY KEY, - success_rate DOUBLE PRECISION, - error_rate DOUBLE PRECISION, - p95_latency_ms INT, - p99_latency_ms INT, - http2_errors INT, - active_alerts INT, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() -); - -CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at ON ops_system_metrics (created_at DESC); diff --git a/backend/migrations/018_ops_metrics_system_stats.sql b/backend/migrations/018_ops_metrics_system_stats.sql deleted file mode 100644 index e92d2137..00000000 --- a/backend/migrations/018_ops_metrics_system_stats.sql +++ /dev/null @@ -1,14 +0,0 @@ --- Extend ops_system_metrics with windowed/system stats - -ALTER TABLE ops_system_metrics - ADD COLUMN IF NOT EXISTS window_minutes INT NOT NULL DEFAULT 1, - ADD COLUMN IF NOT EXISTS cpu_usage_percent DOUBLE PRECISION, - ADD COLUMN IF NOT EXISTS memory_used_mb BIGINT, - ADD COLUMN IF NOT EXISTS memory_total_mb BIGINT, - ADD COLUMN IF NOT EXISTS memory_usage_percent DOUBLE PRECISION, - ADD COLUMN IF NOT EXISTS heap_alloc_mb BIGINT, - ADD COLUMN IF NOT EXISTS gc_pause_ms DOUBLE PRECISION, - ADD COLUMN IF NOT EXISTS concurrency_queue_depth INT; - -CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time - ON ops_system_metrics (window_minutes, created_at DESC); diff --git a/backend/migrations/019_ops_alerts.sql b/backend/migrations/019_ops_alerts.sql deleted file mode 100644 index 91dfd848..00000000 --- a/backend/migrations/019_ops_alerts.sql +++ /dev/null @@ -1,42 +0,0 @@ --- Ops alert rules and events - -CREATE TABLE IF NOT EXISTS ops_alert_rules ( - id BIGSERIAL PRIMARY KEY, - name VARCHAR(128) NOT NULL, - description TEXT, - enabled BOOLEAN NOT NULL DEFAULT TRUE, - metric_type VARCHAR(64) NOT NULL, - operator VARCHAR(8) NOT NULL, - threshold DOUBLE PRECISION NOT NULL, - window_minutes INT NOT NULL DEFAULT 1, - sustained_minutes INT NOT NULL DEFAULT 1, - severity VARCHAR(4) NOT NULL DEFAULT 'P1', - notify_email BOOLEAN NOT NULL DEFAULT FALSE, - notify_webhook BOOLEAN NOT NULL DEFAULT FALSE, - webhook_url TEXT, - cooldown_minutes INT NOT NULL DEFAULT 10, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() -); - -CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled ON ops_alert_rules (enabled); -CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_metric ON ops_alert_rules (metric_type, window_minutes); - -CREATE TABLE IF NOT EXISTS ops_alert_events ( - id BIGSERIAL PRIMARY KEY, - rule_id BIGINT NOT NULL REFERENCES ops_alert_rules(id) ON DELETE CASCADE, - severity VARCHAR(4) NOT NULL, - status VARCHAR(16) NOT NULL DEFAULT 'firing', - title VARCHAR(200), - description TEXT, - metric_value DOUBLE PRECISION, - threshold_value DOUBLE PRECISION, - fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - resolved_at TIMESTAMPTZ, - email_sent BOOLEAN NOT NULL DEFAULT FALSE, - webhook_sent BOOLEAN NOT NULL DEFAULT FALSE, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() -); - -CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status ON ops_alert_events (rule_id, status); -CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at ON ops_alert_events (fired_at DESC); diff --git a/backend/migrations/020_seed_ops_alert_rules.sql b/backend/migrations/020_seed_ops_alert_rules.sql deleted file mode 100644 index eaf128a3..00000000 --- a/backend/migrations/020_seed_ops_alert_rules.sql +++ /dev/null @@ -1,32 +0,0 @@ --- Seed default ops alert rules (idempotent) - -INSERT INTO ops_alert_rules ( - name, - description, - enabled, - metric_type, - operator, - threshold, - window_minutes, - sustained_minutes, - severity, - notify_email, - notify_webhook, - webhook_url, - cooldown_minutes -) -SELECT - 'Global success rate < 99%', - 'Trigger when the 1-minute success rate drops below 99% for 2 consecutive minutes.', - TRUE, - 'success_rate', - '<', - 99, - 1, - 2, - 'P1', - TRUE, - FALSE, - NULL, - 10 -WHERE NOT EXISTS (SELECT 1 FROM ops_alert_rules); diff --git a/backend/migrations/021_seed_ops_alert_rules_more.sql b/backend/migrations/021_seed_ops_alert_rules_more.sql deleted file mode 100644 index 1b0413fc..00000000 --- a/backend/migrations/021_seed_ops_alert_rules_more.sql +++ /dev/null @@ -1,205 +0,0 @@ --- Seed additional ops alert rules (idempotent) - -INSERT INTO ops_alert_rules ( - name, - description, - enabled, - metric_type, - operator, - threshold, - window_minutes, - sustained_minutes, - severity, - notify_email, - notify_webhook, - webhook_url, - cooldown_minutes -) -SELECT - 'Global error rate > 1%', - 'Trigger when the 1-minute error rate exceeds 1% for 2 consecutive minutes.', - TRUE, - 'error_rate', - '>', - 1, - 1, - 2, - 'P1', - TRUE, - CASE - WHEN (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1) IS NULL THEN FALSE - ELSE TRUE - END, - (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1), - 10 -WHERE NOT EXISTS (SELECT 1 FROM ops_alert_rules WHERE name = 'Global error rate > 1%'); - -INSERT INTO ops_alert_rules ( - name, - description, - enabled, - metric_type, - operator, - threshold, - window_minutes, - sustained_minutes, - severity, - notify_email, - notify_webhook, - webhook_url, - cooldown_minutes -) -SELECT - 'P99 latency > 2000ms', - 'Trigger when the 5-minute P99 latency exceeds 2000ms for 2 consecutive samples.', - TRUE, - 'p99_latency_ms', - '>', - 2000, - 5, - 2, - 'P1', - TRUE, - CASE - WHEN (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1) IS NULL THEN FALSE - ELSE TRUE - END, - (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1), - 15 -WHERE NOT EXISTS (SELECT 1 FROM ops_alert_rules WHERE name = 'P99 latency > 2000ms'); - -INSERT INTO ops_alert_rules ( - name, - description, - enabled, - metric_type, - operator, - threshold, - window_minutes, - sustained_minutes, - severity, - notify_email, - notify_webhook, - webhook_url, - cooldown_minutes -) -SELECT - 'HTTP/2 errors > 20', - 'Trigger when HTTP/2 errors exceed 20 in the last minute for 2 consecutive minutes.', - TRUE, - 'http2_errors', - '>', - 20, - 1, - 2, - 'P2', - FALSE, - CASE - WHEN (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1) IS NULL THEN FALSE - ELSE TRUE - END, - (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1), - 10 -WHERE NOT EXISTS (SELECT 1 FROM ops_alert_rules WHERE name = 'HTTP/2 errors > 20'); - -INSERT INTO ops_alert_rules ( - name, - description, - enabled, - metric_type, - operator, - threshold, - window_minutes, - sustained_minutes, - severity, - notify_email, - notify_webhook, - webhook_url, - cooldown_minutes -) -SELECT - 'CPU usage > 85%', - 'Trigger when CPU usage exceeds 85% for 5 consecutive minutes.', - TRUE, - 'cpu_usage_percent', - '>', - 85, - 1, - 5, - 'P2', - FALSE, - CASE - WHEN (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1) IS NULL THEN FALSE - ELSE TRUE - END, - (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1), - 15 -WHERE NOT EXISTS (SELECT 1 FROM ops_alert_rules WHERE name = 'CPU usage > 85%'); - -INSERT INTO ops_alert_rules ( - name, - description, - enabled, - metric_type, - operator, - threshold, - window_minutes, - sustained_minutes, - severity, - notify_email, - notify_webhook, - webhook_url, - cooldown_minutes -) -SELECT - 'Memory usage > 90%', - 'Trigger when memory usage exceeds 90% for 5 consecutive minutes.', - TRUE, - 'memory_usage_percent', - '>', - 90, - 1, - 5, - 'P2', - FALSE, - CASE - WHEN (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1) IS NULL THEN FALSE - ELSE TRUE - END, - (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1), - 15 -WHERE NOT EXISTS (SELECT 1 FROM ops_alert_rules WHERE name = 'Memory usage > 90%'); - -INSERT INTO ops_alert_rules ( - name, - description, - enabled, - metric_type, - operator, - threshold, - window_minutes, - sustained_minutes, - severity, - notify_email, - notify_webhook, - webhook_url, - cooldown_minutes -) -SELECT - 'Queue depth > 50', - 'Trigger when concurrency queue depth exceeds 50 for 2 consecutive minutes.', - TRUE, - 'concurrency_queue_depth', - '>', - 50, - 1, - 2, - 'P2', - FALSE, - CASE - WHEN (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1) IS NULL THEN FALSE - ELSE TRUE - END, - (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1), - 10 -WHERE NOT EXISTS (SELECT 1 FROM ops_alert_rules WHERE name = 'Queue depth > 50'); diff --git a/backend/migrations/022_enable_ops_alert_webhook.sql b/backend/migrations/022_enable_ops_alert_webhook.sql deleted file mode 100644 index 13d73c51..00000000 --- a/backend/migrations/022_enable_ops_alert_webhook.sql +++ /dev/null @@ -1,7 +0,0 @@ --- Enable webhook notifications for rules with webhook_url configured - -UPDATE ops_alert_rules -SET notify_webhook = TRUE -WHERE webhook_url IS NOT NULL - AND webhook_url <> '' - AND notify_webhook IS DISTINCT FROM TRUE; diff --git a/backend/migrations/023_ops_metrics_request_counts.sql b/backend/migrations/023_ops_metrics_request_counts.sql deleted file mode 100644 index ed515053..00000000 --- a/backend/migrations/023_ops_metrics_request_counts.sql +++ /dev/null @@ -1,6 +0,0 @@ --- Add request counts to ops_system_metrics so the UI/alerts can distinguish "no traffic" from "healthy". - -ALTER TABLE ops_system_metrics - ADD COLUMN IF NOT EXISTS request_count BIGINT NOT NULL DEFAULT 0, - ADD COLUMN IF NOT EXISTS success_count BIGINT NOT NULL DEFAULT 0, - ADD COLUMN IF NOT EXISTS error_count BIGINT NOT NULL DEFAULT 0; diff --git a/backend/migrations/025_enhance_ops_monitoring.sql b/backend/migrations/025_enhance_ops_monitoring.sql deleted file mode 100644 index 69259f69..00000000 --- a/backend/migrations/025_enhance_ops_monitoring.sql +++ /dev/null @@ -1,272 +0,0 @@ --- 运维监控中心 2.0 - 数据库 Schema 增强 --- 创建时间: 2026-01-02 --- 说明: 扩展监控指标,支持多维度分析和告警管理 - --- ============================================ --- 1. 扩展 ops_system_metrics 表 --- ============================================ - --- 添加 RED 指标列 -ALTER TABLE ops_system_metrics - ADD COLUMN IF NOT EXISTS qps DECIMAL(10,2) DEFAULT 0, - ADD COLUMN IF NOT EXISTS tps DECIMAL(10,2) DEFAULT 0, - - -- 错误分类 - ADD COLUMN IF NOT EXISTS error_4xx_count BIGINT DEFAULT 0, - ADD COLUMN IF NOT EXISTS error_5xx_count BIGINT DEFAULT 0, - ADD COLUMN IF NOT EXISTS error_timeout_count BIGINT DEFAULT 0, - - -- 延迟指标扩展 - ADD COLUMN IF NOT EXISTS latency_p50 DECIMAL(10,2), - ADD COLUMN IF NOT EXISTS latency_p999 DECIMAL(10,2), - ADD COLUMN IF NOT EXISTS latency_avg DECIMAL(10,2), - ADD COLUMN IF NOT EXISTS latency_max DECIMAL(10,2), - - -- 上游延迟 - ADD COLUMN IF NOT EXISTS upstream_latency_avg DECIMAL(10,2), - - -- 资源指标 - ADD COLUMN IF NOT EXISTS disk_used BIGINT, - ADD COLUMN IF NOT EXISTS disk_total BIGINT, - ADD COLUMN IF NOT EXISTS disk_iops BIGINT, - ADD COLUMN IF NOT EXISTS network_in_bytes BIGINT, - ADD COLUMN IF NOT EXISTS network_out_bytes BIGINT, - - -- 饱和度指标 - ADD COLUMN IF NOT EXISTS goroutine_count INT, - ADD COLUMN IF NOT EXISTS db_conn_active INT, - ADD COLUMN IF NOT EXISTS db_conn_idle INT, - ADD COLUMN IF NOT EXISTS db_conn_waiting INT, - - -- 业务指标 - ADD COLUMN IF NOT EXISTS token_consumed BIGINT DEFAULT 0, - ADD COLUMN IF NOT EXISTS token_rate DECIMAL(10,2) DEFAULT 0, - ADD COLUMN IF NOT EXISTS active_subscriptions INT DEFAULT 0, - - -- 维度标签 (支持多维度分析) - ADD COLUMN IF NOT EXISTS tags JSONB; - --- 添加 JSONB 索引以加速标签查询 -CREATE INDEX IF NOT EXISTS idx_ops_metrics_tags ON ops_system_metrics USING GIN(tags); - --- 添加注释 -COMMENT ON COLUMN ops_system_metrics.qps IS '每秒查询数 (Queries Per Second)'; -COMMENT ON COLUMN ops_system_metrics.tps IS '每秒事务数 (Transactions Per Second)'; -COMMENT ON COLUMN ops_system_metrics.error_4xx_count IS '客户端错误数量 (4xx)'; -COMMENT ON COLUMN ops_system_metrics.error_5xx_count IS '服务端错误数量 (5xx)'; -COMMENT ON COLUMN ops_system_metrics.error_timeout_count IS '超时错误数量'; -COMMENT ON COLUMN ops_system_metrics.upstream_latency_avg IS '上游 API 平均延迟 (ms)'; -COMMENT ON COLUMN ops_system_metrics.goroutine_count IS 'Goroutine 数量 (检测泄露)'; -COMMENT ON COLUMN ops_system_metrics.tags IS '维度标签 (JSON), 如: {"account_id": "123", "api_path": "/v1/chat"}'; - --- ============================================ --- 2. 创建维度统计表 --- ============================================ - -CREATE TABLE IF NOT EXISTS ops_dimension_stats ( - id BIGSERIAL PRIMARY KEY, - timestamp TIMESTAMPTZ NOT NULL, - - -- 维度类型: account, api_path, provider, region - dimension_type VARCHAR(50) NOT NULL, - dimension_value VARCHAR(255) NOT NULL, - - -- 统计指标 - request_count BIGINT DEFAULT 0, - success_count BIGINT DEFAULT 0, - error_count BIGINT DEFAULT 0, - success_rate DECIMAL(5,2), - error_rate DECIMAL(5,2), - - -- 性能指标 - latency_p50 DECIMAL(10,2), - latency_p95 DECIMAL(10,2), - latency_p99 DECIMAL(10,2), - - -- 业务指标 - token_consumed BIGINT DEFAULT 0, - cost_usd DECIMAL(10,4) DEFAULT 0, - - created_at TIMESTAMPTZ DEFAULT NOW() -); - --- 创建复合索引以加速维度查询 -CREATE INDEX IF NOT EXISTS idx_ops_dim_type_value_time - ON ops_dimension_stats(dimension_type, dimension_value, timestamp DESC); - --- 创建单独的时间索引用于范围查询 -CREATE INDEX IF NOT EXISTS idx_ops_dim_timestamp - ON ops_dimension_stats(timestamp DESC); - --- 添加注释 -COMMENT ON TABLE ops_dimension_stats IS '多维度统计表,支持按账户/API/Provider等维度下钻分析'; -COMMENT ON COLUMN ops_dimension_stats.dimension_type IS '维度类型: account(账户), api_path(接口), provider(上游), region(地域)'; -COMMENT ON COLUMN ops_dimension_stats.dimension_value IS '维度值,如: 账户ID, /v1/chat, openai, us-east-1'; - --- ============================================ --- 3. 创建告警规则表 --- ============================================ - -ALTER TABLE ops_alert_rules - ADD COLUMN IF NOT EXISTS dimension_filters JSONB, - ADD COLUMN IF NOT EXISTS notify_channels JSONB, - ADD COLUMN IF NOT EXISTS notify_config JSONB, - ADD COLUMN IF NOT EXISTS created_by VARCHAR(100), - ADD COLUMN IF NOT EXISTS last_triggered_at TIMESTAMPTZ; - --- ============================================ --- 4. 告警历史表 (使用现有的 ops_alert_events) --- ============================================ --- 注意: 后端代码使用 ops_alert_events 表,不创建新表 - --- ============================================ --- 5. 创建数据清理配置表 --- ============================================ - -CREATE TABLE IF NOT EXISTS ops_data_retention_config ( - id SERIAL PRIMARY KEY, - table_name VARCHAR(100) NOT NULL UNIQUE, - retention_days INT NOT NULL, -- 保留天数 - enabled BOOLEAN DEFAULT true, - last_cleanup_at TIMESTAMPTZ, - created_at TIMESTAMPTZ DEFAULT NOW(), - updated_at TIMESTAMPTZ DEFAULT NOW() -); - --- 插入默认配置 -INSERT INTO ops_data_retention_config (table_name, retention_days) VALUES - ('ops_system_metrics', 30), -- 系统指标保留 30 天 - ('ops_dimension_stats', 30), -- 维度统计保留 30 天 - ('ops_error_logs', 30), -- 错误日志保留 30 天 - ('ops_alert_events', 90), -- 告警事件保留 90 天 - ('usage_logs', 90) -- 使用日志保留 90 天 -ON CONFLICT (table_name) DO NOTHING; - -COMMENT ON TABLE ops_data_retention_config IS '数据保留策略配置表'; -COMMENT ON COLUMN ops_data_retention_config.retention_days IS '数据保留天数,超过此天数的数据将被自动清理'; - --- ============================================ --- 6. 创建辅助函数 --- ============================================ - --- 函数: 计算健康度评分 --- 权重: SLA(40%) + 错误率(30%) + 延迟(20%) + 资源(10%) -CREATE OR REPLACE FUNCTION calculate_health_score( - p_success_rate DECIMAL, - p_error_rate DECIMAL, - p_latency_p99 DECIMAL, - p_cpu_usage DECIMAL -) RETURNS INT AS $$ -DECLARE - sla_score INT; - error_score INT; - latency_score INT; - resource_score INT; -BEGIN - -- SLA 评分 (40分) - sla_score := CASE - WHEN p_success_rate >= 99.9 THEN 40 - WHEN p_success_rate >= 99.5 THEN 35 - WHEN p_success_rate >= 99.0 THEN 30 - WHEN p_success_rate >= 95.0 THEN 20 - ELSE 10 - END; - - -- 错误率评分 (30分) - error_score := CASE - WHEN p_error_rate <= 0.1 THEN 30 - WHEN p_error_rate <= 0.5 THEN 25 - WHEN p_error_rate <= 1.0 THEN 20 - WHEN p_error_rate <= 5.0 THEN 10 - ELSE 5 - END; - - -- 延迟评分 (20分) - latency_score := CASE - WHEN p_latency_p99 <= 500 THEN 20 - WHEN p_latency_p99 <= 1000 THEN 15 - WHEN p_latency_p99 <= 3000 THEN 10 - WHEN p_latency_p99 <= 5000 THEN 5 - ELSE 0 - END; - - -- 资源评分 (10分) - resource_score := CASE - WHEN p_cpu_usage <= 50 THEN 10 - WHEN p_cpu_usage <= 70 THEN 7 - WHEN p_cpu_usage <= 85 THEN 5 - ELSE 2 - END; - - RETURN sla_score + error_score + latency_score + resource_score; -END; -$$ LANGUAGE plpgsql IMMUTABLE; - -COMMENT ON FUNCTION calculate_health_score IS '计算系统健康度评分 (0-100),权重: SLA 40% + 错误率 30% + 延迟 20% + 资源 10%'; - --- ============================================ --- 7. 创建视图: 最新指标快照 --- ============================================ - -CREATE OR REPLACE VIEW ops_latest_metrics AS -SELECT - m.*, - calculate_health_score( - m.success_rate::DECIMAL, - m.error_rate::DECIMAL, - m.p99_latency_ms::DECIMAL, - m.cpu_usage_percent::DECIMAL - ) AS health_score -FROM ops_system_metrics m -WHERE m.window_minutes = 1 - AND m.created_at = (SELECT MAX(created_at) FROM ops_system_metrics WHERE window_minutes = 1) -LIMIT 1; - -COMMENT ON VIEW ops_latest_metrics IS '最新的系统指标快照,包含健康度评分'; - --- ============================================ --- 8. 创建视图: 活跃告警列表 --- ============================================ - -CREATE OR REPLACE VIEW ops_active_alerts AS -SELECT - e.id, - e.rule_id, - r.name AS rule_name, - r.metric_type, - e.fired_at, - e.metric_value, - e.threshold_value, - r.severity, - EXTRACT(EPOCH FROM (NOW() - e.fired_at))::INT AS duration_seconds -FROM ops_alert_events e -JOIN ops_alert_rules r ON e.rule_id = r.id -WHERE e.status = 'firing' -ORDER BY e.fired_at DESC; - -COMMENT ON VIEW ops_active_alerts IS '当前活跃的告警列表'; - --- ============================================ --- 9. 权限设置 (可选) --- ============================================ - --- 如果有专门的 ops 用户,可以授权 --- GRANT SELECT, INSERT, UPDATE ON ops_system_metrics TO ops_user; --- GRANT SELECT, INSERT ON ops_dimension_stats TO ops_user; --- GRANT ALL ON ops_alert_rules TO ops_user; --- GRANT ALL ON ops_alert_events TO ops_user; - --- ============================================ --- 10. 数据完整性检查 --- ============================================ - --- 确保现有数据的兼容性 -UPDATE ops_system_metrics -SET - qps = COALESCE(request_count / (window_minutes * 60.0), 0), - error_rate = COALESCE((error_count::DECIMAL / NULLIF(request_count, 0)) * 100, 0) -WHERE qps = 0 AND request_count > 0; - --- ============================================ --- 完成 --- ============================================ diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts deleted file mode 100644 index 5b06532f..00000000 --- a/frontend/src/api/admin/ops.ts +++ /dev/null @@ -1,324 +0,0 @@ -/** - * Admin Ops API endpoints - * Provides stability metrics and error logs for ops dashboard - */ - -import { apiClient } from '../client' - -export type OpsSeverity = 'P0' | 'P1' | 'P2' | 'P3' -export type OpsPhase = - | 'auth' - | 'concurrency' - | 'billing' - | 'scheduling' - | 'network' - | 'upstream' - | 'response' - | 'internal' -export type OpsPlatform = 'gemini' | 'openai' | 'anthropic' | 'antigravity' - -export interface OpsMetrics { - window_minutes: number - request_count: number - success_count: number - error_count: number - success_rate: number - error_rate: number - p95_latency_ms: number - p99_latency_ms: number - http2_errors: number - active_alerts: number - cpu_usage_percent?: number - memory_used_mb?: number - memory_total_mb?: number - memory_usage_percent?: number - heap_alloc_mb?: number - gc_pause_ms?: number - concurrency_queue_depth?: number - updated_at?: string -} - -export interface OpsErrorLog { - id: number - created_at: string - phase: OpsPhase - type: string - severity: OpsSeverity - status_code: number - platform: OpsPlatform - model: string - latency_ms: number | null - request_id: string - message: string - user_id?: number | null - api_key_id?: number | null - account_id?: number | null - group_id?: number | null - client_ip?: string - request_path?: string - stream?: boolean -} - -export interface OpsErrorListParams { - start_time?: string - end_time?: string - platform?: OpsPlatform - phase?: OpsPhase - severity?: OpsSeverity - q?: string - /** - * Max 500 (legacy endpoint uses a hard cap); use paginated /admin/ops/errors for larger result sets. - */ - limit?: number -} - -export interface OpsErrorListResponse { - items: OpsErrorLog[] - total?: number -} - -export interface OpsMetricsHistoryParams { - window_minutes?: number - minutes?: number - start_time?: string - end_time?: string - limit?: number -} - -export interface OpsMetricsHistoryResponse { - items: OpsMetrics[] -} - -/** - * Get latest ops metrics snapshot - */ -export async function getMetrics(): Promise { - const { data } = await apiClient.get('/admin/ops/metrics') - return data -} - -/** - * List metrics history for charts - */ -export async function listMetricsHistory(params?: OpsMetricsHistoryParams): Promise { - const { data } = await apiClient.get('/admin/ops/metrics/history', { params }) - return data -} - -/** - * List recent error logs with optional filters - */ -export async function listErrors(params?: OpsErrorListParams): Promise { - const { data } = await apiClient.get('/admin/ops/error-logs', { params }) - return data -} - -export interface OpsDashboardOverview { - timestamp: string - health_score: number - sla: { - current: number - threshold: number - status: string - trend: string - change_24h: number - } - qps: { - current: number - peak_1h: number - avg_1h: number - change_vs_yesterday: number - } - tps: { - current: number - peak_1h: number - avg_1h: number - } - latency: { - p50: number - p95: number - p99: number - p999: number - avg: number - max: number - threshold_p99: number - status: string - } - errors: { - total_count: number - error_rate: number - '4xx_count': number - '5xx_count': number - timeout_count: number - top_error?: { - code: string - message: string - count: number - } - } - resources: { - cpu_usage: number - memory_usage: number - disk_usage: number - goroutines: number - db_connections: { - active: number - idle: number - waiting: number - max: number - } - } - system_status: { - redis: string - database: string - background_jobs: string - } -} - -export interface ProviderHealthData { - name: string - request_count: number - success_rate: number - error_rate: number - latency_avg: number - latency_p99: number - status: string - errors_by_type: { - '4xx': number - '5xx': number - timeout: number - } -} - -export interface ProviderHealthResponse { - providers: ProviderHealthData[] - summary: { - total_requests: number - avg_success_rate: number - best_provider: string - worst_provider: string - } -} - -export interface LatencyHistogramResponse { - buckets: { - range: string - count: number - percentage: number - }[] - total_requests: number - slow_request_threshold: number -} - -export interface ErrorDistributionResponse { - items: { - code: string - message: string - count: number - percentage: number - }[] -} - -/** - * Get realtime ops dashboard overview - */ -export async function getDashboardOverview(timeRange = '1h'): Promise { - const { data } = await apiClient.get('/admin/ops/dashboard/overview', { - params: { time_range: timeRange } - }) - return data -} - -/** - * Get provider health comparison - */ -export async function getProviderHealth(timeRange = '1h'): Promise { - const { data } = await apiClient.get('/admin/ops/dashboard/providers', { - params: { time_range: timeRange } - }) - return data -} - -/** - * Get latency histogram - */ -export async function getLatencyHistogram(timeRange = '1h'): Promise { - const { data } = await apiClient.get('/admin/ops/dashboard/latency-histogram', { - params: { time_range: timeRange } - }) - return data -} - -/** - * Get error distribution - */ -export async function getErrorDistribution(timeRange = '1h'): Promise { - const { data } = await apiClient.get('/admin/ops/dashboard/errors/distribution', { - params: { time_range: timeRange } - }) - return data -} - -/** - * Subscribe to realtime QPS updates via WebSocket - */ -export function subscribeQPS(onMessage: (data: any) => void): () => void { - let ws: WebSocket | null = null - let reconnectAttempts = 0 - const maxReconnectAttempts = 5 - let reconnectTimer: ReturnType | null = null - let shouldReconnect = true - - const connect = () => { - const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:' - const host = window.location.host - ws = new WebSocket(`${protocol}//${host}/api/v1/admin/ops/ws/qps`) - - ws.onopen = () => { - console.log('[OpsWS] Connected') - reconnectAttempts = 0 - } - - ws.onmessage = (e) => { - const data = JSON.parse(e.data) - onMessage(data) - } - - ws.onerror = (error) => { - console.error('[OpsWS] Connection error:', error) - } - - ws.onclose = () => { - console.log('[OpsWS] Connection closed') - if (shouldReconnect && reconnectAttempts < maxReconnectAttempts) { - const delay = Math.min(1000 * Math.pow(2, reconnectAttempts), 30000) - console.log(`[OpsWS] Reconnecting in ${delay}ms...`) - reconnectTimer = setTimeout(() => { - reconnectAttempts++ - connect() - }, delay) - } - } - } - - connect() - - return () => { - shouldReconnect = false - if (reconnectTimer) clearTimeout(reconnectTimer) - if (ws) ws.close() - } -} - -export const opsAPI = { - getMetrics, - listMetricsHistory, - listErrors, - getDashboardOverview, - getProviderHealth, - getLatencyHistogram, - getErrorDistribution, - subscribeQPS -} - -export default opsAPI diff --git a/frontend/src/views/admin/ops/OpsDashboard.vue b/frontend/src/views/admin/ops/OpsDashboard.vue deleted file mode 100644 index 2762400e..00000000 --- a/frontend/src/views/admin/ops/OpsDashboard.vue +++ /dev/null @@ -1,417 +0,0 @@ - - - - -