package service import ( "context" "database/sql" "errors" "fmt" "log" "math" "runtime" "strings" "sync" "time" "github.com/shirou/gopsutil/v4/disk" ) type OpsMetrics struct { WindowMinutes int `json:"window_minutes"` RequestCount int64 `json:"request_count"` SuccessCount int64 `json:"success_count"` ErrorCount int64 `json:"error_count"` SuccessRate float64 `json:"success_rate"` ErrorRate float64 `json:"error_rate"` P95LatencyMs int `json:"p95_latency_ms"` P99LatencyMs int `json:"p99_latency_ms"` HTTP2Errors int `json:"http2_errors"` ActiveAlerts int `json:"active_alerts"` CPUUsagePercent float64 `json:"cpu_usage_percent"` MemoryUsedMB int64 `json:"memory_used_mb"` MemoryTotalMB int64 `json:"memory_total_mb"` MemoryUsagePercent float64 `json:"memory_usage_percent"` HeapAllocMB int64 `json:"heap_alloc_mb"` GCPauseMs float64 `json:"gc_pause_ms"` ConcurrencyQueueDepth int `json:"concurrency_queue_depth"` UpdatedAt time.Time `json:"updated_at,omitempty"` } type OpsErrorLog struct { ID int64 `json:"id"` CreatedAt time.Time `json:"created_at"` Phase string `json:"phase"` Type string `json:"type"` Severity string `json:"severity"` StatusCode int `json:"status_code"` Platform string `json:"platform"` Model string `json:"model"` LatencyMs *int `json:"latency_ms"` RequestID string `json:"request_id"` Message string `json:"message"` UserID *int64 `json:"user_id,omitempty"` APIKeyID *int64 `json:"api_key_id,omitempty"` AccountID *int64 `json:"account_id,omitempty"` GroupID *int64 `json:"group_id,omitempty"` ClientIP string `json:"client_ip,omitempty"` RequestPath string `json:"request_path,omitempty"` Stream bool `json:"stream"` } type OpsErrorLogFilters struct { StartTime *time.Time EndTime *time.Time Platform string Phase string Severity string Query string Limit int } type OpsWindowStats struct { SuccessCount int64 ErrorCount int64 P95LatencyMs int P99LatencyMs int HTTP2Errors int } type ProviderStats struct { Platform string RequestCount int64 SuccessCount int64 ErrorCount int64 AvgLatencyMs int P99LatencyMs int Error4xxCount int64 Error5xxCount int64 TimeoutCount int64 } type ProviderHealthErrorsByType struct { HTTP4xx int64 `json:"4xx"` HTTP5xx int64 `json:"5xx"` Timeout int64 `json:"timeout"` } type ProviderHealthData struct { Name string `json:"name"` RequestCount int64 `json:"request_count"` SuccessRate float64 `json:"success_rate"` ErrorRate float64 `json:"error_rate"` LatencyAvg int `json:"latency_avg"` LatencyP99 int `json:"latency_p99"` Status string `json:"status"` ErrorsByType ProviderHealthErrorsByType `json:"errors_by_type"` } type LatencyHistogramItem struct { Range string `json:"range"` Count int64 `json:"count"` Percentage float64 `json:"percentage"` } type ErrorDistributionItem struct { Code string `json:"code"` Message string `json:"message"` Count int64 `json:"count"` Percentage float64 `json:"percentage"` } type OpsRepository interface { CreateErrorLog(ctx context.Context, log *OpsErrorLog) error // ListErrorLogsLegacy keeps the original non-paginated query API used by the // existing /api/v1/admin/ops/error-logs endpoint (limit is capped at 500; for // stable pagination use /api/v1/admin/ops/errors). ListErrorLogsLegacy(ctx context.Context, filters OpsErrorLogFilters) ([]OpsErrorLog, error) // ListErrorLogs provides a paginated error-log query API (with total count). ListErrorLogs(ctx context.Context, filter *ErrorLogFilter) ([]*ErrorLog, int64, error) GetLatestSystemMetric(ctx context.Context) (*OpsMetrics, error) CreateSystemMetric(ctx context.Context, metric *OpsMetrics) error GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) GetProviderStats(ctx context.Context, startTime, endTime time.Time) ([]*ProviderStats, error) GetLatencyHistogram(ctx context.Context, startTime, endTime time.Time) ([]*LatencyHistogramItem, error) GetErrorDistribution(ctx context.Context, startTime, endTime time.Time) ([]*ErrorDistributionItem, error) ListRecentSystemMetrics(ctx context.Context, windowMinutes, limit int) ([]OpsMetrics, error) ListSystemMetricsRange(ctx context.Context, windowMinutes int, startTime, endTime time.Time, limit int) ([]OpsMetrics, error) ListAlertRules(ctx context.Context) ([]OpsAlertRule, error) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) error UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error UpdateAlertEventNotifications(ctx context.Context, eventID int64, emailSent, webhookSent bool) error CountActiveAlerts(ctx context.Context) (int, error) GetOverviewStats(ctx context.Context, startTime, endTime time.Time) (*OverviewStats, error) // Redis-backed cache/health (best-effort; implementation lives in repository layer). GetCachedLatestSystemMetric(ctx context.Context) (*OpsMetrics, error) SetCachedLatestSystemMetric(ctx context.Context, metric *OpsMetrics) error GetCachedDashboardOverview(ctx context.Context, timeRange string) (*DashboardOverviewData, error) SetCachedDashboardOverview(ctx context.Context, timeRange string, data *DashboardOverviewData, ttl time.Duration) error PingRedis(ctx context.Context) error } type OpsService struct { repo OpsRepository sqlDB *sql.DB redisNilWarnOnce sync.Once dbNilWarnOnce sync.Once } const opsDBQueryTimeout = 5 * time.Second func NewOpsService(repo OpsRepository, sqlDB *sql.DB) *OpsService { svc := &OpsService{repo: repo, sqlDB: sqlDB} // Best-effort startup health checks: log warnings if Redis/DB is unavailable, // but never fail service startup (graceful degradation). log.Printf("[OpsService] Performing startup health checks...") ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) defer cancel() redisStatus := svc.checkRedisHealth(ctx) dbStatus := svc.checkDatabaseHealth(ctx) log.Printf("[OpsService] Startup health check complete: Redis=%s, Database=%s", redisStatus, dbStatus) if redisStatus == "critical" || dbStatus == "critical" { log.Printf("[OpsService][WARN] Service starting with degraded dependencies - some features may be unavailable") } return svc } func (s *OpsService) RecordError(ctx context.Context, log *OpsErrorLog) error { if log == nil { return nil } if log.CreatedAt.IsZero() { log.CreatedAt = time.Now() } if log.Severity == "" { log.Severity = "P2" } if log.Phase == "" { log.Phase = "internal" } if log.Type == "" { log.Type = "unknown_error" } if log.Message == "" { log.Message = "Unknown error" } ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) defer cancel() return s.repo.CreateErrorLog(ctxDB, log) } func (s *OpsService) RecordMetrics(ctx context.Context, metric *OpsMetrics) error { if metric == nil { return nil } if metric.UpdatedAt.IsZero() { metric.UpdatedAt = time.Now() } ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) defer cancel() if err := s.repo.CreateSystemMetric(ctxDB, metric); err != nil { return err } // Latest metrics snapshot is queried frequently by the ops dashboard; keep a short-lived cache // to avoid unnecessary DB pressure. Only cache the default (1-minute) window metrics. windowMinutes := metric.WindowMinutes if windowMinutes == 0 { windowMinutes = 1 } if windowMinutes == 1 { if repo := s.repo; repo != nil { _ = repo.SetCachedLatestSystemMetric(ctx, metric) } } return nil } func (s *OpsService) ListErrorLogs(ctx context.Context, filters OpsErrorLogFilters) ([]OpsErrorLog, int, error) { ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) defer cancel() logs, err := s.repo.ListErrorLogsLegacy(ctxDB, filters) if err != nil { return nil, 0, err } return logs, len(logs), nil } func (s *OpsService) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) { ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) defer cancel() return s.repo.GetWindowStats(ctxDB, startTime, endTime) } func (s *OpsService) GetLatestMetrics(ctx context.Context) (*OpsMetrics, error) { // Cache first (best-effort): cache errors should not break the dashboard. if s != nil { if repo := s.repo; repo != nil { if cached, err := repo.GetCachedLatestSystemMetric(ctx); err == nil && cached != nil { if cached.WindowMinutes == 0 { cached.WindowMinutes = 1 } return cached, nil } } } ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) defer cancel() metric, err := s.repo.GetLatestSystemMetric(ctxDB) if err != nil { if errors.Is(err, sql.ErrNoRows) { return &OpsMetrics{WindowMinutes: 1}, nil } return nil, err } if metric == nil { return &OpsMetrics{WindowMinutes: 1}, nil } if metric.WindowMinutes == 0 { metric.WindowMinutes = 1 } // Backfill cache (best-effort). if s != nil { if repo := s.repo; repo != nil { _ = repo.SetCachedLatestSystemMetric(ctx, metric) } } return metric, nil } func (s *OpsService) ListMetricsHistory(ctx context.Context, windowMinutes int, startTime, endTime time.Time, limit int) ([]OpsMetrics, error) { if s == nil || s.repo == nil { return nil, nil } if windowMinutes <= 0 { windowMinutes = 1 } if limit <= 0 || limit > 5000 { limit = 300 } if endTime.IsZero() { endTime = time.Now() } if startTime.IsZero() { startTime = endTime.Add(-time.Duration(limit) * opsMetricsInterval) } if startTime.After(endTime) { startTime, endTime = endTime, startTime } ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) defer cancel() return s.repo.ListSystemMetricsRange(ctxDB, windowMinutes, startTime, endTime, limit) } // DashboardOverviewData represents aggregated metrics for the ops dashboard overview. type DashboardOverviewData struct { Timestamp time.Time `json:"timestamp"` HealthScore int `json:"health_score"` SLA SLAData `json:"sla"` QPS QPSData `json:"qps"` TPS TPSData `json:"tps"` Latency LatencyData `json:"latency"` Errors ErrorData `json:"errors"` Resources ResourceData `json:"resources"` SystemStatus SystemStatusData `json:"system_status"` } type SLAData struct { Current float64 `json:"current"` Threshold float64 `json:"threshold"` Status string `json:"status"` Trend string `json:"trend"` Change24h float64 `json:"change_24h"` } type QPSData struct { Current float64 `json:"current"` Peak1h float64 `json:"peak_1h"` Avg1h float64 `json:"avg_1h"` ChangeVsYesterday float64 `json:"change_vs_yesterday"` } type TPSData struct { Current float64 `json:"current"` Peak1h float64 `json:"peak_1h"` Avg1h float64 `json:"avg_1h"` } type LatencyData struct { P50 int `json:"p50"` P95 int `json:"p95"` P99 int `json:"p99"` P999 int `json:"p999"` Avg int `json:"avg"` Max int `json:"max"` ThresholdP99 int `json:"threshold_p99"` Status string `json:"status"` } type ErrorData struct { TotalCount int64 `json:"total_count"` ErrorRate float64 `json:"error_rate"` Count4xx int64 `json:"4xx_count"` Count5xx int64 `json:"5xx_count"` TimeoutCount int64 `json:"timeout_count"` TopError *TopError `json:"top_error,omitempty"` } type TopError struct { Code string `json:"code"` Message string `json:"message"` Count int64 `json:"count"` } type ResourceData struct { CPUUsage float64 `json:"cpu_usage"` MemoryUsage float64 `json:"memory_usage"` DiskUsage float64 `json:"disk_usage"` Goroutines int `json:"goroutines"` DBConnections DBConnectionsData `json:"db_connections"` } type DBConnectionsData struct { Active int `json:"active"` Idle int `json:"idle"` Waiting int `json:"waiting"` Max int `json:"max"` } type SystemStatusData struct { Redis string `json:"redis"` Database string `json:"database"` BackgroundJobs string `json:"background_jobs"` } type OverviewStats struct { RequestCount int64 SuccessCount int64 ErrorCount int64 Error4xxCount int64 Error5xxCount int64 TimeoutCount int64 LatencyP50 int LatencyP95 int LatencyP99 int LatencyP999 int LatencyAvg int LatencyMax int TopErrorCode string TopErrorMsg string TopErrorCount int64 CPUUsage float64 MemoryUsage float64 MemoryUsedMB int64 MemoryTotalMB int64 ConcurrencyQueueDepth int } func (s *OpsService) GetDashboardOverview(ctx context.Context, timeRange string) (*DashboardOverviewData, error) { if s == nil { return nil, errors.New("ops service not initialized") } repo := s.repo if repo == nil { return nil, errors.New("ops repository not initialized") } if s.sqlDB == nil { return nil, errors.New("ops service not initialized") } if strings.TrimSpace(timeRange) == "" { timeRange = "1h" } duration, err := parseTimeRange(timeRange) if err != nil { return nil, err } if cached, err := repo.GetCachedDashboardOverview(ctx, timeRange); err == nil && cached != nil { return cached, nil } now := time.Now().UTC() startTime := now.Add(-duration) ctxStats, cancelStats := context.WithTimeout(ctx, opsDBQueryTimeout) stats, err := repo.GetOverviewStats(ctxStats, startTime, now) cancelStats() if err != nil { return nil, fmt.Errorf("get overview stats: %w", err) } if stats == nil { return nil, errors.New("get overview stats returned nil") } var statsYesterday *OverviewStats { yesterdayEnd := now.Add(-24 * time.Hour) yesterdayStart := yesterdayEnd.Add(-duration) ctxYesterday, cancelYesterday := context.WithTimeout(ctx, opsDBQueryTimeout) ys, err := repo.GetOverviewStats(ctxYesterday, yesterdayStart, yesterdayEnd) cancelYesterday() if err != nil { // Best-effort: overview should still work when historical comparison fails. log.Printf("[OpsOverview] get yesterday overview stats failed: %v", err) } else { statsYesterday = ys } } totalReqs := stats.SuccessCount + stats.ErrorCount successRate, errorRate := calculateRates(stats.SuccessCount, stats.ErrorCount, totalReqs) successRateYesterday := 0.0 totalReqsYesterday := int64(0) if statsYesterday != nil { totalReqsYesterday = statsYesterday.SuccessCount + statsYesterday.ErrorCount successRateYesterday, _ = calculateRates(statsYesterday.SuccessCount, statsYesterday.ErrorCount, totalReqsYesterday) } slaThreshold := 99.9 slaChange24h := roundTo2DP(successRate - successRateYesterday) slaTrend := classifyTrend(slaChange24h, 0.05) slaStatus := classifySLAStatus(successRate, slaThreshold) latencyThresholdP99 := 1000 latencyStatus := classifyLatencyStatus(stats.LatencyP99, latencyThresholdP99) qpsCurrent := 0.0 { ctxWindow, cancelWindow := context.WithTimeout(ctx, opsDBQueryTimeout) windowStats, err := repo.GetWindowStats(ctxWindow, now.Add(-1*time.Minute), now) cancelWindow() if err == nil && windowStats != nil { qpsCurrent = roundTo1DP(float64(windowStats.SuccessCount+windowStats.ErrorCount) / 60) } else if err != nil { log.Printf("[OpsOverview] get realtime qps failed: %v", err) } } qpsAvg := roundTo1DP(safeDivide(float64(totalReqs), duration.Seconds())) qpsPeak := qpsAvg { limit := int(duration.Minutes()) + 5 if limit < 10 { limit = 10 } if limit > 5000 { limit = 5000 } ctxMetrics, cancelMetrics := context.WithTimeout(ctx, opsDBQueryTimeout) items, err := repo.ListSystemMetricsRange(ctxMetrics, 1, startTime, now, limit) cancelMetrics() if err != nil { log.Printf("[OpsOverview] get metrics range for peak qps failed: %v", err) } else { maxQPS := 0.0 for _, item := range items { v := float64(item.RequestCount) / 60 if v > maxQPS { maxQPS = v } } if maxQPS > 0 { qpsPeak = roundTo1DP(maxQPS) } } } qpsAvgYesterday := 0.0 if duration.Seconds() > 0 && totalReqsYesterday > 0 { qpsAvgYesterday = float64(totalReqsYesterday) / duration.Seconds() } qpsChangeVsYesterday := roundTo1DP(percentChange(qpsAvgYesterday, float64(totalReqs)/duration.Seconds())) tpsCurrent, tpsPeak, tpsAvg := 0.0, 0.0, 0.0 if current, peak, avg, err := s.getTokenTPS(ctx, now, startTime, duration); err != nil { log.Printf("[OpsOverview] get token tps failed: %v", err) } else { tpsCurrent, tpsPeak, tpsAvg = roundTo1DP(current), roundTo1DP(peak), roundTo1DP(avg) } diskUsage := 0.0 if v, err := getDiskUsagePercent(ctx, "/"); err != nil { log.Printf("[OpsOverview] get disk usage failed: %v", err) } else { diskUsage = roundTo1DP(v) } redisStatus := s.checkRedisHealth(ctx) dbStatus := s.checkDatabaseHealth(ctx) healthScore := calculateHealthScore(successRate, stats.LatencyP99, errorRate, redisStatus, dbStatus) data := &DashboardOverviewData{ Timestamp: now, HealthScore: healthScore, SLA: SLAData{ Current: successRate, Threshold: slaThreshold, Status: slaStatus, Trend: slaTrend, Change24h: slaChange24h, }, QPS: QPSData{ Current: qpsCurrent, Peak1h: qpsPeak, Avg1h: qpsAvg, ChangeVsYesterday: qpsChangeVsYesterday, }, TPS: TPSData{ Current: tpsCurrent, Peak1h: tpsPeak, Avg1h: tpsAvg, }, Latency: LatencyData{ P50: stats.LatencyP50, P95: stats.LatencyP95, P99: stats.LatencyP99, P999: stats.LatencyP999, Avg: stats.LatencyAvg, Max: stats.LatencyMax, ThresholdP99: latencyThresholdP99, Status: latencyStatus, }, Errors: ErrorData{ TotalCount: stats.ErrorCount, ErrorRate: errorRate, Count4xx: stats.Error4xxCount, Count5xx: stats.Error5xxCount, TimeoutCount: stats.TimeoutCount, }, Resources: ResourceData{ CPUUsage: roundTo1DP(stats.CPUUsage), MemoryUsage: roundTo1DP(stats.MemoryUsage), DiskUsage: diskUsage, Goroutines: runtime.NumGoroutine(), DBConnections: s.getDBConnections(), }, SystemStatus: SystemStatusData{ Redis: redisStatus, Database: dbStatus, BackgroundJobs: "healthy", }, } if stats.TopErrorCount > 0 { data.Errors.TopError = &TopError{ Code: stats.TopErrorCode, Message: stats.TopErrorMsg, Count: stats.TopErrorCount, } } _ = repo.SetCachedDashboardOverview(ctx, timeRange, data, 10*time.Second) return data, nil } func (s *OpsService) GetProviderHealth(ctx context.Context, timeRange string) ([]*ProviderHealthData, error) { if s == nil || s.repo == nil { return nil, nil } if strings.TrimSpace(timeRange) == "" { timeRange = "1h" } window, err := parseTimeRange(timeRange) if err != nil { return nil, err } endTime := time.Now() startTime := endTime.Add(-window) ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) stats, err := s.repo.GetProviderStats(ctxDB, startTime, endTime) cancel() if err != nil { return nil, err } results := make([]*ProviderHealthData, 0, len(stats)) for _, item := range stats { if item == nil { continue } successRate, errorRate := calculateRates(item.SuccessCount, item.ErrorCount, item.RequestCount) results = append(results, &ProviderHealthData{ Name: formatPlatformName(item.Platform), RequestCount: item.RequestCount, SuccessRate: successRate, ErrorRate: errorRate, LatencyAvg: item.AvgLatencyMs, LatencyP99: item.P99LatencyMs, Status: classifyProviderStatus(successRate, item.P99LatencyMs, item.TimeoutCount, item.RequestCount), ErrorsByType: ProviderHealthErrorsByType{ HTTP4xx: item.Error4xxCount, HTTP5xx: item.Error5xxCount, Timeout: item.TimeoutCount, }, }) } return results, nil } func (s *OpsService) GetLatencyHistogram(ctx context.Context, timeRange string) ([]*LatencyHistogramItem, error) { if s == nil || s.repo == nil { return nil, nil } duration, err := parseTimeRange(timeRange) if err != nil { return nil, err } endTime := time.Now() startTime := endTime.Add(-duration) ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) defer cancel() return s.repo.GetLatencyHistogram(ctxDB, startTime, endTime) } func (s *OpsService) GetErrorDistribution(ctx context.Context, timeRange string) ([]*ErrorDistributionItem, error) { if s == nil || s.repo == nil { return nil, nil } duration, err := parseTimeRange(timeRange) if err != nil { return nil, err } endTime := time.Now() startTime := endTime.Add(-duration) ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) defer cancel() return s.repo.GetErrorDistribution(ctxDB, startTime, endTime) } func parseTimeRange(timeRange string) (time.Duration, error) { value := strings.TrimSpace(timeRange) if value == "" { return 0, errors.New("invalid time range") } // Support "7d" style day ranges for convenience. if strings.HasSuffix(value, "d") { numberPart := strings.TrimSuffix(value, "d") if numberPart == "" { return 0, errors.New("invalid time range") } days := 0 for _, ch := range numberPart { if ch < '0' || ch > '9' { return 0, errors.New("invalid time range") } days = days*10 + int(ch-'0') } if days <= 0 { return 0, errors.New("invalid time range") } return time.Duration(days) * 24 * time.Hour, nil } dur, err := time.ParseDuration(value) if err != nil || dur <= 0 { return 0, errors.New("invalid time range") } // Cap to avoid unbounded queries. const maxWindow = 30 * 24 * time.Hour if dur > maxWindow { dur = maxWindow } return dur, nil } func calculateHealthScore(successRate float64, p99Latency int, errorRate float64, redisStatus, dbStatus string) int { score := 100.0 // SLA impact (max -45 points) if successRate < 99.9 { score -= math.Min(45, (99.9-successRate)*12) } // Latency impact (max -35 points) if p99Latency > 1000 { score -= math.Min(35, float64(p99Latency-1000)/80) } // Error rate impact (max -20 points) if errorRate > 0.1 { score -= math.Min(20, (errorRate-0.1)*60) } // Infra status impact if redisStatus != "healthy" { score -= 15 } if dbStatus != "healthy" { score -= 20 } if score < 0 { score = 0 } if score > 100 { score = 100 } return int(math.Round(score)) } func calculateRates(successCount, errorCount, requestCount int64) (successRate float64, errorRate float64) { if requestCount <= 0 { return 0, 0 } successRate = (float64(successCount) / float64(requestCount)) * 100 errorRate = (float64(errorCount) / float64(requestCount)) * 100 return roundTo2DP(successRate), roundTo2DP(errorRate) } func roundTo2DP(v float64) float64 { return math.Round(v*100) / 100 } func roundTo1DP(v float64) float64 { return math.Round(v*10) / 10 } func safeDivide(numerator float64, denominator float64) float64 { if denominator <= 0 { return 0 } return numerator / denominator } func percentChange(previous float64, current float64) float64 { if previous == 0 { if current > 0 { return 100.0 } return 0 } return (current - previous) / previous * 100 } func classifyTrend(delta float64, deadband float64) string { if delta > deadband { return "up" } if delta < -deadband { return "down" } return "stable" } func classifySLAStatus(successRate float64, threshold float64) string { if successRate >= threshold { return "healthy" } if successRate >= threshold-0.5 { return "warning" } return "critical" } func classifyLatencyStatus(p99LatencyMs int, thresholdP99 int) string { if thresholdP99 <= 0 { return "healthy" } if p99LatencyMs <= thresholdP99 { return "healthy" } if p99LatencyMs <= thresholdP99*2 { return "warning" } return "critical" } func getDiskUsagePercent(ctx context.Context, path string) (float64, error) { usage, err := disk.UsageWithContext(ctx, path) if err != nil { return 0, err } if usage == nil { return 0, nil } return usage.UsedPercent, nil } func (s *OpsService) checkRedisHealth(ctx context.Context) string { if s == nil { log.Printf("[OpsOverview][WARN] ops service is nil; redis health check skipped") return "critical" } if s.repo == nil { s.redisNilWarnOnce.Do(func() { log.Printf("[OpsOverview][WARN] ops repository is nil; redis health check skipped") }) return "critical" } ctxPing, cancel := context.WithTimeout(ctx, 800*time.Millisecond) defer cancel() if err := s.repo.PingRedis(ctxPing); err != nil { log.Printf("[OpsOverview][WARN] redis ping failed: %v", err) return "critical" } return "healthy" } func (s *OpsService) checkDatabaseHealth(ctx context.Context) string { if s == nil { log.Printf("[OpsOverview][WARN] ops service is nil; db health check skipped") return "critical" } if s.sqlDB == nil { s.dbNilWarnOnce.Do(func() { log.Printf("[OpsOverview][WARN] database is nil; db health check skipped") }) return "critical" } ctxPing, cancel := context.WithTimeout(ctx, 800*time.Millisecond) defer cancel() if err := s.sqlDB.PingContext(ctxPing); err != nil { log.Printf("[OpsOverview][WARN] db ping failed: %v", err) return "critical" } return "healthy" } func (s *OpsService) getDBConnections() DBConnectionsData { if s == nil || s.sqlDB == nil { return DBConnectionsData{} } stats := s.sqlDB.Stats() maxOpen := stats.MaxOpenConnections if maxOpen < 0 { maxOpen = 0 } return DBConnectionsData{ Active: stats.InUse, Idle: stats.Idle, Waiting: 0, Max: maxOpen, } } func (s *OpsService) getTokenTPS(ctx context.Context, endTime time.Time, startTime time.Time, duration time.Duration) (current float64, peak float64, avg float64, err error) { if s == nil || s.sqlDB == nil { return 0, 0, 0, nil } if duration <= 0 { return 0, 0, 0, nil } // Current TPS: last 1 minute. var tokensLastMinute int64 { lastMinuteStart := endTime.Add(-1 * time.Minute) ctxQuery, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) row := s.sqlDB.QueryRowContext(ctxQuery, ` SELECT COALESCE(SUM(input_tokens + output_tokens), 0) FROM usage_logs WHERE created_at >= $1 AND created_at < $2 `, lastMinuteStart, endTime) scanErr := row.Scan(&tokensLastMinute) cancel() if scanErr != nil { return 0, 0, 0, scanErr } } var totalTokens int64 var maxTokensPerMinute int64 { ctxQuery, cancel := context.WithTimeout(ctx, opsDBQueryTimeout) row := s.sqlDB.QueryRowContext(ctxQuery, ` WITH buckets AS ( SELECT date_trunc('minute', created_at) AS bucket, SUM(input_tokens + output_tokens) AS tokens FROM usage_logs WHERE created_at >= $1 AND created_at < $2 GROUP BY 1 ) SELECT COALESCE(SUM(tokens), 0) AS total_tokens, COALESCE(MAX(tokens), 0) AS max_tokens_per_minute FROM buckets `, startTime, endTime) scanErr := row.Scan(&totalTokens, &maxTokensPerMinute) cancel() if scanErr != nil { return 0, 0, 0, scanErr } } current = safeDivide(float64(tokensLastMinute), 60) peak = safeDivide(float64(maxTokensPerMinute), 60) avg = safeDivide(float64(totalTokens), duration.Seconds()) return current, peak, avg, nil } func formatPlatformName(platform string) string { switch strings.ToLower(strings.TrimSpace(platform)) { case PlatformOpenAI: return "OpenAI" case PlatformAnthropic: return "Anthropic" case PlatformGemini: return "Gemini" case PlatformAntigravity: return "Antigravity" default: if platform == "" { return "Unknown" } if len(platform) == 1 { return strings.ToUpper(platform) } return strings.ToUpper(platform[:1]) + platform[1:] } } func classifyProviderStatus(successRate float64, p99LatencyMs int, timeoutCount int64, requestCount int64) string { if requestCount <= 0 { return "healthy" } if successRate < 98 { return "critical" } if successRate < 99.5 { return "warning" } // Heavy timeout volume should be highlighted even if the overall success rate is okay. if timeoutCount >= 10 && requestCount >= 100 { return "warning" } if p99LatencyMs > 0 && p99LatencyMs >= 5000 { return "warning" } return "healthy" }