diff --git a/backend/internal/handler/admin/ops_realtime_handler.go b/backend/internal/handler/admin/ops_realtime_handler.go index 0c23c13b..4f15ec57 100644 --- a/backend/internal/handler/admin/ops_realtime_handler.go +++ b/backend/internal/handler/admin/ops_realtime_handler.go @@ -118,3 +118,96 @@ func (h *OpsHandler) GetAccountAvailability(c *gin.Context) { } response.Success(c, payload) } + +func parseOpsRealtimeWindow(v string) (time.Duration, string, bool) { + switch strings.ToLower(strings.TrimSpace(v)) { + case "", "1min", "1m": + return 1 * time.Minute, "1min", true + case "5min", "5m": + return 5 * time.Minute, "5min", true + case "30min", "30m": + return 30 * time.Minute, "30min", true + case "1h", "60m", "60min": + return 1 * time.Hour, "1h", true + default: + return 0, "", false + } +} + +// GetRealtimeTrafficSummary returns QPS/TPS current/peak/avg for the selected window. +// GET /api/v1/admin/ops/realtime-traffic +// +// Query params: +// - window: 1min|5min|30min|1h (default: 1min) +// - platform: optional +// - group_id: optional +func (h *OpsHandler) GetRealtimeTrafficSummary(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + windowDur, windowLabel, ok := parseOpsRealtimeWindow(c.Query("window")) + if !ok { + response.BadRequest(c, "Invalid window") + return + } + + platform := strings.TrimSpace(c.Query("platform")) + var groupID *int64 + if v := strings.TrimSpace(c.Query("group_id")); v != "" { + id, err := strconv.ParseInt(v, 10, 64) + if err != nil || id <= 0 { + response.BadRequest(c, "Invalid group_id") + return + } + groupID = &id + } + + endTime := time.Now().UTC() + startTime := endTime.Add(-windowDur) + + if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) { + disabledSummary := &service.OpsRealtimeTrafficSummary{ + Window: windowLabel, + StartTime: startTime, + EndTime: endTime, + Platform: platform, + GroupID: groupID, + QPS: service.OpsRateSummary{}, + TPS: service.OpsRateSummary{}, + } + response.Success(c, gin.H{ + "enabled": false, + "summary": disabledSummary, + "timestamp": endTime, + }) + return + } + + filter := &service.OpsDashboardFilter{ + StartTime: startTime, + EndTime: endTime, + Platform: platform, + GroupID: groupID, + QueryMode: service.OpsQueryModeRaw, + } + + summary, err := h.opsService.GetRealtimeTrafficSummary(c.Request.Context(), filter) + if err != nil { + response.ErrorFrom(c, err) + return + } + if summary != nil { + summary.Window = windowLabel + } + response.Success(c, gin.H{ + "enabled": true, + "summary": summary, + "timestamp": endTime, + }) +} diff --git a/backend/internal/handler/admin/ops_settings_handler.go b/backend/internal/handler/admin/ops_settings_handler.go index 0e0ecb72..ebc8bf49 100644 --- a/backend/internal/handler/admin/ops_settings_handler.go +++ b/backend/internal/handler/admin/ops_settings_handler.go @@ -146,3 +146,49 @@ func (h *OpsHandler) UpdateAdvancedSettings(c *gin.Context) { } response.Success(c, updated) } + +// GetMetricThresholds returns Ops metric thresholds (DB-backed). +// GET /api/v1/admin/ops/settings/metric-thresholds +func (h *OpsHandler) GetMetricThresholds(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + cfg, err := h.opsService.GetMetricThresholds(c.Request.Context()) + if err != nil { + response.Error(c, http.StatusInternalServerError, "Failed to get metric thresholds") + return + } + response.Success(c, cfg) +} + +// UpdateMetricThresholds updates Ops metric thresholds (DB-backed). +// PUT /api/v1/admin/ops/settings/metric-thresholds +func (h *OpsHandler) UpdateMetricThresholds(c *gin.Context) { + if h.opsService == nil { + response.Error(c, http.StatusServiceUnavailable, "Ops service not available") + return + } + if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil { + response.ErrorFrom(c, err) + return + } + + var req service.OpsMetricThresholds + if err := c.ShouldBindJSON(&req); err != nil { + response.BadRequest(c, "Invalid request body") + return + } + + updated, err := h.opsService.UpdateMetricThresholds(c.Request.Context(), &req) + if err != nil { + response.Error(c, http.StatusBadRequest, err.Error()) + return + } + response.Success(c, updated) +} diff --git a/backend/internal/repository/ops_repo_realtime_traffic.go b/backend/internal/repository/ops_repo_realtime_traffic.go new file mode 100644 index 00000000..a9b0b929 --- /dev/null +++ b/backend/internal/repository/ops_repo_realtime_traffic.go @@ -0,0 +1,129 @@ +package repository + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/Wei-Shaw/sub2api/internal/service" +) + +func (r *opsRepository) GetRealtimeTrafficSummary(ctx context.Context, filter *service.OpsDashboardFilter) (*service.OpsRealtimeTrafficSummary, error) { + if r == nil || r.db == nil { + return nil, fmt.Errorf("nil ops repository") + } + if filter == nil { + return nil, fmt.Errorf("nil filter") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, fmt.Errorf("start_time/end_time required") + } + + start := filter.StartTime.UTC() + end := filter.EndTime.UTC() + if start.After(end) { + return nil, fmt.Errorf("start_time must be <= end_time") + } + + window := end.Sub(start) + if window <= 0 { + return nil, fmt.Errorf("invalid time window") + } + if window > time.Hour { + return nil, fmt.Errorf("window too large") + } + + usageJoin, usageWhere, usageArgs, next := buildUsageWhere(filter, start, end, 1) + errorWhere, errorArgs, _ := buildErrorWhere(filter, start, end, next) + + q := ` +WITH usage_buckets AS ( + SELECT + date_trunc('minute', ul.created_at) AS bucket, + COALESCE(COUNT(*), 0) AS success_count, + COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_sum + FROM usage_logs ul + ` + usageJoin + ` + ` + usageWhere + ` + GROUP BY 1 +), +error_buckets AS ( + SELECT + date_trunc('minute', created_at) AS bucket, + COALESCE(COUNT(*), 0) AS error_count + FROM ops_error_logs + ` + errorWhere + ` + AND COALESCE(status_code, 0) >= 400 + GROUP BY 1 +), +combined AS ( + SELECT + COALESCE(u.bucket, e.bucket) AS bucket, + COALESCE(u.success_count, 0) AS success_count, + COALESCE(u.token_sum, 0) AS token_sum, + COALESCE(e.error_count, 0) AS error_count, + COALESCE(u.success_count, 0) + COALESCE(e.error_count, 0) AS request_total + FROM usage_buckets u + FULL OUTER JOIN error_buckets e ON u.bucket = e.bucket +) +SELECT + COALESCE(SUM(success_count), 0) AS success_total, + COALESCE(SUM(error_count), 0) AS error_total, + COALESCE(SUM(token_sum), 0) AS token_total, + COALESCE(MAX(request_total), 0) AS peak_requests_per_min, + COALESCE(MAX(token_sum), 0) AS peak_tokens_per_min +FROM combined` + + args := append(usageArgs, errorArgs...) + var successCount int64 + var errorTotal int64 + var tokenConsumed int64 + var peakRequestsPerMin int64 + var peakTokensPerMin int64 + if err := r.db.QueryRowContext(ctx, q, args...).Scan( + &successCount, + &errorTotal, + &tokenConsumed, + &peakRequestsPerMin, + &peakTokensPerMin, + ); err != nil { + return nil, err + } + + windowSeconds := window.Seconds() + if windowSeconds <= 0 { + windowSeconds = 1 + } + + requestCountTotal := successCount + errorTotal + qpsAvg := roundTo1DP(float64(requestCountTotal) / windowSeconds) + tpsAvg := roundTo1DP(float64(tokenConsumed) / windowSeconds) + + // Keep "current" consistent with the dashboard overview semantics: last 1 minute. + // This remains "within the selected window" since end=start+window. + qpsCurrent, tpsCurrent, err := r.queryCurrentRates(ctx, filter, end) + if err != nil { + return nil, err + } + + qpsPeak := roundTo1DP(float64(peakRequestsPerMin) / 60.0) + tpsPeak := roundTo1DP(float64(peakTokensPerMin) / 60.0) + + return &service.OpsRealtimeTrafficSummary{ + StartTime: start, + EndTime: end, + Platform: strings.TrimSpace(filter.Platform), + GroupID: filter.GroupID, + QPS: service.OpsRateSummary{ + Current: qpsCurrent, + Peak: qpsPeak, + Avg: qpsAvg, + }, + TPS: service.OpsRateSummary{ + Current: tpsCurrent, + Peak: tpsPeak, + Avg: tpsAvg, + }, + }, nil +} diff --git a/backend/internal/server/routes/admin.go b/backend/internal/server/routes/admin.go index 111e4578..9bb019bb 100644 --- a/backend/internal/server/routes/admin.go +++ b/backend/internal/server/routes/admin.go @@ -73,6 +73,7 @@ func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) { // Realtime ops signals ops.GET("/concurrency", h.Admin.Ops.GetConcurrencyStats) ops.GET("/account-availability", h.Admin.Ops.GetAccountAvailability) + ops.GET("/realtime-traffic", h.Admin.Ops.GetRealtimeTrafficSummary) // Alerts (rules + events) ops.GET("/alert-rules", h.Admin.Ops.ListAlertRules) @@ -96,6 +97,13 @@ func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) { ops.GET("/advanced-settings", h.Admin.Ops.GetAdvancedSettings) ops.PUT("/advanced-settings", h.Admin.Ops.UpdateAdvancedSettings) + // Settings group (DB-backed) + settings := ops.Group("/settings") + { + settings.GET("/metric-thresholds", h.Admin.Ops.GetMetricThresholds) + settings.PUT("/metric-thresholds", h.Admin.Ops.UpdateMetricThresholds) + } + // WebSocket realtime (QPS/TPS) ws := ops.Group("/ws") { diff --git a/backend/internal/service/ops_port.go b/backend/internal/service/ops_port.go index 39f3aaf2..4549214d 100644 --- a/backend/internal/service/ops_port.go +++ b/backend/internal/service/ops_port.go @@ -17,6 +17,8 @@ type OpsRepository interface { // Lightweight window stats (for realtime WS / quick sampling). GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error) + // Lightweight realtime traffic summary (for the Ops dashboard header card). + GetRealtimeTrafficSummary(ctx context.Context, filter *OpsDashboardFilter) (*OpsRealtimeTrafficSummary, error) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) diff --git a/backend/internal/service/ops_realtime_traffic.go b/backend/internal/service/ops_realtime_traffic.go new file mode 100644 index 00000000..458905c5 --- /dev/null +++ b/backend/internal/service/ops_realtime_traffic.go @@ -0,0 +1,36 @@ +package service + +import ( + "context" + "time" + + infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" +) + +// GetRealtimeTrafficSummary returns QPS/TPS current/peak/avg for the provided window. +// This is used by the Ops dashboard "Realtime Traffic" card and is intentionally lightweight. +func (s *OpsService) GetRealtimeTrafficSummary(ctx context.Context, filter *OpsDashboardFilter) (*OpsRealtimeTrafficSummary, error) { + if err := s.RequireMonitoringEnabled(ctx); err != nil { + return nil, err + } + if s.opsRepo == nil { + return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available") + } + if filter == nil { + return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required") + } + if filter.StartTime.IsZero() || filter.EndTime.IsZero() { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required") + } + if filter.StartTime.After(filter.EndTime) { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time") + } + if filter.EndTime.Sub(filter.StartTime) > time.Hour { + return nil, infraerrors.BadRequest("OPS_TIME_RANGE_TOO_LARGE", "invalid time range: max window is 1 hour") + } + + // Realtime traffic summary always uses raw logs (minute granularity peaks). + filter.QueryMode = OpsQueryModeRaw + + return s.opsRepo.GetRealtimeTrafficSummary(ctx, filter) +} diff --git a/backend/internal/service/ops_realtime_traffic_models.go b/backend/internal/service/ops_realtime_traffic_models.go new file mode 100644 index 00000000..e88a890b --- /dev/null +++ b/backend/internal/service/ops_realtime_traffic_models.go @@ -0,0 +1,19 @@ +package service + +import "time" + +// OpsRealtimeTrafficSummary is a lightweight summary used by the Ops dashboard "Realtime Traffic" card. +// It reports QPS/TPS current/peak/avg for the requested time window. +type OpsRealtimeTrafficSummary struct { + // Window is a normalized label (e.g. "1min", "5min", "30min", "1h"). + Window string `json:"window"` + + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + + Platform string `json:"platform"` + GroupID *int64 `json:"group_id"` + + QPS OpsRateSummary `json:"qps"` + TPS OpsRateSummary `json:"tps"` +} diff --git a/backend/internal/service/ops_settings.go b/backend/internal/service/ops_settings.go index fbf8f069..bb8052bb 100644 --- a/backend/internal/service/ops_settings.go +++ b/backend/internal/service/ops_settings.go @@ -463,3 +463,93 @@ func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdva _ = json.Unmarshal(raw, updated) return updated, nil } + +// ========================= +// Metric thresholds +// ========================= + +const SettingKeyOpsMetricThresholds = "ops_metric_thresholds" + +func defaultOpsMetricThresholds() *OpsMetricThresholds { + slaMin := 99.5 + latencyMax := 2000.0 + ttftMax := 500.0 + reqErrMax := 5.0 + upstreamErrMax := 5.0 + return &OpsMetricThresholds{ + SLAPercentMin: &slaMin, + LatencyP99MsMax: &latencyMax, + TTFTp99MsMax: &ttftMax, + RequestErrorRatePercentMax: &reqErrMax, + UpstreamErrorRatePercentMax: &upstreamErrMax, + } +} + +func (s *OpsService) GetMetricThresholds(ctx context.Context) (*OpsMetricThresholds, error) { + defaultCfg := defaultOpsMetricThresholds() + if s == nil || s.settingRepo == nil { + return defaultCfg, nil + } + if ctx == nil { + ctx = context.Background() + } + + raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMetricThresholds) + if err != nil { + if errors.Is(err, ErrSettingNotFound) { + if b, mErr := json.Marshal(defaultCfg); mErr == nil { + _ = s.settingRepo.Set(ctx, SettingKeyOpsMetricThresholds, string(b)) + } + return defaultCfg, nil + } + return nil, err + } + + cfg := &OpsMetricThresholds{} + if err := json.Unmarshal([]byte(raw), cfg); err != nil { + return defaultCfg, nil + } + + return cfg, nil +} + +func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricThresholds) (*OpsMetricThresholds, error) { + if s == nil || s.settingRepo == nil { + return nil, errors.New("setting repository not initialized") + } + if ctx == nil { + ctx = context.Background() + } + if cfg == nil { + return nil, errors.New("invalid config") + } + + // Validate thresholds + if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) { + return nil, errors.New("sla_percent_min must be between 0 and 100") + } + if cfg.LatencyP99MsMax != nil && *cfg.LatencyP99MsMax < 0 { + return nil, errors.New("latency_p99_ms_max must be >= 0") + } + if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 { + return nil, errors.New("ttft_p99_ms_max must be >= 0") + } + if cfg.RequestErrorRatePercentMax != nil && (*cfg.RequestErrorRatePercentMax < 0 || *cfg.RequestErrorRatePercentMax > 100) { + return nil, errors.New("request_error_rate_percent_max must be between 0 and 100") + } + if cfg.UpstreamErrorRatePercentMax != nil && (*cfg.UpstreamErrorRatePercentMax < 0 || *cfg.UpstreamErrorRatePercentMax > 100) { + return nil, errors.New("upstream_error_rate_percent_max must be between 0 and 100") + } + + raw, err := json.Marshal(cfg) + if err != nil { + return nil, err + } + if err := s.settingRepo.Set(ctx, SettingKeyOpsMetricThresholds, string(raw)); err != nil { + return nil, err + } + + updated := &OpsMetricThresholds{} + _ = json.Unmarshal(raw, updated) + return updated, nil +} diff --git a/backend/internal/service/ops_settings_models.go b/backend/internal/service/ops_settings_models.go index 7d9a823c..0de28358 100644 --- a/backend/internal/service/ops_settings_models.go +++ b/backend/internal/service/ops_settings_models.go @@ -61,11 +61,20 @@ type OpsAlertSilencingSettings struct { Entries []OpsAlertSilenceEntry `json:"entries,omitempty"` } +type OpsMetricThresholds struct { + SLAPercentMin *float64 `json:"sla_percent_min,omitempty"` // SLA低于此值变红 + LatencyP99MsMax *float64 `json:"latency_p99_ms_max,omitempty"` // 延迟P99高于此值变红 + TTFTp99MsMax *float64 `json:"ttft_p99_ms_max,omitempty"` // TTFT P99高于此值变红 + RequestErrorRatePercentMax *float64 `json:"request_error_rate_percent_max,omitempty"` // 请求错误率高于此值变红 + UpstreamErrorRatePercentMax *float64 `json:"upstream_error_rate_percent_max,omitempty"` // 上游错误率高于此值变红 +} + type OpsAlertRuntimeSettings struct { EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"` DistributedLock OpsDistributedLockSettings `json:"distributed_lock"` Silencing OpsAlertSilencingSettings `json:"silencing"` + Thresholds OpsMetricThresholds `json:"thresholds"` // 指标阈值配置 } // OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation). diff --git a/frontend/.eslintignore b/frontend/.eslintignore new file mode 100644 index 00000000..d8682246 --- /dev/null +++ b/frontend/.eslintignore @@ -0,0 +1,14 @@ +# 忽略编译后的文件 +vite.config.js +vite.config.d.ts + +# 忽略依赖 +node_modules/ + +# 忽略构建输出 +dist/ +../backend/internal/web/dist/ + +# 忽略缓存 +.cache/ +.vite/ diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts index 1d1453f5..3a5484df 100644 --- a/frontend/src/api/admin/ops.ts +++ b/frontend/src/api/admin/ops.ts @@ -362,6 +362,45 @@ export async function getAccountAvailabilityStats(platform?: string, groupId?: n return data } +export interface OpsRateSummary { + current: number + peak: number + avg: number +} + +export interface OpsRealtimeTrafficSummary { + window: string + start_time: string + end_time: string + platform: string + group_id?: number | null + qps: OpsRateSummary + tps: OpsRateSummary +} + +export interface OpsRealtimeTrafficSummaryResponse { + enabled: boolean + summary: OpsRealtimeTrafficSummary | null + timestamp?: string +} + +export async function getRealtimeTrafficSummary( + window: string, + platform?: string, + groupId?: number | null +): Promise { + const params: Record = { window } + if (platform) { + params.platform = platform + } + if (typeof groupId === 'number' && groupId > 0) { + params.group_id = groupId + } + + const { data } = await apiClient.get('/admin/ops/realtime-traffic', { params }) + return data +} + /** * Subscribe to realtime QPS updates via WebSocket. * @@ -661,6 +700,14 @@ export interface EmailNotificationConfig { } } +export interface OpsMetricThresholds { + sla_percent_min?: number | null // SLA低于此值变红 + latency_p99_ms_max?: number | null // 延迟P99高于此值变红 + ttft_p99_ms_max?: number | null // TTFT P99高于此值变红 + request_error_rate_percent_max?: number | null // 请求错误率高于此值变红 + upstream_error_rate_percent_max?: number | null // 上游错误率高于此值变红 +} + export interface OpsDistributedLockSettings { enabled: boolean key: string @@ -681,6 +728,7 @@ export interface OpsAlertRuntimeSettings { reason: string }> } + thresholds: OpsMetricThresholds // 指标阈值配置 } export interface OpsAdvancedSettings { @@ -929,6 +977,17 @@ export async function updateAdvancedSettings(config: OpsAdvancedSettings): Promi return data } +// ==================== Metric Thresholds ==================== + +async function getMetricThresholds(): Promise { + const { data } = await apiClient.get('/admin/ops/settings/metric-thresholds') + return data +} + +async function updateMetricThresholds(thresholds: OpsMetricThresholds): Promise { + await apiClient.put('/admin/ops/settings/metric-thresholds', thresholds) +} + export const opsAPI = { getDashboardOverview, getThroughputTrend, @@ -937,6 +996,7 @@ export const opsAPI = { getErrorDistribution, getConcurrencyStats, getAccountAvailabilityStats, + getRealtimeTrafficSummary, subscribeQPS, listErrorLogs, getErrorLogDetail, @@ -952,7 +1012,9 @@ export const opsAPI = { getAlertRuntimeSettings, updateAlertRuntimeSettings, getAdvancedSettings, - updateAdvancedSettings + updateAdvancedSettings, + getMetricThresholds, + updateMetricThresholds } export default opsAPI diff --git a/frontend/src/components/icons/Icon.vue b/frontend/src/components/icons/Icon.vue index ec3c9a1b..c8ab8aed 100644 --- a/frontend/src/components/icons/Icon.vue +++ b/frontend/src/components/icons/Icon.vue @@ -124,7 +124,8 @@ const icons = { chatBubble: 'M8 10h.01M12 10h.01M16 10h.01M9 16H5a2 2 0 01-2-2V6a2 2 0 012-2h14a2 2 0 012 2v8a2 2 0 01-2 2h-5l-5 5v-5z', calculator: 'M9 7h6m0 10v-3m-3 3h.01M9 17h.01M9 14h.01M12 14h.01M15 11h.01M12 11h.01M9 11h.01M7 21h10a2 2 0 002-2V5a2 2 0 00-2-2H7a2 2 0 00-2 2v14a2 2 0 002 2z', fire: 'M17.657 18.657A8 8 0 016.343 7.343S7 9 9 10c0-2 .5-5 2.986-7C14 5 16.09 5.777 17.656 7.343A7.975 7.975 0 0120 13a7.975 7.975 0 01-2.343 5.657z', - badge: 'M9 12.75L11.25 15 15 9.75M21 12c0 1.268-.63 2.39-1.593 3.068a3.745 3.745 0 01-1.043 3.296 3.745 3.745 0 01-3.296 1.043A3.745 3.745 0 0112 21c-1.268 0-2.39-.63-3.068-1.593a3.746 3.746 0 01-3.296-1.043 3.745 3.745 0 01-1.043-3.296A3.745 3.745 0 013 12c0-1.268.63-2.39 1.593-3.068a3.745 3.745 0 011.043-3.296 3.746 3.746 0 013.296-1.043A3.746 3.746 0 0112 3c1.268 0 2.39.63 3.068 1.593a3.746 3.746 0 013.296 1.043 3.746 3.746 0 011.043 3.296A3.745 3.745 0 0121 12z' + badge: 'M9 12.75L11.25 15 15 9.75M21 12c0 1.268-.63 2.39-1.593 3.068a3.745 3.745 0 01-1.043 3.296 3.745 3.745 0 01-3.296 1.043A3.745 3.745 0 0112 21c-1.268 0-2.39-.63-3.068-1.593a3.746 3.746 0 01-3.296-1.043 3.745 3.745 0 01-1.043-3.296A3.745 3.745 0 013 12c0-1.268.63-2.39 1.593-3.068a3.745 3.745 0 011.043-3.296 3.746 3.746 0 013.296-1.043A3.746 3.746 0 0112 3c1.268 0 2.39.63 3.068 1.593a3.746 3.746 0 013.296 1.043 3.746 3.746 0 011.043 3.296A3.745 3.745 0 0121 12z', + brain: 'M9.75 3.104v5.714a2.25 2.25 0 01-.659 1.591L5 14.5M9.75 3.104c-.251.023-.501.05-.75.082m.75-.082a24.301 24.301 0 014.5 0m0 0v5.714c0 .597.237 1.17.659 1.591L19.8 15.3M14.25 3.104c.251.023.501.05.75.082M19.8 15.3l-1.57.393A9.065 9.065 0 0112 15a9.065 9.065 0 00-6.23.693L5 14.5m0 0l-2.69 2.689c-1.232 1.232-.65 3.318 1.067 3.611A48.309 48.309 0 0012 21c2.773 0 5.491-.235 8.135-.687 1.718-.293 2.3-2.379 1.067-3.61L19.8 15.3M12 8.25a1.5 1.5 0 100-3 1.5 1.5 0 000 3zm0 0v3m-3-1.5a1.5 1.5 0 100-3 1.5 1.5 0 000 3zm0 0h6m-3 4.5a1.5 1.5 0 100-3 1.5 1.5 0 000 3z' } as const const iconPath = computed(() => icons[props.name]) diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts index 9a9fcea9..7ecbb977 100644 --- a/frontend/src/i18n/locales/en.ts +++ b/frontend/src/i18n/locales/en.ts @@ -156,6 +156,7 @@ export default { unknownError: 'Unknown error occurred', saving: 'Saving...', selectedCount: '({count} selected)', refresh: 'Refresh', + settings: 'Settings', notAvailable: 'N/A', now: 'Now', unknown: 'Unknown', @@ -1906,6 +1907,7 @@ export default { max: 'max:', qps: 'QPS', requests: 'Requests', + requestsTitle: 'Requests', upstream: 'Upstream', client: 'Client', system: 'System', @@ -2118,7 +2120,10 @@ export default { empty: 'No alert rules', loadFailed: 'Failed to load alert rules', saveFailed: 'Failed to save alert rule', + saveSuccess: 'Alert rule saved successfully', deleteFailed: 'Failed to delete alert rule', + deleteSuccess: 'Alert rule deleted successfully', + manage: 'Manage Alert Rules', create: 'Create Rule', createTitle: 'Create Alert Rule', editTitle: 'Edit Alert Rule', @@ -2301,6 +2306,54 @@ export default { accountHealthThresholdRange: 'Account health threshold must be between 0 and 100' } }, + settings: { + title: 'Ops Monitoring Settings', + loadFailed: 'Failed to load settings', + saveSuccess: 'Ops monitoring settings saved successfully', + saveFailed: 'Failed to save settings', + dataCollection: 'Data Collection', + evaluationInterval: 'Evaluation Interval (seconds)', + evaluationIntervalHint: 'Frequency of detection tasks, recommended to keep default', + alertConfig: 'Alert Configuration', + enableAlert: 'Enable Alerts', + alertRecipients: 'Alert Recipient Emails', + emailPlaceholder: 'Enter email address', + recipientsHint: 'If empty, the system will use the first admin email as default recipient', + minSeverity: 'Minimum Severity', + reportConfig: 'Report Configuration', + enableReport: 'Enable Reports', + reportRecipients: 'Report Recipient Emails', + dailySummary: 'Daily Summary', + weeklySummary: 'Weekly Summary', + metricThresholds: 'Metric Thresholds', + metricThresholdsHint: 'Configure alert thresholds for metrics, values exceeding thresholds will be displayed in red', + slaMinPercent: 'SLA Minimum Percentage', + slaMinPercentHint: 'SLA below this value will be displayed in red (default: 99.5%)', + latencyP99MaxMs: 'Latency P99 Maximum (ms)', + latencyP99MaxMsHint: 'Latency P99 above this value will be displayed in red (default: 2000ms)', + ttftP99MaxMs: 'TTFT P99 Maximum (ms)', + ttftP99MaxMsHint: 'TTFT P99 above this value will be displayed in red (default: 500ms)', + requestErrorRateMaxPercent: 'Request Error Rate Maximum (%)', + requestErrorRateMaxPercentHint: 'Request error rate above this value will be displayed in red (default: 5%)', + upstreamErrorRateMaxPercent: 'Upstream Error Rate Maximum (%)', + upstreamErrorRateMaxPercentHint: 'Upstream error rate above this value will be displayed in red (default: 5%)', + advancedSettings: 'Advanced Settings', + dataRetention: 'Data Retention Policy', + enableCleanup: 'Enable Data Cleanup', + cleanupSchedule: 'Cleanup Schedule (Cron)', + cleanupScheduleHint: 'Example: 0 2 * * * means 2 AM daily', + errorLogRetentionDays: 'Error Log Retention Days', + minuteMetricsRetentionDays: 'Minute Metrics Retention Days', + hourlyMetricsRetentionDays: 'Hourly Metrics Retention Days', + retentionDaysHint: 'Recommended 7-90 days, longer periods will consume more storage', + aggregation: 'Pre-aggregation Tasks', + enableAggregation: 'Enable Pre-aggregation', + aggregationHint: 'Pre-aggregation improves query performance for long time windows', + validation: { + title: 'Please fix the following issues', + retentionDaysRange: 'Retention days must be between 1-365 days' + } + }, concurrency: { title: 'Concurrency / Queue', byPlatform: 'By Platform', @@ -2334,12 +2387,13 @@ export default { accountError: 'Error' }, tooltips: { + totalRequests: 'Total number of requests (including both successful and failed requests) in the selected time window.', throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.', latencyHistogram: 'Latency distribution (duration_ms) for successful requests.', errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).', errorDistribution: 'Error distribution by status code.', goroutines: - 'Number of Go runtime goroutines (lightweight threads). There is no absolute “safe” number—use your historical baseline. Heuristic: <2k is common; 2k–8k watch; >8k plus rising queue/latency often suggests blocking/leaks.', + 'Number of Go runtime goroutines (lightweight threads). There is no absolute "safe" number—use your historical baseline. Heuristic: <2k is common; 2k–8k watch; >8k plus rising queue/latency often suggests blocking/leaks.', cpu: 'CPU usage percentage, showing system processor load.', memory: 'Memory usage, including used and total available memory.', db: 'Database connection pool status, including active, idle, and waiting connections.', @@ -2349,6 +2403,7 @@ export default { tokens: 'Total number of tokens processed in the current time window.', sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).', errors: 'Error statistics, including total errors, error rate, and upstream error rate.', + upstreamErrors: 'Upstream error statistics, excluding rate limit errors (429/529).', latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.', ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.', health: 'System health score (0-100), considering SLA, error rate, and resource usage.' diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index 5d41d6f8..4535ca88 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -2022,7 +2022,7 @@ export default { ready: '就绪', requestsTotal: '请求(总计)', slaScope: 'SLA 范围:', - tokens: 'Token', + tokens: 'Token数', tps: 'TPS', current: '当前', peak: '峰值', @@ -2051,7 +2051,8 @@ export default { avg: 'avg', max: 'max', qps: 'QPS', - requests: '请求', + requests: '请求数', + requestsTitle: '请求', upstream: '上游', client: '客户端', system: '系统', @@ -2469,6 +2470,18 @@ export default { reportRecipients: '评估报告接收邮箱', dailySummary: '每日摘要', weeklySummary: '每周摘要', + metricThresholds: '指标阈值配置', + metricThresholdsHint: '配置各项指标的告警阈值,超出阈值时将以红色显示', + slaMinPercent: 'SLA最低百分比', + slaMinPercentHint: 'SLA低于此值时显示为红色(默认:99.5%)', + latencyP99MaxMs: '延迟P99最大值(毫秒)', + latencyP99MaxMsHint: '延迟P99高于此值时显示为红色(默认:2000ms)', + ttftP99MaxMs: 'TTFT P99最大值(毫秒)', + ttftP99MaxMsHint: 'TTFT P99高于此值时显示为红色(默认:500ms)', + requestErrorRateMaxPercent: '请求错误率最大值(%)', + requestErrorRateMaxPercentHint: '请求错误率高于此值时显示为红色(默认:5%)', + upstreamErrorRateMaxPercent: '上游错误率最大值(%)', + upstreamErrorRateMaxPercentHint: '上游错误率高于此值时显示为红色(默认:5%)', advancedSettings: '高级设置', dataRetention: '数据保留策略', enableCleanup: '启用数据清理', diff --git a/frontend/src/views/admin/ops/OpsDashboard.vue b/frontend/src/views/admin/ops/OpsDashboard.vue index e8fedc5a..f6712352 100644 --- a/frontend/src/views/admin/ops/OpsDashboard.vue +++ b/frontend/src/views/admin/ops/OpsDashboard.vue @@ -13,17 +13,13 @@ - + @@ -115,13 +111,12 @@ import AppLayout from '@/components/layout/AppLayout.vue' import BaseDialog from '@/components/common/BaseDialog.vue' import { opsAPI, - OPS_WS_CLOSE_CODES, - type OpsWSStatus, type OpsDashboardOverview, type OpsErrorDistributionResponse, type OpsErrorTrendResponse, type OpsLatencyHistogramResponse, - type OpsThroughputTrendResponse + type OpsThroughputTrendResponse, + type OpsMetricThresholds } from '@/api/admin/ops' import { useAdminSettingsStore, useAppStore } from '@/stores' import OpsDashboardHeader from './components/OpsDashboardHeader.vue' @@ -172,14 +167,6 @@ const QUERY_KEYS = { const isApplyingRouteQuery = ref(false) const isSyncingRouteQuery = ref(false) -// WebSocket for realtime QPS/TPS -const realTimeQPS = ref(0) -const realTimeTPS = ref(0) -const wsStatus = ref('closed') -const wsReconnectInMs = ref(null) -const wsHasData = ref(false) -let unsubscribeQPS: (() => void) | null = null - let dashboardFetchController: AbortController | null = null let dashboardFetchSeq = 0 @@ -199,50 +186,6 @@ function abortDashboardFetch() { } } -function stopQPSSubscription(options?: { resetMetrics?: boolean }) { - wsStatus.value = 'closed' - wsReconnectInMs.value = null - if (unsubscribeQPS) unsubscribeQPS() - unsubscribeQPS = null - - if (options?.resetMetrics) { - realTimeQPS.value = 0 - realTimeTPS.value = 0 - wsHasData.value = false - } -} - -function startQPSSubscription() { - stopQPSSubscription() - unsubscribeQPS = opsAPI.subscribeQPS( - (payload) => { - if (payload && typeof payload === 'object' && payload.type === 'qps_update' && payload.data) { - realTimeQPS.value = payload.data.qps || 0 - realTimeTPS.value = payload.data.tps || 0 - wsHasData.value = true - } - }, - { - onStatusChange: (status) => { - wsStatus.value = status - if (status === 'connected') wsReconnectInMs.value = null - }, - onReconnectScheduled: ({ delayMs }) => { - wsReconnectInMs.value = delayMs - }, - onFatalClose: (event) => { - // Server-side feature flag says realtime is disabled; keep UI consistent and avoid reconnect loops. - if (event && event.code === OPS_WS_CLOSE_CODES.REALTIME_DISABLED) { - adminSettingsStore.setOpsRealtimeMonitoringEnabledLocal(false) - stopQPSSubscription({ resetMetrics: true }) - } - }, - // QPS updates may be sparse in idle periods; keep the timeout conservative. - staleTimeoutMs: 180_000 - } - ) -} - const readQueryString = (key: string): string => { const value = route.query[key] if (typeof value === 'string') return value @@ -314,6 +257,7 @@ const syncQueryToRoute = useDebounceFn(async () => { }, 250) const overview = ref(null) +const metricThresholds = ref(null) const throughputTrend = ref(null) const loadingTrend = ref(false) @@ -376,6 +320,11 @@ function onTimeRangeChange(v: string | number | boolean | null) { timeRange.value = v as TimeRange } +function onSettingsSaved() { + loadThresholds() + fetchData() +} + function onPlatformChange(v: string | number | boolean | null) { platform.value = typeof v === 'string' ? v : '' } @@ -615,31 +564,25 @@ onMounted(async () => { return } - if (adminSettingsStore.opsRealtimeMonitoringEnabled) { - startQPSSubscription() - } else { - stopQPSSubscription({ resetMetrics: true }) - } + // Load thresholds configuration + loadThresholds() if (opsEnabled.value) { await fetchData() } }) +async function loadThresholds() { + try { + const settings = await opsAPI.getAlertRuntimeSettings() + metricThresholds.value = settings.thresholds || null + } catch (err) { + console.warn('[OpsDashboard] Failed to load thresholds', err) + metricThresholds.value = null + } +} + onUnmounted(() => { - stopQPSSubscription() abortDashboardFetch() }) - -watch( - () => adminSettingsStore.opsRealtimeMonitoringEnabled, - (enabled) => { - if (!opsEnabled.value) return - if (enabled) { - startQPSSubscription() - } else { - stopQPSSubscription({ resetMetrics: true }) - } - } -) diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue index ccb5dac7..e2002b34 100644 --- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue +++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue @@ -1,29 +1,28 @@