feat(运维监控): 增强监控功能和健康评分系统

后端改进:
- 新增健康评分计算服务(ops_health_score.go)
- 添加分布式锁支持(ops_advisory_lock.go)
- 优化指标采集和聚合逻辑
- 新增运维指标采集间隔配置(60-3600秒)
- 移除未使用的WebSocket查询token认证中间件
- 改进清理服务和告警评估逻辑

前端改进:
- 简化OpsDashboard组件结构
- 完善国际化文本(中英文)
- 新增运维监控相关API类型定义
- 添加运维指标采集间隔设置界面
- 优化错误详情模态框

测试:
- 添加健康评分单元测试
- 更新API契约测试
This commit is contained in:
IanShaw027
2026-01-10 01:38:47 +08:00
parent 8ae75e7f6e
commit 585257d340
25 changed files with 570 additions and 385 deletions

View File

@@ -68,6 +68,7 @@ func (h *SettingHandler) GetSettings(c *gin.Context) {
OpsMonitoringEnabled: settings.OpsMonitoringEnabled, OpsMonitoringEnabled: settings.OpsMonitoringEnabled,
OpsRealtimeMonitoringEnabled: settings.OpsRealtimeMonitoringEnabled, OpsRealtimeMonitoringEnabled: settings.OpsRealtimeMonitoringEnabled,
OpsQueryModeDefault: settings.OpsQueryModeDefault, OpsQueryModeDefault: settings.OpsQueryModeDefault,
OpsMetricsIntervalSeconds: settings.OpsMetricsIntervalSeconds,
}) })
} }
@@ -115,9 +116,10 @@ type UpdateSettingsRequest struct {
IdentityPatchPrompt string `json:"identity_patch_prompt"` IdentityPatchPrompt string `json:"identity_patch_prompt"`
// Ops monitoring (vNext) // Ops monitoring (vNext)
OpsMonitoringEnabled *bool `json:"ops_monitoring_enabled"` OpsMonitoringEnabled *bool `json:"ops_monitoring_enabled"`
OpsRealtimeMonitoringEnabled *bool `json:"ops_realtime_monitoring_enabled"` OpsRealtimeMonitoringEnabled *bool `json:"ops_realtime_monitoring_enabled"`
OpsQueryModeDefault *string `json:"ops_query_mode_default"` OpsQueryModeDefault *string `json:"ops_query_mode_default"`
OpsMetricsIntervalSeconds *int `json:"ops_metrics_interval_seconds"`
} }
// UpdateSettings 更新系统设置 // UpdateSettings 更新系统设置
@@ -173,6 +175,18 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
} }
} }
// Ops metrics collector interval validation (seconds).
if req.OpsMetricsIntervalSeconds != nil {
v := *req.OpsMetricsIntervalSeconds
if v < 60 {
v = 60
}
if v > 3600 {
v = 3600
}
req.OpsMetricsIntervalSeconds = &v
}
settings := &service.SystemSettings{ settings := &service.SystemSettings{
RegistrationEnabled: req.RegistrationEnabled, RegistrationEnabled: req.RegistrationEnabled,
EmailVerifyEnabled: req.EmailVerifyEnabled, EmailVerifyEnabled: req.EmailVerifyEnabled,
@@ -219,6 +233,12 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
} }
return previousSettings.OpsQueryModeDefault return previousSettings.OpsQueryModeDefault
}(), }(),
OpsMetricsIntervalSeconds: func() int {
if req.OpsMetricsIntervalSeconds != nil {
return *req.OpsMetricsIntervalSeconds
}
return previousSettings.OpsMetricsIntervalSeconds
}(),
} }
if err := h.settingService.UpdateSettings(c.Request.Context(), settings); err != nil { if err := h.settingService.UpdateSettings(c.Request.Context(), settings); err != nil {
@@ -266,6 +286,7 @@ func (h *SettingHandler) UpdateSettings(c *gin.Context) {
OpsMonitoringEnabled: updatedSettings.OpsMonitoringEnabled, OpsMonitoringEnabled: updatedSettings.OpsMonitoringEnabled,
OpsRealtimeMonitoringEnabled: updatedSettings.OpsRealtimeMonitoringEnabled, OpsRealtimeMonitoringEnabled: updatedSettings.OpsRealtimeMonitoringEnabled,
OpsQueryModeDefault: updatedSettings.OpsQueryModeDefault, OpsQueryModeDefault: updatedSettings.OpsQueryModeDefault,
OpsMetricsIntervalSeconds: updatedSettings.OpsMetricsIntervalSeconds,
}) })
} }
@@ -375,6 +396,9 @@ func diffSettings(before *service.SystemSettings, after *service.SystemSettings,
if before.OpsQueryModeDefault != after.OpsQueryModeDefault { if before.OpsQueryModeDefault != after.OpsQueryModeDefault {
changed = append(changed, "ops_query_mode_default") changed = append(changed, "ops_query_mode_default")
} }
if before.OpsMetricsIntervalSeconds != after.OpsMetricsIntervalSeconds {
changed = append(changed, "ops_metrics_interval_seconds")
}
return changed return changed
} }

View File

@@ -39,9 +39,10 @@ type SystemSettings struct {
IdentityPatchPrompt string `json:"identity_patch_prompt"` IdentityPatchPrompt string `json:"identity_patch_prompt"`
// Ops monitoring (vNext) // Ops monitoring (vNext)
OpsMonitoringEnabled bool `json:"ops_monitoring_enabled"` OpsMonitoringEnabled bool `json:"ops_monitoring_enabled"`
OpsRealtimeMonitoringEnabled bool `json:"ops_realtime_monitoring_enabled"` OpsRealtimeMonitoringEnabled bool `json:"ops_realtime_monitoring_enabled"`
OpsQueryModeDefault string `json:"ops_query_mode_default"` OpsQueryModeDefault string `json:"ops_query_mode_default"`
OpsMetricsIntervalSeconds int `json:"ops_metrics_interval_seconds"`
} }
type PublicSettings struct { type PublicSettings struct {

View File

@@ -68,6 +68,9 @@ INSERT INTO ops_system_metrics (
db_ok, db_ok,
redis_ok, redis_ok,
redis_conn_total,
redis_conn_idle,
db_conn_active, db_conn_active,
db_conn_idle, db_conn_idle,
db_conn_waiting, db_conn_waiting,
@@ -83,8 +86,9 @@ INSERT INTO ops_system_metrics (
$21,$22,$23,$24,$25,$26, $21,$22,$23,$24,$25,$26,
$27,$28,$29,$30, $27,$28,$29,$30,
$31,$32, $31,$32,
$33,$34,$35, $33,$34,
$36,$37 $35,$36,$37,
$38,$39
)` )`
_, err := r.db.ExecContext( _, err := r.db.ExecContext(
@@ -130,6 +134,9 @@ INSERT INTO ops_system_metrics (
opsNullBool(input.DBOK), opsNullBool(input.DBOK),
opsNullBool(input.RedisOK), opsNullBool(input.RedisOK),
opsNullInt(input.RedisConnTotal),
opsNullInt(input.RedisConnIdle),
opsNullInt(input.DBConnActive), opsNullInt(input.DBConnActive),
opsNullInt(input.DBConnIdle), opsNullInt(input.DBConnIdle),
opsNullInt(input.DBConnWaiting), opsNullInt(input.DBConnWaiting),
@@ -162,6 +169,9 @@ SELECT
db_ok, db_ok,
redis_ok, redis_ok,
redis_conn_total,
redis_conn_idle,
db_conn_active, db_conn_active,
db_conn_idle, db_conn_idle,
db_conn_waiting, db_conn_waiting,
@@ -182,6 +192,8 @@ LIMIT 1`
var memPct sql.NullFloat64 var memPct sql.NullFloat64
var dbOK sql.NullBool var dbOK sql.NullBool
var redisOK sql.NullBool var redisOK sql.NullBool
var redisTotal sql.NullInt64
var redisIdle sql.NullInt64
var dbActive sql.NullInt64 var dbActive sql.NullInt64
var dbIdle sql.NullInt64 var dbIdle sql.NullInt64
var dbWaiting sql.NullInt64 var dbWaiting sql.NullInt64
@@ -198,6 +210,8 @@ LIMIT 1`
&memPct, &memPct,
&dbOK, &dbOK,
&redisOK, &redisOK,
&redisTotal,
&redisIdle,
&dbActive, &dbActive,
&dbIdle, &dbIdle,
&dbWaiting, &dbWaiting,
@@ -231,6 +245,14 @@ LIMIT 1`
v := redisOK.Bool v := redisOK.Bool
out.RedisOK = &v out.RedisOK = &v
} }
if redisTotal.Valid {
v := int(redisTotal.Int64)
out.RedisConnTotal = &v
}
if redisIdle.Valid {
v := int(redisIdle.Int64)
out.RedisConnIdle = &v
}
if dbActive.Valid { if dbActive.Valid {
v := int(dbActive.Int64) v := int(dbActive.Int64)
out.DBConnActive = &v out.DBConnActive = &v
@@ -398,4 +420,3 @@ func opsNullTime(v *time.Time) any {
} }
return sql.NullTime{Time: *v, Valid: true} return sql.NullTime{Time: *v, Valid: true}
} }

View File

@@ -319,7 +319,9 @@ func TestAPIContracts(t *testing.T) {
"enable_identity_patch": true, "enable_identity_patch": true,
"identity_patch_prompt": "", "identity_patch_prompt": "",
"ops_monitoring_enabled": true, "ops_monitoring_enabled": true,
"ops_realtime_monitoring_enabled": true "ops_realtime_monitoring_enabled": true,
"ops_query_mode_default": "auto",
"ops_metrics_interval_seconds": 60
} }
}`, }`,
}, },

View File

@@ -1,54 +0,0 @@
package middleware
import (
"net/http"
"strings"
"github.com/gin-gonic/gin"
)
// InjectBearerTokenFromQueryForWebSocket copies `?token=` into the Authorization header
// for WebSocket handshake requests on a small allow-list of endpoints.
//
// Why: browsers can't set custom headers on WebSocket handshake, but our admin routes
// are protected by header-based auth. This keeps the token support scoped to WS only.
func InjectBearerTokenFromQueryForWebSocket() gin.HandlerFunc {
return func(c *gin.Context) {
if c == nil || c.Request == nil {
if c != nil {
c.Next()
}
return
}
// Only GET websocket upgrades.
if c.Request.Method != http.MethodGet {
c.Next()
return
}
if !strings.EqualFold(strings.TrimSpace(c.GetHeader("Upgrade")), "websocket") {
c.Next()
return
}
// If caller already supplied auth headers, don't override.
if strings.TrimSpace(c.GetHeader("Authorization")) != "" || strings.TrimSpace(c.GetHeader("x-api-key")) != "" {
c.Next()
return
}
// Allow-list ops websocket endpoints.
path := strings.TrimSpace(c.Request.URL.Path)
if !strings.HasPrefix(path, "/api/v1/admin/ops/ws/") {
c.Next()
return
}
token := strings.TrimSpace(c.Query("token"))
if token != "" {
c.Request.Header.Set("Authorization", "Bearer "+token)
}
c.Next()
}
}

View File

@@ -25,8 +25,6 @@ func SetupRouter(
) *gin.Engine { ) *gin.Engine {
// 应用中间件 // 应用中间件
r.Use(middleware2.Logger()) r.Use(middleware2.Logger())
// WebSocket handshake auth helper (token via query param, WS endpoints only).
r.Use(middleware2.InjectBearerTokenFromQueryForWebSocket())
r.Use(middleware2.CORS(cfg.CORS)) r.Use(middleware2.CORS(cfg.CORS))
r.Use(middleware2.SecurityHeaders(cfg.Security.CSP)) r.Use(middleware2.SecurityHeaders(cfg.Security.CSP))

View File

@@ -0,0 +1,46 @@
package service
import (
"context"
"database/sql"
"hash/fnv"
"time"
)
func hashAdvisoryLockID(key string) int64 {
h := fnv.New64a()
_, _ = h.Write([]byte(key))
return int64(h.Sum64())
}
func tryAcquireDBAdvisoryLock(ctx context.Context, db *sql.DB, lockID int64) (func(), bool) {
if db == nil {
return nil, false
}
if ctx == nil {
ctx = context.Background()
}
conn, err := db.Conn(ctx)
if err != nil {
return nil, false
}
acquired := false
if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", lockID).Scan(&acquired); err != nil {
_ = conn.Close()
return nil, false
}
if !acquired {
_ = conn.Close()
return nil, false
}
release := func() {
unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", lockID)
_ = conn.Close()
}
return release, true
}

View File

@@ -376,28 +376,37 @@ return 0
`) `)
func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) { func (s *OpsAggregationService) tryAcquireLeaderLock(ctx context.Context, key string, ttl time.Duration, logPrefix string) (func(), bool) {
if s == nil || s.redisClient == nil { if s == nil {
return nil, true return nil, false
} }
if ctx == nil { if ctx == nil {
ctx = context.Background() ctx = context.Background()
} }
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() // Prefer Redis leader lock when available (multi-instance), but avoid stampeding
if err != nil { // the DB when Redis is flaky by falling back to a DB advisory lock.
// Fail-open: do not block single-instance deployments. if s.redisClient != nil {
return nil, true ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
if err == nil {
if !ok {
s.maybeLogSkip(logPrefix)
return nil, false
}
release := func() {
ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result()
}
return release, true
}
// Redis error: fall through to DB advisory lock.
} }
release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
if !ok { if !ok {
s.maybeLogSkip(logPrefix) s.maybeLogSkip(logPrefix)
return nil, false return nil, false
} }
release := func() {
ctx2, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_, _ = opsAggReleaseScript.Run(ctx2, s.redisClient, []string{key}, s.instanceID).Result()
}
return release, true return release, true
} }

View File

@@ -720,11 +720,12 @@ func (s *OpsAlertEvaluatorService) tryAcquireLeaderLock(ctx context.Context, loc
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
if err != nil { if err != nil {
// Fail-open for single-node environments, but warn. // Prefer fail-closed to avoid duplicate evaluators stampeding the DB when Redis is flaky.
// Single-node deployments can disable the distributed lock via runtime settings.
s.warnNoRedisOnce.Do(func() { s.warnNoRedisOnce.Do(func() {
log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; running without lock: %v", err) log.Printf("[OpsAlertEvaluator] leader lock SetNX failed; skipping this cycle: %v", err)
}) })
return nil, true return nil, false
} }
if !ok { if !ok {
s.maybeLogSkip(key) s.maybeLogSkip(key)

View File

@@ -300,30 +300,36 @@ func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), b
return nil, true return nil, true
} }
if s.redisClient == nil {
s.warnNoRedisOnce.Do(func() {
log.Printf("[OpsCleanup] redis not configured; running without distributed lock")
})
return nil, true
}
key := opsCleanupLeaderLockKeyDefault key := opsCleanupLeaderLockKeyDefault
ttl := opsCleanupLeaderLockTTLDefault ttl := opsCleanupLeaderLockTTLDefault
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result() // Prefer Redis leader lock when available, but avoid stampeding the DB when Redis is flaky by
if err != nil { // falling back to a DB advisory lock.
if s.redisClient != nil {
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
if err == nil {
if !ok {
return nil, false
}
return func() {
_, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
}, true
}
// Redis error: fall back to DB advisory lock.
s.warnNoRedisOnce.Do(func() { s.warnNoRedisOnce.Do(func() {
log.Printf("[OpsCleanup] leader lock SetNX failed; running without lock: %v", err) log.Printf("[OpsCleanup] leader lock SetNX failed; falling back to DB advisory lock: %v", err)
})
} else {
s.warnNoRedisOnce.Do(func() {
log.Printf("[OpsCleanup] redis not configured; using DB advisory lock")
}) })
return nil, true
} }
release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
if !ok { if !ok {
return nil, false return nil, false
} }
return release, true
return func() {
_, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
}, true
} }
func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) { func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {

View File

@@ -5,6 +5,7 @@ import (
"database/sql" "database/sql"
"errors" "errors"
"log" "log"
"time"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors" infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
) )
@@ -39,6 +40,16 @@ func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashbo
// Best-effort system health + jobs; dashboard metrics should still render if these are missing. // Best-effort system health + jobs; dashboard metrics should still render if these are missing.
if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil { if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil {
// Attach config-derived limits so the UI can show "current / max" for connection pools.
// These are best-effort and should never block the dashboard rendering.
if s != nil && s.cfg != nil {
if s.cfg.Database.MaxOpenConns > 0 {
metrics.DBMaxOpenConns = intPtr(s.cfg.Database.MaxOpenConns)
}
if s.cfg.Redis.PoolSize > 0 {
metrics.RedisPoolSize = intPtr(s.cfg.Redis.PoolSize)
}
}
overview.SystemMetrics = metrics overview.SystemMetrics = metrics
} else if err != nil && !errors.Is(err, sql.ErrNoRows) { } else if err != nil && !errors.Is(err, sql.ErrNoRows) {
log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err) log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err)
@@ -50,6 +61,8 @@ func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashbo
log.Printf("[Ops] ListJobHeartbeats failed: %v", err) log.Printf("[Ops] ListJobHeartbeats failed: %v", err)
} }
overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview)
return overview, nil return overview, nil
} }

View File

@@ -35,6 +35,10 @@ type OpsDashboardOverview struct {
Platform string `json:"platform"` Platform string `json:"platform"`
GroupID *int64 `json:"group_id"` GroupID *int64 `json:"group_id"`
// HealthScore is a backend-computed overall health score (0-100).
// It is derived from the monitored metrics in this overview, plus best-effort system metrics/job heartbeats.
HealthScore int `json:"health_score"`
// Latest system-level snapshot (window=1m, global). // Latest system-level snapshot (window=1m, global).
SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"` SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"`

View File

@@ -0,0 +1,126 @@
package service
import (
"math"
"time"
)
// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
//
// Design goals:
// - Backend-owned scoring (UI only displays).
// - Uses "overall" business indicators (SLA/error/latency) plus infra indicators (db/redis/cpu/mem/jobs).
// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
if overview == nil {
return 0
}
// Idle/no-data: avoid showing a "bad" score when there is no traffic.
// UI can still render a gray/idle state based on QPS + error rate.
if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 {
return 100
}
score := 100.0
// --- SLA (primary signal) ---
// SLA is a ratio (0..1). Target is intentionally modest for LLM gateways; it can be tuned later.
slaPct := clampFloat64(overview.SLA*100, 0, 100)
if slaPct < 99.5 {
// Up to -45 points as SLA drops.
score -= math.Min(45, (99.5-slaPct)*12)
}
// --- Error rates (secondary signal) ---
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
if errorPct > 1 {
// Cap at -20 points by 6% error rate.
score -= math.Min(20, (errorPct-1)*4)
}
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
if upstreamPct > 1 {
// Upstream instability deserves extra weight, but keep it smaller than SLA/error.
score -= math.Min(15, (upstreamPct-1)*3)
}
// --- Latency (tail-focused) ---
// Use p99 of duration + TTFT. Penalize only when clearly elevated.
if overview.Duration.P99 != nil {
p99 := float64(*overview.Duration.P99)
if p99 > 2000 {
// From 2s upward, gradually penalize up to -20.
score -= math.Min(20, (p99-2000)/900) // ~20s => ~-20
}
}
if overview.TTFT.P99 != nil {
p99 := float64(*overview.TTFT.P99)
if p99 > 500 {
// TTFT > 500ms starts hurting; cap at -10.
score -= math.Min(10, (p99-500)/200) // 2.5s => -10
}
}
// --- System metrics snapshot (best-effort) ---
if overview.SystemMetrics != nil {
if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
score -= 20
}
if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
score -= 15
}
if overview.SystemMetrics.CPUUsagePercent != nil {
cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
if cpuPct > 85 {
score -= math.Min(10, (cpuPct-85)*1.5)
}
}
if overview.SystemMetrics.MemoryUsagePercent != nil {
memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
if memPct > 90 {
score -= math.Min(10, (memPct-90)*1.0)
}
}
if overview.SystemMetrics.DBConnWaiting != nil && *overview.SystemMetrics.DBConnWaiting > 0 {
waiting := float64(*overview.SystemMetrics.DBConnWaiting)
score -= math.Min(10, waiting*2)
}
if overview.SystemMetrics.ConcurrencyQueueDepth != nil && *overview.SystemMetrics.ConcurrencyQueueDepth > 0 {
depth := float64(*overview.SystemMetrics.ConcurrencyQueueDepth)
score -= math.Min(10, depth*0.5)
}
}
// --- Job heartbeats (best-effort) ---
// Penalize only clear "error after last success" signals, and cap the impact.
jobPenalty := 0.0
for _, hb := range overview.JobHeartbeats {
if hb == nil {
continue
}
if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
jobPenalty += 5
continue
}
if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
jobPenalty += 2
}
}
score -= math.Min(15, jobPenalty)
score = clampFloat64(score, 0, 100)
return int(math.Round(score))
}
func clampFloat64(v float64, min float64, max float64) float64 {
if v < min {
return min
}
if v > max {
return max
}
return v
}

View File

@@ -0,0 +1,60 @@
//go:build unit
package service
import (
"testing"
"time"
"github.com/stretchr/testify/require"
)
func TestComputeDashboardHealthScore_IdleReturns100(t *testing.T) {
t.Parallel()
score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{})
require.Equal(t, 100, score)
}
func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) {
t.Parallel()
ov := &OpsDashboardOverview{
RequestCountTotal: 100,
RequestCountSLA: 100,
SuccessCount: 90,
ErrorCountTotal: 10,
ErrorCountSLA: 10,
SLA: 0.90,
ErrorRate: 0.10,
UpstreamErrorRate: 0.08,
Duration: OpsPercentiles{P99: intPtr(20_000)},
TTFT: OpsPercentiles{P99: intPtr(2_000)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(false),
RedisOK: boolPtr(false),
CPUUsagePercent: float64Ptr(98.0),
MemoryUsagePercent: float64Ptr(97.0),
DBConnWaiting: intPtr(3),
ConcurrencyQueueDepth: intPtr(10),
},
JobHeartbeats: []*OpsJobHeartbeat{
{
JobName: "job-a",
LastErrorAt: timePtr(time.Now().UTC().Add(-1 * time.Minute)),
LastError: stringPtr("boom"),
},
},
}
score := computeDashboardHealthScore(time.Now().UTC(), ov)
require.Less(t, score, 80)
require.GreaterOrEqual(t, score, 0)
}
func timePtr(v time.Time) *time.Time { return &v }
func stringPtr(v string) *string { return &v }

View File

@@ -5,7 +5,6 @@ import (
"database/sql" "database/sql"
"errors" "errors"
"fmt" "fmt"
"hash/fnv"
"log" "log"
"math" "math"
"os" "os"
@@ -262,6 +261,7 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
dbOK := c.checkDB(ctx) dbOK := c.checkDB(ctx)
redisOK := c.checkRedis(ctx) redisOK := c.checkRedis(ctx)
active, idle := c.dbPoolStats() active, idle := c.dbPoolStats()
redisTotal, redisIdle, redisStatsOK := c.redisPoolStats()
successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd) successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd)
if err != nil { if err != nil {
@@ -327,6 +327,19 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
DBOK: boolPtr(dbOK), DBOK: boolPtr(dbOK),
RedisOK: boolPtr(redisOK), RedisOK: boolPtr(redisOK),
RedisConnTotal: func() *int {
if !redisStatsOK {
return nil
}
return intPtr(redisTotal)
}(),
RedisConnIdle: func() *int {
if !redisStatsOK {
return nil
}
return intPtr(redisIdle)
}(),
DBConnActive: intPtr(active), DBConnActive: intPtr(active),
DBConnIdle: intPtr(idle), DBConnIdle: intPtr(idle),
GoroutineCount: intPtr(goroutines), GoroutineCount: intPtr(goroutines),
@@ -722,6 +735,17 @@ func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool {
return c.redisClient.Ping(ctx).Err() == nil return c.redisClient.Ping(ctx).Err() == nil
} }
func (c *OpsMetricsCollector) redisPoolStats() (total int, idle int, ok bool) {
if c == nil || c.redisClient == nil {
return 0, 0, false
}
stats := c.redisClient.PoolStats()
if stats == nil {
return 0, 0, false
}
return int(stats.TotalConns), int(stats.IdleConns), true
}
func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) { func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) {
if c == nil || c.db == nil { if c == nil || c.db == nil {
return 0, 0 return 0, 0
@@ -749,7 +773,7 @@ func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(),
if err != nil { if err != nil {
// Prefer fail-closed to avoid stampeding the database when Redis is flaky. // Prefer fail-closed to avoid stampeding the database when Redis is flaky.
// Fallback to a DB advisory lock when Redis is present but unavailable. // Fallback to a DB advisory lock when Redis is present but unavailable.
release, ok := c.tryAcquireDBAdvisoryLock(ctx) release, ok := tryAcquireDBAdvisoryLock(ctx, c.db, opsMetricsCollectorAdvisoryLockID)
if !ok { if !ok {
c.maybeLogSkip() c.maybeLogSkip()
return nil, false return nil, false
@@ -769,38 +793,6 @@ func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(),
return release, true return release, true
} }
func (c *OpsMetricsCollector) tryAcquireDBAdvisoryLock(ctx context.Context) (func(), bool) {
if c == nil || c.db == nil {
return nil, false
}
if ctx == nil {
ctx = context.Background()
}
conn, err := c.db.Conn(ctx)
if err != nil {
return nil, false
}
acquired := false
if err := conn.QueryRowContext(ctx, "SELECT pg_try_advisory_lock($1)", opsMetricsCollectorAdvisoryLockID).Scan(&acquired); err != nil {
_ = conn.Close()
return nil, false
}
if !acquired {
_ = conn.Close()
return nil, false
}
release := func() {
unlockCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_, _ = conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", opsMetricsCollectorAdvisoryLockID)
_ = conn.Close()
}
return release, true
}
func (c *OpsMetricsCollector) maybeLogSkip() { func (c *OpsMetricsCollector) maybeLogSkip() {
c.skipLogMu.Lock() c.skipLogMu.Lock()
defer c.skipLogMu.Unlock() defer c.skipLogMu.Unlock()
@@ -853,9 +845,3 @@ func float64Ptr(v float64) *float64 {
out := v out := v
return &out return &out
} }
func hashAdvisoryLockID(s string) int64 {
h := fnv.New64a()
_, _ = h.Write([]byte(s))
return int64(h.Sum64())
}

View File

@@ -165,6 +165,9 @@ type OpsInsertSystemMetricsInput struct {
DBOK *bool DBOK *bool
RedisOK *bool RedisOK *bool
RedisConnTotal *int
RedisConnIdle *int
DBConnActive *int DBConnActive *int
DBConnIdle *int DBConnIdle *int
DBConnWaiting *int DBConnWaiting *int
@@ -186,6 +189,13 @@ type OpsSystemMetricsSnapshot struct {
DBOK *bool `json:"db_ok"` DBOK *bool `json:"db_ok"`
RedisOK *bool `json:"redis_ok"` RedisOK *bool `json:"redis_ok"`
// Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max".
DBMaxOpenConns *int `json:"db_max_open_conns"`
RedisPoolSize *int `json:"redis_pool_size"`
RedisConnTotal *int `json:"redis_conn_total"`
RedisConnIdle *int `json:"redis_conn_idle"`
DBConnActive *int `json:"db_conn_active"` DBConnActive *int `json:"db_conn_active"`
DBConnIdle *int `json:"db_conn_idle"` DBConnIdle *int `json:"db_conn_idle"`
DBConnWaiting *int `json:"db_conn_waiting"` DBConnWaiting *int `json:"db_conn_waiting"`

View File

@@ -139,6 +139,9 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled) updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled)
updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled) updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled)
updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault)) updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault))
if settings.OpsMetricsIntervalSeconds > 0 {
updates[SettingKeyOpsMetricsIntervalSeconds] = strconv.Itoa(settings.OpsMetricsIntervalSeconds)
}
return s.settingRepo.SetMultiple(ctx, updates) return s.settingRepo.SetMultiple(ctx, updates)
} }
@@ -231,6 +234,7 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error {
SettingKeyOpsMonitoringEnabled: "true", SettingKeyOpsMonitoringEnabled: "true",
SettingKeyOpsRealtimeMonitoringEnabled: "true", SettingKeyOpsRealtimeMonitoringEnabled: "true",
SettingKeyOpsQueryModeDefault: "auto", SettingKeyOpsQueryModeDefault: "auto",
SettingKeyOpsMetricsIntervalSeconds: "60",
} }
return s.settingRepo.SetMultiple(ctx, defaults) return s.settingRepo.SetMultiple(ctx, defaults)
@@ -301,6 +305,18 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin
result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled]) result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled])
result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled]) result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled])
result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault])) result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault]))
result.OpsMetricsIntervalSeconds = 60
if raw := strings.TrimSpace(settings[SettingKeyOpsMetricsIntervalSeconds]); raw != "" {
if v, err := strconv.Atoi(raw); err == nil {
if v < 60 {
v = 60
}
if v > 3600 {
v = 3600
}
result.OpsMetricsIntervalSeconds = v
}
}
return result return result
} }

View File

@@ -43,6 +43,7 @@ type SystemSettings struct {
OpsMonitoringEnabled bool OpsMonitoringEnabled bool
OpsRealtimeMonitoringEnabled bool OpsRealtimeMonitoringEnabled bool
OpsQueryModeDefault string OpsQueryModeDefault string
OpsMetricsIntervalSeconds int
} }
type PublicSettings struct { type PublicSettings struct {

View File

@@ -46,6 +46,8 @@ export interface OpsDashboardOverview {
platform: string platform: string
group_id?: number | null group_id?: number | null
health_score?: number
system_metrics?: OpsSystemMetricsSnapshot | null system_metrics?: OpsSystemMetricsSnapshot | null
job_heartbeats?: OpsJobHeartbeat[] | null job_heartbeats?: OpsJobHeartbeat[] | null
@@ -228,6 +230,9 @@ export interface OpsSystemMetricsSnapshot {
db_ok?: boolean | null db_ok?: boolean | null
redis_ok?: boolean | null redis_ok?: boolean | null
redis_conn_total?: number | null
redis_conn_idle?: number | null
db_conn_active?: number | null db_conn_active?: number | null
db_conn_idle?: number | null db_conn_idle?: number | null
db_conn_waiting?: number | null db_conn_waiting?: number | null

View File

@@ -50,6 +50,7 @@ export interface SystemSettings {
ops_monitoring_enabled: boolean ops_monitoring_enabled: boolean
ops_realtime_monitoring_enabled: boolean ops_realtime_monitoring_enabled: boolean
ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string
ops_metrics_interval_seconds: number
} }
export interface UpdateSettingsRequest { export interface UpdateSettingsRequest {
@@ -83,6 +84,7 @@ export interface UpdateSettingsRequest {
ops_monitoring_enabled?: boolean ops_monitoring_enabled?: boolean
ops_realtime_monitoring_enabled?: boolean ops_realtime_monitoring_enabled?: boolean
ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string
ops_metrics_interval_seconds?: number
} }
/** /**

View File

@@ -1733,8 +1733,10 @@ export default {
redis: 'Redis', redis: 'Redis',
goroutines: 'Goroutines', goroutines: 'Goroutines',
jobs: 'Jobs', jobs: 'Jobs',
jobsHelp: 'Click “Details” to view job heartbeats and recent errors',
active: 'active', active: 'active',
idle: 'idle', idle: 'idle',
waiting: 'waiting',
ok: 'ok', ok: 'ok',
lastRun: 'last_run:', lastRun: 'last_run:',
lastSuccess: 'last_success:', lastSuccess: 'last_success:',
@@ -1770,12 +1772,50 @@ export default {
errorsSla: 'Errors (SLA scope)', errorsSla: 'Errors (SLA scope)',
upstreamExcl429529: 'Upstream (excl 429/529)', upstreamExcl429529: 'Upstream (excl 429/529)',
failedToLoadData: 'Failed to load ops data.', failedToLoadData: 'Failed to load ops data.',
failedToLoadOverview: 'Failed to load overview',
failedToLoadThroughputTrend: 'Failed to load throughput trend',
failedToLoadLatencyHistogram: 'Failed to load latency histogram',
failedToLoadErrorTrend: 'Failed to load error trend',
failedToLoadErrorDistribution: 'Failed to load error distribution',
failedToLoadErrorDetail: 'Failed to load error detail',
retryFailed: 'Retry failed',
tpsK: 'TPS (K)', tpsK: 'TPS (K)',
top: 'Top:', top: 'Top:',
throughputTrend: 'Throughput Trend', throughputTrend: 'Throughput Trend',
latencyHistogram: 'Latency Histogram', latencyHistogram: 'Latency Histogram',
errorTrend: 'Error Trend', errorTrend: 'Error Trend',
errorDistribution: 'Error Distribution', errorDistribution: 'Error Distribution',
// Health Score & Diagnosis
health: 'Health',
healthCondition: 'Health Condition',
healthHelp: 'Overall system health score based on SLA, error rate, and resource usage',
healthyStatus: 'Healthy',
riskyStatus: 'At Risk',
idleStatus: 'Idle',
diagnosis: {
title: 'Smart Diagnosis',
footer: 'Automated diagnostic suggestions based on current metrics',
idle: 'System is currently idle',
idleImpact: 'No active traffic',
upstreamCritical: 'Upstream error rate critically high ({rate}%)',
upstreamCriticalImpact: 'May affect many user requests',
upstreamHigh: 'Upstream error rate elevated ({rate}%)',
upstreamHighImpact: 'Recommend checking upstream service status',
slaCritical: 'SLA critically below target ({sla}%)',
slaCriticalImpact: 'User experience severely degraded',
slaLow: 'SLA below target ({sla}%)',
slaLowImpact: 'Service quality needs attention',
errorHigh: 'Error rate too high ({rate}%)',
errorHighImpact: 'Many requests failing',
errorElevated: 'Error rate elevated ({rate}%)',
errorElevatedImpact: 'Recommend checking error logs',
healthCritical: 'Overall health score critically low ({score})',
healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation',
healthLow: 'Overall health score low ({score})',
healthLowImpact: 'May indicate minor instability; monitor SLA and error rates',
healthy: 'All system metrics normal',
healthyImpact: 'Service running stable'
},
// Error Log // Error Log
errorLog: { errorLog: {
timeId: 'Time / ID', timeId: 'Time / ID',
@@ -2069,7 +2109,21 @@ export default {
throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.', throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
latencyHistogram: 'Latency distribution (duration_ms) for successful requests.', latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).', errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
errorDistribution: 'Error distribution by status code.' errorDistribution: 'Error distribution by status code.',
goroutines:
'Number of Go runtime goroutines (lightweight threads). There is no absolute “safe” number—use your historical baseline. Heuristic: <2k is common; 2k8k watch; >8k plus rising queue/latency often suggests blocking/leaks.',
cpu: 'CPU usage percentage, showing system processor load.',
memory: 'Memory usage, including used and total available memory.',
db: 'Database connection pool status, including active, idle, and waiting connections.',
redis: 'Redis connection pool status, showing active and idle connections.',
jobs: 'Background job execution status, including last run time, success time, and error information.',
qps: 'Queries Per Second (QPS) and Tokens Per Second (TPS), real-time system throughput.',
tokens: 'Total number of tokens processed in the current time window.',
sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
}, },
charts: { charts: {
emptyRequest: 'No requests in this window.', emptyRequest: 'No requests in this window.',
@@ -2183,7 +2237,9 @@ export default {
queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)', queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)',
queryModeAuto: 'Auto (recommended)', queryModeAuto: 'Auto (recommended)',
queryModeRaw: 'Raw (most accurate, slower)', queryModeRaw: 'Raw (most accurate, slower)',
queryModePreagg: 'Preagg (fastest, requires aggregation)' queryModePreagg: 'Preagg (fastest, requires aggregation)',
metricsInterval: 'Metrics Collection Interval (seconds)',
metricsIntervalHint: 'How often to collect system/request metrics (60-3600 seconds)'
}, },
adminApiKey: { adminApiKey: {
title: 'Admin API Key', title: 'Admin API Key',

View File

@@ -1878,8 +1878,10 @@ export default {
redis: 'Redis', redis: 'Redis',
goroutines: '协程', goroutines: '协程',
jobs: '后台任务', jobs: '后台任务',
jobsHelp: '点击“明细”查看任务心跳与报错信息',
active: '活跃', active: '活跃',
idle: '空闲', idle: '空闲',
waiting: '等待',
ok: '正常', ok: '正常',
lastRun: '最近运行', lastRun: '最近运行',
lastSuccess: '最近成功', lastSuccess: '最近成功',
@@ -1898,8 +1900,8 @@ export default {
errors: '错误', errors: '错误',
errorRate: '错误率:', errorRate: '错误率:',
upstreamRate: '上游错误率:', upstreamRate: '上游错误率:',
latencyDuration: '延迟 (duration_ms)', latencyDuration: '延迟(毫秒)',
ttftLabel: 'TTFT (first_token_ms)', ttftLabel: '首字延迟(毫秒)',
p50: 'p50', p50: 'p50',
p90: 'p90', p90: 'p90',
p95: 'p95', p95: 'p95',
@@ -1915,12 +1917,50 @@ export default {
errorsSla: '错误SLA范围', errorsSla: '错误SLA范围',
upstreamExcl429529: '上游排除429/529', upstreamExcl429529: '上游排除429/529',
failedToLoadData: '加载运维数据失败', failedToLoadData: '加载运维数据失败',
tpsK: 'TPS (K)', failedToLoadOverview: '加载概览数据失败',
failedToLoadThroughputTrend: '加载吞吐趋势失败',
failedToLoadLatencyHistogram: '加载延迟分布失败',
failedToLoadErrorTrend: '加载错误趋势失败',
failedToLoadErrorDistribution: '加载错误分布失败',
failedToLoadErrorDetail: '加载错误详情失败',
retryFailed: '重试失败',
tpsK: 'TPS',
top: '最高:', top: '最高:',
throughputTrend: '吞吐趋势', throughputTrend: '吞吐趋势',
latencyHistogram: '延迟分布', latencyHistogram: '延迟分布',
errorTrend: '错误趋势', errorTrend: '错误趋势',
errorDistribution: '错误分布', errorDistribution: '错误分布',
// Health Score & Diagnosis
health: '健康',
healthCondition: '健康状况',
healthHelp: '基于 SLA、错误率和资源使用情况的系统整体健康评分',
healthyStatus: '健康',
riskyStatus: '风险',
idleStatus: '待机',
diagnosis: {
title: '智能诊断',
footer: '基于当前指标的自动诊断建议',
idle: '系统当前处于待机状态',
idleImpact: '无活跃流量',
upstreamCritical: '上游错误率严重偏高 ({rate}%)',
upstreamCriticalImpact: '可能影响大量用户请求',
upstreamHigh: '上游错误率偏高 ({rate}%)',
upstreamHighImpact: '建议检查上游服务状态',
slaCritical: 'SLA 严重低于目标 ({sla}%)',
slaCriticalImpact: '用户体验严重受损',
slaLow: 'SLA 低于目标 ({sla}%)',
slaLowImpact: '需要关注服务质量',
errorHigh: '错误率过高 ({rate}%)',
errorHighImpact: '大量请求失败',
errorElevated: '错误率偏高 ({rate}%)',
errorElevatedImpact: '建议检查错误日志',
healthCritical: '综合健康评分过低 ({score})',
healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与延迟',
healthLow: '综合健康评分偏低 ({score})',
healthLowImpact: '可能存在轻度波动,建议关注 SLA 与错误率',
healthy: '所有系统指标正常',
healthyImpact: '服务运行稳定'
},
// Error Log // Error Log
errorLog: { errorLog: {
timeId: '时间 / ID', timeId: '时间 / ID',
@@ -2212,9 +2252,23 @@ export default {
}, },
tooltips: { tooltips: {
throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。', throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。',
latencyHistogram: '成功请求的延迟分布(duration_ms)。', latencyHistogram: '成功请求的延迟分布(毫秒)。',
errorTrend: '错误趋势SLA 口径排除业务限制;上游错误率排除 429/529。', errorTrend: '错误趋势SLA 口径排除业务限制;上游错误率排除 429/529。',
errorDistribution: '按状态码统计的错误分布。' errorDistribution: '按状态码统计的错误分布。',
goroutines:
'Go 运行时的协程数量(轻量级线程)。没有绝对“安全值”,建议以历史基线为准。经验参考:<2000 常见2000-8000 需关注;>8000 且伴随队列/延迟上升时,优先排查阻塞/泄漏。',
cpu: 'CPU 使用率,显示系统处理器的负载情况。',
memory: '内存使用率,包括已使用和总可用内存。',
db: '数据库连接池状态,包括活跃连接、空闲连接和等待连接数。',
redis: 'Redis 连接池状态,显示活跃和空闲的连接数。',
jobs: '后台任务执行状态,包括最近运行时间、成功时间和错误信息。',
qps: '每秒查询数QPS和每秒Token数TPS实时显示系统吞吐量。',
tokens: '当前时间窗口内处理的总Token数量。',
sla: '服务等级协议达成率,排除业务限制(如余额不足、配额超限)的成功请求占比。',
errors: '错误统计,包括总错误数、错误率和上游错误率。',
latency: '请求延迟统计,包括 p50、p90、p95、p99 等百分位数。',
ttft: '首Token延迟Time To First Token衡量流式响应的首字节返回速度。',
health: '系统健康评分0-100综合考虑 SLA、错误率和资源使用情况。'
}, },
charts: { charts: {
emptyRequest: '该时间窗口内暂无请求。', emptyRequest: '该时间窗口内暂无请求。',
@@ -2320,14 +2374,16 @@ export default {
description: '启用运维监控模块,用于排障与健康可视化', description: '启用运维监控模块,用于排障与健康可视化',
disabled: '运维监控已关闭', disabled: '运维监控已关闭',
enabled: '启用运维监控', enabled: '启用运维监控',
enabledHint: '启用 Ops 运维监控模块(仅管理员可见)', enabledHint: '启用运维监控模块(仅管理员可见)',
realtimeEnabled: '启用实时监控', realtimeEnabled: '启用实时监控',
realtimeEnabledHint: '启用实时 QPS/指标推送WebSocket', realtimeEnabledHint: '启用实时请求速率和指标推送WebSocket',
queryMode: '默认查询模式', queryMode: '默认查询模式',
queryModeHint: 'Ops Dashboard 默认查询模式auto/raw/preagg', queryModeHint: '运维监控默认查询模式(自动/原始/预聚合',
queryModeAuto: '自动(推荐)', queryModeAuto: '自动(推荐)',
queryModeRaw: 'Raw(最准,但较慢)', queryModeRaw: '原始(最准,但较慢)',
queryModePreagg: 'Preagg(最快,需预聚合)' queryModePreagg: '预聚合(最快,需预聚合)',
metricsInterval: '采集频率(秒)',
metricsIntervalHint: '系统/请求指标采集频率60-3600 秒)'
}, },
adminApiKey: { adminApiKey: {
title: '管理员 API Key', title: '管理员 API Key',

View File

@@ -715,6 +715,25 @@
class="w-[220px]" class="w-[220px]"
/> />
</div> </div>
<div v-if="form.ops_monitoring_enabled" class="mt-5 flex items-center justify-between">
<div>
<label class="font-medium text-gray-900 dark:text-white">{{
t('admin.settings.opsMonitoring.metricsInterval')
}}</label>
<p class="text-sm text-gray-500 dark:text-gray-400">
{{ t('admin.settings.opsMonitoring.metricsIntervalHint') }}
</p>
</div>
<input
v-model.number="form.ops_metrics_interval_seconds"
type="number"
min="60"
max="3600"
step="10"
class="w-[220px] rounded-lg border border-gray-300 bg-white px-3 py-2 text-sm text-gray-900 shadow-sm focus:border-primary-500 focus:outline-none focus:ring-1 focus:ring-primary-500 dark:border-dark-600 dark:bg-dark-800 dark:text-white"
/>
</div>
</div> </div>
</div> </div>
@@ -824,7 +843,8 @@ const form = reactive<SettingsForm>({
// Ops Monitoring (vNext) // Ops Monitoring (vNext)
ops_monitoring_enabled: true, ops_monitoring_enabled: true,
ops_realtime_monitoring_enabled: true, ops_realtime_monitoring_enabled: true,
ops_query_mode_default: 'auto' ops_query_mode_default: 'auto',
ops_metrics_interval_seconds: 60
}) })
const opsQueryModeOptions = computed(() => [ const opsQueryModeOptions = computed(() => [
@@ -922,7 +942,8 @@ async function saveSettings() {
identity_patch_prompt: form.identity_patch_prompt, identity_patch_prompt: form.identity_patch_prompt,
ops_monitoring_enabled: form.ops_monitoring_enabled, ops_monitoring_enabled: form.ops_monitoring_enabled,
ops_realtime_monitoring_enabled: form.ops_realtime_monitoring_enabled, ops_realtime_monitoring_enabled: form.ops_realtime_monitoring_enabled,
ops_query_mode_default: form.ops_query_mode_default ops_query_mode_default: form.ops_query_mode_default,
ops_metrics_interval_seconds: form.ops_metrics_interval_seconds
} }
const updated = await adminAPI.settings.updateSettings(payload) const updated = await adminAPI.settings.updateSettings(payload)
Object.assign(form, updated) Object.assign(form, updated)

View File

@@ -33,190 +33,6 @@
@open-error-details="openErrorDetails" @open-error-details="openErrorDetails"
/> />
<!-- Overview -->
<div
v-if="opsEnabled && !(loading && !hasLoadedOnce)"
class="overflow-hidden rounded-3xl bg-white shadow-sm ring-1 ring-gray-900/5 dark:bg-dark-800 dark:ring-dark-700"
>
<div class="border-b border-gray-100 px-6 py-4 dark:border-dark-700">
<h3 class="text-base font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.systemHealth') }}</h3>
</div>
<div class="p-6">
<div v-if="loadingOverview" class="flex items-center justify-center py-10">
<div class="h-8 w-8 animate-spin rounded-full border-b-2 border-primary-600"></div>
</div>
<div v-else-if="!overview?.system_metrics" class="py-6 text-sm text-gray-500 dark:text-gray-400">
{{ t('admin.ops.noSystemMetrics') }}
</div>
<div v-else class="space-y-6">
<div class="text-xs text-gray-500 dark:text-gray-400">
{{ t('admin.ops.collectedAt') }} {{ formatDateTime(overview.system_metrics.created_at) }} ({{ t('admin.ops.window') }}
{{ overview.system_metrics.window_minutes }}m)
</div>
<div class="grid grid-cols-1 gap-4 md:grid-cols-5">
<div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
<div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.cpu') }}</div>
<div class="mt-1 text-xl font-semibold text-gray-900 dark:text-white">
{{ formatPercent0to100(overview.system_metrics.cpu_usage_percent) }}
</div>
</div>
<div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
<div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.memory') }}</div>
<div class="mt-1 text-xl font-semibold text-gray-900 dark:text-white">
{{ formatPercent0to100(overview.system_metrics.memory_usage_percent) }}
</div>
<div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
{{ formatMBPair(overview.system_metrics.memory_used_mb, overview.system_metrics.memory_total_mb) }}
</div>
</div>
<div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
<div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.db') }}</div>
<div class="mt-1 text-xl font-semibold" :class="boolOkClass(overview.system_metrics.db_ok)">
{{ boolOkLabel(overview.system_metrics.db_ok) }}
</div>
<div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
{{ t('admin.ops.active') }}: {{ overview.system_metrics.db_conn_active ?? '-' }}, {{ t('admin.ops.idle') }}:
{{ overview.system_metrics.db_conn_idle ?? '-' }}
</div>
</div>
<div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
<div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.redis') }}</div>
<div class="mt-1 text-xl font-semibold" :class="boolOkClass(overview.system_metrics.redis_ok)">
{{ boolOkLabel(overview.system_metrics.redis_ok) }}
</div>
</div>
<div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
<div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.goroutines') }}</div>
<div class="mt-1 text-xl font-semibold text-gray-900 dark:text-white">
{{ overview.system_metrics.goroutine_count ?? '-' }}
</div>
</div>
</div>
<div v-if="overview?.job_heartbeats?.length" class="rounded-xl border border-gray-100 dark:border-dark-700">
<div class="border-b border-gray-100 px-4 py-3 text-sm font-semibold text-gray-900 dark:border-dark-700 dark:text-white">
{{ t('admin.ops.jobs') }}
</div>
<div class="divide-y divide-gray-100 dark:divide-dark-700">
<div
v-for="job in overview.job_heartbeats"
:key="job.job_name"
class="flex flex-col gap-1 px-4 py-3 md:flex-row md:items-center md:justify-between"
>
<div class="text-sm font-medium text-gray-900 dark:text-white">
{{ job.job_name }}
</div>
<div class="text-xs text-gray-500 dark:text-gray-400">
{{ t('admin.ops.lastRun') }}: {{ job.last_run_at ? formatDateTime(job.last_run_at) : '-' }} · {{ t('admin.ops.lastSuccess') }}:
{{ job.last_success_at ? formatDateTime(job.last_success_at) : '-' }} ·
<span v-if="job.last_error" class="text-rose-600 dark:text-rose-400">
{{ t('admin.ops.lastError') }}: {{ job.last_error }}
</span>
<span v-else class="text-emerald-600 dark:text-emerald-400">{{ t('admin.ops.ok') }}</span>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<div v-if="opsEnabled && !(loading && !hasLoadedOnce)" class="card">
<div class="border-b border-gray-100 px-6 py-4 dark:border-dark-700">
<h3 class="text-base font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.overview') }}</h3>
</div>
<div class="p-6">
<div v-if="loadingOverview" class="flex items-center justify-center py-10">
<div class="h-8 w-8 animate-spin rounded-full border-b-2 border-primary-600"></div>
</div>
<div v-else-if="!overview" class="py-6 text-sm text-gray-500 dark:text-gray-400">
{{ t('admin.ops.noData') }}
</div>
<div v-else class="space-y-6">
<div class="grid grid-cols-1 gap-4 md:grid-cols-4">
<div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
<div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.requestsTotal') }}</div>
<div class="mt-1 text-xl font-semibold text-gray-900 dark:text-white">
{{ formatInt(overview.request_count_total) }}
</div>
<div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
{{ t('admin.ops.slaScope') }} {{ formatInt(overview.request_count_sla) }}
</div>
</div>
<div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
<div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.tokens') }}</div>
<div class="mt-1 text-xl font-semibold text-gray-900 dark:text-white">
{{ formatInt(overview.token_consumed) }}
</div>
<div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
{{ t('admin.ops.tps') }} {{ overview.tps.current }} ({{ t('admin.ops.peak') }} {{ overview.tps.peak }})
</div>
</div>
<div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
<div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.sla') }}</div>
<div class="mt-1 text-xl font-semibold text-gray-900 dark:text-white">
{{ formatPercent(overview.sla) }}
</div>
<div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
{{ t('admin.ops.businessLimited') }}: {{ formatInt(overview.business_limited_count) }}
</div>
</div>
<div class="rounded-xl bg-gray-50 p-4 dark:bg-dark-800/50">
<div class="text-xs text-gray-500 dark:text-gray-400">{{ t('admin.ops.errors') }}</div>
<div class="mt-1 text-xs text-gray-600 dark:text-gray-300">
{{ t('admin.ops.errorRate') }}: <span class="font-semibold">{{ formatPercent(overview.error_rate) }}</span>
</div>
<div class="mt-1 text-xs text-gray-600 dark:text-gray-300">
{{ t('admin.ops.upstreamRate') }}: <span class="font-semibold">{{ formatPercent(overview.upstream_error_rate) }}</span>
</div>
<div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
429: {{ formatInt(overview.upstream_429_count) }} · 529:
{{ formatInt(overview.upstream_529_count) }}
</div>
</div>
</div>
<div class="grid grid-cols-1 gap-4 md:grid-cols-2">
<div class="rounded-xl border border-gray-200 bg-white p-4 dark:border-dark-700 dark:bg-dark-900">
<div class="text-sm font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.latencyDuration') }}</div>
<div class="mt-3 grid grid-cols-2 gap-2 text-xs text-gray-600 dark:text-gray-300 md:grid-cols-3">
<div>{{ t('admin.ops.p50') }}: <span class="font-mono">{{ formatMs(overview.duration.p50_ms) }}</span></div>
<div>{{ t('admin.ops.p90') }}: <span class="font-mono">{{ formatMs(overview.duration.p90_ms) }}</span></div>
<div>{{ t('admin.ops.p95') }}: <span class="font-mono">{{ formatMs(overview.duration.p95_ms) }}</span></div>
<div>{{ t('admin.ops.p99') }}: <span class="font-mono">{{ formatMs(overview.duration.p99_ms) }}</span></div>
<div>{{ t('admin.ops.avg') }}: <span class="font-mono">{{ formatMs(overview.duration.avg_ms) }}</span></div>
<div>{{ t('admin.ops.max') }}: <span class="font-mono">{{ formatMs(overview.duration.max_ms) }}</span></div>
</div>
</div>
<div class="rounded-xl border border-gray-200 bg-white p-4 dark:border-dark-700 dark:bg-dark-900">
<div class="text-sm font-semibold text-gray-900 dark:text-white">{{ t('admin.ops.ttftLabel') }}</div>
<div class="mt-3 grid grid-cols-2 gap-2 text-xs text-gray-600 dark:text-gray-300 md:grid-cols-3">
<div>{{ t('admin.ops.p50') }}: <span class="font-mono">{{ formatMs(overview.ttft.p50_ms) }}</span></div>
<div>{{ t('admin.ops.p90') }}: <span class="font-mono">{{ formatMs(overview.ttft.p90_ms) }}</span></div>
<div>{{ t('admin.ops.p95') }}: <span class="font-mono">{{ formatMs(overview.ttft.p95_ms) }}</span></div>
<div>{{ t('admin.ops.p99') }}: <span class="font-mono">{{ formatMs(overview.ttft.p99_ms) }}</span></div>
<div>{{ t('admin.ops.avg') }}: <span class="font-mono">{{ formatMs(overview.ttft.avg_ms) }}</span></div>
<div>{{ t('admin.ops.max') }}: <span class="font-mono">{{ formatMs(overview.ttft.max_ms) }}</span></div>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- Row: Concurrency + Throughput --> <!-- Row: Concurrency + Throughput -->
<div v-if="opsEnabled && !(loading && !hasLoadedOnce)" class="grid grid-cols-1 gap-6 lg:grid-cols-3"> <div v-if="opsEnabled && !(loading && !hasLoadedOnce)" class="grid grid-cols-1 gap-6 lg:grid-cols-3">
<div class="lg:col-span-1 min-h-[360px]"> <div class="lg:col-span-1 min-h-[360px]">
@@ -308,7 +124,6 @@ import OpsLatencyChart from './components/OpsLatencyChart.vue'
import OpsThroughputTrendChart from './components/OpsThroughputTrendChart.vue' import OpsThroughputTrendChart from './components/OpsThroughputTrendChart.vue'
import OpsAlertEventsCard from './components/OpsAlertEventsCard.vue' import OpsAlertEventsCard from './components/OpsAlertEventsCard.vue'
import OpsRequestDetailsModal, { type OpsRequestDetailsPreset } from './components/OpsRequestDetailsModal.vue' import OpsRequestDetailsModal, { type OpsRequestDetailsPreset } from './components/OpsRequestDetailsModal.vue'
import { formatDateTime, formatNumberLocaleString } from '@/utils/format'
const route = useRoute() const route = useRoute()
const router = useRouter() const router = useRouter()
@@ -486,7 +301,6 @@ const syncQueryToRoute = useDebounceFn(async () => {
}, 250) }, 250)
const overview = ref<OpsDashboardOverview | null>(null) const overview = ref<OpsDashboardOverview | null>(null)
const loadingOverview = ref(false)
const throughputTrend = ref<OpsThroughputTrendResponse | null>(null) const throughputTrend = ref<OpsThroughputTrendResponse | null>(null)
const loadingTrend = ref(false) const loadingTrend = ref(false)
@@ -523,12 +337,15 @@ function handleThroughputSelectGroup(nextGroupId: number) {
groupId.value = id groupId.value = id
} }
function handleOpenRequestDetails() { function handleOpenRequestDetails(preset?: OpsRequestDetailsPreset) {
requestDetailsPreset.value = { const basePreset: OpsRequestDetailsPreset = {
title: t('admin.ops.requestDetails.title'), title: t('admin.ops.requestDetails.title'),
kind: 'all', kind: 'all',
sort: 'created_at_desc' sort: 'created_at_desc'
} }
requestDetailsPreset.value = { ...basePreset, ...(preset ?? {}) }
if (!requestDetailsPreset.value.title) requestDetailsPreset.value.title = basePreset.title
showRequestDetails.value = true showRequestDetails.value = true
} }
@@ -573,46 +390,8 @@ function openError(id: number) {
showErrorModal.value = true showErrorModal.value = true
} }
function formatInt(v: number | null | undefined): string {
if (typeof v !== 'number') return '0'
return formatNumberLocaleString(v)
}
function formatPercent(v: number | null | undefined): string {
if (typeof v !== 'number') return '-'
return `${(v * 100).toFixed(2)}%`
}
function formatPercent0to100(v: number | null | undefined): string {
if (typeof v !== 'number') return '-'
return `${v.toFixed(1)}%`
}
function formatMBPair(used: number | null | undefined, total: number | null | undefined): string {
if (typeof used !== 'number' || typeof total !== 'number') return '-'
return `${formatNumberLocaleString(used)} / ${formatNumberLocaleString(total)} MB`
}
function boolOkLabel(v: boolean | null | undefined): string {
if (v === true) return 'OK'
if (v === false) return 'FAIL'
return '-'
}
function boolOkClass(v: boolean | null | undefined): string {
if (v === true) return 'text-emerald-600 dark:text-emerald-400'
if (v === false) return 'text-rose-600 dark:text-rose-400'
return 'text-gray-900 dark:text-white'
}
function formatMs(v: number | null | undefined): string {
if (v == null) return '-'
return `${v}ms`
}
async function refreshOverviewWithCancel(fetchSeq: number, signal: AbortSignal) { async function refreshOverviewWithCancel(fetchSeq: number, signal: AbortSignal) {
if (!opsEnabled.value) return if (!opsEnabled.value) return
loadingOverview.value = true
try { try {
const data = await opsAPI.getDashboardOverview( const data = await opsAPI.getDashboardOverview(
{ {
@@ -628,11 +407,7 @@ async function refreshOverviewWithCancel(fetchSeq: number, signal: AbortSignal)
} catch (err: any) { } catch (err: any) {
if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
overview.value = null overview.value = null
appStore.showError(err?.message || 'Failed to load overview') appStore.showError(err?.message || t('admin.ops.failedToLoadOverview'))
} finally {
if (fetchSeq === dashboardFetchSeq) {
loadingOverview.value = false
}
} }
} }
@@ -654,7 +429,7 @@ async function refreshThroughputTrendWithCancel(fetchSeq: number, signal: AbortS
} catch (err: any) { } catch (err: any) {
if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
throughputTrend.value = null throughputTrend.value = null
appStore.showError(err?.message || 'Failed to load throughput trend') appStore.showError(err?.message || t('admin.ops.failedToLoadThroughputTrend'))
} finally { } finally {
if (fetchSeq === dashboardFetchSeq) { if (fetchSeq === dashboardFetchSeq) {
loadingTrend.value = false loadingTrend.value = false
@@ -680,7 +455,7 @@ async function refreshLatencyHistogramWithCancel(fetchSeq: number, signal: Abort
} catch (err: any) { } catch (err: any) {
if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
latencyHistogram.value = null latencyHistogram.value = null
appStore.showError(err?.message || 'Failed to load latency histogram') appStore.showError(err?.message || t('admin.ops.failedToLoadLatencyHistogram'))
} finally { } finally {
if (fetchSeq === dashboardFetchSeq) { if (fetchSeq === dashboardFetchSeq) {
loadingLatency.value = false loadingLatency.value = false
@@ -706,7 +481,7 @@ async function refreshErrorTrendWithCancel(fetchSeq: number, signal: AbortSignal
} catch (err: any) { } catch (err: any) {
if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
errorTrend.value = null errorTrend.value = null
appStore.showError(err?.message || 'Failed to load error trend') appStore.showError(err?.message || t('admin.ops.failedToLoadErrorTrend'))
} finally { } finally {
if (fetchSeq === dashboardFetchSeq) { if (fetchSeq === dashboardFetchSeq) {
loadingErrorTrend.value = false loadingErrorTrend.value = false
@@ -732,7 +507,7 @@ async function refreshErrorDistributionWithCancel(fetchSeq: number, signal: Abor
} catch (err: any) { } catch (err: any) {
if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return if (fetchSeq !== dashboardFetchSeq || isCanceledRequest(err)) return
errorDistribution.value = null errorDistribution.value = null
appStore.showError(err?.message || 'Failed to load error distribution') appStore.showError(err?.message || t('admin.ops.failedToLoadErrorDistribution'))
} finally { } finally {
if (fetchSeq === dashboardFetchSeq) { if (fetchSeq === dashboardFetchSeq) {
loadingErrorDistribution.value = false loadingErrorDistribution.value = false

View File

@@ -286,7 +286,7 @@ async function fetchDetail(id: number) {
} }
} catch (err: any) { } catch (err: any) {
detail.value = null detail.value = null
appStore.showError(err?.message || 'Failed to load error detail') appStore.showError(err?.message || t('admin.ops.failedToLoadErrorDetail'))
} finally { } finally {
loading.value = false loading.value = false
} }
@@ -348,7 +348,7 @@ async function runConfirmedRetry() {
const summary = res.status === 'succeeded' ? t('admin.ops.errorDetail.retrySuccess') : t('admin.ops.errorDetail.retryFailed') const summary = res.status === 'succeeded' ? t('admin.ops.errorDetail.retrySuccess') : t('admin.ops.errorDetail.retryFailed')
appStore.showSuccess(summary) appStore.showSuccess(summary)
} catch (err: any) { } catch (err: any) {
appStore.showError(err?.message || 'Retry failed') appStore.showError(err?.message || t('admin.ops.retryFailed'))
} finally { } finally {
retrying.value = false retrying.value = false
} }