refactor(ops): 移除duration相关告警指标,简化监控配置
主要改动: - 移除 p95_latency_ms 和 p99_latency_ms 告警指标类型 - 移除配置中的 latency_p99_ms_max 阈值设置 - 简化健康分数计算(移除latency权重,重新归一化SLA和错误率) - 移除duration相关的诊断规则和阈值检查 - 统一术语:延迟 → 请求时长 - 保留duration数据展示,但不再用于告警判断 - 聚焦TTFT作为主要的响应速度告警指标 影响范围: - Backend: handler, service, models, tests - Frontend: API types, i18n, components
This commit is contained in:
@@ -20,8 +20,6 @@ var validOpsAlertMetricTypes = []string{
|
||||
"success_rate",
|
||||
"error_rate",
|
||||
"upstream_error_rate",
|
||||
"p95_latency_ms",
|
||||
"p99_latency_ms",
|
||||
"cpu_usage_percent",
|
||||
"memory_usage_percent",
|
||||
"concurrency_queue_depth",
|
||||
|
||||
@@ -523,16 +523,6 @@ func (s *OpsAlertEvaluatorService) computeRuleMetric(
|
||||
return 0, false
|
||||
}
|
||||
return overview.UpstreamErrorRate * 100, true
|
||||
case "p95_latency_ms":
|
||||
if overview.Duration.P95 == nil {
|
||||
return 0, false
|
||||
}
|
||||
return float64(*overview.Duration.P95), true
|
||||
case "p99_latency_ms":
|
||||
if overview.Duration.P99 == nil {
|
||||
return 0, false
|
||||
}
|
||||
return float64(*overview.Duration.P99), true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview)
|
||||
}
|
||||
|
||||
// computeBusinessHealth calculates business health score (0-100)
|
||||
// Components: SLA (50%) + Error Rate (30%) + Latency (20%)
|
||||
// Components: SLA (50%) + Error Rate (30%)
|
||||
func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
|
||||
// SLA score: 99.5% → 100, 95% → 0 (linear)
|
||||
slaScore := 100.0
|
||||
@@ -59,22 +59,9 @@ func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
|
||||
}
|
||||
}
|
||||
|
||||
// Latency score: 1s → 100, 10s → 0 (linear)
|
||||
// Uses P99 of duration (TTFT is less critical for overall health)
|
||||
latencyScore := 100.0
|
||||
if overview.Duration.P99 != nil {
|
||||
p99 := float64(*overview.Duration.P99)
|
||||
if p99 > 1000 {
|
||||
if p99 <= 10000 {
|
||||
latencyScore = (10000 - p99) / 9000 * 100
|
||||
} else {
|
||||
latencyScore = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Weighted combination
|
||||
return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2
|
||||
// Weighted combination (renormalized after removing duration)
|
||||
const weightSum = 0.8
|
||||
return (slaScore*0.5 + errorScore*0.3) / weightSum
|
||||
}
|
||||
|
||||
// computeInfraHealth calculates infrastructure health score (0-100)
|
||||
|
||||
@@ -291,17 +291,6 @@ func TestComputeBusinessHealth(t *testing.T) {
|
||||
wantMin: 95,
|
||||
wantMax: 100,
|
||||
},
|
||||
{
|
||||
name: "latency boundary 1000ms",
|
||||
overview: &OpsDashboardOverview{
|
||||
SLA: 0.995,
|
||||
ErrorRate: 0,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(1000)},
|
||||
},
|
||||
wantMin: 95,
|
||||
wantMax: 100,
|
||||
},
|
||||
{
|
||||
name: "upstream error dominates",
|
||||
overview: &OpsDashboardOverview{
|
||||
|
||||
@@ -482,13 +482,11 @@ const SettingKeyOpsMetricThresholds = "ops_metric_thresholds"
|
||||
|
||||
func defaultOpsMetricThresholds() *OpsMetricThresholds {
|
||||
slaMin := 99.5
|
||||
latencyMax := 2000.0
|
||||
ttftMax := 500.0
|
||||
reqErrMax := 5.0
|
||||
upstreamErrMax := 5.0
|
||||
return &OpsMetricThresholds{
|
||||
SLAPercentMin: &slaMin,
|
||||
LatencyP99MsMax: &latencyMax,
|
||||
TTFTp99MsMax: &ttftMax,
|
||||
RequestErrorRatePercentMax: &reqErrMax,
|
||||
UpstreamErrorRatePercentMax: &upstreamErrMax,
|
||||
@@ -538,9 +536,6 @@ func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricT
|
||||
if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) {
|
||||
return nil, errors.New("sla_percent_min must be between 0 and 100")
|
||||
}
|
||||
if cfg.LatencyP99MsMax != nil && *cfg.LatencyP99MsMax < 0 {
|
||||
return nil, errors.New("latency_p99_ms_max must be >= 0")
|
||||
}
|
||||
if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 {
|
||||
return nil, errors.New("ttft_p99_ms_max must be >= 0")
|
||||
}
|
||||
|
||||
@@ -63,7 +63,6 @@ type OpsAlertSilencingSettings struct {
|
||||
|
||||
type OpsMetricThresholds struct {
|
||||
SLAPercentMin *float64 `json:"sla_percent_min,omitempty"` // SLA低于此值变红
|
||||
LatencyP99MsMax *float64 `json:"latency_p99_ms_max,omitempty"` // 延迟P99高于此值变红
|
||||
TTFTp99MsMax *float64 `json:"ttft_p99_ms_max,omitempty"` // TTFT P99高于此值变红
|
||||
RequestErrorRatePercentMax *float64 `json:"request_error_rate_percent_max,omitempty"` // 请求错误率高于此值变红
|
||||
UpstreamErrorRatePercentMax *float64 `json:"upstream_error_rate_percent_max,omitempty"` // 上游错误率高于此值变红
|
||||
|
||||
@@ -653,8 +653,6 @@ export type MetricType =
|
||||
| 'success_rate'
|
||||
| 'error_rate'
|
||||
| 'upstream_error_rate'
|
||||
| 'p95_latency_ms'
|
||||
| 'p99_latency_ms'
|
||||
| 'cpu_usage_percent'
|
||||
| 'memory_usage_percent'
|
||||
| 'concurrency_queue_depth'
|
||||
@@ -729,7 +727,6 @@ export interface EmailNotificationConfig {
|
||||
|
||||
export interface OpsMetricThresholds {
|
||||
sla_percent_min?: number | null // SLA低于此值变红
|
||||
latency_p99_ms_max?: number | null // 延迟 P99 高于此值变红
|
||||
ttft_p99_ms_max?: number | null // TTFT P99高于此值变红
|
||||
request_error_rate_percent_max?: number | null // 请求错误率高于此值变红
|
||||
upstream_error_rate_percent_max?: number | null // 上游错误率高于此值变红
|
||||
|
||||
@@ -1887,7 +1887,7 @@ export default {
|
||||
totalRequests: 'Total Requests',
|
||||
avgQps: 'Avg QPS',
|
||||
avgTps: 'Avg TPS',
|
||||
avgLatency: 'Avg Latency',
|
||||
avgLatency: 'Avg Request Duration',
|
||||
avgTtft: 'Avg TTFT',
|
||||
exceptions: 'Exceptions',
|
||||
requestErrors: 'Request Errors',
|
||||
@@ -1899,7 +1899,7 @@ export default {
|
||||
errors: 'Errors',
|
||||
errorRate: 'error_rate:',
|
||||
upstreamRate: 'upstream_rate:',
|
||||
latencyDuration: 'Latency (duration_ms)',
|
||||
latencyDuration: 'Request Duration (ms)',
|
||||
ttftLabel: 'TTFT (first_token_ms)',
|
||||
p50: 'p50:',
|
||||
p90: 'p90:',
|
||||
@@ -1919,7 +1919,7 @@ export default {
|
||||
failedToLoadData: 'Failed to load ops data.',
|
||||
failedToLoadOverview: 'Failed to load overview',
|
||||
failedToLoadThroughputTrend: 'Failed to load throughput trend',
|
||||
failedToLoadLatencyHistogram: 'Failed to load latency histogram',
|
||||
failedToLoadLatencyHistogram: 'Failed to load request duration histogram',
|
||||
failedToLoadErrorTrend: 'Failed to load error trend',
|
||||
failedToLoadErrorDistribution: 'Failed to load error distribution',
|
||||
failedToLoadErrorDetail: 'Failed to load error detail',
|
||||
@@ -1927,7 +1927,7 @@ export default {
|
||||
tpsK: 'TPS (K)',
|
||||
top: 'Top:',
|
||||
throughputTrend: 'Throughput Trend',
|
||||
latencyHistogram: 'Latency Histogram',
|
||||
latencyHistogram: 'Request Duration Histogram',
|
||||
errorTrend: 'Error Trend',
|
||||
errorDistribution: 'Error Distribution',
|
||||
// Health Score & Diagnosis
|
||||
@@ -1973,14 +1973,7 @@ export default {
|
||||
memoryHigh: 'Memory usage elevated ({usage}%)',
|
||||
memoryHighImpact: 'Memory pressure is high, needs attention',
|
||||
memoryHighAction: 'Monitor memory trends, check for memory leaks',
|
||||
// Latency diagnostics
|
||||
latencyCritical: 'Response latency critically high ({latency}ms)',
|
||||
latencyCriticalImpact: 'User experience extremely poor, many requests timing out',
|
||||
latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services',
|
||||
latencyHigh: 'Response latency elevated ({latency}ms)',
|
||||
latencyHighImpact: 'User experience degraded, needs optimization',
|
||||
latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic',
|
||||
ttftHigh: 'Time to first byte elevated ({ttft}ms)',
|
||||
ttftHigh: 'Time to first token elevated ({ttft}ms)',
|
||||
ttftHighImpact: 'User perceived latency increased',
|
||||
ttftHighAction: 'Optimize request processing flow, reduce pre-processing time',
|
||||
// Error rate diagnostics
|
||||
@@ -2020,7 +2013,7 @@ export default {
|
||||
context: 'Context',
|
||||
status: 'Status',
|
||||
message: 'Message',
|
||||
latency: 'Latency',
|
||||
latency: 'Request Duration',
|
||||
action: 'Action',
|
||||
noErrors: 'No errors in this window.',
|
||||
grp: 'GRP:',
|
||||
@@ -2049,7 +2042,7 @@ export default {
|
||||
basicInfo: 'Basic Info',
|
||||
platform: 'Platform',
|
||||
model: 'Model',
|
||||
latency: 'Latency',
|
||||
latency: 'Request Duration',
|
||||
ttft: 'TTFT',
|
||||
businessLimited: 'Business Limited',
|
||||
requestPath: 'Request Path',
|
||||
@@ -2398,8 +2391,6 @@ export default {
|
||||
metricThresholdsHint: 'Configure alert thresholds for metrics, values exceeding thresholds will be displayed in red',
|
||||
slaMinPercent: 'SLA Minimum Percentage',
|
||||
slaMinPercentHint: 'SLA below this value will be displayed in red (default: 99.5%)',
|
||||
latencyP99MaxMs: 'Latency P99 Maximum (ms)',
|
||||
latencyP99MaxMsHint: 'Latency P99 above this value will be displayed in red (default: 2000ms)',
|
||||
ttftP99MaxMs: 'TTFT P99 Maximum (ms)',
|
||||
ttftP99MaxMsHint: 'TTFT P99 above this value will be displayed in red (default: 500ms)',
|
||||
requestErrorRateMaxPercent: 'Request Error Rate Maximum (%)',
|
||||
@@ -2458,7 +2449,7 @@ export default {
|
||||
tooltips: {
|
||||
totalRequests: 'Total number of requests (including both successful and failed requests) in the selected time window.',
|
||||
throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
|
||||
latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
|
||||
latencyHistogram: 'Request duration distribution (ms) for successful requests.',
|
||||
errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
|
||||
errorDistribution: 'Error distribution by status code.',
|
||||
goroutines:
|
||||
@@ -2473,7 +2464,7 @@ export default {
|
||||
sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
|
||||
errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
|
||||
upstreamErrors: 'Upstream error statistics, excluding rate limit errors (429/529).',
|
||||
latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
|
||||
latency: 'Request duration statistics, including p50, p90, p95, p99 percentiles.',
|
||||
ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
|
||||
health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
|
||||
},
|
||||
|
||||
@@ -2031,7 +2031,7 @@ export default {
|
||||
totalRequests: '总请求',
|
||||
avgQps: '平均 QPS',
|
||||
avgTps: '平均 TPS',
|
||||
avgLatency: '平均延迟',
|
||||
avgLatency: '平均请求时长',
|
||||
avgTtft: '平均首字延迟',
|
||||
exceptions: '异常数',
|
||||
requestErrors: '请求错误',
|
||||
@@ -2043,7 +2043,7 @@ export default {
|
||||
errors: '错误',
|
||||
errorRate: '错误率:',
|
||||
upstreamRate: '上游错误率:',
|
||||
latencyDuration: '延迟(毫秒)',
|
||||
latencyDuration: '请求时长(毫秒)',
|
||||
ttftLabel: '首字延迟(毫秒)',
|
||||
p50: 'p50',
|
||||
p90: 'p90',
|
||||
@@ -2063,7 +2063,7 @@ export default {
|
||||
failedToLoadData: '加载运维数据失败',
|
||||
failedToLoadOverview: '加载概览数据失败',
|
||||
failedToLoadThroughputTrend: '加载吞吐趋势失败',
|
||||
failedToLoadLatencyHistogram: '加载延迟分布失败',
|
||||
failedToLoadLatencyHistogram: '加载请求时长分布失败',
|
||||
failedToLoadErrorTrend: '加载错误趋势失败',
|
||||
failedToLoadErrorDistribution: '加载错误分布失败',
|
||||
failedToLoadErrorDetail: '加载错误详情失败',
|
||||
@@ -2071,7 +2071,7 @@ export default {
|
||||
tpsK: 'TPS(千)',
|
||||
top: '最高:',
|
||||
throughputTrend: '吞吐趋势',
|
||||
latencyHistogram: '延迟分布',
|
||||
latencyHistogram: '请求时长分布',
|
||||
errorTrend: '错误趋势',
|
||||
errorDistribution: '错误分布',
|
||||
// Health Score & Diagnosis
|
||||
@@ -2117,15 +2117,8 @@ export default {
|
||||
memoryHigh: '内存使用率偏高 ({usage}%)',
|
||||
memoryHighImpact: '内存压力较大,需要关注',
|
||||
memoryHighAction: '监控内存趋势,检查是否有内存泄漏',
|
||||
// Latency diagnostics
|
||||
latencyCritical: '响应延迟严重过高 ({latency}ms)',
|
||||
latencyCriticalImpact: '用户体验极差,大量请求超时',
|
||||
latencyCriticalAction: '检查慢查询、数据库索引、网络延迟和上游服务',
|
||||
latencyHigh: '响应延迟偏高 ({latency}ms)',
|
||||
latencyHighImpact: '用户体验下降,需要优化',
|
||||
latencyHighAction: '分析慢请求日志,优化数据库查询和业务逻辑',
|
||||
ttftHigh: '首字节时间偏高 ({ttft}ms)',
|
||||
ttftHighImpact: '用户感知延迟增加',
|
||||
ttftHighImpact: '用户感知时长增加',
|
||||
ttftHighAction: '优化请求处理流程,减少前置逻辑耗时',
|
||||
// Error rate diagnostics
|
||||
upstreamCritical: '上游错误率严重偏高 ({rate}%)',
|
||||
@@ -2143,13 +2136,13 @@ export default {
|
||||
// SLA diagnostics
|
||||
slaCritical: 'SLA 严重低于目标 ({sla}%)',
|
||||
slaCriticalImpact: '用户体验严重受损',
|
||||
slaCriticalAction: '紧急排查错误和延迟问题,考虑限流保护',
|
||||
slaCriticalAction: '紧急排查错误原因,必要时采取限流保护',
|
||||
slaLow: 'SLA 低于目标 ({sla}%)',
|
||||
slaLowImpact: '需要关注服务质量',
|
||||
slaLowAction: '分析SLA下降原因,优化系统性能',
|
||||
// Health score diagnostics
|
||||
healthCritical: '综合健康评分过低 ({score})',
|
||||
healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与延迟',
|
||||
healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与资源使用情况',
|
||||
healthCriticalAction: '全面检查系统状态,优先处理critical级别问题',
|
||||
healthLow: '综合健康评分偏低 ({score})',
|
||||
healthLowImpact: '可能存在轻度波动,建议关注 SLA 与错误率',
|
||||
@@ -2164,7 +2157,7 @@ export default {
|
||||
context: '上下文',
|
||||
status: '状态码',
|
||||
message: '消息',
|
||||
latency: '延迟',
|
||||
latency: '请求时长',
|
||||
action: '操作',
|
||||
noErrors: '该窗口内暂无错误。',
|
||||
grp: 'GRP:',
|
||||
@@ -2193,7 +2186,7 @@ export default {
|
||||
basicInfo: '基本信息',
|
||||
platform: '平台',
|
||||
model: '模型',
|
||||
latency: '延迟',
|
||||
latency: '请求时长',
|
||||
ttft: 'TTFT',
|
||||
businessLimited: '业务限制',
|
||||
requestPath: '请求路径',
|
||||
@@ -2351,8 +2344,8 @@ export default {
|
||||
successRate: '成功率 (%)',
|
||||
errorRate: '错误率 (%)',
|
||||
upstreamErrorRate: '上游错误率 (%)',
|
||||
p95: 'P95 延迟 (ms)',
|
||||
p99: 'P99 延迟 (ms)',
|
||||
p95: 'P95 请求时长 (ms)',
|
||||
p99: 'P99 请求时长 (ms)',
|
||||
cpu: 'CPU 使用率 (%)',
|
||||
memory: '内存使用率 (%)',
|
||||
queueDepth: '并发排队深度',
|
||||
@@ -2542,8 +2535,6 @@ export default {
|
||||
metricThresholdsHint: '配置各项指标的告警阈值,超出阈值时将以红色显示',
|
||||
slaMinPercent: 'SLA最低百分比',
|
||||
slaMinPercentHint: 'SLA低于此值时显示为红色(默认:99.5%)',
|
||||
latencyP99MaxMs: '延迟P99最大值(毫秒)',
|
||||
latencyP99MaxMsHint: '延迟P99高于此值时显示为红色(默认:2000ms)',
|
||||
ttftP99MaxMs: 'TTFT P99最大值(毫秒)',
|
||||
ttftP99MaxMsHint: 'TTFT P99高于此值时显示为红色(默认:500ms)',
|
||||
requestErrorRateMaxPercent: '请求错误率最大值(%)',
|
||||
@@ -2602,12 +2593,12 @@ export default {
|
||||
tooltips: {
|
||||
totalRequests: '当前时间窗口内的总请求数和Token消耗量。',
|
||||
throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。',
|
||||
latencyHistogram: '成功请求的延迟分布(毫秒)。',
|
||||
latencyHistogram: '成功请求的请求时长分布(毫秒)。',
|
||||
errorTrend: '错误趋势(SLA 口径排除业务限制;上游错误率排除 429/529)。',
|
||||
errorDistribution: '按状态码统计的错误分布。',
|
||||
upstreamErrors: '上游服务返回的错误,包括API提供商的错误响应(排除429/529限流错误)。',
|
||||
goroutines:
|
||||
'Go 运行时的协程数量(轻量级线程)。没有绝对“安全值”,建议以历史基线为准。经验参考:<2000 常见;2000-8000 需关注;>8000 且伴随队列/延迟上升时,优先排查阻塞/泄漏。',
|
||||
'Go 运行时的协程数量(轻量级线程)。没有绝对"安全值",建议以历史基线为准。经验参考:<2000 常见;2000-8000 需关注;>8000 且伴随队列上升时,优先排查阻塞/泄漏。',
|
||||
cpu: 'CPU 使用率,显示系统处理器的负载情况。',
|
||||
memory: '内存使用率,包括已使用和总可用内存。',
|
||||
db: '数据库连接池状态,包括活跃连接、空闲连接和等待连接数。',
|
||||
@@ -2617,7 +2608,7 @@ export default {
|
||||
tokens: '当前时间窗口内处理的总Token数量。',
|
||||
sla: '服务等级协议达成率,排除业务限制(如余额不足、配额超限)的成功请求占比。',
|
||||
errors: '错误统计,包括总错误数、错误率和上游错误率。',
|
||||
latency: '请求延迟统计,包括 p50、p90、p95、p99 等百分位数。',
|
||||
latency: '请求时长统计,包括 p50、p90、p95、p99 等百分位数。',
|
||||
ttft: '首Token延迟(Time To First Token),衡量流式响应的首字节返回速度。',
|
||||
health: '系统健康评分(0-100),综合考虑 SLA、错误率和资源使用情况。'
|
||||
},
|
||||
|
||||
@@ -140,24 +140,6 @@ const metricDefinitions = computed(() => {
|
||||
recommendedThreshold: 1,
|
||||
unit: '%'
|
||||
},
|
||||
{
|
||||
type: 'p95_latency_ms',
|
||||
group: 'system',
|
||||
label: t('admin.ops.alertRules.metrics.p95'),
|
||||
description: t('admin.ops.alertRules.metricDescriptions.p95'),
|
||||
recommendedOperator: '>',
|
||||
recommendedThreshold: 1000,
|
||||
unit: 'ms'
|
||||
},
|
||||
{
|
||||
type: 'p99_latency_ms',
|
||||
group: 'system',
|
||||
label: t('admin.ops.alertRules.metrics.p99'),
|
||||
description: t('admin.ops.alertRules.metricDescriptions.p99'),
|
||||
recommendedOperator: '>',
|
||||
recommendedThreshold: 2000,
|
||||
unit: 'ms'
|
||||
},
|
||||
{
|
||||
type: 'cpu_usage_percent',
|
||||
group: 'system',
|
||||
|
||||
@@ -169,8 +169,8 @@ const updatedAtLabel = computed(() => {
|
||||
return props.lastUpdated.toLocaleTimeString()
|
||||
})
|
||||
|
||||
// --- Color coding for latency/TTFT ---
|
||||
function getLatencyColor(ms: number | null | undefined): string {
|
||||
// --- Color coding for TTFT ---
|
||||
function getTTFTColor(ms: number | null | undefined): string {
|
||||
if (ms == null) return 'text-gray-900 dark:text-white'
|
||||
if (ms < 500) return 'text-green-600 dark:text-green-400'
|
||||
if (ms < 1000) return 'text-yellow-600 dark:text-yellow-400'
|
||||
@@ -186,13 +186,6 @@ function isSLABelowThreshold(slaPercent: number | null): boolean {
|
||||
return slaPercent < threshold
|
||||
}
|
||||
|
||||
function isLatencyAboveThreshold(latencyP99Ms: number | null): boolean {
|
||||
if (latencyP99Ms == null) return false
|
||||
const threshold = props.thresholds?.latency_p99_ms_max
|
||||
if (threshold == null) return false
|
||||
return latencyP99Ms > threshold
|
||||
}
|
||||
|
||||
function isTTFTAboveThreshold(ttftP99Ms: number | null): boolean {
|
||||
if (ttftP99Ms == null) return false
|
||||
const threshold = props.thresholds?.ttft_p99_ms_max
|
||||
@@ -482,24 +475,6 @@ const diagnosisReport = computed<DiagnosisItem[]>(() => {
|
||||
}
|
||||
}
|
||||
|
||||
// Latency diagnostics
|
||||
const durationP99 = ov.duration?.p99_ms ?? 0
|
||||
if (durationP99 > 2000) {
|
||||
report.push({
|
||||
type: 'critical',
|
||||
message: t('admin.ops.diagnosis.latencyCritical', { latency: durationP99.toFixed(0) }),
|
||||
impact: t('admin.ops.diagnosis.latencyCriticalImpact'),
|
||||
action: t('admin.ops.diagnosis.latencyCriticalAction')
|
||||
})
|
||||
} else if (durationP99 > 1000) {
|
||||
report.push({
|
||||
type: 'warning',
|
||||
message: t('admin.ops.diagnosis.latencyHigh', { latency: durationP99.toFixed(0) }),
|
||||
impact: t('admin.ops.diagnosis.latencyHighImpact'),
|
||||
action: t('admin.ops.diagnosis.latencyHighAction')
|
||||
})
|
||||
}
|
||||
|
||||
const ttftP99 = ov.ttft?.p99_ms ?? 0
|
||||
if (ttftP99 > 500) {
|
||||
report.push({
|
||||
@@ -1181,7 +1156,7 @@ function handleToolbarRefresh() {
|
||||
<!-- Right: 6 cards (3 cols x 2 rows) -->
|
||||
<div class="grid h-full grid-cols-1 content-center gap-4 sm:grid-cols-2 lg:col-span-7 lg:grid-cols-3">
|
||||
<!-- Card 1: Requests -->
|
||||
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
|
||||
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 1;">
|
||||
<div class="flex items-center justify-between">
|
||||
<div class="flex items-center gap-1">
|
||||
<span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.requestsTitle') }}</span>
|
||||
@@ -1217,7 +1192,7 @@ function handleToolbarRefresh() {
|
||||
</div>
|
||||
|
||||
<!-- Card 2: SLA -->
|
||||
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
|
||||
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 2;">
|
||||
<div class="flex items-center justify-between">
|
||||
<div class="flex items-center gap-2">
|
||||
<span class="text-[10px] font-bold uppercase text-gray-400">SLA</span>
|
||||
@@ -1247,8 +1222,8 @@ function handleToolbarRefresh() {
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Card 3: Latency (Duration) -->
|
||||
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
|
||||
<!-- Card 4: Request Duration -->
|
||||
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 4;">
|
||||
<div class="flex items-center justify-between">
|
||||
<div class="flex items-center gap-1">
|
||||
<span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.latencyDuration') }}</span>
|
||||
@@ -1264,7 +1239,7 @@ function handleToolbarRefresh() {
|
||||
</button>
|
||||
</div>
|
||||
<div class="mt-2 flex items-baseline gap-2">
|
||||
<div class="text-3xl font-black" :class="isLatencyAboveThreshold(durationP99Ms) ? 'text-red-600 dark:text-red-400' : getLatencyColor(durationP99Ms)">
|
||||
<div class="text-3xl font-black text-gray-900 dark:text-white">
|
||||
{{ durationP99Ms ?? '-' }}
|
||||
</div>
|
||||
<span class="text-xs font-bold text-gray-400">ms (P99)</span>
|
||||
@@ -1272,34 +1247,34 @@ function handleToolbarRefresh() {
|
||||
<div class="mt-3 flex flex-wrap gap-x-3 gap-y-1 text-xs">
|
||||
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
|
||||
<span class="text-gray-500">P95:</span>
|
||||
<span class="font-bold" :class="getLatencyColor(durationP95Ms)">{{ durationP95Ms ?? '-' }}</span>
|
||||
<span class="font-bold text-gray-900 dark:text-white">{{ durationP95Ms ?? '-' }}</span>
|
||||
<span class="text-gray-400">ms</span>
|
||||
</div>
|
||||
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
|
||||
<span class="text-gray-500">P90:</span>
|
||||
<span class="font-bold" :class="getLatencyColor(durationP90Ms)">{{ durationP90Ms ?? '-' }}</span>
|
||||
<span class="font-bold text-gray-900 dark:text-white">{{ durationP90Ms ?? '-' }}</span>
|
||||
<span class="text-gray-400">ms</span>
|
||||
</div>
|
||||
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
|
||||
<span class="text-gray-500">P50:</span>
|
||||
<span class="font-bold" :class="getLatencyColor(durationP50Ms)">{{ durationP50Ms ?? '-' }}</span>
|
||||
<span class="font-bold text-gray-900 dark:text-white">{{ durationP50Ms ?? '-' }}</span>
|
||||
<span class="text-gray-400">ms</span>
|
||||
</div>
|
||||
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
|
||||
<span class="text-gray-500">Avg:</span>
|
||||
<span class="font-bold" :class="getLatencyColor(durationAvgMs)">{{ durationAvgMs ?? '-' }}</span>
|
||||
<span class="font-bold text-gray-900 dark:text-white">{{ durationAvgMs ?? '-' }}</span>
|
||||
<span class="text-gray-400">ms</span>
|
||||
</div>
|
||||
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
|
||||
<span class="text-gray-500">Max:</span>
|
||||
<span class="font-bold" :class="getLatencyColor(durationMaxMs)">{{ durationMaxMs ?? '-' }}</span>
|
||||
<span class="font-bold text-gray-900 dark:text-white">{{ durationMaxMs ?? '-' }}</span>
|
||||
<span class="text-gray-400">ms</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Card 4: TTFT -->
|
||||
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
|
||||
<!-- Card 5: TTFT -->
|
||||
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 5;">
|
||||
<div class="flex items-center justify-between">
|
||||
<div class="flex items-center gap-1">
|
||||
<span class="text-[10px] font-bold uppercase text-gray-400">TTFT</span>
|
||||
@@ -1315,7 +1290,7 @@ function handleToolbarRefresh() {
|
||||
</button>
|
||||
</div>
|
||||
<div class="mt-2 flex items-baseline gap-2">
|
||||
<div class="text-3xl font-black" :class="isTTFTAboveThreshold(ttftP99Ms) ? 'text-red-600 dark:text-red-400' : getLatencyColor(ttftP99Ms)">
|
||||
<div class="text-3xl font-black" :class="isTTFTAboveThreshold(ttftP99Ms) ? 'text-red-600 dark:text-red-400' : getTTFTColor(ttftP99Ms)">
|
||||
{{ ttftP99Ms ?? '-' }}
|
||||
</div>
|
||||
<span class="text-xs font-bold text-gray-400">ms (P99)</span>
|
||||
@@ -1323,34 +1298,34 @@ function handleToolbarRefresh() {
|
||||
<div class="mt-3 flex flex-wrap gap-x-3 gap-y-1 text-xs">
|
||||
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
|
||||
<span class="text-gray-500">P95:</span>
|
||||
<span class="font-bold" :class="getLatencyColor(ttftP95Ms)">{{ ttftP95Ms ?? '-' }}</span>
|
||||
<span class="font-bold" :class="getTTFTColor(ttftP95Ms)">{{ ttftP95Ms ?? '-' }}</span>
|
||||
<span class="text-gray-400">ms</span>
|
||||
</div>
|
||||
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
|
||||
<span class="text-gray-500">P90:</span>
|
||||
<span class="font-bold" :class="getLatencyColor(ttftP90Ms)">{{ ttftP90Ms ?? '-' }}</span>
|
||||
<span class="font-bold" :class="getTTFTColor(ttftP90Ms)">{{ ttftP90Ms ?? '-' }}</span>
|
||||
<span class="text-gray-400">ms</span>
|
||||
</div>
|
||||
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
|
||||
<span class="text-gray-500">P50:</span>
|
||||
<span class="font-bold" :class="getLatencyColor(ttftP50Ms)">{{ ttftP50Ms ?? '-' }}</span>
|
||||
<span class="font-bold" :class="getTTFTColor(ttftP50Ms)">{{ ttftP50Ms ?? '-' }}</span>
|
||||
<span class="text-gray-400">ms</span>
|
||||
</div>
|
||||
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
|
||||
<span class="text-gray-500">Avg:</span>
|
||||
<span class="font-bold" :class="getLatencyColor(ttftAvgMs)">{{ ttftAvgMs ?? '-' }}</span>
|
||||
<span class="font-bold" :class="getTTFTColor(ttftAvgMs)">{{ ttftAvgMs ?? '-' }}</span>
|
||||
<span class="text-gray-400">ms</span>
|
||||
</div>
|
||||
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
|
||||
<span class="text-gray-500">Max:</span>
|
||||
<span class="font-bold" :class="getLatencyColor(ttftMaxMs)">{{ ttftMaxMs ?? '-' }}</span>
|
||||
<span class="font-bold" :class="getTTFTColor(ttftMaxMs)">{{ ttftMaxMs ?? '-' }}</span>
|
||||
<span class="text-gray-400">ms</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Card 5: Request Errors -->
|
||||
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
|
||||
<!-- Card 3: Request Errors -->
|
||||
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 3;">
|
||||
<div class="flex items-center justify-between">
|
||||
<div class="flex items-center gap-1">
|
||||
<span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.requestErrors') }}</span>
|
||||
@@ -1376,7 +1351,7 @@ function handleToolbarRefresh() {
|
||||
</div>
|
||||
|
||||
<!-- Card 6: Upstream Errors -->
|
||||
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
|
||||
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 6;">
|
||||
<div class="flex items-center justify-between">
|
||||
<div class="flex items-center gap-1">
|
||||
<span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.upstreamErrors') }}</span>
|
||||
|
||||
@@ -205,12 +205,13 @@ watch(
|
||||
<div class="flex h-full min-h-0 flex-col">
|
||||
<!-- Filters -->
|
||||
<div class="mb-4 flex-shrink-0 border-b border-gray-200 pb-4 dark:border-dark-700">
|
||||
<div class="grid grid-cols-1 gap-4 lg:grid-cols-14">
|
||||
<div class="lg:col-span-4">
|
||||
<div class="relative group">
|
||||
<div class="pointer-events-none absolute inset-y-0 left-0 flex items-center pl-3.5">
|
||||
<div class="flex flex-col gap-2">
|
||||
<!-- 第一行: 搜索框 -->
|
||||
<div class="flex items-center gap-2">
|
||||
<div class="relative flex-1 group">
|
||||
<div class="pointer-events-none absolute inset-y-0 left-0 flex items-center pl-3">
|
||||
<svg
|
||||
class="h-4 w-4 text-gray-400 transition-colors group-focus-within:text-blue-500"
|
||||
class="h-3.5 w-3.5 text-gray-400 transition-colors group-focus-within:text-blue-500"
|
||||
fill="none"
|
||||
viewBox="0 0 24 24"
|
||||
stroke="currentColor"
|
||||
@@ -221,42 +222,45 @@ watch(
|
||||
<input
|
||||
v-model="q"
|
||||
type="text"
|
||||
class="w-full rounded-2xl border-gray-200 bg-gray-50/50 py-2 pl-10 pr-4 text-sm font-medium text-gray-700 transition-all focus:border-blue-500 focus:bg-white focus:ring-4 focus:ring-blue-500/10 dark:border-dark-700 dark:bg-dark-900 dark:text-gray-300 dark:focus:bg-dark-800"
|
||||
class="w-full rounded-lg border-gray-200 bg-gray-50/50 py-1.5 pl-9 pr-3 text-xs font-medium text-gray-700 transition-all focus:border-blue-500 focus:bg-white focus:ring-2 focus:ring-blue-500/10 dark:border-dark-700 dark:bg-dark-900 dark:text-gray-300 dark:focus:bg-dark-800"
|
||||
:placeholder="t('admin.ops.errorDetails.searchPlaceholder')"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="lg:col-span-2">
|
||||
<Select :model-value="statusCode" :options="statusCodeSelectOptions" class="w-full" @update:model-value="statusCode = $event as any" />
|
||||
</div>
|
||||
<!-- 第二行: 筛选选项 -->
|
||||
<div class="grid grid-cols-6 gap-2">
|
||||
<div class="col-span-1">
|
||||
<Select :model-value="statusCode" :options="statusCodeSelectOptions" size="sm" @update:model-value="statusCode = $event as any" />
|
||||
</div>
|
||||
|
||||
<div class="lg:col-span-2">
|
||||
<Select :model-value="phase" :options="phaseSelectOptions" class="w-full" @update:model-value="phase = String($event ?? '')" />
|
||||
</div>
|
||||
<div class="col-span-1">
|
||||
<Select :model-value="phase" :options="phaseSelectOptions" size="sm" @update:model-value="phase = String($event ?? '')" />
|
||||
</div>
|
||||
|
||||
<div class="lg:col-span-2">
|
||||
<Select :model-value="errorOwner" :options="ownerSelectOptions" class="w-full" @update:model-value="errorOwner = String($event ?? '')" />
|
||||
</div>
|
||||
<div class="col-span-1">
|
||||
<Select :model-value="errorOwner" :options="ownerSelectOptions" size="sm" @update:model-value="errorOwner = String($event ?? '')" />
|
||||
</div>
|
||||
|
||||
<div class="lg:col-span-2">
|
||||
<Select :model-value="resolvedStatus" :options="resolvedSelectOptions" class="w-full" @update:model-value="resolvedStatus = String($event ?? 'unresolved')" />
|
||||
</div>
|
||||
<div class="col-span-1">
|
||||
<Select :model-value="resolvedStatus" :options="resolvedSelectOptions" size="sm" @update:model-value="resolvedStatus = String($event ?? 'unresolved')" />
|
||||
</div>
|
||||
|
||||
<div class="lg:col-span-1">
|
||||
<input
|
||||
v-model="accountIdInput"
|
||||
type="text"
|
||||
inputmode="numeric"
|
||||
class="input w-full text-sm"
|
||||
:placeholder="t('admin.ops.errorDetails.accountIdPlaceholder')"
|
||||
/>
|
||||
</div>
|
||||
<div class="col-span-1">
|
||||
<input
|
||||
v-model="accountIdInput"
|
||||
type="text"
|
||||
inputmode="numeric"
|
||||
class="w-full rounded-lg border-gray-200 bg-gray-50/50 py-1.5 px-3 text-xs font-medium text-gray-700 transition-all focus:border-blue-500 focus:bg-white focus:ring-2 focus:ring-blue-500/10 dark:border-dark-700 dark:bg-dark-900 dark:text-gray-300 dark:focus:bg-dark-800"
|
||||
:placeholder="t('admin.ops.errorDetails.accountIdPlaceholder')"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div class="lg:col-span-1 flex items-center justify-end">
|
||||
<button type="button" class="btn btn-secondary btn-sm" @click="resetFilters">
|
||||
{{ t('common.reset') }}
|
||||
</button>
|
||||
<div class="col-span-1 flex items-center justify-end">
|
||||
<button type="button" class="rounded-lg bg-gray-100 px-3 py-1.5 text-xs font-semibold text-gray-700 transition-colors hover:bg-gray-200 dark:bg-dark-700 dark:text-gray-300 dark:hover:bg-dark-600" @click="resetFilters">
|
||||
{{ t('common.reset') }}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -53,11 +53,6 @@ function validateRuntimeSettings(settings: OpsAlertRuntimeSettings): ValidationR
|
||||
errors.push('SLA 最低值必须在 0-100 之间')
|
||||
}
|
||||
}
|
||||
if (thresholds.latency_p99_ms_max != null) {
|
||||
if (!Number.isFinite(thresholds.latency_p99_ms_max) || thresholds.latency_p99_ms_max < 0) {
|
||||
errors.push('延迟 P99 最大值必须大于或等于 0')
|
||||
}
|
||||
}
|
||||
if (thresholds.ttft_p99_ms_max != null) {
|
||||
if (!Number.isFinite(thresholds.ttft_p99_ms_max) || thresholds.ttft_p99_ms_max < 0) {
|
||||
errors.push('TTFT P99 最大值必须大于或等于 0')
|
||||
@@ -163,7 +158,6 @@ function openAlertEditor() {
|
||||
if (!draftAlert.value.thresholds) {
|
||||
draftAlert.value.thresholds = {
|
||||
sla_percent_min: 99.5,
|
||||
latency_p99_ms_max: 2000,
|
||||
ttft_p99_ms_max: 500,
|
||||
request_error_rate_percent_max: 5,
|
||||
upstream_error_rate_percent_max: 5
|
||||
@@ -353,18 +347,7 @@ onMounted(() => {
|
||||
<p class="mt-1 text-xs text-gray-500 dark:text-gray-400">SLA 低于此值时将显示为红色</p>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">延迟 P99 最大值 (ms)</div>
|
||||
<input
|
||||
v-model.number="draftAlert.thresholds.latency_p99_ms_max"
|
||||
type="number"
|
||||
min="0"
|
||||
step="100"
|
||||
class="input"
|
||||
placeholder="2000"
|
||||
/>
|
||||
<p class="mt-1 text-xs text-gray-500 dark:text-gray-400">延迟 P99 高于此值时将显示为红色</p>
|
||||
</div>
|
||||
|
||||
|
||||
<div>
|
||||
<div class="mb-1 text-xs font-medium text-gray-600 dark:text-gray-300">TTFT P99 最大值 (ms)</div>
|
||||
|
||||
@@ -32,7 +32,6 @@ const advancedSettings = ref<OpsAdvancedSettings | null>(null)
|
||||
// 指标阈值配置
|
||||
const metricThresholds = ref<OpsMetricThresholds>({
|
||||
sla_percent_min: 99.5,
|
||||
latency_p99_ms_max: 2000,
|
||||
ttft_p99_ms_max: 500,
|
||||
request_error_rate_percent_max: 5,
|
||||
upstream_error_rate_percent_max: 5
|
||||
@@ -53,13 +52,12 @@ async function loadAllSettings() {
|
||||
advancedSettings.value = advanced
|
||||
// 如果后端返回了阈值,使用后端的值;否则保持默认值
|
||||
if (thresholds && Object.keys(thresholds).length > 0) {
|
||||
metricThresholds.value = {
|
||||
sla_percent_min: thresholds.sla_percent_min ?? 99.5,
|
||||
latency_p99_ms_max: thresholds.latency_p99_ms_max ?? 2000,
|
||||
ttft_p99_ms_max: thresholds.ttft_p99_ms_max ?? 500,
|
||||
request_error_rate_percent_max: thresholds.request_error_rate_percent_max ?? 5,
|
||||
upstream_error_rate_percent_max: thresholds.upstream_error_rate_percent_max ?? 5
|
||||
}
|
||||
metricThresholds.value = {
|
||||
sla_percent_min: thresholds.sla_percent_min ?? 99.5,
|
||||
ttft_p99_ms_max: thresholds.ttft_p99_ms_max ?? 500,
|
||||
request_error_rate_percent_max: thresholds.request_error_rate_percent_max ?? 5,
|
||||
upstream_error_rate_percent_max: thresholds.upstream_error_rate_percent_max ?? 5
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
console.error('[OpsSettingsDialog] Failed to load settings', err)
|
||||
@@ -161,9 +159,6 @@ const validation = computed(() => {
|
||||
if (metricThresholds.value.sla_percent_min != null && (metricThresholds.value.sla_percent_min < 0 || metricThresholds.value.sla_percent_min > 100)) {
|
||||
errors.push('SLA最低百分比必须在0-100之间')
|
||||
}
|
||||
if (metricThresholds.value.latency_p99_ms_max != null && metricThresholds.value.latency_p99_ms_max < 0) {
|
||||
errors.push('延迟P99最大值必须大于等于0')
|
||||
}
|
||||
if (metricThresholds.value.ttft_p99_ms_max != null && metricThresholds.value.ttft_p99_ms_max < 0) {
|
||||
errors.push('TTFT P99最大值必须大于等于0')
|
||||
}
|
||||
@@ -362,17 +357,6 @@ async function saveAllSettings() {
|
||||
<p class="mt-1 text-xs text-gray-500">{{ t('admin.ops.settings.slaMinPercentHint') }}</p>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<label class="input-label">{{ t('admin.ops.settings.latencyP99MaxMs') }}</label>
|
||||
<input
|
||||
v-model.number="metricThresholds.latency_p99_ms_max"
|
||||
type="number"
|
||||
min="0"
|
||||
step="100"
|
||||
class="input"
|
||||
/>
|
||||
<p class="mt-1 text-xs text-gray-500">{{ t('admin.ops.settings.latencyP99MaxMsHint') }}</p>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<label class="input-label">{{ t('admin.ops.settings.ttftP99MaxMs') }}</label>
|
||||
|
||||
Reference in New Issue
Block a user