feat(ops): 优化健康评分算法和智能诊断机制
- 采用分层加权评分(业务70% + 基础设施30%),避免重复扣分 - 新增延迟诊断(P99 > 2s critical, > 1s warning) - 新增资源诊断(CPU/内存/DB/Redis状态) - 调整诊断阈值(上游错误率5% critical,请求错误率3% critical) - 为每个诊断项添加可操作建议 - 添加完整的单元测试覆盖(30+测试用例) - 完善中英文国际化文本
This commit is contained in:
@@ -1928,22 +1928,62 @@ export default {
|
||||
footer: 'Automated diagnostic suggestions based on current metrics',
|
||||
idle: 'System is currently idle',
|
||||
idleImpact: 'No active traffic',
|
||||
// Resource diagnostics
|
||||
dbDown: 'Database connection failed',
|
||||
dbDownImpact: 'All database operations will fail',
|
||||
dbDownAction: 'Check database service status, network connectivity, and connection configuration',
|
||||
redisDown: 'Redis connection failed',
|
||||
redisDownImpact: 'Cache functionality degraded, performance may decline',
|
||||
redisDownAction: 'Check Redis service status and network connectivity',
|
||||
cpuCritical: 'CPU usage critically high ({usage}%)',
|
||||
cpuCriticalImpact: 'System response slowing, may affect all requests',
|
||||
cpuCriticalAction: 'Check CPU-intensive tasks, consider scaling or code optimization',
|
||||
cpuHigh: 'CPU usage elevated ({usage}%)',
|
||||
cpuHighImpact: 'System load is high, needs attention',
|
||||
cpuHighAction: 'Monitor CPU trends, prepare scaling plan',
|
||||
memoryCritical: 'Memory usage critically high ({usage}%)',
|
||||
memoryCriticalImpact: 'May trigger OOM, system stability threatened',
|
||||
memoryCriticalAction: 'Check for memory leaks, consider increasing memory or optimizing usage',
|
||||
memoryHigh: 'Memory usage elevated ({usage}%)',
|
||||
memoryHighImpact: 'Memory pressure is high, needs attention',
|
||||
memoryHighAction: 'Monitor memory trends, check for memory leaks',
|
||||
// Latency diagnostics
|
||||
latencyCritical: 'Response latency critically high ({latency}ms)',
|
||||
latencyCriticalImpact: 'User experience extremely poor, many requests timing out',
|
||||
latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services',
|
||||
latencyHigh: 'Response latency elevated ({latency}ms)',
|
||||
latencyHighImpact: 'User experience degraded, needs optimization',
|
||||
latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic',
|
||||
ttftHigh: 'Time to first byte elevated ({ttft}ms)',
|
||||
ttftHighImpact: 'User perceived latency increased',
|
||||
ttftHighAction: 'Optimize request processing flow, reduce pre-processing time',
|
||||
// Error rate diagnostics
|
||||
upstreamCritical: 'Upstream error rate critically high ({rate}%)',
|
||||
upstreamCriticalImpact: 'May affect many user requests',
|
||||
upstreamCriticalAction: 'Check upstream service health, enable fallback strategies',
|
||||
upstreamHigh: 'Upstream error rate elevated ({rate}%)',
|
||||
upstreamHighImpact: 'Recommend checking upstream service status',
|
||||
slaCritical: 'SLA critically below target ({sla}%)',
|
||||
slaCriticalImpact: 'User experience severely degraded',
|
||||
slaLow: 'SLA below target ({sla}%)',
|
||||
slaLowImpact: 'Service quality needs attention',
|
||||
upstreamHighAction: 'Contact upstream service team, prepare fallback plan',
|
||||
errorHigh: 'Error rate too high ({rate}%)',
|
||||
errorHighImpact: 'Many requests failing',
|
||||
errorHighAction: 'Check error logs, identify root cause, urgent fix required',
|
||||
errorElevated: 'Error rate elevated ({rate}%)',
|
||||
errorElevatedImpact: 'Recommend checking error logs',
|
||||
errorElevatedAction: 'Analyze error types and distribution, create fix plan',
|
||||
// SLA diagnostics
|
||||
slaCritical: 'SLA critically below target ({sla}%)',
|
||||
slaCriticalImpact: 'User experience severely degraded',
|
||||
slaCriticalAction: 'Urgently investigate errors and latency, consider rate limiting',
|
||||
slaLow: 'SLA below target ({sla}%)',
|
||||
slaLowImpact: 'Service quality needs attention',
|
||||
slaLowAction: 'Analyze SLA decline causes, optimize system performance',
|
||||
// Health score diagnostics
|
||||
healthCritical: 'Overall health score critically low ({score})',
|
||||
healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation',
|
||||
healthCriticalAction: 'Comprehensive system check, prioritize critical-level issues',
|
||||
healthLow: 'Overall health score low ({score})',
|
||||
healthLowImpact: 'May indicate minor instability; monitor SLA and error rates',
|
||||
healthLowAction: 'Monitor metric trends, prevent issue escalation',
|
||||
healthy: 'All system metrics normal',
|
||||
healthyImpact: 'Service running stable'
|
||||
},
|
||||
|
||||
@@ -2074,22 +2074,62 @@ export default {
|
||||
footer: '基于当前指标的自动诊断建议',
|
||||
idle: '系统当前处于待机状态',
|
||||
idleImpact: '无活跃流量',
|
||||
// Resource diagnostics
|
||||
dbDown: '数据库连接失败',
|
||||
dbDownImpact: '所有数据库操作将失败',
|
||||
dbDownAction: '检查数据库服务状态、网络连接和连接配置',
|
||||
redisDown: 'Redis连接失败',
|
||||
redisDownImpact: '缓存功能降级,性能可能下降',
|
||||
redisDownAction: '检查Redis服务状态和网络连接',
|
||||
cpuCritical: 'CPU使用率严重过高 ({usage}%)',
|
||||
cpuCriticalImpact: '系统响应变慢,可能影响所有请求',
|
||||
cpuCriticalAction: '检查CPU密集型任务,考虑扩容或优化代码',
|
||||
cpuHigh: 'CPU使用率偏高 ({usage}%)',
|
||||
cpuHighImpact: '系统负载较高,需要关注',
|
||||
cpuHighAction: '监控CPU趋势,准备扩容方案',
|
||||
memoryCritical: '内存使用率严重过高 ({usage}%)',
|
||||
memoryCriticalImpact: '可能触发OOM,系统稳定性受威胁',
|
||||
memoryCriticalAction: '检查内存泄漏,考虑增加内存或优化内存使用',
|
||||
memoryHigh: '内存使用率偏高 ({usage}%)',
|
||||
memoryHighImpact: '内存压力较大,需要关注',
|
||||
memoryHighAction: '监控内存趋势,检查是否有内存泄漏',
|
||||
// Latency diagnostics
|
||||
latencyCritical: '响应延迟严重过高 ({latency}ms)',
|
||||
latencyCriticalImpact: '用户体验极差,大量请求超时',
|
||||
latencyCriticalAction: '检查慢查询、数据库索引、网络延迟和上游服务',
|
||||
latencyHigh: '响应延迟偏高 ({latency}ms)',
|
||||
latencyHighImpact: '用户体验下降,需要优化',
|
||||
latencyHighAction: '分析慢请求日志,优化数据库查询和业务逻辑',
|
||||
ttftHigh: '首字节时间偏高 ({ttft}ms)',
|
||||
ttftHighImpact: '用户感知延迟增加',
|
||||
ttftHighAction: '优化请求处理流程,减少前置逻辑耗时',
|
||||
// Error rate diagnostics
|
||||
upstreamCritical: '上游错误率严重偏高 ({rate}%)',
|
||||
upstreamCriticalImpact: '可能影响大量用户请求',
|
||||
upstreamCriticalAction: '检查上游服务健康状态,启用降级策略',
|
||||
upstreamHigh: '上游错误率偏高 ({rate}%)',
|
||||
upstreamHighImpact: '建议检查上游服务状态',
|
||||
slaCritical: 'SLA 严重低于目标 ({sla}%)',
|
||||
slaCriticalImpact: '用户体验严重受损',
|
||||
slaLow: 'SLA 低于目标 ({sla}%)',
|
||||
slaLowImpact: '需要关注服务质量',
|
||||
upstreamHighAction: '联系上游服务团队,准备降级方案',
|
||||
errorHigh: '错误率过高 ({rate}%)',
|
||||
errorHighImpact: '大量请求失败',
|
||||
errorHighAction: '查看错误日志,定位错误根因,紧急修复',
|
||||
errorElevated: '错误率偏高 ({rate}%)',
|
||||
errorElevatedImpact: '建议检查错误日志',
|
||||
errorElevatedAction: '分析错误类型和分布,制定修复计划',
|
||||
// SLA diagnostics
|
||||
slaCritical: 'SLA 严重低于目标 ({sla}%)',
|
||||
slaCriticalImpact: '用户体验严重受损',
|
||||
slaCriticalAction: '紧急排查错误和延迟问题,考虑限流保护',
|
||||
slaLow: 'SLA 低于目标 ({sla}%)',
|
||||
slaLowImpact: '需要关注服务质量',
|
||||
slaLowAction: '分析SLA下降原因,优化系统性能',
|
||||
// Health score diagnostics
|
||||
healthCritical: '综合健康评分过低 ({score})',
|
||||
healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与延迟',
|
||||
healthCriticalAction: '全面检查系统状态,优先处理critical级别问题',
|
||||
healthLow: '综合健康评分偏低 ({score})',
|
||||
healthLowImpact: '可能存在轻度波动,建议关注 SLA 与错误率',
|
||||
healthLowAction: '监控指标趋势,预防问题恶化',
|
||||
healthy: '所有系统指标正常',
|
||||
healthyImpact: '服务运行稳定'
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user