后端改进: - 新增健康评分计算服务(ops_health_score.go) - 添加分布式锁支持(ops_advisory_lock.go) - 优化指标采集和聚合逻辑 - 新增运维指标采集间隔配置(60-3600秒) - 移除未使用的WebSocket查询token认证中间件 - 改进清理服务和告警评估逻辑 前端改进: - 简化OpsDashboard组件结构 - 完善国际化文本(中英文) - 新增运维监控相关API类型定义 - 添加运维指标采集间隔设置界面 - 优化错误详情模态框 测试: - 添加健康评分单元测试 - 更新API契约测试
88 lines
2.7 KiB
Go
88 lines
2.7 KiB
Go
package service
|
|
|
|
import "time"
|
|
|
|
type OpsDashboardFilter struct {
|
|
StartTime time.Time
|
|
EndTime time.Time
|
|
|
|
Platform string
|
|
GroupID *int64
|
|
|
|
// QueryMode controls whether dashboard queries should use raw logs or pre-aggregated tables.
|
|
// Expected values: auto/raw/preagg (see OpsQueryMode).
|
|
QueryMode OpsQueryMode
|
|
}
|
|
|
|
type OpsRateSummary struct {
|
|
Current float64 `json:"current"`
|
|
Peak float64 `json:"peak"`
|
|
Avg float64 `json:"avg"`
|
|
}
|
|
|
|
type OpsPercentiles struct {
|
|
P50 *int `json:"p50_ms"`
|
|
P90 *int `json:"p90_ms"`
|
|
P95 *int `json:"p95_ms"`
|
|
P99 *int `json:"p99_ms"`
|
|
Avg *int `json:"avg_ms"`
|
|
Max *int `json:"max_ms"`
|
|
}
|
|
|
|
type OpsDashboardOverview struct {
|
|
StartTime time.Time `json:"start_time"`
|
|
EndTime time.Time `json:"end_time"`
|
|
Platform string `json:"platform"`
|
|
GroupID *int64 `json:"group_id"`
|
|
|
|
// HealthScore is a backend-computed overall health score (0-100).
|
|
// It is derived from the monitored metrics in this overview, plus best-effort system metrics/job heartbeats.
|
|
HealthScore int `json:"health_score"`
|
|
|
|
// Latest system-level snapshot (window=1m, global).
|
|
SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"`
|
|
|
|
// Background jobs health (heartbeats).
|
|
JobHeartbeats []*OpsJobHeartbeat `json:"job_heartbeats"`
|
|
|
|
SuccessCount int64 `json:"success_count"`
|
|
ErrorCountTotal int64 `json:"error_count_total"`
|
|
BusinessLimitedCount int64 `json:"business_limited_count"`
|
|
|
|
ErrorCountSLA int64 `json:"error_count_sla"`
|
|
RequestCountTotal int64 `json:"request_count_total"`
|
|
RequestCountSLA int64 `json:"request_count_sla"`
|
|
|
|
TokenConsumed int64 `json:"token_consumed"`
|
|
|
|
SLA float64 `json:"sla"`
|
|
ErrorRate float64 `json:"error_rate"`
|
|
UpstreamErrorRate float64 `json:"upstream_error_rate"`
|
|
UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
|
|
Upstream429Count int64 `json:"upstream_429_count"`
|
|
Upstream529Count int64 `json:"upstream_529_count"`
|
|
|
|
QPS OpsRateSummary `json:"qps"`
|
|
TPS OpsRateSummary `json:"tps"`
|
|
|
|
Duration OpsPercentiles `json:"duration"`
|
|
TTFT OpsPercentiles `json:"ttft"`
|
|
}
|
|
|
|
type OpsLatencyHistogramBucket struct {
|
|
Range string `json:"range"`
|
|
Count int64 `json:"count"`
|
|
}
|
|
|
|
// OpsLatencyHistogramResponse is a coarse latency distribution histogram (success requests only).
|
|
// It is used by the Ops dashboard to quickly identify tail latency regressions.
|
|
type OpsLatencyHistogramResponse struct {
|
|
StartTime time.Time `json:"start_time"`
|
|
EndTime time.Time `json:"end_time"`
|
|
Platform string `json:"platform"`
|
|
GroupID *int64 `json:"group_id"`
|
|
|
|
TotalRequests int64 `json:"total_requests"`
|
|
Buckets []*OpsLatencyHistogramBucket `json:"buckets"`
|
|
}
|