feat(ops): 后端添加指标阈值管理API
- 新增GetMetricThresholds和UpdateMetricThresholds接口 - 支持配置SLA、延迟P99、TTFT P99、请求错误率、上游错误率阈值 - 添加参数验证逻辑 - 提供默认阈值配置
This commit is contained in:
@@ -146,3 +146,50 @@ func (h *OpsHandler) UpdateAdvancedSettings(c *gin.Context) {
|
||||
}
|
||||
response.Success(c, updated)
|
||||
}
|
||||
|
||||
// GetMetricThresholds returns Ops metric thresholds (DB-backed).
|
||||
// GET /api/v1/admin/ops/settings/metric-thresholds
|
||||
func (h *OpsHandler) GetMetricThresholds(c *gin.Context) {
|
||||
if h.opsService == nil {
|
||||
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||
return
|
||||
}
|
||||
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
cfg, err := h.opsService.GetMetricThresholds(c.Request.Context())
|
||||
if err != nil {
|
||||
response.Error(c, http.StatusInternalServerError, "Failed to get metric thresholds")
|
||||
return
|
||||
}
|
||||
response.Success(c, cfg)
|
||||
}
|
||||
|
||||
// UpdateMetricThresholds updates Ops metric thresholds (DB-backed).
|
||||
// PUT /api/v1/admin/ops/settings/metric-thresholds
|
||||
func (h *OpsHandler) UpdateMetricThresholds(c *gin.Context) {
|
||||
if h.opsService == nil {
|
||||
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
|
||||
return
|
||||
}
|
||||
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
|
||||
response.ErrorFrom(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
var req service.OpsMetricThresholds
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
response.BadRequest(c, "Invalid request body")
|
||||
return
|
||||
}
|
||||
|
||||
updated, err := h.opsService.UpdateMetricThresholds(c.Request.Context(), &req)
|
||||
if err != nil {
|
||||
response.Error(c, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
response.Success(c, updated)
|
||||
}
|
||||
|
||||
|
||||
@@ -96,6 +96,13 @@ func registerOpsRoutes(admin *gin.RouterGroup, h *handler.Handlers) {
|
||||
ops.GET("/advanced-settings", h.Admin.Ops.GetAdvancedSettings)
|
||||
ops.PUT("/advanced-settings", h.Admin.Ops.UpdateAdvancedSettings)
|
||||
|
||||
// Settings group (DB-backed)
|
||||
settings := ops.Group("/settings")
|
||||
{
|
||||
settings.GET("/metric-thresholds", h.Admin.Ops.GetMetricThresholds)
|
||||
settings.PUT("/metric-thresholds", h.Admin.Ops.UpdateMetricThresholds)
|
||||
}
|
||||
|
||||
// WebSocket realtime (QPS/TPS)
|
||||
ws := ops.Group("/ws")
|
||||
{
|
||||
|
||||
@@ -463,3 +463,94 @@ func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdva
|
||||
_ = json.Unmarshal(raw, updated)
|
||||
return updated, nil
|
||||
}
|
||||
|
||||
// =========================
|
||||
// Metric thresholds
|
||||
// =========================
|
||||
|
||||
const SettingKeyOpsMetricThresholds = "ops_metric_thresholds"
|
||||
|
||||
func defaultOpsMetricThresholds() *OpsMetricThresholds {
|
||||
slaMin := 99.5
|
||||
latencyMax := 2000.0
|
||||
ttftMax := 500.0
|
||||
reqErrMax := 5.0
|
||||
upstreamErrMax := 5.0
|
||||
return &OpsMetricThresholds{
|
||||
SLAPercentMin: &slaMin,
|
||||
LatencyP99MsMax: &latencyMax,
|
||||
TTFTp99MsMax: &ttftMax,
|
||||
RequestErrorRatePercentMax: &reqErrMax,
|
||||
UpstreamErrorRatePercentMax: &upstreamErrMax,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *OpsService) GetMetricThresholds(ctx context.Context) (*OpsMetricThresholds, error) {
|
||||
defaultCfg := defaultOpsMetricThresholds()
|
||||
if s == nil || s.settingRepo == nil {
|
||||
return defaultCfg, nil
|
||||
}
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
|
||||
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMetricThresholds)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrSettingNotFound) {
|
||||
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
|
||||
_ = s.settingRepo.Set(ctx, SettingKeyOpsMetricThresholds, string(b))
|
||||
}
|
||||
return defaultCfg, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
cfg := &OpsMetricThresholds{}
|
||||
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
|
||||
return defaultCfg, nil
|
||||
}
|
||||
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricThresholds) (*OpsMetricThresholds, error) {
|
||||
if s == nil || s.settingRepo == nil {
|
||||
return nil, errors.New("setting repository not initialized")
|
||||
}
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
if cfg == nil {
|
||||
return nil, errors.New("invalid config")
|
||||
}
|
||||
|
||||
// Validate thresholds
|
||||
if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) {
|
||||
return nil, errors.New("sla_percent_min must be between 0 and 100")
|
||||
}
|
||||
if cfg.LatencyP99MsMax != nil && *cfg.LatencyP99MsMax < 0 {
|
||||
return nil, errors.New("latency_p99_ms_max must be >= 0")
|
||||
}
|
||||
if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 {
|
||||
return nil, errors.New("ttft_p99_ms_max must be >= 0")
|
||||
}
|
||||
if cfg.RequestErrorRatePercentMax != nil && (*cfg.RequestErrorRatePercentMax < 0 || *cfg.RequestErrorRatePercentMax > 100) {
|
||||
return nil, errors.New("request_error_rate_percent_max must be between 0 and 100")
|
||||
}
|
||||
if cfg.UpstreamErrorRatePercentMax != nil && (*cfg.UpstreamErrorRatePercentMax < 0 || *cfg.UpstreamErrorRatePercentMax > 100) {
|
||||
return nil, errors.New("upstream_error_rate_percent_max must be between 0 and 100")
|
||||
}
|
||||
|
||||
raw, err := json.Marshal(cfg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := s.settingRepo.Set(ctx, SettingKeyOpsMetricThresholds, string(raw)); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
updated := &OpsMetricThresholds{}
|
||||
_ = json.Unmarshal(raw, updated)
|
||||
return updated, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -61,11 +61,20 @@ type OpsAlertSilencingSettings struct {
|
||||
Entries []OpsAlertSilenceEntry `json:"entries,omitempty"`
|
||||
}
|
||||
|
||||
type OpsMetricThresholds struct {
|
||||
SLAPercentMin *float64 `json:"sla_percent_min,omitempty"` // SLA低于此值变红
|
||||
LatencyP99MsMax *float64 `json:"latency_p99_ms_max,omitempty"` // 延迟P99高于此值变红
|
||||
TTFTp99MsMax *float64 `json:"ttft_p99_ms_max,omitempty"` // TTFT P99高于此值变红
|
||||
RequestErrorRatePercentMax *float64 `json:"request_error_rate_percent_max,omitempty"` // 请求错误率高于此值变红
|
||||
UpstreamErrorRatePercentMax *float64 `json:"upstream_error_rate_percent_max,omitempty"` // 上游错误率高于此值变红
|
||||
}
|
||||
|
||||
type OpsAlertRuntimeSettings struct {
|
||||
EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"`
|
||||
|
||||
DistributedLock OpsDistributedLockSettings `json:"distributed_lock"`
|
||||
Silencing OpsAlertSilencingSettings `json:"silencing"`
|
||||
Thresholds OpsMetricThresholds `json:"thresholds"` // 指标阈值配置
|
||||
}
|
||||
|
||||
// OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation).
|
||||
|
||||
Reference in New Issue
Block a user