feat(ops): 后端添加指标阈值管理API

- 新增GetMetricThresholds和UpdateMetricThresholds接口
- 支持配置SLA、延迟P99、TTFT P99、请求错误率、上游错误率阈值
- 添加参数验证逻辑
- 提供默认阈值配置
This commit is contained in:
IanShaw027
2026-01-12 11:42:56 +08:00
parent cc4910dd30
commit 7536dbfee5
4 changed files with 154 additions and 0 deletions

View File

@@ -463,3 +463,94 @@ func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdva
_ = json.Unmarshal(raw, updated)
return updated, nil
}
// =========================
// Metric thresholds
// =========================
const SettingKeyOpsMetricThresholds = "ops_metric_thresholds"
func defaultOpsMetricThresholds() *OpsMetricThresholds {
slaMin := 99.5
latencyMax := 2000.0
ttftMax := 500.0
reqErrMax := 5.0
upstreamErrMax := 5.0
return &OpsMetricThresholds{
SLAPercentMin: &slaMin,
LatencyP99MsMax: &latencyMax,
TTFTp99MsMax: &ttftMax,
RequestErrorRatePercentMax: &reqErrMax,
UpstreamErrorRatePercentMax: &upstreamErrMax,
}
}
func (s *OpsService) GetMetricThresholds(ctx context.Context) (*OpsMetricThresholds, error) {
defaultCfg := defaultOpsMetricThresholds()
if s == nil || s.settingRepo == nil {
return defaultCfg, nil
}
if ctx == nil {
ctx = context.Background()
}
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMetricThresholds)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
_ = s.settingRepo.Set(ctx, SettingKeyOpsMetricThresholds, string(b))
}
return defaultCfg, nil
}
return nil, err
}
cfg := &OpsMetricThresholds{}
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
return defaultCfg, nil
}
return cfg, nil
}
func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricThresholds) (*OpsMetricThresholds, error) {
if s == nil || s.settingRepo == nil {
return nil, errors.New("setting repository not initialized")
}
if ctx == nil {
ctx = context.Background()
}
if cfg == nil {
return nil, errors.New("invalid config")
}
// Validate thresholds
if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) {
return nil, errors.New("sla_percent_min must be between 0 and 100")
}
if cfg.LatencyP99MsMax != nil && *cfg.LatencyP99MsMax < 0 {
return nil, errors.New("latency_p99_ms_max must be >= 0")
}
if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 {
return nil, errors.New("ttft_p99_ms_max must be >= 0")
}
if cfg.RequestErrorRatePercentMax != nil && (*cfg.RequestErrorRatePercentMax < 0 || *cfg.RequestErrorRatePercentMax > 100) {
return nil, errors.New("request_error_rate_percent_max must be between 0 and 100")
}
if cfg.UpstreamErrorRatePercentMax != nil && (*cfg.UpstreamErrorRatePercentMax < 0 || *cfg.UpstreamErrorRatePercentMax > 100) {
return nil, errors.New("upstream_error_rate_percent_max must be between 0 and 100")
}
raw, err := json.Marshal(cfg)
if err != nil {
return nil, err
}
if err := s.settingRepo.Set(ctx, SettingKeyOpsMetricThresholds, string(raw)); err != nil {
return nil, err
}
updated := &OpsMetricThresholds{}
_ = json.Unmarshal(raw, updated)
return updated, nil
}

View File

@@ -61,11 +61,20 @@ type OpsAlertSilencingSettings struct {
Entries []OpsAlertSilenceEntry `json:"entries,omitempty"`
}
type OpsMetricThresholds struct {
SLAPercentMin *float64 `json:"sla_percent_min,omitempty"` // SLA低于此值变红
LatencyP99MsMax *float64 `json:"latency_p99_ms_max,omitempty"` // 延迟P99高于此值变红
TTFTp99MsMax *float64 `json:"ttft_p99_ms_max,omitempty"` // TTFT P99高于此值变红
RequestErrorRatePercentMax *float64 `json:"request_error_rate_percent_max,omitempty"` // 请求错误率高于此值变红
UpstreamErrorRatePercentMax *float64 `json:"upstream_error_rate_percent_max,omitempty"` // 上游错误率高于此值变红
}
type OpsAlertRuntimeSettings struct {
EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"`
DistributedLock OpsDistributedLockSettings `json:"distributed_lock"`
Silencing OpsAlertSilencingSettings `json:"silencing"`
Thresholds OpsMetricThresholds `json:"thresholds"` // 指标阈值配置
}
// OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation).