feat(ops): 增强告警评估和指标收集服务功能
This commit is contained in:
@@ -423,6 +423,55 @@ func (s *OpsAlertEvaluatorService) computeRuleMetric(
|
|||||||
return float64(*systemMetrics.ConcurrencyQueueDepth), true
|
return float64(*systemMetrics.ConcurrencyQueueDepth), true
|
||||||
}
|
}
|
||||||
return 0, false
|
return 0, false
|
||||||
|
case "group_available_accounts":
|
||||||
|
if groupID == nil || *groupID <= 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
if s == nil || s.opsService == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
|
||||||
|
if err != nil || availability == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
if availability.Group == nil {
|
||||||
|
return 0, true
|
||||||
|
}
|
||||||
|
return float64(availability.Group.AvailableCount), true
|
||||||
|
case "group_available_ratio":
|
||||||
|
if groupID == nil || *groupID <= 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
if s == nil || s.opsService == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
|
||||||
|
if err != nil || availability == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return computeGroupAvailableRatio(availability.Group), true
|
||||||
|
case "account_rate_limited_count":
|
||||||
|
if s == nil || s.opsService == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
|
||||||
|
if err != nil || availability == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return float64(countAccountsByCondition(availability.Accounts, func(acc *AccountAvailability) bool {
|
||||||
|
return acc.IsRateLimited
|
||||||
|
})), true
|
||||||
|
case "account_error_count":
|
||||||
|
if s == nil || s.opsService == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
availability, err := s.opsService.GetAccountAvailability(ctx, platform, groupID)
|
||||||
|
if err != nil || availability == nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return float64(countAccountsByCondition(availability.Accounts, func(acc *AccountAvailability) bool {
|
||||||
|
return acc.HasError && acc.TempUnschedulableUntil == nil
|
||||||
|
})), true
|
||||||
}
|
}
|
||||||
|
|
||||||
overview, err := s.opsRepo.GetDashboardOverview(ctx, &OpsDashboardFilter{
|
overview, err := s.opsRepo.GetDashboardOverview(ctx, &OpsDashboardFilter{
|
||||||
@@ -849,18 +898,7 @@ func computeGroupAvailableRatio(group *GroupAvailability) float64 {
|
|||||||
return (float64(group.AvailableCount) / float64(group.TotalAccounts)) * 100
|
return (float64(group.AvailableCount) / float64(group.TotalAccounts)) * 100
|
||||||
}
|
}
|
||||||
|
|
||||||
// computeGroupRateLimitRatio returns the rate-limited percentage for a group.
|
|
||||||
// Formula: (RateLimitCount / TotalAccounts) * 100.
|
|
||||||
// Returns 0 when TotalAccounts is 0.
|
|
||||||
func computeGroupRateLimitRatio(group *GroupAvailability) float64 {
|
|
||||||
if group == nil || group.TotalAccounts <= 0 {
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
return (float64(group.RateLimitCount) / float64(group.TotalAccounts)) * 100
|
|
||||||
}
|
|
||||||
|
|
||||||
// countAccountsByCondition counts accounts that satisfy the given condition.
|
// countAccountsByCondition counts accounts that satisfy the given condition.
|
||||||
// It iterates over accounts and applies the predicate to each entry.
|
|
||||||
func countAccountsByCondition(accounts map[int64]*AccountAvailability, condition func(*AccountAvailability) bool) int64 {
|
func countAccountsByCondition(accounts map[int64]*AccountAvailability, condition func(*AccountAvailability) bool) int64 {
|
||||||
if len(accounts) == 0 || condition == nil {
|
if len(accounts) == 0 || condition == nil {
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
@@ -44,6 +44,9 @@ type OpsMetricsCollector struct {
|
|||||||
settingRepo SettingRepository
|
settingRepo SettingRepository
|
||||||
cfg *config.Config
|
cfg *config.Config
|
||||||
|
|
||||||
|
accountRepo AccountRepository
|
||||||
|
concurrencyService *ConcurrencyService
|
||||||
|
|
||||||
db *sql.DB
|
db *sql.DB
|
||||||
redisClient *redis.Client
|
redisClient *redis.Client
|
||||||
instanceID string
|
instanceID string
|
||||||
@@ -62,17 +65,21 @@ type OpsMetricsCollector struct {
|
|||||||
func NewOpsMetricsCollector(
|
func NewOpsMetricsCollector(
|
||||||
opsRepo OpsRepository,
|
opsRepo OpsRepository,
|
||||||
settingRepo SettingRepository,
|
settingRepo SettingRepository,
|
||||||
|
accountRepo AccountRepository,
|
||||||
|
concurrencyService *ConcurrencyService,
|
||||||
db *sql.DB,
|
db *sql.DB,
|
||||||
redisClient *redis.Client,
|
redisClient *redis.Client,
|
||||||
cfg *config.Config,
|
cfg *config.Config,
|
||||||
) *OpsMetricsCollector {
|
) *OpsMetricsCollector {
|
||||||
return &OpsMetricsCollector{
|
return &OpsMetricsCollector{
|
||||||
opsRepo: opsRepo,
|
opsRepo: opsRepo,
|
||||||
settingRepo: settingRepo,
|
settingRepo: settingRepo,
|
||||||
cfg: cfg,
|
cfg: cfg,
|
||||||
db: db,
|
accountRepo: accountRepo,
|
||||||
redisClient: redisClient,
|
concurrencyService: concurrencyService,
|
||||||
instanceID: uuid.NewString(),
|
db: db,
|
||||||
|
redisClient: redisClient,
|
||||||
|
instanceID: uuid.NewString(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -287,6 +294,7 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
|
|||||||
tps := float64(tokenConsumed) / windowSeconds
|
tps := float64(tokenConsumed) / windowSeconds
|
||||||
|
|
||||||
goroutines := runtime.NumGoroutine()
|
goroutines := runtime.NumGoroutine()
|
||||||
|
concurrencyQueueDepth := c.collectConcurrencyQueueDepth(ctx)
|
||||||
|
|
||||||
input := &OpsInsertSystemMetricsInput{
|
input := &OpsInsertSystemMetricsInput{
|
||||||
CreatedAt: windowEnd,
|
CreatedAt: windowEnd,
|
||||||
@@ -340,14 +348,79 @@ func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
|
|||||||
return intPtr(redisIdle)
|
return intPtr(redisIdle)
|
||||||
}(),
|
}(),
|
||||||
|
|
||||||
DBConnActive: intPtr(active),
|
DBConnActive: intPtr(active),
|
||||||
DBConnIdle: intPtr(idle),
|
DBConnIdle: intPtr(idle),
|
||||||
GoroutineCount: intPtr(goroutines),
|
GoroutineCount: intPtr(goroutines),
|
||||||
|
ConcurrencyQueueDepth: concurrencyQueueDepth,
|
||||||
}
|
}
|
||||||
|
|
||||||
return c.opsRepo.InsertSystemMetrics(ctx, input)
|
return c.opsRepo.InsertSystemMetrics(ctx, input)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *OpsMetricsCollector) collectConcurrencyQueueDepth(parentCtx context.Context) *int {
|
||||||
|
if c == nil || c.accountRepo == nil || c.concurrencyService == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if parentCtx == nil {
|
||||||
|
parentCtx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Best-effort: never let concurrency sampling break the metrics collector.
|
||||||
|
ctx, cancel := context.WithTimeout(parentCtx, 2*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
accounts, err := c.accountRepo.ListSchedulable(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if len(accounts) == 0 {
|
||||||
|
zero := 0
|
||||||
|
return &zero
|
||||||
|
}
|
||||||
|
|
||||||
|
batch := make([]AccountWithConcurrency, 0, len(accounts))
|
||||||
|
for _, acc := range accounts {
|
||||||
|
if acc.ID <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
maxConc := acc.Concurrency
|
||||||
|
if maxConc < 0 {
|
||||||
|
maxConc = 0
|
||||||
|
}
|
||||||
|
batch = append(batch, AccountWithConcurrency{
|
||||||
|
ID: acc.ID,
|
||||||
|
MaxConcurrency: maxConc,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if len(batch) == 0 {
|
||||||
|
zero := 0
|
||||||
|
return &zero
|
||||||
|
}
|
||||||
|
|
||||||
|
loadMap, err := c.concurrencyService.GetAccountsLoadBatch(ctx, batch)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var total int64
|
||||||
|
for _, info := range loadMap {
|
||||||
|
if info == nil || info.WaitingCount <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
total += int64(info.WaitingCount)
|
||||||
|
}
|
||||||
|
if total < 0 {
|
||||||
|
total = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
maxInt := int64(^uint(0) >> 1)
|
||||||
|
if total > maxInt {
|
||||||
|
total = maxInt
|
||||||
|
}
|
||||||
|
v := int(total)
|
||||||
|
return &v
|
||||||
|
}
|
||||||
|
|
||||||
type opsCollectedPercentiles struct {
|
type opsCollectedPercentiles struct {
|
||||||
p50 *int
|
p50 *int
|
||||||
p90 *int
|
p90 *int
|
||||||
@@ -459,9 +532,9 @@ SELECT
|
|||||||
COALESCE(COUNT(*), 0) AS error_total,
|
COALESCE(COUNT(*), 0) AS error_total,
|
||||||
COALESCE(COUNT(*) FILTER (WHERE is_business_limited), 0) AS business_limited,
|
COALESCE(COUNT(*) FILTER (WHERE is_business_limited), 0) AS business_limited,
|
||||||
COALESCE(COUNT(*) FILTER (WHERE NOT is_business_limited), 0) AS error_sla,
|
COALESCE(COUNT(*) FILTER (WHERE NOT is_business_limited), 0) AS error_sla,
|
||||||
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl,
|
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl,
|
||||||
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 429), 0) AS upstream_429,
|
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429), 0) AS upstream_429,
|
||||||
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(status_code, 0) = 529), 0) AS upstream_529
|
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529), 0) AS upstream_529
|
||||||
FROM ops_error_logs
|
FROM ops_error_logs
|
||||||
WHERE created_at >= $1 AND created_at < $2`
|
WHERE created_at >= $1 AND created_at < $2`
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user