feat(ops): 添加分组和账号级别监控指标

- 后端新增 GetAccountAvailability 方法获取账号可用性数据
- 添加分组可用率和限流率计算辅助函数
- 前端支持分组和账号级别的监控指标类型
- 优化警报规则指标选择器,按类别分组显示
This commit is contained in:
IanShaw027
2026-01-11 20:33:52 +08:00
parent c1a3dd41dd
commit dd59e872ff
5 changed files with 126 additions and 11 deletions

View File

@@ -2,6 +2,7 @@ package service
import (
"context"
"errors"
"time"
)
@@ -155,3 +156,39 @@ func (s *OpsService) GetAccountAvailabilityStats(ctx context.Context, platformFi
return platform, group, account, &collectedAt, nil
}
type OpsAccountAvailability struct {
Group *GroupAvailability
Accounts map[int64]*AccountAvailability
CollectedAt *time.Time
}
func (s *OpsService) GetAccountAvailability(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error) {
if s == nil {
return nil, errors.New("ops service is nil")
}
if s.getAccountAvailability != nil {
return s.getAccountAvailability(ctx, platformFilter, groupIDFilter)
}
_, groupStats, accountStats, collectedAt, err := s.GetAccountAvailabilityStats(ctx, platformFilter, groupIDFilter)
if err != nil {
return nil, err
}
var group *GroupAvailability
if groupIDFilter != nil && *groupIDFilter > 0 {
group = groupStats[*groupIDFilter]
}
if accountStats == nil {
accountStats = map[int64]*AccountAvailability{}
}
return &OpsAccountAvailability{
Group: group,
Accounts: accountStats,
CollectedAt: collectedAt,
}, nil
}

View File

@@ -838,3 +838,38 @@ func (l *slidingWindowLimiter) Allow(now time.Time) bool {
l.sent = append(l.sent, now)
return true
}
// computeGroupAvailableRatio returns the available percentage for a group.
// Formula: (AvailableCount / TotalAccounts) * 100.
// Returns 0 when TotalAccounts is 0.
func computeGroupAvailableRatio(group *GroupAvailability) float64 {
if group == nil || group.TotalAccounts <= 0 {
return 0
}
return (float64(group.AvailableCount) / float64(group.TotalAccounts)) * 100
}
// computeGroupRateLimitRatio returns the rate-limited percentage for a group.
// Formula: (RateLimitCount / TotalAccounts) * 100.
// Returns 0 when TotalAccounts is 0.
func computeGroupRateLimitRatio(group *GroupAvailability) float64 {
if group == nil || group.TotalAccounts <= 0 {
return 0
}
return (float64(group.RateLimitCount) / float64(group.TotalAccounts)) * 100
}
// countAccountsByCondition counts accounts that satisfy the given condition.
// It iterates over accounts and applies the predicate to each entry.
func countAccountsByCondition(accounts map[int64]*AccountAvailability, condition func(*AccountAvailability) bool) int64 {
if len(accounts) == 0 || condition == nil {
return 0
}
var count int64
for _, account := range accounts {
if account != nil && condition(account) {
count++
}
}
return count
}

View File

@@ -28,6 +28,9 @@ type OpsService struct {
accountRepo AccountRepository
// getAccountAvailability is a unit-test hook for overriding account availability lookup.
getAccountAvailability func(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error)
concurrencyService *ConcurrencyService
gatewayService *GatewayService
openAIGatewayService *OpenAIGatewayService

View File

@@ -592,6 +592,13 @@ export type MetricType =
| 'cpu_usage_percent'
| 'memory_usage_percent'
| 'concurrency_queue_depth'
| 'group_available_accounts'
| 'group_available_ratio'
| 'group_rate_limit_ratio'
| 'account_rate_limited_count'
| 'account_error_count'
| 'account_error_ratio'
| 'overload_account_count'
export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!='
export interface AlertRule {

View File

@@ -4,7 +4,7 @@ import { useI18n } from 'vue-i18n'
import { useAppStore } from '@/stores/app'
import BaseDialog from '@/components/common/BaseDialog.vue'
import ConfirmDialog from '@/components/common/ConfirmDialog.vue'
import Select from '@/components/common/Select.vue'
import Select, { type SelectOption } from '@/components/common/Select.vue'
import { opsAPI } from '@/api/admin/ops'
import type { AlertRule, MetricType, Operator } from '../types'
import type { OpsSeverity } from '@/api/admin/ops'
@@ -42,17 +42,50 @@ const saving = ref(false)
const editingId = ref<number | null>(null)
const draft = ref<AlertRule | null>(null)
type MetricGroup = 'system' | 'group' | 'account'
const metricDefinitions = computed(() => {
return [
// System-level metrics
{ type: 'success_rate' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.successRate') },
{ type: 'error_rate' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.errorRate') },
{ type: 'upstream_error_rate' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.upstreamErrorRate') },
{ type: 'p95_latency_ms' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.p95') },
{ type: 'p99_latency_ms' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.p99') },
{ type: 'cpu_usage_percent' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.cpu') },
{ type: 'memory_usage_percent' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.memory') },
{ type: 'concurrency_queue_depth' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.queueDepth') },
// Group-level metrics (requires group_id filter)
{ type: 'group_available_accounts' as MetricType, group: 'group' as const, label: t('admin.ops.alertRules.metrics.groupAvailableAccounts') },
{ type: 'group_available_ratio' as MetricType, group: 'group' as const, label: t('admin.ops.alertRules.metrics.groupAvailableRatio') },
{ type: 'group_rate_limit_ratio' as MetricType, group: 'group' as const, label: t('admin.ops.alertRules.metrics.groupRateLimitRatio') },
// Account-level metrics
{ type: 'account_rate_limited_count' as MetricType, group: 'account' as const, label: t('admin.ops.alertRules.metrics.accountRateLimitedCount') },
{ type: 'account_error_count' as MetricType, group: 'account' as const, label: t('admin.ops.alertRules.metrics.accountErrorCount') },
{ type: 'account_error_ratio' as MetricType, group: 'account' as const, label: t('admin.ops.alertRules.metrics.accountErrorRatio') },
{ type: 'overload_account_count' as MetricType, group: 'account' as const, label: t('admin.ops.alertRules.metrics.overloadAccountCount') }
] satisfies Array<{ type: MetricType; group: MetricGroup; label: string }>
})
const metricOptions = computed(() => {
const items: Array<{ value: MetricType; label: string }> = [
{ value: 'success_rate', label: t('admin.ops.alertRules.metrics.successRate') },
{ value: 'error_rate', label: t('admin.ops.alertRules.metrics.errorRate') },
{ value: 'p95_latency_ms', label: t('admin.ops.alertRules.metrics.p95') },
{ value: 'p99_latency_ms', label: t('admin.ops.alertRules.metrics.p99') },
{ value: 'cpu_usage_percent', label: t('admin.ops.alertRules.metrics.cpu') },
{ value: 'memory_usage_percent', label: t('admin.ops.alertRules.metrics.memory') },
{ value: 'concurrency_queue_depth', label: t('admin.ops.alertRules.metrics.queueDepth') }
]
return items
const buildGroup = (group: MetricGroup): SelectOption[] => {
const items = metricDefinitions.value.filter((m) => m.group === group)
if (items.length === 0) return []
const headerValue = `__group__${group}`
return [
{
value: headerValue,
label: t(`admin.ops.alertRules.metricGroups.${group}`),
disabled: true,
kind: 'group'
},
...items.map((m) => ({ value: m.type, label: m.label }))
]
}
return [...buildGroup('system'), ...buildGroup('group'), ...buildGroup('account')]
})
const operatorOptions = computed(() => {