feat(ops): 添加分组和账号级别监控指标
- 后端新增 GetAccountAvailability 方法获取账号可用性数据 - 添加分组可用率和限流率计算辅助函数 - 前端支持分组和账号级别的监控指标类型 - 优化警报规则指标选择器,按类别分组显示
This commit is contained in:
@@ -2,6 +2,7 @@ package service
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -155,3 +156,39 @@ func (s *OpsService) GetAccountAvailabilityStats(ctx context.Context, platformFi
|
|||||||
|
|
||||||
return platform, group, account, &collectedAt, nil
|
return platform, group, account, &collectedAt, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type OpsAccountAvailability struct {
|
||||||
|
Group *GroupAvailability
|
||||||
|
Accounts map[int64]*AccountAvailability
|
||||||
|
CollectedAt *time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *OpsService) GetAccountAvailability(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error) {
|
||||||
|
if s == nil {
|
||||||
|
return nil, errors.New("ops service is nil")
|
||||||
|
}
|
||||||
|
|
||||||
|
if s.getAccountAvailability != nil {
|
||||||
|
return s.getAccountAvailability(ctx, platformFilter, groupIDFilter)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, groupStats, accountStats, collectedAt, err := s.GetAccountAvailabilityStats(ctx, platformFilter, groupIDFilter)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var group *GroupAvailability
|
||||||
|
if groupIDFilter != nil && *groupIDFilter > 0 {
|
||||||
|
group = groupStats[*groupIDFilter]
|
||||||
|
}
|
||||||
|
|
||||||
|
if accountStats == nil {
|
||||||
|
accountStats = map[int64]*AccountAvailability{}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &OpsAccountAvailability{
|
||||||
|
Group: group,
|
||||||
|
Accounts: accountStats,
|
||||||
|
CollectedAt: collectedAt,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -838,3 +838,38 @@ func (l *slidingWindowLimiter) Allow(now time.Time) bool {
|
|||||||
l.sent = append(l.sent, now)
|
l.sent = append(l.sent, now)
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// computeGroupAvailableRatio returns the available percentage for a group.
|
||||||
|
// Formula: (AvailableCount / TotalAccounts) * 100.
|
||||||
|
// Returns 0 when TotalAccounts is 0.
|
||||||
|
func computeGroupAvailableRatio(group *GroupAvailability) float64 {
|
||||||
|
if group == nil || group.TotalAccounts <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return (float64(group.AvailableCount) / float64(group.TotalAccounts)) * 100
|
||||||
|
}
|
||||||
|
|
||||||
|
// computeGroupRateLimitRatio returns the rate-limited percentage for a group.
|
||||||
|
// Formula: (RateLimitCount / TotalAccounts) * 100.
|
||||||
|
// Returns 0 when TotalAccounts is 0.
|
||||||
|
func computeGroupRateLimitRatio(group *GroupAvailability) float64 {
|
||||||
|
if group == nil || group.TotalAccounts <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return (float64(group.RateLimitCount) / float64(group.TotalAccounts)) * 100
|
||||||
|
}
|
||||||
|
|
||||||
|
// countAccountsByCondition counts accounts that satisfy the given condition.
|
||||||
|
// It iterates over accounts and applies the predicate to each entry.
|
||||||
|
func countAccountsByCondition(accounts map[int64]*AccountAvailability, condition func(*AccountAvailability) bool) int64 {
|
||||||
|
if len(accounts) == 0 || condition == nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
var count int64
|
||||||
|
for _, account := range accounts {
|
||||||
|
if account != nil && condition(account) {
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count
|
||||||
|
}
|
||||||
|
|||||||
@@ -28,6 +28,9 @@ type OpsService struct {
|
|||||||
|
|
||||||
accountRepo AccountRepository
|
accountRepo AccountRepository
|
||||||
|
|
||||||
|
// getAccountAvailability is a unit-test hook for overriding account availability lookup.
|
||||||
|
getAccountAvailability func(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error)
|
||||||
|
|
||||||
concurrencyService *ConcurrencyService
|
concurrencyService *ConcurrencyService
|
||||||
gatewayService *GatewayService
|
gatewayService *GatewayService
|
||||||
openAIGatewayService *OpenAIGatewayService
|
openAIGatewayService *OpenAIGatewayService
|
||||||
|
|||||||
@@ -592,6 +592,13 @@ export type MetricType =
|
|||||||
| 'cpu_usage_percent'
|
| 'cpu_usage_percent'
|
||||||
| 'memory_usage_percent'
|
| 'memory_usage_percent'
|
||||||
| 'concurrency_queue_depth'
|
| 'concurrency_queue_depth'
|
||||||
|
| 'group_available_accounts'
|
||||||
|
| 'group_available_ratio'
|
||||||
|
| 'group_rate_limit_ratio'
|
||||||
|
| 'account_rate_limited_count'
|
||||||
|
| 'account_error_count'
|
||||||
|
| 'account_error_ratio'
|
||||||
|
| 'overload_account_count'
|
||||||
export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!='
|
export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!='
|
||||||
|
|
||||||
export interface AlertRule {
|
export interface AlertRule {
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import { useI18n } from 'vue-i18n'
|
|||||||
import { useAppStore } from '@/stores/app'
|
import { useAppStore } from '@/stores/app'
|
||||||
import BaseDialog from '@/components/common/BaseDialog.vue'
|
import BaseDialog from '@/components/common/BaseDialog.vue'
|
||||||
import ConfirmDialog from '@/components/common/ConfirmDialog.vue'
|
import ConfirmDialog from '@/components/common/ConfirmDialog.vue'
|
||||||
import Select from '@/components/common/Select.vue'
|
import Select, { type SelectOption } from '@/components/common/Select.vue'
|
||||||
import { opsAPI } from '@/api/admin/ops'
|
import { opsAPI } from '@/api/admin/ops'
|
||||||
import type { AlertRule, MetricType, Operator } from '../types'
|
import type { AlertRule, MetricType, Operator } from '../types'
|
||||||
import type { OpsSeverity } from '@/api/admin/ops'
|
import type { OpsSeverity } from '@/api/admin/ops'
|
||||||
@@ -42,17 +42,50 @@ const saving = ref(false)
|
|||||||
const editingId = ref<number | null>(null)
|
const editingId = ref<number | null>(null)
|
||||||
const draft = ref<AlertRule | null>(null)
|
const draft = ref<AlertRule | null>(null)
|
||||||
|
|
||||||
|
type MetricGroup = 'system' | 'group' | 'account'
|
||||||
|
|
||||||
|
const metricDefinitions = computed(() => {
|
||||||
|
return [
|
||||||
|
// System-level metrics
|
||||||
|
{ type: 'success_rate' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.successRate') },
|
||||||
|
{ type: 'error_rate' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.errorRate') },
|
||||||
|
{ type: 'upstream_error_rate' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.upstreamErrorRate') },
|
||||||
|
{ type: 'p95_latency_ms' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.p95') },
|
||||||
|
{ type: 'p99_latency_ms' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.p99') },
|
||||||
|
{ type: 'cpu_usage_percent' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.cpu') },
|
||||||
|
{ type: 'memory_usage_percent' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.memory') },
|
||||||
|
{ type: 'concurrency_queue_depth' as MetricType, group: 'system' as const, label: t('admin.ops.alertRules.metrics.queueDepth') },
|
||||||
|
|
||||||
|
// Group-level metrics (requires group_id filter)
|
||||||
|
{ type: 'group_available_accounts' as MetricType, group: 'group' as const, label: t('admin.ops.alertRules.metrics.groupAvailableAccounts') },
|
||||||
|
{ type: 'group_available_ratio' as MetricType, group: 'group' as const, label: t('admin.ops.alertRules.metrics.groupAvailableRatio') },
|
||||||
|
{ type: 'group_rate_limit_ratio' as MetricType, group: 'group' as const, label: t('admin.ops.alertRules.metrics.groupRateLimitRatio') },
|
||||||
|
|
||||||
|
// Account-level metrics
|
||||||
|
{ type: 'account_rate_limited_count' as MetricType, group: 'account' as const, label: t('admin.ops.alertRules.metrics.accountRateLimitedCount') },
|
||||||
|
{ type: 'account_error_count' as MetricType, group: 'account' as const, label: t('admin.ops.alertRules.metrics.accountErrorCount') },
|
||||||
|
{ type: 'account_error_ratio' as MetricType, group: 'account' as const, label: t('admin.ops.alertRules.metrics.accountErrorRatio') },
|
||||||
|
{ type: 'overload_account_count' as MetricType, group: 'account' as const, label: t('admin.ops.alertRules.metrics.overloadAccountCount') }
|
||||||
|
] satisfies Array<{ type: MetricType; group: MetricGroup; label: string }>
|
||||||
|
})
|
||||||
|
|
||||||
const metricOptions = computed(() => {
|
const metricOptions = computed(() => {
|
||||||
const items: Array<{ value: MetricType; label: string }> = [
|
const buildGroup = (group: MetricGroup): SelectOption[] => {
|
||||||
{ value: 'success_rate', label: t('admin.ops.alertRules.metrics.successRate') },
|
const items = metricDefinitions.value.filter((m) => m.group === group)
|
||||||
{ value: 'error_rate', label: t('admin.ops.alertRules.metrics.errorRate') },
|
if (items.length === 0) return []
|
||||||
{ value: 'p95_latency_ms', label: t('admin.ops.alertRules.metrics.p95') },
|
const headerValue = `__group__${group}`
|
||||||
{ value: 'p99_latency_ms', label: t('admin.ops.alertRules.metrics.p99') },
|
return [
|
||||||
{ value: 'cpu_usage_percent', label: t('admin.ops.alertRules.metrics.cpu') },
|
{
|
||||||
{ value: 'memory_usage_percent', label: t('admin.ops.alertRules.metrics.memory') },
|
value: headerValue,
|
||||||
{ value: 'concurrency_queue_depth', label: t('admin.ops.alertRules.metrics.queueDepth') }
|
label: t(`admin.ops.alertRules.metricGroups.${group}`),
|
||||||
]
|
disabled: true,
|
||||||
return items
|
kind: 'group'
|
||||||
|
},
|
||||||
|
...items.map((m) => ({ value: m.type, label: m.label }))
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
return [...buildGroup('system'), ...buildGroup('group'), ...buildGroup('account')]
|
||||||
})
|
})
|
||||||
|
|
||||||
const operatorOptions = computed(() => {
|
const operatorOptions = computed(() => {
|
||||||
|
|||||||
Reference in New Issue
Block a user