Files
yinghuoapi/frontend/src/api/admin/ops.ts
IanShaw027 23aa69f56f refactor(ops): 优化任务心跳和组件刷新机制
后端改动:
- 添加 ops_job_heartbeats.last_result 字段记录任务执行结果
- 优化告警评估器统计信息(规则数/事件数/邮件数)
- 统一各定时任务的心跳记录格式

前端改动:
- 重构 OpsConcurrencyCard 使用父组件统一控制刷新节奏
- 移除独立的 5 秒刷新定时器,改用 refreshToken 机制
- 修复 TypeScript 类型错误
2026-01-15 21:31:55 +08:00

1211 lines
32 KiB
TypeScript

/**
* Admin Ops API endpoints (vNext)
* - Error logs list/detail + retry (client/upstream)
* - Dashboard overview (raw path)
*/
import { apiClient } from '../client'
import type { PaginatedResponse } from '@/types'
export type OpsRetryMode = 'client' | 'upstream'
export type OpsQueryMode = 'auto' | 'raw' | 'preagg'
export interface OpsRequestOptions {
signal?: AbortSignal
}
export interface OpsRetryRequest {
mode: OpsRetryMode
pinned_account_id?: number
force?: boolean
}
export interface OpsRetryAttempt {
id: number
created_at: string
requested_by_user_id: number
source_error_id: number
mode: string
pinned_account_id?: number | null
pinned_account_name?: string
status: string
started_at?: string | null
finished_at?: string | null
duration_ms?: number | null
success?: boolean | null
http_status_code?: number | null
upstream_request_id?: string | null
used_account_id?: number | null
used_account_name?: string
response_preview?: string | null
response_truncated?: boolean | null
result_request_id?: string | null
result_error_id?: number | null
error_message?: string | null
}
export type OpsUpstreamErrorEvent = {
at_unix_ms?: number
platform?: string
account_id?: number
account_name?: string
upstream_status_code?: number
upstream_request_id?: string
upstream_request_body?: string
kind?: string
message?: string
detail?: string
}
export interface OpsRetryResult {
attempt_id: number
mode: OpsRetryMode
status: 'running' | 'succeeded' | 'failed' | string
pinned_account_id?: number | null
used_account_id?: number | null
http_status_code: number
upstream_request_id: string
response_preview: string
response_truncated: boolean
error_message: string
started_at: string
finished_at: string
duration_ms: number
}
export interface OpsDashboardOverview {
start_time: string
end_time: string
platform: string
group_id?: number | null
health_score?: number
system_metrics?: OpsSystemMetricsSnapshot | null
job_heartbeats?: OpsJobHeartbeat[] | null
success_count: number
error_count_total: number
business_limited_count: number
error_count_sla: number
request_count_total: number
request_count_sla: number
token_consumed: number
sla: number
error_rate: number
upstream_error_rate: number
upstream_error_count_excl_429_529: number
upstream_429_count: number
upstream_529_count: number
qps: {
current: number
peak: number
avg: number
}
tps: {
current: number
peak: number
avg: number
}
duration: OpsPercentiles
ttft: OpsPercentiles
}
export interface OpsPercentiles {
p50_ms?: number | null
p90_ms?: number | null
p95_ms?: number | null
p99_ms?: number | null
avg_ms?: number | null
max_ms?: number | null
}
export interface OpsThroughputTrendPoint {
bucket_start: string
request_count: number
token_consumed: number
qps: number
tps: number
}
export interface OpsThroughputPlatformBreakdownItem {
platform: string
request_count: number
token_consumed: number
}
export interface OpsThroughputGroupBreakdownItem {
group_id: number
group_name: string
request_count: number
token_consumed: number
}
export interface OpsThroughputTrendResponse {
bucket: string
points: OpsThroughputTrendPoint[]
by_platform?: OpsThroughputPlatformBreakdownItem[]
top_groups?: OpsThroughputGroupBreakdownItem[]
}
export type OpsRequestKind = 'success' | 'error'
export type OpsRequestDetailsKind = OpsRequestKind | 'all'
export type OpsRequestDetailsSort = 'created_at_desc' | 'duration_desc'
export interface OpsRequestDetail {
kind: OpsRequestKind
created_at: string
request_id: string
platform?: string
model?: string
duration_ms?: number | null
status_code?: number | null
error_id?: number | null
phase?: string
severity?: string
message?: string
user_id?: number | null
api_key_id?: number | null
account_id?: number | null
group_id?: number | null
stream?: boolean
}
export interface OpsRequestDetailsParams {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
kind?: OpsRequestDetailsKind
platform?: string
group_id?: number | null
user_id?: number
api_key_id?: number
account_id?: number
model?: string
request_id?: string
q?: string
min_duration_ms?: number
max_duration_ms?: number
sort?: OpsRequestDetailsSort
page?: number
page_size?: number
}
export type OpsRequestDetailsResponse = PaginatedResponse<OpsRequestDetail>
export interface OpsLatencyHistogramBucket {
range: string
count: number
}
export interface OpsLatencyHistogramResponse {
start_time: string
end_time: string
platform: string
group_id?: number | null
total_requests: number
buckets: OpsLatencyHistogramBucket[]
}
export interface OpsErrorTrendPoint {
bucket_start: string
error_count_total: number
business_limited_count: number
error_count_sla: number
upstream_error_count_excl_429_529: number
upstream_429_count: number
upstream_529_count: number
}
export interface OpsErrorTrendResponse {
bucket: string
points: OpsErrorTrendPoint[]
}
export interface OpsErrorDistributionItem {
status_code: number
total: number
sla: number
business_limited: number
}
export interface OpsErrorDistributionResponse {
total: number
items: OpsErrorDistributionItem[]
}
export interface OpsSystemMetricsSnapshot {
id: number
created_at: string
window_minutes: number
cpu_usage_percent?: number | null
memory_used_mb?: number | null
memory_total_mb?: number | null
memory_usage_percent?: number | null
db_ok?: boolean | null
redis_ok?: boolean | null
// Config-derived limits (best-effort) for rendering "current vs max".
db_max_open_conns?: number | null
redis_pool_size?: number | null
redis_conn_total?: number | null
redis_conn_idle?: number | null
db_conn_active?: number | null
db_conn_idle?: number | null
db_conn_waiting?: number | null
goroutine_count?: number | null
concurrency_queue_depth?: number | null
}
export interface OpsJobHeartbeat {
job_name: string
last_run_at?: string | null
last_success_at?: string | null
last_error_at?: string | null
last_error?: string | null
last_duration_ms?: number | null
last_result?: string | null
updated_at: string
}
export interface PlatformConcurrencyInfo {
platform: string
current_in_use: number
max_capacity: number
load_percentage: number
waiting_in_queue: number
}
export interface GroupConcurrencyInfo {
group_id: number
group_name: string
platform: string
current_in_use: number
max_capacity: number
load_percentage: number
waiting_in_queue: number
}
export interface AccountConcurrencyInfo {
account_id: number
account_name?: string
platform: string
group_id: number
group_name: string
current_in_use: number
max_capacity: number
load_percentage: number
waiting_in_queue: number
}
export interface OpsConcurrencyStatsResponse {
enabled: boolean
platform: Record<string, PlatformConcurrencyInfo>
group: Record<string, GroupConcurrencyInfo>
account: Record<string, AccountConcurrencyInfo>
timestamp?: string
}
export async function getConcurrencyStats(platform?: string, groupId?: number | null): Promise<OpsConcurrencyStatsResponse> {
const params: Record<string, any> = {}
if (platform) {
params.platform = platform
}
if (typeof groupId === 'number' && groupId > 0) {
params.group_id = groupId
}
const { data } = await apiClient.get<OpsConcurrencyStatsResponse>('/admin/ops/concurrency', { params })
return data
}
export interface PlatformAvailability {
platform: string
total_accounts: number
available_count: number
rate_limit_count: number
error_count: number
}
export interface GroupAvailability {
group_id: number
group_name: string
platform: string
total_accounts: number
available_count: number
rate_limit_count: number
error_count: number
}
export interface AccountAvailability {
account_id: number
account_name: string
platform: string
group_id: number
group_name: string
status: string
is_available: boolean
is_rate_limited: boolean
rate_limit_reset_at?: string
rate_limit_remaining_sec?: number
is_overloaded: boolean
overload_until?: string
overload_remaining_sec?: number
has_error: boolean
error_message?: string
}
export interface OpsAccountAvailabilityStatsResponse {
enabled: boolean
platform: Record<string, PlatformAvailability>
group: Record<string, GroupAvailability>
account: Record<string, AccountAvailability>
timestamp?: string
}
export async function getAccountAvailabilityStats(platform?: string, groupId?: number | null): Promise<OpsAccountAvailabilityStatsResponse> {
const params: Record<string, any> = {}
if (platform) {
params.platform = platform
}
if (typeof groupId === 'number' && groupId > 0) {
params.group_id = groupId
}
const { data } = await apiClient.get<OpsAccountAvailabilityStatsResponse>('/admin/ops/account-availability', { params })
return data
}
export interface OpsRateSummary {
current: number
peak: number
avg: number
}
export interface OpsRealtimeTrafficSummary {
window: string
start_time: string
end_time: string
platform: string
group_id?: number | null
qps: OpsRateSummary
tps: OpsRateSummary
}
export interface OpsRealtimeTrafficSummaryResponse {
enabled: boolean
summary: OpsRealtimeTrafficSummary | null
timestamp?: string
}
export async function getRealtimeTrafficSummary(
window: string,
platform?: string,
groupId?: number | null
): Promise<OpsRealtimeTrafficSummaryResponse> {
const params: Record<string, any> = { window }
if (platform) {
params.platform = platform
}
if (typeof groupId === 'number' && groupId > 0) {
params.group_id = groupId
}
const { data } = await apiClient.get<OpsRealtimeTrafficSummaryResponse>('/admin/ops/realtime-traffic', { params })
return data
}
/**
* Subscribe to realtime QPS updates via WebSocket.
*
* Note: browsers cannot set Authorization headers for WebSockets.
* We authenticate via Sec-WebSocket-Protocol using a prefixed token item:
* ["sub2api-admin", "jwt.<token>"]
*/
export interface SubscribeQPSOptions {
token?: string | null
onOpen?: () => void
onClose?: (event: CloseEvent) => void
onError?: (event: Event) => void
/**
* Called when the server closes with an application close code that indicates
* reconnecting is not useful (e.g. feature flag disabled).
*/
onFatalClose?: (event: CloseEvent) => void
/**
* More granular status updates for UI (connecting/reconnecting/offline/etc).
*/
onStatusChange?: (status: OpsWSStatus) => void
/**
* Called when a reconnect is scheduled (helps display "retry in Xs").
*/
onReconnectScheduled?: (info: { attempt: number, delayMs: number }) => void
wsBaseUrl?: string
/**
* Maximum reconnect attempts. Defaults to Infinity to keep the dashboard live.
* Set to 0 to disable reconnect.
*/
maxReconnectAttempts?: number
reconnectBaseDelayMs?: number
reconnectMaxDelayMs?: number
/**
* Stale connection detection (heartbeat-by-observation).
* If no messages are received within this window, the socket is closed to trigger a reconnect.
* Set to 0 to disable.
*/
staleTimeoutMs?: number
/**
* How often to check staleness. Only used when `staleTimeoutMs > 0`.
*/
staleCheckIntervalMs?: number
}
export type OpsWSStatus = 'connecting' | 'connected' | 'reconnecting' | 'offline' | 'closed'
export const OPS_WS_CLOSE_CODES = {
REALTIME_DISABLED: 4001
} as const
const OPS_WS_BASE_PROTOCOL = 'sub2api-admin'
export function subscribeQPS(onMessage: (data: any) => void, options: SubscribeQPSOptions = {}): () => void {
let ws: WebSocket | null = null
let reconnectAttempts = 0
const maxReconnectAttempts = Number.isFinite(options.maxReconnectAttempts as number)
? (options.maxReconnectAttempts as number)
: Infinity
const baseDelayMs = options.reconnectBaseDelayMs ?? 1000
const maxDelayMs = options.reconnectMaxDelayMs ?? 30000
let reconnectTimer: ReturnType<typeof setTimeout> | null = null
let shouldReconnect = true
let isConnecting = false
let hasConnectedOnce = false
let lastMessageAt = 0
const staleTimeoutMs = options.staleTimeoutMs ?? 120_000
const staleCheckIntervalMs = options.staleCheckIntervalMs ?? 30_000
let staleTimer: ReturnType<typeof setInterval> | null = null
const setStatus = (status: OpsWSStatus) => {
options.onStatusChange?.(status)
}
const clearReconnectTimer = () => {
if (reconnectTimer) {
clearTimeout(reconnectTimer)
reconnectTimer = null
}
}
const clearStaleTimer = () => {
if (staleTimer) {
clearInterval(staleTimer)
staleTimer = null
}
}
const startStaleTimer = () => {
clearStaleTimer()
if (!staleTimeoutMs || staleTimeoutMs <= 0) return
staleTimer = setInterval(() => {
if (!shouldReconnect) return
if (!ws || ws.readyState !== WebSocket.OPEN) return
if (!lastMessageAt) return
const ageMs = Date.now() - lastMessageAt
if (ageMs > staleTimeoutMs) {
// Treat as a half-open connection; closing triggers the normal reconnect path.
ws.close()
}
}, staleCheckIntervalMs)
}
const scheduleReconnect = () => {
if (!shouldReconnect) return
if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
// If we're offline, wait for the browser to come back online.
if (typeof navigator !== 'undefined' && 'onLine' in navigator && !navigator.onLine) {
setStatus('offline')
return
}
const expDelay = baseDelayMs * Math.pow(2, reconnectAttempts)
const delay = Math.min(expDelay, maxDelayMs)
const jitter = Math.floor(Math.random() * 250)
clearReconnectTimer()
reconnectTimer = setTimeout(() => {
reconnectAttempts++
connect()
}, delay + jitter)
options.onReconnectScheduled?.({ attempt: reconnectAttempts + 1, delayMs: delay + jitter })
}
const handleOnline = () => {
if (!shouldReconnect) return
if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
connect()
}
const handleOffline = () => {
setStatus('offline')
}
const connect = () => {
if (!shouldReconnect) return
if (isConnecting) return
if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
isConnecting = true
setStatus(hasConnectedOnce ? 'reconnecting' : 'connecting')
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
const wsBaseUrl = options.wsBaseUrl || import.meta.env.VITE_WS_BASE_URL || window.location.host
const wsURL = new URL(`${protocol}//${wsBaseUrl}/api/v1/admin/ops/ws/qps`)
// Do NOT put admin JWT in the URL query string (it can leak via access logs, proxies, etc).
// Browsers cannot set Authorization headers for WebSockets, so we pass the token via
// Sec-WebSocket-Protocol (subprotocol list): ["sub2api-admin", "jwt.<token>"].
const rawToken = String(options.token ?? localStorage.getItem('auth_token') ?? '').trim()
const protocols: string[] = [OPS_WS_BASE_PROTOCOL]
if (rawToken) protocols.push(`jwt.${rawToken}`)
ws = new WebSocket(wsURL.toString(), protocols)
ws.onopen = () => {
reconnectAttempts = 0
isConnecting = false
hasConnectedOnce = true
clearReconnectTimer()
lastMessageAt = Date.now()
startStaleTimer()
setStatus('connected')
options.onOpen?.()
}
ws.onmessage = (e) => {
try {
const data = JSON.parse(e.data)
lastMessageAt = Date.now()
onMessage(data)
} catch (err) {
console.warn('[OpsWS] Failed to parse message:', err)
}
}
ws.onerror = (error) => {
console.error('[OpsWS] Connection error:', error)
options.onError?.(error)
}
ws.onclose = (event) => {
isConnecting = false
options.onClose?.(event)
clearStaleTimer()
ws = null
// If the server explicitly tells us to stop reconnecting, honor it.
if (event && typeof event.code === 'number' && event.code === OPS_WS_CLOSE_CODES.REALTIME_DISABLED) {
shouldReconnect = false
clearReconnectTimer()
setStatus('closed')
options.onFatalClose?.(event)
return
}
scheduleReconnect()
}
}
window.addEventListener('online', handleOnline)
window.addEventListener('offline', handleOffline)
connect()
return () => {
shouldReconnect = false
window.removeEventListener('online', handleOnline)
window.removeEventListener('offline', handleOffline)
clearReconnectTimer()
clearStaleTimer()
if (ws) ws.close()
ws = null
setStatus('closed')
}
}
export type OpsSeverity = string
export type OpsPhase = string
export type AlertSeverity = 'critical' | 'warning' | 'info'
export type ThresholdMode = 'count' | 'percentage' | 'both'
export type MetricType =
| 'success_rate'
| 'error_rate'
| 'upstream_error_rate'
| 'cpu_usage_percent'
| 'memory_usage_percent'
| 'concurrency_queue_depth'
| 'group_available_accounts'
| 'group_available_ratio'
| 'group_rate_limit_ratio'
| 'account_rate_limited_count'
| 'account_error_count'
| 'account_error_ratio'
| 'overload_account_count'
export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!='
export interface AlertRule {
id?: number
name: string
description?: string
enabled: boolean
metric_type: MetricType
operator: Operator
threshold: number
window_minutes: number
sustained_minutes: number
severity: OpsSeverity
cooldown_minutes: number
notify_email: boolean
filters?: Record<string, any>
created_at?: string
updated_at?: string
last_triggered_at?: string | null
}
export interface AlertEvent {
id: number
rule_id: number
severity: OpsSeverity | string
status: 'firing' | 'resolved' | 'manual_resolved' | string
title?: string
description?: string
metric_value?: number
threshold_value?: number
dimensions?: Record<string, any>
fired_at: string
resolved_at?: string | null
email_sent: boolean
created_at: string
}
export interface EmailNotificationConfig {
alert: {
enabled: boolean
recipients: string[]
min_severity: AlertSeverity | ''
rate_limit_per_hour: number
batching_window_seconds: number
include_resolved_alerts: boolean
}
report: {
enabled: boolean
recipients: string[]
daily_summary_enabled: boolean
daily_summary_schedule: string
weekly_summary_enabled: boolean
weekly_summary_schedule: string
error_digest_enabled: boolean
error_digest_schedule: string
error_digest_min_count: number
account_health_enabled: boolean
account_health_schedule: string
account_health_error_rate_threshold: number
}
}
export interface OpsMetricThresholds {
sla_percent_min?: number | null // SLA低于此值变红
ttft_p99_ms_max?: number | null // TTFT P99高于此值变红
request_error_rate_percent_max?: number | null // 请求错误率高于此值变红
upstream_error_rate_percent_max?: number | null // 上游错误率高于此值变红
}
export interface OpsDistributedLockSettings {
enabled: boolean
key: string
ttl_seconds: number
}
export interface OpsAlertRuntimeSettings {
evaluation_interval_seconds: number
distributed_lock: OpsDistributedLockSettings
silencing: {
enabled: boolean
global_until_rfc3339: string
global_reason: string
entries?: Array<{
rule_id?: number
severities?: Array<OpsSeverity | string>
until_rfc3339: string
reason: string
}>
}
thresholds: OpsMetricThresholds // 指标阈值配置
}
export interface OpsAdvancedSettings {
data_retention: OpsDataRetentionSettings
aggregation: OpsAggregationSettings
ignore_count_tokens_errors: boolean
ignore_context_canceled: boolean
ignore_no_available_accounts: boolean
auto_refresh_enabled: boolean
auto_refresh_interval_seconds: number
}
export interface OpsDataRetentionSettings {
cleanup_enabled: boolean
cleanup_schedule: string
error_log_retention_days: number
minute_metrics_retention_days: number
hourly_metrics_retention_days: number
}
export interface OpsAggregationSettings {
aggregation_enabled: boolean
}
export interface OpsErrorLog {
id: number
created_at: string
// Standardized classification
phase: OpsPhase
type: string
error_owner: 'client' | 'provider' | 'platform' | string
error_source: 'client_request' | 'upstream_http' | 'gateway' | string
severity: OpsSeverity
status_code: number
platform: string
model: string
is_retryable: boolean
retry_count: number
resolved: boolean
resolved_at?: string | null
resolved_by_user_id?: number | null
resolved_retry_id?: number | null
client_request_id: string
request_id: string
message: string
user_id?: number | null
user_email: string
api_key_id?: number | null
account_id?: number | null
account_name: string
group_id?: number | null
group_name: string
client_ip?: string | null
request_path?: string
stream?: boolean
}
export interface OpsErrorDetail extends OpsErrorLog {
error_body: string
user_agent: string
// Upstream context (optional; enriched by gateway services)
upstream_status_code?: number | null
upstream_error_message?: string
upstream_error_detail?: string
upstream_errors?: string
auth_latency_ms?: number | null
routing_latency_ms?: number | null
upstream_latency_ms?: number | null
response_latency_ms?: number | null
time_to_first_token_ms?: number | null
request_body: string
request_body_truncated: boolean
request_body_bytes?: number | null
is_business_limited: boolean
}
export type OpsErrorLogsResponse = PaginatedResponse<OpsErrorLog>
export async function getDashboardOverview(
params: {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
mode?: OpsQueryMode
},
options: OpsRequestOptions = {}
): Promise<OpsDashboardOverview> {
const { data } = await apiClient.get<OpsDashboardOverview>('/admin/ops/dashboard/overview', {
params,
signal: options.signal
})
return data
}
export async function getThroughputTrend(
params: {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
mode?: OpsQueryMode
},
options: OpsRequestOptions = {}
): Promise<OpsThroughputTrendResponse> {
const { data } = await apiClient.get<OpsThroughputTrendResponse>('/admin/ops/dashboard/throughput-trend', {
params,
signal: options.signal
})
return data
}
export async function getLatencyHistogram(
params: {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
mode?: OpsQueryMode
},
options: OpsRequestOptions = {}
): Promise<OpsLatencyHistogramResponse> {
const { data } = await apiClient.get<OpsLatencyHistogramResponse>('/admin/ops/dashboard/latency-histogram', {
params,
signal: options.signal
})
return data
}
export async function getErrorTrend(
params: {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
mode?: OpsQueryMode
},
options: OpsRequestOptions = {}
): Promise<OpsErrorTrendResponse> {
const { data } = await apiClient.get<OpsErrorTrendResponse>('/admin/ops/dashboard/error-trend', {
params,
signal: options.signal
})
return data
}
export async function getErrorDistribution(
params: {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
mode?: OpsQueryMode
},
options: OpsRequestOptions = {}
): Promise<OpsErrorDistributionResponse> {
const { data } = await apiClient.get<OpsErrorDistributionResponse>('/admin/ops/dashboard/error-distribution', {
params,
signal: options.signal
})
return data
}
export type OpsErrorListView = 'errors' | 'excluded' | 'all'
export type OpsErrorListQueryParams = {
page?: number
page_size?: number
time_range?: string
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
account_id?: number | null
phase?: string
error_owner?: string
error_source?: string
resolved?: string
view?: OpsErrorListView
q?: string
status_codes?: string
status_codes_other?: string
}
// Legacy unified endpoints
export async function listErrorLogs(params: OpsErrorListQueryParams): Promise<OpsErrorLogsResponse> {
const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/errors', { params })
return data
}
export async function getErrorLogDetail(id: number): Promise<OpsErrorDetail> {
const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/errors/${id}`)
return data
}
export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promise<OpsRetryResult> {
const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/errors/${id}/retry`, req)
return data
}
export async function listRetryAttempts(errorId: number, limit = 50): Promise<OpsRetryAttempt[]> {
const { data } = await apiClient.get<OpsRetryAttempt[]>(`/admin/ops/errors/${errorId}/retries`, { params: { limit } })
return data
}
export async function updateErrorResolved(errorId: number, resolved: boolean): Promise<void> {
await apiClient.put(`/admin/ops/errors/${errorId}/resolve`, { resolved })
}
// New split endpoints
export async function listRequestErrors(params: OpsErrorListQueryParams): Promise<OpsErrorLogsResponse> {
const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/request-errors', { params })
return data
}
export async function listUpstreamErrors(params: OpsErrorListQueryParams): Promise<OpsErrorLogsResponse> {
const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/upstream-errors', { params })
return data
}
export async function getRequestErrorDetail(id: number): Promise<OpsErrorDetail> {
const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/request-errors/${id}`)
return data
}
export async function getUpstreamErrorDetail(id: number): Promise<OpsErrorDetail> {
const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/upstream-errors/${id}`)
return data
}
export async function retryRequestErrorClient(id: number): Promise<OpsRetryResult> {
const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/request-errors/${id}/retry-client`, {})
return data
}
export async function retryRequestErrorUpstreamEvent(id: number, idx: number): Promise<OpsRetryResult> {
const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/request-errors/${id}/upstream-errors/${idx}/retry`, {})
return data
}
export async function retryUpstreamError(id: number): Promise<OpsRetryResult> {
const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/upstream-errors/${id}/retry`, {})
return data
}
export async function updateRequestErrorResolved(errorId: number, resolved: boolean): Promise<void> {
await apiClient.put(`/admin/ops/request-errors/${errorId}/resolve`, { resolved })
}
export async function updateUpstreamErrorResolved(errorId: number, resolved: boolean): Promise<void> {
await apiClient.put(`/admin/ops/upstream-errors/${errorId}/resolve`, { resolved })
}
export async function listRequestErrorUpstreamErrors(
id: number,
params: OpsErrorListQueryParams = {},
options: { include_detail?: boolean } = {}
): Promise<PaginatedResponse<OpsErrorDetail>> {
const query: Record<string, any> = { ...params }
if (options.include_detail) query.include_detail = '1'
const { data } = await apiClient.get<PaginatedResponse<OpsErrorDetail>>(`/admin/ops/request-errors/${id}/upstream-errors`, { params: query })
return data
}
export async function listRequestDetails(params: OpsRequestDetailsParams): Promise<OpsRequestDetailsResponse> {
const { data } = await apiClient.get<OpsRequestDetailsResponse>('/admin/ops/requests', { params })
return data
}
// Alert rules
export async function listAlertRules(): Promise<AlertRule[]> {
const { data } = await apiClient.get<AlertRule[]>('/admin/ops/alert-rules')
return data
}
export async function createAlertRule(rule: AlertRule): Promise<AlertRule> {
const { data } = await apiClient.post<AlertRule>('/admin/ops/alert-rules', rule)
return data
}
export async function updateAlertRule(id: number, rule: Partial<AlertRule>): Promise<AlertRule> {
const { data } = await apiClient.put<AlertRule>(`/admin/ops/alert-rules/${id}`, rule)
return data
}
export async function deleteAlertRule(id: number): Promise<void> {
await apiClient.delete(`/admin/ops/alert-rules/${id}`)
}
export interface AlertEventsQuery {
limit?: number
status?: string
severity?: string
email_sent?: boolean
time_range?: string
start_time?: string
end_time?: string
before_fired_at?: string
before_id?: number
platform?: string
group_id?: number
}
export async function listAlertEvents(params: AlertEventsQuery = {}): Promise<AlertEvent[]> {
const { data } = await apiClient.get<AlertEvent[]>('/admin/ops/alert-events', { params })
return data
}
export async function getAlertEvent(id: number): Promise<AlertEvent> {
const { data } = await apiClient.get<AlertEvent>(`/admin/ops/alert-events/${id}`)
return data
}
export async function updateAlertEventStatus(id: number, status: 'resolved' | 'manual_resolved'): Promise<void> {
await apiClient.put(`/admin/ops/alert-events/${id}/status`, { status })
}
export async function createAlertSilence(payload: {
rule_id: number
platform: string
group_id?: number | null
region?: string | null
until: string
reason?: string
}): Promise<void> {
await apiClient.post('/admin/ops/alert-silences', payload)
}
// Email notification config
export async function getEmailNotificationConfig(): Promise<EmailNotificationConfig> {
const { data } = await apiClient.get<EmailNotificationConfig>('/admin/ops/email-notification/config')
return data
}
export async function updateEmailNotificationConfig(config: EmailNotificationConfig): Promise<EmailNotificationConfig> {
const { data } = await apiClient.put<EmailNotificationConfig>('/admin/ops/email-notification/config', config)
return data
}
// Runtime settings (DB-backed)
export async function getAlertRuntimeSettings(): Promise<OpsAlertRuntimeSettings> {
const { data } = await apiClient.get<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert')
return data
}
export async function updateAlertRuntimeSettings(config: OpsAlertRuntimeSettings): Promise<OpsAlertRuntimeSettings> {
const { data } = await apiClient.put<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert', config)
return data
}
// Advanced settings (DB-backed)
export async function getAdvancedSettings(): Promise<OpsAdvancedSettings> {
const { data } = await apiClient.get<OpsAdvancedSettings>('/admin/ops/advanced-settings')
return data
}
export async function updateAdvancedSettings(config: OpsAdvancedSettings): Promise<OpsAdvancedSettings> {
const { data } = await apiClient.put<OpsAdvancedSettings>('/admin/ops/advanced-settings', config)
return data
}
// ==================== Metric Thresholds ====================
async function getMetricThresholds(): Promise<OpsMetricThresholds> {
const { data } = await apiClient.get<OpsMetricThresholds>('/admin/ops/settings/metric-thresholds')
return data
}
async function updateMetricThresholds(thresholds: OpsMetricThresholds): Promise<void> {
await apiClient.put('/admin/ops/settings/metric-thresholds', thresholds)
}
export const opsAPI = {
getDashboardOverview,
getThroughputTrend,
getLatencyHistogram,
getErrorTrend,
getErrorDistribution,
getConcurrencyStats,
getAccountAvailabilityStats,
getRealtimeTrafficSummary,
subscribeQPS,
// Legacy unified endpoints
listErrorLogs,
getErrorLogDetail,
retryErrorRequest,
listRetryAttempts,
updateErrorResolved,
// New split endpoints
listRequestErrors,
listUpstreamErrors,
getRequestErrorDetail,
getUpstreamErrorDetail,
retryRequestErrorClient,
retryRequestErrorUpstreamEvent,
retryUpstreamError,
updateRequestErrorResolved,
updateUpstreamErrorResolved,
listRequestErrorUpstreamErrors,
listRequestDetails,
listAlertRules,
createAlertRule,
updateAlertRule,
deleteAlertRule,
listAlertEvents,
getAlertEvent,
updateAlertEventStatus,
createAlertSilence,
getEmailNotificationConfig,
updateEmailNotificationConfig,
getAlertRuntimeSettings,
updateAlertRuntimeSettings,
getAdvancedSettings,
updateAdvancedSettings,
getMetricThresholds,
updateMetricThresholds
}
export default opsAPI