feat(前端API): 实现运维监控 API 客户端
- 新增 ops API 客户端(ops.ts) - 扩展 settings API 支持 ops 配置 - 更新 admin API 索引导出 ops 模块 - 扩展 API 客户端支持 WebSocket 连接
This commit is contained in:
@@ -16,6 +16,7 @@ import usageAPI from './usage'
|
|||||||
import geminiAPI from './gemini'
|
import geminiAPI from './gemini'
|
||||||
import antigravityAPI from './antigravity'
|
import antigravityAPI from './antigravity'
|
||||||
import userAttributesAPI from './userAttributes'
|
import userAttributesAPI from './userAttributes'
|
||||||
|
import opsAPI from './ops'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Unified admin API object for convenient access
|
* Unified admin API object for convenient access
|
||||||
@@ -33,7 +34,8 @@ export const adminAPI = {
|
|||||||
usage: usageAPI,
|
usage: usageAPI,
|
||||||
gemini: geminiAPI,
|
gemini: geminiAPI,
|
||||||
antigravity: antigravityAPI,
|
antigravity: antigravityAPI,
|
||||||
userAttributes: userAttributesAPI
|
userAttributes: userAttributesAPI,
|
||||||
|
ops: opsAPI
|
||||||
}
|
}
|
||||||
|
|
||||||
export {
|
export {
|
||||||
@@ -49,7 +51,8 @@ export {
|
|||||||
usageAPI,
|
usageAPI,
|
||||||
geminiAPI,
|
geminiAPI,
|
||||||
antigravityAPI,
|
antigravityAPI,
|
||||||
userAttributesAPI
|
userAttributesAPI,
|
||||||
|
opsAPI
|
||||||
}
|
}
|
||||||
|
|
||||||
export default adminAPI
|
export default adminAPI
|
||||||
|
|||||||
906
frontend/src/api/admin/ops.ts
Normal file
906
frontend/src/api/admin/ops.ts
Normal file
@@ -0,0 +1,906 @@
|
|||||||
|
/**
|
||||||
|
* Admin Ops API endpoints (vNext)
|
||||||
|
* - Error logs list/detail + retry (client/upstream)
|
||||||
|
* - Dashboard overview (raw path)
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { apiClient } from '../client'
|
||||||
|
import type { PaginatedResponse } from '@/types'
|
||||||
|
|
||||||
|
export type OpsRetryMode = 'client' | 'upstream'
|
||||||
|
export type OpsQueryMode = 'auto' | 'raw' | 'preagg'
|
||||||
|
|
||||||
|
export interface OpsRequestOptions {
|
||||||
|
signal?: AbortSignal
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsRetryRequest {
|
||||||
|
mode: OpsRetryMode
|
||||||
|
pinned_account_id?: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsRetryResult {
|
||||||
|
attempt_id: number
|
||||||
|
mode: OpsRetryMode
|
||||||
|
status: 'running' | 'succeeded' | 'failed' | string
|
||||||
|
|
||||||
|
pinned_account_id?: number | null
|
||||||
|
used_account_id?: number | null
|
||||||
|
|
||||||
|
http_status_code: number
|
||||||
|
upstream_request_id: string
|
||||||
|
|
||||||
|
response_preview: string
|
||||||
|
response_truncated: boolean
|
||||||
|
|
||||||
|
error_message: string
|
||||||
|
|
||||||
|
started_at: string
|
||||||
|
finished_at: string
|
||||||
|
duration_ms: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsDashboardOverview {
|
||||||
|
start_time: string
|
||||||
|
end_time: string
|
||||||
|
platform: string
|
||||||
|
group_id?: number | null
|
||||||
|
|
||||||
|
system_metrics?: OpsSystemMetricsSnapshot | null
|
||||||
|
job_heartbeats?: OpsJobHeartbeat[] | null
|
||||||
|
|
||||||
|
success_count: number
|
||||||
|
error_count_total: number
|
||||||
|
business_limited_count: number
|
||||||
|
error_count_sla: number
|
||||||
|
request_count_total: number
|
||||||
|
request_count_sla: number
|
||||||
|
|
||||||
|
token_consumed: number
|
||||||
|
|
||||||
|
sla: number
|
||||||
|
error_rate: number
|
||||||
|
upstream_error_rate: number
|
||||||
|
upstream_error_count_excl_429_529: number
|
||||||
|
upstream_429_count: number
|
||||||
|
upstream_529_count: number
|
||||||
|
|
||||||
|
qps: {
|
||||||
|
current: number
|
||||||
|
peak: number
|
||||||
|
avg: number
|
||||||
|
}
|
||||||
|
tps: {
|
||||||
|
current: number
|
||||||
|
peak: number
|
||||||
|
avg: number
|
||||||
|
}
|
||||||
|
|
||||||
|
duration: OpsPercentiles
|
||||||
|
ttft: OpsPercentiles
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsPercentiles {
|
||||||
|
p50_ms?: number | null
|
||||||
|
p90_ms?: number | null
|
||||||
|
p95_ms?: number | null
|
||||||
|
p99_ms?: number | null
|
||||||
|
avg_ms?: number | null
|
||||||
|
max_ms?: number | null
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsThroughputTrendPoint {
|
||||||
|
bucket_start: string
|
||||||
|
request_count: number
|
||||||
|
token_consumed: number
|
||||||
|
qps: number
|
||||||
|
tps: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsThroughputPlatformBreakdownItem {
|
||||||
|
platform: string
|
||||||
|
request_count: number
|
||||||
|
token_consumed: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsThroughputGroupBreakdownItem {
|
||||||
|
group_id: number
|
||||||
|
group_name: string
|
||||||
|
request_count: number
|
||||||
|
token_consumed: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsThroughputTrendResponse {
|
||||||
|
bucket: string
|
||||||
|
points: OpsThroughputTrendPoint[]
|
||||||
|
by_platform?: OpsThroughputPlatformBreakdownItem[]
|
||||||
|
top_groups?: OpsThroughputGroupBreakdownItem[]
|
||||||
|
}
|
||||||
|
|
||||||
|
export type OpsRequestKind = 'success' | 'error'
|
||||||
|
export type OpsRequestDetailsKind = OpsRequestKind | 'all'
|
||||||
|
export type OpsRequestDetailsSort = 'created_at_desc' | 'duration_desc'
|
||||||
|
|
||||||
|
export interface OpsRequestDetail {
|
||||||
|
kind: OpsRequestKind
|
||||||
|
created_at: string
|
||||||
|
request_id: string
|
||||||
|
|
||||||
|
platform?: string
|
||||||
|
model?: string
|
||||||
|
duration_ms?: number | null
|
||||||
|
status_code?: number | null
|
||||||
|
|
||||||
|
error_id?: number | null
|
||||||
|
phase?: string
|
||||||
|
severity?: string
|
||||||
|
message?: string
|
||||||
|
|
||||||
|
user_id?: number | null
|
||||||
|
api_key_id?: number | null
|
||||||
|
account_id?: number | null
|
||||||
|
group_id?: number | null
|
||||||
|
|
||||||
|
stream?: boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsRequestDetailsParams {
|
||||||
|
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
|
||||||
|
start_time?: string
|
||||||
|
end_time?: string
|
||||||
|
|
||||||
|
kind?: OpsRequestDetailsKind
|
||||||
|
|
||||||
|
platform?: string
|
||||||
|
group_id?: number | null
|
||||||
|
|
||||||
|
user_id?: number
|
||||||
|
api_key_id?: number
|
||||||
|
account_id?: number
|
||||||
|
|
||||||
|
model?: string
|
||||||
|
request_id?: string
|
||||||
|
q?: string
|
||||||
|
|
||||||
|
min_duration_ms?: number
|
||||||
|
max_duration_ms?: number
|
||||||
|
|
||||||
|
sort?: OpsRequestDetailsSort
|
||||||
|
|
||||||
|
page?: number
|
||||||
|
page_size?: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export type OpsRequestDetailsResponse = PaginatedResponse<OpsRequestDetail>
|
||||||
|
|
||||||
|
export interface OpsLatencyHistogramBucket {
|
||||||
|
range: string
|
||||||
|
count: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsLatencyHistogramResponse {
|
||||||
|
start_time: string
|
||||||
|
end_time: string
|
||||||
|
platform: string
|
||||||
|
group_id?: number | null
|
||||||
|
|
||||||
|
total_requests: number
|
||||||
|
buckets: OpsLatencyHistogramBucket[]
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsErrorTrendPoint {
|
||||||
|
bucket_start: string
|
||||||
|
error_count_total: number
|
||||||
|
business_limited_count: number
|
||||||
|
error_count_sla: number
|
||||||
|
upstream_error_count_excl_429_529: number
|
||||||
|
upstream_429_count: number
|
||||||
|
upstream_529_count: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsErrorTrendResponse {
|
||||||
|
bucket: string
|
||||||
|
points: OpsErrorTrendPoint[]
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsErrorDistributionItem {
|
||||||
|
status_code: number
|
||||||
|
total: number
|
||||||
|
sla: number
|
||||||
|
business_limited: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsErrorDistributionResponse {
|
||||||
|
total: number
|
||||||
|
items: OpsErrorDistributionItem[]
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsSystemMetricsSnapshot {
|
||||||
|
id: number
|
||||||
|
created_at: string
|
||||||
|
window_minutes: number
|
||||||
|
|
||||||
|
cpu_usage_percent?: number | null
|
||||||
|
memory_used_mb?: number | null
|
||||||
|
memory_total_mb?: number | null
|
||||||
|
memory_usage_percent?: number | null
|
||||||
|
|
||||||
|
db_ok?: boolean | null
|
||||||
|
redis_ok?: boolean | null
|
||||||
|
|
||||||
|
db_conn_active?: number | null
|
||||||
|
db_conn_idle?: number | null
|
||||||
|
db_conn_waiting?: number | null
|
||||||
|
|
||||||
|
goroutine_count?: number | null
|
||||||
|
concurrency_queue_depth?: number | null
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsJobHeartbeat {
|
||||||
|
job_name: string
|
||||||
|
last_run_at?: string | null
|
||||||
|
last_success_at?: string | null
|
||||||
|
last_error_at?: string | null
|
||||||
|
last_error?: string | null
|
||||||
|
last_duration_ms?: number | null
|
||||||
|
updated_at: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PlatformConcurrencyInfo {
|
||||||
|
platform: string
|
||||||
|
current_in_use: number
|
||||||
|
max_capacity: number
|
||||||
|
load_percentage: number
|
||||||
|
waiting_in_queue: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface GroupConcurrencyInfo {
|
||||||
|
group_id: number
|
||||||
|
group_name: string
|
||||||
|
platform: string
|
||||||
|
current_in_use: number
|
||||||
|
max_capacity: number
|
||||||
|
load_percentage: number
|
||||||
|
waiting_in_queue: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface AccountConcurrencyInfo {
|
||||||
|
account_id: number
|
||||||
|
account_name?: string
|
||||||
|
platform: string
|
||||||
|
group_id: number
|
||||||
|
group_name: string
|
||||||
|
current_in_use: number
|
||||||
|
max_capacity: number
|
||||||
|
load_percentage: number
|
||||||
|
waiting_in_queue: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsConcurrencyStatsResponse {
|
||||||
|
enabled: boolean
|
||||||
|
platform: Record<string, PlatformConcurrencyInfo>
|
||||||
|
group: Record<string, GroupConcurrencyInfo>
|
||||||
|
account: Record<string, AccountConcurrencyInfo>
|
||||||
|
timestamp?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getConcurrencyStats(platform?: string, groupId?: number | null): Promise<OpsConcurrencyStatsResponse> {
|
||||||
|
const params: Record<string, any> = {}
|
||||||
|
if (platform) {
|
||||||
|
params.platform = platform
|
||||||
|
}
|
||||||
|
if (typeof groupId === 'number' && groupId > 0) {
|
||||||
|
params.group_id = groupId
|
||||||
|
}
|
||||||
|
|
||||||
|
const { data } = await apiClient.get<OpsConcurrencyStatsResponse>('/admin/ops/concurrency', { params })
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PlatformAvailability {
|
||||||
|
platform: string
|
||||||
|
total_accounts: number
|
||||||
|
available_count: number
|
||||||
|
rate_limit_count: number
|
||||||
|
error_count: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface GroupAvailability {
|
||||||
|
group_id: number
|
||||||
|
group_name: string
|
||||||
|
platform: string
|
||||||
|
total_accounts: number
|
||||||
|
available_count: number
|
||||||
|
rate_limit_count: number
|
||||||
|
error_count: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface AccountAvailability {
|
||||||
|
account_id: number
|
||||||
|
account_name: string
|
||||||
|
platform: string
|
||||||
|
group_id: number
|
||||||
|
group_name: string
|
||||||
|
status: string
|
||||||
|
is_available: boolean
|
||||||
|
is_rate_limited: boolean
|
||||||
|
rate_limit_reset_at?: string
|
||||||
|
rate_limit_remaining_sec?: number
|
||||||
|
is_overloaded: boolean
|
||||||
|
overload_until?: string
|
||||||
|
overload_remaining_sec?: number
|
||||||
|
has_error: boolean
|
||||||
|
error_message?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsAccountAvailabilityStatsResponse {
|
||||||
|
enabled: boolean
|
||||||
|
platform: Record<string, PlatformAvailability>
|
||||||
|
group: Record<string, GroupAvailability>
|
||||||
|
account: Record<string, AccountAvailability>
|
||||||
|
timestamp?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getAccountAvailabilityStats(platform?: string, groupId?: number | null): Promise<OpsAccountAvailabilityStatsResponse> {
|
||||||
|
const params: Record<string, any> = {}
|
||||||
|
if (platform) {
|
||||||
|
params.platform = platform
|
||||||
|
}
|
||||||
|
if (typeof groupId === 'number' && groupId > 0) {
|
||||||
|
params.group_id = groupId
|
||||||
|
}
|
||||||
|
const { data } = await apiClient.get<OpsAccountAvailabilityStatsResponse>('/admin/ops/account-availability', { params })
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Subscribe to realtime QPS updates via WebSocket.
|
||||||
|
*
|
||||||
|
* Note: browsers cannot set Authorization headers for WebSockets.
|
||||||
|
* We authenticate via Sec-WebSocket-Protocol using a prefixed token item:
|
||||||
|
* ["sub2api-admin", "jwt.<token>"]
|
||||||
|
*/
|
||||||
|
export interface SubscribeQPSOptions {
|
||||||
|
token?: string | null
|
||||||
|
onOpen?: () => void
|
||||||
|
onClose?: (event: CloseEvent) => void
|
||||||
|
onError?: (event: Event) => void
|
||||||
|
/**
|
||||||
|
* Called when the server closes with an application close code that indicates
|
||||||
|
* reconnecting is not useful (e.g. feature flag disabled).
|
||||||
|
*/
|
||||||
|
onFatalClose?: (event: CloseEvent) => void
|
||||||
|
/**
|
||||||
|
* More granular status updates for UI (connecting/reconnecting/offline/etc).
|
||||||
|
*/
|
||||||
|
onStatusChange?: (status: OpsWSStatus) => void
|
||||||
|
/**
|
||||||
|
* Called when a reconnect is scheduled (helps display "retry in Xs").
|
||||||
|
*/
|
||||||
|
onReconnectScheduled?: (info: { attempt: number, delayMs: number }) => void
|
||||||
|
wsBaseUrl?: string
|
||||||
|
/**
|
||||||
|
* Maximum reconnect attempts. Defaults to Infinity to keep the dashboard live.
|
||||||
|
* Set to 0 to disable reconnect.
|
||||||
|
*/
|
||||||
|
maxReconnectAttempts?: number
|
||||||
|
reconnectBaseDelayMs?: number
|
||||||
|
reconnectMaxDelayMs?: number
|
||||||
|
/**
|
||||||
|
* Stale connection detection (heartbeat-by-observation).
|
||||||
|
* If no messages are received within this window, the socket is closed to trigger a reconnect.
|
||||||
|
* Set to 0 to disable.
|
||||||
|
*/
|
||||||
|
staleTimeoutMs?: number
|
||||||
|
/**
|
||||||
|
* How often to check staleness. Only used when `staleTimeoutMs > 0`.
|
||||||
|
*/
|
||||||
|
staleCheckIntervalMs?: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export type OpsWSStatus = 'connecting' | 'connected' | 'reconnecting' | 'offline' | 'closed'
|
||||||
|
|
||||||
|
export const OPS_WS_CLOSE_CODES = {
|
||||||
|
REALTIME_DISABLED: 4001
|
||||||
|
} as const
|
||||||
|
|
||||||
|
const OPS_WS_BASE_PROTOCOL = 'sub2api-admin'
|
||||||
|
|
||||||
|
export function subscribeQPS(onMessage: (data: any) => void, options: SubscribeQPSOptions = {}): () => void {
|
||||||
|
let ws: WebSocket | null = null
|
||||||
|
let reconnectAttempts = 0
|
||||||
|
const maxReconnectAttempts = Number.isFinite(options.maxReconnectAttempts as number)
|
||||||
|
? (options.maxReconnectAttempts as number)
|
||||||
|
: Infinity
|
||||||
|
const baseDelayMs = options.reconnectBaseDelayMs ?? 1000
|
||||||
|
const maxDelayMs = options.reconnectMaxDelayMs ?? 30000
|
||||||
|
let reconnectTimer: ReturnType<typeof setTimeout> | null = null
|
||||||
|
let shouldReconnect = true
|
||||||
|
let isConnecting = false
|
||||||
|
let hasConnectedOnce = false
|
||||||
|
let lastMessageAt = 0
|
||||||
|
const staleTimeoutMs = options.staleTimeoutMs ?? 120_000
|
||||||
|
const staleCheckIntervalMs = options.staleCheckIntervalMs ?? 30_000
|
||||||
|
let staleTimer: ReturnType<typeof setInterval> | null = null
|
||||||
|
|
||||||
|
const setStatus = (status: OpsWSStatus) => {
|
||||||
|
options.onStatusChange?.(status)
|
||||||
|
}
|
||||||
|
|
||||||
|
const clearReconnectTimer = () => {
|
||||||
|
if (reconnectTimer) {
|
||||||
|
clearTimeout(reconnectTimer)
|
||||||
|
reconnectTimer = null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const clearStaleTimer = () => {
|
||||||
|
if (staleTimer) {
|
||||||
|
clearInterval(staleTimer)
|
||||||
|
staleTimer = null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const startStaleTimer = () => {
|
||||||
|
clearStaleTimer()
|
||||||
|
if (!staleTimeoutMs || staleTimeoutMs <= 0) return
|
||||||
|
staleTimer = setInterval(() => {
|
||||||
|
if (!shouldReconnect) return
|
||||||
|
if (!ws || ws.readyState !== WebSocket.OPEN) return
|
||||||
|
if (!lastMessageAt) return
|
||||||
|
const ageMs = Date.now() - lastMessageAt
|
||||||
|
if (ageMs > staleTimeoutMs) {
|
||||||
|
// Treat as a half-open connection; closing triggers the normal reconnect path.
|
||||||
|
ws.close()
|
||||||
|
}
|
||||||
|
}, staleCheckIntervalMs)
|
||||||
|
}
|
||||||
|
|
||||||
|
const scheduleReconnect = () => {
|
||||||
|
if (!shouldReconnect) return
|
||||||
|
if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
|
||||||
|
|
||||||
|
// If we're offline, wait for the browser to come back online.
|
||||||
|
if (typeof navigator !== 'undefined' && 'onLine' in navigator && !navigator.onLine) {
|
||||||
|
setStatus('offline')
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const expDelay = baseDelayMs * Math.pow(2, reconnectAttempts)
|
||||||
|
const delay = Math.min(expDelay, maxDelayMs)
|
||||||
|
const jitter = Math.floor(Math.random() * 250)
|
||||||
|
clearReconnectTimer()
|
||||||
|
reconnectTimer = setTimeout(() => {
|
||||||
|
reconnectAttempts++
|
||||||
|
connect()
|
||||||
|
}, delay + jitter)
|
||||||
|
options.onReconnectScheduled?.({ attempt: reconnectAttempts + 1, delayMs: delay + jitter })
|
||||||
|
}
|
||||||
|
|
||||||
|
const handleOnline = () => {
|
||||||
|
if (!shouldReconnect) return
|
||||||
|
if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
|
||||||
|
connect()
|
||||||
|
}
|
||||||
|
|
||||||
|
const handleOffline = () => {
|
||||||
|
setStatus('offline')
|
||||||
|
}
|
||||||
|
|
||||||
|
const connect = () => {
|
||||||
|
if (!shouldReconnect) return
|
||||||
|
if (isConnecting) return
|
||||||
|
if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
|
||||||
|
if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
|
||||||
|
|
||||||
|
isConnecting = true
|
||||||
|
setStatus(hasConnectedOnce ? 'reconnecting' : 'connecting')
|
||||||
|
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
|
||||||
|
const wsBaseUrl = options.wsBaseUrl || import.meta.env.VITE_WS_BASE_URL || window.location.host
|
||||||
|
const wsURL = new URL(`${protocol}//${wsBaseUrl}/api/v1/admin/ops/ws/qps`)
|
||||||
|
|
||||||
|
// Do NOT put admin JWT in the URL query string (it can leak via access logs, proxies, etc).
|
||||||
|
// Browsers cannot set Authorization headers for WebSockets, so we pass the token via
|
||||||
|
// Sec-WebSocket-Protocol (subprotocol list): ["sub2api-admin", "jwt.<token>"].
|
||||||
|
const rawToken = String(options.token ?? localStorage.getItem('auth_token') ?? '').trim()
|
||||||
|
const protocols: string[] = [OPS_WS_BASE_PROTOCOL]
|
||||||
|
if (rawToken) protocols.push(`jwt.${rawToken}`)
|
||||||
|
|
||||||
|
ws = new WebSocket(wsURL.toString(), protocols)
|
||||||
|
|
||||||
|
ws.onopen = () => {
|
||||||
|
reconnectAttempts = 0
|
||||||
|
isConnecting = false
|
||||||
|
hasConnectedOnce = true
|
||||||
|
clearReconnectTimer()
|
||||||
|
lastMessageAt = Date.now()
|
||||||
|
startStaleTimer()
|
||||||
|
setStatus('connected')
|
||||||
|
options.onOpen?.()
|
||||||
|
}
|
||||||
|
|
||||||
|
ws.onmessage = (e) => {
|
||||||
|
try {
|
||||||
|
const data = JSON.parse(e.data)
|
||||||
|
lastMessageAt = Date.now()
|
||||||
|
onMessage(data)
|
||||||
|
} catch (err) {
|
||||||
|
console.warn('[OpsWS] Failed to parse message:', err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ws.onerror = (error) => {
|
||||||
|
console.error('[OpsWS] Connection error:', error)
|
||||||
|
options.onError?.(error)
|
||||||
|
}
|
||||||
|
|
||||||
|
ws.onclose = (event) => {
|
||||||
|
isConnecting = false
|
||||||
|
options.onClose?.(event)
|
||||||
|
clearStaleTimer()
|
||||||
|
ws = null
|
||||||
|
|
||||||
|
// If the server explicitly tells us to stop reconnecting, honor it.
|
||||||
|
if (event && typeof event.code === 'number' && event.code === OPS_WS_CLOSE_CODES.REALTIME_DISABLED) {
|
||||||
|
shouldReconnect = false
|
||||||
|
clearReconnectTimer()
|
||||||
|
setStatus('closed')
|
||||||
|
options.onFatalClose?.(event)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
scheduleReconnect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
window.addEventListener('online', handleOnline)
|
||||||
|
window.addEventListener('offline', handleOffline)
|
||||||
|
connect()
|
||||||
|
|
||||||
|
return () => {
|
||||||
|
shouldReconnect = false
|
||||||
|
window.removeEventListener('online', handleOnline)
|
||||||
|
window.removeEventListener('offline', handleOffline)
|
||||||
|
clearReconnectTimer()
|
||||||
|
clearStaleTimer()
|
||||||
|
if (ws) ws.close()
|
||||||
|
ws = null
|
||||||
|
setStatus('closed')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export type OpsSeverity = string
|
||||||
|
export type OpsPhase = string
|
||||||
|
|
||||||
|
export type AlertSeverity = 'critical' | 'warning' | 'info'
|
||||||
|
export type ThresholdMode = 'count' | 'percentage' | 'both'
|
||||||
|
export type MetricType =
|
||||||
|
| 'success_rate'
|
||||||
|
| 'error_rate'
|
||||||
|
| 'upstream_error_rate'
|
||||||
|
| 'p95_latency_ms'
|
||||||
|
| 'p99_latency_ms'
|
||||||
|
| 'cpu_usage_percent'
|
||||||
|
| 'memory_usage_percent'
|
||||||
|
| 'concurrency_queue_depth'
|
||||||
|
export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!='
|
||||||
|
|
||||||
|
export interface AlertRule {
|
||||||
|
id?: number
|
||||||
|
name: string
|
||||||
|
description?: string
|
||||||
|
enabled: boolean
|
||||||
|
metric_type: MetricType
|
||||||
|
operator: Operator
|
||||||
|
threshold: number
|
||||||
|
window_minutes: number
|
||||||
|
sustained_minutes: number
|
||||||
|
severity: OpsSeverity
|
||||||
|
cooldown_minutes: number
|
||||||
|
notify_email: boolean
|
||||||
|
filters?: Record<string, any>
|
||||||
|
created_at?: string
|
||||||
|
updated_at?: string
|
||||||
|
last_triggered_at?: string | null
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface AlertEvent {
|
||||||
|
id: number
|
||||||
|
rule_id: number
|
||||||
|
severity: OpsSeverity | string
|
||||||
|
status: 'firing' | 'resolved' | string
|
||||||
|
title?: string
|
||||||
|
description?: string
|
||||||
|
metric_value?: number
|
||||||
|
threshold_value?: number
|
||||||
|
dimensions?: Record<string, any>
|
||||||
|
fired_at: string
|
||||||
|
resolved_at?: string | null
|
||||||
|
email_sent: boolean
|
||||||
|
created_at: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface EmailNotificationConfig {
|
||||||
|
alert: {
|
||||||
|
enabled: boolean
|
||||||
|
recipients: string[]
|
||||||
|
min_severity: AlertSeverity | ''
|
||||||
|
rate_limit_per_hour: number
|
||||||
|
batching_window_seconds: number
|
||||||
|
include_resolved_alerts: boolean
|
||||||
|
}
|
||||||
|
report: {
|
||||||
|
enabled: boolean
|
||||||
|
recipients: string[]
|
||||||
|
daily_summary_enabled: boolean
|
||||||
|
daily_summary_schedule: string
|
||||||
|
weekly_summary_enabled: boolean
|
||||||
|
weekly_summary_schedule: string
|
||||||
|
error_digest_enabled: boolean
|
||||||
|
error_digest_schedule: string
|
||||||
|
error_digest_min_count: number
|
||||||
|
account_health_enabled: boolean
|
||||||
|
account_health_schedule: string
|
||||||
|
account_health_error_rate_threshold: number
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsDistributedLockSettings {
|
||||||
|
enabled: boolean
|
||||||
|
key: string
|
||||||
|
ttl_seconds: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsAlertRuntimeSettings {
|
||||||
|
evaluation_interval_seconds: number
|
||||||
|
distributed_lock: OpsDistributedLockSettings
|
||||||
|
silencing: {
|
||||||
|
enabled: boolean
|
||||||
|
global_until_rfc3339: string
|
||||||
|
global_reason: string
|
||||||
|
entries?: Array<{
|
||||||
|
rule_id?: number
|
||||||
|
severities?: Array<OpsSeverity | string>
|
||||||
|
until_rfc3339: string
|
||||||
|
reason: string
|
||||||
|
}>
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsErrorLog {
|
||||||
|
id: number
|
||||||
|
created_at: string
|
||||||
|
phase: OpsPhase
|
||||||
|
type: string
|
||||||
|
severity: OpsSeverity
|
||||||
|
status_code: number
|
||||||
|
platform: string
|
||||||
|
model: string
|
||||||
|
latency_ms?: number | null
|
||||||
|
client_request_id: string
|
||||||
|
request_id: string
|
||||||
|
message: string
|
||||||
|
|
||||||
|
user_id?: number | null
|
||||||
|
api_key_id?: number | null
|
||||||
|
account_id?: number | null
|
||||||
|
group_id?: number | null
|
||||||
|
|
||||||
|
client_ip?: string | null
|
||||||
|
request_path?: string
|
||||||
|
stream?: boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpsErrorDetail extends OpsErrorLog {
|
||||||
|
error_body: string
|
||||||
|
user_agent: string
|
||||||
|
|
||||||
|
auth_latency_ms?: number | null
|
||||||
|
routing_latency_ms?: number | null
|
||||||
|
upstream_latency_ms?: number | null
|
||||||
|
response_latency_ms?: number | null
|
||||||
|
time_to_first_token_ms?: number | null
|
||||||
|
|
||||||
|
request_body: string
|
||||||
|
request_body_truncated: boolean
|
||||||
|
request_body_bytes?: number | null
|
||||||
|
|
||||||
|
is_business_limited: boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
export type OpsErrorLogsResponse = PaginatedResponse<OpsErrorLog>
|
||||||
|
|
||||||
|
export async function getDashboardOverview(
|
||||||
|
params: {
|
||||||
|
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
|
||||||
|
start_time?: string
|
||||||
|
end_time?: string
|
||||||
|
platform?: string
|
||||||
|
group_id?: number | null
|
||||||
|
mode?: OpsQueryMode
|
||||||
|
},
|
||||||
|
options: OpsRequestOptions = {}
|
||||||
|
): Promise<OpsDashboardOverview> {
|
||||||
|
const { data } = await apiClient.get<OpsDashboardOverview>('/admin/ops/dashboard/overview', {
|
||||||
|
params,
|
||||||
|
signal: options.signal
|
||||||
|
})
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getThroughputTrend(
|
||||||
|
params: {
|
||||||
|
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
|
||||||
|
start_time?: string
|
||||||
|
end_time?: string
|
||||||
|
platform?: string
|
||||||
|
group_id?: number | null
|
||||||
|
mode?: OpsQueryMode
|
||||||
|
},
|
||||||
|
options: OpsRequestOptions = {}
|
||||||
|
): Promise<OpsThroughputTrendResponse> {
|
||||||
|
const { data } = await apiClient.get<OpsThroughputTrendResponse>('/admin/ops/dashboard/throughput-trend', {
|
||||||
|
params,
|
||||||
|
signal: options.signal
|
||||||
|
})
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getLatencyHistogram(
|
||||||
|
params: {
|
||||||
|
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
|
||||||
|
start_time?: string
|
||||||
|
end_time?: string
|
||||||
|
platform?: string
|
||||||
|
group_id?: number | null
|
||||||
|
mode?: OpsQueryMode
|
||||||
|
},
|
||||||
|
options: OpsRequestOptions = {}
|
||||||
|
): Promise<OpsLatencyHistogramResponse> {
|
||||||
|
const { data } = await apiClient.get<OpsLatencyHistogramResponse>('/admin/ops/dashboard/latency-histogram', {
|
||||||
|
params,
|
||||||
|
signal: options.signal
|
||||||
|
})
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getErrorTrend(
|
||||||
|
params: {
|
||||||
|
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
|
||||||
|
start_time?: string
|
||||||
|
end_time?: string
|
||||||
|
platform?: string
|
||||||
|
group_id?: number | null
|
||||||
|
mode?: OpsQueryMode
|
||||||
|
},
|
||||||
|
options: OpsRequestOptions = {}
|
||||||
|
): Promise<OpsErrorTrendResponse> {
|
||||||
|
const { data } = await apiClient.get<OpsErrorTrendResponse>('/admin/ops/dashboard/error-trend', {
|
||||||
|
params,
|
||||||
|
signal: options.signal
|
||||||
|
})
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getErrorDistribution(
|
||||||
|
params: {
|
||||||
|
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
|
||||||
|
start_time?: string
|
||||||
|
end_time?: string
|
||||||
|
platform?: string
|
||||||
|
group_id?: number | null
|
||||||
|
mode?: OpsQueryMode
|
||||||
|
},
|
||||||
|
options: OpsRequestOptions = {}
|
||||||
|
): Promise<OpsErrorDistributionResponse> {
|
||||||
|
const { data } = await apiClient.get<OpsErrorDistributionResponse>('/admin/ops/dashboard/error-distribution', {
|
||||||
|
params,
|
||||||
|
signal: options.signal
|
||||||
|
})
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function listErrorLogs(params: {
|
||||||
|
page?: number
|
||||||
|
page_size?: number
|
||||||
|
time_range?: string
|
||||||
|
start_time?: string
|
||||||
|
end_time?: string
|
||||||
|
platform?: string
|
||||||
|
group_id?: number | null
|
||||||
|
account_id?: number | null
|
||||||
|
phase?: string
|
||||||
|
q?: string
|
||||||
|
status_codes?: string
|
||||||
|
}): Promise<OpsErrorLogsResponse> {
|
||||||
|
const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/errors', { params })
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getErrorLogDetail(id: number): Promise<OpsErrorDetail> {
|
||||||
|
const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/errors/${id}`)
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promise<OpsRetryResult> {
|
||||||
|
const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/errors/${id}/retry`, req)
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function listRequestDetails(params: OpsRequestDetailsParams): Promise<OpsRequestDetailsResponse> {
|
||||||
|
const { data } = await apiClient.get<OpsRequestDetailsResponse>('/admin/ops/requests', { params })
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
// Alert rules
|
||||||
|
export async function listAlertRules(): Promise<AlertRule[]> {
|
||||||
|
const { data } = await apiClient.get<AlertRule[]>('/admin/ops/alert-rules')
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function createAlertRule(rule: AlertRule): Promise<AlertRule> {
|
||||||
|
const { data } = await apiClient.post<AlertRule>('/admin/ops/alert-rules', rule)
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function updateAlertRule(id: number, rule: Partial<AlertRule>): Promise<AlertRule> {
|
||||||
|
const { data } = await apiClient.put<AlertRule>(`/admin/ops/alert-rules/${id}`, rule)
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function deleteAlertRule(id: number): Promise<void> {
|
||||||
|
await apiClient.delete(`/admin/ops/alert-rules/${id}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function listAlertEvents(limit = 100): Promise<AlertEvent[]> {
|
||||||
|
const { data } = await apiClient.get<AlertEvent[]>('/admin/ops/alert-events', { params: { limit } })
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
// Email notification config
|
||||||
|
export async function getEmailNotificationConfig(): Promise<EmailNotificationConfig> {
|
||||||
|
const { data } = await apiClient.get<EmailNotificationConfig>('/admin/ops/email-notification/config')
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function updateEmailNotificationConfig(config: EmailNotificationConfig): Promise<EmailNotificationConfig> {
|
||||||
|
const { data } = await apiClient.put<EmailNotificationConfig>('/admin/ops/email-notification/config', config)
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
// Runtime settings (DB-backed)
|
||||||
|
export async function getAlertRuntimeSettings(): Promise<OpsAlertRuntimeSettings> {
|
||||||
|
const { data } = await apiClient.get<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert')
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function updateAlertRuntimeSettings(config: OpsAlertRuntimeSettings): Promise<OpsAlertRuntimeSettings> {
|
||||||
|
const { data } = await apiClient.put<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert', config)
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
export const opsAPI = {
|
||||||
|
getDashboardOverview,
|
||||||
|
getThroughputTrend,
|
||||||
|
getLatencyHistogram,
|
||||||
|
getErrorTrend,
|
||||||
|
getErrorDistribution,
|
||||||
|
getConcurrencyStats,
|
||||||
|
getAccountAvailabilityStats,
|
||||||
|
subscribeQPS,
|
||||||
|
listErrorLogs,
|
||||||
|
getErrorLogDetail,
|
||||||
|
retryErrorRequest,
|
||||||
|
listRequestDetails,
|
||||||
|
listAlertRules,
|
||||||
|
createAlertRule,
|
||||||
|
updateAlertRule,
|
||||||
|
deleteAlertRule,
|
||||||
|
listAlertEvents,
|
||||||
|
getEmailNotificationConfig,
|
||||||
|
updateEmailNotificationConfig,
|
||||||
|
getAlertRuntimeSettings,
|
||||||
|
updateAlertRuntimeSettings
|
||||||
|
}
|
||||||
|
|
||||||
|
export default opsAPI
|
||||||
@@ -34,9 +34,22 @@ export interface SystemSettings {
|
|||||||
turnstile_enabled: boolean
|
turnstile_enabled: boolean
|
||||||
turnstile_site_key: string
|
turnstile_site_key: string
|
||||||
turnstile_secret_key_configured: boolean
|
turnstile_secret_key_configured: boolean
|
||||||
|
|
||||||
|
// Model fallback configuration
|
||||||
|
enable_model_fallback: boolean
|
||||||
|
fallback_model_anthropic: string
|
||||||
|
fallback_model_openai: string
|
||||||
|
fallback_model_gemini: string
|
||||||
|
fallback_model_antigravity: string
|
||||||
|
|
||||||
// Identity patch configuration (Claude -> Gemini)
|
// Identity patch configuration (Claude -> Gemini)
|
||||||
enable_identity_patch: boolean
|
enable_identity_patch: boolean
|
||||||
identity_patch_prompt: string
|
identity_patch_prompt: string
|
||||||
|
|
||||||
|
// Ops Monitoring (vNext)
|
||||||
|
ops_monitoring_enabled: boolean
|
||||||
|
ops_realtime_monitoring_enabled: boolean
|
||||||
|
ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface UpdateSettingsRequest {
|
export interface UpdateSettingsRequest {
|
||||||
@@ -60,8 +73,16 @@ export interface UpdateSettingsRequest {
|
|||||||
turnstile_enabled?: boolean
|
turnstile_enabled?: boolean
|
||||||
turnstile_site_key?: string
|
turnstile_site_key?: string
|
||||||
turnstile_secret_key?: string
|
turnstile_secret_key?: string
|
||||||
|
enable_model_fallback?: boolean
|
||||||
|
fallback_model_anthropic?: string
|
||||||
|
fallback_model_openai?: string
|
||||||
|
fallback_model_gemini?: string
|
||||||
|
fallback_model_antigravity?: string
|
||||||
enable_identity_patch?: boolean
|
enable_identity_patch?: boolean
|
||||||
identity_patch_prompt?: string
|
identity_patch_prompt?: string
|
||||||
|
ops_monitoring_enabled?: boolean
|
||||||
|
ops_realtime_monitoring_enabled?: boolean
|
||||||
|
ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -80,9 +80,45 @@ apiClient.interceptors.response.use(
|
|||||||
return response
|
return response
|
||||||
},
|
},
|
||||||
(error: AxiosError<ApiResponse<unknown>>) => {
|
(error: AxiosError<ApiResponse<unknown>>) => {
|
||||||
|
// Request cancellation: keep the original axios cancellation error so callers can ignore it.
|
||||||
|
// Otherwise we'd misclassify it as a generic "network error".
|
||||||
|
if (error.code === 'ERR_CANCELED' || axios.isCancel(error)) {
|
||||||
|
return Promise.reject(error)
|
||||||
|
}
|
||||||
|
|
||||||
// Handle common errors
|
// Handle common errors
|
||||||
if (error.response) {
|
if (error.response) {
|
||||||
const { status, data } = error.response
|
const { status, data } = error.response
|
||||||
|
const url = String(error.config?.url || '')
|
||||||
|
|
||||||
|
// Validate `data` shape to avoid HTML error pages breaking our error handling.
|
||||||
|
const apiData = (typeof data === 'object' && data !== null ? data : {}) as Record<string, any>
|
||||||
|
|
||||||
|
// Ops monitoring disabled: treat as feature-flagged 404, and proactively redirect away
|
||||||
|
// from ops pages to avoid broken UI states.
|
||||||
|
if (status === 404 && apiData.message === 'Ops monitoring is disabled') {
|
||||||
|
try {
|
||||||
|
localStorage.setItem('ops_monitoring_enabled_cached', 'false')
|
||||||
|
} catch {
|
||||||
|
// ignore localStorage failures
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
window.dispatchEvent(new CustomEvent('ops-monitoring-disabled'))
|
||||||
|
} catch {
|
||||||
|
// ignore event failures
|
||||||
|
}
|
||||||
|
|
||||||
|
if (window.location.pathname.startsWith('/admin/ops')) {
|
||||||
|
window.location.href = '/admin/settings'
|
||||||
|
}
|
||||||
|
|
||||||
|
return Promise.reject({
|
||||||
|
status,
|
||||||
|
code: 'OPS_DISABLED',
|
||||||
|
message: apiData.message || error.message,
|
||||||
|
url
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
// 401: Unauthorized - clear token and redirect to login
|
// 401: Unauthorized - clear token and redirect to login
|
||||||
if (status === 401) {
|
if (status === 401) {
|
||||||
@@ -113,8 +149,8 @@ apiClient.interceptors.response.use(
|
|||||||
// Return structured error
|
// Return structured error
|
||||||
return Promise.reject({
|
return Promise.reject({
|
||||||
status,
|
status,
|
||||||
code: data?.code,
|
code: apiData.code,
|
||||||
message: data?.message || error.message
|
message: apiData.message || apiData.detail || error.message
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user