feat(前端API): 实现运维监控 API 客户端

- 新增 ops API 客户端（ops.ts） - 扩展 settings API 支持 ops 配置 - 更新 admin API 索引导出 ops 模块 - 扩展 API 客户端支持 WebSocket 连接
2026-01-09 20:58:33 +08:00
parent e846458009
commit 11d063e3c4
4 changed files with 970 additions and 4 deletions
--- a/frontend/src/api/admin/index.ts
+++ b/frontend/src/api/admin/index.ts
@@ -16,6 +16,7 @@ import usageAPI from './usage'
 import geminiAPI from './gemini'
 import antigravityAPI from './antigravity'
 import userAttributesAPI from './userAttributes'
+import opsAPI from './ops'

 /**
 * Unified admin API object for convenient access
@@ -33,7 +34,8 @@ export const adminAPI = {
  usage: usageAPI,
  gemini: geminiAPI,
  antigravity: antigravityAPI,
-  userAttributes: userAttributesAPI
+  userAttributes: userAttributesAPI,
+  ops: opsAPI
 }

 export {
@@ -49,7 +51,8 @@ export {
  usageAPI,
  geminiAPI,
  antigravityAPI,
-  userAttributesAPI
+  userAttributesAPI,
+  opsAPI
 }

 export default adminAPI
--- a/frontend/src/api/admin/ops.ts
+++ b/frontend/src/api/admin/ops.ts
@@ -0,0 +1,906 @@
+/**
+ * Admin Ops API endpoints (vNext)
+ * - Error logs list/detail + retry (client/upstream)
+ * - Dashboard overview (raw path)
+ */
+
+import { apiClient } from '../client'
+import type { PaginatedResponse } from '@/types'
+
+export type OpsRetryMode = 'client' | 'upstream'
+export type OpsQueryMode = 'auto' | 'raw' | 'preagg'
+
+export interface OpsRequestOptions {
+  signal?: AbortSignal
+}
+
+export interface OpsRetryRequest {
+  mode: OpsRetryMode
+  pinned_account_id?: number
+}
+
+export interface OpsRetryResult {
+  attempt_id: number
+  mode: OpsRetryMode
+  status: 'running' | 'succeeded' | 'failed' | string
+
+  pinned_account_id?: number | null
+  used_account_id?: number | null
+
+  http_status_code: number
+  upstream_request_id: string
+
+  response_preview: string
+  response_truncated: boolean
+
+  error_message: string
+
+  started_at: string
+  finished_at: string
+  duration_ms: number
+}
+
+export interface OpsDashboardOverview {
+  start_time: string
+  end_time: string
+  platform: string
+  group_id?: number | null
+
+  system_metrics?: OpsSystemMetricsSnapshot | null
+  job_heartbeats?: OpsJobHeartbeat[] | null
+
+  success_count: number
+  error_count_total: number
+  business_limited_count: number
+  error_count_sla: number
+  request_count_total: number
+  request_count_sla: number
+
+  token_consumed: number
+
+  sla: number
+  error_rate: number
+  upstream_error_rate: number
+  upstream_error_count_excl_429_529: number
+  upstream_429_count: number
+  upstream_529_count: number
+
+  qps: {
+    current: number
+    peak: number
+    avg: number
+  }
+  tps: {
+    current: number
+    peak: number
+    avg: number
+  }
+
+  duration: OpsPercentiles
+  ttft: OpsPercentiles
+}
+
+export interface OpsPercentiles {
+  p50_ms?: number | null
+  p90_ms?: number | null
+  p95_ms?: number | null
+  p99_ms?: number | null
+  avg_ms?: number | null
+  max_ms?: number | null
+}
+
+export interface OpsThroughputTrendPoint {
+  bucket_start: string
+  request_count: number
+  token_consumed: number
+  qps: number
+  tps: number
+}
+
+export interface OpsThroughputPlatformBreakdownItem {
+  platform: string
+  request_count: number
+  token_consumed: number
+}
+
+export interface OpsThroughputGroupBreakdownItem {
+  group_id: number
+  group_name: string
+  request_count: number
+  token_consumed: number
+}
+
+export interface OpsThroughputTrendResponse {
+  bucket: string
+  points: OpsThroughputTrendPoint[]
+  by_platform?: OpsThroughputPlatformBreakdownItem[]
+  top_groups?: OpsThroughputGroupBreakdownItem[]
+}
+
+export type OpsRequestKind = 'success' | 'error'
+export type OpsRequestDetailsKind = OpsRequestKind | 'all'
+export type OpsRequestDetailsSort = 'created_at_desc' | 'duration_desc'
+
+export interface OpsRequestDetail {
+  kind: OpsRequestKind
+  created_at: string
+  request_id: string
+
+  platform?: string
+  model?: string
+  duration_ms?: number | null
+  status_code?: number | null
+
+  error_id?: number | null
+  phase?: string
+  severity?: string
+  message?: string
+
+  user_id?: number | null
+  api_key_id?: number | null
+  account_id?: number | null
+  group_id?: number | null
+
+  stream?: boolean
+}
+
+export interface OpsRequestDetailsParams {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+
+  kind?: OpsRequestDetailsKind
+
+  platform?: string
+  group_id?: number | null
+
+  user_id?: number
+  api_key_id?: number
+  account_id?: number
+
+  model?: string
+  request_id?: string
+  q?: string
+
+  min_duration_ms?: number
+  max_duration_ms?: number
+
+  sort?: OpsRequestDetailsSort
+
+  page?: number
+  page_size?: number
+}
+
+export type OpsRequestDetailsResponse = PaginatedResponse<OpsRequestDetail>
+
+export interface OpsLatencyHistogramBucket {
+  range: string
+  count: number
+}
+
+export interface OpsLatencyHistogramResponse {
+  start_time: string
+  end_time: string
+  platform: string
+  group_id?: number | null
+
+  total_requests: number
+  buckets: OpsLatencyHistogramBucket[]
+}
+
+export interface OpsErrorTrendPoint {
+  bucket_start: string
+  error_count_total: number
+  business_limited_count: number
+  error_count_sla: number
+  upstream_error_count_excl_429_529: number
+  upstream_429_count: number
+  upstream_529_count: number
+}
+
+export interface OpsErrorTrendResponse {
+  bucket: string
+  points: OpsErrorTrendPoint[]
+}
+
+export interface OpsErrorDistributionItem {
+  status_code: number
+  total: number
+  sla: number
+  business_limited: number
+}
+
+export interface OpsErrorDistributionResponse {
+  total: number
+  items: OpsErrorDistributionItem[]
+}
+
+export interface OpsSystemMetricsSnapshot {
+  id: number
+  created_at: string
+  window_minutes: number
+
+  cpu_usage_percent?: number | null
+  memory_used_mb?: number | null
+  memory_total_mb?: number | null
+  memory_usage_percent?: number | null
+
+  db_ok?: boolean | null
+  redis_ok?: boolean | null
+
+  db_conn_active?: number | null
+  db_conn_idle?: number | null
+  db_conn_waiting?: number | null
+
+  goroutine_count?: number | null
+  concurrency_queue_depth?: number | null
+}
+
+export interface OpsJobHeartbeat {
+  job_name: string
+  last_run_at?: string | null
+  last_success_at?: string | null
+  last_error_at?: string | null
+  last_error?: string | null
+  last_duration_ms?: number | null
+  updated_at: string
+}
+
+export interface PlatformConcurrencyInfo {
+  platform: string
+  current_in_use: number
+  max_capacity: number
+  load_percentage: number
+  waiting_in_queue: number
+}
+
+export interface GroupConcurrencyInfo {
+  group_id: number
+  group_name: string
+  platform: string
+  current_in_use: number
+  max_capacity: number
+  load_percentage: number
+  waiting_in_queue: number
+}
+
+export interface AccountConcurrencyInfo {
+  account_id: number
+  account_name?: string
+  platform: string
+  group_id: number
+  group_name: string
+  current_in_use: number
+  max_capacity: number
+  load_percentage: number
+  waiting_in_queue: number
+}
+
+export interface OpsConcurrencyStatsResponse {
+  enabled: boolean
+  platform: Record<string, PlatformConcurrencyInfo>
+  group: Record<string, GroupConcurrencyInfo>
+  account: Record<string, AccountConcurrencyInfo>
+  timestamp?: string
+}
+
+export async function getConcurrencyStats(platform?: string, groupId?: number | null): Promise<OpsConcurrencyStatsResponse> {
+  const params: Record<string, any> = {}
+  if (platform) {
+    params.platform = platform
+  }
+  if (typeof groupId === 'number' && groupId > 0) {
+    params.group_id = groupId
+  }
+
+  const { data } = await apiClient.get<OpsConcurrencyStatsResponse>('/admin/ops/concurrency', { params })
+  return data
+}
+
+export interface PlatformAvailability {
+  platform: string
+  total_accounts: number
+  available_count: number
+  rate_limit_count: number
+  error_count: number
+}
+
+export interface GroupAvailability {
+  group_id: number
+  group_name: string
+  platform: string
+  total_accounts: number
+  available_count: number
+  rate_limit_count: number
+  error_count: number
+}
+
+export interface AccountAvailability {
+  account_id: number
+  account_name: string
+  platform: string
+  group_id: number
+  group_name: string
+  status: string
+  is_available: boolean
+  is_rate_limited: boolean
+  rate_limit_reset_at?: string
+  rate_limit_remaining_sec?: number
+  is_overloaded: boolean
+  overload_until?: string
+  overload_remaining_sec?: number
+  has_error: boolean
+  error_message?: string
+}
+
+export interface OpsAccountAvailabilityStatsResponse {
+  enabled: boolean
+  platform: Record<string, PlatformAvailability>
+  group: Record<string, GroupAvailability>
+  account: Record<string, AccountAvailability>
+  timestamp?: string
+}
+
+export async function getAccountAvailabilityStats(platform?: string, groupId?: number | null): Promise<OpsAccountAvailabilityStatsResponse> {
+  const params: Record<string, any> = {}
+  if (platform) {
+    params.platform = platform
+  }
+  if (typeof groupId === 'number' && groupId > 0) {
+    params.group_id = groupId
+  }
+  const { data } = await apiClient.get<OpsAccountAvailabilityStatsResponse>('/admin/ops/account-availability', { params })
+  return data
+}
+
+/**
+ * Subscribe to realtime QPS updates via WebSocket.
+ *
+ * Note: browsers cannot set Authorization headers for WebSockets.
+ * We authenticate via Sec-WebSocket-Protocol using a prefixed token item:
+ *   ["sub2api-admin", "jwt.<token>"]
+ */
+export interface SubscribeQPSOptions {
+  token?: string | null
+  onOpen?: () => void
+  onClose?: (event: CloseEvent) => void
+  onError?: (event: Event) => void
+  /**
+   * Called when the server closes with an application close code that indicates
+   * reconnecting is not useful (e.g. feature flag disabled).
+   */
+  onFatalClose?: (event: CloseEvent) => void
+  /**
+   * More granular status updates for UI (connecting/reconnecting/offline/etc).
+   */
+  onStatusChange?: (status: OpsWSStatus) => void
+  /**
+   * Called when a reconnect is scheduled (helps display "retry in Xs").
+   */
+  onReconnectScheduled?: (info: { attempt: number, delayMs: number }) => void
+  wsBaseUrl?: string
+  /**
+   * Maximum reconnect attempts. Defaults to Infinity to keep the dashboard live.
+   * Set to 0 to disable reconnect.
+   */
+  maxReconnectAttempts?: number
+  reconnectBaseDelayMs?: number
+  reconnectMaxDelayMs?: number
+  /**
+   * Stale connection detection (heartbeat-by-observation).
+   * If no messages are received within this window, the socket is closed to trigger a reconnect.
+   * Set to 0 to disable.
+   */
+  staleTimeoutMs?: number
+  /**
+   * How often to check staleness. Only used when `staleTimeoutMs > 0`.
+   */
+  staleCheckIntervalMs?: number
+}
+
+export type OpsWSStatus = 'connecting' | 'connected' | 'reconnecting' | 'offline' | 'closed'
+
+export const OPS_WS_CLOSE_CODES = {
+  REALTIME_DISABLED: 4001
+} as const
+
+const OPS_WS_BASE_PROTOCOL = 'sub2api-admin'
+
+export function subscribeQPS(onMessage: (data: any) => void, options: SubscribeQPSOptions = {}): () => void {
+  let ws: WebSocket | null = null
+  let reconnectAttempts = 0
+  const maxReconnectAttempts = Number.isFinite(options.maxReconnectAttempts as number)
+    ? (options.maxReconnectAttempts as number)
+    : Infinity
+  const baseDelayMs = options.reconnectBaseDelayMs ?? 1000
+  const maxDelayMs = options.reconnectMaxDelayMs ?? 30000
+  let reconnectTimer: ReturnType<typeof setTimeout> | null = null
+  let shouldReconnect = true
+  let isConnecting = false
+  let hasConnectedOnce = false
+  let lastMessageAt = 0
+  const staleTimeoutMs = options.staleTimeoutMs ?? 120_000
+  const staleCheckIntervalMs = options.staleCheckIntervalMs ?? 30_000
+  let staleTimer: ReturnType<typeof setInterval> | null = null
+
+  const setStatus = (status: OpsWSStatus) => {
+    options.onStatusChange?.(status)
+  }
+
+  const clearReconnectTimer = () => {
+    if (reconnectTimer) {
+      clearTimeout(reconnectTimer)
+      reconnectTimer = null
+    }
+  }
+
+  const clearStaleTimer = () => {
+    if (staleTimer) {
+      clearInterval(staleTimer)
+      staleTimer = null
+    }
+  }
+
+  const startStaleTimer = () => {
+    clearStaleTimer()
+    if (!staleTimeoutMs || staleTimeoutMs <= 0) return
+    staleTimer = setInterval(() => {
+      if (!shouldReconnect) return
+      if (!ws || ws.readyState !== WebSocket.OPEN) return
+      if (!lastMessageAt) return
+      const ageMs = Date.now() - lastMessageAt
+      if (ageMs > staleTimeoutMs) {
+        // Treat as a half-open connection; closing triggers the normal reconnect path.
+        ws.close()
+      }
+    }, staleCheckIntervalMs)
+  }
+
+  const scheduleReconnect = () => {
+    if (!shouldReconnect) return
+    if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
+
+    // If we're offline, wait for the browser to come back online.
+    if (typeof navigator !== 'undefined' && 'onLine' in navigator && !navigator.onLine) {
+      setStatus('offline')
+      return
+    }
+
+    const expDelay = baseDelayMs * Math.pow(2, reconnectAttempts)
+    const delay = Math.min(expDelay, maxDelayMs)
+    const jitter = Math.floor(Math.random() * 250)
+    clearReconnectTimer()
+    reconnectTimer = setTimeout(() => {
+      reconnectAttempts++
+      connect()
+    }, delay + jitter)
+    options.onReconnectScheduled?.({ attempt: reconnectAttempts + 1, delayMs: delay + jitter })
+  }
+
+  const handleOnline = () => {
+    if (!shouldReconnect) return
+    if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
+    connect()
+  }
+
+  const handleOffline = () => {
+    setStatus('offline')
+  }
+
+  const connect = () => {
+    if (!shouldReconnect) return
+    if (isConnecting) return
+    if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
+    if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
+
+    isConnecting = true
+    setStatus(hasConnectedOnce ? 'reconnecting' : 'connecting')
+    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
+    const wsBaseUrl = options.wsBaseUrl || import.meta.env.VITE_WS_BASE_URL || window.location.host
+    const wsURL = new URL(`${protocol}//${wsBaseUrl}/api/v1/admin/ops/ws/qps`)
+
+    // Do NOT put admin JWT in the URL query string (it can leak via access logs, proxies, etc).
+    // Browsers cannot set Authorization headers for WebSockets, so we pass the token via
+    // Sec-WebSocket-Protocol (subprotocol list): ["sub2api-admin", "jwt.<token>"].
+    const rawToken = String(options.token ?? localStorage.getItem('auth_token') ?? '').trim()
+    const protocols: string[] = [OPS_WS_BASE_PROTOCOL]
+    if (rawToken) protocols.push(`jwt.${rawToken}`)
+
+    ws = new WebSocket(wsURL.toString(), protocols)
+
+    ws.onopen = () => {
+      reconnectAttempts = 0
+      isConnecting = false
+      hasConnectedOnce = true
+      clearReconnectTimer()
+      lastMessageAt = Date.now()
+      startStaleTimer()
+      setStatus('connected')
+      options.onOpen?.()
+    }
+
+    ws.onmessage = (e) => {
+      try {
+        const data = JSON.parse(e.data)
+        lastMessageAt = Date.now()
+        onMessage(data)
+      } catch (err) {
+        console.warn('[OpsWS] Failed to parse message:', err)
+      }
+    }
+
+    ws.onerror = (error) => {
+      console.error('[OpsWS] Connection error:', error)
+      options.onError?.(error)
+    }
+
+    ws.onclose = (event) => {
+      isConnecting = false
+      options.onClose?.(event)
+      clearStaleTimer()
+      ws = null
+
+      // If the server explicitly tells us to stop reconnecting, honor it.
+      if (event && typeof event.code === 'number' && event.code === OPS_WS_CLOSE_CODES.REALTIME_DISABLED) {
+        shouldReconnect = false
+        clearReconnectTimer()
+        setStatus('closed')
+        options.onFatalClose?.(event)
+        return
+      }
+
+      scheduleReconnect()
+    }
+  }
+
+  window.addEventListener('online', handleOnline)
+  window.addEventListener('offline', handleOffline)
+  connect()
+
+  return () => {
+    shouldReconnect = false
+    window.removeEventListener('online', handleOnline)
+    window.removeEventListener('offline', handleOffline)
+    clearReconnectTimer()
+    clearStaleTimer()
+    if (ws) ws.close()
+    ws = null
+    setStatus('closed')
+  }
+}
+
+export type OpsSeverity = string
+export type OpsPhase = string
+
+export type AlertSeverity = 'critical' | 'warning' | 'info'
+export type ThresholdMode = 'count' | 'percentage' | 'both'
+export type MetricType =
+  | 'success_rate'
+  | 'error_rate'
+  | 'upstream_error_rate'
+  | 'p95_latency_ms'
+  | 'p99_latency_ms'
+  | 'cpu_usage_percent'
+  | 'memory_usage_percent'
+  | 'concurrency_queue_depth'
+export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!='
+
+export interface AlertRule {
+  id?: number
+  name: string
+  description?: string
+  enabled: boolean
+  metric_type: MetricType
+  operator: Operator
+  threshold: number
+  window_minutes: number
+  sustained_minutes: number
+  severity: OpsSeverity
+  cooldown_minutes: number
+  notify_email: boolean
+  filters?: Record<string, any>
+  created_at?: string
+  updated_at?: string
+  last_triggered_at?: string | null
+}
+
+export interface AlertEvent {
+  id: number
+  rule_id: number
+  severity: OpsSeverity | string
+  status: 'firing' | 'resolved' | string
+  title?: string
+  description?: string
+  metric_value?: number
+  threshold_value?: number
+  dimensions?: Record<string, any>
+  fired_at: string
+  resolved_at?: string | null
+  email_sent: boolean
+  created_at: string
+}
+
+export interface EmailNotificationConfig {
+  alert: {
+    enabled: boolean
+    recipients: string[]
+    min_severity: AlertSeverity | ''
+    rate_limit_per_hour: number
+    batching_window_seconds: number
+    include_resolved_alerts: boolean
+  }
+  report: {
+    enabled: boolean
+    recipients: string[]
+    daily_summary_enabled: boolean
+    daily_summary_schedule: string
+    weekly_summary_enabled: boolean
+    weekly_summary_schedule: string
+    error_digest_enabled: boolean
+    error_digest_schedule: string
+    error_digest_min_count: number
+    account_health_enabled: boolean
+    account_health_schedule: string
+    account_health_error_rate_threshold: number
+  }
+}
+
+export interface OpsDistributedLockSettings {
+  enabled: boolean
+  key: string
+  ttl_seconds: number
+}
+
+export interface OpsAlertRuntimeSettings {
+  evaluation_interval_seconds: number
+  distributed_lock: OpsDistributedLockSettings
+  silencing: {
+    enabled: boolean
+    global_until_rfc3339: string
+    global_reason: string
+    entries?: Array<{
+      rule_id?: number
+      severities?: Array<OpsSeverity | string>
+      until_rfc3339: string
+      reason: string
+    }>
+  }
+}
+
+export interface OpsErrorLog {
+  id: number
+  created_at: string
+  phase: OpsPhase
+  type: string
+  severity: OpsSeverity
+  status_code: number
+  platform: string
+  model: string
+  latency_ms?: number | null
+  client_request_id: string
+  request_id: string
+  message: string
+
+  user_id?: number | null
+  api_key_id?: number | null
+  account_id?: number | null
+  group_id?: number | null
+
+  client_ip?: string | null
+  request_path?: string
+  stream?: boolean
+}
+
+export interface OpsErrorDetail extends OpsErrorLog {
+  error_body: string
+  user_agent: string
+
+  auth_latency_ms?: number | null
+  routing_latency_ms?: number | null
+  upstream_latency_ms?: number | null
+  response_latency_ms?: number | null
+  time_to_first_token_ms?: number | null
+
+  request_body: string
+  request_body_truncated: boolean
+  request_body_bytes?: number | null
+
+  is_business_limited: boolean
+}
+
+export type OpsErrorLogsResponse = PaginatedResponse<OpsErrorLog>
+
+export async function getDashboardOverview(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsDashboardOverview> {
+  const { data } = await apiClient.get<OpsDashboardOverview>('/admin/ops/dashboard/overview', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+
+export async function getThroughputTrend(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsThroughputTrendResponse> {
+  const { data } = await apiClient.get<OpsThroughputTrendResponse>('/admin/ops/dashboard/throughput-trend', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+
+export async function getLatencyHistogram(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsLatencyHistogramResponse> {
+  const { data } = await apiClient.get<OpsLatencyHistogramResponse>('/admin/ops/dashboard/latency-histogram', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+
+export async function getErrorTrend(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsErrorTrendResponse> {
+  const { data } = await apiClient.get<OpsErrorTrendResponse>('/admin/ops/dashboard/error-trend', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+
+export async function getErrorDistribution(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsErrorDistributionResponse> {
+  const { data } = await apiClient.get<OpsErrorDistributionResponse>('/admin/ops/dashboard/error-distribution', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+
+export async function listErrorLogs(params: {
+  page?: number
+  page_size?: number
+  time_range?: string
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  account_id?: number | null
+  phase?: string
+  q?: string
+  status_codes?: string
+}): Promise<OpsErrorLogsResponse> {
+  const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/errors', { params })
+  return data
+}
+
+export async function getErrorLogDetail(id: number): Promise<OpsErrorDetail> {
+  const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/errors/${id}`)
+  return data
+}
+
+export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promise<OpsRetryResult> {
+  const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/errors/${id}/retry`, req)
+  return data
+}
+
+export async function listRequestDetails(params: OpsRequestDetailsParams): Promise<OpsRequestDetailsResponse> {
+  const { data } = await apiClient.get<OpsRequestDetailsResponse>('/admin/ops/requests', { params })
+  return data
+}
+
+// Alert rules
+export async function listAlertRules(): Promise<AlertRule[]> {
+  const { data } = await apiClient.get<AlertRule[]>('/admin/ops/alert-rules')
+  return data
+}
+
+export async function createAlertRule(rule: AlertRule): Promise<AlertRule> {
+  const { data } = await apiClient.post<AlertRule>('/admin/ops/alert-rules', rule)
+  return data
+}
+
+export async function updateAlertRule(id: number, rule: Partial<AlertRule>): Promise<AlertRule> {
+  const { data } = await apiClient.put<AlertRule>(`/admin/ops/alert-rules/${id}`, rule)
+  return data
+}
+
+export async function deleteAlertRule(id: number): Promise<void> {
+  await apiClient.delete(`/admin/ops/alert-rules/${id}`)
+}
+
+export async function listAlertEvents(limit = 100): Promise<AlertEvent[]> {
+  const { data } = await apiClient.get<AlertEvent[]>('/admin/ops/alert-events', { params: { limit } })
+  return data
+}
+
+// Email notification config
+export async function getEmailNotificationConfig(): Promise<EmailNotificationConfig> {
+  const { data } = await apiClient.get<EmailNotificationConfig>('/admin/ops/email-notification/config')
+  return data
+}
+
+export async function updateEmailNotificationConfig(config: EmailNotificationConfig): Promise<EmailNotificationConfig> {
+  const { data } = await apiClient.put<EmailNotificationConfig>('/admin/ops/email-notification/config', config)
+  return data
+}
+
+// Runtime settings (DB-backed)
+export async function getAlertRuntimeSettings(): Promise<OpsAlertRuntimeSettings> {
+  const { data } = await apiClient.get<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert')
+  return data
+}
+
+export async function updateAlertRuntimeSettings(config: OpsAlertRuntimeSettings): Promise<OpsAlertRuntimeSettings> {
+  const { data } = await apiClient.put<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert', config)
+  return data
+}
+
+export const opsAPI = {
+  getDashboardOverview,
+  getThroughputTrend,
+  getLatencyHistogram,
+  getErrorTrend,
+  getErrorDistribution,
+  getConcurrencyStats,
+  getAccountAvailabilityStats,
+  subscribeQPS,
+  listErrorLogs,
+  getErrorLogDetail,
+  retryErrorRequest,
+  listRequestDetails,
+  listAlertRules,
+  createAlertRule,
+  updateAlertRule,
+  deleteAlertRule,
+  listAlertEvents,
+  getEmailNotificationConfig,
+  updateEmailNotificationConfig,
+  getAlertRuntimeSettings,
+  updateAlertRuntimeSettings
+}
+
+export default opsAPI
--- a/frontend/src/api/admin/settings.ts
+++ b/frontend/src/api/admin/settings.ts
@@ -34,9 +34,22 @@ export interface SystemSettings {
  turnstile_enabled: boolean
  turnstile_site_key: string
  turnstile_secret_key_configured: boolean
+
+  // Model fallback configuration
+  enable_model_fallback: boolean
+  fallback_model_anthropic: string
+  fallback_model_openai: string
+  fallback_model_gemini: string
+  fallback_model_antigravity: string
+
  // Identity patch configuration (Claude -> Gemini)
  enable_identity_patch: boolean
  identity_patch_prompt: string
+
+  // Ops Monitoring (vNext)
+  ops_monitoring_enabled: boolean
+  ops_realtime_monitoring_enabled: boolean
+  ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string
 }

 export interface UpdateSettingsRequest {
@@ -60,8 +73,16 @@ export interface UpdateSettingsRequest {
  turnstile_enabled?: boolean
  turnstile_site_key?: string
  turnstile_secret_key?: string
+  enable_model_fallback?: boolean
+  fallback_model_anthropic?: string
+  fallback_model_openai?: string
+  fallback_model_gemini?: string
+  fallback_model_antigravity?: string
  enable_identity_patch?: boolean
  identity_patch_prompt?: string
+  ops_monitoring_enabled?: boolean
+  ops_realtime_monitoring_enabled?: boolean
+  ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string
 }

 /**
--- a/frontend/src/api/client.ts
+++ b/frontend/src/api/client.ts
@@ -80,9 +80,45 @@ apiClient.interceptors.response.use(
    return response
  },
  (error: AxiosError<ApiResponse<unknown>>) => {
+    // Request cancellation: keep the original axios cancellation error so callers can ignore it.
+    // Otherwise we'd misclassify it as a generic "network error".
+    if (error.code === 'ERR_CANCELED' || axios.isCancel(error)) {
+      return Promise.reject(error)
+    }
+
    // Handle common errors
    if (error.response) {
      const { status, data } = error.response
+      const url = String(error.config?.url || '')
+
+      // Validate `data` shape to avoid HTML error pages breaking our error handling.
+      const apiData = (typeof data === 'object' && data !== null ? data : {}) as Record<string, any>
+
+      // Ops monitoring disabled: treat as feature-flagged 404, and proactively redirect away
+      // from ops pages to avoid broken UI states.
+      if (status === 404 && apiData.message === 'Ops monitoring is disabled') {
+        try {
+          localStorage.setItem('ops_monitoring_enabled_cached', 'false')
+        } catch {
+          // ignore localStorage failures
+        }
+        try {
+          window.dispatchEvent(new CustomEvent('ops-monitoring-disabled'))
+        } catch {
+          // ignore event failures
+        }
+
+        if (window.location.pathname.startsWith('/admin/ops')) {
+          window.location.href = '/admin/settings'
+        }
+
+        return Promise.reject({
+          status,
+          code: 'OPS_DISABLED',
+          message: apiData.message || error.message,
+          url
+        })
+      }

      // 401: Unauthorized - clear token and redirect to login
      if (status === 401) {
@@ -113,8 +149,8 @@ apiClient.interceptors.response.use(
      // Return structured error
      return Promise.reject({
        status,
-        code: data?.code,
-        message: data?.message || error.message
+        code: apiData.code,
+        message: apiData.message || apiData.detail || error.message
      })
    }