Merge pull request #285 from IanShaw027/fix/ops-bug

feat(ops): 增强错误日志管理、告警静默和前端 UI 优化
2026-01-15 11:26:16 +08:00
parent 28de614dfb 5354ba3662
commit 27214f8657
42 changed files with 4039 additions and 1065 deletions
--- a/frontend/src/i18n/locales/en.ts
+++ b/frontend/src/i18n/locales/en.ts
@@ -129,6 +129,8 @@ export default {
    all: 'All',
    none: 'None',
    noData: 'No data',
+    expand: 'Expand',
+    collapse: 'Collapse',
    success: 'Success',
    error: 'Error',
    critical: 'Critical',
@@ -150,12 +152,13 @@ export default {
    invalidEmail: 'Please enter a valid email address',
    optional: 'optional',
    selectOption: 'Select an option',
-    searchPlaceholder: 'Search...', 
-        noOptionsFound: 'No options found',
-        noGroupsAvailable: 'No groups available',
-        unknownError: 'Unknown error occurred',
-        saving: 'Saving...', 
-        selectedCount: '({count} selected)',    refresh: 'Refresh',
+    searchPlaceholder: 'Search...',
+    noOptionsFound: 'No options found',
+    noGroupsAvailable: 'No groups available',
+    unknownError: 'Unknown error occurred',
+    saving: 'Saving...',
+    selectedCount: '({count} selected)',
+    refresh: 'Refresh',
    settings: 'Settings',
    notAvailable: 'N/A',
    now: 'Now',
@@ -1882,10 +1885,8 @@ export default {
      noSystemMetrics: 'No system metrics collected yet.',
      collectedAt: 'Collected at:',
      window: 'window',
-      cpu: 'CPU',
      memory: 'Memory',
      db: 'DB',
-      redis: 'Redis',
      goroutines: 'Goroutines',
      jobs: 'Jobs',
      jobsHelp: 'Click “Details” to view job heartbeats and recent errors',
@@ -1911,7 +1912,7 @@ export default {
      totalRequests: 'Total Requests',
      avgQps: 'Avg QPS',
      avgTps: 'Avg TPS',
-      avgLatency: 'Avg Latency',
+      avgLatency: 'Avg Request Duration',
      avgTtft: 'Avg TTFT',
      exceptions: 'Exceptions',
      requestErrors: 'Request Errors',
@@ -1923,7 +1924,7 @@ export default {
      errors: 'Errors',
      errorRate: 'error_rate:',
      upstreamRate: 'upstream_rate:',
-      latencyDuration: 'Latency (duration_ms)',
+      latencyDuration: 'Request Duration (ms)',
      ttftLabel: 'TTFT (first_token_ms)',
      p50: 'p50:',
      p90: 'p90:',
@@ -1931,7 +1932,6 @@ export default {
      p99: 'p99:',
      avg: 'avg:',
      max: 'max:',
-      qps: 'QPS',
      requests: 'Requests',
      requestsTitle: 'Requests',
      upstream: 'Upstream',
@@ -1943,7 +1943,7 @@ export default {
      failedToLoadData: 'Failed to load ops data.',
      failedToLoadOverview: 'Failed to load overview',
      failedToLoadThroughputTrend: 'Failed to load throughput trend',
-      failedToLoadLatencyHistogram: 'Failed to load latency histogram',
+      failedToLoadLatencyHistogram: 'Failed to load request duration histogram',
      failedToLoadErrorTrend: 'Failed to load error trend',
      failedToLoadErrorDistribution: 'Failed to load error distribution',
      failedToLoadErrorDetail: 'Failed to load error detail',
@@ -1951,7 +1951,7 @@ export default {
      tpsK: 'TPS (K)',
      top: 'Top:',
      throughputTrend: 'Throughput Trend',
-      latencyHistogram: 'Latency Histogram',
+      latencyHistogram: 'Request Duration Histogram',
      errorTrend: 'Error Trend',
      errorDistribution: 'Error Distribution',
      // Health Score & Diagnosis
@@ -1966,7 +1966,9 @@ export default {
        '30m': 'Last 30 minutes',
        '1h': 'Last 1 hour',
        '6h': 'Last 6 hours',
-        '24h': 'Last 24 hours'
+        '24h': 'Last 24 hours',
+        '7d': 'Last 7 days',
+        '30d': 'Last 30 days'
      },
      fullscreen: {
        enter: 'Enter Fullscreen'
@@ -1995,14 +1997,7 @@ export default {
        memoryHigh: 'Memory usage elevated ({usage}%)',
        memoryHighImpact: 'Memory pressure is high, needs attention',
        memoryHighAction: 'Monitor memory trends, check for memory leaks',
-        // Latency diagnostics
-        latencyCritical: 'Response latency critically high ({latency}ms)',
-        latencyCriticalImpact: 'User experience extremely poor, many requests timing out',
-        latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services',
-        latencyHigh: 'Response latency elevated ({latency}ms)',
-        latencyHighImpact: 'User experience degraded, needs optimization',
-        latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic',
-        ttftHigh: 'Time to first byte elevated ({ttft}ms)',
+        ttftHigh: 'Time to first token elevated ({ttft}ms)',
        ttftHighImpact: 'User perceived latency increased',
        ttftHighAction: 'Optimize request processing flow, reduce pre-processing time',
        // Error rate diagnostics
@@ -2038,27 +2033,106 @@ export default {
      // Error Log
      errorLog: {
        timeId: 'Time / ID',
+        commonErrors: {
+          contextDeadlineExceeded: 'context deadline exceeded',
+          connectionRefused: 'connection refused',
+          rateLimit: 'rate limit'
+        },
+        time: 'Time',
+        type: 'Type',
        context: 'Context',
+        platform: 'Platform',
+        model: 'Model',
+        group: 'Group',
+        user: 'User',
+        userId: 'User ID',
+        account: 'Account',
+        accountId: 'Account ID',
        status: 'Status',
        message: 'Message',
-        latency: 'Latency',
+        latency: 'Request Duration',
        action: 'Action',
        noErrors: 'No errors in this window.',
        grp: 'GRP:',
        acc: 'ACC:',
        details: 'Details',
-        phase: 'Phase'
+        phase: 'Phase',
+        id: 'ID:',
+        typeUpstream: 'Upstream',
+        typeRequest: 'Request',
+        typeAuth: 'Auth',
+        typeRouting: 'Routing',
+        typeInternal: 'Internal'
      },
      // Error Details Modal
      errorDetails: {
        upstreamErrors: 'Upstream Errors',
        requestErrors: 'Request Errors',
+        unresolved: 'Unresolved',
+        resolved: 'Resolved',
+        viewErrors: 'Errors',
+        viewExcluded: 'Excluded',
+        statusCodeOther: 'Other',
+        owner: {
+          provider: 'Provider',
+          client: 'Client',
+          platform: 'Platform'
+        },
+        phase: {
+          request: 'Request',
+          auth: 'Auth',
+          routing: 'Routing',
+          upstream: 'Upstream',
+          network: 'Network',
+          internal: 'Internal'
+        },
        total: 'Total:',
        searchPlaceholder: 'Search request_id / client_request_id / message',
-        accountIdPlaceholder: 'account_id'
      },
      // Error Detail Modal
      errorDetail: {
+        title: 'Error Detail',
+        titleWithId: 'Error #{id}',
+        noErrorSelected: 'No error selected.',
+        resolution: 'Resolved:',
+        pinnedToOriginalAccountId: 'Pinned to original account_id',
+        missingUpstreamRequestBody: 'Missing upstream request body',
+        failedToLoadRetryHistory: 'Failed to load retry history',
+        failedToUpdateResolvedStatus: 'Failed to update resolved status',
+        unsupportedRetryMode: 'Unsupported retry mode',
+        classificationKeys: {
+          phase: 'Phase',
+          owner: 'Owner',
+          source: 'Source',
+          retryable: 'Retryable',
+          resolvedAt: 'Resolved At',
+          resolvedBy: 'Resolved By',
+          resolvedRetryId: 'Resolved Retry',
+          retryCount: 'Retry Count'
+        },
+        source: {
+          upstream_http: 'Upstream HTTP'
+        },
+        upstreamKeys: {
+          status: 'Status',
+          message: 'Message',
+          detail: 'Detail',
+          upstreamErrors: 'Upstream Errors'
+        },
+        upstreamEvent: {
+          account: 'Account',
+          status: 'Status',
+          requestId: 'Request ID'
+        },
+        responsePreview: {
+          expand: 'Response (click to expand)',
+          collapse: 'Response (click to collapse)'
+        },
+        retryMeta: {
+          used: 'Used',
+          success: 'Success',
+          pinned: 'Pinned'
+        },
        loading: 'Loading…',
        requestId: 'Request ID',
        time: 'Time',
@@ -2068,8 +2142,10 @@ export default {
        basicInfo: 'Basic Info',
        platform: 'Platform',
        model: 'Model',
-        latency: 'Latency',
-        ttft: 'TTFT',
+        group: 'Group',
+        user: 'User',
+        account: 'Account',
+        latency: 'Request Duration',
        businessLimited: 'Business Limited',
        requestPath: 'Request Path',
        timings: 'Timings',
@@ -2077,6 +2153,8 @@ export default {
        routing: 'Routing',
        upstream: 'Upstream',
        response: 'Response',
+        classification: 'Classification',
+        notRetryable: 'Not recommended to retry',
        retry: 'Retry',
        retryClient: 'Retry (Client)',
        retryUpstream: 'Retry (Upstream pinned)',
@@ -2088,7 +2166,6 @@ export default {
        confirmRetry: 'Confirm Retry',
        retrySuccess: 'Retry succeeded',
        retryFailed: 'Retry failed',
-        na: 'N/A',
        retryHint: 'Retry will resend the request with the same parameters',
        retryClientHint: 'Use client retry (no account pinning)',
        retryUpstreamHint: 'Use upstream pinned retry (pin to the error account)',
@@ -2096,8 +2173,33 @@ export default {
        retryNote1: 'Retry will use the same request body and parameters',
        retryNote2: 'If the original request failed due to account issues, pinned retry may still fail',
        retryNote3: 'Client retry will reselect an account',
+        retryNote4: 'You can force retry for non-retryable errors, but it is not recommended',
        confirmRetryMessage: 'Confirm retry this request?',
-        confirmRetryHint: 'Will resend with the same request parameters'
+        confirmRetryHint: 'Will resend with the same request parameters',
+        forceRetry: 'I understand and want to force retry',
+        forceRetryHint: 'This error usually cannot be fixed by retry; check to proceed',
+        forceRetryNeedAck: 'Please check to force retry',
+        markResolved: 'Mark resolved',
+        markUnresolved: 'Mark unresolved',
+        viewRetries: 'Retry history',
+        retryHistory: 'Retry History',
+        tabOverview: 'Overview',
+        tabRetries: 'Retries',
+        tabRequest: 'Request',
+        tabResponse: 'Response',
+        responseBody: 'Response',
+        compareA: 'Compare A',
+        compareB: 'Compare B',
+        retrySummary: 'Retry Summary',
+        responseHintSucceeded: 'Showing succeeded retry response_preview (#{id})',
+        responseHintFallback: 'No succeeded retry found; showing stored error_body',
+        suggestion: 'Suggestion',
+        suggestUpstreamResolved: '✓ Upstream error resolved by retry; no action needed',
+        suggestUpstream: 'Upstream instability: check account status, consider switching accounts, or retry',
+        suggestRequest: 'Client request error: ask customer to fix request parameters',
+        suggestAuth: 'Auth failed: verify API key/credentials',
+        suggestPlatform: 'Platform error: prioritize investigation and fix',
+        suggestGeneric: 'See details for more context'
      },
      requestDetails: {
        title: 'Request Details',
@@ -2133,13 +2235,46 @@ export default {
        loading: 'Loading...',
        empty: 'No alert events',
        loadFailed: 'Failed to load alert events',
+        status: {
+          firing: 'FIRING',
+          resolved: 'RESOLVED',
+          manualResolved: 'MANUAL RESOLVED'
+        },
+        detail: {
+          title: 'Alert Detail',
+          loading: 'Loading detail...',
+          empty: 'No detail',
+          loadFailed: 'Failed to load alert detail',
+          manualResolve: 'Mark as Resolved',
+          manualResolvedSuccess: 'Marked as manually resolved',
+          manualResolvedFailed: 'Failed to mark as manually resolved',
+          silence: 'Ignore Alert',
+          silenceSuccess: 'Alert silenced',
+          silenceFailed: 'Failed to silence alert',
+          viewRule: 'View Rule',
+          viewLogs: 'View Logs',
+          firedAt: 'Fired At',
+          resolvedAt: 'Resolved At',
+          ruleId: 'Rule ID',
+          dimensions: 'Dimensions',
+          historyTitle: 'History',
+          historyHint: 'Recent events with same rule + dimensions',
+          historyLoading: 'Loading history...',
+          historyEmpty: 'No history'
+        },
        table: {
          time: 'Time',
          status: 'Status',
          severity: 'Severity',
+          platform: 'Platform',
+          ruleId: 'Rule ID',
          title: 'Title',
+          duration: 'Duration',
          metric: 'Metric / Threshold',
-          email: 'Email Sent'
+          dimensions: 'Dimensions',
+          email: 'Email Sent',
+          emailSent: 'Sent',
+          emailIgnored: 'Ignored'
        }
      },
      alertRules: {
@@ -2253,7 +2388,6 @@ export default {
          title: 'Alert Silencing (Maintenance Mode)',
          enabled: 'Enable silencing',
          globalUntil: 'Silence until (RFC3339)',
-          untilPlaceholder: '2026-01-05T00:00:00Z',
          untilHint: 'Leave empty to only toggle silencing without an expiry (not recommended).',
          reason: 'Reason',
          reasonPlaceholder: 'e.g., planned maintenance',
@@ -2293,7 +2427,11 @@ export default {
          lockKeyRequired: 'Distributed lock key is required when lock is enabled',
          lockKeyPrefix: 'Distributed lock key must start with "{prefix}"',
          lockKeyHint: 'Recommended: start with "{prefix}" to avoid conflicts',
-          lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds'
+          lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds',
+          slaMinPercentRange: 'SLA minimum percentage must be between 0 and 100',
+          ttftP99MaxRange: 'TTFT P99 maximum must be a number ≥ 0',
+          requestErrorRateMaxRange: 'Request error rate maximum must be between 0 and 100',
+          upstreamErrorRateMaxRange: 'Upstream error rate maximum must be between 0 and 100'
        }
      },
      email: {
@@ -2358,8 +2496,6 @@ export default {
        metricThresholdsHint: 'Configure alert thresholds for metrics, values exceeding thresholds will be displayed in red',
        slaMinPercent: 'SLA Minimum Percentage',
        slaMinPercentHint: 'SLA below this value will be displayed in red (default: 99.5%)',
-        latencyP99MaxMs: 'Latency P99 Maximum (ms)',
-        latencyP99MaxMsHint: 'Latency P99 above this value will be displayed in red (default: 2000ms)',
        ttftP99MaxMs: 'TTFT P99 Maximum (ms)',
        ttftP99MaxMsHint: 'TTFT P99 above this value will be displayed in red (default: 500ms)',
        requestErrorRateMaxPercent: 'Request Error Rate Maximum (%)',
@@ -2378,9 +2514,28 @@ export default {
        aggregation: 'Pre-aggregation Tasks',
        enableAggregation: 'Enable Pre-aggregation',
        aggregationHint: 'Pre-aggregation improves query performance for long time windows',
+        errorFiltering: 'Error Filtering',
+        ignoreCountTokensErrors: 'Ignore count_tokens errors',
+        ignoreCountTokensErrorsHint: 'When enabled, errors from count_tokens requests will not be written to the error log.',
+        ignoreContextCanceled: 'Ignore client disconnect errors',
+        ignoreContextCanceledHint: 'When enabled, client disconnect (context canceled) errors will not be written to the error log.',
+        ignoreNoAvailableAccounts: 'Ignore no available accounts errors',
+        ignoreNoAvailableAccountsHint: 'When enabled, "No available accounts" errors will not be written to the error log (not recommended; usually a config issue).',
+        autoRefresh: 'Auto Refresh',
+        enableAutoRefresh: 'Enable auto refresh',
+        enableAutoRefreshHint: 'Automatically refresh dashboard data at a fixed interval.',
+        refreshInterval: 'Refresh Interval',
+        refreshInterval15s: '15 seconds',
+        refreshInterval30s: '30 seconds',
+        refreshInterval60s: '60 seconds',
+        autoRefreshCountdown: 'Auto refresh: {seconds}s',
        validation: {
          title: 'Please fix the following issues',
-          retentionDaysRange: 'Retention days must be between 1-365 days'
+          retentionDaysRange: 'Retention days must be between 1-365 days',
+          slaMinPercentRange: 'SLA minimum percentage must be between 0 and 100',
+          ttftP99MaxRange: 'TTFT P99 maximum must be a number ≥ 0',
+          requestErrorRateMaxRange: 'Request error rate maximum must be between 0 and 100',
+          upstreamErrorRateMaxRange: 'Upstream error rate maximum must be between 0 and 100'
        }
      },
      concurrency: {
@@ -2418,7 +2573,7 @@ export default {
      tooltips: {
        totalRequests: 'Total number of requests (including both successful and failed requests) in the selected time window.',
        throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
-        latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
+        latencyHistogram: 'Request duration distribution (ms) for successful requests.',
        errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
        errorDistribution: 'Error distribution by status code.',
        goroutines:
@@ -2433,7 +2588,7 @@ export default {
        sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
        errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
        upstreamErrors: 'Upstream error statistics, excluding rate limit errors (429/529).',
-        latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
+        latency: 'Request duration statistics, including p50, p90, p95, p99 percentiles.',
        ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
        health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
      },