Merge pull request #254 from IanShaw027/feat/ops-count-tokens-filter-and-auto-refresh

feat(ops): count_tokens 错误过滤和自动刷新功能
2026-01-12 17:31:54 +08:00
parent 4da681f58a 6ad29a470c
commit c206d12d5c
13 changed files with 188 additions and 6 deletions
--- a/backend/internal/handler/ops_error_logger.go
+++ b/backend/internal/handler/ops_error_logger.go
@@ -489,6 +489,7 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
 				Severity:          classifyOpsSeverity("upstream_error", effectiveUpstreamStatus),
 				StatusCode:        status,
 				IsBusinessLimited: false,
+				IsCountTokens:     isCountTokensRequest(c),

 				ErrorMessage: recoveredMsg,
 				ErrorBody:    "",
@@ -598,6 +599,7 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
 			Severity:          classifyOpsSeverity(parsed.ErrorType, status),
 			StatusCode:        status,
 			IsBusinessLimited: isBusinessLimited,
+			IsCountTokens:     isCountTokensRequest(c),

 			ErrorMessage: parsed.Message,
 			// Keep the full captured error body (capture is already capped at 64KB) so the
@@ -704,6 +706,14 @@ var opsRetryRequestHeaderAllowlist = []string{
 	"anthropic-version",
 }

+// isCountTokensRequest checks if the request is a count_tokens request
+func isCountTokensRequest(c *gin.Context) bool {
+	if c == nil || c.Request == nil || c.Request.URL == nil {
+		return false
+	}
+	return strings.Contains(c.Request.URL.Path, "/count_tokens")
+}
+
 func extractOpsRetryRequestHeaders(c *gin.Context) *string {
 	if c == nil || c.Request == nil {
 		return nil
--- a/backend/internal/repository/ops_repo.go
+++ b/backend/internal/repository/ops_repo.go
@@ -46,6 +46,7 @@ INSERT INTO ops_error_logs (
  severity,
  status_code,
  is_business_limited,
+  is_count_tokens,
  error_message,
  error_body,
  error_source,
@@ -64,7 +65,7 @@ INSERT INTO ops_error_logs (
  retry_count,
  created_at
 ) VALUES (
-  $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34
+  $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35
 ) RETURNING id`

 	var id int64
@@ -88,6 +89,7 @@ INSERT INTO ops_error_logs (
 		opsNullString(input.Severity),
 		opsNullInt(input.StatusCode),
 		input.IsBusinessLimited,
+		input.IsCountTokens,
 		opsNullString(input.ErrorMessage),
 		opsNullString(input.ErrorBody),
 		opsNullString(input.ErrorSource),
--- a/backend/internal/repository/ops_repo_dashboard.go
+++ b/backend/internal/repository/ops_repo_dashboard.go
@@ -964,8 +964,8 @@ func buildErrorWhere(filter *service.OpsDashboardFilter, start, end time.Time, s
 	}

 	idx := startIndex
-	clauses := make([]string, 0, 4)
-	args = make([]any, 0, 4)
+	clauses := make([]string, 0, 5)
+	args = make([]any, 0, 5)

 	args = append(args, start)
 	clauses = append(clauses, fmt.Sprintf("created_at >= $%d", idx))
@@ -974,6 +974,8 @@ func buildErrorWhere(filter *service.OpsDashboardFilter, start, end time.Time, s
 	clauses = append(clauses, fmt.Sprintf("created_at < $%d", idx))
 	idx++

+	clauses = append(clauses, "is_count_tokens = FALSE")
+
 	if groupID != nil && *groupID > 0 {
 		args = append(args, *groupID)
 		clauses = append(clauses, fmt.Sprintf("group_id = $%d", idx))
--- a/backend/internal/repository/ops_repo_preagg.go
+++ b/backend/internal/repository/ops_repo_preagg.go
@@ -78,7 +78,9 @@ error_base AS (
    status_code AS client_status_code,
    COALESCE(upstream_status_code, status_code, 0) AS effective_status_code
  FROM ops_error_logs
+  -- Exclude count_tokens requests from error metrics as they are informational probes
  WHERE created_at >= $1 AND created_at < $2
+    AND is_count_tokens = FALSE
 ),
 error_agg AS (
  SELECT
--- a/backend/internal/repository/ops_repo_trends.go
+++ b/backend/internal/repository/ops_repo_trends.go
@@ -170,6 +170,7 @@ error_totals AS (
  FROM ops_error_logs
  WHERE created_at >= $1 AND created_at < $2
    AND COALESCE(status_code, 0) >= 400
+    AND is_count_tokens = FALSE  -- 排除 count_tokens 请求的错误
  GROUP BY 1
 ),
 combined AS (
@@ -243,6 +244,7 @@ error_totals AS (
    AND platform = $3
    AND group_id IS NOT NULL
    AND COALESCE(status_code, 0) >= 400
+    AND is_count_tokens = FALSE  -- 排除 count_tokens 请求的错误
  GROUP BY 1
 ),
 combined AS (
--- a/backend/internal/service/ops_port.go
+++ b/backend/internal/service/ops_port.go
@@ -73,6 +73,7 @@ type OpsInsertErrorLogInput struct {
 	Severity          string
 	StatusCode        int
 	IsBusinessLimited bool
+	IsCountTokens     bool // 是否为 count_tokens 请求

 	ErrorMessage string
 	ErrorBody    string
--- a/backend/internal/service/ops_settings.go
+++ b/backend/internal/service/ops_settings.go
@@ -368,6 +368,9 @@ func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
 		Aggregation: OpsAggregationSettings{
 			AggregationEnabled: false,
 		},
+		IgnoreCountTokensErrors: false,
+		AutoRefreshEnabled:      false,
+		AutoRefreshIntervalSec:  30,
 	}
 }

@@ -388,6 +391,10 @@ func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) {
 	if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 {
 		cfg.DataRetention.HourlyMetricsRetentionDays = 30
 	}
+	// Normalize auto refresh interval (default 30 seconds)
+	if cfg.AutoRefreshIntervalSec <= 0 {
+		cfg.AutoRefreshIntervalSec = 30
+	}
 }

 func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
@@ -403,6 +410,9 @@ func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
 	if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
 		return errors.New("hourly_metrics_retention_days must be between 1 and 365")
 	}
+	if cfg.AutoRefreshIntervalSec < 15 || cfg.AutoRefreshIntervalSec > 300 {
+		return errors.New("auto_refresh_interval_seconds must be between 15 and 300")
+	}
 	return nil
 }

--- a/backend/internal/service/ops_settings_models.go
+++ b/backend/internal/service/ops_settings_models.go
@@ -79,8 +79,11 @@ type OpsAlertRuntimeSettings struct {

 // OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation).
 type OpsAdvancedSettings struct {
-	DataRetention OpsDataRetentionSettings `json:"data_retention"`
-	Aggregation   OpsAggregationSettings   `json:"aggregation"`
+	DataRetention           OpsDataRetentionSettings `json:"data_retention"`
+	Aggregation             OpsAggregationSettings   `json:"aggregation"`
+	IgnoreCountTokensErrors bool                     `json:"ignore_count_tokens_errors"`
+	AutoRefreshEnabled      bool                     `json:"auto_refresh_enabled"`
+	AutoRefreshIntervalSec  int                      `json:"auto_refresh_interval_seconds"`
 }

 type OpsDataRetentionSettings struct {
--- a/backend/migrations/036_ops_error_logs_add_is_count_tokens.sql
+++ b/backend/migrations/036_ops_error_logs_add_is_count_tokens.sql
@@ -0,0 +1,16 @@
+-- Migration: 添加 is_count_tokens 字段到 ops_error_logs 表
+-- Purpose: 标记 count_tokens 请求的错误，以便在统计和告警中根据配置动态过滤
+-- Author: System
+-- Date: 2026-01-12
+
+-- Add is_count_tokens column to ops_error_logs table
+ALTER TABLE ops_error_logs
+ADD COLUMN is_count_tokens BOOLEAN NOT NULL DEFAULT FALSE;
+
+-- Add comment
+COMMENT ON COLUMN ops_error_logs.is_count_tokens IS '是否为 count_tokens 请求的错误（用于统计过滤）';
+
+-- Create index for filtering (optional, improves query performance)
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_is_count_tokens
+ON ops_error_logs(is_count_tokens)
+WHERE is_count_tokens = TRUE;
--- a/frontend/src/api/admin/ops.ts
+++ b/frontend/src/api/admin/ops.ts
@@ -734,6 +734,9 @@ export interface OpsAlertRuntimeSettings {
 export interface OpsAdvancedSettings {
  data_retention: OpsDataRetentionSettings
  aggregation: OpsAggregationSettings
+  ignore_count_tokens_errors: boolean
+  auto_refresh_enabled: boolean
+  auto_refresh_interval_seconds: number
 }

 export interface OpsDataRetentionSettings {
--- a/frontend/src/views/admin/ops/OpsDashboard.vue
+++ b/frontend/src/views/admin/ops/OpsDashboard.vue
@@ -20,6 +20,8 @@
        :loading="loading"
        :last-updated="lastUpdated"
        :thresholds="metricThresholds"
+        :auto-refresh-enabled="autoRefreshEnabled"
+        :auto-refresh-countdown="autoRefreshCountdown"
        @update:time-range="onTimeRangeChange"
        @update:platform="onPlatformChange"
        @update:group="onGroupChange"
@@ -104,7 +106,7 @@

 <script setup lang="ts">
 import { computed, onMounted, onUnmounted, ref, watch } from 'vue'
-import { useDebounceFn } from '@vueuse/core'
+import { useDebounceFn, useIntervalFn } from '@vueuse/core'
 import { useI18n } from 'vue-i18n'
 import { useRoute, useRouter } from 'vue-router'
 import AppLayout from '@/components/layout/AppLayout.vue'
@@ -287,6 +289,45 @@ const requestDetailsPreset = ref<OpsRequestDetailsPreset>({
 const showSettingsDialog = ref(false)
 const showAlertRulesCard = ref(false)

+// Auto refresh settings
+const autoRefreshEnabled = ref(false)
+const autoRefreshIntervalMs = ref(30000) // default 30 seconds
+const autoRefreshCountdown = ref(0)
+
+// Auto refresh timer
+const { pause: pauseAutoRefresh, resume: resumeAutoRefresh } = useIntervalFn(
+  () => {
+    if (autoRefreshEnabled.value && opsEnabled.value && !loading.value) {
+      fetchData()
+    }
+  },
+  autoRefreshIntervalMs,
+  { immediate: false }
+)
+
+// Countdown timer (updates every second)
+const { pause: pauseCountdown, resume: resumeCountdown } = useIntervalFn(
+  () => {
+    if (autoRefreshEnabled.value && autoRefreshCountdown.value > 0) {
+      autoRefreshCountdown.value--
+    }
+  },
+  1000,
+  { immediate: false }
+)
+
+// Load auto refresh settings from backend
+async function loadAutoRefreshSettings() {
+  try {
+    const settings = await opsAPI.getAdvancedSettings()
+    autoRefreshEnabled.value = settings.auto_refresh_enabled
+    autoRefreshIntervalMs.value = settings.auto_refresh_interval_seconds * 1000
+    autoRefreshCountdown.value = settings.auto_refresh_interval_seconds
+  } catch (err) {
+    console.error('[OpsDashboard] Failed to load auto refresh settings', err)
+  }
+}
+
 function handleThroughputSelectPlatform(nextPlatform: string) {
  platform.value = nextPlatform || ''
  groupId.value = null
@@ -510,6 +551,10 @@ async function fetchData() {
    ])
    if (fetchSeq !== dashboardFetchSeq) return
    lastUpdated.value = new Date()
+    // Reset auto refresh countdown after successful fetch
+    if (autoRefreshEnabled.value) {
+      autoRefreshCountdown.value = Math.floor(autoRefreshIntervalMs.value / 1000)
+    }
  } catch (err) {
    if (!isOpsDisabledError(err)) {
      console.error('[ops] failed to fetch dashboard data', err)
@@ -567,9 +612,18 @@ onMounted(async () => {
  // Load thresholds configuration
  loadThresholds()

+  // Load auto refresh settings
+  await loadAutoRefreshSettings()
+
  if (opsEnabled.value) {
    await fetchData()
  }
+
+  // Start auto refresh if enabled
+  if (autoRefreshEnabled.value) {
+    resumeAutoRefresh()
+    resumeCountdown()
+  }
 })

 async function loadThresholds() {
@@ -584,5 +638,27 @@ async function loadThresholds() {

 onUnmounted(() => {
  abortDashboardFetch()
+  pauseAutoRefresh()
+  pauseCountdown()
+})
+
+// Watch auto refresh settings changes
+watch(autoRefreshEnabled, (enabled) => {
+  if (enabled) {
+    autoRefreshCountdown.value = Math.floor(autoRefreshIntervalMs.value / 1000)
+    resumeAutoRefresh()
+    resumeCountdown()
+  } else {
+    pauseAutoRefresh()
+    pauseCountdown()
+    autoRefreshCountdown.value = 0
+  }
+})
+
+// Reload auto refresh settings after settings dialog is closed
+watch(showSettingsDialog, async (show) => {
+  if (!show) {
+    await loadAutoRefreshSettings()
+  }
 })
 </script>
--- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
+++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
@@ -23,6 +23,8 @@ interface Props {
  loading: boolean
  lastUpdated: Date | null
  thresholds?: OpsMetricThresholds | null // 阈值配置
+  autoRefreshEnabled?: boolean
+  autoRefreshCountdown?: number
 }

 interface Emits {
@@ -839,6 +841,17 @@ function handleToolbarRefresh() {
          <span>·</span>
          <span>{{ t('common.refresh') }}: {{ updatedAtLabel }}</span>

+          <template v-if="props.autoRefreshEnabled && props.autoRefreshCountdown !== undefined">
+            <span>·</span>
+            <span class="flex items-center gap-1">
+              <svg class="h-3 w-3 animate-spin text-blue-500" fill="none" viewBox="0 0 24 24">
+                <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
+                <path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
+              </svg>
+              <span>自动刷新: {{ props.autoRefreshCountdown }}s</span>
+            </span>
+          </template>
+
          <template v-if="systemMetrics">
            <span>·</span>
            <span>
--- a/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue
+++ b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue
@@ -487,6 +487,48 @@ async function saveAllSettings() {
              <Toggle v-model="advancedSettings.aggregation.aggregation_enabled" />
            </div>
          </div>
+
+          <!-- 错误过滤 -->
+          <div class="space-y-3">
+            <h5 class="text-xs font-semibold text-gray-700 dark:text-gray-300">错误过滤</h5>
+
+            <div class="flex items-center justify-between">
+              <div>
+                <label class="text-sm font-medium text-gray-700 dark:text-gray-300">忽略 count_tokens 错误</label>
+                <p class="mt-1 text-xs text-gray-500">
+                  启用后，count_tokens 请求的错误将不计入运维监控的统计和告警中（但仍会存储在数据库中）
+                </p>
+              </div>
+              <Toggle v-model="advancedSettings.ignore_count_tokens_errors" />
+            </div>
+          </div>
+
+          <!-- 自动刷新 -->
+          <div class="space-y-3">
+            <h5 class="text-xs font-semibold text-gray-700 dark:text-gray-300">自动刷新</h5>
+
+            <div class="flex items-center justify-between">
+              <div>
+                <label class="text-sm font-medium text-gray-700 dark:text-gray-300">启用自动刷新</label>
+                <p class="mt-1 text-xs text-gray-500">
+                  自动刷新仪表板数据，启用后会定期拉取最新数据
+                </p>
+              </div>
+              <Toggle v-model="advancedSettings.auto_refresh_enabled" />
+            </div>
+
+            <div v-if="advancedSettings.auto_refresh_enabled">
+              <label class="input-label">刷新间隔</label>
+              <Select
+                v-model="advancedSettings.auto_refresh_interval_seconds"
+                :options="[
+                  { value: 15, label: '15 秒' },
+                  { value: 30, label: '30 秒' },
+                  { value: 60, label: '60 秒' }
+                ]"
+              />
+            </div>
+          </div>
        </div>
      </details>
    </div>