From 93b5b7474be49a5ad34e7f651861c8ea5ce46d3d Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Thu, 15 Jan 2026 19:50:02 +0800 Subject: [PATCH 1/8] =?UTF-8?q?refactor(ops):=20=E8=B0=83=E6=95=B4?= =?UTF-8?q?=E5=81=A5=E5=BA=B7=E5=BE=97=E5=88=86=E6=9D=83=E9=87=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 业务健康权重从 70% 提升到 80% - 基础设施健康权重从 30% 降低到 20% - 更加关注业务指标(SLA、错误率等)对整体健康的影响 --- backend/internal/service/ops_health_score.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/internal/service/ops_health_score.go b/backend/internal/service/ops_health_score.go index 5efae870..9dd45eea 100644 --- a/backend/internal/service/ops_health_score.go +++ b/backend/internal/service/ops_health_score.go @@ -9,7 +9,7 @@ import ( // // Design goals: // - Backend-owned scoring (UI only displays). -// - Layered scoring: Business Health (70%) + Infrastructure Health (30%) +// - Layered scoring: Business Health (80%) + Infrastructure Health (20%) // - Avoids double-counting (e.g., DB failure affects both infra and business metrics) // - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data. func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int { @@ -26,8 +26,8 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) businessHealth := computeBusinessHealth(overview) infraHealth := computeInfraHealth(now, overview) - // Weighted combination: 70% business + 30% infrastructure - score := businessHealth*0.7 + infraHealth*0.3 + // Weighted combination: 80% business + 20% infrastructure + score := businessHealth*0.8 + infraHealth*0.2 return int(math.Round(clampFloat64(score, 0, 100))) } From 38961ba10e5ad4beff6dfc15890b7f4b9a78b868 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Thu, 15 Jan 2026 19:50:31 +0800 Subject: [PATCH 2/8] =?UTF-8?q?refactor(ops):=20=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E9=98=88=E5=80=BC=E6=A3=80=E6=9F=A5=E7=B3=BB=E7=BB=9F=E5=92=8C?= =?UTF-8?q?=E5=B8=83=E5=B1=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 阈值检查系统优化: - 引入三级阈值系统(normal/warning/critical) - 统一阈值判断逻辑,支持警告和严重两个级别 - 移除硬编码的 TTFT 颜色判断,改用阈值配置 - 新增 getThresholdColorClass 统一颜色映射 布局优化: - 优化详细指标在卡片内的响应式布局 - 改进宽屏下的卡片布局显示 - 优化指标数值的对齐和间距 --- frontend/src/views/admin/ops/OpsDashboard.vue | 4 +- .../ops/components/OpsDashboardHeader.vue | 120 ++++++++++-------- 2 files changed, 68 insertions(+), 56 deletions(-) diff --git a/frontend/src/views/admin/ops/OpsDashboard.vue b/frontend/src/views/admin/ops/OpsDashboard.vue index ff2a434d..d33f0f64 100644 --- a/frontend/src/views/admin/ops/OpsDashboard.vue +++ b/frontend/src/views/admin/ops/OpsDashboard.vue @@ -693,8 +693,8 @@ onMounted(async () => { async function loadThresholds() { try { - const settings = await opsAPI.getAlertRuntimeSettings() - metricThresholds.value = settings.thresholds || null + const thresholds = await opsAPI.getMetricThresholds() + metricThresholds.value = thresholds || null } catch (err) { console.warn('[OpsDashboard] Failed to load thresholds', err) metricThresholds.value = null diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue index 2d52b6e8..8e868bba 100644 --- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue +++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue @@ -169,42 +169,54 @@ const updatedAtLabel = computed(() => { return props.lastUpdated.toLocaleTimeString() }) -// --- Color coding for TTFT --- -function getTTFTColor(ms: number | null | undefined): string { - if (ms == null) return 'text-gray-900 dark:text-white' - if (ms < 500) return 'text-green-600 dark:text-green-400' - if (ms < 1000) return 'text-yellow-600 dark:text-yellow-400' - if (ms < 2000) return 'text-orange-600 dark:text-orange-400' - return 'text-red-600 dark:text-red-400' -} - // --- Threshold checking helpers --- -function isSLABelowThreshold(slaPercent: number | null): boolean { - if (slaPercent == null) return false +type ThresholdLevel = 'normal' | 'warning' | 'critical' + +function getSLAThresholdLevel(slaPercent: number | null): ThresholdLevel { + if (slaPercent == null) return 'normal' const threshold = props.thresholds?.sla_percent_min - if (threshold == null) return false - return slaPercent < threshold + if (threshold == null) return 'normal' + if (slaPercent < threshold) return 'critical' + if (slaPercent < threshold / 0.8) return 'warning' + return 'normal' } -function isTTFTAboveThreshold(ttftP99Ms: number | null): boolean { - if (ttftP99Ms == null) return false +function getTTFTThresholdLevel(ttftMs: number | null): ThresholdLevel { + if (ttftMs == null) return 'normal' const threshold = props.thresholds?.ttft_p99_ms_max - if (threshold == null) return false - return ttftP99Ms > threshold + if (threshold == null) return 'normal' + if (ttftMs >= threshold) return 'critical' + if (ttftMs >= threshold * 0.8) return 'warning' + return 'normal' } -function isRequestErrorRateAboveThreshold(errorRatePercent: number | null): boolean { - if (errorRatePercent == null) return false +function getRequestErrorRateThresholdLevel(errorRatePercent: number | null): ThresholdLevel { + if (errorRatePercent == null) return 'normal' const threshold = props.thresholds?.request_error_rate_percent_max - if (threshold == null) return false - return errorRatePercent > threshold + if (threshold == null) return 'normal' + if (errorRatePercent >= threshold) return 'critical' + if (errorRatePercent >= threshold * 0.8) return 'warning' + return 'normal' } -function isUpstreamErrorRateAboveThreshold(upstreamErrorRatePercent: number | null): boolean { - if (upstreamErrorRatePercent == null) return false +function getUpstreamErrorRateThresholdLevel(upstreamErrorRatePercent: number | null): ThresholdLevel { + if (upstreamErrorRatePercent == null) return 'normal' const threshold = props.thresholds?.upstream_error_rate_percent_max - if (threshold == null) return false - return upstreamErrorRatePercent > threshold + if (threshold == null) return 'normal' + if (upstreamErrorRatePercent >= threshold) return 'critical' + if (upstreamErrorRatePercent >= threshold * 0.8) return 'warning' + return 'normal' +} + +function getThresholdColorClass(level: ThresholdLevel): string { + switch (level) { + case 'critical': + return 'text-red-600 dark:text-red-400' + case 'warning': + return 'text-yellow-600 dark:text-yellow-400' + default: + return 'text-green-600 dark:text-green-400' + } } // --- Realtime / Overview labels --- @@ -1197,7 +1209,7 @@ function handleToolbarRefresh() {
{{ t('admin.ops.sla') }} - +
-
+
{{ ttftP99Ms ?? '-' }}
ms (P99)
-
-
- {{ t('admin.ops.p95') }} - {{ ttftP95Ms ?? '-' }} +
+
+ P95: + {{ ttftP95Ms ?? '-' }} ms
-
- {{ t('admin.ops.p90') }} - {{ ttftP90Ms ?? '-' }} +
+ P90: + {{ ttftP90Ms ?? '-' }} ms
-
- {{ t('admin.ops.p50') }} - {{ ttftP50Ms ?? '-' }} +
+ P50: + {{ ttftP50Ms ?? '-' }} ms
-
+
Avg: - {{ ttftAvgMs ?? '-' }} + {{ ttftAvgMs ?? '-' }} ms
-
+
Max: - {{ ttftMaxMs ?? '-' }} + {{ ttftMaxMs ?? '-' }} ms
@@ -1335,7 +1347,7 @@ function handleToolbarRefresh() { {{ t('admin.ops.requestDetails.details') }}
-
+
{{ errorRatePercent == null ? '-' : `${errorRatePercent.toFixed(2)}%` }}
@@ -1361,7 +1373,7 @@ function handleToolbarRefresh() { {{ t('admin.ops.requestDetails.details') }}
-
+
{{ upstreamErrorRatePercent == null ? '-' : `${upstreamErrorRatePercent.toFixed(2)}%` }}
From 930e9ee55c283ce09a570f663026fc90afd48cf3 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Thu, 15 Jan 2026 19:50:47 +0800 Subject: [PATCH 3/8] =?UTF-8?q?feat(ops):=20=E6=B7=BB=E5=8A=A0=E8=87=AA?= =?UTF-8?q?=E5=AE=9A=E4=B9=89=E6=97=B6=E9=97=B4=E8=8C=83=E5=9B=B4=E9=80=89?= =?UTF-8?q?=E6=8B=A9=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 功能特性: - 在时间段选择器中增加"自定义"选项 - 点击后弹出对话框,支持选择任意时间范围 - 使用 HTML5 datetime-local 输入框,体验友好 - 自定义时显示格式化的时间范围标签(MM-DD HH:mm ~ MM-DD HH:mm) - 默认初始化为最近1小时 技术实现: - 扩展 TimeRange 类型支持 'custom' - 添加 customStartTime 和 customEndTime 状态管理 - 创建 buildApiParams 辅助函数统一处理 API 参数 - 当选择自定义时,使用 start_time 和 end_time 参数替代 time_range - 更新所有相关 API 调用支持自定义时间范围 国际化: - 添加"自定义"、"开始时间"、"结束时间"翻译 --- frontend/src/i18n/locales/zh.ts | 7 +- frontend/src/views/admin/ops/OpsDashboard.vue | 79 +++++------- .../ops/components/OpsDashboardHeader.vue | 120 ++++++++++++++---- 3 files changed, 135 insertions(+), 71 deletions(-) diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index daf39939..30f8df51 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -2117,7 +2117,12 @@ export default { '6h': '近6小时', '24h': '近24小时', '7d': '近7天', - '30d': '近30天' + '30d': '近30天', + custom: '自定义' + }, + customTimeRange: { + startTime: '开始时间', + endTime: '结束时间' }, fullscreen: { enter: '进入全屏' diff --git a/frontend/src/views/admin/ops/OpsDashboard.vue b/frontend/src/views/admin/ops/OpsDashboard.vue index d33f0f64..d8a31931 100644 --- a/frontend/src/views/admin/ops/OpsDashboard.vue +++ b/frontend/src/views/admin/ops/OpsDashboard.vue @@ -23,10 +23,13 @@ :auto-refresh-enabled="autoRefreshEnabled" :auto-refresh-countdown="autoRefreshCountdown" :fullscreen="isFullscreen" + :custom-start-time="customStartTime" + :custom-end-time="customEndTime" @update:time-range="onTimeRangeChange" @update:platform="onPlatformChange" @update:group="onGroupChange" @update:query-mode="onQueryModeChange" + @update:custom-time-range="onCustomTimeRangeChange" @refresh="fetchData" @open-request-details="handleOpenRequestDetails" @open-error-details="openErrorDetails" @@ -148,8 +151,8 @@ const { t } = useI18n() const opsEnabled = computed(() => adminSettingsStore.opsMonitoringEnabled) -type TimeRange = '5m' | '30m' | '1h' | '6h' | '24h' -const allowedTimeRanges = new Set(['5m', '30m', '1h', '6h', '24h']) +type TimeRange = '5m' | '30m' | '1h' | '6h' | '24h' | 'custom' +const allowedTimeRanges = new Set(['5m', '30m', '1h', '6h', '24h', 'custom']) type QueryMode = 'auto' | 'raw' | 'preagg' const allowedQueryModes = new Set(['auto', 'raw', 'preagg']) @@ -163,6 +166,8 @@ const timeRange = ref('1h') const platform = ref('') const groupId = ref(null) const queryMode = ref('auto') +const customStartTime = ref(null) +const customEndTime = ref(null) const QUERY_KEYS = { timeRange: 'tr', @@ -420,6 +425,11 @@ function onTimeRangeChange(v: string | number | boolean | null) { timeRange.value = v as TimeRange } +function onCustomTimeRangeChange(startTime: string, endTime: string) { + customStartTime.value = startTime + customEndTime.value = endTime +} + function onSettingsSaved() { loadThresholds() fetchData() @@ -458,18 +468,25 @@ function openError(id: number) { showErrorModal.value = true } +function buildApiParams() { + const params: any = { + platform: platform.value || undefined, + group_id: groupId.value ?? undefined, + mode: queryMode.value + } + if (timeRange.value === 'custom' && customStartTime.value && customEndTime.value) { + params.start_time = customStartTime.value + params.end_time = customEndTime.value + } else { + params.time_range = timeRange.value + } + return params +} + async function refreshOverviewWithCancel(fetchSeq: number, signal: AbortSignal) { if (!opsEnabled.value) return try { - const data = await opsAPI.getDashboardOverview( - { - time_range: timeRange.value, - platform: platform.value || undefined, - group_id: groupId.value ?? undefined, - mode: queryMode.value - }, - { signal } - ) + const data = await opsAPI.getDashboardOverview(buildApiParams(), { signal }) if (fetchSeq !== dashboardFetchSeq) return overview.value = data } catch (err: any) { @@ -483,15 +500,7 @@ async function refreshThroughputTrendWithCancel(fetchSeq: number, signal: AbortS if (!opsEnabled.value) return loadingTrend.value = true try { - const data = await opsAPI.getThroughputTrend( - { - time_range: timeRange.value, - platform: platform.value || undefined, - group_id: groupId.value ?? undefined, - mode: queryMode.value - }, - { signal } - ) + const data = await opsAPI.getThroughputTrend(buildApiParams(), { signal }) if (fetchSeq !== dashboardFetchSeq) return throughputTrend.value = data } catch (err: any) { @@ -509,15 +518,7 @@ async function refreshLatencyHistogramWithCancel(fetchSeq: number, signal: Abort if (!opsEnabled.value) return loadingLatency.value = true try { - const data = await opsAPI.getLatencyHistogram( - { - time_range: timeRange.value, - platform: platform.value || undefined, - group_id: groupId.value ?? undefined, - mode: queryMode.value - }, - { signal } - ) + const data = await opsAPI.getLatencyHistogram(buildApiParams(), { signal }) if (fetchSeq !== dashboardFetchSeq) return latencyHistogram.value = data } catch (err: any) { @@ -535,15 +536,7 @@ async function refreshErrorTrendWithCancel(fetchSeq: number, signal: AbortSignal if (!opsEnabled.value) return loadingErrorTrend.value = true try { - const data = await opsAPI.getErrorTrend( - { - time_range: timeRange.value, - platform: platform.value || undefined, - group_id: groupId.value ?? undefined, - mode: queryMode.value - }, - { signal } - ) + const data = await opsAPI.getErrorTrend(buildApiParams(), { signal }) if (fetchSeq !== dashboardFetchSeq) return errorTrend.value = data } catch (err: any) { @@ -561,15 +554,7 @@ async function refreshErrorDistributionWithCancel(fetchSeq: number, signal: Abor if (!opsEnabled.value) return loadingErrorDistribution.value = true try { - const data = await opsAPI.getErrorDistribution( - { - time_range: timeRange.value, - platform: platform.value || undefined, - group_id: groupId.value ?? undefined, - mode: queryMode.value - }, - { signal } - ) + const data = await opsAPI.getErrorDistribution(buildApiParams(), { signal }) if (fetchSeq !== dashboardFetchSeq) return errorDistribution.value = data } catch (err: any) { diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue index 8e868bba..b36055e0 100644 --- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue +++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue @@ -26,6 +26,8 @@ interface Props { autoRefreshEnabled?: boolean autoRefreshCountdown?: number fullscreen?: boolean + customStartTime?: string | null + customEndTime?: string | null } interface Emits { @@ -33,6 +35,7 @@ interface Emits { (e: 'update:group', value: number | null): void (e: 'update:timeRange', value: string): void (e: 'update:queryMode', value: string): void + (e: 'update:customTimeRange', startTime: string, endTime: string): void (e: 'refresh'): void (e: 'openRequestDetails', preset?: OpsRequestDetailsPreset): void (e: 'openErrorDetails', kind: 'request' | 'upstream'): void @@ -85,6 +88,23 @@ watch( // --- Filters --- +const showCustomTimeRangeDialog = ref(false) +const customStartTimeInput = ref('') +const customEndTimeInput = ref('') + +function formatCustomTimeRangeLabel(startTime: string, endTime: string): string { + const start = new Date(startTime) + const end = new Date(endTime) + const formatDate = (d: Date) => { + const month = String(d.getMonth() + 1).padStart(2, '0') + const day = String(d.getDate()).padStart(2, '0') + const hour = String(d.getHours()).padStart(2, '0') + const minute = String(d.getMinutes()).padStart(2, '0') + return `${month}-${day} ${hour}:${minute}` + } + return `${formatDate(start)} ~ ${formatDate(end)}` +} + const groups = ref>([]) const platformOptions = computed(() => [ @@ -100,7 +120,13 @@ const timeRangeOptions = computed(() => [ { value: '30m', label: t('admin.ops.timeRange.30m') }, { value: '1h', label: t('admin.ops.timeRange.1h') }, { value: '6h', label: t('admin.ops.timeRange.6h') }, - { value: '24h', label: t('admin.ops.timeRange.24h') } + { value: '24h', label: t('admin.ops.timeRange.24h') }, + { + value: 'custom', + label: props.timeRange === 'custom' && props.customStartTime && props.customEndTime + ? `${t('admin.ops.timeRange.custom')} (${formatCustomTimeRangeLabel(props.customStartTime, props.customEndTime)})` + : t('admin.ops.timeRange.custom') + } ]) const queryModeOptions = computed(() => [ @@ -149,7 +175,32 @@ function handleGroupChange(val: string | number | boolean | null) { } function handleTimeRangeChange(val: string | number | boolean | null) { - emit('update:timeRange', String(val || '1h')) + const newValue = String(val || '1h') + if (newValue === 'custom') { + // 初始化为最近1小时 + const now = new Date() + const oneHourAgo = new Date(now.getTime() - 60 * 60 * 1000) + customStartTimeInput.value = oneHourAgo.toISOString().slice(0, 16) + customEndTimeInput.value = now.toISOString().slice(0, 16) + showCustomTimeRangeDialog.value = true + } else { + emit('update:timeRange', newValue) + } +} + +function handleCustomTimeRangeConfirm() { + if (!customStartTimeInput.value || !customEndTimeInput.value) return + const startTime = new Date(customStartTimeInput.value).toISOString() + const endTime = new Date(customEndTimeInput.value).toISOString() + emit('update:timeRange', 'custom') + emit('update:customTimeRange', startTime, endTime) + showCustomTimeRangeDialog.value = false +} + +function handleCustomTimeRangeCancel() { + showCustomTimeRangeDialog.value = false + // 如果当前不是 custom,不需要做任何事 + // 如果当前是 custom,保持不变 } function handleQueryModeChange(val: string | number | boolean | null) { @@ -164,11 +215,6 @@ function openErrorDetails(kind: 'request' | 'upstream') { emit('openErrorDetails', kind) } -const updatedAtLabel = computed(() => { - if (!props.lastUpdated) return t('common.unknown') - return props.lastUpdated.toLocaleTimeString() -}) - // --- Threshold checking helpers --- type ThresholdLevel = 'normal' | 'warning' | 'critical' @@ -829,25 +875,11 @@ function handleToolbarRefresh() { · - {{ t('common.refresh') }}: {{ updatedAtLabel }} + {{ t('common.refresh') }}: {{ props.lastUpdated ? props.lastUpdated.toLocaleString('zh-CN', { year: 'numeric', month: '2-digit', day: '2-digit', hour: '2-digit', minute: '2-digit', second: '2-digit' }).replace(/\//g, '-') : t('common.unknown') }} - -
@@ -1534,5 +1566,47 @@ function handleToolbarRefresh() {
+ + + +
+
+ + +
+
+ + +
+
+ + +
+
+
From e93f086485b5730f1fe9f48ac810228ccb0668e4 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Thu, 15 Jan 2026 19:57:19 +0800 Subject: [PATCH 4/8] =?UTF-8?q?fix(ops):=20=E8=AF=B7=E6=B1=82=E6=97=B6?= =?UTF-8?q?=E9=95=BF=E8=AF=A6=E6=83=85=E6=98=BE=E7=A4=BA=E6=89=80=E6=9C=89?= =?UTF-8?q?=E8=AF=B7=E6=B1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 移除请求时长卡片详情按钮的 min_duration_ms 参数限制 - 现在点击详情会显示所有请求,按时长倒序排列 - 不再只显示 P99 以上的请求 --- frontend/src/views/admin/ops/components/OpsDashboardHeader.vue | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue index b36055e0..0f6d1124 100644 --- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue +++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue @@ -1277,7 +1277,7 @@ function handleToolbarRefresh() { v-if="!props.fullscreen" class="text-[10px] font-bold text-blue-500 hover:underline" type="button" - @click="openDetails({ title: t('admin.ops.latencyDuration'), sort: 'duration_desc', min_duration_ms: Math.max(Number(durationP99Ms ?? 0), 0) })" + @click="openDetails({ title: t('admin.ops.latencyDuration'), sort: 'duration_desc' })" > {{ t('admin.ops.requestDetails.details') }} From 23aa69f56f2fef4b48ef6376da68bb640ebe1287 Mon Sep 17 00:00:00 2001 From: IanShaw027 <131567472+IanShaw027@users.noreply.github.com> Date: Thu, 15 Jan 2026 21:31:55 +0800 Subject: [PATCH 5/8] =?UTF-8?q?refactor(ops):=20=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E4=BB=BB=E5=8A=A1=E5=BF=83=E8=B7=B3=E5=92=8C=E7=BB=84=E4=BB=B6?= =?UTF-8?q?=E5=88=B7=E6=96=B0=E6=9C=BA=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 后端改动: - 添加 ops_job_heartbeats.last_result 字段记录任务执行结果 - 优化告警评估器统计信息(规则数/事件数/邮件数) - 统一各定时任务的心跳记录格式 前端改动: - 重构 OpsConcurrencyCard 使用父组件统一控制刷新节奏 - 移除独立的 5 秒刷新定时器,改用 refreshToken 机制 - 修复 TypeScript 类型错误 --- .../internal/repository/ops_repo_metrics.go | 16 ++++++- .../service/ops_aggregation_service.go | 5 +++ .../service/ops_alert_evaluator_service.go | 44 +++++++++++++----- .../internal/service/ops_cleanup_service.go | 6 ++- backend/internal/service/ops_port.go | 4 ++ .../service/ops_scheduled_report_service.go | 34 ++++++++++---- ...039_ops_job_heartbeats_add_last_result.sql | 6 +++ frontend/src/api/admin/ops.ts | 1 + frontend/src/views/admin/AccountsView.vue | 2 +- frontend/src/views/admin/ops/OpsDashboard.vue | 10 ++++- .../ops/components/OpsConcurrencyCard.vue | 44 +++++++----------- .../ops/components/OpsDashboardHeader.vue | 45 +++++++++++-------- 12 files changed, 146 insertions(+), 71 deletions(-) create mode 100644 backend/migrations/039_ops_job_heartbeats_add_last_result.sql diff --git a/backend/internal/repository/ops_repo_metrics.go b/backend/internal/repository/ops_repo_metrics.go index bc80ed6e..713e0eb9 100644 --- a/backend/internal/repository/ops_repo_metrics.go +++ b/backend/internal/repository/ops_repo_metrics.go @@ -296,9 +296,10 @@ INSERT INTO ops_job_heartbeats ( last_error_at, last_error, last_duration_ms, + last_result, updated_at ) VALUES ( - $1,$2,$3,$4,$5,$6,NOW() + $1,$2,$3,$4,$5,$6,$7,NOW() ) ON CONFLICT (job_name) DO UPDATE SET last_run_at = COALESCE(EXCLUDED.last_run_at, ops_job_heartbeats.last_run_at), @@ -312,6 +313,10 @@ ON CONFLICT (job_name) DO UPDATE SET ELSE COALESCE(EXCLUDED.last_error, ops_job_heartbeats.last_error) END, last_duration_ms = COALESCE(EXCLUDED.last_duration_ms, ops_job_heartbeats.last_duration_ms), + last_result = CASE + WHEN EXCLUDED.last_success_at IS NOT NULL THEN COALESCE(EXCLUDED.last_result, ops_job_heartbeats.last_result) + ELSE ops_job_heartbeats.last_result + END, updated_at = NOW()` _, err := r.db.ExecContext( @@ -323,6 +328,7 @@ ON CONFLICT (job_name) DO UPDATE SET opsNullTime(input.LastErrorAt), opsNullString(input.LastError), opsNullInt(input.LastDurationMs), + opsNullString(input.LastResult), ) return err } @@ -340,6 +346,7 @@ SELECT last_error_at, last_error, last_duration_ms, + last_result, updated_at FROM ops_job_heartbeats ORDER BY job_name ASC` @@ -359,6 +366,8 @@ ORDER BY job_name ASC` var lastError sql.NullString var lastDuration sql.NullInt64 + var lastResult sql.NullString + if err := rows.Scan( &item.JobName, &lastRun, @@ -366,6 +375,7 @@ ORDER BY job_name ASC` &lastErrorAt, &lastError, &lastDuration, + &lastResult, &item.UpdatedAt, ); err != nil { return nil, err @@ -391,6 +401,10 @@ ORDER BY job_name ASC` v := lastDuration.Int64 item.LastDurationMs = &v } + if lastResult.Valid { + v := lastResult.String + item.LastResult = &v + } out = append(out, &item) } diff --git a/backend/internal/service/ops_aggregation_service.go b/backend/internal/service/ops_aggregation_service.go index 2a6afbba..972462ec 100644 --- a/backend/internal/service/ops_aggregation_service.go +++ b/backend/internal/service/ops_aggregation_service.go @@ -4,6 +4,7 @@ import ( "context" "database/sql" "errors" + "fmt" "log" "strings" "sync" @@ -235,11 +236,13 @@ func (s *OpsAggregationService) aggregateHourly() { successAt := finishedAt hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second) defer hbCancel() + result := truncateString(fmt.Sprintf("window=%s..%s", start.Format(time.RFC3339), end.Format(time.RFC3339)), 2048) _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ JobName: opsAggHourlyJobName, LastRunAt: &runAt, LastSuccessAt: &successAt, LastDurationMs: &dur, + LastResult: &result, }) } @@ -331,11 +334,13 @@ func (s *OpsAggregationService) aggregateDaily() { successAt := finishedAt hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second) defer hbCancel() + result := truncateString(fmt.Sprintf("window=%s..%s", start.Format(time.RFC3339), end.Format(time.RFC3339)), 2048) _ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{ JobName: opsAggDailyJobName, LastRunAt: &runAt, LastSuccessAt: &successAt, LastDurationMs: &dur, + LastResult: &result, }) } diff --git a/backend/internal/service/ops_alert_evaluator_service.go b/backend/internal/service/ops_alert_evaluator_service.go index 2b619f4d..7c62e247 100644 --- a/backend/internal/service/ops_alert_evaluator_service.go +++ b/backend/internal/service/ops_alert_evaluator_service.go @@ -190,6 +190,13 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) { return } + rulesTotal := len(rules) + rulesEnabled := 0 + rulesEvaluated := 0 + eventsCreated := 0 + eventsResolved := 0 + emailsSent := 0 + now := time.Now().UTC() safeEnd := now.Truncate(time.Minute) if safeEnd.IsZero() { @@ -205,6 +212,7 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) { if rule == nil || !rule.Enabled || rule.ID <= 0 { continue } + rulesEnabled++ scopePlatform, scopeGroupID, scopeRegion := parseOpsAlertRuleScope(rule.Filters) @@ -220,6 +228,7 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) { s.resetRuleState(rule.ID, now) continue } + rulesEvaluated++ breachedNow := compareMetric(metricValue, rule.Operator, rule.Threshold) required := requiredSustainedBreaches(rule.SustainedMinutes, interval) @@ -278,8 +287,11 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) { continue } + eventsCreated++ if created != nil && created.ID > 0 { - s.maybeSendAlertEmail(ctx, runtimeCfg, rule, created) + if s.maybeSendAlertEmail(ctx, runtimeCfg, rule, created) { + emailsSent++ + } } continue } @@ -289,11 +301,14 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) { resolvedAt := now if err := s.opsRepo.UpdateAlertEventStatus(ctx, activeEvent.ID, OpsAlertStatusResolved, &resolvedAt); err != nil { log.Printf("[OpsAlertEvaluator] resolve event failed (event=%d): %v", activeEvent.ID, err) + } else { + eventsResolved++ } } } - s.recordHeartbeatSuccess(runAt, time.Since(startedAt)) + result := truncateString(fmt.Sprintf("rules=%d enabled=%d evaluated=%d created=%d resolved=%d emails_sent=%d", rulesTotal, rulesEnabled, rulesEvaluated, eventsCreated, eventsResolved, emailsSent), 2048) + s.recordHeartbeatSuccess(runAt, time.Since(startedAt), result) } func (s *OpsAlertEvaluatorService) pruneRuleStates(rules []*OpsAlertRule) { @@ -585,32 +600,32 @@ func buildOpsAlertDescription(rule *OpsAlertRule, value float64, windowMinutes i ) } -func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runtimeCfg *OpsAlertRuntimeSettings, rule *OpsAlertRule, event *OpsAlertEvent) { +func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runtimeCfg *OpsAlertRuntimeSettings, rule *OpsAlertRule, event *OpsAlertEvent) bool { if s == nil || s.emailService == nil || s.opsService == nil || event == nil || rule == nil { - return + return false } if event.EmailSent { - return + return false } if !rule.NotifyEmail { - return + return false } emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx) if err != nil || emailCfg == nil || !emailCfg.Alert.Enabled { - return + return false } if len(emailCfg.Alert.Recipients) == 0 { - return + return false } if !shouldSendOpsAlertEmailByMinSeverity(strings.TrimSpace(emailCfg.Alert.MinSeverity), strings.TrimSpace(rule.Severity)) { - return + return false } if runtimeCfg != nil && runtimeCfg.Silencing.Enabled { if isOpsAlertSilenced(time.Now().UTC(), rule, event, runtimeCfg.Silencing) { - return + return false } } @@ -639,6 +654,7 @@ func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runt if anySent { _ = s.opsRepo.UpdateAlertEventEmailSent(context.Background(), event.ID, true) } + return anySent } func buildOpsAlertEmailBody(rule *OpsAlertRule, event *OpsAlertEvent) string { @@ -806,7 +822,7 @@ func (s *OpsAlertEvaluatorService) maybeLogSkip(key string) { log.Printf("[OpsAlertEvaluator] leader lock held by another instance; skipping (key=%q)", key) } -func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) { +func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration, result string) { if s == nil || s.opsRepo == nil { return } @@ -814,11 +830,17 @@ func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, durat durMs := duration.Milliseconds() ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) defer cancel() + msg := strings.TrimSpace(result) + if msg == "" { + msg = "ok" + } + msg = truncateString(msg, 2048) _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ JobName: opsAlertEvaluatorJobName, LastRunAt: &runAt, LastSuccessAt: &now, LastDurationMs: &durMs, + LastResult: &msg, }) } diff --git a/backend/internal/service/ops_cleanup_service.go b/backend/internal/service/ops_cleanup_service.go index afd2d22c..1ade7176 100644 --- a/backend/internal/service/ops_cleanup_service.go +++ b/backend/internal/service/ops_cleanup_service.go @@ -149,7 +149,7 @@ func (s *OpsCleanupService) runScheduled() { log.Printf("[OpsCleanup] cleanup failed: %v", err) return } - s.recordHeartbeatSuccess(runAt, time.Since(startedAt)) + s.recordHeartbeatSuccess(runAt, time.Since(startedAt), counts) log.Printf("[OpsCleanup] cleanup complete: %s", counts) } @@ -330,12 +330,13 @@ func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), b return release, true } -func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) { +func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration, counts opsCleanupDeletedCounts) { if s == nil || s.opsRepo == nil { return } now := time.Now().UTC() durMs := duration.Milliseconds() + result := truncateString(counts.String(), 2048) ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) defer cancel() _ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ @@ -343,6 +344,7 @@ func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration tim LastRunAt: &runAt, LastSuccessAt: &now, LastDurationMs: &durMs, + LastResult: &result, }) } diff --git a/backend/internal/service/ops_port.go b/backend/internal/service/ops_port.go index cdeea241..515b47bb 100644 --- a/backend/internal/service/ops_port.go +++ b/backend/internal/service/ops_port.go @@ -235,6 +235,9 @@ type OpsUpsertJobHeartbeatInput struct { LastErrorAt *time.Time LastError *string LastDurationMs *int64 + + // LastResult is an optional human-readable summary of the last successful run. + LastResult *string } type OpsJobHeartbeat struct { @@ -245,6 +248,7 @@ type OpsJobHeartbeat struct { LastErrorAt *time.Time `json:"last_error_at"` LastError *string `json:"last_error"` LastDurationMs *int64 `json:"last_duration_ms"` + LastResult *string `json:"last_result"` UpdatedAt time.Time `json:"updated_at"` } diff --git a/backend/internal/service/ops_scheduled_report_service.go b/backend/internal/service/ops_scheduled_report_service.go index 28902cbc..98b2045d 100644 --- a/backend/internal/service/ops_scheduled_report_service.go +++ b/backend/internal/service/ops_scheduled_report_service.go @@ -177,6 +177,10 @@ func (s *OpsScheduledReportService) runOnce() { return } + reportsTotal := len(reports) + reportsDue := 0 + sentAttempts := 0 + for _, report := range reports { if report == nil || !report.Enabled { continue @@ -184,14 +188,18 @@ func (s *OpsScheduledReportService) runOnce() { if report.NextRunAt.After(now) { continue } + reportsDue++ - if err := s.runReport(ctx, report, now); err != nil { + attempts, err := s.runReport(ctx, report, now) + if err != nil { s.recordHeartbeatError(runAt, time.Since(startedAt), err) return } + sentAttempts += attempts } - s.recordHeartbeatSuccess(runAt, time.Since(startedAt)) + result := truncateString(fmt.Sprintf("reports=%d due=%d send_attempts=%d", reportsTotal, reportsDue, sentAttempts), 2048) + s.recordHeartbeatSuccess(runAt, time.Since(startedAt), result) } type opsScheduledReport struct { @@ -297,9 +305,9 @@ func (s *OpsScheduledReportService) listScheduledReports(ctx context.Context, no return out } -func (s *OpsScheduledReportService) runReport(ctx context.Context, report *opsScheduledReport, now time.Time) error { +func (s *OpsScheduledReportService) runReport(ctx context.Context, report *opsScheduledReport, now time.Time) (int, error) { if s == nil || s.opsService == nil || s.emailService == nil || report == nil { - return nil + return 0, nil } if ctx == nil { ctx = context.Background() @@ -310,11 +318,11 @@ func (s *OpsScheduledReportService) runReport(ctx context.Context, report *opsSc content, err := s.generateReportHTML(ctx, report, now) if err != nil { - return err + return 0, err } if strings.TrimSpace(content) == "" { // Skip sending when the report decides not to emit content (e.g., digest below min count). - return nil + return 0, nil } recipients := report.Recipients @@ -325,22 +333,24 @@ func (s *OpsScheduledReportService) runReport(ctx context.Context, report *opsSc } } if len(recipients) == 0 { - return nil + return 0, nil } subject := fmt.Sprintf("[Ops Report] %s", strings.TrimSpace(report.Name)) + attempts := 0 for _, to := range recipients { addr := strings.TrimSpace(to) if addr == "" { continue } + attempts++ if err := s.emailService.SendEmail(ctx, addr, subject, content); err != nil { // Ignore per-recipient failures; continue best-effort. continue } } - return nil + return attempts, nil } func (s *OpsScheduledReportService) generateReportHTML(ctx context.Context, report *opsScheduledReport, now time.Time) (string, error) { @@ -650,7 +660,7 @@ func (s *OpsScheduledReportService) setLastRunAt(ctx context.Context, reportType _ = s.redisClient.Set(ctx, key, strconv.FormatInt(t.UTC().Unix(), 10), 14*24*time.Hour).Err() } -func (s *OpsScheduledReportService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) { +func (s *OpsScheduledReportService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration, result string) { if s == nil || s.opsService == nil || s.opsService.opsRepo == nil { return } @@ -658,11 +668,17 @@ func (s *OpsScheduledReportService) recordHeartbeatSuccess(runAt time.Time, dura durMs := duration.Milliseconds() ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) defer cancel() + msg := strings.TrimSpace(result) + if msg == "" { + msg = "ok" + } + msg = truncateString(msg, 2048) _ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{ JobName: opsScheduledReportJobName, LastRunAt: &runAt, LastSuccessAt: &now, LastDurationMs: &durMs, + LastResult: &msg, }) } diff --git a/backend/migrations/039_ops_job_heartbeats_add_last_result.sql b/backend/migrations/039_ops_job_heartbeats_add_last_result.sql new file mode 100644 index 00000000..7d6dc743 --- /dev/null +++ b/backend/migrations/039_ops_job_heartbeats_add_last_result.sql @@ -0,0 +1,6 @@ +-- Add last_result to ops_job_heartbeats for UI job details. + +ALTER TABLE IF EXISTS ops_job_heartbeats + ADD COLUMN IF NOT EXISTS last_result TEXT; + +COMMENT ON COLUMN ops_job_heartbeats.last_result IS 'Last successful run result summary (human readable).'; diff --git a/frontend/src/api/admin/ops.ts b/frontend/src/api/admin/ops.ts index 63b12cfb..6e048436 100644 --- a/frontend/src/api/admin/ops.ts +++ b/frontend/src/api/admin/ops.ts @@ -293,6 +293,7 @@ export interface OpsJobHeartbeat { last_error_at?: string | null last_error?: string | null last_duration_ms?: number | null + last_result?: string | null updated_at: string } diff --git a/frontend/src/views/admin/AccountsView.vue b/frontend/src/views/admin/AccountsView.vue index cf484303..42f38c74 100644 --- a/frontend/src/views/admin/AccountsView.vue +++ b/frontend/src/views/admin/AccountsView.vue @@ -414,7 +414,7 @@ const handleScroll = () => { menu.show = false } -onMounted(async () => { load(); try { const [p, g] = await Promise.all([adminAPI.proxies.getAll(), adminAPI.groups.getAll()]); proxies.value = p; groups.value = g } catch (error) { console.error('Failed to load proxies/groups:', error) }; window.addEventListener('scroll', handleScroll, true) }) +onMounted(async () => { load(); try { const [p, g] = await Promise.all([adminAPI.proxies.getAll(), adminAPI.groups.getAll()]); proxies.value = p; groups.value = g } catch (error) { console.error('Failed to load proxies/groups:', error) } window.addEventListener('scroll', handleScroll, true) }) onUnmounted(() => { window.removeEventListener('scroll', handleScroll, true) diff --git a/frontend/src/views/admin/ops/OpsDashboard.vue b/frontend/src/views/admin/ops/OpsDashboard.vue index d8a31931..033ef1da 100644 --- a/frontend/src/views/admin/ops/OpsDashboard.vue +++ b/frontend/src/views/admin/ops/OpsDashboard.vue @@ -42,7 +42,7 @@
- +
{ @@ -597,7 +600,12 @@ async function fetchData() { refreshErrorDistributionWithCancel(fetchSeq, dashboardFetchController.signal) ]) if (fetchSeq !== dashboardFetchSeq) return + lastUpdated.value = new Date() + + // Trigger child component refreshes using the same cadence as the header. + dashboardRefreshToken.value += 1 + // Reset auto refresh countdown after successful fetch if (autoRefreshEnabled.value) { autoRefreshCountdown.value = Math.floor(autoRefreshIntervalMs.value / 1000) diff --git a/frontend/src/views/admin/ops/components/OpsConcurrencyCard.vue b/frontend/src/views/admin/ops/components/OpsConcurrencyCard.vue index 2104d1f7..acb0de1b 100644 --- a/frontend/src/views/admin/ops/components/OpsConcurrencyCard.vue +++ b/frontend/src/views/admin/ops/components/OpsConcurrencyCard.vue @@ -1,12 +1,12 @@