refactor(ops): 优化任务心跳和组件刷新机制

后端改动:
- 添加 ops_job_heartbeats.last_result 字段记录任务执行结果
- 优化告警评估器统计信息(规则数/事件数/邮件数)
- 统一各定时任务的心跳记录格式

前端改动:
- 重构 OpsConcurrencyCard 使用父组件统一控制刷新节奏
- 移除独立的 5 秒刷新定时器,改用 refreshToken 机制
- 修复 TypeScript 类型错误
This commit is contained in:
IanShaw027
2026-01-15 21:31:55 +08:00
parent e93f086485
commit 23aa69f56f
12 changed files with 146 additions and 71 deletions

View File

@@ -296,9 +296,10 @@ INSERT INTO ops_job_heartbeats (
last_error_at,
last_error,
last_duration_ms,
last_result,
updated_at
) VALUES (
$1,$2,$3,$4,$5,$6,NOW()
$1,$2,$3,$4,$5,$6,$7,NOW()
)
ON CONFLICT (job_name) DO UPDATE SET
last_run_at = COALESCE(EXCLUDED.last_run_at, ops_job_heartbeats.last_run_at),
@@ -312,6 +313,10 @@ ON CONFLICT (job_name) DO UPDATE SET
ELSE COALESCE(EXCLUDED.last_error, ops_job_heartbeats.last_error)
END,
last_duration_ms = COALESCE(EXCLUDED.last_duration_ms, ops_job_heartbeats.last_duration_ms),
last_result = CASE
WHEN EXCLUDED.last_success_at IS NOT NULL THEN COALESCE(EXCLUDED.last_result, ops_job_heartbeats.last_result)
ELSE ops_job_heartbeats.last_result
END,
updated_at = NOW()`
_, err := r.db.ExecContext(
@@ -323,6 +328,7 @@ ON CONFLICT (job_name) DO UPDATE SET
opsNullTime(input.LastErrorAt),
opsNullString(input.LastError),
opsNullInt(input.LastDurationMs),
opsNullString(input.LastResult),
)
return err
}
@@ -340,6 +346,7 @@ SELECT
last_error_at,
last_error,
last_duration_ms,
last_result,
updated_at
FROM ops_job_heartbeats
ORDER BY job_name ASC`
@@ -359,6 +366,8 @@ ORDER BY job_name ASC`
var lastError sql.NullString
var lastDuration sql.NullInt64
var lastResult sql.NullString
if err := rows.Scan(
&item.JobName,
&lastRun,
@@ -366,6 +375,7 @@ ORDER BY job_name ASC`
&lastErrorAt,
&lastError,
&lastDuration,
&lastResult,
&item.UpdatedAt,
); err != nil {
return nil, err
@@ -391,6 +401,10 @@ ORDER BY job_name ASC`
v := lastDuration.Int64
item.LastDurationMs = &v
}
if lastResult.Valid {
v := lastResult.String
item.LastResult = &v
}
out = append(out, &item)
}

View File

@@ -4,6 +4,7 @@ import (
"context"
"database/sql"
"errors"
"fmt"
"log"
"strings"
"sync"
@@ -235,11 +236,13 @@ func (s *OpsAggregationService) aggregateHourly() {
successAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
defer hbCancel()
result := truncateString(fmt.Sprintf("window=%s..%s", start.Format(time.RFC3339), end.Format(time.RFC3339)), 2048)
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsAggHourlyJobName,
LastRunAt: &runAt,
LastSuccessAt: &successAt,
LastDurationMs: &dur,
LastResult: &result,
})
}
@@ -331,11 +334,13 @@ func (s *OpsAggregationService) aggregateDaily() {
successAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), 2*time.Second)
defer hbCancel()
result := truncateString(fmt.Sprintf("window=%s..%s", start.Format(time.RFC3339), end.Format(time.RFC3339)), 2048)
_ = s.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsAggDailyJobName,
LastRunAt: &runAt,
LastSuccessAt: &successAt,
LastDurationMs: &dur,
LastResult: &result,
})
}

View File

@@ -190,6 +190,13 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
return
}
rulesTotal := len(rules)
rulesEnabled := 0
rulesEvaluated := 0
eventsCreated := 0
eventsResolved := 0
emailsSent := 0
now := time.Now().UTC()
safeEnd := now.Truncate(time.Minute)
if safeEnd.IsZero() {
@@ -205,6 +212,7 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
if rule == nil || !rule.Enabled || rule.ID <= 0 {
continue
}
rulesEnabled++
scopePlatform, scopeGroupID, scopeRegion := parseOpsAlertRuleScope(rule.Filters)
@@ -220,6 +228,7 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
s.resetRuleState(rule.ID, now)
continue
}
rulesEvaluated++
breachedNow := compareMetric(metricValue, rule.Operator, rule.Threshold)
required := requiredSustainedBreaches(rule.SustainedMinutes, interval)
@@ -278,8 +287,11 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
continue
}
eventsCreated++
if created != nil && created.ID > 0 {
s.maybeSendAlertEmail(ctx, runtimeCfg, rule, created)
if s.maybeSendAlertEmail(ctx, runtimeCfg, rule, created) {
emailsSent++
}
}
continue
}
@@ -289,11 +301,14 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
resolvedAt := now
if err := s.opsRepo.UpdateAlertEventStatus(ctx, activeEvent.ID, OpsAlertStatusResolved, &resolvedAt); err != nil {
log.Printf("[OpsAlertEvaluator] resolve event failed (event=%d): %v", activeEvent.ID, err)
} else {
eventsResolved++
}
}
}
s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
result := truncateString(fmt.Sprintf("rules=%d enabled=%d evaluated=%d created=%d resolved=%d emails_sent=%d", rulesTotal, rulesEnabled, rulesEvaluated, eventsCreated, eventsResolved, emailsSent), 2048)
s.recordHeartbeatSuccess(runAt, time.Since(startedAt), result)
}
func (s *OpsAlertEvaluatorService) pruneRuleStates(rules []*OpsAlertRule) {
@@ -585,32 +600,32 @@ func buildOpsAlertDescription(rule *OpsAlertRule, value float64, windowMinutes i
)
}
func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runtimeCfg *OpsAlertRuntimeSettings, rule *OpsAlertRule, event *OpsAlertEvent) {
func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runtimeCfg *OpsAlertRuntimeSettings, rule *OpsAlertRule, event *OpsAlertEvent) bool {
if s == nil || s.emailService == nil || s.opsService == nil || event == nil || rule == nil {
return
return false
}
if event.EmailSent {
return
return false
}
if !rule.NotifyEmail {
return
return false
}
emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx)
if err != nil || emailCfg == nil || !emailCfg.Alert.Enabled {
return
return false
}
if len(emailCfg.Alert.Recipients) == 0 {
return
return false
}
if !shouldSendOpsAlertEmailByMinSeverity(strings.TrimSpace(emailCfg.Alert.MinSeverity), strings.TrimSpace(rule.Severity)) {
return
return false
}
if runtimeCfg != nil && runtimeCfg.Silencing.Enabled {
if isOpsAlertSilenced(time.Now().UTC(), rule, event, runtimeCfg.Silencing) {
return
return false
}
}
@@ -639,6 +654,7 @@ func (s *OpsAlertEvaluatorService) maybeSendAlertEmail(ctx context.Context, runt
if anySent {
_ = s.opsRepo.UpdateAlertEventEmailSent(context.Background(), event.ID, true)
}
return anySent
}
func buildOpsAlertEmailBody(rule *OpsAlertRule, event *OpsAlertEvent) string {
@@ -806,7 +822,7 @@ func (s *OpsAlertEvaluatorService) maybeLogSkip(key string) {
log.Printf("[OpsAlertEvaluator] leader lock held by another instance; skipping (key=%q)", key)
}
func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration, result string) {
if s == nil || s.opsRepo == nil {
return
}
@@ -814,11 +830,17 @@ func (s *OpsAlertEvaluatorService) recordHeartbeatSuccess(runAt time.Time, durat
durMs := duration.Milliseconds()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
msg := strings.TrimSpace(result)
if msg == "" {
msg = "ok"
}
msg = truncateString(msg, 2048)
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
JobName: opsAlertEvaluatorJobName,
LastRunAt: &runAt,
LastSuccessAt: &now,
LastDurationMs: &durMs,
LastResult: &msg,
})
}

View File

@@ -149,7 +149,7 @@ func (s *OpsCleanupService) runScheduled() {
log.Printf("[OpsCleanup] cleanup failed: %v", err)
return
}
s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
s.recordHeartbeatSuccess(runAt, time.Since(startedAt), counts)
log.Printf("[OpsCleanup] cleanup complete: %s", counts)
}
@@ -330,12 +330,13 @@ func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), b
return release, true
}
func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration, counts opsCleanupDeletedCounts) {
if s == nil || s.opsRepo == nil {
return
}
now := time.Now().UTC()
durMs := duration.Milliseconds()
result := truncateString(counts.String(), 2048)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
@@ -343,6 +344,7 @@ func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration tim
LastRunAt: &runAt,
LastSuccessAt: &now,
LastDurationMs: &durMs,
LastResult: &result,
})
}

View File

@@ -235,6 +235,9 @@ type OpsUpsertJobHeartbeatInput struct {
LastErrorAt *time.Time
LastError *string
LastDurationMs *int64
// LastResult is an optional human-readable summary of the last successful run.
LastResult *string
}
type OpsJobHeartbeat struct {
@@ -245,6 +248,7 @@ type OpsJobHeartbeat struct {
LastErrorAt *time.Time `json:"last_error_at"`
LastError *string `json:"last_error"`
LastDurationMs *int64 `json:"last_duration_ms"`
LastResult *string `json:"last_result"`
UpdatedAt time.Time `json:"updated_at"`
}

View File

@@ -177,6 +177,10 @@ func (s *OpsScheduledReportService) runOnce() {
return
}
reportsTotal := len(reports)
reportsDue := 0
sentAttempts := 0
for _, report := range reports {
if report == nil || !report.Enabled {
continue
@@ -184,14 +188,18 @@ func (s *OpsScheduledReportService) runOnce() {
if report.NextRunAt.After(now) {
continue
}
reportsDue++
if err := s.runReport(ctx, report, now); err != nil {
attempts, err := s.runReport(ctx, report, now)
if err != nil {
s.recordHeartbeatError(runAt, time.Since(startedAt), err)
return
}
sentAttempts += attempts
}
s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
result := truncateString(fmt.Sprintf("reports=%d due=%d send_attempts=%d", reportsTotal, reportsDue, sentAttempts), 2048)
s.recordHeartbeatSuccess(runAt, time.Since(startedAt), result)
}
type opsScheduledReport struct {
@@ -297,9 +305,9 @@ func (s *OpsScheduledReportService) listScheduledReports(ctx context.Context, no
return out
}
func (s *OpsScheduledReportService) runReport(ctx context.Context, report *opsScheduledReport, now time.Time) error {
func (s *OpsScheduledReportService) runReport(ctx context.Context, report *opsScheduledReport, now time.Time) (int, error) {
if s == nil || s.opsService == nil || s.emailService == nil || report == nil {
return nil
return 0, nil
}
if ctx == nil {
ctx = context.Background()
@@ -310,11 +318,11 @@ func (s *OpsScheduledReportService) runReport(ctx context.Context, report *opsSc
content, err := s.generateReportHTML(ctx, report, now)
if err != nil {
return err
return 0, err
}
if strings.TrimSpace(content) == "" {
// Skip sending when the report decides not to emit content (e.g., digest below min count).
return nil
return 0, nil
}
recipients := report.Recipients
@@ -325,22 +333,24 @@ func (s *OpsScheduledReportService) runReport(ctx context.Context, report *opsSc
}
}
if len(recipients) == 0 {
return nil
return 0, nil
}
subject := fmt.Sprintf("[Ops Report] %s", strings.TrimSpace(report.Name))
attempts := 0
for _, to := range recipients {
addr := strings.TrimSpace(to)
if addr == "" {
continue
}
attempts++
if err := s.emailService.SendEmail(ctx, addr, subject, content); err != nil {
// Ignore per-recipient failures; continue best-effort.
continue
}
}
return nil
return attempts, nil
}
func (s *OpsScheduledReportService) generateReportHTML(ctx context.Context, report *opsScheduledReport, now time.Time) (string, error) {
@@ -650,7 +660,7 @@ func (s *OpsScheduledReportService) setLastRunAt(ctx context.Context, reportType
_ = s.redisClient.Set(ctx, key, strconv.FormatInt(t.UTC().Unix(), 10), 14*24*time.Hour).Err()
}
func (s *OpsScheduledReportService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
func (s *OpsScheduledReportService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration, result string) {
if s == nil || s.opsService == nil || s.opsService.opsRepo == nil {
return
}
@@ -658,11 +668,17 @@ func (s *OpsScheduledReportService) recordHeartbeatSuccess(runAt time.Time, dura
durMs := duration.Milliseconds()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
msg := strings.TrimSpace(result)
if msg == "" {
msg = "ok"
}
msg = truncateString(msg, 2048)
_ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
JobName: opsScheduledReportJobName,
LastRunAt: &runAt,
LastSuccessAt: &now,
LastDurationMs: &durMs,
LastResult: &msg,
})
}

View File

@@ -0,0 +1,6 @@
-- Add last_result to ops_job_heartbeats for UI job details.
ALTER TABLE IF EXISTS ops_job_heartbeats
ADD COLUMN IF NOT EXISTS last_result TEXT;
COMMENT ON COLUMN ops_job_heartbeats.last_result IS 'Last successful run result summary (human readable).';