Merge branch 'Wei-Shaw:main' into main
This commit is contained in:
@@ -9,16 +9,19 @@ import (
|
||||
)
|
||||
|
||||
type Account struct {
|
||||
ID int64
|
||||
Name string
|
||||
Notes *string
|
||||
Platform string
|
||||
Type string
|
||||
Credentials map[string]any
|
||||
Extra map[string]any
|
||||
ProxyID *int64
|
||||
Concurrency int
|
||||
Priority int
|
||||
ID int64
|
||||
Name string
|
||||
Notes *string
|
||||
Platform string
|
||||
Type string
|
||||
Credentials map[string]any
|
||||
Extra map[string]any
|
||||
ProxyID *int64
|
||||
Concurrency int
|
||||
Priority int
|
||||
// RateMultiplier 账号计费倍率(>=0,允许 0 表示该账号计费为 0)。
|
||||
// 使用指针用于兼容旧版本调度缓存(Redis)中缺字段的情况:nil 表示按 1.0 处理。
|
||||
RateMultiplier *float64
|
||||
Status string
|
||||
ErrorMessage string
|
||||
LastUsedAt *time.Time
|
||||
@@ -57,6 +60,20 @@ func (a *Account) IsActive() bool {
|
||||
return a.Status == StatusActive
|
||||
}
|
||||
|
||||
// BillingRateMultiplier 返回账号计费倍率。
|
||||
// - nil 表示未配置/旧缓存缺字段,按 1.0 处理
|
||||
// - 允许 0,表示该账号计费为 0
|
||||
// - 负数属于非法数据,出于安全考虑按 1.0 处理
|
||||
func (a *Account) BillingRateMultiplier() float64 {
|
||||
if a == nil || a.RateMultiplier == nil {
|
||||
return 1.0
|
||||
}
|
||||
if *a.RateMultiplier < 0 {
|
||||
return 1.0
|
||||
}
|
||||
return *a.RateMultiplier
|
||||
}
|
||||
|
||||
func (a *Account) IsSchedulable() bool {
|
||||
if !a.IsActive() || !a.Schedulable {
|
||||
return false
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestAccount_BillingRateMultiplier_DefaultsToOneWhenNil(t *testing.T) {
|
||||
var a Account
|
||||
require.NoError(t, json.Unmarshal([]byte(`{"id":1,"name":"acc","status":"active"}`), &a))
|
||||
require.Nil(t, a.RateMultiplier)
|
||||
require.Equal(t, 1.0, a.BillingRateMultiplier())
|
||||
}
|
||||
|
||||
func TestAccount_BillingRateMultiplier_AllowsZero(t *testing.T) {
|
||||
v := 0.0
|
||||
a := Account{RateMultiplier: &v}
|
||||
require.Equal(t, 0.0, a.BillingRateMultiplier())
|
||||
}
|
||||
|
||||
func TestAccount_BillingRateMultiplier_NegativeFallsBackToOne(t *testing.T) {
|
||||
v := -1.0
|
||||
a := Account{RateMultiplier: &v}
|
||||
require.Equal(t, 1.0, a.BillingRateMultiplier())
|
||||
}
|
||||
@@ -63,14 +63,15 @@ type AccountRepository interface {
|
||||
// AccountBulkUpdate describes the fields that can be updated in a bulk operation.
|
||||
// Nil pointers mean "do not change".
|
||||
type AccountBulkUpdate struct {
|
||||
Name *string
|
||||
ProxyID *int64
|
||||
Concurrency *int
|
||||
Priority *int
|
||||
Status *string
|
||||
Schedulable *bool
|
||||
Credentials map[string]any
|
||||
Extra map[string]any
|
||||
Name *string
|
||||
ProxyID *int64
|
||||
Concurrency *int
|
||||
Priority *int
|
||||
RateMultiplier *float64
|
||||
Status *string
|
||||
Schedulable *bool
|
||||
Credentials map[string]any
|
||||
Extra map[string]any
|
||||
}
|
||||
|
||||
// CreateAccountRequest 创建账号请求
|
||||
|
||||
@@ -32,8 +32,8 @@ type UsageLogRepository interface {
|
||||
|
||||
// Admin dashboard stats
|
||||
GetDashboardStats(ctx context.Context) (*usagestats.DashboardStats, error)
|
||||
GetUsageTrendWithFilters(ctx context.Context, startTime, endTime time.Time, granularity string, userID, apiKeyID int64) ([]usagestats.TrendDataPoint, error)
|
||||
GetModelStatsWithFilters(ctx context.Context, startTime, endTime time.Time, userID, apiKeyID, accountID int64) ([]usagestats.ModelStat, error)
|
||||
GetUsageTrendWithFilters(ctx context.Context, startTime, endTime time.Time, granularity string, userID, apiKeyID, accountID, groupID int64, model string, stream *bool) ([]usagestats.TrendDataPoint, error)
|
||||
GetModelStatsWithFilters(ctx context.Context, startTime, endTime time.Time, userID, apiKeyID, accountID, groupID int64, stream *bool) ([]usagestats.ModelStat, error)
|
||||
GetAPIKeyUsageTrend(ctx context.Context, startTime, endTime time.Time, granularity string, limit int) ([]usagestats.APIKeyUsageTrendPoint, error)
|
||||
GetUserUsageTrend(ctx context.Context, startTime, endTime time.Time, granularity string, limit int) ([]usagestats.UserUsageTrendPoint, error)
|
||||
GetBatchUserUsageStats(ctx context.Context, userIDs []int64) (map[int64]*usagestats.BatchUserUsageStats, error)
|
||||
@@ -96,10 +96,16 @@ func NewUsageCache() *UsageCache {
|
||||
}
|
||||
|
||||
// WindowStats 窗口期统计
|
||||
//
|
||||
// cost: 账号口径费用(total_cost * account_rate_multiplier)
|
||||
// standard_cost: 标准费用(total_cost,不含倍率)
|
||||
// user_cost: 用户/API Key 口径费用(actual_cost,受分组倍率影响)
|
||||
type WindowStats struct {
|
||||
Requests int64 `json:"requests"`
|
||||
Tokens int64 `json:"tokens"`
|
||||
Cost float64 `json:"cost"`
|
||||
Requests int64 `json:"requests"`
|
||||
Tokens int64 `json:"tokens"`
|
||||
Cost float64 `json:"cost"`
|
||||
StandardCost float64 `json:"standard_cost"`
|
||||
UserCost float64 `json:"user_cost"`
|
||||
}
|
||||
|
||||
// UsageProgress 使用量进度
|
||||
@@ -266,7 +272,7 @@ func (s *AccountUsageService) getGeminiUsage(ctx context.Context, account *Accou
|
||||
}
|
||||
|
||||
dayStart := geminiDailyWindowStart(now)
|
||||
stats, err := s.usageLogRepo.GetModelStatsWithFilters(ctx, dayStart, now, 0, 0, account.ID)
|
||||
stats, err := s.usageLogRepo.GetModelStatsWithFilters(ctx, dayStart, now, 0, 0, account.ID, 0, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get gemini usage stats failed: %w", err)
|
||||
}
|
||||
@@ -288,7 +294,7 @@ func (s *AccountUsageService) getGeminiUsage(ctx context.Context, account *Accou
|
||||
// Minute window (RPM) - fixed-window approximation: current minute [truncate(now), truncate(now)+1m)
|
||||
minuteStart := now.Truncate(time.Minute)
|
||||
minuteResetAt := minuteStart.Add(time.Minute)
|
||||
minuteStats, err := s.usageLogRepo.GetModelStatsWithFilters(ctx, minuteStart, now, 0, 0, account.ID)
|
||||
minuteStats, err := s.usageLogRepo.GetModelStatsWithFilters(ctx, minuteStart, now, 0, 0, account.ID, 0, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get gemini minute usage stats failed: %w", err)
|
||||
}
|
||||
@@ -377,9 +383,11 @@ func (s *AccountUsageService) addWindowStats(ctx context.Context, account *Accou
|
||||
}
|
||||
|
||||
windowStats = &WindowStats{
|
||||
Requests: stats.Requests,
|
||||
Tokens: stats.Tokens,
|
||||
Cost: stats.Cost,
|
||||
Requests: stats.Requests,
|
||||
Tokens: stats.Tokens,
|
||||
Cost: stats.Cost,
|
||||
StandardCost: stats.StandardCost,
|
||||
UserCost: stats.UserCost,
|
||||
}
|
||||
|
||||
// 缓存窗口统计(1 分钟)
|
||||
@@ -403,9 +411,11 @@ func (s *AccountUsageService) GetTodayStats(ctx context.Context, accountID int64
|
||||
}
|
||||
|
||||
return &WindowStats{
|
||||
Requests: stats.Requests,
|
||||
Tokens: stats.Tokens,
|
||||
Cost: stats.Cost,
|
||||
Requests: stats.Requests,
|
||||
Tokens: stats.Tokens,
|
||||
Cost: stats.Cost,
|
||||
StandardCost: stats.StandardCost,
|
||||
UserCost: stats.UserCost,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -54,7 +54,8 @@ type AdminService interface {
|
||||
CreateProxy(ctx context.Context, input *CreateProxyInput) (*Proxy, error)
|
||||
UpdateProxy(ctx context.Context, id int64, input *UpdateProxyInput) (*Proxy, error)
|
||||
DeleteProxy(ctx context.Context, id int64) error
|
||||
GetProxyAccounts(ctx context.Context, proxyID int64, page, pageSize int) ([]Account, int64, error)
|
||||
BatchDeleteProxies(ctx context.Context, ids []int64) (*ProxyBatchDeleteResult, error)
|
||||
GetProxyAccounts(ctx context.Context, proxyID int64) ([]ProxyAccountSummary, error)
|
||||
CheckProxyExists(ctx context.Context, host string, port int, username, password string) (bool, error)
|
||||
TestProxy(ctx context.Context, id int64) (*ProxyTestResult, error)
|
||||
|
||||
@@ -136,6 +137,7 @@ type CreateAccountInput struct {
|
||||
ProxyID *int64
|
||||
Concurrency int
|
||||
Priority int
|
||||
RateMultiplier *float64 // 账号计费倍率(>=0,允许 0)
|
||||
GroupIDs []int64
|
||||
ExpiresAt *int64
|
||||
AutoPauseOnExpired *bool
|
||||
@@ -151,8 +153,9 @@ type UpdateAccountInput struct {
|
||||
Credentials map[string]any
|
||||
Extra map[string]any
|
||||
ProxyID *int64
|
||||
Concurrency *int // 使用指针区分"未提供"和"设置为0"
|
||||
Priority *int // 使用指针区分"未提供"和"设置为0"
|
||||
Concurrency *int // 使用指针区分"未提供"和"设置为0"
|
||||
Priority *int // 使用指针区分"未提供"和"设置为0"
|
||||
RateMultiplier *float64 // 账号计费倍率(>=0,允许 0)
|
||||
Status string
|
||||
GroupIDs *[]int64
|
||||
ExpiresAt *int64
|
||||
@@ -162,16 +165,17 @@ type UpdateAccountInput struct {
|
||||
|
||||
// BulkUpdateAccountsInput describes the payload for bulk updating accounts.
|
||||
type BulkUpdateAccountsInput struct {
|
||||
AccountIDs []int64
|
||||
Name string
|
||||
ProxyID *int64
|
||||
Concurrency *int
|
||||
Priority *int
|
||||
Status string
|
||||
Schedulable *bool
|
||||
GroupIDs *[]int64
|
||||
Credentials map[string]any
|
||||
Extra map[string]any
|
||||
AccountIDs []int64
|
||||
Name string
|
||||
ProxyID *int64
|
||||
Concurrency *int
|
||||
Priority *int
|
||||
RateMultiplier *float64 // 账号计费倍率(>=0,允许 0)
|
||||
Status string
|
||||
Schedulable *bool
|
||||
GroupIDs *[]int64
|
||||
Credentials map[string]any
|
||||
Extra map[string]any
|
||||
// SkipMixedChannelCheck skips the mixed channel risk check when binding groups.
|
||||
// This should only be set when the caller has explicitly confirmed the risk.
|
||||
SkipMixedChannelCheck bool
|
||||
@@ -220,6 +224,16 @@ type GenerateRedeemCodesInput struct {
|
||||
ValidityDays int // 订阅类型专用:有效天数
|
||||
}
|
||||
|
||||
type ProxyBatchDeleteResult struct {
|
||||
DeletedIDs []int64 `json:"deleted_ids"`
|
||||
Skipped []ProxyBatchDeleteSkipped `json:"skipped"`
|
||||
}
|
||||
|
||||
type ProxyBatchDeleteSkipped struct {
|
||||
ID int64 `json:"id"`
|
||||
Reason string `json:"reason"`
|
||||
}
|
||||
|
||||
// ProxyTestResult represents the result of testing a proxy
|
||||
type ProxyTestResult struct {
|
||||
Success bool `json:"success"`
|
||||
@@ -254,6 +268,7 @@ type adminServiceImpl struct {
|
||||
redeemCodeRepo RedeemCodeRepository
|
||||
billingCacheService *BillingCacheService
|
||||
proxyProber ProxyExitInfoProber
|
||||
proxyLatencyCache ProxyLatencyCache
|
||||
authCacheInvalidator APIKeyAuthCacheInvalidator
|
||||
}
|
||||
|
||||
@@ -267,6 +282,7 @@ func NewAdminService(
|
||||
redeemCodeRepo RedeemCodeRepository,
|
||||
billingCacheService *BillingCacheService,
|
||||
proxyProber ProxyExitInfoProber,
|
||||
proxyLatencyCache ProxyLatencyCache,
|
||||
authCacheInvalidator APIKeyAuthCacheInvalidator,
|
||||
) AdminService {
|
||||
return &adminServiceImpl{
|
||||
@@ -278,6 +294,7 @@ func NewAdminService(
|
||||
redeemCodeRepo: redeemCodeRepo,
|
||||
billingCacheService: billingCacheService,
|
||||
proxyProber: proxyProber,
|
||||
proxyLatencyCache: proxyLatencyCache,
|
||||
authCacheInvalidator: authCacheInvalidator,
|
||||
}
|
||||
}
|
||||
@@ -817,6 +834,12 @@ func (s *adminServiceImpl) CreateAccount(ctx context.Context, input *CreateAccou
|
||||
} else {
|
||||
account.AutoPauseOnExpired = true
|
||||
}
|
||||
if input.RateMultiplier != nil {
|
||||
if *input.RateMultiplier < 0 {
|
||||
return nil, errors.New("rate_multiplier must be >= 0")
|
||||
}
|
||||
account.RateMultiplier = input.RateMultiplier
|
||||
}
|
||||
if err := s.accountRepo.Create(ctx, account); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -869,6 +892,12 @@ func (s *adminServiceImpl) UpdateAccount(ctx context.Context, id int64, input *U
|
||||
if input.Priority != nil {
|
||||
account.Priority = *input.Priority
|
||||
}
|
||||
if input.RateMultiplier != nil {
|
||||
if *input.RateMultiplier < 0 {
|
||||
return nil, errors.New("rate_multiplier must be >= 0")
|
||||
}
|
||||
account.RateMultiplier = input.RateMultiplier
|
||||
}
|
||||
if input.Status != "" {
|
||||
account.Status = input.Status
|
||||
}
|
||||
@@ -942,6 +971,12 @@ func (s *adminServiceImpl) BulkUpdateAccounts(ctx context.Context, input *BulkUp
|
||||
}
|
||||
}
|
||||
|
||||
if input.RateMultiplier != nil {
|
||||
if *input.RateMultiplier < 0 {
|
||||
return nil, errors.New("rate_multiplier must be >= 0")
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare bulk updates for columns and JSONB fields.
|
||||
repoUpdates := AccountBulkUpdate{
|
||||
Credentials: input.Credentials,
|
||||
@@ -959,6 +994,9 @@ func (s *adminServiceImpl) BulkUpdateAccounts(ctx context.Context, input *BulkUp
|
||||
if input.Priority != nil {
|
||||
repoUpdates.Priority = input.Priority
|
||||
}
|
||||
if input.RateMultiplier != nil {
|
||||
repoUpdates.RateMultiplier = input.RateMultiplier
|
||||
}
|
||||
if input.Status != "" {
|
||||
repoUpdates.Status = &input.Status
|
||||
}
|
||||
@@ -1069,6 +1107,7 @@ func (s *adminServiceImpl) ListProxiesWithAccountCount(ctx context.Context, page
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
s.attachProxyLatency(ctx, proxies)
|
||||
return proxies, result.Total, nil
|
||||
}
|
||||
|
||||
@@ -1077,7 +1116,12 @@ func (s *adminServiceImpl) GetAllProxies(ctx context.Context) ([]Proxy, error) {
|
||||
}
|
||||
|
||||
func (s *adminServiceImpl) GetAllProxiesWithAccountCount(ctx context.Context) ([]ProxyWithAccountCount, error) {
|
||||
return s.proxyRepo.ListActiveWithAccountCount(ctx)
|
||||
proxies, err := s.proxyRepo.ListActiveWithAccountCount(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
s.attachProxyLatency(ctx, proxies)
|
||||
return proxies, nil
|
||||
}
|
||||
|
||||
func (s *adminServiceImpl) GetProxy(ctx context.Context, id int64) (*Proxy, error) {
|
||||
@@ -1097,6 +1141,8 @@ func (s *adminServiceImpl) CreateProxy(ctx context.Context, input *CreateProxyIn
|
||||
if err := s.proxyRepo.Create(ctx, proxy); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// Probe latency asynchronously so creation isn't blocked by network timeout.
|
||||
go s.probeProxyLatency(context.Background(), proxy)
|
||||
return proxy, nil
|
||||
}
|
||||
|
||||
@@ -1135,12 +1181,53 @@ func (s *adminServiceImpl) UpdateProxy(ctx context.Context, id int64, input *Upd
|
||||
}
|
||||
|
||||
func (s *adminServiceImpl) DeleteProxy(ctx context.Context, id int64) error {
|
||||
count, err := s.proxyRepo.CountAccountsByProxyID(ctx, id)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if count > 0 {
|
||||
return ErrProxyInUse
|
||||
}
|
||||
return s.proxyRepo.Delete(ctx, id)
|
||||
}
|
||||
|
||||
func (s *adminServiceImpl) GetProxyAccounts(ctx context.Context, proxyID int64, page, pageSize int) ([]Account, int64, error) {
|
||||
// Return mock data for now - would need a dedicated repository method
|
||||
return []Account{}, 0, nil
|
||||
func (s *adminServiceImpl) BatchDeleteProxies(ctx context.Context, ids []int64) (*ProxyBatchDeleteResult, error) {
|
||||
result := &ProxyBatchDeleteResult{}
|
||||
if len(ids) == 0 {
|
||||
return result, nil
|
||||
}
|
||||
|
||||
for _, id := range ids {
|
||||
count, err := s.proxyRepo.CountAccountsByProxyID(ctx, id)
|
||||
if err != nil {
|
||||
result.Skipped = append(result.Skipped, ProxyBatchDeleteSkipped{
|
||||
ID: id,
|
||||
Reason: err.Error(),
|
||||
})
|
||||
continue
|
||||
}
|
||||
if count > 0 {
|
||||
result.Skipped = append(result.Skipped, ProxyBatchDeleteSkipped{
|
||||
ID: id,
|
||||
Reason: ErrProxyInUse.Error(),
|
||||
})
|
||||
continue
|
||||
}
|
||||
if err := s.proxyRepo.Delete(ctx, id); err != nil {
|
||||
result.Skipped = append(result.Skipped, ProxyBatchDeleteSkipped{
|
||||
ID: id,
|
||||
Reason: err.Error(),
|
||||
})
|
||||
continue
|
||||
}
|
||||
result.DeletedIDs = append(result.DeletedIDs, id)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (s *adminServiceImpl) GetProxyAccounts(ctx context.Context, proxyID int64) ([]ProxyAccountSummary, error) {
|
||||
return s.proxyRepo.ListAccountSummariesByProxyID(ctx, proxyID)
|
||||
}
|
||||
|
||||
func (s *adminServiceImpl) CheckProxyExists(ctx context.Context, host string, port int, username, password string) (bool, error) {
|
||||
@@ -1240,12 +1327,24 @@ func (s *adminServiceImpl) TestProxy(ctx context.Context, id int64) (*ProxyTestR
|
||||
proxyURL := proxy.URL()
|
||||
exitInfo, latencyMs, err := s.proxyProber.ProbeProxy(ctx, proxyURL)
|
||||
if err != nil {
|
||||
s.saveProxyLatency(ctx, id, &ProxyLatencyInfo{
|
||||
Success: false,
|
||||
Message: err.Error(),
|
||||
UpdatedAt: time.Now(),
|
||||
})
|
||||
return &ProxyTestResult{
|
||||
Success: false,
|
||||
Message: err.Error(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
latency := latencyMs
|
||||
s.saveProxyLatency(ctx, id, &ProxyLatencyInfo{
|
||||
Success: true,
|
||||
LatencyMs: &latency,
|
||||
Message: "Proxy is accessible",
|
||||
UpdatedAt: time.Now(),
|
||||
})
|
||||
return &ProxyTestResult{
|
||||
Success: true,
|
||||
Message: "Proxy is accessible",
|
||||
@@ -1257,6 +1356,29 @@ func (s *adminServiceImpl) TestProxy(ctx context.Context, id int64) (*ProxyTestR
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *adminServiceImpl) probeProxyLatency(ctx context.Context, proxy *Proxy) {
|
||||
if s.proxyProber == nil || proxy == nil {
|
||||
return
|
||||
}
|
||||
_, latencyMs, err := s.proxyProber.ProbeProxy(ctx, proxy.URL())
|
||||
if err != nil {
|
||||
s.saveProxyLatency(ctx, proxy.ID, &ProxyLatencyInfo{
|
||||
Success: false,
|
||||
Message: err.Error(),
|
||||
UpdatedAt: time.Now(),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
latency := latencyMs
|
||||
s.saveProxyLatency(ctx, proxy.ID, &ProxyLatencyInfo{
|
||||
Success: true,
|
||||
LatencyMs: &latency,
|
||||
Message: "Proxy is accessible",
|
||||
UpdatedAt: time.Now(),
|
||||
})
|
||||
}
|
||||
|
||||
// checkMixedChannelRisk 检查分组中是否存在混合渠道(Antigravity + Anthropic)
|
||||
// 如果存在混合,返回错误提示用户确认
|
||||
func (s *adminServiceImpl) checkMixedChannelRisk(ctx context.Context, currentAccountID int64, currentAccountPlatform string, groupIDs []int64) error {
|
||||
@@ -1306,6 +1428,46 @@ func (s *adminServiceImpl) checkMixedChannelRisk(ctx context.Context, currentAcc
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *adminServiceImpl) attachProxyLatency(ctx context.Context, proxies []ProxyWithAccountCount) {
|
||||
if s.proxyLatencyCache == nil || len(proxies) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
ids := make([]int64, 0, len(proxies))
|
||||
for i := range proxies {
|
||||
ids = append(ids, proxies[i].ID)
|
||||
}
|
||||
|
||||
latencies, err := s.proxyLatencyCache.GetProxyLatencies(ctx, ids)
|
||||
if err != nil {
|
||||
log.Printf("Warning: load proxy latency cache failed: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
for i := range proxies {
|
||||
info := latencies[proxies[i].ID]
|
||||
if info == nil {
|
||||
continue
|
||||
}
|
||||
if info.Success {
|
||||
proxies[i].LatencyStatus = "success"
|
||||
proxies[i].LatencyMs = info.LatencyMs
|
||||
} else {
|
||||
proxies[i].LatencyStatus = "failed"
|
||||
}
|
||||
proxies[i].LatencyMessage = info.Message
|
||||
}
|
||||
}
|
||||
|
||||
func (s *adminServiceImpl) saveProxyLatency(ctx context.Context, proxyID int64, info *ProxyLatencyInfo) {
|
||||
if s.proxyLatencyCache == nil || info == nil {
|
||||
return
|
||||
}
|
||||
if err := s.proxyLatencyCache.SetProxyLatency(ctx, proxyID, info); err != nil {
|
||||
log.Printf("Warning: store proxy latency cache failed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// getAccountPlatform 根据账号 platform 判断混合渠道检查用的平台标识
|
||||
func getAccountPlatform(accountPlatform string) string {
|
||||
switch strings.ToLower(strings.TrimSpace(accountPlatform)) {
|
||||
|
||||
@@ -12,9 +12,9 @@ import (
|
||||
|
||||
type accountRepoStubForBulkUpdate struct {
|
||||
accountRepoStub
|
||||
bulkUpdateErr error
|
||||
bulkUpdateIDs []int64
|
||||
bindGroupErrByID map[int64]error
|
||||
bulkUpdateErr error
|
||||
bulkUpdateIDs []int64
|
||||
bindGroupErrByID map[int64]error
|
||||
}
|
||||
|
||||
func (s *accountRepoStubForBulkUpdate) BulkUpdate(_ context.Context, ids []int64, _ AccountBulkUpdate) (int64, error) {
|
||||
|
||||
@@ -153,8 +153,10 @@ func (s *groupRepoStub) DeleteAccountGroupsByGroupID(ctx context.Context, groupI
|
||||
}
|
||||
|
||||
type proxyRepoStub struct {
|
||||
deleteErr error
|
||||
deletedIDs []int64
|
||||
deleteErr error
|
||||
countErr error
|
||||
accountCount int64
|
||||
deletedIDs []int64
|
||||
}
|
||||
|
||||
func (s *proxyRepoStub) Create(ctx context.Context, proxy *Proxy) error {
|
||||
@@ -199,7 +201,14 @@ func (s *proxyRepoStub) ExistsByHostPortAuth(ctx context.Context, host string, p
|
||||
}
|
||||
|
||||
func (s *proxyRepoStub) CountAccountsByProxyID(ctx context.Context, proxyID int64) (int64, error) {
|
||||
panic("unexpected CountAccountsByProxyID call")
|
||||
if s.countErr != nil {
|
||||
return 0, s.countErr
|
||||
}
|
||||
return s.accountCount, nil
|
||||
}
|
||||
|
||||
func (s *proxyRepoStub) ListAccountSummariesByProxyID(ctx context.Context, proxyID int64) ([]ProxyAccountSummary, error) {
|
||||
panic("unexpected ListAccountSummariesByProxyID call")
|
||||
}
|
||||
|
||||
type redeemRepoStub struct {
|
||||
@@ -409,6 +418,15 @@ func TestAdminService_DeleteProxy_Idempotent(t *testing.T) {
|
||||
require.Equal(t, []int64{404}, repo.deletedIDs)
|
||||
}
|
||||
|
||||
func TestAdminService_DeleteProxy_InUse(t *testing.T) {
|
||||
repo := &proxyRepoStub{accountCount: 2}
|
||||
svc := &adminServiceImpl{proxyRepo: repo}
|
||||
|
||||
err := svc.DeleteProxy(context.Background(), 77)
|
||||
require.ErrorIs(t, err, ErrProxyInUse)
|
||||
require.Empty(t, repo.deletedIDs)
|
||||
}
|
||||
|
||||
func TestAdminService_DeleteProxy_Error(t *testing.T) {
|
||||
deleteErr := errors.New("delete failed")
|
||||
repo := &proxyRepoStub{deleteErr: deleteErr}
|
||||
|
||||
@@ -564,6 +564,10 @@ urlFallbackLoop:
|
||||
}
|
||||
|
||||
upstreamReq, err := antigravity.NewAPIRequestWithURL(ctx, baseURL, action, accessToken, geminiBody)
|
||||
// Capture upstream request body for ops retry of this attempt.
|
||||
if c != nil {
|
||||
c.Set(OpsUpstreamRequestBodyKey, string(geminiBody))
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -574,6 +578,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "request_error",
|
||||
Message: safeErr,
|
||||
@@ -615,6 +620,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "retry",
|
||||
@@ -645,6 +651,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "retry",
|
||||
@@ -697,6 +704,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "signature_error",
|
||||
@@ -740,6 +748,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "signature_retry_request_error",
|
||||
Message: sanitizeUpstreamErrorMessage(retryErr.Error()),
|
||||
@@ -770,6 +779,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: retryResp.StatusCode,
|
||||
UpstreamRequestID: retryResp.Header.Get("x-request-id"),
|
||||
Kind: kind,
|
||||
@@ -817,6 +827,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "failover",
|
||||
@@ -1371,6 +1382,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "request_error",
|
||||
Message: safeErr,
|
||||
@@ -1412,6 +1424,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "retry",
|
||||
@@ -1442,6 +1455,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "retry",
|
||||
@@ -1543,6 +1557,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: requestID,
|
||||
Kind: "failover",
|
||||
@@ -1559,6 +1574,7 @@ urlFallbackLoop:
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: requestID,
|
||||
Kind: "http_error",
|
||||
@@ -2039,6 +2055,7 @@ func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, accou
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: upstreamStatus,
|
||||
UpstreamRequestID: upstreamRequestID,
|
||||
Kind: "http_error",
|
||||
|
||||
@@ -124,16 +124,16 @@ func (s *DashboardService) GetDashboardStats(ctx context.Context) (*usagestats.D
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func (s *DashboardService) GetUsageTrendWithFilters(ctx context.Context, startTime, endTime time.Time, granularity string, userID, apiKeyID int64) ([]usagestats.TrendDataPoint, error) {
|
||||
trend, err := s.usageRepo.GetUsageTrendWithFilters(ctx, startTime, endTime, granularity, userID, apiKeyID)
|
||||
func (s *DashboardService) GetUsageTrendWithFilters(ctx context.Context, startTime, endTime time.Time, granularity string, userID, apiKeyID, accountID, groupID int64, model string, stream *bool) ([]usagestats.TrendDataPoint, error) {
|
||||
trend, err := s.usageRepo.GetUsageTrendWithFilters(ctx, startTime, endTime, granularity, userID, apiKeyID, accountID, groupID, model, stream)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get usage trend with filters: %w", err)
|
||||
}
|
||||
return trend, nil
|
||||
}
|
||||
|
||||
func (s *DashboardService) GetModelStatsWithFilters(ctx context.Context, startTime, endTime time.Time, userID, apiKeyID int64) ([]usagestats.ModelStat, error) {
|
||||
stats, err := s.usageRepo.GetModelStatsWithFilters(ctx, startTime, endTime, userID, apiKeyID, 0)
|
||||
func (s *DashboardService) GetModelStatsWithFilters(ctx context.Context, startTime, endTime time.Time, userID, apiKeyID, accountID, groupID int64, stream *bool) ([]usagestats.ModelStat, error) {
|
||||
stats, err := s.usageRepo.GetModelStatsWithFilters(ctx, startTime, endTime, userID, apiKeyID, accountID, groupID, stream)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get model stats with filters: %w", err)
|
||||
}
|
||||
|
||||
@@ -1466,6 +1466,9 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
for attempt := 1; attempt <= maxRetryAttempts; attempt++ {
|
||||
// 构建上游请求(每次重试需要重新构建,因为请求体需要重新读取)
|
||||
upstreamReq, err := s.buildUpstreamRequest(ctx, c, account, body, token, tokenType, reqModel)
|
||||
// Capture upstream request body for ops retry of this attempt.
|
||||
c.Set(OpsUpstreamRequestBodyKey, string(body))
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -1482,6 +1485,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "request_error",
|
||||
Message: safeErr,
|
||||
@@ -1506,6 +1510,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "signature_error",
|
||||
@@ -1557,6 +1562,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: retryResp.StatusCode,
|
||||
UpstreamRequestID: retryResp.Header.Get("x-request-id"),
|
||||
Kind: "signature_retry_thinking",
|
||||
@@ -1585,6 +1591,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "signature_retry_tools_request_error",
|
||||
Message: sanitizeUpstreamErrorMessage(retryErr2.Error()),
|
||||
@@ -1643,6 +1650,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "retry",
|
||||
@@ -1691,6 +1699,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "retry_exhausted_failover",
|
||||
@@ -1757,6 +1766,7 @@ func (s *GatewayService) Forward(ctx context.Context, c *gin.Context, account *A
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "failover_on_400",
|
||||
@@ -2634,30 +2644,32 @@ func (s *GatewayService) RecordUsage(ctx context.Context, input *RecordUsageInpu
|
||||
if result.ImageSize != "" {
|
||||
imageSize = &result.ImageSize
|
||||
}
|
||||
accountRateMultiplier := account.BillingRateMultiplier()
|
||||
usageLog := &UsageLog{
|
||||
UserID: user.ID,
|
||||
APIKeyID: apiKey.ID,
|
||||
AccountID: account.ID,
|
||||
RequestID: result.RequestID,
|
||||
Model: result.Model,
|
||||
InputTokens: result.Usage.InputTokens,
|
||||
OutputTokens: result.Usage.OutputTokens,
|
||||
CacheCreationTokens: result.Usage.CacheCreationInputTokens,
|
||||
CacheReadTokens: result.Usage.CacheReadInputTokens,
|
||||
InputCost: cost.InputCost,
|
||||
OutputCost: cost.OutputCost,
|
||||
CacheCreationCost: cost.CacheCreationCost,
|
||||
CacheReadCost: cost.CacheReadCost,
|
||||
TotalCost: cost.TotalCost,
|
||||
ActualCost: cost.ActualCost,
|
||||
RateMultiplier: multiplier,
|
||||
BillingType: billingType,
|
||||
Stream: result.Stream,
|
||||
DurationMs: &durationMs,
|
||||
FirstTokenMs: result.FirstTokenMs,
|
||||
ImageCount: result.ImageCount,
|
||||
ImageSize: imageSize,
|
||||
CreatedAt: time.Now(),
|
||||
UserID: user.ID,
|
||||
APIKeyID: apiKey.ID,
|
||||
AccountID: account.ID,
|
||||
RequestID: result.RequestID,
|
||||
Model: result.Model,
|
||||
InputTokens: result.Usage.InputTokens,
|
||||
OutputTokens: result.Usage.OutputTokens,
|
||||
CacheCreationTokens: result.Usage.CacheCreationInputTokens,
|
||||
CacheReadTokens: result.Usage.CacheReadInputTokens,
|
||||
InputCost: cost.InputCost,
|
||||
OutputCost: cost.OutputCost,
|
||||
CacheCreationCost: cost.CacheCreationCost,
|
||||
CacheReadCost: cost.CacheReadCost,
|
||||
TotalCost: cost.TotalCost,
|
||||
ActualCost: cost.ActualCost,
|
||||
RateMultiplier: multiplier,
|
||||
AccountRateMultiplier: &accountRateMultiplier,
|
||||
BillingType: billingType,
|
||||
Stream: result.Stream,
|
||||
DurationMs: &durationMs,
|
||||
FirstTokenMs: result.FirstTokenMs,
|
||||
ImageCount: result.ImageCount,
|
||||
ImageSize: imageSize,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
|
||||
// 添加 UserAgent
|
||||
|
||||
@@ -545,12 +545,19 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
|
||||
}
|
||||
requestIDHeader = idHeader
|
||||
|
||||
// Capture upstream request body for ops retry of this attempt.
|
||||
if c != nil {
|
||||
// In this code path `body` is already the JSON sent to upstream.
|
||||
c.Set(OpsUpstreamRequestBodyKey, string(body))
|
||||
}
|
||||
|
||||
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
||||
if err != nil {
|
||||
safeErr := sanitizeUpstreamErrorMessage(err.Error())
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "request_error",
|
||||
Message: safeErr,
|
||||
@@ -588,6 +595,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: upstreamReqID,
|
||||
Kind: "signature_error",
|
||||
@@ -662,6 +670,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: upstreamReqID,
|
||||
Kind: "retry",
|
||||
@@ -711,6 +720,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: upstreamReqID,
|
||||
Kind: "failover",
|
||||
@@ -737,6 +747,7 @@ func (s *GeminiMessagesCompatService) Forward(ctx context.Context, c *gin.Contex
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: upstreamReqID,
|
||||
Kind: "failover",
|
||||
@@ -972,12 +983,19 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
|
||||
}
|
||||
requestIDHeader = idHeader
|
||||
|
||||
// Capture upstream request body for ops retry of this attempt.
|
||||
if c != nil {
|
||||
// In this code path `body` is already the JSON sent to upstream.
|
||||
c.Set(OpsUpstreamRequestBodyKey, string(body))
|
||||
}
|
||||
|
||||
resp, err = s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
||||
if err != nil {
|
||||
safeErr := sanitizeUpstreamErrorMessage(err.Error())
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "request_error",
|
||||
Message: safeErr,
|
||||
@@ -1036,6 +1054,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: upstreamReqID,
|
||||
Kind: "retry",
|
||||
@@ -1120,6 +1139,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: requestID,
|
||||
Kind: "failover",
|
||||
@@ -1143,6 +1163,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: requestID,
|
||||
Kind: "failover",
|
||||
@@ -1168,6 +1189,7 @@ func (s *GeminiMessagesCompatService) ForwardNative(ctx context.Context, c *gin.
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: requestID,
|
||||
Kind: "http_error",
|
||||
@@ -1300,6 +1322,7 @@ func (s *GeminiMessagesCompatService) writeGeminiMappedError(c *gin.Context, acc
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: upstreamStatus,
|
||||
UpstreamRequestID: upstreamRequestID,
|
||||
Kind: "http_error",
|
||||
|
||||
@@ -664,6 +664,11 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco
|
||||
proxyURL = account.Proxy.URL()
|
||||
}
|
||||
|
||||
// Capture upstream request body for ops retry of this attempt.
|
||||
if c != nil {
|
||||
c.Set(OpsUpstreamRequestBodyKey, string(body))
|
||||
}
|
||||
|
||||
// Send request
|
||||
resp, err := s.httpUpstream.Do(upstreamReq, proxyURL, account.ID, account.Concurrency)
|
||||
if err != nil {
|
||||
@@ -673,6 +678,7 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: 0,
|
||||
Kind: "request_error",
|
||||
Message: safeErr,
|
||||
@@ -707,6 +713,7 @@ func (s *OpenAIGatewayService) Forward(ctx context.Context, c *gin.Context, acco
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "failover",
|
||||
@@ -864,6 +871,7 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: "http_error",
|
||||
@@ -894,6 +902,7 @@ func (s *OpenAIGatewayService) handleErrorResponse(ctx context.Context, resp *ht
|
||||
appendOpsUpstreamError(c, OpsUpstreamErrorEvent{
|
||||
Platform: account.Platform,
|
||||
AccountID: account.ID,
|
||||
AccountName: account.Name,
|
||||
UpstreamStatusCode: resp.StatusCode,
|
||||
UpstreamRequestID: resp.Header.Get("x-request-id"),
|
||||
Kind: kind,
|
||||
@@ -1443,28 +1452,30 @@ func (s *OpenAIGatewayService) RecordUsage(ctx context.Context, input *OpenAIRec
|
||||
|
||||
// Create usage log
|
||||
durationMs := int(result.Duration.Milliseconds())
|
||||
accountRateMultiplier := account.BillingRateMultiplier()
|
||||
usageLog := &UsageLog{
|
||||
UserID: user.ID,
|
||||
APIKeyID: apiKey.ID,
|
||||
AccountID: account.ID,
|
||||
RequestID: result.RequestID,
|
||||
Model: result.Model,
|
||||
InputTokens: actualInputTokens,
|
||||
OutputTokens: result.Usage.OutputTokens,
|
||||
CacheCreationTokens: result.Usage.CacheCreationInputTokens,
|
||||
CacheReadTokens: result.Usage.CacheReadInputTokens,
|
||||
InputCost: cost.InputCost,
|
||||
OutputCost: cost.OutputCost,
|
||||
CacheCreationCost: cost.CacheCreationCost,
|
||||
CacheReadCost: cost.CacheReadCost,
|
||||
TotalCost: cost.TotalCost,
|
||||
ActualCost: cost.ActualCost,
|
||||
RateMultiplier: multiplier,
|
||||
BillingType: billingType,
|
||||
Stream: result.Stream,
|
||||
DurationMs: &durationMs,
|
||||
FirstTokenMs: result.FirstTokenMs,
|
||||
CreatedAt: time.Now(),
|
||||
UserID: user.ID,
|
||||
APIKeyID: apiKey.ID,
|
||||
AccountID: account.ID,
|
||||
RequestID: result.RequestID,
|
||||
Model: result.Model,
|
||||
InputTokens: actualInputTokens,
|
||||
OutputTokens: result.Usage.OutputTokens,
|
||||
CacheCreationTokens: result.Usage.CacheCreationInputTokens,
|
||||
CacheReadTokens: result.Usage.CacheReadInputTokens,
|
||||
InputCost: cost.InputCost,
|
||||
OutputCost: cost.OutputCost,
|
||||
CacheCreationCost: cost.CacheCreationCost,
|
||||
CacheReadCost: cost.CacheReadCost,
|
||||
TotalCost: cost.TotalCost,
|
||||
ActualCost: cost.ActualCost,
|
||||
RateMultiplier: multiplier,
|
||||
AccountRateMultiplier: &accountRateMultiplier,
|
||||
BillingType: billingType,
|
||||
Stream: result.Stream,
|
||||
DurationMs: &durationMs,
|
||||
FirstTokenMs: result.FirstTokenMs,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
|
||||
// 添加 UserAgent
|
||||
|
||||
@@ -206,7 +206,7 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
|
||||
continue
|
||||
}
|
||||
|
||||
scopePlatform, scopeGroupID := parseOpsAlertRuleScope(rule.Filters)
|
||||
scopePlatform, scopeGroupID, scopeRegion := parseOpsAlertRuleScope(rule.Filters)
|
||||
|
||||
windowMinutes := rule.WindowMinutes
|
||||
if windowMinutes <= 0 {
|
||||
@@ -236,6 +236,17 @@ func (s *OpsAlertEvaluatorService) evaluateOnce(interval time.Duration) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Scoped silencing: if a matching silence exists, skip creating a firing event.
|
||||
if s.opsService != nil {
|
||||
platform := strings.TrimSpace(scopePlatform)
|
||||
region := scopeRegion
|
||||
if platform != "" {
|
||||
if ok, err := s.opsService.IsAlertSilenced(ctx, rule.ID, platform, scopeGroupID, region, now); err == nil && ok {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
latestEvent, err := s.opsRepo.GetLatestAlertEvent(ctx, rule.ID)
|
||||
if err != nil {
|
||||
log.Printf("[OpsAlertEvaluator] get latest event failed (rule=%d): %v", rule.ID, err)
|
||||
@@ -359,9 +370,9 @@ func requiredSustainedBreaches(sustainedMinutes int, interval time.Duration) int
|
||||
return required
|
||||
}
|
||||
|
||||
func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64) {
|
||||
func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *int64, region *string) {
|
||||
if filters == nil {
|
||||
return "", nil
|
||||
return "", nil, nil
|
||||
}
|
||||
if v, ok := filters["platform"]; ok {
|
||||
if s, ok := v.(string); ok {
|
||||
@@ -392,7 +403,15 @@ func parseOpsAlertRuleScope(filters map[string]any) (platform string, groupID *i
|
||||
}
|
||||
}
|
||||
}
|
||||
return platform, groupID
|
||||
if v, ok := filters["region"]; ok {
|
||||
if s, ok := v.(string); ok {
|
||||
vv := strings.TrimSpace(s)
|
||||
if vv != "" {
|
||||
region = &vv
|
||||
}
|
||||
}
|
||||
}
|
||||
return platform, groupID, region
|
||||
}
|
||||
|
||||
func (s *OpsAlertEvaluatorService) computeRuleMetric(
|
||||
@@ -504,16 +523,6 @@ func (s *OpsAlertEvaluatorService) computeRuleMetric(
|
||||
return 0, false
|
||||
}
|
||||
return overview.UpstreamErrorRate * 100, true
|
||||
case "p95_latency_ms":
|
||||
if overview.Duration.P95 == nil {
|
||||
return 0, false
|
||||
}
|
||||
return float64(*overview.Duration.P95), true
|
||||
case "p99_latency_ms":
|
||||
if overview.Duration.P99 == nil {
|
||||
return 0, false
|
||||
}
|
||||
return float64(*overview.Duration.P99), true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
|
||||
@@ -8,8 +8,9 @@ import "time"
|
||||
// with the existing ops dashboard frontend (backup style).
|
||||
|
||||
const (
|
||||
OpsAlertStatusFiring = "firing"
|
||||
OpsAlertStatusResolved = "resolved"
|
||||
OpsAlertStatusFiring = "firing"
|
||||
OpsAlertStatusResolved = "resolved"
|
||||
OpsAlertStatusManualResolved = "manual_resolved"
|
||||
)
|
||||
|
||||
type OpsAlertRule struct {
|
||||
@@ -58,12 +59,32 @@ type OpsAlertEvent struct {
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
}
|
||||
|
||||
type OpsAlertSilence struct {
|
||||
ID int64 `json:"id"`
|
||||
|
||||
RuleID int64 `json:"rule_id"`
|
||||
Platform string `json:"platform"`
|
||||
GroupID *int64 `json:"group_id,omitempty"`
|
||||
Region *string `json:"region,omitempty"`
|
||||
|
||||
Until time.Time `json:"until"`
|
||||
Reason string `json:"reason"`
|
||||
|
||||
CreatedBy *int64 `json:"created_by,omitempty"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
}
|
||||
|
||||
type OpsAlertEventFilter struct {
|
||||
Limit int
|
||||
|
||||
// Cursor pagination (descending by fired_at, then id).
|
||||
BeforeFiredAt *time.Time
|
||||
BeforeID *int64
|
||||
|
||||
// Optional filters.
|
||||
Status string
|
||||
Severity string
|
||||
Status string
|
||||
Severity string
|
||||
EmailSent *bool
|
||||
|
||||
StartTime *time.Time
|
||||
EndTime *time.Time
|
||||
|
||||
@@ -88,6 +88,29 @@ func (s *OpsService) ListAlertEvents(ctx context.Context, filter *OpsAlertEventF
|
||||
return s.opsRepo.ListAlertEvents(ctx, filter)
|
||||
}
|
||||
|
||||
func (s *OpsService) GetAlertEventByID(ctx context.Context, eventID int64) (*OpsAlertEvent, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if eventID <= 0 {
|
||||
return nil, infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
|
||||
}
|
||||
ev, err := s.opsRepo.GetAlertEventByID(ctx, eventID)
|
||||
if err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, infraerrors.NotFound("OPS_ALERT_EVENT_NOT_FOUND", "alert event not found")
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
if ev == nil {
|
||||
return nil, infraerrors.NotFound("OPS_ALERT_EVENT_NOT_FOUND", "alert event not found")
|
||||
}
|
||||
return ev, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
@@ -101,6 +124,49 @@ func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*Op
|
||||
return s.opsRepo.GetActiveAlertEvent(ctx, ruleID)
|
||||
}
|
||||
|
||||
func (s *OpsService) CreateAlertSilence(ctx context.Context, input *OpsAlertSilence) (*OpsAlertSilence, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if input == nil {
|
||||
return nil, infraerrors.BadRequest("INVALID_SILENCE", "invalid silence")
|
||||
}
|
||||
if input.RuleID <= 0 {
|
||||
return nil, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
|
||||
}
|
||||
if strings.TrimSpace(input.Platform) == "" {
|
||||
return nil, infraerrors.BadRequest("INVALID_PLATFORM", "invalid platform")
|
||||
}
|
||||
if input.Until.IsZero() {
|
||||
return nil, infraerrors.BadRequest("INVALID_UNTIL", "invalid until")
|
||||
}
|
||||
|
||||
created, err := s.opsRepo.CreateAlertSilence(ctx, input)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return created, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) IsAlertSilenced(ctx context.Context, ruleID int64, platform string, groupID *int64, region *string, now time.Time) (bool, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return false, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return false, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if ruleID <= 0 {
|
||||
return false, infraerrors.BadRequest("INVALID_RULE_ID", "invalid rule id")
|
||||
}
|
||||
if strings.TrimSpace(platform) == "" {
|
||||
return false, nil
|
||||
}
|
||||
return s.opsRepo.IsAlertSilenced(ctx, ruleID, platform, groupID, region, now)
|
||||
}
|
||||
|
||||
func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
@@ -142,7 +208,11 @@ func (s *OpsService) UpdateAlertEventStatus(ctx context.Context, eventID int64,
|
||||
if eventID <= 0 {
|
||||
return infraerrors.BadRequest("INVALID_EVENT_ID", "invalid event id")
|
||||
}
|
||||
if strings.TrimSpace(status) == "" {
|
||||
status = strings.TrimSpace(status)
|
||||
if status == "" {
|
||||
return infraerrors.BadRequest("INVALID_STATUS", "invalid status")
|
||||
}
|
||||
if status != OpsAlertStatusResolved && status != OpsAlertStatusManualResolved {
|
||||
return infraerrors.BadRequest("INVALID_STATUS", "invalid status")
|
||||
}
|
||||
return s.opsRepo.UpdateAlertEventStatus(ctx, eventID, status, resolvedAt)
|
||||
|
||||
@@ -32,49 +32,38 @@ func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview)
|
||||
}
|
||||
|
||||
// computeBusinessHealth calculates business health score (0-100)
|
||||
// Components: SLA (50%) + Error Rate (30%) + Latency (20%)
|
||||
// Components: Error Rate (50%) + TTFT (50%)
|
||||
func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
|
||||
// SLA score: 99.5% → 100, 95% → 0 (linear)
|
||||
slaScore := 100.0
|
||||
slaPct := clampFloat64(overview.SLA*100, 0, 100)
|
||||
if slaPct < 99.5 {
|
||||
if slaPct >= 95 {
|
||||
slaScore = (slaPct - 95) / 4.5 * 100
|
||||
} else {
|
||||
slaScore = 0
|
||||
}
|
||||
}
|
||||
|
||||
// Error rate score: 0.5% → 100, 5% → 0 (linear)
|
||||
// Error rate score: 1% → 100, 10% → 0 (linear)
|
||||
// Combines request errors and upstream errors
|
||||
errorScore := 100.0
|
||||
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
|
||||
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
|
||||
combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
|
||||
if combinedErrorPct > 0.5 {
|
||||
if combinedErrorPct <= 5 {
|
||||
errorScore = (5 - combinedErrorPct) / 4.5 * 100
|
||||
if combinedErrorPct > 1.0 {
|
||||
if combinedErrorPct <= 10.0 {
|
||||
errorScore = (10.0 - combinedErrorPct) / 9.0 * 100
|
||||
} else {
|
||||
errorScore = 0
|
||||
}
|
||||
}
|
||||
|
||||
// Latency score: 1s → 100, 10s → 0 (linear)
|
||||
// Uses P99 of duration (TTFT is less critical for overall health)
|
||||
latencyScore := 100.0
|
||||
if overview.Duration.P99 != nil {
|
||||
p99 := float64(*overview.Duration.P99)
|
||||
// TTFT score: 1s → 100, 3s → 0 (linear)
|
||||
// Time to first token is critical for user experience
|
||||
ttftScore := 100.0
|
||||
if overview.TTFT.P99 != nil {
|
||||
p99 := float64(*overview.TTFT.P99)
|
||||
if p99 > 1000 {
|
||||
if p99 <= 10000 {
|
||||
latencyScore = (10000 - p99) / 9000 * 100
|
||||
if p99 <= 3000 {
|
||||
ttftScore = (3000 - p99) / 2000 * 100
|
||||
} else {
|
||||
latencyScore = 0
|
||||
ttftScore = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Weighted combination
|
||||
return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2
|
||||
// Weighted combination: 50% error rate + 50% TTFT
|
||||
return errorScore*0.5 + ttftScore*0.5
|
||||
}
|
||||
|
||||
// computeInfraHealth calculates infrastructure health score (0-100)
|
||||
|
||||
@@ -127,8 +127,8 @@ func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
|
||||
MemoryUsagePercent: float64Ptr(75),
|
||||
},
|
||||
},
|
||||
wantMin: 60,
|
||||
wantMax: 85,
|
||||
wantMin: 96,
|
||||
wantMax: 97,
|
||||
},
|
||||
{
|
||||
name: "DB failure",
|
||||
@@ -203,8 +203,8 @@ func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
|
||||
MemoryUsagePercent: float64Ptr(30),
|
||||
},
|
||||
},
|
||||
wantMin: 25,
|
||||
wantMax: 50,
|
||||
wantMin: 84,
|
||||
wantMax: 85,
|
||||
},
|
||||
{
|
||||
name: "combined failures - business healthy + infra degraded",
|
||||
@@ -277,30 +277,41 @@ func TestComputeBusinessHealth(t *testing.T) {
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
},
|
||||
wantMin: 50,
|
||||
wantMax: 60,
|
||||
wantMin: 100,
|
||||
wantMax: 100,
|
||||
},
|
||||
{
|
||||
name: "error rate boundary 0.5%",
|
||||
name: "error rate boundary 1%",
|
||||
overview: &OpsDashboardOverview{
|
||||
SLA: 0.995,
|
||||
ErrorRate: 0.005,
|
||||
SLA: 0.99,
|
||||
ErrorRate: 0.01,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
},
|
||||
wantMin: 95,
|
||||
wantMin: 100,
|
||||
wantMax: 100,
|
||||
},
|
||||
{
|
||||
name: "latency boundary 1000ms",
|
||||
name: "error rate 5%",
|
||||
overview: &OpsDashboardOverview{
|
||||
SLA: 0.995,
|
||||
SLA: 0.95,
|
||||
ErrorRate: 0.05,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
},
|
||||
wantMin: 77,
|
||||
wantMax: 78,
|
||||
},
|
||||
{
|
||||
name: "TTFT boundary 2s",
|
||||
overview: &OpsDashboardOverview{
|
||||
SLA: 0.99,
|
||||
ErrorRate: 0,
|
||||
UpstreamErrorRate: 0,
|
||||
Duration: OpsPercentiles{P99: intPtr(1000)},
|
||||
TTFT: OpsPercentiles{P99: intPtr(2000)},
|
||||
},
|
||||
wantMin: 95,
|
||||
wantMax: 100,
|
||||
wantMin: 75,
|
||||
wantMax: 75,
|
||||
},
|
||||
{
|
||||
name: "upstream error dominates",
|
||||
@@ -310,7 +321,7 @@ func TestComputeBusinessHealth(t *testing.T) {
|
||||
UpstreamErrorRate: 0.03,
|
||||
Duration: OpsPercentiles{P99: intPtr(500)},
|
||||
},
|
||||
wantMin: 75,
|
||||
wantMin: 88,
|
||||
wantMax: 90,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -6,24 +6,43 @@ type OpsErrorLog struct {
|
||||
ID int64 `json:"id"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
|
||||
Phase string `json:"phase"`
|
||||
Type string `json:"type"`
|
||||
// Standardized classification
|
||||
// - phase: request|auth|routing|upstream|network|internal
|
||||
// - owner: client|provider|platform
|
||||
// - source: client_request|upstream_http|gateway
|
||||
Phase string `json:"phase"`
|
||||
Type string `json:"type"`
|
||||
|
||||
Owner string `json:"error_owner"`
|
||||
Source string `json:"error_source"`
|
||||
|
||||
Severity string `json:"severity"`
|
||||
|
||||
StatusCode int `json:"status_code"`
|
||||
Platform string `json:"platform"`
|
||||
Model string `json:"model"`
|
||||
|
||||
LatencyMs *int `json:"latency_ms"`
|
||||
IsRetryable bool `json:"is_retryable"`
|
||||
RetryCount int `json:"retry_count"`
|
||||
|
||||
Resolved bool `json:"resolved"`
|
||||
ResolvedAt *time.Time `json:"resolved_at"`
|
||||
ResolvedByUserID *int64 `json:"resolved_by_user_id"`
|
||||
ResolvedByUserName string `json:"resolved_by_user_name"`
|
||||
ResolvedRetryID *int64 `json:"resolved_retry_id"`
|
||||
ResolvedStatusRaw string `json:"-"`
|
||||
|
||||
ClientRequestID string `json:"client_request_id"`
|
||||
RequestID string `json:"request_id"`
|
||||
Message string `json:"message"`
|
||||
|
||||
UserID *int64 `json:"user_id"`
|
||||
APIKeyID *int64 `json:"api_key_id"`
|
||||
AccountID *int64 `json:"account_id"`
|
||||
GroupID *int64 `json:"group_id"`
|
||||
UserID *int64 `json:"user_id"`
|
||||
UserEmail string `json:"user_email"`
|
||||
APIKeyID *int64 `json:"api_key_id"`
|
||||
AccountID *int64 `json:"account_id"`
|
||||
AccountName string `json:"account_name"`
|
||||
GroupID *int64 `json:"group_id"`
|
||||
GroupName string `json:"group_name"`
|
||||
|
||||
ClientIP *string `json:"client_ip"`
|
||||
RequestPath string `json:"request_path"`
|
||||
@@ -67,9 +86,24 @@ type OpsErrorLogFilter struct {
|
||||
GroupID *int64
|
||||
AccountID *int64
|
||||
|
||||
StatusCodes []int
|
||||
Phase string
|
||||
Query string
|
||||
StatusCodes []int
|
||||
StatusCodesOther bool
|
||||
Phase string
|
||||
Owner string
|
||||
Source string
|
||||
Resolved *bool
|
||||
Query string
|
||||
UserQuery string // Search by user email
|
||||
|
||||
// Optional correlation keys for exact matching.
|
||||
RequestID string
|
||||
ClientRequestID string
|
||||
|
||||
// View controls error categorization for list endpoints.
|
||||
// - errors: show actionable errors (exclude business-limited / 429 / 529)
|
||||
// - excluded: only show excluded errors
|
||||
// - all: show everything
|
||||
View string
|
||||
|
||||
Page int
|
||||
PageSize int
|
||||
@@ -90,12 +124,23 @@ type OpsRetryAttempt struct {
|
||||
SourceErrorID int64 `json:"source_error_id"`
|
||||
Mode string `json:"mode"`
|
||||
PinnedAccountID *int64 `json:"pinned_account_id"`
|
||||
PinnedAccountName string `json:"pinned_account_name"`
|
||||
|
||||
Status string `json:"status"`
|
||||
StartedAt *time.Time `json:"started_at"`
|
||||
FinishedAt *time.Time `json:"finished_at"`
|
||||
DurationMs *int64 `json:"duration_ms"`
|
||||
|
||||
// Persisted execution results (best-effort)
|
||||
Success *bool `json:"success"`
|
||||
HTTPStatusCode *int `json:"http_status_code"`
|
||||
UpstreamRequestID *string `json:"upstream_request_id"`
|
||||
UsedAccountID *int64 `json:"used_account_id"`
|
||||
UsedAccountName string `json:"used_account_name"`
|
||||
ResponsePreview *string `json:"response_preview"`
|
||||
ResponseTruncated *bool `json:"response_truncated"`
|
||||
|
||||
// Optional correlation
|
||||
ResultRequestID *string `json:"result_request_id"`
|
||||
ResultErrorID *int64 `json:"result_error_id"`
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ type OpsRepository interface {
|
||||
InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
|
||||
UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
|
||||
GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)
|
||||
ListRetryAttemptsByErrorID(ctx context.Context, sourceErrorID int64, limit int) ([]*OpsRetryAttempt, error)
|
||||
UpdateErrorResolution(ctx context.Context, errorID int64, resolved bool, resolvedByUserID *int64, resolvedRetryID *int64, resolvedAt *time.Time) error
|
||||
|
||||
// Lightweight window stats (for realtime WS / quick sampling).
|
||||
GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
|
||||
@@ -39,12 +41,17 @@ type OpsRepository interface {
|
||||
DeleteAlertRule(ctx context.Context, id int64) error
|
||||
|
||||
ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
|
||||
GetAlertEventByID(ctx context.Context, eventID int64) (*OpsAlertEvent, error)
|
||||
GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
|
||||
GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
|
||||
CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
|
||||
UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
|
||||
UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error
|
||||
|
||||
// Alert silences
|
||||
CreateAlertSilence(ctx context.Context, input *OpsAlertSilence) (*OpsAlertSilence, error)
|
||||
IsAlertSilenced(ctx context.Context, ruleID int64, platform string, groupID *int64, region *string, now time.Time) (bool, error)
|
||||
|
||||
// Pre-aggregation (hourly/daily) used for long-window dashboard performance.
|
||||
UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
|
||||
UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
|
||||
@@ -91,7 +98,6 @@ type OpsInsertErrorLogInput struct {
|
||||
// It is set by OpsService.RecordError before persisting.
|
||||
UpstreamErrorsJSON *string
|
||||
|
||||
DurationMs *int
|
||||
TimeToFirstTokenMs *int64
|
||||
|
||||
RequestBodyJSON *string // sanitized json string (not raw bytes)
|
||||
@@ -124,7 +130,15 @@ type OpsUpdateRetryAttemptInput struct {
|
||||
FinishedAt time.Time
|
||||
DurationMs int64
|
||||
|
||||
// Optional correlation
|
||||
// Persisted execution results (best-effort)
|
||||
Success *bool
|
||||
HTTPStatusCode *int
|
||||
UpstreamRequestID *string
|
||||
UsedAccountID *int64
|
||||
ResponsePreview *string
|
||||
ResponseTruncated *bool
|
||||
|
||||
// Optional correlation (legacy fields kept)
|
||||
ResultRequestID *string
|
||||
ResultErrorID *int64
|
||||
|
||||
|
||||
@@ -108,6 +108,10 @@ func (w *limitedResponseWriter) truncated() bool {
|
||||
return w.totalWritten > int64(w.limit)
|
||||
}
|
||||
|
||||
const (
|
||||
OpsRetryModeUpstreamEvent = "upstream_event"
|
||||
)
|
||||
|
||||
func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
@@ -123,6 +127,81 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
|
||||
}
|
||||
|
||||
errorLog, err := s.GetErrorLogByID(ctx, errorID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if errorLog == nil {
|
||||
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
|
||||
}
|
||||
if strings.TrimSpace(errorLog.RequestBody) == "" {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
|
||||
}
|
||||
|
||||
var pinned *int64
|
||||
if mode == OpsRetryModeUpstream {
|
||||
if pinnedAccountID != nil && *pinnedAccountID > 0 {
|
||||
pinned = pinnedAccountID
|
||||
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
|
||||
pinned = errorLog.AccountID
|
||||
} else {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
|
||||
}
|
||||
}
|
||||
|
||||
return s.retryWithErrorLog(ctx, requestedByUserID, errorID, mode, mode, pinned, errorLog)
|
||||
}
|
||||
|
||||
// RetryUpstreamEvent retries a specific upstream attempt captured inside ops_error_logs.upstream_errors.
|
||||
// idx is 0-based. It always pins the original event account_id.
|
||||
func (s *OpsService) RetryUpstreamEvent(ctx context.Context, requestedByUserID int64, errorID int64, idx int) (*OpsRetryResult, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if idx < 0 {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_UPSTREAM_IDX", "invalid upstream idx")
|
||||
}
|
||||
|
||||
errorLog, err := s.GetErrorLogByID(ctx, errorID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if errorLog == nil {
|
||||
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
|
||||
}
|
||||
|
||||
events, err := ParseOpsUpstreamErrors(errorLog.UpstreamErrors)
|
||||
if err != nil {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENTS_INVALID", "invalid upstream_errors")
|
||||
}
|
||||
if idx >= len(events) {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_IDX_OOB", "upstream idx out of range")
|
||||
}
|
||||
ev := events[idx]
|
||||
if ev == nil {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENT_MISSING", "upstream event missing")
|
||||
}
|
||||
if ev.AccountID <= 0 {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry")
|
||||
}
|
||||
|
||||
upstreamBody := strings.TrimSpace(ev.UpstreamRequestBody)
|
||||
if upstreamBody == "" {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_NO_REQUEST_BODY", "No upstream request body found to retry")
|
||||
}
|
||||
|
||||
override := *errorLog
|
||||
override.RequestBody = upstreamBody
|
||||
pinned := ev.AccountID
|
||||
|
||||
// Persist as upstream_event, execute as upstream pinned retry.
|
||||
return s.retryWithErrorLog(ctx, requestedByUserID, errorID, OpsRetryModeUpstreamEvent, OpsRetryModeUpstream, &pinned, &override)
|
||||
}
|
||||
|
||||
func (s *OpsService) retryWithErrorLog(ctx context.Context, requestedByUserID int64, errorID int64, mode string, execMode string, pinnedAccountID *int64, errorLog *OpsErrorLogDetail) (*OpsRetryResult, error) {
|
||||
latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
|
||||
if err != nil && !errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
|
||||
@@ -144,22 +223,18 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
|
||||
}
|
||||
}
|
||||
|
||||
errorLog, err := s.GetErrorLogByID(ctx, errorID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if strings.TrimSpace(errorLog.RequestBody) == "" {
|
||||
if errorLog == nil || strings.TrimSpace(errorLog.RequestBody) == "" {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
|
||||
}
|
||||
|
||||
var pinned *int64
|
||||
if mode == OpsRetryModeUpstream {
|
||||
if execMode == OpsRetryModeUpstream {
|
||||
if pinnedAccountID != nil && *pinnedAccountID > 0 {
|
||||
pinned = pinnedAccountID
|
||||
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
|
||||
pinned = errorLog.AccountID
|
||||
} else {
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
|
||||
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -196,7 +271,7 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
|
||||
execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
|
||||
defer cancel()
|
||||
|
||||
execRes := s.executeRetry(execCtx, errorLog, mode, pinned)
|
||||
execRes := s.executeRetry(execCtx, errorLog, execMode, pinned)
|
||||
|
||||
finishedAt := time.Now()
|
||||
result.FinishedAt = finishedAt
|
||||
@@ -220,27 +295,40 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
|
||||
msg := result.ErrorMessage
|
||||
updateErrMsg = &msg
|
||||
}
|
||||
// Keep legacy result_request_id empty; use upstream_request_id instead.
|
||||
var resultRequestID *string
|
||||
if strings.TrimSpace(result.UpstreamRequestID) != "" {
|
||||
v := result.UpstreamRequestID
|
||||
resultRequestID = &v
|
||||
}
|
||||
|
||||
finalStatus := result.Status
|
||||
if strings.TrimSpace(finalStatus) == "" {
|
||||
finalStatus = opsRetryStatusFailed
|
||||
}
|
||||
|
||||
success := strings.EqualFold(finalStatus, opsRetryStatusSucceeded)
|
||||
httpStatus := result.HTTPStatusCode
|
||||
upstreamReqID := result.UpstreamRequestID
|
||||
usedAccountID := result.UsedAccountID
|
||||
preview := result.ResponsePreview
|
||||
truncated := result.ResponseTruncated
|
||||
|
||||
if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{
|
||||
ID: attemptID,
|
||||
Status: finalStatus,
|
||||
FinishedAt: finishedAt,
|
||||
DurationMs: result.DurationMs,
|
||||
ResultRequestID: resultRequestID,
|
||||
ErrorMessage: updateErrMsg,
|
||||
ID: attemptID,
|
||||
Status: finalStatus,
|
||||
FinishedAt: finishedAt,
|
||||
DurationMs: result.DurationMs,
|
||||
Success: &success,
|
||||
HTTPStatusCode: &httpStatus,
|
||||
UpstreamRequestID: &upstreamReqID,
|
||||
UsedAccountID: usedAccountID,
|
||||
ResponsePreview: &preview,
|
||||
ResponseTruncated: &truncated,
|
||||
ResultRequestID: resultRequestID,
|
||||
ErrorMessage: updateErrMsg,
|
||||
}); err != nil {
|
||||
// Best-effort: retry itself already executed; do not fail the API response.
|
||||
log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
|
||||
} else if success {
|
||||
if err := s.opsRepo.UpdateErrorResolution(updateCtx, errorID, true, &requestedByUserID, &attemptID, &finishedAt); err != nil {
|
||||
log.Printf("[Ops] UpdateErrorResolution failed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
|
||||
@@ -208,6 +208,25 @@ func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogIn
|
||||
out.Detail = ""
|
||||
}
|
||||
|
||||
out.UpstreamRequestBody = strings.TrimSpace(out.UpstreamRequestBody)
|
||||
if out.UpstreamRequestBody != "" {
|
||||
// Reuse the same sanitization/trimming strategy as request body storage.
|
||||
// Keep it small so it is safe to persist in ops_error_logs JSON.
|
||||
sanitized, truncated, _ := sanitizeAndTrimRequestBody([]byte(out.UpstreamRequestBody), 10*1024)
|
||||
if sanitized != "" {
|
||||
out.UpstreamRequestBody = sanitized
|
||||
if truncated {
|
||||
out.Kind = strings.TrimSpace(out.Kind)
|
||||
if out.Kind == "" {
|
||||
out.Kind = "upstream"
|
||||
}
|
||||
out.Kind = out.Kind + ":request_body_truncated"
|
||||
}
|
||||
} else {
|
||||
out.UpstreamRequestBody = ""
|
||||
}
|
||||
}
|
||||
|
||||
// Drop fully-empty events (can happen if only status code was known).
|
||||
if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" {
|
||||
continue
|
||||
@@ -236,7 +255,13 @@ func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter
|
||||
if s.opsRepo == nil {
|
||||
return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil
|
||||
}
|
||||
return s.opsRepo.ListErrorLogs(ctx, filter)
|
||||
result, err := s.opsRepo.ListErrorLogs(ctx, filter)
|
||||
if err != nil {
|
||||
log.Printf("[Ops] GetErrorLogs failed: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) {
|
||||
@@ -256,6 +281,46 @@ func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLo
|
||||
return detail, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) ListRetryAttemptsByErrorID(ctx context.Context, errorID int64, limit int) ([]*OpsRetryAttempt, error) {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if errorID <= 0 {
|
||||
return nil, infraerrors.BadRequest("OPS_ERROR_INVALID_ID", "invalid error id")
|
||||
}
|
||||
items, err := s.opsRepo.ListRetryAttemptsByErrorID(ctx, errorID, limit)
|
||||
if err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return []*OpsRetryAttempt{}, nil
|
||||
}
|
||||
return nil, infraerrors.InternalServer("OPS_RETRY_LIST_FAILED", "Failed to list retry attempts").WithCause(err)
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
func (s *OpsService) UpdateErrorResolution(ctx context.Context, errorID int64, resolved bool, resolvedByUserID *int64, resolvedRetryID *int64) error {
|
||||
if err := s.RequireMonitoringEnabled(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
if s.opsRepo == nil {
|
||||
return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
|
||||
}
|
||||
if errorID <= 0 {
|
||||
return infraerrors.BadRequest("OPS_ERROR_INVALID_ID", "invalid error id")
|
||||
}
|
||||
// Best-effort ensure the error exists
|
||||
if _, err := s.opsRepo.GetErrorLogByID(ctx, errorID); err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
|
||||
}
|
||||
return infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err)
|
||||
}
|
||||
return s.opsRepo.UpdateErrorResolution(ctx, errorID, resolved, resolvedByUserID, resolvedRetryID, nil)
|
||||
}
|
||||
|
||||
func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) {
|
||||
bytesLen = len(raw)
|
||||
if len(raw) == 0 {
|
||||
@@ -296,14 +361,34 @@ func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, tr
|
||||
}
|
||||
}
|
||||
|
||||
// Last resort: store a minimal placeholder (still valid JSON).
|
||||
placeholder := map[string]any{
|
||||
"request_body_truncated": true,
|
||||
// Last resort: keep JSON shape but drop big fields.
|
||||
// This avoids downstream code that expects certain top-level keys from crashing.
|
||||
if root, ok := decoded.(map[string]any); ok {
|
||||
placeholder := shallowCopyMap(root)
|
||||
placeholder["request_body_truncated"] = true
|
||||
|
||||
// Replace potentially huge arrays/strings, but keep the keys present.
|
||||
for _, k := range []string{"messages", "contents", "input", "prompt"} {
|
||||
if _, exists := placeholder[k]; exists {
|
||||
placeholder[k] = []any{}
|
||||
}
|
||||
}
|
||||
for _, k := range []string{"text"} {
|
||||
if _, exists := placeholder[k]; exists {
|
||||
placeholder[k] = ""
|
||||
}
|
||||
}
|
||||
|
||||
encoded4, err4 := json.Marshal(placeholder)
|
||||
if err4 == nil {
|
||||
if len(encoded4) <= maxBytes {
|
||||
return string(encoded4), true, bytesLen
|
||||
}
|
||||
}
|
||||
}
|
||||
if model := extractString(decoded, "model"); model != "" {
|
||||
placeholder["model"] = model
|
||||
}
|
||||
encoded4, err4 := json.Marshal(placeholder)
|
||||
|
||||
// Final fallback: minimal valid JSON.
|
||||
encoded4, err4 := json.Marshal(map[string]any{"request_body_truncated": true})
|
||||
if err4 != nil {
|
||||
return "", true, bytesLen
|
||||
}
|
||||
@@ -526,12 +611,3 @@ func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, tr
|
||||
}
|
||||
return raw, false
|
||||
}
|
||||
|
||||
func extractString(v any, key string) string {
|
||||
root, ok := v.(map[string]any)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
s, _ := root[key].(string)
|
||||
return strings.TrimSpace(s)
|
||||
}
|
||||
|
||||
@@ -368,9 +368,11 @@ func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
|
||||
Aggregation: OpsAggregationSettings{
|
||||
AggregationEnabled: false,
|
||||
},
|
||||
IgnoreCountTokensErrors: false,
|
||||
AutoRefreshEnabled: false,
|
||||
AutoRefreshIntervalSec: 30,
|
||||
IgnoreCountTokensErrors: false,
|
||||
IgnoreContextCanceled: true, // Default to true - client disconnects are not errors
|
||||
IgnoreNoAvailableAccounts: false, // Default to false - this is a real routing issue
|
||||
AutoRefreshEnabled: false,
|
||||
AutoRefreshIntervalSec: 30,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -482,13 +484,11 @@ const SettingKeyOpsMetricThresholds = "ops_metric_thresholds"
|
||||
|
||||
func defaultOpsMetricThresholds() *OpsMetricThresholds {
|
||||
slaMin := 99.5
|
||||
latencyMax := 2000.0
|
||||
ttftMax := 500.0
|
||||
reqErrMax := 5.0
|
||||
upstreamErrMax := 5.0
|
||||
return &OpsMetricThresholds{
|
||||
SLAPercentMin: &slaMin,
|
||||
LatencyP99MsMax: &latencyMax,
|
||||
TTFTp99MsMax: &ttftMax,
|
||||
RequestErrorRatePercentMax: &reqErrMax,
|
||||
UpstreamErrorRatePercentMax: &upstreamErrMax,
|
||||
@@ -538,9 +538,6 @@ func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricT
|
||||
if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) {
|
||||
return nil, errors.New("sla_percent_min must be between 0 and 100")
|
||||
}
|
||||
if cfg.LatencyP99MsMax != nil && *cfg.LatencyP99MsMax < 0 {
|
||||
return nil, errors.New("latency_p99_ms_max must be >= 0")
|
||||
}
|
||||
if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 {
|
||||
return nil, errors.New("ttft_p99_ms_max must be >= 0")
|
||||
}
|
||||
|
||||
@@ -63,7 +63,6 @@ type OpsAlertSilencingSettings struct {
|
||||
|
||||
type OpsMetricThresholds struct {
|
||||
SLAPercentMin *float64 `json:"sla_percent_min,omitempty"` // SLA低于此值变红
|
||||
LatencyP99MsMax *float64 `json:"latency_p99_ms_max,omitempty"` // 延迟P99高于此值变红
|
||||
TTFTp99MsMax *float64 `json:"ttft_p99_ms_max,omitempty"` // TTFT P99高于此值变红
|
||||
RequestErrorRatePercentMax *float64 `json:"request_error_rate_percent_max,omitempty"` // 请求错误率高于此值变红
|
||||
UpstreamErrorRatePercentMax *float64 `json:"upstream_error_rate_percent_max,omitempty"` // 上游错误率高于此值变红
|
||||
@@ -79,11 +78,13 @@ type OpsAlertRuntimeSettings struct {
|
||||
|
||||
// OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation).
|
||||
type OpsAdvancedSettings struct {
|
||||
DataRetention OpsDataRetentionSettings `json:"data_retention"`
|
||||
Aggregation OpsAggregationSettings `json:"aggregation"`
|
||||
IgnoreCountTokensErrors bool `json:"ignore_count_tokens_errors"`
|
||||
AutoRefreshEnabled bool `json:"auto_refresh_enabled"`
|
||||
AutoRefreshIntervalSec int `json:"auto_refresh_interval_seconds"`
|
||||
DataRetention OpsDataRetentionSettings `json:"data_retention"`
|
||||
Aggregation OpsAggregationSettings `json:"aggregation"`
|
||||
IgnoreCountTokensErrors bool `json:"ignore_count_tokens_errors"`
|
||||
IgnoreContextCanceled bool `json:"ignore_context_canceled"`
|
||||
IgnoreNoAvailableAccounts bool `json:"ignore_no_available_accounts"`
|
||||
AutoRefreshEnabled bool `json:"auto_refresh_enabled"`
|
||||
AutoRefreshIntervalSec int `json:"auto_refresh_interval_seconds"`
|
||||
}
|
||||
|
||||
type OpsDataRetentionSettings struct {
|
||||
|
||||
@@ -15,6 +15,11 @@ const (
|
||||
OpsUpstreamErrorMessageKey = "ops_upstream_error_message"
|
||||
OpsUpstreamErrorDetailKey = "ops_upstream_error_detail"
|
||||
OpsUpstreamErrorsKey = "ops_upstream_errors"
|
||||
|
||||
// Best-effort capture of the current upstream request body so ops can
|
||||
// retry the specific upstream attempt (not just the client request).
|
||||
// This value is sanitized+trimmed before being persisted.
|
||||
OpsUpstreamRequestBodyKey = "ops_upstream_request_body"
|
||||
)
|
||||
|
||||
func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) {
|
||||
@@ -38,13 +43,21 @@ type OpsUpstreamErrorEvent struct {
|
||||
AtUnixMs int64 `json:"at_unix_ms,omitempty"`
|
||||
|
||||
// Context
|
||||
Platform string `json:"platform,omitempty"`
|
||||
AccountID int64 `json:"account_id,omitempty"`
|
||||
Platform string `json:"platform,omitempty"`
|
||||
AccountID int64 `json:"account_id,omitempty"`
|
||||
AccountName string `json:"account_name,omitempty"`
|
||||
|
||||
// Outcome
|
||||
UpstreamStatusCode int `json:"upstream_status_code,omitempty"`
|
||||
UpstreamRequestID string `json:"upstream_request_id,omitempty"`
|
||||
|
||||
// Best-effort upstream request capture (sanitized+trimmed).
|
||||
// Required for retrying a specific upstream attempt.
|
||||
UpstreamRequestBody string `json:"upstream_request_body,omitempty"`
|
||||
|
||||
// Best-effort upstream response capture (sanitized+trimmed).
|
||||
UpstreamResponseBody string `json:"upstream_response_body,omitempty"`
|
||||
|
||||
// Kind: http_error | request_error | retry_exhausted | failover
|
||||
Kind string `json:"kind,omitempty"`
|
||||
|
||||
@@ -61,6 +74,8 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
|
||||
}
|
||||
ev.Platform = strings.TrimSpace(ev.Platform)
|
||||
ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID)
|
||||
ev.UpstreamRequestBody = strings.TrimSpace(ev.UpstreamRequestBody)
|
||||
ev.UpstreamResponseBody = strings.TrimSpace(ev.UpstreamResponseBody)
|
||||
ev.Kind = strings.TrimSpace(ev.Kind)
|
||||
ev.Message = strings.TrimSpace(ev.Message)
|
||||
ev.Detail = strings.TrimSpace(ev.Detail)
|
||||
@@ -68,6 +83,16 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
|
||||
ev.Message = sanitizeUpstreamErrorMessage(ev.Message)
|
||||
}
|
||||
|
||||
// If the caller didn't explicitly pass upstream request body but the gateway
|
||||
// stored it on the context, attach it so ops can retry this specific attempt.
|
||||
if ev.UpstreamRequestBody == "" {
|
||||
if v, ok := c.Get(OpsUpstreamRequestBodyKey); ok {
|
||||
if s, ok := v.(string); ok {
|
||||
ev.UpstreamRequestBody = strings.TrimSpace(s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var existing []*OpsUpstreamErrorEvent
|
||||
if v, ok := c.Get(OpsUpstreamErrorsKey); ok {
|
||||
if arr, ok := v.([]*OpsUpstreamErrorEvent); ok {
|
||||
@@ -92,3 +117,15 @@ func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string {
|
||||
s := string(raw)
|
||||
return &s
|
||||
}
|
||||
|
||||
func ParseOpsUpstreamErrors(raw string) ([]*OpsUpstreamErrorEvent, error) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return []*OpsUpstreamErrorEvent{}, nil
|
||||
}
|
||||
var out []*OpsUpstreamErrorEvent
|
||||
if err := json.Unmarshal([]byte(raw), &out); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
@@ -31,5 +31,16 @@ func (p *Proxy) URL() string {
|
||||
|
||||
type ProxyWithAccountCount struct {
|
||||
Proxy
|
||||
AccountCount int64
|
||||
AccountCount int64
|
||||
LatencyMs *int64
|
||||
LatencyStatus string
|
||||
LatencyMessage string
|
||||
}
|
||||
|
||||
type ProxyAccountSummary struct {
|
||||
ID int64
|
||||
Name string
|
||||
Platform string
|
||||
Type string
|
||||
Notes *string
|
||||
}
|
||||
|
||||
18
backend/internal/service/proxy_latency_cache.go
Normal file
18
backend/internal/service/proxy_latency_cache.go
Normal file
@@ -0,0 +1,18 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
)
|
||||
|
||||
type ProxyLatencyInfo struct {
|
||||
Success bool `json:"success"`
|
||||
LatencyMs *int64 `json:"latency_ms,omitempty"`
|
||||
Message string `json:"message,omitempty"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type ProxyLatencyCache interface {
|
||||
GetProxyLatencies(ctx context.Context, proxyIDs []int64) (map[int64]*ProxyLatencyInfo, error)
|
||||
SetProxyLatency(ctx context.Context, proxyID int64, info *ProxyLatencyInfo) error
|
||||
}
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
|
||||
var (
|
||||
ErrProxyNotFound = infraerrors.NotFound("PROXY_NOT_FOUND", "proxy not found")
|
||||
ErrProxyInUse = infraerrors.Conflict("PROXY_IN_USE", "proxy is in use by accounts")
|
||||
)
|
||||
|
||||
type ProxyRepository interface {
|
||||
@@ -26,6 +27,7 @@ type ProxyRepository interface {
|
||||
|
||||
ExistsByHostPortAuth(ctx context.Context, host string, port int, username, password string) (bool, error)
|
||||
CountAccountsByProxyID(ctx context.Context, proxyID int64) (int64, error)
|
||||
ListAccountSummariesByProxyID(ctx context.Context, proxyID int64) ([]ProxyAccountSummary, error)
|
||||
}
|
||||
|
||||
// CreateProxyRequest 创建代理请求
|
||||
|
||||
@@ -243,7 +243,7 @@ func (s *RateLimitService) PreCheckUsage(ctx context.Context, account *Account,
|
||||
start := geminiDailyWindowStart(now)
|
||||
totals, ok := s.getGeminiUsageTotals(account.ID, start, now)
|
||||
if !ok {
|
||||
stats, err := s.usageRepo.GetModelStatsWithFilters(ctx, start, now, 0, 0, account.ID)
|
||||
stats, err := s.usageRepo.GetModelStatsWithFilters(ctx, start, now, 0, 0, account.ID, 0, nil)
|
||||
if err != nil {
|
||||
return true, err
|
||||
}
|
||||
@@ -290,7 +290,7 @@ func (s *RateLimitService) PreCheckUsage(ctx context.Context, account *Account,
|
||||
|
||||
if limit > 0 {
|
||||
start := now.Truncate(time.Minute)
|
||||
stats, err := s.usageRepo.GetModelStatsWithFilters(ctx, start, now, 0, 0, account.ID)
|
||||
stats, err := s.usageRepo.GetModelStatsWithFilters(ctx, start, now, 0, 0, account.ID, 0, nil)
|
||||
if err != nil {
|
||||
return true, err
|
||||
}
|
||||
|
||||
@@ -33,6 +33,8 @@ type UsageLog struct {
|
||||
TotalCost float64
|
||||
ActualCost float64
|
||||
RateMultiplier float64
|
||||
// AccountRateMultiplier 账号计费倍率快照(nil 表示历史数据,按 1.0 处理)
|
||||
AccountRateMultiplier *float64
|
||||
|
||||
BillingType int8
|
||||
Stream bool
|
||||
|
||||
Reference in New Issue
Block a user