feat(openai): 极致优化 OAuth 链路并补齐性能守护
- 优化 /v1/responses 热路径,减少重复解析与不必要拷贝\n- 优化并发与 token 竞争路径并补齐运行指标\n- 补充 OpenAI/Ops 相关单元测试与回归用例\n- 新增灰度阈值守护与压测脚本,支撑发布验收
This commit is contained in:
@@ -4,16 +4,74 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"log/slog"
|
||||
"math/rand/v2"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
openAITokenRefreshSkew = 3 * time.Minute
|
||||
openAITokenCacheSkew = 5 * time.Minute
|
||||
openAILockWaitTime = 200 * time.Millisecond
|
||||
openAITokenRefreshSkew = 3 * time.Minute
|
||||
openAITokenCacheSkew = 5 * time.Minute
|
||||
openAILockInitialWait = 20 * time.Millisecond
|
||||
openAILockMaxWait = 120 * time.Millisecond
|
||||
openAILockMaxAttempts = 5
|
||||
openAILockJitterRatio = 0.2
|
||||
openAILockWarnThresholdMs = 250
|
||||
)
|
||||
|
||||
// OpenAITokenRuntimeMetrics 表示 OpenAI token 刷新与锁竞争保护指标快照。
|
||||
type OpenAITokenRuntimeMetrics struct {
|
||||
RefreshRequests int64
|
||||
RefreshSuccess int64
|
||||
RefreshFailure int64
|
||||
LockAcquireFailure int64
|
||||
LockContention int64
|
||||
LockWaitSamples int64
|
||||
LockWaitTotalMs int64
|
||||
LockWaitHit int64
|
||||
LockWaitMiss int64
|
||||
LastObservedUnixMs int64
|
||||
}
|
||||
|
||||
type openAITokenRuntimeMetricsStore struct {
|
||||
refreshRequests atomic.Int64
|
||||
refreshSuccess atomic.Int64
|
||||
refreshFailure atomic.Int64
|
||||
lockAcquireFailure atomic.Int64
|
||||
lockContention atomic.Int64
|
||||
lockWaitSamples atomic.Int64
|
||||
lockWaitTotalMs atomic.Int64
|
||||
lockWaitHit atomic.Int64
|
||||
lockWaitMiss atomic.Int64
|
||||
lastObservedUnixMs atomic.Int64
|
||||
}
|
||||
|
||||
func (m *openAITokenRuntimeMetricsStore) snapshot() OpenAITokenRuntimeMetrics {
|
||||
if m == nil {
|
||||
return OpenAITokenRuntimeMetrics{}
|
||||
}
|
||||
return OpenAITokenRuntimeMetrics{
|
||||
RefreshRequests: m.refreshRequests.Load(),
|
||||
RefreshSuccess: m.refreshSuccess.Load(),
|
||||
RefreshFailure: m.refreshFailure.Load(),
|
||||
LockAcquireFailure: m.lockAcquireFailure.Load(),
|
||||
LockContention: m.lockContention.Load(),
|
||||
LockWaitSamples: m.lockWaitSamples.Load(),
|
||||
LockWaitTotalMs: m.lockWaitTotalMs.Load(),
|
||||
LockWaitHit: m.lockWaitHit.Load(),
|
||||
LockWaitMiss: m.lockWaitMiss.Load(),
|
||||
LastObservedUnixMs: m.lastObservedUnixMs.Load(),
|
||||
}
|
||||
}
|
||||
|
||||
func (m *openAITokenRuntimeMetricsStore) touchNow() {
|
||||
if m == nil {
|
||||
return
|
||||
}
|
||||
m.lastObservedUnixMs.Store(time.Now().UnixMilli())
|
||||
}
|
||||
|
||||
// OpenAITokenCache Token 缓存接口(复用 GeminiTokenCache 接口定义)
|
||||
type OpenAITokenCache = GeminiTokenCache
|
||||
|
||||
@@ -22,6 +80,7 @@ type OpenAITokenProvider struct {
|
||||
accountRepo AccountRepository
|
||||
tokenCache OpenAITokenCache
|
||||
openAIOAuthService *OpenAIOAuthService
|
||||
metrics *openAITokenRuntimeMetricsStore
|
||||
}
|
||||
|
||||
func NewOpenAITokenProvider(
|
||||
@@ -33,11 +92,27 @@ func NewOpenAITokenProvider(
|
||||
accountRepo: accountRepo,
|
||||
tokenCache: tokenCache,
|
||||
openAIOAuthService: openAIOAuthService,
|
||||
metrics: &openAITokenRuntimeMetricsStore{},
|
||||
}
|
||||
}
|
||||
|
||||
func (p *OpenAITokenProvider) SnapshotRuntimeMetrics() OpenAITokenRuntimeMetrics {
|
||||
if p == nil {
|
||||
return OpenAITokenRuntimeMetrics{}
|
||||
}
|
||||
p.ensureMetrics()
|
||||
return p.metrics.snapshot()
|
||||
}
|
||||
|
||||
func (p *OpenAITokenProvider) ensureMetrics() {
|
||||
if p != nil && p.metrics == nil {
|
||||
p.metrics = &openAITokenRuntimeMetricsStore{}
|
||||
}
|
||||
}
|
||||
|
||||
// GetAccessToken 获取有效的 access_token
|
||||
func (p *OpenAITokenProvider) GetAccessToken(ctx context.Context, account *Account) (string, error) {
|
||||
p.ensureMetrics()
|
||||
if account == nil {
|
||||
return "", errors.New("account is nil")
|
||||
}
|
||||
@@ -64,6 +139,8 @@ func (p *OpenAITokenProvider) GetAccessToken(ctx context.Context, account *Accou
|
||||
needsRefresh := expiresAt == nil || time.Until(*expiresAt) <= openAITokenRefreshSkew
|
||||
refreshFailed := false
|
||||
if needsRefresh && p.tokenCache != nil {
|
||||
p.metrics.refreshRequests.Add(1)
|
||||
p.metrics.touchNow()
|
||||
locked, lockErr := p.tokenCache.AcquireRefreshLock(ctx, cacheKey, 30*time.Second)
|
||||
if lockErr == nil && locked {
|
||||
defer func() { _ = p.tokenCache.ReleaseRefreshLock(ctx, cacheKey) }()
|
||||
@@ -82,14 +159,17 @@ func (p *OpenAITokenProvider) GetAccessToken(ctx context.Context, account *Accou
|
||||
if expiresAt == nil || time.Until(*expiresAt) <= openAITokenRefreshSkew {
|
||||
if p.openAIOAuthService == nil {
|
||||
slog.Warn("openai_oauth_service_not_configured", "account_id", account.ID)
|
||||
p.metrics.refreshFailure.Add(1)
|
||||
refreshFailed = true // 无法刷新,标记失败
|
||||
} else {
|
||||
tokenInfo, err := p.openAIOAuthService.RefreshAccountToken(ctx, account)
|
||||
if err != nil {
|
||||
// 刷新失败时记录警告,但不立即返回错误,尝试使用现有 token
|
||||
slog.Warn("openai_token_refresh_failed", "account_id", account.ID, "error", err)
|
||||
p.metrics.refreshFailure.Add(1)
|
||||
refreshFailed = true // 刷新失败,标记以使用短 TTL
|
||||
} else {
|
||||
p.metrics.refreshSuccess.Add(1)
|
||||
newCredentials := p.openAIOAuthService.BuildAccountCredentials(tokenInfo)
|
||||
for k, v := range account.Credentials {
|
||||
if _, exists := newCredentials[k]; !exists {
|
||||
@@ -106,6 +186,8 @@ func (p *OpenAITokenProvider) GetAccessToken(ctx context.Context, account *Accou
|
||||
}
|
||||
} else if lockErr != nil {
|
||||
// Redis 错误导致无法获取锁,降级为无锁刷新(仅在 token 接近过期时)
|
||||
p.metrics.lockAcquireFailure.Add(1)
|
||||
p.metrics.touchNow()
|
||||
slog.Warn("openai_token_lock_failed_degraded_refresh", "account_id", account.ID, "error", lockErr)
|
||||
|
||||
// 检查 ctx 是否已取消
|
||||
@@ -126,13 +208,16 @@ func (p *OpenAITokenProvider) GetAccessToken(ctx context.Context, account *Accou
|
||||
if expiresAt == nil || time.Until(*expiresAt) <= openAITokenRefreshSkew {
|
||||
if p.openAIOAuthService == nil {
|
||||
slog.Warn("openai_oauth_service_not_configured", "account_id", account.ID)
|
||||
p.metrics.refreshFailure.Add(1)
|
||||
refreshFailed = true
|
||||
} else {
|
||||
tokenInfo, err := p.openAIOAuthService.RefreshAccountToken(ctx, account)
|
||||
if err != nil {
|
||||
slog.Warn("openai_token_refresh_failed_degraded", "account_id", account.ID, "error", err)
|
||||
p.metrics.refreshFailure.Add(1)
|
||||
refreshFailed = true
|
||||
} else {
|
||||
p.metrics.refreshSuccess.Add(1)
|
||||
newCredentials := p.openAIOAuthService.BuildAccountCredentials(tokenInfo)
|
||||
for k, v := range account.Credentials {
|
||||
if _, exists := newCredentials[k]; !exists {
|
||||
@@ -148,9 +233,14 @@ func (p *OpenAITokenProvider) GetAccessToken(ctx context.Context, account *Accou
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// 锁获取失败(被其他 worker 持有),等待 200ms 后重试读取缓存
|
||||
time.Sleep(openAILockWaitTime)
|
||||
if token, err := p.tokenCache.GetAccessToken(ctx, cacheKey); err == nil && strings.TrimSpace(token) != "" {
|
||||
// 锁被其他 worker 持有:使用短轮询+jitter,降低固定等待导致的尾延迟台阶。
|
||||
p.metrics.lockContention.Add(1)
|
||||
p.metrics.touchNow()
|
||||
token, waitErr := p.waitForTokenAfterLockRace(ctx, cacheKey)
|
||||
if waitErr != nil {
|
||||
return "", waitErr
|
||||
}
|
||||
if strings.TrimSpace(token) != "" {
|
||||
slog.Debug("openai_token_cache_hit_after_wait", "account_id", account.ID)
|
||||
return token, nil
|
||||
}
|
||||
@@ -198,3 +288,64 @@ func (p *OpenAITokenProvider) GetAccessToken(ctx context.Context, account *Accou
|
||||
|
||||
return accessToken, nil
|
||||
}
|
||||
|
||||
func (p *OpenAITokenProvider) waitForTokenAfterLockRace(ctx context.Context, cacheKey string) (string, error) {
|
||||
wait := openAILockInitialWait
|
||||
totalWaitMs := int64(0)
|
||||
for i := 0; i < openAILockMaxAttempts; i++ {
|
||||
actualWait := jitterLockWait(wait)
|
||||
timer := time.NewTimer(actualWait)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
if !timer.Stop() {
|
||||
select {
|
||||
case <-timer.C:
|
||||
default:
|
||||
}
|
||||
}
|
||||
return "", ctx.Err()
|
||||
case <-timer.C:
|
||||
}
|
||||
|
||||
waitMs := actualWait.Milliseconds()
|
||||
if waitMs < 0 {
|
||||
waitMs = 0
|
||||
}
|
||||
totalWaitMs += waitMs
|
||||
p.metrics.lockWaitSamples.Add(1)
|
||||
p.metrics.lockWaitTotalMs.Add(waitMs)
|
||||
p.metrics.touchNow()
|
||||
|
||||
token, err := p.tokenCache.GetAccessToken(ctx, cacheKey)
|
||||
if err == nil && strings.TrimSpace(token) != "" {
|
||||
p.metrics.lockWaitHit.Add(1)
|
||||
if totalWaitMs >= openAILockWarnThresholdMs {
|
||||
slog.Warn("openai_token_lock_wait_high", "wait_ms", totalWaitMs, "attempts", i+1)
|
||||
}
|
||||
return token, nil
|
||||
}
|
||||
|
||||
if wait < openAILockMaxWait {
|
||||
wait *= 2
|
||||
if wait > openAILockMaxWait {
|
||||
wait = openAILockMaxWait
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
p.metrics.lockWaitMiss.Add(1)
|
||||
if totalWaitMs >= openAILockWarnThresholdMs {
|
||||
slog.Warn("openai_token_lock_wait_high", "wait_ms", totalWaitMs, "attempts", openAILockMaxAttempts)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func jitterLockWait(base time.Duration) time.Duration {
|
||||
if base <= 0 {
|
||||
return 0
|
||||
}
|
||||
minFactor := 1 - openAILockJitterRatio
|
||||
maxFactor := 1 + openAILockJitterRatio
|
||||
factor := minFactor + rand.Float64()*(maxFactor-minFactor)
|
||||
return time.Duration(float64(base) * factor)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user