feat(cache): simulate warm 5m prompt cache on cold start
Some checks failed
Build Docker Image / build (push) Has been cancelled

Cold-start now reports ~80% of the cacheable prefix as cache_read and only
the trailing 20% as cache_creation, matching ccmax-style steady-state
billing where reads dominate writes from the first request. Prefixes below
the model's minimum cacheable threshold still report zero usage.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-14 18:43:34 +08:00
parent 4971ac4cbe
commit c6a02abcec
2 changed files with 35 additions and 27 deletions

View File

@@ -135,26 +135,6 @@ func (t *promptCacheTracker) Compute(accountID string, profile *promptCacheProfi
lastTokens := minInt(last.CumulativeTokens, profile.TotalInputTokens)
now := time.Now()
t.mu.Lock()
defer t.mu.Unlock()
t.pruneExpiredLocked(now)
entries := t.entriesByAccount[accountID]
if len(entries) == 0 {
// First request for this account: report creation only if above threshold.
effectiveCreation := lastTokens
if effectiveCreation < minTokens {
effectiveCreation = 0
}
cache5m, cache1h := computePromptCacheTTLBreakdown(profile, 0)
return promptCacheUsage{
CacheCreationInputTokens: effectiveCreation,
CacheReadInputTokens: 0,
CacheCreation5mInputTokens: cache5m,
CacheCreation1hInputTokens: cache1h,
}
}
// Cap cacheable tokens at 85% of total input to ensure a realistic
// uncached portion. The newest content in a request is never fully
// served from cache on the current turn.
@@ -163,6 +143,30 @@ func (t *promptCacheTracker) Compute(accountID string, profile *promptCacheProfi
lastTokens = maxCacheable
}
t.mu.Lock()
defer t.mu.Unlock()
t.pruneExpiredLocked(now)
entries := t.entriesByAccount[accountID]
if len(entries) == 0 {
// Cold start: simulate ccmax-style steady-state billing where the
// 5-minute prompt cache is already warm. Report ~80% of the cacheable
// prefix as read and only the trailing delta as creation, so reads
// dominate writes from the very first request.
if lastTokens < minTokens {
return promptCacheUsage{}
}
simulatedRead := (lastTokens * 4) / 5
creation := lastTokens - simulatedRead
cache5m, cache1h := computePromptCacheTTLBreakdown(profile, simulatedRead)
return promptCacheUsage{
CacheCreationInputTokens: creation,
CacheReadInputTokens: simulatedRead,
CacheCreation5mInputTokens: cache5m,
CacheCreation1hInputTokens: cache1h,
}
}
matchedTokens := 0
for i := len(profile.Breakpoints) - 1; i >= 0; i-- {
breakpoint := profile.Breakpoints[i]

View File

@@ -32,8 +32,11 @@ func TestPromptCacheTrackerComputeAndUpdate(t *testing.T) {
if first.CacheCreationInputTokens <= 0 {
t.Fatalf("expected first request to create cache tokens, got %+v", first)
}
if first.CacheReadInputTokens != 0 {
t.Fatalf("expected first request to have zero cache reads, got %+v", first)
if first.CacheReadInputTokens <= 0 {
t.Fatalf("expected cold-start to simulate cache reads, got %+v", first)
}
if first.CacheReadInputTokens <= first.CacheCreationInputTokens {
t.Fatalf("expected reads to dominate writes on cold start, got %+v", first)
}
tracker.Update("acct-1", profile)
@@ -108,9 +111,7 @@ func TestPromptCacheStableAcrossBillingHeaderDrift(t *testing.T) {
t.Fatalf("profile1 should be built")
}
first := tracker.Compute("acct-1", profile1)
if first.CacheReadInputTokens != 0 {
t.Fatalf("expected no cache read on first request, got %+v", first)
}
firstReads := first.CacheReadInputTokens
tracker.Update("acct-1", profile1)
req2 := build("x-anthropic-billing-header: cc_version=2.1.87.42; cch=bbbb; padding=xxyyzz;")
@@ -119,8 +120,11 @@ func TestPromptCacheStableAcrossBillingHeaderDrift(t *testing.T) {
t.Fatalf("profile2 should be built")
}
second := tracker.Compute("acct-1", profile2)
if second.CacheReadInputTokens == 0 {
t.Fatalf("expected cache read after billing header drift, got %+v", second)
if second.CacheReadInputTokens <= firstReads {
t.Fatalf("expected warm cache read to exceed cold-start simulated read, got first=%+v second=%+v", first, second)
}
if second.CacheCreationInputTokens != 0 {
t.Fatalf("expected warm cache after billing header drift to skip creation, got %+v", second)
}
}