feat(cache): simulate warm 5m prompt cache on cold start
Some checks failed
Build Docker Image / build (push) Has been cancelled
Some checks failed
Build Docker Image / build (push) Has been cancelled
Cold-start now reports ~80% of the cacheable prefix as cache_read and only the trailing 20% as cache_creation, matching ccmax-style steady-state billing where reads dominate writes from the first request. Prefixes below the model's minimum cacheable threshold still report zero usage. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -135,26 +135,6 @@ func (t *promptCacheTracker) Compute(accountID string, profile *promptCacheProfi
|
||||
lastTokens := minInt(last.CumulativeTokens, profile.TotalInputTokens)
|
||||
now := time.Now()
|
||||
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
t.pruneExpiredLocked(now)
|
||||
|
||||
entries := t.entriesByAccount[accountID]
|
||||
if len(entries) == 0 {
|
||||
// First request for this account: report creation only if above threshold.
|
||||
effectiveCreation := lastTokens
|
||||
if effectiveCreation < minTokens {
|
||||
effectiveCreation = 0
|
||||
}
|
||||
cache5m, cache1h := computePromptCacheTTLBreakdown(profile, 0)
|
||||
return promptCacheUsage{
|
||||
CacheCreationInputTokens: effectiveCreation,
|
||||
CacheReadInputTokens: 0,
|
||||
CacheCreation5mInputTokens: cache5m,
|
||||
CacheCreation1hInputTokens: cache1h,
|
||||
}
|
||||
}
|
||||
|
||||
// Cap cacheable tokens at 85% of total input to ensure a realistic
|
||||
// uncached portion. The newest content in a request is never fully
|
||||
// served from cache on the current turn.
|
||||
@@ -163,6 +143,30 @@ func (t *promptCacheTracker) Compute(accountID string, profile *promptCacheProfi
|
||||
lastTokens = maxCacheable
|
||||
}
|
||||
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
t.pruneExpiredLocked(now)
|
||||
|
||||
entries := t.entriesByAccount[accountID]
|
||||
if len(entries) == 0 {
|
||||
// Cold start: simulate ccmax-style steady-state billing where the
|
||||
// 5-minute prompt cache is already warm. Report ~80% of the cacheable
|
||||
// prefix as read and only the trailing delta as creation, so reads
|
||||
// dominate writes from the very first request.
|
||||
if lastTokens < minTokens {
|
||||
return promptCacheUsage{}
|
||||
}
|
||||
simulatedRead := (lastTokens * 4) / 5
|
||||
creation := lastTokens - simulatedRead
|
||||
cache5m, cache1h := computePromptCacheTTLBreakdown(profile, simulatedRead)
|
||||
return promptCacheUsage{
|
||||
CacheCreationInputTokens: creation,
|
||||
CacheReadInputTokens: simulatedRead,
|
||||
CacheCreation5mInputTokens: cache5m,
|
||||
CacheCreation1hInputTokens: cache1h,
|
||||
}
|
||||
}
|
||||
|
||||
matchedTokens := 0
|
||||
for i := len(profile.Breakpoints) - 1; i >= 0; i-- {
|
||||
breakpoint := profile.Breakpoints[i]
|
||||
|
||||
@@ -32,8 +32,11 @@ func TestPromptCacheTrackerComputeAndUpdate(t *testing.T) {
|
||||
if first.CacheCreationInputTokens <= 0 {
|
||||
t.Fatalf("expected first request to create cache tokens, got %+v", first)
|
||||
}
|
||||
if first.CacheReadInputTokens != 0 {
|
||||
t.Fatalf("expected first request to have zero cache reads, got %+v", first)
|
||||
if first.CacheReadInputTokens <= 0 {
|
||||
t.Fatalf("expected cold-start to simulate cache reads, got %+v", first)
|
||||
}
|
||||
if first.CacheReadInputTokens <= first.CacheCreationInputTokens {
|
||||
t.Fatalf("expected reads to dominate writes on cold start, got %+v", first)
|
||||
}
|
||||
|
||||
tracker.Update("acct-1", profile)
|
||||
@@ -108,9 +111,7 @@ func TestPromptCacheStableAcrossBillingHeaderDrift(t *testing.T) {
|
||||
t.Fatalf("profile1 should be built")
|
||||
}
|
||||
first := tracker.Compute("acct-1", profile1)
|
||||
if first.CacheReadInputTokens != 0 {
|
||||
t.Fatalf("expected no cache read on first request, got %+v", first)
|
||||
}
|
||||
firstReads := first.CacheReadInputTokens
|
||||
tracker.Update("acct-1", profile1)
|
||||
|
||||
req2 := build("x-anthropic-billing-header: cc_version=2.1.87.42; cch=bbbb; padding=xxyyzz;")
|
||||
@@ -119,8 +120,11 @@ func TestPromptCacheStableAcrossBillingHeaderDrift(t *testing.T) {
|
||||
t.Fatalf("profile2 should be built")
|
||||
}
|
||||
second := tracker.Compute("acct-1", profile2)
|
||||
if second.CacheReadInputTokens == 0 {
|
||||
t.Fatalf("expected cache read after billing header drift, got %+v", second)
|
||||
if second.CacheReadInputTokens <= firstReads {
|
||||
t.Fatalf("expected warm cache read to exceed cold-start simulated read, got first=%+v second=%+v", first, second)
|
||||
}
|
||||
if second.CacheCreationInputTokens != 0 {
|
||||
t.Fatalf("expected warm cache after billing header drift to skip creation, got %+v", second)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user