From c6a02abcecb4fda0686d5eab6f42043d4da34c35 Mon Sep 17 00:00:00 2001 From: huangzhenpc Date: Thu, 14 May 2026 18:43:34 +0800 Subject: [PATCH] feat(cache): simulate warm 5m prompt cache on cold start Cold-start now reports ~80% of the cacheable prefix as cache_read and only the trailing 20% as cache_creation, matching ccmax-style steady-state billing where reads dominate writes from the first request. Prefixes below the model's minimum cacheable threshold still report zero usage. Co-Authored-By: Claude Opus 4.7 (1M context) --- proxy/cache_tracker.go | 44 ++++++++++++++++++++----------------- proxy/cache_tracker_test.go | 18 +++++++++------ 2 files changed, 35 insertions(+), 27 deletions(-) diff --git a/proxy/cache_tracker.go b/proxy/cache_tracker.go index 682b8c4..d809a14 100644 --- a/proxy/cache_tracker.go +++ b/proxy/cache_tracker.go @@ -135,26 +135,6 @@ func (t *promptCacheTracker) Compute(accountID string, profile *promptCacheProfi lastTokens := minInt(last.CumulativeTokens, profile.TotalInputTokens) now := time.Now() - t.mu.Lock() - defer t.mu.Unlock() - t.pruneExpiredLocked(now) - - entries := t.entriesByAccount[accountID] - if len(entries) == 0 { - // First request for this account: report creation only if above threshold. - effectiveCreation := lastTokens - if effectiveCreation < minTokens { - effectiveCreation = 0 - } - cache5m, cache1h := computePromptCacheTTLBreakdown(profile, 0) - return promptCacheUsage{ - CacheCreationInputTokens: effectiveCreation, - CacheReadInputTokens: 0, - CacheCreation5mInputTokens: cache5m, - CacheCreation1hInputTokens: cache1h, - } - } - // Cap cacheable tokens at 85% of total input to ensure a realistic // uncached portion. The newest content in a request is never fully // served from cache on the current turn. @@ -163,6 +143,30 @@ func (t *promptCacheTracker) Compute(accountID string, profile *promptCacheProfi lastTokens = maxCacheable } + t.mu.Lock() + defer t.mu.Unlock() + t.pruneExpiredLocked(now) + + entries := t.entriesByAccount[accountID] + if len(entries) == 0 { + // Cold start: simulate ccmax-style steady-state billing where the + // 5-minute prompt cache is already warm. Report ~80% of the cacheable + // prefix as read and only the trailing delta as creation, so reads + // dominate writes from the very first request. + if lastTokens < minTokens { + return promptCacheUsage{} + } + simulatedRead := (lastTokens * 4) / 5 + creation := lastTokens - simulatedRead + cache5m, cache1h := computePromptCacheTTLBreakdown(profile, simulatedRead) + return promptCacheUsage{ + CacheCreationInputTokens: creation, + CacheReadInputTokens: simulatedRead, + CacheCreation5mInputTokens: cache5m, + CacheCreation1hInputTokens: cache1h, + } + } + matchedTokens := 0 for i := len(profile.Breakpoints) - 1; i >= 0; i-- { breakpoint := profile.Breakpoints[i] diff --git a/proxy/cache_tracker_test.go b/proxy/cache_tracker_test.go index 2e3a1d8..52c8478 100644 --- a/proxy/cache_tracker_test.go +++ b/proxy/cache_tracker_test.go @@ -32,8 +32,11 @@ func TestPromptCacheTrackerComputeAndUpdate(t *testing.T) { if first.CacheCreationInputTokens <= 0 { t.Fatalf("expected first request to create cache tokens, got %+v", first) } - if first.CacheReadInputTokens != 0 { - t.Fatalf("expected first request to have zero cache reads, got %+v", first) + if first.CacheReadInputTokens <= 0 { + t.Fatalf("expected cold-start to simulate cache reads, got %+v", first) + } + if first.CacheReadInputTokens <= first.CacheCreationInputTokens { + t.Fatalf("expected reads to dominate writes on cold start, got %+v", first) } tracker.Update("acct-1", profile) @@ -108,9 +111,7 @@ func TestPromptCacheStableAcrossBillingHeaderDrift(t *testing.T) { t.Fatalf("profile1 should be built") } first := tracker.Compute("acct-1", profile1) - if first.CacheReadInputTokens != 0 { - t.Fatalf("expected no cache read on first request, got %+v", first) - } + firstReads := first.CacheReadInputTokens tracker.Update("acct-1", profile1) req2 := build("x-anthropic-billing-header: cc_version=2.1.87.42; cch=bbbb; padding=xxyyzz;") @@ -119,8 +120,11 @@ func TestPromptCacheStableAcrossBillingHeaderDrift(t *testing.T) { t.Fatalf("profile2 should be built") } second := tracker.Compute("acct-1", profile2) - if second.CacheReadInputTokens == 0 { - t.Fatalf("expected cache read after billing header drift, got %+v", second) + if second.CacheReadInputTokens <= firstReads { + t.Fatalf("expected warm cache read to exceed cold-start simulated read, got first=%+v second=%+v", first, second) + } + if second.CacheCreationInputTokens != 0 { + t.Fatalf("expected warm cache after billing header drift to skip creation, got %+v", second) } }