feat(cache): simulate warm 5m prompt cache on cold start

Cold-start now reports ~80% of the cacheable prefix as cache_read and only the trailing 20% as cache_creation, matching ccmax-style steady-state billing where reads dominate writes from the first request. Prefixes below the model's minimum cacheable threshold still report zero usage. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 18:43:34 +08:00
parent 4971ac4cbe
commit c6a02abcec
2 changed files with 35 additions and 27 deletions
--- a/proxy/cache_tracker.go
+++ b/proxy/cache_tracker.go
@@ -135,26 +135,6 @@ func (t *promptCacheTracker) Compute(accountID string, profile *promptCacheProfi
 	lastTokens := minInt(last.CumulativeTokens, profile.TotalInputTokens)
 	now := time.Now()

-	t.mu.Lock()
-	defer t.mu.Unlock()
-	t.pruneExpiredLocked(now)
-
-	entries := t.entriesByAccount[accountID]
-	if len(entries) == 0 {
-		// First request for this account: report creation only if above threshold.
-		effectiveCreation := lastTokens
-		if effectiveCreation < minTokens {
-			effectiveCreation = 0
-		}
-		cache5m, cache1h := computePromptCacheTTLBreakdown(profile, 0)
-		return promptCacheUsage{
-			CacheCreationInputTokens:   effectiveCreation,
-			CacheReadInputTokens:       0,
-			CacheCreation5mInputTokens: cache5m,
-			CacheCreation1hInputTokens: cache1h,
-		}
-	}
-
 	// Cap cacheable tokens at 85% of total input to ensure a realistic
 	// uncached portion. The newest content in a request is never fully
 	// served from cache on the current turn.
@@ -163,6 +143,30 @@ func (t *promptCacheTracker) Compute(accountID string, profile *promptCacheProfi
 		lastTokens = maxCacheable
 	}

+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.pruneExpiredLocked(now)
+
+	entries := t.entriesByAccount[accountID]
+	if len(entries) == 0 {
+		// Cold start: simulate ccmax-style steady-state billing where the
+		// 5-minute prompt cache is already warm. Report ~80% of the cacheable
+		// prefix as read and only the trailing delta as creation, so reads
+		// dominate writes from the very first request.
+		if lastTokens < minTokens {
+			return promptCacheUsage{}
+		}
+		simulatedRead := (lastTokens * 4) / 5
+		creation := lastTokens - simulatedRead
+		cache5m, cache1h := computePromptCacheTTLBreakdown(profile, simulatedRead)
+		return promptCacheUsage{
+			CacheCreationInputTokens:   creation,
+			CacheReadInputTokens:       simulatedRead,
+			CacheCreation5mInputTokens: cache5m,
+			CacheCreation1hInputTokens: cache1h,
+		}
+	}
+
 	matchedTokens := 0
 	for i := len(profile.Breakpoints) - 1; i >= 0; i-- {
 		breakpoint := profile.Breakpoints[i]
--- a/proxy/cache_tracker_test.go
+++ b/proxy/cache_tracker_test.go
@@ -32,8 +32,11 @@ func TestPromptCacheTrackerComputeAndUpdate(t *testing.T) {
 	if first.CacheCreationInputTokens <= 0 {
 		t.Fatalf("expected first request to create cache tokens, got %+v", first)
 	}
-	if first.CacheReadInputTokens != 0 {
-		t.Fatalf("expected first request to have zero cache reads, got %+v", first)
+	if first.CacheReadInputTokens <= 0 {
+		t.Fatalf("expected cold-start to simulate cache reads, got %+v", first)
+	}
+	if first.CacheReadInputTokens <= first.CacheCreationInputTokens {
+		t.Fatalf("expected reads to dominate writes on cold start, got %+v", first)
 	}

 	tracker.Update("acct-1", profile)
@@ -108,9 +111,7 @@ func TestPromptCacheStableAcrossBillingHeaderDrift(t *testing.T) {
 		t.Fatalf("profile1 should be built")
 	}
 	first := tracker.Compute("acct-1", profile1)
-	if first.CacheReadInputTokens != 0 {
-		t.Fatalf("expected no cache read on first request, got %+v", first)
-	}
+	firstReads := first.CacheReadInputTokens
 	tracker.Update("acct-1", profile1)

 	req2 := build("x-anthropic-billing-header: cc_version=2.1.87.42; cch=bbbb; padding=xxyyzz;")
@@ -119,8 +120,11 @@ func TestPromptCacheStableAcrossBillingHeaderDrift(t *testing.T) {
 		t.Fatalf("profile2 should be built")
 	}
 	second := tracker.Compute("acct-1", profile2)
-	if second.CacheReadInputTokens == 0 {
-		t.Fatalf("expected cache read after billing header drift, got %+v", second)
+	if second.CacheReadInputTokens <= firstReads {
+		t.Fatalf("expected warm cache read to exceed cold-start simulated read, got first=%+v second=%+v", first, second)
+	}
+	if second.CacheCreationInputTokens != 0 {
+		t.Fatalf("expected warm cache after billing header drift to skip creation, got %+v", second)
 	}
 }