From c6a02abcecb4fda0686d5eab6f42043d4da34c35 Mon Sep 17 00:00:00 2001
From: huangzhenpc <nosqli@163.com>
Date: Thu, 14 May 2026 18:43:34 +0800
Subject: [PATCH] feat(cache): simulate warm 5m prompt cache on cold start

Cold-start now reports ~80% of the cacheable prefix as cache_read and only
the trailing 20% as cache_creation, matching ccmax-style steady-state
billing where reads dominate writes from the first request. Prefixes below
the model's minimum cacheable threshold still report zero usage.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 proxy/cache_tracker.go      | 44 ++++++++++++++++++++-----------------
 proxy/cache_tracker_test.go | 18 +++++++++------
 2 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/proxy/cache_tracker.go b/proxy/cache_tracker.go
index 682b8c4..d809a14 100644
--- a/proxy/cache_tracker.go
+++ b/proxy/cache_tracker.go
@@ -135,26 +135,6 @@ func (t *promptCacheTracker) Compute(accountID string, profile *promptCacheProfi
 	lastTokens := minInt(last.CumulativeTokens, profile.TotalInputTokens)
 	now := time.Now()
 
-	t.mu.Lock()
-	defer t.mu.Unlock()
-	t.pruneExpiredLocked(now)
-
-	entries := t.entriesByAccount[accountID]
-	if len(entries) == 0 {
-		// First request for this account: report creation only if above threshold.
-		effectiveCreation := lastTokens
-		if effectiveCreation < minTokens {
-			effectiveCreation = 0
-		}
-		cache5m, cache1h := computePromptCacheTTLBreakdown(profile, 0)
-		return promptCacheUsage{
-			CacheCreationInputTokens:   effectiveCreation,
-			CacheReadInputTokens:       0,
-			CacheCreation5mInputTokens: cache5m,
-			CacheCreation1hInputTokens: cache1h,
-		}
-	}
-
 	// Cap cacheable tokens at 85% of total input to ensure a realistic
 	// uncached portion. The newest content in a request is never fully
 	// served from cache on the current turn.
@@ -163,6 +143,30 @@ func (t *promptCacheTracker) Compute(accountID string, profile *promptCacheProfi
 		lastTokens = maxCacheable
 	}
 
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.pruneExpiredLocked(now)
+
+	entries := t.entriesByAccount[accountID]
+	if len(entries) == 0 {
+		// Cold start: simulate ccmax-style steady-state billing where the
+		// 5-minute prompt cache is already warm. Report ~80% of the cacheable
+		// prefix as read and only the trailing delta as creation, so reads
+		// dominate writes from the very first request.
+		if lastTokens < minTokens {
+			return promptCacheUsage{}
+		}
+		simulatedRead := (lastTokens * 4) / 5
+		creation := lastTokens - simulatedRead
+		cache5m, cache1h := computePromptCacheTTLBreakdown(profile, simulatedRead)
+		return promptCacheUsage{
+			CacheCreationInputTokens:   creation,
+			CacheReadInputTokens:       simulatedRead,
+			CacheCreation5mInputTokens: cache5m,
+			CacheCreation1hInputTokens: cache1h,
+		}
+	}
+
 	matchedTokens := 0
 	for i := len(profile.Breakpoints) - 1; i >= 0; i-- {
 		breakpoint := profile.Breakpoints[i]
diff --git a/proxy/cache_tracker_test.go b/proxy/cache_tracker_test.go
index 2e3a1d8..52c8478 100644
--- a/proxy/cache_tracker_test.go
+++ b/proxy/cache_tracker_test.go
@@ -32,8 +32,11 @@ func TestPromptCacheTrackerComputeAndUpdate(t *testing.T) {
 	if first.CacheCreationInputTokens <= 0 {
 		t.Fatalf("expected first request to create cache tokens, got %+v", first)
 	}
-	if first.CacheReadInputTokens != 0 {
-		t.Fatalf("expected first request to have zero cache reads, got %+v", first)
+	if first.CacheReadInputTokens <= 0 {
+		t.Fatalf("expected cold-start to simulate cache reads, got %+v", first)
+	}
+	if first.CacheReadInputTokens <= first.CacheCreationInputTokens {
+		t.Fatalf("expected reads to dominate writes on cold start, got %+v", first)
 	}
 
 	tracker.Update("acct-1", profile)
@@ -108,9 +111,7 @@ func TestPromptCacheStableAcrossBillingHeaderDrift(t *testing.T) {
 		t.Fatalf("profile1 should be built")
 	}
 	first := tracker.Compute("acct-1", profile1)
-	if first.CacheReadInputTokens != 0 {
-		t.Fatalf("expected no cache read on first request, got %+v", first)
-	}
+	firstReads := first.CacheReadInputTokens
 	tracker.Update("acct-1", profile1)
 
 	req2 := build("x-anthropic-billing-header: cc_version=2.1.87.42; cch=bbbb; padding=xxyyzz;")
@@ -119,8 +120,11 @@ func TestPromptCacheStableAcrossBillingHeaderDrift(t *testing.T) {
 		t.Fatalf("profile2 should be built")
 	}
 	second := tracker.Compute("acct-1", profile2)
-	if second.CacheReadInputTokens == 0 {
-		t.Fatalf("expected cache read after billing header drift, got %+v", second)
+	if second.CacheReadInputTokens <= firstReads {
+		t.Fatalf("expected warm cache read to exceed cold-start simulated read, got first=%+v second=%+v", first, second)
+	}
+	if second.CacheCreationInputTokens != 0 {
+		t.Fatalf("expected warm cache after billing header drift to skip creation, got %+v", second)
 	}
 }