Merge pull request #3357 from wenyifancc/cache_llama_cpp

feat: Add support for counting cache-hit tokens in llama.cpp
This commit is contained in:
Seefs
2026-03-22 00:39:49 +08:00
committed by GitHub

View File

@@ -627,6 +627,12 @@ func applyUsagePostProcessing(info *relaycommon.RelayInfo, usage *dto.Usage, res
usage.PromptTokensDetails.CachedTokens = usage.PromptCacheHitTokens
}
}
case constant.ChannelTypeOpenAI:
if usage.PromptTokensDetails.CachedTokens == 0 {
if cachedTokens, ok := extractLlamaCachedTokensFromBody(responseBody); ok {
usage.PromptTokensDetails.CachedTokens = cachedTokens
}
}
}
}
@@ -689,3 +695,25 @@ func extractMoonshotCachedTokensFromBody(body []byte) (int, bool) {
return 0, false
}
// extractLlamaCachedTokensFromBody 从llama.cpp的非标准位置提取cache_n
func extractLlamaCachedTokensFromBody(body []byte) (int, bool) {
if len(body) == 0 {
return 0, false
}
var payload struct {
Timings struct {
CachedTokens *int `json:"cache_n"`
} `json:"timings"`
}
if err := common.Unmarshal(body, &payload); err != nil {
return 0, false
}
if payload.Timings.CachedTokens == nil {
return 0, false
}
return *payload.Timings.CachedTokens, true
}