Skip to content

Commit 9894125

Browse files
mdonajmagic
authored andcommitted
Fix: Prevent cache token overwrite by last chunk in streaming usage
1 parent b82af5b commit 9894125

File tree

2 files changed

+97
-2
lines changed

2 files changed

+97
-2
lines changed

litellm/litellm_core_utils/streaming_chunk_builder_utils.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -348,11 +348,17 @@ def calculate_usage(
348348
and usage_chunk_dict["completion_tokens"] > 0
349349
):
350350
completion_tokens = usage_chunk_dict["completion_tokens"]
351-
if usage_chunk_dict["cache_creation_input_tokens"] is not None:
351+
if usage_chunk_dict["cache_creation_input_tokens"] is not None and (
352+
usage_chunk_dict["cache_creation_input_tokens"] > 0
353+
or cache_creation_input_tokens is None
354+
):
352355
cache_creation_input_tokens = usage_chunk_dict[
353356
"cache_creation_input_tokens"
354357
]
355-
if usage_chunk_dict["cache_read_input_tokens"] is not None:
358+
if usage_chunk_dict["cache_read_input_tokens"] is not None and (
359+
usage_chunk_dict["cache_read_input_tokens"] > 0
360+
or cache_read_input_tokens is None
361+
):
356362
cache_read_input_tokens = usage_chunk_dict[
357363
"cache_read_input_tokens"
358364
]

tests/litellm/litellm_core_utils/test_streaming_chunk_builder_utils.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
Function,
1717
ModelResponseStream,
1818
StreamingChoices,
19+
Usage,
20+
PromptTokensDetails,
1921
)
2022

2123

@@ -153,3 +155,90 @@ def test_get_combined_tool_content():
153155
type="function",
154156
),
155157
]
158+
159+
160+
def test_cache_read_input_tokens_retained():
161+
chunk1 = ModelResponseStream(
162+
id="chatcmpl-95aabb85-c39f-443d-ae96-0370c404d70c",
163+
created=1745513206,
164+
model="claude-3-7-sonnet-20250219",
165+
object="chat.completion.chunk",
166+
system_fingerprint=None,
167+
choices=[
168+
StreamingChoices(
169+
finish_reason=None,
170+
index=0,
171+
delta=Delta(
172+
provider_specific_fields=None,
173+
content="",
174+
role=None,
175+
function_call=None,
176+
tool_calls=None,
177+
audio=None,
178+
),
179+
logprobs=None,
180+
)
181+
],
182+
provider_specific_fields=None,
183+
stream_options={"include_usage": True},
184+
usage=Usage(
185+
completion_tokens=5,
186+
prompt_tokens=11779,
187+
total_tokens=11784,
188+
completion_tokens_details=None,
189+
prompt_tokens_details=PromptTokensDetails(
190+
audio_tokens=None, cached_tokens=11775
191+
),
192+
cache_creation_input_tokens=4,
193+
cache_read_input_tokens=11775,
194+
),
195+
)
196+
197+
chunk2 = ModelResponseStream(
198+
id="chatcmpl-95aabb85-c39f-443d-ae96-0370c404d70c",
199+
created=1745513207,
200+
model="claude-3-7-sonnet-20250219",
201+
object="chat.completion.chunk",
202+
system_fingerprint=None,
203+
choices=[
204+
StreamingChoices(
205+
finish_reason="stop",
206+
index=0,
207+
delta=Delta(
208+
provider_specific_fields=None,
209+
content=None,
210+
role=None,
211+
function_call=None,
212+
tool_calls=None,
213+
audio=None,
214+
),
215+
logprobs=None,
216+
)
217+
],
218+
provider_specific_fields=None,
219+
stream_options={"include_usage": True},
220+
usage=Usage(
221+
completion_tokens=214,
222+
prompt_tokens=0,
223+
total_tokens=214,
224+
completion_tokens_details=None,
225+
prompt_tokens_details=PromptTokensDetails(
226+
audio_tokens=None, cached_tokens=0
227+
),
228+
cache_creation_input_tokens=0,
229+
cache_read_input_tokens=0,
230+
),
231+
)
232+
233+
# Use dictionaries directly instead of ModelResponseStream
234+
chunks = [chunk1, chunk2]
235+
processor = ChunkProcessor(chunks=chunks)
236+
237+
usage = processor.calculate_usage(
238+
chunks=chunks,
239+
model="claude-3-7-sonnet",
240+
completion_output="",
241+
)
242+
243+
assert usage.cache_creation_input_tokens == 4
244+
assert usage.cache_read_input_tokens == 11775

0 commit comments

Comments
 (0)