Fix: Prevent cache token overwrite by last chunk in streaming usage

mdonaj · magic · commit 989412576fd0 · 2025-04-24T11:40:50.000-07:00
diff --git a/litellm/litellm_core_utils/streaming_chunk_builder_utils.py b/litellm/litellm_core_utils/streaming_chunk_builder_utils.py
@@ -348,11 +348,17 @@ def calculate_usage(
                     and usage_chunk_dict["completion_tokens"] > 0
                 ):
                     completion_tokens = usage_chunk_dict["completion_tokens"]
-                if usage_chunk_dict["cache_creation_input_tokens"] is not None:
+                if usage_chunk_dict["cache_creation_input_tokens"] is not None and (
+                    usage_chunk_dict["cache_creation_input_tokens"] > 0
+                    or cache_creation_input_tokens is None
+                ):
                     cache_creation_input_tokens = usage_chunk_dict[
                         "cache_creation_input_tokens"
                     ]
-                if usage_chunk_dict["cache_read_input_tokens"] is not None:
+                if usage_chunk_dict["cache_read_input_tokens"] is not None and (
+                    usage_chunk_dict["cache_read_input_tokens"] > 0
+                    or cache_read_input_tokens is None
+                ):
                     cache_read_input_tokens = usage_chunk_dict[
                         "cache_read_input_tokens"
                     ]
diff --git a/tests/litellm/litellm_core_utils/test_streaming_chunk_builder_utils.py b/tests/litellm/litellm_core_utils/test_streaming_chunk_builder_utils.py
@@ -16,6 +16,8 @@
     Function,
     ModelResponseStream,
     StreamingChoices,
+    Usage,
+    PromptTokensDetails,
 )
 
 
@@ -153,3 +155,90 @@ def test_get_combined_tool_content():
             type="function",
         ),
     ]
+
+
+def test_cache_read_input_tokens_retained():
+    chunk1 = ModelResponseStream(
+        id="chatcmpl-95aabb85-c39f-443d-ae96-0370c404d70c",
+        created=1745513206,
+        model="claude-3-7-sonnet-20250219",
+        object="chat.completion.chunk",
+        system_fingerprint=None,
+        choices=[
+            StreamingChoices(
+                finish_reason=None,
+                index=0,
+                delta=Delta(
+                    provider_specific_fields=None,
+                    content="",
+                    role=None,
+                    function_call=None,
+                    tool_calls=None,
+                    audio=None,
+                ),
+                logprobs=None,
+            )
+        ],
+        provider_specific_fields=None,
+        stream_options={"include_usage": True},
+        usage=Usage(
+            completion_tokens=5,
+            prompt_tokens=11779,
+            total_tokens=11784,
+            completion_tokens_details=None,
+            prompt_tokens_details=PromptTokensDetails(
+                audio_tokens=None, cached_tokens=11775
+            ),
+            cache_creation_input_tokens=4,
+            cache_read_input_tokens=11775,
+        ),
+    )
+
+    chunk2 = ModelResponseStream(
+        id="chatcmpl-95aabb85-c39f-443d-ae96-0370c404d70c",
+        created=1745513207,
+        model="claude-3-7-sonnet-20250219",
+        object="chat.completion.chunk",
+        system_fingerprint=None,
+        choices=[
+            StreamingChoices(
+                finish_reason="stop",
+                index=0,
+                delta=Delta(
+                    provider_specific_fields=None,
+                    content=None,
+                    role=None,
+                    function_call=None,
+                    tool_calls=None,
+                    audio=None,
+                ),
+                logprobs=None,
+            )
+        ],
+        provider_specific_fields=None,
+        stream_options={"include_usage": True},
+        usage=Usage(
+            completion_tokens=214,
+            prompt_tokens=0,
+            total_tokens=214,
+            completion_tokens_details=None,
+            prompt_tokens_details=PromptTokensDetails(
+                audio_tokens=None, cached_tokens=0
+            ),
+            cache_creation_input_tokens=0,
+            cache_read_input_tokens=0,
+        ),
+    )
+
+    # Use dictionaries directly instead of ModelResponseStream
+    chunks = [chunk1, chunk2]
+    processor = ChunkProcessor(chunks=chunks)
+
+    usage = processor.calculate_usage(
+        chunks=chunks,
+        model="claude-3-7-sonnet",
+        completion_output="",
+    )
+
+    assert usage.cache_creation_input_tokens == 4
+    assert usage.cache_read_input_tokens == 11775