Anthropic - pass file url's as Document content type + Gemini - cache token tracking on streaming calls (BerriAI#11387)

krrishdholakia · stefan-- · commit 54ffb1f60404 · 2025-06-12T17:23:57.000+02:00
* fix(anthropic/): fix regression when passing file url's to the 'file_id' parameter add test and ensure anthropic file url's are correctly sent as 'document' blocks * fix(vertex_and_google_ai_studio.py): Use same usage calculation function as non-streaming Closes BerriAI#10667 * test(test_vertex_and_google_ai_studio_gemini.py): update test
diff --git a/litellm/litellm_core_utils/prompt_templates/factory.py b/litellm/litellm_core_utils/prompt_templates/factory.py
@@ -1396,6 +1396,21 @@ def select_anthropic_content_block_type_for_file(
         return "container_upload"
 
 
+def anthropic_infer_file_id_content_type(
+    file_id: str,
+) -> Literal["document_url", "container_upload"]:
+    """
+    Use when 'format' not provided.
+
+    - URL's - assume are document_url
+    - Else - assume is container_upload
+    """
+    if file_id.startswith("http") or file_id.startswith("https"):
+        return "document_url"
+    else:
+        return "container_upload"
+
+
 def anthropic_process_openai_file_message(
     message: ChatCompletionFileObject,
 ) -> Union[
@@ -1425,7 +1440,7 @@ def anthropic_process_openai_file_message(
         content_block_type = (
             select_anthropic_content_block_type_for_file(format)
             if format
-            else "container_upload"
+            else anthropic_infer_file_id_content_type(file_id)
         )
         return_block_param: Optional[
             Union[
@@ -1442,6 +1457,14 @@ def anthropic_process_openai_file_message(
                     file_id=file_id,
                 ),
             )
+        elif content_block_type == "document_url":
+            return_block_param = AnthropicMessagesDocumentParam(
+                type="document",
+                source=AnthropicContentParamSourceUrl(
+                    type="url",
+                    url=file_id,
+                ),
+            )
         elif content_block_type == "image":
             return_block_param = AnthropicMessagesImageParam(
                 type="image",
diff --git a/litellm/llms/gemini/realtime/transformation.py b/litellm/llms/gemini/realtime/transformation.py
@@ -658,7 +658,7 @@ def transform_response_done_event(
             modality.lower() for modality in cast(List[str], gemini_modalities)
         ]
         if "usageMetadata" in message:
-            _chat_completion_usage = VertexGeminiConfig()._calculate_usage(
+            _chat_completion_usage = VertexGeminiConfig._calculate_usage(
                 completion_response=message,
             )
         else:
diff --git a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@@ -902,7 +902,8 @@ def _handle_content_policy_violation(
 
         return model_response
 
-    def is_candidate_token_count_inclusive(self, usage_metadata: UsageMetadata) -> bool:
+    @staticmethod
+    def is_candidate_token_count_inclusive(usage_metadata: UsageMetadata) -> bool:
         """
         Check if the candidate token count is inclusive of the thinking token count
 
@@ -919,13 +920,16 @@ def is_candidate_token_count_inclusive(self, usage_metadata: UsageMetadata) -> b
         else:
             return False
 
+    @staticmethod
     def _calculate_usage(
-        self,
         completion_response: Union[
             GenerateContentResponseBody, BidiGenerateContentServerMessage
         ],
     ) -> Usage:
-        if "usageMetadata" not in completion_response:
+        if (
+            completion_response is not None
+            and "usageMetadata" not in completion_response
+        ):
             raise ValueError(
                 f"usageMetadata not found in completion_response. Got={completion_response}"
             )
@@ -936,33 +940,30 @@ def _calculate_usage(
         reasoning_tokens: Optional[int] = None
         response_tokens: Optional[int] = None
         response_tokens_details: Optional[CompletionTokensDetailsWrapper] = None
-        if "cachedContentTokenCount" in completion_response["usageMetadata"]:
-            cached_tokens = completion_response["usageMetadata"][
-                "cachedContentTokenCount"
-            ]
+        usage_metadata = completion_response["usageMetadata"]
+        if "cachedContentTokenCount" in usage_metadata:
+            cached_tokens = usage_metadata["cachedContentTokenCount"]
 
         ## GEMINI LIVE API ONLY PARAMS ##
-        if "responseTokenCount" in completion_response["usageMetadata"]:
-            response_tokens = completion_response["usageMetadata"]["responseTokenCount"]
-        if "responseTokensDetails" in completion_response["usageMetadata"]:
+        if "responseTokenCount" in usage_metadata:
+            response_tokens = usage_metadata["responseTokenCount"]
+        if "responseTokensDetails" in usage_metadata:
             response_tokens_details = CompletionTokensDetailsWrapper()
-            for detail in completion_response["usageMetadata"]["responseTokensDetails"]:
+            for detail in usage_metadata["responseTokensDetails"]:
                 if detail["modality"] == "TEXT":
                     response_tokens_details.text_tokens = detail["tokenCount"]
                 elif detail["modality"] == "AUDIO":
                     response_tokens_details.audio_tokens = detail["tokenCount"]
         #########################################################
 
-        if "promptTokensDetails" in completion_response["usageMetadata"]:
-            for detail in completion_response["usageMetadata"]["promptTokensDetails"]:
+        if "promptTokensDetails" in usage_metadata:
+            for detail in usage_metadata["promptTokensDetails"]:
                 if detail["modality"] == "AUDIO":
                     audio_tokens = detail["tokenCount"]
                 elif detail["modality"] == "TEXT":
                     text_tokens = detail["tokenCount"]
-        if "thoughtsTokenCount" in completion_response["usageMetadata"]:
-            reasoning_tokens = completion_response["usageMetadata"][
-                "thoughtsTokenCount"
-            ]
+        if "thoughtsTokenCount" in usage_metadata:
+            reasoning_tokens = usage_metadata["thoughtsTokenCount"]
         prompt_tokens_details = PromptTokensDetailsWrapper(
             cached_tokens=cached_tokens,
             audio_tokens=audio_tokens,
@@ -973,19 +974,15 @@ def _calculate_usage(
             "candidatesTokenCount", 0
         )
         if (
-            not self.is_candidate_token_count_inclusive(
-                completion_response["usageMetadata"]
-            )
+            not VertexGeminiConfig.is_candidate_token_count_inclusive(usage_metadata)
             and reasoning_tokens
         ):
             completion_tokens = reasoning_tokens + completion_tokens
         ## GET USAGE ##
         usage = Usage(
-            prompt_tokens=completion_response["usageMetadata"].get(
-                "promptTokenCount", 0
-            ),
+            prompt_tokens=usage_metadata.get("promptTokenCount", 0),
             completion_tokens=completion_tokens,
-            total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
+            total_tokens=usage_metadata.get("totalTokenCount", 0),
             prompt_tokens_details=prompt_tokens_details,
             reasoning_tokens=reasoning_tokens,
             completion_tokens_details=response_tokens_details,
@@ -1169,7 +1166,9 @@ def transform_response(
                     _candidates, model_response, logging_obj.optional_params
                 )
 
-            usage = self._calculate_usage(completion_response=completion_response)
+            usage = VertexGeminiConfig._calculate_usage(
+                completion_response=completion_response
+            )
             setattr(model_response, "usage", usage)
 
             ## ADD METADATA TO RESPONSE ##
@@ -1806,21 +1805,8 @@ def chunk_parser(self, chunk: dict) -> "ModelResponseStream":
                 ## GEMINI SETS FINISHREASON ON EVERY CHUNK!
 
             if "usageMetadata" in processed_chunk:
-                usage = Usage(
-                    prompt_tokens=processed_chunk["usageMetadata"].get(
-                        "promptTokenCount", 0
-                    ),
-                    completion_tokens=processed_chunk["usageMetadata"].get(
-                        "candidatesTokenCount", 0
-                    ),
-                    total_tokens=processed_chunk["usageMetadata"].get(
-                        "totalTokenCount", 0
-                    ),
-                    completion_tokens_details=CompletionTokensDetailsWrapper(
-                        reasoning_tokens=processed_chunk["usageMetadata"].get(
-                            "thoughtsTokenCount", 0
-                        )
-                    ),
+                usage = VertexGeminiConfig._calculate_usage(
+                    completion_response=processed_chunk,
                 )
 
             args: Dict[str, Any] = {
diff --git a/litellm/types/llms/anthropic.py b/litellm/types/llms/anthropic.py
@@ -114,6 +114,11 @@ class AnthropicContentParamSource(TypedDict):
     data: str
 
 
+class AnthropicContentParamSourceUrl(TypedDict):
+    type: Literal["url"]
+    url: str
+
+
 class AnthropicContentParamSourceFileId(TypedDict):
     type: Literal["file"]
     file_id: str
@@ -140,7 +145,11 @@ class CitationsObject(TypedDict):
 class AnthropicMessagesDocumentParam(TypedDict, total=False):
     type: Required[Literal["document"]]
     source: Required[
-        Union[AnthropicContentParamSource, AnthropicContentParamSourceFileId]
+        Union[
+            AnthropicContentParamSource,
+            AnthropicContentParamSourceFileId,
+            AnthropicContentParamSourceUrl,
+        ]
     ]
     cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
     title: str
diff --git a/tests/llm_translation/base_llm_unit_tests.py b/tests/llm_translation/base_llm_unit_tests.py
@@ -248,6 +248,40 @@ async def test_pdf_handling(self, pdf_messages, sync_mode):
             )
 
         assert response is not None
+
+    @pytest.mark.asyncio
+    async def test_async_pdf_handling_with_file_id(self):
+        from litellm.utils import supports_pdf_input
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+        litellm.model_cost = litellm.get_model_cost_map(url="")
+
+        litellm._turn_on_debug()
+
+
+        image_content = [
+            {"type": "text", "text": "What's this file about?"},
+            {
+                "type": "file",
+                "file": {
+                    "file_id": "https://upload.wikimedia.org/wikipedia/commons/2/20/Re_example.pdf"
+                },
+            },
+        ]
+
+        image_messages = [{"role": "user", "content": image_content}]
+
+        base_completion_call_args = self.get_base_completion_call_args()
+
+        if not supports_pdf_input(base_completion_call_args["model"], None):
+            pytest.skip("Model does not support image input")
+
+        response = await self.async_completion_function(
+            **base_completion_call_args,
+            messages=image_messages,
+        )
+
+        assert response is not None
+    
     
     def test_file_data_unit_test(self, pdf_messages):
         from litellm.utils import supports_pdf_input, return_raw_request
diff --git a/tests/llm_translation/test_anthropic_completion.py b/tests/llm_translation/test_anthropic_completion.py
@@ -1272,3 +1272,4 @@ def test_anthropic_text_editor():
         print(e)
 
     assert response is not None
+
diff --git a/tests/test_litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py b/tests/test_litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py
@@ -309,7 +309,10 @@ def test_vertex_ai_candidate_token_count_inclusive(
     Test that the candidate token count is inclusive of the thinking token count
     """
     v = VertexGeminiConfig()
-    assert v.is_candidate_token_count_inclusive(usage_metadata) is inclusive
+    assert (
+        VertexGeminiConfig.is_candidate_token_count_inclusive(usage_metadata)
+        is inclusive
+    )
 
     usage = v._calculate_usage(completion_response={"usageMetadata": usage_metadata})
     assert usage.prompt_tokens == expected_usage.prompt_tokens
@@ -490,3 +493,64 @@ def test_vertex_ai_map_tool_with_anyof():
         "anyOf": [{"type": "string", "nullable": True, "title": "Base Branch"}]
     }, f"Expected only anyOf field and its contents to be kept, but got {tools[0]['function_declarations'][0]['parameters']['properties']['base_branch']}"
 
+
+def test_vertex_ai_streaming_usage_calculation():
+    """
+    Ensure streaming usage calculation uses same function as non-streaming usage calculation
+    """
+    from unittest.mock import patch
+
+    from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
+        ModelResponseIterator,
+        VertexGeminiConfig,
+    )
+
+    v = VertexGeminiConfig()
+    usage_metadata = {
+        "promptTokenCount": 57,
+        "candidatesTokenCount": 10,
+        "totalTokenCount": 67,
+    }
+
+    # Test streaming chunk parsing
+    with patch.object(VertexGeminiConfig, "_calculate_usage") as mock_calculate_usage:
+        # Create a streaming chunk
+        chunk = {
+            "candidates": [{"content": {"parts": [{"text": "Hello"}]}}],
+            "usageMetadata": usage_metadata,
+        }
+
+        # Create iterator and parse chunk
+        iterator = ModelResponseIterator(
+            streaming_response=[], sync_stream=True, logging_obj=MagicMock()
+        )
+        iterator.chunk_parser(chunk)
+
+        # Verify _calculate_usage was called with correct parameters
+        mock_calculate_usage.assert_called_once_with(completion_response=chunk)
+
+    # Test non-streaming response parsing
+    with patch.object(VertexGeminiConfig, "_calculate_usage") as mock_calculate_usage:
+        # Create a completion response
+        completion_response = {
+            "candidates": [{"content": {"parts": [{"text": "Hello"}]}}],
+            "usageMetadata": usage_metadata,
+        }
+
+        # Parse completion response
+        v.transform_response(
+            model="gemini-pro",
+            raw_response=MagicMock(json=lambda: completion_response),
+            model_response=ModelResponse(),
+            logging_obj=MagicMock(),
+            request_data={},
+            messages=[],
+            optional_params={},
+            litellm_params={},
+            encoding=None,
+        )
+
+        # Verify _calculate_usage was called with correct parameters
+        mock_calculate_usage.assert_called_once_with(
+            completion_response=completion_response,
+        )

Original file line number	Diff line number	Diff line change
`@@ -658,7 +658,7 @@ def transform_response_done_event(`
`658`	`658`	`modality.lower() for modality in cast(List[str], gemini_modalities)`
`659`	`659`	`]`
`660`	`660`	`if "usageMetadata" in message:`
`661`		`- _chat_completion_usage = VertexGeminiConfig()._calculate_usage(`
	`661`	`+ _chat_completion_usage = VertexGeminiConfig._calculate_usage(`
`662`	`662`	`completion_response=message,`
`663`	`663`	`)`
`664`	`664`	`else:`
Original file line number	Diff line number	Diff line change
`@@ -1272,3 +1272,4 @@ def test_anthropic_text_editor():`
`1272`	`1272`	`print(e)`
`1273`	`1273`
`1274`	`1274`	`assert response is not None`
	`1275`	`+`