From a48acd95f860c4fc85853e20668eabffff07cae7 Mon Sep 17 00:00:00 2001 From: Camille Farineau Date: Wed, 7 May 2025 18:52:04 +0200 Subject: [PATCH 1/6] fix(embeddings): use non default tokenizer when passing list of lists of tokens (int) --- litellm/proxy/proxy_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 18e1d8d98a75..b215afc9e314 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -3821,7 +3821,7 @@ async def embeddings( # noqa: PLR0915 input_list = [] for i in data["input"]: input_list.append( - litellm.decode(model="gpt-3.5-turbo", tokens=i) + litellm.decode(model=m["model_name"], tokens=i) ) data["input"] = input_list break From 18f57d6cc1f3e3b0b432abfda7e1e31bf5db2e26 Mon Sep 17 00:00:00 2001 From: Camille Farineau Date: Wed, 7 May 2025 18:52:37 +0200 Subject: [PATCH 2/6] feat(embeddings): allow for passthrough of list of lists of tokens to hosted_vllm models --- litellm/proxy/proxy_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index b215afc9e314..5c22bc4ea088 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -3814,6 +3814,7 @@ async def embeddings( # noqa: PLR0915 if m["model_name"] == data["model"] and ( m["litellm_params"]["model"] in litellm.open_ai_embedding_models or m["litellm_params"]["model"].startswith("azure/") + or m["litellm_params"]["model"].startswith("hosted_vllm/") ): pass else: From a11ea61a4088a629b13c862caebbb45e38a044d3 Mon Sep 17 00:00:00 2001 From: Camille Farineau Date: Mon, 12 May 2025 15:13:37 +0200 Subject: [PATCH 3/6] Revert "fix(embeddings): use non default tokenizer when passing list of lists of tokens (int)" This reverts commit a48acd95f860c4fc85853e20668eabffff07cae7. --- litellm/proxy/proxy_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 5c22bc4ea088..6cd3e845a652 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -3822,7 +3822,7 @@ async def embeddings( # noqa: PLR0915 input_list = [] for i in data["input"]: input_list.append( - litellm.decode(model=m["model_name"], tokens=i) + litellm.decode(model="gpt-3.5-turbo", tokens=i) ) data["input"] = input_list break From 9ae916646fc20319a6e48514fb7b8e3f8c9f8d9d Mon Sep 17 00:00:00 2001 From: Camille Farineau Date: Mon, 12 May 2025 17:13:06 +0200 Subject: [PATCH 4/6] refactor(embeddings): use a list to verify if provider accept as input a list of tokens --- litellm/constants.py | 6 ++++++ litellm/proxy/proxy_server.py | 9 ++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/litellm/constants.py b/litellm/constants.py index fa944c0dfaa1..8e3d529c16d3 100644 --- a/litellm/constants.py +++ b/litellm/constants.py @@ -164,6 +164,12 @@ "meta_llama", ] +LITELLM_EMBEDDING_PROVIDERS_SUPPORTING_INPUT_ARRAY_OF_TOKENS = [ + "openai", + "azure", + "hosted_vllm" +] + OPENAI_CHAT_COMPLETION_PARAMS = [ "functions", diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 6cd3e845a652..3b8710982084 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -28,6 +28,7 @@ from litellm.constants import ( DEFAULT_MAX_RECURSE_DEPTH, DEFAULT_SLACK_ALERTING_THRESHOLD, + LITELLM_EMBEDDING_PROVIDERS_SUPPORTING_INPUT_ARRAY_OF_TOKENS ) from litellm.types.utils import ( ModelResponse, @@ -3808,13 +3809,15 @@ async def embeddings( # noqa: PLR0915 and isinstance(data["input"][0], list) and isinstance(data["input"][0][0], int) ): # check if array of tokens passed in - # check if non-openai/azure model called - e.g. for langchain integration + # check if provider accept list of tokens as input - e.g. for langchain integration if llm_model_list is not None and data["model"] in router_model_names: for m in llm_model_list: if m["model_name"] == data["model"] and ( m["litellm_params"]["model"] in litellm.open_ai_embedding_models - or m["litellm_params"]["model"].startswith("azure/") - or m["litellm_params"]["model"].startswith("hosted_vllm/") + or any( + m["litellm_params"]["model"].startswith(provider) + for provider in LITELLM_EMBEDDING_PROVIDERS_SUPPORTING_INPUT_ARRAY_OF_TOKENS + ) ): pass else: From 94955a7c2e93e61958bd48e7c0b15105bf84c869 Mon Sep 17 00:00:00 2001 From: Camille Farineau Date: Mon, 12 May 2025 17:16:57 +0200 Subject: [PATCH 5/6] fix(embeddings): verify the model name before validating if provider accept a arrays of tokens as input When passing a list of tokens as input, verify the provider of the model by going through the list of models (`llm_model_list`). First, it check for model name then get the provider and verify if it accept or not arrays of tokens. If yes, then pass, else decode. Previously, it was verifying provider and model name at the same time resulting in decoding even if the current model checked was not the target one (looping onto `llm_model_list`) --- litellm/proxy/proxy_server.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 3b8710982084..5eda8c5abb02 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -3812,23 +3812,23 @@ async def embeddings( # noqa: PLR0915 # check if provider accept list of tokens as input - e.g. for langchain integration if llm_model_list is not None and data["model"] in router_model_names: for m in llm_model_list: - if m["model_name"] == data["model"] and ( - m["litellm_params"]["model"] in litellm.open_ai_embedding_models - or any( - m["litellm_params"]["model"].startswith(provider) - for provider in LITELLM_EMBEDDING_PROVIDERS_SUPPORTING_INPUT_ARRAY_OF_TOKENS - ) - ): - pass - else: - # non-openai/azure embedding model called with token input - input_list = [] - for i in data["input"]: - input_list.append( - litellm.decode(model="gpt-3.5-turbo", tokens=i) - ) - data["input"] = input_list - break + if m["model_name"] == data["model"]: + if (m["litellm_params"]["model"] in litellm.open_ai_embedding_models + or any( + m["litellm_params"]["model"].startswith(provider) + for provider in LITELLM_EMBEDDING_PROVIDERS_SUPPORTING_INPUT_ARRAY_OF_TOKENS + ) + ): + pass + else: + # non-openai/azure embedding model called with token input + input_list = [] + for i in data["input"]: + input_list.append( + litellm.decode(model="gpt-3.5-turbo", tokens=i) + ) + data["input"] = input_list + break ### CALL HOOKS ### - modify incoming data / reject request before calling the model data = await proxy_logging_obj.pre_call_hook( From cedacd18bae48338fbc1aa42de74a0df50abdf8c Mon Sep 17 00:00:00 2001 From: Camille Farineau Date: Tue, 13 May 2025 10:45:31 +0200 Subject: [PATCH 6/6] test(embedding): add unit test to bypass decode for some providers with input as array of tokens Ref: https://github.com/BerriAI/litellm/issues/10113 --- .../test_configs/test_config_no_auth.yaml | 6 ++ tests/litellm/proxy/test_proxy_server.py | 76 +++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 tests/litellm/proxy/test_configs/test_config_no_auth.yaml diff --git a/tests/litellm/proxy/test_configs/test_config_no_auth.yaml b/tests/litellm/proxy/test_configs/test_config_no_auth.yaml new file mode 100644 index 000000000000..1b6b9ad198e0 --- /dev/null +++ b/tests/litellm/proxy/test_configs/test_config_no_auth.yaml @@ -0,0 +1,6 @@ +model_list: +- litellm_params: + model: hosted_vllm/embed_model + model_info: + description: this is a test embedding hosted_vllm model + model_name: vllm_embed_model diff --git a/tests/litellm/proxy/test_proxy_server.py b/tests/litellm/proxy/test_proxy_server.py index 919a00d67039..ba787bc1e1e2 100644 --- a/tests/litellm/proxy/test_proxy_server.py +++ b/tests/litellm/proxy/test_proxy_server.py @@ -1,9 +1,11 @@ +import asyncio import importlib import json import os import socket import subprocess import sys +from unittest import mock from unittest.mock import AsyncMock, MagicMock, mock_open, patch import click @@ -18,6 +20,51 @@ ) # Adds the parent directory to the system-path import litellm +from litellm.proxy.proxy_server import app, initialize + +example_embedding_result = { + "object": "list", + "data": [ + { + "object": "embedding", + "index": 0, + "embedding": [ + -0.006929283495992422, + -0.005336422007530928, + -4.547132266452536e-05, + -0.024047505110502243, + -0.006929283495992422, + -0.005336422007530928, + -4.547132266452536e-05, + -0.024047505110502243, + -0.006929283495992422, + -0.005336422007530928, + -4.547132266452536e-05, + -0.024047505110502243, + ], + } + ], + "model": "text-embedding-3-small", + "usage": {"prompt_tokens": 5, "total_tokens": 5}, +} + +def mock_patch_aembedding(): + return mock.patch( + "litellm.proxy.proxy_server.llm_router.aembedding", + return_value=example_embedding_result, + ) + +@pytest.fixture(scope="function") +def client_no_auth(): + # Assuming litellm.proxy.proxy_server is an object + from litellm.proxy.proxy_server import cleanup_router_config_variables + + cleanup_router_config_variables() + filepath = os.path.dirname(os.path.abspath(__file__)) + config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml" + # initialize can get run in parallel, it sets specific variables for the fast api app, sinc eit gets run in parallel different tests use the wrong variables + asyncio.run(initialize(config=config_fp, debug=True)) + return TestClient(app) @pytest.mark.asyncio @@ -189,3 +236,32 @@ def test_team_info_masking(): print("Got exception: {}".format(exc_info.value)) assert "secret-test-key" not in str(exc_info.value) assert "public-test-key" not in str(exc_info.value) + + +@mock_patch_aembedding() +def test_embedding_input_array_of_tokens(mock_aembedding, client_no_auth): + """ + Test to bypass decoding input as array of tokens for selected providers + + Ref: https://github.com/BerriAI/litellm/issues/10113 + """ + try: + test_data = { + "model": "vllm_embed_model", + "input": [[2046, 13269, 158208]], + } + + response = client_no_auth.post("/v1/embeddings", json=test_data) + + mock_aembedding.assert_called_once_with( + model="vllm_embed_model", + input=[[2046, 13269, 158208]], + metadata=mock.ANY, + proxy_server_request=mock.ANY, + ) + assert response.status_code == 200 + result = response.json() + print(len(result["data"][0]["embedding"])) + assert len(result["data"][0]["embedding"]) > 10 # this usually has len==1536 so + except Exception as e: + pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}")