vllm-project · mgoin · Apr 22, 2025 · Apr 18, 2025 · Apr 18, 2025 · Apr 18, 2025
diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+import base64
+from time import time
+
+import openai
+import pytest
+import requests
+
+from vllm import envs
+
+from ...utils import RemoteOpenAIServer
+
+if not envs.VLLM_USE_V1:
+    pytest.skip(
+        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
+        allow_module_level=True,
+    )
+
+
+def encode_base64_content_from_url(content_url: str) -> str:
+    """Encode a content retrieved from a remote url to base64 format."""
+
+    with requests.get(content_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode('utf-8')
+
+    return result
+
+
+@pytest.mark.asyncio
+async def test_encoder_compilation(monkeypatch):
+    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
+    # model_name = "bczhou/TinyLLaVA-1.5B"
+    server_args = [
+        "--max-model-len", "4096", "--max-num-seqs", "16",
+        "--max-num-batched-tokens", "512"
+    ]
+    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    # with tempfile.TemporaryDirectory() as temp_dir:
+    #     monkeypatch.setenv("VLLM_XLA_CACHE_PATH", temp_dir)
+    # # Server will pre-compile on first startup.
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client: openai.AsyncOpenAI = remote_server.get_async_client()
+        image_base64 = encode_base64_content_from_url(image_url)
+        req_body = dict(messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{image_base64}"
+                    },
+                },
+            ],
+        }],
+                        model=model_name,
+                        max_completion_tokens=48)
+        s = time()
+        chat_completion_from_base64 = await client.chat.completions.create(
+            **req_body)
+        run1 = time() - s
+        print("RUN1", run1)
+        s = time()
+        chat_completion_from_base64 = await client.chat.completions.create(
+            **req_body)
+        run2 = time() - s
+        print("RUN2", run2)
+        result = chat_completion_from_base64.choices[0].message.content
+        assert result
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
@@ -42,7 +42,7 @@ def test_sampler_different(model_name: str):
         sampling_params = SamplingParams(temperature=0.3, seed=42)
         output2 = llm.generate(prompts, sampling_params)
 
-    # Batch-case with TopK
+    # Batch-case with TopK/P
     for B in [4, 16]:
         p = prompts * B
         sampling_params = [
@@ -51,9 +51,10 @@ def test_sampler_different(model_name: str):
                 min_p=0.8,
                 max_tokens=64,
                 # Vary number of ks
-                top_k=random.randint(4, 12)) for _ in range(B)
+                top_k=random.randint(4, 12),
+                top_p=random.random()) for _ in range(B)
         ]
-        # Make sure first two reqs have the same K
+        # Make sure first two reqs have the same K/P
         sampling_params[0] = sampling_params[1]
         output = llm.generate(p, sampling_params)
         assert output[0].outputs[0].text == output[1].outputs[0].text
@@ -11,7 +11,7 @@
     min_p=0.0,
     # strictly disabled for now
     top_k=0,
-    # top_p=0.0,
+    top_p=1.0,
     # frequency_penalties=0.0,
     # presence_penalties=0.0,
     # repetition_penalties=0.0,
@@ -26,11 +26,9 @@ class TPUSupportedSamplingMetadata:
     temperature: torch.Tensor = None
 
     min_p: torch.Tensor = None
-    # Still too slow on forward_native!
     top_k: torch.Tensor = None
     top_p: torch.Tensor = None
 
-    # Greedy sampling flag for compiling single xla graph.
     all_greedy: bool = True
 
     # unsupported, you need to return an extra tensor of static size BxV
@@ -103,17 +101,17 @@ def fill_slice(cpu_tensor: torch.Tensor, fill_val) -> torch.Tensor:
                    DEFAULT_SAMPLING_PARAMS["min_p"])
         fill_slice(input_batch.top_k_cpu_tensor,
                    DEFAULT_SAMPLING_PARAMS["top_k"])
-        # TODO Temporarily disabled until sampling options are enabled
-        # fill_slice(input_batch.top_p_cpu_tensor,
-        #            DEFAULT_SAMPLING_PARAMS["top_p"])
+        fill_slice(input_batch.top_p_cpu_tensor,
+                   DEFAULT_SAMPLING_PARAMS["top_p"])
 
         # Slice persistent device tensors to a fixed pre-compiled padded shape.
         return cls(
             temperature=input_batch.temperature_cpu_tensor[:padded_num_reqs].
             to(xla_device),
             all_greedy=input_batch.all_greedy,
             # TODO enable more and avoid returning None values
-            top_p=None,  # input_batch.top_p[:padded_num_reqs],
+            top_p=input_batch.top_p_cpu_tensor[:padded_num_reqs].to(
+                xla_device),
             top_k=input_batch.top_k_cpu_tensor[:padded_num_reqs].to(
                 xla_device),
             min_p=input_batch.min_p_cpu_tensor[:padded_num_reqs].to(