sgl-project
diff --git a/‎docs/supported_models/vision_language_models.md
Lines changed: 1 addition & 0 deletions b/‎docs/supported_models/vision_language_models.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/runtime/README.md
Lines changed: 3 additions & 2 deletions b/‎examples/runtime/README.md
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/runtime/llava_onevision/http_llama3_llava_test.py renamed to ‎examples/runtime/multimodal/llama3_llava_server.py
Lines changed: 1 addition & 1 deletion b/‎examples/runtime/llava_onevision/http_llama3_llava_test.py renamed to ‎examples/runtime/multimodal/llama3_llava_server.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/runtime/llava_onevision/http_llava_onevision_test.py renamed to ‎examples/runtime/multimodal/llava_onevision_server.py
Lines changed: 1 addition & 1 deletion b/‎examples/runtime/llava_onevision/http_llava_onevision_test.py renamed to ‎examples/runtime/multimodal/llava_onevision_server.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/runtime/multimodal/pixtral_server.py
Lines changed: 127 additions & 0 deletions b/‎examples/runtime/multimodal/pixtral_server.py
Lines changed: 127 additions & 0 deletions
diff --git a/‎examples/runtime/llava_onevision/http_qwen_llava_test.py renamed to ‎examples/runtime/multimodal/qwen_llava_server.py
Lines changed: 1 addition & 1 deletion b/‎examples/runtime/llava_onevision/http_qwen_llava_test.py renamed to ‎examples/runtime/multimodal/qwen_llava_server.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/sglang/lang/chat_template.py
Lines changed: 22 additions & 1 deletion b/‎python/sglang/lang/chat_template.py
Lines changed: 22 additions & 1 deletion
diff --git a/‎python/sglang/srt/configs/model_config.py
Lines changed: 1 addition & 0 deletions b/‎python/sglang/srt/configs/model_config.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/sglang/srt/conversation.py
Lines changed: 21 additions & 1 deletion b/‎python/sglang/srt/conversation.py
Lines changed: 21 additions & 1 deletion
diff --git a/‎python/sglang/srt/managers/multimodal_processors/llava.py
Lines changed: 46 additions & 0 deletions b/‎python/sglang/srt/managers/multimodal_processors/llava.py
Lines changed: 46 additions & 0 deletions
@@ -20,6 +20,7 @@ python3 -m sglang.launch_server \
 | **Janus-Pro** (1B, 7B)      | `deepseek-ai/Janus-Pro-7B`                     | `janus-pro`       | DeepSeek’s open-source multimodal model capable of both image understanding and generation. Janus-Pro employs a decoupled architecture for separate visual encoding paths, enhancing performance in both tasks. |
 | **MiniCPM-V / MiniCPM-o**      | `openbmb/MiniCPM-V-2_6`                          | `minicpmv`           | MiniCPM-V (2.6, ~8B) supports image inputs, and MiniCPM-o adds audio/video; these multimodal LLMs are optimized for end-side deployment on mobile/edge devices. |
 | **Llama 3.2 Vision** (11B)     | `meta-llama/Llama-3.2-11B-Vision-Instruct`        | `llama_3_vision`     | Vision-enabled variant of Llama 3 (11B) that accepts image inputs for visual question answering and other multimodal tasks. |
+| **Pixtral** (12B, 124B)              | `mistral-community/pixtral-12b`                           | `mistral`     | Pixtral is a vision-language model from Mistral AI that can process both text and images. |
 | **LLaVA** (v1.5 & v1.6)        | *e.g.* `liuhaotian/llava-v1.5-13b`               | `vicuna_v1.1`        | Open vision-chat models that add an image encoder to LLaMA/Vicuna (e.g. LLaMA2 13B) for following multimodal instruction prompts. |
 | **LLaVA-NeXT** (8B, 72B)       | `lmms-lab/llava-next-72b`                        | `chatml-llava`       | Improved LLaVA models (with an 8B Llama3 version and a 72B version) offering enhanced visual instruction-following and accuracy on multimodal benchmarks. |
 | **LLaVA-OneVision**            | `lmms-lab/llava-onevision-qwen2-7b-ov`           | `chatml-llava`       | Enhanced LLaVA variant integrating Qwen as the backbone; supports multiple images (and even video frames) as inputs via an OpenAI Vision API-compatible format. |
 
@@ -33,9 +33,10 @@ The `hidden_states` folder contains examples on how to extract hidden states usi
 * `hidden_states_engine.py`: An example how to extract hidden states using the Engine API.
 * `hidden_states_server.py`: An example how to extract hidden states using the Server API.
 
-## LLaVA-NeXT
+## Multimodal
+
+SGLang supports multimodal inputs for various model architectures. The `multimodal` folder contains examples showing how to use urls, files or encoded data to make requests to multimodal models. Examples include querying the [Llava-OneVision](multimodal/llava_onevision_server.py) model (image, multi-image, video), Llava-backed [Qwen-Llava](multimodal/qwen_llava_server.py) and [Llama3-Llava](multimodal/llama3_llava_server.py) models (image, multi-image), and Mistral AI's [Pixtral](multimodal/pixtral_server.py) (image, multi-image).
 
-SGLang support LLaVA-OneVision with single-image, multi-image and video are supported. The folder `llava_onevision` shows how to do this.
 
 ## Token In, Token Out
 
 
@@ -6,7 +6,7 @@
 # Endpoint Service CLI:
 python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000
 
-python3 http_llama3_llava_test.py
+python3 llama3_llava_server.py
 
 Output:
 "Friends posing for a fun photo with a life-sized teddy bear, creating a playful and memorable moment."
 
@@ -3,7 +3,7 @@
 
 python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8
 
-python3 http_llava_onevision_test.py
+python3 llava_onevision_server.py
 """
 
 import base64
 
@@ -0,0 +1,127 @@
+"""
+Usage:
+# Run a Pixtral model with SGLang:
+# HuggingFace:
+python -m sglang.launch_server --model-path mistral-community/pixtral-12b --port=30000
+# ModelScope:
+python -m sglang.launch_server --model-path AI-ModelScope/pixtral-12b --port=30000
+
+# Then test it with:
+python pixtral_server.py
+
+This script tests Pixtral model with both single and multiple images.
+"""
+
+import argparse
+import asyncio
+import json
+
+import aiohttp
+import requests
+
+IMAGE_TOKEN_SEP = "\n[IMG]"
+ROUTE = "/generate"
+
+
+async def send_request(url, data, delay=0):
+    await asyncio.sleep(delay)
+    async with aiohttp.ClientSession() as session:
+        async with session.post(url, json=data) as resp:
+            output = await resp.json()
+    return output
+
+
+async def test_concurrent(args):
+    url = f"{args.host}:{args.port}{ROUTE}"
+
+    # Single image test
+    if args.single_image:
+        prompt = f"<s>[INST]Describe this image in detail.{IMAGE_TOKEN_SEP}[/INST]"
+        image_url = "https://picsum.photos/id/237/400/300"
+        modality = ["image"]
+    # Multiple images test
+    else:
+        image_urls = [
+            "https://picsum.photos/id/237/400/300",
+            "https://picsum.photos/id/27/500/500",
+        ]
+        prompt = f"<s>[INST]How many photos are there? Describe each in a very short sentence.{IMAGE_TOKEN_SEP * len(image_urls)}[/INST]"
+        image_url = image_urls
+        modality = ["multi-images"]
+
+    response = await send_request(
+        url,
+        {
+            "text": prompt,
+            "image_data": image_url,
+            "sampling_params": {
+                "max_new_tokens": 100,
+                "temperature": 0.7,
+                "top_p": 0.9,
+            },
+            "modalities": modality,
+        },
+    )
+
+    print(f"Response: {response}")
+    if "text" in response:
+        print("\nOutput text:", response["text"])
+
+
+def test_streaming(args):
+    url = f"{args.host}:{args.port}/generate"
+
+    # Single image test
+    if args.single_image:
+        prompt = f"<s>[INST]Describe this image in detail.{IMAGE_TOKEN_SEP}[/INST]"
+        image_data = "https://picsum.photos/id/237/400/300"
+        modality = ["image"]
+    # Multiple images test
+    else:
+        image_urls = [
+            "https://picsum.photos/id/237/400/300",
+            "https://picsum.photos/id/27/500/500",
+        ]
+        prompt = f"<s>[INST]How many photos are there? Describe each in a very short sentence.{IMAGE_TOKEN_SEP * len(image_urls)}[/INST]"
+        image_data = image_urls
+        modality = ["multi-images"]
+
+    pload = {
+        "text": prompt,
+        "image_data": image_data,
+        "sampling_params": {"max_new_tokens": 100, "temperature": 0.7, "top_p": 0.9},
+        "modalities": modality,
+        "stream": True,
+    }
+
+    response = requests.post(url, json=pload, stream=True)
+
+    print("Streaming response:")
+    prev = 0
+    for chunk in response.iter_lines(decode_unicode=False):
+        chunk = chunk.decode("utf-8")
+        if chunk and chunk.startswith("data:"):
+            if chunk == "data: [DONE]":
+                break
+            data = json.loads(chunk[5:].strip("\n"))
+            output = data["text"].strip()
+            print(output[prev:], end="", flush=True)
+            prev = len(output)
+    print("\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=30000)
+    parser.add_argument(
+        "--single-image",
+        action="store_true",
+        help="Test with single image instead of multiple images",
+    )
+    parser.add_argument("--no-stream", action="store_true", help="Don't test streaming")
+    args = parser.parse_args()
+
+    asyncio.run(test_concurrent(args))
+    if not args.no_stream:
+        test_streaming(args)
@@ -6,7 +6,7 @@
 # Endpoint Service CLI:
 python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8
 
-python3 http_qwen_llava_test.py
+python3 qwen_llava_server.py
 
 Output:
 "Two children pose with a large teddy bear, one holding a smaller stuffed bear, in a room with an American flag and potted plants."
 
@@ -194,6 +194,21 @@ def get_chat_template_by_model_path(model_path):
     )
 )
 
+# Reference: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/blob/main/chat_template.json
+register_chat_template(
+    ChatTemplate(
+        name="mistral",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("[SYSTEM_PROMPT] ", " [/SYSTEM_PROMPT]"),
+            "user": ("[INST] ", " [/INST]"),
+            "assistant": ("", " </s><s>"),
+        },
+        stop_str=("</s>",),
+        image_token="[IMG]",
+    )
+)
+
 register_chat_template(
     ChatTemplate(
         name="llama-3-instruct",
@@ -509,13 +524,19 @@ def match_vicuna(model_path: str):
 @register_chat_template_matching_function
 def match_llama2_chat(model_path: str):
     if re.search(
-        r"llama-2.*chat|(mistral|mixtral).*instruct|codellama.*instruct",
+        r"llama-2.*chat|codellama.*instruct",
         model_path,
         re.IGNORECASE,
     ):
         return "llama-2-chat"
 
 
+@register_chat_template_matching_function
+def match_mistral(model_path: str):
+    if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE):
+        return "mistral"
+
+
 @register_chat_template_matching_function
 def match_llama3_instruct(model_path: str):
     if re.search(r"llama-3.*instruct", model_path, re.IGNORECASE):
 
@@ -545,6 +545,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
     "Llama4ForConditionalGeneration",
     "LlavaMistralForCausalLM",
     "LlavaQwenForCausalLM",
+    "LlavaForConditionalGeneration",
     "LlavaVidForCausalLM",
     "MiniCPMO",
     "MiniCPMV",
 
@@ -634,6 +634,20 @@ def generate_chat_conv(
     )
 )
 
+# reference: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/blob/main/chat_template.json
+register_conv_template(
+    Conversation(
+        name="mistral",
+        system_template="[SYSTEM_PROMPT]\n{system_message}\n[/SYSTEM_PROMPT]\n\n",
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+        stop_str=["[INST]", "[/INST]", "[SYSTEM_PROMPT]", "[/SYSTEM_PROMPT]"],
+        image_token="[IMG]",
+    )
+)
+
 # reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
 register_conv_template(
     Conversation(
@@ -880,13 +894,19 @@ def match_vicuna(model_path: str):
 @register_conv_template_matching_function
 def match_llama2_chat(model_path: str):
     if re.search(
-        r"llama-2.*chat|(mistral|mixtral).*instruct|codellama.*instruct",
+        r"llama-2.*chat|codellama.*instruct",
         model_path,
         re.IGNORECASE,
     ):
         return "llama-2"
 
 
+@register_conv_template_matching_function
+def match_mistral(model_path: str):
+    if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE):
+        return "mistral"
+
+
 @register_conv_template_matching_function
 def match_deepseek_vl(model_path: str):
     if re.search(r"deepseek.*vl2", model_path, re.IGNORECASE):
 
@@ -1,14 +1,20 @@
 import asyncio
+import importlib
 from typing import List, Optional, Union
 
 import numpy as np
+from transformers.models.auto.processing_auto import (
+    PROCESSOR_MAPPING_NAMES as HF_MAPPING_NAMES,
+)
 
+import sglang.srt.managers.multimodal_processor as sgl_mm_processor_utils
 from sglang.srt.managers.multimodal_processors.base_processor import (
     BaseMultimodalProcessor,
 )
 from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.mm_utils import expand2square, process_anyres_image
 from sglang.srt.models.llava import (
+    LlavaForConditionalGeneration,
     LlavaLlamaForCausalLM,
     LlavaMistralForCausalLM,
     LlavaQwenForCausalLM,
@@ -133,6 +139,7 @@ async def process_mm_data_async(
                             img_data, aspect_ratio, grid_pinpoints
                         )
                     )
+
                 res = await asyncio.gather(*res)
                 for pixel_v, image_h, image_s in res:
                     pixel_values.append(pixel_v)
@@ -165,3 +172,42 @@ async def process_mm_data_async(
                 )
             ],
         }
+
+
+class LlavaMultimodalProcessor(BaseMultimodalProcessor):
+    """
+    This is a wrapper class used to identify the multimodal processor for Llava architecture models.
+    """
+
+    models = [LlavaForConditionalGeneration]
+
+    def _get_sgl_processor_cls(self, model_type: str):
+        if hf_name := HF_MAPPING_NAMES.get(model_type):
+            sgl_mm_processor_set = sgl_mm_processor_utils.PROCESSOR_MAPPING.values()
+            sgl_processor_cls = list(
+                filter(lambda p: p.__name__ == hf_name, sgl_mm_processor_set)
+            )
+            if sgl_processor_cls:
+                return sgl_processor_cls[0]
+        raise ValueError(
+            f"Cannot find corresponding multimodal processor registered in sglang for model type `{model_type}`"
+        )
+
+    def __init__(self, hf_config, server_args, _processor):
+        assert hasattr(hf_config, "vision_config")
+        assert hasattr(hf_config, "text_config")
+        self.vision_config = hf_config.vision_config
+        self.text_config = hf_config.text_config
+        self.hf_config = hf_config
+
+        if vision_type := getattr(self.vision_config, "model_type"):
+            self.inner = self._get_sgl_processor_cls(vision_type)(
+                hf_config, server_args, _processor
+            )
+        else:
+            raise ValueError(
+                f"Required `vision_config.model_type` is not found in hf_config: `{hf_config}`"
+            )
+
+    async def process_mm_data_async(self, *args, **kwargs):
+        return await self.inner.process_mm_data_async(*args, **kwargs)