Skip to content

[Misc] refactor examples series #16708

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions examples/offline_inference/llm_engine_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine:
return LLMEngine.from_engine_args(engine_args)


def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using the LLMEngine class directly')
parser = EngineArgs.add_cli_args(parser)
return parser.parse_args()


def main(args: argparse.Namespace):
"""Main function that sets up and runs the prompt processing."""
engine = initialize_engine(args)
Expand All @@ -58,8 +65,5 @@ def main(args: argparse.Namespace):


if __name__ == '__main__':
parser = FlexibleArgumentParser(
description='Demo on using the LLMEngine class directly')
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
args = parse_args()
main(args)
4 changes: 0 additions & 4 deletions examples/online_serving/gradio_openai_chatbot_webserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,6 @@
from openai import OpenAI


def create_openai_client(api_key, base_url):
return OpenAI(api_key=api_key, base_url=base_url)


def format_history_to_openai(history):
history_openai_format = [{
"role": "system",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -303,12 +303,7 @@ def run_audio() -> None:
}


def main(args) -> None:
chat_type = args.chat_type
example_function_map[chat_type]()


if __name__ == "__main__":
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using OpenAI client for online serving with '
'multimodal language models served with vLLM.')
Expand All @@ -318,5 +313,14 @@ def main(args) -> None:
default="single-image",
choices=list(example_function_map.keys()),
help='Conversation type with multimodal data.')
args = parser.parse_args()
return parser.parse_args()


def main(args) -> None:
chat_type = args.chat_type
example_function_map[chat_type]()


if __name__ == "__main__":
args = parse_args()
main(args)
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# SPDX-License-Identifier: Apache-2.0
"""
To run this example, you can start the vLLM server
To run this example, you can start the vLLM server
without any specific flags:

```bash
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
--guided-decoding-backend outlines
```

This example demonstrates how to generate chat completions
This example demonstrates how to generate chat completions
using the OpenAI Python client library.
"""

Expand All @@ -18,15 +18,6 @@
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"

client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)

models = client.models.list()
model = models.data[0].id

tools = [
{
"type": "function",
Expand Down Expand Up @@ -116,21 +107,36 @@
},
]

chat_completion = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
tool_choice="required",
stream=True # Enable streaming response
)

for chunk in chat_completion:
if chunk.choices and chunk.choices[0].delta.tool_calls:
print(chunk.choices[0].delta.tool_calls)
def main():
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)

models = client.models.list()
model = models.data[0].id

chat_completion = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
tool_choice="required",
stream=True # Enable streaming response
)

for chunk in chat_completion:
if chunk.choices and chunk.choices[0].delta.tool_calls:
print(chunk.choices[0].delta.tool_calls)

chat_completion = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
tool_choice="required")

print(chat_completion.choices[0].message.tool_calls)

chat_completion = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
tool_choice="required")

print(chat_completion.choices[0].message.tool_calls)
if __name__ == "__main__":
main()
63 changes: 36 additions & 27 deletions examples/online_serving/openai_chat_completion_with_reasoning.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
An example shows how to generate chat completions from reasoning models
like DeepSeekR1.

To run this example, you need to start the vLLM server with the reasoning
parser:
To run this example, you need to start the vLLM server
with the reasoning parser:

```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
Expand All @@ -21,35 +21,44 @@
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"

client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)

models = client.models.list()
model = models.data[0].id
def main():
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)

# Round 1
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response = client.chat.completions.create(model=model, messages=messages)
models = client.models.list()
model = models.data[0].id

reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content
# Round 1
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# ruff: noqa: E501
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response = client.chat.completions.create(model=model, messages=messages)

print("reasoning_content for Round 1:", reasoning_content)
print("content for Round 1:", content)
reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content

# Round 2
messages.append({"role": "assistant", "content": content})
messages.append({
"role": "user",
"content": "How many Rs are there in the word 'strawberry'?",
})
response = client.chat.completions.create(model=model, messages=messages)
print("reasoning_content for Round 1:", reasoning_content)
print("content for Round 1:", content)

reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content
# Round 2
messages.append({"role": "assistant", "content": content})
messages.append({
"role":
"user",
"content":
"How many Rs are there in the word 'strawberry'?",
})
response = client.chat.completions.create(model=model, messages=messages)

print("reasoning_content for Round 2:", reasoning_content)
print("content for Round 2:", content)
reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content

print("reasoning_content for Round 2:", reasoning_content)
print("content for Round 2:", content)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
An example shows how to generate chat completions from reasoning models
like DeepSeekR1.

To run this example, you need to start the vLLM server with the reasoning
To run this example, you need to start the vLLM server with the reasoning
parser:

```bash
Expand All @@ -29,41 +29,49 @@
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"

client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]

models = client.models.list()
model = models.data[0].id

messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
stream = client.chat.completions.create(model=model,
messages=messages,
stream=True)

print("client: Start streaming chat completions...")
printed_reasoning_content = False
printed_content = False

for chunk in stream:
reasoning_content = None
content = None
# Check the content is reasoning_content or content
if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content = chunk.choices[0].delta.reasoning_content
elif hasattr(chunk.choices[0].delta, "content"):
content = chunk.choices[0].delta.content

if reasoning_content is not None:
if not printed_reasoning_content:
printed_reasoning_content = True
print("reasoning_content:", end="", flush=True)
print(reasoning_content, end="", flush=True)
elif content is not None:
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)
def main():
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)

models = client.models.list()
model = models.data[0].id

# ruff: noqa: E501
# For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
stream = client.chat.completions.create(model=model,
messages=messages,
stream=True)

print("client: Start streaming chat completions...")
printed_reasoning_content = False
printed_content = False

for chunk in stream:
reasoning_content = None
content = None
# Check the content is reasoning_content or content
if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content = chunk.choices[0].delta.reasoning_content
elif hasattr(chunk.choices[0].delta, "content"):
content = chunk.choices[0].delta.content

if reasoning_content is not None:
if not printed_reasoning_content:
printed_reasoning_content = True
print("reasoning_content:", end="", flush=True)
print(reasoning_content, end="", flush=True)
elif content is not None:
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict):
print("Embedding output:", response_json["data"][0]["embedding"])


if __name__ == '__main__':
def parse_args():
parser = argparse.ArgumentParser(
"Script to call a specified VLM through the API. Make sure to serve "
"the model with --task embed before running this.")
Expand All @@ -107,8 +107,10 @@ def dse_qwen2_vl(inp: dict):
choices=["vlm2vec", "dse_qwen2_vl"],
required=True,
help="Which model to call.")
args = parser.parse_args()
return parser.parse_args()


def main(args):
if args.model == "vlm2vec":
vlm2vec()
elif args.model == "dse_qwen2_vl":
Expand All @@ -120,3 +122,8 @@ def dse_qwen2_vl(inp: dict):
"type": "text",
"content": "What is the weather like today?",
})


if __name__ == '__main__':
args = parse_args()
main(args)
Loading