Skip to content

Commit aa8eced

Browse files
reidliu41lk-chen
authored andcommitted
[Misc] refactor examples series (vllm-project#16708)
Signed-off-by: reidliu41 <[email protected]> Co-authored-by: reidliu41 <[email protected]>
1 parent 456d541 commit aa8eced

11 files changed

+228
-161
lines changed

examples/offline_inference/llm_engine_example.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine:
5050
return LLMEngine.from_engine_args(engine_args)
5151

5252

53+
def parse_args():
54+
parser = FlexibleArgumentParser(
55+
description='Demo on using the LLMEngine class directly')
56+
parser = EngineArgs.add_cli_args(parser)
57+
return parser.parse_args()
58+
59+
5360
def main(args: argparse.Namespace):
5461
"""Main function that sets up and runs the prompt processing."""
5562
engine = initialize_engine(args)
@@ -58,8 +65,5 @@ def main(args: argparse.Namespace):
5865

5966

6067
if __name__ == '__main__':
61-
parser = FlexibleArgumentParser(
62-
description='Demo on using the LLMEngine class directly')
63-
parser = EngineArgs.add_cli_args(parser)
64-
args = parser.parse_args()
68+
args = parse_args()
6569
main(args)

examples/online_serving/gradio_openai_chatbot_webserver.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,6 @@
2323
from openai import OpenAI
2424

2525

26-
def create_openai_client(api_key, base_url):
27-
return OpenAI(api_key=api_key, base_url=base_url)
28-
29-
3026
def format_history_to_openai(history):
3127
history_openai_format = [{
3228
"role": "system",

examples/online_serving/openai_chat_completion_client_for_multimodal.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -303,12 +303,7 @@ def run_audio() -> None:
303303
}
304304

305305

306-
def main(args) -> None:
307-
chat_type = args.chat_type
308-
example_function_map[chat_type]()
309-
310-
311-
if __name__ == "__main__":
306+
def parse_args():
312307
parser = FlexibleArgumentParser(
313308
description='Demo on using OpenAI client for online serving with '
314309
'multimodal language models served with vLLM.')
@@ -318,5 +313,14 @@ def main(args) -> None:
318313
default="single-image",
319314
choices=list(example_function_map.keys()),
320315
help='Conversation type with multimodal data.')
321-
args = parser.parse_args()
316+
return parser.parse_args()
317+
318+
319+
def main(args) -> None:
320+
chat_type = args.chat_type
321+
example_function_map[chat_type]()
322+
323+
324+
if __name__ == "__main__":
325+
args = parse_args()
322326
main(args)

examples/online_serving/openai_chat_completion_client_with_tools_required.py

Lines changed: 32 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
# SPDX-License-Identifier: Apache-2.0
22
"""
3-
To run this example, you can start the vLLM server
3+
To run this example, you can start the vLLM server
44
without any specific flags:
55
66
```bash
77
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
88
--guided-decoding-backend outlines
99
```
1010
11-
This example demonstrates how to generate chat completions
11+
This example demonstrates how to generate chat completions
1212
using the OpenAI Python client library.
1313
"""
1414

@@ -18,15 +18,6 @@
1818
openai_api_key = "EMPTY"
1919
openai_api_base = "http://localhost:8000/v1"
2020

21-
client = OpenAI(
22-
# defaults to os.environ.get("OPENAI_API_KEY")
23-
api_key=openai_api_key,
24-
base_url=openai_api_base,
25-
)
26-
27-
models = client.models.list()
28-
model = models.data[0].id
29-
3021
tools = [
3122
{
3223
"type": "function",
@@ -116,21 +107,36 @@
116107
},
117108
]
118109

119-
chat_completion = client.chat.completions.create(
120-
messages=messages,
121-
model=model,
122-
tools=tools,
123-
tool_choice="required",
124-
stream=True # Enable streaming response
125-
)
126110

127-
for chunk in chat_completion:
128-
if chunk.choices and chunk.choices[0].delta.tool_calls:
129-
print(chunk.choices[0].delta.tool_calls)
111+
def main():
112+
client = OpenAI(
113+
# defaults to os.environ.get("OPENAI_API_KEY")
114+
api_key=openai_api_key,
115+
base_url=openai_api_base,
116+
)
117+
118+
models = client.models.list()
119+
model = models.data[0].id
120+
121+
chat_completion = client.chat.completions.create(
122+
messages=messages,
123+
model=model,
124+
tools=tools,
125+
tool_choice="required",
126+
stream=True # Enable streaming response
127+
)
128+
129+
for chunk in chat_completion:
130+
if chunk.choices and chunk.choices[0].delta.tool_calls:
131+
print(chunk.choices[0].delta.tool_calls)
132+
133+
chat_completion = client.chat.completions.create(messages=messages,
134+
model=model,
135+
tools=tools,
136+
tool_choice="required")
137+
138+
print(chat_completion.choices[0].message.tool_calls)
130139

131-
chat_completion = client.chat.completions.create(messages=messages,
132-
model=model,
133-
tools=tools,
134-
tool_choice="required")
135140

136-
print(chat_completion.choices[0].message.tool_calls)
141+
if __name__ == "__main__":
142+
main()

examples/online_serving/openai_chat_completion_with_reasoning.py

Lines changed: 36 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
An example shows how to generate chat completions from reasoning models
44
like DeepSeekR1.
55
6-
To run this example, you need to start the vLLM server with the reasoning
7-
parser:
6+
To run this example, you need to start the vLLM server
7+
with the reasoning parser:
88
99
```bash
1010
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
@@ -21,35 +21,44 @@
2121
openai_api_key = "EMPTY"
2222
openai_api_base = "http://localhost:8000/v1"
2323

24-
client = OpenAI(
25-
api_key=openai_api_key,
26-
base_url=openai_api_base,
27-
)
2824

29-
models = client.models.list()
30-
model = models.data[0].id
25+
def main():
26+
client = OpenAI(
27+
api_key=openai_api_key,
28+
base_url=openai_api_base,
29+
)
3130

32-
# Round 1
33-
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
34-
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
35-
response = client.chat.completions.create(model=model, messages=messages)
31+
models = client.models.list()
32+
model = models.data[0].id
3633

37-
reasoning_content = response.choices[0].message.reasoning_content
38-
content = response.choices[0].message.content
34+
# Round 1
35+
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
36+
# ruff: noqa: E501
37+
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
38+
response = client.chat.completions.create(model=model, messages=messages)
3939

40-
print("reasoning_content for Round 1:", reasoning_content)
41-
print("content for Round 1:", content)
40+
reasoning_content = response.choices[0].message.reasoning_content
41+
content = response.choices[0].message.content
4242

43-
# Round 2
44-
messages.append({"role": "assistant", "content": content})
45-
messages.append({
46-
"role": "user",
47-
"content": "How many Rs are there in the word 'strawberry'?",
48-
})
49-
response = client.chat.completions.create(model=model, messages=messages)
43+
print("reasoning_content for Round 1:", reasoning_content)
44+
print("content for Round 1:", content)
5045

51-
reasoning_content = response.choices[0].message.reasoning_content
52-
content = response.choices[0].message.content
46+
# Round 2
47+
messages.append({"role": "assistant", "content": content})
48+
messages.append({
49+
"role":
50+
"user",
51+
"content":
52+
"How many Rs are there in the word 'strawberry'?",
53+
})
54+
response = client.chat.completions.create(model=model, messages=messages)
5355

54-
print("reasoning_content for Round 2:", reasoning_content)
55-
print("content for Round 2:", content)
56+
reasoning_content = response.choices[0].message.reasoning_content
57+
content = response.choices[0].message.content
58+
59+
print("reasoning_content for Round 2:", reasoning_content)
60+
print("content for Round 2:", content)
61+
62+
63+
if __name__ == "__main__":
64+
main()

examples/online_serving/openai_chat_completion_with_reasoning_streaming.py

Lines changed: 45 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
An example shows how to generate chat completions from reasoning models
44
like DeepSeekR1.
55
6-
To run this example, you need to start the vLLM server with the reasoning
6+
To run this example, you need to start the vLLM server with the reasoning
77
parser:
88
99
```bash
@@ -29,41 +29,49 @@
2929
openai_api_key = "EMPTY"
3030
openai_api_base = "http://localhost:8000/v1"
3131

32-
client = OpenAI(
33-
api_key=openai_api_key,
34-
base_url=openai_api_base,
35-
)
32+
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
3633

37-
models = client.models.list()
38-
model = models.data[0].id
3934

40-
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
41-
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
42-
stream = client.chat.completions.create(model=model,
43-
messages=messages,
44-
stream=True)
45-
46-
print("client: Start streaming chat completions...")
47-
printed_reasoning_content = False
48-
printed_content = False
49-
50-
for chunk in stream:
51-
reasoning_content = None
52-
content = None
53-
# Check the content is reasoning_content or content
54-
if hasattr(chunk.choices[0].delta, "reasoning_content"):
55-
reasoning_content = chunk.choices[0].delta.reasoning_content
56-
elif hasattr(chunk.choices[0].delta, "content"):
57-
content = chunk.choices[0].delta.content
58-
59-
if reasoning_content is not None:
60-
if not printed_reasoning_content:
61-
printed_reasoning_content = True
62-
print("reasoning_content:", end="", flush=True)
63-
print(reasoning_content, end="", flush=True)
64-
elif content is not None:
65-
if not printed_content:
66-
printed_content = True
67-
print("\ncontent:", end="", flush=True)
68-
# Extract and print the content
69-
print(content, end="", flush=True)
35+
def main():
36+
client = OpenAI(
37+
api_key=openai_api_key,
38+
base_url=openai_api_base,
39+
)
40+
41+
models = client.models.list()
42+
model = models.data[0].id
43+
44+
# ruff: noqa: E501
45+
# For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
46+
stream = client.chat.completions.create(model=model,
47+
messages=messages,
48+
stream=True)
49+
50+
print("client: Start streaming chat completions...")
51+
printed_reasoning_content = False
52+
printed_content = False
53+
54+
for chunk in stream:
55+
reasoning_content = None
56+
content = None
57+
# Check the content is reasoning_content or content
58+
if hasattr(chunk.choices[0].delta, "reasoning_content"):
59+
reasoning_content = chunk.choices[0].delta.reasoning_content
60+
elif hasattr(chunk.choices[0].delta, "content"):
61+
content = chunk.choices[0].delta.content
62+
63+
if reasoning_content is not None:
64+
if not printed_reasoning_content:
65+
printed_reasoning_content = True
66+
print("reasoning_content:", end="", flush=True)
67+
print(reasoning_content, end="", flush=True)
68+
elif content is not None:
69+
if not printed_content:
70+
printed_content = True
71+
print("\ncontent:", end="", flush=True)
72+
# Extract and print the content
73+
print(content, end="", flush=True)
74+
75+
76+
if __name__ == "__main__":
77+
main()

examples/online_serving/openai_chat_embedding_client_for_multimodal.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict):
9898
print("Embedding output:", response_json["data"][0]["embedding"])
9999

100100

101-
if __name__ == '__main__':
101+
def parse_args():
102102
parser = argparse.ArgumentParser(
103103
"Script to call a specified VLM through the API. Make sure to serve "
104104
"the model with --task embed before running this.")
@@ -107,8 +107,10 @@ def dse_qwen2_vl(inp: dict):
107107
choices=["vlm2vec", "dse_qwen2_vl"],
108108
required=True,
109109
help="Which model to call.")
110-
args = parser.parse_args()
110+
return parser.parse_args()
111+
111112

113+
def main(args):
112114
if args.model == "vlm2vec":
113115
vlm2vec()
114116
elif args.model == "dse_qwen2_vl":
@@ -120,3 +122,8 @@ def dse_qwen2_vl(inp: dict):
120122
"type": "text",
121123
"content": "What is the weather like today?",
122124
})
125+
126+
127+
if __name__ == '__main__':
128+
args = parse_args()
129+
main(args)

0 commit comments

Comments
 (0)