Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions samples/disaggregation/vllm/1p1d.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,12 @@ spec:
--served-model-name qwen3-8B \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
env:
- name: PYTHONHASHSEED
value: "1047"
- name: VLLM_SERVER_DEV_MODE
value: "1"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
value: "0.0.0.0"
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5558"
- name: VLLM_WORKER_MULTIPROC_METHOD
Expand Down Expand Up @@ -122,8 +126,12 @@ spec:
--served-model-name qwen3-8B \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
env:
- name: PYTHONHASHSEED
value: "1047"
- name: VLLM_SERVER_DEV_MODE
value: "1"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
value: "0.0.0.0"
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5558"
- name: VLLM_WORKER_MULTIPROC_METHOD
Expand Down
59 changes: 44 additions & 15 deletions samples/disaggregation/vllm/disagg_proxy_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import os
import random
import time
import uuid
from contextlib import asynccontextmanager

import httpx
Expand Down Expand Up @@ -99,20 +100,46 @@ def parse_args():
return args


async def send_request_to_service(client: httpx.AsyncClient, endpoint: str, req_data: dict):
req_data = req_data.copy()
req_data["max_tokens"] = 1
if "max_completion_tokens" in req_data:
req_data["max_completion_tokens"] = 1

headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
response = await client.post(endpoint, json=req_data, headers=headers)
async def send_request_to_service(client: httpx.AsyncClient, endpoint: str, req_id: str, req_data: dict):
req_data_copy = req_data.copy()
# nixl-specific kv_transfer_params for prefillers
req_data_copy['kv_transfer_params'] = {
"do_remote_decode": True,
"do_remote_prefill": False,
"remote_engine_id": None,
"remote_block_ids": None,
"remote_host": None,
"remote_port": None
}
# disable streaming for prefillers
req_data_copy["stream"] = False
if "stream_options" in req_data_copy:
del req_data_copy["stream_options"]
req_data_copy["max_tokens"] = 1
if "max_completion_tokens" in req_data_copy:
req_data_copy["max_completion_tokens"] = 1

headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
"X-Request-Id": req_id
}
response = await client.post(endpoint, json=req_data_copy, headers=headers)
response.raise_for_status()
# extract nixl-specific kv_transfer_params returned from prefillers and
# attach to the req_data for decode clients
response_json = response.json()
kv_transfer_params = response_json.get('kv_transfer_params', {})
if kv_transfer_params:
req_data["kv_transfer_params"] = kv_transfer_params
req_data["kv_transfer_params"]["remote_host"] = client.base_url.host
return response
Comment on lines +128 to 135
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This function modifies the req_data dictionary in-place, which is a side effect that can make the code harder to reason about. A cleaner approach is to return the kv_transfer_params and let the caller update req_data. This makes the data flow explicit. Since the response object is not used by the callers, the function's return value can be changed to facilitate this refactoring.

    # extract nixl-specific kv_transfer_params returned from prefillers
    response_json = response.json()
    kv_transfer_params = response_json.get('kv_transfer_params', {})
    if kv_transfer_params:
        kv_transfer_params["remote_host"] = client.base_url.host
    return kv_transfer_params



async def stream_service_response(client: httpx.AsyncClient, endpoint: str, req_data: dict):
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
async def stream_service_response(client: httpx.AsyncClient, endpoint: str, req_id: str, req_data: dict):
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
"X-Request-Id": req_id
}
async with client.stream("POST", endpoint, json=req_data, headers=headers) as response:
response.raise_for_status()
async for chunk in response.aiter_bytes():
Expand Down Expand Up @@ -141,15 +168,16 @@ async def handle_completions(request: Request):
st = time.time()

try:
req_id = str(uuid.uuid4())
req_data = await request.json()
prefill_client, decode_client = select_random_clients()

await send_request_to_service(prefill_client, "/completions", req_data)
await send_request_to_service(prefill_client, "/completions", req_id, req_data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

To accompany the suggested change in send_request_to_service that removes side effects, this call should be updated to receive the kv_transfer_params and update req_data explicitly.

        kv_transfer_params = await send_request_to_service(prefill_client, "/completions", req_id, req_data)
        if kv_transfer_params:
            req_data["kv_transfer_params"] = kv_transfer_params

et = time.time()
stats_calculator.add(et - st)

async def generate_stream():
async for chunk in stream_service_response(decode_client, "/completions", req_data):
async for chunk in stream_service_response(decode_client, "/completions", req_id, req_data):
yield chunk

return StreamingResponse(generate_stream(), media_type="text/event-stream")
Expand All @@ -169,15 +197,16 @@ async def handle_chat_completions(request: Request):
st = time.time()

try:
req_id = str(uuid.uuid4())
req_data = await request.json()
prefill_client, decode_client = select_random_clients()

await send_request_to_service(prefill_client, "/chat/completions", req_data)
await send_request_to_service(prefill_client, "/chat/completions", req_id, req_data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

To accompany the suggested change in send_request_to_service that removes side effects, this call should be updated to receive the kv_transfer_params and update req_data explicitly.

        kv_transfer_params = await send_request_to_service(prefill_client, "/chat/completions", req_id, req_data)
        if kv_transfer_params:
            req_data["kv_transfer_params"] = kv_transfer_params

et = time.time()
stats_calculator.add(et - st)

async def generate_stream():
async for chunk in stream_service_response(decode_client, "/chat/completions", req_data):
async for chunk in stream_service_response(decode_client, "/chat/completions", req_id, req_data):
yield chunk

return StreamingResponse(generate_stream(), media_type="text/event-stream")
Expand All @@ -195,4 +224,4 @@ async def generate_stream():
global_args = parse_args()

import uvicorn
uvicorn.run(app, host=global_args.host, port=global_args.port)
uvicorn.run(app, host=global_args.host, port=global_args.port)
8 changes: 8 additions & 0 deletions samples/disaggregation/vllm/pool.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,12 @@ spec:
--served-model-name qwen3-8B \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
env:
- name: PYTHONHASHSEED
value: "1047"
- name: VLLM_SERVER_DEV_MODE
value: "1"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
value: "0.0.0.0"
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5558"
- name: VLLM_WORKER_MULTIPROC_METHOD
Expand Down Expand Up @@ -122,8 +126,12 @@ spec:
--served-model-name qwen3-8B \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
env:
- name: PYTHONHASHSEED
value: "1047"
- name: VLLM_SERVER_DEV_MODE
value: "1"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
value: "0.0.0.0"
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5558"
- name: VLLM_WORKER_MULTIPROC_METHOD
Expand Down
10 changes: 8 additions & 2 deletions samples/disaggregation/vllm/replica.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,12 @@ spec:
--served-model-name qwen3-8B \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
env:
- name: PYTHONHASHSEED
value: "1047"
- name: VLLM_SERVER_DEV_MODE
value: "1"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
value: "0.0.0.0"
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5558"
- name: VLLM_WORKER_MULTIPROC_METHOD
Expand Down Expand Up @@ -122,10 +126,12 @@ spec:
--served-model-name qwen3-8B \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
env:
- name: UCX_TLS
value: cuda_ipc,cuda_copy,tcp
- name: PYTHONHASHSEED
value: "1047"
- name: VLLM_SERVER_DEV_MODE
value: "1"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
value: "0.0.0.0"
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5558"
- name: VLLM_WORKER_MULTIPROC_METHOD
Expand Down
8 changes: 8 additions & 0 deletions samples/quickstart/pd-model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,12 @@ spec:
--served-model-name deepseek-r1-distill-llama-8b \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
env:
- name: PYTHONHASHSEED
value: "1047"
- name: VLLM_SERVER_DEV_MODE
value: "1"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
value: "0.0.0.0"
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5558"
- name: VLLM_WORKER_MULTIPROC_METHOD
Expand Down Expand Up @@ -91,8 +95,12 @@ spec:
--served-model-name deepseek-r1-distill-llama-8b \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
env:
- name: PYTHONHASHSEED
value: "1047"
- name: VLLM_SERVER_DEV_MODE
value: "1"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
value: "0.0.0.0"
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5558"
- name: VLLM_WORKER_MULTIPROC_METHOD
Expand Down
8 changes: 8 additions & 0 deletions test/regression/v0.4.0/vllm/1p1d.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,12 @@ spec:
--served-model-name qwen3-8B \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
env:
- name: PYTHONHASHSEED
value: "1047"
- name: VLLM_SERVER_DEV_MODE
value: "1"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
value: "0.0.0.0"
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5558"
- name: VLLM_WORKER_MULTIPROC_METHOD
Expand Down Expand Up @@ -114,8 +118,12 @@ spec:
--served-model-name qwen3-8B \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
env:
- name: PYTHONHASHSEED
value: "1047"
- name: VLLM_SERVER_DEV_MODE
value: "1"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
value: "0.0.0.0"
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5558"
- name: VLLM_WORKER_MULTIPROC_METHOD
Expand Down