vllm-project · Jeffwan · Aug 4, 2025 · Aug 4, 2025 · Aug 4, 2025 · Aug 4, 2025
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
@@ -46,6 +46,14 @@ Ensure that:
 1. The `Service` name matches the `model.aibrix.ai/name` label value in the `Deployment`.
 2. The `--served-model-name` argument value in the `Deployment` command is also consistent with the `Service` name and `model.aibrix.ai/name` label.
 
+Deploy Prefill-Decode (PD) Disaggregation Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Save yaml as `pd-model.yaml` and run `kubectl apply -f pd-model.yaml`.
+
+.. literalinclude:: ../../../samples/quickstart/pd-model.yaml
+   :language: yaml
+
 
 Invoke the model endpoint using gateway API
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -92,6 +100,25 @@ Depending on where you deployed the AIBrix, you can use either of the following
         ]
     }'
 
+.. note::
+
+    To test PD disaggregation, add the ``routing-strategy`` header to ``pd``. For example:
+
+    .. code-block:: bash
+
+        curl -v http://${ENDPOINT}/v1/chat/completions \
+        -H "routing-strategy: pd" \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "deepseek-r1-distill-llama-8b",
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "help me write a random generator in python"}
+            ],
+            "temperature": 0.7
+        }'
+
+
 .. code-block:: python
 
     from openai import OpenAI

diff --git a/samples/quickstart/pd-model.yaml b/samples/quickstart/pd-model.yaml
@@ -0,0 +1,124 @@
+apiVersion: orchestration.aibrix.ai/v1alpha1
+kind: StormService
+metadata:
+  name: vllm-1p1d
+spec:
+  replicas: 1
+  updateStrategy:
+    type: InPlaceUpdate
+  stateful: true
+  selector:
+    matchLabels:
+      app: vllm-1p1d
+  template:
+    metadata:
+      labels:
+        app: vllm-1p1d
+    spec:
+      roles:
+        - name: prefill
+          replicas: 1
+          stateful: true
+          template:
+            metadata:
+              labels:
+                model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+                model.aibrix.ai/port: "8000"
+                model.aibrix.ai/engine: vllm
+            spec:
+              containers:
+                - name: prefill
+                  image: vllm/vllm-openai:v0.9.1
+                  command: ["sh", "-c"]
+                  args:
+                    - |
+                      python3 -m vllm.entrypoints.openai.api_server \
+                      --host "0.0.0.0" \
+                      --port "8000" \
+                      --uvicorn-log-level warning \
+                      --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+                      --served-model-name deepseek-r1-distill-llama-8b \
+                      --kv-transfer-config '{"kv_connector":"PyNcclConnector","kv_role":"kv_both"}'
+                  env:
+                    - name: UCX_TLS
+                      value: cuda_ipc,cuda_copy,tcp
+                    - name: VLLM_SERVER_DEV_MODE
+                      value: "1"
+                    - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+                      value: "5558"
+                    - name: VLLM_WORKER_MULTIPROC_METHOD
+                      value: spawn
+                    - name: VLLM_ENABLE_V1_MULTIPROCESSING
+                      value: "0"
+                    - name: GLOO_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_IB_DISABLE
+                      value: "0"
+                    - name: NCCL_IB_GID_INDEX
+                      value: "7"
+                    - name: NCCL_DEBUG
+                      value: "INFO"
+                  resources:
+                    limits:
+                      nvidia.com/gpu: 1
+                    requests:
+                      nvidia.com/gpu: 1
+                  securityContext:
+                    capabilities:
+                      add:
+                        - IPC_LOCK
+        - name: decode
+          replicas: 1
+          stateful: true
+          template:
+            metadata:
+              labels:
+                model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+                model.aibrix.ai/port: "8000"
+                model.aibrix.ai/engine: vllm
+            spec:
+              containers:
+                - name: decode
+                  image: vllm/vllm-openai:v0.9.1
+                  command: ["sh", "-c"]
+                  args:
+                    - |
+                      python3 -m vllm.entrypoints.openai.api_server \
+                      --host "0.0.0.0" \
+                      --port "8000" \
+                      --uvicorn-log-level warning \
+                      --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+                      --served-model-name deepseek-r1-distill-llama-8b \
+                      --kv-transfer-config '{"kv_connector":"PyNcclConnector","kv_role":"kv_both"}'
+                  env:
+                    - name: UCX_TLS
+                      value: cuda_ipc,cuda_copy,tcp
+                    - name: VLLM_SERVER_DEV_MODE
+                      value: "1"
+                    - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+                      value: "5558"
+                    - name: VLLM_WORKER_MULTIPROC_METHOD
+                      value: spawn
+                    - name: VLLM_ENABLE_V1_MULTIPROCESSING
+                      value: "0"
+                    - name: GLOO_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_IB_DISABLE
+                      value: "0"
+                    - name: NCCL_IB_GID_INDEX
+                      value: "7"
+                    - name: NCCL_DEBUG
+                      value: "INFO"
+                  resources:
+                    limits:
+                      nvidia.com/gpu: 1
+                    requests:
+                      nvidia.com/gpu: 1
+                  securityContext:
+                    capabilities:
+                      add:
+                        - IPC_LOCK