vllm-project
diff --git a/‎config/webhook/manifests.yaml‎
Lines changed: 8 additions & 8 deletions b/‎config/webhook/manifests.yaml‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎pkg/cache/README.md‎
Lines changed: 1 addition & 1 deletion b/‎pkg/cache/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/cache/model_gpu_profile.go‎
Lines changed: 1 addition & 1 deletion b/‎pkg/cache/model_gpu_profile.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/controller/stormservice/sync.go‎
Lines changed: 7 additions & 3 deletions b/‎pkg/controller/stormservice/sync.go‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎pkg/controller/stormservice/sync_test.go‎
Lines changed: 37 additions & 0 deletions b/‎pkg/controller/stormservice/sync_test.go‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎pkg/controller/stormservice/utils.go‎
Lines changed: 2 additions & 1 deletion b/‎pkg/controller/stormservice/utils.go‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎pkg/plugins/gateway/algorithms/pd_disaggregation.go‎
Lines changed: 113 additions & 20 deletions b/‎pkg/plugins/gateway/algorithms/pd_disaggregation.go‎
Lines changed: 113 additions & 20 deletions
@@ -56,37 +56,37 @@ webhooks:
     service:
       name: webhook-service
       namespace: system
-      path: /validate-model-aibrix-ai-v1alpha1-modeladapter
+      path: /validate-orchestration-aibrix-ai-v1alpha1-kvcache
   failurePolicy: Fail
-  name: vmodeladapter.kb.io
+  name: vkvcache-v1alpha1.kb.io
   rules:
   - apiGroups:
-    - model.aibrix.ai
+    - orchestration.aibrix.ai
     apiVersions:
     - v1alpha1
     operations:
     - CREATE
     - UPDATE
     resources:
-    - modeladapters
+    - kvcaches
   sideEffects: None
 - admissionReviewVersions:
   - v1
   clientConfig:
     service:
       name: webhook-service
       namespace: system
-      path: /validate-orchestration-aibrix-ai-v1alpha1-kvcache
+      path: /validate-model-aibrix-ai-v1alpha1-modeladapter
   failurePolicy: Fail
-  name: vkvcache-v1alpha1.kb.io
+  name: vmodeladapter.kb.io
   rules:
   - apiGroups:
-    - orchestration.aibrix.ai
+    - model.aibrix.ai
     apiVersions:
     - v1alpha1
     operations:
     - CREATE
     - UPDATE
     resources:
-    - kvcaches
+    - modeladapters
   sideEffects: None
@@ -94,7 +94,7 @@ Kubernetes informers for watching:
 
 **Performance:**
 - `AIBRIX_POD_METRIC_REFRESH_INTERVAL_MS`: Metric refresh interval
-- `AIBRIX_Model_GPU_PROFILE_CACHING_FLAG`: Enable GPU profile caching
+- `AIBRIX_MODEL_GPU_PROFILE_CACHING_FLAG`: Enable GPU profile caching
 
 ## Usage Example
 
 
@@ -47,7 +47,7 @@ const defaultModelGPUProfileRefreshInterval = 10 * time.Second
 var enableModelGPUProfileCaching = getModelGPUProfileCachingFlag()
 
 func getModelGPUProfileCachingFlag() bool {
-	value := utils.LoadEnv("AIBRIX_Model_GPU_PROFILE_CACHING_FLAG", "true")
+	value := utils.LoadEnv("AIBRIX_MODEL_GPU_PROFILE_CACHING_FLAG", "true")
 	boolVal, err := strconv.ParseBool(value)
 	if err != nil || !boolVal {
 		return false
 
@@ -76,11 +76,15 @@ func (r *StormServiceReconciler) syncHeadlessService(ctx context.Context, servic
 			Name:      service.Name,
 			Namespace: service.Namespace,
 			Labels:    service.Labels,
+			OwnerReferences: []metav1.OwnerReference{
+				*metav1.NewControllerRef(service, orchestrationv1alpha1.SchemeGroupVersion.WithKind(orchestrationv1alpha1.StormServiceKind)),
+			},
 		},
 		Spec: corev1.ServiceSpec{
-			Type:      corev1.ServiceTypeClusterIP,
-			ClusterIP: corev1.ClusterIPNone,
-			Selector:  map[string]string{constants.StormServiceNameLabelKey: service.Name},
+			Type:                     corev1.ServiceTypeClusterIP,
+			ClusterIP:                corev1.ClusterIPNone,
+			Selector:                 map[string]string{constants.StormServiceNameLabelKey: service.Name},
+			PublishNotReadyAddresses: true,
 		},
 	}
 
 
@@ -173,6 +173,28 @@ func TestSyncHeadlessService(t *testing.T) {
 			},
 			wantError: false,
 		},
+		{
+			name: "service already exists with PublishNotReadyAddresses false",
+			stormService: &orchestrationv1alpha1.StormService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-storm",
+					Namespace: "default",
+				},
+			},
+			existingService: &corev1.Service{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-storm",
+					Namespace: "default",
+				},
+				Spec: corev1.ServiceSpec{
+					Type:                     corev1.ServiceTypeClusterIP,
+					ClusterIP:                corev1.ClusterIPNone,
+					Selector:                 map[string]string{constants.StormServiceNameLabelKey: "test-storm"},
+					PublishNotReadyAddresses: false, // should be updated to true
+				},
+			},
+			wantError: false,
+		},
 	}
 
 	for _, tt := range tests {
@@ -216,6 +238,17 @@ func TestSyncHeadlessService(t *testing.T) {
 				t.Errorf("Expected ClusterIP to be None, got %s", service.Spec.ClusterIP)
 			}
 
+			if tt.existingService == nil {
+				if len(service.OwnerReferences) == 0 {
+					t.Error("Expected service to have an owner reference")
+				} else {
+					ownerRef := service.OwnerReferences[0]
+					if ownerRef.Kind != orchestrationv1alpha1.StormServiceKind || ownerRef.UID != service.UID {
+						t.Errorf("Expected owner reference to be %s %s, got %s %s", orchestrationv1alpha1.StormServiceKind, service.UID, ownerRef.Kind, ownerRef.UID)
+					}
+				}
+			}
+
 			expectedSelector := map[string]string{constants.StormServiceNameLabelKey: tt.stormService.Name}
 			if !reflect.DeepEqual(service.Spec.Selector, expectedSelector) {
 				t.Errorf("Expected selector %v, got %v", expectedSelector, service.Spec.Selector)
@@ -224,6 +257,10 @@ func TestSyncHeadlessService(t *testing.T) {
 			if service.Spec.Type != corev1.ServiceTypeClusterIP {
 				t.Errorf("Expected service type ClusterIP, got %v", service.Spec.Type)
 			}
+
+			if service.Spec.PublishNotReadyAddresses != true {
+				t.Errorf("Expected PublishNotReadyAddresses to be true, got %v", service.Spec.PublishNotReadyAddresses)
+			}
 		})
 	}
 }
@@ -237,5 +237,6 @@ func sortRoleSetByRevision(roleSets []*orchestrationv1alpha1.RoleSet, updatedRev
 func isServiceEqual(a, b *corev1.Service) bool {
 	return a.Spec.Type == b.Spec.Type &&
 		apiequality.Semantic.DeepEqual(a.Spec.Selector, b.Spec.Selector) &&
-		a.Spec.ClusterIP == b.Spec.ClusterIP
+		a.Spec.ClusterIP == b.Spec.ClusterIP &&
+		a.Spec.PublishNotReadyAddresses == b.Spec.PublishNotReadyAddresses
 }
@@ -43,6 +43,7 @@ const (
 	SGLangBootstrapPort           int64                  = 8998
 	SGLangBootstrapPortIdentifier string                 = "model.aibrix.ai/sglang-bootstrap-port"
 	LLMEngineIdentifier           string                 = constants.ModelLabelEngine
+	PDRoleSetIdentifier           string                 = "roleset-name"
 	PDRoleIdentifier              string                 = "role-name"
 	RoleReplicaIndex              string                 = "stormservice.orchestration.aibrix.ai/role-replica-index"
 	PodGroupIndex                 string                 = "stormservice.orchestration.aibrix.ai/pod-group-index"
@@ -90,12 +91,18 @@ func (r pdRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList) (
 		return "", err
 	}
 
-	if err = r.doPrefillRequest(ctx, prefillPods, getLLMEngine(prefillPods[0], LLMEngineIdentifier, VLLMEngine)); err != nil {
+	prefillPod, err := r.doPrefillRequest(ctx, prefillPods, getLLMEngine(prefillPods[0], LLMEngineIdentifier, VLLMEngine))
+	if err != nil {
 		klog.ErrorS(err, "prefill request failed", "request_id", ctx.RequestID)
 		return "", err
 	}
 
-	decodePod := r.selectDecodePod(decodePods)
+	decodePod := r.selectDecodePod(prefillPod, decodePods)
+	if decodePod == nil {
+		return "", fmt.Errorf("decode pod not found")
+	}
+
+	klog.InfoS("P/D", "prefill_pod", prefillPod.Name, "decode_pod", decodePod.Name)
 
 	ctx.SetTargetPod(decodePod)
 	return ctx.TargetAddress(), nil
@@ -148,15 +155,30 @@ func (r *pdRouter) evaluatePrefixCache(ctx *types.RoutingContext, prefillPods []
 	return prefillPod, prefixHashes, err
 }
 
-func (r *pdRouter) selectDecodePod(decodePods []*v1.Pod) *v1.Pod {
-	decodePod, _ := utils.SelectRandomPod(decodePods, rand.Intn)
+func (r *pdRouter) selectDecodePod(prefillPod *v1.Pod, decodePods []*v1.Pod) *v1.Pod {
+	prefillRoleSet, ok := prefillPod.Labels[PDRoleSetIdentifier]
+	if !ok {
+		return nil
+	}
+
+	filteredDecodePods := []*v1.Pod{}
+	for _, pod := range decodePods {
+		if podRoleSet, exists := pod.Labels[PDRoleSetIdentifier]; exists && podRoleSet == prefillRoleSet {
+			filteredDecodePods = append(filteredDecodePods, pod)
+		}
+	}
+	if len(filteredDecodePods) == 0 {
+		return nil
+	}
+
+	decodePod, _ := utils.SelectRandomPod(filteredDecodePods, rand.Intn)
 	return decodePod
 }
 
-func (r *pdRouter) doPrefillRequest(routingCtx *types.RoutingContext, prefillPods []*v1.Pod, llmEngine string) error {
+func (r *pdRouter) doPrefillRequest(routingCtx *types.RoutingContext, prefillPods []*v1.Pod, llmEngine string) (*v1.Pod, error) {
 	prefillPod, prefixHashes, err := r.evaluatePrefixCache(routingCtx, prefillPods)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	defer func() {
 		if len(prefixHashes) > 0 {
@@ -167,7 +189,7 @@ func (r *pdRouter) doPrefillRequest(routingCtx *types.RoutingContext, prefillPod
 	// Prepare prefill request payload
 	payload, err := r.preparePrefillPayload(routingCtx, prefillPod, llmEngine)
 	if err != nil {
-		return fmt.Errorf("failed to prepare prefill payload: %w", err)
+		return nil, fmt.Errorf("failed to prepare prefill payload: %w", err)
 	}
 
 	// Execute HTTP request
@@ -183,20 +205,32 @@ func (r *pdRouter) doPrefillRequest(routingCtx *types.RoutingContext, prefillPod
 
 	if llmEngine == SGLangEngine {
 		go func() {
-			if err := r.executeHTTPRequest(apiURL, routingCtx, payload); err != nil {
+			if _, err := r.executeHTTPRequest(apiURL, routingCtx, payload); err != nil {
 				klog.ErrorS(err, "prefill request for sglang failed", "request_id", routingCtx.RequestID)
 				return
 			}
 			klog.InfoS("prefill_request_complete", "request_id", routingCtx.RequestID)
 		}()
+	} else if llmEngine == VLLMEngine {
+		responseData, err := r.executeHTTPRequest(apiURL, routingCtx, payload)
+		if err != nil {
+			return nil, fmt.Errorf("failed to execute prefill request: %w", err)
+		}
+
+		// Update routing context with KV transfer params from prefill response for vLLM
+		if err := r.updateRoutingContextWithKVTransferParams(routingCtx, responseData, prefillPod); err != nil {
+			return nil, fmt.Errorf("failed to update routing context with KV transfer params: %w", err)
+		}
+
+		klog.InfoS("prefill_request_complete", "request_id", routingCtx.RequestID, "prefill_pod_ip", prefillPod.Status.PodIP)
 	} else {
-		if err := r.executeHTTPRequest(apiURL, routingCtx, payload); err != nil {
-			return fmt.Errorf("failed to execute prefill request: %w", err)
+		if _, err := r.executeHTTPRequest(apiURL, routingCtx, payload); err != nil {
+			return nil, fmt.Errorf("failed to execute prefill request: %w", err)
 		}
-		klog.InfoS("prefill_request_complete", "request_id", routingCtx.RequestID)
+		klog.InfoS("prefill_request_complete", "request_id", routingCtx.RequestID, "prefill_pod_ip", prefillPod.Status.PodIP)
 	}
 
-	return nil
+	return prefillPod, nil
 }
 
 func (r *pdRouter) preparePrefillPayload(routingCtx *types.RoutingContext, pod *v1.Pod, llmEngine string) ([]byte, error) {
@@ -221,6 +255,18 @@ func (r *pdRouter) preparePrefillPayload(routingCtx *types.RoutingContext, pod *
 		routingCtx.ReqBody = bodyCopy
 	}
 
+	// Add nixl-specific kv_transfer_params for vLLM prefill requests only
+	if llmEngine == VLLMEngine {
+		completionRequest["kv_transfer_params"] = map[string]any{
+			"do_remote_decode":  true,
+			"do_remote_prefill": false,
+			"remote_engine_id":  nil,
+			"remote_block_ids":  nil,
+			"remote_host":       nil,
+			"remote_port":       nil,
+		}
+	}
+
 	// Set prefill-specific parameters
 	completionRequest["max_tokens"] = 1
 	completionRequest["max_completion_tokens"] = 1
@@ -230,36 +276,83 @@ func (r *pdRouter) preparePrefillPayload(routingCtx *types.RoutingContext, pod *
 	return json.Marshal(completionRequest)
 }
 
-func (r *pdRouter) executeHTTPRequest(url string, routingCtx *types.RoutingContext, payload []byte) error {
+func (r *pdRouter) executeHTTPRequest(url string, routingCtx *types.RoutingContext, payload []byte) (map[string]any, error) {
 	// Create request with context
 	req, err := http.NewRequest("POST", url, bytes.NewBuffer(payload))
 	if err != nil {
-		return fmt.Errorf("failed to create http prefill request: %w", err)
+		return nil, fmt.Errorf("failed to create http prefill request: %w", err)
 	}
 
 	// Set headers
 	for key, value := range routingCtx.ReqHeaders {
 		req.Header.Set(key, value)
 	}
 	req.Header.Set("content-type", "application/json")
-	req.Header.Set("content-length", strconv.Itoa(len(payload)))
+	req.Header.Set("X-Request-Id", routingCtx.RequestID)
 
-	// Execute with timeout
-	client := &http.Client{Timeout: time.Duration(prefillRequestTimeout) * time.Second}
+	client := &http.Client{
+		Timeout: time.Duration(prefillRequestTimeout) * time.Second,
+	}
 	resp, err := client.Do(req)
 	if err != nil {
-		return fmt.Errorf("failed to execute http prefill request: %w", err)
+		return nil, fmt.Errorf("failed to execute http prefill request: %w", err)
 	}
 	defer func() {
 		_ = resp.Body.Close()
 	}()
 
+	// Read response body
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read prefill response body: %w", err)
+	}
+
 	// Check response status
 	if resp.StatusCode != http.StatusOK {
-		body, _ := io.ReadAll(resp.Body)
-		return fmt.Errorf("http prefill request failed with status %d: %s", resp.StatusCode, string(body))
+		return nil, fmt.Errorf("http prefill request failed with status %d: %s", resp.StatusCode, string(body))
+	}
+
+	// Parse response JSON
+	var responseData map[string]any
+	if err := json.Unmarshal(body, &responseData); err != nil {
+		return nil, fmt.Errorf("failed to unmarshal prefill response: %w", err)
+	}
+
+	return responseData, nil
+}
+
+func (r *pdRouter) updateRoutingContextWithKVTransferParams(routingCtx *types.RoutingContext, responseData map[string]any, prefillPod *v1.Pod) error {
+	// Extract kv_transfer_params from prefill response
+	kvTransferParams, exists := responseData["kv_transfer_params"]
+	if !exists {
+		klog.InfoS("no kv_transfer_params in prefill response", "request_id", routingCtx.RequestID)
+		return nil
 	}
 
+	// Parse the original request body
+	var originalRequest map[string]any
+	if err := json.Unmarshal(routingCtx.ReqBody, &originalRequest); err != nil {
+		return fmt.Errorf("failed to unmarshal original request body: %w", err)
+	}
+
+	// Update request body with KV transfer params from prefill response
+	originalRequest["kv_transfer_params"] = kvTransferParams
+
+	// Add prefill host information following the Python pattern
+	if kvTransferParamsMap, ok := kvTransferParams.(map[string]any); ok {
+		kvTransferParamsMap["remote_host"] = prefillPod.Status.PodIP
+	}
+
+	// Marshal the updated request body
+	updatedReqBody, err := json.Marshal(originalRequest)
+	if err != nil {
+		return fmt.Errorf("failed to marshal updated request body: %w", err)
+	}
+
+	// Update routing context with new request body
+	routingCtx.ReqBody = updatedReqBody
+
+	klog.InfoS("updated routing context with kv_transfer_params", "request_id", routingCtx.RequestID, "prefill_host", prefillPod.Status.PodIP)
 	return nil
 }
Original file line number	Diff line number	Diff line change
`@@ -237,5 +237,6 @@ func sortRoleSetByRevision(roleSets []*orchestrationv1alpha1.RoleSet, updatedRev`
`237`	`237`	`func isServiceEqual(a, b *corev1.Service) bool {`
`238`	`238`	`return a.Spec.Type == b.Spec.Type &&`
`239`	`239`	`apiequality.Semantic.DeepEqual(a.Spec.Selector, b.Spec.Selector) &&`
`240`		`- a.Spec.ClusterIP == b.Spec.ClusterIP`
	`240`	`+ a.Spec.ClusterIP == b.Spec.ClusterIP &&`
	`241`	`+ a.Spec.PublishNotReadyAddresses == b.Spec.PublishNotReadyAddresses`
`241`	`242`	`}`