Skip to content

Commit 958768a

Browse files
authored
[Misc] [gpu_optimizer] add namespace info for log (vllm-project#1149)
1 parent 76e26a8 commit 958768a

File tree

3 files changed

+25
-16
lines changed

3 files changed

+25
-16
lines changed

python/aibrix/aibrix/gpu_optimizer/app.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,12 @@ def new_deployment(deployment):
8181
min_replicas = int(label)
8282

8383
return DeploymentStates(
84-
deployment.metadata.name,
85-
deployment.spec.replicas if deployment.spec.replicas is not None else 0,
86-
min_replicas,
84+
name=deployment.metadata.name,
85+
namespace=deployment.metadate.namespace,
86+
replicas=deployment.spec.replicas
87+
if deployment.spec.replicas is not None
88+
else 0,
89+
min_replicas=min_replicas,
8790
)
8891

8992

@@ -101,7 +104,7 @@ def start_serving_thread(watch_ver, deployment, watch_event: bool) -> bool:
101104
watch_ver, deployment_name, namespace, lambda: new_deployment(deployment)
102105
)
103106
logger.info(
104-
f'Deployment "{deployment_name}" found in watch version {watch_ver}, added to the model monitor for "{model_name}"'
107+
f'Deployment "{namespace}/{deployment_name}" found in watch version {watch_ver}, added to the model monitor for "{model_name}"'
105108
)
106109
return False
107110

@@ -120,11 +123,11 @@ def start_serving_thread(watch_ver, deployment, watch_event: bool) -> bool:
120123
model_monitors[model_name] = model_monitor
121124
if watch_event:
122125
logger.info(
123-
f'New model monitor started for "{model_name}". Deployment "{deployment_name}" added.'
126+
f'New model monitor started for "{model_name}". Deployment "{namespace}/{deployment_name}" added.'
124127
)
125128
else:
126129
logger.info(
127-
f'Model monitor started for existed "{model_name}". Deployment "{deployment_name}" added.'
130+
f'Model monitor started for existed "{model_name}". Deployment "{namespace}/{deployment_name}" added.'
128131
)
129132
return True
130133

@@ -137,15 +140,15 @@ def update_deployment(watch_ver, deployment):
137140
namespace = deployment.metadata.namespace
138141
if model_monitor is None:
139142
logger.warning(
140-
f'Updating "{deployment_name}" in the model monitor, but "{model_name}" has not monitored.'
143+
f'Updating "{namespace}/{deployment_name}" in the model monitor, but "{model_name}" has not monitored.'
141144
)
142145
return
143146

144147
if model_monitor.add_deployment(
145148
watch_ver, deployment_name, namespace, lambda: new_deployment(deployment)
146149
):
147150
logger.info(
148-
f'Updated "{deployment_name}" in the model monitor for "{model_name}".'
151+
f'Updated "{namespace}/{deployment_name}" in the model monitor for "{model_name}".'
149152
)
150153

151154

@@ -157,20 +160,20 @@ def remove_deployment(deployment):
157160
namespace = deployment.metadata.namespace
158161
if model_monitor is None:
159162
logger.warning(
160-
f'Removing "{deployment_name}" from the model monitor, but "{model_name}" has not monitored.'
163+
f'Removing "{namespace}/{deployment_name}" from the model monitor, but "{model_name}" has not monitored.'
161164
)
162165
return
163166

164167
if model_monitor.remove_deployment(deployment_name, namespace) == 0:
165168
model_monitor.stop()
166169
del model_monitors[model_name]
167170
logger.info(
168-
f'Removing "{deployment_name}" from the model monitor, no deployment left in "{model_name}", stopping the model monitor.'
171+
f'Removing "{namespace}/{deployment_name}" from the model monitor, no deployment left in "{model_name}", stopping the model monitor.'
169172
)
170173
return
171174

172175
logger.info(
173-
f'Removing "{deployment_name}" from the model monitor for "{model_name}".'
176+
f'Removing "{namespace}/{deployment_name}" from the model monitor for "{model_name}".'
174177
)
175178

176179

@@ -316,7 +319,7 @@ def main(signal, timeout):
316319
start_serving_thread(watch_version, deployment, False)
317320
except Exception as e:
318321
logger.warning(
319-
f"Error on handle existing deployment {deployment.metadata.name}: {e}"
322+
f"Error on handle existing deployment {deployment.metadata.namepsace}/{deployment.metadata.name}: {e} "
320323
)
321324
except client.rest.ApiException as ae:
322325
logger.error(

python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,11 @@
4444
class DeploymentStates:
4545
"""States of a deployment with resource version."""
4646

47-
def __init__(self, name: str, replicas: int = 1, min_replicas: int = 0):
47+
def __init__(
48+
self, name: str, namespace: str = "", replicas: int = 1, min_replicas: int = 0
49+
):
4850
self.name = name
51+
self.namespace = namespace
4952

5053
# _replicas stores optimized value
5154
self._replicas = replicas
@@ -226,7 +229,7 @@ def read_deployment_num_replicas(self, deployment_name: str, namespace: str) ->
226229
key = self._deployment_entry_point(deployment_name, namespace)
227230
if key not in self.deployments:
228231
raise Exception(
229-
f"Deployment {namespace}:{deployment_name} of model {self.model_name} is not monitored"
232+
f"Deployment {namespace}/{deployment_name} of model {self.model_name} is not monitored"
230233
)
231234
return self.deployments[key].replicas
232235

@@ -240,7 +243,7 @@ def update_deployment_num_replicas(
240243
key = self._deployment_entry_point(deployment_name, namespace)
241244
if key not in self.deployments:
242245
raise Exception(
243-
f"Deployment {namespace}:{deployment_name} of model {self.model_name} is not monitored"
246+
f"Deployment {namespace}/{deployment_name} of model {self.model_name} is not monitored"
244247
)
245248

246249
if overriding:

python/aibrix/aibrix/gpu_optimizer/load_monitor/visualizer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,10 @@ def get_debug_model_montior(
122122
if profile_reader is not None:
123123
for _, profile in enumerate(profile_reader.read()):
124124
debug_monitor.add_deployment(
125-
"0", profile.gpu, None, DeploymentStates(profile.gpu, 0)
125+
"0",
126+
profile.gpu,
127+
None,
128+
DeploymentStates(name=profile.gpu, replicas=0),
126129
)
127130

128131
return debug_monitor

0 commit comments

Comments
 (0)