From 9794bb7b2d9418912b2e96f57076f2b2f1460e33 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 24 Apr 2025 13:16:12 +0200
Subject: [PATCH 1/2] Use Transformers helper `get_text_config()` instead of
 checking for `text_config`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 benchmarks/kernels/benchmark_moe.py       |  5 ++---
 tests/models/test_initialization.py       |  5 +----
 vllm/config.py                            |  6 ++----
 vllm/transformers_utils/config.py         | 20 +++++++++++---------
 vllm/v1/worker/lora_model_runner_mixin.py | 11 +++--------
 vllm/worker/cpu_model_runner.py           | 11 +++--------
 vllm/worker/hpu_model_runner.py           | 14 +++++---------
 vllm/worker/model_runner.py               | 14 +++++---------
 8 files changed, 32 insertions(+), 54 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index afe0b53077a..912470fada8 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -553,9 +553,8 @@ def main(args: argparse.Namespace):
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
     else:
-        if not hasattr(config, "hidden_size"):
-            # Support for llama4
-            config = config.text_config
+        # Support for llama4
+        config = config.get_text_config()
         # Default: Mixtral.
         E = config.num_local_experts
         topk = config.num_experts_per_tok
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index cd2b8f00d52..446c4efbf6a 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -24,10 +24,7 @@ def test_can_initialize(model_arch):
     def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
         hf_config.update(model_info.hf_overrides)
 
-        if hasattr(hf_config, "text_config"):
-            text_config: PretrainedConfig = hf_config.text_config
-        else:
-            text_config = hf_config
+        text_config = hf_config.get_text_config()
 
         text_config.update({
             "num_layers": 1,
diff --git a/vllm/config.py b/vllm/config.py
index 741ce04d5df..c64ba696e33 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2859,12 +2859,10 @@ def _get_and_verify_dtype(
 ) -> torch.dtype:
     # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
     # because config.torch_dtype can be None.
-    config_dtype = getattr(config, "torch_dtype", None)
+    config_dtype = getattr(config.get_text_config(), "torch_dtype", None)
 
-    # Fallbacks for multi-modal models if the root config
+    # Fallback for multi-modal models if the root config
     # does not define torch_dtype
-    if config_dtype is None and hasattr(config, "text_config"):
-        config_dtype = getattr(config.text_config, "torch_dtype", None)
     if config_dtype is None and hasattr(config, "vision_config"):
         config_dtype = getattr(config.vision_config, "torch_dtype", None)
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 4e2a31ce672..883ceb56fce 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -757,19 +757,21 @@ def get_hf_text_config(config: PretrainedConfig):
     """Get the "sub" config relevant to llm for multi modal models.
     No op for pure text models.
     """
-    if hasattr(config, "text_config"):
-        # The code operates under the assumption that text_config should have
-        # `num_attention_heads` (among others). Assert here to fail early
-        # if transformers config doesn't align with this assumption.
-        assert hasattr(config.text_config, "num_attention_heads")
-        return config.text_config
-    elif hasattr(config, "thinker_config"):
+    # This block should be unnecessary after https://github.com/huggingface/transformers/pull/37517
+    if hasattr(config, "thinker_config"):
         # TODO(suyang.fy): Refactor code.
         #  For Qwen2.5-Omni, change hf_text_config to
         #  thinker_config.text_config.
         return config.thinker_config.text_config
-    else:
-        return config
+
+    text_config = config.get_text_config()
+
+    # The code operates under the assumption that text_config should have
+    # `num_attention_heads` (among others). Assert here to fail early
+    # if transformers config doesn't align with this assumption.
+    assert hasattr(text_config, "num_attention_heads")
+
+    return text_config
 
 
 def try_get_generation_config(
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index a8a19e0e620..ed451b13d8e 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -35,13 +35,8 @@ def load_lora_model(self, model: nn.Module, model_config: ModelConfig,
             logger.warning("Regarding multimodal models, vLLM currently "
                            "only supports adding LoRA to language model.")
 
-        # It's necessary to distinguish between the max_position_embeddings
-        # of VLMs and LLMs.
-        if hasattr(model.config, "max_position_embeddings"):
-            max_pos_embeddings = model.config.max_position_embeddings
-        else:
-            max_pos_embeddings = (
-                model.config.text_config.max_position_embeddings)
+        # Use get_text_config() in case of multimodal models
+        text_config = model_config.hf_config.get_text_config()
 
         # Add LoRA Manager to the Model Runner
         self.lora_manager = LRUCacheWorkerLoRAManager(
@@ -52,7 +47,7 @@ def load_lora_model(self, model: nn.Module, model_config: ModelConfig,
             device,
             model.embedding_modules,
             model.embedding_padding_modules,
-            max_position_embeddings=max_pos_embeddings,
+            max_position_embeddings=text_config.max_position_embeddings,
         )
         return self.lora_manager.create_lora_manager(model)
 
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 29fbfbf0d37..87b7f02ab6d 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -508,13 +508,8 @@ def load_model(self) -> None:
                 logger.warning("Regarding multimodal models, vLLM currently "
                                "only supports adding LoRA to language model.")
 
-            # It's necessary to distinguish between the max_position_embeddings
-            # of VLMs and LLMs.
-            if hasattr(self.model.config, "max_position_embeddings"):
-                max_pos_embeddings = self.model.config.max_position_embeddings
-            else:
-                max_pos_embeddings = (
-                    self.model.config.text_config.max_position_embeddings)
+            # Use get_text_config() in case of multimodal models
+            text_config = self.model_config.hf_config.get_text_config()
 
             self.lora_manager = LRUCacheWorkerLoRAManager(
                 self.scheduler_config.max_num_seqs,
@@ -524,7 +519,7 @@ def load_model(self) -> None:
                 self.device,
                 self.model.embedding_modules,
                 self.model.embedding_padding_modules,
-                max_position_embeddings=max_pos_embeddings,
+                max_position_embeddings=text_config.max_position_embeddings,
             )
             self.model = self.lora_manager.create_lora_manager(self.model)
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 1bcef841b06..2a495634367 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -724,14 +724,9 @@ def load_model(self) -> None:
                     "Bias support in LoRA is not enabled in HPU yet."
                 assert not self.lora_config.fully_sharded_loras, \
                     "Fully sharded LoRAs is not enabled in HPU yet."
-                # It's necessary to distinguish between the
-                # max_position_embeddings of VLMs and LLMs.
-                if hasattr(self.model.config, "max_position_embeddings"):
-                    max_pos_embeddings = (
-                        self.model.config.max_position_embeddings)
-                else:
-                    max_pos_embeddings = (
-                        self.model.config.text_config.max_position_embeddings)
+
+                # Use get_text_config() in case of multimodal models
+                text_config = self.model_config.hf_config.get_text_config()
 
                 self.lora_manager = LRUCacheWorkerLoRAManager(
                     self.scheduler_config.max_num_seqs,
@@ -741,7 +736,8 @@ def load_model(self) -> None:
                     self.device,
                     self.model.embedding_modules,
                     self.model.embedding_padding_modules,
-                    max_position_embeddings=max_pos_embeddings,
+                    max_position_embeddings=text_config.
+                    max_position_embeddings,
                 )
                 self.model = self.lora_manager.create_lora_manager(self.model)
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 3b09c92ae15..66b12d5be1a 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1130,14 +1130,9 @@ def load_model(self) -> None:
                     logger.warning(
                         "Regarding multimodal models, vLLM currently "
                         "only supports adding LoRA to language model.")
-                # It's necessary to distinguish between the
-                # max_position_embeddings of VLMs and LLMs.
-                if hasattr(self.model.config, "max_position_embeddings"):
-                    max_pos_embeddings = (
-                        self.model.config.max_position_embeddings)
-                else:
-                    max_pos_embeddings = (
-                        self.model.config.text_config.max_position_embeddings)
+
+                # Use get_text_config() in case of multimodal models
+                text_config = self.model_config.hf_config.get_text_config()
 
                 self.lora_manager = LRUCacheWorkerLoRAManager(
                     self.scheduler_config.max_num_seqs,
@@ -1147,7 +1142,8 @@ def load_model(self) -> None:
                     self.device,
                     self.model.embedding_modules,
                     self.model.embedding_padding_modules,
-                    max_position_embeddings=max_pos_embeddings,
+                    max_position_embeddings=text_config.
+                    max_position_embeddings,
                 )
                 self.model = self.lora_manager.create_lora_manager(self.model)
             time_after_load = time.perf_counter()

From cf9d974b875e8d491fb27249e125332316953f2f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 25 Apr 2025 10:32:39 +0200
Subject: [PATCH 2/2] Only assert that text_config has num_attention_heads if
 it's not the original config

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/transformers_utils/config.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 883ceb56fce..358bfb88182 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -766,10 +766,11 @@ def get_hf_text_config(config: PretrainedConfig):
 
     text_config = config.get_text_config()
 
-    # The code operates under the assumption that text_config should have
-    # `num_attention_heads` (among others). Assert here to fail early
-    # if transformers config doesn't align with this assumption.
-    assert hasattr(text_config, "num_attention_heads")
+    if text_config is not config:
+        # The code operates under the assumption that text_config should have
+        # `num_attention_heads` (among others). Assert here to fail early
+        # if transformers config doesn't align with this assumption.
+        assert hasattr(text_config, "num_attention_heads")
 
     return text_config