From 9794bb7b2d9418912b2e96f57076f2b2f1460e33 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 24 Apr 2025 13:16:12 +0200 Subject: [PATCH 1/2] Use Transformers helper `get_text_config()` instead of checking for `text_config` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- benchmarks/kernels/benchmark_moe.py | 5 ++--- tests/models/test_initialization.py | 5 +---- vllm/config.py | 6 ++---- vllm/transformers_utils/config.py | 20 +++++++++++--------- vllm/v1/worker/lora_model_runner_mixin.py | 11 +++-------- vllm/worker/cpu_model_runner.py | 11 +++-------- vllm/worker/hpu_model_runner.py | 14 +++++--------- vllm/worker/model_runner.py | 14 +++++--------- 8 files changed, 32 insertions(+), 54 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index afe0b53077a..912470fada8 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -553,9 +553,8 @@ def main(args: argparse.Namespace): intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size else: - if not hasattr(config, "hidden_size"): - # Support for llama4 - config = config.text_config + # Support for llama4 + config = config.get_text_config() # Default: Mixtral. E = config.num_local_experts topk = config.num_experts_per_tok diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index cd2b8f00d52..446c4efbf6a 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -24,10 +24,7 @@ def test_can_initialize(model_arch): def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: hf_config.update(model_info.hf_overrides) - if hasattr(hf_config, "text_config"): - text_config: PretrainedConfig = hf_config.text_config - else: - text_config = hf_config + text_config = hf_config.get_text_config() text_config.update({ "num_layers": 1, diff --git a/vllm/config.py b/vllm/config.py index 741ce04d5df..c64ba696e33 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2859,12 +2859,10 @@ def _get_and_verify_dtype( ) -> torch.dtype: # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct # because config.torch_dtype can be None. - config_dtype = getattr(config, "torch_dtype", None) + config_dtype = getattr(config.get_text_config(), "torch_dtype", None) - # Fallbacks for multi-modal models if the root config + # Fallback for multi-modal models if the root config # does not define torch_dtype - if config_dtype is None and hasattr(config, "text_config"): - config_dtype = getattr(config.text_config, "torch_dtype", None) if config_dtype is None and hasattr(config, "vision_config"): config_dtype = getattr(config.vision_config, "torch_dtype", None) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 4e2a31ce672..883ceb56fce 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -757,19 +757,21 @@ def get_hf_text_config(config: PretrainedConfig): """Get the "sub" config relevant to llm for multi modal models. No op for pure text models. """ - if hasattr(config, "text_config"): - # The code operates under the assumption that text_config should have - # `num_attention_heads` (among others). Assert here to fail early - # if transformers config doesn't align with this assumption. - assert hasattr(config.text_config, "num_attention_heads") - return config.text_config - elif hasattr(config, "thinker_config"): + # This block should be unnecessary after https://github.com/huggingface/transformers/pull/37517 + if hasattr(config, "thinker_config"): # TODO(suyang.fy): Refactor code. # For Qwen2.5-Omni, change hf_text_config to # thinker_config.text_config. return config.thinker_config.text_config - else: - return config + + text_config = config.get_text_config() + + # The code operates under the assumption that text_config should have + # `num_attention_heads` (among others). Assert here to fail early + # if transformers config doesn't align with this assumption. + assert hasattr(text_config, "num_attention_heads") + + return text_config def try_get_generation_config( diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index a8a19e0e620..ed451b13d8e 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -35,13 +35,8 @@ def load_lora_model(self, model: nn.Module, model_config: ModelConfig, logger.warning("Regarding multimodal models, vLLM currently " "only supports adding LoRA to language model.") - # It's necessary to distinguish between the max_position_embeddings - # of VLMs and LLMs. - if hasattr(model.config, "max_position_embeddings"): - max_pos_embeddings = model.config.max_position_embeddings - else: - max_pos_embeddings = ( - model.config.text_config.max_position_embeddings) + # Use get_text_config() in case of multimodal models + text_config = model_config.hf_config.get_text_config() # Add LoRA Manager to the Model Runner self.lora_manager = LRUCacheWorkerLoRAManager( @@ -52,7 +47,7 @@ def load_lora_model(self, model: nn.Module, model_config: ModelConfig, device, model.embedding_modules, model.embedding_padding_modules, - max_position_embeddings=max_pos_embeddings, + max_position_embeddings=text_config.max_position_embeddings, ) return self.lora_manager.create_lora_manager(model) diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 29fbfbf0d37..87b7f02ab6d 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -508,13 +508,8 @@ def load_model(self) -> None: logger.warning("Regarding multimodal models, vLLM currently " "only supports adding LoRA to language model.") - # It's necessary to distinguish between the max_position_embeddings - # of VLMs and LLMs. - if hasattr(self.model.config, "max_position_embeddings"): - max_pos_embeddings = self.model.config.max_position_embeddings - else: - max_pos_embeddings = ( - self.model.config.text_config.max_position_embeddings) + # Use get_text_config() in case of multimodal models + text_config = self.model_config.hf_config.get_text_config() self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, @@ -524,7 +519,7 @@ def load_model(self) -> None: self.device, self.model.embedding_modules, self.model.embedding_padding_modules, - max_position_embeddings=max_pos_embeddings, + max_position_embeddings=text_config.max_position_embeddings, ) self.model = self.lora_manager.create_lora_manager(self.model) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 1bcef841b06..2a495634367 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -724,14 +724,9 @@ def load_model(self) -> None: "Bias support in LoRA is not enabled in HPU yet." assert not self.lora_config.fully_sharded_loras, \ "Fully sharded LoRAs is not enabled in HPU yet." - # It's necessary to distinguish between the - # max_position_embeddings of VLMs and LLMs. - if hasattr(self.model.config, "max_position_embeddings"): - max_pos_embeddings = ( - self.model.config.max_position_embeddings) - else: - max_pos_embeddings = ( - self.model.config.text_config.max_position_embeddings) + + # Use get_text_config() in case of multimodal models + text_config = self.model_config.hf_config.get_text_config() self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, @@ -741,7 +736,8 @@ def load_model(self) -> None: self.device, self.model.embedding_modules, self.model.embedding_padding_modules, - max_position_embeddings=max_pos_embeddings, + max_position_embeddings=text_config. + max_position_embeddings, ) self.model = self.lora_manager.create_lora_manager(self.model) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 3b09c92ae15..66b12d5be1a 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1130,14 +1130,9 @@ def load_model(self) -> None: logger.warning( "Regarding multimodal models, vLLM currently " "only supports adding LoRA to language model.") - # It's necessary to distinguish between the - # max_position_embeddings of VLMs and LLMs. - if hasattr(self.model.config, "max_position_embeddings"): - max_pos_embeddings = ( - self.model.config.max_position_embeddings) - else: - max_pos_embeddings = ( - self.model.config.text_config.max_position_embeddings) + + # Use get_text_config() in case of multimodal models + text_config = self.model_config.hf_config.get_text_config() self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, @@ -1147,7 +1142,8 @@ def load_model(self) -> None: self.device, self.model.embedding_modules, self.model.embedding_padding_modules, - max_position_embeddings=max_pos_embeddings, + max_position_embeddings=text_config. + max_position_embeddings, ) self.model = self.lora_manager.create_lora_manager(self.model) time_after_load = time.perf_counter() From cf9d974b875e8d491fb27249e125332316953f2f Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 25 Apr 2025 10:32:39 +0200 Subject: [PATCH 2/2] Only assert that text_config has num_attention_heads if it's not the original config Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/config.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 883ceb56fce..358bfb88182 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -766,10 +766,11 @@ def get_hf_text_config(config: PretrainedConfig): text_config = config.get_text_config() - # The code operates under the assumption that text_config should have - # `num_attention_heads` (among others). Assert here to fail early - # if transformers config doesn't align with this assumption. - assert hasattr(text_config, "num_attention_heads") + if text_config is not config: + # The code operates under the assumption that text_config should have + # `num_attention_heads` (among others). Assert here to fail early + # if transformers config doesn't align with this assumption. + assert hasattr(text_config, "num_attention_heads") return text_config