diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 579133ec0c3..c0940638598 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -20,15 +20,11 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None): ("facebook/opt-125m", {}), ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { "dtype": torch.float16, - "quantization": "compressed-tensors" }), ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", { "dtype": torch.float16, - "quantization": "compressed-tensors" - }), - ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", { - "quantization": "compressed-tensors" }), + ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}), ("meta-llama/Llama-3.2-1B-Instruct", {}), ] diff --git a/vllm/config.py b/vllm/config.py index fcbf962ac68..13c38fe4a77 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -752,9 +752,8 @@ def _verify_quantization(self) -> None: supported_quantization = QUANTIZATION_METHODS optimized_quantization_methods = [ "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", - "awq_marlin", "fbgemm_fp8", "compressed_tensors", - "compressed-tensors", "experts_int8", "quark", "nvfp4", "bitblas", - "gptq_bitblas" + "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8", + "quark", "nvfp4", "bitblas", "gptq_bitblas" ] if self.quantization is not None: self.quantization = self.quantization.lower() @@ -764,6 +763,9 @@ def _verify_quantization(self) -> None: if quant_cfg is not None: quant_method = quant_cfg.get("quant_method", "").lower() + quant_method = quant_method.replace("compressed_tensors", + "compressed-tensors") + quant_cfg["quant_method"] = quant_method # Detect which checkpoint is it for name in QUANTIZATION_METHODS: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index cb9a48d7746..7b0032572ec 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -72,7 +72,7 @@ def get_min_capability(cls) -> int: return 70 def get_name(self) -> str: - return "compressed_tensors" + return "compressed-tensors" def get_quant_method( self, diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index ba8f49ca915..f6be3b0e814 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -130,8 +130,8 @@ class RocmPlatform(Platform): device_control_env_var: str = "CUDA_VISIBLE_DEVICES" supported_quantization: list[str] = [ - "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors", - "fbgemm_fp8", "gguf", "quark", "ptpc_fp8" + "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf", + "quark", "ptpc_fp8" ] @classmethod diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index fcac5155637..d5923557a21 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -30,9 +30,7 @@ class TpuPlatform(Platform): ray_device_key: str = "TPU" device_control_env_var: str = "TPU_VISIBLE_CHIPS" - supported_quantization: list[str] = [ - "tpu_int8", "compressed-tensors", "compressed_tensors" - ] + supported_quantization: list[str] = ["tpu_int8", "compressed-tensors"] additional_env_vars: list[str] = [ "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"