From 1c4a5d36489c8729126a910be56d34357d4d6191 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Fri, 29 Nov 2024 10:58:04 +0800 Subject: [PATCH] [platform] Add supported_quantization in platform. Signed-off-by: wangxiyuan --- vllm/config.py | 28 +--------------------------- vllm/platforms/cpu.py | 1 + vllm/platforms/cuda.py | 1 + vllm/platforms/hpu.py | 1 + vllm/platforms/interface.py | 13 +++++++++++++ vllm/platforms/neuron.py | 2 ++ vllm/platforms/openvino.py | 1 + vllm/platforms/rocm.py | 15 +++++++++++++++ vllm/platforms/tpu.py | 2 ++ vllm/platforms/xpu.py | 1 + 10 files changed, 38 insertions(+), 27 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index cd24e9ffdf598..b1e5b412fec8f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -393,17 +393,11 @@ def _parse_quant_hf_config(self): def _verify_quantization(self) -> None: supported_quantization = QUANTIZATION_METHODS - rocm_supported_quantization = [ - "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors", - "fbgemm_fp8", "gguf" - ] optimized_quantization_methods = [ "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", "awq_marlin", "fbgemm_fp8", "compressed_tensors", "compressed-tensors", "experts_int8" ] - tpu_supported_quantization = ["tpu_int8"] - neuron_supported_quantization = ["neuron_quant"] if self.quantization is not None: self.quantization = self.quantization.lower() @@ -438,32 +432,12 @@ def _verify_quantization(self) -> None: raise ValueError( f"Unknown quantization method: {self.quantization}. Must " f"be one of {supported_quantization}.") - if current_platform.is_rocm( - ) and self.quantization not in rocm_supported_quantization: - raise ValueError( - f"{self.quantization} quantization is currently not " - f"supported in ROCm.") - if current_platform.is_tpu( - ) and self.quantization not in tpu_supported_quantization: - raise ValueError( - f"{self.quantization} quantization is currently not " - f"supported in TPU Backend.") + current_platform.verify_quantization(self.quantization) if self.quantization not in optimized_quantization_methods: logger.warning( "%s quantization is not fully " "optimized yet. The speed can be slower than " "non-quantized models.", self.quantization) - if (self.quantization == "awq" and current_platform.is_rocm() - and not envs.VLLM_USE_TRITON_AWQ): - logger.warning( - "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ" - " is not set, enabling VLLM_USE_TRITON_AWQ.") - envs.VLLM_USE_TRITON_AWQ = True - if current_platform.is_neuron( - ) and self.quantization not in neuron_supported_quantization: - raise ValueError( - f"{self.quantization} quantization is currently not " - f"supported in Neuron Backend.") def _verify_cuda_graph(self) -> None: if self.max_seq_len_to_capture is None: diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 3e22c87f61fac..b5333fbd6f502 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -19,6 +19,7 @@ class CpuPlatform(Platform): _enum = PlatformEnum.CPU + device_name: str = "cpu" device_type: str = "cpu" dispatch_key: str = "CPU" diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 5e9ce551f2332..846a1869da228 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -72,6 +72,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R: class CudaPlatformBase(Platform): _enum = PlatformEnum.CUDA + device_name: str = "cuda" device_type: str = "cuda" dispatch_key: str = "CUDA" diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 3071136e43b85..10aaa6d54962c 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -12,6 +12,7 @@ class HpuPlatform(Platform): _enum = PlatformEnum.HPU + device_name: str = "hpu" device_type: str = "hpu" dispatch_key: str = "HPU" diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 3328665029039..eac2b413f9271 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -56,11 +56,13 @@ def to_int(self) -> int: class Platform: _enum: PlatformEnum + device_name: str device_type: str # available dispatch keys: # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa # use "CPU" as a fallback for platforms not registered in PyTorch dispatch_key: str = "CPU" + supported_quantization: list[str] = [] def is_cuda(self) -> bool: return self._enum == PlatformEnum.CUDA @@ -171,6 +173,17 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: """ pass + @classmethod + def verify_quantization(cls, quant: str) -> None: + """ + Verify whether the quantization is supported by the current platform. + """ + if cls.supported_quantization and \ + quant not in cls.supported_quantization: + raise ValueError( + f"{quant} quantization is currently not supported in " + f"{cls.device_name}.") + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 4c4d778ed3dd4..87655ea198303 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -10,7 +10,9 @@ class NeuronPlatform(Platform): _enum = PlatformEnum.NEURON + device_name: str = "neuron" device_type: str = "neuron" + supported_quantization: list[str] = ["neuron_quant"] @classmethod def get_device_name(cls, device_id: int = 0) -> str: diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index ea5ec7b40b95c..29b61e955d9ab 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -23,6 +23,7 @@ class OpenVinoPlatform(Platform): _enum = PlatformEnum.OPENVINO + device_name: str = "openvino" device_type: str = "openvino" dispatch_key: str = "CPU" diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index d2f44c3e423e3..3c14fbc179f69 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -4,6 +4,7 @@ import torch +import vllm.envs as envs from vllm.logger import init_logger from .interface import DeviceCapability, Platform, PlatformEnum, _Backend @@ -35,8 +36,13 @@ class RocmPlatform(Platform): _enum = PlatformEnum.ROCM + device_name: str = "rocm" device_type: str = "cuda" dispatch_key: str = "CUDA" + supported_quantization: list[str] = [ + "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors", + "fbgemm_fp8", "gguf" + ] @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: @@ -79,3 +85,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "vllm.spec_decode.spec_decode_worker.create_spec_worker" else: parallel_config.worker_cls = "vllm.worker.worker.Worker" + + @classmethod + def verify_quantization(cls, quant: str) -> None: + super().verify_quantization(quant) + if quant == "awq" and not envs.VLLM_USE_TRITON_AWQ: + logger.warning( + "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ" + " is not set, enabling VLLM_USE_TRITON_AWQ.") + envs.VLLM_USE_TRITON_AWQ = True diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 137af57023ea9..b138f7e1c54c5 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -16,8 +16,10 @@ class TpuPlatform(Platform): _enum = PlatformEnum.TPU + device_name: str = "tpu" device_type: str = "tpu" dispatch_key: str = "XLA" + supported_quantization: list[str] = ["tpu_int8"] @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 69388a8e0f27c..9665786f4c499 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -16,6 +16,7 @@ class XPUPlatform(Platform): _enum = PlatformEnum.XPU + device_name: str = "xpu" device_type: str = "xpu" dispatch_key: str = "XPU"