From 6bd04b03870730158df2bb4cf5abbdbe381700a4 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 25 Jun 2024 10:11:04 +0000 Subject: [PATCH 1/5] Introduce `ImageAssets` --- tests/conftest.py | 102 +++++++++++++++++------------ tests/models/test_llava.py | 16 +++-- tests/models/test_llava_next.py | 20 +++--- tests/models/test_phi3v.py | 18 ++--- tests/multimodal/test_processor.py | 24 +++---- 5 files changed, 103 insertions(+), 77 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 67885b93285c5..8afcb75b4c906 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,12 @@ import contextlib import gc import os -from typing import Any, Dict, List, Optional, Tuple, TypeVar +from collections import UserList +from dataclasses import dataclass +from functools import cached_property +from pathlib import Path +from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict, + TypeVar) import pytest import torch @@ -28,21 +33,56 @@ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] -# Multi modal related -# You can use `.buildkite/download-images.sh` to download the assets -PIXEL_VALUES_FILES = [ - os.path.join(_TEST_DIR, "images", filename) for filename in - ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"] -] -IMAGE_FEATURES_FILES = [ - os.path.join(_TEST_DIR, "images", filename) for filename in - ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"] -] -IMAGE_FILES = [ - os.path.join(_TEST_DIR, "images", filename) - for filename in ["stop_sign.jpg", "cherry_blossom.jpg"] -] -assert len(PIXEL_VALUES_FILES) == len(IMAGE_FEATURES_FILES) == len(IMAGE_FILES) +_IMAGE_DIR = Path(_TEST_DIR) / "images" +"""You can use `.buildkite/download-images.sh` to download the assets.""" + + +@dataclass(frozen=True) +class ImageAsset: + name: Literal["stop_sign", "cherry_blossom"] + + @cached_property + def pixel_values(self) -> torch.Tensor: + return torch.load(_IMAGE_DIR / f"{self.name}_pixel_values.pt") + + @cached_property + def image_features(self) -> torch.Tensor: + return torch.load(_IMAGE_DIR / f"{self.name}_image_features.pt") + + @cached_property + def pil_image(self) -> Image.Image: + return Image.open(_IMAGE_DIR / f"{self.name}.jpg") + + def for_hf(self) -> Image.Image: + return self.pil_image + + def for_vllm(self, vision_config: VisionLanguageConfig) -> MultiModalData: + image_input_type = vision_config.image_input_type + ImageInputType = VisionLanguageConfig.ImageInputType + + if image_input_type == ImageInputType.IMAGE_FEATURES: + return ImageFeatureData(self.image_features) + if image_input_type == ImageInputType.PIXEL_VALUES: + return ImagePixelData(self.pil_image) + + raise NotImplementedError + + +class _ImageAssetPrompts(TypedDict): + stop_sign: str + cherry_blossom: str + + +class ImageAssets(UserList[ImageAsset]): + + def prompts(self, prompts: _ImageAssetPrompts) -> List[str]: + """Convenience method to define the prompt for each test image.""" + return [prompts["stop_sign"], prompts["cherry_blossom"]] + + +IMAGE_ASSETS = ImageAssets( + [ImageAsset("stop_sign"), + ImageAsset("cherry_blossom")]) def _read_prompts(filename: str) -> List[str]: @@ -81,31 +121,6 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): cleanup() -@pytest.fixture(scope="session") -def hf_images() -> List[Image.Image]: - return [Image.open(filename) for filename in IMAGE_FILES] - - -@pytest.fixture() -def vllm_images(request) -> List[MultiModalData]: - vision_language_config = request.getfixturevalue("model_and_config")[1] - if vision_language_config.image_input_type == ( - VisionLanguageConfig.ImageInputType.IMAGE_FEATURES): - return [ - ImageFeatureData(torch.load(filename)) - for filename in IMAGE_FEATURES_FILES - ] - else: - return [ - ImagePixelData(Image.open(filename)) for filename in IMAGE_FILES - ] - - -@pytest.fixture() -def vllm_image_tensors(request) -> List[torch.Tensor]: - return [torch.load(filename) for filename in PIXEL_VALUES_FILES] - - @pytest.fixture def example_prompts() -> List[str]: prompts = [] @@ -122,6 +137,11 @@ def example_long_prompts() -> List[str]: return prompts +@pytest.fixture(scope="session") +def image_assets() -> ImageAssets: + return IMAGE_ASSETS + + _STR_DTYPE_TO_TORCH_DTYPE = { "half": torch.half, "bfloat16": torch.bfloat16, diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index b41c69f72b052..3d057cdfd823e 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -5,17 +5,17 @@ from vllm.config import VisionLanguageConfig -from ..conftest import IMAGE_FILES +from ..conftest import IMAGE_ASSETS pytestmark = pytest.mark.vlm # The image token is placed before "user" on purpose so that the test can pass -HF_IMAGE_PROMPTS = [ +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": "\nUSER: What's the content of the image?\nASSISTANT:", + "cherry_blossom": "\nUSER: What is the season?\nASSISTANT:", -] - -assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) +}) def iter_llava_configs(model_name: str): @@ -69,8 +69,8 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) -def test_models(hf_runner, vllm_runner, hf_images, vllm_images, - model_and_config, dtype: str, max_tokens: int) -> None: +def test_models(hf_runner, vllm_runner, image_assets, model_and_config, + dtype: str, max_tokens: int) -> None: """Inference result should be the same between hf and vllm. All the image fixtures for the test is under tests/images. @@ -81,6 +81,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, The text output is sanitized to be able to compare with hf. """ model_id, vlm_config = model_and_config + hf_images = [asset.for_hf() for asset in image_assets] + vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets] with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py index 0eca5cb5330c8..c1206e166f2dc 100644 --- a/tests/models/test_llava_next.py +++ b/tests/models/test_llava_next.py @@ -5,7 +5,7 @@ from vllm.config import VisionLanguageConfig -from ..conftest import IMAGE_FILES +from ..conftest import IMAGE_ASSETS pytestmark = pytest.mark.vlm @@ -15,12 +15,12 @@ "questions.") # The image token is placed before "user" on purpose so that the test can pass -HF_IMAGE_PROMPTS = [ - f"{_PREFACE} \nUSER: What's the content of the image? ASSISTANT:", - f"{_PREFACE} \nUSER: What is the season? ASSISTANT:", -] - -assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + f"{_PREFACE} \nUSER: What's the content of the image?\nASSISTANT:", + "cherry_blossom": + f"{_PREFACE} \nUSER: What is the season?\nASSISTANT:", +}) def iter_llava_next_configs(model_name: str): @@ -78,8 +78,8 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) -def test_models(hf_runner, vllm_runner, hf_images, vllm_images, - model_and_config, dtype: str, max_tokens: int) -> None: +def test_models(hf_runner, vllm_runner, image_assets, model_and_config, + dtype: str, max_tokens: int) -> None: """Inference result should be the same between hf and vllm. All the image fixtures for the test is under tests/images. @@ -90,6 +90,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, The text output is sanitized to be able to compare with hf. """ model_id, vlm_config = model_and_config + hf_images = [asset.for_hf() for asset in image_assets] + vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets] with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index a29d50df4c4e5..e01dcca1aafd6 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -6,17 +6,17 @@ from vllm.config import VisionLanguageConfig from vllm.utils import is_cpu -from ..conftest import IMAGE_FILES +from ..conftest import IMAGE_ASSETS pytestmark = pytest.mark.vlm # The image token is placed before "user" on purpose so that the test can pass -HF_IMAGE_PROMPTS = [ +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501 - "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n", -] - -assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) + "cherry_blossom": + "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n", # noqa: E501 +}) def iter_phi3v_configs(model_name: str): @@ -82,8 +82,8 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_tokens", [128]) -def test_models(hf_runner, vllm_runner, hf_images, vllm_images, - model_and_config, dtype: str, max_tokens: int) -> None: +def test_models(hf_runner, vllm_runner, image_assets, model_and_config, + dtype: str, max_tokens: int) -> None: """Inference result should be the same between hf and vllm. All the image fixtures for the test is under tests/images. @@ -94,6 +94,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, The text output is sanitized to be able to compare with hf. """ model_id, vlm_config = model_and_config + hf_images = [asset.for_hf() for asset in image_assets] + vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets] # use eager mode for hf runner, since phi3_v didn't work with flash_attn hf_model_kwargs = {"_attn_implementation": "eager"} diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_processor.py index 51c352361702a..9ac48dfab6784 100644 --- a/tests/multimodal/test_processor.py +++ b/tests/multimodal/test_processor.py @@ -10,7 +10,7 @@ @pytest.mark.parametrize("dtype", ["half", "float"]) -def test_clip_image_processor(hf_images, dtype): +def test_clip_image_processor(image_assets, dtype): MODEL_NAME = "llava-hf/llava-1.5-7b-hf" IMAGE_HEIGHT = IMAGE_WIDTH = 560 @@ -35,13 +35,13 @@ def test_clip_image_processor(hf_images, dtype): image_processor_revision=None, ) - for image in hf_images: + for asset in image_assets: hf_result = hf_processor.preprocess( - image, + asset.pil_image, return_tensors="pt", ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype]) vllm_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(image), + ImagePixelData(asset.pil_image), model_config=model_config, vlm_config=vlm_config, ) @@ -59,7 +59,7 @@ def test_clip_image_processor(hf_images, dtype): reason="Inconsistent image processor being used due to lack " "of support for dynamic image token replacement") @pytest.mark.parametrize("dtype", ["half", "float"]) -def test_llava_next_image_processor(hf_images, dtype): +def test_llava_next_image_processor(image_assets, dtype): MODEL_NAME = "llava-hf/llava-v1.6-34b-hf" IMAGE_HEIGHT = IMAGE_WIDTH = 560 @@ -84,13 +84,13 @@ def test_llava_next_image_processor(hf_images, dtype): image_processor_revision=None, ) - for image in hf_images: + for asset in image_assets: hf_result = hf_processor.preprocess( - image, + asset.pil_image, return_tensors="pt", ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype]) vllm_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(image), + ImagePixelData(asset.pil_image), model_config=model_config, vlm_config=vlm_config, ) @@ -107,7 +107,7 @@ def test_llava_next_image_processor(hf_images, dtype): @pytest.mark.xfail( reason="Example image pixels were not processed using HuggingFace") @pytest.mark.parametrize("dtype", ["float"]) -def test_image_pixel_types(hf_images, vllm_image_tensors, dtype): +def test_image_pixel_types(image_assets, dtype): MODEL_NAME = "llava-hf/llava-1.5-7b-hf" IMAGE_HEIGHT = IMAGE_WIDTH = 560 @@ -129,14 +129,14 @@ def test_image_pixel_types(hf_images, vllm_image_tensors, dtype): image_processor_revision=None, ) - for image, tensor in zip(hf_images, vllm_image_tensors): + for asset in image_assets: image_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(image), + ImagePixelData(asset.pil_image), model_config=model_config, vlm_config=vlm_config, ) tensor_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(tensor), + ImagePixelData(asset.pixel_values), model_config=model_config, vlm_config=vlm_config, ) From ed31ff1823c761fc04c94b44770d44d766ebbf6b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 25 Jun 2024 10:11:56 +0000 Subject: [PATCH 2/5] Fix naming --- tests/models/test_llava.py | 10 +++++----- tests/models/test_llava_next.py | 10 +++++----- tests/models/test_phi3v.py | 10 +++++----- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index 3d057cdfd823e..ac1d2ece62b26 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -49,20 +49,20 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... It also reduces `output_str` from "bla" to "bla". """ - input_ids, output_str = vllm_output + output_ids, output_str = vllm_output image_token_id = vlm_config.image_token_id tokenizer = AutoTokenizer.from_pretrained(model_id) image_token_str = tokenizer.decode(image_token_id) - hf_input_ids = [ - input_id for idx, input_id in enumerate(input_ids) - if input_id != image_token_id or input_ids[idx - 1] != image_token_id + hf_output_ids = [ + token_id for idx, token_id in enumerate(output_ids) + if token_id != image_token_id or output_ids[idx - 1] != image_token_id ] hf_output_str = output_str \ .replace(image_token_str * vlm_config.image_feature_size, "") - return hf_input_ids, hf_output_str + return hf_output_ids, hf_output_str # TODO: Add test for `tensor_parallel_size` [ref: PR #3883] diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py index c1206e166f2dc..d36e503871ca9 100644 --- a/tests/models/test_llava_next.py +++ b/tests/models/test_llava_next.py @@ -56,20 +56,20 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... It also reduces `output_str` from "bla" to "bla". """ - input_ids, output_str = vllm_output + output_ids, output_str = vllm_output image_token_id = vlm_config.image_token_id tokenizer = AutoTokenizer.from_pretrained(model_id) image_token_str = tokenizer.decode(image_token_id) - hf_input_ids = [ - input_id for idx, input_id in enumerate(input_ids) - if input_id != image_token_id or input_ids[idx - 1] != image_token_id + hf_output_ids = [ + token_id for idx, token_id in enumerate(output_ids) + if token_id != image_token_id or output_ids[idx - 1] != image_token_id ] hf_output_str = output_str \ .replace(image_token_str * vlm_config.image_feature_size, " ") - return hf_input_ids, hf_output_str + return hf_output_ids, hf_output_str @pytest.mark.xfail( diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index e01dcca1aafd6..03c1304668366 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -50,22 +50,22 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... It also reduces `output_str` from "bla" to "bla". """ - input_ids, output_str = vllm_output + output_ids, output_str = vllm_output image_token_id = vlm_config.image_token_id tokenizer = AutoTokenizer.from_pretrained(model_id) image_token_str = tokenizer.decode(image_token_id) - hf_input_ids = [ - input_id if input_id != image_token_id else 0 - for idx, input_id in enumerate(input_ids) + hf_output_ids = [ + token_id if token_id != image_token_id else 0 + for idx, token_id in enumerate(output_ids) ] hf_output_str = output_str \ .replace(image_token_str * vlm_config.image_feature_size, "") \ .replace("", " ").replace("<|user|>", "") \ .replace("<|end|>\n<|assistant|>", " ") - return hf_input_ids, hf_output_str + return hf_output_ids, hf_output_str target_dtype = "half" From 93e35941c846e231fa2ed50a6b1032ba79726d9e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 25 Jun 2024 10:15:03 +0000 Subject: [PATCH 3/5] Add note --- tests/conftest.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 8afcb75b4c906..96d3bc3dce778 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -76,7 +76,13 @@ class _ImageAssetPrompts(TypedDict): class ImageAssets(UserList[ImageAsset]): def prompts(self, prompts: _ImageAssetPrompts) -> List[str]: - """Convenience method to define the prompt for each test image.""" + """ + Convenience method to define the prompt for each test image. + + Note: + The order of the returned list should match that of + :const:`IMAGE_ASSETS`. + """ return [prompts["stop_sign"], prompts["cherry_blossom"]] From e4581ab8b14500216bda0445cd50c6b93d2c5967 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 25 Jun 2024 10:19:53 +0000 Subject: [PATCH 4/5] Use singleton --- tests/conftest.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 96d3bc3dce778..468fb519f19bf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -73,22 +73,25 @@ class _ImageAssetPrompts(TypedDict): cherry_blossom: str -class ImageAssets(UserList[ImageAsset]): +class _ImageAssets(UserList[ImageAsset]): + + def __init__(self) -> None: + super().__init__( + [ImageAsset("stop_sign"), + ImageAsset("cherry_blossom")]) def prompts(self, prompts: _ImageAssetPrompts) -> List[str]: """ Convenience method to define the prompt for each test image. - Note: - The order of the returned list should match that of - :const:`IMAGE_ASSETS`. + The order of the returned prompts matches the order of the + assets when iterating through this object. """ return [prompts["stop_sign"], prompts["cherry_blossom"]] -IMAGE_ASSETS = ImageAssets( - [ImageAsset("stop_sign"), - ImageAsset("cherry_blossom")]) +IMAGE_ASSETS = _ImageAssets() +"""Singleton instance of :class:`_ImageAssets`.""" def _read_prompts(filename: str) -> List[str]: @@ -144,7 +147,7 @@ def example_long_prompts() -> List[str]: @pytest.fixture(scope="session") -def image_assets() -> ImageAssets: +def image_assets() -> _ImageAssets: return IMAGE_ASSETS From e8be9cf8dca98eb0c447855256a71f2ecca32750 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 25 Jun 2024 10:21:47 +0000 Subject: [PATCH 5/5] Move `_read_prompts` above image --- tests/conftest.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 468fb519f19bf..9d00c76766943 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -37,6 +37,12 @@ """You can use `.buildkite/download-images.sh` to download the assets.""" +def _read_prompts(filename: str) -> List[str]: + with open(filename, "r") as f: + prompts = f.readlines() + return prompts + + @dataclass(frozen=True) class ImageAsset: name: Literal["stop_sign", "cherry_blossom"] @@ -94,12 +100,6 @@ def prompts(self, prompts: _ImageAssetPrompts) -> List[str]: """Singleton instance of :class:`_ImageAssets`.""" -def _read_prompts(filename: str) -> List[str]: - with open(filename, "r") as f: - prompts = f.readlines() - return prompts - - def cleanup(): destroy_model_parallel() destroy_distributed_environment()