diff --git a/test_cuda/test_support_vlms.py b/test_cuda/test_support_vlms.py new file mode 100644 index 00000000..9df69354 --- /dev/null +++ b/test_cuda/test_support_vlms.py @@ -0,0 +1,263 @@ +import os +import sys +import shutil +import unittest + +sys.path.insert(0, '..') + +from auto_round import AutoRoundConfig ## must import for auto-round format +import requests +from PIL import Image + + +class TestSupportVLMS(unittest.TestCase): + @classmethod + def setUpClass(self): + self.save_dir = os.path.join(os.path.dirname(__file__), "./ut_saved") + self.python_path = sys.executable + self.device = 6 + + @classmethod + def tearDownClass(self): + shutil.rmtree(self.save_dir, ignore_errors=True) + + def test_qwen2(self): + model_path = "/models/Qwen2-VL-2B-Instruct/" + # test tune + res = os.system( + f"cd .. && {self.python_path} -m auto_round --mllm " + f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") + self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail") + + # test infer + quantized_model_path = os.path.join(self.save_dir, "Qwen2-VL-2B-Instruct-w4g128-auto_round") + + from transformers import Qwen2VLForConditionalGeneration, AutoProcessor + model = Qwen2VLForConditionalGeneration.from_pretrained( + quantized_model_path, + torch_dtype="auto", + device_map=f"cuda:{self.device}", + ) + processor = AutoProcessor.from_pretrained(quantized_model_path) + image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": image_url, + }, + {"type": "text", "text": "Describe this image."}, + ], + } + ] + + # Preparation for inference + text = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + image_inputs = Image.open(requests.get(image_url, stream=True).raw) + inputs = processor( + text=[text], + images=image_inputs, + padding=True, + return_tensors="pt", + ) + inputs = inputs.to(model.device) + + generated_ids = model.generate(**inputs, max_new_tokens=128) + generated_ids_trimmed = [ + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + output_text = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + print(output_text[0]) + + def test_phi3(self): + model_path = "/models/Phi-3.5-vision-instruct/" + ## test tune + res = os.system( + f"cd .. && {self.python_path} -m auto_round --mllm " + f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") + self.assertFalse(res > 0 or res == -1, msg="Phi-3.5 tuning fail") + + ## test infer + from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor + quantized_model_path = os.path.join(self.save_dir, "Phi-3.5-vision-instruct-w4g128-auto_round") + res = os.system(f"cp /models/Phi-3.5-vision-instruct/*.py {quantized_model_path}") + model = AutoModelForCausalLM.from_pretrained( + quantized_model_path, + device_map=f"cuda:{self.device}", + trust_remote_code=True, + torch_dtype="auto" + ) + processor = AutoProcessor.from_pretrained(quantized_model_path, + trust_remote_code=True, + num_crops=4 + ) + + image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" + content = "Describe this image." + messages = [ + {"role": "user", + "content": "<|image_1|>\n"+content}, + ] + + prompt = processor.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + image_inputs = Image.open(requests.get(image_url, stream=True).raw) + inputs = processor(prompt, image_inputs, return_tensors="pt").to(model.device) + + generation_args = { + "max_new_tokens": 1000, + "temperature": 0.0, + "do_sample": False, + } + + generate_ids = model.generate(**inputs, + eos_token_id=processor.tokenizer.eos_token_id, + **generation_args + ) + + # remove input tokens + generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] + response = processor.batch_decode(generate_ids, + skip_special_tokens=True, + clean_up_tokenization_spaces=False)[0] + + print(response) + + def test_llava(self): + model_path = "/models/llava-v1.5-7b/" + ## test tune + res = os.system( + f"cd .. && {self.python_path} -m auto_round --mllm " + f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") + self.assertFalse(res > 0 or res == -1, msg="llava-v1.5-7b tuning fail") + + ## test infer + from llava.model.builder import load_pretrained_model + from llava.train.train import preprocess, preprocess_multimodal, DataCollatorForSupervisedDataset + class DataArgs: + is_multimodal = True + mm_use_im_start_end = False + + quantized_model_path = os.path.join(self.save_dir, "llava-v1.5-7b-w4g128-auto_round") + tokenizer, model, image_processor, _ = load_pretrained_model( + quantized_model_path, + model_base=None, + model_name=quantized_model_path, + torch_dtype="auto", + device_map=f"cuda:{self.device}", + ) + image_url = "http://images.cocodataset.org/train2017/000000116003.jpg" + messages = [{"from": "human", "value": "What is the tennis player doing in the image?\n"}] + + # Preparation for inference + image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB') + image_input = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0].to(model.device) + input_data = preprocess_multimodal([messages], DataArgs()) + inputs = preprocess(input_data, tokenizer, has_image=(image_input is not None)) + + output = model.generate(inputs['input_ids'].to(model.device), images=image_input.unsqueeze(0).half(), max_new_tokens=50) + print(tokenizer.batch_decode(output)) + + def test_llama(self): + model_path = "/models/Llama-3.2-11B-Vision-Instruct/" + ## test tune + res = os.system( + f"cd .. && {self.python_path} -m auto_round --mllm " + f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") + self.assertFalse(res > 0 or res == -1, msg="llama-3.2 tuning fail") + + ## test infer + from transformers import MllamaForConditionalGeneration, AutoProcessor + quantized_model_path = os.path.join(self.save_dir, "Llama-3.2-11B-Vision-Instruct-w4g128-auto_round") + model = MllamaForConditionalGeneration.from_pretrained( + quantized_model_path, + torch_dtype="auto", + device_map=f"cuda:{self.device}", + ) + processor = AutoProcessor.from_pretrained(quantized_model_path) + image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" + messages = [ + {"role": "user", "content": [ + {"type": "image"}, + {"type": "text", "text": "Please write a haiku for this one, it would be: "} + ]} + ] + + # Preparation for inference + image = Image.open(requests.get(image_url, stream=True).raw) + input_text = processor.apply_chat_template(messages, add_generation_prompt=True) + inputs = processor( + image, + input_text, + add_special_tokens=False, + return_tensors="pt" + ).to(model.device) + + output = model.generate(**inputs, max_new_tokens=50) + print(processor.decode(output[0])) + + def test_cogvlm(self): + model_path = "/models/cogvlm2-llama3-chat-19B/" + ## test tune + res = os.system( + f"cd .. && {self.python_path} -m auto_round --mllm " + f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") + self.assertFalse(res > 0 or res == -1, msg="cogvlm2 tuning fail") + + ## test infer + DEVICE = f"cuda:{self.device}" + from transformers import AutoModelForCausalLM, AutoTokenizer + quantized_model_path = os.path.join(self.save_dir, "cogvlm2-llama3-chat-19B-w4g128-auto_round") + res = os.system(f"cp /models/Phi-3.5-vision-instruct/*.py {quantized_model_path}") + tokenizer = AutoTokenizer.from_pretrained( + quantized_model_path, + trust_remote_code=True + ) + model = AutoModelForCausalLM.from_pretrained( + quantized_model_path, + torch_dtype="auto", + trust_remote_code=True, + device_map=DEVICE, + ).to(DEVICE).eval() + + image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" + content = "Describe this image." + + text_only_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:" + query = text_only_template.format(content) + + image = Image.open(requests.get(image_url, stream=True).raw) + input_by_model = model.build_conversation_input_ids( + tokenizer, + query=query, + images=[image], + template_version='chat' + ) + inputs = { + 'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE), + 'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE), + 'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE), + 'images': [[input_by_model['images'][0].to(DEVICE).to(model.dtype)]] if image is not None else None, + } + gen_kwargs = { + "max_new_tokens": 2048, + "pad_token_id": 128002, + } + + outputs = model.generate(**inputs, **gen_kwargs) + outputs = outputs[:, inputs['input_ids'].shape[1]:] + response = tokenizer.decode(outputs[0]) + response = response.split("<|end_of_text|>")[0] + print(response) + +if __name__ == "__main__": + unittest.main() \ No newline at end of file