From c6d8bf6eb73f5cc773a3c4d28d35e832ea5ec7fc Mon Sep 17 00:00:00 2001 From: WeiweiZhang1 Date: Tue, 27 Aug 2024 11:24:10 +0800 Subject: [PATCH] refine docs, add accuracy data, add receip and eval scripts (#226) * refine docs, add accuracy data, add receip and eval scripts Signed-off-by: Zhang, Weiwei1 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update supported model list Signed-off-by: Zhang, Weiwei1 * add generation results, update supported model list Signed-off-by: Zhang, Weiwei1 * fixtypos Signed-off-by: Zhang, Weiwei1 * fix typo Signed-off-by: Zhang, Weiwei1 * follow comments Signed-off-by: Zhang, Weiwei1 * resort model list Signed-off-by: Zhang, Weiwei1 * fixtypo Signed-off-by: Zhang, Weiwei1 * fixtypo2 Signed-off-by: Zhang, Weiwei1 * refine table Signed-off-by: Zhang, Weiwei1 --------- Signed-off-by: Zhang, Weiwei1 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- README.md | 14 +- examples/multimodal-modeling/Llava/README.md | 12 +- .../Phi-3-vision/README.md | 71 +- .../Phi-3-vision/eval_042/evaluation.py | 17 +- .../multimodal-modeling/Phi-3-vision/main.py | 1 + .../Phi-3-vision/run_autoround.sh | 3 + .../Phi-3-vision/run_autoround_on_gaudi.sh | 10 - .../Phi-3-vision/run_eval.sh | 55 +- .../multimodal-modeling/Qwen-VL/README.md | 66 +- .../Qwen-VL/eval_042/__init__.py | 0 .../Qwen-VL/eval_042/evaluation.py | 626 ++++++++++++++++++ examples/multimodal-modeling/Qwen-VL/main.py | 41 +- .../mm_evaluation/evaluate_multiple_choice.py | 13 +- .../Qwen-VL/mm_evaluation/evaluate_vqa.py | 5 +- .../Qwen-VL/mm_evaluation/main.py | 101 +++ .../Qwen-VL/mm_evaluation/vqa.py | 6 +- .../Qwen-VL/mm_evaluation/vqa_eval.py | 4 +- .../Qwen-VL/run_autoround.sh | 2 + .../Qwen-VL/run_autoround_on_gaudi.sh | 12 - .../multimodal-modeling/Qwen-VL/run_eval.sh | 19 + 20 files changed, 930 insertions(+), 148 deletions(-) delete mode 100644 examples/multimodal-modeling/Phi-3-vision/run_autoround_on_gaudi.sh create mode 100644 examples/multimodal-modeling/Qwen-VL/eval_042/__init__.py create mode 100644 examples/multimodal-modeling/Qwen-VL/eval_042/evaluation.py create mode 100644 examples/multimodal-modeling/Qwen-VL/mm_evaluation/main.py delete mode 100644 examples/multimodal-modeling/Qwen-VL/run_autoround_on_gaudi.sh create mode 100644 examples/multimodal-modeling/Qwen-VL/run_eval.sh diff --git a/README.md b/README.md index 2698046d..3932744d 100644 --- a/README.md +++ b/README.md @@ -188,22 +188,24 @@ Please note that an asterisk (*) indicates third-party quantized models, which m Model | Supported | |--------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| meta-llama/Meta-Llama-3.1-70B-Instruct | [recipe](https://huggingface.co/Intel/Meta-Llama-3.1-70B-Instruct-int4-inc) | +| meta-llama/Meta-Llama-3.1-70B-Instruct | [recipe](https://huggingface.co/Intel/Meta-Llama-3.1-70B-Instruct-int4-inc) | | meta-llama/Meta-Llama-3.1-8B-Instruct | [model-kaitchup-autogptq-int4*](https://huggingface.co/kaitchup/Meta-Llama-3.1-8B-Instruct-autoround-gptq-4bit-asym), [model-kaitchup-autogptq-sym-int4*](https://huggingface.co/kaitchup/Meta-Llama-3.1-8B-Instruct-autoround-gptq-4bit-sym), [recipe](https://huggingface.co/Intel/Meta-Llama-3.1-8B-Instruct-int4-inc) | | meta-llama/Meta-Llama-3.1-8B | [model-kaitchup-autogptq-sym-int4*](https://huggingface.co/kaitchup/Meta-Llama-3.1-8B-autoround-gptq-4bit-sym) | +| Qwen/Qwen-VL | [accuracy](./examples/multimodal-modeling/Qwen-VL/README.md), [recipe](./examples/multimodal-modeling/Qwen-VL/run_autoround.sh) | Qwen/Qwen2-7B | [model-autoround-int4](https://huggingface.co/Intel/Qwen2-7B-int4-inc) | | Qwen/Qwen2-57B-A14B-Instruct | [model-autoround-int4](https://huggingface.co/Intel/Qwen2-57B-A14B-Instruct-int4-inc) | -| microsoft/Phi-3.5-mini-instruct | [model-kaitchup-autogptq-sym-int4*](https://huggingface.co/kaitchup/Phi-3.5-Mini-instruct-AutoRound-4bit) | -| TinyLlama-1.1B-intermediate | [model-LnL-AI-autogptq-int4*](https://huggingface.co/LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse) | +| 01-ai/Yi-1.5-9B | [model-LnL-AI-autogptq-int4*](https://huggingface.co/LnL-AI/Yi-1.5-9B-4bit-gptq-autoround) | +| 01-ai/Yi-1.5-9B-Chat | [model-LnL-AI-autogptq-int4*](https://huggingface.co/LnL-AI/Yi-1.5-9B-Chat-4bit-gptq-autoround) | | Intel/neural-chat-7b-v3-3 | [model-autogptq-int4](https://huggingface.co/Intel/neural-chat-7b-v3-3-int4-inc) | | Intel/neural-chat-7b-v3-1 | [model-autogptq-int4](https://huggingface.co/Intel/neural-chat-7b-v3-1-int4-inc) | +| TinyLlama-1.1B-intermediate | [model-LnL-AI-autogptq-int4*](https://huggingface.co/LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse) | | mistralai/Mistral-7B-v0.1 | [model-autogptq-lmhead-int4](https://huggingface.co/Intel/Mistral-7B-v0.1-int4-inc-lmhead), [model-autogptq-int4](https://huggingface.co/Intel/Mistral-7B-v0.1-int4-inc) | -| microsoft/phi-2 | [model-autogptq-sym-int4](https://huggingface.co/Intel/phi-2-int4-inc) | | google/gemma-2b | [model-autogptq-int4](https://huggingface.co/Intel/gemma-2b-int4-inc) | | tiiuae/falcon-7b | [model-autogptq-int4-G64](https://huggingface.co/Intel/falcon-7b-int4-inc) | -| 01-ai/Yi-1.5-9B | [model-LnL-AI-autogptq-int4*](https://huggingface.co/LnL-AI/Yi-1.5-9B-4bit-gptq-autoround) | -| 01-ai/Yi-1.5-9B-Chat | [model-LnL-AI-autogptq-int4*](https://huggingface.co/LnL-AI/Yi-1.5-9B-Chat-4bit-gptq-autoround) | | sapienzanlp/modello-italia-9b | [model-fbaldassarri-autogptq-int4*](https://huggingface.co/fbaldassarri/modello-italia-9b-autoround-w4g128-cpu) | +| microsoft/phi-2 | [model-autogptq-sym-int4](https://huggingface.co/Intel/phi-2-int4-inc) | +| microsoft/Phi-3.5-mini-instruct | [model-kaitchup-autogptq-sym-int4*](https://huggingface.co/kaitchup/Phi-3.5-Mini-instruct-AutoRound-4bit) | +| microsoft/Phi-3-vision-128k-instruct | [recipe](./examples/multimodal-modeling/Phi-3-vision/run_autoround.sh) | mistralai/Mistral-7B-Instruct-v0.2 | [accuracy](./docs/Mistral-7B-Instruct-v0.2-acc.md), [recipe](./examples/language-modeling/scripts/Mistral-7B-Instruct-v0.2.sh), [example](./examples/language-modeling/) | | mistralai/Mixtral-8x7B-Instruct-v0.1 | [accuracy](./docs/Mixtral-8x7B-Instruct-v0.1-acc.md), [recipe](./examples/language-modeling/scripts/Mixtral-8x7B-Instruct-v0.1.sh), [example](./examples/language-modeling/) | | mistralai/Mixtral-8x7B-v0.1 | [accuracy](./docs/Mixtral-8x7B-v0.1-acc.md), [recipe](./examples/language-modeling/scripts/Mixtral-8x7B-v0.1.sh), [example](./examples/language-modeling/) | diff --git a/examples/multimodal-modeling/Llava/README.md b/examples/multimodal-modeling/Llava/README.md index c6c398d0..14f5a926 100644 --- a/examples/multimodal-modeling/Llava/README.md +++ b/examples/multimodal-modeling/Llava/README.md @@ -6,6 +6,8 @@ This document presents step-by-step instructions for auto-round. In this example, we introduce an straight-forward way to execute quantization on some popular multimodal models such as LLaVA. +Please note that LLAVA quantization is currently an **experimental feature** and does not yet support inference on various devices after export. + ## Install If you are not using Linux, do NOT proceed, see instructions for [macOS](https://github.com/haotian-liu/LLaVA/blob/main/docs/macOS.md) and [Windows](https://github.com/haotian-liu/LLaVA/blob/main/docs/Windows.md). @@ -62,11 +64,11 @@ Include the flag `--adam`. Note that AdamW is less effective than sign gradient - **Running on Intel Gaudi2** ```bash -bash run_autoround_on_gaudi.sh +bash run_autoround.sh ``` ## 4. Results -Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and TextVQA dataset for evaluation. When the vision components are not involved in quantization, it is able to achieve accuracy loss within 1%. The results for LLava-7b are as follows: +Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and TextVQA dataset for evaluation. When the vision components are not involved in quantization, it is able to achieve accuracy loss within 1%. The results for fake quantized LLava-7b are as follows: | Model | Config | Precision | Hyperparameter | Accuracy% | Relative drop | | :----: | :----: | :----: | :----: | :----: | :----: | | liuhaotian/llava-v1.5-7b | - | FP16 | - | 58.21 | - | @@ -96,9 +98,3 @@ If you find SignRound useful for your research, please cite our paper: ``` - - - - - - diff --git a/examples/multimodal-modeling/Phi-3-vision/README.md b/examples/multimodal-modeling/Phi-3-vision/README.md index 3105c051..b0557bae 100644 --- a/examples/multimodal-modeling/Phi-3-vision/README.md +++ b/examples/multimodal-modeling/Phi-3-vision/README.md @@ -16,6 +16,8 @@ COCO: [train2017](http://images.cocodataset.org/zips/train2017.zip), and unzip t ## 2. Run Examples +PyTorch 1.8 or higher version is needed + Enter into the examples folder and install lm-eval to run the evaluation ```bash pip install -r requirements.txt @@ -47,13 +49,75 @@ Include the flag `--adam`. Note that AdamW is less effective than sign gradient - **Running on Intel Gaudi2** ```bash -bash run_autoround_on_gaudi.sh +bash run_autoround.sh ``` -## 3. Environment +## 3. Run Inference + +```python +from PIL import Image +import requests +import io +from transformers import AutoModelForCausalLM +from transformers import AutoProcessor +from auto_round.auto_quantizer import AutoHfQuantizer +quantized_model_path = "./tmp_autoround" +model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True, torch_dtype="auto", _attn_implementation='flash_attention_2') # use _attn_implementation='eager' to disable flash attention + +processor = AutoProcessor.from_pretrained(quantized_model_path, trust_remote_code=True) + +messages = [ \ + {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"}, \ + {"role": "assistant", "content": "The chart displays the percentage of respondents who agree with various statements about their preparedness for meetings. It shows five categories: 'Having clear and pre-defined goals for meetings', 'Knowing where to find the information I need for a meeting', 'Understanding my exact role and responsibilities when I'm invited', 'Having tools to manage admin tasks like note-taking or summarization', and 'Having more focus time to sufficiently prepare for meetings'. Each category has an associated bar indicating the level of agreement, measured on a scale from 0% to 100%."}, \ + {"role": "user", "content": "Provide insightful questions to spark discussion."}] + +url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png" +# image = Image.open(requests.get(url, stream=True).raw) +image = Image.open(io.BytesIO(requests.get(url, stream=True).content)) + +prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + +inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0") + +generation_args = { + "max_new_tokens": 50, + "temperature": 0.0, + "do_sample": False, +} + +generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args) + +# remove input tokens +generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] +response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + +print(response) +# 1. How does the level of agreement on each statement reflect the overall preparedness of respondents for meetings? +# 2. What are the most and least agreed-upon statements, and why might that be the case? +# 3. +``` + -PyTorch 1.8 or higher version is needed ## Reference @@ -72,3 +136,4 @@ If you find SignRound useful for your research, please cite our paper: + diff --git a/examples/multimodal-modeling/Phi-3-vision/eval_042/evaluation.py b/examples/multimodal-modeling/Phi-3-vision/eval_042/evaluation.py index e9e63fef..ac9f7636 100644 --- a/examples/multimodal-modeling/Phi-3-vision/eval_042/evaluation.py +++ b/examples/multimodal-modeling/Phi-3-vision/eval_042/evaluation.py @@ -576,6 +576,10 @@ def evaluate( parser.add_argument( "--eval_bs", default=1, ) + parser.add_argument( + "--device", default="cuda:0", + help="PyTorch device (e.g. cpu/cuda:0/hpu) for evaluation." + ) parser.add_argument( "--trust_remote_code", action='store_true', help="Whether to enable trust_remote_code" @@ -600,17 +604,20 @@ def evaluate( model_args += f",autogptq=True,gptq_use_triton=True" if args.trust_remote_code: model_args += f",trust_remote_code=True" - + model_args += ",dtype=bfloat16" test_tasks = args.tasks if isinstance(test_tasks, str): test_tasks = test_tasks.split(',') model_name = args.model_name.rstrip('/') from lm_eval.utils import make_table - result = simple_evaluate(model="hf", - model_args=model_args, - tasks=test_tasks, - batch_size=args.eval_bs) + with torch.cuda.amp.autocast(): + result = simple_evaluate(model="hf", + model_args=model_args, + tasks=test_tasks, + device=args.device, + batch_size=args.eval_bs) print(make_table(result)) print("cost time: ", time.time() - s) + diff --git a/examples/multimodal-modeling/Phi-3-vision/main.py b/examples/multimodal-modeling/Phi-3-vision/main.py index ad9114e6..9696b31d 100644 --- a/examples/multimodal-modeling/Phi-3-vision/main.py +++ b/examples/multimodal-modeling/Phi-3-vision/main.py @@ -464,3 +464,4 @@ def create_data_loader(dataset, batch_size=1, data_collator=None): from lm_eval.utils import make_table print(make_table(res)) + diff --git a/examples/multimodal-modeling/Phi-3-vision/run_autoround.sh b/examples/multimodal-modeling/Phi-3-vision/run_autoround.sh index 3583704b..64977fcd 100644 --- a/examples/multimodal-modeling/Phi-3-vision/run_autoround.sh +++ b/examples/multimodal-modeling/Phi-3-vision/run_autoround.sh @@ -6,6 +6,9 @@ CUDA_VISIBLE_DEVICES=$device \ python3 main.py \ --model_name=$model_name \ --deployment_device 'auto_round' \ +--nsamples 512 \ +--model_dtype bf16 \ --image_folder /PATH/TO/coco/images/train2017 \ --question_file /PATH/TO/llava_v1_5_mix665k.json \ --output_dir "./tmp_autoround" + diff --git a/examples/multimodal-modeling/Phi-3-vision/run_autoround_on_gaudi.sh b/examples/multimodal-modeling/Phi-3-vision/run_autoround_on_gaudi.sh deleted file mode 100644 index 90764afb..00000000 --- a/examples/multimodal-modeling/Phi-3-vision/run_autoround_on_gaudi.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -set -x -model_name=microsoft/Phi-3-vision-128k-instruct - -python3 main.py \ - --model_name $model_name \ - --group_size 128 \ - --bits 4 \ - --deployment_device "fake" \ - --output_dir "./tmp_autoround" \ No newline at end of file diff --git a/examples/multimodal-modeling/Phi-3-vision/run_eval.sh b/examples/multimodal-modeling/Phi-3-vision/run_eval.sh index da14a021..40097cf3 100644 --- a/examples/multimodal-modeling/Phi-3-vision/run_eval.sh +++ b/examples/multimodal-modeling/Phi-3-vision/run_eval.sh @@ -1,48 +1,11 @@ -export https_proxy=http://proxy.ims.intel.com:911 -export http_proxy=http://proxy.ims.intel.com:911 -export HF_HOME=/home/weiweiz1/.cache/ +#!/bin/bash +set -x +device=0 -# Mistral-7B-Instruct-v0.2 -# device=3 -# Baichuan2-7B-Chat Phi-3-mini-4k-instruct -# Llama-2-7b-chat-hf -# lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,mmlu, -# ceval-valid,cmmlu -# dir=/data5/zww/test_faster/ -# dir=/models -# for model in Phi-3-mini-4k-instruct Meta-Llama-3-8B-Instruct -# do -# echo ${model}/default -# CUDA_VISIBLE_DEVICES=$device \ -# python3 eval_042/evaluation.py --model_name ${dir}${model}_default/$model-autoround-w4g128-gpu \ -# --trust_remote_code \ -# --eval_bs 16 --tasks gsm8k,ceval-valid,cmmlu \ -# 2>&1| tee -a /data4/zww/test_faster/rounding_${model}_rtn.txt -# echo ${model}/rtn -# done& - -device=2 -dir=/data4/zww/tmp/ -# dir=/data5/models/ -for model in Phi-3-vision-128k-instruct -do - echo ${model} - CUDA_VISIBLE_DEVICES=$device \ - python3 eval_042/evaluation.py --model_name ${dir}/$model-autoround-w4g128-round \ - --trust_remote_code \ - --eval_bs 16 --tasks lambada_openai \ - 2>&1| tee -a /data4/zww/test_faster/rounding_${model}.txt - echo ${model} -done -# dir=/data5/zww/test_faster/ -# for model in Phi-3-mini-4k-instruct Mistral-7B-Instruct-v0.2 -# do -# echo ${model}/rtn -# CUDA_VISIBLE_DEVICES=$device \ -# python3 eval_042/evaluation.py --model_name ${dir}${model}_rtn/$model-autoround-w4g128-gpu \ -# --trust_remote_code \ -# --eval_bs 16 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,mmlu,gsm8k \ -# 2>&1| tee -a /data4/zww/test_faster/rounding_${model}_rtn.txt -# echo ${model}/rtn -# done +model_path='./tmp_autoround' +model=Phi-3-vision-128k-instruct +CUDA_VISIBLE_DEVICES=$device python3 eval_042/evaluation.py \ +--model_name ${model_path}/${model} \ +--trust_remote_code \ +--eval_bs 16 diff --git a/examples/multimodal-modeling/Qwen-VL/README.md b/examples/multimodal-modeling/Qwen-VL/README.md index 3ea128e7..4980eb28 100644 --- a/examples/multimodal-modeling/Qwen-VL/README.md +++ b/examples/multimodal-modeling/Qwen-VL/README.md @@ -100,17 +100,68 @@ Include the flag `--adam`. Note that AdamW is less effective than sign gradient - **Running on Intel Gaudi2** ```bash -bash run_autoround_on_gaudi.sh +bash run_autoround.sh +``` + +## 3. run inference + +```python + from transformers import AutoModelForCausalLM, AutoTokenizer + from transformers.generation import GenerationConfig + import torch + from transformers import set_seed + set_seed(1234) + from auto_round.auto_quantizer import AutoHfQuantizer + quantized_model_path = "./tmp_autoround" + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path, trust_remote_code=True) + # use bf16 + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True, bf16=True).eval() + # use fp16 + # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True, fp16=True).eval() + # use cpu only + # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu", trust_remote_code=True).eval() + # use cuda device + # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cuda", trust_remote_code=True).eval() + query = tokenizer.from_list_format([{'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, \ + {'text': 'Generate the caption in English with grounding:'}, \ + ]) + inputs = tokenizer(query, return_tensors='pt') + inputs = inputs.to(model.device) + with torch.cuda.amp.autocast(): + pred = model.generate(**inputs) + response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False) + print(response) + # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpegGenerate the caption in English with grounding: Woman(451,379),(731,806) and her dog(219,424),(576,896) playing on the beach<|endoftext|> + image = tokenizer.draw_bbox_on_latest_picture(response) + if image: + image.save('2.jpg') + else: + print("no box") + ``` ## 4. Results -Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and TextVQA dataset for evaluation. It is able to achieve accuracy loss within 1% Whether or not the visual component is quantified. The results for Qwen-VL are as follows: -| Model | Config | Precision | Hyperparameter | Accuracy% | Relative drop | -| :----: | :----: | :----: | :----: | :----: | :----: | -| Qwen/Qwen-VL | - | FP16 | - | 63.94 | - | -| Qwen/Qwen-VL | W4G128 | FP16 | with vision | 63.68 | -0.41% | -| Qwen/Qwen-VL | W4G128 | FP16 | w/o vision | 63.73 | -0.33% | +Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and TextVQA dataset for evaluation. please follow the [recipe](./run_autoround.sh) and [evaluate script](./run_eval.sh). The results for Qwen-VL are as follows: +| Metric | bf16 | INT4 | +|:----------------|:--------|:--------| +| avg | 0.5628 | 0.5589 | +| paper-avg | 0.5603 | 0.5611 | +| mmlu | 0.4828 | 0.4639 | +| lambada_openai | 0.6782 | 0.6664 | +| hellaswag | 0.5593 | 0.5487 | +| winogrande | 0.6827 | 0.6875 | +| piqa | 0.7786 | 0.7748 | +| truthfulqa_mc1 | 0.2876 | 0.2901 | +| openbookqa | 0.2880 | 0.2940 | +| boolq | 0.7012 | 0.7318 | +| arc_easy | 0.7201 | 0.7327 | +| arc_challenge | 0.4249 | 0.4206 | +| cmmlu | 0.4798 | 0.4618 | +| ceval | 0.4814 | 0.4569 | +| textVQA | 0.6402 | 0.6379 | +| scienceVQA | 0.6748 | 0.6574 | + ## 5. Environment @@ -136,3 +187,4 @@ If you find SignRound useful for your research, please cite our paper: + diff --git a/examples/multimodal-modeling/Qwen-VL/eval_042/__init__.py b/examples/multimodal-modeling/Qwen-VL/eval_042/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/multimodal-modeling/Qwen-VL/eval_042/evaluation.py b/examples/multimodal-modeling/Qwen-VL/eval_042/evaluation.py new file mode 100644 index 00000000..4b886352 --- /dev/null +++ b/examples/multimodal-modeling/Qwen-VL/eval_042/evaluation.py @@ -0,0 +1,626 @@ +import itertools +import logging +import random +import time +from collections import defaultdict +from typing import TYPE_CHECKING, List, Optional, Union + +import numpy as np +import torch + +import lm_eval.api.metrics +import lm_eval.api.registry +import lm_eval.models +from lm_eval.caching.cache import delete_cache +from lm_eval.evaluator_utils import ( + consolidate_results, + get_sample_size, + get_task_list, + prepare_print_tasks, + print_writeout, + run_task_tests, +) +from lm_eval.logging_utils import add_env_info, get_git_commit_hash +from lm_eval.tasks import TaskManager, get_task_dict +from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string + +if TYPE_CHECKING: + from lm_eval.api.model import LM + from lm_eval.tasks import Task + + +@positional_deprecated +def simple_evaluate( + model, + model_args: Optional[Union[str, dict]] = None, + tasks: Optional[List[Union[str, dict, object]]] = None, + num_fewshot: Optional[int] = None, + batch_size: Optional[int] = None, + max_batch_size: Optional[int] = None, + device: Optional[str] = None, + use_cache: Optional[str] = None, + cache_requests: bool = False, + rewrite_requests_cache: bool = False, + delete_requests_cache: bool = False, + limit: Optional[Union[int, float]] = None, + bootstrap_iters: int = 100000, + check_integrity: bool = False, + write_out: bool = False, + log_samples: bool = True, + gen_kwargs: Optional[str] = None, + task_manager: Optional[TaskManager] = None, + verbosity: str = "INFO", + predict_only: bool = False, + random_seed: int = 0, + numpy_random_seed: int = 1234, + torch_random_seed: int = 1234, + user_model = None, ##user model does not support tensor parallelism +): + """Instantiate and evaluate a model on a list of tasks. + + :param model: Union[str, LM] + Name of model or LM object, see lm_eval.models.get_model + :param model_args: Optional[str, dict] + String or dict arguments for each model class, see LM.create_from_arg_string and LM.create_from_arg_object. + Ignored if `model` argument is a LM object. + :param tasks: list[Union[str, dict, Task]] + List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise. + :param num_fewshot: int + Number of examples in few-shot context + :param batch_size: int or str, optional + Batch size for model + :param max_batch_size: int, optional + Maximal batch size to try with automatic batch size detection + :param device: str, optional + PyTorch device (e.g. "cpu" or "cuda:0") for running models + :param use_cache: str, optional + A path to a sqlite db file for caching model responses. `None` if not caching. + :param cache_requests: bool, optional + Speed up evaluation by caching the building of dataset requests. `None` if not caching. + :param rewrite_requests_cache: bool, optional + Rewrites all of the request cache if set to `True`. `None` if not desired. + :param delete_requests_cache: bool, optional + Deletes all of the request cache if set to `True`. `None` if not desired. + :param limit: int or float, optional + Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples. + :param bootstrap_iters: + Number of iterations for bootstrap statistics + :param check_integrity: bool + Whether to run the relevant part of the test suite for the tasks + :param write_out: bool + If True, write out an example document and model input for checking task integrity + :param log_samples: bool + If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis + :param gen_kwargs: str + String arguments for model generation + Ignored for all tasks with loglikelihood output_type + :param predict_only: bool + If true only model outputs will be generated and returned. Metrics will not be evaluated + :param random_seed: int + Random seed for python's random module. If set to None, the seed will not be set. + :param numpy_random_seed: int + Random seed for numpy. If set to None, the seed will not be set. + :param torch_random_seed: int + Random seed for torch. If set to None, the seed will not be set. + + :return + Dictionary of results + """ + from auto_round.auto_quantizer import AutoHfQuantizer + eval_logger.setLevel(getattr(logging, f"{verbosity}")) + start_date = time.time() + + if delete_requests_cache: + eval_logger.info("Deleting requests cache...") + delete_cache() + + seed_message = [] + if random_seed is not None: + # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412 + seed_message.append(f"Setting random seed to {random_seed}") + random.seed(random_seed) + + if numpy_random_seed is not None: + seed_message.append(f"Setting numpy seed to {numpy_random_seed}") + np.random.seed(numpy_random_seed) + + if torch_random_seed is not None: + seed_message.append(f"Setting torch manual seed to {torch_random_seed}") + torch.manual_seed(torch_random_seed) + + if seed_message: + eval_logger.info(" | ".join(seed_message)) + + if tasks is None: + tasks = [] + if len(tasks) == 0: + raise ValueError( + "No tasks specified, or no tasks found. Please verify the task names." + ) + + if gen_kwargs is not None: + gen_kwargs = simple_parse_args_string(gen_kwargs) + eval_logger.warning( + "generation_kwargs specified through cli, these settings will update set parameters in yaml tasks. " + "Ensure 'do_sample=True' for non-greedy decoding!" + ) + if gen_kwargs == "": + gen_kwargs = None + + if isinstance(model, str): + if model_args is None: + model_args = "" + + if isinstance(model_args, dict): + lm = lm_eval.api.registry.get_model(model).create_from_arg_obj( + model_args, + { + "batch_size": batch_size, + "max_batch_size": max_batch_size, + "device": device, + }, + ) + + else: + lm = lm_eval.api.registry.get_model(model).create_from_arg_string( + model_args, + { + "batch_size": batch_size, + "max_batch_size": max_batch_size, + "device": device, + }, + ) + else: + if not isinstance(model, lm_eval.api.model.LM): + raise TypeError + lm = model + + if use_cache is not None: + eval_logger.info(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}") + lm = lm_eval.api.model.CachingLM( + lm, + use_cache + # each rank receives a different cache db. + # necessary to avoid multiple writes to cache at once + + "_rank" + + str(lm.rank) + + ".db", + ) + if user_model is not None: + lm._model = user_model + + if task_manager is None: + task_manager = TaskManager(verbosity) + + task_dict = get_task_dict(tasks, task_manager) + for task_name in task_dict.keys(): + task_obj = task_dict[task_name] + if isinstance(task_obj, tuple): + _, task_obj = task_obj + if task_obj is None: + continue + + if task_obj.get_config("output_type") == "generate_until": + if gen_kwargs is not None: + task_obj.set_config( + key="generation_kwargs", value=gen_kwargs, update=True + ) + + if predict_only: + log_samples = True + eval_logger.info( + f"Processing {task_name} in output-only mode. Metrics will not be calculated!" + ) + # we have to change the class properties post-hoc. This is pretty hacky. + task_obj.override_metric(metric_name="bypass") + + # override tasks' fewshot values to the provided num_fewshot arg value + # except if tasks have it set to 0 manually in their configs--then we should never overwrite that + if num_fewshot is not None: + if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0: + eval_logger.info( + f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored." + ) + else: + eval_logger.warning( + f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}" + ) + task_obj.set_config(key="num_fewshot", value=num_fewshot) + else: + # if num_fewshot not provided, and the task does not define a default one, default to 0 + if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None: + task_obj.set_config(key="num_fewshot", value=0) + + if check_integrity: + run_task_tests(task_list=tasks) + + results = evaluate( + lm=lm, + task_dict=task_dict, + limit=limit, + cache_requests=cache_requests, + rewrite_requests_cache=rewrite_requests_cache, + bootstrap_iters=bootstrap_iters, + write_out=write_out, + log_samples=log_samples, + verbosity=verbosity, + ) + + if lm.rank == 0: + if isinstance(model, str): + model_name = model + elif hasattr(model, "config") and hasattr(model.config, "_name_or_path"): + model_name = model.config._name_or_path + else: + model_name = type(model).__name__ + + # add info about the model and few shot config + results["config"] = { + "model": model_name, + "model_args": model_args, + "batch_size": batch_size, + "batch_sizes": ( + list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [] + ), + "device": device, + "use_cache": use_cache, + "limit": limit, + "bootstrap_iters": bootstrap_iters, + "gen_kwargs": gen_kwargs, + } + results["git_hash"] = get_git_commit_hash() + results["date"] = start_date + add_env_info(results) # additional environment info to results + return results + else: + return None + + +@positional_deprecated +def evaluate( + lm: "LM", + task_dict, + limit: Optional[int] = None, + cache_requests: bool = False, + rewrite_requests_cache: bool = False, + bootstrap_iters: Optional[int] = 100000, + write_out: bool = False, + log_samples: bool = True, + verbosity: str = "INFO", +): + """Instantiate and evaluate a model on a list of tasks. + + :param lm: obj + Language Model + :param task_dict: dict[str, Task] + Dictionary of tasks. Tasks will be taken to have name type(task).config.task . + :param limit: int, optional + Limit the number of examples per task (only use this for testing) + :param bootstrap_iters: + Number of iterations for bootstrap statistics + :param write_out: bool + If True, write out an example document and model input for checking task integrity + :param log_samples: bool + If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis + :return + Dictionary of results + """ + + eval_logger.setLevel(getattr(logging, f"{verbosity}")) + + # tracks all Instances/requests a model must generate output on. + requests = defaultdict(list) + # stores the amount to pad out reqs per req. type so that + # number of fwd passes per distributed rank is equal + padding_requests = defaultdict(int) + + # get lists of group hierarchy and each type of request + task_hierarchy, eval_tasks = get_task_list(task_dict) + if not log_samples: + if not all( + "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys() + for task_output in eval_tasks + ): + raise ValueError("log_samples must be True for 'bypass' metric-only tasks") + for task_output in eval_tasks: + task: Task = task_output.task + limit = get_sample_size(task, limit) + task.build_all_requests( + limit=limit, + rank=lm.rank, + world_size=lm.world_size, + cache_requests=cache_requests, + rewrite_requests_cache=rewrite_requests_cache, + ) + eval_logger.debug( + f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}" + ) + + if write_out: + print_writeout(task) + # aggregate Instances by LM method requested to get output. + for instance in task.instances: + reqtype = instance.request_type + requests[reqtype].append(instance) + + if lm.world_size > 1: + instances_rnk = torch.tensor(len(task._instances), device=lm.device) + gathered_item = ( + lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist() + ) + # "multiple_choice" task types dispatch (several) "loglikelihood" request types + reqtype = ( + "loglikelihood" + if task.OUTPUT_TYPE == "multiple_choice" + else task.OUTPUT_TYPE + ) + # compute number of pseudo-batches to pad with (FSDP/DDP require even batches among ranks) + numpad = max(gathered_item) - gathered_item[lm.rank] + # todo: may not account for padding in cases like SquadV2 which has multiple req types + padding_requests[reqtype] += numpad + + ### Run LM on inputs, get all outputs ### + # execute each type of request + for reqtype, reqs in requests.items(): + eval_logger.info(f"Running {reqtype} requests") + # create `K` copies of each request `req` based off `K = req.repeats` + cloned_reqs = [] + for req in reqs: + cloned_reqs.extend([req] * req.repeats) + + if (lm.world_size > 1) and (padding_requests[reqtype] > 0): + for _ in range(padding_requests[reqtype]): + cloned_reqs.extend([req] * req.repeats) + + # run requests through model + resps = getattr(lm, reqtype)(cloned_reqs) + + # put responses from model into a list of length K for each request. + for x, req in zip(resps, cloned_reqs): + req.resps.append(x) + + if lm.world_size > 1: + lm.accelerator.wait_for_everyone() + + RANK = lm.rank + WORLD_SIZE = lm.world_size + ### Postprocess outputs ### + # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately) + for task_output in eval_tasks: + task = task_output.task + task.apply_filters() + + ### Collect values of metrics on all datapoints ### + # # unpack results and sort back in order and return control to Task + # TODO: make it possible to use a different metric per filter + # Pre-process task.instances to group by doc_id + instances_by_doc_id = defaultdict(list) + for instance in task.instances: + instances_by_doc_id[instance.doc_id].append(instance) + # Sort instances within each group + for instances in instances_by_doc_id.values(): + instances.sort(key=lambda x: x.idx) + # iterate over different filters used + for filter_key in task.instances[0].filtered_resps.keys(): + doc_iterator = task.doc_iterator( + rank=RANK, limit=limit, world_size=WORLD_SIZE + ) + for doc_id, doc in doc_iterator: + requests = instances_by_doc_id[doc_id] + metrics = task.process_results( + doc, [req.filtered_resps[filter_key] for req in requests] + ) + if log_samples: + target = task.doc_to_target(doc) + example = { + "doc_id": doc_id, + "doc": doc, + "target": target, + "arguments": [req.args for req in requests], + "resps": [req.resps for req in requests], + "filtered_resps": [ + req.filtered_resps[filter_key] for req in requests + ], + } + example.update(metrics) + task_output.logged_samples.append(example) + for metric, value in metrics.items(): + task_output.sample_metrics[(metric, filter_key)].append(value) + + if WORLD_SIZE > 1: + # if multigpu, then gather data across all ranks to rank 0 + # first gather logged samples across all ranks + for task_output in eval_tasks: + if log_samples: + # for task_name, task_samples in list(samples.items()): + full_samples = [None] * WORLD_SIZE if RANK == 0 else None + torch.distributed.gather_object( + obj=task_output.logged_samples, + object_gather_list=full_samples, + dst=0, + ) + + if RANK == 0: + task_output.logged_samples = list( + itertools.chain.from_iterable(full_samples) + ) + + # then collect metrics across all ranks + for metrics in task_output.sample_metrics: + metric_list = [None] * WORLD_SIZE if RANK == 0 else None + torch.distributed.gather_object( + obj=task_output.sample_metrics[metrics], + object_gather_list=metric_list, + dst=0, + ) + if RANK == 0: + task_output.sample_metrics[metrics] = list( + itertools.chain.from_iterable(metric_list) + ) + + if RANK == 0: + ### Aggregate results over all datapoints ### + # aggregate results ; run bootstrap CIs + for task_output in eval_tasks: + task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters) + results, samples, configs, versions, num_fewshot = consolidate_results( + eval_tasks + ) + + ### Calculate group metrics ### + if bool(results): + for group, task_list in reversed(task_hierarchy.items()): + if len(task_list) == 0: + # task_hierarchy entries are either + # `group_name: [subtask1, subtask2, ...]` + # or `task_name: []`. + # we only want to operate on groups here. + continue + metric_list = list( + { + key + for task in task_list + for key in results[task].keys() + if "_stderr" not in key and key not in ["alias", "samples"] + } + ) + for metric in metric_list: + stderr = "_stderr,".join(metric.split(",")) + + # gather metrics, sizes, and stderrs from subtasks + metrics = [ + results[task][metric] + for task in task_list + if metric in results[task] + ] # TODO: copy? + stderrs = [ + results[task][stderr] + for task in task_list + if stderr in results[task] + ] + sizes = [ + results[task]["samples"] + for task in task_list + if metric in results[task] + ] + + # compute group's pooled metric and stderr + results[group][ + metric + ] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes) + # TODO: calculate grouped metric using aggregation fn + if "N/A" in stderrs: + results[group][stderr] = "N/A" + else: + results[group][ + stderr + ] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes) + # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility + # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line: + # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics) + + results[group]["samples"] = sum(sizes) + + results_agg = defaultdict(dict) + groups_agg = defaultdict(dict) + all_tasks_list = list(task_hierarchy.keys()) + while True: + add_tasks_list = list(k for k in results_agg.keys()) + left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list))) + if len(left_tasks_list) == 0: + break + + _task_hierarchy = { + k: v for k, v in task_hierarchy.items() if k in left_tasks_list + } + _results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results) + + results_agg = {**results_agg, **_results_agg} + groups_agg = {**groups_agg, **_groups_agg} + + for group_name, task_list in task_hierarchy.items(): + if task_list: + num_fewshot[group_name] = num_fewshot[ + task_list[0] + ] # TODO: validate this + + results_dict = { + "results": dict(results_agg.items()), + **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}), + "group_subtasks": dict(reversed(task_hierarchy.items())), + "configs": dict(sorted(configs.items())), + "versions": dict(sorted(versions.items())), + "n-shot": dict(sorted(num_fewshot.items())), + } + if log_samples: + results_dict["samples"] = dict(samples) + + return results_dict + + else: + return None + + +if __name__ == "__main__": + + import sys + + sys.path.insert(0, '../../../') + import time + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_name", default="Qwen/Qwen-VL" + ) + parser.add_argument( + "--eval_bs", default=1, + ) + parser.add_argument( + "--trust_remote_code", action='store_true', + help="Whether to enable trust_remote_code" + ) + parser.add_argument( + "--device", default="cuda:0", + help="PyTorch device (e.g. cpu/cuda:0/hpu) for evaluation." + ) + parser.add_argument("--tasks", + default="lambada_openai,hellaswag,winogrande,piqa,mmlu,truthfulqa_mc1," \ + "openbookqa,boolq,rte,arc_easy,arc_challenge", + help="lm-eval tasks for lm_eval version 0.4.2") + + args = parser.parse_args() + s = time.time() + from transformers import AutoConfig + + config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=args.trust_remote_code) + + if hasattr(config, "quantization_config"): + quantization_config = config.quantization_config + if "quant_method" in quantization_config and "auto-round" in quantization_config["quant_method"]: + from auto_round.auto_quantizer import AutoHfQuantizer + elif "quant_method" in quantization_config and quantization_config["quant_method"] == "gptq": + if args.device == "hpu": + from auto_round.auto_quantizer import AutoHfQuantizer + + + test_tasks = args.tasks + if isinstance(test_tasks, str): + test_tasks = test_tasks.split(',') + model_name = args.model_name.rstrip('/') + from lm_eval.utils import make_table + + model_args = f"pretrained={args.model_name}" + if args.trust_remote_code: + model_args += f",trust_remote_code=True" + with torch.cuda.amp.autocast(): + result = simple_evaluate(model="hf", + model_args=model_args, + tasks=test_tasks, + device=args.device, + batch_size=args.eval_bs) + print(make_table(result)) + + print("cost time: ", time.time() - s) + + diff --git a/examples/multimodal-modeling/Qwen-VL/main.py b/examples/multimodal-modeling/Qwen-VL/main.py index 28544271..0c5f35bf 100644 --- a/examples/multimodal-modeling/Qwen-VL/main.py +++ b/examples/multimodal-modeling/Qwen-VL/main.py @@ -234,9 +234,6 @@ def get_train_dataloader(train_dataset, model, data_collator, train_batch_size=1 parser.add_argument("--seed", default=42, type=int, help="seed") - parser.add_argument("--eval_fp16_baseline", action='store_true', - help="whether to eval FP16 baseline") - parser.add_argument("--adam", action='store_true', help="adam") @@ -354,12 +351,12 @@ def get_train_dataloader(train_dataset, model, data_collator, train_batch_size=1 if args.model_dtype != None: if args.model_dtype == "float16" or args.model_dtype == "fp16": torch_dtype = torch.float16 - if args.model_dtype == "bfloat16" or args.model_dtype == "bfp16": + if args.model_dtype == "bfloat16" or args.model_dtype == "bf16": torch_dtype = torch.bfloat16 - dtype_abd = convert_dtype_torch2str(torch_dtype) - if dtype_abd == "bf16": + dtype_str = convert_dtype_torch2str(torch_dtype) + if dtype_str == "bf16": model = AutoModelForCausalLM.from_pretrained(args.model_name, config=config, trust_remote_code=not args.disable_trust_remote_code, bf16=True).eval() - elif dtype_abd == "fp16": + elif dtype_str == "fp16": model = AutoModelForCausalLM.from_pretrained(args.model_name, config=config, trust_remote_code=not args.disable_trust_remote_code, fp16=True).eval() else: model = AutoModelForCausalLM.from_pretrained(args.model_name, config=config, trust_remote_code=not args.disable_trust_remote_code).eval() @@ -373,36 +370,9 @@ def get_train_dataloader(train_dataset, model, data_collator, train_batch_size=1 AutoAdamRound) from auto_round.utils import get_multimodal_block_names - # model = model.eval() + model = model.eval() seqlen = args.seqlen - if args.eval_fp16_baseline: - model = model.half() - model = model.to(torch_device) - datasets=args.eval_dataset.split(',') - for dataset in datasets: - if 'vqa' in dataset: - from mm_evaluation.evaluate_vqa import textVQA_evaluation - evaluator = textVQA_evaluation( - model, - dataset_name=dataset, - # dataset_path=args.eval_path, - tokenizer=tokenizer, - batch_size=args.eval_bs, - device=str(torch_device) - ) - elif 'scienceqa' in dataset: - from mm_evaluation.evaluate_multiple_choice import scienceQA_evaluation - evaluator = scienceQA_evaluation( - model, - dataset_name=dataset, - # dataset_path=args.eval_path, - tokenizer=tokenizer, - batch_size=args.eval_bs, - device=str(torch_device) - ) - exit() - round = AutoRound if args.adam: round = AutoAdamRound @@ -525,3 +495,4 @@ def get_train_dataloader(train_dataset, model, data_collator, train_batch_size=1 ) + diff --git a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_multiple_choice.py b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_multiple_choice.py index a0d285a3..11c89440 100644 --- a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_multiple_choice.py +++ b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_multiple_choice.py @@ -6,7 +6,7 @@ import torch from tqdm import tqdm -from transformers import AutoModelForCausalLM, AutoTokenizer,AutoConfig +from transformers import AutoModelForCausalLM, AutoTokenizer multiple_choices = ['A', 'B', 'C', 'D', 'E'] @@ -102,7 +102,7 @@ def __len__(self): return len(self._local_indices) -def scienceQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", dataset_path=None, tokenizer=None, +def scienceQA_evaluation(model_name, dataset_name, dataset_path=None, tokenizer=None, batch_size=1, few_shot=0, seed=0, trust_remote_code=True, device="cuda:0"): # torch.distributed.init_process_group( # backend='nccl', @@ -115,7 +115,7 @@ def scienceQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", da config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code) model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=trust_remote_code).eval() model = model.to(torch.device(device)) - tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=trust_remote_code, use_fast=False) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code, use_fast=False) else: assert tokenizer is not None, "Two types of parameter passing are supported:model_path or model with tokenizer." model = model_name @@ -192,9 +192,6 @@ def scienceQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", da parser.add_argument( "--model_name", default="Qwen/Qwen-VL" ) - parser.add_argument( - "--base_model", default="Qwen/Qwen-VL" - ) parser.add_argument( "--dataset_name", default="scienceqa_test_img" ) @@ -209,7 +206,6 @@ def scienceQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", da s = time.time() evaluator = scienceQA_evaluation( args.model_name, - base_model=args.base_model, dataset_name=args.dataset_name, # dataset_path=args.eval_path, batch_size=args.eval_bs, @@ -217,5 +213,4 @@ def scienceQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", da ) print("cost time: ", time.time() - s) - - + \ No newline at end of file diff --git a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_vqa.py b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_vqa.py index 82f35011..9173be76 100644 --- a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_vqa.py +++ b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_vqa.py @@ -10,8 +10,8 @@ import torch from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig -from .vqa import VQA -from .vqa_eval import VQAEval +from vqa import VQA +from vqa_eval import VQAEval # This code is much refer to https://github.com/cognitedata/Qwen-VL-finetune/blob/master/eval_mm/evaluate_vqa.py @@ -461,3 +461,4 @@ def textVQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", data ) print("cost time: ", time.time() - s) + diff --git a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/main.py b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/main.py new file mode 100644 index 00000000..08f7170a --- /dev/null +++ b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/main.py @@ -0,0 +1,101 @@ + +if __name__ == "__main__": + + import sys + + sys.path.insert(0, '../../../') + import time + import torch + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_name", default="/models/opt-125m/" + ) + parser.add_argument( + "--eval_bs", default=4, type=int, + ) + parser.add_argument( + "--trust_remote_code", action='store_true', + help="Whether to enable trust_remote_code" + ) + parser.add_argument( + "--device", default="cuda:0", + help="PyTorch device (e.g. cpu/cuda:0/hpu) for evaluation." + ) + parser.add_argument( + "--base_model", default="Qwen/Qwen-VL" + ) + parser.add_argument( + "--model_dtype", default=None, type=str, + help="force to convert the dtype, some backends supports fp16 dtype better" + ) + parser.add_argument( + "--tasks", + default="textvqa_val,scienceqa_test_img", + help="lm-eval tasks for lm_eval version 0.4.2" + ) + + args = parser.parse_args() + s = time.time() + from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM + from auto_round.utils import convert_dtype_torch2str + + config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=args.trust_remote_code) + + if hasattr(config, "quantization_config"): + quantization_config = config.quantization_config + if "quant_method" in quantization_config and "auto-round" in quantization_config["quant_method"]: + from auto_round.auto_quantizer import AutoHfQuantizer + elif "quant_method" in quantization_config and quantization_config["quant_method"] == "gptq": + if args.device == "hpu": + from auto_round.auto_quantizer import AutoHfQuantizer + model_name = args.model_name + torch_dtype = torch.float + if args.model_dtype != None: + if args.model_dtype == "float16" or args.model_dtype == "fp16": + torch_dtype = torch.float16 + if args.model_dtype == "bfloat16" or args.model_dtype == "bfp16": + torch_dtype = torch.bfloat16 + dtype_str = convert_dtype_torch2str(torch_dtype) + if dtype_str == "bf16": + model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=args.trust_remote_code, device_map=args.device, bf16=True).eval() + elif dtype_str == "fp16": + model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=args.trust_remote_code, device_map=args.device, fp16=True).eval() + else: + model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=args.trust_remote_code, device_map=args.device).eval() + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=args.trust_remote_code, padding_side="right", use_fast=False) + tokenizer.pad_token_id = tokenizer.eod_id + test_tasks = args.tasks + if isinstance(test_tasks, str): + test_tasks = test_tasks.split(',') + + for dataset in test_tasks: + if 'vqa' in dataset: + from evaluate_vqa import textVQA_evaluation + with torch.cuda.amp.autocast(): + evaluator = textVQA_evaluation( + model, + dataset_name=dataset, + # dataset_path=args.eval_path, + tokenizer=tokenizer, + batch_size=args.eval_bs, + trust_remote_code=args.trust_remote_code, + device=str(args.device) + ) + elif 'scienceqa' in dataset: + from evaluate_multiple_choice import scienceQA_evaluation + with torch.cuda.amp.autocast(): + evaluator = scienceQA_evaluation( + model, + dataset_name=dataset, + # dataset_path=args.eval_path, + tokenizer=tokenizer, + batch_size=args.eval_bs, + trust_remote_code=args.trust_remote_code, + device=str(args.device) + ) + + print("cost time: ", time.time() - s) + + diff --git a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa.py b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa.py index 17a4e56f..d3b17d00 100644 --- a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa.py +++ b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa.py @@ -162,8 +162,8 @@ def showQA(self, anns): for ann in anns: quesId = ann['question_id'] print('Question: %s' % (self.qqa[quesId]['question'])) - for and in ann['answers']: - print('Answer %d: %s' % (and['answer_id'], and['answer'])) + for ann in ann['answers']: + print('Answer %d: %s' % (ann['answer_id'], ann['answer'])) def loadRes(self, resFile, quesFile): """Load result file and return a result object. @@ -203,4 +203,4 @@ def loadRes(self, resFile, quesFile): res.dataset['annotations'] = anns res.createIndex() - return res \ No newline at end of file + return res diff --git a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa_eval.py b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa_eval.py index 834654da..218719e3 100644 --- a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa_eval.py +++ b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa_eval.py @@ -216,7 +216,7 @@ def evaluate(self, quesIds=None): resAns = self.processPunctuation(resAns) resAns = self.processDigitArticle(resAns) gtAcc = [] - gtAnswers = [and['answer'] for and in gts[quesId]['answers']] + gtAnswers = [ann['answer'] for ann in gts[quesId]['answers']] if len(set(gtAnswers)) > 1: for ansDic in gts[quesId]['answers']: ansDic['answer'] = self.processPunctuation( @@ -327,4 +327,4 @@ def updateProgress(self, progress): '#' * block + '-' * (barLength - block), int(progress * 100), status) sys.stdout.write(text) - sys.stdout.flush() \ No newline at end of file + sys.stdout.flush() diff --git a/examples/multimodal-modeling/Qwen-VL/run_autoround.sh b/examples/multimodal-modeling/Qwen-VL/run_autoround.sh index d59ad510..6a90a0b0 100644 --- a/examples/multimodal-modeling/Qwen-VL/run_autoround.sh +++ b/examples/multimodal-modeling/Qwen-VL/run_autoround.sh @@ -12,8 +12,10 @@ python3 main.py \ --iters 200 \ --seqlen 512 \ --disable_quanted_input \ +--model_dtype bf16 \ --deployment_device 'auto_round' \ --image_folder /path/to/coco/images/train2017/ \ --question_file /path/to/Qwen-VL_mix665k.json \ --output_dir "./tmp_autoround" + diff --git a/examples/multimodal-modeling/Qwen-VL/run_autoround_on_gaudi.sh b/examples/multimodal-modeling/Qwen-VL/run_autoround_on_gaudi.sh deleted file mode 100644 index 72fb31d2..00000000 --- a/examples/multimodal-modeling/Qwen-VL/run_autoround_on_gaudi.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -set -x -model_name=Qwen/Qwen-VL - -python3 main.py \ - --model_name $model_name \ - --group_size 128 \ - --bits 4 \ - --deployment_device "fake" \ - --output_dir "./tmp_autoround" - - diff --git a/examples/multimodal-modeling/Qwen-VL/run_eval.sh b/examples/multimodal-modeling/Qwen-VL/run_eval.sh new file mode 100644 index 00000000..089a45b9 --- /dev/null +++ b/examples/multimodal-modeling/Qwen-VL/run_eval.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -x +device=0 + +model_path='./tmp_autoround' +model=Qwen-VL + +CUDA_VISIBLE_DEVICES=$device python3 eval_042/evaluation.py \ +--model_name ${model_path}/${model} \ +--trust_remote_code \ +--eval_bs 16 + +CUDA_VISIBLE_DEVICES=$device python3 mm_evaluation/main.py \ +--model_name ${model_path}/${model} \ +--trust_remote_code \ +--eval_bs 4 + + +