From c6d8bf6eb73f5cc773a3c4d28d35e832ea5ec7fc Mon Sep 17 00:00:00 2001
From: WeiweiZhang1 <weiwei1.zhang@intel.com>
Date: Tue, 27 Aug 2024 11:24:10 +0800
Subject: [PATCH] refine docs, add accuracy data, add receip and eval scripts
 (#226)

* refine docs, add accuracy data, add receip and eval scripts

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update supported model list

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>

* add generation results, update supported model list

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>

* fixtypos

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>

* fix typo

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>

* follow comments

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>

* resort model list

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>

* fixtypo

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>

* fixtypo2

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>

* refine table

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>

---------

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 README.md                                     |  14 +-
 examples/multimodal-modeling/Llava/README.md  |  12 +-
 .../Phi-3-vision/README.md                    |  71 +-
 .../Phi-3-vision/eval_042/evaluation.py       |  17 +-
 .../multimodal-modeling/Phi-3-vision/main.py  |   1 +
 .../Phi-3-vision/run_autoround.sh             |   3 +
 .../Phi-3-vision/run_autoround_on_gaudi.sh    |  10 -
 .../Phi-3-vision/run_eval.sh                  |  55 +-
 .../multimodal-modeling/Qwen-VL/README.md     |  66 +-
 .../Qwen-VL/eval_042/__init__.py              |   0
 .../Qwen-VL/eval_042/evaluation.py            | 626 ++++++++++++++++++
 examples/multimodal-modeling/Qwen-VL/main.py  |  41 +-
 .../mm_evaluation/evaluate_multiple_choice.py |  13 +-
 .../Qwen-VL/mm_evaluation/evaluate_vqa.py     |   5 +-
 .../Qwen-VL/mm_evaluation/main.py             | 101 +++
 .../Qwen-VL/mm_evaluation/vqa.py              |   6 +-
 .../Qwen-VL/mm_evaluation/vqa_eval.py         |   4 +-
 .../Qwen-VL/run_autoround.sh                  |   2 +
 .../Qwen-VL/run_autoround_on_gaudi.sh         |  12 -
 .../multimodal-modeling/Qwen-VL/run_eval.sh   |  19 +
 20 files changed, 930 insertions(+), 148 deletions(-)
 delete mode 100644 examples/multimodal-modeling/Phi-3-vision/run_autoround_on_gaudi.sh
 create mode 100644 examples/multimodal-modeling/Qwen-VL/eval_042/__init__.py
 create mode 100644 examples/multimodal-modeling/Qwen-VL/eval_042/evaluation.py
 create mode 100644 examples/multimodal-modeling/Qwen-VL/mm_evaluation/main.py
 delete mode 100644 examples/multimodal-modeling/Qwen-VL/run_autoround_on_gaudi.sh
 create mode 100644 examples/multimodal-modeling/Qwen-VL/run_eval.sh

diff --git a/README.md b/README.md
index 2698046d..3932744d 100644
--- a/README.md
+++ b/README.md
@@ -188,22 +188,24 @@ Please note that an asterisk (*) indicates third-party quantized models, which m
 
 Model                                | Supported                                                           |
 |--------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| meta-llama/Meta-Llama-3.1-70B-Instruct       | [recipe](https://huggingface.co/Intel/Meta-Llama-3.1-70B-Instruct-int4-inc)    |
+| meta-llama/Meta-Llama-3.1-70B-Instruct       | [recipe](https://huggingface.co/Intel/Meta-Llama-3.1-70B-Instruct-int4-inc)                       |
 | meta-llama/Meta-Llama-3.1-8B-Instruct        | [model-kaitchup-autogptq-int4*](https://huggingface.co/kaitchup/Meta-Llama-3.1-8B-Instruct-autoround-gptq-4bit-asym), [model-kaitchup-autogptq-sym-int4*](https://huggingface.co/kaitchup/Meta-Llama-3.1-8B-Instruct-autoround-gptq-4bit-sym), [recipe](https://huggingface.co/Intel/Meta-Llama-3.1-8B-Instruct-int4-inc)           |
 | meta-llama/Meta-Llama-3.1-8B                 | [model-kaitchup-autogptq-sym-int4*](https://huggingface.co/kaitchup/Meta-Llama-3.1-8B-autoround-gptq-4bit-sym)     |
+| Qwen/Qwen-VL                          |  [accuracy](./examples/multimodal-modeling/Qwen-VL/README.md), [recipe](./examples/multimodal-modeling/Qwen-VL/run_autoround.sh)
 | Qwen/Qwen2-7B                                | [model-autoround-int4](https://huggingface.co/Intel/Qwen2-7B-int4-inc)        |
 | Qwen/Qwen2-57B-A14B-Instruct                 | [model-autoround-int4](https://huggingface.co/Intel/Qwen2-57B-A14B-Instruct-int4-inc)     |
-| microsoft/Phi-3.5-mini-instruct              | [model-kaitchup-autogptq-sym-int4*](https://huggingface.co/kaitchup/Phi-3.5-Mini-instruct-AutoRound-4bit) |
-| TinyLlama-1.1B-intermediate   | [model-LnL-AI-autogptq-int4*](https://huggingface.co/LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse)     |
+| 01-ai/Yi-1.5-9B                      | [model-LnL-AI-autogptq-int4*](https://huggingface.co/LnL-AI/Yi-1.5-9B-4bit-gptq-autoround)         |
+| 01-ai/Yi-1.5-9B-Chat                 | [model-LnL-AI-autogptq-int4*](https://huggingface.co/LnL-AI/Yi-1.5-9B-Chat-4bit-gptq-autoround)   |
 | Intel/neural-chat-7b-v3-3            | [model-autogptq-int4](https://huggingface.co/Intel/neural-chat-7b-v3-3-int4-inc)          |
 | Intel/neural-chat-7b-v3-1            | [model-autogptq-int4](https://huggingface.co/Intel/neural-chat-7b-v3-1-int4-inc)          |
+| TinyLlama-1.1B-intermediate   | [model-LnL-AI-autogptq-int4*](https://huggingface.co/LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse)     |
 | mistralai/Mistral-7B-v0.1            | [model-autogptq-lmhead-int4](https://huggingface.co/Intel/Mistral-7B-v0.1-int4-inc-lmhead), [model-autogptq-int4](https://huggingface.co/Intel/Mistral-7B-v0.1-int4-inc)                            |
-| microsoft/phi-2                      | [model-autogptq-sym-int4](https://huggingface.co/Intel/phi-2-int4-inc)                        |
 | google/gemma-2b                      | [model-autogptq-int4](https://huggingface.co/Intel/gemma-2b-int4-inc)                     |
 | tiiuae/falcon-7b                     | [model-autogptq-int4-G64](https://huggingface.co/Intel/falcon-7b-int4-inc)                    |
-| 01-ai/Yi-1.5-9B                      | [model-LnL-AI-autogptq-int4*](https://huggingface.co/LnL-AI/Yi-1.5-9B-4bit-gptq-autoround)         |
-| 01-ai/Yi-1.5-9B-Chat                 | [model-LnL-AI-autogptq-int4*](https://huggingface.co/LnL-AI/Yi-1.5-9B-Chat-4bit-gptq-autoround)   |
 | sapienzanlp/modello-italia-9b   | [model-fbaldassarri-autogptq-int4*](https://huggingface.co/fbaldassarri/modello-italia-9b-autoround-w4g128-cpu)   |
+| microsoft/phi-2                      | [model-autogptq-sym-int4](https://huggingface.co/Intel/phi-2-int4-inc)                        |
+| microsoft/Phi-3.5-mini-instruct              | [model-kaitchup-autogptq-sym-int4*](https://huggingface.co/kaitchup/Phi-3.5-Mini-instruct-AutoRound-4bit) |
+| microsoft/Phi-3-vision-128k-instruct  |  [recipe](./examples/multimodal-modeling/Phi-3-vision/run_autoround.sh)
 | mistralai/Mistral-7B-Instruct-v0.2   | [accuracy](./docs/Mistral-7B-Instruct-v0.2-acc.md), [recipe](./examples/language-modeling/scripts/Mistral-7B-Instruct-v0.2.sh),  [example](./examples/language-modeling/)                    |
 | mistralai/Mixtral-8x7B-Instruct-v0.1 | [accuracy](./docs/Mixtral-8x7B-Instruct-v0.1-acc.md), [recipe](./examples/language-modeling/scripts/Mixtral-8x7B-Instruct-v0.1.sh),  [example](./examples/language-modeling/)                                     |
 | mistralai/Mixtral-8x7B-v0.1          | [accuracy](./docs/Mixtral-8x7B-v0.1-acc.md), [recipe](./examples/language-modeling/scripts/Mixtral-8x7B-v0.1.sh), [example](./examples/language-modeling/)        |
diff --git a/examples/multimodal-modeling/Llava/README.md b/examples/multimodal-modeling/Llava/README.md
index c6c398d0..14f5a926 100644
--- a/examples/multimodal-modeling/Llava/README.md
+++ b/examples/multimodal-modeling/Llava/README.md
@@ -6,6 +6,8 @@ This document presents step-by-step instructions for auto-round.
 
 In this example, we introduce an straight-forward way to execute quantization on some popular multimodal models such as LLaVA. 
 
+Please note that LLAVA quantization is currently an **experimental feature** and does not yet support inference on various devices after export.
+
 ## Install
 If you are not using Linux, do NOT proceed, see instructions for [macOS](https://github.com/haotian-liu/LLaVA/blob/main/docs/macOS.md) and [Windows](https://github.com/haotian-liu/LLaVA/blob/main/docs/Windows.md).
 
@@ -62,11 +64,11 @@ Include the flag `--adam`. Note that AdamW is less effective than sign gradient
 
 - **Running on Intel Gaudi2**
 ```bash
-bash run_autoround_on_gaudi.sh
+bash run_autoround.sh
 ```
 
 ## 4. Results
-Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and TextVQA dataset for evaluation. When the vision components are not involved in quantization, it is able to achieve accuracy loss within 1%. The results for LLava-7b are as follows:
+Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and TextVQA dataset for evaluation. When the vision components are not involved in quantization, it is able to achieve accuracy loss within 1%. The results for fake quantized LLava-7b are as follows:
 | Model | Config | Precision | Hyperparameter | Accuracy% | Relative drop |
 |  :----: | :----: | :----: | :----: | :----: | :----: |
 | liuhaotian/llava-v1.5-7b | - | FP16 | - | 58.21 | - |
@@ -96,9 +98,3 @@ If you find SignRound useful for your research, please cite our paper:
 ```
 
 
-
-
-
-
-
-
diff --git a/examples/multimodal-modeling/Phi-3-vision/README.md b/examples/multimodal-modeling/Phi-3-vision/README.md
index 3105c051..b0557bae 100644
--- a/examples/multimodal-modeling/Phi-3-vision/README.md
+++ b/examples/multimodal-modeling/Phi-3-vision/README.md
@@ -16,6 +16,8 @@ COCO: [train2017](http://images.cocodataset.org/zips/train2017.zip), and unzip t
 
 
 ## 2. Run Examples
+PyTorch 1.8 or higher version is needed
+
 Enter into the examples folder and install lm-eval to run the evaluation
 ```bash
 pip install -r requirements.txt
@@ -47,13 +49,75 @@ Include the flag `--adam`. Note that AdamW is less effective than sign gradient
 
 - **Running on Intel Gaudi2**
 ```bash
-bash run_autoround_on_gaudi.sh
+bash run_autoround.sh
 ```
 
 
-## 3. Environment
+## 3. Run Inference
+
+```python
+from PIL import Image
+import requests
+import io
+from transformers import AutoModelForCausalLM
+from transformers import AutoProcessor
+from auto_round.auto_quantizer import AutoHfQuantizer
+quantized_model_path = "./tmp_autoround"
+model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True, torch_dtype="auto", _attn_implementation='flash_attention_2') # use _attn_implementation='eager' to disable flash attention
+
+processor = AutoProcessor.from_pretrained(quantized_model_path, trust_remote_code=True)
+
+messages = [ \
+    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"}, \
+    {"role": "assistant", "content": "The chart displays the percentage of respondents who agree with various statements about their preparedness for meetings. It shows five categories: 'Having clear and pre-defined goals for meetings', 'Knowing where to find the information I need for a meeting', 'Understanding my exact role and responsibilities when I'm invited', 'Having tools to manage admin tasks like note-taking or summarization', and 'Having more focus time to sufficiently prepare for meetings'. Each category has an associated bar indicating the level of agreement, measured on a scale from 0% to 100%."}, \
+    {"role": "user", "content": "Provide insightful questions to spark discussion."}]
+
+url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png" 
+# image = Image.open(requests.get(url, stream=True).raw)
+image = Image.open(io.BytesIO(requests.get(url, stream=True).content))
+
+prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")
+
+generation_args = {
+    "max_new_tokens": 50,
+    "temperature": 0.0,
+    "do_sample": False,
+}
+
+generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args) 
+
+# remove input tokens 
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] 
+
+print(response)
+# 1. How does the level of agreement on each statement reflect the overall preparedness of respondents for meetings?
+# 2. What are the most and least agreed-upon statements, and why might that be the case?
+# 3.
+```
+<!-- 
+
+## 4. Results
+Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and lm_eval dataset for evaluation. please follow the [recipe](./run_autoround.sh) and [evaluate script](./run_eval.sh). The results for Phi-3-vision-128k-instruct are as follows:
+| Metric         | bf16   | INT4   |
+|----------------|--------|--------|
+| avg            | 0.6014 | 0.5940 |
+| mmlu           | 0.6369 | 0.6310 |
+| lambada_openai | 0.6487 | 0.6406 |
+| hellaswag      | 0.5585 | 0.5483 |
+| winogrande     | 0.7395 | 0.7451 |
+| piqa           | 0.7954 | 0.7889 |
+| truthfulqa_mc1 | 0.3084 | 0.2987 |
+| openbookqa     | 0.3580 | 0.3600 |
+| boolq          | 0.8532 | 0.8557 |
+| arc_easy       | 0.8371 | 0.8346 |
+| arc_challenge  | 0.5572 | 0.5469 |
+| cmmlu          | 0.4074 | 0.3950 |
+| ceval          | 0.4027 | 0.4012 |
+| gsm8k          | 0.7157 | 0.6755 | -->
 
-PyTorch 1.8 or higher version is needed
 
 
 ## Reference
@@ -72,3 +136,4 @@ If you find SignRound useful for your research, please cite our paper:
 
 
 
+
diff --git a/examples/multimodal-modeling/Phi-3-vision/eval_042/evaluation.py b/examples/multimodal-modeling/Phi-3-vision/eval_042/evaluation.py
index e9e63fef..ac9f7636 100644
--- a/examples/multimodal-modeling/Phi-3-vision/eval_042/evaluation.py
+++ b/examples/multimodal-modeling/Phi-3-vision/eval_042/evaluation.py
@@ -576,6 +576,10 @@ def evaluate(
     parser.add_argument(
         "--eval_bs", default=1,
     )
+    parser.add_argument(
+        "--device", default="cuda:0",
+        help="PyTorch device (e.g. cpu/cuda:0/hpu) for evaluation."
+    )
     parser.add_argument(
         "--trust_remote_code", action='store_true',
         help="Whether to enable trust_remote_code"
@@ -600,17 +604,20 @@ def evaluate(
             model_args += f",autogptq=True,gptq_use_triton=True"
     if args.trust_remote_code:
         model_args += f",trust_remote_code=True"
-        
+    model_args += ",dtype=bfloat16"
     test_tasks = args.tasks
     if isinstance(test_tasks, str):
         test_tasks = test_tasks.split(',')
     model_name = args.model_name.rstrip('/')
     from lm_eval.utils import make_table
-    result = simple_evaluate(model="hf",
-                             model_args=model_args,
-                             tasks=test_tasks,
-                             batch_size=args.eval_bs)
+    with torch.cuda.amp.autocast():
+        result = simple_evaluate(model="hf",
+                                model_args=model_args,
+                                tasks=test_tasks,
+                                device=args.device,
+                                batch_size=args.eval_bs)
     print(make_table(result))
 
     print("cost time: ", time.time() - s)
 
+
diff --git a/examples/multimodal-modeling/Phi-3-vision/main.py b/examples/multimodal-modeling/Phi-3-vision/main.py
index ad9114e6..9696b31d 100644
--- a/examples/multimodal-modeling/Phi-3-vision/main.py
+++ b/examples/multimodal-modeling/Phi-3-vision/main.py
@@ -464,3 +464,4 @@ def create_data_loader(dataset, batch_size=1, data_collator=None):
         from lm_eval.utils import make_table
 
         print(make_table(res))
+
diff --git a/examples/multimodal-modeling/Phi-3-vision/run_autoround.sh b/examples/multimodal-modeling/Phi-3-vision/run_autoround.sh
index 3583704b..64977fcd 100644
--- a/examples/multimodal-modeling/Phi-3-vision/run_autoround.sh
+++ b/examples/multimodal-modeling/Phi-3-vision/run_autoround.sh
@@ -6,6 +6,9 @@ CUDA_VISIBLE_DEVICES=$device \
 python3 main.py \
 --model_name=$model_name \
 --deployment_device 'auto_round' \
+--nsamples 512 \
+--model_dtype bf16 \
 --image_folder /PATH/TO/coco/images/train2017 \
 --question_file /PATH/TO/llava_v1_5_mix665k.json \
 --output_dir "./tmp_autoround"
+
diff --git a/examples/multimodal-modeling/Phi-3-vision/run_autoround_on_gaudi.sh b/examples/multimodal-modeling/Phi-3-vision/run_autoround_on_gaudi.sh
deleted file mode 100644
index 90764afb..00000000
--- a/examples/multimodal-modeling/Phi-3-vision/run_autoround_on_gaudi.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-set -x
-model_name=microsoft/Phi-3-vision-128k-instruct
-
-python3 main.py \
-  --model_name $model_name \
-  --group_size 128 \
-  --bits 4 \
-  --deployment_device "fake" \
-  --output_dir "./tmp_autoround"
\ No newline at end of file
diff --git a/examples/multimodal-modeling/Phi-3-vision/run_eval.sh b/examples/multimodal-modeling/Phi-3-vision/run_eval.sh
index da14a021..40097cf3 100644
--- a/examples/multimodal-modeling/Phi-3-vision/run_eval.sh
+++ b/examples/multimodal-modeling/Phi-3-vision/run_eval.sh
@@ -1,48 +1,11 @@
-export https_proxy=http://proxy.ims.intel.com:911
-export http_proxy=http://proxy.ims.intel.com:911
-export HF_HOME=/home/weiweiz1/.cache/
+#!/bin/bash
+set -x
+device=0
 
-#  Mistral-7B-Instruct-v0.2
-# device=3
-# Baichuan2-7B-Chat  Phi-3-mini-4k-instruct
-# Llama-2-7b-chat-hf 
-# lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,mmlu,
-# ceval-valid,cmmlu
-# dir=/data5/zww/test_faster/
-# dir=/models
-# for model in Phi-3-mini-4k-instruct Meta-Llama-3-8B-Instruct
-# do
-#     echo ${model}/default
-#     CUDA_VISIBLE_DEVICES=$device \
-#     python3 eval_042/evaluation.py --model_name ${dir}${model}_default/$model-autoround-w4g128-gpu \
-#     --trust_remote_code \
-#     --eval_bs 16 --tasks gsm8k,ceval-valid,cmmlu \
-#     2>&1| tee -a /data4/zww/test_faster/rounding_${model}_rtn.txt
-#     echo ${model}/rtn
-# done&
-
-device=2
-dir=/data4/zww/tmp/
-# dir=/data5/models/
-for model in Phi-3-vision-128k-instruct
-do
-    echo ${model}
-    CUDA_VISIBLE_DEVICES=$device \
-    python3 eval_042/evaluation.py --model_name ${dir}/$model-autoround-w4g128-round \
-    --trust_remote_code \
-    --eval_bs 16 --tasks lambada_openai \
-    2>&1| tee -a /data4/zww/test_faster/rounding_${model}.txt
-    echo ${model}
-done
-# dir=/data5/zww/test_faster/
-# for model in Phi-3-mini-4k-instruct Mistral-7B-Instruct-v0.2
-# do
-#     echo ${model}/rtn
-#     CUDA_VISIBLE_DEVICES=$device \
-#     python3 eval_042/evaluation.py --model_name ${dir}${model}_rtn/$model-autoround-w4g128-gpu \
-#     --trust_remote_code \
-#     --eval_bs 16 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,mmlu,gsm8k \
-#     2>&1| tee -a /data4/zww/test_faster/rounding_${model}_rtn.txt
-#     echo ${model}/rtn
-# done
+model_path='./tmp_autoround'
+model=Phi-3-vision-128k-instruct
 
+CUDA_VISIBLE_DEVICES=$device python3 eval_042/evaluation.py \
+--model_name ${model_path}/${model} \
+--trust_remote_code \
+--eval_bs 16
diff --git a/examples/multimodal-modeling/Qwen-VL/README.md b/examples/multimodal-modeling/Qwen-VL/README.md
index 3ea128e7..4980eb28 100644
--- a/examples/multimodal-modeling/Qwen-VL/README.md
+++ b/examples/multimodal-modeling/Qwen-VL/README.md
@@ -100,17 +100,68 @@ Include the flag `--adam`. Note that AdamW is less effective than sign gradient
 
 - **Running on Intel Gaudi2**
 ```bash
-bash run_autoround_on_gaudi.sh
+bash run_autoround.sh
+```
+
+## 3. run inference
+
+```python
+  from transformers import AutoModelForCausalLM, AutoTokenizer
+  from transformers.generation import GenerationConfig
+  import torch
+  from transformers import set_seed
+  set_seed(1234)
+  from auto_round.auto_quantizer import AutoHfQuantizer
+  quantized_model_path = "./tmp_autoround"
+  tokenizer = AutoTokenizer.from_pretrained(quantized_model_path, trust_remote_code=True)
+  # use bf16
+  model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True, bf16=True).eval()
+  # use fp16
+  # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True, fp16=True).eval()
+  # use cpu only
+  # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu", trust_remote_code=True).eval()
+  # use cuda device
+  # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cuda", trust_remote_code=True).eval()
+  query = tokenizer.from_list_format([{'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, \
+      {'text': 'Generate the caption in English with grounding:'}, \
+  ])
+  inputs = tokenizer(query, return_tensors='pt')
+  inputs = inputs.to(model.device)
+  with torch.cuda.amp.autocast(): 
+      pred = model.generate(**inputs)
+  response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
+  print(response)
+  # <img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>Generate the caption in English with grounding:<ref> Woman</ref><box>(451,379),(731,806)</box> and<ref> her dog</ref><box>(219,424),(576,896)</box> playing on the beach<|endoftext|>
+  image = tokenizer.draw_bbox_on_latest_picture(response)
+  if image:
+    image.save('2.jpg')
+  else:
+    print("no box")
+
 ```
 
 
 ## 4. Results
-Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and TextVQA dataset for evaluation. It is able to achieve accuracy loss within 1% Whether or not the visual component is quantified. The results for Qwen-VL are as follows:
-| Model | Config | Precision | Hyperparameter | Accuracy% | Relative drop |
-|  :----: | :----: | :----: | :----: | :----: | :----: |
-| Qwen/Qwen-VL | - | FP16 | - | 63.94 | - |
-| Qwen/Qwen-VL | W4G128 | FP16 | with vision | 63.68 | -0.41% |
-| Qwen/Qwen-VL | W4G128 | FP16 | w/o vision | 63.73 | -0.33% |
+Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and TextVQA dataset for evaluation. please follow the [recipe](./run_autoround.sh) and [evaluate script](./run_eval.sh). The results for Qwen-VL are as follows:
+| Metric         | bf16   | INT4   |
+|:----------------|:--------|:--------|
+| avg            | 0.5628 | 0.5589 |
+| paper-avg      | 0.5603 | 0.5611 |
+| mmlu           | 0.4828 | 0.4639 |
+| lambada_openai | 0.6782 | 0.6664 |
+| hellaswag      | 0.5593 | 0.5487 |
+| winogrande     | 0.6827 | 0.6875 |
+| piqa           | 0.7786 | 0.7748 |
+| truthfulqa_mc1 | 0.2876 | 0.2901 |
+| openbookqa     | 0.2880 | 0.2940 |
+| boolq          | 0.7012 | 0.7318 |
+| arc_easy       | 0.7201 | 0.7327 |
+| arc_challenge  | 0.4249 | 0.4206 |
+| cmmlu          | 0.4798 | 0.4618 |
+| ceval          | 0.4814 | 0.4569 |
+| textVQA        | 0.6402 | 0.6379 |
+| scienceVQA     | 0.6748 | 0.6574 |
+
 
 
 ## 5. Environment
@@ -136,3 +187,4 @@ If you find SignRound useful for your research, please cite our paper:
 
 
 
+
diff --git a/examples/multimodal-modeling/Qwen-VL/eval_042/__init__.py b/examples/multimodal-modeling/Qwen-VL/eval_042/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/multimodal-modeling/Qwen-VL/eval_042/evaluation.py b/examples/multimodal-modeling/Qwen-VL/eval_042/evaluation.py
new file mode 100644
index 00000000..4b886352
--- /dev/null
+++ b/examples/multimodal-modeling/Qwen-VL/eval_042/evaluation.py
@@ -0,0 +1,626 @@
+import itertools
+import logging
+import random
+import time
+from collections import defaultdict
+from typing import TYPE_CHECKING, List, Optional, Union
+
+import numpy as np
+import torch
+
+import lm_eval.api.metrics
+import lm_eval.api.registry
+import lm_eval.models
+from lm_eval.caching.cache import delete_cache
+from lm_eval.evaluator_utils import (
+    consolidate_results,
+    get_sample_size,
+    get_task_list,
+    prepare_print_tasks,
+    print_writeout,
+    run_task_tests,
+)
+from lm_eval.logging_utils import add_env_info, get_git_commit_hash
+from lm_eval.tasks import TaskManager, get_task_dict
+from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string
+
+if TYPE_CHECKING:
+    from lm_eval.api.model import LM
+    from lm_eval.tasks import Task
+
+
+@positional_deprecated
+def simple_evaluate(
+        model,
+        model_args: Optional[Union[str, dict]] = None,
+        tasks: Optional[List[Union[str, dict, object]]] = None,
+        num_fewshot: Optional[int] = None,
+        batch_size: Optional[int] = None,
+        max_batch_size: Optional[int] = None,
+        device: Optional[str] = None,
+        use_cache: Optional[str] = None,
+        cache_requests: bool = False,
+        rewrite_requests_cache: bool = False,
+        delete_requests_cache: bool = False,
+        limit: Optional[Union[int, float]] = None,
+        bootstrap_iters: int = 100000,
+        check_integrity: bool = False,
+        write_out: bool = False,
+        log_samples: bool = True,
+        gen_kwargs: Optional[str] = None,
+        task_manager: Optional[TaskManager] = None,
+        verbosity: str = "INFO",
+        predict_only: bool = False,
+        random_seed: int = 0,
+        numpy_random_seed: int = 1234,
+        torch_random_seed: int = 1234,
+        user_model = None, ##user model does not support tensor parallelism
+):
+    """Instantiate and evaluate a model on a list of tasks.
+
+    :param model: Union[str, LM]
+        Name of model or LM object, see lm_eval.models.get_model
+    :param model_args: Optional[str, dict]
+        String or dict arguments for each model class, see LM.create_from_arg_string and LM.create_from_arg_object.
+        Ignored if `model` argument is a LM object.
+    :param tasks: list[Union[str, dict, Task]]
+        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
+    :param num_fewshot: int
+        Number of examples in few-shot context
+    :param batch_size: int or str, optional
+        Batch size for model
+    :param max_batch_size: int, optional
+        Maximal batch size to try with automatic batch size detection
+    :param device: str, optional
+        PyTorch device (e.g. "cpu" or "cuda:0") for running models
+    :param use_cache: str, optional
+        A path to a sqlite db file for caching model responses. `None` if not caching.
+    :param cache_requests: bool, optional
+        Speed up evaluation by caching the building of dataset requests. `None` if not caching.
+    :param rewrite_requests_cache: bool, optional
+        Rewrites all of the request cache if set to `True`. `None` if not desired.
+    :param delete_requests_cache: bool, optional
+        Deletes all of the request cache if set to `True`. `None` if not desired.
+    :param limit: int or float, optional
+        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
+    :param bootstrap_iters:
+        Number of iterations for bootstrap statistics
+    :param check_integrity: bool
+        Whether to run the relevant part of the test suite for the tasks
+    :param write_out: bool
+        If True, write out an example document and model input for checking task integrity
+    :param log_samples: bool
+        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
+    :param gen_kwargs: str
+        String arguments for model generation
+        Ignored for all tasks with loglikelihood output_type
+    :param predict_only: bool
+        If true only model outputs will be generated and returned. Metrics will not be evaluated
+    :param random_seed: int
+        Random seed for python's random module. If set to None, the seed will not be set.
+    :param numpy_random_seed: int
+        Random seed for numpy. If set to None, the seed will not be set.
+    :param torch_random_seed: int
+        Random seed for torch. If set to None, the seed will not be set.
+
+    :return
+        Dictionary of results
+    """
+    from auto_round.auto_quantizer import AutoHfQuantizer
+    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
+    start_date = time.time()
+
+    if delete_requests_cache:
+        eval_logger.info("Deleting requests cache...")
+        delete_cache()
+
+    seed_message = []
+    if random_seed is not None:
+        # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412
+        seed_message.append(f"Setting random seed to {random_seed}")
+        random.seed(random_seed)
+
+    if numpy_random_seed is not None:
+        seed_message.append(f"Setting numpy seed to {numpy_random_seed}")
+        np.random.seed(numpy_random_seed)
+
+    if torch_random_seed is not None:
+        seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
+        torch.manual_seed(torch_random_seed)
+
+    if seed_message:
+        eval_logger.info(" | ".join(seed_message))
+
+    if tasks is None:
+        tasks = []
+    if len(tasks) == 0:
+        raise ValueError(
+            "No tasks specified, or no tasks found. Please verify the task names."
+        )
+
+    if gen_kwargs is not None:
+        gen_kwargs = simple_parse_args_string(gen_kwargs)
+        eval_logger.warning(
+            "generation_kwargs specified through cli, these settings will update set parameters in yaml tasks. "
+            "Ensure 'do_sample=True' for non-greedy decoding!"
+        )
+        if gen_kwargs == "":
+            gen_kwargs = None
+
+    if isinstance(model, str):
+        if model_args is None:
+            model_args = ""
+
+        if isinstance(model_args, dict):
+            lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
+                model_args,
+                {
+                    "batch_size": batch_size,
+                    "max_batch_size": max_batch_size,
+                    "device": device,
+                },
+            )
+
+        else:
+            lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
+                model_args,
+                {
+                    "batch_size": batch_size,
+                    "max_batch_size": max_batch_size,
+                    "device": device,
+                },
+            )
+    else:
+        if not isinstance(model, lm_eval.api.model.LM):
+            raise TypeError
+        lm = model
+
+    if use_cache is not None:
+        eval_logger.info(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
+        lm = lm_eval.api.model.CachingLM(
+            lm,
+            use_cache
+            # each rank receives a different cache db.
+            # necessary to avoid multiple writes to cache at once
+            + "_rank"
+            + str(lm.rank)
+            + ".db",
+        )
+    if user_model is not None:
+        lm._model = user_model
+
+    if task_manager is None:
+        task_manager = TaskManager(verbosity)
+
+    task_dict = get_task_dict(tasks, task_manager)
+    for task_name in task_dict.keys():
+        task_obj = task_dict[task_name]
+        if isinstance(task_obj, tuple):
+            _, task_obj = task_obj
+            if task_obj is None:
+                continue
+
+        if task_obj.get_config("output_type") == "generate_until":
+            if gen_kwargs is not None:
+                task_obj.set_config(
+                    key="generation_kwargs", value=gen_kwargs, update=True
+                )
+
+        if predict_only:
+            log_samples = True
+            eval_logger.info(
+                f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
+            )
+            # we have to change the class properties post-hoc. This is pretty hacky.
+            task_obj.override_metric(metric_name="bypass")
+
+        # override tasks' fewshot values to the provided num_fewshot arg value
+        # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
+        if num_fewshot is not None:
+            if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
+                eval_logger.info(
+                    f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
+                )
+            else:
+                eval_logger.warning(
+                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
+                )
+                task_obj.set_config(key="num_fewshot", value=num_fewshot)
+        else:
+            # if num_fewshot not provided, and the task does not define a default one, default to 0
+            if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
+                task_obj.set_config(key="num_fewshot", value=0)
+
+    if check_integrity:
+        run_task_tests(task_list=tasks)
+
+    results = evaluate(
+        lm=lm,
+        task_dict=task_dict,
+        limit=limit,
+        cache_requests=cache_requests,
+        rewrite_requests_cache=rewrite_requests_cache,
+        bootstrap_iters=bootstrap_iters,
+        write_out=write_out,
+        log_samples=log_samples,
+        verbosity=verbosity,
+    )
+
+    if lm.rank == 0:
+        if isinstance(model, str):
+            model_name = model
+        elif hasattr(model, "config") and hasattr(model.config, "_name_or_path"):
+            model_name = model.config._name_or_path
+        else:
+            model_name = type(model).__name__
+
+        # add info about the model and few shot config
+        results["config"] = {
+            "model": model_name,
+            "model_args": model_args,
+            "batch_size": batch_size,
+            "batch_sizes": (
+                list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
+            ),
+            "device": device,
+            "use_cache": use_cache,
+            "limit": limit,
+            "bootstrap_iters": bootstrap_iters,
+            "gen_kwargs": gen_kwargs,
+        }
+        results["git_hash"] = get_git_commit_hash()
+        results["date"] = start_date
+        add_env_info(results)  # additional environment info to results
+        return results
+    else:
+        return None
+
+
+@positional_deprecated
+def evaluate(
+        lm: "LM",
+        task_dict,
+        limit: Optional[int] = None,
+        cache_requests: bool = False,
+        rewrite_requests_cache: bool = False,
+        bootstrap_iters: Optional[int] = 100000,
+        write_out: bool = False,
+        log_samples: bool = True,
+        verbosity: str = "INFO",
+):
+    """Instantiate and evaluate a model on a list of tasks.
+
+    :param lm: obj
+        Language Model
+    :param task_dict: dict[str, Task]
+        Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
+    :param limit: int, optional
+        Limit the number of examples per task (only use this for testing)
+    :param bootstrap_iters:
+        Number of iterations for bootstrap statistics
+    :param write_out: bool
+        If True, write out an example document and model input for checking task integrity
+    :param log_samples: bool
+        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
+    :return
+        Dictionary of results
+    """
+
+    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
+
+    # tracks all Instances/requests a model must generate output on.
+    requests = defaultdict(list)
+    # stores the amount to pad out reqs per req. type so that
+    # number of fwd passes per distributed rank is equal
+    padding_requests = defaultdict(int)
+
+    # get lists of group hierarchy and each type of request
+    task_hierarchy, eval_tasks = get_task_list(task_dict)
+    if not log_samples:
+        if not all(
+                "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
+                for task_output in eval_tasks
+        ):
+            raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
+    for task_output in eval_tasks:
+        task: Task = task_output.task
+        limit = get_sample_size(task, limit)
+        task.build_all_requests(
+            limit=limit,
+            rank=lm.rank,
+            world_size=lm.world_size,
+            cache_requests=cache_requests,
+            rewrite_requests_cache=rewrite_requests_cache,
+        )
+        eval_logger.debug(
+            f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
+        )
+
+        if write_out:
+            print_writeout(task)
+        # aggregate Instances by LM method requested to get output.
+        for instance in task.instances:
+            reqtype = instance.request_type
+            requests[reqtype].append(instance)
+
+        if lm.world_size > 1:
+            instances_rnk = torch.tensor(len(task._instances), device=lm.device)
+            gathered_item = (
+                lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
+            )
+            # "multiple_choice" task types dispatch (several) "loglikelihood" request types
+            reqtype = (
+                "loglikelihood"
+                if task.OUTPUT_TYPE == "multiple_choice"
+                else task.OUTPUT_TYPE
+            )
+            # compute number of pseudo-batches to pad with (FSDP/DDP require even batches among ranks)
+            numpad = max(gathered_item) - gathered_item[lm.rank]
+            # todo: may not account for padding in cases like SquadV2 which has multiple req types
+            padding_requests[reqtype] += numpad
+
+    ### Run LM on inputs, get all outputs ###
+    # execute each type of request
+    for reqtype, reqs in requests.items():
+        eval_logger.info(f"Running {reqtype} requests")
+        # create `K` copies of each request `req` based off `K = req.repeats`
+        cloned_reqs = []
+        for req in reqs:
+            cloned_reqs.extend([req] * req.repeats)
+
+        if (lm.world_size > 1) and (padding_requests[reqtype] > 0):
+            for _ in range(padding_requests[reqtype]):
+                cloned_reqs.extend([req] * req.repeats)
+
+        # run requests through model
+        resps = getattr(lm, reqtype)(cloned_reqs)
+
+        # put responses from model into a list of length K for each request.
+        for x, req in zip(resps, cloned_reqs):
+            req.resps.append(x)
+
+        if lm.world_size > 1:
+            lm.accelerator.wait_for_everyone()
+
+    RANK = lm.rank
+    WORLD_SIZE = lm.world_size
+    ### Postprocess outputs ###
+    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
+    for task_output in eval_tasks:
+        task = task_output.task
+        task.apply_filters()
+
+        ### Collect values of metrics on all datapoints ###
+        # # unpack results and sort back in order and return control to Task
+        # TODO: make it possible to use a different metric per filter
+        # Pre-process task.instances to group by doc_id
+        instances_by_doc_id = defaultdict(list)
+        for instance in task.instances:
+            instances_by_doc_id[instance.doc_id].append(instance)
+        # Sort instances within each group
+        for instances in instances_by_doc_id.values():
+            instances.sort(key=lambda x: x.idx)
+        # iterate over different filters used
+        for filter_key in task.instances[0].filtered_resps.keys():
+            doc_iterator = task.doc_iterator(
+                rank=RANK, limit=limit, world_size=WORLD_SIZE
+            )
+            for doc_id, doc in doc_iterator:
+                requests = instances_by_doc_id[doc_id]
+                metrics = task.process_results(
+                    doc, [req.filtered_resps[filter_key] for req in requests]
+                )
+                if log_samples:
+                    target = task.doc_to_target(doc)
+                    example = {
+                        "doc_id": doc_id,
+                        "doc": doc,
+                        "target": target,
+                        "arguments": [req.args for req in requests],
+                        "resps": [req.resps for req in requests],
+                        "filtered_resps": [
+                            req.filtered_resps[filter_key] for req in requests
+                        ],
+                    }
+                    example.update(metrics)
+                    task_output.logged_samples.append(example)
+                for metric, value in metrics.items():
+                    task_output.sample_metrics[(metric, filter_key)].append(value)
+
+    if WORLD_SIZE > 1:
+        # if multigpu, then gather data across all ranks to rank 0
+        # first gather logged samples across all ranks
+        for task_output in eval_tasks:
+            if log_samples:
+                # for task_name, task_samples in list(samples.items()):
+                full_samples = [None] * WORLD_SIZE if RANK == 0 else None
+                torch.distributed.gather_object(
+                    obj=task_output.logged_samples,
+                    object_gather_list=full_samples,
+                    dst=0,
+                )
+
+                if RANK == 0:
+                    task_output.logged_samples = list(
+                        itertools.chain.from_iterable(full_samples)
+                    )
+
+            # then collect metrics across all ranks
+            for metrics in task_output.sample_metrics:
+                metric_list = [None] * WORLD_SIZE if RANK == 0 else None
+                torch.distributed.gather_object(
+                    obj=task_output.sample_metrics[metrics],
+                    object_gather_list=metric_list,
+                    dst=0,
+                )
+                if RANK == 0:
+                    task_output.sample_metrics[metrics] = list(
+                        itertools.chain.from_iterable(metric_list)
+                    )
+
+    if RANK == 0:
+        ### Aggregate results over all datapoints ###
+        # aggregate results ; run bootstrap CIs
+        for task_output in eval_tasks:
+            task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
+        results, samples, configs, versions, num_fewshot = consolidate_results(
+            eval_tasks
+        )
+
+        ### Calculate group metrics ###
+        if bool(results):
+            for group, task_list in reversed(task_hierarchy.items()):
+                if len(task_list) == 0:
+                    # task_hierarchy entries are either
+                    # `group_name: [subtask1, subtask2, ...]`
+                    # or `task_name: []`.
+                    # we only want to operate on groups here.
+                    continue
+                metric_list = list(
+                    {
+                        key
+                        for task in task_list
+                        for key in results[task].keys()
+                        if "_stderr" not in key and key not in ["alias", "samples"]
+                    }
+                )
+                for metric in metric_list:
+                    stderr = "_stderr,".join(metric.split(","))
+
+                    # gather metrics, sizes, and stderrs from subtasks
+                    metrics = [
+                        results[task][metric]
+                        for task in task_list
+                        if metric in results[task]
+                    ]  # TODO: copy?
+                    stderrs = [
+                        results[task][stderr]
+                        for task in task_list
+                        if stderr in results[task]
+                    ]
+                    sizes = [
+                        results[task]["samples"]
+                        for task in task_list
+                        if metric in results[task]
+                    ]
+
+                    # compute group's pooled metric and stderr
+                    results[group][
+                        metric
+                    ] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
+                    # TODO: calculate grouped metric using aggregation fn
+                    if "N/A" in stderrs:
+                        results[group][stderr] = "N/A"
+                    else:
+                        results[group][
+                            stderr
+                        ] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
+                        # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
+                        # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
+                        # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
+
+                    results[group]["samples"] = sum(sizes)
+
+        results_agg = defaultdict(dict)
+        groups_agg = defaultdict(dict)
+        all_tasks_list = list(task_hierarchy.keys())
+        while True:
+            add_tasks_list = list(k for k in results_agg.keys())
+            left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list)))
+            if len(left_tasks_list) == 0:
+                break
+
+            _task_hierarchy = {
+                k: v for k, v in task_hierarchy.items() if k in left_tasks_list
+            }
+            _results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)
+
+            results_agg = {**results_agg, **_results_agg}
+            groups_agg = {**groups_agg, **_groups_agg}
+
+        for group_name, task_list in task_hierarchy.items():
+            if task_list:
+                num_fewshot[group_name] = num_fewshot[
+                    task_list[0]
+                ]  # TODO: validate this
+
+        results_dict = {
+            "results": dict(results_agg.items()),
+            **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
+            "group_subtasks": dict(reversed(task_hierarchy.items())),
+            "configs": dict(sorted(configs.items())),
+            "versions": dict(sorted(versions.items())),
+            "n-shot": dict(sorted(num_fewshot.items())),
+        }
+        if log_samples:
+            results_dict["samples"] = dict(samples)
+
+        return results_dict
+
+    else:
+        return None
+
+
+if __name__ == "__main__":
+
+    import sys
+
+    sys.path.insert(0, '../../../')
+    import time
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name", default="Qwen/Qwen-VL"
+    )
+    parser.add_argument(
+        "--eval_bs", default=1,
+    )
+    parser.add_argument(
+        "--trust_remote_code", action='store_true',
+        help="Whether to enable trust_remote_code"
+    )
+    parser.add_argument(
+        "--device", default="cuda:0",
+        help="PyTorch device (e.g. cpu/cuda:0/hpu) for evaluation."
+    )
+    parser.add_argument("--tasks",
+                        default="lambada_openai,hellaswag,winogrande,piqa,mmlu,truthfulqa_mc1," \
+                                "openbookqa,boolq,rte,arc_easy,arc_challenge",
+                        help="lm-eval tasks for lm_eval version 0.4.2")
+
+    args = parser.parse_args()
+    s = time.time()
+    from transformers import AutoConfig
+
+    config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=args.trust_remote_code)
+
+    if hasattr(config, "quantization_config"):
+        quantization_config = config.quantization_config
+        if "quant_method" in quantization_config and "auto-round" in quantization_config["quant_method"]:
+            from auto_round.auto_quantizer import AutoHfQuantizer
+        elif "quant_method" in quantization_config and quantization_config["quant_method"] == "gptq":
+            if args.device == "hpu":
+                from auto_round.auto_quantizer import AutoHfQuantizer
+                
+
+    test_tasks = args.tasks
+    if isinstance(test_tasks, str):
+        test_tasks = test_tasks.split(',')
+    model_name = args.model_name.rstrip('/')
+    from lm_eval.utils import make_table
+
+    model_args = f"pretrained={args.model_name}"
+    if args.trust_remote_code:
+        model_args += f",trust_remote_code=True"
+    with torch.cuda.amp.autocast():
+        result = simple_evaluate(model="hf",
+                                model_args=model_args,
+                                tasks=test_tasks,
+                                device=args.device,
+                                batch_size=args.eval_bs)
+    print(make_table(result))
+
+    print("cost time: ", time.time() - s)
+
+
diff --git a/examples/multimodal-modeling/Qwen-VL/main.py b/examples/multimodal-modeling/Qwen-VL/main.py
index 28544271..0c5f35bf 100644
--- a/examples/multimodal-modeling/Qwen-VL/main.py
+++ b/examples/multimodal-modeling/Qwen-VL/main.py
@@ -234,9 +234,6 @@ def get_train_dataloader(train_dataset, model, data_collator, train_batch_size=1
     parser.add_argument("--seed", default=42, type=int,
                         help="seed")
 
-    parser.add_argument("--eval_fp16_baseline", action='store_true',
-                        help="whether to eval FP16 baseline")
-
     parser.add_argument("--adam", action='store_true',
                         help="adam")
 
@@ -354,12 +351,12 @@ def get_train_dataloader(train_dataset, model, data_collator, train_batch_size=1
     if args.model_dtype != None:
         if args.model_dtype == "float16" or args.model_dtype == "fp16":
             torch_dtype = torch.float16
-        if args.model_dtype == "bfloat16" or args.model_dtype == "bfp16":
+        if args.model_dtype == "bfloat16" or args.model_dtype == "bf16":
             torch_dtype = torch.bfloat16
-    dtype_abd = convert_dtype_torch2str(torch_dtype)
-    if dtype_abd == "bf16":
+    dtype_str = convert_dtype_torch2str(torch_dtype)
+    if dtype_str == "bf16":
         model = AutoModelForCausalLM.from_pretrained(args.model_name, config=config, trust_remote_code=not args.disable_trust_remote_code, bf16=True).eval()
-    elif dtype_abd == "fp16":
+    elif dtype_str == "fp16":
         model = AutoModelForCausalLM.from_pretrained(args.model_name, config=config, trust_remote_code=not args.disable_trust_remote_code, fp16=True).eval()
     else:
         model = AutoModelForCausalLM.from_pretrained(args.model_name, config=config, trust_remote_code=not args.disable_trust_remote_code).eval()
@@ -373,36 +370,9 @@ def get_train_dataloader(train_dataset, model, data_collator, train_batch_size=1
                             AutoAdamRound)
     from auto_round.utils import get_multimodal_block_names
 
-    # model = model.eval()
+    model = model.eval()
     seqlen = args.seqlen
 
-    if args.eval_fp16_baseline:
-        model = model.half()
-        model = model.to(torch_device)
-        datasets=args.eval_dataset.split(',')
-        for dataset in datasets:
-            if 'vqa' in dataset:
-                from mm_evaluation.evaluate_vqa import textVQA_evaluation
-                evaluator = textVQA_evaluation(
-                    model,
-                    dataset_name=dataset,
-                    # dataset_path=args.eval_path,
-                    tokenizer=tokenizer,
-                    batch_size=args.eval_bs,
-                    device=str(torch_device)
-                )
-            elif 'scienceqa' in dataset:
-                from mm_evaluation.evaluate_multiple_choice import scienceQA_evaluation
-                evaluator = scienceQA_evaluation(
-                    model,
-                    dataset_name=dataset,
-                    # dataset_path=args.eval_path,
-                    tokenizer=tokenizer,
-                    batch_size=args.eval_bs,
-                    device=str(torch_device)
-                )
-        exit()
-
     round = AutoRound
     if args.adam:
         round = AutoAdamRound
@@ -525,3 +495,4 @@ def get_train_dataloader(train_dataset, model, data_collator, train_batch_size=1
                 )
 
 
+
diff --git a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_multiple_choice.py b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_multiple_choice.py
index a0d285a3..11c89440 100644
--- a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_multiple_choice.py
+++ b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_multiple_choice.py
@@ -6,7 +6,7 @@
 
 import torch
 from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer,AutoConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 multiple_choices = ['A', 'B', 'C', 'D', 'E']
 
@@ -102,7 +102,7 @@ def __len__(self):
         return len(self._local_indices)
 
 
-def scienceQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", dataset_path=None, tokenizer=None,
+def scienceQA_evaluation(model_name, dataset_name, dataset_path=None, tokenizer=None,
                        batch_size=1, few_shot=0, seed=0, trust_remote_code=True, device="cuda:0"):
     # torch.distributed.init_process_group(
     #     backend='nccl',
@@ -115,7 +115,7 @@ def scienceQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", da
         config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
         model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=trust_remote_code).eval()
         model = model.to(torch.device(device))
-        tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=trust_remote_code, use_fast=False)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code, use_fast=False)
     else:
         assert tokenizer is not None, "Two types of parameter passing are supported:model_path or model with tokenizer."
         model = model_name
@@ -192,9 +192,6 @@ def scienceQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", da
     parser.add_argument(
         "--model_name", default="Qwen/Qwen-VL"
     )
-    parser.add_argument(
-        "--base_model", default="Qwen/Qwen-VL"
-    )
     parser.add_argument(
         "--dataset_name", default="scienceqa_test_img"
     )
@@ -209,7 +206,6 @@ def scienceQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", da
     s = time.time()
     evaluator = scienceQA_evaluation(
         args.model_name,
-        base_model=args.base_model,
         dataset_name=args.dataset_name,
         # dataset_path=args.eval_path,
         batch_size=args.eval_bs,
@@ -217,5 +213,4 @@ def scienceQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", da
     )
     print("cost time: ", time.time() - s)
 
-    
-
+    
\ No newline at end of file
diff --git a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_vqa.py b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_vqa.py
index 82f35011..9173be76 100644
--- a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_vqa.py
+++ b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/evaluate_vqa.py
@@ -10,8 +10,8 @@
 import torch
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
-from .vqa import VQA
-from .vqa_eval import VQAEval
+from vqa import VQA
+from vqa_eval import VQAEval
 
 # This code is much refer to https://github.com/cognitedata/Qwen-VL-finetune/blob/master/eval_mm/evaluate_vqa.py
 
@@ -461,3 +461,4 @@ def textVQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", data
     )
     print("cost time: ", time.time() - s)
 
+
diff --git a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/main.py b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/main.py
new file mode 100644
index 00000000..08f7170a
--- /dev/null
+++ b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/main.py
@@ -0,0 +1,101 @@
+
+if __name__ == "__main__":
+
+    import sys
+
+    sys.path.insert(0, '../../../')
+    import time
+    import torch
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name", default="/models/opt-125m/"
+    )
+    parser.add_argument(
+        "--eval_bs", default=4, type=int,
+    )
+    parser.add_argument(
+        "--trust_remote_code", action='store_true',
+        help="Whether to enable trust_remote_code"
+    )
+    parser.add_argument(
+        "--device", default="cuda:0",
+        help="PyTorch device (e.g. cpu/cuda:0/hpu) for evaluation."
+    )
+    parser.add_argument(
+        "--base_model", default="Qwen/Qwen-VL"
+    )
+    parser.add_argument(
+        "--model_dtype", default=None, type=str,
+        help="force to convert the dtype, some backends supports fp16 dtype better"
+    )
+    parser.add_argument(
+        "--tasks",
+        default="textvqa_val,scienceqa_test_img",
+        help="lm-eval tasks for lm_eval version 0.4.2"
+    )
+
+    args = parser.parse_args()
+    s = time.time()
+    from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
+    from auto_round.utils import convert_dtype_torch2str
+
+    config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=args.trust_remote_code)
+
+    if hasattr(config, "quantization_config"):
+        quantization_config = config.quantization_config
+        if "quant_method" in quantization_config and "auto-round" in quantization_config["quant_method"]:
+            from auto_round.auto_quantizer import AutoHfQuantizer
+        elif "quant_method" in quantization_config and quantization_config["quant_method"] == "gptq":
+            if args.device == "hpu":
+                from auto_round.auto_quantizer import AutoHfQuantizer
+    model_name = args.model_name
+    torch_dtype = torch.float
+    if args.model_dtype != None:
+        if args.model_dtype == "float16" or args.model_dtype == "fp16":
+            torch_dtype = torch.float16
+        if args.model_dtype == "bfloat16" or args.model_dtype == "bfp16":
+            torch_dtype = torch.bfloat16
+    dtype_str = convert_dtype_torch2str(torch_dtype)
+    if dtype_str == "bf16":
+        model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=args.trust_remote_code, device_map=args.device, bf16=True).eval()
+    elif dtype_str == "fp16":
+        model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=args.trust_remote_code, device_map=args.device, fp16=True).eval()
+    else:
+        model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=args.trust_remote_code, device_map=args.device).eval()
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=args.trust_remote_code, padding_side="right", use_fast=False)
+    tokenizer.pad_token_id = tokenizer.eod_id
+    test_tasks = args.tasks
+    if isinstance(test_tasks, str):
+        test_tasks = test_tasks.split(',')
+
+    for dataset in test_tasks:
+        if 'vqa' in dataset:
+            from evaluate_vqa import textVQA_evaluation
+            with torch.cuda.amp.autocast():
+                evaluator = textVQA_evaluation(
+                    model,
+                    dataset_name=dataset,
+                    # dataset_path=args.eval_path,
+                    tokenizer=tokenizer,
+                    batch_size=args.eval_bs,
+                    trust_remote_code=args.trust_remote_code,
+                    device=str(args.device)
+                )
+        elif 'scienceqa' in dataset:
+            from evaluate_multiple_choice import scienceQA_evaluation
+            with torch.cuda.amp.autocast():
+                evaluator = scienceQA_evaluation(
+                    model,
+                    dataset_name=dataset,
+                    # dataset_path=args.eval_path,
+                    tokenizer=tokenizer,
+                    batch_size=args.eval_bs,
+                    trust_remote_code=args.trust_remote_code,
+                    device=str(args.device)
+                )
+
+    print("cost time: ", time.time() - s)
+
+
diff --git a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa.py b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa.py
index 17a4e56f..d3b17d00 100644
--- a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa.py
+++ b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa.py
@@ -162,8 +162,8 @@ def showQA(self, anns):
         for ann in anns:
             quesId = ann['question_id']
             print('Question: %s' % (self.qqa[quesId]['question']))
-            for and in ann['answers']:
-                print('Answer %d: %s' % (and['answer_id'], and['answer']))
+            for ann in ann['answers']:
+                print('Answer %d: %s' % (ann['answer_id'], ann['answer']))
 
     def loadRes(self, resFile, quesFile):
         """Load result file and return a result object.
@@ -203,4 +203,4 @@ def loadRes(self, resFile, quesFile):
 
         res.dataset['annotations'] = anns
         res.createIndex()
-        return res
\ No newline at end of file
+        return res
diff --git a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa_eval.py b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa_eval.py
index 834654da..218719e3 100644
--- a/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa_eval.py
+++ b/examples/multimodal-modeling/Qwen-VL/mm_evaluation/vqa_eval.py
@@ -216,7 +216,7 @@ def evaluate(self, quesIds=None):
             resAns = self.processPunctuation(resAns)
             resAns = self.processDigitArticle(resAns)
             gtAcc = []
-            gtAnswers = [and['answer'] for and in gts[quesId]['answers']]
+            gtAnswers = [ann['answer'] for ann in gts[quesId]['answers']]
             if len(set(gtAnswers)) > 1:
                 for ansDic in gts[quesId]['answers']:
                     ansDic['answer'] = self.processPunctuation(
@@ -327,4 +327,4 @@ def updateProgress(self, progress):
             '#' * block + '-' * (barLength - block), int(progress * 100),
             status)
         sys.stdout.write(text)
-        sys.stdout.flush()
\ No newline at end of file
+        sys.stdout.flush()
diff --git a/examples/multimodal-modeling/Qwen-VL/run_autoround.sh b/examples/multimodal-modeling/Qwen-VL/run_autoround.sh
index d59ad510..6a90a0b0 100644
--- a/examples/multimodal-modeling/Qwen-VL/run_autoround.sh
+++ b/examples/multimodal-modeling/Qwen-VL/run_autoround.sh
@@ -12,8 +12,10 @@ python3 main.py \
 --iters 200 \
 --seqlen 512 \
 --disable_quanted_input \
+--model_dtype bf16 \
 --deployment_device 'auto_round' \
 --image_folder /path/to/coco/images/train2017/ \
 --question_file /path/to/Qwen-VL_mix665k.json \
 --output_dir "./tmp_autoround"
 
+
diff --git a/examples/multimodal-modeling/Qwen-VL/run_autoround_on_gaudi.sh b/examples/multimodal-modeling/Qwen-VL/run_autoround_on_gaudi.sh
deleted file mode 100644
index 72fb31d2..00000000
--- a/examples/multimodal-modeling/Qwen-VL/run_autoround_on_gaudi.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-set -x
-model_name=Qwen/Qwen-VL
-
-python3 main.py \
-  --model_name $model_name \
-  --group_size 128 \
-  --bits 4 \
-  --deployment_device "fake" \
-  --output_dir "./tmp_autoround"
-
-
diff --git a/examples/multimodal-modeling/Qwen-VL/run_eval.sh b/examples/multimodal-modeling/Qwen-VL/run_eval.sh
new file mode 100644
index 00000000..089a45b9
--- /dev/null
+++ b/examples/multimodal-modeling/Qwen-VL/run_eval.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+set -x
+device=0
+
+model_path='./tmp_autoround'
+model=Qwen-VL
+
+CUDA_VISIBLE_DEVICES=$device python3 eval_042/evaluation.py \
+--model_name ${model_path}/${model} \
+--trust_remote_code \
+--eval_bs 16
+
+CUDA_VISIBLE_DEVICES=$device python3 mm_evaluation/main.py \
+--model_name ${model_path}/${model} \
+--trust_remote_code \
+--eval_bs 4
+
+
+