diff --git a/.github/code_spell_ignore.txt b/.github/code_spell_ignore.txt index c7e0b759..3221d7c7 100644 --- a/.github/code_spell_ignore.txt +++ b/.github/code_spell_ignore.txt @@ -1,3 +1,9 @@ rouge Rouge -ROUGE \ No newline at end of file +ROUGE +svae +doesnt +thered +theyre +whos +youre diff --git a/evals/evaluation/llava/__init__.py b/evals/evaluation/llava/__init__.py new file mode 100644 index 00000000..916f3a44 --- /dev/null +++ b/evals/evaluation/llava/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/evals/evaluation/llava/eval_gpt_mmvet.py b/evals/evaluation/llava/eval_gpt_mmvet.py new file mode 100644 index 00000000..a0de1213 --- /dev/null +++ b/evals/evaluation/llava/eval_gpt_mmvet.py @@ -0,0 +1,285 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json +import os +import time +from collections import Counter + +import numpy as np +import openai +import pandas as pd +from tqdm import tqdm + +parser = argparse.ArgumentParser(description="ChatGPT-based QA evaluation.") +parser.add_argument("--mmvet_path") +parser.add_argument("--ckpt_name") +parser.add_argument("--result_path") +args = parser.parse_args() + + +openai.api_key = "" +gpt_model = "gpt-4-0613" + + +prompt = """Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. in the ground truth means it is totally right only when all elements in the ground truth are present in the prediction, and means it is totally right when any one element in the ground truth is present in the prediction. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). Just complete the last space of the correctness score. + +Question | Ground truth | Prediction | Correctness +--- | --- | --- | --- +What is x in the equation? | -1 -5 | x = 3 | 0.0 +What is x in the equation? | -1 -5 | x = -1 | 0.5 +What is x in the equation? | -1 -5 | x = -5 | 0.5 +What is x in the equation? | -1 -5 | x = -5 or 5 | 0.5 +What is x in the equation? | -1 -5 | x = -1 or x = -5 | 1.0 +Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme talks about Iceland and Greenland. It's pointing out that despite their names, Iceland is not very icy and Greenland isn't very green. | 0.4 +Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme is using humor to point out the misleading nature of Iceland's and Greenland's names. Iceland, despite its name, has lush green landscapes while Greenland is mostly covered in ice and snow. The text 'This is why I have trust issues' is a playful way to suggest that these contradictions can lead to distrust or confusion. The humor in this meme is derived from the unexpected contrast between the names of the countries and their actual physical characteristics. | 1.0 +""" + +# load metadata +# Download mm-vet.zip and `unzip mm-vet.zip` and change the path below +mmvet_path = args.mmvet_path +use_sub_set = False +decimal_places = 1 # number of decimal places to round to + +if use_sub_set: + bard_set_file = os.path.join(mmvet_path, "bard_set.json") + with open(bard_set_file, "r") as f: + sub_set = json.load(f) + sub_set_name = "bardset" + sub_set_name = sub_set_name + "_" +else: + sub_set = None + sub_set_name = "" + +mmvet_metadata = os.path.join(mmvet_path, "mm-vet.json") +with open(mmvet_metadata, "r") as f: + data = json.load(f) + +counter = Counter() +cap_set_list = [] +cap_set_counter = [] +len_data = 0 +for id, value in data.items(): + if sub_set is not None and id not in sub_set: + continue + question = value["question"] + answer = value["answer"] + cap = value["capability"] + cap = set(cap) + counter.update(cap) + if cap not in cap_set_list: + cap_set_list.append(cap) + cap_set_counter.append(1) + else: + cap_set_counter[cap_set_list.index(cap)] += 1 + + len_data += 1 + +sorted_list = counter.most_common() +columns = [k for k, v in sorted_list] +columns.append("total") +columns.append("std") +columns.append("runs") +df = pd.DataFrame(columns=columns) + +cap_set_sorted_indices = np.argsort(-np.array(cap_set_counter)) +new_cap_set_list = [] +new_cap_set_counter = [] +for index in cap_set_sorted_indices: + new_cap_set_list.append(cap_set_list[index]) + new_cap_set_counter.append(cap_set_counter[index]) + +cap_set_list = new_cap_set_list +cap_set_counter = new_cap_set_counter +cap_set_names = ["_".join(list(cap_set)) for cap_set in cap_set_list] + +columns2 = cap_set_names +columns2.append("total") +columns2.append("std") +columns2.append("runs") +df2 = pd.DataFrame(columns=columns2) + + +###### change your model name ###### +model = args.ckpt_name +result_path = args.result_path +num_run = 1 # we set it as 5 in the paper +model_results_file = os.path.join(result_path, f"{model}.json") + +# grade results for each sample to svae +grade_file = f"{model}_{gpt_model}-grade-{num_run}runs.json" +grade_file = os.path.join(result_path, grade_file) + +# score results regarding capabilities/capability integration to save +cap_score_file = f"{model}_{sub_set_name}{gpt_model}-cap-score-{num_run}runs.csv" +cap_score_file = os.path.join(result_path, cap_score_file) +cap_int_score_file = f"{model}_{sub_set_name}{gpt_model}-cap-int-score-{num_run}runs.csv" +cap_int_score_file = os.path.join(result_path, cap_int_score_file) + +with open(model_results_file) as f: + results = json.load(f) +if os.path.exists(grade_file): + with open(grade_file, "r") as f: + grade_results = json.load(f) +else: + grade_results = {} + + +def need_more_runs(): + need_more_runs = False + if len(grade_results) > 0: + for k, v in grade_results.items(): + if len(v["score"]) < num_run: + need_more_runs = True + break + return need_more_runs or len(grade_results) < len_data + + +while need_more_runs(): + for j in range(num_run): + print(f"eval run {j}") + for id, line in tqdm(data.items()): + if sub_set is not None and id not in sub_set: + continue + if id in grade_results and len(grade_results[id]["score"]) >= (j + 1): + continue + + model_pred = results[id] + + question = ( + prompt + + "\n" + + " | ".join( + [ + line["question"], + line["answer"].replace("", " ").replace("", " "), + model_pred, + "", + ] + ) + ) + messages = [ + {"role": "user", "content": question}, + ] + + if id not in grade_results: + sample_grade = {"model": [], "content": [], "score": []} + else: + sample_grade = grade_results[id] + + grade_sample_run_complete = False + temperature = 0.0 + + while not grade_sample_run_complete: + try: + response = openai.ChatCompletion.create( + model=gpt_model, max_tokens=3, temperature=temperature, messages=messages + ) + # print(response['model']) + content = response["choices"][0]["message"]["content"] + flag = True + try_time = 1 + while flag: + try: + content = content.split(" ")[0].strip() + score = float(content) + if score > 1.0 or score < 0.0: + assert False + flag = False + except: + question = ( + prompt + + "\n" + + " | ".join( + [ + line["question"], + line["answer"].replace("", " ").replace("", " "), + model_pred, + "", + ] + ) + + "\nPredict the correctness of the answer (digit): " + ) + messages = [ + {"role": "user", "content": question}, + ] + response = openai.ChatCompletion.create( + model=gpt_model, max_tokens=3, temperature=temperature, messages=messages + ) + # print(response) + content = response["choices"][0]["message"]["content"] + try_time += 1 + temperature += 0.5 + print(f"{id} try {try_time} times") + print(content) + if try_time > 5: + score = 0.0 + flag = False + grade_sample_run_complete = True + except Exception as e: + print(e) + # gpt4 may have token rate limit + print("sleep 1s") + time.sleep(1) + + if len(sample_grade["model"]) >= j + 1: + sample_grade["model"][j] = response["model"] + sample_grade["content"][j] = content + sample_grade["score"][j] = score + else: + sample_grade["model"].append(response["model"]) + sample_grade["content"].append(content) + sample_grade["score"].append(score) + grade_results[id] = sample_grade + + with open(grade_file, "w") as f: + json.dump(grade_results, f, indent=4) + +assert not need_more_runs() +cap_socres = {k: [0.0] * num_run for k in columns[:-2]} +counter["total"] = len_data + +cap_socres2 = {k: [0.0] * num_run for k in columns2[:-2]} +counter2 = {columns2[i]: cap_set_counter[i] for i in range(len(cap_set_counter))} +counter2["total"] = len_data + +for k, v in grade_results.items(): + if sub_set is not None and k not in sub_set: + continue + for i in range(num_run): + score = v["score"][i] + caps = set(data[k]["capability"]) + for c in caps: + cap_socres[c][i] += score + + cap_socres["total"][i] += score + + index = cap_set_list.index(caps) + cap_socres2[cap_set_names[index]][i] += score + cap_socres2["total"][i] += score + +for k, v in cap_socres.items(): + cap_socres[k] = np.array(v) / counter[k] * 100 + +std = round(cap_socres["total"].std(), decimal_places) +total_copy = cap_socres["total"].copy() +runs = str(list(np.round(total_copy, decimal_places))) + +for k, v in cap_socres.items(): + cap_socres[k] = round(v.mean(), decimal_places) + +cap_socres["std"] = std +cap_socres["runs"] = runs +df.loc[model] = cap_socres + +for k, v in cap_socres2.items(): + cap_socres2[k] = round(np.mean(np.array(v) / counter2[k] * 100), decimal_places) +cap_socres2["std"] = std +cap_socres2["runs"] = runs +df2.loc[model] = cap_socres2 + +df.to_csv(cap_score_file) +df2.to_csv(cap_int_score_file) +print(df) +print(df2) diff --git a/evals/evaluation/llava/eval_gpt_review_bench.py b/evals/evaluation/llava/eval_gpt_review_bench.py new file mode 100644 index 00000000..11db87d3 --- /dev/null +++ b/evals/evaluation/llava/eval_gpt_review_bench.py @@ -0,0 +1,129 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json +import os +import time + +import openai + +NUM_SECONDS_TO_SLEEP = 0.5 + + +def get_eval(content: str, max_tokens: int): + while True: + try: + response = openai.ChatCompletion.create( + model="gpt-4-0314", + messages=[ + { + "role": "system", + "content": "You are a helpful and precise assistant for checking the quality of the answer.", + }, + { + "role": "user", + "content": content, + }, + ], + temperature=0.2, # TODO: figure out which temperature is best for evaluation + max_tokens=max_tokens, + ) + break + except openai.error.RateLimitError: + pass + except Exception as e: + print(e) + time.sleep(NUM_SECONDS_TO_SLEEP) + + return response["choices"][0]["message"]["content"] + + +def parse_score(review): + try: + score_pair = review.split("\n")[0] + score_pair = score_pair.replace(",", " ") + sp = score_pair.split(" ") + if len(sp) == 2: + return [float(sp[0]), float(sp[1])] + else: + print("error", review) + return [-1, -1] + except Exception as e: + print(e) + print("error", review) + return [-1, -1] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="ChatGPT-based QA evaluation.") + parser.add_argument("-q", "--question") + parser.add_argument("-c", "--context") + parser.add_argument("-a", "--answer-list", nargs="+", default=[]) + parser.add_argument("-r", "--rule") + parser.add_argument("-o", "--output") + parser.add_argument("--max-tokens", type=int, default=1024, help="maximum number of tokens produced in the output") + args = parser.parse_args() + + f_q = open(os.path.expanduser(args.question)) + f_ans1 = open(os.path.expanduser(args.answer_list[0])) + f_ans2 = open(os.path.expanduser(args.answer_list[1])) + rule_dict = json.load(open(os.path.expanduser(args.rule), "r")) + + if os.path.isfile(os.path.expanduser(args.output)): + cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] + else: + cur_reviews = [] + + review_file = open(f"{args.output}", "a") + + context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] + image_to_context = {context["image"]: context for context in context_list} + + handles = [] + idx = 0 + for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): + quest = json.loads(ques_js) + ans1 = json.loads(ans1_js) + ans2 = json.loads(ans2_js) + + inst = image_to_context[quest["image"]] + + if isinstance(inst["caption"], list): + cap_str = "\n".join(inst["caption"]) + else: + cap_str = inst["caption"] + + category = "llava_bench_" + json.loads(ques_js)["category"] + if category in rule_dict: + rule = rule_dict[category] + else: + assert False, f"Visual QA category not found in rule file: {category}." + prompt = rule["prompt"] + role = rule["role"] + content = ( + f"[Context]\n{cap_str}\n\n" + f'[Question]\n{quest["text"]}\n\n' + f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' + f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' + f"[System]\n{prompt}\n\n" + ) + cur_js = { + "id": idx + 1, + "question_id": quest["question_id"], + "answer1_id": ans1.get("answer_id", ans1["question_id"]), + "answer2_id": ans2.get("answer_id", ans2["answer_id"]), + "category": category, + } + if idx >= len(cur_reviews): + review = get_eval(content, args.max_tokens) + scores = parse_score(review) + cur_js["content"] = review + cur_js["tuple"] = scores + review_file.write(json.dumps(cur_js) + "\n") + review_file.flush() + else: + print(f"Skipping {idx} as we already have it.") + idx += 1 + print(idx) + review_file.close() diff --git a/evals/evaluation/llava/eval_pope.py b/evals/evaluation/llava/eval_pope.py new file mode 100644 index 00000000..a8104dc7 --- /dev/null +++ b/evals/evaluation/llava/eval_pope.py @@ -0,0 +1,87 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +import argparse +import json +import os + + +def eval_pope(answers, label_file): + label_list = [json.loads(q)["label"] for q in open(label_file, "r")] + + for answer in answers: + text = answer["text"] + + # Only keep the first sentence + if text.find(".") != -1: + text = text.split(".")[0] + + text = text.replace(",", "") + words = text.split(" ") + if "No" in words or "not" in words or "no" in words: + answer["text"] = "no" + else: + answer["text"] = "yes" + + for i in range(len(label_list)): + if label_list[i] == "no": + label_list[i] = 0 + else: + label_list[i] = 1 + + pred_list = [] + for answer in answers: + if answer["text"] == "no": + pred_list.append(0) + else: + pred_list.append(1) + + pos = 1 + neg = 0 + yes_ratio = pred_list.count(1) / len(pred_list) + + TP, TN, FP, FN = 0, 0, 0, 0 + for pred, label in zip(pred_list, label_list): + if pred == pos and label == pos: + TP += 1 + elif pred == pos and label == neg: + FP += 1 + elif pred == neg and label == neg: + TN += 1 + elif pred == neg and label == pos: + FN += 1 + + print("TP\tFP\tTN\tFN\t") + print("{}\t{}\t{}\t{}".format(TP, FP, TN, FN)) + + precision = float(TP) / float(TP + FP) + recall = float(TP) / float(TP + FN) + f1 = 2 * precision * recall / (precision + recall) + acc = (TP + TN) / (TP + TN + FP + FN) + print("Accuracy: {}".format(acc)) + print("Precision: {}".format(precision)) + print("Recall: {}".format(recall)) + print("F1 score: {}".format(f1)) + print("Yes ratio: {}".format(yes_ratio)) + print("%.3f, %.3f, %.3f, %.3f, %.3f" % (f1, acc, precision, recall, yes_ratio)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--annotation-dir", type=str) + parser.add_argument("--question-file", type=str) + parser.add_argument("--result-file", type=str) + args = parser.parse_args() + + questions = [json.loads(line) for line in open(args.question_file)] + questions = {question["question_id"]: question for question in questions} + answers = [json.loads(q) for q in open(args.result_file)] + for file in os.listdir(args.annotation_dir): + assert file.startswith("coco_pope_") + assert file.endswith(".json") + category = file[10:-5] + cur_answers = [x for x in answers if questions[x["question_id"]]["category"] == category] + print("Category: {}, # samples: {}".format(category, len(cur_answers))) + eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) + print("====================================") diff --git a/evals/evaluation/llava/eval_science_qa.py b/evals/evaluation/llava/eval_science_qa.py new file mode 100644 index 00000000..5a299f42 --- /dev/null +++ b/evals/evaluation/llava/eval_science_qa.py @@ -0,0 +1,117 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json +import os +import random +import re + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--base-dir", type=str) + parser.add_argument("--result-file", type=str) + parser.add_argument("--output-file", type=str) + parser.add_argument("--output-result", type=str) + parser.add_argument("--split", type=str, default="test") + parser.add_argument("--options", type=list, default=["A", "B", "C", "D", "E"]) + return parser.parse_args() + + +def convert_caps(results): + fakecaps = [] + for result in results: + image_id = result["question_id"] + caption = result["text"] + fakecaps.append({"image_id": int(image_id), "caption": caption}) + return fakecaps + + +def get_pred_idx(prediction, choices, options): + """Get the index (e.g. 2) from the prediction (e.g. 'C')""" + if prediction in options[: len(choices)]: + return options.index(prediction) + else: + return -1 + return random.choice(range(len(choices))) + + +if __name__ == "__main__": + args = get_args() + + base_dir = args.base_dir + split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] + problems = json.load(open(os.path.join(base_dir, "problems.json"))) + predictions = [json.loads(line) for line in open(args.result_file)] + predictions = {pred["question_id"]: pred for pred in predictions} + split_problems = {idx: problems[idx] for idx in split_indices} + + results = {"correct": [], "incorrect": []} + sqa_results = {} + sqa_results["acc"] = None + sqa_results["correct"] = None + sqa_results["count"] = None + sqa_results["results"] = {} + sqa_results["outputs"] = {} + + for prob_id, prob in split_problems.items(): + if prob_id not in predictions: + pred = {"text": "FAILED", "prompt": "Unknown"} + pred_text = "FAILED" + else: + pred = predictions[prob_id] + pred_text = pred["text"] + + if pred_text in args.options: + answer = pred_text + elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ": + answer = pred_text[0] + else: + pattern = re.compile(r"The answer is ([A-Z]).") + res = pattern.findall(pred_text) + if len(res) == 1: + answer = res[0] # 'A', 'B', ... + else: + answer = "FAILED" + + pred_idx = get_pred_idx(answer, prob["choices"], args.options) + + analysis = { + "question_id": prob_id, + "parsed_ans": answer, + "ground_truth": args.options[prob["answer"]], + "question": pred["prompt"], + "pred": pred_text, + "is_multimodal": "" in pred["prompt"], + } + + sqa_results["results"][prob_id] = get_pred_idx(answer, prob["choices"], args.options) + sqa_results["outputs"][prob_id] = pred_text + + if pred_idx == prob["answer"]: + results["correct"].append(analysis) + else: + results["incorrect"].append(analysis) + + correct = len(results["correct"]) + total = len(results["correct"]) + len(results["incorrect"]) + + ###### IMG ###### + multimodal_correct = len([x for x in results["correct"] if x["is_multimodal"]]) + multimodal_incorrect = len([x for x in results["incorrect"] if x["is_multimodal"]]) + multimodal_total = multimodal_correct + multimodal_incorrect + ###### IMG ###### + + print( + f"Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%" + ) + + sqa_results["acc"] = correct / total * 100 + sqa_results["correct"] = correct + sqa_results["count"] = total + + with open(args.output_file, "w") as f: + json.dump(results, f, indent=2) + with open(args.output_result, "w") as f: + json.dump(sqa_results, f, indent=2) diff --git a/evals/evaluation/llava/eval_textvqa.py b/evals/evaluation/llava/eval_textvqa.py new file mode 100644 index 00000000..318922be --- /dev/null +++ b/evals/evaluation/llava/eval_textvqa.py @@ -0,0 +1,70 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json +import os +import re + +from .m4c_evaluator import TextVQAAccuracyEvaluator + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--annotation-file", type=str) + parser.add_argument("--result-file", type=str) + parser.add_argument("--result-dir", type=str) + return parser.parse_args() + + +def prompt_processor(prompt): + if prompt.startswith("OCR tokens: "): + pattern = r"Question: (.*?) Short answer:" + match = re.search(pattern, prompt, re.DOTALL) + question = match.group(1) + elif "Reference OCR token: " in prompt and len(prompt.split("\n")) == 3: + if prompt.startswith("Reference OCR token:"): + question = prompt.split("\n")[1] + else: + question = prompt.split("\n")[0] + elif len(prompt.split("\n")) == 2: + question = prompt.split("\n")[0] + else: + assert False + + return question.lower() + + +def eval_single(annotation_file, result_file): + experiment_name = os.path.splitext(os.path.basename(result_file))[0] + print(experiment_name) + annotations = json.load(open(annotation_file))["data"] + annotations = {(annotation["image_id"], annotation["question"].lower()): annotation for annotation in annotations} + results = [json.loads(line) for line in open(result_file)] + + pred_list = [] + for result in results: + annotation = annotations[(result["question_id"], prompt_processor(result["prompt"]))] + pred_list.append( + { + "pred_answer": result["text"], + "gt_answers": annotation["answers"], + } + ) + + evaluator = TextVQAAccuracyEvaluator() + print("Samples: {}\nAccuracy: {:.2f}%\n".format(len(pred_list), 100.0 * evaluator.eval_pred_list(pred_list))) + + +if __name__ == "__main__": + args = get_args() + + if args.result_file is not None: + eval_single(args.annotation_file, args.result_file) + + if args.result_dir is not None: + for result_file in sorted(os.listdir(args.result_dir)): + if not result_file.endswith(".jsonl"): + print(f"Skipping {result_file}") + continue + eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) diff --git a/evals/evaluation/llava/m4c_evaluator.py b/evals/evaluation/llava/m4c_evaluator.py new file mode 100644 index 00000000..a014bf62 --- /dev/null +++ b/evals/evaluation/llava/m4c_evaluator.py @@ -0,0 +1,330 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import re + +from tqdm import tqdm + + +class EvalAIAnswerProcessor: + """ + Processes an answer similar to Eval AI + copied from + https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897 + """ + + CONTRACTIONS = { + "aint": "ain't", + "aren't": "aren't", + "can't": "can't", + "couldve": "could've", + "couldn't": "couldn't", + "couldn'tve": "couldn't've", + "couldnt've": "couldn't've", + "didn't": "didn't", + "doesnt": "doesn't", + "dont": "don't", + "hadn't": "hadn't", + "hadnt've": "hadn't've", + "hadn'tve": "hadn't've", + "hasn't": "hasn't", + "haven't": "haven't", + "hed": "he'd", + "hed've": "he'd've", + "he'dve": "he'd've", + "hes": "he's", + "howd": "how'd", + "howll": "how'll", + "hows": "how's", + "Id've": "I'd've", + "I'dve": "I'd've", + "Im": "I'm", + "Ive": "I've", + "isn't": "isn't", + "itd": "it'd", + "itd've": "it'd've", + "it'dve": "it'd've", + "itll": "it'll", + "let's": "let's", + "maam": "ma'am", + "mightnt": "mightn't", + "mightnt've": "mightn't've", + "mightn'tve": "mightn't've", + "mightve": "might've", + "mustnt": "mustn't", + "mustve": "must've", + "neednt": "needn't", + "notve": "not've", + "o'clock": "o'clock", + "oughtnt": "oughtn't", + "ow's'at": "'ow's'at", + "'ows'at": "'ow's'at", + "'ow'sat": "'ow's'at", + "shant": "shan't", + "shed've": "she'd've", + "she'dve": "she'd've", + "she's": "she's", + "shouldve": "should've", + "shouldn't": "shouldn't", + "shouldnt've": "shouldn't've", + "shouldn'tve": "shouldn't've", + "somebody'd": "somebodyd", + "somebodyd've": "somebody'd've", + "somebody'dve": "somebody'd've", + "somebodyll": "somebody'll", + "somebodys": "somebody's", + "someoned": "someone'd", + "someoned've": "someone'd've", + "someone'dve": "someone'd've", + "someonell": "someone'll", + "someones": "someone's", + "somethingd": "something'd", + "somethingd've": "something'd've", + "something'dve": "something'd've", + "somethingll": "something'll", + "that's": "that's", + "thered": "there'd", + "thered've": "there'd've", + "there'dve": "there'd've", + "therere": "there're", + "there's": "there's", + "theyd": "they'd", + "theyd've": "they'd've", + "they'dve": "they'd've", + "theyll": "they'll", + "theyre": "they're", + "theyve": "they've", + "twas": "'twas", + "wasn't": "wasn't", + "wed've": "we'd've", + "we'dve": "we'd've", + "weve": "we've", + "weren't": "weren't", + "whatll": "what'll", + "whatre": "what're", + "what's": "what's", + "whatve": "what've", + "whens": "when's", + "whered": "where'd", + "wheres": "where's", + "whereve": "where've", + "whod": "who'd", + "whod've": "who'd've", + "who'dve": "who'd've", + "wholl": "who'll", + "whos": "who's", + "whove": "who've", + "whyll": "why'll", + "whyre": "why're", + "whys": "why's", + "won't": "won't", + "wouldve": "would've", + "wouldn't": "wouldn't", + "wouldnt've": "wouldn't've", + "wouldn'tve": "wouldn't've", + "yall": "y'all", + "yall'll": "y'all'll", + "y'allll": "y'all'll", + "yall'd've": "y'all'd've", + "y'alld've": "y'all'd've", + "y'all'dve": "y'all'd've", + "youd": "you'd", + "youd've": "you'd've", + "you'dve": "you'd've", + "youll": "you'll", + "youre": "you're", + "youve": "you've", + } + + NUMBER_MAP = { + "none": "0", + "zero": "0", + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", + "ten": "10", + } + ARTICLES = ["a", "an", "the"] + PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)") + COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)") + PUNCTUATIONS = [ + ";", + r"/", + "[", + "]", + '"', + "{", + "}", + "(", + ")", + "=", + "+", + "\\", + "_", + "-", + ">", + "<", + "@", + "`", + ",", + "?", + "!", + ] + + def __init__(self, *args, **kwargs): + pass + + def word_tokenize(self, word): + word = word.lower() + word = word.replace(",", "").replace("?", "").replace("'s", " 's") + return word.strip() + + def process_punctuation(self, in_text): + out_text = in_text + for p in self.PUNCTUATIONS: + if (p + " " in in_text or " " + p in in_text) or (re.search(self.COMMA_STRIP, in_text) is not None): + out_text = out_text.replace(p, "") + else: + out_text = out_text.replace(p, " ") + out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE) + return out_text + + def process_digit_article(self, in_text): + out_text = [] + temp_text = in_text.lower().split() + for word in temp_text: + word = self.NUMBER_MAP.setdefault(word, word) + if word not in self.ARTICLES: + out_text.append(word) + else: + pass + for word_id, word in enumerate(out_text): + if word in self.CONTRACTIONS: + out_text[word_id] = self.CONTRACTIONS[word] + out_text = " ".join(out_text) + return out_text + + def __call__(self, item): + item = self.word_tokenize(item) + item = item.replace("\n", " ").replace("\t", " ").strip() + item = self.process_punctuation(item) + item = self.process_digit_article(item) + return item + + +class TextVQAAccuracyEvaluator: + def __init__(self): + self.answer_processor = EvalAIAnswerProcessor() + + def _compute_answer_scores(self, raw_answers): + """Compute the accuracy (soft score) of human answers.""" + answers = [self.answer_processor(a) for a in raw_answers] + assert len(answers) == 10 + gt_answers = list(enumerate(answers)) + unique_answers = set(answers) + unique_answer_scores = {} + + for unique_answer in unique_answers: + accs = [] + for gt_answer in gt_answers: + other_answers = [item for item in gt_answers if item != gt_answer] + matching_answers = [item for item in other_answers if item[1] == unique_answer] + acc = min(1, float(len(matching_answers)) / 3) + accs.append(acc) + unique_answer_scores[unique_answer] = sum(accs) / len(accs) + + return unique_answer_scores + + def eval_pred_list(self, pred_list): + pred_scores = [] + for entry in tqdm(pred_list): + pred_answer = self.answer_processor(entry["pred_answer"]) + unique_answer_scores = self._compute_answer_scores(entry["gt_answers"]) + score = unique_answer_scores.get(pred_answer, 0.0) + pred_scores.append(score) + + accuracy = sum(pred_scores) / len(pred_scores) + return accuracy + + +class STVQAAccuracyEvaluator: + def __init__(self): + self.answer_processor = EvalAIAnswerProcessor() + + def eval_pred_list(self, pred_list): + pred_scores = [] + for entry in pred_list: + pred_answer = self.answer_processor(entry["pred_answer"]) + gts = [self.answer_processor(a) for a in entry["gt_answers"]] + score = 1.0 if pred_answer in gts else 0.0 + pred_scores.append(score) + + accuracy = sum(pred_scores) / len(pred_scores) + return accuracy + + +class STVQAANLSEvaluator: + def __init__(self): + import editdistance # install with `pip install editdistance` + + self.get_edit_distance = editdistance.eval + + def get_anls(self, s1, s2): + s1 = s1.lower().strip() + s2 = s2.lower().strip() + iou = 1 - self.get_edit_distance(s1, s2) / max(len(s1), len(s2)) + anls = iou if iou >= 0.5 else 0.0 + return anls + + def eval_pred_list(self, pred_list): + pred_scores = [] + for entry in pred_list: + anls = max(self.get_anls(entry["pred_answer"], gt) for gt in entry["gt_answers"]) + pred_scores.append(anls) + + accuracy = sum(pred_scores) / len(pred_scores) + return accuracy + + +class TextCapsBleu4Evaluator: + def __init__(self): + # The following script requires Java 1.8.0 and pycocotools installed. + # The pycocoevalcap can be installed with pip as + # pip install git+https://github.com/ronghanghu/coco-caption.git@python23 + # Original pycocoevalcap code is at https://github.com/tylin/coco-caption + # but has no python3 support yet. + try: + from pycocoevalcap.bleu.bleu import Bleu + from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer + except ModuleNotFoundError: + print( + "Please install pycocoevalcap module using " + "pip install git+https://github.com/ronghanghu/coco-caption.git@python23" # noqa + ) + raise + + self.tokenizer = PTBTokenizer() + self.scorer = Bleu(4) + + def eval_pred_list(self, pred_list): + # Create reference and hypotheses captions. + gts = {} + res = {} + for idx, entry in enumerate(pred_list): + gts[idx] = [{"caption": a} for a in entry["gt_answers"]] + res[idx] = [{"caption": entry["pred_answer"]}] + + gts = self.tokenizer.tokenize(gts) + res = self.tokenizer.tokenize(res) + score, _ = self.scorer.compute_score(gts, res) + + bleu4 = score[3] # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4) + return bleu4 diff --git a/evals/evaluation/llava/summarize_gpt_review.py b/evals/evaluation/llava/summarize_gpt_review.py new file mode 100644 index 00000000..712fef45 --- /dev/null +++ b/evals/evaluation/llava/summarize_gpt_review.py @@ -0,0 +1,70 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json +import os +from collections import defaultdict + +import numpy as np + + +def parse_args(): + parser = argparse.ArgumentParser(description="ChatGPT-based QA evaluation.") + parser.add_argument("-d", "--dir", default=None) + parser.add_argument("-v", "--version", default=None) + parser.add_argument("-s", "--select", nargs="*", default=None) + parser.add_argument("-f", "--files", nargs="*", default=[]) + parser.add_argument("-i", "--ignore", nargs="*", default=[]) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + + if args.ignore is not None: + args.ignore = [int(x) for x in args.ignore] + + if len(args.files) > 0: + review_files = args.files + else: + review_files = [ + x + for x in os.listdir(args.dir) + if x.endswith(".jsonl") + and ( + x.startswith("gpt4_text") or x.startswith("reviews_") or x.startswith("review_") or "review" in args.dir + ) + ] + + for review_file in sorted(review_files): + config = os.path.basename(review_file).replace("gpt4_text_", "").replace(".jsonl", "") + if args.select is not None and any(x not in config for x in args.select): + continue + if "0613" in config: + version = "0613" + else: + version = "0314" + if args.version is not None and args.version != version: + continue + scores = defaultdict(list) + print(config) + with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f: + for review_str in f: + review = json.loads(review_str) + if review["question_id"] in args.ignore: + continue + if "category" in review: + scores[review["category"]].append(review["tuple"]) + scores["all"].append(review["tuple"]) + else: + if "tuple" in review: + scores["all"].append(review["tuple"]) + else: + scores["all"].append(review["score"]) + for k, v in sorted(scores.items()): + stats = np.asarray(v).mean(0).tolist() + stats = [round(x, 3) for x in stats] + # print(k, stats, round(stats[1]/stats[0]*100, 1)) + print(k, round(stats[1] / stats[0] * 100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1)) + print("=================================") diff --git a/examples/VisualQnA/README.md b/examples/VisualQnA/README.md new file mode 100644 index 00000000..591ddd0c --- /dev/null +++ b/examples/VisualQnA/README.md @@ -0,0 +1,267 @@ +# VisualQnA Accuracy Evaluation + +## Data preparation + +Following [LLaVA's instructions](https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md). **You MUST first download [eval.zip](https://drive.google.com/file/d/1atZSBBrAX54yYpxtVVW33zFvcnaHeFPy/view?usp=sharing)**. It contains custom annotations, scripts, and the prediction files with LLaVA v1.5. Extract to `vqa_eval`. This also provides a general structure for all datasets. + +After downloading all of them, organize the data as follows in `vqa_eval`. + +```bash +vqa_eval/ +├── gqa +│   ├── answers +│   │   └── llava_gqa_testdev_balanced +│   │   └── llava-v1.5-13b.jsonl +│   ├── data +│   └── llava_gqa_testdev_balanced.jsonl +├── llava-bench-in-the-wild +│   ├── answers +│   │   ├── llava-v1.5-13b.jsonl +│   │   └── llava-v1.5-7b.jsonl +│   └── reviews +│   ├── llava-v1.5-13b-eval1.jsonl +│   ├── llava-v1.5-13b-eval2.jsonl +│   ├── llava-v1.5-13b-eval3.jsonl +│   ├── llava-v1.5-7b-eval1.jsonl +│   ├── llava-v1.5-7b-eval2.jsonl +│   └── llava-v1.5-7b-eval3.jsonl +├── mmbench +│   ├── answers +│   │   └── mmbench_dev_20230712 +│   │   └── llava-v1.5-13b.jsonl +│   └── answers_upload +│   └── mmbench_dev_20230712 +│   └── llava-v1.5-13b.xlsx +├── MME +│   ├── answers +│   │   └── llava-v1.5-13b.jsonl +│   ├── convert_answer_to_mme.py +│   └── llava_mme.jsonl +├── mm-vet +│   ├── answers +│   │   ├── llava-v1.5-13b.jsonl +│   │   └── llava-v1.5-7b.jsonl +│   ├── convert_answers.py +│   ├── llava-mm-vet.jsonl +│   └── results +│   ├── llava-v1.5-13b_gpt-4-cap-int-score-1runs.csv +│   ├── llava-v1.5-13b_gpt-4-cap-int-score-3runs.csv +│   ├── llava-v1.5-13b_gpt-4-cap-score-1runs.csv +│   ├── llava-v1.5-13b_gpt-4-cap-score-3runs.csv +│   ├── llava-v1.5-13b_gpt-4-grade-1runs.json +│   ├── llava-v1.5-13b_gpt-4-grade-3runs.json +│   ├── llava-v1.5-13b.json +│   ├── llava-v1.5-7b_gpt-4-cap-int-score-1runs.csv +│   ├── llava-v1.5-7b_gpt-4-cap-int-score-3runs.csv +│   ├── llava-v1.5-7b_gpt-4-cap-score-1runs.csv +│   ├── llava-v1.5-7b_gpt-4-cap-score-3runs.csv +│   ├── llava-v1.5-7b_gpt-4-grade-1runs.json +│   ├── llava-v1.5-7b_gpt-4-grade-3runs.json +│   └── llava-v1.5-7b.json +├── pope +│   ├── answers +│   │   └── llava-v1.5-13b.jsonl +│   └── llava_pope_test.jsonl +├── scienceqa +│   ├── answers +│   │   ├── llava-v1.5-13b.jsonl +│   │   ├── llava-v1.5-13b_output.jsonl +│   │   └── llava-v1.5-13b_result.json +│   └── llava_test_CQM-A.json +├── seed_bench +│   ├── answers +│   │   └── llava-v1.5-13b +│   │   └── merge.jsonl +│   ├── answers_upload +│   │   └── llava-v1.5-13b.jsonl +│   ├── extract_video_frames.py +│   └── llava-seed-bench.jsonl +├── textvqa +│   ├── answers +│   │   └── llava-v1.5-13b.jsonl +│   └── llava_textvqa_val_v051_ocr.jsonl +├── vizwiz +│   ├── answers +│   │   └── llava-v1.5-13b.jsonl +│   ├── answers_upload +│   │   └── llava-v1.5-13b.json +│ ├── llava_test.jsonl +│ ├── test +│ ├── test.json +└── vqav2 + ├── answers + │   └── llava_vqav2_mscoco_test-dev2015 + │   └── llava-v1.5-13b + │   └── merge.jsonl + ├── answers_upload + │   └── llava_vqav2_mscoco_test-dev2015 + │   └── llava-v1.5-13b.json + ├── llava_vqav2_mscoco_test2015.jsonl + ├── llava_vqav2_mscoco_test-dev2015.jsonl + └── test2015 +``` + +## Evaluate VisualQnA + +Our evaluation code comes from [LLaVA project](https://github.com/haotian-liu/LLaVA), thanks for their contribution! + +### Launch VisualQnA Service +Please refer to [VisualQnA](https://github.com/opea-project/GenAIExamples/blob/main/VisualQnA/README.md) to deploy VisualQnA Service. + + +Use cURL command to test VisualQnA service and ensure that it has started properly. +```bash +curl http://${host_ip}:80/v1/visualqna -H "Content-Type: application/json" -d '{ + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What'\''s in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "https://www.ilankelman.org/stopsigns/australia.jpg" + } + } + ] + } + ], + "max_tokens": 300 + }' +``` + +### Generation and Evaluation + +#### VQAv2 + +1. Download [`test2015`](http://images.cocodataset.org/zips/test2015.zip) and put it under `vqa_eval/vqav2`. +2. Inference. + +```bash +bash scripts/vqav2.sh +``` + +3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/830/my-submission): `vqa_eval/vqav2/answers_upload`. + + +#### GQA + +1. Download the data following the official instructions [here](https://cs.stanford.edu/people/dorarad/gqa/download.html) and put under `vqa_eval/gqa/data`. +2. Inference. + + +```bash +bash scripts/gqa.sh +``` + +#### VisWiz + +1. Download [`test.json`](https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip) and extract [`test.zip`](https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip) to `test`. Put them under `vqa_eval/vizwiz`. +2. inference. + +```bash +bash scripts/vizwiz.sh +``` + +3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/1911/my-submission): `vqa_eval/vizwiz/answers_upload`. + + +#### ScienceQA + +1. Under `vqa_eval/scienceqa`, download `images`, `pid_splits.json`, `problems.json` from the `data/scienceqa` folder of the ScienceQA [repo](https://github.com/lupantech/ScienceQA). +2. inference. + +```bash +bash scripts/sqa.sh +``` + +#### TextVQA + +1. Download [`TextVQA_0.5.1_val.json`](https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json) and [images](https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip) and extract to `eval/textvqa`. +2. inference and evaluate. + +```bash +bash scripts/textvqa.sh +``` + +#### POPE + +1. Download `coco` from [POPE](https://github.com/AoiDragon/POPE/tree/e3e39262c85a6a83f26cf5094022a782cb0df58d/output/coco) and put under `vqa_eval/pope`. +2. inference and evaluate. + + +```bash +bash scripts/pope.sh +``` + +#### MME +1. Download the data following the official instructions [here](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation). +2. Downloaded images to `MME_Benchmark_release_version`. +3. Put the official `eval_tool` and `MME_Benchmark_release_version` under `vqa_eval/MME`. +4. inference and evaluate. + +```bash +bash scripts/mme.sh +``` + +#### MMBench + +1. Download [`mmbench_dev_20230712.tsv`](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_20230712.tsv) and put under `vqa_eval/mmbench`. +2. inference and evaluate. + +```bash +bash scripts/mmbench.sh +``` + +3. Submit the results to the [evaluation server](https://opencompass.org.cn/leaderboard-multimodal): `vqa_eval/mmbench/answers_upload/mmbench_dev_20230712`. + + +#### MMBench-CN + +1. Download [`mmbench_dev_cn_20231003.tsv`](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_cn_20231003.tsv) and put under `vqa_eval/mmbench`. +2. inference and evaluate. + +```bash +bash scripts/mmbench_cn.sh +``` + +3. Submit the results to the [evaluation server](https://opencompass.org.cn/leaderboard-multimodal): `vqa_eval/mmbench/answers_upload/mmbench_dev_cn_20231003`. + + +#### SEED-Bench + +1. Following the official [instructions](https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md) to download the images and the videos. Put images under `vqa_eval/seed_bench/SEED-Bench-image`. +2. Extract the video frame in the middle from the downloaded videos, and put them under `vqa_eval/seed_bench/SEED-Bench-video-image`. +3. inference and evaluate. + +```bash +bash scripts/seed.sh +``` + +4. Optionally, submit the results to the leaderboard: `vqa_eval/seed_bench/answers_upload` using the official jupyter notebook. + + +#### LLaVA-Bench-in-the-Wild + +1. Extract contents of [`llava-bench-in-the-wild`](https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild) to `vqa_eval/llava-bench-in-the-wild`. +2. inference and evaluate. + +```bash +bash scripts/llavabench.sh +``` + + +#### MM-Vet + +1. Extract [`mm-vet.zip`](https://github.com/yuweihao/MM-Vet/releases/download/v1/mm-vet.zip) to `vqa_eval/mmvet`. +2. inference and evaluate + +```bash +bash scripts/mmvet.sh +``` + + +### Accuracy Result diff --git a/examples/VisualQnA/inference/mmbench_generation.py b/examples/VisualQnA/inference/mmbench_generation.py new file mode 100644 index 00000000..8e9509e1 --- /dev/null +++ b/examples/VisualQnA/inference/mmbench_generation.py @@ -0,0 +1,135 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import base64 +import json +import math +import os +from io import BytesIO + +import pandas as pd +import requests +import shortuuid +from PIL import Image +from tqdm import tqdm + + +def load_image_from_base64(image): + return Image.open(BytesIO(base64.b64decode(image))) + + +def split_list(lst, n): + """Split a list into n (roughly) equal-sized chunks.""" + chunk_size = math.ceil(len(lst) / n) # integer division + return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)] + + +def get_chunk(lst, n, k): + chunks = split_list(lst, n) + return chunks[k] + + +def is_none(value): + if value is None: + return True + if type(value) is float and math.isnan(value): + return True + if type(value) is str and value.lower() == "nan": + return True + if type(value) is str and value.lower() == "none": + return True + return False + + +def get_options(row, options): + parsed_options = [] + for option in options: + option_value = row[option] + if is_none(option_value): + break + parsed_options.append(option_value) + return parsed_options + + +all_options = ["A", "B", "C", "D"] + + +def eval_model(args): + questions = pd.read_table(os.path.expanduser(args.question_file)) + questions = get_chunk(questions, args.num_chunks, args.chunk_idx) + answers_file = os.path.expanduser(args.answers_file) + os.makedirs(os.path.dirname(answers_file), exist_ok=True) + ans_file = open(answers_file, "w") + + cnt = -1 + for index, row in tqdm(questions.iterrows(), total=len(questions)): + options = get_options(row, all_options) + cur_option_char = all_options[: len(options)] + + if args.all_rounds: + num_rounds = len(options) + else: + num_rounds = 1 + + for round_idx in range(num_rounds): + cnt += 1 + idx = row["index"] + question = row["question"] + hint = row["hint"] + image = load_image_from_base64(row["image"]) # Assumes image is base64 encoded + if not is_none(hint): + question = hint + "\n" + question + for option_char, option in zip(all_options[: len(options)], options): + question = question + "\n" + option_char + ". " + option + + # Prepare data for the POST request + payload = { + "question": question, + "image": row["image"], # Assuming image is already base64 encoded + "options": options, + } + + # Send POST request + response = requests.post(args.service_url, json=payload) + if response.status_code == 200: + outputs = response.json().get("answer", "").strip() + else: + print(f"Error: Received status code {response.status_code}") + outputs = "" + + ans_id = shortuuid.uuid() + ans_file.write( + json.dumps( + { + "question_id": idx, + "round_id": round_idx, + "prompt": question, + "text": outputs, + "options": options, + "option_char": cur_option_char, + "answer_id": ans_id, + "metadata": {}, + } + ) + + "\n" + ) + ans_file.flush() + + # Rotate options + options = options[1:] + options[:1] + cur_option_char = cur_option_char[1:] + cur_option_char[:1] + + ans_file.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--question-file", type=str, default="tables/question.jsonl") + parser.add_argument("--answers-file", type=str, default="answer.jsonl") + parser.add_argument("--all-rounds", action="store_true") + parser.add_argument("--lang", type=str, default="en") + parser.add_argument("--service-url", type=str, required=True, help="URL of the VisualQnA service") + args = parser.parse_args() + + eval_model(args) diff --git a/examples/VisualQnA/inference/vqa_generation.py b/examples/VisualQnA/inference/vqa_generation.py new file mode 100644 index 00000000..90fd04f7 --- /dev/null +++ b/examples/VisualQnA/inference/vqa_generation.py @@ -0,0 +1,94 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json +import math +import os + +import requests +import shortuuid +from PIL import Image +from tqdm import tqdm + + +def split_list(lst, n): + """Split a list into n (roughly) equal-sized chunks.""" + chunk_size = math.ceil(len(lst) / n) # integer division + return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)] + + +def get_chunk(lst, n, k): + chunks = split_list(lst, n) + return chunks[k] + + +def eval_model(args): + questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] + questions = get_chunk(questions, args.num_chunks, args.chunk_idx) + answers_file = os.path.expanduser(args.answers_file) + os.makedirs(os.path.dirname(answers_file), exist_ok=True) + ans_file = open(answers_file, "w") + + cnt = -1 + for line in tqdm(questions, total=len(questions)): + cnt += 1 + idx = line["question_id"] + cur_prompt = line["text"] + image_file = line["image"] + + # Construct the payload for the HTTP request + payload = { + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": cur_prompt}, + {"type": "image_url", "image_url": {"url": f"http://{args.image_folder}/{image_file}"}}, + ], + } + ], + "max_tokens": args.max_new_tokens, + } + + # Send the HTTP request to the VisualQnA service + response = requests.post( + f"http://{args.host_ip}:8888/v1/visualqna", headers={"Content-Type": "application/json"}, json=payload + ) + + if response.status_code == 200: + outputs = response.json()["choices"][0]["message"]["content"] + else: + print(f"Failed to get response from VisualQnA service: {response.status_code}") + outputs = "Error in response" + + print(outputs) + ans_id = shortuuid.uuid() + ans_file.write( + json.dumps( + { + "question_id": idx, + "prompt": cur_prompt, + "text": outputs, + "answer_id": ans_id, + "model_id": "visualqna_service", + "metadata": {}, + } + ) + + "\n" + ) + ans_file.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--image-folder", type=str, default="") + parser.add_argument("--question-file", type=str, default="tables/question.jsonl") + parser.add_argument("--answers-file", type=str, default="answer.jsonl") + parser.add_argument("--temperature", type=float, default=0.2) + parser.add_argument("--top_p", type=float, default=None) + parser.add_argument("--num_beams", type=int, default=1) + parser.add_argument("--max_new_tokens", type=int, default=128) + args = parser.parse_args() + + eval_model(args) diff --git a/examples/VisualQnA/scripts/convert_gqa_for_eval.py b/examples/VisualQnA/scripts/convert_gqa_for_eval.py new file mode 100644 index 00000000..67a1e06f --- /dev/null +++ b/examples/VisualQnA/scripts/convert_gqa_for_eval.py @@ -0,0 +1,20 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json + +parser = argparse.ArgumentParser() +parser.add_argument("--src", type=str) +parser.add_argument("--dst", type=str) +args = parser.parse_args() + +all_answers = [] +for line_idx, line in enumerate(open(args.src)): + res = json.loads(line) + question_id = res["question_id"] + text = res["text"].rstrip(".").lower() + all_answers.append({"questionId": question_id, "prediction": text}) + +with open(args.dst, "w") as f: + json.dump(all_answers, f) diff --git a/examples/VisualQnA/scripts/convert_mmbench_for_submission.py b/examples/VisualQnA/scripts/convert_mmbench_for_submission.py new file mode 100644 index 00000000..9b6fd350 --- /dev/null +++ b/examples/VisualQnA/scripts/convert_mmbench_for_submission.py @@ -0,0 +1,33 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json +import os + +import pandas as pd + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--annotation-file", type=str, required=True) + parser.add_argument("--result-dir", type=str, required=True) + parser.add_argument("--upload-dir", type=str, required=True) + parser.add_argument("--experiment", type=str, required=True) + + return parser.parse_args() + + +if __name__ == "__main__": + args = get_args() + + df = pd.read_table(args.annotation_file) + + cur_df = df.copy() + cur_df = cur_df.drop(columns=["hint", "category", "source", "image", "comment", "l2-category"]) + cur_df.insert(6, "prediction", None) + for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")): + pred = json.loads(pred) + cur_df.loc[df["index"] == pred["question_id"], "prediction"] = pred["text"] + + cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine="openpyxl") diff --git a/examples/VisualQnA/scripts/convert_mmvet_for_eval.py b/examples/VisualQnA/scripts/convert_mmvet_for_eval.py new file mode 100644 index 00000000..d10f641c --- /dev/null +++ b/examples/VisualQnA/scripts/convert_mmvet_for_eval.py @@ -0,0 +1,20 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json + +parser = argparse.ArgumentParser() +parser.add_argument("--src", type=str) +parser.add_argument("--dst", type=str) +args = parser.parse_args() + +cur_result = {} + +for line in open(args.src): + data = json.loads(line) + qid = data["question_id"] + cur_result[f"v1_{qid}"] = data["text"] + +with open(args.dst, "w") as f: + json.dump(cur_result, f, indent=2) diff --git a/examples/VisualQnA/scripts/convert_seed_for_submission.py b/examples/VisualQnA/scripts/convert_seed_for_submission.py new file mode 100644 index 00000000..29e97875 --- /dev/null +++ b/examples/VisualQnA/scripts/convert_seed_for_submission.py @@ -0,0 +1,75 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--annotation-file", type=str) + parser.add_argument("--result-file", type=str) + parser.add_argument("--result-upload-file", type=str) + return parser.parse_args() + + +def eval_single(result_file, eval_only_type=None): + results = {} + for line in open(result_file): + row = json.loads(line) + results[row["question_id"]] = row + + type_counts = {} + correct_counts = {} + for question_data in data["questions"]: + if eval_only_type is not None and question_data["data_type"] != eval_only_type: + continue + data_type = question_data["question_type_id"] + type_counts[data_type] = type_counts.get(data_type, 0) + 1 + try: + question_id = int(question_data["question_id"]) + except: + question_id = question_data["question_id"] + if question_id not in results: + correct_counts[data_type] = correct_counts.get(data_type, 0) + continue + row = results[question_id] + if row["text"] == question_data["answer"]: + correct_counts[data_type] = correct_counts.get(data_type, 0) + 1 + + total_count = 0 + total_correct = 0 + for data_type in sorted(type_counts.keys()): + accuracy = correct_counts[data_type] / type_counts[data_type] * 100 + if eval_only_type is None: + print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%") + + total_count += type_counts[data_type] + total_correct += correct_counts[data_type] + + total_accuracy = total_correct / total_count * 100 + if eval_only_type is None: + print(f"Total accuracy: {total_accuracy:.2f}%") + else: + print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%") + + return results + + +if __name__ == "__main__": + args = get_args() + data = json.load(open(args.annotation_file)) + ques_type_id_to_name = {id: n for n, id in data["question_type"].items()} + + results = eval_single(args.result_file) + eval_single(args.result_file, eval_only_type="image") + eval_single(args.result_file, eval_only_type="video") + + with open(args.result_upload_file, "w") as fp: + for question in data["questions"]: + qid = question["question_id"] + if qid in results: + result = results[qid] + else: + result = results[int(qid)] + fp.write(json.dumps({"question_id": qid, "prediction": result["text"]}) + "\n") diff --git a/examples/VisualQnA/scripts/convert_vizwiz_for_submission.py b/examples/VisualQnA/scripts/convert_vizwiz_for_submission.py new file mode 100644 index 00000000..dc99a508 --- /dev/null +++ b/examples/VisualQnA/scripts/convert_vizwiz_for_submission.py @@ -0,0 +1,47 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json +import os + +from evals.evaluation.llava.m4c_evaluator import EvalAIAnswerProcessor + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--annotation-file", type=str, required=True) + parser.add_argument("--result-file", type=str, required=True) + parser.add_argument("--result-upload-file", type=str, required=True) + return parser.parse_args() + + +if __name__ == "__main__": + + args = parse_args() + + os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True) + + results = [] + error_line = 0 + for line_idx, line in enumerate(open(args.result_file)): + try: + results.append(json.loads(line)) + except: + error_line += 1 + results = {x["question_id"]: x["text"] for x in results} + test_split = [json.loads(line) for line in open(args.annotation_file)] + split_ids = set([x["question_id"] for x in test_split]) + + print(f"total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}") + + all_answers = [] + + answer_processor = EvalAIAnswerProcessor() + + for x in test_split: + assert x["question_id"] in results + all_answers.append({"image": x["image"], "answer": answer_processor(results[x["question_id"]])}) + + with open(args.result_upload_file, "w") as f: + json.dump(all_answers, f) diff --git a/examples/VisualQnA/scripts/convert_vqav2_for_submission.py b/examples/VisualQnA/scripts/convert_vqav2_for_submission.py new file mode 100644 index 00000000..6febd6d2 --- /dev/null +++ b/examples/VisualQnA/scripts/convert_vqav2_for_submission.py @@ -0,0 +1,53 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json +import os + +from evals.evaluation.llava.m4c_evaluator import EvalAIAnswerProcessor + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--dir", type=str, default="./vqa_eval/vqav2") + parser.add_argument("--ckpt", type=str, required=True) + parser.add_argument("--split", type=str, required=True) + return parser.parse_args() + + +if __name__ == "__main__": + + args = parse_args() + + src = os.path.join(args.dir, "answers", args.split, args.ckpt, "merge.jsonl") + test_split = os.path.join(args.dir, "llava_vqav2_mscoco_test2015.jsonl") + dst = os.path.join(args.dir, "answers_upload", args.split, f"{args.ckpt}.json") + os.makedirs(os.path.dirname(dst), exist_ok=True) + + results = [] + error_line = 0 + for line_idx, line in enumerate(open(src)): + try: + results.append(json.loads(line)) + except: + error_line += 1 + + results = {x["question_id"]: x["text"] for x in results} + test_split = [json.loads(line) for line in open(test_split)] + split_ids = set([x["question_id"] for x in test_split]) + + print(f"total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}") + + all_answers = [] + + answer_processor = EvalAIAnswerProcessor() + + for x in test_split: + if x["question_id"] not in results: + all_answers.append({"question_id": x["question_id"], "answer": ""}) + else: + all_answers.append({"question_id": x["question_id"], "answer": answer_processor(results[x["question_id"]])}) + + with open(dst, "w") as f: + json.dump(all_answers, open(dst, "w")) diff --git a/examples/VisualQnA/scripts/gpa.sh b/examples/VisualQnA/scripts/gpa.sh new file mode 100644 index 00000000..9e8b7c00 --- /dev/null +++ b/examples/VisualQnA/scripts/gpa.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CKPT_NAME="llava-v1.6-mistral-7b-hf" +CKPT="checkpoints/${CKPT_NAME}" +SPLIT="llava_gqa_testdev_balanced" +EVAL="vqa_eval" +GQADIR="${EVAL}/gqa/data" + +python3 -m vqa_generation \ + --question-file ${EVAL}/gqa/$SPLIT.jsonl \ + --image-folder ${EVAL}/gqa/data/images \ + --answers-file ${EVAL}/gqa/answers/$SPLIT/${CKPT_NAME}/llava_eval.jsonl + +wait + +output_file=${EVAL}/gqa/answers/$SPLIT/${CKPT_NAME}/merge.jsonl + +# Clear out the output file if it exists. +> "$output_file" + +cat ${EVAL}/gqa/answers/$SPLIT/${CKPT_NAME}/llava_eval.jsonl >> "$output_file" + + +mkdir -p $GQADIR/$SPLIT/${CKPT_NAME} +python3 scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/$SPLIT/${CKPT_NAME}/testdev_balanced_predictions.json + +cd $GQADIR +python3 eval/eval_gqa.py --tier $SPLIT/${CKPT_NAME}/testdev_balanced \ + --questions ${EVAL}/gqa/data/questions1.2/testdev_balanced_questions.json diff --git a/examples/VisualQnA/scripts/llavabench.sh b/examples/VisualQnA/scripts/llavabench.sh new file mode 100644 index 00000000..f7cee834 --- /dev/null +++ b/examples/VisualQnA/scripts/llavabench.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CKPT_NAME="llava-v1.6-mistral-7b-hf" +CKPT="checkpoints/${CKPT_NAME}" +EVAL="vqa_eval" + +python3 -m vqa_generation \ + --question-file ${EVAL}/llava-bench-in-the-wild/questions.jsonl \ + --image-folder ${EVAL}/llava-bench-in-the-wild/images \ + --answers-file ${EVAL}/llava-bench-in-the-wild/answers/${CKPT_NAME}.jsonl + +mkdir -p ${EVAL}/llava-bench-in-the-wild/reviews + +python3 evals/evaluation/llava/eval_gpt_review_bench.py \ + --question ${EVAL}/llava-bench-in-the-wild/questions.jsonl \ + --context ${EVAL}/llava-bench-in-the-wild/context.jsonl \ + --rule moellava/eval/table/rule.json \ + --answer-list ${EVAL}/llava-bench-in-the-wild/answers_gpt4.jsonl \ + ${EVAL}/llava-bench-in-the-wild/answers/${CKPT_NAME}.jsonl \ + --output ${EVAL}/llava-bench-in-the-wild/reviews/${CKPT_NAME}.jsonl + +python3 evals/evaluation/llava/summarize_gpt_review.py -f ${EVAL}/llava-bench-in-the-wild/reviews/${CKPT_NAME}.jsonl diff --git a/examples/VisualQnA/scripts/mmbench.sh b/examples/VisualQnA/scripts/mmbench.sh new file mode 100644 index 00000000..22785904 --- /dev/null +++ b/examples/VisualQnA/scripts/mmbench.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +SPLIT="mmbench_dev_20230712" + +CKPT_NAME="llava-v1.6-mistral-7b-hf" +CKPT="checkpoints/${CKPT_NAME}" +EVAL="vql_eval" + +python3 -m mmbench_generation \ + --question-file ${EVAL}/mmbench/$SPLIT.tsv \ + --answers-file ${EVAL}/mmbench/answers/$SPLIT/${CKPT_NAME}.jsonl \ + --single-pred-prompt + +mkdir -p ${EVAL}/mmbench/answers_upload/$SPLIT + +python3 scripts/convert_mmbench_for_submission.py \ + --annotation-file ${EVAL}/mmbench/$SPLIT.tsv \ + --result-dir ${EVAL}/mmbench/answers/$SPLIT \ + --upload-dir ${EVAL}/mmbench/answers_upload/$SPLIT \ + --experiment ${CKPT_NAME} diff --git a/examples/VisualQnA/scripts/mmbench_cn.sh b/examples/VisualQnA/scripts/mmbench_cn.sh new file mode 100644 index 00000000..46273e22 --- /dev/null +++ b/examples/VisualQnA/scripts/mmbench_cn.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +SPLIT="mmbench_dev_cn_20231003" +CKPT_NAME="llava-v1.6-mistral-7b-hf" +CKPT="checkpoints/${CKPT_NAME}" +EVAL="vqa_eval" + + +python3 -m mmbench_generation \ + --question-file ${EVAL}/mmbench/$SPLIT.tsv \ + --answers-file ${EVAL}/mmbench/answers/$SPLIT/${CKPT_NAME}.jsonl \ + --lang cn \ + --single-pred-prompt + +mkdir -p ${EVAL}/mmbench/answers_upload/$SPLIT + +python scripts/convert_mmbench_for_submission.py \ + --annotation-file ${EVAL}/mmbench/$SPLIT.tsv \ + --result-dir ${EVAL}/mmbench/answers/$SPLIT \ + --upload-dir ${EVAL}/mmbench/answers_upload/$SPLIT \ + --experiment ${CKPT_NAME} diff --git a/examples/VisualQnA/scripts/mme.sh b/examples/VisualQnA/scripts/mme.sh new file mode 100644 index 00000000..9efc815b --- /dev/null +++ b/examples/VisualQnA/scripts/mme.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CKPT_NAME="llava-v1.6-mistral-7b-hf" +CKPT="checkpoints/${CKPT_NAME}" +EVAL="vqa_eval" + + +python3 -m vqa_generation \ + --question-file ${EVAL}/MME/llava_mme.jsonl \ + --image-folder ${EVAL}/MME/MME_Benchmark_release_version \ + --answers-file ${EVAL}/MME/answers/${CKPT_NAME}.jsonl + +cd ${EVAL}/MME + +python convert_answer_to_mme.py --experiment $CKPT_NAME + +cd eval_tool + +python calculation.py --results_dir answers/$CKPT_NAME diff --git a/examples/VisualQnA/scripts/mmvet.sh b/examples/VisualQnA/scripts/mmvet.sh new file mode 100644 index 00000000..6b222706 --- /dev/null +++ b/examples/VisualQnA/scripts/mmvet.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CKPT_NAME="llava-v1.6-mistral-7b-hf" +CKPT="checkpoints/${CKPT_NAME}" +EVAL="vqa_eval" + + +python3 -m vqa_generation \ + --question-file ${EVAL}/mm-vet/llava-mm-vet.jsonl \ + --image-folder ${EVAL}/mm-vet/images \ + --answers-file ${EVAL}/mm-vet/answers/${CKPT_NAME}.jsonl + +mkdir -p ${EVAL}/mm-vet/results + +python3 scripts/convert_mmvet_for_eval.py \ + --src ${EVAL}/mm-vet/answers/${CKPT_NAME}.jsonl \ + --dst ${EVAL}/mm-vet/results/${CKPT_NAME}.json + + +python3 evals/evaluation/llava/eval_gpt_mmvet.py \ + --mmvet_path ${EVAL}/mm-vet \ + --ckpt_name ${CKPT_NAME} \ + --result_path ${EVAL}/mm-vet/results diff --git a/examples/VisualQnA/scripts/pope.sh b/examples/VisualQnA/scripts/pope.sh new file mode 100644 index 00000000..82b9a42e --- /dev/null +++ b/examples/VisualQnA/scripts/pope.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CKPT_NAME="llava-v1.6-mistral-7b-hf" +CKPT="checkpoints/${CKPT_NAME}" +EVAL="vqa_eval" + +python3 -m vqa_generation \ + --question-file ${EVAL}/pope/llava_pope_test.jsonl \ + --image-folder ${EVAL}/pope/val2014 \ + --answers-file ${EVAL}/pope/answers/${CKPT_NAME}.jsonl \ + +python evals/evaluation/llava/eval_pope.py \ + --annotation-dir ${EVAL}/pope/coco \ + --question-file ${EVAL}/pope/llava_pope_test.jsonl \ + --result-file ${EVAL}/pope/answers/${CKPT_NAME}.jsonl diff --git a/examples/VisualQnA/scripts/seed.sh b/examples/VisualQnA/scripts/seed.sh new file mode 100644 index 00000000..b3f020e8 --- /dev/null +++ b/examples/VisualQnA/scripts/seed.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +CKPT_NAME="llava-v1.6-mistral-7b-hf" +CKPT="checkpoints/${CKPT_NAME}" +EVAL="vqa_eval" + +python3 -m vqa_generation \ + --question-file ${EVAL}/seed_bench/llava-seed-bench.jsonl \ + --image-folder ${EVAL}/seed_bench \ + --answers-file ${EVAL}/seed_bench/answers/seed_eval.jsonl + +wait + +output_file=${EVAL}/seed_bench/answers/${CKPT_NAME}/merge.jsonl + +# Clear out the output file if it exists. +> "$output_file" + +cat ${EVAL}/seed_bench/answers/seed_eval.jsonl >> "$output_file" + + +# Evaluate +python scripts/convert_seed_for_submission.py \ + --annotation-file ${EVAL}/seed_bench/SEED-Bench.json \ + --result-file $output_file \ + --result-upload-file ${EVAL}/seed_bench/answers_upload/${CKPT_NAME}.jsonl diff --git a/examples/VisualQnA/scripts/sqa.sh b/examples/VisualQnA/scripts/sqa.sh new file mode 100644 index 00000000..8dc121ed --- /dev/null +++ b/examples/VisualQnA/scripts/sqa.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CKPT_NAME="llava-v1.6-mistral-7b-hf" +CKPT="checkpoints/${CKPT_NAME}" +EVAL="vqa_eval" + + +python3 -m vqa_generation \ + --question-file ${EVAL}/scienceqa/llava_test_CQM-A.json \ + --image-folder ${EVAL}/scienceqa/images/test \ + --answers-file ${EVAL}/scienceqa/answers/${CKPT_NAME}.jsonl \ + --single-pred-prompt + +python3 evals/evaluation/llava/eval_science_qa.py \ + --base-dir ${EVAL}/scienceqa \ + --result-file ${EVAL}/scienceqa/answers/${CKPT_NAME}.jsonl \ + --output-file ${EVAL}/scienceqa/answers/${CKPT_NAME}_output.jsonl \ + --output-result ${EVAL}/scienceqa/answers/${CKPT_NAME}_result.json diff --git a/examples/VisualQnA/scripts/textvqa.sh b/examples/VisualQnA/scripts/textvqa.sh new file mode 100644 index 00000000..5295bf0c --- /dev/null +++ b/examples/VisualQnA/scripts/textvqa.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +CKPT_NAME="llava-v1.6-mistral-7b-hf" +CKPT="checkpoints/${CKPT_NAME}" +EVAL="vqa_eval" + +python3 -m vqa_generation \ + --question-file ${EVAL}/textvqa/llava_textvqa_val_v051_ocr.jsonl \ + --image-folder ${EVAL}/textvqa/train_images \ + --answers-file ${EVAL}/textvqa/answers/${CKPT_NAME}.jsonl + + +python3 -m evals/evaluation/llava/eval_textvqa \ + --annotation-file ${EVAL}/textvqa/TextVQA_0.5.1_val.json \ + --result-file ${EVAL}/textvqa/answers/${CKPT_NAME}.jsonl diff --git a/examples/VisualQnA/scripts/vizwiz.sh b/examples/VisualQnA/scripts/vizwiz.sh new file mode 100644 index 00000000..57e753c1 --- /dev/null +++ b/examples/VisualQnA/scripts/vizwiz.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CKPT_NAME="llava-v1.6-mistral-7b-hf" +CKPT="checkpoints/${CKPT_NAME}" +EVAL="vqa_eval" + +python3 -m vqa_generation \ + --question-file ${EVAL}/vizwiz/llava_test.jsonl \ + --image-folder ${EVAL}/vizwiz/test \ + --answers-file ${EVAL}/vizwiz/answers/${CKPT_NAME}.jsonl + +python3 scripts/convert_vizwiz_for_submission.py \ + --annotation-file ${EVAL}/vizwiz/llava_test.jsonl \ + --result-file ${EVAL}/vizwiz/answers/${CKPT_NAME}.jsonl \ + --result-upload-file ${EVAL}/vizwiz/answers_upload/${CKPT_NAME}.json diff --git a/examples/VisualQnA/scripts/vqav2.sh b/examples/VisualQnA/scripts/vqav2.sh new file mode 100644 index 00000000..c9a95066 --- /dev/null +++ b/examples/VisualQnA/scripts/vqav2.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CKPT_NAME="llava-v1.6-mistral-7b-hf" +CKPT="checkpoints/${CKPT_NAME}" +SPLIT="llava_vqav2_mscoco_test-dev2015" +EVAL="vqa_eval" + +python3 -m vqa_generation \ + --question-file ${EVAL}/vqav2/$SPLIT.jsonl \ + --image-folder ${EVAL}/vqav2/test2015 \ + --answers-file ${EVAL}/vqav2/answers/$SPLIT/${CKPT_NAME}/llava_eval.jsonl + + +wait + +output_file=${EVAL}/vqav2/answers/$SPLIT/${CKPT_NAME}/merge.jsonl + +# Clear out the output file if it exists. +> "$output_file" + +cat ${EVAL}/vqav2/answers/$SPLIT/${CKPT_NAME}/llava_evaljsonl >> "$output_file" + +python3 scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt ${CKPT_NAME} --dir ${EVAL}/vqav2