From 32e563ca651c6b4cd44f34414868b5cd119228cd Mon Sep 17 00:00:00 2001 From: Elron Bandel Date: Sun, 12 Jan 2025 18:01:02 +0200 Subject: [PATCH] Update vis bench (#1505) * Simplify reuters dataset and make its preparation faster Signed-off-by: elronbandel * Update vision benchmark example Signed-off-by: elronbandel --------- Signed-off-by: elronbandel --- examples/evaluate_vision_benchmark.py | 18 +++++++++--------- prepare/cards/reuters21578.py | 8 +++----- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/examples/evaluate_vision_benchmark.py b/examples/evaluate_vision_benchmark.py index e43820c989..f8a0aa861f 100644 --- a/examples/evaluate_vision_benchmark.py +++ b/examples/evaluate_vision_benchmark.py @@ -8,13 +8,13 @@ allow_unverified_code=True, ): test_dataset = load_dataset( - "benchmarks.vision[loader_limit=30,max_samples_per_subset=30]", split="test" + "benchmarks.vision[format=formats.chat_api,loader_limit=30,max_samples_per_subset=30]", + split="test", ) # Infer model = CrossProviderInferenceEngine( - model="llama-3-2-11b-vision-instruct", - max_tokens=30, + model="llama-3-2-11b-vision-instruct", max_tokens=30, provider="watsonx" ) """ We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as: @@ -34,9 +34,9 @@ # | subset | score | score_name | num_of_instances | # |:---------|---------:|:----------------|-------------------:| -# | ALL | 0.15637 | subsets_mean | 150 | -# | doc_vqa | 0.05 | anls | 30 | -# | info_vqa | 0.176852 | anls | 30 | -# | chart_qa | 0.1 | relaxed_overall | 30 | -# | ai2d | 0.3 | exact_match_mm | 30 | -# | websrc | 0.155 | websrc_squad_f1 | 30 | +# | ALL | 0.429583 | subsets_mean | 150 | +# | doc_vqa | 0.79103 | anls | 30 | +# | info_vqa | 0.464885 | anls | 30 | +# | chart_qa | 0.3 | relaxed_overall | 30 | +# | ai2d | 0.2 | exact_match_mm | 30 | +# | websrc | 0.392 | websrc_squad_f1 | 30 | diff --git a/prepare/cards/reuters21578.py b/prepare/cards/reuters21578.py index 68827c3cfc..53155d3a13 100644 --- a/prepare/cards/reuters21578.py +++ b/prepare/cards/reuters21578.py @@ -1,4 +1,3 @@ -from datasets import get_dataset_config_names from unitxt import add_to_catalog from unitxt.blocks import ( LoadHF, @@ -139,9 +138,7 @@ classlabels["ModLewis"] = classlabels["ModApte"] classlabels["ModHayes"] = sorted(classlabels["ModApte"] + ["bfr", "hk"]) -for subset in get_dataset_config_names( - dataset_name, trust_remote_code=settings.allow_unverified_code -): +for subset in classlabels: card = TaskCard( loader=LoadHF(path=f"{dataset_name}", name=subset), preprocess_steps=[ @@ -158,5 +155,6 @@ "The Reuters-21578 dataset is one of the most widely used data collections for text categorization research. It is collected from the Reuters financial newswire service in 1987… See the full description on the dataset page: https://huggingface.co/datasets/reuters21578" ), ) - test_card(card, debug=False) + if subset == "ModHayes": + test_card(card, debug=False) add_to_catalog(card, f"cards.{dataset_name}.{subset}", overwrite=True)