Update vis bench (#1505)

* Simplify reuters dataset and make its preparation faster Signed-off-by: elronbandel <[email protected]> * Update vision benchmark example Signed-off-by: elronbandel <[email protected]> --------- Signed-off-by: elronbandel <[email protected]>
IBM · Jan 12, 2025 · 32e563c · 32e563c
1 parent fc5656c
commit 32e563c
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 14 deletions.
diff --git a/examples/evaluate_vision_benchmark.py b/examples/evaluate_vision_benchmark.py
@@ -8,13 +8,13 @@
     allow_unverified_code=True,
 ):
     test_dataset = load_dataset(
-        "benchmarks.vision[loader_limit=30,max_samples_per_subset=30]", split="test"
+        "benchmarks.vision[format=formats.chat_api,loader_limit=30,max_samples_per_subset=30]",
+        split="test",
     )
 
 # Infer
 model = CrossProviderInferenceEngine(
-    model="llama-3-2-11b-vision-instruct",
-    max_tokens=30,
+    model="llama-3-2-11b-vision-instruct", max_tokens=30, provider="watsonx"
 )
 """
 We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
@@ -34,9 +34,9 @@
 
 # | subset   |    score | score_name      |   num_of_instances |
 # |:---------|---------:|:----------------|-------------------:|
-# | ALL      | 0.15637  | subsets_mean    |                150 |
-# | doc_vqa  | 0.05     | anls            |                 30 |
-# | info_vqa | 0.176852 | anls            |                 30 |
-# | chart_qa | 0.1      | relaxed_overall |                 30 |
-# | ai2d     | 0.3      | exact_match_mm  |                 30 |
-# | websrc   | 0.155    | websrc_squad_f1 |                 30 |
+# | ALL      | 0.429583 | subsets_mean    |                150 |
+# | doc_vqa  | 0.79103  | anls            |                 30 |
+# | info_vqa | 0.464885 | anls            |                 30 |
+# | chart_qa | 0.3      | relaxed_overall |                 30 |
+# | ai2d     | 0.2      | exact_match_mm  |                 30 |
+# | websrc   | 0.392    | websrc_squad_f1 |                 30 |
diff --git a/prepare/cards/reuters21578.py b/prepare/cards/reuters21578.py
@@ -1,4 +1,3 @@
-from datasets import get_dataset_config_names
 from unitxt import add_to_catalog
 from unitxt.blocks import (
     LoadHF,
@@ -139,9 +138,7 @@
 classlabels["ModLewis"] = classlabels["ModApte"]
 classlabels["ModHayes"] = sorted(classlabels["ModApte"] + ["bfr", "hk"])
 
-for subset in get_dataset_config_names(
-    dataset_name, trust_remote_code=settings.allow_unverified_code
-):
+for subset in classlabels:
     card = TaskCard(
         loader=LoadHF(path=f"{dataset_name}", name=subset),
         preprocess_steps=[
@@ -158,5 +155,6 @@
             "The Reuters-21578 dataset is one of the most widely used data collections for text categorization research. It is collected from the Reuters financial newswire service in 1987… See the full description on the dataset page: https://huggingface.co/datasets/reuters21578"
         ),
     )
-    test_card(card, debug=False)
+    if subset == "ModHayes":
+        test_card(card, debug=False)
     add_to_catalog(card, f"cards.{dataset_name}.{subset}", overwrite=True)