From 32e563ca651c6b4cd44f34414868b5cd119228cd Mon Sep 17 00:00:00 2001
From: Elron Bandel <elronbandel@gmail.com>
Date: Sun, 12 Jan 2025 18:01:02 +0200
Subject: [PATCH] Update vis bench (#1505)

* Simplify reuters dataset and make its preparation faster

Signed-off-by: elronbandel <elronbandel@gmail.com>

* Update vision benchmark example

Signed-off-by: elronbandel <elronbandel@gmail.com>

---------

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 examples/evaluate_vision_benchmark.py | 18 +++++++++---------
 prepare/cards/reuters21578.py         |  8 +++-----
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/examples/evaluate_vision_benchmark.py b/examples/evaluate_vision_benchmark.py
index e43820c989..f8a0aa861f 100644
--- a/examples/evaluate_vision_benchmark.py
+++ b/examples/evaluate_vision_benchmark.py
@@ -8,13 +8,13 @@
     allow_unverified_code=True,
 ):
     test_dataset = load_dataset(
-        "benchmarks.vision[loader_limit=30,max_samples_per_subset=30]", split="test"
+        "benchmarks.vision[format=formats.chat_api,loader_limit=30,max_samples_per_subset=30]",
+        split="test",
     )
 
 # Infer
 model = CrossProviderInferenceEngine(
-    model="llama-3-2-11b-vision-instruct",
-    max_tokens=30,
+    model="llama-3-2-11b-vision-instruct", max_tokens=30, provider="watsonx"
 )
 """
 We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
@@ -34,9 +34,9 @@
 
 # | subset   |    score | score_name      |   num_of_instances |
 # |:---------|---------:|:----------------|-------------------:|
-# | ALL      | 0.15637  | subsets_mean    |                150 |
-# | doc_vqa  | 0.05     | anls            |                 30 |
-# | info_vqa | 0.176852 | anls            |                 30 |
-# | chart_qa | 0.1      | relaxed_overall |                 30 |
-# | ai2d     | 0.3      | exact_match_mm  |                 30 |
-# | websrc   | 0.155    | websrc_squad_f1 |                 30 |
+# | ALL      | 0.429583 | subsets_mean    |                150 |
+# | doc_vqa  | 0.79103  | anls            |                 30 |
+# | info_vqa | 0.464885 | anls            |                 30 |
+# | chart_qa | 0.3      | relaxed_overall |                 30 |
+# | ai2d     | 0.2      | exact_match_mm  |                 30 |
+# | websrc   | 0.392    | websrc_squad_f1 |                 30 |
diff --git a/prepare/cards/reuters21578.py b/prepare/cards/reuters21578.py
index 68827c3cfc..53155d3a13 100644
--- a/prepare/cards/reuters21578.py
+++ b/prepare/cards/reuters21578.py
@@ -1,4 +1,3 @@
-from datasets import get_dataset_config_names
 from unitxt import add_to_catalog
 from unitxt.blocks import (
     LoadHF,
@@ -139,9 +138,7 @@
 classlabels["ModLewis"] = classlabels["ModApte"]
 classlabels["ModHayes"] = sorted(classlabels["ModApte"] + ["bfr", "hk"])
 
-for subset in get_dataset_config_names(
-    dataset_name, trust_remote_code=settings.allow_unverified_code
-):
+for subset in classlabels:
     card = TaskCard(
         loader=LoadHF(path=f"{dataset_name}", name=subset),
         preprocess_steps=[
@@ -158,5 +155,6 @@
             "The Reuters-21578 dataset is one of the most widely used data collections for text categorization research. It is collected from the Reuters financial newswire service in 1987… See the full description on the dataset page: https://huggingface.co/datasets/reuters21578"
         ),
     )
-    test_card(card, debug=False)
+    if subset == "ModHayes":
+        test_card(card, debug=False)
     add_to_catalog(card, f"cards.{dataset_name}.{subset}", overwrite=True)