diff --git a/examples/evaluate_vision_benchmark.py b/examples/evaluate_vision_benchmark.py
index e43820c989..f8a0aa861f 100644
--- a/examples/evaluate_vision_benchmark.py
+++ b/examples/evaluate_vision_benchmark.py
@@ -8,13 +8,13 @@
     allow_unverified_code=True,
 ):
     test_dataset = load_dataset(
-        "benchmarks.vision[loader_limit=30,max_samples_per_subset=30]", split="test"
+        "benchmarks.vision[format=formats.chat_api,loader_limit=30,max_samples_per_subset=30]",
+        split="test",
     )
 
 # Infer
 model = CrossProviderInferenceEngine(
-    model="llama-3-2-11b-vision-instruct",
-    max_tokens=30,
+    model="llama-3-2-11b-vision-instruct", max_tokens=30, provider="watsonx"
 )
 """
 We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
@@ -34,9 +34,9 @@
 
 # | subset   |    score | score_name      |   num_of_instances |
 # |:---------|---------:|:----------------|-------------------:|
-# | ALL      | 0.15637  | subsets_mean    |                150 |
-# | doc_vqa  | 0.05     | anls            |                 30 |
-# | info_vqa | 0.176852 | anls            |                 30 |
-# | chart_qa | 0.1      | relaxed_overall |                 30 |
-# | ai2d     | 0.3      | exact_match_mm  |                 30 |
-# | websrc   | 0.155    | websrc_squad_f1 |                 30 |
+# | ALL      | 0.429583 | subsets_mean    |                150 |
+# | doc_vqa  | 0.79103  | anls            |                 30 |
+# | info_vqa | 0.464885 | anls            |                 30 |
+# | chart_qa | 0.3      | relaxed_overall |                 30 |
+# | ai2d     | 0.2      | exact_match_mm  |                 30 |
+# | websrc   | 0.392    | websrc_squad_f1 |                 30 |
diff --git a/prepare/cards/reuters21578.py b/prepare/cards/reuters21578.py
index 68827c3cfc..53155d3a13 100644
--- a/prepare/cards/reuters21578.py
+++ b/prepare/cards/reuters21578.py
@@ -1,4 +1,3 @@
-from datasets import get_dataset_config_names
 from unitxt import add_to_catalog
 from unitxt.blocks import (
     LoadHF,
@@ -139,9 +138,7 @@
 classlabels["ModLewis"] = classlabels["ModApte"]
 classlabels["ModHayes"] = sorted(classlabels["ModApte"] + ["bfr", "hk"])
 
-for subset in get_dataset_config_names(
-    dataset_name, trust_remote_code=settings.allow_unverified_code
-):
+for subset in classlabels:
     card = TaskCard(
         loader=LoadHF(path=f"{dataset_name}", name=subset),
         preprocess_steps=[
@@ -158,5 +155,6 @@
             "The Reuters-21578 dataset is one of the most widely used data collections for text categorization research. It is collected from the Reuters financial newswire service in 1987… See the full description on the dataset page: https://huggingface.co/datasets/reuters21578"
         ),
     )
-    test_card(card, debug=False)
+    if subset == "ModHayes":
+        test_card(card, debug=False)
     add_to_catalog(card, f"cards.{dataset_name}.{subset}", overwrite=True)