Update vision benchmark example

Signed-off-by: elronbandel <[email protected]>
IBM · Jan 12, 2025 · a48396a · a48396a
1 parent cd5f7f1
commit a48396a
Showing 1 changed file with 9 additions and 9 deletions.
diff --git a/examples/evaluate_vision_benchmark.py b/examples/evaluate_vision_benchmark.py
@@ -8,13 +8,13 @@
     allow_unverified_code=True,
 ):
     test_dataset = load_dataset(
-        "benchmarks.vision[loader_limit=30,max_samples_per_subset=30]", split="test"
+        "benchmarks.vision[format=formats.chat_api,loader_limit=30,max_samples_per_subset=30]",
+        split="test",
     )
 
 # Infer
 model = CrossProviderInferenceEngine(
-    model="llama-3-2-11b-vision-instruct",
-    max_tokens=30,
+    model="llama-3-2-11b-vision-instruct", max_tokens=30, provider="watsonx"
 )
 """
 We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
@@ -34,9 +34,9 @@
 
 # | subset   |    score | score_name      |   num_of_instances |
 # |:---------|---------:|:----------------|-------------------:|
-# | ALL      | 0.15637  | subsets_mean    |                150 |
-# | doc_vqa  | 0.05     | anls            |                 30 |
-# | info_vqa | 0.176852 | anls            |                 30 |
-# | chart_qa | 0.1      | relaxed_overall |                 30 |
-# | ai2d     | 0.3      | exact_match_mm  |                 30 |
-# | websrc   | 0.155    | websrc_squad_f1 |                 30 |
+# | ALL      | 0.429583 | subsets_mean    |                150 |
+# | doc_vqa  | 0.79103  | anls            |                 30 |
+# | info_vqa | 0.464885 | anls            |                 30 |
+# | chart_qa | 0.3      | relaxed_overall |                 30 |
+# | ai2d     | 0.2      | exact_match_mm  |                 30 |
+# | websrc   | 0.392    | websrc_squad_f1 |                 30 |