-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
A simple way to create and evaluate given a 'task' in the catalog and…
… python data structure (#1413) * - added api methods to create and evaluate datasets from python data structures - created simplfied qa task and qa metric catalogs - added default templates to Task, so user does need to specifiy it - added input checks in llm_as_judges and standard recipe Signed-off-by: Yoav Katz <[email protected]> * Added example of create_and_evaluate for rag Signed-off-by: Yoav Katz <[email protected]> * Added type to Rag end to end response Also added documentation Signed-off-by: Yoav Katz <[email protected]> * Added option for integer ids in Retrieval metrics Signed-off-by: Yoav Katz <[email protected]> * Moved to integer context_ids Signed-off-by: Yoav Katz <[email protected]> * Improved Task field mismtach erorr message Signed-off-by: Yoav Katz <[email protected]> * Added option to pass post processor to create_and_evaluate_dataset Signed-off-by: Yoav Katz <[email protected]> * Addded descriptions and default values to classification Added example of classification Signed-off-by: Yoav Katz <[email protected]> * Added descriptions Signed-off-by: Yoav Katz <[email protected]> * Reorganized extracted qa task and templates. Signed-off-by: Yoav Katz <[email protected]> * Updated json Signed-off-by: Yoav Katz <[email protected]> * Added documentation of default task and improved error message Signed-off-by: Yoav Katz <[email protected]> * Updated extraction tempalte Signed-off-by: Yoav Katz <[email protected]> * Upated recommended template Signed-off-by: Yoav Katz <[email protected]> * Sikplified and improved LLM as Judge Signed-off-by: Yoav Katz <[email protected]> * Improved error message. Signed-off-by: Yoav Katz <[email protected]> * Fixed classification default task Signed-off-by: Yoav Katz <[email protected]> * Fixed classification default task Signed-off-by: Yoav Katz <[email protected]> * Fixed unitxt Signed-off-by: Yoav Katz <[email protected]> * Fixed typo in template Signed-off-by: Yoav Katz <[email protected]> * Improved documentation Signed-off-by: Yoav Katz <[email protected]> * Simplified API (removed create_and_evaluate_dataset) Signed-off-by: Yoav Katz <[email protected]> * Improved error message Signed-off-by: Yoav Katz <[email protected]> * Changed wikitq back to extractive. Signed-off-by: Yoav Katz <[email protected]> * Fixed examples Signed-off-by: Yoav Katz <[email protected]> * Removed location reference in template. Signed-off-by: Yoav Katz <[email protected]> * Modified extractive template Signed-off-by: Yoav Katz <[email protected]> --------- Signed-off-by: Yoav Katz <[email protected]>
- Loading branch information
Showing
65 changed files
with
768 additions
and
291 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
36 changes: 36 additions & 0 deletions
36
examples/evaluate_classification_dataset_with_given_predictions.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from unitxt import get_logger | ||
from unitxt.api import create_dataset, evaluate | ||
from unitxt.text_utils import print_dict | ||
|
||
logger = get_logger() | ||
|
||
|
||
classes = ["positive", "negative"] | ||
|
||
|
||
# Set up question answer pairs in a dictionary | ||
dataset = [ | ||
{"text": "I am happy.", "label": "positive", "classes": classes}, | ||
{"text": "It was a great movie.", "label": "positive", "classes": classes}, | ||
{"text": "I never felt so bad", "label": "negative", "classes": classes}, | ||
] | ||
|
||
predictions = ["Positive.", "negative.", "negative"] | ||
|
||
dataset = create_dataset( | ||
task="tasks.classification.multi_class", | ||
test_set=dataset, | ||
postprocessors=["processors.take_first_word", "processors.lower_case"], | ||
) | ||
|
||
evaluated_dataset = evaluate(predictions, dataset["test"]) | ||
# Print results | ||
for instance in evaluated_dataset: | ||
print_dict( | ||
instance, | ||
keys_to_print=[ | ||
"prediction", | ||
"references", | ||
"score", | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
from unitxt import get_logger | ||
from unitxt.api import create_dataset, evaluate | ||
from unitxt.text_utils import print_dict | ||
|
||
logger = get_logger() | ||
|
||
# Set up question answer pairs in a dictionary | ||
dataset = [ | ||
{"question": "What is the capital of Texas?", "answers": ["Austin"]}, | ||
{"question": "What is the color of the sky?", "answers": ["Blue"]}, | ||
] | ||
|
||
predictions = ["San Antonio", "blue"] | ||
|
||
dataset = create_dataset( | ||
task="tasks.qa.open", | ||
test_set=dataset, | ||
metrics=[ | ||
"metrics.qa.open.recommended_no_gpu", | ||
"metrics.qa.open.recommended_llm_as_judge", | ||
], | ||
) | ||
|
||
evaluated_dataset = evaluate(predictions, dataset["test"]) | ||
|
||
# Print results | ||
for instance in evaluated_dataset: | ||
print_dict( | ||
instance, | ||
keys_to_print=[ | ||
"score", | ||
], | ||
) |
64 changes: 64 additions & 0 deletions
64
examples/evaluate_rag_end_to_end_dataset_with_given_predictions.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
from unitxt import get_logger | ||
from unitxt.api import create_dataset, evaluate | ||
from unitxt.text_utils import print_dict | ||
|
||
logger = get_logger() | ||
|
||
# | ||
contexts = [ | ||
"Austin is the capital of Texas.", | ||
"Houston is in Texas", | ||
"Houston is the the largest city in the state but not the capital of it.", | ||
] | ||
|
||
# Set up question answer pairs in a dictionary | ||
dataset = [ | ||
{ | ||
"question": "What is the capital of Texas?", | ||
"question_id": 0, | ||
"reference_answers": ["Austin"], | ||
"reference_contexts": [contexts[0]], | ||
"reference_context_ids": [0], | ||
"is_answerable_label": True, | ||
}, | ||
{ | ||
"question": "Which is the the largest city in Texas?", | ||
"question_id": 1, | ||
"reference_answers": ["Houston"], | ||
"reference_contexts": [contexts[1], contexts[2]], | ||
"reference_context_ids": [1, 2], | ||
"is_answerable_label": True, | ||
}, | ||
] | ||
|
||
predictions = [ | ||
{ | ||
"answer": "Houston", | ||
"contexts": [contexts[2]], | ||
"context_ids": [2], | ||
"is_answerable": True, | ||
}, | ||
{ | ||
"answer": "Houston", | ||
"contexts": [contexts[2]], | ||
"context_ids": [2], | ||
"is_answerable": True, | ||
}, | ||
] | ||
|
||
dataset = create_dataset( | ||
task="tasks.rag.end_to_end", | ||
test_set=dataset, | ||
split="test", | ||
postprocessors=[], | ||
) | ||
|
||
evaluated_dataset = evaluate(predictions, dataset) | ||
# Print results | ||
for instance in evaluated_dataset: | ||
print_dict( | ||
instance, | ||
keys_to_print=[ | ||
"score", | ||
], | ||
) |
Oops, something went wrong.