diff --git a/evaluators/ragas/langevals_ragas/context_f1.py b/evaluators/ragas/langevals_ragas/context_f1.py index 9aeceae..784fe5b 100644 --- a/evaluators/ragas/langevals_ragas/context_f1.py +++ b/evaluators/ragas/langevals_ragas/context_f1.py @@ -69,27 +69,39 @@ def evaluate(self, entry: RagasContextF1Entry) -> SingleEvaluationResult: ) ) - precision_score = precision_scorer.single_turn_score( - SingleTurnSample( - retrieved_contexts=entry.contexts, - reference_contexts=entry.expected_contexts, + if len(entry.expected_contexts) == 0 and len(entry.contexts) == 0: + precision_score = 1.0 + elif len(entry.expected_contexts) == 0 or len(entry.contexts) == 0: + precision_score = 0.0 + else: + precision_score = precision_scorer.single_turn_score( + SingleTurnSample( + retrieved_contexts=entry.contexts, + reference_contexts=entry.expected_contexts, + ) ) - ) - recall_scorer = NonLLMContextRecall() - recall_scorer.distance_measure = { - "levenshtein": DistanceMeasure.LEVENSHTEIN, - "hamming": DistanceMeasure.HAMMING, - "jaro": DistanceMeasure.JARO, - "jaro_winkler": DistanceMeasure.JARO_WINKLER, - }[self.settings.distance_measure] + if len(entry.expected_contexts) == 0 and len(entry.contexts) == 0: + recall_score = 1.0 + elif len(entry.expected_contexts) == 0: + recall_score = 1.0 + elif len(entry.contexts) == 0: + recall_score = 0.0 + else: + recall_scorer = NonLLMContextRecall() + recall_scorer.distance_measure = { + "levenshtein": DistanceMeasure.LEVENSHTEIN, + "hamming": DistanceMeasure.HAMMING, + "jaro": DistanceMeasure.JARO, + "jaro_winkler": DistanceMeasure.JARO_WINKLER, + }[self.settings.distance_measure] - recall_score = recall_scorer.single_turn_score( - SingleTurnSample( - retrieved_contexts=entry.contexts, - reference_contexts=entry.expected_contexts, + recall_score = recall_scorer.single_turn_score( + SingleTurnSample( + retrieved_contexts=entry.contexts, + reference_contexts=entry.expected_contexts, + ) ) - ) f1_score = ( 2 * (precision_score * recall_score) / (precision_score + recall_score) diff --git a/evaluators/ragas/langevals_ragas/context_precision.py b/evaluators/ragas/langevals_ragas/context_precision.py index a6605ab..8baea2d 100644 --- a/evaluators/ragas/langevals_ragas/context_precision.py +++ b/evaluators/ragas/langevals_ragas/context_precision.py @@ -56,6 +56,24 @@ class RagasContextPrecisionEvaluator( is_guardrail = False def evaluate(self, entry: RagasContextPrecisionEntry) -> SingleEvaluationResult: + if len(entry.expected_contexts) == 0 and len(entry.contexts) == 0: + return RagasResult( + score=1.0, + cost=None, + details="No contexts retrieved, but also no contexts expected, so that's a perfect precision of 1", + ) + if len(entry.expected_contexts) == 0: + return RagasResult( + score=0.0, + cost=None, + details="No contexts expected, yet some were retrieved, precision is 0", + ) + if len(entry.contexts) == 0: + return RagasResult( + score=0.0, + cost=None, + details="No contexts retrieved, precision is 0", + ) scorer = NonLLMContextPrecisionWithReference( distance_measure=NonLLMStringSimilarity( distance_measure={ diff --git a/evaluators/ragas/langevals_ragas/context_recall.py b/evaluators/ragas/langevals_ragas/context_recall.py index d2b9e52..493b13d 100644 --- a/evaluators/ragas/langevals_ragas/context_recall.py +++ b/evaluators/ragas/langevals_ragas/context_recall.py @@ -5,6 +5,7 @@ EvaluatorEntry, SingleEvaluationResult, EvaluatorSettings, + EvaluationResultSkipped, ) from ragas import SingleTurnSample from .lib.common import ( @@ -55,6 +56,25 @@ class RagasContextRecallEvaluator( is_guardrail = False def evaluate(self, entry: RagasContextRecallEntry) -> SingleEvaluationResult: + if len(entry.expected_contexts) == 0 and len(entry.contexts) == 0: + return RagasResult( + score=1.0, + cost=None, + details="No contexts retrieved, but also no contexts expected, so that's a perfect recall of 1", + ) + if len(entry.expected_contexts) == 0: + return RagasResult( + score=1.0, + cost=None, + details="No contexts expected, meaning nothing was missing, so that's a perfect recall of 1", + ) + if len(entry.contexts) == 0: + return RagasResult( + score=0.0, + cost=None, + details="No contexts retrieved, recall is 0", + ) + scorer = NonLLMContextRecall() scorer.distance_measure = { "levenshtein": DistanceMeasure.LEVENSHTEIN, diff --git a/evaluators/ragas/tests/test_ragas.py b/evaluators/ragas/tests/test_ragas.py index d36d2f5..70fb847 100644 --- a/evaluators/ragas/tests/test_ragas.py +++ b/evaluators/ragas/tests/test_ragas.py @@ -177,6 +177,44 @@ def test_context_precision(): assert not result.cost +def test_context_precision_with_empty_contexts(): + evaluator = RagasContextPrecisionEvaluator(settings=RagasContextPrecisionSettings()) + + result = evaluator.evaluate( + RagasContextPrecisionEntry( + contexts=[], + expected_contexts=[], + ) + ) + assert result.status == "processed" + assert result.score is not None and result.score == 1.0 + assert not result.cost + + result = evaluator.evaluate( + RagasContextPrecisionEntry( + contexts=[], + expected_contexts=[ + "Paris is the capital of France.", + "The Eiffel Tower is one of the most famous landmarks in Paris.", + ], + ) + ) + assert result.status == "processed" + assert result.score is not None and result.score == 0.0 + assert not result.cost + + result = evaluator.evaluate( + RagasContextPrecisionEntry( + contexts=["The Eiffel Tower is located in Paris."], + expected_contexts=[], + ) + ) + + assert result.status == "processed" + assert result.score is not None and result.score == 0.0 + assert not result.cost + + def test_context_recall(): evaluator = RagasContextRecallEvaluator(settings=RagasContextRecallSettings()) @@ -195,6 +233,46 @@ def test_context_recall(): assert not result.cost +def test_context_recall_with_empty_contexts(): + evaluator = RagasContextRecallEvaluator(settings=RagasContextRecallSettings()) + + result = evaluator.evaluate( + RagasContextRecallEntry( + contexts=[], + expected_contexts=[], + ) + ) + + assert result.status == "processed" + assert result.score is not None and result.score == 1.0 + assert not result.cost + + result = evaluator.evaluate( + RagasContextRecallEntry( + contexts=[], + expected_contexts=[ + "Paris is the capital of France.", + "The Eiffel Tower is one of the most famous landmarks in Paris.", + ], + ) + ) + + assert result.status == "processed" + assert result.score is not None and result.score == 0.0 + assert not result.cost + + result = evaluator.evaluate( + RagasContextRecallEntry( + contexts=["The Eiffel Tower is located in Paris."], + expected_contexts=[], + ) + ) + + assert result.status == "processed" + assert result.score is not None and result.score == 1.0 + assert not result.cost + + def test_context_f1(): evaluator = RagasContextF1Evaluator(settings=RagasContextF1Settings()) @@ -214,6 +292,32 @@ def test_context_f1(): assert result.details +def test_context_f1_with_empty_contexts(): + evaluator = RagasContextF1Evaluator(settings=RagasContextF1Settings()) + + result = evaluator.evaluate(RagasContextF1Entry(contexts=[], expected_contexts=[])) + + assert result.status == "processed" + assert result.score is not None and result.score == 1.0 + assert not result.cost + + result = evaluator.evaluate( + RagasContextF1Entry(contexts=[], expected_contexts=["context"]) + ) + + assert result.status == "processed" + assert result.score is not None and result.score == 0.0 + assert not result.cost + + result = evaluator.evaluate( + RagasContextF1Entry(contexts=["context"], expected_contexts=[]) + ) + + assert result.status == "processed" + assert result.score is not None and result.score == 0.0 + assert not result.cost + + def test_response_context_precision_with_reference(): evaluator = RagasResponseContextPrecisionEvaluator(settings=RagasSettings()) @@ -319,7 +423,7 @@ def test_summarization_score(): ) assert result.status == "processed" - assert result.score and result.score > 0.7 + assert result.score and result.score > 0.6 assert result.cost and result.cost.amount > 0.0 assert result.details