Skip to content

Commit

Permalink
Accept empty contexts and expected_contexts for f1/precision/recall, …
Browse files Browse the repository at this point in the history
…with early returns to prevent ragas from blowing up
  • Loading branch information
rogeriochaves committed Jan 13, 2025
1 parent f4c187e commit 9562047
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 18 deletions.
46 changes: 29 additions & 17 deletions evaluators/ragas/langevals_ragas/context_f1.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,27 +69,39 @@ def evaluate(self, entry: RagasContextF1Entry) -> SingleEvaluationResult:
)
)

precision_score = precision_scorer.single_turn_score(
SingleTurnSample(
retrieved_contexts=entry.contexts,
reference_contexts=entry.expected_contexts,
if len(entry.expected_contexts) == 0 and len(entry.contexts) == 0:
precision_score = 1.0
elif len(entry.expected_contexts) == 0 or len(entry.contexts) == 0:
precision_score = 0.0
else:
precision_score = precision_scorer.single_turn_score(
SingleTurnSample(
retrieved_contexts=entry.contexts,
reference_contexts=entry.expected_contexts,
)
)
)

recall_scorer = NonLLMContextRecall()
recall_scorer.distance_measure = {
"levenshtein": DistanceMeasure.LEVENSHTEIN,
"hamming": DistanceMeasure.HAMMING,
"jaro": DistanceMeasure.JARO,
"jaro_winkler": DistanceMeasure.JARO_WINKLER,
}[self.settings.distance_measure]
if len(entry.expected_contexts) == 0 and len(entry.contexts) == 0:
recall_score = 1.0
elif len(entry.expected_contexts) == 0:
recall_score = 1.0
elif len(entry.contexts) == 0:
recall_score = 0.0
else:
recall_scorer = NonLLMContextRecall()
recall_scorer.distance_measure = {
"levenshtein": DistanceMeasure.LEVENSHTEIN,
"hamming": DistanceMeasure.HAMMING,
"jaro": DistanceMeasure.JARO,
"jaro_winkler": DistanceMeasure.JARO_WINKLER,
}[self.settings.distance_measure]

recall_score = recall_scorer.single_turn_score(
SingleTurnSample(
retrieved_contexts=entry.contexts,
reference_contexts=entry.expected_contexts,
recall_score = recall_scorer.single_turn_score(
SingleTurnSample(
retrieved_contexts=entry.contexts,
reference_contexts=entry.expected_contexts,
)
)
)

f1_score = (
2 * (precision_score * recall_score) / (precision_score + recall_score)
Expand Down
18 changes: 18 additions & 0 deletions evaluators/ragas/langevals_ragas/context_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,24 @@ class RagasContextPrecisionEvaluator(
is_guardrail = False

def evaluate(self, entry: RagasContextPrecisionEntry) -> SingleEvaluationResult:
if len(entry.expected_contexts) == 0 and len(entry.contexts) == 0:
return RagasResult(
score=1.0,
cost=None,
details="No contexts retrieved, but also no contexts expected, so that's a perfect precision of 1",
)
if len(entry.expected_contexts) == 0:
return RagasResult(
score=0.0,
cost=None,
details="No contexts expected, yet some were retrieved, precision is 0",
)
if len(entry.contexts) == 0:
return RagasResult(
score=0.0,
cost=None,
details="No contexts retrieved, precision is 0",
)
scorer = NonLLMContextPrecisionWithReference(
distance_measure=NonLLMStringSimilarity(
distance_measure={
Expand Down
20 changes: 20 additions & 0 deletions evaluators/ragas/langevals_ragas/context_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
EvaluatorEntry,
SingleEvaluationResult,
EvaluatorSettings,
EvaluationResultSkipped,
)
from ragas import SingleTurnSample
from .lib.common import (
Expand Down Expand Up @@ -55,6 +56,25 @@ class RagasContextRecallEvaluator(
is_guardrail = False

def evaluate(self, entry: RagasContextRecallEntry) -> SingleEvaluationResult:
if len(entry.expected_contexts) == 0 and len(entry.contexts) == 0:
return RagasResult(
score=1.0,
cost=None,
details="No contexts retrieved, but also no contexts expected, so that's a perfect recall of 1",
)
if len(entry.expected_contexts) == 0:
return RagasResult(
score=1.0,
cost=None,
details="No contexts expected, meaning nothing was missing, so that's a perfect recall of 1",
)
if len(entry.contexts) == 0:
return RagasResult(
score=0.0,
cost=None,
details="No contexts retrieved, recall is 0",
)

scorer = NonLLMContextRecall()
scorer.distance_measure = {
"levenshtein": DistanceMeasure.LEVENSHTEIN,
Expand Down
106 changes: 105 additions & 1 deletion evaluators/ragas/tests/test_ragas.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,44 @@ def test_context_precision():
assert not result.cost


def test_context_precision_with_empty_contexts():
evaluator = RagasContextPrecisionEvaluator(settings=RagasContextPrecisionSettings())

result = evaluator.evaluate(
RagasContextPrecisionEntry(
contexts=[],
expected_contexts=[],
)
)
assert result.status == "processed"
assert result.score is not None and result.score == 1.0
assert not result.cost

result = evaluator.evaluate(
RagasContextPrecisionEntry(
contexts=[],
expected_contexts=[
"Paris is the capital of France.",
"The Eiffel Tower is one of the most famous landmarks in Paris.",
],
)
)
assert result.status == "processed"
assert result.score is not None and result.score == 0.0
assert not result.cost

result = evaluator.evaluate(
RagasContextPrecisionEntry(
contexts=["The Eiffel Tower is located in Paris."],
expected_contexts=[],
)
)

assert result.status == "processed"
assert result.score is not None and result.score == 0.0
assert not result.cost


def test_context_recall():
evaluator = RagasContextRecallEvaluator(settings=RagasContextRecallSettings())

Expand All @@ -195,6 +233,46 @@ def test_context_recall():
assert not result.cost


def test_context_recall_with_empty_contexts():
evaluator = RagasContextRecallEvaluator(settings=RagasContextRecallSettings())

result = evaluator.evaluate(
RagasContextRecallEntry(
contexts=[],
expected_contexts=[],
)
)

assert result.status == "processed"
assert result.score is not None and result.score == 1.0
assert not result.cost

result = evaluator.evaluate(
RagasContextRecallEntry(
contexts=[],
expected_contexts=[
"Paris is the capital of France.",
"The Eiffel Tower is one of the most famous landmarks in Paris.",
],
)
)

assert result.status == "processed"
assert result.score is not None and result.score == 0.0
assert not result.cost

result = evaluator.evaluate(
RagasContextRecallEntry(
contexts=["The Eiffel Tower is located in Paris."],
expected_contexts=[],
)
)

assert result.status == "processed"
assert result.score is not None and result.score == 1.0
assert not result.cost


def test_context_f1():
evaluator = RagasContextF1Evaluator(settings=RagasContextF1Settings())

Expand All @@ -214,6 +292,32 @@ def test_context_f1():
assert result.details


def test_context_f1_with_empty_contexts():
evaluator = RagasContextF1Evaluator(settings=RagasContextF1Settings())

result = evaluator.evaluate(RagasContextF1Entry(contexts=[], expected_contexts=[]))

assert result.status == "processed"
assert result.score is not None and result.score == 1.0
assert not result.cost

result = evaluator.evaluate(
RagasContextF1Entry(contexts=[], expected_contexts=["context"])
)

assert result.status == "processed"
assert result.score is not None and result.score == 0.0
assert not result.cost

result = evaluator.evaluate(
RagasContextF1Entry(contexts=["context"], expected_contexts=[])
)

assert result.status == "processed"
assert result.score is not None and result.score == 0.0
assert not result.cost


def test_response_context_precision_with_reference():
evaluator = RagasResponseContextPrecisionEvaluator(settings=RagasSettings())

Expand Down Expand Up @@ -319,7 +423,7 @@ def test_summarization_score():
)

assert result.status == "processed"
assert result.score and result.score > 0.7
assert result.score and result.score > 0.6
assert result.cost and result.cost.amount > 0.0
assert result.details

Expand Down

0 comments on commit 9562047

Please sign in to comment.