Skip to content

Commit

Permalink
Add score to all ragas metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
rogeriochaves committed Nov 22, 2024
1 parent e5c8880 commit be2f3e9
Show file tree
Hide file tree
Showing 11 changed files with 90 additions and 26 deletions.
10 changes: 6 additions & 4 deletions evaluators/ragas/langevals_ragas/answer_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
EvaluationResult,
EvaluatorEntry,
SingleEvaluationResult,
EvaluationResultSkipped,
)
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
from .lib.common import env_vars, evaluate_ragas, RagasSettings
from pydantic import Field


Expand All @@ -18,12 +17,15 @@ class RagasAnswerCorrectnessEntry(EvaluatorEntry):

class RagasAnswerCorrectnessResult(EvaluationResult):
score: float = Field(
description="A score between 0.0 and 1.0 indicating the correctness of the answer."
default=0.0,
description="A score between 0.0 and 1.0 indicating the correctness of the answer.",
)


class RagasAnswerCorrectnessEvaluator(
BaseEvaluator[RagasAnswerCorrectnessEntry, RagasSettings, RagasAnswerCorrectnessResult]
BaseEvaluator[
RagasAnswerCorrectnessEntry, RagasSettings, RagasAnswerCorrectnessResult
]
):
"""
Computes with an LLM a weighted combination of factual as well as semantic similarity between the generated answer and the expected output.
Expand Down
5 changes: 3 additions & 2 deletions evaluators/ragas/langevals_ragas/answer_relevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
EvaluatorEntry,
SingleEvaluationResult,
)
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
from .lib.common import env_vars, evaluate_ragas, RagasSettings
from pydantic import Field


Expand All @@ -15,7 +15,8 @@ class RagasAnswerRelevancyEntry(EvaluatorEntry):

class RagasAnswerRelevancyResult(EvaluationResult):
score: float = Field(
description="A score between 0.0 and 1.0 indicating the relevance of the answer."
default=0.0,
description="A score between 0.0 and 1.0 indicating the relevance of the answer.",
)


Expand Down
15 changes: 13 additions & 2 deletions evaluators/ragas/langevals_ragas/context_precision.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from langevals_core.base_evaluator import (
BaseEvaluator,
EvaluationResult,
EvaluatorEntry,
SingleEvaluationResult,
)
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
from .lib.common import env_vars, evaluate_ragas, RagasSettings
from pydantic import Field


class RagasContextPrecisionEntry(EvaluatorEntry):
Expand All @@ -12,8 +14,17 @@ class RagasContextPrecisionEntry(EvaluatorEntry):
expected_output: str


class RagasContextPrecisionResult(EvaluationResult):
score: float = Field(
default=0.0,
description="A score between 0.0 and 1.0 indicating the precision of the context."
)


class RagasContextPrecisionEvaluator(
BaseEvaluator[RagasContextPrecisionEntry, RagasSettings, RagasResult]
BaseEvaluator[
RagasContextPrecisionEntry, RagasSettings, RagasContextPrecisionResult
]
):
"""
This metric evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Higher scores indicate better precision.
Expand Down
13 changes: 11 additions & 2 deletions evaluators/ragas/langevals_ragas/context_recall.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
from langevals_core.base_evaluator import (
BaseEvaluator,
EvaluationResult,
EvaluatorEntry,
SingleEvaluationResult,
)
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
from .lib.common import env_vars, evaluate_ragas, RagasSettings
from pydantic import Field


class RagasContextRecallEntry(EvaluatorEntry):
contexts: list[str]
expected_output: str


class RagasContextRecallResult(EvaluationResult):
score: float = Field(
default=0.0,
description="A score between 0.0 and 1.0 indicating the recall of the context.",
)


class RagasContextRecallEvaluator(
BaseEvaluator[RagasContextRecallEntry, RagasSettings, RagasResult]
BaseEvaluator[RagasContextRecallEntry, RagasSettings, RagasContextRecallResult]
):
"""
This evaluator measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. Higher values indicate better performance.
Expand Down
15 changes: 13 additions & 2 deletions evaluators/ragas/langevals_ragas/context_relevancy.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,29 @@
from langevals_core.base_evaluator import (
BaseEvaluator,
EvaluationResult,
EvaluatorEntry,
SingleEvaluationResult,
)
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
from .lib.common import env_vars, evaluate_ragas, RagasSettings
from pydantic import Field


class RagasContextRelevancyEntry(EvaluatorEntry):
output: str
contexts: list[str]


class RagasContextRelevancyResult(EvaluationResult):
score: float = Field(
default=0.0,
description="A score between 0.0 and 1.0 indicating the relevancy of the context.",
)


class RagasContextRelevancyEvaluator(
BaseEvaluator[RagasContextRelevancyEntry, RagasSettings, RagasResult]
BaseEvaluator[
RagasContextRelevancyEntry, RagasSettings, RagasContextRelevancyResult
]
):
"""
This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy.
Expand Down
15 changes: 13 additions & 2 deletions evaluators/ragas/langevals_ragas/context_utilization.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from langevals_core.base_evaluator import (
BaseEvaluator,
EvaluationResult,
EvaluatorEntry,
SingleEvaluationResult,
)
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
from .lib.common import env_vars, evaluate_ragas, RagasSettings
from pydantic import Field


class RagasContextUtilizationEntry(EvaluatorEntry):
Expand All @@ -12,8 +14,17 @@ class RagasContextUtilizationEntry(EvaluatorEntry):
contexts: list[str]


class RagasContextUtilizationResult(EvaluationResult):
score: float = Field(
default=0.0,
description="A score between 0.0 and 1.0 indicating the utilization of the context.",
)


class RagasContextUtilizationEvaluator(
BaseEvaluator[RagasContextUtilizationEntry, RagasSettings, RagasResult]
BaseEvaluator[
RagasContextUtilizationEntry, RagasSettings, RagasContextUtilizationResult
]
):
"""
This metric evaluates whether all of the output relevant items present in the contexts are ranked higher or not. Higher scores indicate better utilization.
Expand Down
3 changes: 2 additions & 1 deletion evaluators/ragas/langevals_ragas/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ class RagasFaithfulnessEntry(EvaluatorEntry):

class RagasFaithfulnessResult(EvaluationResult):
score: float = Field(
description="A score between 0.0 and 1.0 indicating the faithfulness of the answer."
default=0.0,
description="A score between 0.0 and 1.0 indicating the faithfulness of the answer.",
)


Expand Down
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ lingua = [ "langevals-lingua",]
aws = [ "langevals-aws",]
huggingface = [ "langevals-huggingface",]
langevals = [ "langevals-langevals",]
all = [ "langevals-langevals", "langevals-azure", "langevals-example", "langevals-lingua", "langevals-ragas", "langevals-google_cloud", "langevals-haystack", "langevals-presidio", "langevals-aws", "langevals-huggingface", "langevals-openai",]
all = [ "langevals-azure", "langevals-langevals", "langevals-example", "langevals-lingua", "langevals-ragas", "langevals-google_cloud", "langevals-haystack", "langevals-presidio", "langevals-aws", "langevals-huggingface", "langevals-openai",]
haystack = [ "langevals-haystack",]
presidio = [ "langevals-presidio",]

Expand Down
8 changes: 3 additions & 5 deletions scripts/generate_evaluator_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,9 @@
package_names = []
for package in evaluator_packages:
package_name = f"langevals-{package}"
optional = "false" if package == "langevals" else "true"
generated_dependencies += f'{package_name} = {{ path = "evaluators/{package}", develop = true, optional = {optional} }}\n'
if package != "langevals":
package_names.append(package_name)
generated_extras += f'{package} = ["{package_name}"]\n'
generated_dependencies += f'{package_name} = {{ path = "evaluators/{package}", develop = true, optional = true }}\n'
package_names.append(package_name)
generated_extras += f'{package} = ["{package_name}"]\n'

generated_extras += 'all = ["' + '", "'.join(package_names) + '"]'

Expand Down
28 changes: 24 additions & 4 deletions ts-integration/evaluators.generated.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1382,7 +1382,12 @@ This metric evaluates whether all of the ground-truth relevant items present in
default: 2048,
},
},
result: {},
result: {
score: {
description:
"A score between 0.0 and 1.0 indicating the precision of the context.",
},
},
},
"ragas/context_recall": {
name: `Ragas Context Recall`,
Expand Down Expand Up @@ -1410,7 +1415,12 @@ This evaluator measures the extent to which the retrieved context aligns with th
default: 2048,
},
},
result: {},
result: {
score: {
description:
"A score between 0.0 and 1.0 indicating the recall of the context.",
},
},
},
"ragas/context_relevancy": {
name: `Ragas Context Relevancy`,
Expand Down Expand Up @@ -1438,7 +1448,12 @@ This metric gauges the relevancy of the retrieved context, calculated based on b
default: 2048,
},
},
result: {},
result: {
score: {
description:
"A score between 0.0 and 1.0 indicating the relevancy of the context.",
},
},
},
"ragas/context_utilization": {
name: `Ragas Context Utilization`,
Expand Down Expand Up @@ -1466,7 +1481,12 @@ This metric evaluates whether all of the output relevant items present in the co
default: 2048,
},
},
result: {},
result: {
score: {
description:
"A score between 0.0 and 1.0 indicating the utilization of the context.",
},
},
},
"ragas/faithfulness": {
name: `Ragas Faithfulness`,
Expand Down

0 comments on commit be2f3e9

Please sign in to comment.