diff --git a/evaluators/ragas/langevals_ragas/answer_correctness.py b/evaluators/ragas/langevals_ragas/answer_correctness.py index 25235fa..5c62e70 100644 --- a/evaluators/ragas/langevals_ragas/answer_correctness.py +++ b/evaluators/ragas/langevals_ragas/answer_correctness.py @@ -4,9 +4,8 @@ EvaluationResult, EvaluatorEntry, SingleEvaluationResult, - EvaluationResultSkipped, ) -from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult +from .lib.common import env_vars, evaluate_ragas, RagasSettings from pydantic import Field @@ -18,12 +17,15 @@ class RagasAnswerCorrectnessEntry(EvaluatorEntry): class RagasAnswerCorrectnessResult(EvaluationResult): score: float = Field( - description="A score between 0.0 and 1.0 indicating the correctness of the answer." + default=0.0, + description="A score between 0.0 and 1.0 indicating the correctness of the answer.", ) class RagasAnswerCorrectnessEvaluator( - BaseEvaluator[RagasAnswerCorrectnessEntry, RagasSettings, RagasAnswerCorrectnessResult] + BaseEvaluator[ + RagasAnswerCorrectnessEntry, RagasSettings, RagasAnswerCorrectnessResult + ] ): """ Computes with an LLM a weighted combination of factual as well as semantic similarity between the generated answer and the expected output. diff --git a/evaluators/ragas/langevals_ragas/answer_relevancy.py b/evaluators/ragas/langevals_ragas/answer_relevancy.py index 305986d..03bca5f 100644 --- a/evaluators/ragas/langevals_ragas/answer_relevancy.py +++ b/evaluators/ragas/langevals_ragas/answer_relevancy.py @@ -4,7 +4,7 @@ EvaluatorEntry, SingleEvaluationResult, ) -from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult +from .lib.common import env_vars, evaluate_ragas, RagasSettings from pydantic import Field @@ -15,7 +15,8 @@ class RagasAnswerRelevancyEntry(EvaluatorEntry): class RagasAnswerRelevancyResult(EvaluationResult): score: float = Field( - description="A score between 0.0 and 1.0 indicating the relevance of the answer." + default=0.0, + description="A score between 0.0 and 1.0 indicating the relevance of the answer.", ) diff --git a/evaluators/ragas/langevals_ragas/context_precision.py b/evaluators/ragas/langevals_ragas/context_precision.py index 1148520..a82a36e 100644 --- a/evaluators/ragas/langevals_ragas/context_precision.py +++ b/evaluators/ragas/langevals_ragas/context_precision.py @@ -1,9 +1,11 @@ from langevals_core.base_evaluator import ( BaseEvaluator, + EvaluationResult, EvaluatorEntry, SingleEvaluationResult, ) -from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult +from .lib.common import env_vars, evaluate_ragas, RagasSettings +from pydantic import Field class RagasContextPrecisionEntry(EvaluatorEntry): @@ -12,8 +14,17 @@ class RagasContextPrecisionEntry(EvaluatorEntry): expected_output: str +class RagasContextPrecisionResult(EvaluationResult): + score: float = Field( + default=0.0, + description="A score between 0.0 and 1.0 indicating the precision of the context." + ) + + class RagasContextPrecisionEvaluator( - BaseEvaluator[RagasContextPrecisionEntry, RagasSettings, RagasResult] + BaseEvaluator[ + RagasContextPrecisionEntry, RagasSettings, RagasContextPrecisionResult + ] ): """ This metric evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Higher scores indicate better precision. diff --git a/evaluators/ragas/langevals_ragas/context_recall.py b/evaluators/ragas/langevals_ragas/context_recall.py index 01dd943..fb16b8a 100644 --- a/evaluators/ragas/langevals_ragas/context_recall.py +++ b/evaluators/ragas/langevals_ragas/context_recall.py @@ -1,9 +1,11 @@ from langevals_core.base_evaluator import ( BaseEvaluator, + EvaluationResult, EvaluatorEntry, SingleEvaluationResult, ) -from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult +from .lib.common import env_vars, evaluate_ragas, RagasSettings +from pydantic import Field class RagasContextRecallEntry(EvaluatorEntry): @@ -11,8 +13,15 @@ class RagasContextRecallEntry(EvaluatorEntry): expected_output: str +class RagasContextRecallResult(EvaluationResult): + score: float = Field( + default=0.0, + description="A score between 0.0 and 1.0 indicating the recall of the context.", + ) + + class RagasContextRecallEvaluator( - BaseEvaluator[RagasContextRecallEntry, RagasSettings, RagasResult] + BaseEvaluator[RagasContextRecallEntry, RagasSettings, RagasContextRecallResult] ): """ This evaluator measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. Higher values indicate better performance. diff --git a/evaluators/ragas/langevals_ragas/context_relevancy.py b/evaluators/ragas/langevals_ragas/context_relevancy.py index 0f3c34b..69a7d75 100644 --- a/evaluators/ragas/langevals_ragas/context_relevancy.py +++ b/evaluators/ragas/langevals_ragas/context_relevancy.py @@ -1,9 +1,11 @@ from langevals_core.base_evaluator import ( BaseEvaluator, + EvaluationResult, EvaluatorEntry, SingleEvaluationResult, ) -from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult +from .lib.common import env_vars, evaluate_ragas, RagasSettings +from pydantic import Field class RagasContextRelevancyEntry(EvaluatorEntry): @@ -11,8 +13,17 @@ class RagasContextRelevancyEntry(EvaluatorEntry): contexts: list[str] +class RagasContextRelevancyResult(EvaluationResult): + score: float = Field( + default=0.0, + description="A score between 0.0 and 1.0 indicating the relevancy of the context.", + ) + + class RagasContextRelevancyEvaluator( - BaseEvaluator[RagasContextRelevancyEntry, RagasSettings, RagasResult] + BaseEvaluator[ + RagasContextRelevancyEntry, RagasSettings, RagasContextRelevancyResult + ] ): """ This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy. diff --git a/evaluators/ragas/langevals_ragas/context_utilization.py b/evaluators/ragas/langevals_ragas/context_utilization.py index 6f4facf..1a1a1e2 100644 --- a/evaluators/ragas/langevals_ragas/context_utilization.py +++ b/evaluators/ragas/langevals_ragas/context_utilization.py @@ -1,9 +1,11 @@ from langevals_core.base_evaluator import ( BaseEvaluator, + EvaluationResult, EvaluatorEntry, SingleEvaluationResult, ) -from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult +from .lib.common import env_vars, evaluate_ragas, RagasSettings +from pydantic import Field class RagasContextUtilizationEntry(EvaluatorEntry): @@ -12,8 +14,17 @@ class RagasContextUtilizationEntry(EvaluatorEntry): contexts: list[str] +class RagasContextUtilizationResult(EvaluationResult): + score: float = Field( + default=0.0, + description="A score between 0.0 and 1.0 indicating the utilization of the context.", + ) + + class RagasContextUtilizationEvaluator( - BaseEvaluator[RagasContextUtilizationEntry, RagasSettings, RagasResult] + BaseEvaluator[ + RagasContextUtilizationEntry, RagasSettings, RagasContextUtilizationResult + ] ): """ This metric evaluates whether all of the output relevant items present in the contexts are ranked higher or not. Higher scores indicate better utilization. diff --git a/evaluators/ragas/langevals_ragas/faithfulness.py b/evaluators/ragas/langevals_ragas/faithfulness.py index f74e6cb..37abf45 100644 --- a/evaluators/ragas/langevals_ragas/faithfulness.py +++ b/evaluators/ragas/langevals_ragas/faithfulness.py @@ -21,7 +21,8 @@ class RagasFaithfulnessEntry(EvaluatorEntry): class RagasFaithfulnessResult(EvaluationResult): score: float = Field( - description="A score between 0.0 and 1.0 indicating the faithfulness of the answer." + default=0.0, + description="A score between 0.0 and 1.0 indicating the faithfulness of the answer.", ) diff --git a/poetry.lock b/poetry.lock index fca8d41..0e6bd04 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6136,4 +6136,4 @@ ragas = ["langevals-ragas"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "488f8728c2c3c17d184138042b79444abad09c63062658ed1da6287ad1958482" +content-hash = "b2627e10c5fa32bdb58919d9ae08a046d7e3e2ab2367f0d54f6899507d1da6fa" diff --git a/pyproject.toml b/pyproject.toml index 9639ce4..ac8ed15 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,7 @@ lingua = [ "langevals-lingua",] aws = [ "langevals-aws",] huggingface = [ "langevals-huggingface",] langevals = [ "langevals-langevals",] -all = [ "langevals-langevals", "langevals-azure", "langevals-example", "langevals-lingua", "langevals-ragas", "langevals-google_cloud", "langevals-haystack", "langevals-presidio", "langevals-aws", "langevals-huggingface", "langevals-openai",] +all = [ "langevals-azure", "langevals-langevals", "langevals-example", "langevals-lingua", "langevals-ragas", "langevals-google_cloud", "langevals-haystack", "langevals-presidio", "langevals-aws", "langevals-huggingface", "langevals-openai",] haystack = [ "langevals-haystack",] presidio = [ "langevals-presidio",] diff --git a/scripts/generate_evaluator_dependencies.py b/scripts/generate_evaluator_dependencies.py index 617f8af..ac630e1 100644 --- a/scripts/generate_evaluator_dependencies.py +++ b/scripts/generate_evaluator_dependencies.py @@ -20,11 +20,9 @@ package_names = [] for package in evaluator_packages: package_name = f"langevals-{package}" - optional = "false" if package == "langevals" else "true" - generated_dependencies += f'{package_name} = {{ path = "evaluators/{package}", develop = true, optional = {optional} }}\n' - if package != "langevals": - package_names.append(package_name) - generated_extras += f'{package} = ["{package_name}"]\n' + generated_dependencies += f'{package_name} = {{ path = "evaluators/{package}", develop = true, optional = true }}\n' + package_names.append(package_name) + generated_extras += f'{package} = ["{package_name}"]\n' generated_extras += 'all = ["' + '", "'.join(package_names) + '"]' diff --git a/ts-integration/evaluators.generated.ts b/ts-integration/evaluators.generated.ts index 0f95f2f..dc44a7c 100644 --- a/ts-integration/evaluators.generated.ts +++ b/ts-integration/evaluators.generated.ts @@ -1382,7 +1382,12 @@ This metric evaluates whether all of the ground-truth relevant items present in default: 2048, }, }, - result: {}, + result: { + score: { + description: + "A score between 0.0 and 1.0 indicating the precision of the context.", + }, + }, }, "ragas/context_recall": { name: `Ragas Context Recall`, @@ -1410,7 +1415,12 @@ This evaluator measures the extent to which the retrieved context aligns with th default: 2048, }, }, - result: {}, + result: { + score: { + description: + "A score between 0.0 and 1.0 indicating the recall of the context.", + }, + }, }, "ragas/context_relevancy": { name: `Ragas Context Relevancy`, @@ -1438,7 +1448,12 @@ This metric gauges the relevancy of the retrieved context, calculated based on b default: 2048, }, }, - result: {}, + result: { + score: { + description: + "A score between 0.0 and 1.0 indicating the relevancy of the context.", + }, + }, }, "ragas/context_utilization": { name: `Ragas Context Utilization`, @@ -1466,7 +1481,12 @@ This metric evaluates whether all of the output relevant items present in the co default: 2048, }, }, - result: {}, + result: { + score: { + description: + "A score between 0.0 and 1.0 indicating the utilization of the context.", + }, + }, }, "ragas/faithfulness": { name: `Ragas Faithfulness`,