diff --git a/evaluators/haystack/langevals_haystack/faithfulness.py b/evaluators/haystack/langevals_haystack/faithfulness.py index 6a0a9a1..c87887f 100644 --- a/evaluators/haystack/langevals_haystack/faithfulness.py +++ b/evaluators/haystack/langevals_haystack/faithfulness.py @@ -3,18 +3,14 @@ # Haystack telemetry breaks for AWS lambdas because it tries to write to home folder which is read-only os.environ["HAYSTACK_TELEMETRY_ENABLED"] = "false" -from typing import Literal from langevals_core.base_evaluator import ( BaseEvaluator, EvaluatorEntry, EvaluationResult, EvaluationResultSkipped, - EvaluatorSettings, SingleEvaluationResult, - Money, - LLMEvaluatorSettings + LLMEvaluatorSettings, ) -from pydantic import BaseModel, Field from haystack.components.evaluators import FaithfulnessEvaluator from langevals_haystack.lib.common import ( @@ -30,14 +26,7 @@ class HaystackFaithfulnessEntry(EvaluatorEntry): class HaystackFaithfulnessSettings(LLMEvaluatorSettings): - model: str = Field( - default="azure/gpt-35-turbo-1106", - description="The model to use for evaluation.", - ) - max_tokens: int = Field( - default=2048, - description="The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", - ) + pass class HaystackFaithfulnessResult(EvaluationResult): diff --git a/evaluators/ragas/langevals_ragas/lib/common.py b/evaluators/ragas/langevals_ragas/lib/common.py index 60957e7..6f82f0e 100644 --- a/evaluators/ragas/langevals_ragas/lib/common.py +++ b/evaluators/ragas/langevals_ragas/lib/common.py @@ -6,12 +6,12 @@ from langevals_core.base_evaluator import ( BaseEvaluator, EvaluationResult, - LLMEvaluatorSettings, + EvaluatorSettings, Money, EvaluationResultSkipped, - EvaluatorEntry + EvaluatorEntry, ) -from pydantic import BaseModel, Field +from pydantic import Field from ragas import evaluate from ragas.metrics.base import Metric from ragas.llms import LangchainLLMWrapper @@ -38,34 +38,34 @@ from tqdm.notebook import tqdm as tqdm_notebook from functools import partialmethod -import json -import re from typing import List, Optional from datasets import Dataset from ragas import evaluate from ragas.metrics import faithfulness, Faithfulness from ragas.llms import LangchainLLMWrapper -from ragas.llms.prompt import PromptValue -from langchain_core.callbacks import Callbacks -from pydantic import BaseModel, Field -import litellm -from langchain.schema.output import LLMResult -from langchain_core.outputs.generation import Generation +from pydantic import Field from langevals_core.utils import calculate_total_tokens env_vars = [] -class RagasSettings(LLMEvaluatorSettings): - model: str = Field( - default="azure/gpt-35-turbo-16k", +class RagasSettings(EvaluatorSettings): + model: Literal[ + "openai/gpt-3.5-turbo-16k", + "openai/gpt-4o", + "openai/gpt-4o-mini", + "azure/gpt-35-turbo-16k", + "azure/gpt-4o", + "anthropic/claude-3-5-sonnet-20240620", + ] = Field( + default="openai/gpt-3.5-turbo-16k", description="The model to use for evaluation.", ) embeddings_model: Literal[ "openai/text-embedding-ada-002", "azure/text-embedding-ada-002", ] = Field( - default="azure/text-embedding-ada-002", + default="openai/text-embedding-ada-002", description="The model to use for embeddings.", ) max_tokens: int = Field( diff --git a/ts-integration/evaluators.generated.ts b/ts-integration/evaluators.generated.ts index f44f440..1f1b569 100644 --- a/ts-integration/evaluators.generated.ts +++ b/ts-integration/evaluators.generated.ts @@ -106,7 +106,23 @@ export type Evaluators = { }; "haystack/faithfulness": { settings: { - model: string; + model: + | "openai/gpt-3.5-turbo" + | "openai/gpt-3.5-turbo-0125" + | "openai/gpt-3.5-turbo-1106" + | "openai/gpt-4-turbo" + | "openai/gpt-4-0125-preview" + | "openai/gpt-4o" + | "openai/gpt-4o-mini" + | "openai/gpt-4-1106-preview" + | "azure/gpt-35-turbo-1106" + | "azure/gpt-4o" + | "azure/gpt-4-turbo-2024-04-09" + | "azure/gpt-4-1106-preview" + | "groq/llama3-70b-8192" + | "anthropic/claude-3-haiku-20240307" + | "anthropic/claude-3-sonnet-20240229" + | "anthropic/claude-3-opus-20240229"; max_tokens: number; }; }; @@ -503,65 +519,107 @@ export type Evaluators = { }; "ragas/answer_correctness": { settings: { - model: string; - max_tokens: number; + model: + | "openai/gpt-3.5-turbo-16k" + | "openai/gpt-4o" + | "openai/gpt-4o-mini" + | "azure/gpt-35-turbo-16k" + | "azure/gpt-4o" + | "anthropic/claude-3-5-sonnet-20240620"; embeddings_model: | "openai/text-embedding-ada-002" | "azure/text-embedding-ada-002"; + max_tokens: number; }; }; "ragas/answer_relevancy": { settings: { - model: string; - max_tokens: number; + model: + | "openai/gpt-3.5-turbo-16k" + | "openai/gpt-4o" + | "openai/gpt-4o-mini" + | "azure/gpt-35-turbo-16k" + | "azure/gpt-4o" + | "anthropic/claude-3-5-sonnet-20240620"; embeddings_model: | "openai/text-embedding-ada-002" | "azure/text-embedding-ada-002"; + max_tokens: number; }; }; "ragas/context_precision": { settings: { - model: string; - max_tokens: number; + model: + | "openai/gpt-3.5-turbo-16k" + | "openai/gpt-4o" + | "openai/gpt-4o-mini" + | "azure/gpt-35-turbo-16k" + | "azure/gpt-4o" + | "anthropic/claude-3-5-sonnet-20240620"; embeddings_model: | "openai/text-embedding-ada-002" | "azure/text-embedding-ada-002"; + max_tokens: number; }; }; "ragas/context_recall": { settings: { - model: string; - max_tokens: number; + model: + | "openai/gpt-3.5-turbo-16k" + | "openai/gpt-4o" + | "openai/gpt-4o-mini" + | "azure/gpt-35-turbo-16k" + | "azure/gpt-4o" + | "anthropic/claude-3-5-sonnet-20240620"; embeddings_model: | "openai/text-embedding-ada-002" | "azure/text-embedding-ada-002"; + max_tokens: number; }; }; "ragas/context_relevancy": { settings: { - model: string; - max_tokens: number; + model: + | "openai/gpt-3.5-turbo-16k" + | "openai/gpt-4o" + | "openai/gpt-4o-mini" + | "azure/gpt-35-turbo-16k" + | "azure/gpt-4o" + | "anthropic/claude-3-5-sonnet-20240620"; embeddings_model: | "openai/text-embedding-ada-002" | "azure/text-embedding-ada-002"; + max_tokens: number; }; }; "ragas/context_utilization": { settings: { - model: string; - max_tokens: number; + model: + | "openai/gpt-3.5-turbo-16k" + | "openai/gpt-4o" + | "openai/gpt-4o-mini" + | "azure/gpt-35-turbo-16k" + | "azure/gpt-4o" + | "anthropic/claude-3-5-sonnet-20240620"; embeddings_model: | "openai/text-embedding-ada-002" | "azure/text-embedding-ada-002"; + max_tokens: number; }; }; "ragas/faithfulness": { settings: { - model: string; - max_tokens: number; + model: + | "openai/gpt-3.5-turbo-16k" + | "openai/gpt-4o" + | "openai/gpt-4o-mini" + | "azure/gpt-35-turbo-16k" + | "azure/gpt-4o" + | "anthropic/claude-3-5-sonnet-20240620"; embeddings_model: | "openai/text-embedding-ada-002" | "azure/text-embedding-ada-002"; + max_tokens: number; }; }; }; @@ -679,13 +737,12 @@ This evaluator assesses the extent to which the generated answer is consistent w optionalFields: [], settings: { model: { - description: "The model to use for evaluation.", - default: "azure/gpt-35-turbo-1106", + description: "The model to use for evaluation", + default: "openai/gpt-4o-mini", }, max_tokens: { - description: - "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", - default: 2048, + description: "Max tokens allowed for evaluation", + default: 8192, }, }, result: {}, @@ -1283,17 +1340,17 @@ This evaluator focuses on assessing how pertinent the generated answer is to the settings: { model: { description: "The model to use for evaluation.", - default: "azure/gpt-35-turbo-16k", + default: "openai/gpt-3.5-turbo-16k", + }, + embeddings_model: { + description: "The model to use for embeddings.", + default: "openai/text-embedding-ada-002", }, max_tokens: { description: "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", default: 2048, }, - embeddings_model: { - description: "The model to use for embeddings.", - default: "azure/text-embedding-ada-002", - }, }, result: {}, }, @@ -1311,17 +1368,17 @@ This evaluator focuses on assessing how pertinent the generated answer is to the settings: { model: { description: "The model to use for evaluation.", - default: "azure/gpt-35-turbo-16k", + default: "openai/gpt-3.5-turbo-16k", + }, + embeddings_model: { + description: "The model to use for embeddings.", + default: "openai/text-embedding-ada-002", }, max_tokens: { description: "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", default: 2048, }, - embeddings_model: { - description: "The model to use for embeddings.", - default: "azure/text-embedding-ada-002", - }, }, result: {}, }, @@ -1339,17 +1396,17 @@ This metric evaluates whether all of the ground-truth relevant items present in settings: { model: { description: "The model to use for evaluation.", - default: "azure/gpt-35-turbo-16k", + default: "openai/gpt-3.5-turbo-16k", + }, + embeddings_model: { + description: "The model to use for embeddings.", + default: "openai/text-embedding-ada-002", }, max_tokens: { description: "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", default: 2048, }, - embeddings_model: { - description: "The model to use for embeddings.", - default: "azure/text-embedding-ada-002", - }, }, result: {}, }, @@ -1367,17 +1424,17 @@ This evaluator measures the extent to which the retrieved context aligns with th settings: { model: { description: "The model to use for evaluation.", - default: "azure/gpt-35-turbo-16k", + default: "openai/gpt-3.5-turbo-16k", + }, + embeddings_model: { + description: "The model to use for embeddings.", + default: "openai/text-embedding-ada-002", }, max_tokens: { description: "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", default: 2048, }, - embeddings_model: { - description: "The model to use for embeddings.", - default: "azure/text-embedding-ada-002", - }, }, result: {}, }, @@ -1395,17 +1452,17 @@ This metric gauges the relevancy of the retrieved context, calculated based on b settings: { model: { description: "The model to use for evaluation.", - default: "azure/gpt-35-turbo-16k", + default: "openai/gpt-3.5-turbo-16k", + }, + embeddings_model: { + description: "The model to use for embeddings.", + default: "openai/text-embedding-ada-002", }, max_tokens: { description: "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", default: 2048, }, - embeddings_model: { - description: "The model to use for embeddings.", - default: "azure/text-embedding-ada-002", - }, }, result: {}, }, @@ -1423,17 +1480,17 @@ This metric evaluates whether all of the output relevant items present in the co settings: { model: { description: "The model to use for evaluation.", - default: "azure/gpt-35-turbo-16k", + default: "openai/gpt-3.5-turbo-16k", + }, + embeddings_model: { + description: "The model to use for embeddings.", + default: "openai/text-embedding-ada-002", }, max_tokens: { description: "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", default: 2048, }, - embeddings_model: { - description: "The model to use for embeddings.", - default: "azure/text-embedding-ada-002", - }, }, result: {}, }, @@ -1451,17 +1508,17 @@ This evaluator assesses the extent to which the generated answer is consistent w settings: { model: { description: "The model to use for evaluation.", - default: "azure/gpt-35-turbo-16k", + default: "openai/gpt-3.5-turbo-16k", + }, + embeddings_model: { + description: "The model to use for embeddings.", + default: "openai/text-embedding-ada-002", }, max_tokens: { description: "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", default: 2048, }, - embeddings_model: { - description: "The model to use for embeddings.", - default: "azure/text-embedding-ada-002", - }, }, result: {}, },