diff --git a/evals/metrics/__init__.py b/evals/metrics/__init__.py new file mode 100644 index 00000000..2d5825c6 --- /dev/null +++ b/evals/metrics/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/evals/metrics/ragas/__init__.py b/evals/metrics/ragas/__init__.py new file mode 100644 index 00000000..f2f02578 --- /dev/null +++ b/evals/metrics/ragas/__init__.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# + +from .ragas import ( + RAGASContextualPrecisionMetric, + RAGASContextualRelevancyMetric, + RAGASAnswerRelevancyMetric, + RAGASFaithfulnessMetric, + RAGASContextualRecallMetric, + RagasMetric, +) diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py new file mode 100644 index 00000000..d437c5cf --- /dev/null +++ b/evals/metrics/ragas/ragas.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# + +from typing import Dict, Optional, Union + +from langchain_core.embeddings import Embeddings +from langchain_core.language_models import BaseLanguageModel +from langchain_huggingface import HuggingFaceEndpoint + + +def format_ragas_metric_name(name: str): + return f"{name} (ragas)" + + +class RAGASContextualPrecisionMetric: + """This metric checks the contextual precision using Ragas.""" + + def __init__( + self, + threshold: float = 0.3, + model: Optional[Union[str, BaseLanguageModel]] = "gpt-3.5-turbo", + ): + self.threshold = threshold + self.model = model + + def measure(self, test_case: Dict): + try: + from ragas import evaluate + from ragas.llms import LangchainLLMWrapper + from ragas.metrics import context_precision + + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.") + + try: + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + # Set LLM model + if isinstance(self.model, str): + chat_model = HuggingFaceEndpoint( + endpoint_url=self.model, + timeout=600, + ) + else: + chat_model = self.model + chat_model = LangchainLLMWrapper(chat_model) + # Create a dataset from the test case + data = { + "contexts": [test_case["retrieval_context"]], + "question": [test_case["input"]], + "ground_truth": [test_case["expected_output"]], + } + dataset = Dataset.from_dict(data) + + # Evaluate the dataset using Ragas + scores = evaluate(dataset, metrics=[context_precision], llm=chat_model) + # Ragas only does dataset-level comparisons + context_precision_score = scores["context_precision"] + self.success = context_precision_score >= self.threshold + self.score = context_precision_score + return self.score + + async def a_measure(self, test_case: Dict): + return self.measure(test_case) + + def is_successful(self): + return self.success + + @property + def __name__(self): + return format_ragas_metric_name("Contextual Precision") + + +class RAGASContextualRelevancyMetric: + """This metric checks the contextual relevancy using Ragas.""" + + def __init__( + self, + threshold: float = 0.3, + model: Optional[Union[str, BaseLanguageModel]] = "gpt-3.5-turbo", + ): + self.threshold = threshold + self.model = model + + async def a_measure(self, test_case: Dict): + return self.measure(test_case) + + def measure(self, test_case: Dict): + # sends to server + try: + from ragas import evaluate + from ragas.llms import LangchainLLMWrapper + from ragas.metrics import context_relevancy + + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.") + + try: + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + # Set LLM model + if isinstance(self.model, str): + chat_model = HuggingFaceEndpoint( + endpoint_url=self.model, + timeout=600, + ) + else: + chat_model = self.model + chat_model = LangchainLLMWrapper(chat_model) + # Create a dataset from the test case + data = { + "contexts": [test_case["retrieval_context"]], + "question": [test_case["input"]], + } + dataset = Dataset.from_dict(data) + + # Evaluate the dataset using Ragas + scores = evaluate(dataset, metrics=[context_relevancy], llm=chat_model) + + # Ragas only does dataset-level comparisons + context_relevancy_score = scores["context_relevancy"] + self.success = context_relevancy_score >= self.threshold + self.score = context_relevancy_score + return self.score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return format_ragas_metric_name("Contextual Relevancy") + + +class RAGASAnswerRelevancyMetric: + """This metric checks the answer relevancy using Ragas.""" + + def __init__( + self, + threshold: float = 0.3, + model: Optional[Union[str, BaseLanguageModel]] = "gpt-3.5-turbo", + embeddings: Optional[Embeddings] = None, + ): + + self.threshold = threshold + self.model = model + self.embeddings = embeddings + + async def a_measure(self, test_case: Dict): + return self.measure(test_case) + + def measure(self, test_case: Dict): + # sends to server + try: + from ragas import evaluate + from ragas.llms import LangchainLLMWrapper + from ragas.metrics import answer_relevancy + + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.") + + try: + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + # Set LLM model + if isinstance(self.model, str): + chat_model = HuggingFaceEndpoint( + endpoint_url=self.model, + timeout=600, + ) + else: + chat_model = self.model + chat_model = LangchainLLMWrapper(chat_model) + data = { + "question": [test_case["input"]], + "answer": [test_case["actual_output"]], + "contexts": [test_case["retrieval_context"]], + } + dataset = Dataset.from_dict(data) + + scores = evaluate( + dataset, + metrics=[answer_relevancy], + llm=chat_model, + embeddings=self.embeddings, + ) + answer_relevancy_score = scores["answer_relevancy"] + self.success = answer_relevancy_score >= self.threshold + self.score = answer_relevancy_score + return self.score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return format_ragas_metric_name("Answer Relevancy") + + +class RAGASFaithfulnessMetric: + def __init__( + self, + threshold: float = 0.3, + model: Optional[Union[str, BaseLanguageModel]] = "gpt-3.5-turbo", + ): + + self.threshold = threshold + self.model = model + + async def a_measure(self, test_case: Dict): + return self.measure(test_case) + + def measure(self, test_case: Dict): + # sends to server + try: + from ragas import evaluate + from ragas.llms import LangchainLLMWrapper + from ragas.metrics import faithfulness + + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.") + + try: + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + # Set LLM model + if isinstance(self.model, str): + chat_model = HuggingFaceEndpoint( + endpoint_url=self.model, + timeout=600, + ) + else: + chat_model = self.model + chat_model = LangchainLLMWrapper(chat_model) + data = { + "contexts": [test_case["retrieval_context"]], + "question": [test_case["input"]], + "answer": [test_case["actual_output"]], + } + dataset = Dataset.from_dict(data) + + scores = evaluate( + dataset, + metrics=[faithfulness], + llm=chat_model, + ) + faithfulness_score = scores["faithfulness"] + self.success = faithfulness_score >= self.threshold + self.score = faithfulness_score + return self.score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return format_ragas_metric_name("Faithfulness") + + +class RAGASContextualRecallMetric: + """This metric checks the context recall using Ragas.""" + + def __init__( + self, + threshold: float = 0.3, + model: Optional[Union[str, BaseLanguageModel]] = "gpt-3.5-turbo", + ): + self.threshold = threshold + self.model = model + + async def a_measure(self, test_case: Dict): + return self.measure(test_case) + + def measure(self, test_case: Dict): + # sends to server + try: + from ragas import evaluate + from ragas.llms import LangchainLLMWrapper + from ragas.metrics import context_recall + + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.") + + try: + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + # Set LLM model + if isinstance(self.model, str): + chat_model = HuggingFaceEndpoint( + endpoint_url=self.model, + timeout=600, + ) + else: + chat_model = self.model + chat_model = LangchainLLMWrapper(chat_model) + data = { + "question": [test_case["input"]], + "ground_truth": [test_case["expected_output"]], + "contexts": [test_case["retrieval_context"]], + } + dataset = Dataset.from_dict(data) + + scores = evaluate( + dataset, + [context_recall], + llm=chat_model, + ) + context_recall_score = scores["context_recall"] + self.success = context_recall_score >= self.threshold + self.score = context_recall_score + return self.score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return format_ragas_metric_name("Contextual Recall") + + +class RagasMetric: + """This metric checks if the output is more than 3 letters.""" + + def __init__( + self, + threshold: float = 0.3, + model: Optional[Union[str, BaseLanguageModel]] = "gpt-3.5-turbo", + embeddings: Optional[Embeddings] = None, + ): + + self.threshold = threshold + self.model = model + self.embeddings = embeddings + + async def a_measure(self, test_case: Dict): + return self.measure(test_case) + + def measure(self, test_case: Dict): + # sends to server + try: + from ragas import evaluate + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.") + + try: + # How do i make sure this isn't just huggingface dataset + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + # Create a dataset from the test case + # Convert the Dict to a format compatible with Dataset + score_breakdown = {} + metrics = [ + RAGASContextualPrecisionMetric(model=self.model), + RAGASContextualRecallMetric(model=self.model), + RAGASFaithfulnessMetric(model=self.model), + RAGASAnswerRelevancyMetric(model=self.model, embeddings=self.embeddings), + ] + + for metric in metrics: + score = metric.measure(test_case) + score_breakdown[metric.__name__] = score + + ragas_score = sum(score_breakdown.values()) / len(score_breakdown) + + self.success = ragas_score >= self.threshold + self.score = ragas_score + self.score_breakdown = score_breakdown + return self.score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return "RAGAS" diff --git a/requirements.txt b/requirements.txt index cc3859dd..483cf00a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,5 @@ bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e +langchain_community +langchain_huggingface lm-eval==0.4.2 +ragas diff --git a/tests/requirements.txt b/tests/requirements.txt index cc3859dd..483cf00a 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,2 +1,5 @@ bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e +langchain_community +langchain_huggingface lm-eval==0.4.2 +ragas diff --git a/tests/test_ragas.py b/tests/test_ragas.py new file mode 100644 index 00000000..66d30964 --- /dev/null +++ b/tests/test_ragas.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# +import unittest + +from evals.metrics.ragas import RagasMetric + + +class TestRagasMetric(unittest.TestCase): + + @unittest.skip("need assign localhost id") + def test_ragas(self): + # Replace this with the actual output from your LLM application + actual_output = "We offer a 30-day full refund at no extra cost." + + # Replace this with the expected output from your RAG generator + expected_output = "You are eligible for a 30 day full refund at no extra cost." + + # Replace this with the actual retrieved context from your RAG pipeline + retrieval_context = ["All customers are eligible for a 30 day full refund at no extra cost."] + from langchain_community.embeddings import HuggingFaceBgeEmbeddings + + embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5") + metric = RagasMetric(threshold=0.5, model="http://localhost:8008", embeddings=embeddings) + test_case = { + "input": "What if these shoes don't fit?", + "actual_output": actual_output, + "expected_output": expected_output, + "retrieval_context": retrieval_context, + } + + metric.measure(test_case) + print(metric.score) + + +if __name__ == "__main__": + unittest.main()