diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index 35449c08..672ac39e 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -1,24 +1,23 @@ + #!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 - # import os from typing import Dict, Optional, Union - +from langchain_huggingface import HuggingFaceEndpoint from langchain_core.embeddings import Embeddings from langchain_core.language_models import BaseLanguageModel -from langchain_huggingface import HuggingFaceEndpoint +import sys +sys.path.append('/home/akakne/miniforge3/envs/recsys/bin') def format_ragas_metric_name(name: str): return f"{name} (ragas)" - class RagasMetric: """This metric checks if the output is more than 3 letters.""" - def __init__( self, threshold: float = 0.3, @@ -26,7 +25,6 @@ def __init__( embeddings: Optional[Embeddings] = None, metrics: Optional[list[str]] = None, ): - self.threshold = threshold self.model = model self.embeddings = embeddings @@ -39,10 +37,14 @@ def __init__( "context_recall", "faithfulness", "context_utilization", - "reference_free_rubrics_score", + # "reference_free_rubrics_score", ] - + async def a_measure(self, test_case: Dict): + return self.measure(test_case) + def measure(self, test_case: Dict): + # sends to server try: + from ragas import evaluate from ragas.metrics import ( answer_correctness, answer_relevancy, @@ -51,16 +53,14 @@ def __init__( context_recall, context_utilization, faithfulness, - reference_free_rubrics_score, + # reference_free_rubrics_score, ) except ModuleNotFoundError: raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.") - try: from datasets import Dataset except ModuleNotFoundError: raise ModuleNotFoundError("Please install dataset") - self.metrics_instance = { "answer_correctness": answer_correctness, "answer_relevancy": answer_relevancy, @@ -69,26 +69,24 @@ def __init__( "context_recall": context_recall, "faithfulness": faithfulness, "context_utilization": context_utilization, - "reference_free_rubrics_score": reference_free_rubrics_score, + # "reference_free_rubrics_score": reference_free_rubrics_score, } - # Set LLM model openai_key = os.getenv("OPENAI_API_KEY", None) if openai_key is not None: print("OPENAI_API_KEY is provided, ragas initializes the model by OpenAI.") self.model = None if isinstance(self.model, str): - print("LLM endpoint: ", self.model) - self.chat_model = HuggingFaceEndpoint( + print("Loading a HuggingFace Endpoint") + chat_model = HuggingFaceEndpoint( endpoint_url=self.model, - task="text-generation", - max_new_tokens=1024, - do_sample=False, + timeout=600, ) else: - self.chat_model = self.model - - # initialize metrics + print("Accepting user-initialized model as we could not detect OpenAI key or HuggingFace Endpoint URL.") + chat_model = self.model + # Create a dataset from the test case + # Convert the Dict to a format compatible with Dataset if self.metrics is not None: tmp_metrics = [] # check supported list @@ -106,10 +104,8 @@ def __init__( if metric == "answer_relevancy" and self.embeddings is None: raise ValueError("answer_relevancy metric need provide embeddings model.") tmp_metrics.append(self.metrics_instance[metric]) - self.metrics = tmp_metrics - - else: # default metrics + else: self.metrics = [ answer_relevancy, faithfulness, @@ -118,39 +114,31 @@ def __init__( context_precision, context_recall, ] - - async def a_measure(self, test_case: Dict): - return self.measure(test_case) - - def measure(self, test_case: Dict): - from ragas import evaluate - - try: - from datasets import Dataset - except ModuleNotFoundError: - raise ModuleNotFoundError("Please install dataset") - - # Create a dataset from the test case - # Convert the Dict to a format compatible with Dataset - data = { - "question": test_case["question"], - "contexts": test_case["contexts"], - "answer": test_case["answer"], - "ground_truth": test_case["ground_truth"], + # Find necessary input fields using the given metrics + _required_columns = set() + for metric in self.metrics: + for column in list(metric._required_columns.values())[0]: + _required_columns.add(column) + column2field = { + "user_input" : "question", + "response" : "answer", + "reference" : "ground_truth", + "retrieved_contexts" : "contexts" } + _required_fields = [column2field[column] for column in _required_columns] + data = {field : test_case[field] for field in _required_fields} dataset = Dataset.from_dict(data) + # evaluate self.score = evaluate( dataset, metrics=self.metrics, - llm=self.chat_model, + llm=chat_model, embeddings=self.embeddings, ) return self.score - def is_successful(self): return self.success - @property def __name__(self): return "RAGAS"