From f1593ea4243af6dca54a557d29f64f7b511932ca Mon Sep 17 00:00:00 2001
From: adkakne <113945294+adkakne@users.noreply.github.com>
Date: Tue, 24 Sep 2024 00:48:43 -0700
Subject: [PATCH] Minimize requirements for user data for OPEA ragas (#136)

* minimized required fields/columns in user data

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>

* add bench-target as the prefix of output folder (#133)

Signed-off-by: Yingchun Guo <yingchun.guo@intel.com>
Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>

* remove examples. (#135)

Co-authored-by: root <root@idc708073.jf.intel.com>
Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>

* minor naming correction to maintain consistency

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>

* Add hyperlinks and paths validation. (#132)

Signed-off-by: ZePan110 <ze.pan@intel.com>
Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>

* added support for older version of ragas

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
Signed-off-by: Yingchun Guo <yingchun.guo@intel.com>
Signed-off-by: ZePan110 <ze.pan@intel.com>
Co-authored-by: Ying Chun Guo <yingchun.guo@intel.com>
Co-authored-by: lkk <33276950+lkk12014402@users.noreply.github.com>
Co-authored-by: root <root@idc708073.jf.intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: ZePan110 <ze.pan@intel.com>
---
 evals/metrics/ragas/ragas.py | 68 +++++++++++++++++++-----------------
 tests/requirements.txt       |  1 +
 2 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index 35449c08..f8d76664 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-
 #
 import os
 from typing import Dict, Optional, Union
@@ -26,7 +25,6 @@ def __init__(
         embeddings: Optional[Embeddings] = None,
         metrics: Optional[list[str]] = None,
     ):
-
         self.threshold = threshold
         self.model = model
         self.embeddings = embeddings
@@ -42,7 +40,13 @@ def __init__(
             "reference_free_rubrics_score",
         ]
 
+    async def a_measure(self, test_case: Dict):
+        return self.measure(test_case)
+
+    def measure(self, test_case: Dict):
+        # sends to server
         try:
+            from ragas import evaluate
             from ragas.metrics import (
                 answer_correctness,
                 answer_relevancy,
@@ -55,12 +59,10 @@ def __init__(
             )
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
-
         try:
             from datasets import Dataset
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install dataset")
-
         self.metrics_instance = {
             "answer_correctness": answer_correctness,
             "answer_relevancy": answer_relevancy,
@@ -71,7 +73,6 @@ def __init__(
             "context_utilization": context_utilization,
             "reference_free_rubrics_score": reference_free_rubrics_score,
         }
-
         # Set LLM model
         openai_key = os.getenv("OPENAI_API_KEY", None)
         if openai_key is not None:
@@ -81,14 +82,13 @@ def __init__(
             print("LLM endpoint: ", self.model)
             self.chat_model = HuggingFaceEndpoint(
                 endpoint_url=self.model,
-                task="text-generation",
-                max_new_tokens=1024,
-                do_sample=False,
+                timeout=600,
             )
         else:
+            print("Accepting user-initialized model as we could not detect OpenAI key or HuggingFace Endpoint URL.")
             self.chat_model = self.model
-
-        # initialize metrics
+        # Create a dataset from the test case
+        # Convert the Dict to a format compatible with Dataset
         if self.metrics is not None:
             tmp_metrics = []
             # check supported list
@@ -106,10 +106,8 @@ def __init__(
                     if metric == "answer_relevancy" and self.embeddings is None:
                         raise ValueError("answer_relevancy metric need provide embeddings model.")
                     tmp_metrics.append(self.metrics_instance[metric])
-
             self.metrics = tmp_metrics
-
-        else:  # default metrics
+        else:
             self.metrics = [
                 answer_relevancy,
                 faithfulness,
@@ -118,28 +116,34 @@ def __init__(
                 context_precision,
                 context_recall,
             ]
-
-    async def a_measure(self, test_case: Dict):
-        return self.measure(test_case)
-
-    def measure(self, test_case: Dict):
-        from ragas import evaluate
-
-        try:
-            from datasets import Dataset
-        except ModuleNotFoundError:
-            raise ModuleNotFoundError("Please install dataset")
-
-        # Create a dataset from the test case
-        # Convert the Dict to a format compatible with Dataset
-        data = {
-            "question": test_case["question"],
-            "contexts": test_case["contexts"],
-            "answer": test_case["answer"],
-            "ground_truth": test_case["ground_truth"],
+        # Find necessary input fields using the given metrics
+        _required_columns = set()
+        is_latest = faithfulness
+        column_map = {  # this column maps new naming style in ragas to their old naming style
+            "user_input": "question",
+            "response": "answer",
+            "reference": "ground_truth",
+            "retrieved_contexts": "contexts",
         }
+        for metric in self.metrics:
+            if hasattr(metric, "_required_columns"):
+                for column in list(metric._required_columns.values())[0]:
+                    _required_columns.add(column_map[column])
+            elif hasattr(metric, "evaluation_mode"):
+                from ragas.metrics.base import get_required_columns
+
+                for column in get_required_columns(metric.evaluation_mode):
+                    _required_columns.add(column)
+            else:
+                print("metric has no attribute denoting required columns")
+
+        print("Required columns for given list of metrics are = {}".format(_required_columns))
+
+        # get only necessary columns from test case
+        data = {column: test_case[column] for column in _required_columns}
         dataset = Dataset.from_dict(data)
 
+        # evaluate
         self.score = evaluate(
             dataset,
             metrics=self.metrics,
diff --git a/tests/requirements.txt b/tests/requirements.txt
index cf468d39..d2cd20b0 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1,4 +1,5 @@
 bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@e5c2f31625223431d7987f43b70b75b9d26ba118
+jieba
 langchain_community
 langchain_huggingface
 lm-eval==0.4.3