Skip to content

Commit

Permalink
Minimize requirements for user data for OPEA ragas (#136)
Browse files Browse the repository at this point in the history
* minimized required fields/columns in user data

Signed-off-by: aasavari <[email protected]>

* add bench-target as the prefix of output folder (#133)

Signed-off-by: Yingchun Guo <[email protected]>
Signed-off-by: aasavari <[email protected]>

* remove examples. (#135)

Co-authored-by: root <[email protected]>
Signed-off-by: aasavari <[email protected]>

* minor naming correction to maintain consistency

Signed-off-by: aasavari <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: aasavari <[email protected]>

* Add hyperlinks and paths validation. (#132)

Signed-off-by: ZePan110 <[email protected]>
Signed-off-by: aasavari <[email protected]>

* added support for older version of ragas

Signed-off-by: aasavari <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: aasavari <[email protected]>
Signed-off-by: Yingchun Guo <[email protected]>
Signed-off-by: ZePan110 <[email protected]>
Co-authored-by: Ying Chun Guo <[email protected]>
Co-authored-by: lkk <[email protected]>
Co-authored-by: root <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: ZePan110 <[email protected]>
  • Loading branch information
6 people authored Sep 24, 2024
1 parent 8653efb commit f1593ea
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 32 deletions.
68 changes: 36 additions & 32 deletions evals/metrics/ragas/ragas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

#
import os
from typing import Dict, Optional, Union
Expand All @@ -26,7 +25,6 @@ def __init__(
embeddings: Optional[Embeddings] = None,
metrics: Optional[list[str]] = None,
):

self.threshold = threshold
self.model = model
self.embeddings = embeddings
Expand All @@ -42,7 +40,13 @@ def __init__(
"reference_free_rubrics_score",
]

async def a_measure(self, test_case: Dict):
return self.measure(test_case)

def measure(self, test_case: Dict):
# sends to server
try:
from ragas import evaluate
from ragas.metrics import (
answer_correctness,
answer_relevancy,
Expand All @@ -55,12 +59,10 @@ def __init__(
)
except ModuleNotFoundError:
raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")

try:
from datasets import Dataset
except ModuleNotFoundError:
raise ModuleNotFoundError("Please install dataset")

self.metrics_instance = {
"answer_correctness": answer_correctness,
"answer_relevancy": answer_relevancy,
Expand All @@ -71,7 +73,6 @@ def __init__(
"context_utilization": context_utilization,
"reference_free_rubrics_score": reference_free_rubrics_score,
}

# Set LLM model
openai_key = os.getenv("OPENAI_API_KEY", None)
if openai_key is not None:
Expand All @@ -81,14 +82,13 @@ def __init__(
print("LLM endpoint: ", self.model)
self.chat_model = HuggingFaceEndpoint(
endpoint_url=self.model,
task="text-generation",
max_new_tokens=1024,
do_sample=False,
timeout=600,
)
else:
print("Accepting user-initialized model as we could not detect OpenAI key or HuggingFace Endpoint URL.")
self.chat_model = self.model

# initialize metrics
# Create a dataset from the test case
# Convert the Dict to a format compatible with Dataset
if self.metrics is not None:
tmp_metrics = []
# check supported list
Expand All @@ -106,10 +106,8 @@ def __init__(
if metric == "answer_relevancy" and self.embeddings is None:
raise ValueError("answer_relevancy metric need provide embeddings model.")
tmp_metrics.append(self.metrics_instance[metric])

self.metrics = tmp_metrics

else: # default metrics
else:
self.metrics = [
answer_relevancy,
faithfulness,
Expand All @@ -118,28 +116,34 @@ def __init__(
context_precision,
context_recall,
]

async def a_measure(self, test_case: Dict):
return self.measure(test_case)

def measure(self, test_case: Dict):
from ragas import evaluate

try:
from datasets import Dataset
except ModuleNotFoundError:
raise ModuleNotFoundError("Please install dataset")

# Create a dataset from the test case
# Convert the Dict to a format compatible with Dataset
data = {
"question": test_case["question"],
"contexts": test_case["contexts"],
"answer": test_case["answer"],
"ground_truth": test_case["ground_truth"],
# Find necessary input fields using the given metrics
_required_columns = set()
is_latest = faithfulness
column_map = { # this column maps new naming style in ragas to their old naming style
"user_input": "question",
"response": "answer",
"reference": "ground_truth",
"retrieved_contexts": "contexts",
}
for metric in self.metrics:
if hasattr(metric, "_required_columns"):
for column in list(metric._required_columns.values())[0]:
_required_columns.add(column_map[column])
elif hasattr(metric, "evaluation_mode"):
from ragas.metrics.base import get_required_columns

for column in get_required_columns(metric.evaluation_mode):
_required_columns.add(column)
else:
print("metric has no attribute denoting required columns")

print("Required columns for given list of metrics are = {}".format(_required_columns))

# get only necessary columns from test case
data = {column: test_case[column] for column in _required_columns}
dataset = Dataset.from_dict(data)

# evaluate
self.score = evaluate(
dataset,
metrics=self.metrics,
Expand Down
1 change: 1 addition & 0 deletions tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@e5c2f31625223431d7987f43b70b75b9d26ba118
jieba
langchain_community
langchain_huggingface
lm-eval==0.4.3
Expand Down

0 comments on commit f1593ea

Please sign in to comment.