Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DRAFT: Feature/accuracy for qa task #394

Merged
merged 17 commits into from
May 11, 2023
Merged
12 changes: 10 additions & 2 deletions nlptest/datahandler/datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,9 +458,17 @@ def load_data(self):
data = []
with jsonlines.open(self._file_path) as reader:
for item in reader:
expected_results = item.get("answer_and_def_correct_predictions", item.get("answer", None))
if isinstance(expected_results, str) or isinstance(expected_results, bool): expected_results = [str(expected_results)]

data.append(
QASample(original_question=item['question'], original_context=item.get(
'passage', "-"), task=self.task, dataset_name=self._file_path.split('/')[-2])
QASample(
original_question = item['question'],
original_context= item.get('passage', "-"),
expected_results = expected_results,
task=self.task,
dataset_name=self._file_path.split('/')[-2]
)
)

return data
Expand Down
6 changes: 5 additions & 1 deletion nlptest/modelhandler/llm_modelhandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,20 @@ def load_model(cls, hub: str, path: str, *args, **kwargs):
Please install langchain by pip install langchain''')
except ValidationError as e:
error_msg = [err['loc'][0] for err in e.errors()]

raise ConfigError(
f"\nPlease update model_parameters section in config.yml file for {path} model in {hub}.\nmodel_parameters:\n\t{error_msg[0]}: value \n\n{error_msg} is required field(s), please provide them in config.yml "
)


def predict(self, text: Union[str, dict], prompt: dict, *args, **kwargs):
prompt_template = PromptTemplate(**prompt)
llmchain = LLMChain(prompt=prompt_template, llm=self.model)
return llmchain.run(**text)

def predict_raw(self, text: Union[str, dict], prompt: dict, *args, **kwargs):
"""Alias of the 'predict' method"""
return self.predict(text, prompt, *args, **kwargs)

def __call__(self, text: Union[str, dict], prompt: dict, *args, **kwargs):
"""Alias of the 'predict' method"""
return self.predict(text, prompt, *args, **kwargs)
Expand Down
2 changes: 1 addition & 1 deletion nlptest/nlptest.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def generated_results(self) -> Optional[pd.DataFrame]:
generated_results_df = pd.DataFrame.from_dict(
[x.to_dict() for x in self._generated_results])

return generated_results_df
return generated_results_df.fillna('-')

def augment(self, input_path: str, output_path: str, inplace: bool = False) -> "Harness":
"""
Expand Down
66 changes: 40 additions & 26 deletions nlptest/transform/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from nlptest.utils.custom_types.sample import QASample, SequenceClassificationSample, NERSample
from ..utils.custom_types import Result, Sample
from .utils import (A2B_DICT, asian_names, black_names, country_economic_dict, create_terminology, female_pronouns,
from .utils import ( default_user_prompt, A2B_DICT, asian_names, black_names, country_economic_dict, create_terminology, female_pronouns,
get_substitution_names, hispanic_names, inter_racial_names, male_pronouns, native_american_names,
neutral_pronouns, religion_wise_names, white_names)
from .robustness import BaseRobustness
Expand Down Expand Up @@ -617,17 +618,16 @@ def transform(self) -> List[Sample]:
List[Sample]:
A list of `Sample` objects representing the resulting dataset after running the robustness test.
"""
# TODO: get rid of pandas
all_samples = []
for test_name, params in self.tests.items():
data_handler_copy = [x.copy() for x in self._data_handler]

try:
y_true = pd.Series(data_handler_copy).apply(
lambda x: [y.entity for y in x.expected_results.predictions])
except:
y_true = pd.Series(data_handler_copy).apply(
lambda x: [y.label for y in x.expected_results.predictions])
if isinstance(data_handler_copy[0], NERSample):
y_true = pd.Series(data_handler_copy).apply(lambda x: [y.entity for y in x.expected_results.predictions])
elif isinstance(data_handler_copy[0], SequenceClassificationSample):
y_true = pd.Series(data_handler_copy).apply(lambda x: [y.label for y in x.expected_results.predictions])
elif isinstance(data_handler_copy[0], QASample):
y_true = pd.Series(data_handler_copy).apply(lambda x: x.expected_results)

y_true = y_true.explode().apply(lambda x: x.split("-")
[-1] if isinstance(x, str) else x)
Expand Down Expand Up @@ -664,29 +664,43 @@ def run(cls, sample_list: Dict[str, List[Sample]], model: ModelFactory, raw_data
raw_data (List[Sample]): The raw dataset.

"""
try:
y_true = pd.Series(raw_data).apply(
lambda x: [y.entity for y in x.expected_results.predictions])
except:
y_true = pd.Series(raw_data).apply(
lambda x: [y.label for y in x.expected_results.predictions])

len(y_true)
X_test = pd.Series(raw_data).apply(lambda x: x.original)
y_pred = X_test.apply(model.predict_raw)

valid_indices = y_true.apply(len) == y_pred.apply(len)
y_true = y_true[valid_indices]
y_pred = y_pred[valid_indices]

y_true = y_true.explode().apply(lambda x: x.split("-")[-1])
y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1])

if isinstance(raw_data[0], NERSample):
y_true = pd.Series(raw_data).apply(lambda x: [y.entity for y in x.expected_results.predictions])
X_test = pd.Series(raw_data).apply(lambda sample: sample.original)
y_pred = X_test.apply(model.predict_raw)
valid_indices = y_true.apply(len) == y_pred.apply(len)
y_true = y_true[valid_indices]
y_pred = y_pred[valid_indices]
y_true = y_true.explode()
y_pred = y_pred.explode()
y_pred = y_pred.apply(lambda x: x.split("-")[-1])
y_true = y_true.apply(lambda x: x.split("-")[-1])

elif isinstance(raw_data[0], SequenceClassificationSample):
y_true = pd.Series(raw_data).apply(lambda x: [y.label for y in x.expected_results.predictions])
y_true = y_true.apply(lambda x: x[0])
X_test = pd.Series(raw_data).apply(lambda sample: sample.original)
y_pred = X_test.apply(model.predict_raw)
y_true = y_true.explode()
y_pred = y_pred.explode()

elif isinstance(raw_data[0], QASample):
dataset_name = raw_data[0].dataset_name.split('-')[0].lower()
user_prompt = kwargs.get('user_prompt', default_user_prompt.get(dataset_name, ""))
prompt_template = """Context: {context}\nQuestion: {question}\n """ + user_prompt

y_true = pd.Series(raw_data).apply(lambda x: x.expected_results)
X_test = pd.Series(raw_data)
y_pred = X_test.apply(lambda sample: model(text={'context':sample.original_context, 'question': sample.original_question}, prompt={"template":prompt_template, 'input_variables':["context", "question"]}))
y_pred = y_pred.apply(lambda x: x.strip())

if kwargs['is_default']:
y_pred = y_pred.apply(lambda x: '1' if x in ['pos', 'LABEL_1', 'POS'] else (
'0' if x in ['neg', 'LABEL_0', 'NEG'] else x))


supported_tests = cls.available_tests()

tasks = []
for test_name, samples in sample_list.items():
tasks.append(
Expand Down
202 changes: 195 additions & 7 deletions nlptest/transform/accuracy.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from abc import ABC, abstractmethod
import asyncio
from typing import Any, Dict, List

from sklearn.metrics import classification_report, f1_score
import asyncio
import logging
import evaluate

from sklearn.metrics import classification_report, f1_score
from nlptest.utils.custom_types import MinScoreOutput, MinScoreSample


class BaseAccuracy(ABC):
"""
Abstract base class for implementing accuracy measures.
Expand Down Expand Up @@ -63,7 +64,7 @@ class MinPrecisionScore(BaseAccuracy):
Subclass of BaseAccuracy that implements the minimum precision score.

Attributes:
alias_name (str): The name "min_precision_score" for config.
alias_name (str): The name for config.

Methods:
transform(y_true, y_pred) -> Any: Creates accuracy test results.
Expand Down Expand Up @@ -145,7 +146,7 @@ class MinRecallScore(BaseAccuracy):
Subclass of BaseAccuracy that implements the minimum precision score.

Attributes:
alias_name (str): The name "min_precision_score" for config.
alias_name (str): The name for config.

Methods:
transform(y_true, y_pred) -> Any: Creates accuracy test results.
Expand Down Expand Up @@ -228,7 +229,7 @@ class MinF1Score(BaseAccuracy):
Subclass of BaseAccuracy that implements the minimum precision score.

Attributes:
alias_name (str): The name "min_precision_score" for config.
alias_name (str): The name for config.

Methods:
transform(y_true, y_pred) -> Any: Creates accuracy test results.
Expand Down Expand Up @@ -372,7 +373,7 @@ class MinMacroF1Score(BaseAccuracy):
Subclass of BaseAccuracy that implements the minimum precision score.

Attributes:
alias_name (str): The name "min_precision_score" for config.
alias_name (str): The name for config.

Methods:
transform(y_true, y_pred) -> Any: Creates accuracy test results.
Expand Down Expand Up @@ -486,3 +487,190 @@ async def run(sample_list: List[MinScoreSample], y_true, y_pred, **kwargs):
if progress:
progress.update(1)
return sample_list

class MinEMcore(BaseAccuracy):
"""
Subclass of BaseAccuracy that implements the minimum precision score.

Attributes:
alias_name (str): The name for config.

Methods:
transform(y_true, y_pred) -> Any: Creates accuracy test results.
"""

alias_name = "min_exact_match_score"
supported_tasks = ["question-answering"]

@staticmethod
def transform(y_true, params):
"""
Computes the minimum F1 score for the given data.

Args:
y_true (List[Any]): True values
y_pred (List[Any]): Predicted values
params (Dict): parameters for tests configuration

Returns:
List[MinScoreSample]: The transformed data based on the minimum F1 score.
"""

min_score = params["min_score"]

sample = MinScoreSample(
category="accuracy",
test_type="min_macro_f1_score",
expected_results=MinScoreOutput(min_score=min_score)
)

return [sample]

@staticmethod
async def run(sample_list: List[MinScoreSample], y_true, y_pred, **kwargs):

"""
Computes the minimum F1 score for the given data.

Args:
sample_list (List[MinScoreSample]): List of samples to be transformed.
y_true (List[Any]): True values
y_pred (List[Any]): Predicted values

"""
progress = kwargs.get("progress_bar", False)

em = evaluate.load("exact_match")
y_true = [x[0] for x in y_true]
result = em.compute(references=y_true, predictions=y_pred)["exact_match"]
for sample in sample_list:
sample.actual_results = MinScoreOutput(min_score=result)
sample.state = "done"
if progress:
progress.update(1)

return sample_list

class MinBLEUcore(BaseAccuracy):
"""
Subclass of BaseAccuracy that implements the minimum precision score.

Attributes:
alias_name (str): The name for config.

Methods:
transform(y_true, y_pred) -> Any: Creates accuracy test results.
"""

alias_name = "min_bleu_score"
supported_tasks = ["question-answering"]

@staticmethod
def transform(y_true, params):
"""
Computes the minimum F1 score for the given data.

Args:
y_true (List[Any]): True values
y_pred (List[Any]): Predicted values
params (Dict): parameters for tests configuration

Returns:
List[MinScoreSample]: The transformed data based on the minimum F1 score.
"""

min_score = params["min_score"]

sample = MinScoreSample(
category="accuracy",
test_type="min_bleu_score",
expected_results=MinScoreOutput(min_score=min_score)
)

return [sample]

@staticmethod
async def run(sample_list: List[MinScoreSample], y_true, y_pred, **kwargs):

"""
Computes the minimum F1 score for the given data.

Args:
sample_list (List[MinScoreSample]): List of samples to be transformed.
y_true (List[Any]): True values
y_pred (List[Any]): Predicted values

"""
progress = kwargs.get("progress_bar", False)
em = evaluate.load("bleu")
result = em.compute(references=y_true, predictions=y_pred)
y_true = [[f'The answer is {y}' for y in x] for x in y_true]
y_pred = [f'The answer is {x}' for x in y_pred]

for sample in sample_list:
sample.actual_results = MinScoreOutput(min_score=result["bleu"])
sample.state = "done"
if progress:
progress.update(1)
return sample_list

class MinROUGEcore(BaseAccuracy):
"""
Subclass of BaseAccuracy that implements the minimum precision score.

Attributes:
alias_name (str): The name for config.

Methods:
transform(y_true, y_pred) -> Any: Creates accuracy test results.
"""

alias_name = ["min_rouge1_score","min_rouge2_score","min_rougeL_score","min_rougeLsum_score"]
supported_tasks = ["question-answering"]

@staticmethod
def transform(y_true, params):
"""
Computes the minimum F1 score for the given data.

Args:
y_true (List[Any]): True values
y_pred (List[Any]): Predicted values
params (Dict): parameters for tests configuration

Returns:
List[MinScoreSample]: The transformed data based on the minimum F1 score.
"""

min_score = params["min_score"]

sample = MinScoreSample(
category="accuracy",
test_type="min_bleu_score",
expected_results=MinScoreOutput(min_score=min_score)
)

return [sample]

@staticmethod
async def run(sample_list: List[MinScoreSample], y_true, y_pred, **kwargs):

"""
Computes the minimum F1 score for the given data.

Args:
sample_list (List[MinScoreSample]): List of samples to be transformed.
y_true (List[Any]): True values
y_pred (List[Any]): Predicted values

"""
progress = kwargs.get("progress_bar", False)
em = evaluate.load("rouge")
result = em.compute(references=y_true, predictions=y_pred)

for sample in sample_list:
sample.actual_results = MinScoreOutput(min_score=result[sample.test_type.split('_')[1]])
sample.state = "done"
if progress:
progress.update(1)
return sample_list
2 changes: 1 addition & 1 deletion nlptest/utils/custom_types/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,4 +117,4 @@ def __eq__(self, other: "NEROutput"):
raise NotImplementedError()


Result = TypeVar("Result", NEROutput, SequenceClassificationOutput, MinScoreOutput, MaxScoreOutput)
Result = TypeVar("Result", NEROutput, SequenceClassificationOutput, MinScoreOutput, MaxScoreOutput, List[str], str)
Loading