Skip to content

Commit

Permalink
Merge pull request #394 from JohnSnowLabs/feature/accuracy-for-qa-task
Browse files Browse the repository at this point in the history
DRAFT: Feature/accuracy for qa task
  • Loading branch information
ArshaanNazir authored May 11, 2023
2 parents c9c0772 + dd4d77f commit 3a1ac44
Show file tree
Hide file tree
Showing 9 changed files with 261 additions and 43 deletions.
12 changes: 10 additions & 2 deletions nlptest/datahandler/datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,9 +458,17 @@ def load_data(self):
data = []
with jsonlines.open(self._file_path) as reader:
for item in reader:
expected_results = item.get("answer_and_def_correct_predictions", item.get("answer", None))
if isinstance(expected_results, str) or isinstance(expected_results, bool): expected_results = [str(expected_results)]

data.append(
QASample(original_question=item['question'], original_context=item.get(
'passage', "-"), task=self.task, dataset_name=self._file_path.split('/')[-2])
QASample(
original_question = item['question'],
original_context= item.get('passage', "-"),
expected_results = expected_results,
task=self.task,
dataset_name=self._file_path.split('/')[-2]
)
)

return data
Expand Down
6 changes: 5 additions & 1 deletion nlptest/modelhandler/llm_modelhandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,20 @@ def load_model(cls, hub: str, path: str, *args, **kwargs):
Please install langchain by pip install langchain''')
except ValidationError as e:
error_msg = [err['loc'][0] for err in e.errors()]

raise ConfigError(
f"\nPlease update model_parameters section in config.yml file for {path} model in {hub}.\nmodel_parameters:\n\t{error_msg[0]}: value \n\n{error_msg} is required field(s), please provide them in config.yml "
)


def predict(self, text: Union[str, dict], prompt: dict, *args, **kwargs):
prompt_template = PromptTemplate(**prompt)
llmchain = LLMChain(prompt=prompt_template, llm=self.model)
return llmchain.run(**text)

def predict_raw(self, text: Union[str, dict], prompt: dict, *args, **kwargs):
"""Alias of the 'predict' method"""
return self.predict(text, prompt, *args, **kwargs)

def __call__(self, text: Union[str, dict], prompt: dict, *args, **kwargs):
"""Alias of the 'predict' method"""
return self.predict(text, prompt, *args, **kwargs)
Expand Down
2 changes: 1 addition & 1 deletion nlptest/nlptest.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def generated_results(self) -> Optional[pd.DataFrame]:
generated_results_df = pd.DataFrame.from_dict(
[x.to_dict() for x in self._generated_results])

return generated_results_df
return generated_results_df.fillna('-')

def augment(self, input_path: str, output_path: str, inplace: bool = False) -> "Harness":
"""
Expand Down
66 changes: 40 additions & 26 deletions nlptest/transform/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from nlptest.utils.custom_types.sample import QASample, SequenceClassificationSample, NERSample
from ..utils.custom_types import Result, Sample
from .utils import (A2B_DICT, asian_names, black_names, country_economic_dict, create_terminology, female_pronouns,
from .utils import ( default_user_prompt, A2B_DICT, asian_names, black_names, country_economic_dict, create_terminology, female_pronouns,
get_substitution_names, hispanic_names, inter_racial_names, male_pronouns, native_american_names,
neutral_pronouns, religion_wise_names, white_names)
from .robustness import BaseRobustness
Expand Down Expand Up @@ -617,17 +618,16 @@ def transform(self) -> List[Sample]:
List[Sample]:
A list of `Sample` objects representing the resulting dataset after running the robustness test.
"""
# TODO: get rid of pandas
all_samples = []
for test_name, params in self.tests.items():
data_handler_copy = [x.copy() for x in self._data_handler]

try:
y_true = pd.Series(data_handler_copy).apply(
lambda x: [y.entity for y in x.expected_results.predictions])
except:
y_true = pd.Series(data_handler_copy).apply(
lambda x: [y.label for y in x.expected_results.predictions])
if isinstance(data_handler_copy[0], NERSample):
y_true = pd.Series(data_handler_copy).apply(lambda x: [y.entity for y in x.expected_results.predictions])
elif isinstance(data_handler_copy[0], SequenceClassificationSample):
y_true = pd.Series(data_handler_copy).apply(lambda x: [y.label for y in x.expected_results.predictions])
elif isinstance(data_handler_copy[0], QASample):
y_true = pd.Series(data_handler_copy).apply(lambda x: x.expected_results)

y_true = y_true.explode().apply(lambda x: x.split("-")
[-1] if isinstance(x, str) else x)
Expand Down Expand Up @@ -664,29 +664,43 @@ def run(cls, sample_list: Dict[str, List[Sample]], model: ModelFactory, raw_data
raw_data (List[Sample]): The raw dataset.
"""
try:
y_true = pd.Series(raw_data).apply(
lambda x: [y.entity for y in x.expected_results.predictions])
except:
y_true = pd.Series(raw_data).apply(
lambda x: [y.label for y in x.expected_results.predictions])

len(y_true)
X_test = pd.Series(raw_data).apply(lambda x: x.original)
y_pred = X_test.apply(model.predict_raw)

valid_indices = y_true.apply(len) == y_pred.apply(len)
y_true = y_true[valid_indices]
y_pred = y_pred[valid_indices]

y_true = y_true.explode().apply(lambda x: x.split("-")[-1])
y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1])

if isinstance(raw_data[0], NERSample):
y_true = pd.Series(raw_data).apply(lambda x: [y.entity for y in x.expected_results.predictions])
X_test = pd.Series(raw_data).apply(lambda sample: sample.original)
y_pred = X_test.apply(model.predict_raw)
valid_indices = y_true.apply(len) == y_pred.apply(len)
y_true = y_true[valid_indices]
y_pred = y_pred[valid_indices]
y_true = y_true.explode()
y_pred = y_pred.explode()
y_pred = y_pred.apply(lambda x: x.split("-")[-1])
y_true = y_true.apply(lambda x: x.split("-")[-1])

elif isinstance(raw_data[0], SequenceClassificationSample):
y_true = pd.Series(raw_data).apply(lambda x: [y.label for y in x.expected_results.predictions])
y_true = y_true.apply(lambda x: x[0])
X_test = pd.Series(raw_data).apply(lambda sample: sample.original)
y_pred = X_test.apply(model.predict_raw)
y_true = y_true.explode()
y_pred = y_pred.explode()

elif isinstance(raw_data[0], QASample):
dataset_name = raw_data[0].dataset_name.split('-')[0].lower()
user_prompt = kwargs.get('user_prompt', default_user_prompt.get(dataset_name, ""))
prompt_template = """Context: {context}\nQuestion: {question}\n """ + user_prompt

y_true = pd.Series(raw_data).apply(lambda x: x.expected_results)
X_test = pd.Series(raw_data)
y_pred = X_test.apply(lambda sample: model(text={'context':sample.original_context, 'question': sample.original_question}, prompt={"template":prompt_template, 'input_variables':["context", "question"]}))
y_pred = y_pred.apply(lambda x: x.strip())

if kwargs['is_default']:
y_pred = y_pred.apply(lambda x: '1' if x in ['pos', 'LABEL_1', 'POS'] else (
'0' if x in ['neg', 'LABEL_0', 'NEG'] else x))


supported_tests = cls.available_tests()

tasks = []
for test_name, samples in sample_list.items():
tasks.append(
Expand Down
202 changes: 195 additions & 7 deletions nlptest/transform/accuracy.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from abc import ABC, abstractmethod
import asyncio
from typing import Any, Dict, List

from sklearn.metrics import classification_report, f1_score
import asyncio
import logging
import evaluate

from sklearn.metrics import classification_report, f1_score
from nlptest.utils.custom_types import MinScoreOutput, MinScoreSample


class BaseAccuracy(ABC):
"""
Abstract base class for implementing accuracy measures.
Expand Down Expand Up @@ -63,7 +64,7 @@ class MinPrecisionScore(BaseAccuracy):
Subclass of BaseAccuracy that implements the minimum precision score.
Attributes:
alias_name (str): The name "min_precision_score" for config.
alias_name (str): The name for config.
Methods:
transform(y_true, y_pred) -> Any: Creates accuracy test results.
Expand Down Expand Up @@ -145,7 +146,7 @@ class MinRecallScore(BaseAccuracy):
Subclass of BaseAccuracy that implements the minimum precision score.
Attributes:
alias_name (str): The name "min_precision_score" for config.
alias_name (str): The name for config.
Methods:
transform(y_true, y_pred) -> Any: Creates accuracy test results.
Expand Down Expand Up @@ -228,7 +229,7 @@ class MinF1Score(BaseAccuracy):
Subclass of BaseAccuracy that implements the minimum precision score.
Attributes:
alias_name (str): The name "min_precision_score" for config.
alias_name (str): The name for config.
Methods:
transform(y_true, y_pred) -> Any: Creates accuracy test results.
Expand Down Expand Up @@ -372,7 +373,7 @@ class MinMacroF1Score(BaseAccuracy):
Subclass of BaseAccuracy that implements the minimum precision score.
Attributes:
alias_name (str): The name "min_precision_score" for config.
alias_name (str): The name for config.
Methods:
transform(y_true, y_pred) -> Any: Creates accuracy test results.
Expand Down Expand Up @@ -486,3 +487,190 @@ async def run(sample_list: List[MinScoreSample], y_true, y_pred, **kwargs):
if progress:
progress.update(1)
return sample_list

class MinEMcore(BaseAccuracy):
"""
Subclass of BaseAccuracy that implements the minimum precision score.
Attributes:
alias_name (str): The name for config.
Methods:
transform(y_true, y_pred) -> Any: Creates accuracy test results.
"""

alias_name = "min_exact_match_score"
supported_tasks = ["question-answering"]

@staticmethod
def transform(y_true, params):
"""
Computes the minimum F1 score for the given data.
Args:
y_true (List[Any]): True values
y_pred (List[Any]): Predicted values
params (Dict): parameters for tests configuration
Returns:
List[MinScoreSample]: The transformed data based on the minimum F1 score.
"""

min_score = params["min_score"]

sample = MinScoreSample(
category="accuracy",
test_type="min_macro_f1_score",
expected_results=MinScoreOutput(min_score=min_score)
)

return [sample]

@staticmethod
async def run(sample_list: List[MinScoreSample], y_true, y_pred, **kwargs):

"""
Computes the minimum F1 score for the given data.
Args:
sample_list (List[MinScoreSample]): List of samples to be transformed.
y_true (List[Any]): True values
y_pred (List[Any]): Predicted values
"""
progress = kwargs.get("progress_bar", False)

em = evaluate.load("exact_match")
y_true = [x[0] for x in y_true]
result = em.compute(references=y_true, predictions=y_pred)["exact_match"]
for sample in sample_list:
sample.actual_results = MinScoreOutput(min_score=result)
sample.state = "done"
if progress:
progress.update(1)

return sample_list

class MinBLEUcore(BaseAccuracy):
"""
Subclass of BaseAccuracy that implements the minimum precision score.
Attributes:
alias_name (str): The name for config.
Methods:
transform(y_true, y_pred) -> Any: Creates accuracy test results.
"""

alias_name = "min_bleu_score"
supported_tasks = ["question-answering"]

@staticmethod
def transform(y_true, params):
"""
Computes the minimum F1 score for the given data.
Args:
y_true (List[Any]): True values
y_pred (List[Any]): Predicted values
params (Dict): parameters for tests configuration
Returns:
List[MinScoreSample]: The transformed data based on the minimum F1 score.
"""

min_score = params["min_score"]

sample = MinScoreSample(
category="accuracy",
test_type="min_bleu_score",
expected_results=MinScoreOutput(min_score=min_score)
)

return [sample]

@staticmethod
async def run(sample_list: List[MinScoreSample], y_true, y_pred, **kwargs):

"""
Computes the minimum F1 score for the given data.
Args:
sample_list (List[MinScoreSample]): List of samples to be transformed.
y_true (List[Any]): True values
y_pred (List[Any]): Predicted values
"""
progress = kwargs.get("progress_bar", False)
em = evaluate.load("bleu")
result = em.compute(references=y_true, predictions=y_pred)
y_true = [[f'The answer is {y}' for y in x] for x in y_true]
y_pred = [f'The answer is {x}' for x in y_pred]

for sample in sample_list:
sample.actual_results = MinScoreOutput(min_score=result["bleu"])
sample.state = "done"
if progress:
progress.update(1)
return sample_list

class MinROUGEcore(BaseAccuracy):
"""
Subclass of BaseAccuracy that implements the minimum precision score.
Attributes:
alias_name (str): The name for config.
Methods:
transform(y_true, y_pred) -> Any: Creates accuracy test results.
"""

alias_name = ["min_rouge1_score","min_rouge2_score","min_rougeL_score","min_rougeLsum_score"]
supported_tasks = ["question-answering"]

@staticmethod
def transform(y_true, params):
"""
Computes the minimum F1 score for the given data.
Args:
y_true (List[Any]): True values
y_pred (List[Any]): Predicted values
params (Dict): parameters for tests configuration
Returns:
List[MinScoreSample]: The transformed data based on the minimum F1 score.
"""

min_score = params["min_score"]

sample = MinScoreSample(
category="accuracy",
test_type="min_bleu_score",
expected_results=MinScoreOutput(min_score=min_score)
)

return [sample]

@staticmethod
async def run(sample_list: List[MinScoreSample], y_true, y_pred, **kwargs):

"""
Computes the minimum F1 score for the given data.
Args:
sample_list (List[MinScoreSample]): List of samples to be transformed.
y_true (List[Any]): True values
y_pred (List[Any]): Predicted values
"""
progress = kwargs.get("progress_bar", False)
em = evaluate.load("rouge")
result = em.compute(references=y_true, predictions=y_pred)

for sample in sample_list:
sample.actual_results = MinScoreOutput(min_score=result[sample.test_type.split('_')[1]])
sample.state = "done"
if progress:
progress.update(1)
return sample_list
2 changes: 1 addition & 1 deletion nlptest/utils/custom_types/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,4 +117,4 @@ def __eq__(self, other: "NEROutput"):
raise NotImplementedError()


Result = TypeVar("Result", NEROutput, SequenceClassificationOutput, MinScoreOutput, MaxScoreOutput)
Result = TypeVar("Result", NEROutput, SequenceClassificationOutput, MinScoreOutput, MaxScoreOutput, List[str], str)
Loading

0 comments on commit 3a1ac44

Please sign in to comment.