Merge pull request #394 from JohnSnowLabs/feature/accuracy-for-qa-task

DRAFT: Feature/accuracy for qa task
JohnSnowLabs · May 11, 2023 · 3a1ac44 · 3a1ac44
2 parents c9c0772 + dd4d77f
commit 3a1ac44
Show file tree

Hide file tree

Showing 9 changed files with 261 additions and 43 deletions.
diff --git a/nlptest/datahandler/datasource.py b/nlptest/datahandler/datasource.py
@@ -458,9 +458,17 @@ def load_data(self):
         data = []
         with jsonlines.open(self._file_path) as reader:
             for item in reader:
+                expected_results = item.get("answer_and_def_correct_predictions", item.get("answer", None))
+                if isinstance(expected_results, str) or isinstance(expected_results, bool): expected_results = [str(expected_results)]
+
                 data.append(
-                    QASample(original_question=item['question'], original_context=item.get(
-                        'passage', "-"), task=self.task, dataset_name=self._file_path.split('/')[-2])
+                    QASample(
+                        original_question = item['question'],
+                        original_context= item.get('passage', "-"),
+                        expected_results = expected_results,
+                        task=self.task,
+                        dataset_name=self._file_path.split('/')[-2]
+                        )
                 )
 
         return data

diff --git a/nlptest/modelhandler/llm_modelhandler.py b/nlptest/modelhandler/llm_modelhandler.py
@@ -35,16 +35,20 @@ def load_model(cls, hub: str, path: str, *args, **kwargs):
                 Please install langchain by pip install langchain''')
         except ValidationError as e:
             error_msg = [err['loc'][0] for err in e.errors()]
-
             raise ConfigError(
                 f"\nPlease update model_parameters section in config.yml file for {path} model in {hub}.\nmodel_parameters:\n\t{error_msg[0]}: value \n\n{error_msg} is required field(s), please provide them in config.yml "
             )
+
 
     def predict(self, text: Union[str, dict], prompt: dict, *args, **kwargs):
         prompt_template = PromptTemplate(**prompt)
         llmchain = LLMChain(prompt=prompt_template, llm=self.model)
         return llmchain.run(**text)
 
+    def predict_raw(self, text: Union[str, dict], prompt: dict, *args, **kwargs):
+        """Alias of the 'predict' method"""
+        return self.predict(text, prompt, *args, **kwargs)
+
     def __call__(self, text: Union[str, dict], prompt: dict, *args, **kwargs):
         """Alias of the 'predict' method"""
         return self.predict(text, prompt, *args, **kwargs)

diff --git a/nlptest/nlptest.py b/nlptest/nlptest.py
@@ -277,7 +277,7 @@ def generated_results(self) -> Optional[pd.DataFrame]:
         generated_results_df = pd.DataFrame.from_dict(
             [x.to_dict() for x in self._generated_results])
 
-        return generated_results_df
+        return generated_results_df.fillna('-')
 
     def augment(self, input_path: str, output_path: str, inplace: bool = False) -> "Harness":
         """

diff --git a/nlptest/transform/__init__.py b/nlptest/transform/__init__.py
@@ -1,5 +1,6 @@
+from nlptest.utils.custom_types.sample import QASample, SequenceClassificationSample, NERSample
 from ..utils.custom_types import Result, Sample
-from .utils import (A2B_DICT, asian_names, black_names, country_economic_dict, create_terminology, female_pronouns,
+from .utils import ( default_user_prompt, A2B_DICT, asian_names, black_names, country_economic_dict, create_terminology, female_pronouns,
                     get_substitution_names, hispanic_names, inter_racial_names, male_pronouns, native_american_names,
                     neutral_pronouns, religion_wise_names, white_names)
 from .robustness import BaseRobustness
@@ -617,17 +618,16 @@ def transform(self) -> List[Sample]:
             List[Sample]:
                 A list of `Sample` objects representing the resulting dataset after running the robustness test.
         """
-        # TODO: get rid of pandas
         all_samples = []
         for test_name, params in self.tests.items():
             data_handler_copy = [x.copy() for x in self._data_handler]
 
-            try:
-                y_true = pd.Series(data_handler_copy).apply(
-                    lambda x: [y.entity for y in x.expected_results.predictions])
-            except:
-                y_true = pd.Series(data_handler_copy).apply(
-                    lambda x: [y.label for y in x.expected_results.predictions])
+            if isinstance(data_handler_copy[0], NERSample):
+                y_true = pd.Series(data_handler_copy).apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            elif isinstance(data_handler_copy[0], SequenceClassificationSample):
+                y_true = pd.Series(data_handler_copy).apply(lambda x: [y.label for y in x.expected_results.predictions])
+            elif isinstance(data_handler_copy[0], QASample):
+                y_true = pd.Series(data_handler_copy).apply(lambda x: x.expected_results)
 
             y_true = y_true.explode().apply(lambda x: x.split("-")
                                             [-1] if isinstance(x, str) else x)
@@ -664,29 +664,43 @@ def run(cls, sample_list: Dict[str, List[Sample]], model: ModelFactory, raw_data
             raw_data (List[Sample]): The raw dataset.
 
         """
-        try:
-            y_true = pd.Series(raw_data).apply(
-                lambda x: [y.entity for y in x.expected_results.predictions])
-        except:
-            y_true = pd.Series(raw_data).apply(
-                lambda x: [y.label for y in x.expected_results.predictions])
-
-        len(y_true)
-        X_test = pd.Series(raw_data).apply(lambda x: x.original)
-        y_pred = X_test.apply(model.predict_raw)
-
-        valid_indices = y_true.apply(len) == y_pred.apply(len)
-        y_true = y_true[valid_indices]
-        y_pred = y_pred[valid_indices]
-
-        y_true = y_true.explode().apply(lambda x: x.split("-")[-1])
-        y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1])
-
+        if isinstance(raw_data[0], NERSample):
+            y_true = pd.Series(raw_data).apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            X_test = pd.Series(raw_data).apply(lambda sample: sample.original)
+            y_pred = X_test.apply(model.predict_raw)
+            valid_indices = y_true.apply(len) == y_pred.apply(len)
+            y_true = y_true[valid_indices]
+            y_pred = y_pred[valid_indices]
+            y_true = y_true.explode()
+            y_pred = y_pred.explode()
+            y_pred = y_pred.apply(lambda x: x.split("-")[-1])
+            y_true = y_true.apply(lambda x: x.split("-")[-1])
+
+        elif isinstance(raw_data[0], SequenceClassificationSample):
+            y_true = pd.Series(raw_data).apply(lambda x: [y.label for y in x.expected_results.predictions])
+            y_true = y_true.apply(lambda x: x[0])
+            X_test = pd.Series(raw_data).apply(lambda sample: sample.original)
+            y_pred = X_test.apply(model.predict_raw)
+            y_true = y_true.explode()
+            y_pred = y_pred.explode()
+
+        elif isinstance(raw_data[0], QASample):
+            dataset_name = raw_data[0].dataset_name.split('-')[0].lower()
+            user_prompt = kwargs.get('user_prompt', default_user_prompt.get(dataset_name, ""))
+            prompt_template = """Context: {context}\nQuestion: {question}\n """ + user_prompt
+
+            y_true = pd.Series(raw_data).apply(lambda x: x.expected_results)
+            X_test = pd.Series(raw_data)
+            y_pred = X_test.apply(lambda sample: model(text={'context':sample.original_context, 'question': sample.original_question}, prompt={"template":prompt_template, 'input_variables':["context", "question"]}))
+            y_pred = y_pred.apply(lambda x: x.strip())
+
         if kwargs['is_default']:
             y_pred = y_pred.apply(lambda x: '1' if x in ['pos', 'LABEL_1', 'POS'] else (
                 '0' if x in ['neg', 'LABEL_0', 'NEG'] else x))
 
+
         supported_tests = cls.available_tests()
+
         tasks = []
         for test_name, samples in sample_list.items():
             tasks.append(

diff --git a/nlptest/transform/accuracy.py b/nlptest/transform/accuracy.py
@@ -1,12 +1,13 @@
 from abc import ABC, abstractmethod
-import asyncio
 from typing import Any, Dict, List
 
-from sklearn.metrics import classification_report, f1_score
+import asyncio
+import logging
+import evaluate
 
+from sklearn.metrics import classification_report, f1_score
 from nlptest.utils.custom_types import MinScoreOutput, MinScoreSample
 
-
 class BaseAccuracy(ABC):
     """
     Abstract base class for implementing accuracy measures.
@@ -63,7 +64,7 @@ class MinPrecisionScore(BaseAccuracy):
     Subclass of BaseAccuracy that implements the minimum precision score.
 
     Attributes:
-        alias_name (str): The name "min_precision_score" for config.
+        alias_name (str): The name for config.
 
     Methods:
         transform(y_true, y_pred) -> Any: Creates accuracy test results.
@@ -145,7 +146,7 @@ class MinRecallScore(BaseAccuracy):
     Subclass of BaseAccuracy that implements the minimum precision score.
 
     Attributes:
-        alias_name (str): The name "min_precision_score" for config.
+        alias_name (str): The name for config.
 
     Methods:
         transform(y_true, y_pred) -> Any: Creates accuracy test results.
@@ -228,7 +229,7 @@ class MinF1Score(BaseAccuracy):
     Subclass of BaseAccuracy that implements the minimum precision score.
 
     Attributes:
-        alias_name (str): The name "min_precision_score" for config.
+        alias_name (str): The name for config.
 
     Methods:
         transform(y_true, y_pred) -> Any: Creates accuracy test results.
@@ -372,7 +373,7 @@ class MinMacroF1Score(BaseAccuracy):
     Subclass of BaseAccuracy that implements the minimum precision score.
 
     Attributes:
-        alias_name (str): The name "min_precision_score" for config.
+        alias_name (str): The name for config.
 
     Methods:
         transform(y_true, y_pred) -> Any: Creates accuracy test results.
@@ -486,3 +487,190 @@ async def run(sample_list: List[MinScoreSample], y_true, y_pred, **kwargs):
             if progress:
                 progress.update(1)
         return sample_list
+
+class MinEMcore(BaseAccuracy):
+    """
+    Subclass of BaseAccuracy that implements the minimum precision score.
+
+    Attributes:
+        alias_name (str): The name for config.
+
+    Methods:
+        transform(y_true, y_pred) -> Any: Creates accuracy test results.
+    """
+
+    alias_name = "min_exact_match_score"
+    supported_tasks = ["question-answering"]
+
+    @staticmethod
+    def transform(y_true, params):
+        """
+        Computes the minimum F1 score for the given data.
+
+        Args:
+            y_true (List[Any]): True values
+            y_pred (List[Any]): Predicted values
+            params (Dict): parameters for tests configuration
+
+        Returns:
+            List[MinScoreSample]: The transformed data based on the minimum F1 score.
+        """
+
+        min_score = params["min_score"]
+
+        sample = MinScoreSample(
+            category="accuracy",
+            test_type="min_macro_f1_score",
+            expected_results=MinScoreOutput(min_score=min_score)
+        )
+
+        return [sample]
+
+    @staticmethod
+    async def run(sample_list: List[MinScoreSample], y_true, y_pred, **kwargs):
+
+        """
+        Computes the minimum F1 score for the given data.
+
+        Args:
+            sample_list (List[MinScoreSample]): List of samples to be transformed.
+            y_true (List[Any]): True values
+            y_pred (List[Any]): Predicted values
+
+        """
+        progress = kwargs.get("progress_bar", False)
+
+        em = evaluate.load("exact_match")
+        y_true = [x[0] for x in y_true]
+        result = em.compute(references=y_true, predictions=y_pred)["exact_match"]
+        for sample in sample_list:
+            sample.actual_results = MinScoreOutput(min_score=result)
+            sample.state = "done"
+            if progress:
+                progress.update(1)
+
+        return sample_list
+
+class MinBLEUcore(BaseAccuracy):
+    """
+    Subclass of BaseAccuracy that implements the minimum precision score.
+
+    Attributes:
+        alias_name (str): The name for config.
+
+    Methods:
+        transform(y_true, y_pred) -> Any: Creates accuracy test results.
+    """
+
+    alias_name = "min_bleu_score"
+    supported_tasks = ["question-answering"]
+
+    @staticmethod
+    def transform(y_true, params):
+        """
+        Computes the minimum F1 score for the given data.
+
+        Args:
+            y_true (List[Any]): True values
+            y_pred (List[Any]): Predicted values
+            params (Dict): parameters for tests configuration
+
+        Returns:
+            List[MinScoreSample]: The transformed data based on the minimum F1 score.
+        """
+
+        min_score = params["min_score"]
+
+        sample = MinScoreSample(
+            category="accuracy",
+            test_type="min_bleu_score",
+            expected_results=MinScoreOutput(min_score=min_score)
+        )
+
+        return [sample]
+
+    @staticmethod
+    async def run(sample_list: List[MinScoreSample], y_true, y_pred, **kwargs):
+
+        """
+        Computes the minimum F1 score for the given data.
+
+        Args:
+            sample_list (List[MinScoreSample]): List of samples to be transformed.
+            y_true (List[Any]): True values
+            y_pred (List[Any]): Predicted values
+
+        """
+        progress = kwargs.get("progress_bar", False)
+        em = evaluate.load("bleu")
+        result = em.compute(references=y_true, predictions=y_pred)
+        y_true = [[f'The answer is {y}' for y in x] for x in y_true]
+        y_pred = [f'The answer is {x}' for x in y_pred]
+
+        for sample in sample_list:
+            sample.actual_results = MinScoreOutput(min_score=result["bleu"])
+            sample.state = "done"
+            if progress:
+                progress.update(1)
+        return sample_list
+
+class MinROUGEcore(BaseAccuracy):
+    """
+    Subclass of BaseAccuracy that implements the minimum precision score.
+
+    Attributes:
+        alias_name (str): The name for config.
+
+    Methods:
+        transform(y_true, y_pred) -> Any: Creates accuracy test results.
+    """
+
+    alias_name = ["min_rouge1_score","min_rouge2_score","min_rougeL_score","min_rougeLsum_score"]
+    supported_tasks = ["question-answering"]
+
+    @staticmethod
+    def transform(y_true, params):
+        """
+        Computes the minimum F1 score for the given data.
+
+        Args:
+            y_true (List[Any]): True values
+            y_pred (List[Any]): Predicted values
+            params (Dict): parameters for tests configuration
+
+        Returns:
+            List[MinScoreSample]: The transformed data based on the minimum F1 score.
+        """
+
+        min_score = params["min_score"]
+
+        sample = MinScoreSample(
+            category="accuracy",
+            test_type="min_bleu_score",
+            expected_results=MinScoreOutput(min_score=min_score)
+        )
+
+        return [sample]
+
+    @staticmethod
+    async def run(sample_list: List[MinScoreSample], y_true, y_pred, **kwargs):
+
+        """
+        Computes the minimum F1 score for the given data.
+
+        Args:
+            sample_list (List[MinScoreSample]): List of samples to be transformed.
+            y_true (List[Any]): True values
+            y_pred (List[Any]): Predicted values
+
+        """
+        progress = kwargs.get("progress_bar", False)
+        em = evaluate.load("rouge")
+        result = em.compute(references=y_true, predictions=y_pred)
+
+        for sample in sample_list:
+            sample.actual_results = MinScoreOutput(min_score=result[sample.test_type.split('_')[1]])
+            sample.state = "done"
+            if progress:
+                progress.update(1)
+        return sample_list
diff --git a/nlptest/utils/custom_types/output.py b/nlptest/utils/custom_types/output.py
@@ -117,4 +117,4 @@ def __eq__(self, other: "NEROutput"):
         raise NotImplementedError()
 
 
-Result = TypeVar("Result", NEROutput, SequenceClassificationOutput, MinScoreOutput, MaxScoreOutput)
+Result = TypeVar("Result", NEROutput, SequenceClassificationOutput, MinScoreOutput, MaxScoreOutput, List[str], str)