NASA-IMPACT · NISH1001 · Mar 10, 2023 · Feb 21, 2023 · Feb 21, 2023 · Feb 22, 2023
diff --git a/evalem/evaluators/__init__.py b/evalem/evaluators/__init__.py
@@ -1,2 +1,3 @@
 # flake8: noqa
 from ._base import Evaluator
+from .basics import QAEvaluator, TextClassificationEvaluator
diff --git a/evalem/evaluators/basics.py b/evalem/evaluators/basics.py
@@ -1,6 +1,13 @@
 #!/usr/bin/env python3
 
-from ..metrics import AccuracyMetric, ExactMatchMetric, F1Metric
+from ..metrics import (
+    AccuracyMetric,
+    ConfusionMatrix,
+    ExactMatchMetric,
+    F1Metric,
+    PrecisionMetric,
+    RecallMetric,
+)
 from ._base import Evaluator
 
 
@@ -30,6 +37,23 @@ def __init__(self) -> None:
         )
 
 
+class TextClassificationEvaluator(BasicEvaluator):
+    """
+    An evaluator for text classification tasks.
+    """
+
+    def __init__(self) -> None:
+        super().__init__(
+            metrics=[
+                AccuracyMetric(),
+                F1Metric(),
+                PrecisionMetric(),
+                RecallMetric(),
+                ConfusionMatrix(),
+            ],
+        )
+
+
 def main():
     pass
 

diff --git a/evalem/metrics/basics.py b/evalem/metrics/basics.py
@@ -101,11 +101,3 @@ def __get_labels(
         Get unique list of labels across predictions + references.
         """
         return sorted(set(predictions).union(references))
-
-
-def main():
-    pass
-
-
-if __name__ == "__main__":
-    main()
diff --git a/evalem/metrics/semantics.py b/evalem/metrics/semantics.py
@@ -37,6 +37,10 @@ class BertScore(SemanticMetric):
             https://github.com/Tiiiger/bert_score/blob/master/bert_score/utils.py
         ```device```: ```str```
             Which device to run the model on? Defaults to "cpu".
+        ```per_instance_score```: ```bool```
+            If enabled, precision, recall and f1 score per instance is also
+            returned in the computation result.
+            Else: mean precision, recall and f1 is computed by default.
         ```debug```: ```bool```
             Enable debugging log? Defaults to False.
 
@@ -68,12 +72,14 @@ class BertScore(SemanticMetric):
 
     def __init__(
         self,
-        model_type: str = "roberta-large",
+        model_type: str = "bert-base-uncased",
         device: str = "cpu",
+        per_instance_score: bool = False,
         debug: bool = False,
     ) -> None:
         super().__init__(metrics="bertscore", device=device, debug=debug)
         self.model_type = model_type
+        self.per_instance_score = per_instance_score
 
     def compute(
         self,
@@ -83,13 +89,19 @@ def compute(
     ) -> MetricOutput:
         device = kwargs.pop("device", self.device)
         model_type = kwargs.pop("model_type", self.model_type)
-        return super().compute(
+        result = super().compute(
             predictions=predictions,
             references=references,
             model_type=model_type,
             device=device,
             **kwargs,
         )
+        # if you want to supress a list of all these metrics
+        # and want to just have mean/average.
+        if not self.per_instance_score:
+            for _key in ["precision", "recall", "f1"]:
+                result["bertscore"][_key] = np.mean(result["bertscore"][_key])
+        return result
 
 
 class BartScore(SemanticMetric):

diff --git a/evalem/misc/datasets.py b/evalem/misc/datasets.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 
+from typing import Dict
+
 from datasets import load_dataset
 
 
@@ -41,6 +43,42 @@ def get_squad_v2(
     return dict(inputs=inputs, references=references)
 
 
+def get_imdb(
+    data_type: str = "test",
+    nsamples: int = 1000,
+    shuffle: bool = False,
+) -> Dict[str, list]:
+    """
+    This loads imdb text classification dataset using HuggingFace datasets module.
+
+    Args:
+        ```data_type```: ```str```
+            Either "train" or "test"
+        ```nsamples```: ```int```
+            How many samples to load?
+            Note: The returned data size may not be exactly equal to nsamples
+            as we're filtering out empty references
+        ```shuffle```: ```bool```
+            If enabled, shuffles the data prior to sampling/filtering.
+
+    Returns:
+        Returns a dict with 2 keys:
+            - `inputs`: `List[dict]`, each dict has "context" and "question"
+            keys
+            - `references`: ```List[List[str]]```
+
+    """
+    nsamples = nsamples or 0
+    data = load_dataset("imdb")[data_type]
+    data = data.shuffle(seed=42) if shuffle else data
+    data = data.select(range(nsamples)) if nsamples > 0 else data
+
+    label_map = ["NEGATIVE", "POSITIVE"]
+    inputs = [(d["text"], label_map[d["label"]]) for d in data]
+    inputs, references = zip(*inputs)
+    return dict(inputs=list(inputs), references=list(references))
+
+
 def main():
     pass
 

diff --git a/evalem/models/__init__.py b/evalem/models/__init__.py
@@ -1,3 +1,7 @@
 # flake8: noqa
 from ._base import HFLMWrapper, HFPipelineWrapper, ModelWrapper
-from .defaults import DefaultQAModelWrapper
+from .defaults import (
+    DefaultQAModelWrapper,
+    QuestionAnsweringHFPipelineWrapper,
+    TextClassificationHFPipelineWrapper,
+)
diff --git a/evalem/models/_base.py b/evalem/models/_base.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 from abc import abstractmethod
-from typing import Iterable, Type
+from typing import Callable, Iterable, Type
 
 from transformers import Pipeline as HF_Pipeline
 from transformers import PreTrainedModel, PreTrainedTokenizerBase
@@ -16,19 +16,47 @@ class ModelWrapper(AbstractBase):
     all the upstream models into a nice wrapper.
 
     All the downstream implementation of `ModelWrapper` should implement
-    the `predict(...)` method.
+    the `_predict(...)` method which is itself called by `.predict(...)` method.
+
+    Args:
+        ```model```:
+            Input model that's being wrapped for common interface
+        ```debug```: ```bool```
+            If enabled, debugging logs could be printed
+        ```kwargs```:
+            - ```inputs_preprocessor```
+                A `Callable` to apply on inputs.
+            - ```predictions_postprocessor```
+                A `Callable` to apply on model outputs/predictions.
 
     Note:
-        In order to convert to task-specific downstream format, we provide
-        `_map_predictions(...)` method which user can override. By default,
-        it is an identity that doesn't change the format egested by the model.
+        - Override `_preprocess_inputs` method to change data format for
+            model input. Default it identity (no change).
+        - Override `_postprocess_predictions` to convert predictions to
+            task-specific downstream format. Defaults to identity (no change).
     """
 
-    def __init__(self, model, debug: bool = False, **kwargs) -> None:
+    def __init__(
+        self,
+        model,
+        debug: bool = False,
+        **kwargs,
+    ) -> None:
         super().__init__(debug=debug)
         self.model = model
 
-    @abstractmethod
+        # specifies how the input format conversion is done
+        self.inputs_preprocessor: Callable = (
+            kwargs.get("inputs_preprocessor", self._preprocess_inputs)
+            or self._preprocess_inputs
+        )
+
+        # specifies how the predictions formatting is done
+        self.predictions_postprocessor: Callable = (
+            kwargs.get("predictions_postprocessor", self._postprocess_predictions)
+            or self._postprocess_predictions
+        )
+
     def predict(
         self,
         inputs: Iterable,
@@ -45,23 +73,51 @@ def predict(
         Returns:
             Iterable of predicted instance
         """
-        raise NotImplementedError()
+        inputs = self.inputs_preprocessor(inputs, **kwargs)
+        predictions = self._predict(inputs, **kwargs)
+        return self.predictions_postprocessor(predictions, **kwargs)
 
-    def __call__(
+    @abstractmethod
+    def _predict(
         self,
         inputs: Iterable,
         **kwargs,
     ) -> Iterable[EvaluationPredictionInstance]:
-        return self.predict(inputs, **kwargs)
+        """
+        Entrypoint method for predicting using the wrapped model
+
+        Args:
+            ```inputs```
+                Represent input dataset whose format depends on
+                downstream tasks.
+
+        Returns:
+            Iterable of predicted instance
+        """
+        raise NotImplementedError()
 
-    def _map_predictions(self, predictions: Iterable):
+    def _preprocess_inputs(self, inputs: Iterable, **kwargs) -> Iterable:
+        """
+        A helper method to transform inputs suitable for model to ingest.
+        By default, it's an identity function.
+        """
+        return inputs
+
+    def _postprocess_predictions(self, predictions: Iterable, **kwargs):
         """
         A helper method to transform predictions from the models
         into any downstream format. By default, it's an identity function.
         """
         # default -> Identity
         return predictions
 
+    def __call__(
+        self,
+        inputs: Iterable,
+        **kwargs,
+    ) -> Iterable[EvaluationPredictionInstance]:
+        return self.predict(inputs, **kwargs)
+
 
 class HFWrapper(ModelWrapper):
     """
@@ -86,8 +142,9 @@ def __init__(
         self,
         model: Type[PreTrainedModel],
         tokenizer: Type[PreTrainedTokenizerBase],
+        **kwargs,
     ) -> None:
-        super().__init__(model=model)
+        super().__init__(model=model, **kwargs)
         self.tokenizer = tokenizer
 
 
@@ -113,21 +170,29 @@ class HFPipelineWrapper(HFWrapper):
             pipe = hf_pipeline("question-answering")
             wrapped_model = HFPipelineWrapper(pipe)
 
+            # Or: if you want to specify how to post-process predictions,
+            # provide the processor explicitly.
+            wrapped_model = HFPipelineWrapper(
+                pipeline("question-answering", model="deepset/roberta-base-squad2"),
+                predictions_postprocessor=lambda xs: list(map(lambda x: x["answer"], xs))
+            )
+
+
             # compute predictions
             # (format?) and pass to evaluator along with references
             predictions = wrapped_model.predict(<inputs>)
     """
 
-    def __init__(self, pipeline: Type[HF_Pipeline], debug: bool = False) -> None:
+    def __init__(self, pipeline: Type[HF_Pipeline], **kwargs) -> None:
         """
         Args:
             ```pipeline```:
                 A HuggingFace pipeline object used for prediction
         """
-        super().__init__(model=pipeline)
+        super().__init__(model=pipeline, **kwargs)
 
-    def predict(self, inputs, **kwargs):
-        return self._map_predictions(self.model(inputs))
+    def _predict(self, inputs, **kwargs):
+        return self.model(inputs, **kwargs)
 
     @property
     def pipeline(self) -> HF_Pipeline: