diff --git a/evalem/evaluators/__init__.py b/evalem/evaluators/__init__.py index f730e53..29051a1 100644 --- a/evalem/evaluators/__init__.py +++ b/evalem/evaluators/__init__.py @@ -1,2 +1,3 @@ # flake8: noqa from ._base import Evaluator +from .basics import QAEvaluator, TextClassificationEvaluator diff --git a/evalem/evaluators/basics.py b/evalem/evaluators/basics.py index 98b3665..8c8ac91 100755 --- a/evalem/evaluators/basics.py +++ b/evalem/evaluators/basics.py @@ -1,6 +1,13 @@ #!/usr/bin/env python3 -from ..metrics import AccuracyMetric, ExactMatchMetric, F1Metric +from ..metrics import ( + AccuracyMetric, + ConfusionMatrix, + ExactMatchMetric, + F1Metric, + PrecisionMetric, + RecallMetric, +) from ._base import Evaluator @@ -30,6 +37,23 @@ def __init__(self) -> None: ) +class TextClassificationEvaluator(BasicEvaluator): + """ + An evaluator for text classification tasks. + """ + + def __init__(self) -> None: + super().__init__( + metrics=[ + AccuracyMetric(), + F1Metric(), + PrecisionMetric(), + RecallMetric(), + ConfusionMatrix(), + ], + ) + + def main(): pass diff --git a/evalem/metrics/basics.py b/evalem/metrics/basics.py index 0aa24d2..ce6c3b3 100755 --- a/evalem/metrics/basics.py +++ b/evalem/metrics/basics.py @@ -101,11 +101,3 @@ def __get_labels( Get unique list of labels across predictions + references. """ return sorted(set(predictions).union(references)) - - -def main(): - pass - - -if __name__ == "__main__": - main() diff --git a/evalem/metrics/semantics.py b/evalem/metrics/semantics.py index ea43909..62b799c 100755 --- a/evalem/metrics/semantics.py +++ b/evalem/metrics/semantics.py @@ -37,6 +37,10 @@ class BertScore(SemanticMetric): https://github.com/Tiiiger/bert_score/blob/master/bert_score/utils.py ```device```: ```str``` Which device to run the model on? Defaults to "cpu". + ```per_instance_score```: ```bool``` + If enabled, precision, recall and f1 score per instance is also + returned in the computation result. + Else: mean precision, recall and f1 is computed by default. ```debug```: ```bool``` Enable debugging log? Defaults to False. @@ -68,12 +72,14 @@ class BertScore(SemanticMetric): def __init__( self, - model_type: str = "roberta-large", + model_type: str = "bert-base-uncased", device: str = "cpu", + per_instance_score: bool = False, debug: bool = False, ) -> None: super().__init__(metrics="bertscore", device=device, debug=debug) self.model_type = model_type + self.per_instance_score = per_instance_score def compute( self, @@ -83,13 +89,19 @@ def compute( ) -> MetricOutput: device = kwargs.pop("device", self.device) model_type = kwargs.pop("model_type", self.model_type) - return super().compute( + result = super().compute( predictions=predictions, references=references, model_type=model_type, device=device, **kwargs, ) + # if you want to supress a list of all these metrics + # and want to just have mean/average. + if not self.per_instance_score: + for _key in ["precision", "recall", "f1"]: + result["bertscore"][_key] = np.mean(result["bertscore"][_key]) + return result class BartScore(SemanticMetric): diff --git a/evalem/misc/datasets.py b/evalem/misc/datasets.py index a5de18b..fffab03 100755 --- a/evalem/misc/datasets.py +++ b/evalem/misc/datasets.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 +from typing import Dict + from datasets import load_dataset @@ -41,6 +43,42 @@ def get_squad_v2( return dict(inputs=inputs, references=references) +def get_imdb( + data_type: str = "test", + nsamples: int = 1000, + shuffle: bool = False, +) -> Dict[str, list]: + """ + This loads imdb text classification dataset using HuggingFace datasets module. + + Args: + ```data_type```: ```str``` + Either "train" or "test" + ```nsamples```: ```int``` + How many samples to load? + Note: The returned data size may not be exactly equal to nsamples + as we're filtering out empty references + ```shuffle```: ```bool``` + If enabled, shuffles the data prior to sampling/filtering. + + Returns: + Returns a dict with 2 keys: + - `inputs`: `List[dict]`, each dict has "context" and "question" + keys + - `references`: ```List[List[str]]``` + + """ + nsamples = nsamples or 0 + data = load_dataset("imdb")[data_type] + data = data.shuffle(seed=42) if shuffle else data + data = data.select(range(nsamples)) if nsamples > 0 else data + + label_map = ["NEGATIVE", "POSITIVE"] + inputs = [(d["text"], label_map[d["label"]]) for d in data] + inputs, references = zip(*inputs) + return dict(inputs=list(inputs), references=list(references)) + + def main(): pass diff --git a/evalem/models/__init__.py b/evalem/models/__init__.py index 87296b0..8add094 100644 --- a/evalem/models/__init__.py +++ b/evalem/models/__init__.py @@ -1,3 +1,7 @@ # flake8: noqa from ._base import HFLMWrapper, HFPipelineWrapper, ModelWrapper -from .defaults import DefaultQAModelWrapper +from .defaults import ( + DefaultQAModelWrapper, + QuestionAnsweringHFPipelineWrapper, + TextClassificationHFPipelineWrapper, +) diff --git a/evalem/models/_base.py b/evalem/models/_base.py index e035249..8a5ea5c 100644 --- a/evalem/models/_base.py +++ b/evalem/models/_base.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 from abc import abstractmethod -from typing import Iterable, Type +from typing import Callable, Iterable, Type from transformers import Pipeline as HF_Pipeline from transformers import PreTrainedModel, PreTrainedTokenizerBase @@ -16,19 +16,47 @@ class ModelWrapper(AbstractBase): all the upstream models into a nice wrapper. All the downstream implementation of `ModelWrapper` should implement - the `predict(...)` method. + the `_predict(...)` method which is itself called by `.predict(...)` method. + + Args: + ```model```: + Input model that's being wrapped for common interface + ```debug```: ```bool``` + If enabled, debugging logs could be printed + ```kwargs```: + - ```inputs_preprocessor``` + A `Callable` to apply on inputs. + - ```predictions_postprocessor``` + A `Callable` to apply on model outputs/predictions. Note: - In order to convert to task-specific downstream format, we provide - `_map_predictions(...)` method which user can override. By default, - it is an identity that doesn't change the format egested by the model. + - Override `_preprocess_inputs` method to change data format for + model input. Default it identity (no change). + - Override `_postprocess_predictions` to convert predictions to + task-specific downstream format. Defaults to identity (no change). """ - def __init__(self, model, debug: bool = False, **kwargs) -> None: + def __init__( + self, + model, + debug: bool = False, + **kwargs, + ) -> None: super().__init__(debug=debug) self.model = model - @abstractmethod + # specifies how the input format conversion is done + self.inputs_preprocessor: Callable = ( + kwargs.get("inputs_preprocessor", self._preprocess_inputs) + or self._preprocess_inputs + ) + + # specifies how the predictions formatting is done + self.predictions_postprocessor: Callable = ( + kwargs.get("predictions_postprocessor", self._postprocess_predictions) + or self._postprocess_predictions + ) + def predict( self, inputs: Iterable, @@ -45,16 +73,37 @@ def predict( Returns: Iterable of predicted instance """ - raise NotImplementedError() + inputs = self.inputs_preprocessor(inputs, **kwargs) + predictions = self._predict(inputs, **kwargs) + return self.predictions_postprocessor(predictions, **kwargs) - def __call__( + @abstractmethod + def _predict( self, inputs: Iterable, **kwargs, ) -> Iterable[EvaluationPredictionInstance]: - return self.predict(inputs, **kwargs) + """ + Entrypoint method for predicting using the wrapped model + + Args: + ```inputs``` + Represent input dataset whose format depends on + downstream tasks. + + Returns: + Iterable of predicted instance + """ + raise NotImplementedError() - def _map_predictions(self, predictions: Iterable): + def _preprocess_inputs(self, inputs: Iterable, **kwargs) -> Iterable: + """ + A helper method to transform inputs suitable for model to ingest. + By default, it's an identity function. + """ + return inputs + + def _postprocess_predictions(self, predictions: Iterable, **kwargs): """ A helper method to transform predictions from the models into any downstream format. By default, it's an identity function. @@ -62,6 +111,13 @@ def _map_predictions(self, predictions: Iterable): # default -> Identity return predictions + def __call__( + self, + inputs: Iterable, + **kwargs, + ) -> Iterable[EvaluationPredictionInstance]: + return self.predict(inputs, **kwargs) + class HFWrapper(ModelWrapper): """ @@ -86,8 +142,9 @@ def __init__( self, model: Type[PreTrainedModel], tokenizer: Type[PreTrainedTokenizerBase], + **kwargs, ) -> None: - super().__init__(model=model) + super().__init__(model=model, **kwargs) self.tokenizer = tokenizer @@ -113,21 +170,29 @@ class HFPipelineWrapper(HFWrapper): pipe = hf_pipeline("question-answering") wrapped_model = HFPipelineWrapper(pipe) + # Or: if you want to specify how to post-process predictions, + # provide the processor explicitly. + wrapped_model = HFPipelineWrapper( + pipeline("question-answering", model="deepset/roberta-base-squad2"), + predictions_postprocessor=lambda xs: list(map(lambda x: x["answer"], xs)) + ) + + # compute predictions # (format?) and pass to evaluator along with references predictions = wrapped_model.predict() """ - def __init__(self, pipeline: Type[HF_Pipeline], debug: bool = False) -> None: + def __init__(self, pipeline: Type[HF_Pipeline], **kwargs) -> None: """ Args: ```pipeline```: A HuggingFace pipeline object used for prediction """ - super().__init__(model=pipeline) + super().__init__(model=pipeline, **kwargs) - def predict(self, inputs, **kwargs): - return self._map_predictions(self.model(inputs)) + def _predict(self, inputs, **kwargs): + return self.model(inputs, **kwargs) @property def pipeline(self) -> HF_Pipeline: diff --git a/evalem/models/defaults.py b/evalem/models/defaults.py index 065a790..910bea6 100755 --- a/evalem/models/defaults.py +++ b/evalem/models/defaults.py @@ -1,43 +1,56 @@ #!/usr/bin/env python3 -from typing import Iterable, List, Union +from typing import Iterable, List, Optional, Union from transformers import pipeline as hf_pipeline -from ..structures import EvaluationPredictionInstance, QAPredictionDTO -from ._base import HFPipelineWrapper +from ..structures import PredictionDTO, QAPredictionDTO +from ._base import HFPipelineWrapper, PreTrainedModel, PreTrainedTokenizerBase -class DefaultQAModelWrapper(HFPipelineWrapper): +class QuestionAnsweringHFPipelineWrapper(HFPipelineWrapper): """ - A default distill-bert-uncased base HF pipeline for - Question-Answering task. - - The predictor expects the input format to be a `List[dict]`, where each - dict has the following keys: - - `context` (str): Paragraph/context fromw which question is asked - - `question` (str): Actual question string being asked - - Example input dict: - .. code-block: python - - { - "context": "There are 7 continents in the world." - "question": "How many continents are there?" - } - - The `predict(...)` method finally returns `List[QAPredictionDTO]` structure. + A HFPipelineWrapper for question-answering. + + Args: + ```model```: ```Type[PreTrainedModel]``` + Which model to use? + ```tokenizer```: ```Type[PreTrainedTokenizerBase]``` + Which tokenizer to use? + ```device```:```str``` + Which device to run the model on? cpu? gpu? mps? """ - def __init__(self, device: str = "cpu") -> None: - super().__init__(pipeline=hf_pipeline("question-answering", device=device)) + _task = "question-answering" - def _map_predictions( + def __init__( + self, + model: Optional[ + Union[str, PreTrainedModel] + ] = "distilbert-base-cased-distilled-squad", + tokenizer: Optional[Union[str, PreTrainedTokenizerBase]] = None, + device: str = "cpu", + hf_params: Optional[dict] = None, + **kwargs, + ) -> None: + self.hf_params = hf_params or {} + super().__init__( + pipeline=hf_pipeline( + self._task, + model=model, + tokenizer=tokenizer, + device=device, + **self.hf_params, + ), + **kwargs, + ) + + def _postprocess_predictions( self, predictions: Union[dict, List[dict]], - ) -> Iterable[EvaluationPredictionInstance]: + ) -> Iterable[QAPredictionDTO]: """ - This helper method converts the pipeline's default output format + This method converts the pipeline's default output format to the iterable of QAPredictionDTO. Args: @@ -64,6 +77,86 @@ def _map_predictions( ) +class DefaultQAModelWrapper(HFPipelineWrapper): + """ + Deprecated: Use `QuestionAnsweringHFPipelineWrapper()` + """ + + def __init__(self, device: str = "cpu") -> None: + raise DeprecationWarning( + "Deprecated ModelWrapper. Please use `QuestionAnsweringHFPipelineWrapper`", + ) + + +class TextClassificationHFPipelineWrapper(HFPipelineWrapper): + """ + A HFPipelineWrapper for text classification. + + Args: + ```model```: ```Type[PreTrainedModel]``` + Which model to use? + ```tokenizer```: ```Type[PreTrainedTokenizerBase]``` + Which tokenizer to use? + ```device```:```str``` + Which device to run the model on? cpu? gpu? mps? + """ + + _task = "text-classification" + + def __init__( + self, + model: Optional[ + Union[str, PreTrainedModel] + ] = "distilbert-base-uncased-finetuned-sst-2-english", + tokenizer: Optional[Union[str, PreTrainedTokenizerBase]] = None, + device: str = "cpu", + hf_params: Optional[dict] = None, + **kwargs, + ) -> None: + self.hf_params = hf_params or {} + super().__init__( + pipeline=hf_pipeline( + self._task, + model=model, + tokenizer=tokenizer, + device=device, + **self.hf_params, + ), + **kwargs, + ) + # mapping from int code to actual label name. + self.label_map = kwargs.get("label_map", {}) + + def _postprocess_predictions( + self, + predictions: Union[dict, List[dict]], + ) -> Iterable[PredictionDTO]: + """ + This method converts the pipeline's default output format + to the iterable of QAPredictionDTO. + + Args: + ```predictions```: ```Union[dict, List[dict]]``` + Predictions provided by the the classificaton pipeline. + + Returns: + Converted format: ```Iterable[PredictionDTO]``` + """ + if isinstance(predictions, dict): + predictions = [predictions] + + # Note: Default model here is guaranteed to have these keys. + # Use label mapping. If mapping doesn't exist, just use the prediction. + predictions = map( + lambda p: PredictionDTO( + text=self.label_map.get(p["label"], p["label"]), + score=p.get("score"), + ), + predictions, + ) + return list(predictions) + + def main(): pass diff --git a/requirements.txt b/requirements.txt index d4c4cf9..d130660 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ numpy==1.24.2 pandas==1.5.3 pytest==7.2.1 scikit-learn==1.2.1 +sentencepiece==0.1.97 seqeval==1.2.2 torch==1.13.1 transformers==4.26.1