Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[alpha] Improvements to ModelWrapper and better QA/Classification implementation #8

Merged
merged 16 commits into from
Mar 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions evalem/evaluators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
# flake8: noqa
from ._base import Evaluator
from .basics import QAEvaluator, TextClassificationEvaluator
26 changes: 25 additions & 1 deletion evalem/evaluators/basics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
#!/usr/bin/env python3

from ..metrics import AccuracyMetric, ExactMatchMetric, F1Metric
from ..metrics import (
AccuracyMetric,
ConfusionMatrix,
ExactMatchMetric,
F1Metric,
PrecisionMetric,
RecallMetric,
)
from ._base import Evaluator


Expand Down Expand Up @@ -30,6 +37,23 @@ def __init__(self) -> None:
)


class TextClassificationEvaluator(BasicEvaluator):
"""
An evaluator for text classification tasks.
"""

def __init__(self) -> None:
super().__init__(
metrics=[
AccuracyMetric(),
F1Metric(),
PrecisionMetric(),
RecallMetric(),
ConfusionMatrix(),
],
)


def main():
pass
NISH1001 marked this conversation as resolved.
Show resolved Hide resolved

Expand Down
8 changes: 0 additions & 8 deletions evalem/metrics/basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,3 @@ def __get_labels(
Get unique list of labels across predictions + references.
"""
return sorted(set(predictions).union(references))


def main():
pass


if __name__ == "__main__":
main()
16 changes: 14 additions & 2 deletions evalem/metrics/semantics.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ class BertScore(SemanticMetric):
https://github.com/Tiiiger/bert_score/blob/master/bert_score/utils.py
```device```: ```str```
Which device to run the model on? Defaults to "cpu".
```per_instance_score```: ```bool```
If enabled, precision, recall and f1 score per instance is also
returned in the computation result.
Else: mean precision, recall and f1 is computed by default.
```debug```: ```bool```
Enable debugging log? Defaults to False.

Expand Down Expand Up @@ -68,12 +72,14 @@ class BertScore(SemanticMetric):

def __init__(
self,
model_type: str = "roberta-large",
model_type: str = "bert-base-uncased",
device: str = "cpu",
per_instance_score: bool = False,
debug: bool = False,
) -> None:
super().__init__(metrics="bertscore", device=device, debug=debug)
self.model_type = model_type
self.per_instance_score = per_instance_score

def compute(
self,
Expand All @@ -83,13 +89,19 @@ def compute(
) -> MetricOutput:
device = kwargs.pop("device", self.device)
model_type = kwargs.pop("model_type", self.model_type)
return super().compute(
result = super().compute(
predictions=predictions,
references=references,
model_type=model_type,
device=device,
**kwargs,
)
# if you want to supress a list of all these metrics
# and want to just have mean/average.
if not self.per_instance_score:
for _key in ["precision", "recall", "f1"]:
result["bertscore"][_key] = np.mean(result["bertscore"][_key])
return result


class BartScore(SemanticMetric):
Expand Down
38 changes: 38 additions & 0 deletions evalem/misc/datasets.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env python3

from typing import Dict

from datasets import load_dataset


Expand Down Expand Up @@ -41,6 +43,42 @@ def get_squad_v2(
return dict(inputs=inputs, references=references)


def get_imdb(
NISH1001 marked this conversation as resolved.
Show resolved Hide resolved
data_type: str = "test",
nsamples: int = 1000,
shuffle: bool = False,
) -> Dict[str, list]:
"""
This loads imdb text classification dataset using HuggingFace datasets module.

Args:
```data_type```: ```str```
Either "train" or "test"
```nsamples```: ```int```
How many samples to load?
Note: The returned data size may not be exactly equal to nsamples
as we're filtering out empty references
```shuffle```: ```bool```
If enabled, shuffles the data prior to sampling/filtering.

Returns:
Returns a dict with 2 keys:
- `inputs`: `List[dict]`, each dict has "context" and "question"
keys
- `references`: ```List[List[str]]```

"""
nsamples = nsamples or 0
data = load_dataset("imdb")[data_type]
data = data.shuffle(seed=42) if shuffle else data
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move seed to a config or a constant.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah ya. Good call. The framework-level config could be a nice way to manage these seeds.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can I resolve this in the next PR? It doesn't hamper the behavior of the framework at this point.

data = data.select(range(nsamples)) if nsamples > 0 else data

label_map = ["NEGATIVE", "POSITIVE"]
inputs = [(d["text"], label_map[d["label"]]) for d in data]
inputs, references = zip(*inputs)
return dict(inputs=list(inputs), references=list(references))


def main():
pass

Expand Down
6 changes: 5 additions & 1 deletion evalem/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# flake8: noqa
from ._base import HFLMWrapper, HFPipelineWrapper, ModelWrapper
from .defaults import DefaultQAModelWrapper
from .defaults import (
DefaultQAModelWrapper,
QuestionAnsweringHFPipelineWrapper,
TextClassificationHFPipelineWrapper,
)
97 changes: 81 additions & 16 deletions evalem/models/_base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python3

from abc import abstractmethod
from typing import Iterable, Type
from typing import Callable, Iterable, Type

from transformers import Pipeline as HF_Pipeline
from transformers import PreTrainedModel, PreTrainedTokenizerBase
Expand All @@ -16,19 +16,47 @@ class ModelWrapper(AbstractBase):
all the upstream models into a nice wrapper.

All the downstream implementation of `ModelWrapper` should implement
the `predict(...)` method.
the `_predict(...)` method which is itself called by `.predict(...)` method.

Args:
```model```:
Input model that's being wrapped for common interface
```debug```: ```bool```
If enabled, debugging logs could be printed
```kwargs```:
- ```inputs_preprocessor```
A `Callable` to apply on inputs.
- ```predictions_postprocessor```
A `Callable` to apply on model outputs/predictions.

Note:
In order to convert to task-specific downstream format, we provide
`_map_predictions(...)` method which user can override. By default,
it is an identity that doesn't change the format egested by the model.
- Override `_preprocess_inputs` method to change data format for
model input. Default it identity (no change).
- Override `_postprocess_predictions` to convert predictions to
task-specific downstream format. Defaults to identity (no change).
"""

def __init__(self, model, debug: bool = False, **kwargs) -> None:
def __init__(
self,
model,
debug: bool = False,
**kwargs,
) -> None:
super().__init__(debug=debug)
self.model = model

@abstractmethod
# specifies how the input format conversion is done
self.inputs_preprocessor: Callable = (
kwargs.get("inputs_preprocessor", self._preprocess_inputs)
or self._preprocess_inputs
)

# specifies how the predictions formatting is done
self.predictions_postprocessor: Callable = (
kwargs.get("predictions_postprocessor", self._postprocess_predictions)
or self._postprocess_predictions
)

def predict(
self,
inputs: Iterable,
Expand All @@ -45,23 +73,51 @@ def predict(
Returns:
Iterable of predicted instance
"""
raise NotImplementedError()
inputs = self.inputs_preprocessor(inputs, **kwargs)
predictions = self._predict(inputs, **kwargs)
return self.predictions_postprocessor(predictions, **kwargs)

def __call__(
@abstractmethod
def _predict(
self,
inputs: Iterable,
**kwargs,
) -> Iterable[EvaluationPredictionInstance]:
return self.predict(inputs, **kwargs)
"""
Entrypoint method for predicting using the wrapped model

Args:
```inputs```
Represent input dataset whose format depends on
downstream tasks.

Returns:
Iterable of predicted instance
"""
raise NotImplementedError()

def _map_predictions(self, predictions: Iterable):
def _preprocess_inputs(self, inputs: Iterable, **kwargs) -> Iterable:
"""
A helper method to transform inputs suitable for model to ingest.
By default, it's an identity function.
"""
return inputs

def _postprocess_predictions(self, predictions: Iterable, **kwargs):
"""
A helper method to transform predictions from the models
into any downstream format. By default, it's an identity function.
"""
# default -> Identity
return predictions

def __call__(
self,
inputs: Iterable,
**kwargs,
) -> Iterable[EvaluationPredictionInstance]:
return self.predict(inputs, **kwargs)


class HFWrapper(ModelWrapper):
"""
Expand All @@ -86,8 +142,9 @@ def __init__(
self,
model: Type[PreTrainedModel],
tokenizer: Type[PreTrainedTokenizerBase],
**kwargs,
) -> None:
super().__init__(model=model)
super().__init__(model=model, **kwargs)
self.tokenizer = tokenizer


Expand All @@ -113,21 +170,29 @@ class HFPipelineWrapper(HFWrapper):
pipe = hf_pipeline("question-answering")
wrapped_model = HFPipelineWrapper(pipe)

# Or: if you want to specify how to post-process predictions,
# provide the processor explicitly.
wrapped_model = HFPipelineWrapper(
pipeline("question-answering", model="deepset/roberta-base-squad2"),
predictions_postprocessor=lambda xs: list(map(lambda x: x["answer"], xs))
)


# compute predictions
# (format?) and pass to evaluator along with references
predictions = wrapped_model.predict(<inputs>)
"""

def __init__(self, pipeline: Type[HF_Pipeline], debug: bool = False) -> None:
def __init__(self, pipeline: Type[HF_Pipeline], **kwargs) -> None:
"""
Args:
```pipeline```:
A HuggingFace pipeline object used for prediction
"""
super().__init__(model=pipeline)
super().__init__(model=pipeline, **kwargs)

def predict(self, inputs, **kwargs):
return self._map_predictions(self.model(inputs))
def _predict(self, inputs, **kwargs):
return self.model(inputs, **kwargs)

@property
def pipeline(self) -> HF_Pipeline:
Expand Down
Loading