Lightning-AI · stancld · Feb 23, 2022 · Feb 14, 2022 · Feb 14, 2022 · Feb 14, 2022
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import re
 from functools import partial
-from typing import Sequence
+from typing import Callable, Sequence
 
+import numpy as np
 import pytest
 import torch
 
@@ -164,3 +166,46 @@ def test_rouge_metric_wrong_key_value_error():
             rouge_keys=key,
             accumulate="best",
         )
+
+
+@pytest.mark.parametrize(
+    "pl_rouge_metric_key",
+    [
+        ("rouge1_precision"),
+        ("rouge1_recall"),
+        ("rouge1_fmeasure"),
+        ("rouge2_precision"),
+        ("rouge2_recall"),
+        ("rouge2_fmeasure"),
+        ("rougeL_precision"),
+        ("rougeL_recall"),
+        ("rougeL_fmeasure"),
+        ("rougeLsum_precision"),
+        ("rougeLsum_recall"),
+        ("rougeLsum_fmeasure"),
+    ],
+)
+def test_rouge_metric_normalizer_tokenizer(pl_rouge_metric_key):
+    normalizer: Callable[[str], str] = lambda text: re.sub(r"[^a-z0-9]+", " ", text.lower())
+    tokenizer: Callable[[str], Sequence[str]] = lambda text: re.split(r"\s+", text)
+
+    rouge_level, metric = pl_rouge_metric_key.split("_")
+    original_score = _compute_rouge_score(
+        preds=_inputs_single_sentence_single_reference.preds,
+        target=_inputs_single_sentence_single_reference.targets,
+        rouge_level=rouge_level,
+        metric=metric,
+        accumulate="best",
+        use_stemmer=False,
+    )
+
+    Scorer = ROUGEScore(
+        normalizer=normalizer, tokenizer=tokenizer, rouge_keys=rouge_level, accumulate="best", use_stemmer=False
+    )
+    Scorer.update(
+        _inputs_single_sentence_single_reference.preds,
+        _inputs_single_sentence_single_reference.targets,
+    )
+    metrics_score = Scorer.compute()
+
+    np.isclose(metrics_score[rouge_level + "_" + metric], original_score, atol=1e-8, equal_nan=True)
@@ -0,0 +1,85 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""An example of how to use ROUGEScore with a user's defined/own normalizer and tokenizer.
+
+To run: python rouge_score-own_normalizer_and_tokenizer.py
+"""
+
+import re
+from pprint import pprint
+from typing import Sequence
+
+from torchmetrics.text.rouge import ROUGEScore
+
+
+class UserNormalizer:
+    """The `UserNormalizer` class is required to normalize a non-alphabet language text input.
+
+    The user's defined normalizer is expected to return string that are fed into the tokenizer.
+    """
+
+    def __init__(self) -> None:
+        self.pattern = r"[^a-z0-9]+"
+
+    def __call__(self, text: str) -> str:
+        """The `__call__` method must be defined for this class. To ensure the functionality, the `__call__` method
+        should obey the input/output arguments structure described below.
+
+        Args:
+            text: Input text.
+
+        Return:
+            Normalized python string object
+        """
+        output_text = re.sub(self.pattern, " ", text.lower())
+
+        return output_text
+
+
+class UserTokenizer:
+    """The `UserNormalizer` class is required to tokenize a non-alphabet language text input.
+
+    The user's defined tokenizer is expected to return ``Sequence[str]`` that are fed into the rouge score.
+    """
+
+    pattern = r"\s+"
+
+    def __call__(self, text: str) -> Sequence[str]:
+        """The `__call__` method must be defined for this class. To ensure the functionality, the `__call__` method
+        should obey the input/output arguments structure described below.
+
+        Args:
+            text: Input text.
+
+        Return:
+            Tokenized sentence
+        """
+        output_tokens = re.split(self.pattern, text)
+
+        return output_tokens
+
+
+_PREDS = ["hello", "hello world", "world world world"]
+_REFS = ["hello", "hello hello", "hello world hello"]
+
+
+if __name__ == "__main__":
+    normalizer = UserNormalizer()
+    tokenizer = UserTokenizer()
+
+    rouge_score = ROUGEScore(normalizer=normalizer, tokenizer=tokenizer)
+
+    rouge_score.update(_PREDS, _REFS)
+
+    pprint(rouge_score.compute())
@@ -13,7 +13,7 @@
 # limitations under the License.
 import re
 from collections import Counter
-from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import torch
 from torch import Tensor, tensor
@@ -91,7 +91,12 @@ def _lcs(pred_tokens: Sequence[str], target_tokens: Sequence[str]) -> int:
     return LCS[-1][-1]
 
 
-def _normalize_and_tokenize_text(text: str, stemmer: Optional[Any] = None) -> Sequence[str]:
+def _normalize_and_tokenize_text(
+    text: str,
+    stemmer: Optional[Any] = None,
+    normalizer: Callable[[str], str] = None,
+    tokenizer: Callable[[str], Sequence[str]] = None,
+) -> Sequence[str]:
     """Rouge score should be calculated only over lowercased words and digits. Optionally, Porter stemmer can be
     used to strip word suffixes to improve matching. The text normalization follows the implemantion from `Rouge
     score_Text Normalizition`_
@@ -101,17 +106,27 @@ def _normalize_and_tokenize_text(text: str, stemmer: Optional[Any] = None) -> Se
             An input sentence.
         stemmer:
             Porter stemmer instance to strip word suffixes to improve matching.
+        normalizer:
+            A user's own normalizer function.
+            If this is ``None``, replacing any non-alpha-numeric characters with spaces is default.
+            This function must take a `str` and return a `str`.
+        tokenizer:
+            A user's own tokenizer function. If this is ``None``, spliting by spaces is default
+            This function must take a `str` and return `Sequence[str]`
     """
-    # Replace any non-alpha-numeric characters with spaces.
-    text = re.sub(r"[^a-z0-9]+", " ", text.lower())
 
-    tokens = re.split(r"\s+", text)
+    # If normalizer is none, replace any non-alpha-numeric characters with spaces.
+    text = normalizer(text) if callable(normalizer) else re.sub(r"[^a-z0-9]+", " ", text.lower())
+
+    # If tokenizer is none, spliting by spaces
+    tokens = tokenizer(text) if callable(tokenizer) else re.split(r"\s+", text)
+
     if stemmer:
         # Only stem words more than 3 characters long.
         tokens = [stemmer.stem(x) if len(x) > 3 else x for x in tokens]
 
     # One final check to drop any empty or invalid tokens.
-    tokens = [x for x in tokens if (isinstance(x, str) and re.match(r"^[a-z0-9]+$", x))]
+    tokens = [x for x in tokens if (isinstance(x, str) and len(x) > 0)]
 
     return tokens
 
@@ -167,6 +182,8 @@ def _rouge_score_update(
     rouge_keys_values: List[Union[int, str]],
     accumulate: str,
     stemmer: Optional[Any] = None,
+    normalizer: Callable[[str], str] = None,
+    tokenizer: Callable[[str], Sequence[str]] = None,
 ) -> Dict[Union[int, str], List[Dict[str, Tensor]]]:
     """Update the rouge score with the current set of predicted and target sentences.
 
@@ -184,6 +201,13 @@ def _rouge_score_update(
             Allowed values are ``avg`` and ``best``.
         stemmer:
             Porter stemmer instance to strip word suffixes to improve matching.
+        normalizer:
+            A user's own normalizer function.
+            If this is ``None``, replacing any non-alpha-numeric characters with spaces is default.
+            This function must take a `str` and return a `str`.
+        tokenizer:
+            A user's own tokenizer function. If this is ``None``, spliting by spaces is default
+            This function must take a `str` and return `Sequence[str]`
 
     Example:
         >>> preds = "My name is John".split()
@@ -214,16 +238,18 @@ def _rouge_score_update(
         result_inner: Dict[Union[int, str], Dict[str, Tensor]] = {rouge_key: {} for rouge_key in rouge_keys_values}
         result_avg: Dict[Union[int, str], List[Dict[str, Tensor]]] = {rouge_key: [] for rouge_key in rouge_keys_values}
         list_results = []
-        pred = _normalize_and_tokenize_text(pred_raw, stemmer)
-        pred_Lsum = _normalize_and_tokenize_text(_add_newline_to_end_of_each_sentence(pred_raw), stemmer)
+        pred = _normalize_and_tokenize_text(pred_raw, stemmer, normalizer, tokenizer)
+        pred_Lsum = _normalize_and_tokenize_text(
+            _add_newline_to_end_of_each_sentence(pred_raw), stemmer, normalizer, tokenizer
+        )
 
         for target_raw_inner in target_raw:
-            tgt = _normalize_and_tokenize_text(target_raw_inner, stemmer)
+            tgt = _normalize_and_tokenize_text(target_raw_inner, stemmer, normalizer, tokenizer)
 
             if "Lsum" in rouge_keys_values:
                 # rougeLsum expects "\n" separated sentences within a summary
                 target_Lsum = _normalize_and_tokenize_text(
-                    _add_newline_to_end_of_each_sentence(target_raw_inner), stemmer
+                    _add_newline_to_end_of_each_sentence(target_raw_inner), stemmer, normalizer, tokenizer
                 )
 
             for rouge_key in rouge_keys_values:
@@ -291,6 +317,8 @@ def rouge_score(
     target: Union[str, Sequence[str], Sequence[Sequence[str]]],
     accumulate: Literal["avg", "best"] = "best",
     use_stemmer: bool = False,
+    normalizer: Callable[[str], str] = None,
+    tokenizer: Callable[[str], Sequence[str]] = None,
     rouge_keys: Union[str, Tuple[str, ...]] = ("rouge1", "rouge2", "rougeL", "rougeLsum"),  # type: ignore
 ) -> Dict[str, Tensor]:
     """Calculate `Calculate Rouge Score`_ , used for automatic summarization.
@@ -306,6 +334,13 @@ def rouge_score(
             - ``best`` takes the best fmeasure score obtained between prediction and multiple corresponding references.
         use_stemmer:
             Use Porter stemmer to strip word suffixes to improve matching.
+        normalizer:
+            A user's own normalizer function.
+            If this is ``None``, replacing any non-alpha-numeric characters with spaces is default.
+            This function must take a `str` and return a `str`.
+        tokenizer:
+            A user's own tokenizer function. If this is ``None``, spliting by spaces is default
+            This function must take a `str` and return `Sequence[str]`
         rouge_keys:
             A list of rouge types to calculate.
             Keys that are allowed are ``rougeL``, ``rougeLsum``, and ``rouge1`` through ``rouge9``.
@@ -367,7 +402,13 @@ def rouge_score(
         target = [[target]]
 
     sentence_results: Dict[Union[int, str], List[Dict[str, Tensor]]] = _rouge_score_update(
-        preds, target, rouge_keys_values, stemmer=stemmer, accumulate=accumulate
+        preds,
+        target,
+        rouge_keys_values,
+        stemmer=stemmer,
+        normalizer=normalizer,
+        tokenizer=tokenizer,
+        accumulate=accumulate,
     )
 
     output: Dict[str, List[Tensor]] = {}

@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 from torch import Tensor
 from typing_extensions import Literal
@@ -35,6 +35,13 @@ class ROUGEScore(Metric):
     Args:
         use_stemmer:
             Use Porter stemmer to strip word suffixes to improve matching.
+        normalizer:
+            A user's own normalizer function.
+            If this is ``None``, replacing any non-alpha-numeric characters with spaces is default.
+            This function must take a `str` and return a `str`.
+        tokenizer:
+            A user's own tokenizer function. If this is ``None``, spliting by spaces is default
+            This function must take a `str` and return `Sequence[str]`
         accumulate:
             Useful incase of multi-reference rouge score.
             - ``avg`` takes the avg of all references with respect to predictions
@@ -87,6 +94,8 @@ class ROUGEScore(Metric):
     def __init__(
         self,
         use_stemmer: bool = False,
+        normalizer: Callable[[str], str] = None,
+        tokenizer: Callable[[str], Sequence[str]] = None,
         accumulate: Literal["avg", "best"] = "best",
         rouge_keys: Union[str, Tuple[str, ...]] = ("rouge1", "rouge2", "rougeL", "rougeLsum"),  # type: ignore
         compute_on_step: Optional[bool] = None,
@@ -114,6 +123,8 @@ def __init__(
         self.rouge_keys = rouge_keys
         self.rouge_keys_values = [ALLOWED_ROUGE_KEYS[key] for key in rouge_keys]
         self.stemmer = nltk.stem.porter.PorterStemmer() if use_stemmer else None
+        self.normalizer = normalizer
+        self.tokenizer = tokenizer
         self.accumulate = accumulate
 
         # Adding stated dynamically to prevent IndexError during sync function as some lists can be empty.
@@ -143,7 +154,13 @@ def update(  # type: ignore
             target = [[target]]
 
         output: Dict[Union[int, str], List[Dict[str, Tensor]]] = _rouge_score_update(
-            preds, target, self.rouge_keys_values, stemmer=self.stemmer, accumulate=self.accumulate
+            preds,
+            target,
+            self.rouge_keys_values,
+            stemmer=self.stemmer,
+            normalizer=self.normalizer,
+            tokenizer=self.tokenizer,
+            accumulate=self.accumulate,
         )
         for rouge_key, metrics in output.items():
             for metric in metrics: