From b15f586f764e592c1faaabf5c6cc4978b703a44f Mon Sep 17 00:00:00 2001
From: felixgwu <felixgwu@users.noreply.github.com>
Date: Mon, 24 Aug 2020 17:11:52 -0400
Subject: [PATCH] Custom baseline (#74)

* add baseline_path

* update cli option name & plotting

* fix typo
---
 README.md                   |  3 ++-
 bert_score/score.py         | 15 +++++++++++----
 bert_score/scorer.py        | 37 ++++++++++++++++++++++++-------------
 bert_score/utils.py         |  7 +++++--
 bert_score_cli/score.py     |  4 +++-
 bert_score_cli/visualize.py |  4 +++-
 journal/rescale_baseline.md |  2 +-
 7 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index a335454..07f6efc 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,7 @@
 
 Automatic Evaluation Metric described in the paper [BERTScore: Evaluating Text Generation with BERT](https://arxiv.org/abs/1904.09675) (ICLR 2020).
 #### News:
+- The option `--rescale-with-baseline` is changed to `--rescale_with_baseline` so that it is consistent with other options.
 - Updated to version 0.3.5
   - Being compatible with Huggingface's transformers >=v3.0.0 and minor fixes ([#58](https://github.com/Tiiiger/bert_score/pull/58), [#66](https://github.com/Tiiiger/bert_score/pull/66), [#68](https://github.com/Tiiiger/bert_score/pull/68))
   - Several improvements related to efficency ([#67](https://github.com/Tiiiger/bert_score/pull/67), [#69](https://github.com/Tiiiger/bert_score/pull/69))
@@ -116,7 +117,7 @@ where "roberta-large_L17_no-idf_version=0.3.0(hug_trans=2.3.0)" is the hash code
 Starting from version 0.3.0, we support rescaling the scores with baseline scores
 
 ```sh
-bert-score -r example/refs.txt -c example/hyps.txt --lang en --rescale-with-baseline
+bert-score -r example/refs.txt -c example/hyps.txt --lang en --rescale_with_baseline
 ```
 You will get:
 
diff --git a/bert_score/score.py b/bert_score/score.py
index f5a0054..018e73e 100644
--- a/bert_score/score.py
+++ b/bert_score/score.py
@@ -41,6 +41,7 @@ def score(
     lang=None,
     return_hash=False,
     rescale_with_baseline=False,
+    baseline_path=None,
 ):
     """
     BERTScore metric.
@@ -64,6 +65,7 @@ def score(
                   specified when `rescale_with_baseline` is True.
         - :param: `return_hash` (bool): return hash code of the setting
         - :param: `rescale_with_baseline` (bool): rescale bertscore with pre-computed baseline
+        - :param: `baseline_path` (str): customized baseline file
 
     Return:
         - :param: `(P, R, F)`: each is of shape (N); N = number of input
@@ -145,8 +147,10 @@ def score(
             max_preds.append(all_preds[beg:end].max(dim=0)[0])
         all_preds = torch.stack(max_preds, dim=0)
 
+    use_custom_baseline = baseline_path is not None
     if rescale_with_baseline:
-        baseline_path = os.path.join(os.path.dirname(__file__), f"rescale_baseline/{lang}/{model_type}.tsv")
+        if baseline_path is None:
+            baseline_path = os.path.join(os.path.dirname(__file__), f"rescale_baseline/{lang}/{model_type}.tsv")
         if os.path.isfile(baseline_path):
             if not all_layers:
                 baselines = torch.from_numpy(pd.read_csv(baseline_path).iloc[num_layers].to_numpy())[1:].float()
@@ -164,13 +168,15 @@ def score(
         print(f"done in {time_diff:.2f} seconds, {len(refs) / time_diff:.2f} sentences/sec")
 
     if return_hash:
-        return tuple([out, get_hash(model_type, num_layers, idf, rescale_with_baseline)])
+        return tuple([out, get_hash(model_type, num_layers, idf, rescale_with_baseline,
+                                    use_custom_baseline=use_custom_baseline)])
 
     return out
 
 
 def plot_example(
-    candidate, reference, model_type=None, num_layers=None, lang=None, rescale_with_baseline=False, fname=""
+    candidate, reference, model_type=None, num_layers=None, lang=None, rescale_with_baseline=False,
+    baseline_path=None, fname="",
 ):
     """
     BERTScore metric.
@@ -234,7 +240,8 @@ def plot_example(
     sim = sim[1:-1, 1:-1]
 
     if rescale_with_baseline:
-        baseline_path = os.path.join(os.path.dirname(__file__), f"rescale_baseline/{lang}/{model_type}.tsv")
+        if baseline_path is None:
+            baseline_path = os.path.join(os.path.dirname(__file__), f"rescale_baseline/{lang}/{model_type}.tsv")
         if os.path.isfile(baseline_path):
             baselines = torch.from_numpy(pd.read_csv(baseline_path).iloc[num_layers].to_numpy())[1:].float()
             sim = (sim - baselines[2].item()) / (1 - baselines[2].item())
diff --git a/bert_score/scorer.py b/bert_score/scorer.py
index 217da4d..057976e 100644
--- a/bert_score/scorer.py
+++ b/bert_score/scorer.py
@@ -42,6 +42,7 @@ def __init__(
         device=None,
         lang=None,
         rescale_with_baseline=False,
+        baseline_path=None,
     ):
         """
         Args:
@@ -51,8 +52,8 @@ def __init__(
             - :param: `num_layers` (int): the layer of representation to use.
                       default using the number of layer tuned on WMT16 correlation data
             - :param: `verbose` (bool): turn on intermediate status update
-            - :param: `idf` (dict): use idf weighting, can also be a precomputed idf_dict
-            - :param: `idf_sents` (List of str): use idf weighting, can also be a precomputed idf_dict
+            - :param: `idf` (bool): a booling to specify whether to use idf or not (this should be True even if `idf_sents` is given)
+            - :param: `idf_sents` (List of str): list of sentences used to compute the idf weights
             - :param: `device` (str): on which the contextual embedding model will be allocated on.
                       If this argument is None, the model lives on cuda:0 if cuda is available.
             - :param: `batch_size` (int): bert score processing batch size
@@ -62,6 +63,7 @@ def __init__(
                       specified when `rescale_with_baseline` is True.
             - :param: `return_hash` (bool): return hash code of the setting
             - :param: `rescale_with_baseline` (bool): rescale bertscore with pre-computed baseline
+            - :param: `baseline_path` (str): customized baseline file
         """
 
         assert lang is not None or model_type is not None, "Either lang or model_type should be specified"
@@ -106,6 +108,12 @@ def __init__(
         if idf_sents is not None:
             self.compute_idf(idf_sents)
 
+        self._baseline_vals = None
+        self.baseline_path = baseline_path
+        self.use_custom_baseline = self.baseline_path is not None
+        if self.baseline_path is None:
+            self.baseline_path = os.path.join(os.path.dirname(__file__), f"rescale_baseline/{self.lang}/{self.model_type}.tsv")
+
     @property
     def lang(self):
         return self._lang
@@ -128,22 +136,25 @@ def rescale_with_baseline(self):
 
     @property
     def baseline_vals(self):
-        baseline_path = os.path.join(os.path.dirname(__file__), f"rescale_baseline/{self.lang}/{self.model_type}.tsv")
-        if os.path.isfile(baseline_path):
-            if not self.all_layers:
-                baseline_vals = torch.from_numpy(pd.read_csv(baseline_path).iloc[self.num_layers].to_numpy())[
-                    1:
-                ].float()
+        if self._baseline_vals is None:
+            if os.path.isfile(self.baseline_path):
+                if not self.all_layers:
+                    self._baseline_vals = torch.from_numpy(
+                        pd.read_csv(self.baseline_path).iloc[self.num_layers].to_numpy()
+                    )[1:].float()
+                else:
+                    self._baseline_vals = torch.from_numpy(
+                        pd.read_csv(self.baseline_path).to_numpy()
+                    )[:, 1:].unsqueeze(1).float()
             else:
-                baseline_vals = torch.from_numpy(pd.read_csv(baseline_path).to_numpy())[:, 1:].unsqueeze(1).float()
-        else:
-            raise ValueError(f"Baseline not Found for {self.model_type} on {self.lang} at {baseline_path}")
+                raise ValueError(
+                    f"Baseline not Found for {self.model_type} on {self.lang} at {self.baseline_path}")
 
-        return baseline_vals
+        return self._baseline_vals
 
     @property
     def hash(self):
-        return get_hash(self.model_type, self.num_layers, self.idf, self.rescale_with_baseline)
+        return get_hash(self.model_type, self.num_layers, self.idf, self.rescale_with_baseline, self.use_custom_baseline)
 
     def compute_idf(self, sents):
         """
diff --git a/bert_score/utils.py b/bert_score/utils.py
index d25d049..632ec5a 100644
--- a/bert_score/utils.py
+++ b/bert_score/utils.py
@@ -437,12 +437,15 @@ def length_to_mask(lens):
     return preds
 
 
-def get_hash(model, num_layers, idf, rescale_with_baseline):
+def get_hash(model, num_layers, idf, rescale_with_baseline, use_custom_baseline):
     msg = "{}_L{}{}_version={}(hug_trans={})".format(
         model, num_layers, "_idf" if idf else "_no-idf", __version__, trans_version
     )
     if rescale_with_baseline:
-        msg += "-rescaled"
+        if use_custom_baseline:
+            msg += "-custom-rescaled"
+        else:
+            msg += "-rescaled"
     return msg
 
 
diff --git a/bert_score_cli/score.py b/bert_score_cli/score.py
index 595f85c..6775ea6 100755
--- a/bert_score_cli/score.py
+++ b/bert_score_cli/score.py
@@ -24,8 +24,9 @@ def main():
     parser.add_argument("--nthreads", type=int, default=4, help="number of cpu workers (default: 4)")
     parser.add_argument("--idf", action="store_true", help="BERT Score with IDF scaling")
     parser.add_argument(
-        "--rescale-with-baseline", action="store_true", help="Rescaling the numerical score with precomputed baselines"
+        "--rescale_with_baseline", action="store_true", help="Rescaling the numerical score with precomputed baselines"
     )
+    parser.add_argument("--baseline_path", default=None, type=str, help="path of custom baseline csv file")
     parser.add_argument("-s", "--seg_level", action="store_true", help="show individual score of each pair")
     parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
     parser.add_argument("-r", "--ref", type=str, nargs="+", required=True, help="reference file path(s) or a string")
@@ -65,6 +66,7 @@ def main():
         lang=args.lang,
         return_hash=True,
         rescale_with_baseline=args.rescale_with_baseline,
+        baseline_path=args.baseline_path,
     )
     avg_scores = [s.mean(dim=0) for s in all_preds]
     P = avg_scores[0].cpu().item()
diff --git a/bert_score_cli/visualize.py b/bert_score_cli/visualize.py
index bd25f13..70d86a8 100644
--- a/bert_score_cli/visualize.py
+++ b/bert_score_cli/visualize.py
@@ -20,8 +20,9 @@ def main():
     parser.add_argument("-c", "--cand", type=str, required=True, help="candidate sentence")
     parser.add_argument("-f", "--file", type=str, default="visualize.png", help="name of file to save output matrix in")
     parser.add_argument(
-        "--rescale-with-baseline", action="store_true", help="Rescaling the numerical score with precomputed baselines"
+        "--rescale_with_baseline", action="store_true", help="Rescaling the numerical score with precomputed baselines"
     )
+    parser.add_argument("--baseline_path", default=None, type=str, help="path of custom baseline csv file")
 
     args = parser.parse_args()
 
@@ -33,6 +34,7 @@ def main():
         num_layers=args.num_layers,
         fname=args.file,
         rescale_with_baseline=args.rescale_with_baseline,
+        baseline_path=args.baseline_path,
     )
 
 
diff --git a/journal/rescale_baseline.md b/journal/rescale_baseline.md
index 3eb653f..077aefd 100644
--- a/journal/rescale_baseline.md
+++ b/journal/rescale_baseline.md
@@ -48,7 +48,7 @@ out = bert_score.score(
 and for the command-line version:
 ```bash
 bert-score -r example/refs.txt -c example/hyps.txt \
-           --lang en --rescale-with-baseline
+           --lang en --rescale_with_baseline
 ```