Adding input documents to the output files (#69)

* Adding input documents to the output files * Adding missing file
danieldeutsch · Nov 21, 2020 · 534e2fd · 534e2fd
1 parent ee776dc
commit 534e2fd
Show file tree

Hide file tree

Showing 4 changed files with 187 additions and 11 deletions.
diff --git a/doc/datasets/fabbri2020.md b/doc/datasets/fabbri2020.md
@@ -1,22 +1,28 @@
 # Fabbri 2020
 This dataset contains expert and Turker annotations for summaries on the CNN/DailyMail dataset as collected in [1].
-The setup command will save the summaries and references for all of the systems and their corresponding annotations.
+The setup command will save the summaries and references for all of the systems and their corresponding annotations and input documents.
 See this [Github repository](https://github.com/Yale-LILY/SummEval) for more details.
 
 ```bash
-sacrerouge setup-dataset fabbri2020 <output-dir>
+sacrerouge setup-dataset fabbri2020 <cnn-tar-path> <dailymail-tar-path> <output-dir>
 ```
+The `<cnn-tar-path>` and `<dailymail-tar-path>` are paths to the downloaded "story" tarfiles from [here](https://cs.nyu.edu/~kcho/DMQA/). 
 
 The output files are the following:
-- `summaries.jsonl`: The model output summaries and the ground-truth references
-- `summaries-with-crowd.jsonl`: The model output summaries and the ground-truth and ten crowdsourced references
+- `summaries.jsonl`: The model output summaries with their input documents and the ground-truth references
+- `summaries-with-crowd.jsonl`: The model output summaries with their input documents and the ground-truth and ten crowdsourced references
 - `metrics.jsonl`: The expert and Turker annotations that correspond to `summaries.jsonl` and `summaries-with-crowd.jsonl`
 
 Notes:
 - The raw data does not identify which reference summary is the original ground-truth reference, but after checking a handful of instances, it appears as if it is always the first reference in the list of references.
-That first reference is the one included in `summaries.jsonl`.
+That first reference is the one included in `summaries.jsonl`. ([Confirmed](https://github.com/Yale-LILY/SummEval/issues/8))
 - To make the crowd summaries distinct, each is given a `summarizer_id` of `turker-` followed by a number from 1 to 10.
 It is not necessarily the case that the summaries identified by `turker-i` were all written by the same person and should not be treated as such.
+- There are many (input document, summary) pairs that were not judged and not processed by this script.
+We need to figure out exactly what to do with them.
+Several models have multiple output files (M5 has "outputs_rouge.aligned.jsonl" and "outputs_rouge+coh.aligned.jsonl")
+but the "summaries.jsonl" will only have one of those summaries (e.g. in "summaries.jsonl" there is only one summary for "M5").
+We need to have some way to distinguish the different outputs to put all of the aligned document + summary pairs into one file.
 
 ## Correlations
 Here are the correlations of some of the metrics implemented in this library to the responsiveness scores in this dataset.

diff --git a/sacrerouge/datasets/fabbri2020/pair_data.py b/sacrerouge/datasets/fabbri2020/pair_data.py
@@ -0,0 +1,103 @@
+# Modified file from https://github.com/Yale-LILY/SummEval/blob/master/data_processing/pair_data.py
+"""
+Script for recreating the full model outputs from CNN/DM Story files.
+CNN/DM Story files can be downloaded from https://cs.nyu.edu/~kcho/DMQA/
+"""
+import argparse
+import json
+import os
+from tqdm import tqdm
+from glob import glob
+
+
+def parse_story_file(content):
+    """
+    Remove article highlights and unnecessary white characters.
+    """
+    content_raw = content.split("@highlight")[0]
+    content = " ".join(filter(None, [x.strip() for x in content_raw.split("\n")]))
+    return content
+
+def annotation_pairing(data_annotations, story_files):
+    print("Processing file:", data_annotations)
+    with open(data_annotations) as fd:
+        dataset = [json.loads(line) for line in fd]
+
+    for example in dataset:
+        story_path = os.path.join(story_files, example["filepath"])
+
+        with open(story_path) as fd:
+            story_content = fd.read()
+            example["text"] = parse_story_file(story_content)
+
+    paired_file = data_annotations.replace("aligned", "aligned.paired")
+    if os.path.dirname(paired_file):
+        os.makedirs(os.path.dirname(paired_file), exist_ok=True)
+    with open(paired_file, "w") as fd:
+        for example in dataset:
+            fd.write(json.dumps(example, ensure_ascii=False) + "\n")
+
+
+def output_pairing(aligned_data, story_files, model_outputs):
+    """
+    Walk data sub-directories and recreate examples
+    """
+    for unpaired_path in glob(f'{aligned_data}/*/aligned/*'):
+        filename = os.path.basename(unpaired_path)
+
+        if not (".jsonl" in filename and "aligned" in filename and os.path.isfile(unpaired_path)):
+            continue
+
+        print("Processing file:", unpaired_path)
+        with open(unpaired_path) as fd:
+            dataset = [json.loads(line) for line in fd]
+
+        for example in tqdm(dataset):
+            story_path = os.path.join(story_files, example["filepath"])
+
+            with open(story_path) as fd:
+                story_content = fd.read()
+                example["text"] = parse_story_file(story_content)
+
+        paired_filename = filename.replace("aligned", "aligned.paired")
+        paired_path = os.path.join(model_outputs, "paired", paired_filename)
+        os.makedirs(os.path.dirname(paired_path), exist_ok=True)
+        with open(paired_path, "w") as fd:
+            for example in dataset:
+                fd.write(json.dumps(example, ensure_ascii=False) + "\n")
+
+
+def run_pair_data(data_annotations: str = None, model_outputs: str = None, story_files: str = None):
+    if not (data_annotations or model_outputs) or not story_files:
+            raise RuntimeError("To run script please specify `data_annotations` to pair human annotation data or"
+                               "`model_outputs` to pair generated summaries. Story files should be specified in either case.")
+
+    if model_outputs:
+        aligned_data = model_outputs # os.path.join(model_outputs, "aligned")
+
+    if data_annotations:
+        annotation_pairing(data_annotations, story_files)
+
+    if model_outputs and story_files:
+        output_pairing(aligned_data, story_files, model_outputs)
+
+# if __name__ == "__main__":
+#     PARSER = argparse.ArgumentParser()
+#     PARSER.add_argument("--data_annotations", type=str, help="Path to file human annotations")
+#     PARSER.add_argument("--model_outputs", type=str, help="Path to directory holding model data")
+#     PARSER.add_argument("--story_files", type=str, help="Path to directory holding CNNDM story files")
+#     ARGS = PARSER.parse_args()
+#
+#
+#     if not (ARGS.data_annotations or ARGS.model_outputs) or not ARGS.story_files:
+#         raise RuntimeError("To run script please specify `data_annotations` to pair human annotation data or"
+#                            "`model_outputs` to pair generated summaries. Story files should be specified in either case.")
+#
+#     if ARGS.model_outputs:
+#         ARGS.aligned_data = os.path.join(ARGS.model_outputs, "aligned")
+#
+#     if ARGS.data_annotations:
+#         annotation_pairing(ARGS)
+#
+#     if ARGS.model_outputs and ARGS.story_files:
+#         output_pairing(ARGS)
diff --git a/sacrerouge/datasets/fabbri2020/setup.py b/sacrerouge/datasets/fabbri2020/setup.py
@@ -1,7 +1,11 @@
+import os
+import shutil
+import tarfile
 from typing import Any, Dict, List, Tuple
 
-from sacrerouge.common.util import download_url_to_file
+from sacrerouge.common.util import download_file_from_google_drive, download_url_to_file
 from sacrerouge.data import Metrics, MetricsDict
+from sacrerouge.datasets.fabbri2020.pair_data import run_pair_data
 from sacrerouge.io import JsonlReader, JsonlWriter
 
 
@@ -12,6 +16,23 @@ def download_human_judgments(output_dir: str, force: bool) -> str:
     return file_path
 
 
+def download_system_outputs(output_dir: str, force: bool) -> str:
+    expanded_dir = f'{output_dir}/expanded/'
+    for system_id in range(0, 24):
+        url = f'https://storage.googleapis.com/sfr-summarization-repo-research/M{system_id}.tar.gz'
+        file_path = f'{output_dir}/M{system_id}.tar.gz'
+        download_url_to_file(url, file_path, force)
+
+        model_expanded_dir = f'{expanded_dir}/M{system_id}'
+        if os.path.exists(model_expanded_dir) and force:
+            shutil.rmtree(model_expanded_dir)
+
+        if not os.path.exists(model_expanded_dir):
+            with tarfile.open(file_path, 'r') as tar:
+                tar.extractall(expanded_dir)
+    return expanded_dir
+
+
 def load_judgments(file_path: str) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Metrics]]:
     summaries = []
     summaries_with_crowd = []
@@ -26,6 +47,7 @@ def load_judgments(file_path: str) -> Tuple[List[Dict[str, Any]], List[Dict[str,
             references = instance['references']
             expert_annotations = instance['expert_annotations']
             turker_annotations = instance['turker_annotations']
+            document = instance['text']
 
             # It appears that the first reference is always the ground-truth, the others are crowdsourced, although
             # this is not 100% confirmed. It is only based on me looking through a handful of examples.
@@ -47,6 +69,7 @@ def load_judgments(file_path: str) -> Tuple[List[Dict[str, Any]], List[Dict[str,
                 'summarizer_id': summarizer_id,
                 'summarizer_type': 'peer',
                 'file_path': filename,
+                'document': {'text': document},
                 'summary': summary,
                 'references': [references[0]]
             })
@@ -55,6 +78,7 @@ def load_judgments(file_path: str) -> Tuple[List[Dict[str, Any]], List[Dict[str,
                 'summarizer_id': summarizer_id,
                 'summarizer_type': 'peer',
                 'file_path': filename,
+                'document': {'text': document},
                 'summary': summary,
                 'references': references
             })
@@ -81,16 +105,49 @@ def load_judgments(file_path: str) -> Tuple[List[Dict[str, Any]], List[Dict[str,
     return summaries, summaries_with_crowd, metrics_list
 
 
+def setup_documents(cnn_tar: str, dailymail_tar: str, output_dir: str, force: bool) -> None:
+    cnn_dir = f'{output_dir}/raw/cnndm/cnn'
+    dm_dir = f'{output_dir}/raw/cnndm/dailymail'
+    for tar_path, target_path in [(cnn_tar, cnn_dir), (dailymail_tar, dm_dir)]:
+        if os.path.exists(target_path) and force:
+            print(f'Removing {target_path}')
+            shutil.rmtree(target_path)
+
+        if not os.path.exists(target_path):
+            print(f'Untarring {tar_path} (it\'s pretty slow...)')
+            with tarfile.open(tar_path, 'r') as tar:
+                tar.extractall(f'{output_dir}/raw/cnndm')
+
+
 def save_data(data: List[Any], file_path: str) -> None:
     with JsonlWriter(file_path) as out:
         for item in data:
             out.write(item)
 
 
-def setup(output_dir: str, force: bool) -> None:
-    judgments_file = download_human_judgments(f'{output_dir}/raw', force)
-    summaries, summaries_with_crowd, metrics = load_judgments(judgments_file)
+def setup(cnn_tar: str, dailymail_tar: str, output_dir: str, force: bool) -> None:
+    # Download the expert and turker annotations
+    download_human_judgments(f'{output_dir}/raw', force)
+
+    # Download the system outputs
+    model_output_dir = download_system_outputs(f'{output_dir}/raw/model-outputs', force)
+
+    # Untar all of the documents
+    setup_documents(cnn_tar, dailymail_tar, output_dir, force)
 
+    # Pair together the summaries and input documents. The script will output the
+    # results into "model_annotations.aligned.paired.jsonl"
+    run_pair_data(data_annotations=f'{output_dir}/raw/model_annotations.aligned.jsonl',
+                  model_outputs=model_output_dir,
+                  story_files=f'{output_dir}/raw')
+
+    summaries, summaries_with_crowd, metrics = load_judgments(f'{output_dir}/raw/model_annotations.aligned.paired.jsonl')
     save_data(summaries, f'{output_dir}/summaries.jsonl')
     save_data(summaries_with_crowd, f'{output_dir}/summaries-with-crowd.jsonl')
-    save_data(metrics, f'{output_dir}/metrics.jsonl')
+    save_data(metrics, f'{output_dir}/metrics.jsonl')
+
+    # TODO Save all of the documents + summaries that were not judged. I'm not really sure
+    # what to do with them because several of the models have different output files. We need
+    # to come up with some way of distinguishing them. For instance, M5 has
+    # "outputs_rouge.aligned.jsonl" and "outputs_rouge+coh.aligned.jsonl", but the "summaries.jsonl"
+    # file will just mark whichever summary was judged as "M5"
diff --git a/sacrerouge/datasets/fabbri2020/subcommand.py b/sacrerouge/datasets/fabbri2020/subcommand.py
@@ -11,6 +11,16 @@ class Fabbri2020Subcommand(DatasetSetupSubcommand):
     def add_subparser(self, parser: argparse._SubParsersAction):
         description = 'Setup the Fabbri 2020 dataset'
         self.parser = parser.add_parser('fabbri2020', description=description, help=description)
+        self.parser.add_argument(
+            'cnn_tar',
+            type=str,
+            help='The path to the downloaded tar file from this link: https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ'
+        )
+        self.parser.add_argument(
+            'dailymail_tar',
+            type=str,
+            help='The path to the downloaded tar file from this link: https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs'
+        )
         self.parser.add_argument(
             'output_dir',
             type=str,
@@ -25,4 +35,4 @@ def add_subparser(self, parser: argparse._SubParsersAction):
 
     @overrides
     def run(self, args):
-        setup.setup(args.output_dir, args.force)
+        setup.setup(args.cnn_tar, args.dailymail_tar, args.output_dir, args.force)