diff --git a/CHANGELOG.md b/CHANGELOG.md index ffb75b1..cf59913 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## Unreleased +### Added +- Added [FEQA](https://www.aclweb.org/anthology/2020.acl-main.454/) ## [v0.1.3](https://github.com/danieldeutsch/sacrerouge/releases/tag/v0.1.3) - 2020-11-25 ### Added diff --git a/doc/metrics/feqa.md b/doc/metrics/feqa.md new file mode 100644 index 0000000..b6afada --- /dev/null +++ b/doc/metrics/feqa.md @@ -0,0 +1,31 @@ +# FEQA +FEQA [1] is a question-answering based metric for evaluating the faithfulness of summaries. +Our implementation uses our [fork](https://github.com/danieldeutsch/feqa) of the [original repository](https://github.com/esdurmus/feqa) which adds a `run.py` file to easily run FEQA using an input and output file. + +## Setting Up +First, create an environment for FEQA to run (see [here](../../environments/feqa.yml)). +We had to manually install the Cython and numpy packages before installing the other requirements because benepar requires Cython and numpy to be installed first. + +After the environment is created, install the spacy model and other resources: +``` +python -m spacy download en_core_web_sm +python +>>> import benepar +>>> import nltk +>>> benepar.download('benepar_en2') +>>> nltk.download('stopwords') +``` + +Then, setup the metric by cloning the repository and downloading the necessary pre-trained models: +``` +sacrerouge setup-metric feqa +``` + +## Correlations +This implementation achieves near 0.0 Pearson correlations to the data collected by [2] on the CNN/DailyMail and XSUM splits, respectively. +I am not sure why yet. +The data from [1] has not been released yet to reproduce the results from the paper. + +## References +[1] Esin Durmus, He He and Mona Diab. [FEQA: A Question Answering Evaluation Framework for Faithfulness Assessment in Abstractive Summarization](https://www.aclweb.org/anthology/2020.acl-main.454/). ACL 2020. +[2] Alex Wang, Kyunghyun Cho, and Mike Lewis. [Asking and Answering Questions to Evaluate the Factual Consistency of Summaries](https://www.aclweb.org/anthology/2020.acl-main.450.pdf). ACL 2020. \ No newline at end of file diff --git a/doc/metrics/metrics.md b/doc/metrics/metrics.md index 62e1697..29bb648 100644 --- a/doc/metrics/metrics.md +++ b/doc/metrics/metrics.md @@ -6,6 +6,7 @@ The following metrics have been implemented: - [BERTScore](bertscore.md) - [BEwT-E](bewte.md) - [Decomposed ROUGE](decomposed-rouge.md) +- [FEQA](feqa.md) - [METEOR](meteor.md) - [MoverScore](moverscore.md) - [Pyramid Score](pyramid-score.md) diff --git a/environments/feqa.yml b/environments/feqa.yml new file mode 100644 index 0000000..684df1c --- /dev/null +++ b/environments/feqa.yml @@ -0,0 +1,14 @@ +name: feqa +dependencies: + - python=3.6 + - pip + - pip: + - Cython==0.29.15 + - numpy==1.19.1 + - benepar==0.1.2 + - torch==1.5.0 + - fairseq==0.9.0 + - nltk==3.5 + - spacy==2.3.2 + - tensorflow==1.15.0 + - transformers==2.8.0 \ No newline at end of file diff --git a/experiments/feqa/.gitignore b/experiments/feqa/.gitignore new file mode 100644 index 0000000..6caf68a --- /dev/null +++ b/experiments/feqa/.gitignore @@ -0,0 +1 @@ +output \ No newline at end of file diff --git a/experiments/feqa/run.sh b/experiments/feqa/run.sh new file mode 100644 index 0000000..3fac16f --- /dev/null +++ b/experiments/feqa/run.sh @@ -0,0 +1,19 @@ +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +set -e + +for dataset in cnndm xsum; do + python -m sacrerouge feqa score \ + --input-files datasets/wang2020/${dataset}.summaries.jsonl \ + --dataset-reader document-based \ + --output-jsonl ${DIR}/output/${dataset}/scores.jsonl \ + --environment_name /shared/ddeutsch/envs/feqa + + python -m sacrerouge correlate \ + --metrics-jsonl-files datasets/wang2020/${dataset}.metrics.jsonl ${DIR}/output/${dataset}/scores.jsonl \ + --metrics wang2020_crowd_faithfulness FEQA \ + --summarizer-type peer \ + --skip-summary-level \ + --skip-system-level \ + --output-file ${DIR}/output/${dataset}/correlations.json +done \ No newline at end of file diff --git a/sacrerouge/metrics/__init__.py b/sacrerouge/metrics/__init__.py index 782757d..c153c58 100644 --- a/sacrerouge/metrics/__init__.py +++ b/sacrerouge/metrics/__init__.py @@ -3,6 +3,7 @@ from sacrerouge.metrics.bertscore import BertScore from sacrerouge.metrics.bewte import BEwTE from sacrerouge.metrics.bleurt import Bleurt +from sacrerouge.metrics.feqa import FEQA from sacrerouge.metrics.decomposed_rouge import DecomposedRouge from sacrerouge.metrics.meteor import Meteor from sacrerouge.metrics.moverscore import MoverScore diff --git a/sacrerouge/metrics/feqa.py b/sacrerouge/metrics/feqa.py new file mode 100644 index 0000000..719b85f --- /dev/null +++ b/sacrerouge/metrics/feqa.py @@ -0,0 +1,128 @@ +import argparse +import logging +import os +import shutil +from overrides import overrides +from subprocess import Popen, PIPE +from typing import List + +from sacrerouge.commands import MetricSetupSubcommand +from sacrerouge.common import DATA_ROOT, TemporaryDirectory +from sacrerouge.common.util import download_file_from_google_drive +from sacrerouge.data import MetricsDict +from sacrerouge.data.types import DocumentType, SummaryType +from sacrerouge.io import JsonlReader, JsonlWriter +from sacrerouge.metrics import Metric, DocumentBasedMetric + +logger = logging.getLogger(__name__) + + +@Metric.register('feqa') +class FEQA(DocumentBasedMetric): + def __init__(self, + environment_name: str = None, + feqa_root: str = f'{DATA_ROOT}/metrics/feqa', + batch_size: int = 8) -> None: + super().__init__(['summary'], ['documents']) + self.environment_name = environment_name + self.feqa_root = feqa_root + self.batch_size = batch_size + + if self.environment_name is not None: + if 'CONDA_INIT' not in os.environ: + raise Exception('If `environment_name` is not none, environment variable "CONDA_INIT" must be set to the path to "conda.sh"') + + def _ensure_single_document(self, documents_list: List[List[DocumentType]]): + # For now, the code only works if there's 1 input document. The QA model only evalutes against one document, + # so I think it may have to fundamentally change for multi-documents + for documents in documents_list: + assert len(documents) == 1 + + def score_multi_all(self, + summaries_list: List[List[SummaryType]], + documents_list: List[List[DocumentType]]) -> List[List[MetricsDict]]: + self._ensure_single_document(documents_list) + + with TemporaryDirectory() as temp_dir: + input_file = f'{temp_dir}/input.jsonl' + output_file = f'{temp_dir}/output.jsonl' + with JsonlWriter(input_file) as out: + for summaries, documents in zip(summaries_list, documents_list): + assert len(documents) == 1 + document = documents[0] + if isinstance(document, list): + document = ' '.join(document) + for summary in summaries: + if isinstance(summary, list): + summary = ' '.join(summary) + out.write({'document': document, 'summary': summary}) + + commands = [] + if self.environment_name is not None: + commands.append(f'source {os.environ["CONDA_INIT"]}') + commands.append(f'conda activate {self.environment_name}') + commands.append(f'cd {self.feqa_root}') + commands.append(f'python run.py {input_file} {output_file} {self.batch_size}') + command = ' && '.join(commands) + logger.info(f'Running FEQA command: "{command}"') + process = Popen(command, stdout=PIPE, stderr=PIPE, shell=True) + stdout, stderr = process.communicate() + logger.info(stdout.decode()) + logger.error(stderr.decode()) + + scores = JsonlReader(output_file).read() + metrics_list = [] + index = 0 + for summaries in summaries_list: + metrics_list.append([]) + for _ in summaries: + metrics_list[-1].append(MetricsDict({'FEQA': scores[index]['score']})) + index += 1 + return metrics_list + + +@MetricSetupSubcommand.register('feqa') +class FEQASetupSubcommand(MetricSetupSubcommand): + @overrides + def add_subparser(self, parser: argparse._SubParsersAction): + description = 'Setup the FEQA metric' + self.parser = parser.add_parser('feqa', description=description, help=description) + self.parser.add_argument('--force', action='store_true', help='Force setting up the metric again') + self.parser.set_defaults(subfunc=self.run) + + @overrides + def run(self, args): + if args.force and os.path.exists(f'{DATA_ROOT}/metrics/feqa'): + shutil.rmtree(f'{DATA_ROOT}/metrics/feqa') + + # Clone the github repo + if not os.path.exists(f'{DATA_ROOT}/metrics/feqa'): + commands = [ + f'mkdir -p {DATA_ROOT}/metrics', + f'cd {DATA_ROOT}/metrics', + f'git clone https://github.com/danieldeutsch/feqa', + ] + command = ' && '.join(commands) + process = Popen(command, shell=True) + process.communicate() + + # Download the model files + gdrive_files = { + 'qa_models/squad1.0/config.json': '1IwWhQf9MP2G-vOBsQD87kMMEBS0IvcXa', + 'qa_models/squad1.0/dev-v1.1.json': '1tsWhCsXSxxgkBMBnGB9wkOliJH8K3Prs', + 'qa_models/squad1.0/evaluate-v1.1.py': '1p-LlVVAGuMYjFckjK5HxdiK5xEuM-2Ev', + 'qa_models/squad1.0/pytorch_model.bin': '1pWMsSTTwcoX0l75bzNFjvSC7firawp9M', + 'qa_models/squad1.0/run_squad.py': '1yZKNFU7md4KPGmThPwsp4dt95HkKsArX', + 'qa_models/squad1.0/special_tokens_map.json': '1rbv75oE5x0rXxtGGXETTvLBoHK5h3Lfj', + 'qa_models/squad1.0/tokenizer_config.json': '1oPM62qOWofGnaLmlX_CWkYKbZ-KEMtym', + 'qa_models/squad1.0/train-v1.1.json': '1y9_EgnoBbm0SJeCaNZFfjOyraeA-qfqP', + 'qa_models/squad1.0/training_args.bin': '1r49Y1Cp2t6_II2xjOyxbvYVvp2EQj3zu', + 'qa_models/squad1.0/vocab.txt': '1iGZrP6_3PiiH0pcF4zoSbqAsWdFvimfF', + 'bart_qg/checkpoints/checkpoint_best.pt': '1GFnimonLFgGal1LT6KRgMJZLbxmNJvxF', + 'bart_qg/checkpoints/dict.src.txt': '17CShx4cUEQTl_gpLapnbMsc7CmDAaV7r', + 'bart_qg/checkpoints/dict.tgt.txt': '1_dUN7CQZdqPxoiezzWp5yByuEXVJFwce', + } + for file_path, file_id in gdrive_files.items(): + download_file_from_google_drive(file_id, f'{DATA_ROOT}/metrics/feqa/{file_path}', force=args.force) + + print('FEQA setup success') \ No newline at end of file diff --git a/sacrerouge/tests/metrics/feqa_test.py b/sacrerouge/tests/metrics/feqa_test.py new file mode 100644 index 0000000..42d5843 --- /dev/null +++ b/sacrerouge/tests/metrics/feqa_test.py @@ -0,0 +1,53 @@ +import os +import pytest + +from sacrerouge.common.testing.metric_test_cases import DocumentBasedMetricTestCase +from sacrerouge.common.testing.util import sacrerouge_command_exists +from sacrerouge.metrics import FEQA + + +@pytest.mark.skipif('FEQA_ENV' not in os.environ, reason='FEQA python environment environment variable not set') +class TestFEQA(DocumentBasedMetricTestCase): + def test_example(self): + # Tests to make sure we get the same output as running the example in the original repository + documents_list = [ + [ + "The world's oldest person has died a \ + few weeks after celebrating her 117th birthday. \ + Born on March 5, 1898, the greatgrandmother had lived through two world \ + wars, the invention of the television and the \ + first successful powered aeroplane." + ], + [ + "The world's oldest person has died a \ + few weeks after celebrating her 117th birthday. \ + Born on March 5, 1898, the greatgrandmother had lived through two world \ + wars, the invention of the television and the \ + first successful powered aeroplane." + ] + ] + summaries = [ + "The world's oldest person died in 1898", + "The world's oldest person died after her 117th birthday" + ] + + metric = FEQA(environment_name=os.environ['FEQA_ENV']) + + # The original iPython notebook has the second score of 0.8875, but the first one matches. I assume that + # this is caused by some minor change (e.g., they use spacy model 2.1.0 in the example but ours uses 2.3.1) + # since the first score matches. + expected_output = [ + {'FEQA': 0.674074074074074}, + {'FEQA': 0.85}, + ] + actual_output = metric.score_all(summaries, documents_list) + + assert len(expected_output) == len(actual_output) + for expected, actual in zip(expected_output, actual_output): + assert expected == pytest.approx(actual, 1e-4) + + def test_command_exists(self): + assert sacrerouge_command_exists(['feqa']) + + def test_setup_command_exists(self): + assert sacrerouge_command_exists(['setup-metric', 'feqa']) \ No newline at end of file