From 3ec54412a916055a77d2df39df7b0c1cc6fa1a4b Mon Sep 17 00:00:00 2001 From: lkk <33276950+lkk12014402@users.noreply.github.com> Date: Wed, 26 Jun 2024 13:52:02 +0800 Subject: [PATCH] support document summarization evaluation with microservice. (#34) * support summarization evaluation with microservice. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add summarization directory. * fix typo. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- evals/metrics/summarization/__init__.py | 2 + evals/metrics/summarization/summarization.py | 101 +++++++++++++ evals/metrics/summarization/template.py | 147 +++++++++++++++++++ 3 files changed, 250 insertions(+) create mode 100644 evals/metrics/summarization/__init__.py create mode 100644 evals/metrics/summarization/summarization.py create mode 100644 evals/metrics/summarization/template.py diff --git a/evals/metrics/summarization/__init__.py b/evals/metrics/summarization/__init__.py new file mode 100644 index 00000000..916f3a44 --- /dev/null +++ b/evals/metrics/summarization/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/evals/metrics/summarization/summarization.py b/evals/metrics/summarization/summarization.py new file mode 100644 index 00000000..10441956 --- /dev/null +++ b/evals/metrics/summarization/summarization.py @@ -0,0 +1,101 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import collections +import json +import logging +from enum import Enum +from typing import Dict, List, Optional, Union + +import requests +from requests.exceptions import RequestException +from rogue import Rogue + +from .template import SummarizationTemplate + +LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s" +logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) +logger = logging.getLogger(__name__) + +LLM_JUDGE_METRICS = { + "Relevance": SummarizationTemplate.generate_relevance, + "Coherence": SummarizationTemplate.generate_coherence, + "Consistency": SummarizationTemplate.generate_consistency, + "Fluency": SummarizationTemplate.generate_fluency, +} + + +class SummarizationMetric: + """The summarization metric not only uses your LLMs (application) to generate summaries for evaluation, + but also uses LLMs to judge whether your LLM (application) is generating Relevance, + Coherence, Consistency, Fluency summaries.""" + + def __init__( + self, + model: Optional[Union[str]] = None, + llm_judge: Optional[Union[str]] = None, + ): + """ + Args: + model: your LLMs endpoint (application) to generate summaries + llm_judge: LLMs endpoint for judge summaries + """ + + self.model = model + self.headers = {"Content-Type": "application/json"} + self.llm_judge = llm_judge + self.metrics = collections.defaultdict(list) + self.rogue = Rogue() + + def rouge_scores(self, text1, text2): + eval_rouge = self.rogue.get_scores(text1, text2) + self.metrics["rouge-1|F-Score"].append(eval_rouge[0]["rouge-1"]["f"]) + self.metrics["rouge-2|F-Score"].append(eval_rouge[0]["rouge-2"]["f"]) + self.metrics["rouge-l|F-Score"].append(eval_rouge[0]["rouge-l"]["f"]) + + def llm_scores(self, document, summary): + for metric in LLM_JUDGE_METRICS: + req = { + "inputs": LLM_JUDGE_METRICS[metric](document, summary), + "parameters": {"max_new_tokens": 5, "do_sample": False}, + } + + try: + response = requests.post( + f"{self.llm_judge}", + headers=self.headers, + data=json.dumps(req), + ) + response.raise_for_status() + response = response.json() + except RequestException as e: + logger.info(str(e)) + continue + + score = response["generated_text"].strip() + self.metrics[metric].append(int(score)) + + def summarize(self, document: str, ref_summary: str, **generation_kwargs): + req = {"inputs": SummarizationTemplate.generate_summary(document), "parameters": generation_kwargs} + + try: + response = requests.post( + f"{self.model}", + headers=self.headers, + data=json.dumps(req), + ) + response.raise_for_status() + response = response.json() + except RequestException as e: + logger.info(str(e)) + + gen_summary = response["generated_text"] + + # get metrics + self.rouge_scores(gen_summary, ref_summary) + if self.llm_judge is not None: + self.llm_scores(document, gen_summary) + + @property + def average_score(self): + return {metric: sum(self.metrics[metric]) / len(self.metrics[metric]) for metric in self.metrics} diff --git a/evals/metrics/summarization/template.py b/evals/metrics/summarization/template.py new file mode 100644 index 00000000..362f079d --- /dev/null +++ b/evals/metrics/summarization/template.py @@ -0,0 +1,147 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +class SummarizationTemplate: + @staticmethod + def generate_summary(document): + return f"""Provide a concise summary of the following document: + +{document} + +""" + + @staticmethod + def generate_relevance(document, summary): + return f"""You will be given one summary written for an article. Your task is to rate the summary on one metric. +Please make sure you read and understand these instructions very carefully. +Please keep this document open while reviewing, and refer to it as needed. + +Evaluation Criteria: + +Relevance(1-5) - selection of important content from the source. \ +The summary should include only important information from the source document. \ +Annotators were instructed to penalize summaries which contained redundancies and excess information. + +Evaluation Steps: + +1. Read the summary and the source document carefully. +2. Compare the summary to the source document and identify the main points of the article. +3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains. +4. Assign a relevance score from 1 to 5. + +Example: + +Source Text: + +{document} + +Summary: + +{summary} + +Evaluation Form (scores ONLY): + +- Relevance +""" + + @staticmethod + def generate_coherence(document, summary): + return f"""You will be given one summary written for an article. Your task is to rate the summary on one metric. +Please make sure you read and understand these instructions very carefully. +Please keep this document open while reviewing, and refer to it as needed. + +Evaluation Criteria: + +Coherence(1-5) - the collective quality of all sentences. \ +We align this dimension with the DUC quality question of structure and coherence \ +whereby "the summary should be well-structured and well-organized. \ +The summary should not just be a heap of related information, but should build from sentence to a\ +coherent body of information about a topic." + +Evaluation Steps: + +1. Read the article carefully and identify the main topic and key points. +2. Read the summary and compare it to the article. Check if the summary covers the main topic and key points of the article, +and if it presents them in a clear and logical order. +3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria. + +Example: + +Source Text: + +{document} + +Summary: + +{summary} + +Evaluation Form (scores ONLY): + +- Coherence +""" + + @staticmethod + def generate_consistency(document, summary): + return f"""You will be given one summary written for an article. Your task is to rate the summary on one metric. +Please make sure you read and understand these instructions very carefully. +Please keep this document open while reviewing, and refer to it as needed. + +Evaluation Criteria: + +Consistency(1-5) - the factual alignment between the summary and the summarized source. \ +A factually consistent summary contains only statements that are entailed by the source document. \ +Annotators were also asked to penalize summaries that contained hallucinated facts. + +Evaluation Steps: + +1. Read the article carefully and identify the main facts and details it presents. +2. Read the summary and compare it to the article. Check if the summary contains any factual errors that are not supported by the article. +3. Assign a score for consistency based on the Evaluation Criteria. + +Example: + +Source Text: + +{document} + +Summary: + +{summary} + +Evaluation Form (scores ONLY): + +- Consistency +""" + + @staticmethod + def generate_fluency(document, summary): + return f"""You will be given one summary written for an article. Your task is to rate the summary on one metric. +Please make sure you read and understand these instructions very carefully. +Please keep this document open while reviewing, and refer to it as needed. + +Evaluation Criteria: + +Fluency(1-3): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure. +1: Poor. The summary has many errors that make it hard to understand or sound unnatural. +2: Fair. The summary has some errors that affect the clarity or smoothness of the text, but the main points are still comprehensible. +3: Good. The summary has few or no errors and is easy to read and follow. + +Evaluation Steps: + +Read the summary and evaluate its fluency based on the given criteria. Assign a fluency score from 1 to 3. + +Example: + +Source Text: + +{document} + +Summary: + +{summary} + +Evaluation Form (scores ONLY): + +- Fluency +"""