From 3ec54412a916055a77d2df39df7b0c1cc6fa1a4b Mon Sep 17 00:00:00 2001
From: lkk <33276950+lkk12014402@users.noreply.github.com>
Date: Wed, 26 Jun 2024 13:52:02 +0800
Subject: [PATCH] support document summarization evaluation with microservice.
 (#34)

* support summarization evaluation with microservice.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add summarization directory.

* fix typo.

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 evals/metrics/summarization/__init__.py      |   2 +
 evals/metrics/summarization/summarization.py | 101 +++++++++++++
 evals/metrics/summarization/template.py      | 147 +++++++++++++++++++
 3 files changed, 250 insertions(+)
 create mode 100644 evals/metrics/summarization/__init__.py
 create mode 100644 evals/metrics/summarization/summarization.py
 create mode 100644 evals/metrics/summarization/template.py

diff --git a/evals/metrics/summarization/__init__.py b/evals/metrics/summarization/__init__.py
new file mode 100644
index 00000000..916f3a44
--- /dev/null
+++ b/evals/metrics/summarization/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/evals/metrics/summarization/summarization.py b/evals/metrics/summarization/summarization.py
new file mode 100644
index 00000000..10441956
--- /dev/null
+++ b/evals/metrics/summarization/summarization.py
@@ -0,0 +1,101 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import collections
+import json
+import logging
+from enum import Enum
+from typing import Dict, List, Optional, Union
+
+import requests
+from requests.exceptions import RequestException
+from rogue import Rogue
+
+from .template import SummarizationTemplate
+
+LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
+logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
+logger = logging.getLogger(__name__)
+
+LLM_JUDGE_METRICS = {
+    "Relevance": SummarizationTemplate.generate_relevance,
+    "Coherence": SummarizationTemplate.generate_coherence,
+    "Consistency": SummarizationTemplate.generate_consistency,
+    "Fluency": SummarizationTemplate.generate_fluency,
+}
+
+
+class SummarizationMetric:
+    """The summarization metric not only uses your LLMs (application) to generate summaries for evaluation,
+    but also uses LLMs to judge whether your LLM (application) is generating Relevance,
+    Coherence, Consistency, Fluency summaries."""
+
+    def __init__(
+        self,
+        model: Optional[Union[str]] = None,
+        llm_judge: Optional[Union[str]] = None,
+    ):
+        """
+        Args:
+            model: your LLMs endpoint (application) to generate summaries
+            llm_judge: LLMs endpoint for judge summaries
+        """
+
+        self.model = model
+        self.headers = {"Content-Type": "application/json"}
+        self.llm_judge = llm_judge
+        self.metrics = collections.defaultdict(list)
+        self.rogue = Rogue()
+
+    def rouge_scores(self, text1, text2):
+        eval_rouge = self.rogue.get_scores(text1, text2)
+        self.metrics["rouge-1|F-Score"].append(eval_rouge[0]["rouge-1"]["f"])
+        self.metrics["rouge-2|F-Score"].append(eval_rouge[0]["rouge-2"]["f"])
+        self.metrics["rouge-l|F-Score"].append(eval_rouge[0]["rouge-l"]["f"])
+
+    def llm_scores(self, document, summary):
+        for metric in LLM_JUDGE_METRICS:
+            req = {
+                "inputs": LLM_JUDGE_METRICS[metric](document, summary),
+                "parameters": {"max_new_tokens": 5, "do_sample": False},
+            }
+
+            try:
+                response = requests.post(
+                    f"{self.llm_judge}",
+                    headers=self.headers,
+                    data=json.dumps(req),
+                )
+                response.raise_for_status()
+                response = response.json()
+            except RequestException as e:
+                logger.info(str(e))
+                continue
+
+            score = response["generated_text"].strip()
+            self.metrics[metric].append(int(score))
+
+    def summarize(self, document: str, ref_summary: str, **generation_kwargs):
+        req = {"inputs": SummarizationTemplate.generate_summary(document), "parameters": generation_kwargs}
+
+        try:
+            response = requests.post(
+                f"{self.model}",
+                headers=self.headers,
+                data=json.dumps(req),
+            )
+            response.raise_for_status()
+            response = response.json()
+        except RequestException as e:
+            logger.info(str(e))
+
+        gen_summary = response["generated_text"]
+
+        # get metrics
+        self.rouge_scores(gen_summary, ref_summary)
+        if self.llm_judge is not None:
+            self.llm_scores(document, gen_summary)
+
+    @property
+    def average_score(self):
+        return {metric: sum(self.metrics[metric]) / len(self.metrics[metric]) for metric in self.metrics}
diff --git a/evals/metrics/summarization/template.py b/evals/metrics/summarization/template.py
new file mode 100644
index 00000000..362f079d
--- /dev/null
+++ b/evals/metrics/summarization/template.py
@@ -0,0 +1,147 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+class SummarizationTemplate:
+    @staticmethod
+    def generate_summary(document):
+        return f"""Provide a concise summary of the following document:
+
+{document}
+
+"""
+
+    @staticmethod
+    def generate_relevance(document, summary):
+        return f"""You will be given one summary written for an article. Your task is to rate the summary on one metric.
+Please make sure you read and understand these instructions very carefully.
+Please keep this document open while reviewing, and refer to it as needed.
+
+Evaluation Criteria:
+
+Relevance(1-5) - selection of important content from the source. \
+The summary should include only important information from the source document. \
+Annotators were instructed to penalize summaries which contained redundancies and excess information.
+
+Evaluation Steps:
+
+1. Read the summary and the source document carefully.
+2. Compare the summary to the source document and identify the main points of the article.
+3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains.
+4. Assign a relevance score from 1 to 5.
+
+Example:
+
+Source Text:
+
+{document}
+
+Summary:
+
+{summary}
+
+Evaluation Form (scores ONLY):
+
+- Relevance
+"""
+
+    @staticmethod
+    def generate_coherence(document, summary):
+        return f"""You will be given one summary written for an article. Your task is to rate the summary on one metric.
+Please make sure you read and understand these instructions very carefully.
+Please keep this document open while reviewing, and refer to it as needed.
+
+Evaluation Criteria:
+
+Coherence(1-5) - the collective quality of all sentences. \
+We align this dimension with the DUC quality question of structure and coherence \
+whereby "the summary should be well-structured and well-organized. \
+The summary should not just be a heap of related information, but should build from sentence to a\
+coherent body of information about a topic."
+
+Evaluation Steps:
+
+1. Read the article carefully and identify the main topic and key points.
+2. Read the summary and compare it to the article. Check if the summary covers the main topic and key points of the article,
+and if it presents them in a clear and logical order.
+3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.
+
+Example:
+
+Source Text:
+
+{document}
+
+Summary:
+
+{summary}
+
+Evaluation Form (scores ONLY):
+
+- Coherence
+"""
+
+    @staticmethod
+    def generate_consistency(document, summary):
+        return f"""You will be given one summary written for an article. Your task is to rate the summary on one metric.
+Please make sure you read and understand these instructions very carefully.
+Please keep this document open while reviewing, and refer to it as needed.
+
+Evaluation Criteria:
+
+Consistency(1-5) - the factual alignment between the summary and the summarized source. \
+A factually consistent summary contains only statements that are entailed by the source document. \
+Annotators were also asked to penalize summaries that contained hallucinated facts.
+
+Evaluation Steps:
+
+1. Read the article carefully and identify the main facts and details it presents.
+2. Read the summary and compare it to the article. Check if the summary contains any factual errors that are not supported by the article.
+3. Assign a score for consistency based on the Evaluation Criteria.
+
+Example:
+
+Source Text:
+
+{document}
+
+Summary:
+
+{summary}
+
+Evaluation Form (scores ONLY):
+
+- Consistency
+"""
+
+    @staticmethod
+    def generate_fluency(document, summary):
+        return f"""You will be given one summary written for an article. Your task is to rate the summary on one metric.
+Please make sure you read and understand these instructions very carefully.
+Please keep this document open while reviewing, and refer to it as needed.
+
+Evaluation Criteria:
+
+Fluency(1-3): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure.
+1: Poor. The summary has many errors that make it hard to understand or sound unnatural.
+2: Fair. The summary has some errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
+3: Good. The summary has few or no errors and is easy to read and follow.
+
+Evaluation Steps:
+
+Read the summary and evaluate its fluency based on the given criteria. Assign a fluency score from 1 to 3.
+
+Example:
+
+Source Text:
+
+{document}
+
+Summary:
+
+{summary}
+
+Evaluation Form (scores ONLY):
+
+- Fluency
+"""