feat: add RAG Evals Runner and Needle in a Haystack Evaluation (#945)

* add initial niah evaluation * add readme * add niah metrics * add main RAG evals runner * add evaluations Python package
defenseunicorns · Sep 3, 2024 · c191b54 · c191b54
1 parent 03bf0fd
commit c191b54
Show file tree

Hide file tree

Showing 9 changed files with 648 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -44,3 +44,6 @@ vocabulary.json
 
 # go binaries
 api/main
+
+# evaluation artifacts
+.deepeval-cache.json
diff --git a/src/leapfrogai_evals/README.md b/src/leapfrogai_evals/README.md
@@ -0,0 +1,71 @@
+# LeapfrogAI Evaluations
+
+This covers how to use the evaluations present in LeapfrogAI. As more evaluations get added, these instructions will be updated.
+
+## Running the Evaluations
+The LeapfrogAI RAG evaluation system assumes the following:
+
+- LeapfrogAI is deployed
+- A valid LeapfrogAI API key is set (for more info, see the [API README](/src/leapfrogai_api/README.md))
+
+Set the following environment variables:
+
+```bash
+LEAPFROGAI_API_URL=<LeapfrogAI API url, usually: https://leapfrogai-api.uds.dev/openai/v1 for development>
+LEAPFROGAI_API_KEY=<LeapfrogAI API key>
+MODEL_TO_EVALUATE="vllm" # can also be provided as "model" to the __init__ for the runner
+```
+
+Running `main.py` will by default run all of the evaluations currently available:
+
+```bash
+# from within the src/leapfrogai_evals dir
+python -m pip install .
+python main.py
+```
+
+## Needle in a Haystack (NIAH)
+
+A Needle in a Haystack evaluation is used to evaluate the performance of the LeapfrogAI RAG system in tasks that require finding a specific piece of information (the "needle") within a large body of text (the "haystack").
+
+This evaluation can be used to evaluate both the retrieval and generation stages of RAG:
+
+- If the needle is found within the retrieved context, the retrieval process is functioning as expected
+- If the needle is present in the final generated response, the generation process is functioning as expected
+
+### Data
+The LeapfrogAI NIAH evaluation uses a custom dataset available on HuggingFace: [defenseunicorns/LFAI_RAG_niah_v1](https://huggingface.co/datasets/defenseunicorns/LFAI_RAG_niah_v1)
+
+LFAI_RAG_niah_v1 contains 120 context entries that are intended to be used for Needle in a Haystack RAG Evaluations.
+
+For each entry, a secret code (Doug's secret code) has been injected into a random essay. This secret code is the "needle" that is the goal to be found by an LLM.
+
+Example:
+
+```json
+{
+  "context_length":512,
+  "context_depth":0.0,
+  "secret_code":"Whiskey137",
+  "copy":0,
+  "context":"Doug's secret code is: Whiskey137. Remember this. Venture funding works like gears. A typical startup goes through several rounds of funding, and at each round you want to take just enough money to reach the speed where you can shift into the next gear.\n\nFew startups get it quite right. Many are underfunded. A few are overfunded, which is like trying to start driving in third gear."
+}
+```
+
+### Experimental Design
+The LeapfrogAI NIAH evaluation uses the following process:
+
+- build a vector store (the haystack) upload 10 contextless documents (as padding)
+- for a subset of the data (10 datapoints by default):
+    - create an assistant
+    - upload the contextual document (containing the needle) to the vector store
+    - prompt the LLM to provide the secret code hidden in the context
+    - record the following:
+        - whether or not the needle text was returned by the retrieval step of RAG
+        - whether or not the needle text was returned by the LLM's final response
+    - delete the contextual document from the vector store
+    - delete the assistant
+- delete the contextless documents
+- delete the vector store
+
+The retrieval and response rate is then averaged across each copy of the experiment to generate a final score.
diff --git a/src/leapfrogai_evals/__init__.py b/src/leapfrogai_evals/__init__.py
diff --git a/src/leapfrogai_evals/main.py b/src/leapfrogai_evals/main.py
@@ -0,0 +1,70 @@
+import deepeval
+from deepeval.test_case import LLMTestCase
+import logging
+
+from leapfrogai_evals.runners.niah_runner import NIAH_Runner
+from leapfrogai_evals.metrics.niah_metrics import NIAH_Retrieval, NIAH_Response
+
+ALL_EVALS = ["LFAI_NIAH"]
+
+
+class RAGEvaluator:
+    """A class that handles running all of the LeapfrogAI RAG evaluations"""
+
+    def __init__(self):
+        self.eval_list = None
+        self.test_case_dict = None
+        self.niah_test_cases = None
+        self.eval_options = ALL_EVALS
+
+    def set_evaluations(self, evals_list=[]) -> None:
+        """Set the evaluations that will be run via a list"""
+        if len(evals_list) == 0:
+            logging.info("Setting eval list to ALL")
+            self.eval_list = ALL_EVALS
+        # TODO: Add other evals options
+
+    def run_evals(self, *args, **kwargs) -> None:
+        """Run all of the selected evaluations"""
+        logging.info("Running the following evaluations:")
+        for eval in self.eval_list:
+            logging.info(f" -{eval}")
+        if "LFAI_NIAH" in self.eval_list:
+            self._niah_evaluation(*args, **kwargs)
+        # TODO: add more evaluations
+
+    def _niah_evaluation(self, *args, **kwargs) -> None:
+        """Run the Needle in a Haystack evaluation"""
+        self.niah_test_cases = []
+
+        niah_runner = NIAH_Runner(*args, **kwargs)
+        niah_runner.run_experiment()
+
+        # build test cases out of the niah_dataset
+        for row in niah_runner.niah_data:
+            self.niah_test_cases.append(
+                LLMTestCase(
+                    input=niah_runner.message_prompt,
+                    actual_output=row["response"],
+                    context=[row["context"]],
+                    additional_metadata={
+                        "retrieval_score": row["retrieval_score"],
+                        "response_score": row["response_score"],
+                    },
+                )
+            )
+
+        # run metrics
+        retrieval_metric = NIAH_Retrieval()
+        response_metric = NIAH_Response()
+
+        deepeval.evaluate(
+            test_cases=self.niah_test_cases, metrics=[retrieval_metric, response_metric]
+        )
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    evaluator = RAGEvaluator()
+    evaluator.set_evaluations()
+    evaluator.run_evals()
diff --git a/src/leapfrogai_evals/metrics/__init__.py b/src/leapfrogai_evals/metrics/__init__.py
diff --git a/src/leapfrogai_evals/metrics/niah_metrics.py b/src/leapfrogai_evals/metrics/niah_metrics.py
@@ -0,0 +1,111 @@
+from deepeval.metrics import BaseMetric
+from deepeval.test_case import LLMTestCase
+import asyncio
+
+
+class NIAH_Retrieval(BaseMetric):
+    """A metric for measuring the retrieval score from the LFAI Needle in a Haystack Evaluation"""
+
+    def __init__(
+        self,
+        threshold: float = 1.0,
+        async_mode: bool = True,
+    ):
+        self.threshold = threshold
+        self.async_mode = async_mode
+
+    def measure(self, test_case: LLMTestCase) -> int:
+        """
+        Records the niah retrieval score from the test case
+
+        This function checks for the presence of a retrieval_score (provided by the niah_runner)
+        and sets a boolean determined by said score. The score is calculated in the runner to keep the
+        runner self-contained as a means of running the entire evaluation on its own. For simplicity,
+        the score is copied here for integration with DeepEval.
+
+        params:
+        -------
+        test_case: LLMTestCase
+            A test case object built from the results of a needle in a haystack evaluation run.
+            test_case should contain an additional metadata field that returns a dictionary with
+            the field "retrieval_score"
+
+        returns:
+        -------
+        int
+            A score that is equal to the "retrieval_score" from the test_case
+        """
+        self.score = test_case.additional_metadata["retrieval_score"]
+        self.success = self.score >= self.threshold
+
+        if self.success:
+            self.reason = f"Retrieval in the NIAH evaluation scored greater than or equal to the threshold score of {self.threshold}"
+        else:
+            self.reason = f"Retrieval in the NIAH evaluation scored less than the threshold score of {self.threshold}"
+
+        return self.score
+
+    async def a_measure(self, test_case: LLMTestCase) -> int:
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self.measure, test_case)
+
+    def is_successful(self) -> bool:
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Needle in a Haystack (NIAH) Retrieval"
+
+
+class NIAH_Response(BaseMetric):
+    """A metric for measuring the response score from the LFAI Needle in a Haystack Evaluation"""
+
+    def __init__(
+        self,
+        threshold: float = 1.0,
+        async_mode: bool = True,
+    ):
+        self.threshold = threshold
+        self.async_mode = async_mode
+
+    def measure(self, test_case: LLMTestCase) -> int:
+        """
+        Records the niah response score from the test case
+
+        This function checks for the presence of a response_score (provided by the niah_runner)
+        and sets a boolean determined by said score. The score is calculated in the runner to keep the
+        runner self-contained as a means of running the entire evaluation on its own. For simplicity,
+        the score is copied here for integration with DeepEval.
+
+        params:
+        -------
+        test_case: LLMTestCase
+            A test case object built from the results of a needle in a haystack evaluation run.
+            test_case should contain an additional metadata field that returns a dictionary with
+            the field "response_score"
+
+        returns:
+        -------
+        int
+            A score that is equal to the "response_score" from the test_case
+        """
+        self.score = test_case.additional_metadata["response_score"]
+        self.success = self.score >= self.threshold
+
+        if self.success:
+            self.reason = f"Response in the NIAH evaluation scored greater than or equal to the threshold score of {self.threshold}"
+        else:
+            self.reason = f"Response in the NIAH evaluation scored less than the threshold score of {self.threshold}"
+
+        return self.score
+
+    async def a_measure(self, test_case: LLMTestCase) -> int:
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self.measure, test_case)
+
+    def is_successful(self) -> bool:
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Needle in a Haystack (NIAH) Response"
diff --git a/src/leapfrogai_evals/pyproject.toml b/src/leapfrogai_evals/pyproject.toml
@@ -0,0 +1,37 @@
+[project]
+name = "leapfrogai-evals"
+description = "A framework for running evaluations in LeapfrogAI"
+
+# x-release-please-start-version
+version = "0.11.0"
+# x-release-please-end
+
+dependencies = [
+    "deepeval == 1.1.1",
+    "openai == 1.42.0",
+    "tqdm == 4.66.5",
+    "python-dotenv == 1.0.1",
+    "seaborn == 0.13.2",
+    "datasets == 2.21.0",
+    "huggingface-hub == 0.24.6"
+]
+requires-python = "~=3.11"
+readme = "README.md"
+
+[tool.pip-tools]
+generate-hashes = true
+
+[tool.setuptools.packages.find]
+where = ["../"]
+include = ["leapfrogai_evals*"]
+namespaces = false
+
+[tool.pytest.ini_options]
+addopts = ["--import-mode=importlib"]
+
+[tool.ruff]
+target-version = "py311"
+
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
diff --git a/src/leapfrogai_evals/runners/__init__.py b/src/leapfrogai_evals/runners/__init__.py