Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: FaithfulnessEvaluator specifies inputs explicitly #7548

Merged
merged 2 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 45 additions & 37 deletions haystack/components/evaluators/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,40 @@
from haystack.core.component import component
from haystack.utils import Secret, deserialize_secrets_inplace

# Default examples to include in the prompt if the user does not provide any examples
_DEFAULT_EXAMPLES = [
{
"inputs": {
"questions": "What is the capital of Germany and when was it founded?",
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
"responses": "The capital of Germany, Berlin, was founded in the 13th century.",
},
"outputs": {
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
"statement_scores": [1, 1],
},
},
{
"inputs": {
"questions": "What is the capital of France?",
"contexts": ["Berlin is the capital of Germany."],
"responses": "Paris",
},
"outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
},
{
"inputs": {
"questions": "What is the capital of Italy?",
"contexts": ["Rome is the capital of Italy."],
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
},
"outputs": {
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
"statement_scores": [1, 0],
},
},
]


class FaithfulnessEvaluator(LLMEvaluator):
"""
Expand Down Expand Up @@ -50,7 +84,8 @@ def __init__(
Creates an instance of FaithfulnessEvaluator.

:param examples:
Few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
Optional few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
Default examples will be used if none are provided.
Each example must be a dictionary with keys "inputs" and "outputs".
"inputs" must be a dictionary with keys "questions", "contexts", and "responses".
"outputs" must be a dictionary with "statements" and "statement_scores".
Expand Down Expand Up @@ -81,38 +116,7 @@ def __init__(
)
self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])]
self.outputs = ["statements", "statement_scores"]
self.examples = examples or [
{
"inputs": {
"questions": "What is the capital of Germany and when was it founded?",
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
"responses": "The capital of Germany, Berlin, was founded in the 13th century.",
},
"outputs": {
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
"statement_scores": [1, 1],
},
},
{
"inputs": {
"questions": "What is the capital of France?",
"contexts": ["Berlin is the capital of Germany."],
"responses": "Paris",
},
"outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
},
{
"inputs": {
"questions": "What is the capital of Italy?",
"contexts": ["Rome is the capital of Italy."],
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
},
"outputs": {
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
"statement_scores": [1, 0],
},
},
]
self.examples = examples or _DEFAULT_EXAMPLES
self.api = api
self.api_key = api_key

Expand All @@ -126,19 +130,23 @@ def __init__(
)

@component.output_types(results=List[Dict[str, Any]])
def run(self, **inputs) -> Dict[str, Any]:
def run(self, questions: List[str], contexts: List[List[str]], responses: List[str]) -> Dict[str, Any]:
"""
Run the LLM evaluator.

:param inputs:
The input values to evaluate. The keys are the input names and the values are lists of input values.
:param questions:
A list of questions.
:param contexts:
A list of lists of contexts. Each list of contexts corresponds to one question and one response.
julian-risch marked this conversation as resolved.
Show resolved Hide resolved
:param responses:
A list of responses.
:returns:
A dictionary with the following outputs:
- `score`: Mean faithfulness score over all the provided input answers.
- `individual_scores`: A list of faithfulness scores for each input answer.
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
"""
result = super().run(**inputs)
result = super().run(questions=questions, contexts=contexts, responses=responses)

# calculate average statement faithfulness score per query
for res in result["results"]:
Expand Down
35 changes: 33 additions & 2 deletions test/components/evaluators/test_faithfulness_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from typing import List

import pytest
Expand Down Expand Up @@ -108,10 +109,15 @@ def generator_run(self, *args, **kwargs):
questions = ["Which is the most popular global sport?", "Who created the Python language?"]
contexts = [
[
"The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people."
"The popularity of sports can be measured in various ways, including TV viewership, social media "
"presence, number of participants, and economic impact. Football is undoubtedly the world's most "
"popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
"Messi, drawing a followership of more than 4 billion people."
],
[
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming "
"language. Its design philosophy emphasizes code readability, and its language constructs aim to help "
"programmers write clear, logical code for both small and large-scale software projects."
],
]
responses = [
Expand All @@ -127,3 +133,28 @@ def generator_run(self, *args, **kwargs):
],
"score": 0.75,
}

def test_run_missing_parameters(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = FaithfulnessEvaluator()
with pytest.raises(TypeError, match="missing 3 required positional arguments"):
component.run()

@pytest.mark.skipif(
not os.environ.get("OPENAI_API_KEY", None),
reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
)
@pytest.mark.integration
def test_live_run(self):
questions = ["What is Python and who created it?"]
contexts = [["Python is a programming language created by Guido van Rossum."]]
responses = ["Python is a programming language created by George Lucas."]
evaluator = FaithfulnessEvaluator()
result = evaluator.run(questions=questions, contexts=contexts, responses=responses)

assert result["score"] == 0.5
assert result["individual_scores"] == [0.5]
assert result["results"][0]["score"] == 0.5
assert result["results"][0]["statement_scores"] == [1, 0]
assert "programming language" in result["results"][0]["statements"][0]
assert "George Lucas" in result["results"][0]["statements"][1]