Skip to content

Commit

Permalink
Prevent failing on greetings, division by zero errors, add reasoning …
Browse files Browse the repository at this point in the history
…and better descriptions
  • Loading branch information
rogeriochaves committed Jul 23, 2024
1 parent 6a16cfe commit 00e7c9a
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 62 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
Money,
)


class QueryResolutionConversationMessageEntry(EvaluatorEntry):
input: str
output: str
Expand All @@ -33,15 +34,18 @@ class QueryResolutionConversationSettings(BaseModel):
"openai/gpt-4-turbo",
"openai/gpt-4-0125-preview",
"openai/gpt-4-1106-preview",
"openai/gpt-4o",
"openai/gpt-4o-mini",
"azure/gpt-35-turbo-1106",
"azure/gpt-4-turbo-2024-04-09",
"azure/gpt-4-1106-preview",
"azure/gpt-4o",
"groq/llama3-70b-8192",
"anthropic/claude-3-haiku-20240307",
"anthropic/claude-3-sonnet-20240229",
"anthropic/claude-3-opus-20240229",
] = Field(
default="azure/gpt-35-turbo-1106",
default="openai/gpt-4o-mini",
description="The model to use for evaluation",
)
max_tokens: int = Field(
Expand All @@ -53,10 +57,7 @@ class QueryResolutionConversationSettings(BaseModel):
class QueryResolutionConversationResult(EvaluationResult):
score: float
passed: bool = Field(default=True)
details: Optional[str] = Field(
default="2 querries were resolved in this conversation"
)

details: Optional[str]


class QueryResolutionConversationEvaluator(
Expand All @@ -67,7 +68,7 @@ class QueryResolutionConversationEvaluator(
]
):
"""
This evaluator checks if all the querries of the user were resolved by the LLM.
This evaluator checks if all the user queries in the conversation were resolved. Useful to detect when the bot doesn't know how to answer or can't help the user.
"""

name = "Query Resolution Conversation Evaluator"
Expand Down Expand Up @@ -120,7 +121,6 @@ def evaluate(
List[dict[str, str]],
trim_messages(messages, litellm_model, max_tokens=max_tokens),
)
print(messages)

response = litellm.completion(
model=litellm_model,
Expand All @@ -130,55 +130,62 @@ def evaluate(
"type": "function",
"function": {
"name": "query_resolution_evaluator",
"description": "Evaluate if all of the querries were resolved",
"description": "Evaluate if all of the queries were answered",
"parameters": {
"type": "object",
"properties": {
"querries_total": {
"type": "number",
"description": "Number of total user querries in the dialogue",
"reasoning": {
"type": "string",
"description": "Reasoning for the answer",
},
"querries_resolved": {
"queries_total": {
"type": "number",
"description": "Number of resolved user querries in the dialogue",
"description": "Number of total user queries in the dialogue, greetings and non-requests do not count",
},
"were_resolved": {
"type": "boolean",
"description": "True if all querries were resolved, false if not",
"queries_answered": {
"type": "number",
"description": "Number of resolved user queries in the dialogue",
},
},
"required": [
"were_resolved",
"querries_total",
"querries_resolved",
"reasoning",
"queries_total",
"queries_answered",
],
},
},
},
],
tool_choice={"type": "function", "function": {"name": "query_resolution_evaluator"}}, # type: ignore
tool_choice={
"type": "function",
"function": {"name": "query_resolution_evaluator"},
},
)
response = cast(ModelResponse, response)
choice = cast(Choices, response.choices[0])
arguments = json.loads(
cast(Message, choice.message).tool_calls[0].function.arguments
)
print(choice)

cost = completion_cost(completion_response=response, prompt=prompt)

passed: bool = cast(bool, arguments["were_resolved"])
total_querries: int = arguments["querries_total"]
resolved_querries: int = arguments["querries_resolved"]
resolution_ratio: float = resolved_querries / total_querries
reasoning: str = arguments["reasoning"]
passed: bool = arguments["queries_answered"] == arguments["queries_total"]
total_queries: int = arguments["queries_total"]
resolved_queries: int = arguments["queries_answered"]
resolution_ratio: float = (
1
if resolved_queries == 0 and total_queries == 0
else resolved_queries / max(total_queries, 1)
)
cost = completion_cost(completion_response=response)
details: str = (
f"There were {total_querries} querries in total and {resolved_querries} of them were resolved in the conversation."
f"There were {total_queries} queries in total and {resolved_queries} of them were resolved in the conversation. Reasoning: {reasoning}"
)

return QueryResolutionConversationResult(
passed=passed,
score=resolution_ratio,
details=details,
cost=Money(amount=cost, currency="USD") if cost else None,
)
)
70 changes: 62 additions & 8 deletions evaluators/langevals/tests/test_query_resolution_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,41 @@
QueryResolutionConversationEntry,
QueryResolutionConversationSettings,
QueryResolutionConversationResult,
QueryResolutionConversationEvaluator
QueryResolutionConversationEvaluator,
)


def test_query_resolution_conversation_evaluator_pass_for_simple_greetings():
response1 = QueryResolutionConversationMessageEntry(
input="Hey, how are you?",
output="Hello, I am an assistant and I don't have feelings",
)
conversation = QueryResolutionConversationEntry(conversation=[response1])
settings = QueryResolutionConversationSettings(
model="openai/gpt-4o-mini", max_tokens=10000
)
evaluator = QueryResolutionConversationEvaluator(settings=settings)
result = evaluator.evaluate(conversation)

assert result.status == "processed"
assert result.score == 1
assert result.passed == True
assert result.details


def test_query_resolution_conversation_evaluator_pass():
response1 = QueryResolutionConversationMessageEntry(input="Hey, how are you?", output="Hello, I am an assistant and I don't have feelings")
response2 = QueryResolutionConversationMessageEntry(input="Okay, is there a president in the Netherlands? Also, tell me what is the system of government in the Netherlands?", output="There is no president in the Netherlands. The system of government is constitutional monarchy.")
response1 = QueryResolutionConversationMessageEntry(
input="Hey, how are you?",
output="Hello, I am an assistant and I don't have feelings",
)
response2 = QueryResolutionConversationMessageEntry(
input="Okay, is there a president in the Netherlands? Also, tell me what is the system of government in the Netherlands?",
output="There is no president in the Netherlands. The system of government is constitutional monarchy.",
)
conversation = QueryResolutionConversationEntry(conversation=[response1, response2])
settings = QueryResolutionConversationSettings(model='openai/gpt-3.5-turbo-1106', max_tokens=10000)
settings = QueryResolutionConversationSettings(
model="openai/gpt-4o-mini", max_tokens=10000
)
evaluator = QueryResolutionConversationEvaluator(settings=settings)
result = evaluator.evaluate(conversation)

Expand All @@ -24,11 +50,20 @@ def test_query_resolution_conversation_evaluator_pass():
assert result.passed == True
assert result.details


def test_query_resolution_conversation_evaluator_fail():
response1 = QueryResolutionConversationMessageEntry(input="Hey, how are you?", output="Hello, I am an assistant and I don't have feelings")
response2 = QueryResolutionConversationMessageEntry(input="Okay, is there a president in the Netherlands? Also, what equals 2 + 2? How many paws does a standard dog have?", output="There is no president in the Netherlands.")
response1 = QueryResolutionConversationMessageEntry(
input="Hey, how are you?",
output="Hello, I am an assistant and I don't have feelings",
)
response2 = QueryResolutionConversationMessageEntry(
input="Okay, is there a president in the Netherlands? Also, what equals 2 + 2? How many paws does a standard dog have?",
output="There is no president in the Netherlands.",
)
conversation = QueryResolutionConversationEntry(conversation=[response1, response2])
settings = QueryResolutionConversationSettings(model='openai/gpt-3.5-turbo-1106', max_tokens=10000)
settings = QueryResolutionConversationSettings(
model="openai/gpt-4o-mini", max_tokens=10000
)
evaluator = QueryResolutionConversationEvaluator(settings=settings)
result = evaluator.evaluate(conversation)

Expand All @@ -38,12 +73,31 @@ def test_query_resolution_conversation_evaluator_fail():
assert result.details


def test_query_resolution_conversation_evaluator_fails_with_i_dont_know():
response1 = QueryResolutionConversationMessageEntry(
input="What time is it?",
output="Sorry, I don't have any information about the current time",
)
conversation = QueryResolutionConversationEntry(conversation=[response1])
settings = QueryResolutionConversationSettings(
model="openai/gpt-4o-mini", max_tokens=10000
)
evaluator = QueryResolutionConversationEvaluator(settings=settings)
result = evaluator.evaluate(conversation)

assert result.status == "processed"
assert result.score == 0.0
assert result.passed == False
assert result.details


def test_product_sentiment_polarity_evaluator_skipped_for_non_product_related_outputs():
response1 = QueryResolutionConversationMessageEntry(input="", output="")
response2 = QueryResolutionConversationMessageEntry(input="", output="")
conversation = QueryResolutionConversationEntry(conversation=[response1, response2])
settings = QueryResolutionConversationSettings(model='openai/gpt-3.5-turbo-1106', max_tokens=10000)
settings = QueryResolutionConversationSettings(
model="openai/gpt-4o-mini", max_tokens=10000
)
evaluator = QueryResolutionConversationEvaluator(settings=settings)
result = evaluator.evaluate(conversation)

Expand Down
Loading

0 comments on commit 00e7c9a

Please sign in to comment.