-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathmost_basic_eval.py
163 lines (136 loc) · 6.56 KB
/
most_basic_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# Apache Software License 2.0
#
# Copyright (c) ZenML GmbH 2024. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import string
from openai import OpenAI
from utils.openai_utils import get_openai_api_key
def preprocess_text(text):
text = text.lower()
text = text.translate(str.maketrans("", "", string.punctuation))
text = re.sub(r"\s+", " ", text).strip()
return text
def tokenize(text):
return preprocess_text(text).split()
def retrieve_relevant_chunks(query, corpus, top_n=2):
query_tokens = set(tokenize(query))
similarities = []
for chunk in corpus:
chunk_tokens = set(tokenize(chunk))
similarity = len(query_tokens.intersection(chunk_tokens)) / len(
query_tokens.union(chunk_tokens)
)
similarities.append((chunk, similarity))
similarities.sort(key=lambda x: x[1], reverse=True)
return [chunk for chunk, _ in similarities[:top_n]]
def answer_question(query, corpus, top_n=2):
relevant_chunks = retrieve_relevant_chunks(query, corpus, top_n)
if not relevant_chunks:
return "I don't have enough information to answer the question."
context = "\n".join(relevant_chunks)
client = OpenAI(api_key=get_openai_api_key())
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": f"Based on the provided context, answer the following question: {query}\n\nContext:\n{context}",
},
{
"role": "user",
"content": query,
},
],
model="gpt-3.5-turbo",
)
return chat_completion.choices[0].message.content.strip()
# Sci-fi themed corpus about "ZenML World"
corpus = [
"The luminescent forests of ZenML World are inhabited by glowing Zenbots that emit a soft, pulsating light as they roam the enchanted landscape.",
"In the neon skies of ZenML World, Cosmic Butterflies flutter gracefully, their iridescent wings leaving trails of stardust in their wake.",
"Telepathic Treants, ancient sentient trees, communicate through the quantum neural network that spans the entire surface of ZenML World, sharing wisdom and knowledge.",
"Deep within the melodic caverns of ZenML World, Fractal Fungi emit pulsating tones that resonate through the crystalline structures, creating a symphony of otherworldly sounds.",
"Near the ethereal waterfalls of ZenML World, Holographic Hummingbirds hover effortlessly, their translucent wings refracting the prismatic light into mesmerizing patterns.",
"Gravitational Geckos, masters of anti-gravity, traverse the inverted cliffs of ZenML World, defying the laws of physics with their extraordinary abilities.",
"Plasma Phoenixes, majestic creatures of pure energy, soar above the chromatic canyons of ZenML World, their fiery trails painting the sky in a dazzling display of colors.",
"Along the prismatic shores of ZenML World, Crystalline Crabs scuttle and burrow, their transparent exoskeletons refracting the light into a kaleidoscope of hues.",
]
corpus = [preprocess_text(sentence) for sentence in corpus]
# Evaluation data
eval_data = [
{
"question": "What creatures inhabit the luminescent forests of ZenML World?",
"expected_answer": "The luminescent forests of ZenML World are inhabited by glowing Zenbots.",
},
{
"question": "What do Fractal Fungi do in the melodic caverns of ZenML World?",
"expected_answer": "Fractal Fungi emit pulsating tones that resonate through the crystalline structures, creating a symphony of otherworldly sounds in the melodic caverns of ZenML World.",
},
{
"question": "Where do Gravitational Geckos live in ZenML World?",
"expected_answer": "Gravitational Geckos traverse the inverted cliffs of ZenML World.",
},
]
def evaluate_retrieval(question, expected_answer, corpus, top_n=2):
relevant_chunks = retrieve_relevant_chunks(question, corpus, top_n)
print(f"Question: {question}")
print(f"Expected Answer: {expected_answer}")
print("Retrieved Chunks:")
for chunk in relevant_chunks:
print(f"- {chunk}")
score = any(
any(word in chunk for word in tokenize(expected_answer))
for chunk in relevant_chunks
)
print(f"Retrieval Score: {score}")
print()
return score
def evaluate_generation(question, expected_answer, generated_answer):
client = OpenAI(api_key=get_openai_api_key())
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are an evaluation judge. Given a question, an expected answer, and a generated answer, your task is to determine if the generated answer is relevant and accurate. Respond with 'YES' if the generated answer is satisfactory, or 'NO' if it is not.",
},
{
"role": "user",
"content": f"Question: {question}\nExpected Answer: {expected_answer}\nGenerated Answer: {generated_answer}\nIs the generated answer relevant and accurate?",
},
],
model="gpt-4",
)
judgment = chat_completion.choices[0].message.content.strip().lower()
print(f"Question: {question}")
print(f"Expected Answer: {expected_answer}")
print(f"Generated Answer: {generated_answer}")
print(f"Judgment: {judgment}")
print()
return judgment == "yes"
retrieval_scores = []
generation_scores = []
for item in eval_data:
retrieval_score = evaluate_retrieval(
item["question"], item["expected_answer"], corpus
)
retrieval_scores.append(retrieval_score)
generated_answer = answer_question(item["question"], corpus)
generation_score = evaluate_generation(
item["question"], item["expected_answer"], generated_answer
)
generation_scores.append(generation_score)
retrieval_accuracy = sum(retrieval_scores) / len(retrieval_scores)
generation_accuracy = sum(generation_scores) / len(generation_scores)
print(f"Retrieval Accuracy: {retrieval_accuracy:.2f}")
print(f"Generation Accuracy: {generation_accuracy:.2f}")