-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
170 lines (131 loc) · 5.1 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import json
import os
import chromadb
import PyPDF2
import streamlit as st
import yaml
from chromadb.utils import embedding_functions
from openai import OpenAI
def extract_text_from_pdf(pdf_path):
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
def embed_content_in_chunks(content, ai_client):
chunk_size = 800
chunk_overlap = 200
chunks = [
content[i : i + chunk_size]
for i in range(0, len(content), chunk_size - chunk_overlap)
]
embeddings = []
for chunk in chunks:
response = ai_client.embeddings.create(
input=chunk, model="text-embedding-ada-002"
)
embeddings.append(response.data[0].embedding)
return chunks, embeddings
def load_questions_and_answers(json_path):
with open(json_path, "r") as file:
data = json.load(file)
questions = {k: f"{k}: {v}" for k, v in data["questions"].items()}
return questions, data["answers"]
def get_relevant_content(collection, user_answer, actual_answer, question):
combined_query = f"{actual_answer}{question}"
# Perform the queries and collect results
results = collection.query(query_texts=combined_query, n_results=5)
# Extract relevant documents from results
relevant_content = ("\n\n".join(results["documents"][0]))
# Return content focusing on what the student missed
return relevant_content if any(relevant_content) else ""
def load_prompts():
with open("prompts.yaml", "r") as file:
return yaml.safe_load(file)
def get_feedback(ai_client, user_answer, question, relevant_content, actual_answer):
prompts = load_prompts()
# Prepare the feedback prompt
feedback_prompt = prompts["feedback_prompt"].format(
question=question,
user_answer=user_answer,
)
# System prompt for feedback
feedback_system_prompt = f"""
You are an expert educator providing constructive feedback to students.
Use the following information to inform your feedback:
Actual Answer:
{actual_answer}
Relevant Course Content:
{relevant_content}
Provide brief feedback in 2-3 sentences, focusing on key strengths and areas for improvement. Your response should not exceed 150 tokens, so keep it short.
"""
# Call the AI API for feedback
feedback_response = ai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": feedback_system_prompt},
{"role": "user", "content": feedback_prompt},
],
temperature=0.1,
max_tokens=500,
)
feedback = feedback_response.choices[0].message.content.strip()
# Prepare the grading prompt
grading_prompt = prompts["grading_prompt"].format(
question=question,
user_answer=user_answer,
)
# System prompt for grading
grading_system_prompt = f"""
You are an expert grader assessing student answers based on their alignment with the actual answer.
Actual Answer:
{actual_answer}
"""
# Call the AI API for grading
grading_response = ai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": grading_system_prompt},
{"role": "user", "content": grading_prompt},
],
temperature=0.1,
max_tokens=5,
)
grade = grading_response.choices[0].message.content.strip()
# Combine feedback and grade
formatted_response = f"**Feedback:** {feedback}\n\n**Grade:** {grade}"
return formatted_response
@st.cache_resource
def get_or_create_chroma_collection(_db_client, module_content_fp, _ai_client):
collection_name = "module_content"
embedding_function = embedding_functions.OpenAIEmbeddingFunction(
api_key=os.getenv("OPENAI_API_KEY2"), model_name="text-embedding-ada-002"
)
try:
collection = _db_client.get_collection(
name=collection_name, embedding_function=embedding_function
)
print("Using existing ChromaDB collection.")
except chromadb.errors.InvalidCollectionException:
print("No existing ChromaDB collection found. Creating a new one...")
pdf_text = extract_text_from_pdf(module_content_fp)
chunks, embeddings = embed_content_in_chunks(pdf_text, _ai_client)
collection = _db_client.create_collection(
name=collection_name, embedding_function=embedding_function
)
collection.add(
documents=chunks,
embeddings=embeddings,
ids=[f"embedding_{i}" for i in range(len(embeddings))],
)
print("New collection created and embeddings added to ChromaDB.")
return collection
def group_question(questions):
grouped = {}
for q_id, question in questions.items():
base_id = q_id[0] # Get the first character of the question ID
if base_id not in grouped:
grouped[base_id] = []
grouped[base_id].append((q_id, question)) # Append a tuple instead of two separate arguments
return grouped