-
Notifications
You must be signed in to change notification settings - Fork 22
/
backend.py
124 lines (113 loc) · 4.16 KB
/
backend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from utils import SemanticScholarReader
import os
from llama_index import ServiceContext, VectorStoreIndex
from llama_index import (
VectorStoreIndex,
StorageContext,
load_index_from_storage,
ServiceContext,
)
from llama_index.llms import OpenAI
import logging
from llama_index.memory import ChatMemoryBuffer
from llama_index.query_engine import CitationQueryEngine
from llama_index.embeddings import OpenAIEmbedding
import sys
from llama_index.evaluation import DatasetGenerator
from llama_index import (
VectorStoreIndex,
ServiceContext,
)
import pickle
import random
from llama_index.retrievers import BM25Retriever
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
def create_index(research_space, num_papers, full_text):
service_context = ServiceContext.from_defaults(
chunk_size=1024,
llm=OpenAI(model="gpt-3.5-turbo", temperature=0),
embed_model=OpenAIEmbedding(embed_batch_size=10),
)
# instantiating SemanticScholarReader
s2_reader = SemanticScholarReader(api_key=os.environ["SEMANTIC_SCHOLAR_API_KEY"])
path_to_store = (
"./citation_"
+ research_space.replace(" ", "_")
+ "_"
+ str(num_papers)
+ "_full_text_"
+ str(full_text)
)
# loading the data from Semantic Scholar
if not os.path.exists(path_to_store):
logging.info(
"Creating index for research space: "
+ research_space
+ " with "
+ str(num_papers)
+ "papers at: "
+ path_to_store
)
documents = s2_reader.load_data(
research_space, limit=num_papers, full_text=full_text
)
try:
index = VectorStoreIndex.from_documents(
documents, service_context=service_context
)
except Exception as e:
logging.info("Error creating index: " + str(e))
documents = s2_reader.load_data(research_space, limit=num_papers)
index = VectorStoreIndex.from_documents(
documents, service_context=service_context
)
index.storage_context.persist(persist_dir=path_to_store)
# dump documents to pickle file
with open(path_to_store + "/documents.pkl", "wb") as f:
pickle.dump(documents, f)
else:
logging.info(
"Loading index for research space from existing index: " + path_to_store
)
index = load_index_from_storage(
StorageContext.from_defaults(persist_dir=path_to_store),
service_context=service_context,
)
# load documents from pickle file
with open(path_to_store + "/documents.pkl", "rb") as f:
documents = pickle.load(f)
logging.info("Done loading index")
return index, documents
def generate_sample_questions(doc_papers):
paper = random.sample(doc_papers, 1)
data_generator = DatasetGenerator.from_documents(paper)
eval_questions = data_generator.generate_questions_from_nodes(3)
return eval_questions
def prompt_for_chat(research_space):
_prompt = (
"You are a chatbot, able to have normal interactions, as well as talk about papers from"
+ research_space
+ ". Use your memory and make sure to cite the source of your knowledge."
)
return _prompt
def get_chat_engine(index, research_space):
memory = ChatMemoryBuffer.from_defaults(token_limit=2000)
chat_engine = index.as_chat_engine(
chat_mode="context",
memory=memory,
system_prompt=prompt_for_chat(research_space),
)
return chat_engine
def citation_query_engine(index, k, streaming, citation_chunk_size):
logging.info("Done creating index, loading chat . . ")
chat_engine = CitationQueryEngine.from_args(
index,
# retriever=BM25Retriever.from_defaults(index, similarity_top_k=k),
similarity_top_k=k,
streaming=streaming,
# here we can control how granular citation sources are, the default is 512
citation_chunk_size=citation_chunk_size,
)
logging.info("Done loading chat engine")
return chat_engine