-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweaviateRetriever.py
119 lines (109 loc) · 3.57 KB
/
weaviateRetriever.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import weaviate
from langchain_openai import OpenAIEmbeddings
from llama_index.core import SimpleDirectoryReader
from langchain_weaviate.vectorstores import WeaviateVectorStore
from dotenv import load_dotenv
from path import Path
import os
load_dotenv()
weaviate_url = os.getenv("WEAVIATE_URL")
client = weaviate.Client(
url=weaviate_url,
auth_client_secret=weaviate.auth.AuthClientPassword(
username = os.getenv("WCS_USERNAME"),
password = os.getenv("WCS_PASSWORD")
),
)
def create_vectordatabase():
client.schema.delete_all()
schemas = client.schema.get()
print(schemas)
# Define the Schema object to use `text-embedding-3-small` on `title` and `content`, but skip it for `url`
moodlebot_schema = {
"class": "MoodleBot",
"description": "A collection of documents for MoodleBot",
"vectorizer": "text2vec-openai",
"moduleConfig": {
"text2vec-openai": {
"model": "ada",
"modelVersion": "002",
"type": "text"
}
},
"properties": [{
"name": "doc_id",
"description": "Document ID",
"dataType": ["text"],
"moduleConfig": {
"text2vec-openai": {"skip": True, "vectorizePropertyName": False}
}
},
{
"name": "file",
"description": "Filename",
"dataType": ["text"],
"moduleConfig": {
"text2vec-openai": {"skip": True, "vectorizePropertyName": False}
}
},
{
"name": "page",
"description": "Page number",
"dataType": ["text"],
"moduleConfig": {
"text2vec-openai": {"skip": True, "vectorizePropertyName": False}
}
},
{
"name": "content",
"description": "Content of the Slide",
"dataType": ["text"],
"moduleConfig": {
"text2vec-openai": {"skip": False, "vectorizePropertyName": False}
}
}]
}
client.schema.create_class(moodlebot_schema)
schemas = client.schema.get()
print(schemas)
datas = []
for path in Path(os.getenv("SELECTED_FILES")).iterdir():
if path.suffix != ".pdf":
continue
print(f"Processing {path.name}...")
reader = SimpleDirectoryReader(
input_files=[path]
)
docs = reader.load_data()
for d in docs:
data = dict()
data["doc_id"] = d.doc_id
data["page"] = d.metadata["page_label"]
data["file"] = d.metadata["file_name"]
data["text"] = d.text
datas.append(data)
print(len(datas))
with client.batch as batch:
for data in datas:
batch.add_data_object(data, "MoodleBot")
count = client.data_object.get(class_name="MoodleBot")['totalResults']
print(count)
return
weaviate_client = weaviate.connect_to_custom(
http_host=os.getenv("WEAVIATE_POD"),
http_port=8080,
http_secure=False,
grpc_host=os.getenv("WEAVIATE_GRPC"),
grpc_port=50051,
grpc_secure=False,
auth_credentials=weaviate.auth.AuthClientPassword(
username = os.getenv("WCS_USERNAME"),
password = os.getenv("WCS_PASSWORD")
),
)
def get_docs(prompt):
embedding = OpenAIEmbeddings()
weav = WeaviateVectorStore(client=weaviate_client, index_name ="MoodleBot", text_key="text", embedding=embedding)
retriever = weav.as_retriever()
docs = retriever.invoke(prompt)
return docs