-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_data.py
47 lines (37 loc) · 1.22 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import sys
import os
root_dir = os.path.dirname(__file__)
data_dir = os.path.join(root_dir, 'beirut_data')
file_path = os.path.join(data_dir, os.listdir(data_dir)[0])
# Load documents
from langchain_community.document_loaders import PyMuPDFLoader
loader = PyMuPDFLoader(file_path)
documents = loader.load()
# Load documents into vector store
import weaviate
client = weaviate.connect_to_local()
import weaviate.classes as wvc
from weaviate.classes.config import Property, DataType
# If the collection exists, get it and don't create it
try:
collection = client.collections.create(
name = 'docs',
properties = [
Property(name='text', data_type = DataType.TEXT)
]
)
except weaviate.exceptions.WeaviateQueryError:
collection = client.collections.get(name='docs')
except weaviate.exceptions.UnexpectedStatusCodeError:
collection = client.collections.get(name='docs')
import ollama
with collection.batch.dynamic() as batch:
for i, d in enumerate(documents):
response = ollama.embeddings(
model = 'all-minilm',
prompt = str(d)
)
batch.add_object(
properties = {'text': str(d)},
vector = response['embedding']
)