-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
102 lines (77 loc) · 3.13 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import glob
import os
import re
import lancedb
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
DirectoryLoader,
PyPDFLoader,
WebBaseLoader,
)
from langchain_community.vectorstores import LanceDB
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm import tqdm
# output file where we'll store the names
output_file = "saved_file_names.txt"
data_dir = "data"
files_to_read = glob.glob("data/*.pdf")
lancedb_name = "lance_database"
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}
embeddings = HuggingFaceEmbeddings(
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)
# todo try OpenAIEmbeddings() instead of HuggingFaceEmbeddings
def get_books_we_read():
return files_to_read
def create_new_db_or_read_existing():
# check if files in data are the ones we read before, if so return that instance (not foolproof!)
if not files_to_read:
print("No books to read :((")
exit()
if os.path.exists(output_file):
print("Found saved file names, loading from that")
with open(output_file, "r") as file:
saved_file_names = file.readlines()
saved_file_names = [name.strip() for name in saved_file_names]
if set(saved_file_names) == set(files_to_read):
return read_lancedb()
# else write a new lancedb since we don't have these new books stored and return that instance (will take longer)
print("No saved file names found, writing new lancedb")
docsearch = write_new_lancedb()
with open(output_file, "w") as file:
for filename in files_to_read:
file.write(f"{filename}\n")
return docsearch
def read_lancedb():
db = lancedb.connect(lancedb_name)
return LanceDB(connection=db, embedding=embeddings)
def write_new_lancedb():
all_docs = []
for file_path in tqdm(files_to_read, desc="Reading books"):
# Load each document using PyPDFLoader
loader = PyPDFLoader(file_path)
docs = loader.load()
all_docs.extend(docs)
docs = all_docs
def clean_text(text):
cleaned = re.sub(r"\n+", " ", text)
cleaned = re.sub(r"[_\n]", " ", cleaned)
cleaned = re.sub(r"\s+", " ", cleaned).strip()
return cleaned
for doc in docs:
doc.page_content = clean_text(doc.page_content)
all_chunks = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
for doc in tqdm(docs, desc="Splitting documents"):
chunks = text_splitter.split_documents([doc])
all_chunks.extend(chunks)
for i, chunk in enumerate(all_chunks[100:102]):
print(f"Chunk {i + 1}:\n{chunk.page_content}\n")
query = "Tell me the length of the embeddings for this document" # for kicks
print(len(embeddings.embed_documents([query])[0]))
db = lancedb.connect(lancedb_name)
docsearch = LanceDB.from_documents(all_chunks, embeddings, connection=db)
return docsearch