-
Notifications
You must be signed in to change notification settings - Fork 0
/
intake_book.py
138 lines (119 loc) · 5.05 KB
/
intake_book.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import json
import os
import re
import psycopg2
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
from tqdm import tqdm
import keys
from sentence_transformers import SentenceTransformer, util
# Constants
MIN_PARAGRAPH_LENGTH = 100 # Minimum length of paragraph to consider
BATCH_SIZE = 250 # Size of each batch for database insertion
# Initialize the sentence transformer model
model = SentenceTransformer('llmrails/ember-v1')
# Define a class to represent a document
class Document:
def __init__(self, text, author, source, part, json_data, embedding=None):
self.text = text
self.author = author
self.source = source
self.part = part
self.json_data = json_data
self.embedding = embedding
# Function to create embeddings from text
def create_embeddings(text):
embeddings = model.encode(text, convert_to_tensor=True)
embedding_list = embeddings.flatten().tolist()
return embedding_list
# Function to open a database connection
def open_db():
return psycopg2.connect(keys.postgres_connection_params)
# Function to normalize spaces in text
def normalize_spaces(text):
return re.sub(r'\s+', ' ', text)
# Function to count the number of paragraphs in an EPUB book
def count_paragraphs(book):
count = 0
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
soup = BeautifulSoup(item.content, 'html.parser')
for p in soup.find_all('p'):
if len(normalize_spaces(p.get_text().strip())) > MIN_PARAGRAPH_LENGTH:
count += 1
return count
# Function to create vector lists from paragraphs in an EPUB file
def vector_lists_from_paragraphs_from_epub(epub_path):
book = epub.read_epub(epub_path)
total_paragraphs = count_paragraphs(book)
paragraphs = []
# Extracting global metadata
author = book.get_metadata('DC', 'creator')
title = book.get_metadata('DC', 'title')
author = author[0][0] if author else "Unknown Author"
title = title[0][0] if title else "Unknown Title"
with tqdm(total=total_paragraphs, desc=f"Processing '{title}' by '{author}") as pbar:
paragraph_counter = 1
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
soup = BeautifulSoup(item.content, 'html.parser')
for p in soup.find_all('p'):
text = normalize_spaces(p.get_text().strip())
if len(text) > MIN_PARAGRAPH_LENGTH:
percent_through = (paragraph_counter / total_paragraphs) * 100
location = round(percent_through / 100, 2) # Removed comma
embedding = create_embeddings(text)
pg = Document(text=text, author=author, source=title, part=location, json_data={}, embedding=embedding)
paragraphs.append(pg)
pbar.update(1)
paragraph_counter += 1
return paragraphs
# Function to insert document data into a database
def put(documents):
data_to_insert = [(doc.text, doc.author, doc.source, doc.part, json.dumps(doc.json_data), doc.embedding) for doc in documents]
db = open_db()
with db as conn:
with conn.cursor() as cur:
with tqdm(total=len(data_to_insert), desc="Inserting Data into Vector Storage") as pbar:
for data_chunk in chunked(data_to_insert, BATCH_SIZE):
cur.executemany("""
INSERT INTO books (text, author, source, part, json_data, vector_embedding)
VALUES (%s, %s, %s, %s, %s, %s);
""", data_chunk)
conn.commit()
pbar.update(len(data_chunk))
# Helper function to yield chunks of data
def chunked(iterable, size):
for i in range(0, len(iterable), size):
yield iterable[i:i + size]
# Main function to embed and upload books
def embed_and_upload_books():
# Create necessary directories if they don't exist
os.makedirs('books', exist_ok=True)
os.makedirs('finished_books', exist_ok=True)
os.makedirs('error_books', exist_ok=True)
failures = ""
for book in os.listdir('books'):
file_path = f'books/{book}'
print(f"Starting '{book}'.")
try:
paragraphs = vector_lists_from_paragraphs_from_epub(file_path)
if len(paragraphs) > 50:
put(paragraphs)
print(f"{len(paragraphs)} uploaded. {file_path}")
os.rename(file_path, f'finished_books/{book}')
else:
fail = f"{file_path} too short\n"
failures += fail
print(fail)
os.rename(file_path, f'error_books/{book}')
except Exception as e:
fail = f"Failure on '{file_path}': {e}\n"
failures += fail
print(fail)
os.rename(file_path, f'error_books/{book}')
print(failures)
# Execute the main function if the script is run directly
if __name__ == "__main__":
embed_and_upload_books()