Skip to content

Commit

Permalink
Merge branch 'main' into feature/links
Browse files Browse the repository at this point in the history
  • Loading branch information
AlisoSouza authored Mar 28, 2024
2 parents f71a523 + 046774a commit 9cdf2fb
Show file tree
Hide file tree
Showing 8 changed files with 468 additions and 274 deletions.
6 changes: 5 additions & 1 deletion app/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ def index_file_data(content_base: Dict) -> bool:
os.environ.get("AWS_STORAGE_SECRET_KEY")
)
text_splitter = TextSplitter(character_text_splitter())
manager = IndexerFileManager(file_downloader, main_app.content_base_indexer, text_splitter)
manager = IndexerFileManager(
file_downloader,
main_app.content_base_indexer,
text_splitter,
)
index_result: bool = manager.index_file_url(content_base)
NexusRESTClient().index_succedded(
task_succeded=index_result,
Expand Down
1 change: 1 addition & 0 deletions app/handlers/content_bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class ContentBaseIndexRequest(BaseModel):
extension_file: str
task_uuid: str
content_base: str
load_type: str = None


class ContentBaseIndexResponse(BaseModel):
Expand Down
8 changes: 5 additions & 3 deletions app/indexer/indexer_file_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,19 @@ class IndexerFileManager:
def __init__(self,
file_downloader: IFileDownloader,
content_base_indexer: IDocumentIndexer,
text_splitter: ITextSplitter
text_splitter: ITextSplitter,
) -> None:
self.file_downloader = file_downloader
self.content_base_indexer = content_base_indexer
self.text_splitter = text_splitter

def index_file_url(self, content_base) -> bool:
def index_file_url(self, content_base, **kwargs) -> bool:
load_type = content_base.get("load_type")
docs: List[Document] = load_file_url_and_split_text(
content_base.get("file"),
content_base.get('extension_file'),
self.text_splitter
self.text_splitter,
load_type=load_type
)
document_pages: List[Document] = add_file_metadata(docs, content_base)
try:
Expand Down
7 changes: 5 additions & 2 deletions app/loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@ def load_file_url_and_get_pages_text(file_url: str, file_type: str) -> List[Docu
return data_loader.load()


def load_file_url_and_split_text(file_url: str, file_type: str, text_splitter: ITextSplitter) -> List[Document]:
def load_file_url_and_split_text(file_url: str, file_type: str, text_splitter: ITextSplitter, **kwargs) -> List[Document]:

load_type = kwargs.get("load_type", None)

loader = supported_loaders_cls.get(file_type)
data_loader = DataLoaderCls(loader, file_url)
data_loader = DataLoaderCls(loader=loader, file=file_url, load_type=load_type)
return data_loader.load_and_split_text(text_splitter)
28 changes: 22 additions & 6 deletions app/loaders/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from langchain.document_loaders import (
TextLoader, PyPDFLoader, UnstructuredExcelLoader,
UnstructuredWordDocumentLoader, Docx2txtLoader, UnstructuredURLLoader
UnstructuredWordDocumentLoader, Docx2txtLoader, UnstructuredURLLoader, PDFMinerLoader
)
from langchain.schema.document import Document
from typing import Callable, List, Union
Expand All @@ -20,13 +20,19 @@ def load(self):


class DataLoaderCls:
def __init__(self, loader: DocumentLoader, file: str) -> None:
self.loader = loader(file)
def __init__(self, loader: DocumentLoader, file: str, **kwargs) -> None:
load_type = kwargs.get("load_type")
self.loader = self.get_load_type(loader, load_type, file)
self.file = file

def get_load_type(self, loader, load_type, file):
if load_type == "pdfminer":
return loader(loader="pdfminer", file=file)
return loader(file=file)

def load(self) -> List[Document]:
return self.loader.load()

def load_and_split_text(self, text_splitter: ITextSplitter) -> List[Document]:
return self.loader.load_and_split_text(text_splitter)

Expand Down Expand Up @@ -90,8 +96,18 @@ def load_and_split_text(self, text_splitter: ITextSplitter) -> List[Document]:


class PDFLoader(DocumentLoader):
def __init__(self, file: str) -> None:
self.loader = PyPDFLoader(file)
def __init__(
self,
file: str,
**kwargs,
) -> None:
loader_class = kwargs.get("loader")
self.loader = self.get_loader_class(loader_class)(file)

def get_loader_class(self, loader_class):
if loader_class == "pdfminer":
return PDFMinerLoader
return PyPDFLoader

def load(self) -> List[Document]:
pages = self.loader.load_and_split()
Expand Down
11 changes: 11 additions & 0 deletions app/tests/test_document_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,17 @@ def test_load_file_url_and_split_text(self):
docs = load_file_url_and_split_text(file_path, file_type, self.text_splitter)
self.assertEqual(list, type(docs))

def test_load_file_url_and_split_text_pdf_miner(self):
file_path = f'{self.path}/{self.file_name}.pdf'
file_type = "pdf"
docs = load_file_url_and_split_text(
file_path,
file_type,
self.text_splitter,
load_type="pdfminer"
)
self.assertEqual(list, type(docs))

def test_load_file_url_and_get_pages_text(self): # this function is deprecated
file_path = f'{self.path}/{self.file_name}.pdf'
file_type = "pdf"
Expand Down
Loading

0 comments on commit 9cdf2fb

Please sign in to comment.