diff --git a/Dockerfile b/Dockerfile index ddb6f86..8eadd8c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,6 +9,10 @@ COPY pyproject.toml poetry.lock ./ RUN poetry config virtualenvs.create false && \ poetry install --no-dev +RUN apt update && apt install libmagic1 -y +RUN poetry add python-magic +RUN python -m nltk.downloader punkt averaged_perceptron_tagger -d /usr/share/nltk_data + COPY . . EXPOSE 8000 diff --git a/app/handlers/nexus.py b/app/handlers/nexus.py index 1ea94d8..e598795 100644 --- a/app/handlers/nexus.py +++ b/app/handlers/nexus.py @@ -16,10 +16,18 @@ def __init__(self) -> None: def index_succedded(self, task_succeded: bool, nexus_task_uuid: str, file_type: str) -> None: endpoint = f'{self.base_url}/api/v1/content-base-file' + + if file_type == "txt": + ftype = "text" + elif file_type == "urls": + ftype = "link" + else: + ftype = "file" + data = { "status": int(task_succeded), "task_uuid": nexus_task_uuid, - "file_type": "text" if file_type == "txt" else "file", + "file_type": ftype, } response = requests.patch(url=endpoint, data=json.dumps(data), headers=self.headers) response.raise_for_status() diff --git a/app/loaders/__init__.py b/app/loaders/__init__.py index 0fcef6b..8234ed6 100644 --- a/app/loaders/__init__.py +++ b/app/loaders/__init__.py @@ -1,9 +1,24 @@ import os -from app.loaders.loaders import (DataLoader, txt_loader, pdf_loader, docx_loader, xlsx_loader, DataLoaderCls, PDFLoader, DocxLoader, TxtLoader, XlsxLoader) +from app.loaders.loaders import ( + DataLoader, + txt_loader, + pdf_loader, + docx_loader, + xlsx_loader, +) +from app.loaders.loaders import ( + DataLoaderCls, + PDFLoader, + DocxLoader, + TxtLoader, + XlsxLoader, + URLsLoader, +) from langchain.schema.document import Document from typing import List from app.text_splitters import ITextSplitter + supported_loaders = { 'txt': txt_loader, 'pdf': pdf_loader, @@ -20,9 +35,9 @@ 'txt': TxtLoader, 'xlsx': XlsxLoader, 'xls': XlsxLoader, + 'urls': URLsLoader, } - def load_file_and_get_raw_text(file_name: str, file_type: str) -> str: file_path = f'{os.environ.get("FILE_PATH")}/{file_name}' loader = supported_loaders.get(file_type) @@ -47,6 +62,5 @@ def load_file_url_and_split_text(file_url: str, file_type: str, text_splitter: I load_type = kwargs.get("load_type", None) loader = supported_loaders_cls.get(file_type) - data_loader = DataLoaderCls(loader=loader, file=file_url, load_type=load_type) return data_loader.load_and_split_text(text_splitter) diff --git a/app/loaders/loaders.py b/app/loaders/loaders.py index b5ef277..f0a51c2 100644 --- a/app/loaders/loaders.py +++ b/app/loaders/loaders.py @@ -5,11 +5,10 @@ from langchain.document_loaders import ( TextLoader, PyPDFLoader, UnstructuredExcelLoader, - UnstructuredWordDocumentLoader, Docx2txtLoader, - PDFMinerLoader + UnstructuredWordDocumentLoader, Docx2txtLoader, UnstructuredURLLoader, PDFMinerLoader ) from langchain.schema.document import Document -from typing import Callable, List +from typing import Callable, List, Union from app.text_splitters import ITextSplitter @@ -65,7 +64,7 @@ def txt_loader(file: str) -> Callable: class TxtLoader(DocumentLoader): def _get_file(self, file: str): - if os.environ.get("AWS_STORAGE_BUCKET_NAME") in file: + if os.environ.get("AWS_STORAGE_BUCKET_NAME") in file: # pragma: no cover response = requests.get(file) if response.status_code == 200: file_path = f"/tmp/{uuid.uuid4()}.txt" @@ -189,7 +188,7 @@ def __init__(self, file:str) -> None: tmp_file, _ = self._get_temp_file(file) self.loader = UnstructuredExcelLoader(tmp_file, mode="single") - def _get_temp_file(self, file_url: str): + def _get_temp_file(self, file_url: str): # pragma: no cover result = urlparse(file_url) filename = result.path.strip("/") file_path, message = urlretrieve(file_url, f"/tmp/{filename}") @@ -211,3 +210,31 @@ def load_and_split_text(self, text_splitter: ITextSplitter) -> List[Document]: for chunk in text_chunks: split_pages.append(Document(page_content=chunk, metadata=metadatas)) return split_pages + + +class URLsLoader(DocumentLoader): + def _urls(self, urls: Union[List[str], str]): + if isinstance(urls, str): + return [urls] + return urls + + def __init__(self, urls: Union[List[str], str]) -> None: + self.urls = self._urls(urls) + self.loader = UnstructuredURLLoader(urls=self.urls) + + def load(self) -> List[Document]: + return self.loader.load() + + def load_and_split_text(self, text_splitter: ITextSplitter) -> List[Document]: + split_pages = [] + + pages = self.loader.load_and_split() + for page in pages: + page_content = page.page_content.lower() + metadatas = page.metadata + metadatas.update({"full_page": page_content}) + + text_chunks = text_splitter.split_text(page_content) + for chunk in text_chunks: + split_pages.append(Document(page_content=chunk, metadata=metadatas)) + return split_pages diff --git a/app/tests/test_document_loader.py b/app/tests/test_document_loader.py index 8d0b15a..469fb18 100644 --- a/app/tests/test_document_loader.py +++ b/app/tests/test_document_loader.py @@ -5,6 +5,8 @@ PDFLoader, DocxLoader, TxtLoader, + URLsLoader, + XlsxLoader, pdf_loader, txt_loader, docx_loader, @@ -122,11 +124,35 @@ def test_load_xlsx(self): raw_text = data_loader.raw_text() self.assertEqual(type(raw_text), str) + @mock.patch("app.loaders.loaders.XlsxLoader._get_temp_file") + def test_load_xlsx_cls(self, mock_file_url): + file_path = f'{self.path}/{self.file_name}.xlsx' + mock_file_url.return_value = (file_path, "") + xlsx_loader = XlsxLoader(file_path) + split_pages: List[Document] = xlsx_loader.load_and_split_text(self.text_splitter) + self.assertEqual(list, type(split_pages)) + def test_pdf_loader_cls(self): file_path = f'{self.path}/{self.file_name}.pdf' pdf_loader = PDFLoader(file_path) split_pages: List[Document] = pdf_loader.load_and_split_text(self.text_splitter) self.assertEqual(list, type(split_pages)) + + def test_urls_loader_cls(self): + urls_loader = URLsLoader("https://en.wikipedia.org/wiki/Unit_testing") + split_pages: List[Document] = urls_loader.load() + self.assertEqual(list, type(split_pages)) + + def test_urls_loader_and_split_cls(self): + urls_loader = URLsLoader("https://en.wikipedia.org/wiki/Unit_testing") + split_pages: List[Document] = urls_loader.load_and_split_text(self.text_splitter) + self.assertEqual(list, type(split_pages)) + + def test_urls_list_loader_and_split_cls(self): + urls = ["https://en.wikipedia.org/wiki/Unit_testing"] + urls_loader = URLsLoader(urls) + split_pages: List[Document] = urls_loader.load_and_split_text(self.text_splitter) + self.assertEqual(list, type(split_pages)) def test_docx_loader_cls(self): file_path = f'{self.path}/{self.file_name}.docx' @@ -134,6 +160,7 @@ def test_docx_loader_cls(self): split_pages: List[Document] = docx_loader.load_and_split_text(self.text_splitter) self.assertEqual(list, type(split_pages)) + @mock.patch.dict(os.environ, {"AWS_STORAGE_BUCKET_NAME": "file-path"}) def test_txt_loader_cls(self): file_path = f'{self.path}/{self.file_name}.txt' docx_loader = TxtLoader(file_path) diff --git a/app/util.py b/app/util.py index a4c0aa7..3f148f0 100644 --- a/app/util.py +++ b/app/util.py @@ -7,11 +7,11 @@ class ContentHandler(EmbeddingsContentHandler): content_type = "application/json" accepts = "application/json" - def transform_input(self, inputs: list[str], model_kwargs: dict) -> bytes: + def transform_input(self, inputs: list[str], model_kwargs: dict) -> bytes: # pragma: no cover input_str = json.dumps({"inputs": inputs, **model_kwargs}) return input_str.encode("utf-8") - def transform_output(self, output: bytes) -> list[list[float]]: + def transform_output(self, output: bytes) -> list[list[float]]: # pragma: no cover response_json = json.loads(output.read().decode("utf-8")) return response_json["vectors"] diff --git a/poetry.lock b/poetry.lock index 4575e1d..a8b6c09 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -2209,6 +2209,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -3000,6 +3001,22 @@ files = [ {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, ] +[[package]] +name = "xlrd" +version = "2.0.1" +description = "Library for developers to extract data from Microsoft Excel (tm) .xls spreadsheet files" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +files = [ + {file = "xlrd-2.0.1-py2.py3-none-any.whl", hash = "sha256:6a33ee89877bd9abc1158129f6e94be74e2679636b8a205b43b85206c3f0bbdd"}, + {file = "xlrd-2.0.1.tar.gz", hash = "sha256:f72f148f54442c6b056bf931dbc34f986fd0c3b0b6b5a58d013c9aef274d0c88"}, +] + +[package.extras] +build = ["twine", "wheel"] +docs = ["sphinx"] +test = ["pytest", "pytest-cov"] + [[package]] name = "xlsxwriter" version = "3.1.9" @@ -3117,4 +3134,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "dbcf8ae11992ad0b797cb45145f896aade2d7434fb513e06c9818ce853e8c154" +content-hash = "ef2a8fe841c241406573f67e3b818b0fdbfe71a059fa9c3f59ca3b548a6e4ca8" diff --git a/pyproject.toml b/pyproject.toml index 90a4c82..17d5b4f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ pydantic = "2.3.0" celery = "^5.3.6" redis = "^5.0.1" sentry-sdk = {extras = ["fastapi"], version = "^1.35.0"} +xlrd = "^2.0.1" pdfminer-six = "^20231228"