From 78214456a69c60c0aaf48a221e20250c83115d5e Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 6 Dec 2024 17:50:22 +0100 Subject: [PATCH 1/9] feat: Add unstructured document handler Added unstructured library and handling of certain document types through their library Feature COG-685 --- .../document_types/UnstructuredDocument.py | 26 +++++++++++++++++++ .../processing/document_types/__init__.py | 1 + cognee/tasks/documents/classify_documents.py | 2 ++ 3 files changed, 29 insertions(+) create mode 100644 cognee/modules/data/processing/document_types/UnstructuredDocument.py diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py new file mode 100644 index 000000000..c48423653 --- /dev/null +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -0,0 +1,26 @@ +from io import StringIO + +from cognee.modules.chunking.TextChunker import TextChunker +from .Document import Document + +class UnstructuredDocument(Document): + type: str = "unstructured" + + def read(self, chunk_size: int): + def get_text(): + from unstructured.partition.auto import partition + elements = partition(self.raw_data_location) + in_memory_file = StringIO("\n\n".join([str(el) for el in elements])) + in_memory_file.seek(0) + + while True: + text = in_memory_file.read(1024) + + if len(text.strip()) == 0: + break + + yield text + + chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text) + + yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/__init__.py b/cognee/modules/data/processing/document_types/__init__.py index 9682cc101..2e862f4ba 100644 --- a/cognee/modules/data/processing/document_types/__init__.py +++ b/cognee/modules/data/processing/document_types/__init__.py @@ -3,3 +3,4 @@ from .TextDocument import TextDocument from .ImageDocument import ImageDocument from .AudioDocument import AudioDocument +from .UnstructuredDocument import UnstructuredDocument diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py index d007b6888..abef4ea9e 100644 --- a/cognee/tasks/documents/classify_documents.py +++ b/cognee/tasks/documents/classify_documents.py @@ -5,12 +5,14 @@ AudioDocument, ImageDocument, TextDocument, + UnstructuredDocument, ) from cognee.modules.data.operations.get_metadata import get_metadata EXTENSION_TO_DOCUMENT_CLASS = { "pdf": PdfDocument, # Text documents "txt": TextDocument, + "docx": UnstructuredDocument, "png": ImageDocument, # Image documents "dwg": ImageDocument, "xcf": ImageDocument, From 62db3f8598466438cb72d797db9c58920b40de64 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 8 Dec 2024 14:37:50 +0100 Subject: [PATCH 2/9] feat: Remove the need for libmagic for unstructured documents Remove the need for libmagic so for unstructured documents by providing mime_type information Feature COG-685 --- .../modules/data/processing/document_types/Document.py | 1 + .../processing/document_types/UnstructuredDocument.py | 2 +- cognee/tasks/documents/classify_documents.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 773fc30c8..45441dcce 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -6,6 +6,7 @@ class Document(DataPoint): name: str raw_data_location: str metadata_id: UUID + mime_type: str def read(self, chunk_size: int) -> str: pass diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py index c48423653..68ccbe1f2 100644 --- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -9,7 +9,7 @@ class UnstructuredDocument(Document): def read(self, chunk_size: int): def get_text(): from unstructured.partition.auto import partition - elements = partition(self.raw_data_location) + elements = partition(self.raw_data_location, content_type=self.mime_type) in_memory_file = StringIO("\n\n".join([str(el) for el in elements])) in_memory_file.seek(0) diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py index abef4ea9e..47beeb917 100644 --- a/cognee/tasks/documents/classify_documents.py +++ b/cognee/tasks/documents/classify_documents.py @@ -13,6 +13,14 @@ "pdf": PdfDocument, # Text documents "txt": TextDocument, "docx": UnstructuredDocument, + "doc": UnstructuredDocument, + "odt": UnstructuredDocument, + "xls": UnstructuredDocument, + "xlsx": UnstructuredDocument, + "ppt": UnstructuredDocument, + "pptx": UnstructuredDocument, + "odp": UnstructuredDocument, + "ods": UnstructuredDocument, "png": ImageDocument, # Image documents "dwg": ImageDocument, "xcf": ImageDocument, @@ -50,6 +58,7 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]: title = f"{data_item.name}.{data_item.extension}", raw_data_location = data_item.raw_data_location, name = data_item.name, + mime_type = data_item.mime_type, metadata_id = metadata.id ) documents.append(document) From 53b7806ccb774b8db86be2880e523997a8caa8d8 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 8 Dec 2024 14:42:08 +0100 Subject: [PATCH 3/9] chore: Update pyproject file with unstructured library Add unstructured library as docs optional extension to pyproject.toml Chore COG-685 --- poetry.lock | 489 ++++++++++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 2 + 2 files changed, 490 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 8a0326236..9c11fd43a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -726,6 +726,17 @@ files = [ [package.dependencies] pycparser = "*" +[[package]] +name = "chardet" +version = "5.2.0" +description = "Universal encoding detector for Python 3" +optional = true +python-versions = ">=3.7" +files = [ + {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, + {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, +] + [[package]] name = "charset-normalizer" version = "3.4.0" @@ -1213,6 +1224,24 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "deepdiff" +version = "8.0.1" +description = "Deep Difference and Search of any Python object/data. Recreate objects by adding adding deltas to each other." +optional = true +python-versions = ">=3.8" +files = [ + {file = "deepdiff-8.0.1-py3-none-any.whl", hash = "sha256:42e99004ce603f9a53934c634a57b04ad5900e0d8ed0abb15e635767489cbc05"}, + {file = "deepdiff-8.0.1.tar.gz", hash = "sha256:245599a4586ab59bb599ca3517a9c42f3318ff600ded5e80a3432693c8ec3c4b"}, +] + +[package.dependencies] +orderly-set = "5.2.2" + +[package.extras] +cli = ["click (==8.1.7)", "pyyaml (==6.0.1)"] +optimize = ["orjson"] + [[package]] name = "deepeval" version = "2.0.1" @@ -1482,6 +1511,20 @@ files = [ dnspython = ">=2.0.0" idna = ">=2.0.0" +[[package]] +name = "emoji" +version = "2.14.0" +description = "Emoji for Python" +optional = true +python-versions = ">=3.7" +files = [ + {file = "emoji-2.14.0-py3-none-any.whl", hash = "sha256:fcc936bf374b1aec67dda5303ae99710ba88cc9cdce2d1a71c5f2204e6d78799"}, + {file = "emoji-2.14.0.tar.gz", hash = "sha256:f68ac28915a2221667cddb3e6c589303c3c6954c6c5af6fefaec7f9bdf72fdca"}, +] + +[package.extras] +dev = ["coverage", "pytest (>=7.4.4)"] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -2321,6 +2364,27 @@ files = [ {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"}, ] +[[package]] +name = "html5lib" +version = "1.1" +description = "HTML parser based on the WHATWG HTML specification" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "html5lib-1.1-py2.py3-none-any.whl", hash = "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d"}, + {file = "html5lib-1.1.tar.gz", hash = "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f"}, +] + +[package.dependencies] +six = ">=1.9" +webencodings = "*" + +[package.extras] +all = ["chardet (>=2.2)", "genshi", "lxml"] +chardet = ["chardet (>=2.2)"] +genshi = ["genshi"] +lxml = ["lxml"] + [[package]] name = "htmlmin2" version = "0.1.13" @@ -2823,6 +2887,17 @@ files = [ [package.dependencies] ply = "*" +[[package]] +name = "jsonpath-python" +version = "1.0.6" +description = "A more powerful JSONPath implementation in modern python" +optional = true +python-versions = ">=3.6" +files = [ + {file = "jsonpath-python-1.0.6.tar.gz", hash = "sha256:dd5be4a72d8a2995c3f583cf82bf3cd1a9544cfdabf2d22595b67aff07349666"}, + {file = "jsonpath_python-1.0.6-py3-none-any.whl", hash = "sha256:1e3b78df579f5efc23565293612decee04214609208a2335884b3ee3f786b575"}, +] + [[package]] name = "jsonpointer" version = "3.0.0" @@ -3464,6 +3539,160 @@ typing-extensions = ">=4.5.0" typing-inspect = ">=0.8.0" wrapt = "*" +[[package]] +name = "lxml" +version = "5.3.0" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +optional = true +python-versions = ">=3.6" +files = [ + {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656"}, + {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ae5fe5c4b525aa82b8076c1a59d642c17b6e8739ecf852522c6321852178119d"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:501d0d7e26b4d261fca8132854d845e4988097611ba2531408ec91cf3fd9d20a"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb66442c2546446944437df74379e9cf9e9db353e61301d1a0e26482f43f0dd8"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9e41506fec7a7f9405b14aa2d5c8abbb4dbbd09d88f9496958b6d00cb4d45330"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f7d4a670107d75dfe5ad080bed6c341d18c4442f9378c9f58e5851e86eb79965"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41ce1f1e2c7755abfc7e759dc34d7d05fd221723ff822947132dc934d122fe22"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:44264ecae91b30e5633013fb66f6ddd05c006d3e0e884f75ce0b4755b3e3847b"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_ppc64le.whl", hash = "sha256:3c174dc350d3ec52deb77f2faf05c439331d6ed5e702fc247ccb4e6b62d884b7"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_s390x.whl", hash = "sha256:2dfab5fa6a28a0b60a20638dc48e6343c02ea9933e3279ccb132f555a62323d8"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b1c8c20847b9f34e98080da785bb2336ea982e7f913eed5809e5a3c872900f32"}, + {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2c86bf781b12ba417f64f3422cfc302523ac9cd1d8ae8c0f92a1c66e56ef2e86"}, + {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:c162b216070f280fa7da844531169be0baf9ccb17263cf5a8bf876fcd3117fa5"}, + {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:36aef61a1678cb778097b4a6eeae96a69875d51d1e8f4d4b491ab3cfb54b5a03"}, + {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f65e5120863c2b266dbcc927b306c5b78e502c71edf3295dfcb9501ec96e5fc7"}, + {file = "lxml-5.3.0-cp310-cp310-win32.whl", hash = "sha256:ef0c1fe22171dd7c7c27147f2e9c3e86f8bdf473fed75f16b0c2e84a5030ce80"}, + {file = "lxml-5.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:052d99051e77a4f3e8482c65014cf6372e61b0a6f4fe9edb98503bb5364cfee3"}, + {file = "lxml-5.3.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:74bcb423462233bc5d6066e4e98b0264e7c1bed7541fff2f4e34fe6b21563c8b"}, + {file = "lxml-5.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a3d819eb6f9b8677f57f9664265d0a10dd6551d227afb4af2b9cd7bdc2ccbf18"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b8f5db71b28b8c404956ddf79575ea77aa8b1538e8b2ef9ec877945b3f46442"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c3406b63232fc7e9b8783ab0b765d7c59e7c59ff96759d8ef9632fca27c7ee4"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ecdd78ab768f844c7a1d4a03595038c166b609f6395e25af9b0f3f26ae1230f"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:168f2dfcfdedf611eb285efac1516c8454c8c99caf271dccda8943576b67552e"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa617107a410245b8660028a7483b68e7914304a6d4882b5ff3d2d3eb5948d8c"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:69959bd3167b993e6e710b99051265654133a98f20cec1d9b493b931942e9c16"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_ppc64le.whl", hash = "sha256:bd96517ef76c8654446fc3db9242d019a1bb5fe8b751ba414765d59f99210b79"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_s390x.whl", hash = "sha256:ab6dd83b970dc97c2d10bc71aa925b84788c7c05de30241b9e96f9b6d9ea3080"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:eec1bb8cdbba2925bedc887bc0609a80e599c75b12d87ae42ac23fd199445654"}, + {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6a7095eeec6f89111d03dabfe5883a1fd54da319c94e0fb104ee8f23616b572d"}, + {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:6f651ebd0b21ec65dfca93aa629610a0dbc13dbc13554f19b0113da2e61a4763"}, + {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:f422a209d2455c56849442ae42f25dbaaba1c6c3f501d58761c619c7836642ec"}, + {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:62f7fdb0d1ed2065451f086519865b4c90aa19aed51081979ecd05a21eb4d1be"}, + {file = "lxml-5.3.0-cp311-cp311-win32.whl", hash = "sha256:c6379f35350b655fd817cd0d6cbeef7f265f3ae5fedb1caae2eb442bbeae9ab9"}, + {file = "lxml-5.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c52100e2c2dbb0649b90467935c4b0de5528833c76a35ea1a2691ec9f1ee7a1"}, + {file = "lxml-5.3.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:e99f5507401436fdcc85036a2e7dc2e28d962550afe1cbfc07c40e454256a859"}, + {file = "lxml-5.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:384aacddf2e5813a36495233b64cb96b1949da72bef933918ba5c84e06af8f0e"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:874a216bf6afaf97c263b56371434e47e2c652d215788396f60477540298218f"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65ab5685d56914b9a2a34d67dd5488b83213d680b0c5d10b47f81da5a16b0b0e"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aac0bbd3e8dd2d9c45ceb82249e8bdd3ac99131a32b4d35c8af3cc9db1657179"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b369d3db3c22ed14c75ccd5af429086f166a19627e84a8fdade3f8f31426e52a"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c24037349665434f375645fa9d1f5304800cec574d0310f618490c871fd902b3"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:62d172f358f33a26d6b41b28c170c63886742f5b6772a42b59b4f0fa10526cb1"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_ppc64le.whl", hash = "sha256:c1f794c02903c2824fccce5b20c339a1a14b114e83b306ff11b597c5f71a1c8d"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_s390x.whl", hash = "sha256:5d6a6972b93c426ace71e0be9a6f4b2cfae9b1baed2eed2006076a746692288c"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:3879cc6ce938ff4eb4900d901ed63555c778731a96365e53fadb36437a131a99"}, + {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:74068c601baff6ff021c70f0935b0c7bc528baa8ea210c202e03757c68c5a4ff"}, + {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ecd4ad8453ac17bc7ba3868371bffb46f628161ad0eefbd0a855d2c8c32dd81a"}, + {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7e2f58095acc211eb9d8b5771bf04df9ff37d6b87618d1cbf85f92399c98dae8"}, + {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e63601ad5cd8f860aa99d109889b5ac34de571c7ee902d6812d5d9ddcc77fa7d"}, + {file = "lxml-5.3.0-cp312-cp312-win32.whl", hash = "sha256:17e8d968d04a37c50ad9c456a286b525d78c4a1c15dd53aa46c1d8e06bf6fa30"}, + {file = "lxml-5.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:c1a69e58a6bb2de65902051d57fde951febad631a20a64572677a1052690482f"}, + {file = "lxml-5.3.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c72e9563347c7395910de6a3100a4840a75a6f60e05af5e58566868d5eb2d6a"}, + {file = "lxml-5.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e92ce66cd919d18d14b3856906a61d3f6b6a8500e0794142338da644260595cd"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d04f064bebdfef9240478f7a779e8c5dc32b8b7b0b2fc6a62e39b928d428e51"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c2fb570d7823c2bbaf8b419ba6e5662137f8166e364a8b2b91051a1fb40ab8b"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0c120f43553ec759f8de1fee2f4794452b0946773299d44c36bfe18e83caf002"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:562e7494778a69086f0312ec9689f6b6ac1c6b65670ed7d0267e49f57ffa08c4"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:423b121f7e6fa514ba0c7918e56955a1d4470ed35faa03e3d9f0e3baa4c7e492"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c00f323cc00576df6165cc9d21a4c21285fa6b9989c5c39830c3903dc4303ef3"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_ppc64le.whl", hash = "sha256:1fdc9fae8dd4c763e8a31e7630afef517eab9f5d5d31a278df087f307bf601f4"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_s390x.whl", hash = "sha256:658f2aa69d31e09699705949b5fc4719cbecbd4a97f9656a232e7d6c7be1a367"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:1473427aff3d66a3fa2199004c3e601e6c4500ab86696edffdbc84954c72d832"}, + {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a87de7dd873bf9a792bf1e58b1c3887b9264036629a5bf2d2e6579fe8e73edff"}, + {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:0d7b36afa46c97875303a94e8f3ad932bf78bace9e18e603f2085b652422edcd"}, + {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:cf120cce539453ae086eacc0130a324e7026113510efa83ab42ef3fcfccac7fb"}, + {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:df5c7333167b9674aa8ae1d4008fa4bc17a313cc490b2cca27838bbdcc6bb15b"}, + {file = "lxml-5.3.0-cp313-cp313-win32.whl", hash = "sha256:c802e1c2ed9f0c06a65bc4ed0189d000ada8049312cfeab6ca635e39c9608957"}, + {file = "lxml-5.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:406246b96d552e0503e17a1006fd27edac678b3fcc9f1be71a2f94b4ff61528d"}, + {file = "lxml-5.3.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:8f0de2d390af441fe8b2c12626d103540b5d850d585b18fcada58d972b74a74e"}, + {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1afe0a8c353746e610bd9031a630a95bcfb1a720684c3f2b36c4710a0a96528f"}, + {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56b9861a71575f5795bde89256e7467ece3d339c9b43141dbdd54544566b3b94"}, + {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:9fb81d2824dff4f2e297a276297e9031f46d2682cafc484f49de182aa5e5df99"}, + {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:2c226a06ecb8cdef28845ae976da407917542c5e6e75dcac7cc33eb04aaeb237"}, + {file = "lxml-5.3.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:7d3d1ca42870cdb6d0d29939630dbe48fa511c203724820fc0fd507b2fb46577"}, + {file = "lxml-5.3.0-cp36-cp36m-win32.whl", hash = "sha256:094cb601ba9f55296774c2d57ad68730daa0b13dc260e1f941b4d13678239e70"}, + {file = "lxml-5.3.0-cp36-cp36m-win_amd64.whl", hash = "sha256:eafa2c8658f4e560b098fe9fc54539f86528651f61849b22111a9b107d18910c"}, + {file = "lxml-5.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cb83f8a875b3d9b458cada4f880fa498646874ba4011dc974e071a0a84a1b033"}, + {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25f1b69d41656b05885aa185f5fdf822cb01a586d1b32739633679699f220391"}, + {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23e0553b8055600b3bf4a00b255ec5c92e1e4aebf8c2c09334f8368e8bd174d6"}, + {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ada35dd21dc6c039259596b358caab6b13f4db4d4a7f8665764d616daf9cc1d"}, + {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:81b4e48da4c69313192d8c8d4311e5d818b8be1afe68ee20f6385d0e96fc9512"}, + {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:2bc9fd5ca4729af796f9f59cd8ff160fe06a474da40aca03fcc79655ddee1a8b"}, + {file = "lxml-5.3.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:07da23d7ee08577760f0a71d67a861019103e4812c87e2fab26b039054594cc5"}, + {file = "lxml-5.3.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:ea2e2f6f801696ad7de8aec061044d6c8c0dd4037608c7cab38a9a4d316bfb11"}, + {file = "lxml-5.3.0-cp37-cp37m-win32.whl", hash = "sha256:5c54afdcbb0182d06836cc3d1be921e540be3ebdf8b8a51ee3ef987537455f84"}, + {file = "lxml-5.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:f2901429da1e645ce548bf9171784c0f74f0718c3f6150ce166be39e4dd66c3e"}, + {file = "lxml-5.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c56a1d43b2f9ee4786e4658c7903f05da35b923fb53c11025712562d5cc02753"}, + {file = "lxml-5.3.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ee8c39582d2652dcd516d1b879451500f8db3fe3607ce45d7c5957ab2596040"}, + {file = "lxml-5.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fdf3a3059611f7585a78ee10399a15566356116a4288380921a4b598d807a22"}, + {file = "lxml-5.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:146173654d79eb1fc97498b4280c1d3e1e5d58c398fa530905c9ea50ea849b22"}, + {file = "lxml-5.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0a7056921edbdd7560746f4221dca89bb7a3fe457d3d74267995253f46343f15"}, + {file = "lxml-5.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:9e4b47ac0f5e749cfc618efdf4726269441014ae1d5583e047b452a32e221920"}, + {file = "lxml-5.3.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f914c03e6a31deb632e2daa881fe198461f4d06e57ac3d0e05bbcab8eae01945"}, + {file = "lxml-5.3.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:213261f168c5e1d9b7535a67e68b1f59f92398dd17a56d934550837143f79c42"}, + {file = "lxml-5.3.0-cp38-cp38-win32.whl", hash = "sha256:218c1b2e17a710e363855594230f44060e2025b05c80d1f0661258142b2add2e"}, + {file = "lxml-5.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:315f9542011b2c4e1d280e4a20ddcca1761993dda3afc7a73b01235f8641e903"}, + {file = "lxml-5.3.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1ffc23010330c2ab67fac02781df60998ca8fe759e8efde6f8b756a20599c5de"}, + {file = "lxml-5.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2b3778cb38212f52fac9fe913017deea2fdf4eb1a4f8e4cfc6b009a13a6d3fcc"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b0c7a688944891086ba192e21c5229dea54382f4836a209ff8d0a660fac06be"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:747a3d3e98e24597981ca0be0fd922aebd471fa99d0043a3842d00cdcad7ad6a"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86a6b24b19eaebc448dc56b87c4865527855145d851f9fc3891673ff97950540"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b11a5d918a6216e521c715b02749240fb07ae5a1fefd4b7bf12f833bc8b4fe70"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68b87753c784d6acb8a25b05cb526c3406913c9d988d51f80adecc2b0775d6aa"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:109fa6fede314cc50eed29e6e56c540075e63d922455346f11e4d7a036d2b8cf"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_ppc64le.whl", hash = "sha256:02ced472497b8362c8e902ade23e3300479f4f43e45f4105c85ef43b8db85229"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_s390x.whl", hash = "sha256:6b038cc86b285e4f9fea2ba5ee76e89f21ed1ea898e287dc277a25884f3a7dfe"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:7437237c6a66b7ca341e868cda48be24b8701862757426852c9b3186de1da8a2"}, + {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:7f41026c1d64043a36fda21d64c5026762d53a77043e73e94b71f0521939cc71"}, + {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:482c2f67761868f0108b1743098640fbb2a28a8e15bf3f47ada9fa59d9fe08c3"}, + {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:1483fd3358963cc5c1c9b122c80606a3a79ee0875bcac0204149fa09d6ff2727"}, + {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2dec2d1130a9cda5b904696cec33b2cfb451304ba9081eeda7f90f724097300a"}, + {file = "lxml-5.3.0-cp39-cp39-win32.whl", hash = "sha256:a0eabd0a81625049c5df745209dc7fcef6e2aea7793e5f003ba363610aa0a3ff"}, + {file = "lxml-5.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:89e043f1d9d341c52bf2af6d02e6adde62e0a46e6755d5eb60dc6e4f0b8aeca2"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7b1cd427cb0d5f7393c31b7496419da594fe600e6fdc4b105a54f82405e6626c"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51806cfe0279e06ed8500ce19479d757db42a30fd509940b1701be9c86a5ff9a"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee70d08fd60c9565ba8190f41a46a54096afa0eeb8f76bd66f2c25d3b1b83005"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:8dc2c0395bea8254d8daebc76dcf8eb3a95ec2a46fa6fae5eaccee366bfe02ce"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6ba0d3dcac281aad8a0e5b14c7ed6f9fa89c8612b47939fc94f80b16e2e9bc83"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:94d6c3782907b5e40e21cadf94b13b0842ac421192f26b84c45f13f3c9d5dc27"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c300306673aa0f3ed5ed9372b21867690a17dba38c68c44b287437c362ce486b"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78d9b952e07aed35fe2e1a7ad26e929595412db48535921c5013edc8aa4a35ce"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:01220dca0d066d1349bd6a1726856a78f7929f3878f7e2ee83c296c69495309e"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:2d9b8d9177afaef80c53c0a9e30fa252ff3036fb1c6494d427c066a4ce6a282f"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:20094fc3f21ea0a8669dc4c61ed7fa8263bd37d97d93b90f28fc613371e7a875"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ace2c2326a319a0bb8a8b0e5b570c764962e95818de9f259ce814ee666603f19"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92e67a0be1639c251d21e35fe74df6bcc40cba445c2cda7c4a967656733249e2"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd5350b55f9fecddc51385463a4f67a5da829bc741e38cf689f38ec9023f54ab"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c1fefd7e3d00921c44dc9ca80a775af49698bbfd92ea84498e56acffd4c5469"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:71a8dd38fbd2f2319136d4ae855a7078c69c9a38ae06e0c17c73fd70fc6caad8"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:97acf1e1fd66ab53dacd2c35b319d7e548380c2e9e8c54525c6e76d21b1ae3b1"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:68934b242c51eb02907c5b81d138cb977b2129a0a75a8f8b60b01cb8586c7b21"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b710bc2b8292966b23a6a0121f7a6c51d45d2347edcc75f016ac123b8054d3f2"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18feb4b93302091b1541221196a2155aa296c363fd233814fa11e181adebc52f"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:3eb44520c4724c2e1a57c0af33a379eee41792595023f367ba3952a2d96c2aab"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:609251a0ca4770e5a8768ff902aa02bf636339c5a93f9349b48eb1f606f7f3e9"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:516f491c834eb320d6c843156440fe7fc0d50b33e44387fcec5b02f0bc118a4c"}, + {file = "lxml-5.3.0.tar.gz", hash = "sha256:4e109ca30d1edec1ac60cdbe341905dc3b8f55b16855e03a54aaf59e51ec8c6f"}, +] + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html-clean = ["lxml-html-clean"] +html5 = ["html5lib"] +htmlsoup = ["BeautifulSoup4"] +source = ["Cython (>=3.0.11)"] + [[package]] name = "makefun" version = "1.15.6" @@ -4388,6 +4617,20 @@ rsa = ["cryptography (>=3.0.0)"] signals = ["blinker (>=1.4.0)"] signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] +[[package]] +name = "olefile" +version = "0.47" +description = "Python package to parse, read and write Microsoft OLE2 files (Structured Storage or Compound Document, Microsoft Office)" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "olefile-0.47-py2.py3-none-any.whl", hash = "sha256:543c7da2a7adadf21214938bb79c83ea12b473a4b6ee4ad4bf854e7715e13d1f"}, + {file = "olefile-0.47.zip", hash = "sha256:599383381a0bf3dfbd932ca0ca6515acd174ed48870cbf7fee123d698c192c1c"}, +] + +[package.extras] +tests = ["pytest", "pytest-cov"] + [[package]] name = "openai" version = "1.52.0" @@ -4506,6 +4749,17 @@ files = [ deprecated = ">=1.2.6" opentelemetry-api = "1.27.0" +[[package]] +name = "orderly-set" +version = "5.2.2" +description = "Orderly set" +optional = true +python-versions = ">=3.8" +files = [ + {file = "orderly_set-5.2.2-py3-none-any.whl", hash = "sha256:f7a37c95a38c01cdfe41c3ffb62925a318a2286ea0a41790c057fc802aec54da"}, + {file = "orderly_set-5.2.2.tar.gz", hash = "sha256:52a18b86aaf3f5d5a498bbdb27bf3253a4e5c57ab38e5b7a56fa00115cd28448"}, +] + [[package]] name = "orjson" version = "3.10.12" @@ -5768,6 +6022,20 @@ files = [ [package.extras] cli = ["click (>=5.0)"] +[[package]] +name = "python-iso639" +version = "2024.10.22" +description = "ISO 639 language codes, names, and other associated information" +optional = true +python-versions = ">=3.8" +files = [ + {file = "python_iso639-2024.10.22-py3-none-any.whl", hash = "sha256:02d3ce2e01c6896b30b9cbbd3e1c8ee0d7221250b5d63ea9803e0d2a81fd1047"}, + {file = "python_iso639-2024.10.22.tar.gz", hash = "sha256:750f21b6a0bc6baa24253a3d8aae92b582bf93aa40988361cd96852c2c6d9a52"}, +] + +[package.extras] +dev = ["black (==24.10.0)", "build (==1.2.1)", "flake8 (==7.1.1)", "pytest (==8.3.3)", "requests (==2.32.3)", "twine (==5.1.1)"] + [[package]] name = "python-json-logger" version = "2.0.7" @@ -5779,6 +6047,17 @@ files = [ {file = "python_json_logger-2.0.7-py3-none-any.whl", hash = "sha256:f380b826a991ebbe3de4d897aeec42760035ac760345e57b812938dc8b35e2bd"}, ] +[[package]] +name = "python-magic" +version = "0.4.27" +description = "File type identification using libmagic" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "python-magic-0.4.27.tar.gz", hash = "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b"}, + {file = "python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"}, +] + [[package]] name = "python-multipart" version = "0.0.17" @@ -5790,6 +6069,22 @@ files = [ {file = "python_multipart-0.0.17.tar.gz", hash = "sha256:41330d831cae6e2f22902704ead2826ea038d0419530eadff3ea80175aec5538"}, ] +[[package]] +name = "python-oxmsg" +version = "0.0.1" +description = "Extract attachments from Outlook .msg files." +optional = true +python-versions = ">=3.9" +files = [ + {file = "python_oxmsg-0.0.1-py3-none-any.whl", hash = "sha256:8ea7d5dda1bc161a413213da9e18ed152927c1fda2feaf5d1f02192d8ad45eea"}, + {file = "python_oxmsg-0.0.1.tar.gz", hash = "sha256:b65c1f93d688b85a9410afa824192a1ddc39da359b04a0bd2cbd3874e84d4994"}, +] + +[package.dependencies] +click = "*" +olefile = "*" +typing-extensions = ">=4.9.0" + [[package]] name = "pytz" version = "2024.2" @@ -6064,6 +6359,106 @@ urllib3 = ">=1.26.14,<3" fastembed = ["fastembed (==0.3.6)"] fastembed-gpu = ["fastembed-gpu (==0.3.6)"] +[[package]] +name = "rapidfuzz" +version = "3.10.1" +description = "rapid fuzzy string matching" +optional = true +python-versions = ">=3.9" +files = [ + {file = "rapidfuzz-3.10.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f17d9f21bf2f2f785d74f7b0d407805468b4c173fa3e52c86ec94436b338e74a"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b31f358a70efc143909fb3d75ac6cd3c139cd41339aa8f2a3a0ead8315731f2b"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f4f43f2204b56a61448ec2dd061e26fd344c404da99fb19f3458200c5874ba2"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9d81bf186a453a2757472133b24915768abc7c3964194406ed93e170e16c21cb"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3611c8f45379a12063d70075c75134f2a8bd2e4e9b8a7995112ddae95ca1c982"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3c3b537b97ac30da4b73930fa8a4fe2f79c6d1c10ad535c5c09726612cd6bed9"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231ef1ec9cf7b59809ce3301006500b9d564ddb324635f4ea8f16b3e2a1780da"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ed4f3adc1294834955b7e74edd3c6bd1aad5831c007f2d91ea839e76461a5879"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:7b6015da2e707bf632a71772a2dbf0703cff6525732c005ad24987fe86e8ec32"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1b35a118d61d6f008e8e3fb3a77674d10806a8972c7b8be433d6598df4d60b01"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:bc308d79a7e877226f36bdf4e149e3ed398d8277c140be5c1fd892ec41739e6d"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f017dbfecc172e2d0c37cf9e3d519179d71a7f16094b57430dffc496a098aa17"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-win32.whl", hash = "sha256:36c0e1483e21f918d0f2f26799fe5ac91c7b0c34220b73007301c4f831a9c4c7"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-win_amd64.whl", hash = "sha256:10746c1d4c8cd8881c28a87fd7ba0c9c102346dfe7ff1b0d021cdf093e9adbff"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-win_arm64.whl", hash = "sha256:dfa64b89dcb906835e275187569e51aa9d546a444489e97aaf2cc84011565fbe"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:92958ae075c87fef393f835ed02d4fe8d5ee2059a0934c6c447ea3417dfbf0e8"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ba7521e072c53e33c384e78615d0718e645cab3c366ecd3cc8cb732befd94967"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00d02cbd75d283c287471b5b3738b3e05c9096150f93f2d2dfa10b3d700f2db9"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:efa1582a397da038e2f2576c9cd49b842f56fde37d84a6b0200ffebc08d82350"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f12912acee1f506f974f58de9fdc2e62eea5667377a7e9156de53241c05fdba8"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666d5d8b17becc3f53447bcb2b6b33ce6c2df78792495d1fa82b2924cd48701a"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26f71582c0d62445067ee338ddad99b655a8f4e4ed517a90dcbfbb7d19310474"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8a2ef08b27167bcff230ffbfeedd4c4fa6353563d6aaa015d725dd3632fc3de7"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:365e4fc1a2b95082c890f5e98489b894e6bf8c338c6ac89bb6523c2ca6e9f086"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1996feb7a61609fa842e6b5e0c549983222ffdedaf29644cc67e479902846dfe"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:cf654702f144beaa093103841a2ea6910d617d0bb3fccb1d1fd63c54dde2cd49"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ec108bf25de674781d0a9a935030ba090c78d49def3d60f8724f3fc1e8e75024"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-win32.whl", hash = "sha256:031f8b367e5d92f7a1e27f7322012f3c321c3110137b43cc3bf678505583ef48"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-win_amd64.whl", hash = "sha256:f98f36c6a1bb9a6c8bbec99ad87c8c0e364f34761739b5ea9adf7b48129ae8cf"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-win_arm64.whl", hash = "sha256:f1da2028cb4e41be55ee797a82d6c1cf589442504244249dfeb32efc608edee7"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1340b56340896bede246f612b6ecf685f661a56aabef3d2512481bfe23ac5835"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2316515169b7b5a453f0ce3adbc46c42aa332cae9f2edb668e24d1fc92b2f2bb"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e06fe6a12241ec1b72c0566c6b28cda714d61965d86569595ad24793d1ab259"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d99c1cd9443b19164ec185a7d752f4b4db19c066c136f028991a480720472e23"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a1d9aa156ed52d3446388ba4c2f335e312191d1ca9d1f5762ee983cf23e4ecf6"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:54bcf4efaaee8e015822be0c2c28214815f4f6b4f70d8362cfecbd58a71188ac"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0c955e32afdbfdf6e9ee663d24afb25210152d98c26d22d399712d29a9b976b"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:191633722203f5b7717efcb73a14f76f3b124877d0608c070b827c5226d0b972"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:195baad28057ec9609e40385991004e470af9ef87401e24ebe72c064431524ab"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:0fff4a6b87c07366662b62ae994ffbeadc472e72f725923f94b72a3db49f4671"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4ffed25f9fdc0b287f30a98467493d1e1ce5b583f6317f70ec0263b3c97dbba6"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d02cf8e5af89a9ac8f53c438ddff6d773f62c25c6619b29db96f4aae248177c0"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-win32.whl", hash = "sha256:f3bb81d4fe6a5d20650f8c0afcc8f6e1941f6fecdb434f11b874c42467baded0"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-win_amd64.whl", hash = "sha256:aaf83e9170cb1338922ae42d320699dccbbdca8ffed07faeb0b9257822c26e24"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-win_arm64.whl", hash = "sha256:c5da802a0d085ad81b0f62828fb55557996c497b2d0b551bbdfeafd6d447892f"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fc22d69a1c9cccd560a5c434c0371b2df0f47c309c635a01a913e03bbf183710"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:38b0dac2c8e057562b8f0d8ae5b663d2d6a28c5ab624de5b73cef9abb6129a24"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fde3bbb14e92ce8fcb5c2edfff72e474d0080cadda1c97785bf4822f037a309"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9141fb0592e55f98fe9ac0f3ce883199b9c13e262e0bf40c5b18cdf926109d16"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:237bec5dd1bfc9b40bbd786cd27949ef0c0eb5fab5eb491904c6b5df59d39d3c"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18123168cba156ab5794ea6de66db50f21bb3c66ae748d03316e71b27d907b95"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b75fe506c8e02769cc47f5ab21ce3e09b6211d3edaa8f8f27331cb6988779be"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9da82aa4b46973aaf9e03bb4c3d6977004648c8638febfc0f9d237e865761270"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:c34c022d5ad564f1a5a57a4a89793bd70d7bad428150fb8ff2760b223407cdcf"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:1e96c84d6c2a0ca94e15acb5399118fff669f4306beb98a6d8ec6f5dccab4412"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e8e154b84a311263e1aca86818c962e1fa9eefdd643d1d5d197fcd2738f88cb9"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:335fee93188f8cd585552bb8057228ce0111bd227fa81bfd40b7df6b75def8ab"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-win32.whl", hash = "sha256:6729b856166a9e95c278410f73683957ea6100c8a9d0a8dbe434c49663689255"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-win_amd64.whl", hash = "sha256:0e06d99ad1ad97cb2ef7f51ec6b1fedd74a3a700e4949353871cf331d07b382a"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-win_arm64.whl", hash = "sha256:8d1b7082104d596a3eb012e0549b2634ed15015b569f48879701e9d8db959dbb"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:779027d3307e1a2b1dc0c03c34df87a470a368a1a0840a9d2908baf2d4067956"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:440b5608ab12650d0390128d6858bc839ae77ffe5edf0b33a1551f2fa9860651"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82cac41a411e07a6f3dc80dfbd33f6be70ea0abd72e99c59310819d09f07d945"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:958473c9f0bca250590200fd520b75be0dbdbc4a7327dc87a55b6d7dc8d68552"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ef60dfa73749ef91cb6073be1a3e135f4846ec809cc115f3cbfc6fe283a5584"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7fbac18f2c19fc983838a60611e67e3262e36859994c26f2ee85bb268de2355"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a0d519ff39db887cd73f4e297922786d548f5c05d6b51f4e6754f452a7f4296"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:bebb7bc6aeb91cc57e4881b222484c26759ca865794187217c9dcea6c33adae6"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:fe07f8b9c3bb5c5ad1d2c66884253e03800f4189a60eb6acd6119ebaf3eb9894"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:bfa48a4a2d45a41457f0840c48e579db157a927f4e97acf6e20df8fc521c79de"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:2cf44d01bfe8ee605b7eaeecbc2b9ca64fc55765f17b304b40ed8995f69d7716"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1e6bbca9246d9eedaa1c84e04a7f555493ba324d52ae4d9f3d9ddd1b740dcd87"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-win32.whl", hash = "sha256:567f88180f2c1423b4fe3f3ad6e6310fc97b85bdba574801548597287fc07028"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-win_amd64.whl", hash = "sha256:6b2cd7c29d6ecdf0b780deb587198f13213ac01c430ada6913452fd0c40190fc"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-win_arm64.whl", hash = "sha256:9f912d459e46607ce276128f52bea21ebc3e9a5ccf4cccfef30dd5bddcf47be8"}, + {file = "rapidfuzz-3.10.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ac4452f182243cfab30ba4668ef2de101effaedc30f9faabb06a095a8c90fd16"}, + {file = "rapidfuzz-3.10.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:565c2bd4f7d23c32834652b27b51dd711814ab614b4e12add8476be4e20d1cf5"}, + {file = "rapidfuzz-3.10.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:187d9747149321607be4ccd6f9f366730078bed806178ec3eeb31d05545e9e8f"}, + {file = "rapidfuzz-3.10.1-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:616290fb9a8fa87e48cb0326d26f98d4e29f17c3b762c2d586f2b35c1fd2034b"}, + {file = "rapidfuzz-3.10.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:073a5b107e17ebd264198b78614c0206fa438cce749692af5bc5f8f484883f50"}, + {file = "rapidfuzz-3.10.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:39c4983e2e2ccb9732f3ac7d81617088822f4a12291d416b09b8a1eadebb3e29"}, + {file = "rapidfuzz-3.10.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ac7adee6bcf0c6fee495d877edad1540a7e0f5fc208da03ccb64734b43522d7a"}, + {file = "rapidfuzz-3.10.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:425f4ac80b22153d391ee3f94bc854668a0c6c129f05cf2eaf5ee74474ddb69e"}, + {file = "rapidfuzz-3.10.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65a2fa13e8a219f9b5dcb9e74abe3ced5838a7327e629f426d333dfc8c5a6e66"}, + {file = "rapidfuzz-3.10.1-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:75561f3df9a906aaa23787e9992b228b1ab69007932dc42070f747103e177ba8"}, + {file = "rapidfuzz-3.10.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:edd062490537e97ca125bc6c7f2b7331c2b73d21dc304615afe61ad1691e15d5"}, + {file = "rapidfuzz-3.10.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cfcc8feccf63245a22dfdd16e222f1a39771a44b870beb748117a0e09cbb4a62"}, + {file = "rapidfuzz-3.10.1.tar.gz", hash = "sha256:5a15546d847a915b3f42dc79ef9b0c78b998b4e2c53b252e7166284066585979"}, +] + +[package.extras] +all = ["numpy"] + [[package]] name = "redis" version = "5.2.0" @@ -7725,6 +8120,97 @@ files = [ {file = "ujson-5.10.0.tar.gz", hash = "sha256:b3cd8f3c5d8c7738257f1018880444f7b7d9b66232c64649f562d7ba86ad4bc1"}, ] +[[package]] +name = "unstructured" +version = "0.16.10" +description = "A library that prepares raw documents for downstream ML tasks." +optional = true +python-versions = "<3.13,>=3.9.0" +files = [ + {file = "unstructured-0.16.10-py3-none-any.whl", hash = "sha256:738fc020fb4d9dfd1a3e54fee255221f7f916afafa16ff4e1a7a14495ba5b5ce"}, + {file = "unstructured-0.16.10.tar.gz", hash = "sha256:61c4a447514ab5d6f8629fde2da03833cf29e0bee26a1d3b901ac57d3b5d523a"}, +] + +[package.dependencies] +backoff = "*" +beautifulsoup4 = "*" +chardet = "*" +dataclasses-json = "*" +emoji = "*" +filetype = "*" +html5lib = "*" +langdetect = "*" +lxml = "*" +nltk = "*" +numpy = "<2" +psutil = "*" +python-iso639 = "*" +python-magic = "*" +python-oxmsg = "*" +rapidfuzz = "*" +requests = "*" +tqdm = "*" +typing-extensions = "*" +unstructured-client = "*" +wrapt = "*" + +[package.extras] +all-docs = ["effdet", "google-cloud-vision", "markdown", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (==0.8.1)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +csv = ["pandas"] +doc = ["python-docx (>=1.1.2)"] +docx = ["python-docx (>=1.1.2)"] +epub = ["pypandoc"] +huggingface = ["langdetect", "sacremoses", "sentencepiece", "torch", "transformers"] +image = ["effdet", "google-cloud-vision", "onnx", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypdf", "unstructured-inference (==0.8.1)", "unstructured.pytesseract (>=0.3.12)"] +local-inference = ["effdet", "google-cloud-vision", "markdown", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (==0.8.1)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +md = ["markdown"] +odt = ["pypandoc", "python-docx (>=1.1.2)"] +org = ["pypandoc"] +paddleocr = ["paddlepaddle (==3.0.0b1)", "unstructured.paddleocr (==2.8.1.0)"] +pdf = ["effdet", "google-cloud-vision", "onnx", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypdf", "unstructured-inference (==0.8.1)", "unstructured.pytesseract (>=0.3.12)"] +ppt = ["python-pptx (>=1.0.1)"] +pptx = ["python-pptx (>=1.0.1)"] +rst = ["pypandoc"] +rtf = ["pypandoc"] +tsv = ["pandas"] +xlsx = ["networkx", "openpyxl", "pandas", "xlrd"] + +[[package]] +name = "unstructured-client" +version = "0.25.9" +description = "Python Client SDK for Unstructured API" +optional = true +python-versions = ">=3.8" +files = [ + {file = "unstructured-client-0.25.9.tar.gz", hash = "sha256:fcc461623f58fefb0e22508e28bf653a8f6934b9779cb4a90dd68d77a39fb5b2"}, + {file = "unstructured_client-0.25.9-py3-none-any.whl", hash = "sha256:c984c01878c8fc243be7c842467d1113a194d885ab6396ae74258ee42717c5b5"}, +] + +[package.dependencies] +certifi = ">=2023.7.22" +charset-normalizer = ">=3.2.0" +cryptography = ">=3.1" +dataclasses-json = ">=0.6.4" +deepdiff = ">=6.0" +httpx = ">=0.27.0" +idna = ">=3.4" +jsonpath-python = ">=1.0.6" +marshmallow = ">=3.19.0" +mypy-extensions = ">=1.0.0" +nest-asyncio = ">=1.6.0" +packaging = ">=23.1" +pypdf = ">=4.0" +python-dateutil = ">=2.8.2" +requests = ">=2.31.0" +requests-toolbelt = ">=1.0.0" +six = ">=1.16.0" +typing-extensions = ">=4.7.1" +typing-inspect = ">=0.9.0" +urllib3 = ">=1.26.18" + +[package.extras] +dev = ["pylint (==3.1.0)"] + [[package]] name = "uri-template" version = "1.3.0" @@ -8261,6 +8747,7 @@ type = ["pytest-mypy"] [extras] deepeval = ["deepeval"] +docs = ["unstructured"] falkordb = ["falkordb"] filesystem = ["botocore"] groq = ["groq"] @@ -8278,4 +8765,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.12" -content-hash = "9aea4fbf426c44acda7af97be059598057eef35f51e32ce60a771ac9ab2f57af" +content-hash = "c9a760447a62b3c71fa84f20a614b6d3c5725b3869fc87f78b03eb2c80841ce1" diff --git a/pyproject.toml b/pyproject.toml index dc99ed65c..f03789833 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ llama-index-core = {version = "^0.11.22", optional = true} deepeval = {version = "^2.0.1", optional = true} transformers = "^4.46.3" pymilvus = {version = "^2.5.0", optional = true} +unstructured = {version = "^0.16.10", optional = true} [tool.poetry.extras] filesystem = ["s3fs", "botocore"] @@ -89,6 +90,7 @@ falkordb = ["falkordb"] groq = ["groq"] langfuse = ["langfuse"] milvus = ["pymilvus"] +docs = ["unstructured"] [tool.poetry.group.dev.dependencies] pytest = "^7.4.0" From 07d9330e4a7c8331ea0939cc8185d2f74adb2350 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 8 Dec 2024 14:53:19 +0100 Subject: [PATCH 4/9] feat: Add UnstructuredLibraryImportError Added exception when unstructured libary is called but not installed Feature COG-685 --- cognee/modules/data/exceptions/__init__.py | 9 +++++++++ cognee/modules/data/exceptions/exceptions.py | 11 +++++++++++ .../processing/document_types/UnstructuredDocument.py | 8 +++++++- 3 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 cognee/modules/data/exceptions/__init__.py create mode 100644 cognee/modules/data/exceptions/exceptions.py diff --git a/cognee/modules/data/exceptions/__init__.py b/cognee/modules/data/exceptions/__init__.py new file mode 100644 index 000000000..fa8468c88 --- /dev/null +++ b/cognee/modules/data/exceptions/__init__.py @@ -0,0 +1,9 @@ +""" +Custom exceptions for the Cognee API. + +This module defines a set of exceptions for handling various data errors +""" + +from .exceptions import ( + UnstructuredLibraryImportError, +) \ No newline at end of file diff --git a/cognee/modules/data/exceptions/exceptions.py b/cognee/modules/data/exceptions/exceptions.py new file mode 100644 index 000000000..3b1aac52c --- /dev/null +++ b/cognee/modules/data/exceptions/exceptions.py @@ -0,0 +1,11 @@ +from cognee.exceptions import CogneeApiError +from fastapi import status + +class UnstructuredLibraryImportError(CogneeApiError): + def __init__( + self, + message: str = "Import error. Unstructured library is not installed.", + name: str = "UnstructuredModuleImportError", + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + ): + super().__init__(message, name, status_code) \ No newline at end of file diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py index 68ccbe1f2..62632cd08 100644 --- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -2,13 +2,19 @@ from cognee.modules.chunking.TextChunker import TextChunker from .Document import Document +from cognee.modules.data.exceptions import UnstructuredLibraryImportError + class UnstructuredDocument(Document): type: str = "unstructured" def read(self, chunk_size: int): def get_text(): - from unstructured.partition.auto import partition + try: + from unstructured.partition.auto import partition + except ModuleNotFoundError: + raise UnstructuredLibraryImportError + elements = partition(self.raw_data_location, content_type=self.mime_type) in_memory_file = StringIO("\n\n".join([str(el) for el in elements])) in_memory_file.seek(0) From 596b3edf7244544445ef0a254b3248ec0432eb5c Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 8 Dec 2024 15:18:42 +0100 Subject: [PATCH 5/9] test: Add test for Unstructured pptx document type Added pptx example file and tested Unstructured pptx document type handling Test COG-685 --- .../documents/UnstructuredDocument_test.py | 22 ++++++++++++++++++ cognee/tests/test_data/example.pptx | Bin 0 -> 39894 bytes 2 files changed, 22 insertions(+) create mode 100644 cognee/tests/integration/documents/UnstructuredDocument_test.py create mode 100644 cognee/tests/test_data/example.pptx diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py new file mode 100644 index 000000000..56c3c827f --- /dev/null +++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py @@ -0,0 +1,22 @@ +import os +import uuid + +from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument + +def test_UnstructuredDocument(): + docx_file_path = os.path.join( + os.sep, + *(os.path.dirname(__file__).split(os.sep)[:-2]), + "test_data", + "example.pptx", + ) + + pptx_document = UnstructuredDocument( + id=uuid.uuid4(), name="example.pptx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(), + mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + + for paragraph_data in pptx_document.read(chunk_size=1024): + assert 19 == paragraph_data.word_count, f' 19 != {paragraph_data.word_count = }' + assert 104 == len(paragraph_data.text), f' 104 != {len(paragraph_data.text) = }' + assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }' diff --git a/cognee/tests/test_data/example.pptx b/cognee/tests/test_data/example.pptx new file mode 100644 index 0000000000000000000000000000000000000000..3e49ae3df04e862e6b4b334dce04fb453e9b3d38 GIT binary patch literal 39894 zcmeFYV~}Ot*0!0pZCBd1ZQHh4Y1?+CZQGfZR;8^<+wQD-&WS$J{l$ssc>CwOf6SP3 ztQk9Et=Rj%#y!Sd@>0McC;$);5C9EIJ*oiz@j>~T+uJ+SIh&iKg1o{7T~tx8NpAd*MeJFvlEEgi2*tAqPf=GY z0qSthCDjV80)9b#SQ=n8nYWlT)$(72G)#DT--1sVORmTzfx$&8E8Vsk=}>n<*zt#; zh+Un^4(tm~2}~O19+bA9b1fF;P;23B4PbMTMhfuDs*qWCHLRMKSyBO{HgZuY3>s*N zkAqyKPmafy^a8O){=I0`1k2W!#PzL<8@3B0+MNG8Q5s3mX+Ss$jFr635VOkEqI4B6 zF{Y>qG+^SZKsD6}r1}goF}DFHc&HKGt0E(oYr1e~T2r>>hr@am)df!P9i?rYucS64 z#K=8r)?v%G+fgfT_CkomSUH5t5Z(#15rh&!PqE32J9wPhG&} z;~Q*^QXd=IQ!(fk{)D%wUqoI{Kn+_*cDG*NEA-niluyCRBc00qv;aeaSLYb6V!;~C ztkX~N+EdlmOV72c^g8Gyt3p|UX_s&DwujGYhYA6g&>R5#c-lY3WA2!Pg`ojjXedlE zJv$|8>vX9>ElnsAgOmv@%GajEC_)%D_;q<-uMZl?jt&||M&NVs_?`})b_Vvq`LrNv zjPTNf=(<05FYWk5e0iQPh6-xV+)Z|UKaU%>eBS5aevw{1FU5YkzdSv&<$gX75JP)u z?dC!9{eVPP|TQR$41h6ueD+ht&1CoT9j? zrO`^Xgvg{07~u%wj&Q+QItH>KVTYK-Pz@JzoM3po=p*@wzNO13Ag(3XtV;&Jb}=g` z;U?^6Xagco!}Vx~1WlJ>Zq_>da)v^3(GEea zWP}B%(ccK*6H6hlwBKH=o4i9s>v#m}$&f9mAc@NF*-eP0-B)(0COPxoybpOz4q4Y7 z4107#wJzp#2!&oA*pBgXu5^H0uDiv0E@v+dV|CT054Y0MCzn~^*Euqc&yPKYG1r!q zL~YAEqp9fWaC!}SN5g-IzB7jyP*ev*l1z<=L%lOBT>C^RA?zI>5vkP2phyM>K&LgN zxcNX9={tGcQqm>*4SyF48gd-yG@e@7ZxBxpz`_iihqBWC&Sanq$X&!d-6C}>m-HKJ zGpJpF-E(%^<`?vA9C=zi{jF!8K3a+|gRt&om_n40p$6;OE!JlQ+dC751Fvj#75j=# z73pvVlk+cE+E|#o9Zc9njPCH(!5Vvc{XS+xR)M9Hfahzx~!Eu##w9t2g@@VOjVJ0;t zOgf`G`F5z>5v1rt-(y9^6_1=bQ-aP;=BCu>Z%=+dup22qq_?$G(sYOJ$JGW_erIER z#YLHJmpBf?X~AJ$Qq>tSze2;;&Zud+rUqe|u775UF1E7H%wZSQeC z)Y!CCYBl+B2G49u3Ea${gBQjV#+z|@3xRYkSV7(Myo=y$i#cpk`B9-HZzarn+*y8ppFdyApieG z-WmUqceyWlNBOtBYdM~O$vc;8N0UY1;N%Q3%`B2t1d2FknVpa)4!yoQ+Zm&N3vP%F zpr(H6#k}!ptqXroe<@!xMWk#Z>jqRK2MN)==jHpRr&nyD@e39JPv30S{>3q?M$!$n65J?g3VfIAS z{05cmo1=dEX-JK%hsv)YMUF(-SCCteOdsvSA+V&fFJf2rzld>g1n253S7J@nI9+bAB`02~k?5BEd^j36_Dh^-DTYY-APPw4HF#%q>Z}KhIa2vBOv`fIo@GIIq zS!#Z#TxWcUf^J>7cIb8l*->BHf^LP4;!GSh7mF+CnXp_rLkxjRDeGBzekdhO;C8wKq?ZR6B=cSu1&#o4 zoQg$r0Nvb8I>gP`CuKw6D^rwIKdz{F+C9-++v4Usa%l^5)~OOsTljvV#Pdx_pVxU| z8dGbqUAFjKU!ka`?aImC}kCfn&Em$VM&8dxDupc(P4U0&KtC(`0)LYsMjJt=rTrJ zqJDlpe!(>3iz|)r3z^c4c!qN>W3<$G?cH!l4`{|>;+B;{^geb5$6@BVI{e6R&3uG2 zMhlX@G1P<@6+8l+%nRVC{b$QpS~%icAc-yMwlWd~s-?ZCD8hV^NII5NjCuI%*QjhQ z=#jGb@3^*2QjZw$R~=i0EjvjUL@Ma`-|s^4-gzI6T<*QDvprU#Y~Fjb%fgp(x<}ID zyg7)EQhu(s%3pv2v!uf^<_#I}h5s@#s1AbWn8*P%yHTPpc!wYzi%Qgh5Xs|2Autfs zVW@0sZ{Q+VFQ%`L!xnim$nxKQHOeaeyu6j0Rs5L6>L{K8=9Z>Dn$5iSXsGM3+ z(yvgjQmOjLex=0`*CP>{xGhwux9-H5aTnrztbbaK-09@yfn81cE*(@Se&1K@2qE4; zvupB~FSPKjts)4~Ko#jEtI85NPUiqP`ePGoP7&cFcV=z{i?mh zd&`%?Z&)I`i~}6gEwnN`j^4O)8hcW{>eo#JHzCk9Mcimf$Dkah{MIf%>EQ%zTHJpn znvT(|#kLCYZf{$HZ#D~Sa{FF-ZW+G1%0q%(_10j)+we^HN8vTFGv|JMDZJHxRd~XG zCGfU(&L&PW22RcobTvn(Ya@A56&?p_R))`u z{e)+bW(KNN{BfyQLyNSP2vrB8FeQFs7k;PL^>!e_Vxo=s3@ITJ1TLADmH%rxxx83_ zz(H1$B=|=%{W=QU+`>uef&*+lHS~zQQKYimL{cn z4JmvFUdT0lgy?XY1EVBZHu+$Igz<2b+W>nz4$Omh5J6?(3x8`KXx~_7$;bNR5BVRq z`bH@Fh9^`N=vP^KQU~Jji-=TRfxg`=0uy~Cy}Z=B8!Uw>w9gq{2c|hE`-JAPii;_Z z?V9{_1jX7&<8yGVK@ObDv1og0phf-*mj)mUjedIo0XnJx(4#4&P^rY? zerO32%SSFJeuEkI7{nl z=CR;+O0>WW*fFh&%K|aVN!!%GIp;VVa!3IlVWUg27PQnRB#pOZNAE zHCz^4F%V9bzQkswY=FWpRC@wzMg`usuT^v63THFkF;1?G>|YS_yt^0VAyA8*Qj^z! zN7o=`)SK*PDes>FH0y5PmBtjOMF3OpMgz^aJ)_b7&h33G@$UQiG4IG!{{)nmnaPjx zyX8%_;!f4Ye?_N1K{0%HQF(JhUT%~44mAmvHTlPY7d8CtK{Gcq0{f%&CbgtxukZ*U zGG{LU%pM*+9R1cb0{tu;{lK1IIF5e*4hhaO&mB3lE|2_q^qrvmRs>I zMz8BXrjssvxAiG|2_}cqgB1ps=^^_}hU1VqX65a{;F7s!Ejx(DFga$`?ezk3==F0d z*D&Y>e4MHJLazj6%yn(c6pHz`Y~QK9w^7{V9(cByq={|FE^Y;3FA;LG#l5L6-H=g7 z^{!|vwI6qWtdoA4)8Nys8XaKf8hW^P6jl8;&)rv&aBLY^u{3{>;7ulR%S@=kGyW$S zyKIU5Pf*LZN>=Zmpw^P*)cxO`*w$4(n*RjJ7A^h%2KCxM*{$820=j&ke4p|EC_kH* z4tK?u^0U+bx7?ok|D^nSjwaUsZ~dqJ6OEppgzBe<0ly-9`3X**{#y+{jtNlnuKmlDcc!uk! zPA{-@k%2u$lq#JeEnY}K>yYLMXEvOWH4`E1&9BB<5UaZXrOyFz@OkmLNhT46UoSG|Av8gK~6 zK2@LkuRA3M{BO?!|39CFlC_1g$-f@L-}zq#W~;s{?y(~H26P9Gd1 zF|Eg2908Np8+y*DeOn>)eyJv*auCcY?u}~FeRplr2yeOR?ZZSAey8Vvk(N0 zHz6)3dcQr`G0Y$xHdHVq1*MsTWjRh=+v2TAk4wn_nFvnbA*!lUVG4pNqZs+ENJ*)0 z_l;!)P1hw@GJ0t{g()dkbSZ34w6Lp-cSN^Z6*g~+#DW8rm3=nJ06H(TU4iub-ad{S z&gqxUL>NVFRcEcN!@hJrEvA}Njnuf@3t$tLB34^mnP4Yvh-rXOJ;W(vW{5%q+^5IPY6`h7qBe5ta_kv&-z`1X4NWrrcV{*wxro+`JyaH!28_@ z-F2W;PUJb%s+N4m@_wf-<{8!YiHy!18zoKUD&EMVv`-L`j6ciCFcA)z6uYg5dC@rZ zNm32bN~TwIdGlgpET{LrW8C%n(Uoi5@HF+D?rHEL8oKNy^giS24vJDzZH5o%-3xZ6 z+KgOr8wBxYPxxmn^3KuIp~D+2mG9MwFEW7W3#FSD+@y(^ zUSUfEI2$t71tWv7h#s&x$QxBHx?G+d0AwPmEIdgos@I%(kEW-afFKU6+LaJg#3brO zC}MI=|777|etacGcDV2C!cESxPkU(8N?=>ZAvUW^?{n1*B#oBB_p;*FH`=ivg&##b zAJQ#BeMr3_`UVaIAJ}g0(f7jq9dvlB-XM_ke0AS1-pn08i#i`E4|p2^+XH`m9A2FB z{gICVjn1VTU;qGm=>H}kl>Z0$_zxxhzeVI6kmlB$25YZX+n1pru zf!hG8C%nv6$U|U#=c5Ud4QWMsTXbxphm4uvCh0_;r8b};?1Q$ueFNv;*U$RGh}_u4 zNa5*_*cgLw>A{|vLoN#2jjPlAs0Ah_2*H}h>$UPA(`wpl;f zc!2%$I7+DoA6vd|mi51?>AyXN(;p#~Ht?`>adx8nkHtTpLfht#r{MYW6b8OrZ^h)_ z97z@Sq`9j1!n1(+Nl5CzAkqAcsj{c`;kpFuW*_uo6B1DXAQkcBh`BC=>yPS05VWZu zd1%W*SqjnV?3j!v2=geWUW;!hSYgvf*d=Zyw%+VrT#?ejObFxv`Z>-rp~WXtOSZbm zzoUFR)b}@=*rs({1@?9tQ=e_1#*#AtJ7UuA&BDhvN&p;o8y%E{Xx*u1GHz?e)x;Ec zYQU0n#f%-OvZsG8FsWrD6-|tMV30;k?LWT478= z%rB(=3``(J5k!f*4^aHLa|{{vg%fly z?kCf(sd3)Zlpw`~Bll&Ot~dO80#U9t#)#R$lXQT9K!U+Enp+9|wjl17FbUq``%}1+ z0Ta#^qIMiK`zQkAA;_`oK?CY}#9~^dy|hF_Nj)&N!d=t{^ZN?gqS6__*T?G4L#9n4 zm+UL8F==d)SH@=5j9f!?T%4J)2{5ZVhX}VQXcDbT>Z@o(8+Jxk6OmU`%41E@-`>?& zk=cFpHVs&Ip*pn!-B8@`&hzN@u6v?JY_8l{@E+~2-C^;1hWBu3@p}4@f4RR63?VbS z_hZS~x4QR($(0NvH?Sb;pkg2;>0dHu2#usw>syLo;Ij|3Jvz_EF!CfBn&aL+S7W&Ix_CxMWm;`I>5v&nDwzX0EXs^q#pbf^_7R zeP{l+M6T>9meO>h+AT_(!d;`s+~It?>%#3mRgN|r)OpFoe9x!3g=XsD_l$sy;+RX^ z10^pV%To_1RJnjcvJZ@<+eX@*du|uYocuG8rc3l9N(*yE~)K!-zQhxp^A(SQFpWqLIw6(L954-*D24s?fD*y6g0W}}jF^(d6QSP3LO~yA`WOK6psU)o( zTrYtsQIoffJ(f}!VVFcpq`_(!vmXo8YE8%gv02Ml!;gSR*M|YiNPi)W8->d&`POYcHwM$#gcBI9it0TDJ{# zt0gkgMpDr><5{d|>`i43)@5}zCQHK^)vPQrG%U?DrfcfMi^aorMIqf_lzQFr0K_mh z`=fh$Zzm_}jCivwqT!Y8AEs)2WHG1DR&*zqkHz^JjwB7zq7uUASYdVxe$S(!1`?QD z?{CL7mwdi2T3L~!OgBoS&~aq>bLwg_&^rFhmb{I2$+;(~!Fs|zU_%{kQQW{u0*rAw z!A~E4bBzpjDcz%ccYnz&6&E1S4F&)JFZ|!g{BM)UkKDt7nBAe3Y)cGeVtoC=waxprZQY&YMFhC5Fq~pMiRN!%_M_|nB^A< z4%~d}l*f?(%WZz`Zae`0A+xajDR63?iT zhR}qZka&ZW#RWEjJ{%!E`i|~Vjm*EA0ZtsmUT|AnPoTJm>+`^d)0n*-u)?^HWhbDP z5aZZsYCg}P)*jBNS2hs^$-yRL2E*P#(o9siPi=>DIK(ES8xdIe&Wn4)*o!ecAK@b< zC=vrHha(RGn6}@{DXG3+3&(bs7VOR-J|{>b3kn@M2vLusq$u-1Tu+{H!aHk_ybw~V zSUU!Dm;i^?NE@RhQ#xJor|=QwE-MymM2CRU7Xi@W@-^Oo&=G-{pmzHu=0Lf$g5Kn# zj!4R76U^&Y*OEj3Kpe2B9T2H zlj(PV&ta+3b<~~{(GOFIGZe<_2O(2l0(r5gt=SReJ(^n5$_$@EnR}Rgl(ODnq$r z($};N1*N?t!I)Sn;ci#CEpw6raX=r<*DW__ouUhrJmQ>#jN@2tz6AYr)2@)4)Woz6 zuB&f4V?EBGv*sx|tWRI7!vt>820TAO9W|yTiP;k(Q8_v+UP8a$eg-~{JFP81UW#^3KDNXMZvXl4f0RV~Ad#5$*AM^G?yvOkgF z&YTjWCN69bjL`#(B96tP+43am-wMBs56y2fwa9G(IQL}Omq;loH84jyq1ENIP|1-JPI|zy+IbKb$wrvftckpby1Jx0Jllorv>W%_A+zo zzMi{RN5w50KEkWbEgL*SbIl$aIZ}1y{_QWFQSo{P|I)diiTwI0G}26d2U3zd@C(z* z?o*Q7eLK4xO72l5TZ0zG0Z{k)C`gq;VcAX4bq$lj7!k^Dv_F>(Zx{tkIe^EfQPYt@ zx^?X3$r<%(k&+^;w^w}Hs9y_yK7H!NhPy2L6BRT_l#Kx0pClQ_bUmcVUm(?;CRi<< zcbJ+g53nIRa~VzF>iP3{cKz5u^xU{{Fgt7Ov#M?R*mD;GHhIIcfQZ*=SK>0! z^7G=T8~646GUrOl-qXIc6-z~)-V3J{x_Y=EIl6x^`*+Yers~kq zTUk4y>UcZli?Kz&UPKyyjErF1IK@(ZJ0L2A!{YWb;Y7idRx-4HFD8LGSQ~th3Qy6xmY6^Sd*4YOe|pDmN-_H z%+PL=zaU=zfZDvqzq}e26#+gX(CBEYgD587=k@!gp0C%Z*woNMlp$pk07*jfaJ^$zGiq4$2?L1e zLmH6E)6kt;oR4PtPorM;!qjrAWO$*U(p1O=59$fGzJJdJ#P)tBKY!H@*Z+UO>R+1t zFIHWgnf{)c&^~v}WdFjdH+aZBf}ty!6J$X1qXidRQ!KJ+_;f*^6dJ z>^zx<1c5z$tHYSALv#rXfljqET&lgZcPKjDC2?gX!e~DUUd1qzj*9#lnE=N0$jJ!2 z8d*-cG0HIDphVR036{ypc!Qp9VF3*Il6^LHS-`ZTUrQvq18^Ka|2d|6jCe{KAFN%B8ik~QZ7@(4?ay1b*z$DOvZ2I zMoQ>Rw)r~>x7D*pbOd>G0`##t+e8pMS-s+%60Utybb@v9W@jXpi6qUR5Qi2wknzlr zdr0V~UQjHVsm)n|1~zS!X4g^Kii7E@lK~z~ci`xhEN3O`fbdzpL*Kb9vAR%NTcPEQ z%m@Ppo~GpN1(k#I9?>fw!9>sSvI%m{5IPAg;UI??Va`?^41NZB<)@Ap`56q*Zdd{3sNc>r~ZI9yEQL zmuD?~emkPaT$qvUYK17Y;a8l+PG|U9b;52a4NYUemoMmbTGig8r@}5~;P9_uBn^y{ zE%}Wnv~H4&;t-4Udcsv6mgx}fmr&e&sPv06{X!z&Dvk`W!cvS<4$pS8%p^9QywU-X z9zbcOmU5H5$3m_?1(Kpez$ylIUh?rnj!{gQWg`zq>1I-ej08z;8CY-;byBkE&)nOm z98`x1*aL^v+{_;xEhdKJPyy09nv0#&5Q?qpl$H4Fkjt93g;zy2ZUuw$vlPkA_R%i`h3qnD`L zGHTcS0VtXiDjTOEc$cOvw2I6(l7@W2U19{tiF(tOWUo&04d@ zW!RZB9%6k;h$&ulUXqxOqt?uhkv1+>x@aVB>I~+9E?xB%~%G@c}ToJ3g zcK6lKcGn+(*3 zlx%HMyEO^AWT=%+byF$p&?*48KLV-HrY+yhGuS-TJRk$pOV7tcE4!*;{qXIfe@c4m zq538C5o|DhHXir>#d^ZmEjwkTc(&Yv-b>byJ&zi;AkTyU(cbDDRa909$1Y&o@;F{?$j&F&Ydq=ldCts0`9 zQq&{OlZ5^Yd4mZQdsTB0UWe}?A1D4VmSbqAnMN(fkpfnL=NyoPw&NsnpfB=Xk(Koj zT_y^G9SSlP#z{eLVfZqGo5)VnQQYDKtyR6~ea(1$R;Qzj3QkL^IUQBh#uWs{5Ya}^ z(anY_Rg5Ww%rP{~tCaT5mUbaC?9DUgRM@MO7+8XP%Yp?}ixw4Q7XNGps;Zi+{AY9d zSo6u~zcvT2Q3hcD-OK?_+1(uvK#XEHB=Lye)5@8V25*zFs-oC*p_O1rYh;`u&2?~$6tGh$aYlf@#U7iMgQk+ z`7h1>7wcbjynnF1;kYV_;`7xP6zy`oqXU7QB4M6l-doMtmZsy1Z1l5~sHjG>o;#M_ z%(I(mG51^&8%=a}`&I>l#LC_nQkxEMkm+f5&uw6&JGUEObq1`Z_G2JRuWXmBQyR+ z5hd^dQ$(LPG7K@|HzUJ63j||ZcFH~{t>e~&5frYO70bqfA3c%>-y`%v9RfL;t&agH zI51sokdeG;=!@*e(970vHz?*HsT0*tP6BmL_7L0h@L>dCsg-1owP7c`2KaUb{ls4)<_)+dhai`edcW zz2deZEq8Pdr~?Wq8*c=HDr&@L?py&nL9x@_3mOG*62i})#J?JA611E-UNm@$ZWr-I z{S)>>_;f#l@J6&Z7f1X z_Un#M{|2@hnl8i%*xbMvIm-@Ex_F`VKA~Q&7=GyE%71#6?O8pQqtmi}Kh8w6s_OJp z?H~_K1z#F8IsR7N0wF_GYVSTx^X_p16?DcNyxt{xXBk52SO4g&d4rPxQU1-#4k5dv2x&-88FslvRo3!57m%rHLpb)-2n#+srvP1W-ZPa zOX-U`s+btlz%a3KGLAy*FRUA(H1``aWBZ`wz{t@ldNX(Y$nEvQ=)HdR3V|F1byxCZ zlf0Up7pmE-!yh(Mb*J_p;nwC>4IV*ruN)XMQgg@g|Cg^ubWIJDs|hBkA(<*;b}n)l-XSsO+2k0yrk2ZrsxTTRtMw<4qCAQZ&$Ad#AxVqs zY!z9ZCe$;MDzI9z89Iz7m{&sk)h3qteEIm7%foERCJ=*MtgRlW-+72CZ4_ylnknbk zFrB1^F?2e7ukcZfDuVgBjBxwXPMuvpxB1A2Ri&a`hq(!dVFo_N@ZaDWnkx*4t?5iB zXd*Fh$0IP89*kLtD=;-Ce_D#FGU;80W>lK~iKGJ?Nvkr|XK*B04qMikR@(Gr%#J&f z60D&!EwC|FAI$VnHAPvdr<9r;Su>PAZM7)==DSSraE_hW@jfSC(|y2>wf0=GAC*doccc8KxJdnVnyYtW%KPtcUaljnyQ+5pz%^oO?N5&tB}qWSJa*GVi|ia?%lNI zUf8{A$E~Q>bV^mCqsQ@O@vl|9&_K}}+^xb7IAgw{*SeC|jU8+aw_H7Md&~Hq`@c}C z`CJ>5EPCI+G%ZoTXDmo_2tn?A)zNt;2h`X>W-~Mjqu5{KdTv|{a=tfoXUGd>7@pDY z4@7NWfz?+L;yaWxq67m!d zPT8tDGO$8lb?@Tg_Kyfrvta2zRdSbTl4vxf^glKAQ%+N?E=I;bMe@aaFqZZJ<3ARDh*O(3mI~v_8kG-NFwV{VJo&VsdHZWJAX$6kXTTqG~4@GOaS zcA1eTo_NM%Juo7Qa27pK!WGF9rWGN#8J))WQ^ooOplu~23n@5cB{c5v!I6V+Q#+t8 zw6x?ro<7J={rw{0=)u5<-0>oPu;)Lo`;Ym(*f@FWg$euUoqym5U+`uBfS72N@lU*X|pEG8hW9mTUXAoaV^ zz5yf53FUzaU(DaEfIOTu4R0R)$A%t=k1dX!(-=k@qy?vy11_vqlV`N3Cm7-sF8lm8 z9xV>MEFz2}*{n_Nd}LhLf!MmFaxzJMXYeh1yQ>0@hk<|OStmTpf|OPxpgDGRgeo^- zsIdcSauva*U#DZ9;g7A6@cLblsgz6wrPFT%B0B0w9AhHg=(;QM_=E}sN9UL;o^Zf( zVshlB``-{bU`bPe5%t6hGG|?!89r3FNxhfwUSbd!_hw%s;nC@Qzq>GE`M7_Y-tC3X zR_KXNkcW%?bU9XRb&uXP@Csk9JB*Yfai8K>mSB^}hB2de1+AwwK}+vS!ka`ZWZ|3l zLVU0zCI4{=Qf|+LFv?>tq4%wQva2s3+qQAM>ftPZcrmA|ye{PiSv)yW(cNOt(mtJ| zdP#x#UdO-Bvphr2sDWP`9{Fl=JS+<~3{?IFZ`yH{=d8mpCi^0(+)Vmp$EsaIFm*>! zdQ!ehg@{_@7}F690YE1CKqajfzW}7iwSfQYy#a1~)E-6c9(#s$qjMWB+Z2duTxh1gtXbI(*tsuxiA+6JfWV{)aY_I&m zhcYoudoW}#NW?-Es8UOZXo6#mQ9GxwpEO`<+(od$eUTh7q`F{aHZVeQ+l0C=v|74P ze!8SLIn7hKAr@^Jg(1k3itY~O)>VO)w&y|{;&SY&gvWjc#)+U#ckljPHYg{?G zPnAuT;~mD|nw+coO|{gc4hLd`$z&26&Wg#dT_H{R;G#3E1LB=Z@EI@NldLJrCac=~QZSQe!M2gkjo_#=%)!k)) z;(qzqWsu}H^&eT*<2DT+p}b0ajU1u8zJuak{MbEa;GR1=V?my+WVr`-%uMGjZpn=hi!x-0 z#D;E5BF9H{Y|HY%a=JFoUes0qVKnS;Q`cCIZhNoopGD>9(fH7ofbU1;K~=p^sdk!Q zn?v>O5UP}vUXx6UrJF~icQvGPyq_PLQt5?Cr_OTkWaq6Fp&bL(S+mAt& z_bB-})D`j?0Op~dfp%1U#|wvKUAVW%MOAOYfW@ZiFEr2LtMh=*=9k(50Uc>SO3~ur zNd!331xRL~_V~Y=`*d}y;jTXwm38=%s7-=v`Pd-WXjK$girmP{#&8LnO}g{T>mA9T z$8Z^%P3G$Rmx<*xDWl#WFL8rdNNK5pcy0y{s%6`B9{yaK)^#ys!_<5U_jbTNRKD)naESl&hZTgE&$`nQ>wJg7;5CmDqNe5E z6s)H85JIhMv$KGvYvW;GzbYxu?;QZ-DlOl*h?G?9D3$)>$`P8b$KGpS81v zerl?gds}d-R{OBJ)=HFO?j(5!?;6qJaa5q2oatDZQAl~B?~NsG0e}ikjv#3lDEaq8 z#q^pFgqtpWfz6)7<|n6tjp2b@>#M_%j;~V$Hbsk8R4!^ZD%|YcghWPa z4n;sB9XdK1JlPc$T^|AH)itf%C-ivLHSN)%rxmt)J1^zBdKULDl4~k@sPMd`-A=sl zt|{sv)fHX<6(Zru@SnEIJM+OUsP7Aa_sOwu`NypokI5B>wiqueV>$H?@89X-z~4i3 zg%4X0?(jTK_WG&segiX*gpdp#sfeQ^M|D!AR>Q6!6;OtIP3QI1&zeTQ9{^OC-f`It z6kmQi8O-BxS*#{f2fpYi!Zm!^)KIYPJcrV}S>_%?BU4cLXd&q6;&2jdC2@Ci98GiP z$+VR;ssW+B8NmVz+nM1?0z9lxG%T6KAFRcE*E8&mGv*oRO$^hKp~HE>qVtW-(U3OW z%y2Z!CnNkTxw zrsLww`Zv?j+W2!~z~|420UffJ7Lyj)cvBe#0;vGbuH^-OsYaS0B8~*tJviVO&9%Kd z2~ov-5-Q0IV^GrpD5P*Ac=&0zr_m2DoGu@yRAMVw%xQ?6<;^-PEd(->$ty!c-*yhZ z2%AV~KP5ki5M)o_w)fxrB}>i|3f}=)jlWSJ1Kl#DElUv|Ejzf^2XSnV1PM}){0=(c z|0&gMuL#B~wZ!4*wS>#g@v~NE%1ka@?)NRVe@~oovJf>C84MAD@sn4&f^Fx)sM7j@ z-$wP3f}kU%&IDspc(ki#n}*yO>cHOIh(gY{D0@b9%PDs7QTJ?OBkA0F5|g;`8J9sF z3wOo!DrQot-XMoK+8gv%ai&^*0yf#7MzxVLu&Y63w~WSqal|3JE(-f~grG6gKDD-BQsS@O5ix9=h|CTu9#>4E~U zG1n$ko~L$rO!>LM-2qfS<-9V{Xj5AuP|`+p8G#MHYF$9YR>86v$bYtu#*v$yMfh2v#8wRE@FqHb2Fz9r>1wRkcWbLaf_~nmd1qYJY6yw zovT#&v})=NjXIkW`=|Q40qQPNr=I^KjQicEklik1PwYs|H3%x6ukSSoOHOS+@)=!D z?J%;6d;j|o@`5`UYL2<@HH;Bizw(|Z=xb4b#yzA=-k6hHy>20R6qtb{2+fX2;H(BjZXFTSqJ>v84xPN&0M zF;w*)I$~Y&+13*6=~kM8M9TOwIqO9&4OTr$>qzfLBW!{6_6gH^`wRst{r+SnV7-FkC$!@-R`$tX}()A@8B&n2FQ z?_cbDdsk6YW(qe>J|#~t*8#py!X6aMEfyIY9Brxb{d_w=<+G&8Q%jOMhV3mYXI4!% zT2&!%Js~sC+GW7jc@jBO-vZ)A1Al3UDvgZ{&=+^bb8#kdjv*sEP!(4LqjHx+;~M#gk%$0LlMa9j zgy?DM%~|6>mQECA1c_G_4m$D(kKcji`TP6Jd%Prla}Ggv^w>2~(vIiC2T?4MyEBmU zI$MBvhI9ZA&ING@bRqT(CCO*04jWw?a6+L+MU!I#0|-V@r<=~1C<(A=piiQQltNID zsF2$u-LKX}w!P(q8PhL~oA4LU`-qQj9CUXaMHLy+0^Hq0hf=}>7;4I!lI zs3{Cu!jxzSdVh#H&3}i5ZV;U&H*PWb5k1SE)gFo5pg$XA3P?p$EnDO3mMrO&g0Y6A zr7OvR$Tqj4OJ`@yU+<8Xp%+v?Zwv>TD6>Wh zqIaEvsc|67?!NsN(TCoXH!Kv=f=^YGOfB9VRAurJAa^m6Z&m;t!`e1NC646M(kLk7 z$Q0%e`K_H}b*!kT8!c1_3s};_aIn3sU)I4M^Fq4H59ur*4WcH0j7^>=V@$9_(=BPf z==b6TSoF*?Yn3wwAcvlDSO}yPvb(GHJzou_<0e7WwS1y2Dtc;SBP!(ER5*XLK2e(h zr@Ek}*`$wPY)2jYKwHcI)!ti2#no)lgSdNe3mV)dXb2jd0Kwhe-6cqZyIXK~2?UqM zLU4C?C%8Mn7lW7yDH{{HJH(6_ihzuk3KC6YOow&SN+8~05NS9chK7D7@wO*`*ed^Tw!$` zpYCaas$lv3}_5w{U%mWXp>LZ0siSASXcprUJboGK`u8}*JX5V?lv z``chujM-%Gz^e#F3whEV;CA3K~(;DyBl&Zkh zd1|Ti5@#Lmij+K%GC8*EknRr;4d29KlLfa6Q|&LXRhPZfb38JT%au&Oy}TbBmIQbVu?uxiJMLZxNa(&&k{im^e1Py*NKMD&?5L&_w1PBNM{r`Jm`7xgUA7OE@=lIW}aMaA>4H0m&Xw$;f$@2ie0dr$^ zDWfl6p7m5a;ppI5PkO~de#st9Y!qC~0B&z84UeKIAe&`|O7=&DkZ+9krhQJZedDg* zObV~r=zhejvym}{_`Vb-tv7QCLecn3+h>~Q=jv&RnJ9YLQ$z6Jc{JZrmEB$L2ZHmT zxt(LA1j^H*VDZoSXxRALj74ToC2_n8N%xO@7Gal#a)lBt$Qox3arn_}F&bTko$Pso zPeSdR^7T%_Af)f6%U1f2>!HN@BA&r z_GMKmhcG|Icbknbm~gAsekRFb$R*#tUNEm3N7Iw8b?R@KcN(yovmTJ+mcR;e*lCB~ zQG71f;1C%aO!3>*robZh=vOT!H;p8d11<}rFK|=B&3mtsP^1eCtmLfgfy@Nayb3bP z7Q;1?dj+?G*y9h60eVLz9pn3qBe6_GsvsU6%RxsxJN7-#%g~p!1 zB#dlwzSUwFZe-H6Q72zz=dGYMMb^ABv=TeQa&7WhMNKm)OL6m(-d!{^s@>PBTQTht zP)fcMgVs&39Tl4}2n%)^UWA~Dg4z2Jw@1`u^PrKWIfuh!^c#6c&X?dapG4e z&oE(ipd?KSDw{j~sQYu_#vq760-{t!f$_zz%g|@nV08AvYs$*m->n@nRy$c)K8^D` zZ@UuicEZD9$jOl!=!J5jSMcVE2%bn&di$`!$|92jTy4iN0y7!i@j4L@x>hRQvk##Q zSmLY1({De(0WL8~L}Af~GP`(v^<6t#q=`vh;x;Z(;SioMwR=pykvonNrUAY-Pl`*i}8m47q|UP^-5(1Jk&LEFI62+;yum z_d^y?!2&{-F7*xsSN5`ns8T53l4fEKfz!Qd@e5y-wdJAi&cbIPWb4kd;n_J}8Mfu& z>dqoyAmr`Vv*x)zUSYE3p$qHg(Bvt>5AQv$4#NshTs5)d**IP?Vd0TGmgg2(a}s(e#zfpsZ^;7P?6* zPMyk6HtnxPuY{L`Ihr3!t_h_Ly}NPvQZgE(g6GzGitpn>;SB%AR#7r zfy#!e^!#=J`}w7if)plsrh$2u7tg(!Tu!lR$Q1fX>Qb2K^dw~^1@HM-&JX#*QEH1b z`(Z~pb?Bt#?>t&W{9>j1!1VgLno^iZ^+HB!ZtxytnQBvDV^}6uLU3HokYC>G>6&v+ zQW||-(?tN;_wroz3aQXAU@18<&#kdw&vKV#uPTQ@WL1F1p~@*5N*ac$9M)&SGhpeZ zxL4{mi~$!NVEf3I%$kwGKqH?3CUGOB69c~H{@TSaM<7!vu?}5%#%>`Kf5{MG?Ts<@ zoDWTOESQv=v|=mhhPoTC#D3^WG?WdE~Zn+EbSjBk+Rby)8Ls0-%L`r94s$ zJ=q{a;gfV<${A;@JZ-+Zsn@!rPnioByi7Bs5pOzXRg>Kb&6^IAv*P)JrnGNsGrpjz ze%wS!n#o+NphW$)`ujVYhhz1DR(W16@()2B(S0dXIvizE1uBS!k1|Tu(X2o z7|H>SHLWY*-Fkz~Y1e9Iw1Si4hOlOVxHM{K8ZNA9=nHf(<+wd%=1o|kf$lX*ntM0L zo6QnhD-Eyd&k7aa2O>1}YhIn|=*(b%>xv45X!>Vk>#Z$tp5+gUhZJ(0sV77A_a#W3 z$AhY~W@(io6*lW{v=*``syn<$yIIf6Mlg(3)<8lW@&gnEh>R^Oz6xnu6CwzS7p` z&FOCg27#;8yR(Do_qiTA6QtWg8s4qBG@9Mo#WaVJ_7(NWJ?_263wuTe!_vas7e z2TbwvDq;*T%1B7sJu_L3M;Ltgk;HaCk(DF|4B#UFf3Q^jL#F@Fe8s_@^XGhJ-*)C_ zZhH8I(~gy%u^7bw`}|j6Zu+OHn=(>!(4?N{H1J^a(}~R!2fzrfViX--2r;pN8cTv0 ze_Uy)eOTFkR&Za@WH1Viov0l88;^14PepJcw;ZpBv&%g3HB(&*qvTK^LAp`*iX;*W z%Lct%W6$L9=e!Mgvf%7ca~{*zVqWk{N!(E(@L!WiYj@57m|#H-0`)T?A$S-ki6Zm0_!eSQ>=r}}Ei1pZ3pZVGV5fj?LwgFiLaoxM zt#QIJ+NZ*370~RMiPqr~_RMYoHi7iL^{*WaNz-ewked|y9W4XjJ|Y$3eA-ac8e zbiTVkA3fTCE~u06LJ*9YLHN{(42R4~=&U7m5fu{BW1MYlZde-*Ex%lk4!A3sBE7k!Vqy(srY{3ulf52S zJ-72aeHaBcHQ0j0q>F~o zUBD9eD~u;-)H!N5XI#UEZRW$)OF$u;{IbsoLInM>uBm-qDY)<^EM~K^#J*_c3MxYZ z=px{kC>X1RzMx1x<5<(~4HemowgDmW!i|;?huEK%)gmb9+u{<(0n6*i3+m#ZESC!8 zRY7}zt2nlLEq(|B3QScmR^>ryrH?op#39;K6jb6d7$sfq3IXWFyzk8=*+?G&)zGEv zvok-c>U>pIzYOa(rNh?2DeA7NZC8Id1*%LoGfxVEOqSxn5V=_R}6-Z2BB5JM#!A>oKnpFyyF;1(eg(b|PdqgqNR z>sudGMl^`LbcyGxZPL;)Dkui5AM!EI0yHwqmS~O=$O(&D^#~aVi#qkNoQqex^x~aU zz`{Kmom0@l)s}34(-O6nZFJ|k)vi&hV`s>;Z^tXg;ym+U;ZBXt4PfE5_hZt`O=J!M zFLGv(+6>3x+06vY6%26UFjqijKdf1ynU(4|a(zQ8F~ozyvR{-a=Jn_LlE&qRTtXwt zO7>Ab$m3aXSQmsvu5sPa*7jbHdCFQ2VpJm~oOY)%hhBAqE^|E;5@q|sc&d8G%G=$_+^iqlLOie5w zE=W-nG_V&?^v06e1R!~xT5^evL(1@qrTUUhZsXri%-cs57UjD}H44zIXWmtda*d6A zEzr{9a4Tc5uWyCfTIh$Ta=OADu$|H(|73|9x864n`Gpc;)MRd_J#XznLkr(dz+Mg`z^TG+JK3?RUU0lF}h~ ztJj--2vAy8S?&k|G4sBSy3%CE!I;+caM!8Dc6?XPyjz6I zqE8GM6i}3;;=hP^jwmGdHWq*l$)n7^vG*|^;f#FTH#Ty!kS=C#6HHij<-DFl+ zyJY2(fPmoupymT=OHlBViDrg4YKEq2@AULm-{9fqVdBoRH<_C|nFy|q3Fyr#riV(s zvUr>pn9nX%SJV_BhRVZ-TFf@Ev)k8vvN8QsVZkyBV6SCol?JZbMgv_IBO$OqIg?5w zd7YeAWqrIlDr7y?U=YoN5S*3!}+Lai%m_$h&Wx!0`Ks}n-R=s7fw!P)I@ zyqD+w^}5z8ObpV>Po1zIz|u@;<+{Z*0+eaI;Q9MK`S{5^nXVAY4mWi-h()Q$;xnn2 z`6eS0<6g}4A?Kw${0LE;>NA?f0<+URh5z4&D1Ip2{~siuI_@5ZX zuSMHDwC_gKCGy!S^5{Jy8IHm8h}C4X0{Q8()z6%;hn%MH_ULNo_hqlixU+?NE4Iga zna9E~$S}zX2vZM7KiwYtod?IigdN^MrCXuh-`z?qFq6gA*GZASz7AZTjw~~S>@fKt z7ANB2fr@e6zj~;-*5vEA=^Qzb07XBN8F+IYQ_*OIEzB5-rq0uULg{a}8uB8cjXo8d zxNCBWi#Wg_DJ^n|W4!3zKKXvy86SEWS%py|GbUErScvicQLsr0)`%LK`6_+enQ7c} zNR_Df!LpLNwJ6eNK<$5(Wm0xx6VnMg5M=(!ev)vy~pFjv)#C7JQ~GDp(7IhKv~&P2?@BZ}g$YevdSMYKB8!|rJF_EV+i@^HWNygWZ! zyWL#gO%GVnk3=>>KUU=O5;tnLHVk=?ZqE!p0`NRiMLluf9mE|*!kgJDsz zh__3?MO-=~DmHk-IMl7PP|bb7QlLgd(wM2087{6}TZ!o2Sd9v_E)Szsj0%UXx2uU0 zrgWjSanetb=O)PK4;uuV1?lLcw|r&vH$qT5c?TsL9gWiuuz}1&5Fw(R@}?6do3Zn~ zwD7gSAxO+NFejhjCp>}@vcqAt2y25&H0;={>?$&*wNR0r5wB7+k^$@?ec9PFb`%$? z*8TdXTwXOmDgTEpOIzN|vy%bp$v0K-&VrZ&jJPcffy$wu!K}uYA|Yg&dCv@ZzD*u0 zBNBcA=T#5$})##D4Nt|0yCgeX*p7dHw~U zSxc&PQIh;Xw8rV0TJuA-i`?dDVVuJ0nrIqr|HW6JSqq;RjfrY(k7?2z(5yx8`I@}D zf@_h>37JgFqgjj6X*Kos_@h}1Pn}&v`0;pwZxTfdrVI?H%q>&t#&$(6kti=D|3#WF z(DVR7L`uqGmV06OC z2~S3Cao&O_oV=8_=5yr%UFij-DzG?bN~CpjGE6lTrmM)6s{O} z-p>cxn%2s3u>R({*2?TwYEfqawzu7(Xgcp(KvnLM%j4BL#| zF^-#jjA09dLOO|M;%X8+j!8l3laG;!vL%jFb|ao_r&3{lypAE(u|z8> zD)0e%wTz8TRdeWCr~tiM0^@L#C*BvAKmfg3hUD}{(GqI99=%#rP0dx$t7?~kUM+$W zp;9!d6BCbKEye{ECN`5Zl|Zi+Bswd!NV5! z`(Ul$k2LK$ort1>nkK~(-Sve6=B&@xUiKm12DdYO;#;|$R~JfK@)bUNhu?SQgb|>PfXhF0=K0&{+g*X=% z&tkXW`YP-y&`V7&n@(3UiU;2Q=<8Sw2mC2zZ?4b1r@iOeF77ABV4}V%j-hy_VJ$+h zi%e`3bd6SnS9!R0k&&PXXE-@st&E}^!SbSM>%u7&X3_Y05tV{GY$OPL%#&2iw4-}C zRFce14bQFYyj~@e>D&quejDY;%6W!QDynj+ABQgFD$!vvmxH1;7@Wl@1HYl0lHo`1 z4|G!_8QSE)d-p1AgshXZmm+47MO24FIa?GLYt~+6DWQBwxRV;UHptB%U=76-iLmWQ1?BIb#;XNyvZ?9!boR3oOlPnUvq)s2ni5;gvLsw?pADB%nj)KYPMVU{z% zvf$*aai;cyw~JW@+|sMgj}HB?DRh%nBHKCbT^g5AUuXc7y`>%a>&lbZ*Vcei&Gwjq z;dEWeXcQ8Qn78O=<|n3EmjS3^(-uD847o9sq$`c>vz}Mg<0Hvt8>|l)9g0{<%h10H zSqP#3_9{jF{^%2@!}+P`RJ5+&YC>IQmzg?3Fb(aTI(=z*oRH4}r|Y^E$9v)83cak# z{Jqc<%V9>vTSDO>a$CCuE*j+U-8qjtC9;;>;l>57!EU zD@>5O!8;gUKmxe`T*m0i#W!LL0+<1ay^bRY&_v(>ACAHcou#?!p56zloYv?|l*GjY12 zoz`QDP?eTSaCst@hU)?s0ITi<;p0mcW{(Nu6?i<-q1_8wJksyGOO82!spJ+%XQn&{ zz*KTJ=SEx^E3f~!vjOJ=Ep4M^$Co$3c??I`bzW##Ih0X5Z4s_}ND343MU@M@?^Mt} zj>dAW;x6+Au}p%>CHpjQi!;ZJ6dUP393 zntRnF-*(`kHt*Pnd5)_!4}Bd!=(RN~%E2b}dZbeX;4;H{SDJ02fKzBvNhZ~L{`%15 zG+R1Pp{rn{x76i?ZqaLor{L5?p|{Y*mP{sY{gJ=Ncz?| zj|-a&?<99m_vMZN|3I`>Fa=1;54}HAhDVWU0y#ct0%2N*Bg>pE;ZvnSqVl=&ZFih* zi6sy0!X(pWKO%=t#`QKi(@?XyJ?vV7j`zK=a+p_mRCJ|zprUIy0A=2O94Par$}4?J z!88vGD~0rj(g}T^%NiKpozZ~ZMP-XCzA3$HXf$s$gDoeB(5hSu&RcC*GON8=czartRA8HC z^lohlB7hcI#u{umz$7~q@>-}dS|;|n1>qJBi#`?L%_noTpy=I%=ZL}*Z-s4Rgm_dY zbkDmaz|516Ti*4k9;XV#gxJWDbI#e5lO>l%7yv``q6x{JJEo@I^lV6MZ0r_c%Cnfm zIY3ng6-{g8a=JfOdp#t+G#nX9QSDU$dh z`8U%Oj+G9NlK+X`H>j=*y_Hee%;n_ry;f)#@yc@E3raMDNv(k$2MF3(6_@A-FEi>w zSV3kP;g|%hGUA7zFb-~4K3b}*Zmm}jS9ib*=eRk4E7E}+iYVAQr-GgUs}QbY_*SJw z)|1qm@|NV5ly-q$XpM@D6_B^`uB{~`gVjPZWMa}6|KUe=TEN7Vv(rBFf(Xf>QQT6)a5{3O z_7b?hU&!2VAta}u0zo4aj*i3UF4ZljK>f6C3#+m;HIe-iMO+0ww$i7 zjD6Y!IiKBQuw((}baqj?eG5^ssA#uo{?gAnn570qZBn+sAM2;Do~|<^WgebtM~mQ; z_(o8+#VE9>cMDTx<+Qy5?qgc82+AUofydNErC@48=TBzbwDS$iOAPiz+xlZ3254x-vwxg!p4HOD_r9cw_}&VNE<}lHW(_2<-a`n zV8&fID^}e2=7_}Wc_c}ARj!8l=BB;W%Ni+y2s`q3^1*NRB9d6LN+5#+zLp4K^xUNo zO(>!?HaCh7XvZ{Y@c3DkXOTp8PJbbivCVv5)z8EsA<(Q*TsSFjdkZnx$MdlR%O>i> zOCHZN9^x5JLrzi3k+t99h1GV`nQr8ft*)w+RQPQvmaAH2 zvzT?`($Rz$k-!aEPUddSwx5r~xOL8g6jh#eSKaMf9?_ZYBs!d;*qV>dS(2q;WM?yE z@r*m`ltxo#aMi7zMn@@@j`$~6rQ~U)koDFgh!`;GYXsw1**N#6Vw(3ifEcxLQ4CU{ zYG^hk?}Cyy)H=t;W^y*xespFL4(p1b3SJWaYHHnnK%AGLz2J|#?b=9@pn9k9{=*-}~oXV^C?ZMdHdZT%GDn6;uLp8)T8F$nBaEGR`$aiRJ6Wfcj^ zq&R8{RCUAK2$NcZx$n-HbU6%kG+NuWH5c1anyX5O8s<~6oYsi}hie8i&a~`C&yz&M z^d{CA*cnbjfDYsw+DE+IVvR%b)o-;A)^@#=*k&@bF6 zRAiHRVjVjfWo94Zs?CU7xg6s;uq(E~gzv-lRq(DGa5eTvloUrP%Dx5HVSAk+OBB`7 z=2MI*)si%ChkWV{aB&vbP1%zGKe6KPFitJ&`Xt{%hs||Pht73wfs#a^#8|G1um=SW z&-fMr{7YA>31c}Zco(C!D#9ETIP_yJMkfGItHbf`0FW-ObyWc=JG^#jLT5zQsd#e?NGR-~WJMeg7E~f0w;3RS-IlL5Y27 z>Xl5*!Z-sCM^75jVSn`Mt}7y^qhOfT)je;7-aYbK zf)n3H0&+r1GbNW~7-??>?IMd$N>Yl~gDh4`a|jng*sM2;&7Rw(`)#au)v4k4h~90Q zvW_+9DEzPiUia5fwn*tI&C1-8-?E9j?Ij zyxyXnTT#O%%RNKdwm{bz1kOUzNX&_6Ezxz@9ZC9mIruMw2U(iFsN^0Ipf-f|&6ZR# z+#{K5dvoSqT*WSbT702hNHqk)BJC64!K#?xT*!>b_<(NgZIb`G;}saiR}qCosDv0e z*138VP6Bk2yq6dx$-M;<;+DliGd;GY@g#Qz@N#P%>PJYrWFI1-y*mV(6}+)wS^w=n zehhZ+CJR!2WKUxOzKIxTrDXfdzgmvPixsCeaH{$DUYMpbc8VxOqugI+1}6h^N-^%#OXTSW&P-utuTGAllh z+vN1%AQt*7yHaJa$_Sb`tD^JY{xr*eQG%M|o zDlMUTKlM)4DbbJOl6i4RX<1Cn4KorNp7U9E9j{SeM1l>x3q~f*8>_1Z!|utMz)!n_L8t&!;B(1#F-Tfm6?&fHMX3g%&#bxg8OKGRe>v{DA?vy&Tz#S`w(ifgh zW9_Dt$`d0GT*bBHlZ4;D_m?3sMZ{!&eJ(x`LSw$g$$Pm-G4%e@JHTkn57yFA5v z#NMLiQqK4Ju6&bDFbX3f3_(m$JC2D=#izF;LLYJi;<0fD9@q%Q0BrnNAF);{(pa*A zu8Z_Sl-qlcm;`<#3``8l1&lo9$7j_9JyqTFy~IN1b{QFEf) z1Wb7e>-Gew5so0!fqNjm?yECC)1c8p_;DodD9LebfB^B6n9U(R)|S*Zcgq?gSS)TTZFH_ijhdL z=F|1h%CJKrqEC0V;-EPj_9xKE#6QN^9co3_nbz} z+#uv*C9cg`mv07u)_;9=Pd-~+66MT%!j*ZB9F}4E7O^>KTHWIVMy=o=asNR#RICIYxAeeBT5Uj)a==u(+O!oEl-x-7d?K4g>wWMN&SC>J2BVGGnZ0 z3%NdaCY3}vMS%WnOr?dSVh(T;#N2S{x%3h4*;gK{CoLDjHdyy|19DB zoEfSN*sgG(cmllyV-JODnS2-pbG;?yML;SF$FWu-=+vFhEAg=>_=!iN zC-Nd-Nto8Wa`4)*L|s3P&r9(kmxaV|M-ej80fFfz$#G|b@#Imja|NYjU7^zTjpi?Z zA$+uc5EL(d%R)60-Nq%-SUdC7^t+M#!b`5m+AdFDG$l@ElhMIUry_;lO+;OM1DR;VN_)Ow(WKkOf`hepYB z%35CE*k*!ALGzapfP~pbca0*5fqlhR#EpG*R-^&_szYT=W6Ue5Xs#CU`iK=qazJ^O zp_=(58}wYDF?MCoGMxTf^TBYycc}7;0*67oy6&><^Iee-kfg2k7X|U|QNsYiF9=(C zk&Isy+y zTW^kUez<1qq8C~G-ZgNodw2{f+!LxBP*}2PPG1p;j@j5S=Q0IkA_U?DGLmp<{3SWC z_$0C=b%qehtTOrnb1>~x>sLc<2+>^&Cqp=#vTvsJ)(fa&zX{dqm$G5pOgO}xgLPug zsU0LHh?{c*CSWJDKZYox2dLF1-i}3FB;l@G9N^8{yUjSvmp0JejZuc0? zZa;AXa}rv4CKosiPX-RDV}T=xCHZmzFTgE*E6nM0D26gcay^#5$Z<_QA-jVCjZ0)e z>Yd4yN$xUzYntmR{)0+o*F48ZE`{3Exj%Z@++h^MTW<|ydJI4);4J86>DhX%3XL7$ep0-^R;}k>T1=)&YYLw#Su2_ zSKsj7-k0kk#MxgtKOpr)zd*#r<5->S$VEzO%jf)Tg=D$QFU)Ztnjb1N=80ajuO(e# zF=nMgc_8Krx0Y4qs&{X_OraV6ZTCZL;O!YVM&RlYFI6nhR^2&O*rLc@Ds$ztif)Q? zhV}&IRPjy*(%Uwu%FJ8zug~9lQhD(VyV|P;@=2U|DZnW67PY44C2f3mePQ|PUE97I z$K0!Mz1(}D6U%M<9@$SHak(R_!*?|uiooXiKGXk9&MOrb^9cuTCLsEsnO}KVh!7Tb6Q#! zAEGV-2VLZpXz`z^7(0N&_2!|NlplF|)_Xy8tx$fZLRD3cSRR_+*RrvdvvLkeRPaYp z!xlb?X4-p`La-YXxsxkn;3(J4oUnHrh&QyCw9yvB2Ht=uPw?K~7J)ARH0qdPZ|q`t z)$Sx7N6bZzQV-cV~2u%q_$Bt1X@!MdwwFjoU$7N6Peh<`=`sbsXyGnc14LvZ zaXM2p#@9!hvw6g4{TZL}##}^g_&^V=%X&W5>S2o4JrF+XN&>r8^Q(b^Z{TEpMTfc z|8~5iiGkCvj*scLoCS7#OU46{wUs>cNGl4h(J;xpy)2Q*1+B1zkSG#Uwn35BV&wPo zRBWH6P|8;a%c;bH$$439Vp)vF){y;iR1z`KZ%vgj_AYi$bqAtQ{Av6_^WY{5ThnT_ z*k4&ybKSQ4ozbslYCD9y+T`g`RY8qgip{Dg$iRqfP(DWM^7%Y*q>%K=mZnlk- zPTi=S2XuX1Ul*Zo-v`kJ!4eN_6UA?BDn3?jEfLzh#P_CIdTzeb3Ax&C_E)XS%5$x(JmW=F;6q3=iuVXU$OXgs6A&M%Ox+P94CG(3 z%EpUjKHyj*L;Dj}C$D7oy2;tOR7VHyErXldR*>~t4_;0~4p*M-WVoMyC+yjQBiT7$ zEvXm7S=m7`lcx3;`SKDLDx$9{pyOq^hLoRgj=4k^P6+*Yy1c)m#+7bgb36paEzRv~ zjWDxy#!(pe$o999M!Z?O<}XQ*yR+T!>Axig47`s-x(R;KP}g zzJndnNgRQ!vCK#IGjX+F@h8$jaVLjdM|*jE&2miL?+k|*+l)zML6^z@b{8QbB_4_t zO2y9$S_j{+jW(@w`PS8z1%9=6-W)@E<@=7|mn6XQcl%Rl25#vT*^4{CD*|=s+YJBY z1Oh3DMhMNL1RBsULe9tb7YE}-{JGgBGll9O_X_+%a_KoHQs5-1VT>Oh#?J02N^*w- z_32n$vq;)~yl?@+u5j+o#M|pbqs$BGx1cm)SQ(|S1u|hezf}f~k$Cp7 zXWzNa4MDwYz@KL^A{?+S3fzPDcso(;he_T$1Va{Gp1Z%qK&MoZMD0pK$=pPxW-0L(?6T}zaudckmfKgDPf33F` z^#;jJ=fIGZlQJhE_Fxn_`ZZiDC|eS?HzvC0^astrDn$8{z%l9r{50Y4Wl5{|No_Xx z!4+vk)qMVwfj4>EW9o@_MG#S?oV4wJZwO@##acZ91kv%==ax+*8VVy^3?`ZyCH;6J z3^*E%Y}O--Y)6Ny^DiH}4?y`|5Cr&TZ2F%%i2g(e4J_?->`kmJe`KsB&Rgzqpme8e zFB13hN(sCYmF+;ah&Qdtqp{y2+*Zf!hF}Owts#KpopD;U>4C0>`E)PGd+%oG zDV>Srq{YrzH6cVz?X2teveW`KOjz6{Zpy1!(zU{ZD%3eBVTr7wPH0+7q-74HLjXWBcFDd(#<^lxkxFnMWDz{oaYWSdHa)Dj(Z7+z73$ptZgJP{E}5qE?BbU(Vs zp^|?`;^PtEOsx5l7Gm8(v-~n~WG~A?@5mQf&I0{qc}}yDG!|sO%K>P9J3Jrh2p3d@senvzOw{mn&@6MX`^&{qPOnyBMl%F`_-~E z*o9ea3OsW#Eq(UUG*VCP_2tk)7n+D0KSBf1xrdPz=E4g{^uUT;feq*AsX9%R1fS6W zy+-yM%~>Z+DwT|=ZLG>+mL>yz`@pYr_SAH>h6+;4{wF2O*$YxcacxDh(}4J66`#;g za6DZzs-#E9>UFPLInO>*1Ot?cQms}U%egeSo_UTJj&`aDR~Tv@w;Ss~Huz5S+wrM=VXBGJcIM*3&T9eIe_S(!`& zEL9|itiZ4|ia2AEG^Ji1DVH@%I=LqgrEl1vo`bul#i<^=bo2sJSC~0s{i48mX#@kD zZOi+;x`-R49xTY|Ln3H8{Rg=M zN9O7QhO4ao-DM+Ikxne&IhzHVDSPTL^f#4d^00k1F2Y(X#bLWHuXgo$>JM)`2Sabg z7`7Gzo2E{on`XN|^1(LURBVPlFx{~*uoC3uMdj*KfIUmZQP`)xR8kie|17?-1;{P=1Ht&D(v$X9Q2S>&7%5-%ZKkHx#p zp}>Y+8+-J6wrBj_MF?b2hw#j>EZM%M7WfR-k(2n&R)p11f`i6b?V-+!tb$fOaI7h?6 z`CsGef6KJ~ylr@#CO;h>e#Dtd#)hTa8PTDZfK_Va7QTQ{qjf9+60CJ}(Vs6(5dd^& ztIqJ>E-XMfn8T}lo-%l-@oI7G-{`k8ispr)?=><_347A(N#gtC8!Ni%McgU($)0yDX1p?30%_%t|I1v0xTipeGT$ypD z9E=_Fg^O5J))V21+>4{We`=R#FsY(Kd=)j#d1h+}PiI315P*Rc&kgHXw2lHW=M3V# z5X3zv_iTa)f-WU+7z_N2gC+~PhLjn0jZ*i%8S%3G@&$A-q6q2U`#D5d9kq>11X^tA z2$88IrUN~xKz$dqHTojdTEnx-1L&5LZ@hOoO&?UCbsTBe?x`SUb(q-JzPT65+DhWX ztr1zzG)b@IHhh;+z5Q`}LRHN+!UP| zc(E=R{`4ibU7jtuV3q1;7P(tnif&b*6DfVg75GdEY{? zQi{6CqOMA}C6PITVv5=I7P=;zeHZRck%HO9A@>|R0KPtbg33k^XKm-yg}0EE8cBX; zt}iPD7>E*fD;%n~BpA{A-O4DZM7SY@?;uT~Jgf6ep~kV*ds*4}kQxz(Cnn;8qb9o>tk{C-Y7V0c|}S??s;RtgyLpL!P0mgD20* zhzV6Lz!uIijq#JSiitd@7R#=)hZ9MXHqYKh4`oe_LLHf=9F-jTu?7zPz}X=N(5>fx-)pD|T3Oly2QRI6F4hKi8jrJ^f%pO7{U$

r{h$9ft7*Nbvw!yS^AoVzK8_u`OSsJmVmdN zYHX{IM1VHQmqujW@adTo$lcmpS1^`opEW{x*?o0YIg;=FY1M8 zG-{SRs*&0ktEzjp8P9)8sxe!Ai^jxu5#dmF1 z+~4vRMS{L~s|6jazl&V~FUV5HpE#^}yNrRATja_hl-EoS8rbXQ{@hXA%VX5yGBbpU zBBZ1}{pQPU{wtK8gK(4MAL_Nsm|ao_94%}RA3t|cFmw=f5EvL3kSh5W#Uk37;Z)%D zpbU(QJx*+YKdi0o8SIS>EDU}={s@BoiJ*d)Jdy!KF#89>BCz!jgda7R{zQ0M5T5uq z0&T!S{Sm>^%HF{4zsopJe)ub)r#00d3oQORz5wrh{sH$>GyVkqp&@?~eOhk$QS5&m z-Dppc<^Nit{ZFv`C&h{%D>VN)ps@Z4_JrjR_+RnJKXIQ{p8UNxF|dE){_Dn@$M5-{ za8D~Y{@$Azgnxni$@9r8{5#R7wY+{OS_bUg$Is~r%x^n{{(2StJJAdBzrg*2=&y2M ze|Evs>LkAtg`)Ti@)OoSYLWa2_wPi%0aMKX@xK2-^jCqCKaroBrT+B>#f^gy;`Xgg@c_ooMvCzrg*2=r5y%KarpA?fji+ zrovy4pAh}Alk-oweJZs+8t`!TTrGe^;lcc_zPq{2Uff zp#M85`9bwbR>_~8`ZS~BcNx8M_zT<tnwEOWgd=)1QWlexFb)`2BkKPXj}LB0LRu{65RH{O}Xu z=g08Z0MnmfPoo8o5t3iW-pAkKC4Zjy^kxy^*IB@4*Fis7{5Q1l=b2A$H-7&FSVMtJ c6aT!md@BJ7%+Ln`p#uJk0lpS(_~TFi7c}=W)Bpeg literal 0 HcmV?d00001 From 5567370214b100c32254c4fe60755fce3dbe8a0d Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Mon, 9 Dec 2024 09:32:28 +0100 Subject: [PATCH 6/9] chore: Update gh actions to install docs extra Update library gh actions to install docs extra to test unstructured integration tests Chore COG-685 --- .github/workflows/test_python_3_10.yml | 2 +- .github/workflows/test_python_3_11.yml | 2 +- .github/workflows/test_python_3_9.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_python_3_10.yml b/.github/workflows/test_python_3_10.yml index dc1594db4..30d266379 100644 --- a/.github/workflows/test_python_3_10.yml +++ b/.github/workflows/test_python_3_10.yml @@ -47,7 +47,7 @@ jobs: installer-parallel: true - name: Install dependencies - run: poetry install --no-interaction + run: poetry install --no-interaction -E docs - name: Run unit tests run: poetry run pytest cognee/tests/unit/ diff --git a/.github/workflows/test_python_3_11.yml b/.github/workflows/test_python_3_11.yml index 0ef9bf910..1bf8b50f0 100644 --- a/.github/workflows/test_python_3_11.yml +++ b/.github/workflows/test_python_3_11.yml @@ -47,7 +47,7 @@ jobs: installer-parallel: true - name: Install dependencies - run: poetry install --no-interaction + run: poetry install --no-interaction -E docs - name: Run unit tests run: poetry run pytest cognee/tests/unit/ diff --git a/.github/workflows/test_python_3_9.yml b/.github/workflows/test_python_3_9.yml index adce159ad..cac4d0cd5 100644 --- a/.github/workflows/test_python_3_9.yml +++ b/.github/workflows/test_python_3_9.yml @@ -47,7 +47,7 @@ jobs: installer-parallel: true - name: Install dependencies - run: poetry install --no-interaction + run: poetry install --no-interaction -E docs - name: Run unit tests run: poetry run pytest cognee/tests/unit/ From df289deb185ce660eb889e3f59c1ae14ea0390bc Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Mon, 9 Dec 2024 09:49:26 +0100 Subject: [PATCH 7/9] chore: Update dependencies to handle different document types Update unstructured so it would install support for different document types Chore COG-685 --- .../documents/AudioDocument_test.py | 2 +- .../documents/ImageDocument_test.py | 2 +- .../integration/documents/PdfDocument_test.py | 3 +- .../documents/TextDocument_test.py | 2 +- poetry.lock | 105 +++++++++++++++++- pyproject.toml | 4 +- 6 files changed, 112 insertions(+), 6 deletions(-) diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py index a35e3892b..da8b85d0b 100644 --- a/cognee/tests/integration/documents/AudioDocument_test.py +++ b/cognee/tests/integration/documents/AudioDocument_test.py @@ -27,7 +27,7 @@ def test_AudioDocument(): document = AudioDocument( - id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="", metadata_id=uuid.uuid4() + id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="", metadata_id=uuid.uuid4(), mime_type="", ) with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT): for ground_truth, paragraph_data in zip( diff --git a/cognee/tests/integration/documents/ImageDocument_test.py b/cognee/tests/integration/documents/ImageDocument_test.py index 9f5952c40..8a8ee8ef3 100644 --- a/cognee/tests/integration/documents/ImageDocument_test.py +++ b/cognee/tests/integration/documents/ImageDocument_test.py @@ -16,7 +16,7 @@ def test_ImageDocument(): document = ImageDocument( - id=uuid.uuid4(), name="image-dummy-test", raw_data_location="", metadata_id=uuid.uuid4() + id=uuid.uuid4(), name="image-dummy-test", raw_data_location="", metadata_id=uuid.uuid4(), mime_type="", ) with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT): diff --git a/cognee/tests/integration/documents/PdfDocument_test.py b/cognee/tests/integration/documents/PdfDocument_test.py index fbfe236db..ac57eaf75 100644 --- a/cognee/tests/integration/documents/PdfDocument_test.py +++ b/cognee/tests/integration/documents/PdfDocument_test.py @@ -17,7 +17,8 @@ def test_PdfDocument(): "artificial-intelligence.pdf", ) document = PdfDocument( - id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path, metadata_id=uuid.uuid4() + id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path, metadata_id=uuid.uuid4(), + mime_type="", ) for ground_truth, paragraph_data in zip( diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py index 46adee094..f663418f5 100644 --- a/cognee/tests/integration/documents/TextDocument_test.py +++ b/cognee/tests/integration/documents/TextDocument_test.py @@ -29,7 +29,7 @@ def test_TextDocument(input_file, chunk_size): input_file, ) document = TextDocument( - id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path, metadata_id=uuid.uuid4() + id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path, metadata_id=uuid.uuid4(), mime_type="", ) for ground_truth, paragraph_data in zip( diff --git a/poetry.lock b/poetry.lock index 9c11fd43a..dcba97b55 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1525,6 +1525,17 @@ files = [ [package.extras] dev = ["coverage", "pytest (>=7.4.4)"] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +description = "An implementation of lxml.xmlfile for the standard library" +optional = true +python-versions = ">=3.8" +files = [ + {file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"}, + {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, +] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -4655,6 +4666,20 @@ typing-extensions = ">=4.11,<5" [package.extras] datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] +[[package]] +name = "openpyxl" +version = "3.1.5" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +optional = true +python-versions = ">=3.8" +files = [ + {file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"}, + {file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "opentelemetry-api" version = "1.27.0" @@ -5885,6 +5910,17 @@ bulk-writer = ["azure-storage-blob", "minio (>=7.0.0)", "pyarrow (>=12.0.0)", "r dev = ["black", "grpcio (==1.62.2)", "grpcio-testing (==1.62.2)", "grpcio-tools (==1.62.2)", "pytest (>=5.3.4)", "pytest-cov (>=2.8.1)", "pytest-timeout (>=1.3.4)", "ruff (>0.4.0)"] model = ["milvus-model (>=0.1.0)"] +[[package]] +name = "pypandoc" +version = "1.14" +description = "Thin wrapper for pandoc." +optional = true +python-versions = ">=3.6" +files = [ + {file = "pypandoc-1.14-py3-none-any.whl", hash = "sha256:1315c7ad7fac7236dacf69a05b521ed2c3f1d0177f70e9b92bfffce6c023df22"}, + {file = "pypandoc-1.14.tar.gz", hash = "sha256:6b4c45f5f1b9fb5bb562079164806bdbbc3e837b5402bcf3f1139edc5730a197"}, +] + [[package]] name = "pyparsing" version = "3.2.0" @@ -6008,6 +6044,21 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "python-docx" +version = "1.1.2" +description = "Create, read, and update Microsoft Word .docx files." +optional = true +python-versions = ">=3.7" +files = [ + {file = "python_docx-1.1.2-py3-none-any.whl", hash = "sha256:08c20d6058916fb19853fcf080f7f42b6270d89eac9fa5f8c15f691c0017fabe"}, + {file = "python_docx-1.1.2.tar.gz", hash = "sha256:0cf1f22e95b9002addca7948e16f2cd7acdfd498047f1941ca5d293db7762efd"}, +] + +[package.dependencies] +lxml = ">=3.1.0" +typing-extensions = ">=4.9.0" + [[package]] name = "python-dotenv" version = "1.0.1" @@ -6085,6 +6136,23 @@ click = "*" olefile = "*" typing-extensions = ">=4.9.0" +[[package]] +name = "python-pptx" +version = "1.0.2" +description = "Create, read, and update PowerPoint 2007+ (.pptx) files." +optional = true +python-versions = ">=3.8" +files = [ + {file = "python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba"}, + {file = "python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095"}, +] + +[package.dependencies] +lxml = ">=3.1.0" +Pillow = ">=3.3.2" +typing-extensions = ">=4.9.0" +XlsxWriter = ">=0.5.7" + [[package]] name = "pytz" version = "2024.2" @@ -8141,18 +8209,26 @@ filetype = "*" html5lib = "*" langdetect = "*" lxml = "*" +markdown = {version = "*", optional = true, markers = "extra == \"md\""} +networkx = {version = "*", optional = true, markers = "extra == \"xlsx\""} nltk = "*" numpy = "<2" +openpyxl = {version = "*", optional = true, markers = "extra == \"xlsx\""} +pandas = {version = "*", optional = true, markers = "extra == \"csv\" or extra == \"tsv\" or extra == \"xlsx\""} psutil = "*" +pypandoc = {version = "*", optional = true, markers = "extra == \"epub\" or extra == \"odt\" or extra == \"org\" or extra == \"rst\" or extra == \"rtf\""} +python-docx = {version = ">=1.1.2", optional = true, markers = "extra == \"doc\" or extra == \"docx\" or extra == \"odt\""} python-iso639 = "*" python-magic = "*" python-oxmsg = "*" +python-pptx = {version = ">=1.0.1", optional = true, markers = "extra == \"ppt\" or extra == \"pptx\""} rapidfuzz = "*" requests = "*" tqdm = "*" typing-extensions = "*" unstructured-client = "*" wrapt = "*" +xlrd = {version = "*", optional = true, markers = "extra == \"xlsx\""} [package.extras] all-docs = ["effdet", "google-cloud-vision", "markdown", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (==0.8.1)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] @@ -8498,6 +8574,33 @@ files = [ {file = "wrapt-1.17.0.tar.gz", hash = "sha256:16187aa2317c731170a88ef35e8937ae0f533c402872c1ee5e6d079fcf320801"}, ] +[[package]] +name = "xlrd" +version = "2.0.1" +description = "Library for developers to extract data from Microsoft Excel (tm) .xls spreadsheet files" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +files = [ + {file = "xlrd-2.0.1-py2.py3-none-any.whl", hash = "sha256:6a33ee89877bd9abc1158129f6e94be74e2679636b8a205b43b85206c3f0bbdd"}, + {file = "xlrd-2.0.1.tar.gz", hash = "sha256:f72f148f54442c6b056bf931dbc34f986fd0c3b0b6b5a58d013c9aef274d0c88"}, +] + +[package.extras] +build = ["twine", "wheel"] +docs = ["sphinx"] +test = ["pytest", "pytest-cov"] + +[[package]] +name = "xlsxwriter" +version = "3.2.0" +description = "A Python module for creating Excel XLSX files." +optional = true +python-versions = ">=3.6" +files = [ + {file = "XlsxWriter-3.2.0-py3-none-any.whl", hash = "sha256:ecfd5405b3e0e228219bcaf24c2ca0915e012ca9464a14048021d21a995d490e"}, + {file = "XlsxWriter-3.2.0.tar.gz", hash = "sha256:9977d0c661a72866a61f9f7a809e25ebbb0fb7036baa3b9fe74afcfca6b3cb8c"}, +] + [[package]] name = "xxhash" version = "3.5.0" @@ -8765,4 +8868,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.12" -content-hash = "c9a760447a62b3c71fa84f20a614b6d3c5725b3869fc87f78b03eb2c80841ce1" +content-hash = "c1f30981f79db94213a89aec3207f0b4775944968e97dda8aa49c3aa143ce7b5" diff --git a/pyproject.toml b/pyproject.toml index f03789833..0bbf545c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,9 @@ llama-index-core = {version = "^0.11.22", optional = true} deepeval = {version = "^2.0.1", optional = true} transformers = "^4.46.3" pymilvus = {version = "^2.5.0", optional = true} -unstructured = {version = "^0.16.10", optional = true} +unstructured = { extras = ["csv", "doc", "docx", "epub", "md", "odt", "org", "ppt", "pptx", "rst", "rtf", "tsv", "xlsx"], version = "^0.16.10", optional = true } + + [tool.poetry.extras] filesystem = ["s3fs", "botocore"] From d7d559f4f7c985ed501896900da404fa4ab1360d Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Mon, 9 Dec 2024 15:20:50 +0100 Subject: [PATCH 8/9] test: Add tests for different document types Add tests for unstructured reading for different document types Test COG-685 --- .../documents/UnstructuredDocument_test.py | 62 +++++++++++++++++- cognee/tests/test_data/example.csv | 3 + cognee/tests/test_data/example.docx | Bin 0 -> 5950 bytes cognee/tests/test_data/example.xlsx | Bin 0 -> 9210 bytes 4 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 cognee/tests/test_data/example.csv create mode 100644 cognee/tests/test_data/example.docx create mode 100644 cognee/tests/test_data/example.xlsx diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py index 56c3c827f..7f6e20ba0 100644 --- a/cognee/tests/integration/documents/UnstructuredDocument_test.py +++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py @@ -4,19 +4,77 @@ from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument def test_UnstructuredDocument(): - docx_file_path = os.path.join( + # Define file paths of test data + pptx_file_path = os.path.join( os.sep, *(os.path.dirname(__file__).split(os.sep)[:-2]), "test_data", "example.pptx", ) + docx_file_path = os.path.join( + os.sep, + *(os.path.dirname(__file__).split(os.sep)[:-2]), + "test_data", + "example.docx", + ) + + csv_file_path = os.path.join( + os.sep, + *(os.path.dirname(__file__).split(os.sep)[:-2]), + "test_data", + "example.csv", + ) + + xlsx_file_path = os.path.join( + os.sep, + *(os.path.dirname(__file__).split(os.sep)[:-2]), + "test_data", + "example.xlsx", + ) + + # Define test documents pptx_document = UnstructuredDocument( - id=uuid.uuid4(), name="example.pptx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(), + id=uuid.uuid4(), name="example.pptx", raw_data_location=pptx_file_path, metadata_id=uuid.uuid4(), mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation" ) + docx_document = UnstructuredDocument( + id=uuid.uuid4(), name="example.docx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(), + mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ) + + csv_document = UnstructuredDocument( + id=uuid.uuid4(), name="example.csv", raw_data_location=csv_file_path, metadata_id=uuid.uuid4(), + mime_type="text/csv" + ) + + xlsx_document = UnstructuredDocument( + id=uuid.uuid4(), name="example.xslx", raw_data_location=xlsx_file_path, metadata_id=uuid.uuid4(), + mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + + # Test PPTX for paragraph_data in pptx_document.read(chunk_size=1024): assert 19 == paragraph_data.word_count, f' 19 != {paragraph_data.word_count = }' assert 104 == len(paragraph_data.text), f' 104 != {len(paragraph_data.text) = }' assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }' + + # Test DOCX + for paragraph_data in docx_document.read(chunk_size=1024): + assert 16 == paragraph_data.word_count, f' 16 != {paragraph_data.word_count = }' + assert 145 == len(paragraph_data.text), f' 145 != {len(paragraph_data.text) = }' + assert 'sentence_end' == paragraph_data.cut_type, f' sentence_end != {paragraph_data.cut_type = }' + + # TEST CSV + for paragraph_data in csv_document.read(chunk_size=1024): + assert 15 == paragraph_data.word_count, f' 15 != {paragraph_data.word_count = }' + assert 'A A A A A A A A A,A A A A A A,A A' == paragraph_data.text, \ + f'Read text doesn\'t match expected text: {paragraph_data.text}' + assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }' + + # Test XLSX + for paragraph_data in xlsx_document.read(chunk_size=1024): + assert 36 == paragraph_data.word_count, f' 36 != {paragraph_data.word_count = }' + assert 171 == len(paragraph_data.text), f' 171 != {len(paragraph_data.text) = }' + assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }' diff --git a/cognee/tests/test_data/example.csv b/cognee/tests/test_data/example.csv new file mode 100644 index 000000000..004bbf122 --- /dev/null +++ b/cognee/tests/test_data/example.csv @@ -0,0 +1,3 @@ +A,A,A,A,A +A,A,A,"A,A",A +A,A,A,"A,A",A diff --git a/cognee/tests/test_data/example.docx b/cognee/tests/test_data/example.docx new file mode 100644 index 0000000000000000000000000000000000000000..81ff7f2d4f7d9ef9950390bc3ec8e00bd01befe9 GIT binary patch literal 5950 zcmaJ_1z3}9+orp_yQI58N}ACKgGq-7jBccr5@}@8I64(j8I80cC?SlN5`l@7^gsMU zUjOfXuN}{h-N$x5d+zg!`?^j89drx|6dW8Jln`|dbCer~kNj-y?dawsBzXN^lLpsD z7a@6sIHR!Lh0U0erj8t`$1KE8VAG_{pi5$N8P|ZpJPCpsrbXFWmp%Qukdbl3B%c?P zy>NY{s{w_Zul**WJc23fb^U$n5%L|w1P)^Rp~^rNN#VE`ay73Q2kdL@iVg0y3@||% z-cIB0;2*u1_6sXHamH#)*KP8{3jAhP=#PlMOk;$3G_Dpa1mby@s)UgF@|3$f0J#KN z`PtV#M$VPbZ#zh+?3FjgtwbuG>NqFpY+ST>l+Kq{v@5fb9*u!Ja-s z_F!*E!9aJnCq^LsXkm)55CX!VClr4A ze^4mfcqoz2T={V(`eQ2ks*CC!z%WoLD!Fi6zAZk7quT6y zu`BM7=UB1?8ALdzutp@!rh0~ow^E^sBx`P>vbayh)JAKDS^(v7`|Tb|8i=u~*MrTW zand|~A4%X)9=CQJWwEBH>gf_@R_T=g^_#K(I!37NzZt& z)gU3GY1v%3ddtXp2pJ|vf}lDbLCX-{QR_v>*G4|s?&b$=XFoBVPE*xw?0fWZ!-8?H z&ifDOuf62qG?VCzih=_FM=z1w^pdTo=e3jaQinZySV-<1U*O?8(%Wy?YLIemseF2VpV4^6)PFkDm5KDJ!$wN(fGc%%dAW0K3tUfMNPe89>-(q zkq-hqPo1^WY~KVMpuL+QP?{j0roOT+A8Umbz_X>u12ZzlAdi9yF|IP9N7c6+@x}3X zlv6b9=>~I%>5D8+9G7<#(KhzvN59NC@J*fspIbThyOBOY>;E1DUSj%?KKRK((oU8= zkl@?E*~3Y3P@k6caVPlcwmD{UDaJ9Tj0<1XDDE!b4=b_ZH^e~@K`M-=2Lp1N)*bYH za#NzrW$FMLCkw}%3dRvwd^DcjPyAE#Yr;7U-=2#h31|IB!pUzEZtv&g3wHm-`Dg=P zw=hNQ@_ZT24BP}e^-EcpprsLaXUmc)e)k;5Y5=K`2D#J%L9Go1Q*U&~V{5!uM6 z-a}r0pY9CEZv(*I4nqGbi^$CPb9eOcz24iuN@S`Ya)TtX$N6mU7fMY@QcFG5Joye$ zMDm)uByk6Mf~fIKVRYv*AI5g}{qvt-a1w)>L^eyzwN!{(2rq@FylKXFzkYPI(R zjI3~L{3Cnm$uSz$JbHr+1dc>Gri$XzXHV}L@rU)~nQ+gr6hOa5o#|sypmV`Z(avj! zA~K~OchtLs9>mgDjVjR5r7lH!Fh0`EXjdltUYF!RUfWq4QB;mEu5 zy4y;{&(=1m^YhALwMMa*sRFL)I1v)=>_u*t{2$$XUHyL1ag&H0poI@ylH?(xuCrb& zh__djT2Ev2g;$;fm!5d@^JyZCC^owk8zlpc@Gxo=Ek?zazI<`ODmE#LKQ%0I{%L`( zLU_ZF4A8(r#85Z4a6>MRN{yrXnSO4E;D@;Z<;?CoQEz!R-&uSb3iA3fyziKcW3IpAMSOb076VMT(4> zaHh|P%3R5ipd!Rmc`X^UC2(fLOV=8K-P!QeeKU`s0#LRJ7nCB#jFN+G!+(+CRk0ZP z&tn{O_*e0vv+Z2X*11{l=~QFbn+9``*v#8;bScO-EuW3h_v3Al!)ihyPP{XN zEUvt=tULx)9=S_-j^@-U5c0G_JR84hokHLX<~sRC8-OuDo_&ODYTn z{4jw)&QQwe?ZCRjj7g7ecfw;`)#Q8LcN`-r!#MFU^}rtOLl_*Cd7nN=MDMVKjhzBv z_i1RdMEHE4@z`dzOjwANEW*ZN9=^32>an#4RB_cP=jY4P48)9TQ{>zX$t9g!_%RKA#K)%0Wkc?f=aJeo5D_Y&M z6wuIJp0zE)qTG)Ly_e~)v>+r{NFy&-rA^<>oB6gq>Q19*yt!0-r)&)fa8|Uq<|{9X zwxABn$PCuIlYC&aqN&*!k0-QXI;82LUQ>Oq4#Rnn_RE~^YUojdjstHn0~3_Py?KW8 z3np+jy+fztHVxt4QcHUo;WK+T9~<=+<(fSoNcVW%HqkvF4IyY@N0W<~gR84G6!u`E z@KP=^7uB1ltl583wFy$AtJB@|?HSBqRaXThnU=W|ib}w;W+zjFUksGtd=Shx<%SXy zLCgD^#DSK)D|VaWeZk?mzspH++M^U_D@Hg|fJO4g!Hg9nA|law<>)AF*&T|jPUl>t zbpkJbsW(StBKNEb-6};_DV>~CZ=7OTZq_xBLp4DV4UYyWJp_A*)xaR$;Xb zFZ%KCqL#loQd`O8#4S^A^y`eJGfT2z>H#E=<|N}X=$A*Wt`*nvEGJpz1WvSIZ0H4` zOMmTDjg!|_f4G=2&(=I%B6*9Dw@^QdQg5>}AVF<^Eyji=;8wj+Ri#2+yz1SB`Am2a z-nlt*Pze9}z=&e=j~c>Hcng5&K&c0Ag+KQ z;HTDAr#{j7YNtLEElv;m?S>`qC2s78KQQ1w&Q`efw{NQv+i5o^jwPwQ8?0bq6l=i- zt56Q>0Lr>mIH=#*#4xQim7}Uz)5eW`uVDRxZXz;$L5eZ~xqF6x+UI-zE}kF%8~-jT zuDL7hOs_12>kiQ&p?NwrfN!5wkFOFCCDJ+L;u7KS^EwZxD(7^2H%9<|Rkma4AKf+3 z@#%Yin2HRIT1m12)GePw+W&mz+ z%1{Mr%wk;QBNMkMzdK)7TZ#rfXb7I3K5+*<%C>tj&S-5p{$o6w8X*oB2!^$Lc!-nk zihSK8N>{)At;!6f$CxQOc_6URllcPM%Y;rmg@7&y9QeWQm@X$w(8bK#tXw^h+9wSo0hKBa*r1!n@8Ed z$aibo1{!cl8N2YXO@2nzhrg#F>8EHH`y7+KpBQd~$;7Yh`68V2O;QnQwBf{4VC3zF z(aK#cfeB_8+WUhJa(UBu-J&Vzrxe`t7^jZxS$GYk=)6$zE3w28GsL&^k1ddO8^xj+ zKNIU$@f8#xKj^c&dL*YVfSgf<125R<+S+?Tba0S>MU)#;;|#^LRg*PNsV*twEASi} zZ5H*>b8SmScc!)@>ez+#*Dc&vXx9fQ*MXc88a4{bcfNn>Cy4)Zfcp3bxjFh=_Yu9O zAn+`5cv!59I|rcS_))q7{Y=W;D4(D1p(pvMw#qg;%RvvO5wjfpoNCbd#&)GpkKMzz z4Flrz|UM2y;$VUAG|2ZsY;R&GI3;zmIfMr_3;fm9h`c=p4WviSO0UV21|b9%bCj$21O zfU9LL{Etk$SCj_feoY5;drv$jR8B#=`;TI{?|upRd431_4KOW_vKhp_{8VgKk|r6V zPD5}8Hp|n7vj$VnOfoD-bAWMzUTO@acZ}N)FW0g#C50ZCYHcMt=u|9`d1!|#V27LG z_@ACU^JQr5a$#OXFm6FCr|Ebi&YXhom-Okq);|`^T!3%IxFS-`UQ1lkorG9fTR)lB zJZwBqN3C5XXYm1 z-%4p*q?Qa6`U2i=2*qy#reSLek8bZ~Z?ro!)@7`_g77C(&Qg;u*Xtb?|28;_r^UxjkeppIg z!Qn|8?9TAg+>VTS0o4z;C9*ScNG77jAr%`DUC923vQL{!)v_Q^JVy}#-LmSU4QZW- z8LcJrq&%4^nlR74+iF+J)wstABcG9^Iz9zkrfk|HiIf-?k{}bu| zZP9%keSPnHIRC0Kjt0Yk9wma-eai8Lucex;#if+l3Fx^%#m;%@9qpMz*B$YUx_+&4 zr4^?TCwZni$$}!j(PuNp)g#`95VVq#zJ7am#UnVJ+qhoIt*|7n=`G9iky9rLlO(pG z6%{G9+DEgwYDSI%FmkLVOwOc)rVRAB<`Wm}A-$p16boq4s{qFn8}y$pQ!PIdc_>sf zh2@fpH!`3N>|-|gVca3o+c9ZIqhpin4WaqAo3ZzVzQ&NM`B-@kDvV-iutuJn!A|@+ z)o~!M`KQr{R;z^yq<#{(kbio@gyqk%5aG_pD8VJRN;N*VVc$ z1IeWDwTh51V43lzUI~YDC#s*Uh3A38ww7xg^UL{teouABG%P4L6obGm(LbbzB*)lkfI8WYMZ5YAdPB+PV8gytc%$z+e5_x z4G{&k#aVmAW~gm+PwQ?;$wm;}k7II3TT~s)^3!CR3Y_yc1f;>5g^KaIt7|x&QHRCN z?$?bf?;PXY1#Z94fjQ9fK4xz`sZ)dr;9R zP=1S*H#NFzvGT8WL%RGEcvCjI&XC{6j{K|tC+PhNzbS%TYfHav5LpHP3;(y$^e6hJ z)NmbEzfBh72Ku*b^5+e14(Hd3#BU=*lJrmd#GiM%*`@rI2Ue{AG1~tLzghRMTbbWx zNBD2}@5bg&@Xh*leKhaSiPKra7B XsvGEFA~``p!9%{nkZ!J`xL*AqLx|wr+xJaCdiiOOQZt8VMHMT^e^8Ah^2(m*4~m?j9UMkj6W>1-I}zbKiY4 zlbQDm?mfM#&ZmndZbkWs0$-^ai$wg#n7-NwINOx}TQ{$E1kdUt=w!*^~58TRG8hV4ZHrRsp zLAOk3c)4d?`OPYp;Ar_Q=GHXMoQ*Y;kHPsCn;cvewa*H^yFsxkm9{#QiCT-xU7A8C zi18E8ypid%RnD;Z4h-G<;YG!3+K_7O*wm5jhYwx zhI>+Wh%WCkK(~SSY%IsCUjVtUV)%36dW4w4}8inKyOLJUBH@wDUk z7fw7JU2IJp9c_PFuYY3(25Q2fT>i6LrJ9mLKPP4f>W^?vk1TgQtVLH2>O;+g=SU+B zOw06CJpLDJB+RY)+FuknU>(D}PR57aulX=GVR0^dSjr-?;kFuglR#d6d0>I&(5`-6p& z*Mx0P;T;-;v_gUW1D}|0m@8%Yuq;IYQz8gbkkO`jvf(OuKg!g#{GD~|;>2QmWN3a3 zy3>Cb89`rBPAXUc009vIcn`SAj z=FFgf;hz56)0$Az+|(r2w>Q~zN5%R;yi*Y(bD5fv^Afb z$c5n*Pw%hs@`Kg@?JaIini`A2Y6a_88HGe>^AZpI`S%WsW5kuo*u<2+XS6%E zM=2&lcT20m@eH2Y>9p3EZ7Gz|uk1q|GVoaDb*DB84Ff7#uzWO`R6kwoJnR@X=?A&ujNvaA-<_ub@zQR*y;Kifv%e#pHZREv7V{h zq{9ZgFi*JGO~F7AED zR3Ujdd3^Xc#R|Zi4v28!8bvVmD(knbxwu*0D^zT@2iGP=KY4!RT7&Umo%RRhiCx;Q z$jc^g-0+YI;O_Rr%tZuxQfOjj8iXeX@%zoW{!>AQ-HV*QW5NgKfgaJZVf2oTEIS3a zVPGOX=yYE&=8+SfCS5T4%CWvdGd5VrEBBspyYtzMD}IYy8KEq@>~mhm?n$Dv+2Am7 zZK;5kbYOsl+B?a&b}03PavO%)F_!GLBtA7{2it==eYhAN3vtdexf3hoclAZ4 z6+&~N2ZNy?`QW)K%#aRl_d|)qndzuu6PcM|h6B#4LNDT*b4IiWO%1pM&nEUSV_%Dh z!9H!0h8U z_hwGe;rRKm7!;8|aXcXRljb_BKF=1Ut*@=NQGKms1M7y*zB6ls7&)MR5H4-C5sHS` z{FcxH*rH&UPe6373z{{(cw3Ccq~c&5Y2cDAJihf?CxjfhnJ3`ZF+Yv?Y`!R?zn9-I zgK5Q5)ZbV?goiX6YbOTZ@`Ay4LswE=Qtw%+6H7izK8JDf5BV(lN4qb#bjRrbG}9ge z%uZ#f&KRWp87BR%H(agE?aeuUzjOVZ=%LPd6o~+C7s1^NG?2#w-&P#m>iVQz5}4L3 zCz-Ud=@9sahdZ&I1m_u@(4`tE5#GsJ)4)hMlV4#j>Nnrqj-*2G|NR^7^X0PL9ShM|QYH zWqnXse(&#sWj}&6^cD72+AcqpLQZIz8GisNEa6kPg<#Mcee}@lUYhs#L!@LOtLgYG z#cEZAG>JC`!6vFce%w?vhv+Ht0Y2sJDjQr0lIYQlYg!-b=<@5f`syX$h3={ylTj^w zH_qh}A>Z_wxn{)1_P4sFd)y&t=D?qRLoW?M(2;V;d*W_UH1gG+27TY_v6-Pn3eoi` zHiRvEJ9K&QsN2;vQwn3n8=O+o^i}vE#Ub7pUG;80zfJ2za}0lV%25Af3)zRb(1kpW zfZ8w$wAS@ks5KHy;&1!Hb1X_N4;J!p4fXmK6pjp3n=fotrR7{cg|UY=+IX}q;hFXB z#^S7^g-CQHGED;G(JKzs)D^X<1t7Z7$K2w8z-MZ-a+@OqXev`k^`D{`h1UC*>1L6z z9mBBcjXr5Gq04LD)QAWDD4~gL;^Zz$(Km|FCQ-ZcH0d8vY=fHA7}XV_(-s7*!AOd|4d+Sj^V7gzw6<4EUoM5O!%^P za|Ms1)9?HDy(EG)zsnt!II^W>QnK_@s))MN>J8Hq`Y7WdjQcZ~7dic$lKqJm?d8R${&_qx>{^)8cY55l*LcG)hrQk=EW#%9#W)WNTqM!OSw2(N$096LRA%2fm> zowr1^MNB2w9<$3=0^0YS97GrA7zoAEBN0bQm%&2$ONkx-Yv`M{#EQoP$D%M6^ z9Anmmyql$yaxtP)V{ec$c{kQ;PWju?_wiNxn!YmIc`;uz8e`}KmX-F*Q*JsK0wU&;~!Jhbx5m;+Qx5Uz5d3`}hwuX03 zEQZoL-!llscGN7>0{kv#Vq2HoK@5_PWtunsv9G=pi@)Q?AtS?8v7=5nB3~OK%B$w9 zaZI3Xu&8ku5(*jtIQSjjkGIk}6}ooE$yr~s_@Be#%Dm~oN!pGXn2e}rc<>k{_kaN* zz{lyG-6soYF`Dm42Yp1oreL-8w#r@Uc1o-Seeu!+8O+~2-G8u{H4v{cpRmn1*B!jP zx|FcVX!i{or9b(;o2$}Z2jQVgJ|2S~Xp{8tgKM9TM&vA82DUP^*uYFs-(>Qov#uo5 zux&ZhzhK~Aikm{yl`;o+XGbbtYIyR{=b09TyRqQCpQ^f7KKG1^nN_xIscg+GM}<$k z^<;V*Bb6p!va2h0vRXBF;J>Tkp_7KdxHwB_tY|?3bRz^BVo2f|SpH_Pn4vxGsy(f` zpO&vH;|xLJw8z8qgj}t^>VSkxEJ=h}kS6b#;*~aJv`1H?Ok~J-C-IbLz^IyV@KDCN zsxP)Wd)c^If9QBs#6}bC`Z0!chBKI8gH>ca^?JH`A zPeJtb>Qnt94!4aqNX^VNytcPEyM%oUo(L^8b9w{?>Yv<+NuJd9I`2C8YZ#_p=$Y1*D+!PLmecvuFikitBD=zGR`ua>jHO8TRDvp|K7F69RLUDS|fC ztDSy)7+52#q+KNK?VyN>>wp#B5x^?a*5+VAjExA43US#V=L*WI#Y=$PdTbhvS}n9k z&Il#P(g;VXx)-FI<@WMLRDYgUVWA*YS&fpSbwuD8DQR@ zb4}U{7uiOy_%1b9&s^;AsLbY4ZOh(*fj#df-@Y;D7X{nH$@cy;w~|h~a$n@fkHF;y z9#$kYl{x+!dp8+Dv9oH17sEN9b=_XF9?0k1+>NfZ&s!jR1c`b(1qha^?W#&TrqM9o z&5#Ty^BYC2B!uIrojDc{=v) zHlY@cVvKMd%7s&hUxY&qs`O6Ky}`B9q@t%CH0T&h^A4&90w|3x&3Z^SbmsnrI%S9{ z@*-u;B#(KIlq{x35GFsfPQbNIThr4&MtZ@sL@$3vd3n%e3DiFSblX9$UUufcdl*HU z%MQz&Xnqv60hkxZqwZU*$aVw@>ag%l`7cF`Y9iSfB;7`6g+Tm=YqE>Vn+1LBM+(x@ za~o98i;J|XaIGezA-d**tw>xZALj71vd{wj8H0&rz5q3W`FLZeK{~!EU4BWr9+-#z zW9d?We7msbi6i{qzPgOjHWGLb;rf<0Cd~DGv3mz5l3_V`E+?_cnT;*i(ks?0nfmmM zy4=goU7{QkE8O`lxJ@cNG=Y_%nWPw9A2n}D21mDmKQoiC@mqS)x%e23mvzH@V;kUY z!9ix)p%7Ro14lPE0dQ;n8YtO&BX?BgMK$vqVbhL*cy?*yn~4H7fzhD!mvhZVXJr=x znl~z|i?WdN$Ji^4K!!nD6$Q2i35=qUiC3$IWoCZ;e0aY0pFF;3xH#{E!7a*1RLU;9 z9dXpTL$%t1{h@+&kzz1T+f2hd=|Fh9DWeyp?(qgaMVFlVD~1IEVdMpZL}Zluy`uWI z>OE%f@10#C@AklHLPEuqLWvHS7A-F(zjuVvf+(rc(j1SVQsl%-1D|}ozK`@ekHBfkR>h*k zbWmd3J*mP8F-uGYu-$S%MfmUD^b&S5wjnePpA2=S3I8SJKVwsnmASbqh~xL}cSkc@ zThFmV2GhT0?kUi-nXCDRdRf+;v_jUV;zMcL4xhm?W4GE@!IAR2H4#|8Tsk{~*yPvU z(Onsj-?EGsA*Ic{b=B>RrU_Mc9-?(@xvw?}Eq78L?;Y=_KR3yb)YB+P4{BYBa2%-| zoOJ1RMsBvWJF&C993D;aOW-yXR3_5*938D2_CiQaVpOBwXc81LH*j*V^iQUb703`3 zWQyWxlhMZYhA_(6#sNF4+d~MIvVjKbi_dGbl`B-(wSDTz+Q)peSeOy#O4*=D#(95x&{ zcrGe#NGs|G7*z#DZQaMFVZ9?uB=SLr5uso7)_JFe>Iyr+3<}fVWj89%dGVHm#uXgM z@iL{h`oO#1wL0GB;=R3sy@M%;QXU6Ieo+MXhCjdk4WrGu=5p*(@G-AZir?(le8;MY zon7T$s^SRe^WHT7Mm3ZYQ?Vw`S6O271SBC;Ha`izaVV^@p>qek_;m4*0 zyu#QKDzYFNg-Kez{q!RAkdyb-w;m*#f__<6Aaos&WGal)R`ADMR-3ta$n+rRD=AAh zok*Zms%+aJVT^lBNsZVNpUlf8KH_q3R~ZtRo<&|p(GPw!l=J>R1=MPY_wt558ux=% z;P2J<#0kQg1~L57_vG`Pv5!GrP1*NxlE5Ve^gZ$%>K-u$n45ad5H(L^F z7eSqAN8r9`%w0E?rR5JCmI>!Ky2#lz>-lhs+Xdo{Wz`$o#oj3Kh*ZU|9X$vHAGe&~ zxQJkr7^t?75aN_G&-*XC{!uBR9}VcqLT$1o)cRsUbF5~Lrm8NEP9P3bM;G&-VK1~C z@!!-cbm^iK4Q)d>aR=5A?=ccRVgg3r1nX+czJEhFS69~@zoMHFK{aCYconrItZ-dB zv;z7*CX`lGkyO?}W2Mf=Ed|7+eokxaNjHYpfkzS$A!=H_-QYlLRt6jL=Jrv46xTQ= zObg67)%0vjD3SNKax}6^O*qp&49YU(Y$;*qH7kiKk$r5I(M~I5L5w~`)^0UrwgIsz zEgYJvz_FYOD@b3$o6ZJEu}HW%C1jFn=*t!^hQ>nh;J z%Qx$f^cg!j{a5&)OZLAlJ5k9I>H>$I zz}``yfk}kyfoMUMCa<+FYE57g-7S?%O!Mn$-&H0YE~qq3&n>zy#ZYhO{W8|=S zE1HGW^t%8fpLHNNUf#=#o{+P@hzch3MQ8V{?8lN@*^5%KNs{dE$1fko&t;zG9!g0z z(>)BtZ#e>$BA3HXSTBuD@qStN<8&nRPSrB^Vma&HwXNjX8c|h@t;l^^Xuz;oFH8e- zSr}Dy_U~50Ow`P&d?|XFzp;rNeMwrz)=7>gOt-_GBGhcPng}UmPsuK|vF+Xvj+;Uk zl%#usvX5n~y@p^%UFB_XX(d<`d|1=o%>sOY>xHSr7NbSYQIXs zE;7Sc9N@~f-Kp!C9s|aQfHsUcgSH2SHYe((?pEz6&tAMs18pW4Z{|2P!LIRwCIrNV zRPOqG$Xbzpq?45HW?j}?ka}#86Rs_I8E7oyV}H3tR6Y>Ln=@TCa|sz~h3^`=(?9nx z6%0w+`*Qi+9cYktYiP+w_o7<&>HmtkFwfYaDbasEW%2J_`*;5jXD(C~{>tF5_27RK z{_NAB(D+kz_*cTe*2w-$*bW`*|6eitmCvt*lRr3}Ko64qQbzfe_}4t~AH-s4{|e@Q z%^Ls8;n)1@9~>B<@gVf?f6d1J%HY=s@(%_jP Date: Mon, 9 Dec 2024 16:37:46 +0100 Subject: [PATCH 9/9] test: Update typo in unstructured test Update typo for file name in test Test COG-685 --- cognee/tests/integration/documents/UnstructuredDocument_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py index 7f6e20ba0..418b18810 100644 --- a/cognee/tests/integration/documents/UnstructuredDocument_test.py +++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py @@ -50,7 +50,7 @@ def test_UnstructuredDocument(): ) xlsx_document = UnstructuredDocument( - id=uuid.uuid4(), name="example.xslx", raw_data_location=xlsx_file_path, metadata_id=uuid.uuid4(), + id=uuid.uuid4(), name="example.xlsx", raw_data_location=xlsx_file_path, metadata_id=uuid.uuid4(), mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" )