Merge pull request #269 from topoteretes/COG-685-more-document-types

Cog 685 more document types
topoteretes · Dec 9, 2024 · 5ffbebd · 5ffbebd
2 parents ce96431 + acf5952
commit 5ffbebd
Show file tree

Hide file tree

Showing 20 changed files with 751 additions and 8 deletions.
diff --git a/.github/workflows/test_python_3_10.yml b/.github/workflows/test_python_3_10.yml
@@ -47,7 +47,7 @@ jobs:
           installer-parallel: true
 
       - name: Install dependencies
-        run: poetry install --no-interaction
+        run: poetry install --no-interaction -E docs
 
       - name: Run unit tests
         run: poetry run pytest cognee/tests/unit/

diff --git a/.github/workflows/test_python_3_11.yml b/.github/workflows/test_python_3_11.yml
@@ -47,7 +47,7 @@ jobs:
           installer-parallel: true
 
       - name: Install dependencies
-        run: poetry install --no-interaction
+        run: poetry install --no-interaction -E docs
 
       - name: Run unit tests
         run: poetry run pytest cognee/tests/unit/

diff --git a/.github/workflows/test_python_3_9.yml b/.github/workflows/test_python_3_9.yml
@@ -47,7 +47,7 @@ jobs:
           installer-parallel: true
 
       - name: Install dependencies
-        run: poetry install --no-interaction
+        run: poetry install --no-interaction -E docs
 
       - name: Run unit tests
         run: poetry run pytest cognee/tests/unit/

diff --git a/cognee/modules/data/exceptions/__init__.py b/cognee/modules/data/exceptions/__init__.py
@@ -0,0 +1,9 @@
+"""
+Custom exceptions for the Cognee API.
+
+This module defines a set of exceptions for handling various data errors
+"""
+
+from .exceptions import (
+    UnstructuredLibraryImportError,
+)
diff --git a/cognee/modules/data/exceptions/exceptions.py b/cognee/modules/data/exceptions/exceptions.py
@@ -0,0 +1,11 @@
+from cognee.exceptions import CogneeApiError
+from fastapi import status
+
+class UnstructuredLibraryImportError(CogneeApiError):
+    def __init__(
+            self,
+            message: str = "Import error. Unstructured library is not installed.",
+            name: str = "UnstructuredModuleImportError",
+            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+    ):
+        super().__init__(message, name, status_code)
diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
@@ -6,6 +6,7 @@ class Document(DataPoint):
     name: str
     raw_data_location: str
     metadata_id: UUID
+    mime_type: str
 
     def read(self, chunk_size: int) -> str:
         pass
diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
@@ -0,0 +1,32 @@
+from io import StringIO
+
+from cognee.modules.chunking.TextChunker import TextChunker
+from .Document import Document
+from cognee.modules.data.exceptions import UnstructuredLibraryImportError
+
+
+class UnstructuredDocument(Document):
+    type: str = "unstructured"
+
+    def read(self, chunk_size: int):
+        def get_text():
+            try:
+                from unstructured.partition.auto import partition
+            except ModuleNotFoundError:
+                raise UnstructuredLibraryImportError
+
+            elements = partition(self.raw_data_location, content_type=self.mime_type)
+            in_memory_file = StringIO("\n\n".join([str(el) for el in elements]))
+            in_memory_file.seek(0)
+
+            while True:
+                text = in_memory_file.read(1024)
+
+                if len(text.strip()) == 0:
+                    break
+
+                yield text
+
+        chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text)
+
+        yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/__init__.py b/cognee/modules/data/processing/document_types/__init__.py
@@ -3,3 +3,4 @@
 from .TextDocument import TextDocument
 from .ImageDocument import ImageDocument
 from .AudioDocument import AudioDocument
+from .UnstructuredDocument import UnstructuredDocument
diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py
@@ -5,12 +5,22 @@
     AudioDocument,
     ImageDocument,
     TextDocument,
+    UnstructuredDocument,
 )
 from cognee.modules.data.operations.get_metadata import get_metadata
 
 EXTENSION_TO_DOCUMENT_CLASS = {
     "pdf": PdfDocument,  # Text documents
     "txt": TextDocument,
+    "docx": UnstructuredDocument,
+    "doc": UnstructuredDocument,
+    "odt": UnstructuredDocument,
+    "xls": UnstructuredDocument,
+    "xlsx": UnstructuredDocument,
+    "ppt": UnstructuredDocument,
+    "pptx": UnstructuredDocument,
+    "odp": UnstructuredDocument,
+    "ods": UnstructuredDocument,
     "png": ImageDocument,  # Image documents
     "dwg": ImageDocument,
     "xcf": ImageDocument,
@@ -48,6 +58,7 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]:
             title = f"{data_item.name}.{data_item.extension}",
             raw_data_location = data_item.raw_data_location,
             name = data_item.name,
+            mime_type = data_item.mime_type,
             metadata_id = metadata.id
         )
         documents.append(document)

diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py
@@ -27,7 +27,7 @@
 def test_AudioDocument():
 
     document = AudioDocument(
-        id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="", metadata_id=uuid.uuid4()
+        id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="", metadata_id=uuid.uuid4(), mime_type="",
     )
     with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
         for ground_truth, paragraph_data in zip(

diff --git a/cognee/tests/integration/documents/ImageDocument_test.py b/cognee/tests/integration/documents/ImageDocument_test.py
@@ -16,7 +16,7 @@
 def test_ImageDocument():
 
     document = ImageDocument(
-        id=uuid.uuid4(), name="image-dummy-test", raw_data_location="", metadata_id=uuid.uuid4()
+        id=uuid.uuid4(), name="image-dummy-test", raw_data_location="", metadata_id=uuid.uuid4(), mime_type="",
     )
     with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT):
 

diff --git a/cognee/tests/integration/documents/PdfDocument_test.py b/cognee/tests/integration/documents/PdfDocument_test.py
@@ -17,7 +17,8 @@ def test_PdfDocument():
         "artificial-intelligence.pdf",
     )
     document = PdfDocument(
-        id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path, metadata_id=uuid.uuid4()
+        id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path, metadata_id=uuid.uuid4(),
+        mime_type="",
     )
 
     for ground_truth, paragraph_data in zip(

diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py
@@ -29,7 +29,7 @@ def test_TextDocument(input_file, chunk_size):
         input_file,
     )
     document = TextDocument(
-        id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path, metadata_id=uuid.uuid4()
+        id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path, metadata_id=uuid.uuid4(), mime_type="",
     )
 
     for ground_truth, paragraph_data in zip(

diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py
@@ -0,0 +1,80 @@
+import os
+import uuid
+
+from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument
+
+def test_UnstructuredDocument():
+    # Define file paths of test data
+    pptx_file_path = os.path.join(
+        os.sep,
+        *(os.path.dirname(__file__).split(os.sep)[:-2]),
+        "test_data",
+        "example.pptx",
+    )
+
+    docx_file_path = os.path.join(
+        os.sep,
+        *(os.path.dirname(__file__).split(os.sep)[:-2]),
+        "test_data",
+        "example.docx",
+    )
+
+    csv_file_path = os.path.join(
+        os.sep,
+        *(os.path.dirname(__file__).split(os.sep)[:-2]),
+        "test_data",
+        "example.csv",
+    )
+
+    xlsx_file_path = os.path.join(
+        os.sep,
+        *(os.path.dirname(__file__).split(os.sep)[:-2]),
+        "test_data",
+        "example.xlsx",
+    )
+
+    # Define test documents
+    pptx_document = UnstructuredDocument(
+        id=uuid.uuid4(), name="example.pptx", raw_data_location=pptx_file_path, metadata_id=uuid.uuid4(),
+        mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation"
+    )
+
+    docx_document = UnstructuredDocument(
+        id=uuid.uuid4(), name="example.docx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(),
+        mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    )
+
+    csv_document = UnstructuredDocument(
+        id=uuid.uuid4(), name="example.csv", raw_data_location=csv_file_path, metadata_id=uuid.uuid4(),
+        mime_type="text/csv"
+    )
+
+    xlsx_document = UnstructuredDocument(
+        id=uuid.uuid4(), name="example.xlsx", raw_data_location=xlsx_file_path, metadata_id=uuid.uuid4(),
+        mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+    )
+
+    # Test PPTX
+    for paragraph_data in pptx_document.read(chunk_size=1024):
+        assert 19 == paragraph_data.word_count, f' 19 != {paragraph_data.word_count = }'
+        assert 104 == len(paragraph_data.text), f' 104 != {len(paragraph_data.text) = }'
+        assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'
+
+    # Test DOCX
+    for paragraph_data in docx_document.read(chunk_size=1024):
+        assert 16 == paragraph_data.word_count, f' 16 != {paragraph_data.word_count = }'
+        assert 145 == len(paragraph_data.text), f' 145 != {len(paragraph_data.text) = }'
+        assert 'sentence_end' == paragraph_data.cut_type, f' sentence_end != {paragraph_data.cut_type = }'
+
+    # TEST CSV
+    for paragraph_data in csv_document.read(chunk_size=1024):
+        assert 15 == paragraph_data.word_count, f' 15 != {paragraph_data.word_count = }'
+        assert 'A A A A A A A A A,A A A A A A,A A' == paragraph_data.text, \
+            f'Read text doesn\'t match expected text: {paragraph_data.text}'
+        assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'
+
+    # Test XLSX
+    for paragraph_data in xlsx_document.read(chunk_size=1024):
+        assert 36 == paragraph_data.word_count, f' 36 != {paragraph_data.word_count = }'
+        assert 171 == len(paragraph_data.text), f' 171 != {len(paragraph_data.text) = }'
+        assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'
diff --git a/cognee/tests/test_data/example.csv b/cognee/tests/test_data/example.csv
@@ -0,0 +1,3 @@
+A,A,A,A,A
+A,A,A,"A,A",A
+A,A,A,"A,A",A
diff --git a/cognee/tests/test_data/example.docx b/cognee/tests/test_data/example.docx
diff --git a/cognee/tests/test_data/example.pptx b/cognee/tests/test_data/example.pptx
diff --git a/cognee/tests/test_data/example.xlsx b/cognee/tests/test_data/example.xlsx