Skip to content

Commit

Permalink
Merge pull request #269 from topoteretes/COG-685-more-document-types
Browse files Browse the repository at this point in the history
Cog 685 more document types
  • Loading branch information
Vasilije1990 authored Dec 9, 2024
2 parents ce96431 + acf5952 commit 5ffbebd
Show file tree
Hide file tree
Showing 20 changed files with 751 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test_python_3_10.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
installer-parallel: true

- name: Install dependencies
run: poetry install --no-interaction
run: poetry install --no-interaction -E docs

- name: Run unit tests
run: poetry run pytest cognee/tests/unit/
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test_python_3_11.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
installer-parallel: true

- name: Install dependencies
run: poetry install --no-interaction
run: poetry install --no-interaction -E docs

- name: Run unit tests
run: poetry run pytest cognee/tests/unit/
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test_python_3_9.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
installer-parallel: true

- name: Install dependencies
run: poetry install --no-interaction
run: poetry install --no-interaction -E docs

- name: Run unit tests
run: poetry run pytest cognee/tests/unit/
Expand Down
9 changes: 9 additions & 0 deletions cognee/modules/data/exceptions/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""
Custom exceptions for the Cognee API.
This module defines a set of exceptions for handling various data errors
"""

from .exceptions import (
UnstructuredLibraryImportError,
)
11 changes: 11 additions & 0 deletions cognee/modules/data/exceptions/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from cognee.exceptions import CogneeApiError
from fastapi import status

class UnstructuredLibraryImportError(CogneeApiError):
def __init__(
self,
message: str = "Import error. Unstructured library is not installed.",
name: str = "UnstructuredModuleImportError",
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
):
super().__init__(message, name, status_code)
1 change: 1 addition & 0 deletions cognee/modules/data/processing/document_types/Document.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class Document(DataPoint):
name: str
raw_data_location: str
metadata_id: UUID
mime_type: str

def read(self, chunk_size: int) -> str:
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from io import StringIO

from cognee.modules.chunking.TextChunker import TextChunker
from .Document import Document
from cognee.modules.data.exceptions import UnstructuredLibraryImportError


class UnstructuredDocument(Document):
type: str = "unstructured"

def read(self, chunk_size: int):
def get_text():
try:
from unstructured.partition.auto import partition
except ModuleNotFoundError:
raise UnstructuredLibraryImportError

elements = partition(self.raw_data_location, content_type=self.mime_type)
in_memory_file = StringIO("\n\n".join([str(el) for el in elements]))
in_memory_file.seek(0)

while True:
text = in_memory_file.read(1024)

if len(text.strip()) == 0:
break

yield text

chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text)

yield from chunker.read()
1 change: 1 addition & 0 deletions cognee/modules/data/processing/document_types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from .TextDocument import TextDocument
from .ImageDocument import ImageDocument
from .AudioDocument import AudioDocument
from .UnstructuredDocument import UnstructuredDocument
11 changes: 11 additions & 0 deletions cognee/tasks/documents/classify_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,22 @@
AudioDocument,
ImageDocument,
TextDocument,
UnstructuredDocument,
)
from cognee.modules.data.operations.get_metadata import get_metadata

EXTENSION_TO_DOCUMENT_CLASS = {
"pdf": PdfDocument, # Text documents
"txt": TextDocument,
"docx": UnstructuredDocument,
"doc": UnstructuredDocument,
"odt": UnstructuredDocument,
"xls": UnstructuredDocument,
"xlsx": UnstructuredDocument,
"ppt": UnstructuredDocument,
"pptx": UnstructuredDocument,
"odp": UnstructuredDocument,
"ods": UnstructuredDocument,
"png": ImageDocument, # Image documents
"dwg": ImageDocument,
"xcf": ImageDocument,
Expand Down Expand Up @@ -48,6 +58,7 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]:
title = f"{data_item.name}.{data_item.extension}",
raw_data_location = data_item.raw_data_location,
name = data_item.name,
mime_type = data_item.mime_type,
metadata_id = metadata.id
)
documents.append(document)
Expand Down
2 changes: 1 addition & 1 deletion cognee/tests/integration/documents/AudioDocument_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
def test_AudioDocument():

document = AudioDocument(
id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="", metadata_id=uuid.uuid4()
id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="", metadata_id=uuid.uuid4(), mime_type="",
)
with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
for ground_truth, paragraph_data in zip(
Expand Down
2 changes: 1 addition & 1 deletion cognee/tests/integration/documents/ImageDocument_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def test_ImageDocument():

document = ImageDocument(
id=uuid.uuid4(), name="image-dummy-test", raw_data_location="", metadata_id=uuid.uuid4()
id=uuid.uuid4(), name="image-dummy-test", raw_data_location="", metadata_id=uuid.uuid4(), mime_type="",
)
with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT):

Expand Down
3 changes: 2 additions & 1 deletion cognee/tests/integration/documents/PdfDocument_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ def test_PdfDocument():
"artificial-intelligence.pdf",
)
document = PdfDocument(
id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path, metadata_id=uuid.uuid4()
id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path, metadata_id=uuid.uuid4(),
mime_type="",
)

for ground_truth, paragraph_data in zip(
Expand Down
2 changes: 1 addition & 1 deletion cognee/tests/integration/documents/TextDocument_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_TextDocument(input_file, chunk_size):
input_file,
)
document = TextDocument(
id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path, metadata_id=uuid.uuid4()
id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path, metadata_id=uuid.uuid4(), mime_type="",
)

for ground_truth, paragraph_data in zip(
Expand Down
80 changes: 80 additions & 0 deletions cognee/tests/integration/documents/UnstructuredDocument_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import os
import uuid

from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument

def test_UnstructuredDocument():
# Define file paths of test data
pptx_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
"test_data",
"example.pptx",
)

docx_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
"test_data",
"example.docx",
)

csv_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
"test_data",
"example.csv",
)

xlsx_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
"test_data",
"example.xlsx",
)

# Define test documents
pptx_document = UnstructuredDocument(
id=uuid.uuid4(), name="example.pptx", raw_data_location=pptx_file_path, metadata_id=uuid.uuid4(),
mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation"
)

docx_document = UnstructuredDocument(
id=uuid.uuid4(), name="example.docx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(),
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)

csv_document = UnstructuredDocument(
id=uuid.uuid4(), name="example.csv", raw_data_location=csv_file_path, metadata_id=uuid.uuid4(),
mime_type="text/csv"
)

xlsx_document = UnstructuredDocument(
id=uuid.uuid4(), name="example.xlsx", raw_data_location=xlsx_file_path, metadata_id=uuid.uuid4(),
mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)

# Test PPTX
for paragraph_data in pptx_document.read(chunk_size=1024):
assert 19 == paragraph_data.word_count, f' 19 != {paragraph_data.word_count = }'
assert 104 == len(paragraph_data.text), f' 104 != {len(paragraph_data.text) = }'
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'

# Test DOCX
for paragraph_data in docx_document.read(chunk_size=1024):
assert 16 == paragraph_data.word_count, f' 16 != {paragraph_data.word_count = }'
assert 145 == len(paragraph_data.text), f' 145 != {len(paragraph_data.text) = }'
assert 'sentence_end' == paragraph_data.cut_type, f' sentence_end != {paragraph_data.cut_type = }'

# TEST CSV
for paragraph_data in csv_document.read(chunk_size=1024):
assert 15 == paragraph_data.word_count, f' 15 != {paragraph_data.word_count = }'
assert 'A A A A A A A A A,A A A A A A,A A' == paragraph_data.text, \
f'Read text doesn\'t match expected text: {paragraph_data.text}'
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'

# Test XLSX
for paragraph_data in xlsx_document.read(chunk_size=1024):
assert 36 == paragraph_data.word_count, f' 36 != {paragraph_data.word_count = }'
assert 171 == len(paragraph_data.text), f' 171 != {len(paragraph_data.text) = }'
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'
3 changes: 3 additions & 0 deletions cognee/tests/test_data/example.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
A,A,A,A,A
A,A,A,"A,A",A
A,A,A,"A,A",A
Binary file added cognee/tests/test_data/example.docx
Binary file not shown.
Binary file added cognee/tests/test_data/example.pptx
Binary file not shown.
Binary file added cognee/tests/test_data/example.xlsx
Binary file not shown.
Loading

0 comments on commit 5ffbebd

Please sign in to comment.