-
Notifications
You must be signed in to change notification settings - Fork 85
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #269 from topoteretes/COG-685-more-document-types
Cog 685 more document types
- Loading branch information
Showing
20 changed files
with
751 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
""" | ||
Custom exceptions for the Cognee API. | ||
This module defines a set of exceptions for handling various data errors | ||
""" | ||
|
||
from .exceptions import ( | ||
UnstructuredLibraryImportError, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from cognee.exceptions import CogneeApiError | ||
from fastapi import status | ||
|
||
class UnstructuredLibraryImportError(CogneeApiError): | ||
def __init__( | ||
self, | ||
message: str = "Import error. Unstructured library is not installed.", | ||
name: str = "UnstructuredModuleImportError", | ||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, | ||
): | ||
super().__init__(message, name, status_code) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
32 changes: 32 additions & 0 deletions
32
cognee/modules/data/processing/document_types/UnstructuredDocument.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from io import StringIO | ||
|
||
from cognee.modules.chunking.TextChunker import TextChunker | ||
from .Document import Document | ||
from cognee.modules.data.exceptions import UnstructuredLibraryImportError | ||
|
||
|
||
class UnstructuredDocument(Document): | ||
type: str = "unstructured" | ||
|
||
def read(self, chunk_size: int): | ||
def get_text(): | ||
try: | ||
from unstructured.partition.auto import partition | ||
except ModuleNotFoundError: | ||
raise UnstructuredLibraryImportError | ||
|
||
elements = partition(self.raw_data_location, content_type=self.mime_type) | ||
in_memory_file = StringIO("\n\n".join([str(el) for el in elements])) | ||
in_memory_file.seek(0) | ||
|
||
while True: | ||
text = in_memory_file.read(1024) | ||
|
||
if len(text.strip()) == 0: | ||
break | ||
|
||
yield text | ||
|
||
chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text) | ||
|
||
yield from chunker.read() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
80 changes: 80 additions & 0 deletions
80
cognee/tests/integration/documents/UnstructuredDocument_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import os | ||
import uuid | ||
|
||
from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument | ||
|
||
def test_UnstructuredDocument(): | ||
# Define file paths of test data | ||
pptx_file_path = os.path.join( | ||
os.sep, | ||
*(os.path.dirname(__file__).split(os.sep)[:-2]), | ||
"test_data", | ||
"example.pptx", | ||
) | ||
|
||
docx_file_path = os.path.join( | ||
os.sep, | ||
*(os.path.dirname(__file__).split(os.sep)[:-2]), | ||
"test_data", | ||
"example.docx", | ||
) | ||
|
||
csv_file_path = os.path.join( | ||
os.sep, | ||
*(os.path.dirname(__file__).split(os.sep)[:-2]), | ||
"test_data", | ||
"example.csv", | ||
) | ||
|
||
xlsx_file_path = os.path.join( | ||
os.sep, | ||
*(os.path.dirname(__file__).split(os.sep)[:-2]), | ||
"test_data", | ||
"example.xlsx", | ||
) | ||
|
||
# Define test documents | ||
pptx_document = UnstructuredDocument( | ||
id=uuid.uuid4(), name="example.pptx", raw_data_location=pptx_file_path, metadata_id=uuid.uuid4(), | ||
mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation" | ||
) | ||
|
||
docx_document = UnstructuredDocument( | ||
id=uuid.uuid4(), name="example.docx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(), | ||
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" | ||
) | ||
|
||
csv_document = UnstructuredDocument( | ||
id=uuid.uuid4(), name="example.csv", raw_data_location=csv_file_path, metadata_id=uuid.uuid4(), | ||
mime_type="text/csv" | ||
) | ||
|
||
xlsx_document = UnstructuredDocument( | ||
id=uuid.uuid4(), name="example.xlsx", raw_data_location=xlsx_file_path, metadata_id=uuid.uuid4(), | ||
mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | ||
) | ||
|
||
# Test PPTX | ||
for paragraph_data in pptx_document.read(chunk_size=1024): | ||
assert 19 == paragraph_data.word_count, f' 19 != {paragraph_data.word_count = }' | ||
assert 104 == len(paragraph_data.text), f' 104 != {len(paragraph_data.text) = }' | ||
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }' | ||
|
||
# Test DOCX | ||
for paragraph_data in docx_document.read(chunk_size=1024): | ||
assert 16 == paragraph_data.word_count, f' 16 != {paragraph_data.word_count = }' | ||
assert 145 == len(paragraph_data.text), f' 145 != {len(paragraph_data.text) = }' | ||
assert 'sentence_end' == paragraph_data.cut_type, f' sentence_end != {paragraph_data.cut_type = }' | ||
|
||
# TEST CSV | ||
for paragraph_data in csv_document.read(chunk_size=1024): | ||
assert 15 == paragraph_data.word_count, f' 15 != {paragraph_data.word_count = }' | ||
assert 'A A A A A A A A A,A A A A A A,A A' == paragraph_data.text, \ | ||
f'Read text doesn\'t match expected text: {paragraph_data.text}' | ||
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }' | ||
|
||
# Test XLSX | ||
for paragraph_data in xlsx_document.read(chunk_size=1024): | ||
assert 36 == paragraph_data.word_count, f' 36 != {paragraph_data.word_count = }' | ||
assert 171 == len(paragraph_data.text), f' 171 != {len(paragraph_data.text) = }' | ||
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
A,A,A,A,A | ||
A,A,A,"A,A",A | ||
A,A,A,"A,A",A |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Oops, something went wrong.