Adjust integration tests

topoteretes · Jan 9, 2025 · abb3ea6 · abb3ea6
1 parent 6762039
commit abb3ea6
Show file tree

Hide file tree

Showing 7 changed files with 10 additions and 10 deletions.
diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py
@@ -13,7 +13,7 @@ def create_transcript(self):
         result = get_llm_client().create_transcript(self.raw_data_location)
         return result.text
 
-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         # Transcribe the audio file
 
         text = self.create_transcript()

diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
@@ -11,5 +11,5 @@ class Document(DataPoint):
     mime_type: str
     _metadata: dict = {"index_fields": ["name"], "type": "Document"}
 
-    def read(self, chunk_size: int, max_tokens: Optional[int], chunker=str) -> str:
+    def read(self, chunk_size: int, chunker=str, max_tokens: Optional[int] = None) -> str:
         pass
diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py
@@ -13,7 +13,7 @@ def transcribe_image(self):
         result = get_llm_client().transcribe_image(self.raw_data_location)
         return result.choices[0].message.content
 
-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         # Transcribe the image file
         text = self.transcribe_image()
 

diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py
@@ -9,7 +9,7 @@
 class PdfDocument(Document):
     type: str = "pdf"
 
-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         file = PdfReader(self.raw_data_location)
 
         def get_text():

diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py
@@ -7,7 +7,7 @@
 class TextDocument(Document):
     type: str = "text"
 
-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         def get_text():
             with open(self.raw_data_location, mode="r", encoding="utf-8") as file:
                 while True:

diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
@@ -10,7 +10,7 @@
 class UnstructuredDocument(Document):
     type: str = "unstructured"
 
-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]) -> str:
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None) -> str:
         def get_text():
             try:
                 from unstructured.partition.auto import partition

diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py
@@ -68,23 +68,23 @@ def test_UnstructuredDocument():
     )
 
     # Test PPTX
-    for paragraph_data in pptx_document.read(chunk_size=1024):
+    for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
         assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
         assert (
             "sentence_cut" == paragraph_data.cut_type
         ), f" sentence_cut != {paragraph_data.cut_type = }"
 
     # Test DOCX
-    for paragraph_data in docx_document.read(chunk_size=1024):
+    for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
         assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
         assert (
             "sentence_end" == paragraph_data.cut_type
         ), f" sentence_end != {paragraph_data.cut_type = }"
 
     # TEST CSV
-    for paragraph_data in csv_document.read(chunk_size=1024):
+    for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
         assert (
             "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text
@@ -94,7 +94,7 @@ def test_UnstructuredDocument():
         ), f" sentence_cut != {paragraph_data.cut_type = }"
 
     # Test XLSX
-    for paragraph_data in xlsx_document.read(chunk_size=1024):
+    for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
         assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
         assert (