Skip to content

Commit

Permalink
Adjust integration tests
Browse files Browse the repository at this point in the history
  • Loading branch information
alekszievr committed Jan 9, 2025
1 parent 6762039 commit abb3ea6
Show file tree
Hide file tree
Showing 7 changed files with 10 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def create_transcript(self):
result = get_llm_client().create_transcript(self.raw_data_location)
return result.text

def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
# Transcribe the audio file

text = self.create_transcript()
Expand Down
2 changes: 1 addition & 1 deletion cognee/modules/data/processing/document_types/Document.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ class Document(DataPoint):
mime_type: str
_metadata: dict = {"index_fields": ["name"], "type": "Document"}

def read(self, chunk_size: int, max_tokens: Optional[int], chunker=str) -> str:
def read(self, chunk_size: int, chunker=str, max_tokens: Optional[int] = None) -> str:
pass
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def transcribe_image(self):
result = get_llm_client().transcribe_image(self.raw_data_location)
return result.choices[0].message.content

def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
# Transcribe the image file
text = self.transcribe_image()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
class PdfDocument(Document):
type: str = "pdf"

def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
file = PdfReader(self.raw_data_location)

def get_text():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class TextDocument(Document):
type: str = "text"

def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
def get_text():
with open(self.raw_data_location, mode="r", encoding="utf-8") as file:
while True:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
class UnstructuredDocument(Document):
type: str = "unstructured"

def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]) -> str:
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None) -> str:
def get_text():
try:
from unstructured.partition.auto import partition
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,23 +68,23 @@ def test_UnstructuredDocument():
)

# Test PPTX
for paragraph_data in pptx_document.read(chunk_size=1024):
for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"):
assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
assert (
"sentence_cut" == paragraph_data.cut_type
), f" sentence_cut != {paragraph_data.cut_type = }"

# Test DOCX
for paragraph_data in docx_document.read(chunk_size=1024):
for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"):
assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
assert (
"sentence_end" == paragraph_data.cut_type
), f" sentence_end != {paragraph_data.cut_type = }"

# TEST CSV
for paragraph_data in csv_document.read(chunk_size=1024):
for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"):
assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
assert (
"A A A A A A A A A,A A A A A A,A A" == paragraph_data.text
Expand All @@ -94,7 +94,7 @@ def test_UnstructuredDocument():
), f" sentence_cut != {paragraph_data.cut_type = }"

# Test XLSX
for paragraph_data in xlsx_document.read(chunk_size=1024):
for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"):
assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
assert (
Expand Down

0 comments on commit abb3ea6

Please sign in to comment.