topoteretes · borisarzentar · Sep 21, 2024 · Sep 21, 2024
diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py
@@ -15,12 +15,12 @@ def __init__(self, id: UUID, title: str, raw_data_location: str, chunking_strate
         self.raw_data_location = raw_data_location
         self.chunking_strategy = chunking_strategy
 
-    def read(self):
+    def read(self, chunk_size: int):
         # Transcribe the audio file
         result = get_llm_client().create_transcript(self.raw_data_location)
         text = result.text
 
-        chunker = TextChunker(self.id, get_text = lambda: text)
+        chunker = TextChunker(self.id, chunk_size = chunk_size, get_text = lambda: text)
 
         yield from chunker.read()
 

diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
@@ -7,5 +7,5 @@ class Document(Protocol):
     title: str
     raw_data_location: str
 
-    def read(self) -> str:
+    def read(self, chunk_size: int) -> str:
         pass
diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py
@@ -14,12 +14,12 @@ def __init__(self, id: UUID, title: str, raw_data_location: str):
         self.title = title
         self.raw_data_location = raw_data_location
 
-    def read(self):
+    def read(self, chunk_size: int):
         # Transcribe the image file
         result = get_llm_client().transcribe_image(self.raw_data_location)
         text = result.choices[0].message.content
 
-        chunker = TextChunker(self.id, get_text = lambda: text)
+        chunker = TextChunker(self.id, chunk_size = chunk_size, get_text = lambda: text)
 
         yield from chunker.read()
 

diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py
@@ -13,15 +13,15 @@ def __init__(self, id: UUID, title: str, raw_data_location: str):
         self.title = title
         self.raw_data_location = raw_data_location
 
-    def read(self) -> PdfReader:
+    def read(self, chunk_size: int) -> PdfReader:
         file = PdfReader(self.raw_data_location)
 
         def get_text():
             for page in file.pages:
                 page_text = page.extract_text()
                 yield page_text
 
-        chunker = TextChunker(self.id, get_text = get_text)
+        chunker = TextChunker(self.id, chunk_size = chunk_size, get_text = get_text)
 
         yield from chunker.read()
 

diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py
@@ -12,7 +12,7 @@ def __init__(self, id: UUID, title: str, raw_data_location: str):
         self.title = title
         self.raw_data_location = raw_data_location
 
-    def read(self):
+    def read(self, chunk_size: int):
         def get_text():
             with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
                 while True:
@@ -24,7 +24,7 @@ def get_text():
                     yield text
 
 
-        chunker = TextChunker(self.id, get_text = get_text)
+        chunker = TextChunker(self.id,chunk_size = chunk_size, get_text = get_text)
 
         yield from chunker.read()
 

diff --git a/cognee/tasks/source_documents_to_chunks/source_documents_to_chunks.py b/cognee/tasks/source_documents_to_chunks/source_documents_to_chunks.py
@@ -2,7 +2,7 @@
 from cognee.modules.data.processing.document_types.Document import Document
 
 
-async def source_documents_to_chunks(documents: list[Document], parent_node_id: str = None):
+async def source_documents_to_chunks(documents: list[Document], chunk_size: int = 1024, parent_node_id: str = None):
     graph_engine = await get_graph_engine()
 
     if parent_node_id is None:
@@ -40,5 +40,5 @@ async def source_documents_to_chunks(documents: list[Document], parent_node_id:
         await graph_engine.add_edges(edges)
 
     for document in documents:
-        for document_chunk in document.read():
+        for document_chunk in document.read(chunk_size = chunk_size):
             yield document_chunk
diff --git a/notebooks/cognee_demo_1.5.ipynb b/notebooks/cognee_demo_1.5.ipynb
@@ -391,7 +391,7 @@
     "            Task(classify_documents),\n",
     "            Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n",
     "            Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),\n",
-    "            Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n",
+    "            Task(source_documents_to_chunks, chunk_size = 800, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n",
     "            Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = \"entities\", task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes\n",
     "            Task(chunk_update_check, collection_name = \"chunks\"), # Find all affected chunks, so we don't process unchanged chunks\n",
     "            Task(\n",

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cognee"
-version = "0.1.15"
+version = "0.1.16"
 description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning."
 authors = ["Vasilije Markovic", "Boris Arzentar"]
 readme = "README.md"