From f46308cd15626e73807041a916360fc3b8887de8 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Sat, 21 Sep 2024 17:33:13 +0200 Subject: [PATCH] chore: increase the lib version --- .../modules/data/processing/document_types/AudioDocument.py | 4 ++-- cognee/modules/data/processing/document_types/Document.py | 2 +- .../modules/data/processing/document_types/ImageDocument.py | 4 ++-- cognee/modules/data/processing/document_types/PdfDocument.py | 4 ++-- cognee/modules/data/processing/document_types/TextDocument.py | 4 ++-- .../source_documents_to_chunks/source_documents_to_chunks.py | 4 ++-- notebooks/cognee_demo_1.5.ipynb | 2 +- pyproject.toml | 2 +- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index 1eb85f177..41f9e4c4b 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -15,12 +15,12 @@ def __init__(self, id: UUID, title: str, raw_data_location: str, chunking_strate self.raw_data_location = raw_data_location self.chunking_strategy = chunking_strategy - def read(self): + def read(self, chunk_size: int): # Transcribe the audio file result = get_llm_client().create_transcript(self.raw_data_location) text = result.text - chunker = TextChunker(self.id, get_text = lambda: text) + chunker = TextChunker(self.id, chunk_size = chunk_size, get_text = lambda: text) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 78eeecc11..1e841682d 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -7,5 +7,5 @@ class Document(Protocol): title: str raw_data_location: str - def read(self) -> str: + def read(self, chunk_size: int) -> str: pass diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py index cd157a90b..e0308180a 100644 --- a/cognee/modules/data/processing/document_types/ImageDocument.py +++ b/cognee/modules/data/processing/document_types/ImageDocument.py @@ -14,12 +14,12 @@ def __init__(self, id: UUID, title: str, raw_data_location: str): self.title = title self.raw_data_location = raw_data_location - def read(self): + def read(self, chunk_size: int): # Transcribe the image file result = get_llm_client().transcribe_image(self.raw_data_location) text = result.choices[0].message.content - chunker = TextChunker(self.id, get_text = lambda: text) + chunker = TextChunker(self.id, chunk_size = chunk_size, get_text = lambda: text) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index d8adb69b8..97fcb2949 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -13,7 +13,7 @@ def __init__(self, id: UUID, title: str, raw_data_location: str): self.title = title self.raw_data_location = raw_data_location - def read(self) -> PdfReader: + def read(self, chunk_size: int) -> PdfReader: file = PdfReader(self.raw_data_location) def get_text(): @@ -21,7 +21,7 @@ def get_text(): page_text = page.extract_text() yield page_text - chunker = TextChunker(self.id, get_text = get_text) + chunker = TextChunker(self.id, chunk_size = chunk_size, get_text = get_text) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py index 84678dfa4..d4dcee183 100644 --- a/cognee/modules/data/processing/document_types/TextDocument.py +++ b/cognee/modules/data/processing/document_types/TextDocument.py @@ -12,7 +12,7 @@ def __init__(self, id: UUID, title: str, raw_data_location: str): self.title = title self.raw_data_location = raw_data_location - def read(self): + def read(self, chunk_size: int): def get_text(): with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file: while True: @@ -24,7 +24,7 @@ def get_text(): yield text - chunker = TextChunker(self.id, get_text = get_text) + chunker = TextChunker(self.id,chunk_size = chunk_size, get_text = get_text) yield from chunker.read() diff --git a/cognee/tasks/source_documents_to_chunks/source_documents_to_chunks.py b/cognee/tasks/source_documents_to_chunks/source_documents_to_chunks.py index ff2e78a5e..c3cdcb0e2 100644 --- a/cognee/tasks/source_documents_to_chunks/source_documents_to_chunks.py +++ b/cognee/tasks/source_documents_to_chunks/source_documents_to_chunks.py @@ -2,7 +2,7 @@ from cognee.modules.data.processing.document_types.Document import Document -async def source_documents_to_chunks(documents: list[Document], parent_node_id: str = None): +async def source_documents_to_chunks(documents: list[Document], chunk_size: int = 1024, parent_node_id: str = None): graph_engine = await get_graph_engine() if parent_node_id is None: @@ -40,5 +40,5 @@ async def source_documents_to_chunks(documents: list[Document], parent_node_id: await graph_engine.add_edges(edges) for document in documents: - for document_chunk in document.read(): + for document_chunk in document.read(chunk_size = chunk_size): yield document_chunk diff --git a/notebooks/cognee_demo_1.5.ipynb b/notebooks/cognee_demo_1.5.ipynb index 52760d949..e7850edc8 100644 --- a/notebooks/cognee_demo_1.5.ipynb +++ b/notebooks/cognee_demo_1.5.ipynb @@ -391,7 +391,7 @@ " Task(classify_documents),\n", " Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n", " Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),\n", - " Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n", + " Task(source_documents_to_chunks, chunk_size = 800, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n", " Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = \"entities\", task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes\n", " Task(chunk_update_check, collection_name = \"chunks\"), # Find all affected chunks, so we don't process unchanged chunks\n", " Task(\n", diff --git a/pyproject.toml b/pyproject.toml index 87519b664..5777d4bc0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cognee" -version = "0.1.15" +version = "0.1.16" description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning." authors = ["Vasilije Markovic", "Boris Arzentar"] readme = "README.md"