Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: increase the lib version #138

Merged
merged 1 commit into from
Sep 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ def __init__(self, id: UUID, title: str, raw_data_location: str, chunking_strate
self.raw_data_location = raw_data_location
self.chunking_strategy = chunking_strategy

def read(self):
def read(self, chunk_size: int):
# Transcribe the audio file
result = get_llm_client().create_transcript(self.raw_data_location)
text = result.text

chunker = TextChunker(self.id, get_text = lambda: text)
chunker = TextChunker(self.id, chunk_size = chunk_size, get_text = lambda: text)

yield from chunker.read()

Expand Down
2 changes: 1 addition & 1 deletion cognee/modules/data/processing/document_types/Document.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ class Document(Protocol):
title: str
raw_data_location: str

def read(self) -> str:
def read(self, chunk_size: int) -> str:
pass
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ def __init__(self, id: UUID, title: str, raw_data_location: str):
self.title = title
self.raw_data_location = raw_data_location

def read(self):
def read(self, chunk_size: int):
# Transcribe the image file
result = get_llm_client().transcribe_image(self.raw_data_location)
text = result.choices[0].message.content

chunker = TextChunker(self.id, get_text = lambda: text)
chunker = TextChunker(self.id, chunk_size = chunk_size, get_text = lambda: text)

yield from chunker.read()

Expand Down
4 changes: 2 additions & 2 deletions cognee/modules/data/processing/document_types/PdfDocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@ def __init__(self, id: UUID, title: str, raw_data_location: str):
self.title = title
self.raw_data_location = raw_data_location

def read(self) -> PdfReader:
def read(self, chunk_size: int) -> PdfReader:
file = PdfReader(self.raw_data_location)

def get_text():
for page in file.pages:
page_text = page.extract_text()
yield page_text

chunker = TextChunker(self.id, get_text = get_text)
chunker = TextChunker(self.id, chunk_size = chunk_size, get_text = get_text)

yield from chunker.read()

Expand Down
4 changes: 2 additions & 2 deletions cognee/modules/data/processing/document_types/TextDocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def __init__(self, id: UUID, title: str, raw_data_location: str):
self.title = title
self.raw_data_location = raw_data_location

def read(self):
def read(self, chunk_size: int):
def get_text():
with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
while True:
Expand All @@ -24,7 +24,7 @@ def get_text():
yield text


chunker = TextChunker(self.id, get_text = get_text)
chunker = TextChunker(self.id,chunk_size = chunk_size, get_text = get_text)

yield from chunker.read()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from cognee.modules.data.processing.document_types.Document import Document


async def source_documents_to_chunks(documents: list[Document], parent_node_id: str = None):
async def source_documents_to_chunks(documents: list[Document], chunk_size: int = 1024, parent_node_id: str = None):
graph_engine = await get_graph_engine()

if parent_node_id is None:
Expand Down Expand Up @@ -40,5 +40,5 @@ async def source_documents_to_chunks(documents: list[Document], parent_node_id:
await graph_engine.add_edges(edges)

for document in documents:
for document_chunk in document.read():
for document_chunk in document.read(chunk_size = chunk_size):
yield document_chunk
2 changes: 1 addition & 1 deletion notebooks/cognee_demo_1.5.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@
" Task(classify_documents),\n",
" Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n",
" Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),\n",
" Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n",
" Task(source_documents_to_chunks, chunk_size = 800, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n",
" Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = \"entities\", task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes\n",
" Task(chunk_update_check, collection_name = \"chunks\"), # Find all affected chunks, so we don't process unchanged chunks\n",
" Task(\n",
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "cognee"
version = "0.1.15"
version = "0.1.16"
description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning."
authors = ["Vasilije Markovic", "Boris Arzentar"]
readme = "README.md"
Expand Down
Loading