Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code graph pipeline improvements and fixes #414

Merged
merged 27 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a8a83ff
Ingest non-code files
alekszievr Dec 20, 2024
399faf9
Fixing review findings
alekszievr Dec 20, 2024
a6dfff8
Merge branch 'dev' of https://github.com/topoteretes/cognee into dev
alekszievr Jan 6, 2025
4cee9a1
fix: add allowed extensions
lxobr Jan 6, 2025
dbc33a6
fix: adhere UnstructuredDocument.read() to Document
lxobr Jan 6, 2025
5e79dc5
feat: time code graph run and add mock support
lxobr Jan 6, 2025
4802567
Overcome ContextWindowExceededError by checking token count while chu…
alekszievr Jan 7, 2025
fbf8fc9
Merge branch 'dev' into COG-949
alekszievr Jan 7, 2025
a774191
Adjust AudioDocument and handle None token limit
alekszievr Jan 7, 2025
fb13a1b
Handle azure models as well
alekszievr Jan 7, 2025
0dec704
Merge branch 'dev' into COG-949
alekszievr Jan 8, 2025
8ffef50
Add clean logging to code graph example
alekszievr Jan 8, 2025
f4397bf
Remove setting envvars from arg
alekszievr Jan 8, 2025
34a9267
Get embedding engine instead of passing it. Get it from vector engine…
alekszievr Jan 8, 2025
97814e3
Get embedding engine instead of passing it in code chunking.
alekszievr Jan 8, 2025
d1d81eb
Merge branch 'dev' into COG-949
alekszievr Jan 9, 2025
2e37f85
ruff formatting
alekszievr Jan 9, 2025
5635da6
Adjust unit tests
alekszievr Jan 9, 2025
6762039
Merge remote-tracking branch 'origin/COG-949' into COG-949
alekszievr Jan 9, 2025
abb3ea6
Adjust integration tests
alekszievr Jan 9, 2025
cdaae16
Handle circular import
alekszievr Jan 9, 2025
626bc76
Set max_tokens in config
alekszievr Jan 9, 2025
d7b2186
Adjust SWE-bench script to code graph pipeline call
alekszievr Jan 9, 2025
18bb282
Adjust SWE-bench script to code graph pipeline call
alekszievr Jan 9, 2025
7e10349
Merge branch 'COG-949' of https://github.com/topoteretes/cognee into …
hajdul88 Jan 9, 2025
a11b914
Merge branch 'dev' into COG-949
alekszievr Jan 10, 2025
5839ab0
Merge branch 'dev' into COG-949
Vasilije1990 Jan 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cognee/api/v1/cognify/code_graph_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
Task(classify_documents),
Task(extract_chunks_from_documents),
Task(extract_chunks_from_documents, embedding_model=embedding_engine.model, max_tokens=8192),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As the embedding_engine shouldn't change in code and should only be changed through environment configuration I think we should remove it as a parameter to functions and instead get it dynamically with the getter when needed

Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}),
Task(
summarize_text,
Expand Down
23 changes: 19 additions & 4 deletions cognee/modules/chunking/TextChunker.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,45 @@
from uuid import uuid5, NAMESPACE_OID
from typing import Optional
from uuid import NAMESPACE_OID, uuid5

from .models.DocumentChunk import DocumentChunk
from cognee.tasks.chunks import chunk_by_paragraph

from .models.DocumentChunk import DocumentChunk


class TextChunker():
document = None
max_chunk_size: int

chunk_index = 0
chunk_size = 0
token_count = 0

def __init__(self, document, get_text: callable, chunk_size: int = 1024):
def __init__(self, document, get_text: callable, embedding_model: Optional[str] = None, max_tokens: Optional[int] = None, chunk_size: int = 1024):
self.document = document
self.max_chunk_size = chunk_size
self.get_text = get_text
self.max_tokens = max_tokens if max_tokens else float("inf")
self.embedding_model = embedding_model

def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens
return word_count_fits and token_count_fits

def read(self):
paragraph_chunks = []
for content_text in self.get_text():
for chunk_data in chunk_by_paragraph(
content_text,
self.embedding_model,
self.max_tokens,
self.max_chunk_size,
batch_paragraphs = True,
):
if self.chunk_size + chunk_data["word_count"] <= self.max_chunk_size:
if self.check_word_count_and_token_count(self.chunk_size, self.token_count, chunk_data):
paragraph_chunks.append(chunk_data)
self.chunk_size += chunk_data["word_count"]
self.token_count += chunk_data["token_count"]
else:
if len(paragraph_chunks) == 0:
yield DocumentChunk(
Expand Down Expand Up @@ -63,6 +77,7 @@ def read(self):
print(e)
paragraph_chunks = [chunk_data]
self.chunk_size = chunk_data["word_count"]
self.token_count = chunk_data["token_count"]

self.chunk_index += 1

Expand Down
3 changes: 2 additions & 1 deletion cognee/modules/data/processing/document_types/Document.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from typing import Optional
from uuid import UUID

from cognee.infrastructure.engine import DataPoint
Expand All @@ -13,5 +14,5 @@ class Document(DataPoint):
"type": "Document"
}

def read(self, chunk_size: int, chunker = str) -> str:
def read(self, chunk_size: int, embedding_model: Optional[str], max_tokens: Optional[int], chunker = str) -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Update default values for new parameters in read method

The new parameters embedding_model and max_tokens in the read method lack default values, which may cause issues in subclasses that do not override these parameters. Set default values to None to maintain compatibility.

Apply this diff to set default values:

     def read(
         self,
         chunk_size: int,
-        embedding_model: Optional[str],
-        max_tokens: Optional[int],
+        embedding_model: Optional[str] = None,
+        max_tokens: Optional[int] = None,
         chunker=str,
     ) -> str:
         pass
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def read(self, chunk_size: int, embedding_model: Optional[str], max_tokens: Optional[int], chunker = str) -> str:
def read(
self,
chunk_size: int,
embedding_model: Optional[str] = None,
max_tokens: Optional[int] = None,
chunker=str
) -> str:
🧰 Tools
🪛 GitHub Actions: ruff format

[error] 308 files require formatting according to Ruff standards. Run Ruff formatter to fix formatting issues.


💡 Codebase verification

Update missing read methods in ImageDocument and AudioDocument

The following classes need to implement the read method with the updated signature read(self, chunk_size: int, embedding_model: Optional[str], max_tokens: Optional[int], chunker = str) -> str:

  • cognee/modules/data/processing/document_types/ImageDocument.py
  • cognee/modules/data/processing/document_types/AudioDocument.py
🔗 Analysis chain

Ensure all subclasses override the updated read method

The read method signature has changed. Ensure that all subclasses of Document, such as AudioDocument, VideoDocument, etc., override the read method with the new parameters to prevent TypeError.

The pipeline failure indicates that AudioDocument.read() does not accept embedding_model as a parameter. Please update all subclasses accordingly.

Run the following script to identify classes that need updating:

🏁 Scripts executed

The following scripts were executed for the analysis:

Script:

#!/bin/bash
# Description: Find all subclasses of Document that need to update the `read` method.

# Find files containing classes that inherit from Document but do not have updated `read` methods
fd --type f --extension py '.*' | xargs grep -l 'class .*Document(Document):' | xargs grep -L 'def read(self, chunk_size'

# Alternatively, use ast-grep to find method definitions
ast-grep --pattern 'class $CLASS_NAME(Document): {$$$ def read(self, chunk_size: int, $$$): $$$ }' --ignore 'test_*'

Length of output: 490


Script:

#!/bin/bash
# Find Document subclasses
echo "=== Document subclasses ==="
ast-grep --pattern 'class $NAME(Document)'

echo -e "\n=== Read method implementations ==="
# Find read method implementations
ast-grep --pattern 'def read(self, $$$)'

echo -e "\n=== Backup search with ripgrep ==="
# Backup search with ripgrep
rg "class.*\(Document\)" -A 5

Length of output: 3351

🧰 Tools
🪛 GitHub Actions: ruff format

[error] 308 files require formatting according to Ruff standards. Run Ruff formatter to fix formatting issues.

pass
10 changes: 7 additions & 3 deletions cognee/modules/data/processing/document_types/ImageDocument.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from typing import Optional

from cognee.infrastructure.llm.get_llm_client import get_llm_client
from .Document import Document

from .ChunkerMapping import ChunkerConfig
from .Document import Document


class ImageDocument(Document):
type: str = "image"
Expand All @@ -10,11 +14,11 @@ def transcribe_image(self):
result = get_llm_client().transcribe_image(self.raw_data_location)
return(result.choices[0].message.content)

def read(self, chunk_size: int, chunker: str):
def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
# Transcribe the image file
text = self.transcribe_image()

chunker_func = ChunkerConfig.get_chunker(chunker)
chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text])
chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
alekszievr marked this conversation as resolved.
Show resolved Hide resolved

yield from chunker.read()
10 changes: 7 additions & 3 deletions cognee/modules/data/processing/document_types/PdfDocument.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from typing import Optional

from pypdf import PdfReader
from .Document import Document

from .ChunkerMapping import ChunkerConfig
from .Document import Document


class PdfDocument(Document):
type: str = "pdf"

def read(self, chunk_size: int, chunker: str):
def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
file = PdfReader(self.raw_data_location)

def get_text():
Expand All @@ -14,7 +18,7 @@ def get_text():
yield page_text

chunker_func = ChunkerConfig.get_chunker(chunker)
chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text)
chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)

yield from chunker.read()

Expand Down
9 changes: 6 additions & 3 deletions cognee/modules/data/processing/document_types/TextDocument.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from .Document import Document
from typing import Optional

from .ChunkerMapping import ChunkerConfig
from .Document import Document


class TextDocument(Document):
type: str = "text"

def read(self, chunk_size: int, chunker: str):
def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
def get_text():
with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
while True:
Expand All @@ -17,6 +20,6 @@ def get_text():

chunker_func = ChunkerConfig.get_chunker(chunker)

chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text)
chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)

yield from chunker.read()
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from io import StringIO
from typing import Optional

from cognee.modules.chunking.TextChunker import TextChunker
from .Document import Document
from cognee.modules.data.exceptions import UnstructuredLibraryImportError

from .Document import Document


class UnstructuredDocument(Document):
type: str = "unstructured"

def read(self, chunk_size: int):
def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]) -> str:
def get_text():
try:
from unstructured.partition.auto import partition
Expand All @@ -27,6 +29,6 @@ def get_text():

yield text

chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text)
chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)

yield from chunker.read()
31 changes: 27 additions & 4 deletions cognee/tasks/chunks/chunk_by_paragraph.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
from uuid import uuid5, NAMESPACE_OID
from typing import Dict, Any, Iterator
from typing import Any, Dict, Iterator, Optional, Union
from uuid import NAMESPACE_OID, uuid5

import tiktoken

from .chunk_by_sentence import chunk_by_sentence

def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs: bool = True) -> Iterator[Dict[str, Any]]:

def chunk_by_paragraph(
data: str,
embedding_model: Optional[str],
max_tokens: Optional[Union[int, float]],
paragraph_length: int = 1024,
batch_paragraphs: bool = True
) -> Iterator[Dict[str, Any]]:
"""
Chunks text by paragraph while preserving exact text reconstruction capability.
When chunks are joined with empty string "", they reproduce the original text exactly.
Expand All @@ -12,14 +22,22 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
chunk_index = 0
paragraph_ids = []
last_cut_type = None
current_token_count = 0

for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
# Check if this sentence would exceed length limit
if current_word_count > 0 and current_word_count + word_count > paragraph_length:
if embedding_model:
tokenizer = tiktoken.encoding_for_model(embedding_model)
token_count = len(tokenizer.encode(sentence))
else:
token_count = 0

if current_word_count > 0 and (current_word_count + word_count > paragraph_length or current_token_count + token_count > max_tokens):
# Yield current chunk
alekszievr marked this conversation as resolved.
Show resolved Hide resolved
chunk_dict = {
"text": current_chunk,
"word_count": current_word_count,
"token_count": current_token_count,
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
"paragraph_ids": paragraph_ids,
"chunk_index": chunk_index,
Expand All @@ -32,18 +50,21 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
paragraph_ids = []
current_chunk = ""
current_word_count = 0
current_token_count = 0
chunk_index += 1

paragraph_ids.append(paragraph_id)
current_chunk += sentence
current_word_count += word_count
current_token_count += token_count

# Handle end of paragraph
if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs:
# For non-batch mode, yield each paragraph separately
chunk_dict = {
"text": current_chunk,
"word_count": current_word_count,
"token_count": current_token_count,
"paragraph_ids": paragraph_ids,
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
"chunk_index": chunk_index,
Expand All @@ -53,6 +74,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
paragraph_ids = []
current_chunk = ""
current_word_count = 0
current_token_count = 0
chunk_index += 1

last_cut_type = end_type
Expand All @@ -62,6 +84,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
chunk_dict = {
"text": current_chunk,
"word_count": current_word_count,
"token_count": current_token_count,
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
"paragraph_ids": paragraph_ids,
"chunk_index": chunk_index,
Expand Down
12 changes: 10 additions & 2 deletions cognee/tasks/documents/extract_chunks_from_documents.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
from typing import Optional

from cognee.modules.data.processing.document_types.Document import Document


async def extract_chunks_from_documents(documents: list[Document], chunk_size: int = 1024, chunker = 'text_chunker'):
async def extract_chunks_from_documents(
documents: list[Document],
chunk_size: int = 1024,
chunker='text_chunker',
embedding_model: Optional[str] = None,
max_tokens: Optional[int] = None,
):
for document in documents:
for document_chunk in document.read(chunk_size = chunk_size, chunker = chunker):
for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, embedding_model=embedding_model, max_tokens=max_tokens):
alekszievr marked this conversation as resolved.
Show resolved Hide resolved
yield document_chunk
21 changes: 20 additions & 1 deletion cognee/tasks/repo_processor/get_non_code_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,27 @@ async def get_non_py_files(repo_path):
'node_modules', '*.egg-info'
}

ALLOWED_EXTENSIONS = {
'.txt', '.md', '.csv', '.json', '.xml', '.yaml', '.yml', '.html',
'.css', '.js', '.ts', '.jsx', '.tsx', '.sql', '.log', '.ini',
'.toml', '.properties', '.sh', '.bash', '.dockerfile', '.gitignore',
'.gitattributes', '.makefile', '.pyproject', '.requirements',
'.env', '.pdf', '.doc', '.docx', '.dot', '.dotx', '.rtf',
'.wps', '.wpd', '.odt', '.ott', '.ottx', '.txt', '.wp',
'.sdw', '.sdx', '.docm', '.dotm',
# Additional extensions for other programming languages
'.java', '.c', '.cpp', '.h', '.cs', '.go', '.php', '.rb',
'.swift', '.pl', '.lua', '.rs', '.scala', '.kt', '.sh',
'.sql', '.v', '.asm', '.pas', '.d', '.ml', '.clj', '.cljs',
'.erl', '.ex', '.exs', '.f', '.fs', '.r', '.pyi',
'.pdb', '.ipynb', '.rmd', '.cabal', '.hs', '.nim',
'.vhdl', '.verilog', '.svelte', '.html', '.css', '.scss',
'.less', '.json5', '.yaml', '.yml'
}

def should_process(path):
return not any(pattern in path for pattern in IGNORED_PATTERNS)
_, ext = os.path.splitext(path)
return ext in ALLOWED_EXTENSIONS and not any(pattern in path for pattern in IGNORED_PATTERNS)

non_py_files_paths = [
os.path.join(root, file)
Expand Down
62 changes: 57 additions & 5 deletions examples/python/code_graph_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,61 @@ async def main(repo_path, include_docs):
async for result in run_code_graph_pipeline(repo_path, include_docs):
print(result)

if __name__ == "__main__":
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository")
parser.add_argument("--include_docs", type=lambda x: x.lower() in ("true", "1"), default=True, help="Whether or not to process non-code files")
args = parser.parse_args()
asyncio.run(main(args.repo_path, args.include_docs))
parser.add_argument(
"--repo_path",
type=str,
required=True,
help="Path to the repository"
)
parser.add_argument(
"--include_docs",
type=lambda x: x.lower() in ("true", "1"),
default=True,
help="Whether or not to process non-code files"
)
parser.add_argument(
"--mock_embedding",
type=lambda x: x.lower() in ("true", "1"),
default=True,
help="Whether or not to mock embedding and code summary"
)
parser.add_argument(
"--mock_code_summary",
type=lambda x: x.lower() in ("true", "1"),
default=True,
help="Whether or not to mock code summary"
)
parser.add_argument(
"--time",
type=lambda x: x.lower() in ("true", "1"),
default=True,
help="Whether or not to time the pipeline run"
)
return parser.parse_args()

if __name__ == "__main__":
import os

args = parse_args()

if args.mock_embedding:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This I wouldn't do. It can cause misunderstanding later, we should either use envs or args.

os.environ["MOCK_EMBEDDING"] = "true"
print("Mocking embedding.")

if args.mock_code_summary:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here

os.environ["MOCK_CODE_SUMMARY"] = "true"
print("Mocking code summary.")

if args.time:
import time
start_time = time.time()
asyncio.run(main(args.repo_path, args.include_docs))
end_time = time.time()
print("\n" + "="*50)
print(f"Pipeline Execution Time: {end_time - start_time:.2f} seconds")
print("="*50 + "\n")
else:
asyncio.run(main(args.repo_path, args.include_docs))

Loading