From 682ddfc020e2f82e2d14a60ee8a989803479a77d Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Wed, 18 Dec 2024 17:32:40 +0100 Subject: [PATCH 01/17] fix: pass the list of all CodeFiles to enrichment task --- .../tasks/repo_processor/get_repo_file_dependencies.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py index 221af6cf6..b54c1f152 100644 --- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py +++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py @@ -71,7 +71,7 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non path = repo_path, ) - yield repo + yield [repo] with ProcessPoolExecutor(max_workers = 12) as executor: loop = asyncio.get_event_loop() @@ -90,10 +90,11 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non results = await asyncio.gather(*tasks) + code_files = [] for (file_path, metadata), dependencies in zip(py_files_dict.items(), results): source_code = metadata.get("source_code") - yield CodeFile( + code_files.append(CodeFile( id = uuid5(NAMESPACE_OID, file_path), source_code = source_code, extracted_id = file_path, @@ -106,4 +107,6 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non source_code = py_files_dict.get(dependency, {}).get("source_code"), ) for dependency in dependencies ] if dependencies else None, - ) + )) + + yield code_files From d40fc12939b73fa421c481254439d28bc1c98194 Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Wed, 18 Dec 2024 17:33:54 +0100 Subject: [PATCH 02/17] feat: introduce SourceCodeChunk, update metadata --- cognee/shared/CodeGraphEntities.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/cognee/shared/CodeGraphEntities.py b/cognee/shared/CodeGraphEntities.py index 23b8879c2..27289493d 100644 --- a/cognee/shared/CodeGraphEntities.py +++ b/cognee/shared/CodeGraphEntities.py @@ -1,5 +1,4 @@ from typing import List, Optional - from cognee.infrastructure.engine import DataPoint @@ -7,7 +6,7 @@ class Repository(DataPoint): __tablename__ = "Repository" path: str _metadata: dict = { - "index_fields": ["source_code"], + "index_fields": [], "type": "Repository" } @@ -19,29 +18,31 @@ class CodeFile(DataPoint): depends_on: Optional[List["CodeFile"]] = None depends_directly_on: Optional[List["CodeFile"]] = None contains: Optional[List["CodePart"]] = None - _metadata: dict = { - "index_fields": ["source_code"], + "index_fields": [], "type": "CodeFile" } class CodePart(DataPoint): __tablename__ = "codepart" - # part_of: Optional[CodeFile] - source_code: str - + # part_of: Optional[CodeFile] = None + source_code: Optional[str] = None _metadata: dict = { - "index_fields": ["source_code"], + "index_fields": [], "type": "CodePart" } -class CodeRelationship(DataPoint): - source_id: str - target_id: str - relation: str # depends on or depends directly +class SourceCodeChunk(DataPoint): + __tablename__ = "sourcecodechunk" + code_chunk_of: Optional[CodePart] = None + source_code: Optional[str] = None + previous_chunk: Optional["SourceCodeChunk"] = None + _metadata: dict = { - "type": "CodeRelationship" + "index_fields": ["source_code"], + "type": "SourceCodeChunk" } CodeFile.model_rebuild() CodePart.model_rebuild() +SourceCodeChunk.model_rebuild() \ No newline at end of file From aea7382983b5250eb443c46adc6a387d2690e22e Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Wed, 18 Dec 2024 17:34:17 +0100 Subject: [PATCH 03/17] feat: get_source_code_chunks code graph pipeline task --- .../repo_processor/get_source_code_chunks.py | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 cognee/tasks/repo_processor/get_source_code_chunks.py diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py new file mode 100644 index 000000000..2cd007a19 --- /dev/null +++ b/cognee/tasks/repo_processor/get_source_code_chunks.py @@ -0,0 +1,112 @@ +from typing import AsyncGenerator, Generator +from uuid import NAMESPACE_OID, uuid5 +from cognee.infrastructure.engine import DataPoint +from cognee.shared.CodeGraphEntities import CodePart, SourceCodeChunk, CodeFile +import tiktoken +import parso + +from cognee.tasks.repo_processor import logger + + +def _count_tokens(tokenizer: tiktoken.Encoding, source_code: str) -> int: + return len(tokenizer.encode(source_code)) + + +def _get_subchunk_token_counts( + tokenizer: tiktoken.Encoding, source_code: str, max_subchunk_tokens: int = 8000 +) -> list[tuple[str, int]]: + """Splits source code into subchunk and counts tokens for each subchunk.""" + + try: + module = parso.parse(source_code) + except Exception as e: + logger.error(f"Error parsing source code: {e}") + return [] + + if not module.children: + logger.warning("Parsed module has no children (empty or invalid source code).") + return [] + + if len(module.children) <= 2: + module = module.children[0] + + subchunk_token_counts = [] + for child in module.children: + subchunk = child.get_code() + token_count = _count_tokens(tokenizer, subchunk) + if token_count <= max_subchunk_tokens: + subchunk_token_counts.append((subchunk, token_count)) + continue + + subchunk_token_counts.extend(_get_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens)) + + return subchunk_token_counts + + +def _get_chunk_source_code( + code_token_counts: list[tuple[str, int]], overlap: float, max_tokens: int +) -> tuple[list[tuple[str, int]], str]: + """Generates a chunk of source code from tokenized subchunks with overlap handling.""" + current_count = 0 + cumulative_counts = [] + current_source_code = '' + + for i, (child_code, token_count) in enumerate(code_token_counts): + current_count += token_count + cumulative_counts.append(current_count) + if current_count > max_tokens: + break + current_source_code += f"\n{child_code}" + + if current_count<= max_tokens: + return [], current_source_code.strip() + + cutoff = 1 + for i, cum_count in enumerate(cumulative_counts): + if cum_count> (1 - overlap) * max_tokens: + break + cutoff = i + + return code_token_counts[cutoff:], current_source_code.strip() + + +def get_source_code_chunks_from_code_part( + code_file_part: CodePart, + max_tokens: int = 8192, + overlap: float = 0.25, + granularity: float = 0.1, + model_name: str = "text-embedding-3-large" +) -> Generator[SourceCodeChunk, None, None]: + """Yields source code chunks from a CodePart object, with configurable token limits and overlap.""" + tokenizer = tiktoken.encoding_for_model(model_name) + max_subchunk_tokens = max(1, int(granularity * max_tokens)) + subchunk_token_counts = _get_subchunk_token_counts(tokenizer, code_file_part.source_code, max_subchunk_tokens) + + previous_chunk = None + while subchunk_token_counts: + subchunk_token_counts, chunk_source_code = _get_chunk_source_code(subchunk_token_counts, overlap, max_tokens) + if not chunk_source_code: + continue + current_chunk = SourceCodeChunk( + id=uuid5(NAMESPACE_OID, chunk_source_code), + code_chunk_of=code_file_part, + source_code=chunk_source_code, + previous_chunk=previous_chunk + ) + yield current_chunk + previous_chunk = current_chunk + + +async def get_source_code_chunks(data_points: list[DataPoint], embedding_model="text-embedding-3-large") -> \ +AsyncGenerator[list[DataPoint], None]: + """Processes code graph datapoints, create SourceCodeChink datapoints.""" + for data_point in data_points: + yield data_point + if not isinstance(data_point, CodeFile): + continue + if not data_point.contains: + continue + for code_part in data_point.contains: + yield code_part + for source_code_chunk in get_source_code_chunks_from_code_part(code_part, model_name=embedding_model): + yield source_code_chunk From 1c5ca8460a9cd382eb7d3d5cd3bdd2c88c6f1bbe Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Wed, 18 Dec 2024 17:34:51 +0100 Subject: [PATCH 04/17] feat: integrate get_source_code_chunks task, comment out summarize_code --- cognee/api/v1/cognify/code_graph_pipeline.py | 50 ++++++++++---------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index eeb10d69e..ac89f257a 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -7,34 +7,32 @@ from pathlib import Path from typing import Union -from cognee.modules.data.methods import get_datasets, get_datasets_by_name +from cognee.shared.SourceCodeGraph import SourceCodeGraph +from cognee.shared.data_models import SummarizedContent +from cognee.shared.utils import send_telemetry +from cognee.modules.data.models import Dataset, Data from cognee.modules.data.methods.get_dataset_data import get_dataset_data -from cognee.modules.data.models import Data, Dataset -from cognee.modules.pipelines import run_tasks -from cognee.modules.pipelines.models import PipelineRunStatus -from cognee.modules.pipelines.operations.get_pipeline_status import \ - get_pipeline_status -from cognee.modules.pipelines.operations.log_pipeline_status import \ - log_pipeline_status +from cognee.modules.data.methods import get_datasets, get_datasets_by_name from cognee.modules.pipelines.tasks.Task import Task -from cognee.modules.users.methods import get_default_user +from cognee.modules.pipelines import run_tasks from cognee.modules.users.models import User -from cognee.shared.SourceCodeGraph import SourceCodeGraph -from cognee.shared.utils import send_telemetry -from cognee.tasks.documents import (check_permissions_on_documents, - classify_documents, - extract_chunks_from_documents) +from cognee.modules.users.methods import get_default_user +from cognee.modules.pipelines.models import PipelineRunStatus +from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status +from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status +from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents +from cognee.tasks.repo_processor.get_source_code_chunks import get_source_code_chunks from cognee.tasks.graph import extract_graph_from_code -from cognee.tasks.repo_processor import (enrich_dependency_graph, - expand_dependency_graph, - get_repo_file_dependencies) +from cognee.tasks.repo_processor import get_repo_file_dependencies, enrich_dependency_graph, expand_dependency_graph from cognee.tasks.storage import add_data_points from cognee.tasks.summarization import summarize_code +from cognee.infrastructure.databases.vector.embeddings import (get_embedding_engine) logger = logging.getLogger("code_graph_pipeline") update_status_lock = asyncio.Lock() + async def code_graph_pipeline(datasets: Union[str, list[str]] = None, user: User = None): if user is None: user = await get_default_user() @@ -65,7 +63,7 @@ async def code_graph_pipeline(datasets: Union[str, list[str]] = None, user: User async def run_pipeline(dataset: Dataset, user: User): '''DEPRECATED: Use `run_code_graph_pipeline` instead. This function will be removed.''' - data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id) + data_documents: list[Data] = await get_dataset_data(dataset_id=dataset.id) document_ids_str = [str(document.id) for document in data_documents] @@ -88,10 +86,11 @@ async def run_pipeline(dataset: Dataset, user: User): try: tasks = [ Task(classify_documents), - Task(check_permissions_on_documents, user = user, permissions = ["write"]), - Task(extract_chunks_from_documents), # Extract text chunks based on the document type. - Task(add_data_points, task_config = { "batch_size": 10 }), - Task(extract_graph_from_code, graph_model = SourceCodeGraph, task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks. + Task(check_permissions_on_documents, user=user, permissions=["write"]), + Task(extract_chunks_from_documents), # Extract text chunks based on the document type. + Task(add_data_points, task_config={"batch_size": 10}), + Task(extract_graph_from_code, graph_model=SourceCodeGraph, task_config={"batch_size": 10}), + # Generate knowledge graphs from the document chunks. ] pipeline = run_tasks(tasks, data_documents, "code_graph_pipeline") @@ -135,11 +134,14 @@ async def run_code_graph_pipeline(repo_path): await cognee.prune.prune_system(metadata=True) await create_db_and_tables() + embedding_engine = get_embedding_engine() + tasks = [ Task(get_repo_file_dependencies), - Task(enrich_dependency_graph, task_config={"batch_size": 50}), + Task(enrich_dependency_graph), Task(expand_dependency_graph, task_config={"batch_size": 50}), - Task(summarize_code, task_config={"batch_size": 50}), + Task(get_source_code_chunks, embedding_model=embedding_engine.model, task_config={"batch_size": 50}), + # Task(summarize_code, task_config={"batch_size": 50}), Task(add_data_points, task_config={"batch_size": 50}), ] From ccd5cb6ef2c9cbe0e42cce0378d71577aed5981e Mon Sep 17 00:00:00 2001 From: alekszievr <44192193+alekszievr@users.noreply.github.com> Date: Thu, 19 Dec 2024 15:17:52 +0100 Subject: [PATCH 05/17] Fix code summarization (#387) --- cognee/api/v1/cognify/code_graph_pipeline.py | 39 ++++++++++++-------- cognee/tasks/summarization/models.py | 6 ++- cognee/tasks/summarization/summarize_code.py | 6 +-- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index ac89f257a..0a6af0f95 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -7,26 +7,34 @@ from pathlib import Path from typing import Union -from cognee.shared.SourceCodeGraph import SourceCodeGraph -from cognee.shared.data_models import SummarizedContent -from cognee.shared.utils import send_telemetry -from cognee.modules.data.models import Dataset, Data -from cognee.modules.data.methods.get_dataset_data import get_dataset_data +from cognee.infrastructure.databases.vector.embeddings import \ + get_embedding_engine from cognee.modules.data.methods import get_datasets, get_datasets_by_name -from cognee.modules.pipelines.tasks.Task import Task +from cognee.modules.data.methods.get_dataset_data import get_dataset_data +from cognee.modules.data.models import Data, Dataset from cognee.modules.pipelines import run_tasks -from cognee.modules.users.models import User -from cognee.modules.users.methods import get_default_user from cognee.modules.pipelines.models import PipelineRunStatus -from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status -from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status -from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents -from cognee.tasks.repo_processor.get_source_code_chunks import get_source_code_chunks +from cognee.modules.pipelines.operations.get_pipeline_status import \ + get_pipeline_status +from cognee.modules.pipelines.operations.log_pipeline_status import \ + log_pipeline_status +from cognee.modules.pipelines.tasks.Task import Task +from cognee.modules.users.methods import get_default_user +from cognee.modules.users.models import User +from cognee.shared.data_models import SummarizedContent +from cognee.shared.SourceCodeGraph import SourceCodeGraph +from cognee.shared.utils import send_telemetry +from cognee.tasks.documents import (check_permissions_on_documents, + classify_documents, + extract_chunks_from_documents) from cognee.tasks.graph import extract_graph_from_code -from cognee.tasks.repo_processor import get_repo_file_dependencies, enrich_dependency_graph, expand_dependency_graph +from cognee.tasks.repo_processor import (enrich_dependency_graph, + expand_dependency_graph, + get_repo_file_dependencies) +from cognee.tasks.repo_processor.get_source_code_chunks import \ + get_source_code_chunks from cognee.tasks.storage import add_data_points from cognee.tasks.summarization import summarize_code -from cognee.infrastructure.databases.vector.embeddings import (get_embedding_engine) logger = logging.getLogger("code_graph_pipeline") @@ -121,6 +129,7 @@ def generate_dataset_name(dataset_name: str) -> str: async def run_code_graph_pipeline(repo_path): import os import pathlib + import cognee from cognee.infrastructure.databases.relational import create_db_and_tables @@ -141,7 +150,7 @@ async def run_code_graph_pipeline(repo_path): Task(enrich_dependency_graph), Task(expand_dependency_graph, task_config={"batch_size": 50}), Task(get_source_code_chunks, embedding_model=embedding_engine.model, task_config={"batch_size": 50}), - # Task(summarize_code, task_config={"batch_size": 50}), + Task(summarize_code, task_config={"batch_size": 50}), Task(add_data_points, task_config={"batch_size": 50}), ] diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py index add448155..178267ce6 100644 --- a/cognee/tasks/summarization/models.py +++ b/cognee/tasks/summarization/models.py @@ -1,6 +1,8 @@ +from typing import Union + from cognee.infrastructure.engine import DataPoint from cognee.modules.chunking.models import DocumentChunk -from cognee.shared.CodeGraphEntities import CodeFile +from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk class TextSummary(DataPoint): @@ -17,7 +19,7 @@ class TextSummary(DataPoint): class CodeSummary(DataPoint): __tablename__ = "code_summary" text: str - made_from: CodeFile + made_from: Union[CodeFile, CodePart, SourceCodeChunk] _metadata: dict = { "index_fields": ["text"], diff --git a/cognee/tasks/summarization/summarize_code.py b/cognee/tasks/summarization/summarize_code.py index b116e57a9..3b393a0d5 100644 --- a/cognee/tasks/summarization/summarize_code.py +++ b/cognee/tasks/summarization/summarize_code.py @@ -1,10 +1,10 @@ import asyncio from typing import AsyncGenerator, Union from uuid import uuid5 -from typing import Type from cognee.infrastructure.engine import DataPoint from cognee.modules.data.extraction.extract_summary import extract_code_summary + from .models import CodeSummary @@ -21,7 +21,7 @@ async def summarize_code( ) file_summaries_map = { - code_data_point.extracted_id: str(file_summary) + code_data_point.id: str(file_summary) for code_data_point, file_summary in zip(code_data_points, file_summaries) } @@ -36,5 +36,5 @@ async def summarize_code( yield CodeSummary( id=uuid5(node.id, "CodeSummary"), made_from=node, - text=file_summaries_map[node.extracted_id], + text=file_summaries_map[node.id], ) From 5deb83ef35da0bf53fe6e8104fb3ebf9bf76658f Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Thu, 19 Dec 2024 16:19:12 +0100 Subject: [PATCH 06/17] feat: update data models --- cognee/shared/data_models.py | 1 - cognee/tasks/summarization/models.py | 2 +- cognee/tasks/summarization/summarize_code.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cognee/shared/data_models.py b/cognee/shared/data_models.py index dec53cfcb..2a8bc8c91 100644 --- a/cognee/shared/data_models.py +++ b/cognee/shared/data_models.py @@ -210,7 +210,6 @@ class SummarizedClass(BaseModel): decorators: Optional[List[str]] = None class SummarizedCode(BaseModel): - file_name: str high_level_summary: str key_features: List[str] imports: List[str] = [] diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py index 178267ce6..5b0345015 100644 --- a/cognee/tasks/summarization/models.py +++ b/cognee/tasks/summarization/models.py @@ -19,7 +19,7 @@ class TextSummary(DataPoint): class CodeSummary(DataPoint): __tablename__ = "code_summary" text: str - made_from: Union[CodeFile, CodePart, SourceCodeChunk] + summarizes: Union[CodeFile, CodePart, SourceCodeChunk] _metadata: dict = { "index_fields": ["text"], diff --git a/cognee/tasks/summarization/summarize_code.py b/cognee/tasks/summarization/summarize_code.py index 3b393a0d5..9efc5b6ca 100644 --- a/cognee/tasks/summarization/summarize_code.py +++ b/cognee/tasks/summarization/summarize_code.py @@ -35,6 +35,6 @@ async def summarize_code( yield CodeSummary( id=uuid5(node.id, "CodeSummary"), - made_from=node, + summarizes=node, text=file_summaries_map[node.id], ) From ace45ef378acf5a0168de88f70f522e4f9121cde Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Thu, 19 Dec 2024 17:15:43 +0100 Subject: [PATCH 07/17] feat: naive parse long strings in source code --- .../repo_processor/get_source_code_chunks.py | 34 +++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py index 2cd007a19..dfbc9c2b6 100644 --- a/cognee/tasks/repo_processor/get_source_code_chunks.py +++ b/cognee/tasks/repo_processor/get_source_code_chunks.py @@ -12,6 +12,26 @@ def _count_tokens(tokenizer: tiktoken.Encoding, source_code: str) -> int: return len(tokenizer.encode(source_code)) +def _get_naive_subchunk_token_counts( + tokenizer: tiktoken.Encoding, source_code: str, max_subchunk_tokens: int = 8000 +) -> list[tuple[str, int]]: + """Splits source code into subchunks of up to max_subchunk_tokens and counts tokens.""" + + token_ids = tokenizer.encode(source_code) + subchunk_token_counts = [] + + for start_idx in range(0, len(token_ids), max_subchunk_tokens): + subchunk_token_ids = token_ids[start_idx: start_idx + max_subchunk_tokens] + token_count = len(subchunk_token_ids) + subchunk = ''.join( + tokenizer.decode_single_token_bytes(token_id).decode('utf-8', errors='replace') + for token_id in subchunk_token_ids + ) + subchunk_token_counts.append((subchunk, token_count)) + + return subchunk_token_counts + + def _get_subchunk_token_counts( tokenizer: tiktoken.Encoding, source_code: str, max_subchunk_tokens: int = 8000 ) -> list[tuple[str, int]]: @@ -34,10 +54,18 @@ def _get_subchunk_token_counts( for child in module.children: subchunk = child.get_code() token_count = _count_tokens(tokenizer, subchunk) + + if token_count == 0: + continue + if token_count <= max_subchunk_tokens: subchunk_token_counts.append((subchunk, token_count)) continue + if child.type == 'string': + subchunk_token_counts.extend(_get_naive_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens)) + continue + subchunk_token_counts.extend(_get_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens)) return subchunk_token_counts @@ -58,12 +86,12 @@ def _get_chunk_source_code( break current_source_code += f"\n{child_code}" - if current_count<= max_tokens: + if current_count <= max_tokens: return [], current_source_code.strip() cutoff = 1 for i, cum_count in enumerate(cumulative_counts): - if cum_count> (1 - overlap) * max_tokens: + if cum_count > (1 - overlap) * max_tokens: break cutoff = i @@ -98,7 +126,7 @@ def get_source_code_chunks_from_code_part( async def get_source_code_chunks(data_points: list[DataPoint], embedding_model="text-embedding-3-large") -> \ -AsyncGenerator[list[DataPoint], None]: + AsyncGenerator[list[DataPoint], None]: """Processes code graph datapoints, create SourceCodeChink datapoints.""" for data_point in data_points: yield data_point From c1539f6f2c60e70009e72354d18123ddddb1512f Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Fri, 20 Dec 2024 15:03:38 +0100 Subject: [PATCH 08/17] fix: get_non_py_files instead of get_non_code_files --- cognee/api/v1/cognify/code_graph_pipeline.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 4d0f8501b..99032255f 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -17,7 +17,7 @@ from cognee.tasks.repo_processor import (enrich_dependency_graph, expand_dependency_graph, get_data_list_for_user, - get_non_code_files, + get_non_py_files, get_repo_file_dependencies) from cognee.tasks.repo_processor.get_source_code_chunks import \ get_source_code_chunks @@ -67,7 +67,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): if include_docs: non_code_tasks = [ - Task(get_non_code_files, task_config={"batch_size": 50}), + Task(get_non_py_files, task_config={"batch_size": 50}), Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user), Task(get_data_list_for_user, dataset_name="repo_docs", user=user), Task(classify_documents), @@ -79,7 +79,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): task_config={"batch_size": 50} ), ] - if include_docs: async for result in run_tasks(non_code_tasks, repo_path): yield result From d2911c1c591b95f144a38cdb16a484118a524096 Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Fri, 20 Dec 2024 15:15:31 +0100 Subject: [PATCH 09/17] fix: limit recursion, add comment --- .../tasks/repo_processor/get_source_code_chunks.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py index dfbc9c2b6..5aa0c35f2 100644 --- a/cognee/tasks/repo_processor/get_source_code_chunks.py +++ b/cognee/tasks/repo_processor/get_source_code_chunks.py @@ -33,9 +33,15 @@ def _get_naive_subchunk_token_counts( def _get_subchunk_token_counts( - tokenizer: tiktoken.Encoding, source_code: str, max_subchunk_tokens: int = 8000 + tokenizer: tiktoken.Encoding, + source_code: str, + max_subchunk_tokens: int = 8000, + depth: int = 0, + max_depth: int = 100 ) -> list[tuple[str, int]]: """Splits source code into subchunk and counts tokens for each subchunk.""" + if depth > max_depth: + return _get_naive_subchunk_token_counts(tokenizer, source_code, max_subchunk_tokens) try: module = parso.parse(source_code) @@ -47,6 +53,7 @@ def _get_subchunk_token_counts( logger.warning("Parsed module has no children (empty or invalid source code).") return [] + # Handle cases with only one real child and an EndMarker to prevent infinite recursion. if len(module.children) <= 2: module = module.children[0] @@ -66,7 +73,9 @@ def _get_subchunk_token_counts( subchunk_token_counts.extend(_get_naive_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens)) continue - subchunk_token_counts.extend(_get_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens)) + subchunk_token_counts.extend( + _get_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens, depth=depth + 1, max_depth=max_depth) + ) return subchunk_token_counts From 762df1124914a7c734e30cb42e0d5a6305930889 Mon Sep 17 00:00:00 2001 From: alekszievr <44192193+alekszievr@users.noreply.github.com> Date: Mon, 23 Dec 2024 11:39:57 +0100 Subject: [PATCH 10/17] handle embedding empty input error (#398) --- cognee/api/v1/cognify/code_graph_pipeline.py | 3 ++- cognee/tasks/storage/index_data_points.py | 8 +++++++- examples/python/code_graph_example.py | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 99032255f..3d31b4000 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -2,9 +2,9 @@ import logging from pathlib import Path +from cognee.base_config import get_base_config from cognee.infrastructure.databases.vector.embeddings import \ get_embedding_engine -from cognee.base_config import get_base_config from cognee.modules.cognify.config import get_cognify_config from cognee.modules.pipelines import run_tasks from cognee.modules.pipelines.tasks.Task import Task @@ -79,6 +79,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): task_config={"batch_size": 50} ), ] + if include_docs: async for result in run_tasks(non_code_tasks, repo_path): yield result diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py index 857e4d777..058d4e8e5 100644 --- a/cognee/tasks/storage/index_data_points.py +++ b/cognee/tasks/storage/index_data_points.py @@ -1,3 +1,6 @@ +from litellm.exceptions import BadRequestError +from litellm.llms.OpenAI.openai import OpenAIError + from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.engine import DataPoint @@ -30,7 +33,10 @@ async def index_data_points(data_points: list[DataPoint]): for index_name, indexable_points in index_points.items(): index_name, field_name = index_name.split(".") - await vector_engine.index_data_points(index_name, field_name, indexable_points) + try: + await vector_engine.index_data_points(index_name, field_name, indexable_points) + except (OpenAIError, BadRequestError) as e: + print(f"Failed to index data points for {index_name}.{field_name}: {e}") return data_points diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py index c0b91972b..44ab33aad 100644 --- a/examples/python/code_graph_example.py +++ b/examples/python/code_graph_example.py @@ -11,6 +11,6 @@ async def main(repo_path, include_docs): if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository") - parser.add_argument("--include_docs", type=bool, default=True, help="Whether or not to process non-code files") + parser.add_argument("--include_docs", type=lambda x: x.lower() in ("true", "1"), default=True, help="Whether or not to process non-code files") args = parser.parse_args() asyncio.run(main(args.repo_path, args.include_docs)) \ No newline at end of file From ce6f7302f02d43c5337807e4777357a3f6f814dd Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Mon, 23 Dec 2024 12:10:07 +0100 Subject: [PATCH 11/17] feat: robustly handle CodeFile source code --- cognee/tasks/repo_processor/get_source_code_chunks.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py index 5aa0c35f2..1d69ac3b5 100644 --- a/cognee/tasks/repo_processor/get_source_code_chunks.py +++ b/cognee/tasks/repo_processor/get_source_code_chunks.py @@ -115,6 +115,10 @@ def get_source_code_chunks_from_code_part( model_name: str = "text-embedding-3-large" ) -> Generator[SourceCodeChunk, None, None]: """Yields source code chunks from a CodePart object, with configurable token limits and overlap.""" + if not code_file_part.source_code: + logger.error(f"No source code in CodeFile {code_file_part.id}") + return + tokenizer = tiktoken.encoding_for_model(model_name) max_subchunk_tokens = max(1, int(granularity * max_tokens)) subchunk_token_counts = _get_subchunk_token_counts(tokenizer, code_file_part.source_code, max_subchunk_tokens) From c50d0c7ee65ab354dd3b2ddd3ca2e7b57b526a99 Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Mon, 23 Dec 2024 12:13:25 +0100 Subject: [PATCH 12/17] refactor: sort imports --- cognee/tasks/repo_processor/get_source_code_chunks.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py index 1d69ac3b5..4ce83cfc5 100644 --- a/cognee/tasks/repo_processor/get_source_code_chunks.py +++ b/cognee/tasks/repo_processor/get_source_code_chunks.py @@ -1,9 +1,11 @@ from typing import AsyncGenerator, Generator from uuid import NAMESPACE_OID, uuid5 -from cognee.infrastructure.engine import DataPoint -from cognee.shared.CodeGraphEntities import CodePart, SourceCodeChunk, CodeFile -import tiktoken + import parso +import tiktoken + +from cognee.infrastructure.engine import DataPoint +from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk from cognee.tasks.repo_processor import logger From 07dcf73974dfd8d9e1aad74eb4e0a47990ca1854 Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Mon, 23 Dec 2024 12:15:51 +0100 Subject: [PATCH 13/17] todo: add support for other embedding models --- cognee/tasks/repo_processor/get_source_code_chunks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py index 4ce83cfc5..854c50681 100644 --- a/cognee/tasks/repo_processor/get_source_code_chunks.py +++ b/cognee/tasks/repo_processor/get_source_code_chunks.py @@ -143,6 +143,7 @@ def get_source_code_chunks_from_code_part( async def get_source_code_chunks(data_points: list[DataPoint], embedding_model="text-embedding-3-large") -> \ AsyncGenerator[list[DataPoint], None]: """Processes code graph datapoints, create SourceCodeChink datapoints.""" + # TODO: Add support for other embedding models, with max_token mapping for data_point in data_points: yield data_point if not isinstance(data_point, CodeFile): From 35071b5c100d98c1fbfdc86c60253041ca03089d Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Mon, 23 Dec 2024 12:17:38 +0100 Subject: [PATCH 14/17] feat: add custom logger --- cognee/tasks/repo_processor/get_source_code_chunks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py index 854c50681..b12496775 100644 --- a/cognee/tasks/repo_processor/get_source_code_chunks.py +++ b/cognee/tasks/repo_processor/get_source_code_chunks.py @@ -1,3 +1,4 @@ +import logging from typing import AsyncGenerator, Generator from uuid import NAMESPACE_OID, uuid5 @@ -7,7 +8,7 @@ from cognee.infrastructure.engine import DataPoint from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk -from cognee.tasks.repo_processor import logger +logger = logging.getLogger("task:get_source_code_chunks") def _count_tokens(tokenizer: tiktoken.Encoding, source_code: str) -> int: From 68a9d278f768e21d8ecc9e600bfd97a20f604ec4 Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Mon, 23 Dec 2024 12:26:57 +0100 Subject: [PATCH 15/17] feat: add robustness to get_source_code_chunks Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- .../repo_processor/get_source_code_chunks.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py index b12496775..4d0ce3200 100644 --- a/cognee/tasks/repo_processor/get_source_code_chunks.py +++ b/cognee/tasks/repo_processor/get_source_code_chunks.py @@ -146,12 +146,19 @@ async def get_source_code_chunks(data_points: list[DataPoint], embedding_model=" """Processes code graph datapoints, create SourceCodeChink datapoints.""" # TODO: Add support for other embedding models, with max_token mapping for data_point in data_points: - yield data_point - if not isinstance(data_point, CodeFile): - continue - if not data_point.contains: - continue - for code_part in data_point.contains: - yield code_part - for source_code_chunk in get_source_code_chunks_from_code_part(code_part, model_name=embedding_model): - yield source_code_chunk + try: + yield data_point + if not isinstance(data_point, CodeFile): + continue + if not data_point.contains: + logger.warning(f"CodeFile {data_point.id} contains no code parts") + continue + for code_part in data_point.contains: + try: + yield code_part + for source_code_chunk in get_source_code_chunks_from_code_part(code_part, model_name=embedding_model): + yield source_code_chunk + except Exception as e: + logger.error(f"Error processing code part: {e}") + except Exception as e: + logger.error(f"Error processing data point: {e}") From cf63dbcd0d87482cccc3bb57b0ab0395f4fa6475 Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Mon, 23 Dec 2024 14:17:10 +0100 Subject: [PATCH 16/17] feat: improve embedding exceptions --- .../exceptions/embedding_exception.py | 3 ++ .../embeddings/LiteLLMEmbeddingEngine.py | 33 +++++++++++-------- cognee/tasks/storage/index_data_points.py | 9 ++--- 3 files changed, 27 insertions(+), 18 deletions(-) create mode 100644 cognee/infrastructure/databases/exceptions/embedding_exception.py diff --git a/cognee/infrastructure/databases/exceptions/embedding_exception.py b/cognee/infrastructure/databases/exceptions/embedding_exception.py new file mode 100644 index 000000000..ba7c70d80 --- /dev/null +++ b/cognee/infrastructure/databases/exceptions/embedding_exception.py @@ -0,0 +1,3 @@ +class EmbeddingException(Exception): + """Custom exception for handling embedding-related errors.""" + pass \ No newline at end of file diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py index dce12b318..e1699fcc9 100644 --- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py @@ -5,25 +5,27 @@ import litellm import os from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine +from cognee.infrastructure.databases.exceptions.embedding_exception import EmbeddingException litellm.set_verbose = False logger = logging.getLogger("LiteLLMEmbeddingEngine") + class LiteLLMEmbeddingEngine(EmbeddingEngine): api_key: str endpoint: str api_version: str model: str dimensions: int - mock:bool + mock: bool def __init__( - self, - model: Optional[str] = "text-embedding-3-large", - dimensions: Optional[int] = 3072, - api_key: str = None, - endpoint: str = None, - api_version: str = None, + self, + model: Optional[str] = "text-embedding-3-large", + dimensions: Optional[int] = 3072, + api_key: str = None, + endpoint: str = None, + api_version: str = None, ): self.api_key = api_key self.endpoint = endpoint @@ -33,7 +35,7 @@ def __init__( enable_mocking = os.getenv("MOCK_EMBEDDING", "false") if isinstance(enable_mocking, bool): - enable_mocking= str(enable_mocking).lower() + enable_mocking = str(enable_mocking).lower() self.mock = enable_mocking in ("true", "1", "yes") MAX_RETRIES = 5 @@ -43,7 +45,7 @@ async def embed_text(self, text: List[str]) -> List[List[float]]: async def exponential_backoff(attempt): wait_time = min(10 * (2 ** attempt), 60) # Max 60 seconds await asyncio.sleep(wait_time) - + try: if self.mock: response = { @@ -56,10 +58,10 @@ async def exponential_backoff(attempt): else: response = await litellm.aembedding( self.model, - input = text, - api_key = self.api_key, - api_base = self.endpoint, - api_version = self.api_version + input=text, + api_key=self.api_key, + api_base=self.endpoint, + api_version=self.api_version ) self.retry_count = 0 @@ -71,7 +73,7 @@ async def exponential_backoff(attempt): if len(text) == 1: parts = [text] else: - parts = [text[0:math.ceil(len(text)/2)], text[math.ceil(len(text)/2):]] + parts = [text[0:math.ceil(len(text) / 2)], text[math.ceil(len(text) / 2):]] parts_futures = [self.embed_text(part) for part in parts] embeddings = await asyncio.gather(*parts_futures) @@ -95,6 +97,9 @@ async def exponential_backoff(attempt): return await self.embed_text(text) + except (litellm.exceptions.BadRequestError, litellm.llms.OpenAI.openai.OpenAIError): + raise EmbeddingException("Failed to index data points.") + except Exception as error: logger.error("Error embedding text: %s", str(error)) raise error diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py index 058d4e8e5..3cd3a29af 100644 --- a/cognee/tasks/storage/index_data_points.py +++ b/cognee/tasks/storage/index_data_points.py @@ -1,9 +1,10 @@ -from litellm.exceptions import BadRequestError -from litellm.llms.OpenAI.openai import OpenAIError +import logging +from cognee.infrastructure.databases.exceptions.embedding_exception import EmbeddingException from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.engine import DataPoint +logger = logging.getLogger("index_data_points") async def index_data_points(data_points: list[DataPoint]): created_indexes = {} @@ -35,8 +36,8 @@ async def index_data_points(data_points: list[DataPoint]): index_name, field_name = index_name.split(".") try: await vector_engine.index_data_points(index_name, field_name, indexable_points) - except (OpenAIError, BadRequestError) as e: - print(f"Failed to index data points for {index_name}.{field_name}: {e}") + except EmbeddingException as e: + logger.warning(f"Failed to index data points for {index_name}.{field_name}: {e}") return data_points From f5fa3ec4beec26b6e624ead38b005527fec5bec3 Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Thu, 26 Dec 2024 13:39:47 +0100 Subject: [PATCH 17/17] refactor: format indents, rename module --- ...mbedding_exception.py => EmbeddingException.py} | 0 .../vector/embeddings/LiteLLMEmbeddingEngine.py | 14 +++++++------- cognee/tasks/storage/index_data_points.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) rename cognee/infrastructure/databases/exceptions/{embedding_exception.py => EmbeddingException.py} (100%) diff --git a/cognee/infrastructure/databases/exceptions/embedding_exception.py b/cognee/infrastructure/databases/exceptions/EmbeddingException.py similarity index 100% rename from cognee/infrastructure/databases/exceptions/embedding_exception.py rename to cognee/infrastructure/databases/exceptions/EmbeddingException.py diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py index e1699fcc9..93f59cc77 100644 --- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py @@ -5,7 +5,7 @@ import litellm import os from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine -from cognee.infrastructure.databases.exceptions.embedding_exception import EmbeddingException +from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException litellm.set_verbose = False logger = logging.getLogger("LiteLLMEmbeddingEngine") @@ -20,12 +20,12 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): mock: bool def __init__( - self, - model: Optional[str] = "text-embedding-3-large", - dimensions: Optional[int] = 3072, - api_key: str = None, - endpoint: str = None, - api_version: str = None, + self, + model: Optional[str] = "text-embedding-3-large", + dimensions: Optional[int] = 3072, + api_key: str = None, + endpoint: str = None, + api_version: str = None, ): self.api_key = api_key self.endpoint = endpoint diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py index 3cd3a29af..12af2d2ef 100644 --- a/cognee/tasks/storage/index_data_points.py +++ b/cognee/tasks/storage/index_data_points.py @@ -1,6 +1,6 @@ import logging -from cognee.infrastructure.databases.exceptions.embedding_exception import EmbeddingException +from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.engine import DataPoint