From 682ddfc020e2f82e2d14a60ee8a989803479a77d Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Wed, 18 Dec 2024 17:32:40 +0100
Subject: [PATCH 01/17] fix: pass the list of all CodeFiles to enrichment task

---
 .../tasks/repo_processor/get_repo_file_dependencies.py   | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
index 221af6cf6..b54c1f152 100644
--- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py
+++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
@@ -71,7 +71,7 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non
         path = repo_path,
     )
 
-    yield repo
+    yield [repo]
 
     with ProcessPoolExecutor(max_workers = 12) as executor:
         loop = asyncio.get_event_loop()
@@ -90,10 +90,11 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non
 
         results = await asyncio.gather(*tasks)
 
+        code_files = []
         for (file_path, metadata), dependencies in zip(py_files_dict.items(), results):
             source_code = metadata.get("source_code")
 
-            yield CodeFile(
+            code_files.append(CodeFile(
                 id = uuid5(NAMESPACE_OID, file_path),
                 source_code = source_code,
                 extracted_id = file_path,
@@ -106,4 +107,6 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non
                         source_code = py_files_dict.get(dependency, {}).get("source_code"),
                     ) for dependency in dependencies
                 ] if dependencies else None,
-            )
+            ))
+
+        yield code_files

From d40fc12939b73fa421c481254439d28bc1c98194 Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Wed, 18 Dec 2024 17:33:54 +0100
Subject: [PATCH 02/17] feat: introduce SourceCodeChunk, update metadata

---
 cognee/shared/CodeGraphEntities.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/cognee/shared/CodeGraphEntities.py b/cognee/shared/CodeGraphEntities.py
index 23b8879c2..27289493d 100644
--- a/cognee/shared/CodeGraphEntities.py
+++ b/cognee/shared/CodeGraphEntities.py
@@ -1,5 +1,4 @@
 from typing import List, Optional
-
 from cognee.infrastructure.engine import DataPoint
 
 
@@ -7,7 +6,7 @@ class Repository(DataPoint):
     __tablename__ = "Repository"
     path: str
     _metadata: dict = {
-        "index_fields": ["source_code"],
+        "index_fields": [],
         "type": "Repository"
     }
 
@@ -19,29 +18,31 @@ class CodeFile(DataPoint):
     depends_on: Optional[List["CodeFile"]] = None
     depends_directly_on: Optional[List["CodeFile"]] = None
     contains: Optional[List["CodePart"]] = None
-
     _metadata: dict = {
-        "index_fields": ["source_code"],
+        "index_fields": [],
         "type": "CodeFile"
     }
 
 class CodePart(DataPoint):
     __tablename__ = "codepart"
-    # part_of: Optional[CodeFile]
-    source_code: str
-    
+    # part_of: Optional[CodeFile] = None
+    source_code: Optional[str] = None
     _metadata: dict = {
-        "index_fields": ["source_code"],
+        "index_fields": [],
         "type": "CodePart"
     }
 
-class CodeRelationship(DataPoint):
-    source_id: str
-    target_id: str
-    relation: str  # depends on or depends directly
+class SourceCodeChunk(DataPoint):
+    __tablename__ = "sourcecodechunk"
+    code_chunk_of: Optional[CodePart] = None
+    source_code: Optional[str] = None
+    previous_chunk: Optional["SourceCodeChunk"] = None
+
     _metadata: dict = {
-        "type": "CodeRelationship"
+        "index_fields": ["source_code"],
+        "type": "SourceCodeChunk"
     }
 
 CodeFile.model_rebuild()
 CodePart.model_rebuild()
+SourceCodeChunk.model_rebuild()
\ No newline at end of file

From aea7382983b5250eb443c46adc6a387d2690e22e Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Wed, 18 Dec 2024 17:34:17 +0100
Subject: [PATCH 03/17] feat: get_source_code_chunks code graph pipeline task

---
 .../repo_processor/get_source_code_chunks.py  | 112 ++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 cognee/tasks/repo_processor/get_source_code_chunks.py

diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py
new file mode 100644
index 000000000..2cd007a19
--- /dev/null
+++ b/cognee/tasks/repo_processor/get_source_code_chunks.py
@@ -0,0 +1,112 @@
+from typing import AsyncGenerator, Generator
+from uuid import NAMESPACE_OID, uuid5
+from cognee.infrastructure.engine import DataPoint
+from cognee.shared.CodeGraphEntities import CodePart, SourceCodeChunk, CodeFile
+import tiktoken
+import parso
+
+from cognee.tasks.repo_processor import logger
+
+
+def _count_tokens(tokenizer: tiktoken.Encoding, source_code: str) -> int:
+    return len(tokenizer.encode(source_code))
+
+
+def _get_subchunk_token_counts(
+        tokenizer: tiktoken.Encoding, source_code: str, max_subchunk_tokens: int = 8000
+) -> list[tuple[str, int]]:
+    """Splits source code into subchunk and counts tokens for each subchunk."""
+
+    try:
+        module = parso.parse(source_code)
+    except Exception as e:
+        logger.error(f"Error parsing source code: {e}")
+        return []
+
+    if not module.children:
+        logger.warning("Parsed module has no children (empty or invalid source code).")
+        return []
+
+    if len(module.children) <= 2:
+        module = module.children[0]
+
+    subchunk_token_counts = []
+    for child in module.children:
+        subchunk = child.get_code()
+        token_count = _count_tokens(tokenizer, subchunk)
+        if token_count <= max_subchunk_tokens:
+            subchunk_token_counts.append((subchunk, token_count))
+            continue
+
+        subchunk_token_counts.extend(_get_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens))
+
+    return subchunk_token_counts
+
+
+def _get_chunk_source_code(
+        code_token_counts: list[tuple[str, int]], overlap: float, max_tokens: int
+) -> tuple[list[tuple[str, int]], str]:
+    """Generates a chunk of source code from tokenized subchunks with overlap handling."""
+    current_count = 0
+    cumulative_counts = []
+    current_source_code = ''
+
+    for i, (child_code, token_count) in enumerate(code_token_counts):
+        current_count += token_count
+        cumulative_counts.append(current_count)
+        if current_count > max_tokens:
+            break
+        current_source_code += f"\n{child_code}"
+
+    if current_count<= max_tokens:
+        return [], current_source_code.strip()
+
+    cutoff = 1
+    for i, cum_count in enumerate(cumulative_counts):
+        if cum_count> (1 - overlap) * max_tokens:
+            break
+        cutoff = i
+
+    return code_token_counts[cutoff:], current_source_code.strip()
+
+
+def get_source_code_chunks_from_code_part(
+        code_file_part: CodePart,
+        max_tokens: int = 8192,
+        overlap: float = 0.25,
+        granularity: float = 0.1,
+        model_name: str = "text-embedding-3-large"
+) -> Generator[SourceCodeChunk, None, None]:
+    """Yields source code chunks from a CodePart object, with configurable token limits and overlap."""
+    tokenizer = tiktoken.encoding_for_model(model_name)
+    max_subchunk_tokens = max(1, int(granularity * max_tokens))
+    subchunk_token_counts = _get_subchunk_token_counts(tokenizer, code_file_part.source_code, max_subchunk_tokens)
+
+    previous_chunk = None
+    while subchunk_token_counts:
+        subchunk_token_counts, chunk_source_code = _get_chunk_source_code(subchunk_token_counts, overlap, max_tokens)
+        if not chunk_source_code:
+            continue
+        current_chunk = SourceCodeChunk(
+            id=uuid5(NAMESPACE_OID, chunk_source_code),
+            code_chunk_of=code_file_part,
+            source_code=chunk_source_code,
+            previous_chunk=previous_chunk
+        )
+        yield current_chunk
+        previous_chunk = current_chunk
+
+
+async def get_source_code_chunks(data_points: list[DataPoint], embedding_model="text-embedding-3-large") -> \
+AsyncGenerator[list[DataPoint], None]:
+    """Processes code graph datapoints, create SourceCodeChink datapoints."""
+    for data_point in data_points:
+        yield data_point
+        if not isinstance(data_point, CodeFile):
+            continue
+        if not data_point.contains:
+            continue
+        for code_part in data_point.contains:
+            yield code_part
+            for source_code_chunk in get_source_code_chunks_from_code_part(code_part, model_name=embedding_model):
+                yield source_code_chunk

From 1c5ca8460a9cd382eb7d3d5cd3bdd2c88c6f1bbe Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Wed, 18 Dec 2024 17:34:51 +0100
Subject: [PATCH 04/17] feat: integrate get_source_code_chunks task, comment
 out summarize_code

---
 cognee/api/v1/cognify/code_graph_pipeline.py | 50 ++++++++++----------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index eeb10d69e..ac89f257a 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -7,34 +7,32 @@
 from pathlib import Path
 from typing import Union
 
-from cognee.modules.data.methods import get_datasets, get_datasets_by_name
+from cognee.shared.SourceCodeGraph import SourceCodeGraph
+from cognee.shared.data_models import SummarizedContent
+from cognee.shared.utils import send_telemetry
+from cognee.modules.data.models import Dataset, Data
 from cognee.modules.data.methods.get_dataset_data import get_dataset_data
-from cognee.modules.data.models import Data, Dataset
-from cognee.modules.pipelines import run_tasks
-from cognee.modules.pipelines.models import PipelineRunStatus
-from cognee.modules.pipelines.operations.get_pipeline_status import \
-    get_pipeline_status
-from cognee.modules.pipelines.operations.log_pipeline_status import \
-    log_pipeline_status
+from cognee.modules.data.methods import get_datasets, get_datasets_by_name
 from cognee.modules.pipelines.tasks.Task import Task
-from cognee.modules.users.methods import get_default_user
+from cognee.modules.pipelines import run_tasks
 from cognee.modules.users.models import User
-from cognee.shared.SourceCodeGraph import SourceCodeGraph
-from cognee.shared.utils import send_telemetry
-from cognee.tasks.documents import (check_permissions_on_documents,
-                                    classify_documents,
-                                    extract_chunks_from_documents)
+from cognee.modules.users.methods import get_default_user
+from cognee.modules.pipelines.models import PipelineRunStatus
+from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
+from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status
+from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents
+from cognee.tasks.repo_processor.get_source_code_chunks import get_source_code_chunks
 from cognee.tasks.graph import extract_graph_from_code
-from cognee.tasks.repo_processor import (enrich_dependency_graph,
-                                         expand_dependency_graph,
-                                         get_repo_file_dependencies)
+from cognee.tasks.repo_processor import get_repo_file_dependencies, enrich_dependency_graph, expand_dependency_graph
 from cognee.tasks.storage import add_data_points
 from cognee.tasks.summarization import summarize_code
+from cognee.infrastructure.databases.vector.embeddings import (get_embedding_engine)
 
 logger = logging.getLogger("code_graph_pipeline")
 
 update_status_lock = asyncio.Lock()
 
+
 async def code_graph_pipeline(datasets: Union[str, list[str]] = None, user: User = None):
     if user is None:
         user = await get_default_user()
@@ -65,7 +63,7 @@ async def code_graph_pipeline(datasets: Union[str, list[str]] = None, user: User
 
 async def run_pipeline(dataset: Dataset, user: User):
     '''DEPRECATED: Use `run_code_graph_pipeline` instead. This function will be removed.'''
-    data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)
+    data_documents: list[Data] = await get_dataset_data(dataset_id=dataset.id)
 
     document_ids_str = [str(document.id) for document in data_documents]
 
@@ -88,10 +86,11 @@ async def run_pipeline(dataset: Dataset, user: User):
     try:
         tasks = [
             Task(classify_documents),
-            Task(check_permissions_on_documents, user = user, permissions = ["write"]),
-            Task(extract_chunks_from_documents), # Extract text chunks based on the document type.
-            Task(add_data_points, task_config = { "batch_size": 10 }),
-            Task(extract_graph_from_code, graph_model = SourceCodeGraph, task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks.
+            Task(check_permissions_on_documents, user=user, permissions=["write"]),
+            Task(extract_chunks_from_documents),  # Extract text chunks based on the document type.
+            Task(add_data_points, task_config={"batch_size": 10}),
+            Task(extract_graph_from_code, graph_model=SourceCodeGraph, task_config={"batch_size": 10}),
+            # Generate knowledge graphs from the document chunks.
         ]
 
         pipeline = run_tasks(tasks, data_documents, "code_graph_pipeline")
@@ -135,11 +134,14 @@ async def run_code_graph_pipeline(repo_path):
     await cognee.prune.prune_system(metadata=True)
     await create_db_and_tables()
 
+    embedding_engine = get_embedding_engine()
+
     tasks = [
         Task(get_repo_file_dependencies),
-        Task(enrich_dependency_graph, task_config={"batch_size": 50}),
+        Task(enrich_dependency_graph),
         Task(expand_dependency_graph, task_config={"batch_size": 50}),
-        Task(summarize_code, task_config={"batch_size": 50}),
+        Task(get_source_code_chunks, embedding_model=embedding_engine.model, task_config={"batch_size": 50}),
+        # Task(summarize_code, task_config={"batch_size": 50}),
         Task(add_data_points, task_config={"batch_size": 50}),
     ]
 

From ccd5cb6ef2c9cbe0e42cce0378d71577aed5981e Mon Sep 17 00:00:00 2001
From: alekszievr <44192193+alekszievr@users.noreply.github.com>
Date: Thu, 19 Dec 2024 15:17:52 +0100
Subject: [PATCH 05/17] Fix code summarization (#387)

---
 cognee/api/v1/cognify/code_graph_pipeline.py | 39 ++++++++++++--------
 cognee/tasks/summarization/models.py         |  6 ++-
 cognee/tasks/summarization/summarize_code.py |  6 +--
 3 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index ac89f257a..0a6af0f95 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -7,26 +7,34 @@
 from pathlib import Path
 from typing import Union
 
-from cognee.shared.SourceCodeGraph import SourceCodeGraph
-from cognee.shared.data_models import SummarizedContent
-from cognee.shared.utils import send_telemetry
-from cognee.modules.data.models import Dataset, Data
-from cognee.modules.data.methods.get_dataset_data import get_dataset_data
+from cognee.infrastructure.databases.vector.embeddings import \
+    get_embedding_engine
 from cognee.modules.data.methods import get_datasets, get_datasets_by_name
-from cognee.modules.pipelines.tasks.Task import Task
+from cognee.modules.data.methods.get_dataset_data import get_dataset_data
+from cognee.modules.data.models import Data, Dataset
 from cognee.modules.pipelines import run_tasks
-from cognee.modules.users.models import User
-from cognee.modules.users.methods import get_default_user
 from cognee.modules.pipelines.models import PipelineRunStatus
-from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
-from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status
-from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents
-from cognee.tasks.repo_processor.get_source_code_chunks import get_source_code_chunks
+from cognee.modules.pipelines.operations.get_pipeline_status import \
+    get_pipeline_status
+from cognee.modules.pipelines.operations.log_pipeline_status import \
+    log_pipeline_status
+from cognee.modules.pipelines.tasks.Task import Task
+from cognee.modules.users.methods import get_default_user
+from cognee.modules.users.models import User
+from cognee.shared.data_models import SummarizedContent
+from cognee.shared.SourceCodeGraph import SourceCodeGraph
+from cognee.shared.utils import send_telemetry
+from cognee.tasks.documents import (check_permissions_on_documents,
+                                    classify_documents,
+                                    extract_chunks_from_documents)
 from cognee.tasks.graph import extract_graph_from_code
-from cognee.tasks.repo_processor import get_repo_file_dependencies, enrich_dependency_graph, expand_dependency_graph
+from cognee.tasks.repo_processor import (enrich_dependency_graph,
+                                         expand_dependency_graph,
+                                         get_repo_file_dependencies)
+from cognee.tasks.repo_processor.get_source_code_chunks import \
+    get_source_code_chunks
 from cognee.tasks.storage import add_data_points
 from cognee.tasks.summarization import summarize_code
-from cognee.infrastructure.databases.vector.embeddings import (get_embedding_engine)
 
 logger = logging.getLogger("code_graph_pipeline")
 
@@ -121,6 +129,7 @@ def generate_dataset_name(dataset_name: str) -> str:
 async def run_code_graph_pipeline(repo_path):
     import os
     import pathlib
+
     import cognee
     from cognee.infrastructure.databases.relational import create_db_and_tables
 
@@ -141,7 +150,7 @@ async def run_code_graph_pipeline(repo_path):
         Task(enrich_dependency_graph),
         Task(expand_dependency_graph, task_config={"batch_size": 50}),
         Task(get_source_code_chunks, embedding_model=embedding_engine.model, task_config={"batch_size": 50}),
-        # Task(summarize_code, task_config={"batch_size": 50}),
+        Task(summarize_code, task_config={"batch_size": 50}),
         Task(add_data_points, task_config={"batch_size": 50}),
     ]
 
diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py
index add448155..178267ce6 100644
--- a/cognee/tasks/summarization/models.py
+++ b/cognee/tasks/summarization/models.py
@@ -1,6 +1,8 @@
+from typing import Union
+
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.chunking.models import DocumentChunk
-from cognee.shared.CodeGraphEntities import CodeFile
+from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk
 
 
 class TextSummary(DataPoint):
@@ -17,7 +19,7 @@ class TextSummary(DataPoint):
 class CodeSummary(DataPoint):
     __tablename__ = "code_summary"
     text: str
-    made_from: CodeFile
+    made_from: Union[CodeFile, CodePart, SourceCodeChunk]
 
     _metadata: dict = {
         "index_fields": ["text"],
diff --git a/cognee/tasks/summarization/summarize_code.py b/cognee/tasks/summarization/summarize_code.py
index b116e57a9..3b393a0d5 100644
--- a/cognee/tasks/summarization/summarize_code.py
+++ b/cognee/tasks/summarization/summarize_code.py
@@ -1,10 +1,10 @@
 import asyncio
 from typing import AsyncGenerator, Union
 from uuid import uuid5
-from typing import Type
 
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.data.extraction.extract_summary import extract_code_summary
+
 from .models import CodeSummary
 
 
@@ -21,7 +21,7 @@ async def summarize_code(
     )
 
     file_summaries_map = {
-        code_data_point.extracted_id: str(file_summary)
+        code_data_point.id: str(file_summary)
         for code_data_point, file_summary in zip(code_data_points, file_summaries)
     }
 
@@ -36,5 +36,5 @@ async def summarize_code(
         yield CodeSummary(
             id=uuid5(node.id, "CodeSummary"),
             made_from=node,
-            text=file_summaries_map[node.extracted_id],
+            text=file_summaries_map[node.id],
         )

From 5deb83ef35da0bf53fe6e8104fb3ebf9bf76658f Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Thu, 19 Dec 2024 16:19:12 +0100
Subject: [PATCH 06/17] feat: update data models

---
 cognee/shared/data_models.py                 | 1 -
 cognee/tasks/summarization/models.py         | 2 +-
 cognee/tasks/summarization/summarize_code.py | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/cognee/shared/data_models.py b/cognee/shared/data_models.py
index dec53cfcb..2a8bc8c91 100644
--- a/cognee/shared/data_models.py
+++ b/cognee/shared/data_models.py
@@ -210,7 +210,6 @@ class SummarizedClass(BaseModel):
     decorators: Optional[List[str]] = None
 
 class SummarizedCode(BaseModel):
-    file_name: str
     high_level_summary: str
     key_features: List[str]
     imports: List[str] = []
diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py
index 178267ce6..5b0345015 100644
--- a/cognee/tasks/summarization/models.py
+++ b/cognee/tasks/summarization/models.py
@@ -19,7 +19,7 @@ class TextSummary(DataPoint):
 class CodeSummary(DataPoint):
     __tablename__ = "code_summary"
     text: str
-    made_from: Union[CodeFile, CodePart, SourceCodeChunk]
+    summarizes: Union[CodeFile, CodePart, SourceCodeChunk]
 
     _metadata: dict = {
         "index_fields": ["text"],
diff --git a/cognee/tasks/summarization/summarize_code.py b/cognee/tasks/summarization/summarize_code.py
index 3b393a0d5..9efc5b6ca 100644
--- a/cognee/tasks/summarization/summarize_code.py
+++ b/cognee/tasks/summarization/summarize_code.py
@@ -35,6 +35,6 @@ async def summarize_code(
 
         yield CodeSummary(
             id=uuid5(node.id, "CodeSummary"),
-            made_from=node,
+            summarizes=node,
             text=file_summaries_map[node.id],
         )

From ace45ef378acf5a0168de88f70f522e4f9121cde Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Thu, 19 Dec 2024 17:15:43 +0100
Subject: [PATCH 07/17] feat: naive parse long strings in source code

---
 .../repo_processor/get_source_code_chunks.py  | 34 +++++++++++++++++--
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py
index 2cd007a19..dfbc9c2b6 100644
--- a/cognee/tasks/repo_processor/get_source_code_chunks.py
+++ b/cognee/tasks/repo_processor/get_source_code_chunks.py
@@ -12,6 +12,26 @@ def _count_tokens(tokenizer: tiktoken.Encoding, source_code: str) -> int:
     return len(tokenizer.encode(source_code))
 
 
+def _get_naive_subchunk_token_counts(
+        tokenizer: tiktoken.Encoding, source_code: str, max_subchunk_tokens: int = 8000
+) -> list[tuple[str, int]]:
+    """Splits source code into subchunks of up to max_subchunk_tokens and counts tokens."""
+
+    token_ids = tokenizer.encode(source_code)
+    subchunk_token_counts = []
+
+    for start_idx in range(0, len(token_ids), max_subchunk_tokens):
+        subchunk_token_ids = token_ids[start_idx: start_idx + max_subchunk_tokens]
+        token_count = len(subchunk_token_ids)
+        subchunk = ''.join(
+            tokenizer.decode_single_token_bytes(token_id).decode('utf-8', errors='replace')
+            for token_id in subchunk_token_ids
+        )
+        subchunk_token_counts.append((subchunk, token_count))
+
+    return subchunk_token_counts
+
+
 def _get_subchunk_token_counts(
         tokenizer: tiktoken.Encoding, source_code: str, max_subchunk_tokens: int = 8000
 ) -> list[tuple[str, int]]:
@@ -34,10 +54,18 @@ def _get_subchunk_token_counts(
     for child in module.children:
         subchunk = child.get_code()
         token_count = _count_tokens(tokenizer, subchunk)
+
+        if token_count == 0:
+            continue
+
         if token_count <= max_subchunk_tokens:
             subchunk_token_counts.append((subchunk, token_count))
             continue
 
+        if child.type == 'string':
+            subchunk_token_counts.extend(_get_naive_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens))
+            continue
+
         subchunk_token_counts.extend(_get_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens))
 
     return subchunk_token_counts
@@ -58,12 +86,12 @@ def _get_chunk_source_code(
             break
         current_source_code += f"\n{child_code}"
 
-    if current_count<= max_tokens:
+    if current_count <= max_tokens:
         return [], current_source_code.strip()
 
     cutoff = 1
     for i, cum_count in enumerate(cumulative_counts):
-        if cum_count> (1 - overlap) * max_tokens:
+        if cum_count > (1 - overlap) * max_tokens:
             break
         cutoff = i
 
@@ -98,7 +126,7 @@ def get_source_code_chunks_from_code_part(
 
 
 async def get_source_code_chunks(data_points: list[DataPoint], embedding_model="text-embedding-3-large") -> \
-AsyncGenerator[list[DataPoint], None]:
+        AsyncGenerator[list[DataPoint], None]:
     """Processes code graph datapoints, create SourceCodeChink datapoints."""
     for data_point in data_points:
         yield data_point

From c1539f6f2c60e70009e72354d18123ddddb1512f Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Fri, 20 Dec 2024 15:03:38 +0100
Subject: [PATCH 08/17] fix: get_non_py_files instead of get_non_code_files

---
 cognee/api/v1/cognify/code_graph_pipeline.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index 4d0f8501b..99032255f 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -17,7 +17,7 @@
 from cognee.tasks.repo_processor import (enrich_dependency_graph,
                                          expand_dependency_graph,
                                          get_data_list_for_user,
-                                         get_non_code_files,
+                                         get_non_py_files,
                                          get_repo_file_dependencies)
 from cognee.tasks.repo_processor.get_source_code_chunks import \
     get_source_code_chunks
@@ -67,7 +67,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
 
     if include_docs:
         non_code_tasks = [
-            Task(get_non_code_files, task_config={"batch_size": 50}),
+            Task(get_non_py_files, task_config={"batch_size": 50}),
             Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
             Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
             Task(classify_documents),
@@ -79,7 +79,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
                 task_config={"batch_size": 50}
             ),
         ]
-
     if include_docs:
         async for result in run_tasks(non_code_tasks, repo_path):
             yield result

From d2911c1c591b95f144a38cdb16a484118a524096 Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Fri, 20 Dec 2024 15:15:31 +0100
Subject: [PATCH 09/17] fix: limit recursion, add comment

---
 .../tasks/repo_processor/get_source_code_chunks.py  | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py
index dfbc9c2b6..5aa0c35f2 100644
--- a/cognee/tasks/repo_processor/get_source_code_chunks.py
+++ b/cognee/tasks/repo_processor/get_source_code_chunks.py
@@ -33,9 +33,15 @@ def _get_naive_subchunk_token_counts(
 
 
 def _get_subchunk_token_counts(
-        tokenizer: tiktoken.Encoding, source_code: str, max_subchunk_tokens: int = 8000
+        tokenizer: tiktoken.Encoding,
+        source_code: str,
+        max_subchunk_tokens: int = 8000,
+        depth: int = 0,
+        max_depth: int = 100
 ) -> list[tuple[str, int]]:
     """Splits source code into subchunk and counts tokens for each subchunk."""
+    if depth > max_depth:
+        return _get_naive_subchunk_token_counts(tokenizer, source_code, max_subchunk_tokens)
 
     try:
         module = parso.parse(source_code)
@@ -47,6 +53,7 @@ def _get_subchunk_token_counts(
         logger.warning("Parsed module has no children (empty or invalid source code).")
         return []
 
+    # Handle cases with only one real child and an EndMarker to prevent infinite recursion.
     if len(module.children) <= 2:
         module = module.children[0]
 
@@ -66,7 +73,9 @@ def _get_subchunk_token_counts(
             subchunk_token_counts.extend(_get_naive_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens))
             continue
 
-        subchunk_token_counts.extend(_get_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens))
+        subchunk_token_counts.extend(
+            _get_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens, depth=depth + 1, max_depth=max_depth)
+        )
 
     return subchunk_token_counts
 

From 762df1124914a7c734e30cb42e0d5a6305930889 Mon Sep 17 00:00:00 2001
From: alekszievr <44192193+alekszievr@users.noreply.github.com>
Date: Mon, 23 Dec 2024 11:39:57 +0100
Subject: [PATCH 10/17] handle embedding empty input error (#398)

---
 cognee/api/v1/cognify/code_graph_pipeline.py | 3 ++-
 cognee/tasks/storage/index_data_points.py    | 8 +++++++-
 examples/python/code_graph_example.py        | 2 +-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index 99032255f..3d31b4000 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -2,9 +2,9 @@
 import logging
 from pathlib import Path
 
+from cognee.base_config import get_base_config
 from cognee.infrastructure.databases.vector.embeddings import \
     get_embedding_engine
-from cognee.base_config import get_base_config
 from cognee.modules.cognify.config import get_cognify_config
 from cognee.modules.pipelines import run_tasks
 from cognee.modules.pipelines.tasks.Task import Task
@@ -79,6 +79,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
                 task_config={"batch_size": 50}
             ),
         ]
+        
     if include_docs:
         async for result in run_tasks(non_code_tasks, repo_path):
             yield result
diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py
index 857e4d777..058d4e8e5 100644
--- a/cognee/tasks/storage/index_data_points.py
+++ b/cognee/tasks/storage/index_data_points.py
@@ -1,3 +1,6 @@
+from litellm.exceptions import BadRequestError
+from litellm.llms.OpenAI.openai import OpenAIError
+
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.infrastructure.engine import DataPoint
 
@@ -30,7 +33,10 @@ async def index_data_points(data_points: list[DataPoint]):
 
     for index_name, indexable_points in index_points.items():
         index_name, field_name = index_name.split(".")
-        await vector_engine.index_data_points(index_name, field_name, indexable_points)
+        try:
+            await vector_engine.index_data_points(index_name, field_name, indexable_points)
+        except (OpenAIError, BadRequestError) as e:
+            print(f"Failed to index data points for {index_name}.{field_name}: {e}")
 
     return data_points
 
diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py
index c0b91972b..44ab33aad 100644
--- a/examples/python/code_graph_example.py
+++ b/examples/python/code_graph_example.py
@@ -11,6 +11,6 @@ async def main(repo_path, include_docs):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository")
-    parser.add_argument("--include_docs", type=bool, default=True, help="Whether or not to process non-code files")
+    parser.add_argument("--include_docs", type=lambda x: x.lower() in ("true", "1"), default=True, help="Whether or not to process non-code files")
     args = parser.parse_args()
     asyncio.run(main(args.repo_path, args.include_docs))
\ No newline at end of file

From ce6f7302f02d43c5337807e4777357a3f6f814dd Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Mon, 23 Dec 2024 12:10:07 +0100
Subject: [PATCH 11/17] feat: robustly handle CodeFile source code

---
 cognee/tasks/repo_processor/get_source_code_chunks.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py
index 5aa0c35f2..1d69ac3b5 100644
--- a/cognee/tasks/repo_processor/get_source_code_chunks.py
+++ b/cognee/tasks/repo_processor/get_source_code_chunks.py
@@ -115,6 +115,10 @@ def get_source_code_chunks_from_code_part(
         model_name: str = "text-embedding-3-large"
 ) -> Generator[SourceCodeChunk, None, None]:
     """Yields source code chunks from a CodePart object, with configurable token limits and overlap."""
+    if not code_file_part.source_code:
+        logger.error(f"No source code in CodeFile {code_file_part.id}")
+        return
+
     tokenizer = tiktoken.encoding_for_model(model_name)
     max_subchunk_tokens = max(1, int(granularity * max_tokens))
     subchunk_token_counts = _get_subchunk_token_counts(tokenizer, code_file_part.source_code, max_subchunk_tokens)

From c50d0c7ee65ab354dd3b2ddd3ca2e7b57b526a99 Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Mon, 23 Dec 2024 12:13:25 +0100
Subject: [PATCH 12/17] refactor: sort imports

---
 cognee/tasks/repo_processor/get_source_code_chunks.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py
index 1d69ac3b5..4ce83cfc5 100644
--- a/cognee/tasks/repo_processor/get_source_code_chunks.py
+++ b/cognee/tasks/repo_processor/get_source_code_chunks.py
@@ -1,9 +1,11 @@
 from typing import AsyncGenerator, Generator
 from uuid import NAMESPACE_OID, uuid5
-from cognee.infrastructure.engine import DataPoint
-from cognee.shared.CodeGraphEntities import CodePart, SourceCodeChunk, CodeFile
-import tiktoken
+
 import parso
+import tiktoken
+
+from cognee.infrastructure.engine import DataPoint
+from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk
 
 from cognee.tasks.repo_processor import logger
 

From 07dcf73974dfd8d9e1aad74eb4e0a47990ca1854 Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Mon, 23 Dec 2024 12:15:51 +0100
Subject: [PATCH 13/17] todo: add support for other embedding models

---
 cognee/tasks/repo_processor/get_source_code_chunks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py
index 4ce83cfc5..854c50681 100644
--- a/cognee/tasks/repo_processor/get_source_code_chunks.py
+++ b/cognee/tasks/repo_processor/get_source_code_chunks.py
@@ -143,6 +143,7 @@ def get_source_code_chunks_from_code_part(
 async def get_source_code_chunks(data_points: list[DataPoint], embedding_model="text-embedding-3-large") -> \
         AsyncGenerator[list[DataPoint], None]:
     """Processes code graph datapoints, create SourceCodeChink datapoints."""
+    # TODO: Add support for other embedding models, with max_token mapping
     for data_point in data_points:
         yield data_point
         if not isinstance(data_point, CodeFile):

From 35071b5c100d98c1fbfdc86c60253041ca03089d Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Mon, 23 Dec 2024 12:17:38 +0100
Subject: [PATCH 14/17] feat: add custom logger

---
 cognee/tasks/repo_processor/get_source_code_chunks.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py
index 854c50681..b12496775 100644
--- a/cognee/tasks/repo_processor/get_source_code_chunks.py
+++ b/cognee/tasks/repo_processor/get_source_code_chunks.py
@@ -1,3 +1,4 @@
+import logging
 from typing import AsyncGenerator, Generator
 from uuid import NAMESPACE_OID, uuid5
 
@@ -7,7 +8,7 @@
 from cognee.infrastructure.engine import DataPoint
 from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk
 
-from cognee.tasks.repo_processor import logger
+logger = logging.getLogger("task:get_source_code_chunks")
 
 
 def _count_tokens(tokenizer: tiktoken.Encoding, source_code: str) -> int:

From 68a9d278f768e21d8ecc9e600bfd97a20f604ec4 Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Mon, 23 Dec 2024 12:26:57 +0100
Subject: [PATCH 15/17] feat: add robustness to get_source_code_chunks

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 .../repo_processor/get_source_code_chunks.py  | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py
index b12496775..4d0ce3200 100644
--- a/cognee/tasks/repo_processor/get_source_code_chunks.py
+++ b/cognee/tasks/repo_processor/get_source_code_chunks.py
@@ -146,12 +146,19 @@ async def get_source_code_chunks(data_points: list[DataPoint], embedding_model="
     """Processes code graph datapoints, create SourceCodeChink datapoints."""
     # TODO: Add support for other embedding models, with max_token mapping
     for data_point in data_points:
-        yield data_point
-        if not isinstance(data_point, CodeFile):
-            continue
-        if not data_point.contains:
-            continue
-        for code_part in data_point.contains:
-            yield code_part
-            for source_code_chunk in get_source_code_chunks_from_code_part(code_part, model_name=embedding_model):
-                yield source_code_chunk
+        try:
+            yield data_point
+            if not isinstance(data_point, CodeFile):
+                continue
+            if not data_point.contains:
+                logger.warning(f"CodeFile {data_point.id} contains no code parts")
+                continue
+            for code_part in data_point.contains:
+                try:
+                    yield code_part
+                    for source_code_chunk in get_source_code_chunks_from_code_part(code_part, model_name=embedding_model):
+                        yield source_code_chunk
+                except Exception as e:
+                    logger.error(f"Error processing code part: {e}")
+        except Exception as e:
+            logger.error(f"Error processing data point: {e}")

From cf63dbcd0d87482cccc3bb57b0ab0395f4fa6475 Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Mon, 23 Dec 2024 14:17:10 +0100
Subject: [PATCH 16/17] feat: improve embedding exceptions

---
 .../exceptions/embedding_exception.py         |  3 ++
 .../embeddings/LiteLLMEmbeddingEngine.py      | 33 +++++++++++--------
 cognee/tasks/storage/index_data_points.py     |  9 ++---
 3 files changed, 27 insertions(+), 18 deletions(-)
 create mode 100644 cognee/infrastructure/databases/exceptions/embedding_exception.py

diff --git a/cognee/infrastructure/databases/exceptions/embedding_exception.py b/cognee/infrastructure/databases/exceptions/embedding_exception.py
new file mode 100644
index 000000000..ba7c70d80
--- /dev/null
+++ b/cognee/infrastructure/databases/exceptions/embedding_exception.py
@@ -0,0 +1,3 @@
+class EmbeddingException(Exception):
+    """Custom exception for handling embedding-related errors."""
+    pass
\ No newline at end of file
diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
index dce12b318..e1699fcc9 100644
--- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
+++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
@@ -5,25 +5,27 @@
 import litellm
 import os
 from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
+from cognee.infrastructure.databases.exceptions.embedding_exception import EmbeddingException
 
 litellm.set_verbose = False
 logger = logging.getLogger("LiteLLMEmbeddingEngine")
 
+
 class LiteLLMEmbeddingEngine(EmbeddingEngine):
     api_key: str
     endpoint: str
     api_version: str
     model: str
     dimensions: int
-    mock:bool
+    mock: bool
 
     def __init__(
-        self,
-        model: Optional[str] = "text-embedding-3-large",
-        dimensions: Optional[int] = 3072,
-        api_key: str = None,
-        endpoint: str = None,
-        api_version: str = None,
+            self,
+            model: Optional[str] = "text-embedding-3-large",
+            dimensions: Optional[int] = 3072,
+            api_key: str = None,
+            endpoint: str = None,
+            api_version: str = None,
     ):
         self.api_key = api_key
         self.endpoint = endpoint
@@ -33,7 +35,7 @@ def __init__(
 
         enable_mocking = os.getenv("MOCK_EMBEDDING", "false")
         if isinstance(enable_mocking, bool):
-            enable_mocking= str(enable_mocking).lower()
+            enable_mocking = str(enable_mocking).lower()
         self.mock = enable_mocking in ("true", "1", "yes")
 
     MAX_RETRIES = 5
@@ -43,7 +45,7 @@ async def embed_text(self, text: List[str]) -> List[List[float]]:
         async def exponential_backoff(attempt):
             wait_time = min(10 * (2 ** attempt), 60)  # Max 60 seconds
             await asyncio.sleep(wait_time)
-      
+
         try:
             if self.mock:
                 response = {
@@ -56,10 +58,10 @@ async def exponential_backoff(attempt):
             else:
                 response = await litellm.aembedding(
                     self.model,
-                    input = text,
-                    api_key = self.api_key,
-                    api_base = self.endpoint,
-                    api_version = self.api_version
+                    input=text,
+                    api_key=self.api_key,
+                    api_base=self.endpoint,
+                    api_version=self.api_version
                 )
 
                 self.retry_count = 0
@@ -71,7 +73,7 @@ async def exponential_backoff(attempt):
                 if len(text) == 1:
                     parts = [text]
                 else:
-                    parts = [text[0:math.ceil(len(text)/2)], text[math.ceil(len(text)/2):]]
+                    parts = [text[0:math.ceil(len(text) / 2)], text[math.ceil(len(text) / 2):]]
 
                 parts_futures = [self.embed_text(part) for part in parts]
                 embeddings = await asyncio.gather(*parts_futures)
@@ -95,6 +97,9 @@ async def exponential_backoff(attempt):
 
             return await self.embed_text(text)
 
+        except (litellm.exceptions.BadRequestError, litellm.llms.OpenAI.openai.OpenAIError):
+            raise EmbeddingException("Failed to index data points.")
+
         except Exception as error:
             logger.error("Error embedding text: %s", str(error))
             raise error
diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py
index 058d4e8e5..3cd3a29af 100644
--- a/cognee/tasks/storage/index_data_points.py
+++ b/cognee/tasks/storage/index_data_points.py
@@ -1,9 +1,10 @@
-from litellm.exceptions import BadRequestError
-from litellm.llms.OpenAI.openai import OpenAIError
+import logging
 
+from cognee.infrastructure.databases.exceptions.embedding_exception import EmbeddingException
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.infrastructure.engine import DataPoint
 
+logger = logging.getLogger("index_data_points")
 
 async def index_data_points(data_points: list[DataPoint]):
     created_indexes = {}
@@ -35,8 +36,8 @@ async def index_data_points(data_points: list[DataPoint]):
         index_name, field_name = index_name.split(".")
         try:
             await vector_engine.index_data_points(index_name, field_name, indexable_points)
-        except (OpenAIError, BadRequestError) as e:
-            print(f"Failed to index data points for {index_name}.{field_name}: {e}")
+        except EmbeddingException as e:
+            logger.warning(f"Failed to index data points for {index_name}.{field_name}: {e}")
 
     return data_points
 

From f5fa3ec4beec26b6e624ead38b005527fec5bec3 Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Thu, 26 Dec 2024 13:39:47 +0100
Subject: [PATCH 17/17] refactor: format indents, rename module

---
 ...mbedding_exception.py => EmbeddingException.py} |  0
 .../vector/embeddings/LiteLLMEmbeddingEngine.py    | 14 +++++++-------
 cognee/tasks/storage/index_data_points.py          |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)
 rename cognee/infrastructure/databases/exceptions/{embedding_exception.py => EmbeddingException.py} (100%)

diff --git a/cognee/infrastructure/databases/exceptions/embedding_exception.py b/cognee/infrastructure/databases/exceptions/EmbeddingException.py
similarity index 100%
rename from cognee/infrastructure/databases/exceptions/embedding_exception.py
rename to cognee/infrastructure/databases/exceptions/EmbeddingException.py
diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
index e1699fcc9..93f59cc77 100644
--- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
+++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
@@ -5,7 +5,7 @@
 import litellm
 import os
 from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
-from cognee.infrastructure.databases.exceptions.embedding_exception import EmbeddingException
+from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException
 
 litellm.set_verbose = False
 logger = logging.getLogger("LiteLLMEmbeddingEngine")
@@ -20,12 +20,12 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
     mock: bool
 
     def __init__(
-            self,
-            model: Optional[str] = "text-embedding-3-large",
-            dimensions: Optional[int] = 3072,
-            api_key: str = None,
-            endpoint: str = None,
-            api_version: str = None,
+        self,
+        model: Optional[str] = "text-embedding-3-large",
+        dimensions: Optional[int] = 3072,
+        api_key: str = None,
+        endpoint: str = None,
+        api_version: str = None,
     ):
         self.api_key = api_key
         self.endpoint = endpoint
diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py
index 3cd3a29af..12af2d2ef 100644
--- a/cognee/tasks/storage/index_data_points.py
+++ b/cognee/tasks/storage/index_data_points.py
@@ -1,6 +1,6 @@
 import logging
 
-from cognee.infrastructure.databases.exceptions.embedding_exception import EmbeddingException
+from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.infrastructure.engine import DataPoint