Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cog 813 source code chunks #383

Merged
merged 19 commits into from
Dec 26, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions cognee/api/v1/cognify/code_graph_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from pathlib import Path

from cognee.base_config import get_base_config
from cognee.infrastructure.databases.vector.embeddings import \
get_embedding_engine
from cognee.modules.cognify.config import get_cognify_config
from cognee.modules.pipelines import run_tasks
from cognee.modules.pipelines.tasks.Task import Task
Expand All @@ -15,8 +17,10 @@
from cognee.tasks.repo_processor import (enrich_dependency_graph,
expand_dependency_graph,
get_data_list_for_user,
get_non_code_files,
get_non_py_files,
get_repo_file_dependencies)
from cognee.tasks.repo_processor.get_source_code_chunks import \
get_source_code_chunks
from cognee.tasks.storage import add_data_points

monitoring = get_base_config().monitoring_tool
Expand All @@ -28,6 +32,7 @@
logger = logging.getLogger("code_graph_pipeline")
update_status_lock = asyncio.Lock()


@observe
async def run_code_graph_pipeline(repo_path, include_docs=True):
import os
Expand All @@ -46,20 +51,23 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
await cognee.prune.prune_system(metadata=True)
await create_db_and_tables()

embedding_engine = get_embedding_engine()

lxobr marked this conversation as resolved.
Show resolved Hide resolved
cognee_config = get_cognify_config()
user = await get_default_user()

tasks = [
Task(get_repo_file_dependencies),
Task(enrich_dependency_graph, task_config={"batch_size": 50}),
Task(enrich_dependency_graph),
Task(expand_dependency_graph, task_config={"batch_size": 50}),
Task(get_source_code_chunks, embedding_model=embedding_engine.model, task_config={"batch_size": 50}),
Task(summarize_code, task_config={"batch_size": 50}),
Task(add_data_points, task_config={"batch_size": 50}),
]

if include_docs:
non_code_tasks = [
Task(get_non_code_files, task_config={"batch_size": 50}),
Task(get_non_py_files, task_config={"batch_size": 50}),
Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
Task(classify_documents),
Expand All @@ -71,7 +79,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
task_config={"batch_size": 50}
),
]

if include_docs:
async for result in run_tasks(non_code_tasks, repo_path):
yield result
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
class EmbeddingException(Exception):
"""Custom exception for handling embedding-related errors."""
pass
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,27 @@
import litellm
import os
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
from cognee.infrastructure.databases.exceptions.embedding_exception import EmbeddingException

litellm.set_verbose = False
logger = logging.getLogger("LiteLLMEmbeddingEngine")


class LiteLLMEmbeddingEngine(EmbeddingEngine):
api_key: str
endpoint: str
api_version: str
model: str
dimensions: int
mock:bool
mock: bool

def __init__(
self,
model: Optional[str] = "text-embedding-3-large",
dimensions: Optional[int] = 3072,
api_key: str = None,
endpoint: str = None,
api_version: str = None,
self,
model: Optional[str] = "text-embedding-3-large",
dimensions: Optional[int] = 3072,
api_key: str = None,
endpoint: str = None,
api_version: str = None,
):
self.api_key = api_key
self.endpoint = endpoint
Expand All @@ -33,7 +35,7 @@ def __init__(

enable_mocking = os.getenv("MOCK_EMBEDDING", "false")
if isinstance(enable_mocking, bool):
enable_mocking= str(enable_mocking).lower()
enable_mocking = str(enable_mocking).lower()
self.mock = enable_mocking in ("true", "1", "yes")

MAX_RETRIES = 5
Expand All @@ -43,7 +45,7 @@ async def embed_text(self, text: List[str]) -> List[List[float]]:
async def exponential_backoff(attempt):
wait_time = min(10 * (2 ** attempt), 60) # Max 60 seconds
await asyncio.sleep(wait_time)

try:
if self.mock:
response = {
Expand All @@ -56,10 +58,10 @@ async def exponential_backoff(attempt):
else:
response = await litellm.aembedding(
self.model,
input = text,
api_key = self.api_key,
api_base = self.endpoint,
api_version = self.api_version
input=text,
api_key=self.api_key,
api_base=self.endpoint,
api_version=self.api_version
)

self.retry_count = 0
Expand All @@ -71,7 +73,7 @@ async def exponential_backoff(attempt):
if len(text) == 1:
parts = [text]
else:
parts = [text[0:math.ceil(len(text)/2)], text[math.ceil(len(text)/2):]]
parts = [text[0:math.ceil(len(text) / 2)], text[math.ceil(len(text) / 2):]]

parts_futures = [self.embed_text(part) for part in parts]
embeddings = await asyncio.gather(*parts_futures)
Expand All @@ -95,6 +97,9 @@ async def exponential_backoff(attempt):

return await self.embed_text(text)

except (litellm.exceptions.BadRequestError, litellm.llms.OpenAI.openai.OpenAIError):
raise EmbeddingException("Failed to index data points.")

except Exception as error:
logger.error("Error embedding text: %s", str(error))
raise error
Expand Down
27 changes: 14 additions & 13 deletions cognee/shared/CodeGraphEntities.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
from typing import List, Optional

from cognee.infrastructure.engine import DataPoint


class Repository(DataPoint):
__tablename__ = "Repository"
path: str
_metadata: dict = {
"index_fields": ["source_code"],
lxobr marked this conversation as resolved.
Show resolved Hide resolved
"index_fields": [],
"type": "Repository"
}

Expand All @@ -19,29 +18,31 @@ class CodeFile(DataPoint):
depends_on: Optional[List["CodeFile"]] = None
depends_directly_on: Optional[List["CodeFile"]] = None
contains: Optional[List["CodePart"]] = None

_metadata: dict = {
"index_fields": ["source_code"],
"index_fields": [],
"type": "CodeFile"
}

class CodePart(DataPoint):
__tablename__ = "codepart"
# part_of: Optional[CodeFile]
source_code: str

# part_of: Optional[CodeFile] = None
source_code: Optional[str] = None
lxobr marked this conversation as resolved.
Show resolved Hide resolved
_metadata: dict = {
"index_fields": ["source_code"],
"index_fields": [],
"type": "CodePart"
}

class CodeRelationship(DataPoint):
source_id: str
target_id: str
relation: str # depends on or depends directly
class SourceCodeChunk(DataPoint):
__tablename__ = "sourcecodechunk"
code_chunk_of: Optional[CodePart] = None
source_code: Optional[str] = None
previous_chunk: Optional["SourceCodeChunk"] = None

_metadata: dict = {
"type": "CodeRelationship"
"index_fields": ["source_code"],
"type": "SourceCodeChunk"
}

CodeFile.model_rebuild()
CodePart.model_rebuild()
SourceCodeChunk.model_rebuild()
1 change: 0 additions & 1 deletion cognee/shared/data_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,6 @@ class SummarizedClass(BaseModel):
decorators: Optional[List[str]] = None

class SummarizedCode(BaseModel):
file_name: str
high_level_summary: str
key_features: List[str]
imports: List[str] = []
Expand Down
9 changes: 6 additions & 3 deletions cognee/tasks/repo_processor/get_repo_file_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non
path = repo_path,
)

yield repo
yield [repo]

with ProcessPoolExecutor(max_workers = 12) as executor:
loop = asyncio.get_event_loop()
Expand All @@ -90,10 +90,11 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non

results = await asyncio.gather(*tasks)

code_files = []
lxobr marked this conversation as resolved.
Show resolved Hide resolved
for (file_path, metadata), dependencies in zip(py_files_dict.items(), results):
source_code = metadata.get("source_code")

yield CodeFile(
code_files.append(CodeFile(
id = uuid5(NAMESPACE_OID, file_path),
source_code = source_code,
extracted_id = file_path,
Expand All @@ -106,4 +107,6 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non
source_code = py_files_dict.get(dependency, {}).get("source_code"),
) for dependency in dependencies
] if dependencies else None,
)
))

yield code_files
Loading
Loading