Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

COG 870 Remove duplicate edges from the code graph #293

Merged
merged 13 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions cognee/api/v1/cognify/code_graph_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import asyncio
import logging
from pathlib import Path
from typing import Union

from cognee.shared.SourceCodeGraph import SourceCodeGraph
from cognee.shared.data_models import SummarizedContent
from cognee.shared.utils import send_telemetry
from cognee.modules.data.models import Dataset, Data
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
Expand All @@ -16,7 +18,9 @@
from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status
from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents
from cognee.tasks.graph import extract_graph_from_code
from cognee.tasks.repo_processor import get_repo_file_dependencies, enrich_dependency_graph, expand_dependency_graph
from cognee.tasks.storage import add_data_points
from cognee.tasks.summarization import summarize_code

logger = logging.getLogger("code_graph_pipeline")

Expand Down Expand Up @@ -103,3 +107,30 @@ async def run_pipeline(dataset: Dataset, user: User):

def generate_dataset_name(dataset_name: str) -> str:
return dataset_name.replace(".", "_").replace(" ", "_")


async def run_code_graph_pipeline(repo_path):
import os
import pathlib
import cognee
from cognee.infrastructure.databases.relational import create_db_and_tables

file_path = Path(__file__).parent
data_directory_path = str(pathlib.Path(os.path.join(file_path, ".data_storage/code_graph")).resolve())
cognee.config.data_root_directory(data_directory_path)
cognee_directory_path = str(pathlib.Path(os.path.join(file_path, ".cognee_system/code_graph")).resolve())
cognee.config.system_root_directory(cognee_directory_path)
lxobr marked this conversation as resolved.
Show resolved Hide resolved

await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
await create_db_and_tables()

tasks = [
Task(get_repo_file_dependencies),
Task(enrich_dependency_graph, task_config={"batch_size": 50}),
Task(expand_dependency_graph, task_config={"batch_size": 50}),
Task(summarize_code, summarization_model=SummarizedContent, task_config={"batch_size": 50}),
Task(add_data_points, task_config={"batch_size": 50}),
]

return run_tasks(tasks, repo_path, "cognify_code_pipeline")
lxobr marked this conversation as resolved.
Show resolved Hide resolved
38 changes: 21 additions & 17 deletions cognee/tasks/summarization/summarize_code.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,43 @@
import asyncio
from typing import Type
from uuid import uuid5
from typing import Type
lxobr marked this conversation as resolved.
Show resolved Hide resolved

from pydantic import BaseModel

from cognee.infrastructure.engine import DataPoint
from cognee.modules.data.extraction.extract_summary import extract_summary
from cognee.shared.CodeGraphEntities import CodeFile
from cognee.tasks.storage import add_data_points

from .models import CodeSummary


async def summarize_code(
code_files: list[DataPoint],
code_graph_nodes: list[DataPoint],
summarization_model: Type[BaseModel],
) -> list[DataPoint]:
lxobr marked this conversation as resolved.
Show resolved Hide resolved
if len(code_files) == 0:
return code_files
if len(code_graph_nodes) == 0:
return

code_files_data_points = [file for file in code_files if isinstance(file, CodeFile)]
code_files_data_points = [file for file in code_graph_nodes if isinstance(file, CodeFile)]

file_summaries = await asyncio.gather(
*[extract_summary(file.source_code, summarization_model) for file in code_files_data_points]
)

summaries = [
CodeSummary(
id = uuid5(file.id, "CodeSummary"),
made_from = file,
text = file_summaries[file_index].summary,
)
for (file_index, file) in enumerate(code_files_data_points)
]
file_summaries_map = {
code_file_data_point.extracted_id: file_summary.summary
for code_file_data_point, file_summary in zip(code_files_data_points, file_summaries)
}

for node in code_graph_nodes:
if not isinstance(node, DataPoint):
continue
yield node

await add_data_points(summaries)
if not isinstance(node, CodeFile):
continue

return code_files
yield CodeSummary(
lxobr marked this conversation as resolved.
Show resolved Hide resolved
id=uuid5(node.id, "CodeSummary"),
made_from=node,
text=file_summaries_map[node.extracted_id],
)
47 changes: 7 additions & 40 deletions evals/eval_swe_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,13 @@
from swebench.harness.utils import load_swebench_dataset
from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE

from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
from cognee.api.v1.search import SearchType
from cognee.infrastructure.llm.get_llm_client import get_llm_client
from cognee.infrastructure.llm.prompts import read_query_prompt
from cognee.modules.pipelines import Task, run_tasks
from cognee.modules.retrieval.brute_force_triplet_search import \
brute_force_triplet_search
# from cognee.shared.data_models import SummarizedContent
from cognee.shared.utils import render_graph
from cognee.tasks.repo_processor import (enrich_dependency_graph,
expand_dependency_graph,
get_repo_file_dependencies)
from cognee.tasks.storage import add_data_points
# from cognee.tasks.summarization import summarize_code
from evals.eval_utils import download_github_repo, retrieved_edges_to_string


Expand All @@ -42,48 +36,22 @@ def check_install_package(package_name):


async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
import os
import pathlib
import cognee
from cognee.infrastructure.databases.relational import create_db_and_tables

file_path = Path(__file__).parent
data_directory_path = str(pathlib.Path(os.path.join(file_path, ".data_storage/code_graph")).resolve())
cognee.config.data_root_directory(data_directory_path)
cognee_directory_path = str(pathlib.Path(os.path.join(file_path, ".cognee_system/code_graph")).resolve())
cognee.config.system_root_directory(cognee_directory_path)

await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata = True)

await create_db_and_tables()

# repo_path = download_github_repo(instance, '../RAW_GIT_REPOS')

repo_path = '/Users/borisarzentar/Projects/graphrag'

tasks = [
Task(get_repo_file_dependencies),
Task(enrich_dependency_graph, task_config = { "batch_size": 50 }),
Task(expand_dependency_graph, task_config = { "batch_size": 50 }),
Task(add_data_points, task_config = { "batch_size": 50 }),
# Task(summarize_code, summarization_model = SummarizedContent),
]

pipeline = run_tasks(tasks, repo_path, "cognify_code_pipeline")
repo_path = download_github_repo(instance, '../RAW_GIT_REPOS')
pipeline = await run_code_graph_pipeline(repo_path)

async for result in pipeline:
print(result)

print('Here we have the repo under the repo_path')

await render_graph(None, include_labels = True, include_nodes = True)
await render_graph(None, include_labels=True, include_nodes=True)

problem_statement = instance['problem_statement']
instructions = read_query_prompt("patch_gen_kg_instructions.txt")

retrieved_edges = await brute_force_triplet_search(problem_statement, top_k = 3, collections = ["data_point_source_code", "data_point_text"])

retrieved_edges = await brute_force_triplet_search(problem_statement, top_k=3,
collections=["data_point_source_code", "data_point_text"])

retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)

prompt = "\n".join([
Expand Down Expand Up @@ -171,7 +139,6 @@ async def main():
with open(predictions_path, "w") as file:
json.dump(preds, file)


subprocess.run(
[
"python",
Expand Down
15 changes: 15 additions & 0 deletions examples/python/code_graph_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import argparse
import asyncio
from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline


async def main(repo_path):
async for result in await run_code_graph_pipeline(repo_path):
lxobr marked this conversation as resolved.
Show resolved Hide resolved
print(result)

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--repo-path", type=str, required=True, help="Path to the repository")
args = parser.parse_args()
asyncio.run(main(args.repo_path))

Loading