Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

COG 870 Remove duplicate edges from the code graph #293

Merged
merged 13 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 22 additions & 19 deletions cognee/tasks/summarization/summarize_code.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,42 @@
import asyncio
from typing import Type
from uuid import uuid5
from typing import Type
lxobr marked this conversation as resolved.
Show resolved Hide resolved

from pydantic import BaseModel

from cognee.infrastructure.engine import DataPoint
from cognee.modules.data.extraction.extract_summary import extract_summary
from cognee.shared.CodeGraphEntities import CodeFile
from cognee.tasks.storage import add_data_points

from .models import CodeSummary


async def summarize_code(
code_files: list[DataPoint],
summarization_model: Type[BaseModel],
code_graph_nodes: list[DataPoint],
lxobr marked this conversation as resolved.
Show resolved Hide resolved
summarization_model: Type[BaseModel],
) -> list[DataPoint]:
lxobr marked this conversation as resolved.
Show resolved Hide resolved
if len(code_files) == 0:
return code_files
if len(code_graph_nodes) == 0:
return

code_files_data_points = [file for file in code_files if isinstance(file, CodeFile)]
code_files_data_points = [file for file in code_graph_nodes if isinstance(file, CodeFile)]

file_summaries = await asyncio.gather(
*[extract_summary(file.source_code, summarization_model) for file in code_files_data_points]
)

summaries = [
CodeSummary(
id = uuid5(file.id, "CodeSummary"),
made_from = file,
text = file_summaries[file_index].summary,
file_summaries_map = {
code_file_data_point.extracted_id: file_summary.summary
for code_file_data_point, file_summary in zip(code_files_data_points, file_summaries)
}

for node in code_graph_nodes:
if not isinstance(node, DataPoint):
continue
yield node

if not isinstance(node, CodeFile):
continue
yield CodeSummary(
lxobr marked this conversation as resolved.
Show resolved Hide resolved
id=uuid5(node.id, "CodeSummary"),
made_from=node,
text=file_summaries_map[node.extracted_id],
)
for (file_index, file) in enumerate(code_files_data_points)
]

await add_data_points(summaries)

return code_files
36 changes: 18 additions & 18 deletions evals/eval_swe_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@
from cognee.modules.pipelines import Task, run_tasks
from cognee.modules.retrieval.brute_force_triplet_search import \
brute_force_triplet_search
# from cognee.shared.data_models import SummarizedContent
from cognee.shared.data_models import SummarizedContent
from cognee.shared.utils import render_graph
from cognee.tasks.repo_processor import (enrich_dependency_graph,
expand_dependency_graph,
get_repo_file_dependencies)
from cognee.tasks.storage import add_data_points
# from cognee.tasks.summarization import summarize_code
from cognee.tasks.summarization import summarize_code
from evals.eval_utils import download_github_repo, retrieved_edges_to_string


Expand All @@ -41,7 +41,7 @@ def check_install_package(package_name):
return False


async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
async def run_code_graph_pipeline(repo_path):
lxobr marked this conversation as resolved.
Show resolved Hide resolved
import os
import pathlib
import cognee
Expand All @@ -54,36 +54,37 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp
cognee.config.system_root_directory(cognee_directory_path)

await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata = True)

await cognee.prune.prune_system(metadata=True)
await create_db_and_tables()

# repo_path = download_github_repo(instance, '../RAW_GIT_REPOS')

repo_path = '/Users/borisarzentar/Projects/graphrag'

tasks = [
Task(get_repo_file_dependencies),
Task(enrich_dependency_graph, task_config = { "batch_size": 50 }),
Task(expand_dependency_graph, task_config = { "batch_size": 50 }),
Task(add_data_points, task_config = { "batch_size": 50 }),
# Task(summarize_code, summarization_model = SummarizedContent),
Task(enrich_dependency_graph, task_config={"batch_size": 50}),
Task(expand_dependency_graph, task_config={"batch_size": 50}),
Task(summarize_code, summarization_model=SummarizedContent, task_config={"batch_size": 50}),
Task(add_data_points, task_config={"batch_size": 50}),
]

pipeline = run_tasks(tasks, repo_path, "cognify_code_pipeline")
return run_tasks(tasks, repo_path, "cognify_code_pipeline")


async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
repo_path = download_github_repo(instance, '../RAW_GIT_REPOS')
pipeline = await run_code_graph_pipeline(repo_path)

async for result in pipeline:
print(result)

print('Here we have the repo under the repo_path')

await render_graph(None, include_labels = True, include_nodes = True)
await render_graph(None, include_labels=True, include_nodes=True)

problem_statement = instance['problem_statement']
instructions = read_query_prompt("patch_gen_kg_instructions.txt")

retrieved_edges = await brute_force_triplet_search(problem_statement, top_k = 3, collections = ["data_point_source_code", "data_point_text"])

retrieved_edges = await brute_force_triplet_search(problem_statement, top_k=3,
collections=["data_point_source_code", "data_point_text"])

retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)

prompt = "\n".join([
Expand Down Expand Up @@ -171,7 +172,6 @@ async def main():
with open(predictions_path, "w") as file:
json.dump(preds, file)


subprocess.run(
[
"python",
Expand Down
13 changes: 13 additions & 0 deletions examples/python/code_graph_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import argparse
import asyncio
from evals.eval_swe_bench import run_code_graph_pipeline

async def main(repo_path):
async for result in await run_code_graph_pipeline(repo_path):
lxobr marked this conversation as resolved.
Show resolved Hide resolved
print(result)

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("repo_path", type=str, help="Path to the repository")
lxobr marked this conversation as resolved.
Show resolved Hide resolved
args = parser.parse_args()
asyncio.run(main(args.repo_path))
Loading