topoteretes · borisarzentar · Nov 11, 2024 · Oct 24, 2024 · Oct 29, 2024 · Nov 7, 2024
diff --git a/README.md b/README.md
@@ -109,24 +109,34 @@ import asyncio
 from cognee.api.v1.search import SearchType
 
 async def main():
-    await cognee.prune.prune_data()  # Reset cognee data
-    await cognee.prune.prune_system(metadata=True)  # Reset cognee system state
+    # Reset cognee data
+    await cognee.prune.prune_data()
+    # Reset cognee system state
+    await cognee.prune.prune_system(metadata=True)
 
     text = """
     Natural language processing (NLP) is an interdisciplinary
     subfield of computer science and information retrieval.
     """
 
-    await cognee.add(text)  # Add text to cognee
-    await cognee.cognify()  # Use LLMs and cognee to create knowledge graph
+    # Add text to cognee
+    await cognee.add(text)
 
-    search_results = await cognee.search(  # Search cognee for insights
+    # Use LLMs and cognee to create knowledge graph
+    await cognee.cognify()
+
+    # Search cognee for insights
+    search_results = await cognee.search(
         SearchType.INSIGHTS,
-        {'query': 'Tell me about NLP'}
+        "Tell me about NLP",
     )
 
-    for result_text in search_results:  # Display results
+    # Display results
+    for result_text in search_results:
         print(result_text)
+        # natural_language_processing is_a field
+        # natural_language_processing is_subfield_of computer_science
+        # natural_language_processing is_subfield_of information_retrieval
 
 asyncio.run(main())
 ```

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -0,0 +1,110 @@
+import asyncio
+import logging
+from typing import Union
+
+from cognee.shared.SourceCodeGraph import SourceCodeGraph
+from cognee.shared.utils import send_telemetry
+from cognee.modules.data.models import Dataset, Data
+from cognee.modules.data.methods.get_dataset_data import get_dataset_data
+from cognee.modules.data.methods import get_datasets, get_datasets_by_name
+from cognee.modules.pipelines.tasks.Task import Task
+from cognee.modules.pipelines import run_tasks
+from cognee.modules.users.models import User
+from cognee.modules.users.methods import get_default_user
+from cognee.modules.pipelines.models import PipelineRunStatus
+from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
+from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status
+from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents
+from cognee.tasks.graph import extract_graph_from_code
+from cognee.tasks.storage import add_data_points
+
+logger = logging.getLogger("code_graph_pipeline")
+
+update_status_lock = asyncio.Lock()
+
+class PermissionDeniedException(Exception):
+    def __init__(self, message: str):
+        self.message = message
+        super().__init__(self.message)
+
+async def code_graph_pipeline(datasets: Union[str, list[str]] = None, user: User = None):
+    if user is None:
+        user = await get_default_user()
+
+    existing_datasets = await get_datasets(user.id)
+
+    if datasets is None or len(datasets) == 0:
+        # If no datasets are provided, cognify all existing datasets.
+        datasets = existing_datasets
+
+    if type(datasets[0]) == str:
+        datasets = await get_datasets_by_name(datasets, user.id)
-    if datasets is None or len(datasets) == 0:
-        # If no datasets are provided, cognify all existing datasets.
-        datasets = existing_datasets
-
-    if type(datasets[0]) == str:
-        datasets = await get_datasets_by_name(datasets, user.id)
+    if datasets is None or len(datasets) == 0:
+        # If no datasets are provided, cognify all existing datasets.
+        datasets = existing_datasets
+
+    if isinstance(datasets, str):
+        datasets = [datasets]
+
+    if isinstance(datasets[0], str):
+        datasets = await get_datasets_by_name(datasets, user.id)
-    if datasets is None or len(datasets) == 0:
-        # If no datasets are provided, cognify all existing datasets.
-        datasets = existing_datasets
-
-    if type(datasets[0]) == str:
-        datasets = await get_datasets_by_name(datasets, user.id)
+    if datasets is None or len(datasets) == 0:
+        # If no datasets are provided, cognify all existing datasets.
+        datasets = existing_datasets
+
+    if isinstance(datasets, str):
+        datasets = [datasets]
+
+    if isinstance(datasets[0], str):
+        datasets = await get_datasets_by_name(datasets, user.id)
+
+    existing_datasets_map = {
+        generate_dataset_name(dataset.name): True for dataset in existing_datasets
+    }
+
+    awaitables = []
+
+    for dataset in datasets:
+        dataset_name = generate_dataset_name(dataset.name)
+
+        if dataset_name in existing_datasets_map:
+            awaitables.append(run_pipeline(dataset, user))
+
+    return await asyncio.gather(*awaitables)
+
+
+async def run_pipeline(dataset: Dataset, user: User):
+    data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)
+
+    document_ids_str = [str(document.id) for document in data_documents]
+
+    dataset_id = dataset.id
+    dataset_name = generate_dataset_name(dataset.name)
+
+    send_telemetry("code_graph_pipeline EXECUTION STARTED", user.id)
+
+    async with update_status_lock:
+        task_status = await get_pipeline_status([dataset_id])
+
+        if dataset_id in task_status and task_status[dataset_id] == PipelineRunStatus.DATASET_PROCESSING_STARTED:
+            logger.info("Dataset %s is already being processed.", dataset_name)
+            return
+
+        await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_STARTED, {
+            "dataset_name": dataset_name,
+            "files": document_ids_str,
+        })
+    try:
+        tasks = [
+            Task(classify_documents),
+            Task(check_permissions_on_documents, user = user, permissions = ["write"]),
+            Task(extract_chunks_from_documents), # Extract text chunks based on the document type.
+            Task(add_data_points, task_config = { "batch_size": 10 }),
+            Task(extract_graph_from_code, graph_model = SourceCodeGraph, task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks.
+        ]
+
+        pipeline = run_tasks(tasks, data_documents, "code_graph_pipeline")
+
+        async for result in pipeline:
+            print(result)
+
+        send_telemetry("code_graph_pipeline EXECUTION COMPLETED", user.id)
+
+        await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_COMPLETED, {
+            "dataset_name": dataset_name,
+            "files": document_ids_str,
+        })
+    except Exception as error:
+        send_telemetry("code_graph_pipeline EXECUTION ERRORED", user.id)
+
+        await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_ERRORED, {
+            "dataset_name": dataset_name,
+            "files": document_ids_str,
+        })
+        raise error
+
+
+def generate_dataset_name(dataset_name: str) -> str:
+    return dataset_name.replace(".", "_").replace(" ", "_")
diff --git a/cognee/api/v1/cognify/cognify_v2.py b/cognee/api/v1/cognify/cognify_v2.py
@@ -9,21 +9,15 @@
 from cognee.modules.data.methods.get_dataset_data import get_dataset_data
 from cognee.modules.data.methods import get_datasets, get_datasets_by_name
 from cognee.modules.pipelines.tasks.Task import Task
-from cognee.modules.pipelines import run_tasks, run_tasks_parallel
+from cognee.modules.pipelines import run_tasks
 from cognee.modules.users.models import User
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.pipelines.models import PipelineRunStatus
 from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
 from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status
-from cognee.tasks import chunk_naive_llm_classifier, \
-    chunk_remove_disconnected, \
-    infer_data_ontology, \
-    save_chunks_to_store, \
-    chunk_update_check, \
-    chunks_into_graph, \
-    source_documents_to_chunks, \
-    check_permissions_on_documents, \
-    classify_documents
+from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents
+from cognee.tasks.graph import extract_graph_from_data
+from cognee.tasks.storage import add_data_points
 from cognee.tasks.summarization import summarize_text
 
 logger = logging.getLogger("cognify.v2")
@@ -87,31 +81,17 @@ async def run_cognify_pipeline(dataset: Dataset, user: User):
     try:
         cognee_config = get_cognify_config()
 
-        root_node_id = None
-
         tasks = [
             Task(classify_documents),
             Task(check_permissions_on_documents, user = user, permissions = ["write"]),
-            Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),
-            Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type
-            Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = "entities", task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes
-            Task(chunk_update_check, collection_name = "chunks"), # Find all affected chunks, so we don't process unchanged chunks
+            Task(extract_chunks_from_documents), # Extract text chunks based on the document type.
+            Task(add_data_points, task_config = { "batch_size": 10 }),
+            Task(extract_graph_from_data, graph_model = KnowledgeGraph, task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks.
             Task(
-                save_chunks_to_store,
-                collection_name = "chunks",
-            ), # Save the document chunks in vector db and as nodes in graph db (connected to the document node and between each other)
-            run_tasks_parallel([
-                Task(
-                    summarize_text,
-                    summarization_model = cognee_config.summarization_model,
-                    collection_name = "summaries",
-                ),
-                Task(
-                    chunk_naive_llm_classifier,
-                    classification_model = cognee_config.classification_model,
-                ),
-            ]),
-            Task(chunk_remove_disconnected), # Remove the obsolete document chunks.
+                summarize_text,
+                summarization_model = cognee_config.summarization_model,
+                task_config = { "batch_size": 10 }
+            ),
         ]
 
         pipeline = run_tasks(tasks, data_documents, "cognify_pipeline")

diff --git a/cognee/api/v1/search/search_v2.py b/cognee/api/v1/search/search_v2.py
@@ -5,7 +5,7 @@
 from cognee.modules.users.models import User
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.users.permissions.methods import get_document_ids_for_user
-from cognee.tasks.chunking import query_chunks
+from cognee.tasks.chunks import query_chunks
 from cognee.tasks.graph import query_graph_connections
 from cognee.tasks.summarization import query_summaries
 

diff --git a/cognee/infrastructure/databases/graph/falkordb/__init__.py b/cognee/infrastructure/databases/graph/falkordb/__init__.py