From c901fa8b8acc23197aa46e885ce34a77c156b6e7 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Thu, 24 Oct 2024 12:37:06 +0200 Subject: [PATCH 01/19] feat: add falkordb adapter --- .../databases/graph/neo4j_driver/adapter.py | 1 - .../vector/falkordb/FalkorDBAdapter.py | 96 +++++++++++++++---- .../processing/document_types/__init__.py | 1 + examples/python/GraphModel.py | 62 ++++++++++++ poetry.lock | 41 ++++++-- pyproject.toml | 1 + 6 files changed, 173 insertions(+), 29 deletions(-) create mode 100644 examples/python/GraphModel.py diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index f072d60f..0b8925ce 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -7,7 +7,6 @@ from neo4j import AsyncSession from neo4j import AsyncGraphDatabase from neo4j.exceptions import Neo4jError -from networkx import predecessor from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface logger = logging.getLogger("Neo4jAdapter") diff --git a/cognee/infrastructure/databases/vector/falkordb/FalkorDBAdapter.py b/cognee/infrastructure/databases/vector/falkordb/FalkorDBAdapter.py index 563219fe..744d79f5 100644 --- a/cognee/infrastructure/databases/vector/falkordb/FalkorDBAdapter.py +++ b/cognee/infrastructure/databases/vector/falkordb/FalkorDBAdapter.py @@ -1,57 +1,113 @@ - -from typing import List, Dict, Optional, Any - +import asyncio from falkordb import FalkorDB -from qdrant_client import AsyncQdrantClient, models -from ..vector_db_interface import VectorDBInterface from ..models.DataPoint import DataPoint +from ..vector_db_interface import VectorDBInterface from ..embeddings.EmbeddingEngine import EmbeddingEngine - - class FalcorDBAdapter(VectorDBInterface): def __init__( self, graph_database_url: str, - graph_database_username: str, - graph_database_password: str, graph_database_port: int, - driver: Optional[Any] = None, embedding_engine = EmbeddingEngine, - graph_name: str = "DefaultGraph", ): self.driver = FalkorDB( host = graph_database_url, port = graph_database_port) - self.graph_name = graph_name self.embedding_engine = embedding_engine - async def embed_data(self, data: list[str]) -> list[list[float]]: return await self.embedding_engine.embed_text(data) + async def has_collection(self, collection_name: str) -> bool: + collections = self.driver.list_graphs() + + return collection_name in collections async def create_collection(self, collection_name: str, payload_schema = None): - pass + self.driver.select_graph(collection_name) + + async def create_data_points(self, collection_name: str, data_points: list[DataPoint]): + graph = self.driver.select_graph(collection_name) + def stringify_properties(properties: dict) -> str: + return ",".join(f"{key}:'{value}'" for key, value in properties.items()) + + def create_data_point_query(data_point: DataPoint): + node_label = type(data_point.payload).__name__ + node_properties = stringify_properties(data_point.payload.dict()) + + return f"""CREATE (:{node_label} {{{node_properties}}})""" - async def create_data_points(self, collection_name: str, data_points: List[DataPoint]): - pass + query = " ".join([create_data_point_query(data_point) for data_point in data_points]) + + graph.query(query) async def retrieve(self, collection_name: str, data_point_ids: list[str]): - pass + graph = self.driver.select_graph(collection_name) + + return graph.query( + f"MATCH (node) WHERE node.id IN $node_ids RETURN node", + { + "node_ids": data_point_ids, + }, + ) async def search( self, collection_name: str, query_text: str = None, - query_vector: List[float] = None, + query_vector: list[float] = None, limit: int = 10, with_vector: bool = False, ): - pass + if query_text is None and query_vector is None: + raise ValueError("One of query_text or query_vector must be provided!") + + if query_text and not query_vector: + query_vector = (await self.embedding_engine.embed_text([query_text]))[0] + + graph = self.driver.select_graph(collection_name) + + query = f""" + CALL db.idx.vector.queryNodes( + null, + 'text', + {limit}, + {query_vector} + ) YIELD node, score + """ + + result = graph.query(query) + + return result + + async def batch_search( + self, + collection_name: str, + query_texts: list[str], + limit: int = None, + with_vectors: bool = False, + ): + query_vectors = await self.embedding_engine.embed_text(query_texts) + + return await asyncio.gather( + *[self.search( + collection_name = collection_name, + query_vector = query_vector, + limit = limit, + with_vector = with_vectors, + ) for query_vector in query_vectors] + ) async def delete_data_points(self, collection_name: str, data_point_ids: list[str]): - pass + graph = self.driver.select_graph(collection_name) + + return graph.query( + f"MATCH (node) WHERE node.id IN $node_ids DETACH DELETE node", + { + "node_ids": data_point_ids, + }, + ) diff --git a/cognee/modules/data/processing/document_types/__init__.py b/cognee/modules/data/processing/document_types/__init__.py index d751366b..9682cc10 100644 --- a/cognee/modules/data/processing/document_types/__init__.py +++ b/cognee/modules/data/processing/document_types/__init__.py @@ -1,3 +1,4 @@ +from .Document import Document from .PdfDocument import PdfDocument from .TextDocument import TextDocument from .ImageDocument import ImageDocument diff --git a/examples/python/GraphModel.py b/examples/python/GraphModel.py new file mode 100644 index 00000000..01251fc2 --- /dev/null +++ b/examples/python/GraphModel.py @@ -0,0 +1,62 @@ + +from typing import Optional +from uuid import UUID +from datetime import datetime +from pydantic import BaseModel + + +async def add_data_points(collection_name: str, data_points: list): + pass + + + +class Summary(BaseModel): + id: UUID + text: str + chunk: "Chunk" + created_at: datetime + updated_at: Optional[datetime] + + vector_index = ["text"] + +class Chunk(BaseModel): + id: UUID + text: str + summary: Summary + document: "Document" + created_at: datetime + updated_at: Optional[datetime] + word_count: int + chunk_index: int + cut_type: str + + vector_index = ["text"] + +class Document(BaseModel): + id: UUID + chunks: list[Chunk] + created_at: datetime + updated_at: Optional[datetime] + +class EntityType(BaseModel): + id: UUID + name: str + description: str + created_at: datetime + updated_at: Optional[datetime] + + vector_index = ["name"] + +class Entity(BaseModel): + id: UUID + name: str + type: EntityType + description: str + chunks: list[Chunk] + created_at: datetime + updated_at: Optional[datetime] + + vector_index = ["name"] + +class OntologyModel(BaseModel): + chunks: list[Chunk] diff --git a/poetry.lock b/poetry.lock index acd56e02..b8ff95c1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiofiles" @@ -1490,6 +1490,19 @@ files = [ [package.extras] tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"] +[[package]] +name = "falkordb" +version = "1.0.9" +description = "Python client for interacting with FalkorDB database" +optional = false +python-versions = "<4.0,>=3.8" +files = [ + {file = "falkordb-1.0.9.tar.gz", hash = "sha256:177008e63c7e4d9ebbdfeb8cad24b0e49175bb0f6e96cac9b4ffb641c0eff0f1"}, +] + +[package.dependencies] +redis = ">=5.0.1,<6.0.0" + [[package]] name = "fastapi" version = "0.109.2" @@ -3685,7 +3698,6 @@ optional = false python-versions = ">=3.6" files = [ {file = "mkdocs-redirects-1.2.1.tar.gz", hash = "sha256:9420066d70e2a6bb357adf86e67023dcdca1857f97f07c7fe450f8f1fb42f861"}, - {file = "mkdocs_redirects-1.2.1-py3-none-any.whl", hash = "sha256:497089f9e0219e7389304cffefccdfa1cac5ff9509f2cb706f4c9b221726dffb"}, ] [package.dependencies] @@ -5771,6 +5783,24 @@ files = [ [package.extras] test = ["pytest (>=3.0)", "pytest-asyncio"] +[[package]] +name = "redis" +version = "5.1.1" +description = "Python client for Redis database and key-value store" +optional = false +python-versions = ">=3.8" +files = [ + {file = "redis-5.1.1-py3-none-any.whl", hash = "sha256:f8ea06b7482a668c6475ae202ed8d9bcaa409f6e87fb77ed1043d912afd62e24"}, + {file = "redis-5.1.1.tar.gz", hash = "sha256:f6c997521fedbae53387307c5d0bf784d9acc28d9f1d058abeac566ec4dbed72"}, +] + +[package.dependencies] +async-timeout = {version = ">=4.0.3", markers = "python_full_version < \"3.11.3\""} + +[package.extras] +hiredis = ["hiredis (>=3.0.0)"] +ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==23.2.1)", "requests (>=2.31.0)"] + [[package]] name = "referencing" version = "0.35.1" @@ -6292,11 +6322,6 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, - {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, - {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, - {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, - {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, - {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, @@ -7766,4 +7791,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.12" -content-hash = "70a0072dce8de95d64b862f9a9df48aaec84c8d8515ae018fce4426a0dcacf88" +content-hash = "fef56656ead761cab7d5c3d0bf1fa5a54608db73b14616d08e5fb152dba91236" diff --git a/pyproject.toml b/pyproject.toml index 22074959..65d54978 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,6 +72,7 @@ asyncpg = "^0.29.0" alembic = "^1.13.3" pgvector = "^0.3.5" psycopg2 = {version = "^2.9.10", optional = true} +falkordb = "^1.0.9" [tool.poetry.extras] filesystem = ["s3fs", "botocore"] From 14e2c7efbe950a3943375eebe305304a1e7f7f5f Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Thu, 7 Nov 2024 11:17:01 +0100 Subject: [PATCH 02/19] feat: add FalkorDB integration --- cognee/api/v1/cognify/cognify_v2.py | 42 +- cognee/api/v1/search/search_v2.py | 2 +- .../databases/graph/falkordb/__init__.py | 0 .../databases/graph/falkordb/adapter.py | 198 -- .../databases/graph/get_graph_engine.py | 23 +- .../databases/graph/graph_db_interface.py | 2 +- .../databases/graph/neo4j_driver/adapter.py | 115 +- .../databases/graph/networkx/adapter.py | 67 +- .../hybrid/falkordb/FalkorDBAdapter.py | 237 +++ .../databases/vector/__init__.py | 1 - .../infrastructure/databases/vector/config.py | 2 + .../databases/vector/create_vector_engine.py | 19 +- .../vector/falkordb/FalkorDBAdapter.py | 113 -- .../vector/lancedb/LanceDBAdapter.py | 75 +- .../databases/vector/models/DataPoint.py | 13 - .../databases/vector/models/ScoredResult.py | 3 +- .../vector/pgvector/PGVectorAdapter.py | 40 +- .../databases/vector/qdrant/QDrantAdapter.py | 46 +- .../databases/vector/vector_db_interface.py | 2 +- .../vector/weaviate_db/WeaviateAdapter.py | 35 +- cognee/infrastructure/engine/__init__.py | 1 + .../__tests__/model_to_graph_to_model.test.py | 72 + .../infrastructure/engine/models/DataPoint.py | 24 + cognee/modules/chunking/TextChunker.py | 54 +- cognee/modules/chunking/__init__.py | 2 - .../modules/chunking/models/DocumentChunk.py | 13 +- cognee/modules/data/extraction/__init__.py | 1 + .../extraction/knowledge_graph/__init__.py | 1 + .../data/operations/detect_language.py} | 30 +- .../modules/data/operations/translate_text.py | 41 + .../document_types/AudioDocument.py | 21 +- .../processing/document_types/Document.py | 8 +- .../document_types/ImageDocument.py | 20 +- .../processing/document_types/PdfDocument.py | 20 +- .../processing/document_types/TextDocument.py | 20 +- cognee/modules/engine/models/Entity.py | 12 + cognee/modules/engine/models/EntityType.py | 11 + cognee/modules/engine/models/__init__.py | 2 + cognee/modules/engine/utils/__init__.py | 2 + .../modules/engine/utils/generate_node_id.py | 4 + .../engine/utils/generate_node_name.py | 2 + cognee/modules/graph/utils.py | 5 - cognee/modules/graph/utils/__init__.py | 2 + .../graph/utils/get_graph_from_model.py | 81 + .../utils/get_model_instance_from_graph.py | 29 + cognee/modules/search/CogneeSearch.py | 33 - cognee/modules/search/__init__.py | 0 cognee/modules/search/graph/__init__.py | 0 .../modules/search/graph/search_adjacent.py | 43 - cognee/modules/search/graph/search_cypher.py | 15 - .../modules/search/graph/search_similarity.py | 27 - cognee/modules/search/graph/search_summary.py | 17 - cognee/modules/search/llm/__init__.py | 0 .../modules/search/llm/extraction/__init__.py | 0 .../categorize_relevant_category.py | 16 - .../extraction/categorize_relevant_summary.py | 15 - .../search/llm/get_relevant_summary.py | 17 - cognee/modules/search/vector/__init__.py | 0 cognee/modules/search/vector/bm25.py | 1 - cognee/modules/search/vector/fusion.py | 1 - .../modules/search/vector/search_traverse.py | 36 - cognee/modules/storage/utils/__init__.py | 46 + cognee/shared/utils.py | 58 +- cognee/tasks/__init__.py | 10 - .../chunk_naive_llm_classifier.py | 6 +- .../chunk_remove_disconnected/__init__.py | 0 cognee/tasks/chunk_translate/__init__.py | 0 .../tasks/chunk_translate/translate_chunk.py | 39 - cognee/tasks/chunk_update_check/__init__.py | 0 .../chunk_update_check/chunk_update_check.py | 26 - cognee/tasks/{chunking => chunks}/__init__.py | 1 + .../__tests__/chunk_by_paragraph.test.py | 2 +- .../chunk_by_paragraph.py | 0 .../{chunking => chunks}/chunk_by_sentence.py | 0 .../{chunking => chunks}/chunk_by_word.py | 0 .../{chunking => chunks}/query_chunks.py | 2 +- .../remove_disconnected_chunks.py} | 4 +- .../classify_documents/classify_documents.py | 13 - .../document_language_detection/__init__.py | 0 cognee/tasks/documents/__init__.py | 3 + .../check_permissions_on_documents.py | 0 cognee/tasks/documents/classify_documents.py | 13 + .../extract_chunks_from_documents.py | 7 + cognee/tasks/graph/__init__.py | 2 +- cognee/tasks/graph/chunks_into_graph.py | 213 -- cognee/tasks/graph/extract_graph_from_data.py | 121 ++ .../infer_data_ontology.py | 5 +- cognee/tasks/graph/query_graph_connections.py | 4 +- cognee/tasks/infer_data_ontology/__init__.py | 0 .../infer_data_ontology/models/models.py | 31 - cognee/tasks/save_chunks_to_store/__init__.py | 0 .../save_chunks_to_store.py | 96 - .../source_documents_to_chunks/__init__.py | 0 .../source_documents_to_chunks.py | 44 - cognee/tasks/storage/__init__.py | 2 + cognee/tasks/storage/add_data_points.py | 24 + cognee/tasks/storage/index_data_points.py | 81 + .../tasks/storage/save_to_vector_storage.py | 42 - .../tasks/summarization/models/TextSummary.py | 13 +- cognee/tasks/summarization/query_summaries.py | 2 +- cognee/tasks/summarization/summarize_text.py | 34 +- cognee/tests/test_library.py | 2 +- cognee/tests/test_neo4j.py | 2 +- cognee/tests/test_pgvector.py | 2 +- cognee/tests/test_qdrant.py | 2 +- cognee/tests/test_weaviate.py | 2 +- examples/python/GraphModel.py | 62 - notebooks/cognee_demo.ipynb | 1791 ++++++++--------- poetry.lock | 90 +- pyproject.toml | 2 +- tools/daily_twitter_stats.py | 4 +- 111 files changed, 2136 insertions(+), 2501 deletions(-) delete mode 100644 cognee/infrastructure/databases/graph/falkordb/__init__.py delete mode 100644 cognee/infrastructure/databases/graph/falkordb/adapter.py create mode 100644 cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py delete mode 100644 cognee/infrastructure/databases/vector/falkordb/FalkorDBAdapter.py delete mode 100644 cognee/infrastructure/databases/vector/models/DataPoint.py create mode 100644 cognee/infrastructure/engine/__init__.py create mode 100644 cognee/infrastructure/engine/__tests__/model_to_graph_to_model.test.py create mode 100644 cognee/infrastructure/engine/models/DataPoint.py delete mode 100644 cognee/modules/chunking/__init__.py rename cognee/{tasks/document_language_detection/document_language_detection.py => modules/data/operations/detect_language.py} (54%) create mode 100644 cognee/modules/data/operations/translate_text.py create mode 100644 cognee/modules/engine/models/Entity.py create mode 100644 cognee/modules/engine/models/EntityType.py create mode 100644 cognee/modules/engine/models/__init__.py create mode 100644 cognee/modules/engine/utils/__init__.py create mode 100644 cognee/modules/engine/utils/generate_node_id.py create mode 100644 cognee/modules/engine/utils/generate_node_name.py delete mode 100644 cognee/modules/graph/utils.py create mode 100644 cognee/modules/graph/utils/__init__.py create mode 100644 cognee/modules/graph/utils/get_graph_from_model.py create mode 100644 cognee/modules/graph/utils/get_model_instance_from_graph.py delete mode 100644 cognee/modules/search/CogneeSearch.py delete mode 100644 cognee/modules/search/__init__.py delete mode 100644 cognee/modules/search/graph/__init__.py delete mode 100644 cognee/modules/search/graph/search_adjacent.py delete mode 100644 cognee/modules/search/graph/search_cypher.py delete mode 100644 cognee/modules/search/graph/search_similarity.py delete mode 100644 cognee/modules/search/graph/search_summary.py delete mode 100644 cognee/modules/search/llm/__init__.py delete mode 100644 cognee/modules/search/llm/extraction/__init__.py delete mode 100644 cognee/modules/search/llm/extraction/categorize_relevant_category.py delete mode 100644 cognee/modules/search/llm/extraction/categorize_relevant_summary.py delete mode 100644 cognee/modules/search/llm/get_relevant_summary.py delete mode 100644 cognee/modules/search/vector/__init__.py delete mode 100644 cognee/modules/search/vector/bm25.py delete mode 100644 cognee/modules/search/vector/fusion.py delete mode 100644 cognee/modules/search/vector/search_traverse.py create mode 100644 cognee/modules/storage/utils/__init__.py delete mode 100644 cognee/tasks/__init__.py delete mode 100644 cognee/tasks/chunk_remove_disconnected/__init__.py delete mode 100644 cognee/tasks/chunk_translate/__init__.py delete mode 100644 cognee/tasks/chunk_translate/translate_chunk.py delete mode 100644 cognee/tasks/chunk_update_check/__init__.py delete mode 100644 cognee/tasks/chunk_update_check/chunk_update_check.py rename cognee/tasks/{chunking => chunks}/__init__.py (72%) rename cognee/tasks/{chunking => chunks}/__tests__/chunk_by_paragraph.test.py (97%) rename cognee/tasks/{chunking => chunks}/chunk_by_paragraph.py (100%) rename cognee/tasks/{chunking => chunks}/chunk_by_sentence.py (100%) rename cognee/tasks/{chunking => chunks}/chunk_by_word.py (100%) rename cognee/tasks/{chunking => chunks}/query_chunks.py (83%) rename cognee/tasks/{chunk_remove_disconnected/chunk_remove_disconnected.py => chunks/remove_disconnected_chunks.py} (84%) delete mode 100644 cognee/tasks/classify_documents/classify_documents.py delete mode 100644 cognee/tasks/document_language_detection/__init__.py create mode 100644 cognee/tasks/documents/__init__.py rename cognee/tasks/{check_permissions_on_documents => documents}/check_permissions_on_documents.py (100%) create mode 100644 cognee/tasks/documents/classify_documents.py create mode 100644 cognee/tasks/documents/extract_chunks_from_documents.py delete mode 100644 cognee/tasks/graph/chunks_into_graph.py create mode 100644 cognee/tasks/graph/extract_graph_from_data.py rename cognee/tasks/{infer_data_ontology => graph}/infer_data_ontology.py (95%) delete mode 100644 cognee/tasks/infer_data_ontology/__init__.py delete mode 100644 cognee/tasks/infer_data_ontology/models/models.py delete mode 100644 cognee/tasks/save_chunks_to_store/__init__.py delete mode 100644 cognee/tasks/save_chunks_to_store/save_chunks_to_store.py delete mode 100644 cognee/tasks/source_documents_to_chunks/__init__.py delete mode 100644 cognee/tasks/source_documents_to_chunks/source_documents_to_chunks.py create mode 100644 cognee/tasks/storage/__init__.py create mode 100644 cognee/tasks/storage/add_data_points.py create mode 100644 cognee/tasks/storage/index_data_points.py delete mode 100644 cognee/tasks/storage/save_to_vector_storage.py delete mode 100644 examples/python/GraphModel.py diff --git a/cognee/api/v1/cognify/cognify_v2.py b/cognee/api/v1/cognify/cognify_v2.py index 26134a4f..be9ecd1c 100644 --- a/cognee/api/v1/cognify/cognify_v2.py +++ b/cognee/api/v1/cognify/cognify_v2.py @@ -9,21 +9,15 @@ from cognee.modules.data.methods.get_dataset_data import get_dataset_data from cognee.modules.data.methods import get_datasets, get_datasets_by_name from cognee.modules.pipelines.tasks.Task import Task -from cognee.modules.pipelines import run_tasks, run_tasks_parallel +from cognee.modules.pipelines import run_tasks from cognee.modules.users.models import User from cognee.modules.users.methods import get_default_user from cognee.modules.pipelines.models import PipelineRunStatus from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status -from cognee.tasks import chunk_naive_llm_classifier, \ - chunk_remove_disconnected, \ - infer_data_ontology, \ - save_chunks_to_store, \ - chunk_update_check, \ - chunks_into_graph, \ - source_documents_to_chunks, \ - check_permissions_on_documents, \ - classify_documents +from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents +from cognee.tasks.graph import extract_graph_from_data +from cognee.tasks.storage import add_data_points from cognee.tasks.summarization import summarize_text logger = logging.getLogger("cognify.v2") @@ -87,31 +81,17 @@ async def run_cognify_pipeline(dataset: Dataset, user: User): try: cognee_config = get_cognify_config() - root_node_id = None - tasks = [ Task(classify_documents), Task(check_permissions_on_documents, user = user, permissions = ["write"]), - Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph), - Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type - Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = "entities", task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes - Task(chunk_update_check, collection_name = "chunks"), # Find all affected chunks, so we don't process unchanged chunks + Task(extract_chunks_from_documents), # Extract text chunks based on the document type. + Task(add_data_points, task_config = { "batch_size": 10 }), + Task(extract_graph_from_data, graph_model = KnowledgeGraph, task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks. Task( - save_chunks_to_store, - collection_name = "chunks", - ), # Save the document chunks in vector db and as nodes in graph db (connected to the document node and between each other) - run_tasks_parallel([ - Task( - summarize_text, - summarization_model = cognee_config.summarization_model, - collection_name = "summaries", - ), - Task( - chunk_naive_llm_classifier, - classification_model = cognee_config.classification_model, - ), - ]), - Task(chunk_remove_disconnected), # Remove the obsolete document chunks. + summarize_text, + summarization_model = cognee_config.summarization_model, + task_config = { "batch_size": 10 } + ), ] pipeline = run_tasks(tasks, data_documents, "cognify_pipeline") diff --git a/cognee/api/v1/search/search_v2.py b/cognee/api/v1/search/search_v2.py index b3d45d71..a82f1421 100644 --- a/cognee/api/v1/search/search_v2.py +++ b/cognee/api/v1/search/search_v2.py @@ -5,7 +5,7 @@ from cognee.modules.users.models import User from cognee.modules.users.methods import get_default_user from cognee.modules.users.permissions.methods import get_document_ids_for_user -from cognee.tasks.chunking import query_chunks +from cognee.tasks.chunks import query_chunks from cognee.tasks.graph import query_graph_connections from cognee.tasks.summarization import query_summaries diff --git a/cognee/infrastructure/databases/graph/falkordb/__init__.py b/cognee/infrastructure/databases/graph/falkordb/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/cognee/infrastructure/databases/graph/falkordb/adapter.py b/cognee/infrastructure/databases/graph/falkordb/adapter.py deleted file mode 100644 index 2c9dbbea..00000000 --- a/cognee/infrastructure/databases/graph/falkordb/adapter.py +++ /dev/null @@ -1,198 +0,0 @@ -""" FalcorDB Adapter for Graph Database""" -import json -import logging -from typing import Optional, Any, List, Dict -from contextlib import asynccontextmanager - - -from falkordb.asyncio import FalkorDB -from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface - -logger = logging.getLogger("FalcorDBAdapter") - -class FalcorDBAdapter(GraphDBInterface): - def __init__( - self, - graph_database_url: str, - graph_database_username: str, - graph_database_password: str, - graph_database_port: int, - driver: Optional[Any] = None, - graph_name: str = "DefaultGraph", - ): - self.driver = FalkorDB( - host = graph_database_url, - port = graph_database_port) - self.graph_name = graph_name - - - - async def query( - self, - query: str, - params: Optional[Dict[str, Any]] = None, - ) -> List[Dict[str, Any]]: - try: - selected_graph = self.driver.select_graph(self.graph_name) - - result = await selected_graph.query(query) - return result.result_set - - except Exception as error: - logger.error("Falkor query error: %s", error, exc_info = True) - raise error - - async def graph(self): - return self.driver - - async def add_node(self, node_id: str, node_properties: Dict[str, Any] = None): - node_id = node_id.replace(":", "_") - - serialized_properties = self.serialize_properties(node_properties) - - if "name" not in serialized_properties: - serialized_properties["name"] = node_id - - # serialized_properties["created_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - # serialized_properties["updated_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - # properties = ", ".join(f"{property_name}: ${property_name}" for property_name in serialized_properties.keys()) - - query = f"""MERGE (node:`{node_id}` {{id: $node_id}}) - ON CREATE SET node += $properties - RETURN ID(node) AS internal_id, node.id AS nodeId""" - - params = { - "node_id": node_id, - "properties": serialized_properties, - } - - return await self.query(query, params) - - async def add_nodes(self, nodes: list[tuple[str, dict[str, Any]]]) -> None: - for node in nodes: - node_id, node_properties = node - node_id = node_id.replace(":", "_") - - await self.add_node( - node_id = node_id, - node_properties = node_properties, - ) - - - - async def extract_node_description(self, node_id: str): - query = """MATCH (n)-[r]->(m) - WHERE n.id = $node_id - AND NOT m.id CONTAINS 'DefaultGraphModel' - RETURN m - """ - - result = await self.query(query, dict(node_id = node_id)) - - descriptions = [] - - for node in result: - # Assuming 'm' is a consistent key in your data structure - attributes = node.get("m", {}) - - # Ensure all required attributes are present - if all(key in attributes for key in ["id", "layer_id", "description"]): - descriptions.append({ - "id": attributes["id"], - "layer_id": attributes["layer_id"], - "description": attributes["description"], - }) - - return descriptions - - async def get_layer_nodes(self): - query = """MATCH (node) WHERE node.layer_id IS NOT NULL - RETURN node""" - - return [result["node"] for result in (await self.query(query))] - - async def extract_node(self, node_id: str): - results = self.extract_nodes([node_id]) - - return results[0] if len(results) > 0 else None - - async def extract_nodes(self, node_ids: List[str]): - query = """ - UNWIND $node_ids AS id - MATCH (node {id: id}) - RETURN node""" - - params = { - "node_ids": node_ids - } - - results = await self.query(query, params) - - return results - - async def delete_node(self, node_id: str): - node_id = id.replace(":", "_") - - query = f"MATCH (node:`{node_id}` {{id: $node_id}}) DETACH DELETE n" - params = { "node_id": node_id } - - return await self.query(query, params) - - async def add_edge(self, from_node: str, to_node: str, relationship_name: str, edge_properties: Optional[Dict[str, Any]] = {}): - serialized_properties = self.serialize_properties(edge_properties) - from_node = from_node.replace(":", "_") - to_node = to_node.replace(":", "_") - - query = f"""MATCH (from_node:`{from_node}` {{id: $from_node}}), (to_node:`{to_node}` {{id: $to_node}}) - MERGE (from_node)-[r:`{relationship_name}`]->(to_node) - SET r += $properties - RETURN r""" - - params = { - "from_node": from_node, - "to_node": to_node, - "properties": serialized_properties - } - - return await self.query(query, params) - - - async def add_edges(self, edges: list[tuple[str, str, str, dict[str, Any]]]) -> None: - # edges_data = [] - - for edge in edges: - from_node, to_node, relationship_name, edge_properties = edge - from_node = from_node.replace(":", "_") - to_node = to_node.replace(":", "_") - - await self.add_edge( - from_node = from_node, - to_node = to_node, - relationship_name = relationship_name, - edge_properties = edge_properties - ) - - - - async def filter_nodes(self, search_criteria): - query = f"""MATCH (node) - WHERE node.id CONTAINS '{search_criteria}' - RETURN node""" - - - return await self.query(query) - - - async def delete_graph(self): - query = """MATCH (node) - DETACH DELETE node;""" - - return await self.query(query) - - def serialize_properties(self, properties = dict()): - return { - property_key: json.dumps(property_value) - if isinstance(property_value, (dict, list)) - else property_value for property_key, property_value in properties.items() - } diff --git a/cognee/infrastructure/databases/graph/get_graph_engine.py b/cognee/infrastructure/databases/graph/get_graph_engine.py index 465b09b6..038e878c 100644 --- a/cognee/infrastructure/databases/graph/get_graph_engine.py +++ b/cognee/infrastructure/databases/graph/get_graph_engine.py @@ -2,7 +2,6 @@ from .config import get_graph_config from .graph_db_interface import GraphDBInterface -from .networkx.adapter import NetworkXAdapter async def get_graph_engine() -> GraphDBInterface : @@ -21,19 +20,19 @@ async def get_graph_engine() -> GraphDBInterface : except: pass - elif config.graph_database_provider == "falkorb": - try: - from .falkordb.adapter import FalcorDBAdapter + elif config.graph_database_provider == "falkordb": + from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine + from cognee.infrastructure.databases.hybrid.falkordb.FalkorDBAdapter import FalkorDBAdapter - return FalcorDBAdapter( - graph_database_url = config.graph_database_url, - graph_database_username = config.graph_database_username, - graph_database_password = config.graph_database_password, - graph_database_port = config.graph_database_port - ) - except: - pass + embedding_engine = get_embedding_engine() + + return FalkorDBAdapter( + database_url = config.graph_database_url, + database_port = config.graph_database_port, + embedding_engine = embedding_engine, + ) + from .networkx.adapter import NetworkXAdapter graph_client = NetworkXAdapter(filename = config.graph_file_path) if graph_client.graph is None: diff --git a/cognee/infrastructure/databases/graph/graph_db_interface.py b/cognee/infrastructure/databases/graph/graph_db_interface.py index 37aae5c9..3b9e55ff 100644 --- a/cognee/infrastructure/databases/graph/graph_db_interface.py +++ b/cognee/infrastructure/databases/graph/graph_db_interface.py @@ -3,7 +3,7 @@ class GraphDBInterface(Protocol): @abstractmethod - async def graph(self): + async def query(self, query: str, params: dict): raise NotImplementedError @abstractmethod diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 0b8925ce..f0d62c78 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -1,12 +1,13 @@ """ Neo4j Adapter for Graph Database""" -import json import logging import asyncio from typing import Optional, Any, List, Dict from contextlib import asynccontextmanager +from uuid import UUID from neo4j import AsyncSession from neo4j import AsyncGraphDatabase from neo4j.exceptions import Neo4jError +from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface logger = logging.getLogger("Neo4jAdapter") @@ -40,7 +41,7 @@ async def query( ) -> List[Dict[str, Any]]: try: async with self.get_session() as session: - result = await session.run(query, parameters=params) + result = await session.run(query, parameters = params) data = await result.data() await self.close() return data @@ -48,9 +49,6 @@ async def query( logger.error("Neo4j query error: %s", error, exc_info = True) raise error - async def graph(self): - return await self.get_session() - async def has_node(self, node_id: str) -> bool: results = self.query( """ @@ -62,73 +60,42 @@ async def has_node(self, node_id: str) -> bool: ) return results[0]["node_exists"] if len(results) > 0 else False - async def add_node(self, node_id: str, node_properties: Dict[str, Any] = None): - node_id = node_id.replace(":", "_") - - serialized_properties = self.serialize_properties(node_properties) - - if "name" not in serialized_properties: - serialized_properties["name"] = node_id + async def add_node(self, node: DataPoint): + serialized_properties = self.serialize_properties(node.model_dump()) - query = f"""MERGE (node:`{node_id}` {{id: $node_id}}) + query = """MERGE (node {id: $node_id}) ON CREATE SET node += $properties + ON MATCH SET node += $properties + ON MATCH SET node.updated_at = timestamp() RETURN ID(node) AS internal_id, node.id AS nodeId""" params = { - "node_id": node_id, + "node_id": str(node.id), "properties": serialized_properties, } return await self.query(query, params) - async def add_nodes(self, nodes: list[tuple[str, dict[str, Any]]]) -> None: + async def add_nodes(self, nodes: list[DataPoint]) -> None: query = """ UNWIND $nodes AS node MERGE (n {id: node.node_id}) ON CREATE SET n += node.properties + ON MATCH SET n += node.properties + ON MATCH SET n.updated_at = timestamp() WITH n, node.node_id AS label CALL apoc.create.addLabels(n, [label]) YIELD node AS labeledNode RETURN ID(labeledNode) AS internal_id, labeledNode.id AS nodeId """ nodes = [{ - "node_id": node_id, - "properties": self.serialize_properties(node_properties), - } for (node_id, node_properties) in nodes] + "node_id": str(node.id), + "properties": self.serialize_properties(node.model_dump()), + } for node in nodes] results = await self.query(query, dict(nodes = nodes)) return results - async def extract_node_description(self, node_id: str): - query = """MATCH (n)-[r]->(m) - WHERE n.id = $node_id - AND NOT m.id CONTAINS 'DefaultGraphModel' - RETURN m - """ - - result = await self.query(query, dict(node_id = node_id)) - - descriptions = [] - - for node in result: - # Assuming 'm' is a consistent key in your data structure - attributes = node.get("m", {}) - - # Ensure all required attributes are present - if all(key in attributes for key in ["id", "layer_id", "description"]): - descriptions.append({ - "id": attributes["id"], - "layer_id": attributes["layer_id"], - "description": attributes["description"], - }) - - return descriptions - - async def get_layer_nodes(self): - query = """MATCH (node) WHERE node.layer_id IS NOT NULL - RETURN node""" - - return [result["node"] for result in (await self.query(query))] async def extract_node(self, node_id: str): results = await self.extract_nodes([node_id]) @@ -169,9 +136,9 @@ async def delete_nodes(self, node_ids: list[str]) -> None: return await self.query(query, params) - async def has_edge(self, from_node: str, to_node: str, edge_label: str) -> bool: + async def has_edge(self, from_node: UUID, to_node: UUID, edge_label: str) -> bool: query = f""" - MATCH (from_node:`{from_node}`)-[relationship:`{edge_label}`]->(to_node:`{to_node}`) + MATCH (from_node:`{str(from_node)}`)-[relationship:`{edge_label}`]->(to_node:`{str(to_node)}`) RETURN COUNT(relationship) > 0 AS edge_exists """ @@ -189,8 +156,8 @@ async def has_edges(self, edges): try: params = { "edges": [{ - "from_node": edge[0], - "to_node": edge[1], + "from_node": str(edge[0]), + "to_node": str(edge[1]), "relationship_name": edge[2], } for edge in edges], } @@ -207,16 +174,17 @@ async def add_edge(self, from_node: str, to_node: str, relationship_name: str, e from_node = from_node.replace(":", "_") to_node = to_node.replace(":", "_") - query = f"""MATCH (from_node:`{from_node}` + query = f"""MATCH (from_node:`{str(from_node)}` {{id: $from_node}}), - (to_node:`{to_node}` {{id: $to_node}}) + (to_node:`{str(to_node)}` {{id: $to_node}}) MERGE (from_node)-[r:`{relationship_name}`]->(to_node) - SET r += $properties + ON CREATE SET r += $properties, r.updated_at = timestamp() + ON MATCH SET r += $properties, r.updated_at = timestamp() RETURN r""" params = { - "from_node": from_node, - "to_node": to_node, + "from_node": str(from_node), + "to_node": str(to_node), "properties": serialized_properties } @@ -233,13 +201,13 @@ async def add_edges(self, edges: list[tuple[str, str, str, dict[str, Any]]]) -> """ edges = [{ - "from_node": edge[0], - "to_node": edge[1], + "from_node": str(edge[0]), + "to_node": str(edge[1]), "relationship_name": edge[2], "properties": { **(edge[3] if edge[3] else {}), - "source_node_id": edge[0], - "target_node_id": edge[1], + "source_node_id": str(edge[0]), + "target_node_id": str(edge[1]), }, } for edge in edges] @@ -299,14 +267,6 @@ async def get_disconnected_nodes(self) -> list[str]: return results[0]["ids"] if len(results) > 0 else [] - async def filter_nodes(self, search_criteria): - query = f"""MATCH (node) - WHERE node.id CONTAINS '{search_criteria}' - RETURN node""" - - return await self.query(query) - - async def get_predecessors(self, node_id: str, edge_label: str = None) -> list[str]: if edge_label is not None: query = """ @@ -437,15 +397,22 @@ async def delete_graph(self): return await self.query(query) def serialize_properties(self, properties = dict()): - return { - property_key: json.dumps(property_value) - if isinstance(property_value, (dict, list)) - else property_value for property_key, property_value in properties.items() - } + serialized_properties = {} + + for property_key, property_value in properties.items(): + if isinstance(property_value, UUID): + serialized_properties[property_key] = str(property_value) + continue + + serialized_properties[property_key] = property_value + + return serialized_properties async def get_graph_data(self): query = "MATCH (n) RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties" + result = await self.query(query) + nodes = [( record["properties"]["id"], record["properties"], diff --git a/cognee/infrastructure/databases/graph/networkx/adapter.py b/cognee/infrastructure/databases/graph/networkx/adapter.py index 19bd5005..aac8c0c3 100644 --- a/cognee/infrastructure/databases/graph/networkx/adapter.py +++ b/cognee/infrastructure/databases/graph/networkx/adapter.py @@ -1,14 +1,18 @@ """Adapter for NetworkX graph database.""" +from datetime import datetime, timezone import os import json import asyncio import logging +from re import A from typing import Dict, Any, List import aiofiles import aiofiles.os as aiofiles_os import networkx as nx from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface +from cognee.infrastructure.engine import DataPoint +from cognee.modules.storage.utils import JSONEncoder logger = logging.getLogger("NetworkXAdapter") @@ -25,29 +29,34 @@ def __new__(cls, filename): def __init__(self, filename = "cognee_graph.pkl"): self.filename = filename + async def query(self, query: str, params: dict): + pass async def has_node(self, node_id: str) -> bool: return self.graph.has_node(node_id) async def add_node( self, - node_id: str, - node_properties, + node: DataPoint, ) -> None: - if not self.graph.has_node(id): - self.graph.add_node(node_id, **node_properties) - await self.save_graph_to_file(self.filename) + self.graph.add_node(node.id, **node.model_dump()) + + await self.save_graph_to_file(self.filename) async def add_nodes( self, - nodes: List[tuple[str, dict]], + nodes: list[DataPoint], ) -> None: + nodes = [(node.id, node.model_dump()) for node in nodes] + self.graph.add_nodes_from(nodes) await self.save_graph_to_file(self.filename) + async def get_graph(self): return self.graph + async def has_edge(self, from_node: str, to_node: str, edge_label: str) -> bool: return self.graph.has_edge(from_node, to_node, key = edge_label) @@ -55,18 +64,20 @@ async def has_edges(self, edges): result = [] for (from_node, to_node, edge_label) in edges: - if await self.has_edge(from_node, to_node, edge_label): + if self.graph.has_edge(from_node, to_node, edge_label): result.append((from_node, to_node, edge_label)) return result + async def add_edge( self, from_node: str, to_node: str, relationship_name: str, - edge_properties: Dict[str, Any] = None, + edge_properties: Dict[str, Any] = {}, ) -> None: + edge_properties["updated_at"] = datetime.now(timezone.utc) self.graph.add_edge(from_node, to_node, key = relationship_name, **(edge_properties if edge_properties else {})) await self.save_graph_to_file(self.filename) @@ -74,22 +85,29 @@ async def add_edges( self, edges: tuple[str, str, str, dict], ) -> None: + edges = [(edge[0], edge[1], edge[2], { + **(edge[3] if len(edge) == 4 else {}), + "updated_at": datetime.now(timezone.utc), + }) for edge in edges] + self.graph.add_edges_from(edges) await self.save_graph_to_file(self.filename) async def get_edges(self, node_id: str): return list(self.graph.in_edges(node_id, data = True)) + list(self.graph.out_edges(node_id, data = True)) + async def delete_node(self, node_id: str) -> None: """Asynchronously delete a node from the graph if it exists.""" - if self.graph.has_node(id): - self.graph.remove_node(id) + if self.graph.has_node(node_id): + self.graph.remove_node(node_id) await self.save_graph_to_file(self.filename) async def delete_nodes(self, node_ids: List[str]) -> None: self.graph.remove_nodes_from(node_ids) await self.save_graph_to_file(self.filename) + async def get_disconnected_nodes(self) -> List[str]: connected_components = list(nx.weakly_connected_components(self.graph)) @@ -102,33 +120,6 @@ async def get_disconnected_nodes(self) -> List[str]: return disconnected_nodes - async def extract_node_description(self, node_id: str) -> Dict[str, Any]: - descriptions = [] - - if self.graph.has_node(node_id): - # Get the attributes of the node - for neighbor in self.graph.neighbors(node_id): - # Get the attributes of the neighboring node - attributes = self.graph.nodes[neighbor] - - # Ensure all required attributes are present before extracting description - if all(key in attributes for key in ["id", "layer_id", "description"]): - descriptions.append({ - "id": attributes["id"], - "layer_id": attributes["layer_id"], - "description": attributes["description"], - }) - - return descriptions - - async def get_layer_nodes(self): - layer_nodes = [] - - for _, data in self.graph.nodes(data = True): - if "layer_id" in data: - layer_nodes.append(data) - - return layer_nodes async def extract_node(self, node_id: str) -> dict: if self.graph.has_node(node_id): @@ -240,7 +231,7 @@ async def save_graph_to_file(self, file_path: str=None) -> None: graph_data = nx.readwrite.json_graph.node_link_data(self.graph) async with aiofiles.open(file_path, "w") as file: - await file.write(json.dumps(graph_data)) + await file.write(json.dumps(graph_data, cls = JSONEncoder)) async def load_graph_from_file(self, file_path: str = None): diff --git a/cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py b/cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py new file mode 100644 index 00000000..effe9e68 --- /dev/null +++ b/cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py @@ -0,0 +1,237 @@ +import asyncio +from textwrap import dedent +from typing import Any +from falkordb import FalkorDB + +from cognee.infrastructure.engine import DataPoint +from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface +from cognee.infrastructure.databases.vector.embeddings import EmbeddingEngine +from cognee.infrastructure.databases.vector.vector_db_interface import VectorDBInterface + +class IndexSchema(DataPoint): + text: str + + _metadata: dict = { + "index_fields": ["text"] + } + +class FalkorDBAdapter(VectorDBInterface, GraphDBInterface): + def __init__( + self, + database_url: str, + database_port: int, + embedding_engine = EmbeddingEngine, + ): + self.driver = FalkorDB( + host = database_url, + port = database_port, + ) + self.embedding_engine = embedding_engine + self.graph_name = "cognee_graph" + + def query(self, query: str, params: dict = {}): + graph = self.driver.select_graph(self.graph_name) + + try: + result = graph.query(query, params) + return result + except Exception as e: + print(f"Error executing query: {e}") + raise e + + async def embed_data(self, data: list[str]) -> list[list[float]]: + return await self.embedding_engine.embed_text(data) + + async def stringify_properties(self, properties: dict, vectorize_fields = []) -> str: + async def get_value(key, value): + return f"'{value}'" if key not in vectorize_fields else await self.get_vectorized_value(value) + + return ",".join([f"{key}:{await get_value(key, value)}" for key, value in properties.items()]) + + async def get_vectorized_value(self, value: Any) -> str: + vector = (await self.embed_data([value]))[0] + return f"vecf32({vector})" + + async def create_data_point_query(self, data_point: DataPoint): + node_label = type(data_point).__name__ + node_properties = await self.stringify_properties( + data_point.model_dump(), + data_point._metadata["index_fields"], + # data_point._metadata["index_fields"] if hasattr(data_point, "_metadata") else [], + ) + + return dedent(f""" + MERGE (node:{node_label} {{id: '{str(data_point.id)}'}}) + ON CREATE SET node += ({{{node_properties}}}) + ON CREATE SET node.updated_at = timestamp() + ON MATCH SET node += ({{{node_properties}}}) + ON MATCH SET node.updated_at = timestamp() + """).strip() + + async def create_edge_query(self, edge: tuple[str, str, str, dict]) -> str: + properties = await self.stringify_properties(edge[3]) + properties = f"{{{properties}}}" + + return dedent(f""" + MERGE (source {{id:'{edge[0]}'}}) + MERGE (target {{id: '{edge[1]}'}}) + MERGE (source)-[edge:{edge[2]} {properties}]->(target) + ON MATCH SET edge.updated_at = timestamp() + ON CREATE SET edge.updated_at = timestamp() + """).strip() + + async def create_collection(self, collection_name: str): + pass + + async def has_collection(self, collection_name: str) -> bool: + collections = self.driver.list_graphs() + + return collection_name in collections + + async def create_data_points(self, data_points: list[DataPoint]): + queries = [await self.create_data_point_query(data_point) for data_point in data_points] + for query in queries: + self.query(query) + + async def create_vector_index(self, index_name: str, index_property_name: str): + graph = self.driver.select_graph(self.graph_name) + + if not self.has_vector_index(graph, index_name, index_property_name): + graph.create_node_vector_index(index_name, index_property_name, dim = self.embedding_engine.get_vector_size()) + + def has_vector_index(self, graph, index_name: str, index_property_name: str) -> bool: + try: + indices = graph.list_indices() + + return any([(index[0] == index_name and index_property_name in index[1]) for index in indices.result_set]) + except: + return False + + async def index_data_points(self, index_name: str, index_property_name: str, data_points: list[DataPoint]): + pass + + async def add_node(self, node: DataPoint): + await self.create_data_points([node]) + + async def add_nodes(self, nodes: list[DataPoint]): + await self.create_data_points(nodes) + + async def add_edge(self, edge: tuple[str, str, str, dict]): + query = await self.create_edge_query(edge) + + self.query(query) + + async def add_edges(self, edges: list[tuple[str, str, str, dict]]): + queries = [await self.create_edge_query(edge) for edge in edges] + + for query in queries: + self.query(query) + + async def has_edges(self, edges): + query = dedent(""" + UNWIND $edges AS edge + MATCH (a)-[r]->(b) + WHERE id(a) = edge.from_node AND id(b) = edge.to_node AND type(r) = edge.relationship_name + RETURN edge.from_node AS from_node, edge.to_node AS to_node, edge.relationship_name AS relationship_name, count(r) > 0 AS edge_exists + """).strip() + + params = { + "edges": [{ + "from_node": str(edge[0]), + "to_node": str(edge[1]), + "relationship_name": edge[2], + } for edge in edges], + } + + results = self.query(query, params).result_set + + return [result["edge_exists"] for result in results] + + async def retrieve(self, data_point_ids: list[str]): + return self.query( + f"MATCH (node) WHERE node.id IN $node_ids RETURN node", + { + "node_ids": data_point_ids, + }, + ) + + async def extract_node(self, data_point_id: str): + return await self.retrieve([data_point_id]) + + async def extract_nodes(self, data_point_ids: list[str]): + return await self.retrieve(data_point_ids) + + async def search( + self, + collection_name: str, + query_text: str = None, + query_vector: list[float] = None, + limit: int = 10, + with_vector: bool = False, + ): + if query_text is None and query_vector is None: + raise ValueError("One of query_text or query_vector must be provided!") + + if query_text and not query_vector: + query_vector = (await self.embed_data([query_text]))[0] + + query = dedent(f""" + CALL db.idx.vector.queryNodes( + {collection_name}, + 'text', + {limit}, + vecf32({query_vector}) + ) YIELD node, score + """).strip() + + result = self.query(query) + + return result + + async def batch_search( + self, + collection_name: str, + query_texts: list[str], + limit: int = None, + with_vectors: bool = False, + ): + query_vectors = await self.embedding_engine.embed_text(query_texts) + + return await asyncio.gather( + *[self.search( + collection_name = collection_name, + query_vector = query_vector, + limit = limit, + with_vector = with_vectors, + ) for query_vector in query_vectors] + ) + + async def delete_data_points(self, collection_name: str, data_point_ids: list[str]): + return self.query( + f"MATCH (node) WHERE node.id IN $node_ids DETACH DELETE node", + { + "node_ids": data_point_ids, + }, + ) + + async def delete_node(self, collection_name: str, data_point_id: str): + return await self.delete_data_points([data_point_id]) + + async def delete_nodes(self, collection_name: str, data_point_ids: list[str]): + self.delete_data_points(data_point_ids) + + async def delete_graph(self): + try: + graph = self.driver.select_graph(self.graph_name) + + indices = graph.list_indices() + for index in indices.result_set: + for field in index[1]: + graph.drop_node_vector_index(index[0], field) + + graph.delete() + except Exception as e: + print(f"Error deleting graph: {e}") + + async def prune(self): + self.delete_graph() diff --git a/cognee/infrastructure/databases/vector/__init__.py b/cognee/infrastructure/databases/vector/__init__.py index 604170f1..0a6e3c1f 100644 --- a/cognee/infrastructure/databases/vector/__init__.py +++ b/cognee/infrastructure/databases/vector/__init__.py @@ -1,4 +1,3 @@ -from .models.DataPoint import DataPoint from .models.VectorConfig import VectorConfig from .models.CollectionConfig import CollectionConfig from .vector_db_interface import VectorDBInterface diff --git a/cognee/infrastructure/databases/vector/config.py b/cognee/infrastructure/databases/vector/config.py index 1d79b3cb..846bc584 100644 --- a/cognee/infrastructure/databases/vector/config.py +++ b/cognee/infrastructure/databases/vector/config.py @@ -8,6 +8,7 @@ class VectorConfig(BaseSettings): os.path.join(get_absolute_path(".cognee_system"), "databases"), "cognee.lancedb" ) + vector_db_port: int = 1234 vector_db_key: str = "" vector_db_provider: str = "lancedb" @@ -16,6 +17,7 @@ class VectorConfig(BaseSettings): def to_dict(self) -> dict: return { "vector_db_url": self.vector_db_url, + "vector_db_port": self.vector_db_port, "vector_db_key": self.vector_db_key, "vector_db_provider": self.vector_db_provider, } diff --git a/cognee/infrastructure/databases/vector/create_vector_engine.py b/cognee/infrastructure/databases/vector/create_vector_engine.py index f0cbfcd5..db5ef312 100644 --- a/cognee/infrastructure/databases/vector/create_vector_engine.py +++ b/cognee/infrastructure/databases/vector/create_vector_engine.py @@ -1,9 +1,8 @@ from typing import Dict -from ..relational.config import get_relational_config - class VectorConfig(Dict): vector_db_url: str + vector_db_port: str vector_db_key: str vector_db_provider: str @@ -29,6 +28,7 @@ def create_vector_engine(config: VectorConfig, embedding_engine): embedding_engine = embedding_engine ) elif config["vector_db_provider"] == "pgvector": + from cognee.infrastructure.databases.relational import get_relational_config from .pgvector.PGVectorAdapter import PGVectorAdapter # Get configuration for postgres database @@ -43,9 +43,18 @@ def create_vector_engine(config: VectorConfig, embedding_engine): f"postgresql+asyncpg://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}" ) - return PGVectorAdapter(connection_string, - config["vector_db_key"], - embedding_engine + return PGVectorAdapter( + connection_string, + config["vector_db_key"], + embedding_engine, + ) + elif config["vector_db_provider"] == "falkordb": + from ..hybrid.falkordb.FalkorDBAdapter import FalkorDBAdapter + + return FalkorDBAdapter( + database_url = config["vector_db_url"], + database_port = config["vector_db_port"], + embedding_engine = embedding_engine, ) else: from .lancedb.LanceDBAdapter import LanceDBAdapter diff --git a/cognee/infrastructure/databases/vector/falkordb/FalkorDBAdapter.py b/cognee/infrastructure/databases/vector/falkordb/FalkorDBAdapter.py deleted file mode 100644 index 744d79f5..00000000 --- a/cognee/infrastructure/databases/vector/falkordb/FalkorDBAdapter.py +++ /dev/null @@ -1,113 +0,0 @@ -import asyncio -from falkordb import FalkorDB -from ..models.DataPoint import DataPoint -from ..vector_db_interface import VectorDBInterface -from ..embeddings.EmbeddingEngine import EmbeddingEngine - - -class FalcorDBAdapter(VectorDBInterface): - def __init__( - self, - graph_database_url: str, - graph_database_port: int, - embedding_engine = EmbeddingEngine, - ): - self.driver = FalkorDB( - host = graph_database_url, - port = graph_database_port) - self.embedding_engine = embedding_engine - - - async def embed_data(self, data: list[str]) -> list[list[float]]: - return await self.embedding_engine.embed_text(data) - - async def has_collection(self, collection_name: str) -> bool: - collections = self.driver.list_graphs() - - return collection_name in collections - - async def create_collection(self, collection_name: str, payload_schema = None): - self.driver.select_graph(collection_name) - - async def create_data_points(self, collection_name: str, data_points: list[DataPoint]): - graph = self.driver.select_graph(collection_name) - - def stringify_properties(properties: dict) -> str: - return ",".join(f"{key}:'{value}'" for key, value in properties.items()) - - def create_data_point_query(data_point: DataPoint): - node_label = type(data_point.payload).__name__ - node_properties = stringify_properties(data_point.payload.dict()) - - return f"""CREATE (:{node_label} {{{node_properties}}})""" - - query = " ".join([create_data_point_query(data_point) for data_point in data_points]) - - graph.query(query) - - async def retrieve(self, collection_name: str, data_point_ids: list[str]): - graph = self.driver.select_graph(collection_name) - - return graph.query( - f"MATCH (node) WHERE node.id IN $node_ids RETURN node", - { - "node_ids": data_point_ids, - }, - ) - - async def search( - self, - collection_name: str, - query_text: str = None, - query_vector: list[float] = None, - limit: int = 10, - with_vector: bool = False, - ): - if query_text is None and query_vector is None: - raise ValueError("One of query_text or query_vector must be provided!") - - if query_text and not query_vector: - query_vector = (await self.embedding_engine.embed_text([query_text]))[0] - - graph = self.driver.select_graph(collection_name) - - query = f""" - CALL db.idx.vector.queryNodes( - null, - 'text', - {limit}, - {query_vector} - ) YIELD node, score - """ - - result = graph.query(query) - - return result - - async def batch_search( - self, - collection_name: str, - query_texts: list[str], - limit: int = None, - with_vectors: bool = False, - ): - query_vectors = await self.embedding_engine.embed_text(query_texts) - - return await asyncio.gather( - *[self.search( - collection_name = collection_name, - query_vector = query_vector, - limit = limit, - with_vector = with_vectors, - ) for query_vector in query_vectors] - ) - - async def delete_data_points(self, collection_name: str, data_point_ids: list[str]): - graph = self.driver.select_graph(collection_name) - - return graph.query( - f"MATCH (node) WHERE node.id IN $node_ids DETACH DELETE node", - { - "node_ids": data_point_ids, - }, - ) diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py index 40463448..f3e193ff 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py @@ -1,12 +1,25 @@ +import inspect from typing import List, Optional, get_type_hints, Generic, TypeVar import asyncio +from uuid import UUID import lancedb +from pydantic import BaseModel from lancedb.pydantic import Vector, LanceModel +from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.files.storage import LocalStorage +from cognee.modules.storage.utils import copy_model, get_own_properties from ..models.ScoredResult import ScoredResult -from ..vector_db_interface import VectorDBInterface, DataPoint +from ..vector_db_interface import VectorDBInterface from ..embeddings.EmbeddingEngine import EmbeddingEngine +class IndexSchema(DataPoint): + id: str + text: str + + _metadata: dict = { + "index_fields": ["text"] + } + class LanceDBAdapter(VectorDBInterface): name = "LanceDB" url: str @@ -38,10 +51,12 @@ async def has_collection(self, collection_name: str) -> bool: collection_names = await connection.table_names() return collection_name in collection_names - async def create_collection(self, collection_name: str, payload_schema = None): - data_point_types = get_type_hints(DataPoint) + async def create_collection(self, collection_name: str, payload_schema: BaseModel): vector_size = self.embedding_engine.get_vector_size() + payload_schema = self.get_data_point_schema(payload_schema) + data_point_types = get_type_hints(payload_schema) + class LanceDataPoint(LanceModel): id: data_point_types["id"] vector: Vector(vector_size) @@ -55,13 +70,16 @@ class LanceDataPoint(LanceModel): exist_ok = True, ) - async def create_data_points(self, collection_name: str, data_points: List[DataPoint]): + async def create_data_points(self, collection_name: str, data_points: list[DataPoint]): connection = await self.get_connection() + payload_schema = type(data_points[0]) + payload_schema = self.get_data_point_schema(payload_schema) + if not await self.has_collection(collection_name): await self.create_collection( collection_name, - payload_schema = type(data_points[0].payload), + payload_schema, ) collection = await connection.open_table(collection_name) @@ -79,15 +97,26 @@ class LanceDataPoint(LanceModel, Generic[IdType, PayloadSchema]): vector: Vector(vector_size) payload: PayloadSchema + def create_lance_data_point(data_point: DataPoint, vector: list[float]) -> LanceDataPoint: + properties = get_own_properties(data_point) + properties["id"] = str(properties["id"]) + + return LanceDataPoint[str, self.get_data_point_schema(type(data_point))]( + id = str(data_point.id), + vector = vector, + payload = properties, + ) + lance_data_points = [ - LanceDataPoint[type(data_point.id), type(data_point.payload)]( - id = data_point.id, - vector = data_vectors[data_index], - payload = data_point.payload, - ) for (data_index, data_point) in enumerate(data_points) + create_lance_data_point(data_point, data_vectors[data_point_index]) + for (data_point_index, data_point) in enumerate(data_points) ] - await collection.add(lance_data_points) + await collection.merge_insert("id") \ + .when_matched_update_all() \ + .when_not_matched_insert_all() \ + .execute(lance_data_points) + async def retrieve(self, collection_name: str, data_point_ids: list[str]): connection = await self.get_connection() @@ -99,7 +128,7 @@ async def retrieve(self, collection_name: str, data_point_ids: list[str]): results = await collection.query().where(f"id IN {tuple(data_point_ids)}").to_pandas() return [ScoredResult( - id = result["id"], + id = UUID(result["id"]), payload = result["payload"], score = 0, ) for result in results.to_dict("index").values()] @@ -138,7 +167,7 @@ async def search( normalized_values = [(result["_distance"] - min_value) / (max_value - min_value) for result in result_values] return [ScoredResult( - id = str(result["id"]), + id = UUID(result["id"]), payload = result["payload"], score = normalized_values[value_index], ) for value_index, result in enumerate(result_values)] @@ -167,7 +196,27 @@ async def delete_data_points(self, collection_name: str, data_point_ids: list[st results = await collection.delete(f"id IN {tuple(data_point_ids)}") return results + async def create_vector_index(self, index_name: str, index_property_name: str): + await self.create_collection(f"{index_name}_{index_property_name}", payload_schema = IndexSchema) + + async def index_data_points(self, index_name: str, index_property_name: str, data_points: list[DataPoint]): + await self.create_data_points(f"{index_name}_{index_property_name}", [ + IndexSchema( + id = str(data_point.id), + text = getattr(data_point, data_point._metadata["index_fields"][0]), + ) for data_point in data_points + ]) + async def prune(self): # Clean up the database if it was set up as temporary if self.url.startswith("/"): LocalStorage.remove_all(self.url) # Remove the temporary directory and files inside + + def get_data_point_schema(self, model_type): + return copy_model( + model_type, + include_fields = { + "id": (str, ...), + }, + exclude_fields = ["_metadata"], + ) \ No newline at end of file diff --git a/cognee/infrastructure/databases/vector/models/DataPoint.py b/cognee/infrastructure/databases/vector/models/DataPoint.py deleted file mode 100644 index 5ad870b6..00000000 --- a/cognee/infrastructure/databases/vector/models/DataPoint.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Generic, TypeVar -from pydantic import BaseModel - -PayloadSchema = TypeVar("PayloadSchema", bound = BaseModel) - -class DataPoint(BaseModel, Generic[PayloadSchema]): - id: str - payload: PayloadSchema - embed_field: str = "value" - - def get_embeddable_data(self): - if hasattr(self.payload, self.embed_field): - return getattr(self.payload, self.embed_field) diff --git a/cognee/infrastructure/databases/vector/models/ScoredResult.py b/cognee/infrastructure/databases/vector/models/ScoredResult.py index fcecbbe7..f9d8bec7 100644 --- a/cognee/infrastructure/databases/vector/models/ScoredResult.py +++ b/cognee/infrastructure/databases/vector/models/ScoredResult.py @@ -1,7 +1,8 @@ from typing import Any, Dict +from uuid import UUID from pydantic import BaseModel class ScoredResult(BaseModel): - id: str + id: UUID score: float # Lower score is better payload: Dict[str, Any] diff --git a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py index b13346cf..70359913 100644 --- a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +++ b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py @@ -1,17 +1,26 @@ import asyncio +from uuid import UUID from pgvector.sqlalchemy import Vector from typing import List, Optional, get_type_hints from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import JSON, Column, Table, select, delete from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker +from cognee.infrastructure.engine import DataPoint + from .serialize_datetime import serialize_datetime from ..models.ScoredResult import ScoredResult -from ..vector_db_interface import VectorDBInterface, DataPoint +from ..vector_db_interface import VectorDBInterface from ..embeddings.EmbeddingEngine import EmbeddingEngine from ...relational.sqlalchemy.SqlAlchemyAdapter import SQLAlchemyAdapter from ...relational.ModelBase import Base +class IndexSchema(DataPoint): + text: str + + _metadata: dict = { + "index_fields": ["text"] + } class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface): @@ -76,7 +85,7 @@ async def create_data_points( if not await self.has_collection(collection_name): await self.create_collection( collection_name=collection_name, - payload_schema=type(data_points[0].payload), + payload_schema=type(data_points[0]), ) data_vectors = await self.embed_data( @@ -105,7 +114,7 @@ def __init__(self, id, payload, vector): PGVectorDataPoint( id=data_point.id, vector=data_vectors[data_index], - payload=serialize_datetime(data_point.payload.dict()), + payload=serialize_datetime(data_point.model_dump()), ) for (data_index, data_point) in enumerate(data_points) ] @@ -113,6 +122,17 @@ def __init__(self, id, payload, vector): session.add_all(pgvector_data_points) await session.commit() + async def create_vector_index(self, index_name: str, index_property_name: str): + await self.create_collection(f"{index_name}_{index_property_name}") + + async def index_data_points(self, index_name: str, index_property_name: str, data_points: list[DataPoint]): + await self.create_data_points(f"{index_name}_{index_property_name}", [ + IndexSchema( + id = data_point.id, + text = getattr(data_point, data_point._metadata["index_fields"][0]), + ) for data_point in data_points + ]) + async def get_table(self, collection_name: str) -> Table: """ Dynamically loads a table using the given collection name @@ -137,8 +157,11 @@ async def retrieve(self, collection_name: str, data_point_ids: List[str]): results = results.all() return [ - ScoredResult(id=result.id, payload=result.payload, score=0) - for result in results + ScoredResult( + id = UUID(result.id), + payload = result.payload, + score = 0 + ) for result in results ] async def search( @@ -181,9 +204,10 @@ async def search( # Create and return ScoredResult objects return [ ScoredResult( - id=str(row.id), payload=row.payload, score=row.similarity - ) - for row in vector_list + id = UUID(row.id), + payload = row.payload, + score = row.similarity + ) for row in vector_list ] async def batch_search( diff --git a/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py b/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py index cc6d80b2..87d673a0 100644 --- a/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +++ b/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py @@ -1,12 +1,20 @@ import logging from typing import List, Dict, Optional from qdrant_client import AsyncQdrantClient, models + +from cognee.infrastructure.engine import DataPoint from ..vector_db_interface import VectorDBInterface -from ..models.DataPoint import DataPoint from ..embeddings.EmbeddingEngine import EmbeddingEngine logger = logging.getLogger("QDrantAdapter") +class IndexSchema(DataPoint): + text: str + + _metadata: dict = { + "index_fields": ["text"] + } + # class CollectionConfig(BaseModel, extra = "forbid"): # vector_config: Dict[str, models.VectorParams] = Field(..., description="Vectors configuration" ) # hnsw_config: Optional[models.HnswConfig] = Field(default = None, description="HNSW vector index configuration") @@ -75,20 +83,19 @@ async def create_collection( ): client = self.get_qdrant_client() - result = await client.create_collection( - collection_name = collection_name, - vectors_config = { - "text": models.VectorParams( - size = self.embedding_engine.get_vector_size(), - distance = "Cosine" - ) - } - ) + if not await client.collection_exists(collection_name): + await client.create_collection( + collection_name = collection_name, + vectors_config = { + "text": models.VectorParams( + size = self.embedding_engine.get_vector_size(), + distance = "Cosine" + ) + } + ) await client.close() - return result - async def create_data_points(self, collection_name: str, data_points: List[DataPoint]): client = self.get_qdrant_client() @@ -96,8 +103,8 @@ async def create_data_points(self, collection_name: str, data_points: List[DataP def convert_to_qdrant_point(data_point: DataPoint): return models.PointStruct( - id = data_point.id, - payload = data_point.payload.dict(), + id = str(data_point.id), + payload = data_point.model_dump(), vector = { "text": data_vectors[data_points.index(data_point)] } @@ -116,6 +123,17 @@ def convert_to_qdrant_point(data_point: DataPoint): finally: await client.close() + async def create_vector_index(self, index_name: str, index_property_name: str): + await self.create_collection(f"{index_name}_{index_property_name}") + + async def index_data_points(self, index_name: str, index_property_name: str, data_points: list[DataPoint]): + await self.create_data_points(f"{index_name}_{index_property_name}", [ + IndexSchema( + id = data_point.id, + text = getattr(data_point, data_point._metadata["index_fields"][0]), + ) for data_point in data_points + ]) + async def retrieve(self, collection_name: str, data_point_ids: list[str]): client = self.get_qdrant_client() results = await client.retrieve(collection_name, data_point_ids, with_payload = True) diff --git a/cognee/infrastructure/databases/vector/vector_db_interface.py b/cognee/infrastructure/databases/vector/vector_db_interface.py index 10e268f1..457b92f0 100644 --- a/cognee/infrastructure/databases/vector/vector_db_interface.py +++ b/cognee/infrastructure/databases/vector/vector_db_interface.py @@ -1,6 +1,6 @@ from typing import List, Protocol, Optional from abc import abstractmethod -from .models.DataPoint import DataPoint +from cognee.infrastructure.engine import DataPoint from .models.PayloadSchema import PayloadSchema class VectorDBInterface(Protocol): diff --git a/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py b/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py index 8aae831a..a1b986ef 100644 --- a/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +++ b/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py @@ -1,13 +1,23 @@ import asyncio import logging from typing import List, Optional +from uuid import UUID + +from cognee.infrastructure.engine import DataPoint from ..vector_db_interface import VectorDBInterface -from ..models.DataPoint import DataPoint from ..models.ScoredResult import ScoredResult from ..embeddings.EmbeddingEngine import EmbeddingEngine logger = logging.getLogger("WeaviateAdapter") +class IndexSchema(DataPoint): + uuid: str + text: str + + _metadata: dict = { + "index_fields": ["text"] + } + class WeaviateAdapter(VectorDBInterface): name = "Weaviate" url: str @@ -74,9 +84,13 @@ async def create_data_points(self, collection_name: str, data_points: List[DataP def convert_to_weaviate_data_points(data_point: DataPoint): vector = data_vectors[data_points.index(data_point)] + properties = data_point.model_dump() + properties["uuid"] = properties["id"] + del properties["id"] + return DataObject( uuid = data_point.id, - properties = data_point.payload.dict(), + properties = properties, vector = vector ) @@ -100,6 +114,17 @@ def convert_to_weaviate_data_points(data_point: DataPoint): logger.error("Error creating data points: %s", str(error)) raise error + async def create_vector_index(self, index_name: str, index_property_name: str): + await self.create_collection(f"{index_name}_{index_property_name}") + + async def index_data_points(self, index_name: str, index_property_name: str, data_points: list[DataPoint]): + await self.create_data_points(f"{index_name}_{index_property_name}", [ + IndexSchema( + uuid = str(data_point.id), + text = getattr(data_point, data_point._metadata["index_fields"][0]), + ) for data_point in data_points + ]) + async def retrieve(self, collection_name: str, data_point_ids: list[str]): from weaviate.classes.query import Filter future = asyncio.Future() @@ -143,9 +168,9 @@ async def search( return [ ScoredResult( - id=str(result.uuid), - payload=result.properties, - score=float(result.metadata.score) + id = UUID(result.id), + payload = result.properties, + score = float(result.metadata.score) ) for result in search_result.objects ] diff --git a/cognee/infrastructure/engine/__init__.py b/cognee/infrastructure/engine/__init__.py new file mode 100644 index 00000000..26f567da --- /dev/null +++ b/cognee/infrastructure/engine/__init__.py @@ -0,0 +1 @@ +from .models.DataPoint import DataPoint diff --git a/cognee/infrastructure/engine/__tests__/model_to_graph_to_model.test.py b/cognee/infrastructure/engine/__tests__/model_to_graph_to_model.test.py new file mode 100644 index 00000000..5d3908fa --- /dev/null +++ b/cognee/infrastructure/engine/__tests__/model_to_graph_to_model.test.py @@ -0,0 +1,72 @@ +from enum import Enum +from typing import Optional +from cognee.infrastructure.engine import DataPoint +from cognee.modules.graph.utils import get_graph_from_model, get_model_instance_from_graph + + +if __name__ == "__main__": + + class CarTypeName(Enum): + Pickup = "Pickup" + Sedan = "Sedan" + SUV = "SUV" + Coupe = "Coupe" + Convertible = "Convertible" + Hatchback = "Hatchback" + Wagon = "Wagon" + Minivan = "Minivan" + Van = "Van" + + class CarType(DataPoint): + id: str + name: CarTypeName + _metadata: dict = dict(index_fields = ["name"]) + + class Car(DataPoint): + id: str + brand: str + model: str + year: int + color: str + is_type: CarType + + class Person(DataPoint): + id: str + name: str + age: int + owns_car: list[Car] + driving_licence: Optional[dict] + _metadata: dict = dict(index_fields = ["name"]) + + boris = Person( + id = "boris", + name = "Boris", + age = 30, + owns_car = [ + Car( + id = "car1", + brand = "Toyota", + model = "Camry", + year = 2020, + color = "Blue", + is_type = CarType(id = "sedan", name = CarTypeName.Sedan), + ), + ], + driving_licence = { + "issued_by": "PU Vrsac", + "issued_on": "2025-11-06", + "number": "1234567890", + "expires_on": "2025-11-06", + }, + ) + + nodes, edges = get_graph_from_model(boris) + + print(nodes) + print(edges) + + person_data = nodes[len(nodes) - 1] + + parsed_person = get_model_instance_from_graph(nodes, edges, 'boris') + + print(parsed_person) diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py new file mode 100644 index 00000000..222b11ad --- /dev/null +++ b/cognee/infrastructure/engine/models/DataPoint.py @@ -0,0 +1,24 @@ +from typing_extensions import TypedDict +from uuid import UUID, uuid4 +from typing import Optional +from datetime import datetime, timezone +from pydantic import BaseModel, Field + +class MetaData(TypedDict): + index_fields: list[str] + +class DataPoint(BaseModel): + id: UUID = Field(default_factory = uuid4) + updated_at: Optional[datetime] = datetime.now(timezone.utc) + _metadata: Optional[MetaData] = { + "index_fields": [] + } + + # class Config: + # underscore_attrs_are_private = True + + def get_embeddable_data(self): + if self._metadata and len(self._metadata["index_fields"]) > 0 \ + and hasattr(self, self._metadata["index_fields"][0]): + + return getattr(self, self._metadata["index_fields"][0]) diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py index a8dc3478..4717d108 100644 --- a/cognee/modules/chunking/TextChunker.py +++ b/cognee/modules/chunking/TextChunker.py @@ -1,18 +1,18 @@ -from uuid import UUID, uuid5, NAMESPACE_OID +from uuid import uuid5, NAMESPACE_OID -from cognee.modules.chunking import DocumentChunk -from cognee.tasks.chunking import chunk_by_paragraph +from .models.DocumentChunk import DocumentChunk +from cognee.tasks.chunks import chunk_by_paragraph class TextChunker(): - id: UUID + document = None max_chunk_size: int chunk_index = 0 chunk_size = 0 paragraph_chunks = [] - def __init__(self, id: UUID, get_text: callable, chunk_size: int = 1024): - self.id = id + def __init__(self, document, get_text: callable, chunk_size: int = 1024): + self.document = document self.max_chunk_size = chunk_size self.get_text = get_text @@ -29,10 +29,10 @@ def read(self): else: if len(self.paragraph_chunks) == 0: yield DocumentChunk( + id = str(chunk_data["chunk_id"]), text = chunk_data["text"], word_count = chunk_data["word_count"], - document_id = str(self.id), - chunk_id = str(chunk_data["chunk_id"]), + is_part_of = self.document, chunk_index = self.chunk_index, cut_type = chunk_data["cut_type"], ) @@ -40,25 +40,31 @@ def read(self): self.chunk_size = 0 else: chunk_text = " ".join(chunk["text"] for chunk in self.paragraph_chunks) - yield DocumentChunk( - text = chunk_text, - word_count = self.chunk_size, - document_id = str(self.id), - chunk_id = str(uuid5(NAMESPACE_OID, f"{str(self.id)}-{self.chunk_index}")), - chunk_index = self.chunk_index, - cut_type = self.paragraph_chunks[len(self.paragraph_chunks) - 1]["cut_type"], - ) + try: + yield DocumentChunk( + id = str(uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}")), + text = chunk_text, + word_count = self.chunk_size, + is_part_of = self.document, + chunk_index = self.chunk_index, + cut_type = self.paragraph_chunks[len(self.paragraph_chunks) - 1]["cut_type"], + ) + except Exception as e: + print(e) self.paragraph_chunks = [chunk_data] self.chunk_size = chunk_data["word_count"] self.chunk_index += 1 if len(self.paragraph_chunks) > 0: - yield DocumentChunk( - text = " ".join(chunk["text"] for chunk in self.paragraph_chunks), - word_count = self.chunk_size, - document_id = str(self.id), - chunk_id = str(uuid5(NAMESPACE_OID, f"{str(self.id)}-{self.chunk_index}")), - chunk_index = self.chunk_index, - cut_type = self.paragraph_chunks[len(self.paragraph_chunks) - 1]["cut_type"], - ) + try: + yield DocumentChunk( + id = str(uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}")), + text = " ".join(chunk["text"] for chunk in self.paragraph_chunks), + word_count = self.chunk_size, + is_part_of = self.document, + chunk_index = self.chunk_index, + cut_type = self.paragraph_chunks[len(self.paragraph_chunks) - 1]["cut_type"], + ) + except Exception as e: + print(e) diff --git a/cognee/modules/chunking/__init__.py b/cognee/modules/chunking/__init__.py deleted file mode 100644 index 4e1d87a8..00000000 --- a/cognee/modules/chunking/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .models.DocumentChunk import DocumentChunk -from .TextChunker import TextChunker diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py index b6a924a9..975edb27 100644 --- a/cognee/modules/chunking/models/DocumentChunk.py +++ b/cognee/modules/chunking/models/DocumentChunk.py @@ -1,9 +1,14 @@ -from pydantic import BaseModel +from typing import Optional +from cognee.infrastructure.engine import DataPoint +from cognee.modules.data.processing.document_types import Document -class DocumentChunk(BaseModel): +class DocumentChunk(DataPoint): text: str word_count: int - document_id: str - chunk_id: str chunk_index: int cut_type: str + is_part_of: Document + + _metadata: Optional[dict] = { + "index_fields": ["text"], + } diff --git a/cognee/modules/data/extraction/__init__.py b/cognee/modules/data/extraction/__init__.py index e69de29b..b6419282 100644 --- a/cognee/modules/data/extraction/__init__.py +++ b/cognee/modules/data/extraction/__init__.py @@ -0,0 +1 @@ +from .knowledge_graph.extract_content_graph import extract_content_graph diff --git a/cognee/modules/data/extraction/knowledge_graph/__init__.py b/cognee/modules/data/extraction/knowledge_graph/__init__.py index e69de29b..0939b2b3 100644 --- a/cognee/modules/data/extraction/knowledge_graph/__init__.py +++ b/cognee/modules/data/extraction/knowledge_graph/__init__.py @@ -0,0 +1 @@ +from .extract_content_graph import extract_content_graph diff --git a/cognee/tasks/document_language_detection/document_language_detection.py b/cognee/modules/data/operations/detect_language.py similarity index 54% rename from cognee/tasks/document_language_detection/document_language_detection.py rename to cognee/modules/data/operations/detect_language.py index e2e8fdb6..e8267573 100644 --- a/cognee/tasks/document_language_detection/document_language_detection.py +++ b/cognee/modules/data/operations/detect_language.py @@ -1,36 +1,36 @@ - import logging +logger = logging.getLogger(__name__) - -async def detect_language(data:str): +async def detect_language(text: str): """ Detect the language of the given text and return its ISO 639-1 language code. - If the detected language is Croatian ('hr'), it maps to Serbian ('sr'). + If the detected language is Croatian ("hr"), it maps to Serbian ("sr"). The text is trimmed to the first 100 characters for efficient processing. Parameters: text (str): The text for language detection. Returns: - str: The ISO 639-1 language code of the detected language, or 'None' in case of an error. + str: The ISO 639-1 language code of the detected language, or "None" in case of an error. """ - # Trim the text to the first 100 characters from langdetect import detect, LangDetectException - trimmed_text = data[:100] + # Trim the text to the first 100 characters + trimmed_text = text[:100] try: # Detect the language using langdetect detected_lang_iso639_1 = detect(trimmed_text) - logging.info(f"Detected ISO 639-1 code: {detected_lang_iso639_1}") - # Special case: map 'hr' (Croatian) to 'sr' (Serbian ISO 639-2) - if detected_lang_iso639_1 == 'hr': - yield 'sr' - yield detected_lang_iso639_1 + # Special case: map "hr" (Croatian) to "sr" (Serbian ISO 639-2) + if detected_lang_iso639_1 == "hr": + return "sr" + + return detected_lang_iso639_1 except LangDetectException as e: - logging.error(f"Language detection error: {e}") + logger.error(f"Language detection error: {e}") + except Exception as e: - logging.error(f"Unexpected error: {e}") + logger.error(f"Unexpected error: {e}") - yield None \ No newline at end of file + return None diff --git a/cognee/modules/data/operations/translate_text.py b/cognee/modules/data/operations/translate_text.py new file mode 100644 index 00000000..41171264 --- /dev/null +++ b/cognee/modules/data/operations/translate_text.py @@ -0,0 +1,41 @@ +import logging + +logger = logging.getLogger(__name__) + +async def translate_text(text, source_language: str = "sr", target_language: str = "en", region_name = "eu-west-1"): + """ + Translate text from source language to target language using AWS Translate. + Parameters: + text (str): The text to be translated. + source_language (str): The source language code (e.g., "sr" for Serbian). ISO 639-2 Code https://www.loc.gov/standards/iso639-2/php/code_list.php + target_language (str): The target language code (e.g., "en" for English). ISO 639-2 Code https://www.loc.gov/standards/iso639-2/php/code_list.php + region_name (str): AWS region name. + Returns: + str: Translated text or an error message. + """ + + import boto3 + from botocore.exceptions import BotoCoreError, ClientError + + if not text: + raise ValueError("No text to translate.") + + if not source_language or not target_language: + raise ValueError("Source and target language codes are required.") + + try: + translate = boto3.client(service_name = "translate", region_name = region_name, use_ssl = True) + result = translate.translate_text( + Text = text, + SourceLanguageCode = source_language, + TargetLanguageCode = target_language, + ) + yield result.get("TranslatedText", "No translation found.") + + except BotoCoreError as e: + logger.error(f"BotoCoreError occurred: {e}") + yield None + + except ClientError as e: + logger.error(f"ClientError occurred: {e}") + yield None diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index a794b361..d3ae0974 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -1,34 +1,15 @@ -from uuid import UUID, uuid5, NAMESPACE_OID from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.modules.chunking.TextChunker import TextChunker from .Document import Document class AudioDocument(Document): type: str = "audio" - title: str - raw_data_location: str - chunking_strategy: str - - def __init__(self, id: UUID, title: str, raw_data_location: str, chunking_strategy:str="paragraph"): - self.id = id or uuid5(NAMESPACE_OID, title) - self.title = title - self.raw_data_location = raw_data_location - self.chunking_strategy = chunking_strategy def read(self, chunk_size: int): # Transcribe the audio file result = get_llm_client().create_transcript(self.raw_data_location) text = result.text - chunker = TextChunker(self.id, chunk_size = chunk_size, get_text = lambda: text) + chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: text) yield from chunker.read() - - - def to_dict(self) -> dict: - return dict( - id=str(self.id), - type=self.type, - title=self.title, - raw_data_location=self.raw_data_location, - ) diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 1e841682..7d5545cf 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -1,10 +1,8 @@ -from uuid import UUID -from typing import Protocol +from cognee.infrastructure.engine import DataPoint -class Document(Protocol): - id: UUID +class Document(DataPoint): type: str - title: str + name: str raw_data_location: str def read(self, chunk_size: int) -> str: diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py index e12b3cd1..5571b3bd 100644 --- a/cognee/modules/data/processing/document_types/ImageDocument.py +++ b/cognee/modules/data/processing/document_types/ImageDocument.py @@ -1,33 +1,15 @@ -from uuid import UUID, uuid5, NAMESPACE_OID from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.modules.chunking.TextChunker import TextChunker from .Document import Document - class ImageDocument(Document): type: str = "image" - title: str - raw_data_location: str - - def __init__(self, id: UUID, title: str, raw_data_location: str): - self.id = id or uuid5(NAMESPACE_OID, title) - self.title = title - self.raw_data_location = raw_data_location def read(self, chunk_size: int): # Transcribe the image file result = get_llm_client().transcribe_image(self.raw_data_location) text = result.choices[0].message.content - chunker = TextChunker(self.id, chunk_size = chunk_size, get_text = lambda: text) + chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: text) yield from chunker.read() - - - def to_dict(self) -> dict: - return dict( - id=str(self.id), - type=self.type, - title=self.title, - raw_data_location=self.raw_data_location, - ) diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index 5bbff085..2d194199 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -1,19 +1,11 @@ -from uuid import UUID, uuid5, NAMESPACE_OID from pypdf import PdfReader from cognee.modules.chunking.TextChunker import TextChunker from .Document import Document class PdfDocument(Document): type: str = "pdf" - title: str - raw_data_location: str - def __init__(self, id: UUID, title: str, raw_data_location: str): - self.id = id or uuid5(NAMESPACE_OID, title) - self.title = title - self.raw_data_location = raw_data_location - - def read(self, chunk_size: int) -> PdfReader: + def read(self, chunk_size: int): file = PdfReader(self.raw_data_location) def get_text(): @@ -21,16 +13,8 @@ def get_text(): page_text = page.extract_text() yield page_text - chunker = TextChunker(self.id, chunk_size = chunk_size, get_text = get_text) + chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text) yield from chunker.read() file.stream.close() - - def to_dict(self) -> dict: - return dict( - id = str(self.id), - type = self.type, - title = self.title, - raw_data_location = self.raw_data_location, - ) diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py index 774d1f05..32d3416b 100644 --- a/cognee/modules/data/processing/document_types/TextDocument.py +++ b/cognee/modules/data/processing/document_types/TextDocument.py @@ -1,16 +1,8 @@ -from uuid import UUID, uuid5, NAMESPACE_OID from cognee.modules.chunking.TextChunker import TextChunker from .Document import Document class TextDocument(Document): type: str = "text" - title: str - raw_data_location: str - - def __init__(self, id: UUID, title: str, raw_data_location: str): - self.id = id or uuid5(NAMESPACE_OID, title) - self.title = title - self.raw_data_location = raw_data_location def read(self, chunk_size: int): def get_text(): @@ -23,16 +15,6 @@ def get_text(): yield text - - chunker = TextChunker(self.id,chunk_size = chunk_size, get_text = get_text) + chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text) yield from chunker.read() - - - def to_dict(self) -> dict: - return dict( - id = str(self.id), - type = self.type, - title = self.title, - raw_data_location = self.raw_data_location, - ) diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py new file mode 100644 index 00000000..c43774e3 --- /dev/null +++ b/cognee/modules/engine/models/Entity.py @@ -0,0 +1,12 @@ +from cognee.infrastructure.engine import DataPoint +from cognee.modules.chunking.models.DocumentChunk import DocumentChunk +from .EntityType import EntityType + +class Entity(DataPoint): + name: str + is_a: EntityType + description: str + mentioned_in: DocumentChunk + _metadata: dict = { + "index_fields": ["name"], + } diff --git a/cognee/modules/engine/models/EntityType.py b/cognee/modules/engine/models/EntityType.py new file mode 100644 index 00000000..b4f49585 --- /dev/null +++ b/cognee/modules/engine/models/EntityType.py @@ -0,0 +1,11 @@ +from cognee.infrastructure.engine import DataPoint +from cognee.modules.chunking.models.DocumentChunk import DocumentChunk + +class EntityType(DataPoint): + name: str + type: str + description: str + exists_in: DocumentChunk + _metadata: dict = { + "index_fields": ["name"], + } diff --git a/cognee/modules/engine/models/__init__.py b/cognee/modules/engine/models/__init__.py new file mode 100644 index 00000000..24abb8b1 --- /dev/null +++ b/cognee/modules/engine/models/__init__.py @@ -0,0 +1,2 @@ +from .Entity import Entity +from .EntityType import EntityType diff --git a/cognee/modules/engine/utils/__init__.py b/cognee/modules/engine/utils/__init__.py new file mode 100644 index 00000000..9cc2bc57 --- /dev/null +++ b/cognee/modules/engine/utils/__init__.py @@ -0,0 +1,2 @@ +from .generate_node_id import generate_node_id +from .generate_node_name import generate_node_name diff --git a/cognee/modules/engine/utils/generate_node_id.py b/cognee/modules/engine/utils/generate_node_id.py new file mode 100644 index 00000000..db086a19 --- /dev/null +++ b/cognee/modules/engine/utils/generate_node_id.py @@ -0,0 +1,4 @@ +from uuid import NAMESPACE_OID, uuid5 + +def generate_node_id(node_id: str) -> str: + return uuid5(NAMESPACE_OID, node_id.lower().replace(" ", "_").replace("'", "")) diff --git a/cognee/modules/engine/utils/generate_node_name.py b/cognee/modules/engine/utils/generate_node_name.py new file mode 100644 index 00000000..84b26619 --- /dev/null +++ b/cognee/modules/engine/utils/generate_node_name.py @@ -0,0 +1,2 @@ +def generate_node_name(name: str) -> str: + return name.lower().replace(" ", "_").replace("'", "") diff --git a/cognee/modules/graph/utils.py b/cognee/modules/graph/utils.py deleted file mode 100644 index 55a048cd..00000000 --- a/cognee/modules/graph/utils.py +++ /dev/null @@ -1,5 +0,0 @@ -def generate_node_name(name: str) -> str: - return name.lower().replace(" ", "_").replace("'", "") - -def generate_node_id(node_id: str) -> str: - return node_id.lower().replace(" ", "_").replace("'", "") diff --git a/cognee/modules/graph/utils/__init__.py b/cognee/modules/graph/utils/__init__.py new file mode 100644 index 00000000..18e7ac29 --- /dev/null +++ b/cognee/modules/graph/utils/__init__.py @@ -0,0 +1,2 @@ +from .get_graph_from_model import get_graph_from_model +from .get_model_instance_from_graph import get_model_instance_from_graph diff --git a/cognee/modules/graph/utils/get_graph_from_model.py b/cognee/modules/graph/utils/get_graph_from_model.py new file mode 100644 index 00000000..ef402e4d --- /dev/null +++ b/cognee/modules/graph/utils/get_graph_from_model.py @@ -0,0 +1,81 @@ +from datetime import datetime, timezone +from cognee.infrastructure.engine import DataPoint +from cognee.modules import data +from cognee.modules.storage.utils import copy_model + +def get_graph_from_model(data_point: DataPoint, include_root = True): + nodes = [] + edges = [] + + data_point_properties = {} + excluded_properties = set() + + for field_name, field_value in data_point: + if field_name == "_metadata": + continue + + if isinstance(field_value, DataPoint): + excluded_properties.add(field_name) + + property_nodes, property_edges = get_graph_from_model(field_value, True) + nodes[:0] = property_nodes + edges[:0] = property_edges + + for property_node in get_own_properties(property_nodes, property_edges): + edges.append((data_point.id, property_node.id, field_name, { + "source_node_id": data_point.id, + "target_node_id": property_node.id, + "relationship_name": field_name, + "updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), + })) + continue + + if isinstance(field_value, list): + if isinstance(field_value[0], DataPoint): + excluded_properties.add(field_name) + + for item in field_value: + property_nodes, property_edges = get_graph_from_model(item, True) + nodes[:0] = property_nodes + edges[:0] = property_edges + + for property_node in get_own_properties(property_nodes, property_edges): + edges.append((data_point.id, property_node.id, field_name, { + "source_node_id": data_point.id, + "target_node_id": property_node.id, + "relationship_name": field_name, + "updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), + "metadata": { + "type": "list" + }, + })) + continue + + data_point_properties[field_name] = field_value + + SimpleDataPointModel = copy_model( + type(data_point), + include_fields = { + "_metadata": (dict, data_point._metadata), + }, + exclude_fields = excluded_properties, + ) + + if include_root: + nodes.append(SimpleDataPointModel(**data_point_properties)) + + return nodes, edges + + +def get_own_properties(property_nodes, property_edges): + own_properties = [] + + destination_nodes = [str(property_edge[1]) for property_edge in property_edges] + + for node in property_nodes: + if str(node.id) in destination_nodes: + continue + + own_properties.append(node) + + return own_properties diff --git a/cognee/modules/graph/utils/get_model_instance_from_graph.py b/cognee/modules/graph/utils/get_model_instance_from_graph.py new file mode 100644 index 00000000..82cdfa15 --- /dev/null +++ b/cognee/modules/graph/utils/get_model_instance_from_graph.py @@ -0,0 +1,29 @@ +from pydantic_core import PydanticUndefined +from cognee.infrastructure.engine import DataPoint +from cognee.modules.storage.utils import copy_model + + +def get_model_instance_from_graph(nodes: list[DataPoint], edges: list, entity_id: str): + node_map = {} + + for node in nodes: + node_map[node.id] = node + + for edge in edges: + source_node = node_map[edge[0]] + target_node = node_map[edge[1]] + edge_label = edge[2] + edge_properties = edge[3] if len(edge) == 4 else {} + edge_metadata = edge_properties.get("metadata", {}) + edge_type = edge_metadata.get("type") + + if edge_type == "list": + NewModel = copy_model(type(source_node), { edge_label: (list[type(target_node)], PydanticUndefined) }) + + node_map[edge[0]] = NewModel(**source_node.model_dump(), **{ edge_label: [target_node] }) + else: + NewModel = copy_model(type(source_node), { edge_label: (type(target_node), PydanticUndefined) }) + + node_map[edge[0]] = NewModel(**source_node.model_dump(), **{ edge_label: target_node }) + + return node_map[entity_id] diff --git a/cognee/modules/search/CogneeSearch.py b/cognee/modules/search/CogneeSearch.py deleted file mode 100644 index 8c9245f6..00000000 --- a/cognee/modules/search/CogneeSearch.py +++ /dev/null @@ -1,33 +0,0 @@ -import asyncio -import nest_asyncio -import dspy -from cognee.modules.search.vector.search_similarity import search_similarity - -nest_asyncio.apply() - -class AnswerFromContext(dspy.Signature): - question: str = dspy.InputField() - context: str = dspy.InputField(desc = "Context to use for answer generation.") - answer: str = dspy.OutputField() - -question_answer_llm = dspy.OpenAI(model = "gpt-3.5-turbo-instruct") - -class CogneeSearch(dspy.Module): - def __init__(self, ): - super().__init__() - self.generate_answer = dspy.TypedChainOfThought(AnswerFromContext) - - def forward(self, question): - context = asyncio.run(search_similarity(question)) - - context_text = "\n".join(context) - print(f"Context: {context_text}") - - with dspy.context(lm = question_answer_llm): - answer_prediction = self.generate_answer(context = context_text, question = question) - answer = answer_prediction.answer - - print(f"Question: {question}") - print(f"Answer: {answer}") - - return dspy.Prediction(context = context_text, answer = answer) diff --git a/cognee/modules/search/__init__.py b/cognee/modules/search/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/cognee/modules/search/graph/__init__.py b/cognee/modules/search/graph/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/cognee/modules/search/graph/search_adjacent.py b/cognee/modules/search/graph/search_adjacent.py deleted file mode 100644 index 7295ebe7..00000000 --- a/cognee/modules/search/graph/search_adjacent.py +++ /dev/null @@ -1,43 +0,0 @@ -import asyncio -from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.infrastructure.databases.vector import get_vector_engine - -async def search_adjacent(query: str) -> list[(str, str)]: - """ - Find the neighbours of a given node in the graph and return their ids and descriptions. - - Parameters: - - query (str): The query string to filter nodes by. - - Returns: - - list[(str, str)]: A list containing the unique identifiers and names of the neighbours of the given node. - """ - node_id = query - - if node_id is None: - return {} - - graph_engine = await get_graph_engine() - - exact_node = await graph_engine.extract_node(node_id) - - if exact_node is not None and "uuid" in exact_node: - neighbours = await graph_engine.get_neighbours(exact_node["uuid"]) - else: - vector_engine = get_vector_engine() - results = await asyncio.gather( - vector_engine.search("entities", query_text = query, limit = 10), - vector_engine.search("classification", query_text = query, limit = 10), - ) - results = [*results[0], *results[1]] - relevant_results = [result for result in results if result.score < 0.5][:5] - - if len(relevant_results) == 0: - return [] - - node_neighbours = await asyncio.gather(*[graph_engine.get_neighbours(result.id) for result in relevant_results]) - neighbours = [] - for neighbour_ids in node_neighbours: - neighbours.extend(neighbour_ids) - - return neighbours diff --git a/cognee/modules/search/graph/search_cypher.py b/cognee/modules/search/graph/search_cypher.py deleted file mode 100644 index 39a09542..00000000 --- a/cognee/modules/search/graph/search_cypher.py +++ /dev/null @@ -1,15 +0,0 @@ - -from cognee.infrastructure.databases.graph import get_graph_engine, get_graph_config - -async def search_cypher(query: str): - """ - Use a Cypher query to search the graph and return the results. - """ - graph_config = get_graph_config() - - if graph_config.graph_database_provider == "neo4j": - graph_engine = await get_graph_engine() - result = await graph_engine.graph().run(query) - return result - else: - raise ValueError("Unsupported search type for the used graph engine.") diff --git a/cognee/modules/search/graph/search_similarity.py b/cognee/modules/search/graph/search_similarity.py deleted file mode 100644 index dd48ce38..00000000 --- a/cognee/modules/search/graph/search_similarity.py +++ /dev/null @@ -1,27 +0,0 @@ -from cognee.infrastructure.databases.vector import get_vector_engine - -async def search_similarity(query: str) -> list[str, str]: - """ - Parameters: - - query (str): The query string to filter nodes by. - - Returns: - - list(chunk): A list of objects providing information about the chunks related to query. - """ - vector_engine = get_vector_engine() - - similar_results = await vector_engine.search("chunks", query, limit = 5) - - results = [ - parse_payload(result.payload) for result in similar_results - ] - - return results - - -def parse_payload(payload: dict) -> dict: - return { - "text": payload["text"], - "chunk_id": payload["chunk_id"], - "document_id": payload["document_id"], - } diff --git a/cognee/modules/search/graph/search_summary.py b/cognee/modules/search/graph/search_summary.py deleted file mode 100644 index 79ca4ee1..00000000 --- a/cognee/modules/search/graph/search_summary.py +++ /dev/null @@ -1,17 +0,0 @@ -from cognee.infrastructure.databases.vector import get_vector_engine - -async def search_summary(query: str) -> list: - """ - Parameters: - - query (str): The query string to filter summaries by. - - Returns: - - list[str, UUID]: A list of objects providing information about the summaries related to query. - """ - vector_engine = get_vector_engine() - - summaries_results = await vector_engine.search("summaries", query, limit = 5) - - summaries = [summary.payload for summary in summaries_results] - - return summaries diff --git a/cognee/modules/search/llm/__init__.py b/cognee/modules/search/llm/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/cognee/modules/search/llm/extraction/__init__.py b/cognee/modules/search/llm/extraction/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/cognee/modules/search/llm/extraction/categorize_relevant_category.py b/cognee/modules/search/llm/extraction/categorize_relevant_category.py deleted file mode 100644 index 2134780e..00000000 --- a/cognee/modules/search/llm/extraction/categorize_relevant_category.py +++ /dev/null @@ -1,16 +0,0 @@ -from typing import Type -from pydantic import BaseModel -from cognee.infrastructure.llm.prompts import render_prompt -from cognee.infrastructure.llm.get_llm_client import get_llm_client - -async def categorize_relevant_category(query: str, summary, response_model: Type[BaseModel]): - llm_client = get_llm_client() - - enriched_query= render_prompt("categorize_categories.txt", {"query": query, "categories": summary}) - - - system_prompt = " Choose the relevant categories and return appropriate output based on the model" - - llm_output = await llm_client.acreate_structured_output(enriched_query, system_prompt, response_model) - - return llm_output.model_dump() diff --git a/cognee/modules/search/llm/extraction/categorize_relevant_summary.py b/cognee/modules/search/llm/extraction/categorize_relevant_summary.py deleted file mode 100644 index 1ed09e69..00000000 --- a/cognee/modules/search/llm/extraction/categorize_relevant_summary.py +++ /dev/null @@ -1,15 +0,0 @@ -from typing import Type -from pydantic import BaseModel -from cognee.infrastructure.llm.prompts import render_prompt -from cognee.infrastructure.llm.get_llm_client import get_llm_client - -async def categorize_relevant_summary(query: str, summaries, response_model: Type[BaseModel]): - llm_client = get_llm_client() - - enriched_query= render_prompt("categorize_summary.txt", {"query": query, "summaries": summaries}) - - system_prompt = "Choose the relevant summaries and return appropriate output based on the model" - - llm_output = await llm_client.acreate_structured_output(enriched_query, system_prompt, response_model) - - return llm_output diff --git a/cognee/modules/search/llm/get_relevant_summary.py b/cognee/modules/search/llm/get_relevant_summary.py deleted file mode 100644 index f5a3c8ef..00000000 --- a/cognee/modules/search/llm/get_relevant_summary.py +++ /dev/null @@ -1,17 +0,0 @@ -import logging -from typing import List, Dict -from cognee.modules.cognify.config import get_cognify_config -from .extraction.categorize_relevant_summary import categorize_relevant_summary - -logger = logging.getLogger(__name__) -async def get_cognitive_layers(content: str, categories: List[Dict]): - try: - cognify_config = get_cognify_config() - return (await categorize_relevant_summary( - content, - categories[0], - cognify_config.summarization_model, - )).cognitive_layers - except Exception as error: - logger.error("Error extracting cognitive layers from content: %s", error, exc_info = True) - raise error diff --git a/cognee/modules/search/vector/__init__.py b/cognee/modules/search/vector/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/cognee/modules/search/vector/bm25.py b/cognee/modules/search/vector/bm25.py deleted file mode 100644 index 134feb81..00000000 --- a/cognee/modules/search/vector/bm25.py +++ /dev/null @@ -1 +0,0 @@ -""" Placeholder for BM25 implementation""" \ No newline at end of file diff --git a/cognee/modules/search/vector/fusion.py b/cognee/modules/search/vector/fusion.py deleted file mode 100644 index 48ecb7ed..00000000 --- a/cognee/modules/search/vector/fusion.py +++ /dev/null @@ -1 +0,0 @@ -"""Placeholder for fusions search implementation""" \ No newline at end of file diff --git a/cognee/modules/search/vector/search_traverse.py b/cognee/modules/search/vector/search_traverse.py deleted file mode 100644 index 5c1d0792..00000000 --- a/cognee/modules/search/vector/search_traverse.py +++ /dev/null @@ -1,36 +0,0 @@ -import asyncio -from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine -from cognee.infrastructure.databases.vector import get_vector_engine - -async def search_traverse(query: str): - node_id = query - rules = set() - - graph_engine = await get_graph_engine() - vector_engine = get_vector_engine() - - exact_node = await graph_engine.extract_node(node_id) - - if exact_node is not None and "uuid" in exact_node: - edges = await graph_engine.get_edges(exact_node["uuid"]) - - for edge in edges: - rules.add(f"{edge[0]} {edge[2]['relationship_name']} {edge[1]}") - else: - results = await asyncio.gather( - vector_engine.search("entities", query_text = query, limit = 10), - vector_engine.search("classification", query_text = query, limit = 10), - ) - results = [*results[0], *results[1]] - relevant_results = [result for result in results if result.score < 0.5][:5] - - if len(relevant_results) > 0: - for result in relevant_results: - graph_node_id = result.id - - edges = await graph_engine.get_edges(graph_node_id) - - for edge in edges: - rules.add(f"{edge[0]} {edge[2]['relationship_name']} {edge[1]}") - - return list(rules) diff --git a/cognee/modules/storage/utils/__init__.py b/cognee/modules/storage/utils/__init__.py new file mode 100644 index 00000000..7073e647 --- /dev/null +++ b/cognee/modules/storage/utils/__init__.py @@ -0,0 +1,46 @@ +import json +from uuid import UUID +from datetime import datetime +from pydantic_core import PydanticUndefined + +from cognee.infrastructure.engine import DataPoint + +class JSONEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, datetime): + return obj.isoformat() # Convert datetime to ISO 8601 string + elif isinstance(obj, UUID): + # if the obj is uuid, we simply return the value of uuid + return str(obj) + return json.JSONEncoder.default(self, obj) + + +from pydantic import create_model + +def copy_model(model: DataPoint, include_fields: dict = {}, exclude_fields: list = []): + fields = { + name: (field.annotation, field.default if field.default is not None else PydanticUndefined) + for name, field in model.model_fields.items() + if name not in exclude_fields + } + + final_fields = { + **fields, + **include_fields + } + + return create_model(model.__name__, **final_fields) + +def get_own_properties(data_point: DataPoint): + properties = {} + + for field_name, field_value in data_point: + if field_name == "_metadata" \ + or isinstance(field_value, dict) \ + or isinstance(field_value, DataPoint) \ + or (isinstance(field_value, list) and isinstance(field_value[0], DataPoint)): + continue + + properties[field_name] = field_value + + return properties diff --git a/cognee/shared/utils.py b/cognee/shared/utils.py index f6b75f4e..f3272357 100644 --- a/cognee/shared/utils.py +++ b/cognee/shared/utils.py @@ -1,6 +1,6 @@ """ This module contains utility functions for the cognee. """ import os -import datetime +from datetime import datetime, timezone import graphistry import networkx as nx import numpy as np @@ -25,7 +25,7 @@ def send_telemetry(event_name: str, user_id, additional_properties: dict = {}): host = "https://eu.i.posthog.com" ) - current_time = datetime.datetime.now() + current_time = datetime.now(timezone.utc) properties = { "time": current_time.strftime("%m/%d/%Y"), **additional_properties, @@ -86,30 +86,36 @@ async def register_graphistry(): graphistry.register(api = 3, username = config.graphistry_username, password = config.graphistry_password) -def prepare_edges(graph): - return nx.to_pandas_edgelist(graph) +def prepare_edges(graph, source, target, edge_key): + edge_list = [{ + source: str(edge[0]), + target: str(edge[1]), + edge_key: str(edge[2]), + } for edge in graph.edges] + + return pd.DataFrame(edge_list) def prepare_nodes(graph, include_size=False): nodes_data = [] for node in graph.nodes: node_info = graph.nodes[node] - description = node_info.get("layer_description", {}).get("layer", "Default Layer") if isinstance( - node_info.get("layer_description"), dict) else node_info.get("layer_description", "Default Layer") - # description = node_info['layer_description']['layer'] if isinstance(node_info.get('layer_description'), dict) and 'layer' in node_info['layer_description'] else node_info.get('layer_description', node) - # if isinstance(node_info.get('layer_description'), dict) and 'layer' in node_info.get('layer_description'): - # description = node_info['layer_description']['layer'] - # # Use 'layer_description' directly if it's not a dictionary, otherwise default to node ID - # else: - # description = node_info.get('layer_description', node) - - node_data = {"id": node, "layer_description": description} + + if not node_info: + continue + + node_data = { + "id": str(node), + "name": node_info["name"] if "name" in node_info else str(node), + } + if include_size: default_size = 10 # Default node size larger_size = 20 # Size for nodes with specific keywords in their ID - keywords = ["DOCUMENT", "User", "LAYER"] + keywords = ["DOCUMENT", "User"] node_size = larger_size if any(keyword in str(node) for keyword in keywords) else default_size node_data["size"] = node_size + nodes_data.append(node_data) return pd.DataFrame(nodes_data) @@ -129,28 +135,28 @@ async def render_graph(graph, include_nodes=False, include_color=False, include_ graph = networkx_graph - edges = prepare_edges(graph) - plotter = graphistry.edges(edges, "source", "target") + edges = prepare_edges(graph, "source_node", "target_node", "relationship_name") + plotter = graphistry.edges(edges, "source_node", "target_node") + plotter = plotter.bind(edge_label = "relationship_name") if include_nodes: - nodes = prepare_nodes(graph, include_size=include_size) + nodes = prepare_nodes(graph, include_size = include_size) plotter = plotter.nodes(nodes, "id") - if include_size: - plotter = plotter.bind(point_size="size") + plotter = plotter.bind(point_size = "size") if include_color: - unique_layers = nodes["layer_description"].unique() - color_palette = generate_color_palette(unique_layers) - plotter = plotter.encode_point_color("layer_description", categorical_mapping=color_palette, - default_mapping="silver") + pass + # unique_layers = nodes["layer_description"].unique() + # color_palette = generate_color_palette(unique_layers) + # plotter = plotter.encode_point_color("layer_description", categorical_mapping=color_palette, + # default_mapping="silver") if include_labels: - plotter = plotter.bind(point_label = "layer_description") - + plotter = plotter.bind(point_label = "name") # Visualization diff --git a/cognee/tasks/__init__.py b/cognee/tasks/__init__.py deleted file mode 100644 index e19b49d8..00000000 --- a/cognee/tasks/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from .summarization.summarize_text import summarize_text -from .chunk_naive_llm_classifier.chunk_naive_llm_classifier import chunk_naive_llm_classifier -from .chunk_remove_disconnected.chunk_remove_disconnected import chunk_remove_disconnected -from .chunk_update_check.chunk_update_check import chunk_update_check -from .save_chunks_to_store.save_chunks_to_store import save_chunks_to_store -from .source_documents_to_chunks.source_documents_to_chunks import source_documents_to_chunks -from .infer_data_ontology.infer_data_ontology import infer_data_ontology -from .check_permissions_on_documents.check_permissions_on_documents import check_permissions_on_documents -from .classify_documents.classify_documents import classify_documents -from .graph.chunks_into_graph import chunks_into_graph diff --git a/cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py b/cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py index 83b49545..3a9d957d 100644 --- a/cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +++ b/cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py @@ -5,7 +5,7 @@ from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.vector import get_vector_engine, DataPoint from cognee.modules.data.extraction.extract_categories import extract_categories -from cognee.modules.chunking import DocumentChunk +from cognee.modules.chunking.models.DocumentChunk import DocumentChunk async def chunk_naive_llm_classifier(data_chunks: list[DocumentChunk], classification_model: Type[BaseModel]): @@ -66,7 +66,7 @@ class Keyword(BaseModel): "chunk_id": str(data_chunk.chunk_id), "document_id": str(data_chunk.document_id), }), - embed_field="text", + index_fields=["text"], ) ) @@ -105,7 +105,7 @@ class Keyword(BaseModel): "chunk_id": str(data_chunk.chunk_id), "document_id": str(data_chunk.document_id), }), - embed_field="text", + index_fields=["text"], ) ) diff --git a/cognee/tasks/chunk_remove_disconnected/__init__.py b/cognee/tasks/chunk_remove_disconnected/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/cognee/tasks/chunk_translate/__init__.py b/cognee/tasks/chunk_translate/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/cognee/tasks/chunk_translate/translate_chunk.py b/cognee/tasks/chunk_translate/translate_chunk.py deleted file mode 100644 index 7a7f1580..00000000 --- a/cognee/tasks/chunk_translate/translate_chunk.py +++ /dev/null @@ -1,39 +0,0 @@ - -import logging - -from cognee.base_config import get_base_config - -BaseConfig = get_base_config() - -async def translate_text(data, source_language:str='sr', target_language:str='en', region_name='eu-west-1'): - """ - Translate text from source language to target language using AWS Translate. - Parameters: - data (str): The text to be translated. - source_language (str): The source language code (e.g., 'sr' for Serbian). ISO 639-2 Code https://www.loc.gov/standards/iso639-2/php/code_list.php - target_language (str): The target language code (e.g., 'en' for English). ISO 639-2 Code https://www.loc.gov/standards/iso639-2/php/code_list.php - region_name (str): AWS region name. - Returns: - str: Translated text or an error message. - """ - import boto3 - from botocore.exceptions import BotoCoreError, ClientError - - if not data: - yield "No text provided for translation." - - if not source_language or not target_language: - yield "Both source and target language codes are required." - - try: - translate = boto3.client(service_name='translate', region_name=region_name, use_ssl=True) - result = translate.translate_text(Text=data, SourceLanguageCode=source_language, TargetLanguageCode=target_language) - yield result.get('TranslatedText', 'No translation found.') - - except BotoCoreError as e: - logging.info(f"BotoCoreError occurred: {e}") - yield "Error with AWS Translate service configuration or request." - - except ClientError as e: - logging.info(f"ClientError occurred: {e}") - yield "Error with AWS client or network issue." \ No newline at end of file diff --git a/cognee/tasks/chunk_update_check/__init__.py b/cognee/tasks/chunk_update_check/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/cognee/tasks/chunk_update_check/chunk_update_check.py b/cognee/tasks/chunk_update_check/chunk_update_check.py deleted file mode 100644 index 1c1a534d..00000000 --- a/cognee/tasks/chunk_update_check/chunk_update_check.py +++ /dev/null @@ -1,26 +0,0 @@ -from cognee.infrastructure.databases.vector import get_vector_engine -from cognee.modules.chunking import DocumentChunk - - -async def chunk_update_check(data_chunks: list[DocumentChunk], collection_name: str) -> list[DocumentChunk]: - vector_engine = get_vector_engine() - - if not await vector_engine.has_collection(collection_name): - # If collection doesn't exist, all data_chunks are new - return data_chunks - - existing_chunks = await vector_engine.retrieve( - collection_name, - [str(chunk.chunk_id) for chunk in data_chunks], - ) - - existing_chunks_map = {str(chunk.id): chunk.payload for chunk in existing_chunks} - - affected_data_chunks = [] - - for chunk in data_chunks: - if chunk.chunk_id not in existing_chunks_map or \ - chunk.text != existing_chunks_map[chunk.chunk_id]["text"]: - affected_data_chunks.append(chunk) - - return affected_data_chunks diff --git a/cognee/tasks/chunking/__init__.py b/cognee/tasks/chunks/__init__.py similarity index 72% rename from cognee/tasks/chunking/__init__.py rename to cognee/tasks/chunks/__init__.py index 6c6728d5..e9265856 100644 --- a/cognee/tasks/chunking/__init__.py +++ b/cognee/tasks/chunks/__init__.py @@ -2,3 +2,4 @@ from .chunk_by_word import chunk_by_word from .chunk_by_sentence import chunk_by_sentence from .chunk_by_paragraph import chunk_by_paragraph +from .remove_disconnected_chunks import remove_disconnected_chunks diff --git a/cognee/tasks/chunking/__tests__/chunk_by_paragraph.test.py b/cognee/tasks/chunks/__tests__/chunk_by_paragraph.test.py similarity index 97% rename from cognee/tasks/chunking/__tests__/chunk_by_paragraph.test.py rename to cognee/tasks/chunks/__tests__/chunk_by_paragraph.test.py index cecea481..b63be0eb 100644 --- a/cognee/tasks/chunking/__tests__/chunk_by_paragraph.test.py +++ b/cognee/tasks/chunks/__tests__/chunk_by_paragraph.test.py @@ -1,4 +1,4 @@ -from cognee.tasks.chunking import chunk_by_paragraph +from cognee.tasks.chunks import chunk_by_paragraph if __name__ == "__main__": def test_chunking_on_whole_text(): diff --git a/cognee/tasks/chunking/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py similarity index 100% rename from cognee/tasks/chunking/chunk_by_paragraph.py rename to cognee/tasks/chunks/chunk_by_paragraph.py diff --git a/cognee/tasks/chunking/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py similarity index 100% rename from cognee/tasks/chunking/chunk_by_sentence.py rename to cognee/tasks/chunks/chunk_by_sentence.py diff --git a/cognee/tasks/chunking/chunk_by_word.py b/cognee/tasks/chunks/chunk_by_word.py similarity index 100% rename from cognee/tasks/chunking/chunk_by_word.py rename to cognee/tasks/chunks/chunk_by_word.py diff --git a/cognee/tasks/chunking/query_chunks.py b/cognee/tasks/chunks/query_chunks.py similarity index 83% rename from cognee/tasks/chunking/query_chunks.py rename to cognee/tasks/chunks/query_chunks.py index b19a560c..93f32a64 100644 --- a/cognee/tasks/chunking/query_chunks.py +++ b/cognee/tasks/chunks/query_chunks.py @@ -10,7 +10,7 @@ async def query_chunks(query: str) -> list[dict]: """ vector_engine = get_vector_engine() - found_chunks = await vector_engine.search("chunks", query, limit = 5) + found_chunks = await vector_engine.search("DocumentChunk_text", query, limit = 5) chunks = [result.payload for result in found_chunks] diff --git a/cognee/tasks/chunk_remove_disconnected/chunk_remove_disconnected.py b/cognee/tasks/chunks/remove_disconnected_chunks.py similarity index 84% rename from cognee/tasks/chunk_remove_disconnected/chunk_remove_disconnected.py rename to cognee/tasks/chunks/remove_disconnected_chunks.py index 0c39ed5d..4a36a33e 100644 --- a/cognee/tasks/chunk_remove_disconnected/chunk_remove_disconnected.py +++ b/cognee/tasks/chunks/remove_disconnected_chunks.py @@ -1,7 +1,7 @@ from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.modules.chunking import DocumentChunk +from cognee.modules.chunking.models.DocumentChunk import DocumentChunk -async def chunk_remove_disconnected(data_chunks: list[DocumentChunk]) -> list[DocumentChunk]: +async def remove_disconnected_chunks(data_chunks: list[DocumentChunk]) -> list[DocumentChunk]: graph_engine = await get_graph_engine() document_ids = set((data_chunk.document_id for data_chunk in data_chunks)) diff --git a/cognee/tasks/classify_documents/classify_documents.py b/cognee/tasks/classify_documents/classify_documents.py deleted file mode 100644 index 0c71ecc8..00000000 --- a/cognee/tasks/classify_documents/classify_documents.py +++ /dev/null @@ -1,13 +0,0 @@ -from cognee.modules.data.models import Data -from cognee.modules.data.processing.document_types import Document, PdfDocument, AudioDocument, ImageDocument, TextDocument - -def classify_documents(data_documents: list[Data]) -> list[Document]: - documents = [ - PdfDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) if data_item.extension == "pdf" else - AudioDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) if data_item.extension == "audio" else - ImageDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) if data_item.extension == "image" else - TextDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) - for data_item in data_documents - ] - - return documents diff --git a/cognee/tasks/document_language_detection/__init__.py b/cognee/tasks/document_language_detection/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/cognee/tasks/documents/__init__.py b/cognee/tasks/documents/__init__.py new file mode 100644 index 00000000..248bb04d --- /dev/null +++ b/cognee/tasks/documents/__init__.py @@ -0,0 +1,3 @@ +from .classify_documents import classify_documents +from .extract_chunks_from_documents import extract_chunks_from_documents +from .check_permissions_on_documents import check_permissions_on_documents diff --git a/cognee/tasks/check_permissions_on_documents/check_permissions_on_documents.py b/cognee/tasks/documents/check_permissions_on_documents.py similarity index 100% rename from cognee/tasks/check_permissions_on_documents/check_permissions_on_documents.py rename to cognee/tasks/documents/check_permissions_on_documents.py diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py new file mode 100644 index 00000000..64ed808d --- /dev/null +++ b/cognee/tasks/documents/classify_documents.py @@ -0,0 +1,13 @@ +from cognee.modules.data.models import Data +from cognee.modules.data.processing.document_types import Document, PdfDocument, AudioDocument, ImageDocument, TextDocument + +def classify_documents(data_documents: list[Data]) -> list[Document]: + documents = [ + PdfDocument(id = data_item.id, name=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) if data_item.extension == "pdf" else + AudioDocument(id = data_item.id, name=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) if data_item.extension == "audio" else + ImageDocument(id = data_item.id, name=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) if data_item.extension == "image" else + TextDocument(id = data_item.id, name=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) + for data_item in data_documents + ] + + return documents diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py new file mode 100644 index 00000000..ec19a786 --- /dev/null +++ b/cognee/tasks/documents/extract_chunks_from_documents.py @@ -0,0 +1,7 @@ +from cognee.modules.data.processing.document_types.Document import Document + + +async def extract_chunks_from_documents(documents: list[Document], chunk_size: int = 1024): + for document in documents: + for document_chunk in document.read(chunk_size = chunk_size): + yield document_chunk diff --git a/cognee/tasks/graph/__init__.py b/cognee/tasks/graph/__init__.py index f9c39e4c..94dc82f2 100644 --- a/cognee/tasks/graph/__init__.py +++ b/cognee/tasks/graph/__init__.py @@ -1,2 +1,2 @@ -from .chunks_into_graph import chunks_into_graph +from .extract_graph_from_data import extract_graph_from_data from .query_graph_connections import query_graph_connections diff --git a/cognee/tasks/graph/chunks_into_graph.py b/cognee/tasks/graph/chunks_into_graph.py deleted file mode 100644 index 7ba22e84..00000000 --- a/cognee/tasks/graph/chunks_into_graph.py +++ /dev/null @@ -1,213 +0,0 @@ -import json -import asyncio -from uuid import uuid5, NAMESPACE_OID -from datetime import datetime, timezone -from typing import Type -from pydantic import BaseModel -from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.infrastructure.databases.vector import DataPoint, get_vector_engine -from cognee.modules.data.extraction.knowledge_graph.extract_content_graph import extract_content_graph -from cognee.modules.chunking import DocumentChunk -from cognee.modules.graph.utils import generate_node_id, generate_node_name - - -class EntityNode(BaseModel): - uuid: str - name: str - type: str - description: str - created_at: datetime - updated_at: datetime - -async def chunks_into_graph(data_chunks: list[DocumentChunk], graph_model: Type[BaseModel], collection_name: str): - chunk_graphs = await asyncio.gather( - *[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks] - ) - - vector_engine = get_vector_engine() - graph_engine = await get_graph_engine() - - has_collection = await vector_engine.has_collection(collection_name) - - if not has_collection: - await vector_engine.create_collection(collection_name, payload_schema = EntityNode) - - processed_nodes = {} - type_node_edges = [] - entity_node_edges = [] - type_entity_edges = [] - - for (chunk_index, chunk) in enumerate(data_chunks): - chunk_graph = chunk_graphs[chunk_index] - for node in chunk_graph.nodes: - type_node_id = generate_node_id(node.type) - entity_node_id = generate_node_id(node.id) - - if type_node_id not in processed_nodes: - type_node_edges.append((str(chunk.chunk_id), type_node_id, "contains_entity_type")) - processed_nodes[type_node_id] = True - - if entity_node_id not in processed_nodes: - entity_node_edges.append((str(chunk.chunk_id), entity_node_id, "contains_entity")) - type_entity_edges.append((entity_node_id, type_node_id, "is_entity_type")) - processed_nodes[entity_node_id] = True - - graph_node_edges = [ - (edge.target_node_id, edge.source_node_id, edge.relationship_name) \ - for edge in chunk_graph.edges - ] - - existing_edges = await graph_engine.has_edges([ - *type_node_edges, - *entity_node_edges, - *type_entity_edges, - *graph_node_edges, - ]) - - existing_edges_map = {} - existing_nodes_map = {} - - for edge in existing_edges: - existing_edges_map[edge[0] + edge[1] + edge[2]] = True - existing_nodes_map[edge[0]] = True - - graph_nodes = [] - graph_edges = [] - data_points = [] - - for (chunk_index, chunk) in enumerate(data_chunks): - graph = chunk_graphs[chunk_index] - if graph is None: - continue - - for node in graph.nodes: - node_id = generate_node_id(node.id) - node_name = generate_node_name(node.name) - - type_node_id = generate_node_id(node.type) - type_node_name = generate_node_name(node.type) - - if node_id not in existing_nodes_map: - node_data = dict( - uuid = node_id, - name = node_name, - type = node_name, - description = node.description, - created_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), - updated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), - ) - - graph_nodes.append(( - node_id, - dict( - **node_data, - properties = json.dumps(node.properties), - ) - )) - - data_points.append(DataPoint[EntityNode]( - id = str(uuid5(NAMESPACE_OID, node_id)), - payload = node_data, - embed_field = "name", - )) - - existing_nodes_map[node_id] = True - - edge_key = str(chunk.chunk_id) + node_id + "contains_entity" - - if edge_key not in existing_edges_map: - graph_edges.append(( - str(chunk.chunk_id), - node_id, - "contains_entity", - dict( - relationship_name = "contains_entity", - source_node_id = str(chunk.chunk_id), - target_node_id = node_id, - ), - )) - - # Add relationship between entity type and entity itself: "Jake is Person" - graph_edges.append(( - node_id, - type_node_id, - "is_entity_type", - dict( - relationship_name = "is_entity_type", - source_node_id = type_node_id, - target_node_id = node_id, - ), - )) - - existing_edges_map[edge_key] = True - - if type_node_id not in existing_nodes_map: - type_node_data = dict( - uuid = type_node_id, - name = type_node_name, - type = type_node_id, - description = type_node_name, - created_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), - updated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), - ) - - graph_nodes.append((type_node_id, dict( - **type_node_data, - properties = json.dumps(node.properties) - ))) - - data_points.append(DataPoint[EntityNode]( - id = str(uuid5(NAMESPACE_OID, type_node_id)), - payload = type_node_data, - embed_field = "name", - )) - - existing_nodes_map[type_node_id] = True - - edge_key = str(chunk.chunk_id) + type_node_id + "contains_entity_type" - - if edge_key not in existing_edges_map: - graph_edges.append(( - str(chunk.chunk_id), - type_node_id, - "contains_entity_type", - dict( - relationship_name = "contains_entity_type", - source_node_id = str(chunk.chunk_id), - target_node_id = type_node_id, - ), - )) - - existing_edges_map[edge_key] = True - - # Add relationship that came from graphs. - for edge in graph.edges: - source_node_id = generate_node_id(edge.source_node_id) - target_node_id = generate_node_id(edge.target_node_id) - relationship_name = generate_node_name(edge.relationship_name) - edge_key = source_node_id + target_node_id + relationship_name - - if edge_key not in existing_edges_map: - graph_edges.append(( - generate_node_id(edge.source_node_id), - generate_node_id(edge.target_node_id), - edge.relationship_name, - dict( - relationship_name = generate_node_name(edge.relationship_name), - source_node_id = generate_node_id(edge.source_node_id), - target_node_id = generate_node_id(edge.target_node_id), - properties = json.dumps(edge.properties), - ), - )) - existing_edges_map[edge_key] = True - - if len(data_points) > 0: - await vector_engine.create_data_points(collection_name, data_points) - - if len(graph_nodes) > 0: - await graph_engine.add_nodes(graph_nodes) - - if len(graph_edges) > 0: - await graph_engine.add_edges(graph_edges) - - return data_chunks diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py new file mode 100644 index 00000000..36cc3e2f --- /dev/null +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -0,0 +1,121 @@ +import asyncio +from typing import Type +from pydantic import BaseModel +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.modules.data.extraction.knowledge_graph import extract_content_graph +from cognee.modules.chunking.models.DocumentChunk import DocumentChunk +from cognee.modules.engine.models import EntityType, Entity +from cognee.modules.engine.utils import generate_node_id, generate_node_name +from cognee.tasks.storage import add_data_points + +async def extract_graph_from_data(data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]): + chunk_graphs = await asyncio.gather( + *[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks] + ) + + processed_nodes = {} + type_node_edges = [] + entity_node_edges = [] + type_entity_edges = [] + + for (chunk_index, chunk) in enumerate(data_chunks): + chunk_graph = chunk_graphs[chunk_index] + for node in chunk_graph.nodes: + type_node_id = generate_node_id(node.type) + entity_node_id = generate_node_id(node.id) + + if str(type_node_id) not in processed_nodes: + type_node_edges.append((str(chunk.id), str(type_node_id), "exists_in")) + processed_nodes[str(type_node_id)] = True + + if str(entity_node_id) not in processed_nodes: + entity_node_edges.append((str(chunk.id), entity_node_id, "mentioned_in")) + type_entity_edges.append((str(entity_node_id), str(type_node_id), "is_a")) + processed_nodes[str(entity_node_id)] = True + + graph_node_edges = [ + (edge.target_node_id, edge.source_node_id, edge.relationship_name) \ + for edge in chunk_graph.edges + ] + + graph_engine = await get_graph_engine() + + existing_edges = await graph_engine.has_edges([ + *type_node_edges, + *entity_node_edges, + *type_entity_edges, + *graph_node_edges, + ]) + + existing_edges_map = {} + + for edge in existing_edges: + existing_edges_map[edge[0] + edge[1] + edge[2]] = True + + added_nodes_map = {} + graph_edges = [] + data_points = [] + + for (chunk_index, chunk) in enumerate(data_chunks): + graph = chunk_graphs[chunk_index] + if graph is None: + continue + + for node in graph.nodes: + node_id = generate_node_id(node.id) + node_name = generate_node_name(node.name) + + type_node_id = generate_node_id(node.type) + type_node_name = generate_node_name(node.type) + + if f"{str(type_node_id)}_type" not in added_nodes_map: + type_node = EntityType( + id = type_node_id, + name = type_node_name, + type = type_node_name, + description = type_node_name, + exists_in = chunk, + ) + added_nodes_map[f"{str(type_node_id)}_type"] = type_node + else: + type_node = added_nodes_map[f"{str(type_node_id)}_type"] + + if f"{str(node_id)}_entity" not in added_nodes_map: + entity_node = Entity( + id = node_id, + name = node_name, + is_a = type_node, + description = node.description, + mentioned_in = chunk, + ) + data_points.append(entity_node) + added_nodes_map[f"{str(node_id)}_entity"] = entity_node + + # Add relationship that came from graphs. + for edge in graph.edges: + source_node_id = generate_node_id(edge.source_node_id) + target_node_id = generate_node_id(edge.target_node_id) + relationship_name = generate_node_name(edge.relationship_name) + + edge_key = str(source_node_id) + str(target_node_id) + relationship_name + + if edge_key not in existing_edges_map: + graph_edges.append(( + source_node_id, + target_node_id, + edge.relationship_name, + dict( + relationship_name = generate_node_name(edge.relationship_name), + source_node_id = source_node_id, + target_node_id = target_node_id, + ), + )) + existing_edges_map[edge_key] = True + + if len(data_points) > 0: + await add_data_points(data_points) + + if len(graph_edges) > 0: + await graph_engine.add_edges(graph_edges) + + return data_chunks diff --git a/cognee/tasks/infer_data_ontology/infer_data_ontology.py b/cognee/tasks/graph/infer_data_ontology.py similarity index 95% rename from cognee/tasks/infer_data_ontology/infer_data_ontology.py rename to cognee/tasks/graph/infer_data_ontology.py index 6415eb00..58fddce8 100644 --- a/cognee/tasks/infer_data_ontology/infer_data_ontology.py +++ b/cognee/tasks/graph/infer_data_ontology.py @@ -20,7 +20,7 @@ from cognee.modules.data.extraction.knowledge_graph.add_model_class_to_graph import add_model_class_to_graph from cognee.tasks.infer_data_ontology.models.models import NodeModel, GraphOntology from cognee.shared.data_models import KnowledgeGraph -from cognee.modules.graph.utils import generate_node_id, generate_node_name +from cognee.modules.engine.utils import generate_node_id, generate_node_name logger = logging.getLogger("task:infer_data_ontology") @@ -116,7 +116,6 @@ async def add_graph_ontology(self, file_path: str = None, documents: list = None name = generate_node_name(node.name), type = generate_node_id(node.id), description = node.description, - created_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), updated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), )) for node in ontology.nodes]) @@ -128,7 +127,6 @@ async def add_graph_ontology(self, file_path: str = None, documents: list = None source_node_id = generate_node_id(edge.source_id), target_node_id = generate_node_id(edge.target_id), relationship_name = edge.relationship_type, - created_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), updated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), ), ) for edge in ontology.edges) @@ -160,7 +158,6 @@ async def add_graph_ontology(self, file_path: str = None, documents: list = None "source_node_id": row["relationship_source"], "target_node_id": row["relationship_target"], "relationship_name": row["relationship_type"], - "created_at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), "updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), }, ) diff --git a/cognee/tasks/graph/query_graph_connections.py b/cognee/tasks/graph/query_graph_connections.py index 3f1b5226..36d53514 100644 --- a/cognee/tasks/graph/query_graph_connections.py +++ b/cognee/tasks/graph/query_graph_connections.py @@ -27,8 +27,8 @@ async def query_graph_connections(query: str, exploration_levels = 1) -> list[(s else: vector_engine = get_vector_engine() results = await asyncio.gather( - vector_engine.search("entities", query_text = query, limit = 5), - vector_engine.search("classification", query_text = query, limit = 5), + vector_engine.search("Entity_text", query_text = query, limit = 5), + vector_engine.search("EntityType_text", query_text = query, limit = 5), ) results = [*results[0], *results[1]] relevant_results = [result for result in results if result.score < 0.5][:5] diff --git a/cognee/tasks/infer_data_ontology/__init__.py b/cognee/tasks/infer_data_ontology/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/cognee/tasks/infer_data_ontology/models/models.py b/cognee/tasks/infer_data_ontology/models/models.py deleted file mode 100644 index b62bf3ac..00000000 --- a/cognee/tasks/infer_data_ontology/models/models.py +++ /dev/null @@ -1,31 +0,0 @@ -from typing import Any, Dict, List, Optional, Union -from pydantic import BaseModel, Field - -class RelationshipModel(BaseModel): - type: str - source: str - target: str - -class NodeModel(BaseModel): - node_id: str - name: str - default_relationship: Optional[RelationshipModel] = None - children: List[Union[Dict[str, Any], "NodeModel"]] = Field(default_factory=list) - -NodeModel.update_forward_refs() - - -class OntologyNode(BaseModel): - id: str = Field(..., description = "Unique identifier made from node name.") - name: str - description: str - -class OntologyEdge(BaseModel): - id: str - source_id: str - target_id: str - relationship_type: str - -class GraphOntology(BaseModel): - nodes: list[OntologyNode] - edges: list[OntologyEdge] diff --git a/cognee/tasks/save_chunks_to_store/__init__.py b/cognee/tasks/save_chunks_to_store/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/cognee/tasks/save_chunks_to_store/save_chunks_to_store.py b/cognee/tasks/save_chunks_to_store/save_chunks_to_store.py deleted file mode 100644 index 435fd020..00000000 --- a/cognee/tasks/save_chunks_to_store/save_chunks_to_store.py +++ /dev/null @@ -1,96 +0,0 @@ -from cognee.infrastructure.databases.vector import DataPoint, get_vector_engine -from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.modules.chunking import DocumentChunk - -async def save_chunks_to_store(data_chunks: list[DocumentChunk], collection_name: str): - if len(data_chunks) == 0: - return data_chunks - - vector_engine = get_vector_engine() - graph_engine = await get_graph_engine() - - # Remove and unlink existing chunks - if await vector_engine.has_collection(collection_name): - existing_chunks = [DocumentChunk.parse_obj(chunk.payload) for chunk in (await vector_engine.retrieve( - collection_name, - [str(chunk.chunk_id) for chunk in data_chunks], - ))] - - if len(existing_chunks) > 0: - await vector_engine.delete_data_points(collection_name, [str(chunk.chunk_id) for chunk in existing_chunks]) - - await graph_engine.remove_connection_to_successors_of([chunk.chunk_id for chunk in existing_chunks], "next_chunk") - await graph_engine.remove_connection_to_predecessors_of([chunk.chunk_id for chunk in existing_chunks], "has_chunk") - else: - await vector_engine.create_collection(collection_name, payload_schema = DocumentChunk) - - # Add to vector storage - await vector_engine.create_data_points( - collection_name, - [ - DataPoint[DocumentChunk]( - id = str(chunk.chunk_id), - payload = chunk, - embed_field = "text", - ) for chunk in data_chunks - ], - ) - - # Add to graph storage - chunk_nodes = [] - chunk_edges = [] - - for chunk in data_chunks: - chunk_nodes.append(( - str(chunk.chunk_id), - dict( - uuid = str(chunk.chunk_id), - chunk_id = str(chunk.chunk_id), - document_id = str(chunk.document_id), - word_count = chunk.word_count, - chunk_index = chunk.chunk_index, - cut_type = chunk.cut_type, - ) - )) - - chunk_edges.append(( - str(chunk.document_id), - str(chunk.chunk_id), - "has_chunk", - dict( - relationship_name = "has_chunk", - source_node_id = str(chunk.document_id), - target_node_id = str(chunk.chunk_id), - ), - )) - - previous_chunk_id = get_previous_chunk_id(data_chunks, chunk) - - if previous_chunk_id is not None: - chunk_edges.append(( - str(previous_chunk_id), - str(chunk.chunk_id), - "next_chunk", - dict( - relationship_name = "next_chunk", - source_node_id = str(previous_chunk_id), - target_node_id = str(chunk.chunk_id), - ), - )) - - await graph_engine.add_nodes(chunk_nodes) - await graph_engine.add_edges(chunk_edges) - - return data_chunks - - -def get_previous_chunk_id(document_chunks: list[DocumentChunk], current_chunk: DocumentChunk) -> DocumentChunk: - if current_chunk.chunk_index == 0: - return current_chunk.document_id - - for chunk in document_chunks: - if str(chunk.document_id) == str(current_chunk.document_id) \ - and chunk.chunk_index == current_chunk.chunk_index - 1: - return chunk.chunk_id - - return None diff --git a/cognee/tasks/source_documents_to_chunks/__init__.py b/cognee/tasks/source_documents_to_chunks/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/cognee/tasks/source_documents_to_chunks/source_documents_to_chunks.py b/cognee/tasks/source_documents_to_chunks/source_documents_to_chunks.py deleted file mode 100644 index c3cdcb0e..00000000 --- a/cognee/tasks/source_documents_to_chunks/source_documents_to_chunks.py +++ /dev/null @@ -1,44 +0,0 @@ -from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.modules.data.processing.document_types.Document import Document - - -async def source_documents_to_chunks(documents: list[Document], chunk_size: int = 1024, parent_node_id: str = None): - graph_engine = await get_graph_engine() - - if parent_node_id is None: - documents, parent_node_id = documents - - - nodes = [] - edges = [] - - if parent_node_id and await graph_engine.extract_node(parent_node_id) is None: - nodes.append((parent_node_id, {})) - - document_nodes = await graph_engine.extract_nodes([str(document.id) for document in documents]) - - for (document_index, document) in enumerate(documents): - document_node = document_nodes[document_index] if document_index in document_nodes else None - - if document_node is None: - nodes.append((str(document.id), document.to_dict())) - - if parent_node_id: - edges.append(( - parent_node_id, - str(document.id), - "has_document", - dict( - relationship_name = "has_document", - source_node_id = parent_node_id, - target_node_id = str(document.id), - ), - )) - - if len(nodes) > 0: - await graph_engine.add_nodes(nodes) - await graph_engine.add_edges(edges) - - for document in documents: - for document_chunk in document.read(chunk_size = chunk_size): - yield document_chunk diff --git a/cognee/tasks/storage/__init__.py b/cognee/tasks/storage/__init__.py new file mode 100644 index 00000000..156ae696 --- /dev/null +++ b/cognee/tasks/storage/__init__.py @@ -0,0 +1,2 @@ +from .add_data_points import add_data_points +from .index_data_points import index_data_points diff --git a/cognee/tasks/storage/add_data_points.py b/cognee/tasks/storage/add_data_points.py new file mode 100644 index 00000000..b803c9df --- /dev/null +++ b/cognee/tasks/storage/add_data_points.py @@ -0,0 +1,24 @@ +from cognee.infrastructure.engine import DataPoint +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.modules.graph.utils import get_graph_from_model +from .index_data_points import index_data_points + + +async def add_data_points(data_points: list[DataPoint]): + nodes = [] + edges = [] + + for data_point in data_points: + property_nodes, property_edges = get_graph_from_model(data_point) + + nodes.extend(property_nodes) + edges.extend(property_edges) + + graph_engine = await get_graph_engine() + + await index_data_points(data_points) + + await graph_engine.add_nodes(nodes) + await graph_engine.add_edges(edges) + + return data_points diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py new file mode 100644 index 00000000..a28335e2 --- /dev/null +++ b/cognee/tasks/storage/index_data_points.py @@ -0,0 +1,81 @@ +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.infrastructure.engine import DataPoint + +async def index_data_points(data_points: list[DataPoint]): + created_indexes = {} + index_points = {} + + vector_engine = get_vector_engine() + + flat_data_points: list[DataPoint] = [] + + for data_point in data_points: + flat_data_points.extend(get_data_points_from_model(data_point)) + + for data_point in flat_data_points: + data_point_type = type(data_point) + + for field_name in data_point._metadata["index_fields"]: + index_name = f"{data_point_type.__name__}.{field_name}" + + if index_name not in created_indexes: + await vector_engine.create_vector_index(data_point_type.__name__, field_name) + created_indexes[index_name] = True + + if index_name not in index_points: + index_points[index_name] = [] + + indexed_data_point = data_point.model_copy() + indexed_data_point._metadata["index_fields"] = [field_name] + index_points[index_name].append(indexed_data_point) + + for index_name, indexable_points in index_points.items(): + index_name, field_name = index_name.split(".") + await vector_engine.index_data_points(index_name, field_name, indexable_points) + + return data_points + +def get_data_points_from_model(data_point: DataPoint, added_data_points = {}) -> list[DataPoint]: + data_points = [] + + for field_name, field_value in data_point: + if isinstance(field_value, DataPoint): + new_data_points = get_data_points_from_model(field_value, added_data_points) + + for new_point in new_data_points: + if str(new_point.id) not in added_data_points: + added_data_points[str(new_point.id)] = True + data_points.append(new_point) + + if isinstance(field_value, list) and isinstance(field_value[0], DataPoint): + for field_value_item in field_value: + new_data_points = get_data_points_from_model(field_value_item, added_data_points) + + for new_point in new_data_points: + if str(new_point.id) not in added_data_points: + added_data_points[str(new_point.id)] = True + data_points.append(new_point) + + data_points.append(data_point) + + return data_points + + +if __name__ == "__main__": + class Car(DataPoint): + model: str + color: str + + class Person(DataPoint): + name: str + age: int + owns_car: list[Car] + + car1 = Car(model = "Tesla Model S", color = "Blue") + car2 = Car(model = "Toyota Camry", color = "Red") + person = Person(name = "John", age = 30, owns_car = [car1, car2]) + + data_points = get_data_points_from_model(person) + + print(data_points) + \ No newline at end of file diff --git a/cognee/tasks/storage/save_to_vector_storage.py b/cognee/tasks/storage/save_to_vector_storage.py deleted file mode 100644 index e77ae02c..00000000 --- a/cognee/tasks/storage/save_to_vector_storage.py +++ /dev/null @@ -1,42 +0,0 @@ -from cognee.infrastructure.databases.vector import get_vector_engine, DataPoint - -async def save_to_vector_storage(data_chunks: list, collection_name: str, embed_field: str): - if len(data_chunks) == 0: - return data_chunks - - if not all(isinstance(chunk, type(data_chunks[0])) for chunk in data_chunks): - raise ValueError("All data chunks must be of the same type.") - - vector_engine = get_vector_engine() - - PayloadSchema = type(data_chunks[0]) - - await vector_engine.create_collection(collection_name, payload_schema = PayloadSchema) - - await vector_engine.create_data_points( - collection_name, - [ - DataPoint[PayloadSchema]( - id = str(chunk.id), - payload = parse_data(chunk, chunk_index), - embed_field = embed_field, - ) for (chunk_index, chunk) in enumerate(data_chunks) - ], - ) - - return data_chunks - -def parse_data(chunk, chunk_index: int) -> dict: - from uuid import UUID - - data = { - "chunk_index": chunk_index, - } - - for key, value in vars(chunk).items(): - if isinstance(value, UUID): - data[key] = str(value) - else: - data[key] = value - - return data diff --git a/cognee/tasks/summarization/models/TextSummary.py b/cognee/tasks/summarization/models/TextSummary.py index ed447183..5e724cd6 100644 --- a/cognee/tasks/summarization/models/TextSummary.py +++ b/cognee/tasks/summarization/models/TextSummary.py @@ -1,5 +1,12 @@ -from pydantic import BaseModel +from cognee.infrastructure.engine import DataPoint +from cognee.modules.chunking.models.DocumentChunk import DocumentChunk +from cognee.modules.data.processing.document_types import Document -class TextSummary(BaseModel): +class TextSummary(DataPoint): text: str - chunk_id: str + chunk: DocumentChunk + + _metadata: dict = { + "index_fields": ["text"], + } + diff --git a/cognee/tasks/summarization/query_summaries.py b/cognee/tasks/summarization/query_summaries.py index 871e1b31..89683914 100644 --- a/cognee/tasks/summarization/query_summaries.py +++ b/cognee/tasks/summarization/query_summaries.py @@ -10,7 +10,7 @@ async def query_summaries(query: str) -> list: """ vector_engine = get_vector_engine() - summaries_results = await vector_engine.search("summaries", query, limit = 5) + summaries_results = await vector_engine.search("TextSummary_text", query, limit = 5) summaries = [summary.payload for summary in summaries_results] diff --git a/cognee/tasks/summarization/summarize_text.py b/cognee/tasks/summarization/summarize_text.py index b52f0735..a1abaccc 100644 --- a/cognee/tasks/summarization/summarize_text.py +++ b/cognee/tasks/summarization/summarize_text.py @@ -1,14 +1,13 @@ - import asyncio from typing import Type +from uuid import uuid5 from pydantic import BaseModel -from cognee.infrastructure.databases.vector import get_vector_engine, DataPoint from cognee.modules.data.extraction.extract_summary import extract_summary -from cognee.modules.chunking import DocumentChunk +from cognee.modules.chunking.models.DocumentChunk import DocumentChunk +from cognee.tasks.storage import add_data_points from .models.TextSummary import TextSummary - -async def summarize_text(data_chunks: list[DocumentChunk], summarization_model: Type[BaseModel], collection_name: str = "summaries"): +async def summarize_text(data_chunks: list[DocumentChunk], summarization_model: Type[BaseModel]): if len(data_chunks) == 0: return data_chunks @@ -16,23 +15,14 @@ async def summarize_text(data_chunks: list[DocumentChunk], summarization_model: *[extract_summary(chunk.text, summarization_model) for chunk in data_chunks] ) - vector_engine = get_vector_engine() + summaries = [ + TextSummary( + id = uuid5(chunk.id, "summary"), + chunk = chunk, + text = chunk_summaries[chunk_index].summary, + ) for (chunk_index, chunk) in enumerate(data_chunks) + ] - await vector_engine.create_collection(collection_name, payload_schema=TextSummary) - - await vector_engine.create_data_points( - collection_name, - [ - DataPoint[TextSummary]( - id = str(chunk.chunk_id), - payload = dict( - chunk_id = str(chunk.chunk_id), - document_id = str(chunk.document_id), - text = chunk_summaries[chunk_index].summary, - ), - embed_field = "text", - ) for (chunk_index, chunk) in enumerate(data_chunks) - ], - ) + add_data_points(summaries) return data_chunks diff --git a/cognee/tests/test_library.py b/cognee/tests/test_library.py index 49533939..f20080a5 100755 --- a/cognee/tests/test_library.py +++ b/cognee/tests/test_library.py @@ -32,7 +32,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("entities", "AI"))[0] + random_node = (await vector_engine.search("Entity", "AI"))[0] random_node_name = random_node.payload["name"] search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py index feff647c..31bff65f 100644 --- a/cognee/tests/test_neo4j.py +++ b/cognee/tests/test_neo4j.py @@ -36,7 +36,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("entities", "AI"))[0] + random_node = (await vector_engine.search("Entity", "AI"))[0] random_node_name = random_node.payload["name"] search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index 02d292d6..b58b8751 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -65,7 +65,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("entities", "AI"))[0] + random_node = (await vector_engine.search("Entity", "AI"))[0] random_node_name = random_node.payload["name"] search_results = await cognee.search(SearchType.INSIGHTS, query=random_node_name) diff --git a/cognee/tests/test_qdrant.py b/cognee/tests/test_qdrant.py index 2ea011eb..9766938e 100644 --- a/cognee/tests/test_qdrant.py +++ b/cognee/tests/test_qdrant.py @@ -37,7 +37,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("entities", "AI"))[0] + random_node = (await vector_engine.search("Entity", "AI"))[0] random_node_name = random_node.payload["name"] search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) diff --git a/cognee/tests/test_weaviate.py b/cognee/tests/test_weaviate.py index 7ad29a9a..175c9ecd 100644 --- a/cognee/tests/test_weaviate.py +++ b/cognee/tests/test_weaviate.py @@ -35,7 +35,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("entities", "AI"))[0] + random_node = (await vector_engine.search("Entity", "AI"))[0] random_node_name = random_node.payload["name"] search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) diff --git a/examples/python/GraphModel.py b/examples/python/GraphModel.py deleted file mode 100644 index 01251fc2..00000000 --- a/examples/python/GraphModel.py +++ /dev/null @@ -1,62 +0,0 @@ - -from typing import Optional -from uuid import UUID -from datetime import datetime -from pydantic import BaseModel - - -async def add_data_points(collection_name: str, data_points: list): - pass - - - -class Summary(BaseModel): - id: UUID - text: str - chunk: "Chunk" - created_at: datetime - updated_at: Optional[datetime] - - vector_index = ["text"] - -class Chunk(BaseModel): - id: UUID - text: str - summary: Summary - document: "Document" - created_at: datetime - updated_at: Optional[datetime] - word_count: int - chunk_index: int - cut_type: str - - vector_index = ["text"] - -class Document(BaseModel): - id: UUID - chunks: list[Chunk] - created_at: datetime - updated_at: Optional[datetime] - -class EntityType(BaseModel): - id: UUID - name: str - description: str - created_at: datetime - updated_at: Optional[datetime] - - vector_index = ["name"] - -class Entity(BaseModel): - id: UUID - name: str - type: EntityType - description: str - chunks: list[Chunk] - created_at: datetime - updated_at: Optional[datetime] - - vector_index = ["name"] - -class OntologyModel(BaseModel): - chunks: list[Chunk] diff --git a/notebooks/cognee_demo.ipynb b/notebooks/cognee_demo.ipynb index c2c24953..396d7b98 100644 --- a/notebooks/cognee_demo.ipynb +++ b/notebooks/cognee_demo.ipynb @@ -1,908 +1,889 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "d35ac8ce-0f92-46f5-9ba4-a46970f0ce19", - "metadata": {}, - "source": [ - "# Cognee - Get Started" - ] - }, - { - "cell_type": "markdown", - "id": "bd981778-0c84-4542-8e6f-1a7712184873", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "## Let's talk about the problem first\n", - "\n", - "### Large Language Models (LLMs) have become powerful tools for generating text and answering questions, but they still have several limitations and challenges. Below is an overview of some of the biggest problems with the results they produce:\n", - "\n", - "### 1. Hallucinations and Misinformation\n", - "- Hallucinations: LLMs sometimes produce outputs that are factually incorrect or entirely fabricated. This phenomenon is known as \"hallucination.\" Even if an LLM seems confident, the information it provides might not be reliable.\n", - "- Misinformation: Misinformation can be subtle or glaring, ranging from minor inaccuracies to entirely fictitious events, sources, or data.\n", - "\n", - "### 2. Lack of Contextual Understanding\n", - "- LLMs can recognize and replicate patterns in language but don’t have true comprehension. This can lead to responses that are coherent but miss nuanced context or deeper meaning.\n", - "- They can misinterpret multi-turn conversations, leading to confusion in maintaining context over a long dialogue.\n", - "\n", - "### 3. Inconsistent Reliability\n", - "- Depending on the prompt, LLMs might produce inconsistent responses to similar questions or tasks. For example, the same query might result in conflicting answers when asked in slightly different ways.\n", - "- This inconsistency can undermine trust in the model's outputs, especially in professional or academic settings.\n", - "\n", - "### 4. Inability to Access Real-Time Information\n", - "- Most LLMs are trained on data up to a specific point and cannot access or generate information on current events or emerging trends unless updated. This can make them unsuitable for inquiries requiring up-to-date information.\n", - "- Real-time browsing capabilities can help, but they are not universally available.\n", - "\n", - "### 5. Lack of Personalization and Adaptability\n", - "- LLMs do not naturally adapt to individual preferences or learning styles unless explicitly programmed to do so. This limits their usefulness in providing personalized recommendations or support.\n", - "\n", - "### 6. Difficulty with Highly Technical or Niche Domains\n", - "- LLMs may struggle with highly specialized or technical topics where domain-specific knowledge is required.\n", - "- They can produce technically plausible but inaccurate or incomplete information, which can be misleading in areas like law, medicine, or scientific research.\n", - "\n", - "### 7. Ambiguity in Response Generation\n", - "- LLMs might not always specify their level of certainty, making it hard to gauge when they are speculating or providing less confident answers.\n", - "- They lack a mechanism to say “I don’t know,” which can lead to responses that are less useful or potentially misleading." - ] - }, - { - "cell_type": "markdown", - "id": "d8e606b1-94d3-43ce-bb4b-dbadff7f4ca6", - "metadata": {}, - "source": [ - "## The next solution was RAGs \n", - "\n", - "#### RAGs (Retrieval Augmented Generation) are systems that connect to a vector store and search for similar data so they can enrich LLM response." - ] - }, - { - "attachments": { - "df72c97a-cb3b-4e3c-bd68-d7bc986353c6.png": { - "image/png": "" - } - }, - "cell_type": "markdown", - "id": "23e74f22-f43c-4f03-afe0-b423cbaa412a", - "metadata": {}, - "source": [ - "![1_Jq9bEbitg1Pv4oASwEQwJg.png](attachment:df72c97a-cb3b-4e3c-bd68-d7bc986353c6.png)\n" - ] - }, - { - "cell_type": "markdown", - "id": "b6a98710-a14b-4a14-bb56-d3ae055e94d9", - "metadata": {}, - "source": [ - "#### The problem lies in the nature of the search. If you just find some keywords, and return one or many documents from vectorstore this way, you will have an issue with the the way you would use to organise and prioritise documents. \n" - ] - }, - { - "cell_type": "markdown", - "id": "5029110f", - "metadata": {}, - "source": [ - "![rag_problem_v2_white.drawio.png]()" - ] - }, - { - "cell_type": "markdown", - "id": "b6a98710-a14b-4a14-bb56-d3ae055e94d9", - "metadata": {}, - "source": [ - "## Semantic similarity search is not magic\n", - "#### The most similar result isn't the most relevant one. \n", - "#### If you search for documents in which the sentiment expressed is \"I like apples.\", one of the closest results you get are documents in which the sentiment expressed is \"I don't like apples.\"\n", - "#### Wouldn't it be nice to have a semantic model LLMs could use?\n" - ] - }, - { - "cell_type": "markdown", - "id": "b900f830-8e9e-4272-b198-594606da4457", - "metadata": {}, - "source": [ - "# That is where Cognee comes in" - ] - }, - { - "cell_type": "markdown", - "id": "d3ae099a-1bbb-4f13-9bcb-c0f778d50e91", - "metadata": {}, - "source": [ - "#### Cognee assists developers in introducing greater predictability and management into their Retrieval-Augmented Generation (RAG) workflows through the use of graph architectures, vector stores, and auto-optimizing pipelines. Displaying information as a graph is the clearest way to grasp the content of your documents. Crucially, graphs allow systematic navigation and extraction of data from documents based on their hierarchy.\n", - "\n", - "#### Cognee lets you create tasks and contextual pipelines of tasks that enable composable GraphRAG, where you have full control of all the elements of the pipeline from ingestion until graph creation. " - ] - }, - { - "cell_type": "markdown", - "id": "785383b0-87b5-4a0a-be3f-e809aa284e30", - "metadata": {}, - "source": [ - "# Core Concepts" - ] - }, - { - "cell_type": "markdown", - "id": "3540ce30-2b22-4ece-8516-8d5ff2a405fe", - "metadata": {}, - "source": [ - "## Concept 1: Data Pipelines" - ] - }, - { - "cell_type": "markdown", - "id": "7e47bae4-d27d-4430-a134-e1b381378f5c", - "metadata": {}, - "source": [ - "### Most of the data we provide to a system can be categorized as unstructured, semi-structured, or structured. Rows from a database would belong to structured data, jsons to semi-structured data, and logs that we input into the system could be considered unstructured. To organize and process this data, we need to ensure we have custom loaders for all data types, which can help us unify and organize it properly." - ] - }, - { - "cell_type": "markdown", - "id": "2f9c9376-8c68-4397-9081-d260cddcbd25", - "metadata": {}, - "source": [ - "![image.png]()" - ] - }, - { - "cell_type": "markdown", - "id": "7c87c5cf", - "metadata": {}, - "source": [ - "#### In the example above, we have a pipeline in which data has been imported from various sources, normalized, and stored in a database. " - ] - }, - { - "cell_type": "markdown", - "id": "bd435d1d", - "metadata": {}, - "source": [ - "## Concept 2: Data Enrichment with LLMs" - ] - }, - { - "cell_type": "markdown", - "id": "836d35ef", - "metadata": {}, - "source": [ - "#### LLMs are adept at processing unstructured data. They can easily extract summaries, keywords, and other useful information from documents. We use function calling with Pydantic models to extract information from the unstructured data. " - ] - }, - { - "cell_type": "markdown", - "id": "5bc1681c", - "metadata": {}, - "source": [ - "![image.png]()" - ] - }, - { - "cell_type": "markdown", - "id": "c6f428a8", - "metadata": {}, - "source": [ - "#### We decompose the loaded content into graphs, allowing us to more precisely map out the relationships between entities and concepts." - ] - }, - { - "cell_type": "markdown", - "id": "34c2227f", - "metadata": {}, - "source": [ - "## Concept 3: Graphs" - ] - }, - { - "cell_type": "markdown", - "id": "7ec176f5", - "metadata": {}, - "source": [ - "#### Knowledge graphs simply map out knowledge, linking specific facts and their connections. When Large Language Models (LLMs) process text, they infer these links, leading to occasional inaccuracies due to their probabilistic nature. Clearly defined relationships enhance their accuracy. This structured approach can extend beyond concepts to document layouts, pages, or other organizational schemas." - ] - }, - { - "cell_type": "markdown", - "id": "ff454731", - "metadata": {}, - "source": [ - "![Untitled-2024-10-08-1656(2).png]()" - ] - }, - { - "cell_type": "markdown", - "id": "5b3b58d3", - "metadata": {}, - "source": [ - "## Concept 4: Vector and Graph Retrieval" - ] - }, - { - "cell_type": "markdown", - "id": "3555db8b", - "metadata": {}, - "source": [ - "#### Cognee lets you use multiple vector and graph retrieval methods to find the most relevant information." - ] - }, - { - "cell_type": "markdown", - "id": "d2d5e844", - "metadata": {}, - "source": [ - "## Concept 5: Auto-Optimizing Pipelines" - ] - }, - { - "cell_type": "markdown", - "id": "6979a010", - "metadata": {}, - "source": [ - "#### Integrating knowledge graphs into Retrieval-Augmented Generation (RAG) pipelines leads to an intriguing outcome: the system's adeptness at contextual understanding allows it to be evaluated in a way Machine Learning (ML) engineers are accustomed to. This involves bombarding the RAG system with hundreds of synthetic questions, enabling the knowledge graph to evolve and refine its context autonomously over time. This method paves the way for developing self-improving memory engines that can adapt to new data and user feedback." - ] - }, - { - "cell_type": "markdown", - "id": "074f0ea8-c659-4736-be26-be4b0e5ac665", - "metadata": {}, - "source": [ - "# Demo time" - ] - }, - { - "cell_type": "markdown", - "id": "0587d91d", - "metadata": {}, - "source": [ - "#### First let's define some data that we will cognify and perform a search on" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "df16431d0f48b006", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-20T14:02:48.519686Z", - "start_time": "2024-09-20T14:02:48.515589Z" - } - }, - "outputs": [], - "source": [ - "job_position = \"\"\"Senior Data Scientist (Machine Learning)\n", - "\n", - "Company: TechNova Solutions\n", - "Location: San Francisco, CA\n", - "\n", - "Job Description:\n", - "\n", - "TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.\n", - "\n", - "Responsibilities:\n", - "\n", - "Develop and implement advanced machine learning algorithms and models.\n", - "Analyze large, complex datasets to extract meaningful patterns and insights.\n", - "Collaborate with cross-functional teams to integrate predictive models into products.\n", - "Stay updated with the latest advancements in machine learning and data science.\n", - "Mentor junior data scientists and provide technical guidance.\n", - "Qualifications:\n", - "\n", - "Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field.\n", - "5+ years of experience in data science and machine learning.\n", - "Proficient in Python, R, and SQL.\n", - "Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).\n", - "Strong problem-solving skills and attention to detail.\n", - "Candidate CVs\n", - "\"\"\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "9086abf3af077ab4", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-20T14:02:49.120838Z", - "start_time": "2024-09-20T14:02:49.118294Z" - } - }, - "outputs": [], - "source": [ - "job_1 = \"\"\"\n", - "CV 1: Relevant\n", - "Name: Dr. Emily Carter\n", - "Contact Information:\n", - "\n", - "Email: emily.carter@example.com\n", - "Phone: (555) 123-4567\n", - "Summary:\n", - "\n", - "Senior Data Scientist with over 8 years of experience in machine learning and predictive analytics. Expertise in developing advanced algorithms and deploying scalable models in production environments.\n", - "\n", - "Education:\n", - "\n", - "Ph.D. in Computer Science, Stanford University (2014)\n", - "B.S. in Mathematics, University of California, Berkeley (2010)\n", - "Experience:\n", - "\n", - "Senior Data Scientist, InnovateAI Labs (2016 – Present)\n", - "Led a team in developing machine learning models for natural language processing applications.\n", - "Implemented deep learning algorithms that improved prediction accuracy by 25%.\n", - "Collaborated with cross-functional teams to integrate models into cloud-based platforms.\n", - "Data Scientist, DataWave Analytics (2014 – 2016)\n", - "Developed predictive models for customer segmentation and churn analysis.\n", - "Analyzed large datasets using Hadoop and Spark frameworks.\n", - "Skills:\n", - "\n", - "Programming Languages: Python, R, SQL\n", - "Machine Learning: TensorFlow, Keras, Scikit-Learn\n", - "Big Data Technologies: Hadoop, Spark\n", - "Data Visualization: Tableau, Matplotlib\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "a9de0cc07f798b7f", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-20T14:02:49.675003Z", - "start_time": "2024-09-20T14:02:49.671615Z" - } - }, - "outputs": [], - "source": [ - "job_2 = \"\"\"\n", - "CV 2: Relevant\n", - "Name: Michael Rodriguez\n", - "Contact Information:\n", - "\n", - "Email: michael.rodriguez@example.com\n", - "Phone: (555) 234-5678\n", - "Summary:\n", - "\n", - "Data Scientist with a strong background in machine learning and statistical modeling. Skilled in handling large datasets and translating data into actionable business insights.\n", - "\n", - "Education:\n", - "\n", - "M.S. in Data Science, Carnegie Mellon University (2013)\n", - "B.S. in Computer Science, University of Michigan (2011)\n", - "Experience:\n", - "\n", - "Senior Data Scientist, Alpha Analytics (2017 – Present)\n", - "Developed machine learning models to optimize marketing strategies.\n", - "Reduced customer acquisition cost by 15% through predictive modeling.\n", - "Data Scientist, TechInsights (2013 – 2017)\n", - "Analyzed user behavior data to improve product features.\n", - "Implemented A/B testing frameworks to evaluate product changes.\n", - "Skills:\n", - "\n", - "Programming Languages: Python, Java, SQL\n", - "Machine Learning: Scikit-Learn, XGBoost\n", - "Data Visualization: Seaborn, Plotly\n", - "Databases: MySQL, MongoDB\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "185ff1c102d06111", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-20T14:02:50.286828Z", - "start_time": "2024-09-20T14:02:50.284369Z" - } - }, - "outputs": [], - "source": [ - "job_3 = \"\"\"\n", - "CV 3: Relevant\n", - "Name: Sarah Nguyen\n", - "Contact Information:\n", - "\n", - "Email: sarah.nguyen@example.com\n", - "Phone: (555) 345-6789\n", - "Summary:\n", - "\n", - "Data Scientist specializing in machine learning with 6 years of experience. Passionate about leveraging data to drive business solutions and improve product performance.\n", - "\n", - "Education:\n", - "\n", - "M.S. in Statistics, University of Washington (2014)\n", - "B.S. in Applied Mathematics, University of Texas at Austin (2012)\n", - "Experience:\n", - "\n", - "Data Scientist, QuantumTech (2016 – Present)\n", - "Designed and implemented machine learning algorithms for financial forecasting.\n", - "Improved model efficiency by 20% through algorithm optimization.\n", - "Junior Data Scientist, DataCore Solutions (2014 – 2016)\n", - "Assisted in developing predictive models for supply chain optimization.\n", - "Conducted data cleaning and preprocessing on large datasets.\n", - "Skills:\n", - "\n", - "Programming Languages: Python, R\n", - "Machine Learning Frameworks: PyTorch, Scikit-Learn\n", - "Statistical Analysis: SAS, SPSS\n", - "Cloud Platforms: AWS, Azure\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "d55ce4c58f8efb67", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-20T14:02:50.950343Z", - "start_time": "2024-09-20T14:02:50.946378Z" - } - }, - "outputs": [], - "source": [ - "job_4 = \"\"\"\n", - "CV 4: Not Relevant\n", - "Name: David Thompson\n", - "Contact Information:\n", - "\n", - "Email: david.thompson@example.com\n", - "Phone: (555) 456-7890\n", - "Summary:\n", - "\n", - "Creative Graphic Designer with over 8 years of experience in visual design and branding. Proficient in Adobe Creative Suite and passionate about creating compelling visuals.\n", - "\n", - "Education:\n", - "\n", - "B.F.A. in Graphic Design, Rhode Island School of Design (2012)\n", - "Experience:\n", - "\n", - "Senior Graphic Designer, CreativeWorks Agency (2015 – Present)\n", - "Led design projects for clients in various industries.\n", - "Created branding materials that increased client engagement by 30%.\n", - "Graphic Designer, Visual Innovations (2012 – 2015)\n", - "Designed marketing collateral, including brochures, logos, and websites.\n", - "Collaborated with the marketing team to develop cohesive brand strategies.\n", - "Skills:\n", - "\n", - "Design Software: Adobe Photoshop, Illustrator, InDesign\n", - "Web Design: HTML, CSS\n", - "Specialties: Branding and Identity, Typography\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "ca4ecc32721ad332", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-20T14:02:51.548191Z", - "start_time": "2024-09-20T14:02:51.545520Z" - } - }, - "outputs": [], - "source": [ - "job_5 = \"\"\"\n", - "CV 5: Not Relevant\n", - "Name: Jessica Miller\n", - "Contact Information:\n", - "\n", - "Email: jessica.miller@example.com\n", - "Phone: (555) 567-8901\n", - "Summary:\n", - "\n", - "Experienced Sales Manager with a strong track record in driving sales growth and building high-performing teams. Excellent communication and leadership skills.\n", - "\n", - "Education:\n", - "\n", - "B.A. in Business Administration, University of Southern California (2010)\n", - "Experience:\n", - "\n", - "Sales Manager, Global Enterprises (2015 – Present)\n", - "Managed a sales team of 15 members, achieving a 20% increase in annual revenue.\n", - "Developed sales strategies that expanded customer base by 25%.\n", - "Sales Representative, Market Leaders Inc. (2010 – 2015)\n", - "Consistently exceeded sales targets and received the 'Top Salesperson' award in 2013.\n", - "Skills:\n", - "\n", - "Sales Strategy and Planning\n", - "Team Leadership and Development\n", - "CRM Software: Salesforce, Zoho\n", - "Negotiation and Relationship Building\n", - "\"\"\"" - ] - }, - { - "cell_type": "markdown", - "id": "4415446a", - "metadata": {}, - "source": [ - "#### Please add the necessary environment information bellow:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bce39dc6", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# # Setting environment variables\n", - "if \"GRAPHISTRY_USERNAME\" not in os.environ: \n", - " os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n", - "\n", - "if \"GRAPHISTRY_PASSWORD\" not in os.environ: \n", - " os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n", - "\n", - "if \"LLM_API_KEY\" not in os.environ:\n", - " os.environ[\"LLM_API_KEY\"] = \"\"\n", - "\n", - "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" # \"neo4j\" or \"networkx\"\n", - "# Not needed if using networkx\n", - "#GRAPH_DATABASE_URL=\"\"\n", - "#GRAPH_DATABASE_USERNAME=\"\"\n", - "#GRAPH_DATABASE_PASSWORD=\"\"\n", - "\n", - "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" # \"qdrant\", \"weaviate\" or \"lancedb\"\n", - "# Not needed if using \"lancedb\"\n", - "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n", - "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n", - "\n", - "# Database provider\n", - "os.environ[\"DB_PROVIDER\"]=\"sqlite\" # or \"postgres\"\n", - "\n", - "# Database name\n", - "os.environ[\"DB_NAME\"]=\"cognee_db\"\n", - "\n", - "# Postgres specific parameters (Only if Postgres is run)\n", - "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n", - "# os.environ[\"DB_PORT\"]=\"5432\"\n", - "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n", - "# os.environ[\"DB_PASSWORD\"]=\"cognee\"" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "9f1a1dbd", - "metadata": {}, - "outputs": [], - "source": [ - "# Reset the cognee system with the following command:\n", - "\n", - "import cognee\n", - "\n", - "await cognee.prune.prune_data()\n", - "await cognee.prune.prune_system(metadata=True)" - ] - }, - { - "cell_type": "markdown", - "id": "383d6971", - "metadata": {}, - "source": [ - "#### After we have defined and gathered our data let's add it to cognee " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "904df61ba484a8e5", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-20T14:02:54.243987Z", - "start_time": "2024-09-20T14:02:52.498195Z" - } - }, - "outputs": [], - "source": [ - "import cognee\n", - "\n", - "await cognee.add([job_1, job_2, job_3, job_4, job_5, job_position], \"example\")" - ] - }, - { - "cell_type": "markdown", - "id": "0f15c5b1", - "metadata": {}, - "source": [ - "#### All good, let's cognify it." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "7c431fdef4921ae0", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-20T14:02:57.925667Z", - "start_time": "2024-09-20T14:02:57.922353Z" - } - }, - "outputs": [], - "source": [ - "from cognee.shared.data_models import KnowledgeGraph\n", - "from cognee.modules.data.models import Dataset, Data\n", - "from cognee.modules.data.methods.get_dataset_data import get_dataset_data\n", - "from cognee.modules.cognify.config import get_cognify_config\n", - "from cognee.modules.pipelines.tasks.Task import Task\n", - "from cognee.modules.pipelines import run_tasks\n", - "from cognee.modules.users.models import User\n", - "from cognee.tasks import chunk_remove_disconnected, \\\n", - " infer_data_ontology, \\\n", - " save_chunks_to_store, \\\n", - " chunk_update_check, \\\n", - " chunks_into_graph, \\\n", - " source_documents_to_chunks, \\\n", - " check_permissions_on_documents, \\\n", - " classify_documents, \\\n", - " chunk_naive_llm_classifier\n", - "from cognee.tasks.summarization import summarize_text\n", - "\n", - "async def run_cognify_pipeline(dataset: Dataset, user: User = None):\n", - " data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)\n", - "\n", - " try:\n", - "\n", - " root_node_id = None\n", - "\n", - " cognee_config = get_cognify_config()\n", - "\n", - " tasks = [\n", - " Task(classify_documents),\n", - " Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n", - " Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),\n", - " Task(source_documents_to_chunks, chunk_size = 800, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n", - " Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = \"entities\", task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes\n", - " Task(chunk_update_check, collection_name = \"chunks\"), # Find all affected chunks, so we don't process unchanged chunks\n", - " Task(\n", - " save_chunks_to_store,\n", - " collection_name = \"chunks\",\n", - " ), \n", - " Task(\n", - " summarize_text,\n", - " summarization_model = cognee_config.summarization_model,\n", - " collection_name = \"summaries\",\n", - " ),\n", - " Task(\n", - " chunk_naive_llm_classifier,\n", - " classification_model = cognee_config.classification_model,\n", - " ),\n", - " Task(chunk_remove_disconnected), # Remove the obsolete document chunks.\n", - " ]\n", - "\n", - " pipeline = run_tasks(tasks, data_documents)\n", - "\n", - " async for result in pipeline:\n", - " print(result)\n", - " except Exception as error:\n", - " raise error" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0a91b99c6215e09", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-20T14:02:58.905774Z", - "start_time": "2024-09-20T14:02:58.625915Z" - } - }, - "outputs": [], - "source": [ - "from cognee.modules.users.methods import get_default_user\n", - "from cognee.modules.data.methods import get_datasets_by_name\n", - "\n", - "user = await get_default_user()\n", - "\n", - "datasets = await get_datasets_by_name([\"example\"], user.id)\n", - "\n", - "await run_cognify_pipeline(datasets[0], user)" - ] - }, - { - "cell_type": "markdown", - "id": "219a6d41", - "metadata": {}, - "source": [ - "#### We get the url to the graph on graphistry in the notebook cell bellow, showing nodes and connections made by the cognify process." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "080389e5", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from cognee.shared.utils import render_graph\n", - "from cognee.infrastructure.databases.graph import get_graph_engine\n", - "import graphistry\n", - "\n", - "graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n", - "\n", - "graph_engine = await get_graph_engine()\n", - "\n", - "graph_url = await render_graph(graph_engine.graph)\n", - "print(graph_url)" - ] - }, - { - "cell_type": "markdown", - "id": "59e6c3c3", - "metadata": {}, - "source": [ - "#### We can also do a search on the data to explore the knowledge." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e5e7dfc8", - "metadata": {}, - "outputs": [], - "source": [ - "async def search(\n", - " vector_engine,\n", - " collection_name: str,\n", - " query_text: str = None,\n", - "):\n", - " query_vector = (await vector_engine.embedding_engine.embed_text([query_text]))[0]\n", - "\n", - " connection = await vector_engine.get_connection()\n", - " collection = await connection.open_table(collection_name)\n", - "\n", - " results = await collection.vector_search(query_vector).limit(10).to_pandas()\n", - "\n", - " result_values = list(results.to_dict(\"index\").values())\n", - "\n", - " return [dict(\n", - " id = str(result[\"id\"]),\n", - " payload = result[\"payload\"],\n", - " score = result[\"_distance\"],\n", - " ) for result in result_values]\n", - "\n", - "\n", - "from cognee.infrastructure.databases.vector import get_vector_engine\n", - "\n", - "vector_engine = get_vector_engine()\n", - "results = await search(vector_engine, \"entities\", \"sarah.nguyen@example.com\")\n", - "for result in results:\n", - " print(result)" - ] - }, - { - "cell_type": "markdown", - "id": "81fa2b00", - "metadata": {}, - "source": [ - "#### We normalize search output scores so the lower the score of the search result is the higher the chance that it's what you're looking for. In the example above we have searched for node entities in the knowledge graph related to \"sarah.nguyen@example.com\"" - ] - }, - { - "cell_type": "markdown", - "id": "1b94ff96", - "metadata": {}, - "source": [ - "#### In the example bellow we'll use cognee search to summarize information regarding the node most related to \"sarah.nguyen@example.com\" in the knowledge graph" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21a3e9a6", - "metadata": {}, - "outputs": [], - "source": [ - "from cognee.api.v1.search import SearchType\n", - "\n", - "node = (await vector_engine.search(\"entities\", \"sarah.nguyen@example.com\"))[0]\n", - "node_name = node.payload[\"name\"]\n", - "\n", - "search_results = await cognee.search(SearchType.SUMMARIES, query = node_name)\n", - "print(\"\\n\\Extracted summaries are:\\n\")\n", - "for result in search_results:\n", - " print(f\"{result}\\n\")" - ] - }, - { - "cell_type": "markdown", - "id": "fd6e5fe2", - "metadata": {}, - "source": [ - "#### In this example we'll use cognee search to find chunks in which the node most related to \"sarah.nguyen@example.com\" is a part of" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7a8abff", - "metadata": {}, - "outputs": [], - "source": [ - "search_results = await cognee.search(SearchType.CHUNKS, query = node_name)\n", - "print(\"\\n\\nExtracted chunks are:\\n\")\n", - "for result in search_results:\n", - " print(f\"{result}\\n\")" - ] - }, - { - "cell_type": "markdown", - "id": "47f0112f", - "metadata": {}, - "source": [ - "#### In this example we'll use cognee search to give us insights from the knowledge graph related to the node most related to \"sarah.nguyen@example.com\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "706a3954", - "metadata": {}, - "outputs": [], - "source": [ - "search_results = await cognee.search(SearchType.INSIGHTS, query = node_name)\n", - "print(\"\\n\\nExtracted sentences are:\\n\")\n", - "for result in search_results:\n", - " print(f\"{result}\\n\")" - ] - }, - { - "cell_type": "markdown", - "id": "2ab3d84a", - "metadata": {}, - "source": [ - "#### Bellow is a diagram of the cognee process for the data used in this example notebook" - ] - }, - { - "cell_type": "markdown", - "id": "31412c52", - "metadata": {}, - "source": [ - "![cognee_final.drawio.png]()" - ] - }, - { - "cell_type": "markdown", - "id": "288ab570", - "metadata": {}, - "source": [ - "# Give us a star if you like it!\n", - "https://github.com/topoteretes/cognee" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.6" - } + "cells": [ + { + "cell_type": "markdown", + "id": "d35ac8ce-0f92-46f5-9ba4-a46970f0ce19", + "metadata": {}, + "source": [ + "# Cognee - Get Started" + ] + }, + { + "cell_type": "markdown", + "id": "bd981778-0c84-4542-8e6f-1a7712184873", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" }, - "nbformat": 4, - "nbformat_minor": 5 + "tags": [] + }, + "source": [ + "## Let's talk about the problem first\n", + "\n", + "### Large Language Models (LLMs) have become powerful tools for generating text and answering questions, but they still have several limitations and challenges. Below is an overview of some of the biggest problems with the results they produce:\n", + "\n", + "### 1. Hallucinations and Misinformation\n", + "- Hallucinations: LLMs sometimes produce outputs that are factually incorrect or entirely fabricated. This phenomenon is known as \"hallucination.\" Even if an LLM seems confident, the information it provides might not be reliable.\n", + "- Misinformation: Misinformation can be subtle or glaring, ranging from minor inaccuracies to entirely fictitious events, sources, or data.\n", + "\n", + "### 2. Lack of Contextual Understanding\n", + "- LLMs can recognize and replicate patterns in language but don’t have true comprehension. This can lead to responses that are coherent but miss nuanced context or deeper meaning.\n", + "- They can misinterpret multi-turn conversations, leading to confusion in maintaining context over a long dialogue.\n", + "\n", + "### 3. Inconsistent Reliability\n", + "- Depending on the prompt, LLMs might produce inconsistent responses to similar questions or tasks. For example, the same query might result in conflicting answers when asked in slightly different ways.\n", + "- This inconsistency can undermine trust in the model's outputs, especially in professional or academic settings.\n", + "\n", + "### 4. Inability to Access Real-Time Information\n", + "- Most LLMs are trained on data up to a specific point and cannot access or generate information on current events or emerging trends unless updated. This can make them unsuitable for inquiries requiring up-to-date information.\n", + "- Real-time browsing capabilities can help, but they are not universally available.\n", + "\n", + "### 5. Lack of Personalization and Adaptability\n", + "- LLMs do not naturally adapt to individual preferences or learning styles unless explicitly programmed to do so. This limits their usefulness in providing personalized recommendations or support.\n", + "\n", + "### 6. Difficulty with Highly Technical or Niche Domains\n", + "- LLMs may struggle with highly specialized or technical topics where domain-specific knowledge is required.\n", + "- They can produce technically plausible but inaccurate or incomplete information, which can be misleading in areas like law, medicine, or scientific research.\n", + "\n", + "### 7. Ambiguity in Response Generation\n", + "- LLMs might not always specify their level of certainty, making it hard to gauge when they are speculating or providing less confident answers.\n", + "- They lack a mechanism to say “I don’t know,” which can lead to responses that are less useful or potentially misleading." + ] + }, + { + "cell_type": "markdown", + "id": "d8e606b1-94d3-43ce-bb4b-dbadff7f4ca6", + "metadata": {}, + "source": [ + "## The next solution was RAGs \n", + "\n", + "#### RAGs (Retrieval Augmented Generation) are systems that connect to a vector store and search for similar data so they can enrich LLM response." + ] + }, + { + "attachments": { + "df72c97a-cb3b-4e3c-bd68-d7bc986353c6.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "23e74f22-f43c-4f03-afe0-b423cbaa412a", + "metadata": {}, + "source": [ + "![1_Jq9bEbitg1Pv4oASwEQwJg.png](attachment:df72c97a-cb3b-4e3c-bd68-d7bc986353c6.png)\n" + ] + }, + { + "cell_type": "markdown", + "id": "b6a98710-a14b-4a14-bb56-d3ae055e94d9", + "metadata": {}, + "source": [ + "#### The problem lies in the nature of the search. If you just find some keywords, and return one or many documents from vectorstore this way, you will have an issue with the the way you would use to organise and prioritise documents. \n" + ] + }, + { + "cell_type": "markdown", + "id": "5029110f", + "metadata": {}, + "source": [ + "![rag_problem_v2_white.drawio.png]()" + ] + }, + { + "cell_type": "markdown", + "id": "b6a98710-a14b-4a14-bb56-d3ae055e94d9", + "metadata": {}, + "source": [ + "## Semantic similarity search is not magic\n", + "#### The most similar result isn't the most relevant one. \n", + "#### If you search for documents in which the sentiment expressed is \"I like apples.\", one of the closest results you get are documents in which the sentiment expressed is \"I don't like apples.\"\n", + "#### Wouldn't it be nice to have a semantic model LLMs could use?\n" + ] + }, + { + "cell_type": "markdown", + "id": "b900f830-8e9e-4272-b198-594606da4457", + "metadata": {}, + "source": [ + "# That is where Cognee comes in" + ] + }, + { + "cell_type": "markdown", + "id": "d3ae099a-1bbb-4f13-9bcb-c0f778d50e91", + "metadata": {}, + "source": [ + "#### Cognee assists developers in introducing greater predictability and management into their Retrieval-Augmented Generation (RAG) workflows through the use of graph architectures, vector stores, and auto-optimizing pipelines. Displaying information as a graph is the clearest way to grasp the content of your documents. Crucially, graphs allow systematic navigation and extraction of data from documents based on their hierarchy.\n", + "\n", + "#### Cognee lets you create tasks and contextual pipelines of tasks that enable composable GraphRAG, where you have full control of all the elements of the pipeline from ingestion until graph creation. " + ] + }, + { + "cell_type": "markdown", + "id": "785383b0-87b5-4a0a-be3f-e809aa284e30", + "metadata": {}, + "source": [ + "# Core Concepts" + ] + }, + { + "cell_type": "markdown", + "id": "3540ce30-2b22-4ece-8516-8d5ff2a405fe", + "metadata": {}, + "source": [ + "## Concept 1: Data Pipelines" + ] + }, + { + "cell_type": "markdown", + "id": "7e47bae4-d27d-4430-a134-e1b381378f5c", + "metadata": {}, + "source": [ + "### Most of the data we provide to a system can be categorized as unstructured, semi-structured, or structured. Rows from a database would belong to structured data, jsons to semi-structured data, and logs that we input into the system could be considered unstructured. To organize and process this data, we need to ensure we have custom loaders for all data types, which can help us unify and organize it properly." + ] + }, + { + "cell_type": "markdown", + "id": "2f9c9376-8c68-4397-9081-d260cddcbd25", + "metadata": {}, + "source": [ + "![image.png]()" + ] + }, + { + "cell_type": "markdown", + "id": "7c87c5cf", + "metadata": {}, + "source": [ + "#### In the example above, we have a pipeline in which data has been imported from various sources, normalized, and stored in a database. " + ] + }, + { + "cell_type": "markdown", + "id": "bd435d1d", + "metadata": {}, + "source": [ + "## Concept 2: Data Enrichment with LLMs" + ] + }, + { + "cell_type": "markdown", + "id": "836d35ef", + "metadata": {}, + "source": [ + "#### LLMs are adept at processing unstructured data. They can easily extract summaries, keywords, and other useful information from documents. We use function calling with Pydantic models to extract information from the unstructured data. " + ] + }, + { + "cell_type": "markdown", + "id": "5bc1681c", + "metadata": {}, + "source": [ + "![image.png]()" + ] + }, + { + "cell_type": "markdown", + "id": "c6f428a8", + "metadata": {}, + "source": [ + "#### We decompose the loaded content into graphs, allowing us to more precisely map out the relationships between entities and concepts." + ] + }, + { + "cell_type": "markdown", + "id": "34c2227f", + "metadata": {}, + "source": [ + "## Concept 3: Graphs" + ] + }, + { + "cell_type": "markdown", + "id": "7ec176f5", + "metadata": {}, + "source": [ + "#### Knowledge graphs simply map out knowledge, linking specific facts and their connections. When Large Language Models (LLMs) process text, they infer these links, leading to occasional inaccuracies due to their probabilistic nature. Clearly defined relationships enhance their accuracy. This structured approach can extend beyond concepts to document layouts, pages, or other organizational schemas." + ] + }, + { + "cell_type": "markdown", + "id": "ff454731", + "metadata": {}, + "source": [ + "![Untitled-2024-10-08-1656(2).png]()" + ] + }, + { + "cell_type": "markdown", + "id": "5b3b58d3", + "metadata": {}, + "source": [ + "## Concept 4: Vector and Graph Retrieval" + ] + }, + { + "cell_type": "markdown", + "id": "3555db8b", + "metadata": {}, + "source": [ + "#### Cognee lets you use multiple vector and graph retrieval methods to find the most relevant information." + ] + }, + { + "cell_type": "markdown", + "id": "d2d5e844", + "metadata": {}, + "source": [ + "## Concept 5: Auto-Optimizing Pipelines" + ] + }, + { + "cell_type": "markdown", + "id": "6979a010", + "metadata": {}, + "source": [ + "#### Integrating knowledge graphs into Retrieval-Augmented Generation (RAG) pipelines leads to an intriguing outcome: the system's adeptness at contextual understanding allows it to be evaluated in a way Machine Learning (ML) engineers are accustomed to. This involves bombarding the RAG system with hundreds of synthetic questions, enabling the knowledge graph to evolve and refine its context autonomously over time. This method paves the way for developing self-improving memory engines that can adapt to new data and user feedback." + ] + }, + { + "cell_type": "markdown", + "id": "074f0ea8-c659-4736-be26-be4b0e5ac665", + "metadata": {}, + "source": [ + "# Demo time" + ] + }, + { + "cell_type": "markdown", + "id": "0587d91d", + "metadata": {}, + "source": [ + "#### First let's define some data that we will cognify and perform a search on" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "df16431d0f48b006", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:48.519686Z", + "start_time": "2024-09-20T14:02:48.515589Z" + } + }, + "outputs": [], + "source": [ + "job_position = \"\"\"Senior Data Scientist (Machine Learning)\n", + "\n", + "Company: TechNova Solutions\n", + "Location: San Francisco, CA\n", + "\n", + "Job Description:\n", + "\n", + "TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.\n", + "\n", + "Responsibilities:\n", + "\n", + "Develop and implement advanced machine learning algorithms and models.\n", + "Analyze large, complex datasets to extract meaningful patterns and insights.\n", + "Collaborate with cross-functional teams to integrate predictive models into products.\n", + "Stay updated with the latest advancements in machine learning and data science.\n", + "Mentor junior data scientists and provide technical guidance.\n", + "Qualifications:\n", + "\n", + "Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field.\n", + "5+ years of experience in data science and machine learning.\n", + "Proficient in Python, R, and SQL.\n", + "Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).\n", + "Strong problem-solving skills and attention to detail.\n", + "Candidate CVs\n", + "\"\"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9086abf3af077ab4", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:49.120838Z", + "start_time": "2024-09-20T14:02:49.118294Z" + } + }, + "outputs": [], + "source": [ + "job_1 = \"\"\"\n", + "CV 1: Relevant\n", + "Name: Dr. Emily Carter\n", + "Contact Information:\n", + "\n", + "Email: emily.carter@example.com\n", + "Phone: (555) 123-4567\n", + "Summary:\n", + "\n", + "Senior Data Scientist with over 8 years of experience in machine learning and predictive analytics. Expertise in developing advanced algorithms and deploying scalable models in production environments.\n", + "\n", + "Education:\n", + "\n", + "Ph.D. in Computer Science, Stanford University (2014)\n", + "B.S. in Mathematics, University of California, Berkeley (2010)\n", + "Experience:\n", + "\n", + "Senior Data Scientist, InnovateAI Labs (2016 – Present)\n", + "Led a team in developing machine learning models for natural language processing applications.\n", + "Implemented deep learning algorithms that improved prediction accuracy by 25%.\n", + "Collaborated with cross-functional teams to integrate models into cloud-based platforms.\n", + "Data Scientist, DataWave Analytics (2014 – 2016)\n", + "Developed predictive models for customer segmentation and churn analysis.\n", + "Analyzed large datasets using Hadoop and Spark frameworks.\n", + "Skills:\n", + "\n", + "Programming Languages: Python, R, SQL\n", + "Machine Learning: TensorFlow, Keras, Scikit-Learn\n", + "Big Data Technologies: Hadoop, Spark\n", + "Data Visualization: Tableau, Matplotlib\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a9de0cc07f798b7f", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:49.675003Z", + "start_time": "2024-09-20T14:02:49.671615Z" + } + }, + "outputs": [], + "source": [ + "job_2 = \"\"\"\n", + "CV 2: Relevant\n", + "Name: Michael Rodriguez\n", + "Contact Information:\n", + "\n", + "Email: michael.rodriguez@example.com\n", + "Phone: (555) 234-5678\n", + "Summary:\n", + "\n", + "Data Scientist with a strong background in machine learning and statistical modeling. Skilled in handling large datasets and translating data into actionable business insights.\n", + "\n", + "Education:\n", + "\n", + "M.S. in Data Science, Carnegie Mellon University (2013)\n", + "B.S. in Computer Science, University of Michigan (2011)\n", + "Experience:\n", + "\n", + "Senior Data Scientist, Alpha Analytics (2017 – Present)\n", + "Developed machine learning models to optimize marketing strategies.\n", + "Reduced customer acquisition cost by 15% through predictive modeling.\n", + "Data Scientist, TechInsights (2013 – 2017)\n", + "Analyzed user behavior data to improve product features.\n", + "Implemented A/B testing frameworks to evaluate product changes.\n", + "Skills:\n", + "\n", + "Programming Languages: Python, Java, SQL\n", + "Machine Learning: Scikit-Learn, XGBoost\n", + "Data Visualization: Seaborn, Plotly\n", + "Databases: MySQL, MongoDB\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "185ff1c102d06111", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:50.286828Z", + "start_time": "2024-09-20T14:02:50.284369Z" + } + }, + "outputs": [], + "source": [ + "job_3 = \"\"\"\n", + "CV 3: Relevant\n", + "Name: Sarah Nguyen\n", + "Contact Information:\n", + "\n", + "Email: sarah.nguyen@example.com\n", + "Phone: (555) 345-6789\n", + "Summary:\n", + "\n", + "Data Scientist specializing in machine learning with 6 years of experience. Passionate about leveraging data to drive business solutions and improve product performance.\n", + "\n", + "Education:\n", + "\n", + "M.S. in Statistics, University of Washington (2014)\n", + "B.S. in Applied Mathematics, University of Texas at Austin (2012)\n", + "Experience:\n", + "\n", + "Data Scientist, QuantumTech (2016 – Present)\n", + "Designed and implemented machine learning algorithms for financial forecasting.\n", + "Improved model efficiency by 20% through algorithm optimization.\n", + "Junior Data Scientist, DataCore Solutions (2014 – 2016)\n", + "Assisted in developing predictive models for supply chain optimization.\n", + "Conducted data cleaning and preprocessing on large datasets.\n", + "Skills:\n", + "\n", + "Programming Languages: Python, R\n", + "Machine Learning Frameworks: PyTorch, Scikit-Learn\n", + "Statistical Analysis: SAS, SPSS\n", + "Cloud Platforms: AWS, Azure\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d55ce4c58f8efb67", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:50.950343Z", + "start_time": "2024-09-20T14:02:50.946378Z" + } + }, + "outputs": [], + "source": [ + "job_4 = \"\"\"\n", + "CV 4: Not Relevant\n", + "Name: David Thompson\n", + "Contact Information:\n", + "\n", + "Email: david.thompson@example.com\n", + "Phone: (555) 456-7890\n", + "Summary:\n", + "\n", + "Creative Graphic Designer with over 8 years of experience in visual design and branding. Proficient in Adobe Creative Suite and passionate about creating compelling visuals.\n", + "\n", + "Education:\n", + "\n", + "B.F.A. in Graphic Design, Rhode Island School of Design (2012)\n", + "Experience:\n", + "\n", + "Senior Graphic Designer, CreativeWorks Agency (2015 – Present)\n", + "Led design projects for clients in various industries.\n", + "Created branding materials that increased client engagement by 30%.\n", + "Graphic Designer, Visual Innovations (2012 – 2015)\n", + "Designed marketing collateral, including brochures, logos, and websites.\n", + "Collaborated with the marketing team to develop cohesive brand strategies.\n", + "Skills:\n", + "\n", + "Design Software: Adobe Photoshop, Illustrator, InDesign\n", + "Web Design: HTML, CSS\n", + "Specialties: Branding and Identity, Typography\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "ca4ecc32721ad332", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:51.548191Z", + "start_time": "2024-09-20T14:02:51.545520Z" + } + }, + "outputs": [], + "source": [ + "job_5 = \"\"\"\n", + "CV 5: Not Relevant\n", + "Name: Jessica Miller\n", + "Contact Information:\n", + "\n", + "Email: jessica.miller@example.com\n", + "Phone: (555) 567-8901\n", + "Summary:\n", + "\n", + "Experienced Sales Manager with a strong track record in driving sales growth and building high-performing teams. Excellent communication and leadership skills.\n", + "\n", + "Education:\n", + "\n", + "B.A. in Business Administration, University of Southern California (2010)\n", + "Experience:\n", + "\n", + "Sales Manager, Global Enterprises (2015 – Present)\n", + "Managed a sales team of 15 members, achieving a 20% increase in annual revenue.\n", + "Developed sales strategies that expanded customer base by 25%.\n", + "Sales Representative, Market Leaders Inc. (2010 – 2015)\n", + "Consistently exceeded sales targets and received the 'Top Salesperson' award in 2013.\n", + "Skills:\n", + "\n", + "Sales Strategy and Planning\n", + "Team Leadership and Development\n", + "CRM Software: Salesforce, Zoho\n", + "Negotiation and Relationship Building\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "4415446a", + "metadata": {}, + "source": [ + "#### Please add the necessary environment information bellow:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bce39dc6", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# # Setting environment variables\n", + "if \"GRAPHISTRY_USERNAME\" not in os.environ: \n", + " os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n", + "\n", + "if \"GRAPHISTRY_PASSWORD\" not in os.environ: \n", + " os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n", + "\n", + "if \"LLM_API_KEY\" not in os.environ:\n", + " os.environ[\"LLM_API_KEY\"] = \"\"\n", + "\n", + "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" # \"neo4j\" or \"networkx\"\n", + "# Not needed if using networkx\n", + "#GRAPH_DATABASE_URL=\"\"\n", + "#GRAPH_DATABASE_USERNAME=\"\"\n", + "#GRAPH_DATABASE_PASSWORD=\"\"\n", + "\n", + "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" # \"qdrant\", \"weaviate\" or \"lancedb\"\n", + "# Not needed if using \"lancedb\"\n", + "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n", + "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n", + "\n", + "# Database provider\n", + "os.environ[\"DB_PROVIDER\"]=\"sqlite\" # or \"postgres\"\n", + "\n", + "# Database name\n", + "os.environ[\"DB_NAME\"]=\"cognee_db\"\n", + "\n", + "# Postgres specific parameters (Only if Postgres is run)\n", + "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n", + "# os.environ[\"DB_PORT\"]=\"5432\"\n", + "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n", + "# os.environ[\"DB_PASSWORD\"]=\"cognee\"" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "9f1a1dbd", + "metadata": {}, + "outputs": [], + "source": [ + "# Reset the cognee system with the following command:\n", + "\n", + "import cognee\n", + "\n", + "await cognee.prune.prune_data()\n", + "await cognee.prune.prune_system(metadata=True)" + ] + }, + { + "cell_type": "markdown", + "id": "383d6971", + "metadata": {}, + "source": [ + "#### After we have defined and gathered our data let's add it to cognee " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "904df61ba484a8e5", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:54.243987Z", + "start_time": "2024-09-20T14:02:52.498195Z" + } + }, + "outputs": [], + "source": [ + "import cognee\n", + "\n", + "await cognee.add([job_1, job_2, job_3, job_4, job_5, job_position], \"example\")" + ] + }, + { + "cell_type": "markdown", + "id": "0f15c5b1", + "metadata": {}, + "source": [ + "#### All good, let's cognify it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c431fdef4921ae0", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:57.925667Z", + "start_time": "2024-09-20T14:02:57.922353Z" + } + }, + "outputs": [], + "source": [ + "from cognee.shared.data_models import KnowledgeGraph\n", + "from cognee.modules.data.models import Dataset, Data\n", + "from cognee.modules.data.methods.get_dataset_data import get_dataset_data\n", + "from cognee.modules.cognify.config import get_cognify_config\n", + "from cognee.modules.pipelines.tasks.Task import Task\n", + "from cognee.modules.pipelines import run_tasks\n", + "from cognee.modules.users.models import User\n", + "from cognee.tasks.documents import check_permissions_on_documents, classify_documents, extract_chunks_from_documents\n", + "from cognee.tasks.graph import extract_graph_from_data\n", + "from cognee.tasks.storage import add_data_points\n", + "from cognee.tasks.summarization import summarize_text\n", + "\n", + "async def run_cognify_pipeline(dataset: Dataset, user: User = None):\n", + " data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)\n", + "\n", + " try:\n", + " cognee_config = get_cognify_config()\n", + "\n", + " tasks = [\n", + " Task(classify_documents),\n", + " Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n", + " Task(extract_chunks_from_documents), # Extract text chunks based on the document type.\n", + " Task(add_data_points, task_config = { \"batch_size\": 10 }),\n", + " Task(extract_graph_from_data, graph_model = KnowledgeGraph, task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks.\n", + " Task(\n", + " summarize_text,\n", + " summarization_model = cognee_config.summarization_model,\n", + " task_config = { \"batch_size\": 10 }\n", + " ),\n", + " ]\n", + "\n", + " pipeline = run_tasks(tasks, data_documents)\n", + "\n", + " async for result in pipeline:\n", + " print(result)\n", + " except Exception as error:\n", + " raise error\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0a91b99c6215e09", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:58.905774Z", + "start_time": "2024-09-20T14:02:58.625915Z" + } + }, + "outputs": [], + "source": [ + "from cognee.modules.users.methods import get_default_user\n", + "from cognee.modules.data.methods import get_datasets_by_name\n", + "\n", + "user = await get_default_user()\n", + "\n", + "datasets = await get_datasets_by_name([\"example\"], user.id)\n", + "\n", + "await run_cognify_pipeline(datasets[0], user)" + ] + }, + { + "cell_type": "markdown", + "id": "219a6d41", + "metadata": {}, + "source": [ + "#### We get the url to the graph on graphistry in the notebook cell bellow, showing nodes and connections made by the cognify process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "080389e5", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from cognee.shared.utils import render_graph\n", + "from cognee.infrastructure.databases.graph import get_graph_engine\n", + "import graphistry\n", + "\n", + "graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n", + "\n", + "graph_engine = await get_graph_engine()\n", + "\n", + "graph_url = await render_graph(graph_engine.graph)\n", + "print(graph_url)" + ] + }, + { + "cell_type": "markdown", + "id": "59e6c3c3", + "metadata": {}, + "source": [ + "#### We can also do a search on the data to explore the knowledge." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5e7dfc8", + "metadata": {}, + "outputs": [], + "source": [ + "async def search(\n", + " vector_engine,\n", + " collection_name: str,\n", + " query_text: str = None,\n", + "):\n", + " query_vector = (await vector_engine.embedding_engine.embed_text([query_text]))[0]\n", + "\n", + " connection = await vector_engine.get_connection()\n", + " collection = await connection.open_table(collection_name)\n", + "\n", + " results = await collection.vector_search(query_vector).limit(10).to_pandas()\n", + "\n", + " result_values = list(results.to_dict(\"index\").values())\n", + "\n", + " return [dict(\n", + " id = str(result[\"id\"]),\n", + " payload = result[\"payload\"],\n", + " score = result[\"_distance\"],\n", + " ) for result in result_values]\n", + "\n", + "\n", + "from cognee.infrastructure.databases.vector import get_vector_engine\n", + "\n", + "vector_engine = get_vector_engine()\n", + "results = await search(vector_engine, \"entities\", \"sarah.nguyen@example.com\")\n", + "for result in results:\n", + " print(result)" + ] + }, + { + "cell_type": "markdown", + "id": "81fa2b00", + "metadata": {}, + "source": [ + "#### We normalize search output scores so the lower the score of the search result is the higher the chance that it's what you're looking for. In the example above we have searched for node entities in the knowledge graph related to \"sarah.nguyen@example.com\"" + ] + }, + { + "cell_type": "markdown", + "id": "1b94ff96", + "metadata": {}, + "source": [ + "#### In the example bellow we'll use cognee search to summarize information regarding the node most related to \"sarah.nguyen@example.com\" in the knowledge graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21a3e9a6", + "metadata": {}, + "outputs": [], + "source": [ + "from cognee.api.v1.search import SearchType\n", + "\n", + "node = (await vector_engine.search(\"entities\", \"sarah.nguyen@example.com\"))[0]\n", + "node_name = node.payload[\"name\"]\n", + "\n", + "search_results = await cognee.search(SearchType.SUMMARIES, query = node_name)\n", + "print(\"\\n\\Extracted summaries are:\\n\")\n", + "for result in search_results:\n", + " print(f\"{result}\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "fd6e5fe2", + "metadata": {}, + "source": [ + "#### In this example we'll use cognee search to find chunks in which the node most related to \"sarah.nguyen@example.com\" is a part of" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7a8abff", + "metadata": {}, + "outputs": [], + "source": [ + "search_results = await cognee.search(SearchType.CHUNKS, query = node_name)\n", + "print(\"\\n\\nExtracted chunks are:\\n\")\n", + "for result in search_results:\n", + " print(f\"{result}\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "47f0112f", + "metadata": {}, + "source": [ + "#### In this example we'll use cognee search to give us insights from the knowledge graph related to the node most related to \"sarah.nguyen@example.com\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "706a3954", + "metadata": {}, + "outputs": [], + "source": [ + "search_results = await cognee.search(SearchType.INSIGHTS, query = node_name)\n", + "print(\"\\n\\nExtracted sentences are:\\n\")\n", + "for result in search_results:\n", + " print(f\"{result}\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "2ab3d84a", + "metadata": {}, + "source": [ + "#### Bellow is a diagram of the cognee process for the data used in this example notebook" + ] + }, + { + "cell_type": "markdown", + "id": "31412c52", + "metadata": {}, + "source": [ + "![cognee_final.drawio.png]()" + ] + }, + { + "cell_type": "markdown", + "id": "288ab570", + "metadata": {}, + "source": [ + "# Give us a star if you like it!\n", + "https://github.com/topoteretes/cognee" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/poetry.lock b/poetry.lock index dbbd4687..46ba5042 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3059,17 +3059,17 @@ files = [ [[package]] name = "lancedb" -version = "0.8.0" +version = "0.15.0" description = "lancedb" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "lancedb-0.8.0-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:60b86d7e976ba3900d84687252f6234b7ed5d32e13f012ecd2d85a7994d7bcdb"}, - {file = "lancedb-0.8.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:b268ee0b70c845999f0c42e2906857e5da9c39b50c978d922a36b8aed9c4a163"}, - {file = "lancedb-0.8.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab3e01ee064187d77556d75d6bd90940bcc4d65c854adc858be52fba204ded47"}, - {file = "lancedb-0.8.0-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:627834390660ad3e0a4350dcb6eca169139d46bb9a678b509c31445cd011e733"}, - {file = "lancedb-0.8.0-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:126e9891936be83690ddd8e3d8bf5f947b08dbe47a31ec41dfc8999335ada135"}, - {file = "lancedb-0.8.0-cp38-abi3-win_amd64.whl", hash = "sha256:ae32fadae2310a5bd95123cf7df07a614c9de06530c5c12f342d31ac9964fa10"}, + {file = "lancedb-0.15.0-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:3eacc9c6766594874a7d54e822550c7991d64b14571251f1e4b43985cc4f3cdb"}, + {file = "lancedb-0.15.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:48c28571f79805e11a3bca09486fd1c8d6c3f7762f7692cca1c5e0cdea6bfa20"}, + {file = "lancedb-0.15.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e349a1671943b75a536d2589b5a12f685c129149b0cad266e12555f9501f8ccd"}, + {file = "lancedb-0.15.0-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:c567866b08222457e1aca51df9abeb871aad8fed0db58c004365629c05f8ecbb"}, + {file = "lancedb-0.15.0-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:223cd77fa84a1317301ad4771de58ac5685d58cee03f0a20ba4bc95517b5c79f"}, + {file = "lancedb-0.15.0-cp38-abi3-win_amd64.whl", hash = "sha256:66d251f22709c72f819aace9e665127f1040845d88b25c1f088c4beb36087f7e"}, ] [package.dependencies] @@ -3077,12 +3077,10 @@ attrs = ">=21.3.0" cachetools = "*" deprecation = "*" overrides = ">=0.7" +packaging = "*" pydantic = ">=1.10" -pylance = "0.11.0" -ratelimiter = ">=1.0,<2.0" +pylance = "0.19.1" requests = ">=2.31.0" -retry = ">=0.9.2" -semver = "*" tqdm = ">=4.27.0" [package.extras] @@ -3090,8 +3088,8 @@ azure = ["adlfs (>=2024.2.0)"] clip = ["open-clip", "pillow", "torch"] dev = ["pre-commit", "ruff"] docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"] -embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "instructorembedding", "ollama", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"] -tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"] +embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "ibm-watsonx-ai (>=1.1.2)", "instructorembedding", "ollama", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"] +tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19,<=1.3.0)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"] [[package]] name = "langfuse" @@ -4955,17 +4953,6 @@ bcrypt = {version = "4.1.2", optional = true, markers = "extra == \"bcrypt\""} argon2 = ["argon2-cffi (==23.1.0)"] bcrypt = ["bcrypt (==4.1.2)"] -[[package]] -name = "py" -version = "1.11.0" -description = "library with cross-python path, ini-parsing, io, code, log facilities" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -files = [ - {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, - {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, -] - [[package]] name = "pyarrow" version = "15.0.0" @@ -5201,28 +5188,30 @@ tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] [[package]] name = "pylance" -version = "0.11.0" +version = "0.19.1" description = "python wrapper for Lance columnar format" optional = false python-versions = ">=3.9" files = [ - {file = "pylance-0.11.0-cp39-abi3-macosx_10_15_x86_64.whl", hash = "sha256:3405e217fcb3a75662957605621f7eebb34f35b10c49a00ea1ddef478e0567db"}, - {file = "pylance-0.11.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:c50bd63eaca846fb812109b6ea62a81873797901b4aff808fc8a96aa1994bf52"}, - {file = "pylance-0.11.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf1d3badbfb111d1f193363422940bcfcae45755b60f14fe2153614e65e63d13"}, - {file = "pylance-0.11.0-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:c40252ff325e401116dec3c2010a8011ab5c15915237bee11b97697197b1d0b8"}, - {file = "pylance-0.11.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:ab0893b77e6ae5d9bb2bf21c0f6e199c81071034d5dbfb42787abdff8bfe8ef7"}, - {file = "pylance-0.11.0-cp39-abi3-win_amd64.whl", hash = "sha256:fe5ede7168f5afc67232818eddc57e086cb7579151e9a34b52c3d9fabc7575aa"}, + {file = "pylance-0.19.1-cp39-abi3-macosx_10_15_x86_64.whl", hash = "sha256:a254d09690a5e09cadc5fecc7b43b2bfc20b477e0f0ba31497e1d6abb36b524a"}, + {file = "pylance-0.19.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:9859c372b2d7fe443b6218f62e9d77caf94961cac73b274c85b724f20dd6b690"}, + {file = "pylance-0.19.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8315152f57329e7668ff5c82c252591ea0e3d2aed702dd19a42d645945e7a07e"}, + {file = "pylance-0.19.1-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:7c2e0e00b40214edae576075dbfa432cadaf5ba21354b0c46f307daf4e77403f"}, + {file = "pylance-0.19.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:e26ce273840912c45dd2b8f6f8fb9082c1c788d696e11b78ddad3949e3892d50"}, + {file = "pylance-0.19.1-cp39-abi3-win_amd64.whl", hash = "sha256:b341e547c995b5d6b32eb63e1e015d31b608de49a9ad03f8981453f4c667e8e1"}, ] [package.dependencies] -numpy = ">=1.22" -pyarrow = ">=12,<15.0.1" +numpy = ">=1.22,<2" +pyarrow = ">=12" [package.extras] benchmarks = ["pytest-benchmark"] +cuvs-cu11 = ["cuvs-cu11", "pylibraft-cu11"] +cuvs-cu12 = ["cuvs-cu12", "pylibraft-cu12"] dev = ["ruff (==0.4.1)"] ray = ["ray[data]"] -tests = ["boto3", "datasets", "duckdb", "h5py (<3.11)", "ml-dtypes", "pandas", "pillow", "polars[pandas,pyarrow]", "pytest", "tensorflow", "tqdm"] +tests = ["boto3", "datasets", "duckdb", "ml-dtypes", "pandas", "pillow", "polars[pandas,pyarrow]", "pytest", "tensorflow", "tqdm"] torch = ["torch"] [[package]] @@ -5710,20 +5699,6 @@ urllib3 = ">=1.26.14,<3" fastembed = ["fastembed (==0.3.6)"] fastembed-gpu = ["fastembed-gpu (==0.3.6)"] -[[package]] -name = "ratelimiter" -version = "1.2.0.post0" -description = "Simple python rate limiting object" -optional = false -python-versions = "*" -files = [ - {file = "ratelimiter-1.2.0.post0-py3-none-any.whl", hash = "sha256:a52be07bc0bb0b3674b4b304550f10c769bbb00fead3072e035904474259809f"}, - {file = "ratelimiter-1.2.0.post0.tar.gz", hash = "sha256:5c395dcabdbbde2e5178ef3f89b568a3066454a6ddc223b76473dac22f89b4f7"}, -] - -[package.extras] -test = ["pytest (>=3.0)", "pytest-asyncio"] - [[package]] name = "redis" version = "5.2.0" @@ -5896,21 +5871,6 @@ files = [ packaging = ">=23.2" types-setuptools = ">=69.1.0" -[[package]] -name = "retry" -version = "0.9.2" -description = "Easy to use retry decorator." -optional = false -python-versions = "*" -files = [ - {file = "retry-0.9.2-py2.py3-none-any.whl", hash = "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606"}, - {file = "retry-0.9.2.tar.gz", hash = "sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4"}, -] - -[package.dependencies] -decorator = ">=3.4.2" -py = ">=1.4.26,<2.0.0" - [[package]] name = "rfc3339-validator" version = "0.1.4" @@ -7711,4 +7671,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.12" -content-hash = "c6bb6ae960663c9dacec79ce67ccb4867014f9e1d23b7fe40191ecf09e8beefc" +content-hash = "bb70798562fee44c6daa2f5c7fa4d17165fb76016618c1fc8fd0782c5aa4a6de" diff --git a/pyproject.toml b/pyproject.toml index 79bb449f..9da60eaf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,7 @@ matplotlib = "^3.8.3" structlog = "^24.1.0" tiktoken = "0.7.0" posthog = "^3.5.0" -lancedb = "0.8.0" +lancedb = "0.15.0" litellm = "1.38.10" groq = "0.8.0" tantivy = "^0.22.0" diff --git a/tools/daily_twitter_stats.py b/tools/daily_twitter_stats.py index 43bedda7..d66f052d 100644 --- a/tools/daily_twitter_stats.py +++ b/tools/daily_twitter_stats.py @@ -1,7 +1,7 @@ import tweepy import requests import json -from datetime import datetime +from datetime import datetime, timezone # Twitter API credentials from GitHub Secrets API_KEY = '${{ secrets.TWITTER_API_KEY }}' @@ -30,7 +30,7 @@ def get_follower_count(username): def send_data_to_segment(username, follower_count): - current_time = datetime.now().isoformat() + current_time = datetime.now(timezone.utc).isoformat() data = { 'userId': username, From 897bbac699a69e068171018b5663b3b15c266f5a Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Thu, 7 Nov 2024 11:36:31 +0100 Subject: [PATCH 03/19] fix: serialize UUID in pgvector data point payload --- .../databases/vector/pgvector/PGVectorAdapter.py | 4 ++-- .../{serialize_datetime.py => serialize_data.py} | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) rename cognee/infrastructure/databases/vector/pgvector/{serialize_datetime.py => serialize_data.py} (57%) diff --git a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py index 32131804..235cc774 100644 --- a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +++ b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py @@ -8,7 +8,7 @@ from cognee.infrastructure.engine import DataPoint -from .serialize_datetime import serialize_datetime +from .serialize_data import serialize_data from ..models.ScoredResult import ScoredResult from ..vector_db_interface import VectorDBInterface from ..embeddings.EmbeddingEngine import EmbeddingEngine @@ -113,7 +113,7 @@ def __init__(self, id, payload, vector): PGVectorDataPoint( id=data_point.id, vector=data_vectors[data_index], - payload=serialize_datetime(data_point.model_dump()), + payload=serialize_data(data_point.model_dump()), ) for (data_index, data_point) in enumerate(data_points) ] diff --git a/cognee/infrastructure/databases/vector/pgvector/serialize_datetime.py b/cognee/infrastructure/databases/vector/pgvector/serialize_data.py similarity index 57% rename from cognee/infrastructure/databases/vector/pgvector/serialize_datetime.py rename to cognee/infrastructure/databases/vector/pgvector/serialize_data.py index 9cb979e2..cdba1e92 100644 --- a/cognee/infrastructure/databases/vector/pgvector/serialize_datetime.py +++ b/cognee/infrastructure/databases/vector/pgvector/serialize_data.py @@ -1,12 +1,15 @@ from datetime import datetime +from uuid import UUID -def serialize_datetime(data): +def serialize_data(data): """Recursively convert datetime objects in dictionaries/lists to ISO format.""" if isinstance(data, dict): - return {key: serialize_datetime(value) for key, value in data.items()} + return {key: serialize_data(value) for key, value in data.items()} elif isinstance(data, list): - return [serialize_datetime(item) for item in data] + return [serialize_data(item) for item in data] elif isinstance(data, datetime): return data.isoformat() # Convert datetime to ISO 8601 string + elif isinstance(data, UUID): + return str(data) else: return data \ No newline at end of file From f569088a2e4d35c4fa396313a1d918deccb87400 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Thu, 7 Nov 2024 15:38:03 +0100 Subject: [PATCH 04/19] fix: add summaries to the graph --- .../databases/graph/neo4j_driver/adapter.py | 2 +- .../databases/graph/networkx/adapter.py | 28 +++++--- .../hybrid/falkordb/FalkorDBAdapter.py | 30 +++++++++ .../vector/weaviate_db/WeaviateAdapter.py | 2 +- cognee/modules/chunking/TextChunker.py | 6 +- .../graph/utils/get_graph_from_model.py | 66 +++++++++++++------ .../modules/pipelines/operations/run_tasks.py | 4 +- cognee/tasks/graph/query_graph_connections.py | 14 ++-- cognee/tasks/storage/index_data_points.py | 3 +- .../tasks/summarization/models/TextSummary.py | 3 +- cognee/tasks/summarization/summarize_text.py | 7 +- cognee/tests/test_library.py | 4 +- cognee/tests/test_neo4j.py | 4 +- cognee/tests/test_pgvector.py | 4 +- cognee/tests/test_qdrant.py | 4 +- cognee/tests/test_weaviate.py | 4 +- 16 files changed, 127 insertions(+), 58 deletions(-) diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index f0d62c78..8e79b201 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -338,7 +338,7 @@ async def get_neighbours(self, node_id: str) -> List[Dict[str, Any]]: return predecessors + successors - async def get_connections(self, node_id: str) -> list: + async def get_connections(self, node_id: UUID) -> list: predecessors_query = """ MATCH (node)<-[relation]-(neighbour) WHERE node.id = $node_id diff --git a/cognee/infrastructure/databases/graph/networkx/adapter.py b/cognee/infrastructure/databases/graph/networkx/adapter.py index aac8c0c3..b106e9fe 100644 --- a/cognee/infrastructure/databases/graph/networkx/adapter.py +++ b/cognee/infrastructure/databases/graph/networkx/adapter.py @@ -7,6 +7,7 @@ import logging from re import A from typing import Dict, Any, List +from uuid import UUID import aiofiles import aiofiles.os as aiofiles_os import networkx as nx @@ -130,7 +131,7 @@ async def extract_node(self, node_id: str) -> dict: async def extract_nodes(self, node_ids: List[str]) -> List[dict]: return [self.graph.nodes[node_id] for node_id in node_ids if self.graph.has_node(node_id)] - async def get_predecessors(self, node_id: str, edge_label: str = None) -> list: + async def get_predecessors(self, node_id: UUID, edge_label: str = None) -> list: if self.graph.has_node(node_id): if edge_label is None: return [ @@ -146,7 +147,7 @@ async def get_predecessors(self, node_id: str, edge_label: str = None) -> list: return nodes - async def get_successors(self, node_id: str, edge_label: str = None) -> list: + async def get_successors(self, node_id: UUID, edge_label: str = None) -> list: if self.graph.has_node(node_id): if edge_label is None: return [ @@ -175,13 +176,13 @@ async def get_neighbours(self, node_id: str) -> list: return neighbours - async def get_connections(self, node_id: str) -> list: + async def get_connections(self, node_id: UUID) -> list: if not self.graph.has_node(node_id): return [] node = self.graph.nodes[node_id] - if "uuid" not in node: + if "id" not in node: return [] predecessors, successors = await asyncio.gather( @@ -192,14 +193,14 @@ async def get_connections(self, node_id: str) -> list: connections = [] for neighbor in predecessors: - if "uuid" in neighbor: - edge_data = self.graph.get_edge_data(neighbor["uuid"], node["uuid"]) + if "id" in neighbor: + edge_data = self.graph.get_edge_data(neighbor["id"], node["id"]) for edge_properties in edge_data.values(): connections.append((neighbor, edge_properties, node)) for neighbor in successors: - if "uuid" in neighbor: - edge_data = self.graph.get_edge_data(node["uuid"], neighbor["uuid"]) + if "id" in neighbor: + edge_data = self.graph.get_edge_data(node["id"], neighbor["id"]) for edge_properties in edge_data.values(): connections.append((node, edge_properties, neighbor)) @@ -245,6 +246,17 @@ async def load_graph_from_file(self, file_path: str = None): if os.path.exists(file_path): async with aiofiles.open(file_path, "r") as file: graph_data = json.loads(await file.read()) + for node in graph_data["nodes"]: + node["id"] = UUID(node["id"]) + node["updated_at"] = datetime.strptime(node["updated_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + + for edge in graph_data["links"]: + edge["source"] = UUID(edge["source"]) + edge["target"] = UUID(edge["target"]) + edge["source_node_id"] = UUID(edge["source_node_id"]) + edge["target_node_id"] = UUID(edge["target_node_id"]) + edge["updated_at"] = datetime.strptime(edge["updated_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + self.graph = nx.readwrite.json_graph.node_link_graph(graph_data) else: # Log that the file does not exist and an empty graph is initialized diff --git a/cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py b/cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py index effe9e68..ea5a7508 100644 --- a/cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py +++ b/cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py @@ -1,6 +1,7 @@ import asyncio from textwrap import dedent from typing import Any +from uuid import UUID from falkordb import FalkorDB from cognee.infrastructure.engine import DataPoint @@ -161,6 +162,35 @@ async def extract_node(self, data_point_id: str): async def extract_nodes(self, data_point_ids: list[str]): return await self.retrieve(data_point_ids) + async def get_connections(self, node_id: UUID) -> list: + predecessors_query = """ + MATCH (node)<-[relation]-(neighbour) + WHERE node.id = $node_id + RETURN neighbour, relation, node + """ + successors_query = """ + MATCH (node)-[relation]->(neighbour) + WHERE node.id = $node_id + RETURN node, relation, neighbour + """ + + predecessors, successors = await asyncio.gather( + self.query(predecessors_query, dict(node_id = node_id)), + self.query(successors_query, dict(node_id = node_id)), + ) + + connections = [] + + for neighbour in predecessors: + neighbour = neighbour["relation"] + connections.append((neighbour[0], { "relationship_name": neighbour[1] }, neighbour[2])) + + for neighbour in successors: + neighbour = neighbour["relation"] + connections.append((neighbour[0], { "relationship_name": neighbour[1] }, neighbour[2])) + + return connections + async def search( self, collection_name: str, diff --git a/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py b/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py index a1b986ef..b5cabc56 100644 --- a/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +++ b/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py @@ -168,7 +168,7 @@ async def search( return [ ScoredResult( - id = UUID(result.id), + id = UUID(result.uuid), payload = result.properties, score = float(result.metadata.score) ) for result in search_result.objects diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py index 4717d108..71438380 100644 --- a/cognee/modules/chunking/TextChunker.py +++ b/cognee/modules/chunking/TextChunker.py @@ -29,7 +29,7 @@ def read(self): else: if len(self.paragraph_chunks) == 0: yield DocumentChunk( - id = str(chunk_data["chunk_id"]), + id = chunk_data["chunk_id"], text = chunk_data["text"], word_count = chunk_data["word_count"], is_part_of = self.document, @@ -42,7 +42,7 @@ def read(self): chunk_text = " ".join(chunk["text"] for chunk in self.paragraph_chunks) try: yield DocumentChunk( - id = str(uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}")), + id = uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"), text = chunk_text, word_count = self.chunk_size, is_part_of = self.document, @@ -59,7 +59,7 @@ def read(self): if len(self.paragraph_chunks) > 0: try: yield DocumentChunk( - id = str(uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}")), + id = uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"), text = " ".join(chunk["text"] for chunk in self.paragraph_chunks), word_count = self.chunk_size, is_part_of = self.document, diff --git a/cognee/modules/graph/utils/get_graph_from_model.py b/cognee/modules/graph/utils/get_graph_from_model.py index ef402e4d..35e00fb5 100644 --- a/cognee/modules/graph/utils/get_graph_from_model.py +++ b/cognee/modules/graph/utils/get_graph_from_model.py @@ -1,9 +1,8 @@ from datetime import datetime, timezone from cognee.infrastructure.engine import DataPoint -from cognee.modules import data from cognee.modules.storage.utils import copy_model -def get_graph_from_model(data_point: DataPoint, include_root = True): +def get_graph_from_model(data_point: DataPoint, include_root = True, added_nodes = {}, added_edges = {}): nodes = [] edges = [] @@ -17,29 +16,55 @@ def get_graph_from_model(data_point: DataPoint, include_root = True): if isinstance(field_value, DataPoint): excluded_properties.add(field_name) - property_nodes, property_edges = get_graph_from_model(field_value, True) - nodes[:0] = property_nodes - edges[:0] = property_edges + property_nodes, property_edges = get_graph_from_model(field_value, True, added_nodes, added_edges) + + for node in property_nodes: + if str(node.id) not in added_nodes: + nodes.append(node) + added_nodes[str(node.id)] = True + + for edge in property_edges: + edge_key = str(edge[0]) + str(edge[1]) + edge[2] + + if str(edge_key) not in added_edges: + edges.append(edge) + added_edges[str(edge_key)] = True for property_node in get_own_properties(property_nodes, property_edges): - edges.append((data_point.id, property_node.id, field_name, { - "source_node_id": data_point.id, - "target_node_id": property_node.id, - "relationship_name": field_name, - "updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), - })) + edge_key = str(data_point.id) + str(property_node.id) + field_name + + if str(edge_key) not in added_edges: + edges.append((data_point.id, property_node.id, field_name, { + "source_node_id": data_point.id, + "target_node_id": property_node.id, + "relationship_name": field_name, + "updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), + })) + added_edges[str(edge_key)] = True continue - if isinstance(field_value, list): - if isinstance(field_value[0], DataPoint): - excluded_properties.add(field_name) + if isinstance(field_value, list) and isinstance(field_value[0], DataPoint): + excluded_properties.add(field_name) + + for item in field_value: + property_nodes, property_edges = get_graph_from_model(item, True, added_nodes, added_edges) - for item in field_value: - property_nodes, property_edges = get_graph_from_model(item, True) - nodes[:0] = property_nodes - edges[:0] = property_edges + for node in property_nodes: + if str(node.id) not in added_nodes: + nodes.append(node) + added_nodes[str(node.id)] = True - for property_node in get_own_properties(property_nodes, property_edges): + for edge in property_edges: + edge_key = str(edge[0]) + str(edge[1]) + edge[2] + + if str(edge_key) not in added_edges: + edges.append(edge) + added_edges[edge_key] = True + + for property_node in get_own_properties(property_nodes, property_edges): + edge_key = str(data_point.id) + str(property_node.id) + field_name + + if str(edge_key) not in added_edges: edges.append((data_point.id, property_node.id, field_name, { "source_node_id": data_point.id, "target_node_id": property_node.id, @@ -49,7 +74,8 @@ def get_graph_from_model(data_point: DataPoint, include_root = True): "type": "list" }, })) - continue + added_edges[edge_key] = True + continue data_point_properties[field_name] = field_value diff --git a/cognee/modules/pipelines/operations/run_tasks.py b/cognee/modules/pipelines/operations/run_tasks.py index 5f15aae8..7058bdb6 100644 --- a/cognee/modules/pipelines/operations/run_tasks.py +++ b/cognee/modules/pipelines/operations/run_tasks.py @@ -7,7 +7,7 @@ logger = logging.getLogger("run_tasks(tasks: [Task], data)") -async def run_tasks_base(tasks: [Task], data = None, user: User = None): +async def run_tasks_base(tasks: list[Task], data = None, user: User = None): if len(tasks) == 0: yield data return @@ -16,7 +16,7 @@ async def run_tasks_base(tasks: [Task], data = None, user: User = None): running_task = tasks[0] leftover_tasks = tasks[1:] - next_task = leftover_tasks[0] if len(leftover_tasks) > 1 else None + next_task = leftover_tasks[0] if len(leftover_tasks) > 0 else None next_task_batch_size = next_task.task_config["batch_size"] if next_task else 1 if inspect.isasyncgenfunction(running_task.executable): diff --git a/cognee/tasks/graph/query_graph_connections.py b/cognee/tasks/graph/query_graph_connections.py index 36d53514..cd4d76a5 100644 --- a/cognee/tasks/graph/query_graph_connections.py +++ b/cognee/tasks/graph/query_graph_connections.py @@ -22,13 +22,13 @@ async def query_graph_connections(query: str, exploration_levels = 1) -> list[(s exact_node = await graph_engine.extract_node(node_id) - if exact_node is not None and "uuid" in exact_node: - node_connections = await graph_engine.get_connections(str(exact_node["uuid"])) + if exact_node is not None and "id" in exact_node: + node_connections = await graph_engine.get_connections(str(exact_node["id"])) else: vector_engine = get_vector_engine() results = await asyncio.gather( - vector_engine.search("Entity_text", query_text = query, limit = 5), - vector_engine.search("EntityType_text", query_text = query, limit = 5), + vector_engine.search("Entity_name", query_text = query, limit = 5), + vector_engine.search("EntityType_name", query_text = query, limit = 5), ) results = [*results[0], *results[1]] relevant_results = [result for result in results if result.score < 0.5][:5] @@ -37,7 +37,7 @@ async def query_graph_connections(query: str, exploration_levels = 1) -> list[(s return [] node_connections_results = await asyncio.gather( - *[graph_engine.get_connections(str(result.payload["uuid"])) for result in relevant_results] + *[graph_engine.get_connections(result.id) for result in relevant_results] ) node_connections = [] @@ -48,10 +48,10 @@ async def query_graph_connections(query: str, exploration_levels = 1) -> list[(s unique_node_connections_map = {} unique_node_connections = [] for node_connection in node_connections: - if "uuid" not in node_connection[0] or "uuid" not in node_connection[2]: + if "id" not in node_connection[0] or "id" not in node_connection[2]: continue - unique_id = f"{node_connection[0]['uuid']} {node_connection[1]['relationship_name']} {node_connection[2]['uuid']}" + unique_id = f"{node_connection[0]['id']} {node_connection[1]['relationship_name']} {node_connection[2]['id']}" if unique_id not in unique_node_connections_map: unique_node_connections_map[unique_id] = True diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py index a28335e2..681fbaa1 100644 --- a/cognee/tasks/storage/index_data_points.py +++ b/cognee/tasks/storage/index_data_points.py @@ -56,7 +56,8 @@ def get_data_points_from_model(data_point: DataPoint, added_data_points = {}) -> added_data_points[str(new_point.id)] = True data_points.append(new_point) - data_points.append(data_point) + if (str(data_point.id) not in added_data_points): + data_points.append(data_point) return data_points diff --git a/cognee/tasks/summarization/models/TextSummary.py b/cognee/tasks/summarization/models/TextSummary.py index 5e724cd6..c6a932b3 100644 --- a/cognee/tasks/summarization/models/TextSummary.py +++ b/cognee/tasks/summarization/models/TextSummary.py @@ -4,9 +4,8 @@ class TextSummary(DataPoint): text: str - chunk: DocumentChunk + made_from: DocumentChunk _metadata: dict = { "index_fields": ["text"], } - diff --git a/cognee/tasks/summarization/summarize_text.py b/cognee/tasks/summarization/summarize_text.py index a1abaccc..756f65e3 100644 --- a/cognee/tasks/summarization/summarize_text.py +++ b/cognee/tasks/summarization/summarize_text.py @@ -5,6 +5,7 @@ from cognee.modules.data.extraction.extract_summary import extract_summary from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.tasks.storage import add_data_points +from cognee.tasks.storage.index_data_points import get_data_points_from_model from .models.TextSummary import TextSummary async def summarize_text(data_chunks: list[DocumentChunk], summarization_model: Type[BaseModel]): @@ -17,12 +18,12 @@ async def summarize_text(data_chunks: list[DocumentChunk], summarization_model: summaries = [ TextSummary( - id = uuid5(chunk.id, "summary"), - chunk = chunk, + id = uuid5(chunk.id, "TextSummary"), + made_from = chunk, text = chunk_summaries[chunk_index].summary, ) for (chunk_index, chunk) in enumerate(data_chunks) ] - add_data_points(summaries) + await add_data_points(summaries) return data_chunks diff --git a/cognee/tests/test_library.py b/cognee/tests/test_library.py index d7e7e5fe..2e707b64 100755 --- a/cognee/tests/test_library.py +++ b/cognee/tests/test_library.py @@ -32,8 +32,8 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity", "AI"))[0] - random_node_name = random_node.payload["name"] + random_node = (await vector_engine.search("Entity_name", "AI"))[0] + random_node_name = random_node.payload["text"] search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) assert len(search_results) != 0, "The search results list is empty." diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py index 2f9abf12..0783e973 100644 --- a/cognee/tests/test_neo4j.py +++ b/cognee/tests/test_neo4j.py @@ -36,8 +36,8 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity", "AI"))[0] - random_node_name = random_node.payload["name"] + random_node = (await vector_engine.search("Entity_name", "AI"))[0] + random_node_name = random_node.payload["text"] search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) assert len(search_results) != 0, "The search results list is empty." diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index b58b8751..802aa3fc 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -65,8 +65,8 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity", "AI"))[0] - random_node_name = random_node.payload["name"] + random_node = (await vector_engine.search("Entity_name", "AI"))[0] + random_node_name = random_node.payload["text"] search_results = await cognee.search(SearchType.INSIGHTS, query=random_node_name) assert len(search_results) != 0, "The search results list is empty." diff --git a/cognee/tests/test_qdrant.py b/cognee/tests/test_qdrant.py index 84fac6a2..faa2cbcf 100644 --- a/cognee/tests/test_qdrant.py +++ b/cognee/tests/test_qdrant.py @@ -37,8 +37,8 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity", "AI"))[0] - random_node_name = random_node.payload["name"] + random_node = (await vector_engine.search("Entity_name", "AI"))[0] + random_node_name = random_node.payload["text"] search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) assert len(search_results) != 0, "The search results list is empty." diff --git a/cognee/tests/test_weaviate.py b/cognee/tests/test_weaviate.py index e943e1ec..121c1749 100644 --- a/cognee/tests/test_weaviate.py +++ b/cognee/tests/test_weaviate.py @@ -35,8 +35,8 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity", "AI"))[0] - random_node_name = random_node.payload["name"] + random_node = (await vector_engine.search("Entity_name", "AI"))[0] + random_node_name = random_node.payload["text"] search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) assert len(search_results) != 0, "The search results list is empty." From c89063602e52a53f4bb76a8ff963ff1b1e812e6a Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Thu, 7 Nov 2024 15:41:11 +0100 Subject: [PATCH 05/19] fix: remove unused import --- cognee/tasks/summarization/summarize_text.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cognee/tasks/summarization/summarize_text.py b/cognee/tasks/summarization/summarize_text.py index 756f65e3..47d6946b 100644 --- a/cognee/tasks/summarization/summarize_text.py +++ b/cognee/tasks/summarization/summarize_text.py @@ -5,7 +5,6 @@ from cognee.modules.data.extraction.extract_summary import extract_summary from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.tasks.storage import add_data_points -from cognee.tasks.storage.index_data_points import get_data_points_from_model from .models.TextSummary import TextSummary async def summarize_text(data_chunks: list[DocumentChunk], summarization_model: Type[BaseModel]): From 9e10c611bcdb44cf637314b29379285485566be5 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 7 Nov 2024 16:19:38 +0100 Subject: [PATCH 06/19] fix: resolves pg asyncpg UUID to UUID --- .../infrastructure/databases/vector/pgvector/PGVectorAdapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py index 235cc774..2e9a3764 100644 --- a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +++ b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py @@ -203,7 +203,7 @@ async def search( # Create and return ScoredResult objects return [ ScoredResult( - id = UUID(row.id), + id = UUID(str(row.id)), payload = row.payload, score = row.similarity ) for row in vector_list From 19d62f2c84ea78046abac18dfe8d1ee1ff6dbac7 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Fri, 8 Nov 2024 15:31:02 +0100 Subject: [PATCH 07/19] fix: add code graph generation pipeline --- cognee/api/v1/cognify/code_graph_pipeline.py | 110 ++++++++++++++++++ .../databases/graph/networkx/adapter.py | 34 ++++-- .../vector/lancedb/LanceDBAdapter.py | 11 +- .../graph/utils/get_graph_from_model.py | 2 +- cognee/shared/SourceCodeGraph.py | 93 ++++++++------- cognee/shared/utils.py | 2 +- cognee/tasks/graph/__init__.py | 1 + cognee/tasks/graph/extract_graph_from_code.py | 17 +++ cognee/tasks/storage/index_data_points.py | 2 +- cognee/tests/test_code_generation.py | 38 ++++++ cognee/tests/test_data/code.txt | 70 +++++++++++ 11 files changed, 326 insertions(+), 54 deletions(-) create mode 100644 cognee/api/v1/cognify/code_graph_pipeline.py create mode 100644 cognee/tasks/graph/extract_graph_from_code.py create mode 100755 cognee/tests/test_code_generation.py create mode 100644 cognee/tests/test_data/code.txt diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py new file mode 100644 index 00000000..2cbb606c --- /dev/null +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -0,0 +1,110 @@ +import asyncio +import logging +from typing import Union + +from cognee.shared.SourceCodeGraph import SourceCodeGraph +from cognee.shared.utils import send_telemetry +from cognee.modules.data.models import Dataset, Data +from cognee.modules.data.methods.get_dataset_data import get_dataset_data +from cognee.modules.data.methods import get_datasets, get_datasets_by_name +from cognee.modules.pipelines.tasks.Task import Task +from cognee.modules.pipelines import run_tasks +from cognee.modules.users.models import User +from cognee.modules.users.methods import get_default_user +from cognee.modules.pipelines.models import PipelineRunStatus +from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status +from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status +from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents +from cognee.tasks.graph import extract_graph_from_code +from cognee.tasks.storage import add_data_points + +logger = logging.getLogger("code_graph_pipeline") + +update_status_lock = asyncio.Lock() + +class PermissionDeniedException(Exception): + def __init__(self, message: str): + self.message = message + super().__init__(self.message) + +async def code_graph_pipeline(datasets: Union[str, list[str]] = None, user: User = None): + if user is None: + user = await get_default_user() + + existing_datasets = await get_datasets(user.id) + + if datasets is None or len(datasets) == 0: + # If no datasets are provided, cognify all existing datasets. + datasets = existing_datasets + + if type(datasets[0]) == str: + datasets = await get_datasets_by_name(datasets, user.id) + + existing_datasets_map = { + generate_dataset_name(dataset.name): True for dataset in existing_datasets + } + + awaitables = [] + + for dataset in datasets: + dataset_name = generate_dataset_name(dataset.name) + + if dataset_name in existing_datasets_map: + awaitables.append(run_pipeline(dataset, user)) + + return await asyncio.gather(*awaitables) + + +async def run_pipeline(dataset: Dataset, user: User): + data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id) + + document_ids_str = [str(document.id) for document in data_documents] + + dataset_id = dataset.id + dataset_name = generate_dataset_name(dataset.name) + + send_telemetry("code_graph_pipeline EXECUTION STARTED", user.id) + + async with update_status_lock: + task_status = await get_pipeline_status([dataset_id]) + + if dataset_id in task_status and task_status[dataset_id] == PipelineRunStatus.DATASET_PROCESSING_STARTED: + logger.info("Dataset %s is already being processed.", dataset_name) + return + + await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_STARTED, { + "dataset_name": dataset_name, + "files": document_ids_str, + }) + try: + tasks = [ + Task(classify_documents), + Task(check_permissions_on_documents, user = user, permissions = ["write"]), + Task(extract_chunks_from_documents), # Extract text chunks based on the document type. + Task(add_data_points, task_config = { "batch_size": 10 }), + Task(extract_graph_from_code, graph_model = SourceCodeGraph, task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks. + ] + + pipeline = run_tasks(tasks, data_documents, "code_graph_pipeline") + + async for result in pipeline: + print(result) + + send_telemetry("code_graph_pipeline EXECUTION COMPLETED", user.id) + + await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_COMPLETED, { + "dataset_name": dataset_name, + "files": document_ids_str, + }) + except Exception as error: + send_telemetry("code_graph_pipeline EXECUTION ERRORED", user.id) + + await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_ERRORED, { + "dataset_name": dataset_name, + "files": document_ids_str, + }) + raise error + + +def generate_dataset_name(dataset_name: str) -> str: + return dataset_name.replace(".", "_").replace(" ", "_") diff --git a/cognee/infrastructure/databases/graph/networkx/adapter.py b/cognee/infrastructure/databases/graph/networkx/adapter.py index b106e9fe..6c7abd49 100644 --- a/cognee/infrastructure/databases/graph/networkx/adapter.py +++ b/cognee/infrastructure/databases/graph/networkx/adapter.py @@ -30,6 +30,10 @@ def __new__(cls, filename): def __init__(self, filename = "cognee_graph.pkl"): self.filename = filename + async def get_graph_data(self): + await self.load_graph_from_file() + return (list(self.graph.nodes(data = True)), list(self.graph.edges(data = True, keys = True))) + async def query(self, query: str, params: dict): pass @@ -247,15 +251,27 @@ async def load_graph_from_file(self, file_path: str = None): async with aiofiles.open(file_path, "r") as file: graph_data = json.loads(await file.read()) for node in graph_data["nodes"]: - node["id"] = UUID(node["id"]) - node["updated_at"] = datetime.strptime(node["updated_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + try: + node["id"] = UUID(node["id"]) + except: + pass + if "updated_at" in node: + node["updated_at"] = datetime.strptime(node["updated_at"], "%Y-%m-%dT%H:%M:%S.%f%z") for edge in graph_data["links"]: - edge["source"] = UUID(edge["source"]) - edge["target"] = UUID(edge["target"]) - edge["source_node_id"] = UUID(edge["source_node_id"]) - edge["target_node_id"] = UUID(edge["target_node_id"]) - edge["updated_at"] = datetime.strptime(edge["updated_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + try: + source_id = UUID(edge["source"]) + target_id = UUID(edge["target"]) + + edge["source"] = source_id + edge["target"] = target_id + edge["source_node_id"] = source_id + edge["target_node_id"] = target_id + except: + pass + + if "updated_at" in node: + edge["updated_at"] = datetime.strptime(edge["updated_at"], "%Y-%m-%dT%H:%M:%S.%f%z") self.graph = nx.readwrite.json_graph.node_link_graph(graph_data) else: @@ -268,8 +284,8 @@ async def load_graph_from_file(self, file_path: str = None): os.makedirs(file_dir, exist_ok = True) await self.save_graph_to_file(file_path) - except Exception: - logger.error("Failed to load graph from file: %s", file_path) + except Exception as e: + logger.error("Failed to load graph from file: %s \n %s", file_path, str(e)) # Initialize an empty graph in case of error self.graph = nx.MultiDiGraph() diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py index 39e43189..d883a29e 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py @@ -164,7 +164,16 @@ async def search( if value < min_value: min_value = value - normalized_values = [(result["_distance"] - min_value) / (max_value - min_value) for result in result_values] + normalized_values = [] + min_value = min(result["_distance"] for result in result_values) + max_value = max(result["_distance"] for result in result_values) + + if max_value == min_value: + # Avoid division by zero: Assign all normalized values to 0 (or any constant value like 1) + normalized_values = [0 for _ in result_values] + else: + normalized_values = [(result["_distance"] - min_value) / (max_value - min_value) for result in + result_values] return [ScoredResult( id = UUID(result["id"]), diff --git a/cognee/modules/graph/utils/get_graph_from_model.py b/cognee/modules/graph/utils/get_graph_from_model.py index 35e00fb5..29137ddc 100644 --- a/cognee/modules/graph/utils/get_graph_from_model.py +++ b/cognee/modules/graph/utils/get_graph_from_model.py @@ -43,7 +43,7 @@ def get_graph_from_model(data_point: DataPoint, include_root = True, added_nodes added_edges[str(edge_key)] = True continue - if isinstance(field_value, list) and isinstance(field_value[0], DataPoint): + if isinstance(field_value, list) and len(field_value) > 0 and isinstance(field_value[0], DataPoint): excluded_properties.add(field_name) for item in field_value: diff --git a/cognee/shared/SourceCodeGraph.py b/cognee/shared/SourceCodeGraph.py index 51b90f29..60f425e3 100644 --- a/cognee/shared/SourceCodeGraph.py +++ b/cognee/shared/SourceCodeGraph.py @@ -1,84 +1,95 @@ -from typing import List, Union, Literal, Optional -from pydantic import BaseModel +from typing import Any, List, Union, Literal, Optional +from cognee.infrastructure.engine import DataPoint -class BaseClass(BaseModel): +class Variable(DataPoint): id: str name: str - type: Literal["Class"] = "Class" + type: Literal["Variable"] = "Variable" description: str - constructor_parameters: Optional[List[str]] = None + is_static: Optional[bool] = False + default_value: Optional[str] = None + data_type: str + + _metadata = { + "index_fields": ["name"] + } -class Class(BaseModel): +class Operator(DataPoint): + id: str + name: str + type: Literal["Operator"] = "Operator" + description: str + return_type: str + +class Class(DataPoint): id: str name: str type: Literal["Class"] = "Class" description: str - constructor_parameters: Optional[List[str]] = None - from_class: Optional[BaseClass] = None + constructor_parameters: List[Variable] + extended_from_class: Optional["Class"] = None + has_methods: list["Function"] -class ClassInstance(BaseModel): + _metadata = { + "index_fields": ["name"] + } + +class ClassInstance(DataPoint): id: str name: str type: Literal["ClassInstance"] = "ClassInstance" description: str from_class: Class + instantiated_by: Union["Function"] + instantiation_arguments: List[Variable] + + _metadata = { + "index_fields": ["name"] + } -class Function(BaseModel): +class Function(DataPoint): id: str name: str type: Literal["Function"] = "Function" description: str - parameters: Optional[List[str]] = None + parameters: List[Variable] return_type: str is_static: Optional[bool] = False -class Variable(BaseModel): - id: str - name: str - type: Literal["Variable"] = "Variable" - description: str - is_static: Optional[bool] = False - default_value: Optional[str] = None - -class Operator(BaseModel): - id: str - name: str - type: Literal["Operator"] = "Operator" - description: str - return_type: str + _metadata = { + "index_fields": ["name"] + } -class ExpressionPart(BaseModel): +class FunctionCall(DataPoint): id: str - name: str - type: Literal["Expression"] = "Expression" - description: str - expression: str - members: List[Union[Variable, Function, Operator]] + type: Literal["FunctionCall"] = "FunctionCall" + called_by: Union[Function, Literal["main"]] + function_called: Function + function_arguments: List[Any] -class Expression(BaseModel): +class Expression(DataPoint): id: str name: str type: Literal["Expression"] = "Expression" description: str expression: str - members: List[Union[Variable, Function, Operator, ExpressionPart]] + members: List[Union[Variable, Function, Operator, "Expression"]] -class Edge(BaseModel): - source_node_id: str - target_node_id: str - relationship_name: Literal["called in", "stored in", "defined in", "returned by", "instantiated in", "uses", "updates"] - -class SourceCodeGraph(BaseModel): +class SourceCodeGraph(DataPoint): id: str name: str description: str language: str nodes: List[Union[ Class, + ClassInstance, Function, + FunctionCall, Variable, Operator, Expression, - ClassInstance, ]] - edges: List[Edge] + +Class.model_rebuild() +ClassInstance.model_rebuild() +Expression.model_rebuild() diff --git a/cognee/shared/utils.py b/cognee/shared/utils.py index f3272357..14578f20 100644 --- a/cognee/shared/utils.py +++ b/cognee/shared/utils.py @@ -91,7 +91,7 @@ def prepare_edges(graph, source, target, edge_key): source: str(edge[0]), target: str(edge[1]), edge_key: str(edge[2]), - } for edge in graph.edges] + } for edge in graph.edges(keys = True, data = True)] return pd.DataFrame(edge_list) diff --git a/cognee/tasks/graph/__init__.py b/cognee/tasks/graph/__init__.py index 94dc82f2..eafc1292 100644 --- a/cognee/tasks/graph/__init__.py +++ b/cognee/tasks/graph/__init__.py @@ -1,2 +1,3 @@ from .extract_graph_from_data import extract_graph_from_data +from .extract_graph_from_code import extract_graph_from_code from .query_graph_connections import query_graph_connections diff --git a/cognee/tasks/graph/extract_graph_from_code.py b/cognee/tasks/graph/extract_graph_from_code.py new file mode 100644 index 00000000..159e9baa --- /dev/null +++ b/cognee/tasks/graph/extract_graph_from_code.py @@ -0,0 +1,17 @@ +import asyncio +from typing import Type +from pydantic import BaseModel +from cognee.modules.data.extraction.knowledge_graph import extract_content_graph +from cognee.modules.chunking.models.DocumentChunk import DocumentChunk +from cognee.tasks.storage import add_data_points + +async def extract_graph_from_code(data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]): + chunk_graphs = await asyncio.gather( + *[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks] + ) + + for (chunk_index, chunk) in enumerate(data_chunks): + chunk_graph = chunk_graphs[chunk_index] + await add_data_points(chunk_graph.nodes) + + return data_chunks diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py index 681fbaa1..dc74d705 100644 --- a/cognee/tasks/storage/index_data_points.py +++ b/cognee/tasks/storage/index_data_points.py @@ -47,7 +47,7 @@ def get_data_points_from_model(data_point: DataPoint, added_data_points = {}) -> added_data_points[str(new_point.id)] = True data_points.append(new_point) - if isinstance(field_value, list) and isinstance(field_value[0], DataPoint): + if isinstance(field_value, list) and len(field_value) > 0 and isinstance(field_value[0], DataPoint): for field_value_item in field_value: new_data_points = get_data_points_from_model(field_value_item, added_data_points) diff --git a/cognee/tests/test_code_generation.py b/cognee/tests/test_code_generation.py new file mode 100755 index 00000000..aad59ace --- /dev/null +++ b/cognee/tests/test_code_generation.py @@ -0,0 +1,38 @@ +import os +import logging +import pathlib +import cognee +from cognee.api.v1.cognify.code_graph_pipeline import code_graph_pipeline +from cognee.api.v1.search import SearchType +from cognee.shared.utils import render_graph + +logging.basicConfig(level = logging.DEBUG) + +async def main(): + data_directory_path = str(pathlib.Path(os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_code_generation")).resolve()) + cognee.config.data_root_directory(data_directory_path) + cognee_directory_path = str(pathlib.Path(os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_code_generation")).resolve()) + cognee.config.system_root_directory(cognee_directory_path) + + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata = True) + + dataset_name = "artificial_intelligence" + + ai_text_file_path = os.path.join(pathlib.Path(__file__).parent, "test_data/code.txt") + await cognee.add([ai_text_file_path], dataset_name) + + await code_graph_pipeline([dataset_name]) + + await render_graph(None, include_nodes = True, include_labels = True) + + search_results = await cognee.search(SearchType.CHUNKS, query = "Student") + assert len(search_results) != 0, "The search results list is empty." + print("\n\nExtracted chunks are:\n") + for result in search_results: + print(f"{result}\n") + + +if __name__ == "__main__": + import asyncio + asyncio.run(main(), debug=True) diff --git a/cognee/tests/test_data/code.txt b/cognee/tests/test_data/code.txt new file mode 100644 index 00000000..c40f7124 --- /dev/null +++ b/cognee/tests/test_data/code.txt @@ -0,0 +1,70 @@ +// Class definition for a Person +class Person { + constructor(name, age) { + this.name = name; + this.age = age; + } + + // Method to return a greeting message + greet() { + return `Hello, my name is ${this.name} and I'm ${this.age} years old.`; + } + + // Method to celebrate birthday + celebrateBirthday() { + this.age += 1; + return `Happy Birthday, ${this.name}! You are now ${this.age} years old.`; + } +} + +// Class definition for a Student, extending from Person +class Student extends Person { + constructor(name, age, grade) { + super(name, age); + this.grade = grade; + } + + // Method to describe the student + describe() { + return `${this.name} is a ${this.grade} grade student and is ${this.age} years old.`; + } +} + +// Function to enroll a new student +function enrollStudent(name, age, grade) { + const student = new Student(name, age, grade); + console.log(student.greet()); + console.log(student.describe()); + return student; +} + +// Function to promote a student to the next grade +function promoteStudent(student) { + student.grade += 1; + console.log(`${student.name} has been promoted to grade ${student.grade}.`); + return student; +} + +// Variable definition and assignment +let schoolName = "Greenwood High School"; +let students = []; + +// Enrolling students +students.push(enrollStudent("Alice", 14, 9)); +students.push(enrollStudent("Bob", 15, 10)); + +// Looping through students to celebrate their birthdays +students.forEach(student => { + console.log(student.celebrateBirthday()); +}); + +// Promoting all students +students = students.map(promoteStudent); + +// Displaying the final state of all students +console.log("Final Students List:"); +students.forEach(student => console.log(student.describe())); + +// Updating the school name +schoolName = "Greenwood International School"; +console.log(`School Name Updated to: ${schoolName}`); From e7e6107b24ef37a1dc1d6382e3306697da0d0016 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Mon, 11 Nov 2024 13:22:28 +0100 Subject: [PATCH 08/19] fix: check "updated_at" in edge instead of node --- cognee/infrastructure/databases/graph/networkx/adapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/infrastructure/databases/graph/networkx/adapter.py b/cognee/infrastructure/databases/graph/networkx/adapter.py index 6c7abd49..65aeea28 100644 --- a/cognee/infrastructure/databases/graph/networkx/adapter.py +++ b/cognee/infrastructure/databases/graph/networkx/adapter.py @@ -270,7 +270,7 @@ async def load_graph_from_file(self, file_path: str = None): except: pass - if "updated_at" in node: + if "updated_at" in edge: edge["updated_at"] = datetime.strptime(edge["updated_at"], "%Y-%m-%dT%H:%M:%S.%f%z") self.graph = nx.readwrite.json_graph.node_link_graph(graph_data) From d733bfdf6a29b3c5d649a4df411dd1ca1850674b Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Mon, 11 Nov 2024 14:38:59 +0100 Subject: [PATCH 09/19] fix: convert qdrant search results to ScoredPoint --- README.md | 24 +++++++++++++------ .../databases/vector/qdrant/QDrantAdapter.py | 15 ++++++++++-- cognee/modules/engine/utils/__init__.py | 1 + .../engine/utils/generate_edge_name.py | 2 ++ .../engine/utils/generate_node_name.py | 2 +- cognee/tasks/graph/extract_graph_from_data.py | 6 ++--- cognee/tests/test_qdrant.py | 2 +- cognee/tests/test_weaviate.py | 2 +- 8 files changed, 39 insertions(+), 15 deletions(-) create mode 100644 cognee/modules/engine/utils/generate_edge_name.py diff --git a/README.md b/README.md index 9ce92e80..82c3730d 100644 --- a/README.md +++ b/README.md @@ -109,24 +109,34 @@ import asyncio from cognee.api.v1.search import SearchType async def main(): - await cognee.prune.prune_data() # Reset cognee data - await cognee.prune.prune_system(metadata=True) # Reset cognee system state + # Reset cognee data + await cognee.prune.prune_data() + # Reset cognee system state + await cognee.prune.prune_system(metadata=True) text = """ Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval. """ - await cognee.add(text) # Add text to cognee - await cognee.cognify() # Use LLMs and cognee to create knowledge graph + # Add text to cognee + await cognee.add(text) - search_results = await cognee.search( # Search cognee for insights + # Use LLMs and cognee to create knowledge graph + await cognee.cognify() + + # Search cognee for insights + search_results = await cognee.search( SearchType.INSIGHTS, - {'query': 'Tell me about NLP'} + "Tell me about NLP", ) - for result_text in search_results: # Display results + # Display results + for result_text in search_results: print(result_text) + # natural_language_processing is_a field + # natural_language_processing is_subfield_of computer_science + # natural_language_processing is_subfield_of information_retrieval asyncio.run(main()) ``` diff --git a/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py b/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py index 87d673a0..1efcd47b 100644 --- a/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +++ b/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py @@ -1,7 +1,9 @@ import logging +from uuid import UUID from typing import List, Dict, Optional from qdrant_client import AsyncQdrantClient, models +from cognee.infrastructure.databases.vector.models.ScoredResult import ScoredResult from cognee.infrastructure.engine import DataPoint from ..vector_db_interface import VectorDBInterface from ..embeddings.EmbeddingEngine import EmbeddingEngine @@ -153,7 +155,7 @@ async def search( client = self.get_qdrant_client() - result = await client.search( + results = await client.search( collection_name = collection_name, query_vector = models.NamedVector( name = "text", @@ -165,7 +167,16 @@ async def search( await client.close() - return result + return [ + ScoredResult( + id = UUID(result.id), + payload = { + **result.payload, + "id": UUID(result.id), + }, + score = 1 - result.score, + ) for result in results + ] async def batch_search(self, collection_name: str, query_texts: List[str], limit: int = None, with_vectors: bool = False): diff --git a/cognee/modules/engine/utils/__init__.py b/cognee/modules/engine/utils/__init__.py index 9cc2bc57..4d4ab02e 100644 --- a/cognee/modules/engine/utils/__init__.py +++ b/cognee/modules/engine/utils/__init__.py @@ -1,2 +1,3 @@ from .generate_node_id import generate_node_id from .generate_node_name import generate_node_name +from .generate_edge_name import generate_edge_name diff --git a/cognee/modules/engine/utils/generate_edge_name.py b/cognee/modules/engine/utils/generate_edge_name.py new file mode 100644 index 00000000..49ab5e8a --- /dev/null +++ b/cognee/modules/engine/utils/generate_edge_name.py @@ -0,0 +1,2 @@ +def generate_edge_name(name: str) -> str: + return name.lower().replace(" ", "_").replace("'", "") diff --git a/cognee/modules/engine/utils/generate_node_name.py b/cognee/modules/engine/utils/generate_node_name.py index 84b26619..a2871875 100644 --- a/cognee/modules/engine/utils/generate_node_name.py +++ b/cognee/modules/engine/utils/generate_node_name.py @@ -1,2 +1,2 @@ def generate_node_name(name: str) -> str: - return name.lower().replace(" ", "_").replace("'", "") + return name.lower().replace("'", "") diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index 36cc3e2f..9e6edcab 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -5,7 +5,7 @@ from cognee.modules.data.extraction.knowledge_graph import extract_content_graph from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.engine.models import EntityType, Entity -from cognee.modules.engine.utils import generate_node_id, generate_node_name +from cognee.modules.engine.utils import generate_edge_name, generate_node_id, generate_node_name from cognee.tasks.storage import add_data_points async def extract_graph_from_data(data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]): @@ -95,7 +95,7 @@ async def extract_graph_from_data(data_chunks: list[DocumentChunk], graph_model: for edge in graph.edges: source_node_id = generate_node_id(edge.source_node_id) target_node_id = generate_node_id(edge.target_node_id) - relationship_name = generate_node_name(edge.relationship_name) + relationship_name = generate_edge_name(edge.relationship_name) edge_key = str(source_node_id) + str(target_node_id) + relationship_name @@ -105,7 +105,7 @@ async def extract_graph_from_data(data_chunks: list[DocumentChunk], graph_model: target_node_id, edge.relationship_name, dict( - relationship_name = generate_node_name(edge.relationship_name), + relationship_name = generate_edge_name(edge.relationship_name), source_node_id = source_node_id, target_node_id = target_node_id, ), diff --git a/cognee/tests/test_qdrant.py b/cognee/tests/test_qdrant.py index faa2cbcf..784b3f27 100644 --- a/cognee/tests/test_qdrant.py +++ b/cognee/tests/test_qdrant.py @@ -37,7 +37,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity_name", "AI"))[0] + random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0] random_node_name = random_node.payload["text"] search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) diff --git a/cognee/tests/test_weaviate.py b/cognee/tests/test_weaviate.py index 121c1749..3f853f63 100644 --- a/cognee/tests/test_weaviate.py +++ b/cognee/tests/test_weaviate.py @@ -35,7 +35,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity_name", "AI"))[0] + random_node = (await vector_engine.search("Entity_name", "quantum computer"))[0] random_node_name = random_node.payload["text"] search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) From 9c4da23307a983235c09d0266dcffe57ee18bc13 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Mon, 11 Nov 2024 15:56:09 +0100 Subject: [PATCH 10/19] fix: fix single data point addition to weaiate --- .../vector/weaviate_db/WeaviateAdapter.py | 60 ++++++++++--------- cognee/tests/test_weaviate.py | 2 +- 2 files changed, 33 insertions(+), 29 deletions(-) diff --git a/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py b/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py index b5cabc56..4b07aeb1 100644 --- a/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +++ b/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py @@ -11,7 +11,6 @@ logger = logging.getLogger("WeaviateAdapter") class IndexSchema(DataPoint): - uuid: str text: str _metadata: dict = { @@ -58,18 +57,21 @@ async def create_collection( future = asyncio.Future() - future.set_result( - self.client.collections.create( - name=collection_name, - properties=[ - wvcc.Property( - name="text", - data_type=wvcc.DataType.TEXT, - skip_vectorization=True - ) - ] + if not self.client.collections.exists(collection_name): + future.set_result( + self.client.collections.create( + name = collection_name, + properties = [ + wvcc.Property( + name = "text", + data_type = wvcc.DataType.TEXT, + skip_vectorization = True + ) + ] + ) ) - ) + else: + future.set_result(self.get_collection(collection_name)) return await future @@ -80,13 +82,16 @@ async def create_data_points(self, collection_name: str, data_points: List[DataP from weaviate.classes.data import DataObject data_vectors = await self.embed_data( - list(map(lambda data_point: data_point.get_embeddable_data(), data_points))) + [data_point.get_embeddable_data() for data_point in data_points] + ) def convert_to_weaviate_data_points(data_point: DataPoint): vector = data_vectors[data_points.index(data_point)] properties = data_point.model_dump() - properties["uuid"] = properties["id"] - del properties["id"] + + if "id" in properties: + properties["uuid"] = str(data_point.id) + del properties["id"] return DataObject( uuid = data_point.id, @@ -94,7 +99,7 @@ def convert_to_weaviate_data_points(data_point: DataPoint): vector = vector ) - data_points = list(map(convert_to_weaviate_data_points, data_points)) + data_points = [convert_to_weaviate_data_points(data_point) for data_point in data_points] collection = self.get_collection(collection_name) @@ -102,14 +107,13 @@ def convert_to_weaviate_data_points(data_point: DataPoint): if len(data_points) > 1: return collection.data.insert_many(data_points) else: - return collection.data.insert(data_points[0]) - # with collection.batch.dynamic() as batch: - # for point in data_points: - # batch.add_object( - # uuid = point.uuid, - # properties = point.properties, - # vector = point.vector - # ) + data_point: DataObject = data_points[0] + return collection.data.insert( + uuid = data_point.uuid, + vector = data_point.vector, + properties = data_point.properties, + references = data_point.references, + ) except Exception as error: logger.error("Error creating data points: %s", str(error)) raise error @@ -120,8 +124,8 @@ async def create_vector_index(self, index_name: str, index_property_name: str): async def index_data_points(self, index_name: str, index_property_name: str, data_points: list[DataPoint]): await self.create_data_points(f"{index_name}_{index_property_name}", [ IndexSchema( - uuid = str(data_point.id), - text = getattr(data_point, data_point._metadata["index_fields"][0]), + id = data_point.id, + text = data_point.get_embeddable_data(), ) for data_point in data_points ]) @@ -168,9 +172,9 @@ async def search( return [ ScoredResult( - id = UUID(result.uuid), + id = UUID(str(result.uuid)), payload = result.properties, - score = float(result.metadata.score) + score = 1 - float(result.metadata.score) ) for result in search_result.objects ] diff --git a/cognee/tests/test_weaviate.py b/cognee/tests/test_weaviate.py index 3f853f63..f788f997 100644 --- a/cognee/tests/test_weaviate.py +++ b/cognee/tests/test_weaviate.py @@ -35,7 +35,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity_name", "quantum computer"))[0] + random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0] random_node_name = random_node.payload["text"] search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) From 4c19999f55294b8001ba9eebc768cab2522b0006 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Mon, 11 Nov 2024 15:56:30 +0100 Subject: [PATCH 11/19] fix: convert UUID to str for neo4j query --- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py | 4 ++-- cognee/tests/test_neo4j.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 8e79b201..a6618dce 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -351,8 +351,8 @@ async def get_connections(self, node_id: UUID) -> list: """ predecessors, successors = await asyncio.gather( - self.query(predecessors_query, dict(node_id = node_id)), - self.query(successors_query, dict(node_id = node_id)), + self.query(predecessors_query, dict(node_id = str(node_id))), + self.query(successors_query, dict(node_id = str(node_id))), ) connections = [] diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py index 0783e973..9cf1c53d 100644 --- a/cognee/tests/test_neo4j.py +++ b/cognee/tests/test_neo4j.py @@ -36,7 +36,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity_name", "AI"))[0] + random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0] random_node_name = random_node.payload["text"] search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) From 39bc8d6fc337c55ac58d62ad9baa7bd4977b69f9 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Mon, 11 Nov 2024 16:23:53 +0100 Subject: [PATCH 12/19] fix: change weaviate batch update to use dynamic batch --- .../databases/graph/neo4j_driver/adapter.py | 48 ++++++++++--------- .../vector/weaviate_db/WeaviateAdapter.py | 11 ++++- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index a6618dce..26bbb581 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -1,6 +1,7 @@ """ Neo4j Adapter for Graph Database""" import logging import asyncio +from textwrap import dedent from typing import Optional, Any, List, Dict from contextlib import asynccontextmanager from uuid import UUID @@ -43,7 +44,6 @@ async def query( async with self.get_session() as session: result = await session.run(query, parameters = params) data = await result.data() - await self.close() return data except Neo4jError as error: logger.error("Neo4j query error: %s", error, exc_info = True) @@ -63,11 +63,10 @@ async def has_node(self, node_id: str) -> bool: async def add_node(self, node: DataPoint): serialized_properties = self.serialize_properties(node.model_dump()) - query = """MERGE (node {id: $node_id}) - ON CREATE SET node += $properties - ON MATCH SET node += $properties - ON MATCH SET node.updated_at = timestamp() - RETURN ID(node) AS internal_id, node.id AS nodeId""" + query = dedent("""MERGE (node {id: $node_id}) + ON CREATE SET node += $properties, node.updated_at = timestamp() + ON MATCH SET node += $properties, node.updated_at = timestamp() + RETURN ID(node) AS internal_id, node.id AS nodeId""") params = { "node_id": str(node.id), @@ -80,9 +79,8 @@ async def add_nodes(self, nodes: list[DataPoint]) -> None: query = """ UNWIND $nodes AS node MERGE (n {id: node.node_id}) - ON CREATE SET n += node.properties - ON MATCH SET n += node.properties - ON MATCH SET n.updated_at = timestamp() + ON CREATE SET n += node.properties, n.updated_at = timestamp() + ON MATCH SET n += node.properties, n.updated_at = timestamp() WITH n, node.node_id AS label CALL apoc.create.addLabels(n, [label]) YIELD node AS labeledNode RETURN ID(labeledNode) AS internal_id, labeledNode.id AS nodeId @@ -137,12 +135,19 @@ async def delete_nodes(self, node_ids: list[str]) -> None: return await self.query(query, params) async def has_edge(self, from_node: UUID, to_node: UUID, edge_label: str) -> bool: - query = f""" - MATCH (from_node:`{str(from_node)}`)-[relationship:`{edge_label}`]->(to_node:`{str(to_node)}`) + query = """ + MATCH (from_node)-[relationship]->(to_node) + WHERE from_node.id = $from_node_id AND to_node.id = $to_node_id AND type(relationship) = $edge_label RETURN COUNT(relationship) > 0 AS edge_exists """ - edge_exists = await self.query(query) + params = { + "from_node_id": str(from_node), + "to_node_id": str(to_node), + "edge_label": edge_label, + } + + edge_exists = await self.query(query, params) return edge_exists async def has_edges(self, edges): @@ -169,22 +174,21 @@ async def has_edges(self, edges): raise error - async def add_edge(self, from_node: str, to_node: str, relationship_name: str, edge_properties: Optional[Dict[str, Any]] = {}): + async def add_edge(self, from_node: UUID, to_node: UUID, relationship_name: str, edge_properties: Optional[Dict[str, Any]] = {}): serialized_properties = self.serialize_properties(edge_properties) - from_node = from_node.replace(":", "_") - to_node = to_node.replace(":", "_") - query = f"""MATCH (from_node:`{str(from_node)}` - {{id: $from_node}}), - (to_node:`{str(to_node)}` {{id: $to_node}}) - MERGE (from_node)-[r:`{relationship_name}`]->(to_node) - ON CREATE SET r += $properties, r.updated_at = timestamp() - ON MATCH SET r += $properties, r.updated_at = timestamp() - RETURN r""" + query = dedent("""MATCH (from_node {id: $from_node}), + (to_node {id: $to_node}) + MERGE (from_node)-[r]->(to_node) + ON CREATE SET r += $properties, r.updated_at = timestamp(), r.type = $relationship_name + ON MATCH SET r += $properties, r.updated_at = timestamp() + RETURN r + """) params = { "from_node": str(from_node), "to_node": str(to_node), + "relationship_name": relationship_name, "properties": serialized_properties } diff --git a/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py b/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py index 4b07aeb1..be356740 100644 --- a/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +++ b/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py @@ -105,10 +105,17 @@ def convert_to_weaviate_data_points(data_point: DataPoint): try: if len(data_points) > 1: - return collection.data.insert_many(data_points) + with collection.batch.dynamic() as batch: + for data_point in data_points: + batch.add_object( + uuid = data_point.uuid, + vector = data_point.vector, + properties = data_point.properties, + references = data_point.references, + ) else: data_point: DataObject = data_points[0] - return collection.data.insert( + return collection.data.update( uuid = data_point.uuid, vector = data_point.vector, properties = data_point.properties, From d2d819e7404736a9a7c1b83243966834564d922b Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Mon, 11 Nov 2024 17:29:13 +0100 Subject: [PATCH 13/19] fix: unwrap connections in PGVectorAdapter --- .../vector/pgvector/PGVectorAdapter.py | 78 +++++++++---------- cognee/tests/test_pgvector.py | 4 +- pyproject.toml | 12 +-- 3 files changed, 48 insertions(+), 46 deletions(-) diff --git a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py index 2e9a3764..1a7128a0 100644 --- a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +++ b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py @@ -80,44 +80,44 @@ def __init__(self, id, payload, vector): async def create_data_points( self, collection_name: str, data_points: List[DataPoint] ): - async with self.get_async_session() as session: - if not await self.has_collection(collection_name): - await self.create_collection( - collection_name=collection_name, - payload_schema=type(data_points[0]), - ) - - data_vectors = await self.embed_data( - [data_point.get_embeddable_data() for data_point in data_points] + if not await self.has_collection(collection_name): + await self.create_collection( + collection_name = collection_name, + payload_schema = type(data_points[0]), ) - vector_size = self.embedding_engine.get_vector_size() + data_vectors = await self.embed_data( + [data_point.get_embeddable_data() for data_point in data_points] + ) - class PGVectorDataPoint(Base): - __tablename__ = collection_name - __table_args__ = {"extend_existing": True} - # PGVector requires one column to be the primary key - primary_key: Mapped[int] = mapped_column( - primary_key=True, autoincrement=True - ) - id: Mapped[type(data_points[0].id)] - payload = Column(JSON) - vector = Column(Vector(vector_size)) + vector_size = self.embedding_engine.get_vector_size() - def __init__(self, id, payload, vector): - self.id = id - self.payload = payload - self.vector = vector + class PGVectorDataPoint(Base): + __tablename__ = collection_name + __table_args__ = {"extend_existing": True} + # PGVector requires one column to be the primary key + primary_key: Mapped[int] = mapped_column( + primary_key=True, autoincrement=True + ) + id: Mapped[type(data_points[0].id)] + payload = Column(JSON) + vector = Column(Vector(vector_size)) - pgvector_data_points = [ - PGVectorDataPoint( - id=data_point.id, - vector=data_vectors[data_index], - payload=serialize_data(data_point.model_dump()), - ) - for (data_index, data_point) in enumerate(data_points) - ] + def __init__(self, id, payload, vector): + self.id = id + self.payload = payload + self.vector = vector + pgvector_data_points = [ + PGVectorDataPoint( + id = data_point.id, + vector = data_vectors[data_index], + payload = serialize_data(data_point.model_dump()), + ) + for (data_index, data_point) in enumerate(data_points) + ] + + async with self.get_async_session() as session: session.add_all(pgvector_data_points) await session.commit() @@ -128,7 +128,7 @@ async def index_data_points(self, index_name: str, index_property_name: str, dat await self.create_data_points(f"{index_name}_{index_property_name}", [ IndexSchema( id = data_point.id, - text = getattr(data_point, data_point._metadata["index_fields"][0]), + text = data_point.get_embeddable_data(), ) for data_point in data_points ]) @@ -146,10 +146,10 @@ async def get_table(self, collection_name: str) -> Table: raise ValueError(f"Table '{collection_name}' not found.") async def retrieve(self, collection_name: str, data_point_ids: List[str]): - async with self.get_async_session() as session: - # Get PGVectorDataPoint Table from database - PGVectorDataPoint = await self.get_table(collection_name) + # Get PGVectorDataPoint Table from database + PGVectorDataPoint = await self.get_table(collection_name) + async with self.get_async_session() as session: results = await session.execute( select(PGVectorDataPoint).where(PGVectorDataPoint.c.id.in_(data_point_ids)) ) @@ -177,11 +177,11 @@ async def search( if query_text and not query_vector: query_vector = (await self.embedding_engine.embed_text([query_text]))[0] + # Get PGVectorDataPoint Table from database + PGVectorDataPoint = await self.get_table(collection_name) + # Use async session to connect to the database async with self.get_async_session() as session: - # Get PGVectorDataPoint Table from database - PGVectorDataPoint = await self.get_table(collection_name) - # Find closest vectors to query_vector closest_items = await session.execute( select( diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index 802aa3fc..ac4d08fb 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -41,7 +41,7 @@ async def main(): cognee.config.system_root_directory(cognee_directory_path) await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) + await cognee.prune.prune_system(metadata = True) dataset_name = "cs_explanations" @@ -65,7 +65,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity_name", "AI"))[0] + random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0] random_node_name = random_node.payload["text"] search_results = await cognee.search(SearchType.INSIGHTS, query=random_node_name) diff --git a/pyproject.toml b/pyproject.toml index 93ec8e0e..2af7a25a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ aiosqlite = "^0.20.0" pandas = "2.0.3" filetype = "^1.2.0" nltk = "^3.8.1" -dlt = {extras = ["sqlalchemy"], version = "^1.2.0"} +dlt = {extras = ["sqlalchemy"], version = "^1.3.0"} aiofiles = "^23.2.1" qdrant-client = "^1.9.0" graphistry = "^0.33.5" @@ -66,10 +66,7 @@ pydantic-settings = "^2.2.1" anthropic = "^0.26.1" sentry-sdk = {extras = ["fastapi"], version = "^2.9.0"} fastapi-users = {version = "*", extras = ["sqlalchemy"]} -asyncpg = "^0.29.0" alembic = "^1.13.3" -pgvector = "^0.3.5" -psycopg2 = {version = "^2.9.10", optional = true} [tool.poetry.extras] filesystem = ["s3fs", "botocore"] @@ -77,9 +74,14 @@ cli = ["pipdeptree", "cron-descriptor"] weaviate = ["weaviate-client"] qdrant = ["qdrant-client"] neo4j = ["neo4j"] -postgres = ["psycopg2"] +postgres = ["psycopg2", "pgvector", "asyncpg"] notebook = ["ipykernel", "overrides", "ipywidgets", "jupyterlab", "jupyterlab_widgets", "jupyterlab-server", "jupyterlab-git"] +[tool.poetry.group.postgres.dependencies] +asyncpg = "^0.29.0" +pgvector = "^0.3.5" +psycopg2 = "^2.9.10" + [tool.poetry.group.dev.dependencies] pytest = "^7.4.0" pytest-asyncio = "^0.21.1" From f8e35b328444569c22ee4b93ea33de005e4e2db1 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Mon, 11 Nov 2024 17:32:06 +0100 Subject: [PATCH 14/19] fix: update poetry.lock --- poetry.lock | 79 ++++++++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/poetry.lock b/poetry.lock index d30aa907..f9bc5a74 100644 --- a/poetry.lock +++ b/poetry.lock @@ -597,17 +597,17 @@ css = ["tinycss2 (>=1.1.0,<1.5)"] [[package]] name = "boto3" -version = "1.35.55" +version = "1.35.57" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" files = [ - {file = "boto3-1.35.55-py3-none-any.whl", hash = "sha256:c7a0a0bc5ae3bed5d38e8bfe5a56b31621e79bdd7c1ea6e5ba4326d820cde3a5"}, - {file = "boto3-1.35.55.tar.gz", hash = "sha256:82fa8cdb00731aeffe7a5829821ae78d75c7ae959b638c15ff3b4681192ace90"}, + {file = "boto3-1.35.57-py3-none-any.whl", hash = "sha256:9edf49640c79a05b0a72f4c2d1e24dfc164344b680535a645f455ac624dc3680"}, + {file = "boto3-1.35.57.tar.gz", hash = "sha256:db58348849a5af061f0f5ec9c3b699da5221ca83354059fdccb798e3ddb6b62a"}, ] [package.dependencies] -botocore = ">=1.35.55,<1.36.0" +botocore = ">=1.35.57,<1.36.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.10.0,<0.11.0" @@ -616,13 +616,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.35.55" +version = "1.35.57" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" files = [ - {file = "botocore-1.35.55-py3-none-any.whl", hash = "sha256:3d54739e498534c9d7a6e9732ae2d17ed29c7d5e29fe36c956d8488b859538b0"}, - {file = "botocore-1.35.55.tar.gz", hash = "sha256:61ae18f688250372d7b6046e35c86f8fd09a7c0f0064b52688f3490b4d6c9d6b"}, + {file = "botocore-1.35.57-py3-none-any.whl", hash = "sha256:92ddd02469213766872cb2399269dd20948f90348b42bf08379881d5e946cc34"}, + {file = "botocore-1.35.57.tar.gz", hash = "sha256:d96306558085baf0bcb3b022d7a8c39c93494f031edb376694d2b2dcd0e81327"}, ] [package.dependencies] @@ -2606,22 +2606,22 @@ colors = ["colorama (>=0.4.6)"] [[package]] name = "jedi" -version = "0.19.1" +version = "0.19.2" description = "An autocompletion tool for Python that can be used for text editors." optional = false python-versions = ">=3.6" files = [ - {file = "jedi-0.19.1-py2.py3-none-any.whl", hash = "sha256:e983c654fe5c02867aef4cdfce5a2fbb4a50adc0af145f70504238f18ef5e7e0"}, - {file = "jedi-0.19.1.tar.gz", hash = "sha256:cf0496f3651bc65d7174ac1b7d043eff454892c708a87d1b683e57b569927ffd"}, + {file = "jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9"}, + {file = "jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0"}, ] [package.dependencies] -parso = ">=0.8.3,<0.9.0" +parso = ">=0.8.4,<0.9.0" [package.extras] docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alabaster (==0.7.12)", "babel (==2.9.1)", "chardet (==4.0.0)", "commonmark (==0.8.1)", "docutils (==0.17.1)", "future (==0.18.2)", "idna (==2.10)", "imagesize (==1.2.0)", "mock (==1.0.1)", "packaging (==20.9)", "pyparsing (==2.4.7)", "pytz (==2021.1)", "readthedocs-sphinx-ext (==2.1.4)", "recommonmark (==0.5.0)", "requests (==2.25.1)", "six (==1.15.0)", "snowballstemmer (==2.1.0)", "sphinx (==1.8.5)", "sphinx-rtd-theme (==0.4.3)", "sphinxcontrib-serializinghtml (==1.1.4)", "sphinxcontrib-websupport (==1.2.4)", "urllib3 (==1.26.4)"] qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] -testing = ["Django", "attrs", "colorama", "docopt", "pytest (<7.0.0)"] +testing = ["Django", "attrs", "colorama", "docopt", "pytest (<9.0.0)"] [[package]] name = "jinja2" @@ -2755,15 +2755,18 @@ files = [ [[package]] name = "json5" -version = "0.9.25" +version = "0.9.27" description = "A Python implementation of the JSON5 data format." optional = false -python-versions = ">=3.8" +python-versions = ">=3.8.0" files = [ - {file = "json5-0.9.25-py3-none-any.whl", hash = "sha256:34ed7d834b1341a86987ed52f3f76cd8ee184394906b6e22a1e0deb9ab294e8f"}, - {file = "json5-0.9.25.tar.gz", hash = "sha256:548e41b9be043f9426776f05df8635a00fe06104ea51ed24b67f908856e151ae"}, + {file = "json5-0.9.27-py3-none-any.whl", hash = "sha256:17b43d78d3a6daeca4d7030e9bf22092dba29b1282cc2d0cfa56f6febee8dc93"}, + {file = "json5-0.9.27.tar.gz", hash = "sha256:5a19de4a6ca24ba664dc7d50307eb73ba9a16dea5d6bde85677ae85d3ed2d8e0"}, ] +[package.extras] +dev = ["build (==1.2.1)", "coverage (==7.5.3)", "mypy (==1.10.0)", "pip (==24.1)", "pylint (==3.2.3)", "ruff (==0.5.1)", "twine (==5.1.1)", "uv (==0.2.13)"] + [[package]] name = "jsonpatch" version = "1.33" @@ -4360,13 +4363,13 @@ files = [ [[package]] name = "packaging" -version = "24.1" +version = "24.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, - {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, + {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, + {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, ] [[package]] @@ -5007,7 +5010,7 @@ test = ["pytest", "pytest-xdist", "setuptools"] name = "psycopg2" version = "2.9.10" description = "psycopg2 - Python-PostgreSQL Database Adapter" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "psycopg2-2.9.10-cp310-cp310-win32.whl", hash = "sha256:5df2b672140f95adb453af93a7d669d7a7bf0a56bcd26f1502329166f4a61716"}, @@ -7032,13 +7035,13 @@ test = ["vcrpy (>=1.10.3)"] [[package]] name = "typer" -version = "0.12.5" +version = "0.13.0" description = "Typer, build great CLIs. Easy to code. Based on Python type hints." optional = false python-versions = ">=3.7" files = [ - {file = "typer-0.12.5-py3-none-any.whl", hash = "sha256:62fe4e471711b147e3365034133904df3e235698399bc4de2b36c8579298d52b"}, - {file = "typer-0.12.5.tar.gz", hash = "sha256:f592f089bedcc8ec1b974125d64851029c3b1af145f04aca64d69410f0c9b722"}, + {file = "typer-0.13.0-py3-none-any.whl", hash = "sha256:d85fe0b777b2517cc99c8055ed735452f2659cd45e451507c76f48ce5c1d00e2"}, + {file = "typer-0.13.0.tar.gz", hash = "sha256:f1c7198347939361eec90139ffa0fd8b3df3a2259d5852a0f7400e476d95985c"}, ] [package.dependencies] @@ -7333,19 +7336,15 @@ validators = "0.33.0" [[package]] name = "webcolors" -version = "24.8.0" +version = "24.11.1" description = "A library for working with the color formats defined by HTML and CSS." optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "webcolors-24.8.0-py3-none-any.whl", hash = "sha256:fc4c3b59358ada164552084a8ebee637c221e4059267d0f8325b3b560f6c7f0a"}, - {file = "webcolors-24.8.0.tar.gz", hash = "sha256:08b07af286a01bcd30d583a7acadf629583d1f79bfef27dd2c2c5c263817277d"}, + {file = "webcolors-24.11.1-py3-none-any.whl", hash = "sha256:515291393b4cdf0eb19c155749a096f779f7d909f7cceea072791cb9095b92e9"}, + {file = "webcolors-24.11.1.tar.gz", hash = "sha256:ecb3d768f32202af770477b8b65f318fa4f566c22948673a977b00d589dd80f6"}, ] -[package.extras] -docs = ["furo", "sphinx", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-notfound-page", "sphinxext-opengraph"] -tests = ["coverage[toml]"] - [[package]] name = "webencodings" version = "0.5.1" @@ -7375,13 +7374,13 @@ test = ["websockets"] [[package]] name = "wheel" -version = "0.44.0" +version = "0.45.0" description = "A built-package format for Python" optional = false python-versions = ">=3.8" files = [ - {file = "wheel-0.44.0-py3-none-any.whl", hash = "sha256:2376a90c98cc337d18623527a97c31797bd02bad0033d41547043a1cbfbe448f"}, - {file = "wheel-0.44.0.tar.gz", hash = "sha256:a29c3f2817e95ab89aa4660681ad547c0e9547f20e75b0562fe7723c9a2a9d49"}, + {file = "wheel-0.45.0-py3-none-any.whl", hash = "sha256:52f0baa5e6522155090a09c6bd95718cc46956d1b51d537ea5454249edb671c7"}, + {file = "wheel-0.45.0.tar.gz", hash = "sha256:a57353941a3183b3d5365346b567a260a0602a0f8a635926a7dede41b94c674a"}, ] [package.extras] @@ -7718,13 +7717,13 @@ propcache = ">=0.2.0" [[package]] name = "zipp" -version = "3.20.2" +version = "3.21.0" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"}, - {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"}, + {file = "zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931"}, + {file = "zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4"}, ] [package.extras] @@ -7740,11 +7739,11 @@ cli = [] filesystem = ["botocore"] neo4j = ["neo4j"] notebook = [] -postgres = ["psycopg2"] +postgres = [] qdrant = ["qdrant-client"] weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.12" -content-hash = "426fa990f2bdd15fa5be55392beb4cf77dba320f2e95cc503d1c0549d9758d64" +content-hash = "7c305c381d9327bd55e658cc955a6335411d85fc3e11f2f3dcebfdc5e3b70da0" From 30edd2dc5c992d4ffead520e7dd1a8584f242b11 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Mon, 11 Nov 2024 17:38:33 +0100 Subject: [PATCH 15/19] fix: add postgres extras to dependencies --- poetry.lock | 6 +++--- pyproject.toml | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index f9bc5a74..270e6602 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4435,8 +4435,8 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.20.3", markers = "python_version < \"3.10\""}, - {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -7739,11 +7739,11 @@ cli = [] filesystem = ["botocore"] neo4j = ["neo4j"] notebook = [] -postgres = [] +postgres = ["asyncpg", "pgvector", "psycopg2"] qdrant = ["qdrant-client"] weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.12" -content-hash = "7c305c381d9327bd55e658cc955a6335411d85fc3e11f2f3dcebfdc5e3b70da0" +content-hash = "fb09733ff7a70fb91c5f72ff0c8a8137b857557930a7aa025aad3154de4d8ceb" diff --git a/pyproject.toml b/pyproject.toml index 2af7a25a..0bc3849b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,6 +67,9 @@ anthropic = "^0.26.1" sentry-sdk = {extras = ["fastapi"], version = "^2.9.0"} fastapi-users = {version = "*", extras = ["sqlalchemy"]} alembic = "^1.13.3" +asyncpg = "^0.29.0" +pgvector = "^0.3.5" +psycopg2 = "^2.9.10" [tool.poetry.extras] filesystem = ["s3fs", "botocore"] @@ -77,10 +80,6 @@ neo4j = ["neo4j"] postgres = ["psycopg2", "pgvector", "asyncpg"] notebook = ["ipykernel", "overrides", "ipywidgets", "jupyterlab", "jupyterlab_widgets", "jupyterlab-server", "jupyterlab-git"] -[tool.poetry.group.postgres.dependencies] -asyncpg = "^0.29.0" -pgvector = "^0.3.5" -psycopg2 = "^2.9.10" [tool.poetry.group.dev.dependencies] pytest = "^7.4.0" From 44954c12b0e4a4659daf82d3106e445351c494cd Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Mon, 11 Nov 2024 17:54:00 +0100 Subject: [PATCH 16/19] fix: update entities collection name in cognee_demo notebook --- .../vector/pgvector/PGVectorAdapter.py | 30 ++++++++++--------- notebooks/cognee_demo.ipynb | 2 +- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py index 1a7128a0..01691714 100644 --- a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +++ b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py @@ -54,7 +54,6 @@ async def create_collection(self, collection_name: str, payload_schema=None): vector_size = self.embedding_engine.get_vector_size() if not await self.has_collection(collection_name): - class PGVectorDataPoint(Base): __tablename__ = collection_name __table_args__ = {"extend_existing": True} @@ -180,6 +179,8 @@ async def search( # Get PGVectorDataPoint Table from database PGVectorDataPoint = await self.get_table(collection_name) + closest_items = [] + # Use async session to connect to the database async with self.get_async_session() as session: # Find closest vectors to query_vector @@ -194,20 +195,21 @@ async def search( .limit(limit) ) - vector_list = [] - # Extract distances and find min/max for normalization - for vector in closest_items: - # TODO: Add normalization of similarity score - vector_list.append(vector) + vector_list = [] - # Create and return ScoredResult objects - return [ - ScoredResult( - id = UUID(str(row.id)), - payload = row.payload, - score = row.similarity - ) for row in vector_list - ] + # Extract distances and find min/max for normalization + for vector in closest_items: + # TODO: Add normalization of similarity score + vector_list.append(vector) + + # Create and return ScoredResult objects + return [ + ScoredResult( + id = UUID(str(row.id)), + payload = row.payload, + score = row.similarity + ) for row in vector_list + ] async def batch_search( self, diff --git a/notebooks/cognee_demo.ipynb b/notebooks/cognee_demo.ipynb index 396d7b98..5f4dfa22 100644 --- a/notebooks/cognee_demo.ipynb +++ b/notebooks/cognee_demo.ipynb @@ -758,7 +758,7 @@ "from cognee.infrastructure.databases.vector import get_vector_engine\n", "\n", "vector_engine = get_vector_engine()\n", - "results = await search(vector_engine, \"entities\", \"sarah.nguyen@example.com\")\n", + "results = await search(vector_engine, \"Entity_name\", \"sarah.nguyen@example.com\")\n", "for result in results:\n", " print(result)" ] From a3b366753bc8474326b3ed3af3c86905585d0553 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Mon, 11 Nov 2024 17:58:06 +0100 Subject: [PATCH 17/19] fix: rerun github workflow checks on push --- .github/workflows/test_neo4j.yml | 2 +- .github/workflows/test_notebook.yml | 2 +- .github/workflows/test_pgvector.yml | 2 +- .github/workflows/test_python_3_10.yml | 2 +- .github/workflows/test_python_3_11.yml | 2 +- .github/workflows/test_python_3_9.yml | 2 +- .github/workflows/test_qdrant.yml | 2 +- .github/workflows/test_weaviate.yml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test_neo4j.yml b/.github/workflows/test_neo4j.yml index 5e94ad7f..0b47a55f 100644 --- a/.github/workflows/test_neo4j.yml +++ b/.github/workflows/test_neo4j.yml @@ -5,7 +5,7 @@ on: pull_request: branches: - main - types: [labeled] + types: [labeled, synchronize] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/test_notebook.yml b/.github/workflows/test_notebook.yml index 49365ac9..e5d10f0f 100644 --- a/.github/workflows/test_notebook.yml +++ b/.github/workflows/test_notebook.yml @@ -5,7 +5,7 @@ on: pull_request: branches: - main - types: [labeled] + types: [labeled, synchronize] concurrency: diff --git a/.github/workflows/test_pgvector.yml b/.github/workflows/test_pgvector.yml index 971208fb..52df86c7 100644 --- a/.github/workflows/test_pgvector.yml +++ b/.github/workflows/test_pgvector.yml @@ -5,7 +5,7 @@ on: pull_request: branches: - main - types: [labeled] + types: [labeled, synchronize] concurrency: diff --git a/.github/workflows/test_python_3_10.yml b/.github/workflows/test_python_3_10.yml index 625d472e..5a795403 100644 --- a/.github/workflows/test_python_3_10.yml +++ b/.github/workflows/test_python_3_10.yml @@ -5,7 +5,7 @@ on: pull_request: branches: - main - types: [labeled] + types: [labeled, synchronize] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/test_python_3_11.yml b/.github/workflows/test_python_3_11.yml index 4ff6a9ce..22cdad32 100644 --- a/.github/workflows/test_python_3_11.yml +++ b/.github/workflows/test_python_3_11.yml @@ -5,7 +5,7 @@ on: pull_request: branches: - main - types: [labeled] + types: [labeled, synchronize] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/test_python_3_9.yml b/.github/workflows/test_python_3_9.yml index fae865a1..d6e7f8b9 100644 --- a/.github/workflows/test_python_3_9.yml +++ b/.github/workflows/test_python_3_9.yml @@ -5,7 +5,7 @@ on: pull_request: branches: - main - types: [labeled] + types: [labeled, synchronize] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/test_qdrant.yml b/.github/workflows/test_qdrant.yml index 4a7bdc6a..a6347bd0 100644 --- a/.github/workflows/test_qdrant.yml +++ b/.github/workflows/test_qdrant.yml @@ -5,7 +5,7 @@ on: pull_request: branches: - main - types: [labeled] + types: [labeled, synchronize] concurrency: diff --git a/.github/workflows/test_weaviate.yml b/.github/workflows/test_weaviate.yml index 0199ad01..490f9075 100644 --- a/.github/workflows/test_weaviate.yml +++ b/.github/workflows/test_weaviate.yml @@ -5,7 +5,7 @@ on: pull_request: branches: - main - types: [labeled] + types: [labeled, synchronize] concurrency: From ed9036a4368c439386fd7c1e18915e3afdbe5cee Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Mon, 11 Nov 2024 17:59:14 +0100 Subject: [PATCH 18/19] fix: change entity collection name --- notebooks/cognee_demo.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/cognee_demo.ipynb b/notebooks/cognee_demo.ipynb index 5f4dfa22..d26476a4 100644 --- a/notebooks/cognee_demo.ipynb +++ b/notebooks/cognee_demo.ipynb @@ -788,7 +788,7 @@ "source": [ "from cognee.api.v1.search import SearchType\n", "\n", - "node = (await vector_engine.search(\"entities\", \"sarah.nguyen@example.com\"))[0]\n", + "node = (await vector_engine.search(\"Entity_name\", \"sarah.nguyen@example.com\"))[0]\n", "node_name = node.payload[\"name\"]\n", "\n", "search_results = await cognee.search(SearchType.SUMMARIES, query = node_name)\n", From fd6398db7f6391561947ee5077ba829345fc5afc Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Mon, 11 Nov 2024 18:07:53 +0100 Subject: [PATCH 19/19] fix: cognee_demo notebook search --- notebooks/cognee_demo.ipynb | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/notebooks/cognee_demo.ipynb b/notebooks/cognee_demo.ipynb index d26476a4..06cd2a86 100644 --- a/notebooks/cognee_demo.ipynb +++ b/notebooks/cognee_demo.ipynb @@ -265,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "df16431d0f48b006", "metadata": { "ExecuteTime": { @@ -304,7 +304,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "9086abf3af077ab4", "metadata": { "ExecuteTime": { @@ -349,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "a9de0cc07f798b7f", "metadata": { "ExecuteTime": { @@ -393,7 +393,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "185ff1c102d06111", "metadata": { "ExecuteTime": { @@ -437,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "d55ce4c58f8efb67", "metadata": { "ExecuteTime": { @@ -479,7 +479,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "ca4ecc32721ad332", "metadata": { "ExecuteTime": { @@ -572,7 +572,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "9f1a1dbd", "metadata": {}, "outputs": [], @@ -789,7 +789,7 @@ "from cognee.api.v1.search import SearchType\n", "\n", "node = (await vector_engine.search(\"Entity_name\", \"sarah.nguyen@example.com\"))[0]\n", - "node_name = node.payload[\"name\"]\n", + "node_name = node.payload[\"text\"]\n", "\n", "search_results = await cognee.search(SearchType.SUMMARIES, query = node_name)\n", "print(\"\\n\\Extracted summaries are:\\n\")\n", @@ -881,7 +881,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.11.8" } }, "nbformat": 4,