Merge branch 'dev' into test-ubuntu-24.04

topoteretes · Jan 13, 2025 · 32d7b07 · 32d7b07
2 parents 0ce2339 + f9ddcaf
commit 32d7b07
Show file tree

Hide file tree

Showing 51 changed files with 1,579 additions and 424 deletions.
diff --git a/.github/workflows/reusable_notebook.yml b/.github/workflows/reusable_notebook.yml
@@ -51,6 +51,7 @@ jobs:
         env:
           ENV: 'dev'
           LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
           GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
         run: |

diff --git a/.github/workflows/test_llama_index_cognee_integration_notebook.yml b/.github/workflows/test_llama_index_cognee_integration_notebook.yml
@@ -0,0 +1,20 @@
+name: test | llama index cognee integration notebook
+
+on:
+  workflow_dispatch:
+  pull_request:
+    types: [labeled, synchronize]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  run_notebook_test:
+      uses: ./.github/workflows/reusable_notebook.yml
+      with:
+        notebook-location: notebooks/llama_index_cognee_integration.ipynb
+      secrets:
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
+        GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -10,7 +10,7 @@ repos:
     -   id: check-added-large-files
 - repo: https://github.com/astral-sh/ruff-pre-commit
   # Ruff version.
-  rev: v0.8.3
+  rev: v0.9.0
   hooks:
     # Run the linter.
     - id: ruff

diff --git a/README.md b/README.md
@@ -101,15 +101,9 @@ cognee.config.set_graphistry_config({
 })
 ```
 
-(Optional) To run the UI, go to cognee-frontend directory and run:
-```
-npm run dev
-```
-or run everything in a docker container:
-```
-docker-compose up
-```
-Then navigate to localhost:3000
+(Optional) To run the with an UI, go to cognee-mcp directory and follow the instructions.
+You will be able to use cognee as mcp tool and create graphs and query them.
+
 
 If you want to use Cognee with PostgreSQL, make sure to set the following values in the .env file:
 ```

diff --git a/cognee-mcp/pyproject.toml b/cognee-mcp/pyproject.toml
@@ -3,10 +3,10 @@ name = "cognee-mcp"
 version = "0.1.0"
 description = "A MCP server project"
 readme = "README.md"
-requires-python = ">=3.11"
+requires-python = ">=3.10"
 dependencies = [
     "mcp>=1.1.1",
-    "openai==1.52.0",
+    "openai==1.59.4",
     "pydantic==2.8.2",
     "python-dotenv==1.0.1",
     "fastapi>=0.109.2,<0.110.0",
@@ -21,18 +21,18 @@ dependencies = [
     "boto3>=1.26.125,<2.0.0",
     "botocore>=1.35.54,<2.0.0",
     "gunicorn>=20.1.0,<21.0.0",
-    "sqlalchemy==2.0.35",
-    "instructor==1.5.2",
+    "sqlalchemy==2.0.36",
+    "instructor==1.7.2",
     "networkx>=3.2.1,<4.0.0",
     "aiosqlite>=0.20.0,<0.21.0",
-    "pandas==2.0.3",
+    "pandas==2.2.3",
     "filetype>=1.2.0,<2.0.0",
     "nltk>=3.8.1,<4.0.0",
     "dlt[sqlalchemy]>=1.4.1,<2.0.0",
     "aiofiles>=23.2.1,<24.0.0",
     "qdrant-client>=1.9.0,<2.0.0", # Optional
     "graphistry>=0.33.5,<0.34.0",
-    "tenacity>=8.4.1,<9.0.0",
+    "tenacity>=9.0.0",
     "weaviate-client==4.6.7", # Optional
     "scikit-learn>=1.5.0,<2.0.0",
     "pypdf>=4.1.0,<5.0.0",
@@ -44,8 +44,8 @@ dependencies = [
     "langsmith==0.1.139", # Optional
     "langdetect==1.0.9",
     "posthog>=3.5.0,<4.0.0", # Optional
-    "lancedb==0.15.0",
-    "litellm==1.49.1",
+    "lancedb==0.16.0",
+    "litellm==1.57.2",
     "groq==0.8.0", # Optional
     "langfuse>=2.32.0,<3.0.0", # Optional
     "pydantic-settings>=2.2.1,<3.0.0",
@@ -56,7 +56,7 @@ dependencies = [
     "asyncpg==0.30.0", # Optional
     "pgvector>=0.3.5,<0.4.0", # Optional
     "psycopg2>=2.9.10,<3.0.0", # Optional
-    "llama-index-core>=0.11.22,<0.12.0", # Optional
+    "llama-index-core>=0.12.0", # Optional
     "deepeval>=2.0.1,<3.0.0", # Optional
     "transformers>=4.46.3,<5.0.0",
     "pymilvus>=2.5.0,<3.0.0", # Optional

diff --git a/cognee-mcp/uv.lock b/cognee-mcp/uv.lock
diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -3,7 +3,6 @@
 from pathlib import Path
 
 from cognee.base_config import get_base_config
-from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
 from cognee.modules.cognify.config import get_cognify_config
 from cognee.modules.pipelines import run_tasks
 from cognee.modules.pipelines.tasks.Task import Task
@@ -54,20 +53,14 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
     await cognee.prune.prune_system(metadata=True)
     await create_db_and_tables()
 
-    embedding_engine = get_embedding_engine()
-
     cognee_config = get_cognify_config()
     user = await get_default_user()
 
     tasks = [
         Task(get_repo_file_dependencies),
         Task(enrich_dependency_graph),
         Task(expand_dependency_graph, task_config={"batch_size": 50}),
-        Task(
-            get_source_code_chunks,
-            embedding_model=embedding_engine.model,
-            task_config={"batch_size": 50},
-        ),
+        Task(get_source_code_chunks, task_config={"batch_size": 50}),
         Task(summarize_code, task_config={"batch_size": 50}),
         Task(add_data_points, task_config={"batch_size": 50}),
     ]
@@ -78,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
             Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
             Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
             Task(classify_documents),
-            Task(extract_chunks_from_documents),
+            Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens),
             Task(
                 extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}
             ),

diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
@@ -493,7 +493,7 @@ async def get_filtered_graph_data(self, attribute_filters):
 
         query_edges = f"""
         MATCH (n)-[r]->(m)
-        WHERE {where_clause} AND {where_clause.replace('n.', 'm.')}
+        WHERE {where_clause} AND {where_clause.replace("n.", "m.")}
         RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties
         """
         result_edges = await self.query(query_edges)

diff --git a/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt b/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt
@@ -1,3 +1,6 @@
-I need you to solve this issue by looking at the provided edges retrieved from a knowledge graph and 
-generate a single patch file that I can apply directly to this repository using git apply. 
-Please respond with a single patch file in the following format.
+You are a senior software engineer. I need you to solve this issue by looking at the provided context and
+generate a single patch file that I can apply directly to this repository using git apply.
+Additionally, please make sure that you provide code only with correct syntax and
+you apply the patch on the relevant files (together with their path that you can try to find out from the github issue). Don't change the names of existing
+functions or classes, as they may be referenced from other code.
+Please respond only with a single patch file in the following format without adding any additional context or string.
diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py
@@ -1,32 +1,47 @@
-from uuid import uuid5, NAMESPACE_OID
+from typing import Optional
+from uuid import NAMESPACE_OID, uuid5
 
-from .models.DocumentChunk import DocumentChunk
 from cognee.tasks.chunks import chunk_by_paragraph
 
+from .models.DocumentChunk import DocumentChunk
+
 
 class TextChunker:
     document = None
     max_chunk_size: int
 
     chunk_index = 0
     chunk_size = 0
+    token_count = 0
 
-    def __init__(self, document, get_text: callable, chunk_size: int = 1024):
+    def __init__(
+        self, document, get_text: callable, max_tokens: Optional[int] = None, chunk_size: int = 1024
+    ):
         self.document = document
         self.max_chunk_size = chunk_size
         self.get_text = get_text
+        self.max_tokens = max_tokens if max_tokens else float("inf")
+
+    def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
+        word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
+        token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens
+        return word_count_fits and token_count_fits
 
     def read(self):
         paragraph_chunks = []
         for content_text in self.get_text():
             for chunk_data in chunk_by_paragraph(
                 content_text,
+                self.max_tokens,
                 self.max_chunk_size,
                 batch_paragraphs=True,
             ):
-                if self.chunk_size + chunk_data["word_count"] <= self.max_chunk_size:
+                if self.check_word_count_and_token_count(
+                    self.chunk_size, self.token_count, chunk_data
+                ):
                     paragraph_chunks.append(chunk_data)
                     self.chunk_size += chunk_data["word_count"]
+                    self.token_count += chunk_data["token_count"]
                 else:
                     if len(paragraph_chunks) == 0:
                         yield DocumentChunk(
@@ -66,6 +81,7 @@ def read(self):
                             print(e)
                         paragraph_chunks = [chunk_data]
                         self.chunk_size = chunk_data["word_count"]
+                        self.token_count = chunk_data["token_count"]
 
                     self.chunk_index += 1
 

diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py
@@ -12,6 +12,7 @@ class DocumentChunk(DataPoint):
     chunk_index: int
     cut_type: str
     is_part_of: Document
+    pydantic_type: str = "DocumentChunk"
     contains: List[Entity] = None
 
     _metadata: dict = {"index_fields": ["text"], "type": "DocumentChunk"}
diff --git a/cognee/modules/cognify/config.py b/cognee/modules/cognify/config.py
@@ -1,12 +1,14 @@
 from functools import lru_cache
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from cognee.shared.data_models import DefaultContentPrediction, SummarizedContent
+from typing import Optional
+import os
 
 
 class CognifyConfig(BaseSettings):
     classification_model: object = DefaultContentPrediction
     summarization_model: object = SummarizedContent
-
+    max_tokens: Optional[int] = os.getenv("MAX_TOKENS")
     model_config = SettingsConfigDict(env_file=".env", extra="allow")
 
     def to_dict(self) -> dict:

diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py
@@ -1,6 +1,9 @@
+from typing import Optional
+
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from .Document import Document
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document
 
 
 class AudioDocument(Document):
@@ -10,12 +13,14 @@ def create_transcript(self):
         result = get_llm_client().create_transcript(self.raw_data_location)
         return result.text
 
-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         # Transcribe the audio file
 
         text = self.create_transcript()
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size=chunk_size, get_text=lambda: [text])
+        chunker = chunker_func(
+            self, chunk_size=chunk_size, get_text=lambda: [text], max_tokens=max_tokens
+        )
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
@@ -1,3 +1,4 @@
+from typing import Optional
 from uuid import UUID
 
 from cognee.infrastructure.engine import DataPoint
@@ -10,5 +11,5 @@ class Document(DataPoint):
     mime_type: str
     _metadata: dict = {"index_fields": ["name"], "type": "Document"}
 
-    def read(self, chunk_size: int, chunker=str) -> str:
+    def read(self, chunk_size: int, chunker=str, max_tokens: Optional[int] = None) -> str:
         pass
diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py
@@ -1,6 +1,9 @@
+from typing import Optional
+
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from .Document import Document
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document
 
 
 class ImageDocument(Document):
@@ -10,11 +13,13 @@ def transcribe_image(self):
         result = get_llm_client().transcribe_image(self.raw_data_location)
         return result.choices[0].message.content
 
-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         # Transcribe the image file
         text = self.transcribe_image()
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size=chunk_size, get_text=lambda: [text])
+        chunker = chunker_func(
+            self, chunk_size=chunk_size, get_text=lambda: [text], max_tokens=max_tokens
+        )
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py
@@ -1,12 +1,15 @@
+from typing import Optional
+
 from pypdf import PdfReader
-from .Document import Document
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document
 
 
 class PdfDocument(Document):
     type: str = "pdf"
 
-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         file = PdfReader(self.raw_data_location)
 
         def get_text():
@@ -15,7 +18,9 @@ def get_text():
                 yield page_text
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size=chunk_size, get_text=get_text)
+        chunker = chunker_func(
+            self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens
+        )
 
         yield from chunker.read()
 

diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py
@@ -1,11 +1,13 @@
-from .Document import Document
+from typing import Optional
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document
 
 
 class TextDocument(Document):
     type: str = "text"
 
-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         def get_text():
             with open(self.raw_data_location, mode="r", encoding="utf-8") as file:
                 while True:
@@ -18,6 +20,8 @@ def get_text():
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
 
-        chunker = chunker_func(self, chunk_size=chunk_size, get_text=get_text)
+        chunker = chunker_func(
+            self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens
+        )
 
         yield from chunker.read()