Skip to content

Commit

Permalink
Merge branch 'dev' into test-ubuntu-24.04
Browse files Browse the repository at this point in the history
  • Loading branch information
dexters1 authored Jan 13, 2025
2 parents 0ce2339 + f9ddcaf commit 32d7b07
Show file tree
Hide file tree
Showing 51 changed files with 1,579 additions and 424 deletions.
1 change: 1 addition & 0 deletions .github/workflows/reusable_notebook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ jobs:
env:
ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
run: |
Expand Down
20 changes: 20 additions & 0 deletions .github/workflows/test_llama_index_cognee_integration_notebook.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: test | llama index cognee integration notebook

on:
workflow_dispatch:
pull_request:
types: [labeled, synchronize]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
run_notebook_test:
uses: ./.github/workflows/reusable_notebook.yml
with:
notebook-location: notebooks/llama_index_cognee_integration.ipynb
secrets:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ repos:
- id: check-added-large-files
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.8.3
rev: v0.9.0
hooks:
# Run the linter.
- id: ruff
Expand Down
12 changes: 3 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,15 +101,9 @@ cognee.config.set_graphistry_config({
})
```

(Optional) To run the UI, go to cognee-frontend directory and run:
```
npm run dev
```
or run everything in a docker container:
```
docker-compose up
```
Then navigate to localhost:3000
(Optional) To run the with an UI, go to cognee-mcp directory and follow the instructions.
You will be able to use cognee as mcp tool and create graphs and query them.


If you want to use Cognee with PostgreSQL, make sure to set the following values in the .env file:
```
Expand Down
18 changes: 9 additions & 9 deletions cognee-mcp/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ name = "cognee-mcp"
version = "0.1.0"
description = "A MCP server project"
readme = "README.md"
requires-python = ">=3.11"
requires-python = ">=3.10"
dependencies = [
"mcp>=1.1.1",
"openai==1.52.0",
"openai==1.59.4",
"pydantic==2.8.2",
"python-dotenv==1.0.1",
"fastapi>=0.109.2,<0.110.0",
Expand All @@ -21,18 +21,18 @@ dependencies = [
"boto3>=1.26.125,<2.0.0",
"botocore>=1.35.54,<2.0.0",
"gunicorn>=20.1.0,<21.0.0",
"sqlalchemy==2.0.35",
"instructor==1.5.2",
"sqlalchemy==2.0.36",
"instructor==1.7.2",
"networkx>=3.2.1,<4.0.0",
"aiosqlite>=0.20.0,<0.21.0",
"pandas==2.0.3",
"pandas==2.2.3",
"filetype>=1.2.0,<2.0.0",
"nltk>=3.8.1,<4.0.0",
"dlt[sqlalchemy]>=1.4.1,<2.0.0",
"aiofiles>=23.2.1,<24.0.0",
"qdrant-client>=1.9.0,<2.0.0", # Optional
"graphistry>=0.33.5,<0.34.0",
"tenacity>=8.4.1,<9.0.0",
"tenacity>=9.0.0",
"weaviate-client==4.6.7", # Optional
"scikit-learn>=1.5.0,<2.0.0",
"pypdf>=4.1.0,<5.0.0",
Expand All @@ -44,8 +44,8 @@ dependencies = [
"langsmith==0.1.139", # Optional
"langdetect==1.0.9",
"posthog>=3.5.0,<4.0.0", # Optional
"lancedb==0.15.0",
"litellm==1.49.1",
"lancedb==0.16.0",
"litellm==1.57.2",
"groq==0.8.0", # Optional
"langfuse>=2.32.0,<3.0.0", # Optional
"pydantic-settings>=2.2.1,<3.0.0",
Expand All @@ -56,7 +56,7 @@ dependencies = [
"asyncpg==0.30.0", # Optional
"pgvector>=0.3.5,<0.4.0", # Optional
"psycopg2>=2.9.10,<3.0.0", # Optional
"llama-index-core>=0.11.22,<0.12.0", # Optional
"llama-index-core>=0.12.0", # Optional
"deepeval>=2.0.1,<3.0.0", # Optional
"transformers>=4.46.3,<5.0.0",
"pymilvus>=2.5.0,<3.0.0", # Optional
Expand Down
974 changes: 785 additions & 189 deletions cognee-mcp/uv.lock

Large diffs are not rendered by default.

11 changes: 2 additions & 9 deletions cognee/api/v1/cognify/code_graph_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from pathlib import Path

from cognee.base_config import get_base_config
from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
from cognee.modules.cognify.config import get_cognify_config
from cognee.modules.pipelines import run_tasks
from cognee.modules.pipelines.tasks.Task import Task
Expand Down Expand Up @@ -54,20 +53,14 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
await cognee.prune.prune_system(metadata=True)
await create_db_and_tables()

embedding_engine = get_embedding_engine()

cognee_config = get_cognify_config()
user = await get_default_user()

tasks = [
Task(get_repo_file_dependencies),
Task(enrich_dependency_graph),
Task(expand_dependency_graph, task_config={"batch_size": 50}),
Task(
get_source_code_chunks,
embedding_model=embedding_engine.model,
task_config={"batch_size": 50},
),
Task(get_source_code_chunks, task_config={"batch_size": 50}),
Task(summarize_code, task_config={"batch_size": 50}),
Task(add_data_points, task_config={"batch_size": 50}),
]
Expand All @@ -78,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
Task(classify_documents),
Task(extract_chunks_from_documents),
Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens),
Task(
extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ async def get_filtered_graph_data(self, attribute_filters):

query_edges = f"""
MATCH (n)-[r]->(m)
WHERE {where_clause} AND {where_clause.replace('n.', 'm.')}
WHERE {where_clause} AND {where_clause.replace("n.", "m.")}
RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties
"""
result_edges = await self.query(query_edges)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
I need you to solve this issue by looking at the provided edges retrieved from a knowledge graph and
generate a single patch file that I can apply directly to this repository using git apply.
Please respond with a single patch file in the following format.
You are a senior software engineer. I need you to solve this issue by looking at the provided context and
generate a single patch file that I can apply directly to this repository using git apply.
Additionally, please make sure that you provide code only with correct syntax and
you apply the patch on the relevant files (together with their path that you can try to find out from the github issue). Don't change the names of existing
functions or classes, as they may be referenced from other code.
Please respond only with a single patch file in the following format without adding any additional context or string.
24 changes: 20 additions & 4 deletions cognee/modules/chunking/TextChunker.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,47 @@
from uuid import uuid5, NAMESPACE_OID
from typing import Optional
from uuid import NAMESPACE_OID, uuid5

from .models.DocumentChunk import DocumentChunk
from cognee.tasks.chunks import chunk_by_paragraph

from .models.DocumentChunk import DocumentChunk


class TextChunker:
document = None
max_chunk_size: int

chunk_index = 0
chunk_size = 0
token_count = 0

def __init__(self, document, get_text: callable, chunk_size: int = 1024):
def __init__(
self, document, get_text: callable, max_tokens: Optional[int] = None, chunk_size: int = 1024
):
self.document = document
self.max_chunk_size = chunk_size
self.get_text = get_text
self.max_tokens = max_tokens if max_tokens else float("inf")

def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens
return word_count_fits and token_count_fits

def read(self):
paragraph_chunks = []
for content_text in self.get_text():
for chunk_data in chunk_by_paragraph(
content_text,
self.max_tokens,
self.max_chunk_size,
batch_paragraphs=True,
):
if self.chunk_size + chunk_data["word_count"] <= self.max_chunk_size:
if self.check_word_count_and_token_count(
self.chunk_size, self.token_count, chunk_data
):
paragraph_chunks.append(chunk_data)
self.chunk_size += chunk_data["word_count"]
self.token_count += chunk_data["token_count"]
else:
if len(paragraph_chunks) == 0:
yield DocumentChunk(
Expand Down Expand Up @@ -66,6 +81,7 @@ def read(self):
print(e)
paragraph_chunks = [chunk_data]
self.chunk_size = chunk_data["word_count"]
self.token_count = chunk_data["token_count"]

self.chunk_index += 1

Expand Down
1 change: 1 addition & 0 deletions cognee/modules/chunking/models/DocumentChunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class DocumentChunk(DataPoint):
chunk_index: int
cut_type: str
is_part_of: Document
pydantic_type: str = "DocumentChunk"
contains: List[Entity] = None

_metadata: dict = {"index_fields": ["text"], "type": "DocumentChunk"}
4 changes: 3 additions & 1 deletion cognee/modules/cognify/config.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from functools import lru_cache
from pydantic_settings import BaseSettings, SettingsConfigDict
from cognee.shared.data_models import DefaultContentPrediction, SummarizedContent
from typing import Optional
import os


class CognifyConfig(BaseSettings):
classification_model: object = DefaultContentPrediction
summarization_model: object = SummarizedContent

max_tokens: Optional[int] = os.getenv("MAX_TOKENS")
model_config = SettingsConfigDict(env_file=".env", extra="allow")

def to_dict(self) -> dict:
Expand Down
11 changes: 8 additions & 3 deletions cognee/modules/data/processing/document_types/AudioDocument.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from typing import Optional

from cognee.infrastructure.llm.get_llm_client import get_llm_client
from .Document import Document

from .ChunkerMapping import ChunkerConfig
from .Document import Document


class AudioDocument(Document):
Expand All @@ -10,12 +13,14 @@ def create_transcript(self):
result = get_llm_client().create_transcript(self.raw_data_location)
return result.text

def read(self, chunk_size: int, chunker: str):
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
# Transcribe the audio file

text = self.create_transcript()

chunker_func = ChunkerConfig.get_chunker(chunker)
chunker = chunker_func(self, chunk_size=chunk_size, get_text=lambda: [text])
chunker = chunker_func(
self, chunk_size=chunk_size, get_text=lambda: [text], max_tokens=max_tokens
)

yield from chunker.read()
3 changes: 2 additions & 1 deletion cognee/modules/data/processing/document_types/Document.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from typing import Optional
from uuid import UUID

from cognee.infrastructure.engine import DataPoint
Expand All @@ -10,5 +11,5 @@ class Document(DataPoint):
mime_type: str
_metadata: dict = {"index_fields": ["name"], "type": "Document"}

def read(self, chunk_size: int, chunker=str) -> str:
def read(self, chunk_size: int, chunker=str, max_tokens: Optional[int] = None) -> str:
pass
11 changes: 8 additions & 3 deletions cognee/modules/data/processing/document_types/ImageDocument.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from typing import Optional

from cognee.infrastructure.llm.get_llm_client import get_llm_client
from .Document import Document

from .ChunkerMapping import ChunkerConfig
from .Document import Document


class ImageDocument(Document):
Expand All @@ -10,11 +13,13 @@ def transcribe_image(self):
result = get_llm_client().transcribe_image(self.raw_data_location)
return result.choices[0].message.content

def read(self, chunk_size: int, chunker: str):
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
# Transcribe the image file
text = self.transcribe_image()

chunker_func = ChunkerConfig.get_chunker(chunker)
chunker = chunker_func(self, chunk_size=chunk_size, get_text=lambda: [text])
chunker = chunker_func(
self, chunk_size=chunk_size, get_text=lambda: [text], max_tokens=max_tokens
)

yield from chunker.read()
11 changes: 8 additions & 3 deletions cognee/modules/data/processing/document_types/PdfDocument.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
from typing import Optional

from pypdf import PdfReader
from .Document import Document

from .ChunkerMapping import ChunkerConfig
from .Document import Document


class PdfDocument(Document):
type: str = "pdf"

def read(self, chunk_size: int, chunker: str):
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
file = PdfReader(self.raw_data_location)

def get_text():
Expand All @@ -15,7 +18,9 @@ def get_text():
yield page_text

chunker_func = ChunkerConfig.get_chunker(chunker)
chunker = chunker_func(self, chunk_size=chunk_size, get_text=get_text)
chunker = chunker_func(
self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens
)

yield from chunker.read()

Expand Down
10 changes: 7 additions & 3 deletions cognee/modules/data/processing/document_types/TextDocument.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from .Document import Document
from typing import Optional

from .ChunkerMapping import ChunkerConfig
from .Document import Document


class TextDocument(Document):
type: str = "text"

def read(self, chunk_size: int, chunker: str):
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
def get_text():
with open(self.raw_data_location, mode="r", encoding="utf-8") as file:
while True:
Expand All @@ -18,6 +20,8 @@ def get_text():

chunker_func = ChunkerConfig.get_chunker(chunker)

chunker = chunker_func(self, chunk_size=chunk_size, get_text=get_text)
chunker = chunker_func(
self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens
)

yield from chunker.read()
Loading

0 comments on commit 32d7b07

Please sign in to comment.