Merge branch 'dev' into COG-970-refactor-tokenizing

topoteretes · Jan 24, 2025 · 77a7285 · 77a7285
2 parents 902979c + 72e332f
commit 77a7285
Show file tree

Hide file tree

Showing 12 changed files with 211 additions and 29 deletions.
diff --git a/.github/workflows/profiling.yaml b/.github/workflows/profiling.yaml
@@ -68,32 +68,32 @@ jobs:
         echo "HEAD_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
 
     # Run profiler on the base branch
-    - name: Run profiler on base branch
-      env:
-        BASE_SHA: ${{ env.BASE_SHA }}
-      run: |
-        echo "Profiling the base branch for code_graph_pipeline.py"
-        echo "Checking out base SHA: $BASE_SHA"
-        git checkout $BASE_SHA
-        echo "This is the working directory: $PWD"
-        # Ensure the script is executable
-        chmod +x cognee/api/v1/cognify/code_graph_pipeline.py
-        # Run Scalene
-        poetry run pyinstrument --renderer json -o base_results.json cognee/api/v1/cognify/code_graph_pipeline.py
-
-    # Run profiler on head branch
-#    - name: Run profiler on head branch
+#    - name: Run profiler on base branch
 #      env:
-#        HEAD_SHA: ${{ env.HEAD_SHA }}
+#        BASE_SHA: ${{ env.BASE_SHA }}
 #      run: |
-#        echo "Profiling the head branch for code_graph_pipeline.py"
-#        echo "Checking out head SHA: $HEAD_SHA"
-#        git checkout $HEAD_SHA
+#        echo "Profiling the base branch for code_graph_pipeline.py"
+#        echo "Checking out base SHA: $BASE_SHA"
+#        git checkout $BASE_SHA
 #        echo "This is the working directory: $PWD"
 #        # Ensure the script is executable
 #        chmod +x cognee/api/v1/cognify/code_graph_pipeline.py
 #        # Run Scalene
-#        poetry run pyinstrument --renderer json -o head_results.json cognee/api/v1/cognify/code_graph_pipeline.py
+#        poetry run pyinstrument --renderer json -o base_results.json cognee/api/v1/cognify/code_graph_pipeline.py
+
+    # Run profiler on head branch
+    - name: Run profiler on head branch
+      env:
+        HEAD_SHA: ${{ env.HEAD_SHA }}
+      run: |
+        echo "Profiling the head branch for code_graph_pipeline.py"
+        echo "Checking out head SHA: $HEAD_SHA"
+        git checkout $HEAD_SHA
+        echo "This is the working directory: $PWD"
+        # Ensure the script is executable
+        chmod +x cognee/api/v1/cognify/code_graph_pipeline.py
+        # Run Scalene
+        poetry run pyinstrument --renderer json -o head_results.json cognee/api/v1/cognify/code_graph_pipeline.py
 
 #    # Compare profiling results
 #    - name: Compare profiling results

diff --git a/.github/workflows/reusable_python_example.yml b/.github/workflows/reusable_python_example.yml
@@ -7,6 +7,10 @@ on:
         description: "Location of example script to run"
         required: true
         type: string
+      arguments:
+        description: "Arguments for example script"
+        required: false
+        type: string
     secrets:
       GRAPHISTRY_USERNAME:
         required: true
@@ -53,4 +57,4 @@ jobs:
           LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
           GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
-        run: poetry run python ${{ inputs.example-location }}
+        run: poetry run python ${{ inputs.example-location }} ${{ inputs.arguments }}
diff --git a/.github/workflows/test_code_graph_example.yml b/.github/workflows/test_code_graph_example.yml
@@ -0,0 +1,22 @@
+name: test | code graph example
+
+on:
+  workflow_dispatch:
+  pull_request:
+    types: [labeled, synchronize]
+
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  run_simple_example_test:
+      uses: ./.github/workflows/reusable_python_example.yml
+      with:
+        example-location: ./examples/python/code_graph_example.py
+        arguments: "--repo_path ./evals"
+      secrets:
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
+        GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
diff --git a/Dockerfile_modal b/Dockerfile_modal
@@ -0,0 +1,32 @@
+FROM python:3.11-slim
+
+# Set environment variables
+ENV PIP_NO_CACHE_DIR=true
+ENV PATH="${PATH}:/root/.poetry/bin"
+ENV PYTHONPATH=/app
+ENV RUN_MODE=modal
+ENV SKIP_MIGRATIONS=true
+
+# System dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    libpq-dev \
+    git \
+    curl \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+
+ENV PYTHONPATH=/app
+WORKDIR /app
+COPY pyproject.toml poetry.lock /app/
+
+
+RUN pip install poetry
+
+RUN poetry install --all-extras --no-root --without dev
+
+COPY cognee/ /app/cognee
+COPY README.md /app/README.md
diff --git a/README.md b/README.md
@@ -241,6 +241,28 @@ Please see the cognee [Development Guide](https://docs.cognee.ai/quickstart/) fo
 ```bash
 pip install cognee
 ```
+### Deployment at Scale (Modal)
+
+Scale cognee in 4(+1) simple steps to handle enterprise workloads using [Modal](https://modal.com)'s GPU-powered infrastructure
+
+**1. Install the modal python client**
+```bash
+pip install modal
+```
+**2. Create a free account on [Modal](https://modal.com)**
+
+**3. Set Up Modal API Key**
+```bash
+modal token set --token-id TOKEN_ID --token-secret TOKEN_SECRET --profile=PROFILE
+modal profile activate PROFILE
+```
+**4. Run cognee example**
+
+This simple example will deploy separate cognee instances building their own memory stores and answering a list of questions at scale.
+```bash
+modal run -d modal_deployment.py
+```
+**5. Change the modal_deploy script and develop your own AI memory at scale 🚀**
 
 ## 💫 Contributors
 

diff --git a/cognee-mcp/src/__init__.py b/cognee-mcp/src/__init__.py
@@ -1,5 +1,6 @@
 from .server import mcp
 
+
 def main():
     """Main entry point for the package."""
     mcp.run(transport="stdio")
diff --git a/cognee-mcp/src/client.py b/cognee-mcp/src/client.py
@@ -4,9 +4,9 @@
 
 # Create server parameters for stdio connection
 server_params = StdioServerParameters(
-    command="mcp", # Executable
-    args=["run", "src/server.py"], # Optional command line arguments
-    env=None # Optional environment variables
+    command="mcp",  # Executable
+    args=["run", "src/server.py"],  # Optional command line arguments
+    env=None,  # Optional environment variables
 )
 
 text = """
@@ -27,6 +27,7 @@
 more accurate classifications or predictions over time.
 """
 
+
 async def run():
     async with stdio_client(server_params) as (read, write):
         async with ClientSession(read, write, timedelta(minutes=3)) as session:
@@ -37,6 +38,8 @@ async def run():
 
             print(f"Cognify result: {toolResult}")
 
+
 if __name__ == "__main__":
     import asyncio
+
     asyncio.run(run())
diff --git a/cognee-mcp/src/server.py b/cognee-mcp/src/server.py
@@ -1,13 +1,15 @@
 import os
 import cognee
 import importlib.util
+
 # from PIL import Image as PILImage
 from mcp.server.fastmcp import FastMCP
 from cognee.api.v1.search import SearchType
 from cognee.shared.data_models import KnowledgeGraph
 
 mcp = FastMCP("cognee", timeout=120000)
 
+
 @mcp.tool()
 async def cognify(text: str, graph_model_file: str = None, graph_model_name: str = None) -> str:
     """Build knowledge graph from the input text"""
@@ -19,9 +21,9 @@ async def cognify(text: str, graph_model_file: str = None, graph_model_name: str
     await cognee.add(text)
 
     try:
-      await cognee.cognify(graph_model=graph_model)
+        await cognee.cognify(graph_model=graph_model)
     except Exception as e:
-      raise ValueError(f"Failed to cognify: {str(e)}")
+        raise ValueError(f"Failed to cognify: {str(e)}")
 
     return "Ingested"
 
@@ -57,7 +59,6 @@ async def prune() -> str:
 #       raise ValueError(f"Failed to create visualization: {str(e)}")
 
 
-
 def node_to_string(node):
     node_data = ", ".join(
         [f'{key}: "{value}"' for key, value in node.items() if key in ["id", "name"]]

diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py
@@ -8,7 +8,6 @@
 from cognee.modules.data.methods.get_dataset_data import get_dataset_data
 from cognee.modules.data.methods.get_datasets_by_name import get_datasets_by_name
 from cognee.modules.data.models import Data
-from cognee.modules.data.operations.write_metadata import write_metadata
 from cognee.modules.ingestion.data_types import BinaryData
 from cognee.modules.users.methods import get_default_user
 from cognee.shared.CodeGraphEntities import Repository

diff --git a/modal_deployment.py b/modal_deployment.py
@@ -0,0 +1,94 @@
+import modal
+import os
+import logging
+import asyncio
+import cognee
+import signal
+
+from cognee.api.v1.search import SearchType
+from cognee.shared.utils import setup_logging
+
+app = modal.App("cognee-runner")
+
+image = (
+    modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
+    .copy_local_file("pyproject.toml", "pyproject.toml")
+    .copy_local_file("poetry.lock", "poetry.lock")
+    .env({"ENV": os.getenv("ENV"), "LLM_API_KEY": os.getenv("LLM_API_KEY")})
+    .poetry_install_from_file(poetry_pyproject_toml="pyproject.toml")
+    .pip_install("protobuf", "h2")
+)
+
+
+@app.function(image=image, concurrency_limit=10)
+async def entry(text: str, query: str):
+    setup_logging(logging.ERROR)
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    await cognee.add(text)
+    await cognee.cognify()
+    search_results = await cognee.search(SearchType.GRAPH_COMPLETION, query_text=query)
+
+    return {
+        "text": text,
+        "query": query,
+        "answer": search_results[0] if search_results else None,
+    }
+
+
+@app.local_entrypoint()
+async def main():
+    text_queries = [
+        {
+            "text": "NASA's Artemis program aims to return humans to the Moon by 2026, focusing on sustainable exploration and preparing for future Mars missions.",
+            "query": "When does NASA plan to return humans to the Moon under the Artemis program?",
+        },
+        {
+            "text": "According to a 2022 UN report, global food waste amounts to approximately 931 million tons annually, with households contributing 61% of the total.",
+            "query": "How much food waste do households contribute annually according to the 2022 UN report?",
+        },
+        {
+            "text": "The 2021 census data revealed that Tokyo's population reached 14 million, reflecting a 2.1% increase compared to the previous census conducted in 2015.",
+            "query": "What was Tokyo's population according to the 2021 census data?",
+        },
+        {
+            "text": "A recent study published in the Journal of Nutrition found that consuming 30 grams of almonds daily can lower LDL cholesterol levels by 7% over a 12-week period.",
+            "query": "How much can daily almond consumption lower LDL cholesterol according to the study?",
+        },
+        {
+            "text": "Amazon's Prime membership grew to 200 million subscribers in 2023, marking a 10% increase from the previous year, driven by exclusive content and faster delivery options.",
+            "query": "How many Prime members did Amazon have in 2023?",
+        },
+        {
+            "text": "A new report by the International Energy Agency states that global renewable energy capacity increased by 295 gigawatts in 2022, primarily driven by solar and wind power expansion.",
+            "query": "By how much did global renewable energy capacity increase in 2022 according to the report?",
+        },
+        {
+            "text": "The World Health Organization reported in 2023 that the global life expectancy has risen to 73.4 years, an increase of 5.5 years since the year 2000.",
+            "query": "What is the current global life expectancy according to the WHO's 2023 report?",
+        },
+        {
+            "text": "The FIFA World Cup 2022 held in Qatar attracted a record-breaking audience of 5 billion people across various digital and traditional broadcasting platforms.",
+            "query": "How many people watched the FIFA World Cup 2022?",
+        },
+        {
+            "text": "The European Space Agency's JUICE mission, launched in 2023, aims to explore Jupiter's icy moons, including Ganymede, Europa, and Callisto, over the next decade.",
+            "query": "Which moons is the JUICE mission set to explore?",
+        },
+        {
+            "text": "According to a report by the International Labour Organization, the global unemployment rate in 2023 was estimated at 5.4%, reflecting a slight decrease compared to the previous year.",
+            "query": "What was the global unemployment rate in 2023 according to the ILO?",
+        },
+    ]
+
+    tasks = [entry.remote.aio(item["text"], item["query"]) for item in text_queries]
+
+    results = await asyncio.gather(*tasks)
+
+    print("\nFinal Results:")
+
+    for result in results:
+        print(result)
+        print("----")
+
+    os.kill(os.getpid(), signal.SIGTERM)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -78,6 +78,8 @@ httpx = "0.27.0"
 bokeh="^3.6.2"
 nltk = "3.9.1"
 google-generativeai = {version = "^0.8.4", optional = true}
+parso = {version = "^0.8.4", optional = true}
+jedi =  {version = "^0.19.2", optional = true}
 
 
 [tool.poetry.extras]
@@ -96,6 +98,7 @@ falkordb = ["falkordb"]
 groq = ["groq"]
 milvus = ["pymilvus"]
 docs = ["unstructured"]
+codegraph = ["parso", "jedi"]
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.0"