topoteretes · dexters1 · Jan 29, 2025 · Jan 21, 2025 · Jan 23, 2025 · Jan 23, 2025
diff --git a/.env.template b/.env.template
@@ -1,12 +1,28 @@
 ENV="local"
 TOKENIZERS_PARALLELISM="false"
-LLM_API_KEY=
+
+# LLM settings
+LLM_API_KEY=""
+LLM_MODEL="openai/gpt-4o-mini"
+LLM_PROVIDER="openai"
+LLM_ENDPOINT=""
+LLM_API_VERSION=""
+LLM_MAX_TOKENS="16384"
 
 GRAPHISTRY_USERNAME=
 GRAPHISTRY_PASSWORD=
 
 SENTRY_REPORTING_URL=
 
+# Embedding settings
+EMBEDDING_PROVIDER="openai"
+EMBEDDING_API_KEY=""
+EMBEDDING_MODEL="openai/text-embedding-3-large"
+EMBEDDING_ENDPOINT=""
+EMBEDDING_API_VERSION=""
+EMBEDDING_DIMENSIONS=3072
+EMBEDDING_MAX_TOKENS=8191
+
 # "neo4j" or "networkx"
 GRAPH_DATABASE_PROVIDER="networkx"
 # Not needed if using networkx

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -20,6 +20,7 @@
 from cognee.tasks.repo_processor.get_source_code_chunks import get_source_code_chunks
 from cognee.tasks.storage import add_data_points
 from cognee.tasks.summarization import summarize_code, summarize_text
+from cognee.infrastructure.llm import get_max_chunk_tokens
 
 monitoring = get_base_config().monitoring_tool
 if monitoring == MonitoringTool.LANGFUSE:
@@ -57,7 +58,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
             Task(ingest_data, dataset_name="repo_docs", user=user),
             Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
             Task(classify_documents),
-            Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens),
+            Task(extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens()),
             Task(
                 extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}
             ),

diff --git a/cognee/api/v1/cognify/cognify_v2.py b/cognee/api/v1/cognify/cognify_v2.py
@@ -4,6 +4,7 @@
 
 from pydantic import BaseModel
 
+from cognee.infrastructure.llm import get_max_chunk_tokens
 from cognee.modules.cognify.config import get_cognify_config
 from cognee.modules.data.methods import get_datasets, get_datasets_by_name
 from cognee.modules.data.methods.get_dataset_data import get_dataset_data
@@ -151,7 +152,9 @@ async def get_default_tasks(
         default_tasks = [
             Task(classify_documents),
             Task(check_permissions_on_documents, user=user, permissions=["write"]),
-            Task(extract_chunks_from_documents),  # Extract text chunks based on the document type.
+            Task(
+                extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens()
+            ),  # Extract text chunks based on the document type.
             Task(
                 extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10}
             ),  # Generate knowledge graphs from the document chunks.

diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
@@ -6,6 +6,9 @@
 import os
 from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
 from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException
+from cognee.infrastructure.llm.tokenizer.Gemini import GeminiTokenizer
+from cognee.infrastructure.llm.tokenizer.HuggingFace import HuggingFaceTokenizer
+from cognee.infrastructure.llm.tokenizer.TikToken import TikTokenTokenizer
 
 litellm.set_verbose = False
 logger = logging.getLogger("LiteLLMEmbeddingEngine")
@@ -15,23 +18,29 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
     api_key: str
     endpoint: str
     api_version: str
+    provider: str
     model: str
     dimensions: int
     mock: bool
 
     def __init__(
         self,
+        provider: str = "openai",
         model: Optional[str] = "text-embedding-3-large",
         dimensions: Optional[int] = 3072,
         api_key: str = None,
         endpoint: str = None,
         api_version: str = None,
+        max_tokens: int = 512,
     ):
         self.api_key = api_key
         self.endpoint = endpoint
         self.api_version = api_version
+        self.provider = provider
         self.model = model
         self.dimensions = dimensions
+        self.max_tokens = max_tokens
+        self.tokenizer = self.get_tokenizer()
 
         enable_mocking = os.getenv("MOCK_EMBEDDING", "false")
         if isinstance(enable_mocking, bool):
@@ -104,3 +113,18 @@ async def exponential_backoff(attempt):
 
     def get_vector_size(self) -> int:
         return self.dimensions
+
+    def get_tokenizer(self):
+        logger.debug(f"Loading tokenizer for model {self.model}...")
+        # If model also contains provider information, extract only model information
+        model = self.model.split("/")[-1]
+
+        if "openai" in self.provider.lower():
+            tokenizer = TikTokenTokenizer(model=model, max_tokens=self.max_tokens)
+        elif "gemini" in self.provider.lower():
+            tokenizer = GeminiTokenizer(model=model, max_tokens=self.max_tokens)
+        else:
+            tokenizer = HuggingFaceTokenizer(model=self.model, max_tokens=self.max_tokens)
+
+        logger.debug(f"Tokenizer loaded for model: {self.model}")
+        return tokenizer
diff --git a/cognee/infrastructure/databases/vector/embeddings/config.py b/cognee/infrastructure/databases/vector/embeddings/config.py
@@ -4,12 +4,13 @@
 
 
 class EmbeddingConfig(BaseSettings):
-    embedding_model: Optional[str] = "text-embedding-3-large"
+    embedding_provider: Optional[str] = "openai"
+    embedding_model: Optional[str] = "openai/text-embedding-3-large"
     embedding_dimensions: Optional[int] = 3072
     embedding_endpoint: Optional[str] = None
     embedding_api_key: Optional[str] = None
     embedding_api_version: Optional[str] = None
-
+    embedding_max_tokens: Optional[int] = 8191
     model_config = SettingsConfigDict(env_file=".env", extra="allow")
 
 

diff --git a/cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py b/cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py
@@ -10,9 +10,11 @@ def get_embedding_engine() -> EmbeddingEngine:
 
     return LiteLLMEmbeddingEngine(
         # If OpenAI API is used for embeddings, litellm needs only the api_key.
+        provider=config.embedding_provider,
         api_key=config.embedding_api_key or llm_config.llm_api_key,
         endpoint=config.embedding_endpoint,
         api_version=config.embedding_api_version,
         model=config.embedding_model,
         dimensions=config.embedding_dimensions,
+        max_tokens=config.embedding_max_tokens,
     )
diff --git a/cognee/infrastructure/llm/__init__.py b/cognee/infrastructure/llm/__init__.py
@@ -1 +1,2 @@
 from .config import get_llm_config
+from .utils import get_max_chunk_tokens
diff --git a/cognee/infrastructure/llm/anthropic/adapter.py b/cognee/infrastructure/llm/anthropic/adapter.py
@@ -14,11 +14,12 @@ class AnthropicAdapter(LLMInterface):
     name = "Anthropic"
     model: str
 
-    def __init__(self, model: str = None):
+    def __init__(self, max_tokens: int, model: str = None):
         self.aclient = instructor.patch(
             create=anthropic.Anthropic().messages.create, mode=instructor.Mode.ANTHROPIC_TOOLS
         )
         self.model = model
+        self.max_tokens = max_tokens
 
     async def acreate_structured_output(
         self, text_input: str, system_prompt: str, response_model: Type[BaseModel]

diff --git a/cognee/infrastructure/llm/config.py b/cognee/infrastructure/llm/config.py
@@ -11,6 +11,7 @@ class LLMConfig(BaseSettings):
     llm_api_version: Optional[str] = None
     llm_temperature: float = 0.0
     llm_streaming: bool = False
+    llm_max_tokens: int = 16384
     transcription_model: str = "whisper-1"
 
     model_config = SettingsConfigDict(env_file=".env", extra="allow")
@@ -24,6 +25,7 @@ def to_dict(self) -> dict:
             "api_version": self.llm_api_version,
             "temperature": self.llm_temperature,
             "streaming": self.llm_streaming,
+            "max_tokens": self.llm_max_tokens,
             "transcription_model": self.transcription_model,
         }
 

diff --git a/cognee/infrastructure/llm/generic_llm_api/adapter.py b/cognee/infrastructure/llm/generic_llm_api/adapter.py
@@ -2,6 +2,7 @@
 
 import asyncio
 from typing import List, Type
+
 from pydantic import BaseModel
 import instructor
 from cognee.infrastructure.llm.llm_interface import LLMInterface
@@ -16,11 +17,12 @@ class GenericAPIAdapter(LLMInterface):
     model: str
     api_key: str
 
-    def __init__(self, endpoint, api_key: str, model: str, name: str):
+    def __init__(self, endpoint, api_key: str, model: str, name: str, max_tokens: int):
         self.name = name
         self.model = model
         self.api_key = api_key
         self.endpoint = endpoint
+        self.max_tokens = max_tokens
 
         llm_config = get_llm_config()
 

diff --git a/cognee/infrastructure/llm/get_llm_client.py b/cognee/infrastructure/llm/get_llm_client.py
@@ -20,6 +20,15 @@ def get_llm_client():
 
     provider = LLMProvider(llm_config.llm_provider)
 
+    # Check if max_token value is defined in liteLLM for given model
+    # if not use value from cognee configuration
+    from cognee.infrastructure.llm.utils import (
+        get_model_max_tokens,
+    )  # imported here to avoid circular imports
+
+    model_max_tokens = get_model_max_tokens(llm_config.llm_model)
+    max_tokens = model_max_tokens if model_max_tokens else llm_config.llm_max_tokens
+
     if provider == LLMProvider.OPENAI:
         if llm_config.llm_api_key is None:
             raise InvalidValueError(message="LLM API key is not set.")
@@ -32,6 +41,7 @@ def get_llm_client():
             api_version=llm_config.llm_api_version,
             model=llm_config.llm_model,
             transcription_model=llm_config.transcription_model,
+            max_tokens=max_tokens,
             streaming=llm_config.llm_streaming,
         )
 
@@ -42,13 +52,17 @@ def get_llm_client():
         from .generic_llm_api.adapter import GenericAPIAdapter
 
         return GenericAPIAdapter(
-            llm_config.llm_endpoint, llm_config.llm_api_key, llm_config.llm_model, "Ollama"
+            llm_config.llm_endpoint,
+            llm_config.llm_api_key,
+            llm_config.llm_model,
+            "Ollama",
+            max_tokens=max_tokens,
         )
 
     elif provider == LLMProvider.ANTHROPIC:
         from .anthropic.adapter import AnthropicAdapter
 
-        return AnthropicAdapter(llm_config.llm_model)
+        return AnthropicAdapter(max_tokens=max_tokens, model=llm_config.llm_model)
 
     elif provider == LLMProvider.CUSTOM:
         if llm_config.llm_api_key is None:
@@ -57,7 +71,11 @@ def get_llm_client():
         from .generic_llm_api.adapter import GenericAPIAdapter
 
         return GenericAPIAdapter(
-            llm_config.llm_endpoint, llm_config.llm_api_key, llm_config.llm_model, "Custom"
+            llm_config.llm_endpoint,
+            llm_config.llm_api_key,
+            llm_config.llm_model,
+            "Custom",
+            max_tokens=max_tokens,
         )
 
     else:

diff --git a/cognee/infrastructure/llm/openai/adapter.py b/cognee/infrastructure/llm/openai/adapter.py
@@ -32,6 +32,7 @@ def __init__(
         api_version: str,
         model: str,
         transcription_model: str,
+        max_tokens: int,
         streaming: bool = False,
     ):
         self.aclient = instructor.from_litellm(litellm.acompletion)
@@ -41,6 +42,7 @@ def __init__(
         self.api_key = api_key
         self.endpoint = endpoint
         self.api_version = api_version
+        self.max_tokens = max_tokens
         self.streaming = streaming
 
     @observe(as_type="generation")

diff --git a/cognee/infrastructure/llm/tokenizer/Gemini/__init__.py b/cognee/infrastructure/llm/tokenizer/Gemini/__init__.py
@@ -0,0 +1 @@
+from .adapter import GeminiTokenizer
diff --git a/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py b/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py
@@ -0,0 +1,44 @@
+from typing import List, Any
+
+from ..tokenizer_interface import TokenizerInterface
+
+
+class GeminiTokenizer(TokenizerInterface):
+    def __init__(
+        self,
+        model: str,
+        max_tokens: int = 3072,
+    ):
+        self.model = model
+        self.max_tokens = max_tokens
+
+        # Get LLM API key from config
+        from cognee.infrastructure.databases.vector.embeddings.config import get_embedding_config
+        from cognee.infrastructure.llm.config import get_llm_config
+
+        config = get_embedding_config()
+        llm_config = get_llm_config()
+
+        import google.generativeai as genai
+
+        genai.configure(api_key=config.embedding_api_key or llm_config.llm_api_key)
-        from cognee.infrastructure.databases.vector.embeddings.config import get_embedding_config
-        from cognee.infrastructure.llm.config import get_llm_config
-
-        config = get_embedding_config()
-        llm_config = get_llm_config()
-
-        import google.generativeai as genai
-
-        genai.configure(api_key=config.embedding_api_key or llm_config.llm_api_key)
+        from cognee.infrastructure.databases.vector.embeddings.config import get_embedding_config
+        from cognee.infrastructure.llm.config import get_llm_config
+
+        try:
+            config = get_embedding_config()
+            llm_config = get_llm_config()
+            
+            import google.generativeai as genai
+            
+            api_key = config.embedding_api_key or llm_config.llm_api_key
+            if not api_key:
+                raise ValueError("No API key found in configuration")
+                
+            genai.configure(api_key=api_key)
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize Gemini tokenizer: {str(e)}")
-        from cognee.infrastructure.databases.vector.embeddings.config import get_embedding_config
-        from cognee.infrastructure.llm.config import get_llm_config
-
-        config = get_embedding_config()
-        llm_config = get_llm_config()
-
-        import google.generativeai as genai
-
-        genai.configure(api_key=config.embedding_api_key or llm_config.llm_api_key)
+        from cognee.infrastructure.databases.vector.embeddings.config import get_embedding_config
+        from cognee.infrastructure.llm.config import get_llm_config
+
+        try:
+            config = get_embedding_config()
+            llm_config = get_llm_config()
+            
+            import google.generativeai as genai
+            
+            api_key = config.embedding_api_key or llm_config.llm_api_key
+            if not api_key:
+                raise ValueError("No API key found in configuration")
+                
+            genai.configure(api_key=api_key)
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize Gemini tokenizer: {str(e)}")
+
+    def extract_tokens(self, text: str) -> List[Any]:
+        raise NotImplementedError
+
+    def count_tokens(self, text: str) -> int:
+        """
+        Returns the number of tokens in the given text.
+        Args:
+            text: str
+
+        Returns:
+            number of tokens in the given text
+
+        """
+        import google.generativeai as genai
+
+        return len(genai.embed_content(model=f"models/{self.model}", content=text))
-    def count_tokens(self, text: str) -> int:
-        """
-        Returns the number of tokens in the given text.
-        Args:
-            text: str
-
-        Returns:
-            number of tokens in the given text
-
-        """
-        import google.generativeai as genai
-
-        return len(genai.embed_content(model=f"models/{self.model}", content=text))
+    def count_tokens(self, text: str) -> int:
+        """Returns the number of tokens in the given text.
+        
+        Args:
+            text: str
+
+        Returns:
+            int: number of tokens in the given text
+            
+        Raises:
+            ValueError: If text is empty or None
+            RuntimeError: If token counting fails
+        """
+        if not text:
+            raise ValueError("Input text cannot be empty")
+            
+        try:
+            import google.generativeai as genai
+            result = genai.embed_content(
+                model=f"models/{self.model}",
+                content=text
+            )
+            return len(result)
+        except Exception as e:
+            raise RuntimeError(f"Failed to count tokens: {str(e)}")
-    def count_tokens(self, text: str) -> int:
-        """
-        Returns the number of tokens in the given text.
-        Args:
-            text: str
-
-        Returns:
-            number of tokens in the given text
-
-        """
-        import google.generativeai as genai
-
-        return len(genai.embed_content(model=f"models/{self.model}", content=text))
+    def count_tokens(self, text: str) -> int:
+        """Returns the number of tokens in the given text.
+        
+        Args:
+            text: str
+
+        Returns:
+            int: number of tokens in the given text
+            
+        Raises:
+            ValueError: If text is empty or None
+            RuntimeError: If token counting fails
+        """
+        if not text:
+            raise ValueError("Input text cannot be empty")
+            
+        try:
+            import google.generativeai as genai
+            result = genai.embed_content(
+                model=f"models/{self.model}",
+                content=text
+            )
+            return len(result)
+        except Exception as e:
+            raise RuntimeError(f"Failed to count tokens: {str(e)}")
+
+    def trim_text_to_max_tokens(self, text: str) -> str:
+        raise NotImplementedError
diff --git a/cognee/infrastructure/llm/tokenizer/HuggingFace/__init__.py b/cognee/infrastructure/llm/tokenizer/HuggingFace/__init__.py
@@ -0,0 +1 @@
+from .adapter import HuggingFaceTokenizer
diff --git a/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py b/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py
@@ -0,0 +1,36 @@
+from typing import List, Any
+
+from transformers import AutoTokenizer
+
+from ..tokenizer_interface import TokenizerInterface
+
+
+class HuggingFaceTokenizer(TokenizerInterface):
+    def __init__(
+        self,
+        model: str,
+        max_tokens: int = 512,
+    ):
+        self.model = model
+        self.max_tokens = max_tokens
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model)
+
+    def extract_tokens(self, text: str) -> List[Any]:
+        tokens = self.tokenizer.tokenize(text)
+        return tokens
+
+    def count_tokens(self, text: str) -> int:
+        """
+        Returns the number of tokens in the given text.
+        Args:
+            text: str
+
+        Returns:
+            number of tokens in the given text
+
+        """
+        return len(self.tokenizer.tokenize(text))
+
+    def trim_text_to_max_tokens(self, text: str) -> str:
+        raise NotImplementedError
diff --git a/cognee/infrastructure/llm/tokenizer/TikToken/__init__.py b/cognee/infrastructure/llm/tokenizer/TikToken/__init__.py
@@ -0,0 +1 @@
+from .adapter import TikTokenTokenizer
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .config import get_llm_config
		from .utils import get_max_chunk_tokens