topoteretes · Vasilije1990 · Aug 9, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024
diff --git a/cognee/api/v1/cognify/cognify_v2.py b/cognee/api/v1/cognify/cognify_v2.py
@@ -55,10 +55,10 @@ async def run_cognify_pipeline(dataset: Dataset):
         data: list[Data] = await get_dataset_data(dataset_id = dataset.id)
 
         documents = [
-            PdfDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location) if data_item.extension == "pdf" else
-            AudioDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location) if data_item.extension == "audio" else
-            ImageDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location) if data_item.extension == "image" else
-            TextDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location)
+            PdfDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location, chunking_strategy="paragraph") if data_item.extension == "pdf" else
+            AudioDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location, chunking_strategy="paragraph") if data_item.extension == "audio" else
+            ImageDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location, chunking_strategy="paragraph") if data_item.extension == "image" else
+            TextDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location, chunking_strategy="paragraph")
             for data_item in data
         ]
 

diff --git a/cognee/base_config.py b/cognee/base_config.py
@@ -9,6 +9,8 @@ class BaseConfig(BaseSettings):
     monitoring_tool: object = MonitoringTool.LANGFUSE
     graphistry_username: Optional[str] = None
     graphistry_password: Optional[str] = None
+    aws_access_key_id: Optional[str] = None
+    aws_secret_access_key: Optional[str] = None
 
     model_config = SettingsConfigDict(env_file = ".env", extra = "allow")
 

diff --git a/cognee/infrastructure/data/chunking/DefaultChunkEngine.py b/cognee/infrastructure/data/chunking/DefaultChunkEngine.py
@@ -4,6 +4,8 @@
 from cognee.shared.data_models import ChunkStrategy
 
 
+# /Users/vasa/Projects/cognee/cognee/infrastructure/data/chunking/DefaultChunkEngine.py
+
 class DefaultChunkEngine():
     def __init__(self, chunk_strategy=None, chunk_size=None, chunk_overlap=None):
         self.chunk_strategy = chunk_strategy

diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py
@@ -2,19 +2,22 @@
 from typing import Optional
 
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from cognee.modules.data.chunking import chunk_by_paragraph
+
 from cognee.modules.data.processing.chunk_types.DocumentChunk import DocumentChunk
 from cognee.modules.data.processing.document_types.Document import Document
 
+from cognee.tasks.chunking.chunking_registry import get_chunking_function
+
 class AudioReader:
     id: UUID
     file_path: str
+    chunking_strategy:str
 
-    def __init__(self, id: UUID, file_path: str):
+    def __init__(self, id: UUID, file_path: str, chunking_strategy:str = "paragraph"):
         self.id = id
         self.file_path = file_path
         self.llm_client = get_llm_client()  # You can choose different models like "tiny", "base", "small", etc.
-
+        self.chunking_function = get_chunking_function(chunking_strategy)
 
     def read(self, max_chunk_size: Optional[int] = 1024):
         chunk_index = 0
@@ -37,7 +40,7 @@ def read_text_chunks(text, chunk_size):
             chunked_pages.append(page_index)
             page_index += 1
 
-            for chunk_data in chunk_by_paragraph(page_text, max_chunk_size, batch_paragraphs=True):
+            for chunk_data in self.chunking_function(page_text, max_chunk_size, batch_paragraphs=True):
                 if chunk_size + chunk_data["word_count"] <= max_chunk_size:
                     paragraph_chunks.append(chunk_data)
                     chunk_size += chunk_data["word_count"]
@@ -86,14 +89,16 @@ class AudioDocument(Document):
     type: str = "audio"
     title: str
     file_path: str
+    chunking_strategy:str
 
-    def __init__(self, id: UUID, title: str, file_path: str):
+    def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
         self.id = id or uuid5(NAMESPACE_OID, title)
         self.title = title
         self.file_path = file_path
+        self.chunking_strategy = chunking_strategy
 
     def get_reader(self) -> AudioReader:
-        reader = AudioReader(self.id, self.file_path)
+        reader = AudioReader(self.id, self.file_path, self.chunking_strategy)
         return reader
 
     def to_dict(self) -> dict:

diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
@@ -6,3 +6,4 @@ class Document(Protocol):
     type: str
     title: str
     file_path: str
+    chunking_strategy:str
diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py
@@ -2,19 +2,23 @@
 from typing import Optional
 
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from cognee.modules.data.chunking import chunk_by_paragraph
+
 from cognee.modules.data.processing.chunk_types.DocumentChunk import DocumentChunk
 from cognee.modules.data.processing.document_types.Document import Document
+from cognee.tasks.chunking import chunk_by_paragraph
+from cognee.tasks.chunking.chunking_registry import get_chunking_function
+
 
 class ImageReader:
     id: UUID
     file_path: str
+    chunking_strategy:str
 
-    def __init__(self, id: UUID, file_path: str):
+    def __init__(self, id: UUID, file_path: str, chunking_strategy:str = "paragraph"):
         self.id = id
         self.file_path = file_path
-        self.llm_client = get_llm_client()  # You can choose different models like "tiny", "base", "small", etc.
-
+        self.llm_client = get_llm_client() # You can choose different models like "tiny", "base", "small", etc.
+        self.chunking_function = get_chunking_function(chunking_strategy)
 
     def read(self, max_chunk_size: Optional[int] = 1024):
         chunk_index = 0

diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py
@@ -3,17 +3,23 @@
 from uuid import UUID, uuid5, NAMESPACE_OID
 from typing import Optional
 from pypdf import PdfReader as pypdf_PdfReader
-from cognee.modules.data.chunking import chunk_by_paragraph
+
 from cognee.modules.data.processing.chunk_types.DocumentChunk import DocumentChunk
+from cognee.tasks.chunking import chunk_by_paragraph
+from cognee.tasks.chunking.chunking_registry import get_chunking_function
 from .Document import Document
 
 class PdfReader():
     id: UUID
     file_path: str
+    chunking_strategy: str
 
-    def __init__(self, id: UUID, file_path: str):
+    def __init__(self, id: UUID, file_path: str, chunking_strategy:str = "paragraph"):
         self.id = id
         self.file_path = file_path
+        self.chunking_strategy = chunking_strategy
+        self.chunking_function = get_chunking_function(chunking_strategy)
+
 
     def get_number_of_pages(self):
         file = pypdf_PdfReader(self.file_path)
@@ -33,7 +39,7 @@ def read(self, max_chunk_size: Optional[int] = 1024):
             page_text = page.extract_text()
             chunked_pages.append(page_index)
 
-            for chunk_data in chunk_by_paragraph(page_text, max_chunk_size, batch_paragraphs = True):
+            for chunk_data in self.chunking_function(page_text, max_chunk_size, batch_paragraphs = True):
                 if chunk_size + chunk_data["word_count"] <= max_chunk_size:
                     paragraph_chunks.append(chunk_data)
                     chunk_size += chunk_data["word_count"]
@@ -85,18 +91,20 @@ class PdfDocument(Document):
     title: str
     num_pages: int
     file_path: str
+    chunking_strategy:str
 
-    def __init__(self, id: UUID, title: str, file_path: str):
+    def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
         self.id = id or uuid5(NAMESPACE_OID, title)
         self.title = title
         self.file_path = file_path
         logging.debug("file_path: %s", self.file_path)
         reader = PdfReader(self.id, self.file_path)
         self.num_pages = reader.get_number_of_pages()
+        self.chunking_strategy = chunking_strategy
 
     def get_reader(self) -> PdfReader:
         logging.debug("file_path: %s", self.file_path)
-        reader = PdfReader(self.id, self.file_path)
+        reader = PdfReader(self.id, self.file_path, self.chunking_strategy)
         return reader
 
     def to_dict(self) -> dict:

diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py
@@ -1,16 +1,22 @@
 from uuid import UUID, uuid5, NAMESPACE_OID
 from typing import Optional
-from cognee.modules.data.chunking import chunk_by_paragraph
+
 from cognee.modules.data.processing.chunk_types.DocumentChunk import DocumentChunk
+from cognee.tasks.chunking.chunking_registry import get_chunking_function
 from .Document import Document
 
 class TextReader():
     id: UUID
     file_path: str
+    chunking_strategy:str
 
-    def __init__(self, id: UUID, file_path: str):
+    def __init__(self, id: UUID, file_path: str, chunking_strategy:str="paragraph"):
         self.id = id
         self.file_path = file_path
+        self.chunking_strategy = chunking_strategy
+        self.chunking_function = get_chunking_function(chunking_strategy)
+
+
 
     def get_number_of_pages(self):
         num_pages = 1 # Pure text is not formatted
@@ -39,7 +45,7 @@ def read_text_chunks(file_path):
             chunked_pages.append(page_index)
             page_index += 1
 
-            for chunk_data in chunk_by_paragraph(page_text, max_chunk_size, batch_paragraphs = True):
+            for chunk_data in self.chunking_function(page_text, max_chunk_size, batch_paragraphs = True):
                 if chunk_size + chunk_data["word_count"] <= max_chunk_size:
                     paragraph_chunks.append(chunk_data)
                     chunk_size += chunk_data["word_count"]
@@ -89,17 +95,19 @@ class TextDocument(Document):
     title: str
     num_pages: int
     file_path: str
+    chunking_strategy:str
 
-    def __init__(self, id: UUID, title: str, file_path: str):
+    def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
         self.id = id or uuid5(NAMESPACE_OID, title)
         self.title = title
         self.file_path = file_path
+        self.chunking_strategy = chunking_strategy
 
         reader = TextReader(self.id, self.file_path)
         self.num_pages = reader.get_number_of_pages()
 
     def get_reader(self) -> TextReader:
-        reader = TextReader(self.id, self.file_path)
+        reader = TextReader(self.id, self.file_path, self.chunking_strategy)
         return reader
 
     def to_dict(self) -> dict:

diff --git a/cognee/modules/data/processing/document_types/__tests__/PdfDocument.test.py b/cognee/modules/data/processing/document_types/__tests__/PdfDocument.test.py
@@ -3,7 +3,7 @@
 
 if __name__ == "__main__":
     test_file_path = os.path.join(os.path.dirname(__file__), "artificial-inteligence.pdf")
-    pdf_doc = PdfDocument("Test document.pdf", test_file_path)
+    pdf_doc = PdfDocument("Test document.pdf", test_file_path, chunking_strategy="paragraph")
     reader = pdf_doc.get_reader()
 
     for paragraph_data in reader.read():

diff --git a/cognee/tasks/chunk_translate/__init__.py b/cognee/tasks/chunk_translate/__init__.py
diff --git a/cognee/tasks/chunk_translate/translate_chunk.py b/cognee/tasks/chunk_translate/translate_chunk.py
@@ -0,0 +1,39 @@
+
+import logging
+
+from cognee.base_config import get_base_config
+
+BaseConfig = get_base_config()
+
+async def translate_text(data, source_language:str='sr', target_language:str='en', region_name='eu-west-1'):
+    """
+    Translate text from source language to target language using AWS Translate.
+    Parameters:
+    data (str): The text to be translated.
+    source_language (str): The source language code (e.g., 'sr' for Serbian). ISO 639-2 Code https://www.loc.gov/standards/iso639-2/php/code_list.php
+    target_language (str): The target language code (e.g., 'en' for English). ISO 639-2 Code https://www.loc.gov/standards/iso639-2/php/code_list.php
+    region_name (str): AWS region name.
+    Returns:
+    str: Translated text or an error message.
+    """
+    import boto3
+    from botocore.exceptions import BotoCoreError, ClientError
+
+    if not data:
+        yield "No text provided for translation."
+
+    if not source_language or not target_language:
+        yield "Both source and target language codes are required."
+
+    try:
+        translate = boto3.client(service_name='translate', region_name=region_name, use_ssl=True)
+        result = translate.translate_text(Text=data, SourceLanguageCode=source_language, TargetLanguageCode=target_language)
+        yield result.get('TranslatedText', 'No translation found.')
+
+    except BotoCoreError as e:
+        logging.info(f"BotoCoreError occurred: {e}")
+        yield "Error with AWS Translate service configuration or request."
+
+    except ClientError as e:
+        logging.info(f"ClientError occurred: {e}")
+        yield "Error with AWS client or network issue."
-async def translate_text(data, source_language:str='sr', target_language:str='en', region_name='eu-west-1'):
-    """
-    Translate text from source language to target language using AWS Translate.
-    Parameters:
-    data (str): The text to be translated.
-    source_language (str): The source language code (e.g., 'sr' for Serbian). ISO 639-2 Code https://www.loc.gov/standards/iso639-2/php/code_list.php
-    target_language (str): The target language code (e.g., 'en' for English). ISO 639-2 Code https://www.loc.gov/standards/iso639-2/php/code_list.php
-    region_name (str): AWS region name.
-    Returns:
-    str: Translated text or an error message.
-    """
-    import boto3
-    from botocore.exceptions import BotoCoreError, ClientError
-
-    if not data:
-        yield "No text provided for translation."
-
-    if not source_language or not target_language:
-        yield "Both source and target language codes are required."
-
-    try:
-        translate = boto3.client(service_name='translate', region_name=region_name, use_ssl=True)
-        result = translate.translate_text(Text=data, SourceLanguageCode=source_language, TargetLanguageCode=target_language)
-        yield result.get('TranslatedText', 'No translation found.')
-
-    except BotoCoreError as e:
-        logging.info(f"BotoCoreError occurred: {e}")
-        yield "Error with AWS Translate service configuration or request."
-
-    except ClientError as e:
-        logging.info(f"ClientError occurred: {e}")
-        yield "Error with AWS client or network issue."
+async def translate_text(data, source_language: str = 'sr', target_language: str = 'en', region_name: str = 'eu-west-1') -> str:
+    """
+    Translate text from source language to target language using AWS Translate.
+    Parameters:
+    data (str): The text to be translated.
+    source_language (str): The source language code (e.g., 'sr' for Serbian). ISO 639-2 Code https://www.loc.gov/standards/iso639-2/php/code_list.php
+    target_language (str): The target language code (e.g., 'en' for English). ISO 639-2 Code https://www.loc.gov/standards/iso639-2/php/code_list.php
+    region_name (str): AWS region name.
+    Returns:
+    str: Translated text or an error message.
+    """
+    import boto3
+    from botocore.exceptions import BotoCoreError, ClientError
+
+    if not data:
+        return "No text provided for translation."
+
+    if not source_language or not target_language:
+        return "Both source and target language codes are required."
+
+    try:
+        translate = boto3.client(service_name='translate', region_name=region_name, use_ssl=True)
+        result = translate.translate_text(Text=data, SourceLanguageCode=source_language, TargetLanguageCode=target_language)
+        return result.get('TranslatedText', 'No translation found.')
+
+    except BotoCoreError as e:
+        logging.info(f"BotoCoreError occurred: {e}")
+        return "Error with AWS Translate service configuration or request."
+
+    except ClientError as e:
+        logging.info(f"ClientError occurred: {e}")
+        return "Error with AWS client or network issue."
-async def translate_text(data, source_language:str='sr', target_language:str='en', region_name='eu-west-1'):
-    """
-    Translate text from source language to target language using AWS Translate.
-    Parameters:
-    data (str): The text to be translated.
-    source_language (str): The source language code (e.g., 'sr' for Serbian). ISO 639-2 Code https://www.loc.gov/standards/iso639-2/php/code_list.php
-    target_language (str): The target language code (e.g., 'en' for English). ISO 639-2 Code https://www.loc.gov/standards/iso639-2/php/code_list.php
-    region_name (str): AWS region name.
-    Returns:
-    str: Translated text or an error message.
-    """
-    import boto3
-    from botocore.exceptions import BotoCoreError, ClientError
-
-    if not data:
-        yield "No text provided for translation."
-
-    if not source_language or not target_language:
-        yield "Both source and target language codes are required."
-
-    try:
-        translate = boto3.client(service_name='translate', region_name=region_name, use_ssl=True)
-        result = translate.translate_text(Text=data, SourceLanguageCode=source_language, TargetLanguageCode=target_language)
-        yield result.get('TranslatedText', 'No translation found.')
-
-    except BotoCoreError as e:
-        logging.info(f"BotoCoreError occurred: {e}")
-        yield "Error with AWS Translate service configuration or request."
-
-    except ClientError as e:
-        logging.info(f"ClientError occurred: {e}")
-        yield "Error with AWS client or network issue."
+async def translate_text(data, source_language: str = 'sr', target_language: str = 'en', region_name: str = 'eu-west-1') -> str:
+    """
+    Translate text from source language to target language using AWS Translate.
+    Parameters:
+    data (str): The text to be translated.
+    source_language (str): The source language code (e.g., 'sr' for Serbian). ISO 639-2 Code https://www.loc.gov/standards/iso639-2/php/code_list.php
+    target_language (str): The target language code (e.g., 'en' for English). ISO 639-2 Code https://www.loc.gov/standards/iso639-2/php/code_list.php
+    region_name (str): AWS region name.
+    Returns:
+    str: Translated text or an error message.
+    """
+    import boto3
+    from botocore.exceptions import BotoCoreError, ClientError
+
+    if not data:
+        return "No text provided for translation."
+
+    if not source_language or not target_language:
+        return "Both source and target language codes are required."
+
+    try:
+        translate = boto3.client(service_name='translate', region_name=region_name, use_ssl=True)
+        result = translate.translate_text(Text=data, SourceLanguageCode=source_language, TargetLanguageCode=target_language)
+        return result.get('TranslatedText', 'No translation found.')
+
+    except BotoCoreError as e:
+        logging.info(f"BotoCoreError occurred: {e}")
+        return "Error with AWS Translate service configuration or request."
+
+    except ClientError as e:
+        logging.info(f"ClientError occurred: {e}")
+        return "Error with AWS client or network issue."
diff --git a/cognee/modules/data/chunking/__init__.py → cognee/tasks/chunking/__init__.py b/cognee/modules/data/chunking/__init__.py → cognee/tasks/chunking/__init__.py
diff --git a/...king/__tests__/chunk_by_paragraph.test.py → ...king/__tests__/chunk_by_paragraph.test.py b/...king/__tests__/chunk_by_paragraph.test.py → ...king/__tests__/chunk_by_paragraph.test.py
@@ -1,4 +1,4 @@
-from cognee.modules.data.chunking import chunk_by_paragraph
+from cognee.tasks.chunking import chunk_by_paragraph
 
 if __name__ == "__main__":
     def test_chunking_on_whole_text():

diff --git a/...dules/data/chunking/chunk_by_paragraph.py → cognee/tasks/chunking/chunk_by_paragraph.py b/...dules/data/chunking/chunk_by_paragraph.py → cognee/tasks/chunking/chunk_by_paragraph.py
@@ -1,6 +1,8 @@
 from uuid import uuid5, NAMESPACE_OID
 from .chunk_by_sentence import chunk_by_sentence
+from cognee.tasks.chunking.chunking_registry import register_chunking_function
 
+@register_chunking_function("paragraph")
 def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs = True):
     paragraph = ""
     last_cut_type = None

diff --git a/...odules/data/chunking/chunk_by_sentence.py → cognee/tasks/chunking/chunk_by_sentence.py b/...odules/data/chunking/chunk_by_sentence.py → cognee/tasks/chunking/chunk_by_sentence.py
@@ -1,6 +1,11 @@
+
+
+
 from uuid import uuid4
 from .chunk_by_word import chunk_by_word
+from cognee.tasks.chunking.chunking_registry import register_chunking_function
 
+@register_chunking_function("sentence")
 def chunk_by_sentence(data: str):
     sentence = ""
     paragraph_id = uuid4()

diff --git a/...ee/modules/data/chunking/chunk_by_word.py → cognee/tasks/chunking/chunk_by_word.py b/...ee/modules/data/chunking/chunk_by_word.py → cognee/tasks/chunking/chunk_by_word.py
diff --git a/cognee/tasks/chunking/chunking_registry.py b/cognee/tasks/chunking/chunking_registry.py
@@ -0,0 +1,10 @@
+chunking_registry = {}
+
+def register_chunking_function(name):
+    def decorator(func):
+        chunking_registry[name] = func
+        return func
+    return decorator
+
+def get_chunking_function(name: str):
+    return chunking_registry.get(name)
diff --git a/cognee/tasks/document_language_detection/__init__.py b/cognee/tasks/document_language_detection/__init__.py
diff --git a/cognee/tasks/document_language_detection/document_language_detection.py b/cognee/tasks/document_language_detection/document_language_detection.py
@@ -0,0 +1,36 @@
+
+import logging
+
+
+
+async def detect_language(data:str):
+    """
+    Detect the language of the given text and return its ISO 639-1 language code.
+    If the detected language is Croatian ('hr'), it maps to Serbian ('sr').
+    The text is trimmed to the first 100 characters for efficient processing.
+    Parameters:
+    text (str): The text for language detection.
+    Returns:
+    str: The ISO 639-1 language code of the detected language, or 'None' in case of an error.
+    """
+
+    # Trim the text to the first 100 characters
+    from langdetect import detect, LangDetectException
+    trimmed_text = data[:100]
+
+    try:
+        # Detect the language using langdetect
+        detected_lang_iso639_1 = detect(trimmed_text)
+        logging.info(f"Detected ISO 639-1 code: {detected_lang_iso639_1}")
+
+        # Special case: map 'hr' (Croatian) to 'sr' (Serbian ISO 639-2)
+        if detected_lang_iso639_1 == 'hr':
+            yield 'sr'
+        yield detected_lang_iso639_1
+
+    except LangDetectException as e:
+        logging.error(f"Language detection error: {e}")
+    except Exception as e:
+        logging.error(f"Unexpected error: {e}")
+
+    yield None
-async def detect_language(data:str):
-    """
-    Detect the language of the given text and return its ISO 639-1 language code.
-    If the detected language is Croatian ('hr'), it maps to Serbian ('sr').
-    The text is trimmed to the first 100 characters for efficient processing.
-    Parameters:
-    text (str): The text for language detection.
-    Returns:
-    str: The ISO 639-1 language code of the detected language, or 'None' in case of an error.
-    """
-
-    # Trim the text to the first 100 characters
-    from langdetect import detect, LangDetectException
-    trimmed_text = data[:100]
-
-    try:
-        # Detect the language using langdetect
-        detected_lang_iso639_1 = detect(trimmed_text)
-        logging.info(f"Detected ISO 639-1 code: {detected_lang_iso639_1}")
-
-        # Special case: map 'hr' (Croatian) to 'sr' (Serbian ISO 639-2)
-        if detected_lang_iso639_1 == 'hr':
-            yield 'sr'
-        yield detected_lang_iso639_1
-
-    except LangDetectException as e:
-        logging.error(f"Language detection error: {e}")
-    except Exception as e:
-        logging.error(f"Unexpected error: {e}")
-
-    yield None
+async def detect_language(data: str) -> str:
+    """
+    Detect the language of the given text and return its ISO 639-1 language code.
+    If the detected language is Croatian ('hr'), it maps to Serbian ('sr').
+    The text is trimmed to the first 100 characters for efficient processing.
+    Parameters:
+    text (str): The text for language detection.
+    Returns:
+    str: The ISO 639-1 language code of the detected language, or 'None' in case of an error.
+    """
+
+    # Trim the text to the first 100 characters
+    from langdetect import detect, LangDetectException
+    trimmed_text = data[:100]
+
+    try:
+        # Detect the language using langdetect
+        detected_lang_iso639_1 = detect(trimmed_text)
+        logging.info(f"Detected ISO 639-1 code: {detected_lang_iso639_1}")
+
+        # Special case: map 'hr' (Croatian) to 'sr' (Serbian ISO 639-2)
+        return 'sr' if detected_lang_iso639_1 == 'hr' else detected_lang_iso639_1
+
+    except LangDetectException as e:
+        logging.error(f"Language detection error: {e}")
+    except Exception as e:
+        logging.error(f"Unexpected error: {e}")
+
+    return None
-async def detect_language(data:str):
-    """
-    Detect the language of the given text and return its ISO 639-1 language code.
-    If the detected language is Croatian ('hr'), it maps to Serbian ('sr').
-    The text is trimmed to the first 100 characters for efficient processing.
-    Parameters:
-    text (str): The text for language detection.
-    Returns:
-    str: The ISO 639-1 language code of the detected language, or 'None' in case of an error.
-    """
-
-    # Trim the text to the first 100 characters
-    from langdetect import detect, LangDetectException
-    trimmed_text = data[:100]
-
-    try:
-        # Detect the language using langdetect
-        detected_lang_iso639_1 = detect(trimmed_text)
-        logging.info(f"Detected ISO 639-1 code: {detected_lang_iso639_1}")
-
-        # Special case: map 'hr' (Croatian) to 'sr' (Serbian ISO 639-2)
-        if detected_lang_iso639_1 == 'hr':
-            yield 'sr'
-        yield detected_lang_iso639_1
-
-    except LangDetectException as e:
-        logging.error(f"Language detection error: {e}")
-    except Exception as e:
-        logging.error(f"Unexpected error: {e}")
-
-    yield None
+async def detect_language(data: str) -> str:
+    """
+    Detect the language of the given text and return its ISO 639-1 language code.
+    If the detected language is Croatian ('hr'), it maps to Serbian ('sr').
+    The text is trimmed to the first 100 characters for efficient processing.
+    Parameters:
+    text (str): The text for language detection.
+    Returns:
+    str: The ISO 639-1 language code of the detected language, or 'None' in case of an error.
+    """
+
+    # Trim the text to the first 100 characters
+    from langdetect import detect, LangDetectException
+    trimmed_text = data[:100]
+
+    try:
+        # Detect the language using langdetect
+        detected_lang_iso639_1 = detect(trimmed_text)
+        logging.info(f"Detected ISO 639-1 code: {detected_lang_iso639_1}")
+
+        # Special case: map 'hr' (Croatian) to 'sr' (Serbian ISO 639-2)
+        return 'sr' if detected_lang_iso639_1 == 'hr' else detected_lang_iso639_1
+
+    except LangDetectException as e:
+        logging.error(f"Language detection error: {e}")
+    except Exception as e:
+        logging.error(f"Unexpected error: {e}")
+
+    return None