From 65a0c98455fdcb7df0852955fcdafc61cae45448 Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Fri, 17 Jan 2025 10:20:57 +0100 Subject: [PATCH 1/3] COG-989 feat: make tasks a configurable argument in the cognify function (#442) * feat: make tasks a configurable argument in the cognify function * fix: add data points task --------- Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com> --- cognee/api/v1/cognify/cognify_v2.py | 58 +++++++++++++++++++---------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/cognee/api/v1/cognify/cognify_v2.py b/cognee/api/v1/cognify/cognify_v2.py index 680c05828..738f77c52 100644 --- a/cognee/api/v1/cognify/cognify_v2.py +++ b/cognee/api/v1/cognify/cognify_v2.py @@ -36,6 +36,7 @@ async def cognify( datasets: Union[str, list[str]] = None, user: User = None, graph_model: BaseModel = KnowledgeGraph, + tasks: list[Task] = None, ): if user is None: user = await get_default_user() @@ -55,18 +56,19 @@ async def cognify( awaitables = [] + if tasks is None: + tasks = await get_default_tasks(user, graph_model) + for dataset in datasets: dataset_name = generate_dataset_name(dataset.name) if dataset_name in existing_datasets_map: - awaitables.append(run_cognify_pipeline(dataset, user, graph_model)) + awaitables.append(run_cognify_pipeline(dataset, user, tasks)) return await asyncio.gather(*awaitables) -async def run_cognify_pipeline( - dataset: Dataset, user: User, graph_model: BaseModel = KnowledgeGraph -): +async def run_cognify_pipeline(dataset: Dataset, user: User, tasks: list[Task]): data_documents: list[Data] = await get_dataset_data(dataset_id=dataset.id) document_ids_str = [str(document.id) for document in data_documents] @@ -96,22 +98,12 @@ async def run_cognify_pipeline( ) try: - cognee_config = get_cognify_config() + if not isinstance(tasks, list): + raise ValueError("Tasks must be a list") - tasks = [ - Task(classify_documents), - Task(check_permissions_on_documents, user=user, permissions=["write"]), - Task(extract_chunks_from_documents), # Extract text chunks based on the document type. - Task( - extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10} - ), # Generate knowledge graphs from the document chunks. - Task( - summarize_text, - summarization_model=cognee_config.summarization_model, - task_config={"batch_size": 10}, - ), - Task(add_data_points, only_root=True, task_config={"batch_size": 10}), - ] + for task in tasks: + if not isinstance(task, Task): + raise ValueError(f"Task {task} is not an instance of Task") pipeline = run_tasks(tasks, data_documents, "cognify_pipeline") @@ -146,3 +138,31 @@ async def run_cognify_pipeline( def generate_dataset_name(dataset_name: str) -> str: return dataset_name.replace(".", "_").replace(" ", "_") + + +async def get_default_tasks( + user: User = None, graph_model: BaseModel = KnowledgeGraph +) -> list[Task]: + if user is None: + user = await get_default_user() + + try: + cognee_config = get_cognify_config() + default_tasks = [ + Task(classify_documents), + Task(check_permissions_on_documents, user=user, permissions=["write"]), + Task(extract_chunks_from_documents), # Extract text chunks based on the document type. + Task( + extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10} + ), # Generate knowledge graphs from the document chunks. + Task( + summarize_text, + summarization_model=cognee_config.summarization_model, + task_config={"batch_size": 10}, + ), + Task(add_data_points, only_root=True, task_config={"batch_size": 10}), + ] + except Exception as error: + send_telemetry("cognee.cognify DEFAULT TASKS CREATION ERRORED", user.id) + raise error + return default_tasks From 964fca72c6ba8620148dd54dd19d330ebe4fa767 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 17 Jan 2025 10:36:04 +0100 Subject: [PATCH 2/3] fix: Update ruff version for cognee --- poetry.lock | 50 +++++++++++++++++++++++++------------------------- pyproject.toml | 2 +- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/poetry.lock b/poetry.lock index 51cd2474e..48a7b9d10 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. [[package]] name = "aiofiles" @@ -1060,6 +1060,7 @@ files = [ {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:761817a3377ef15ac23cd7834715081791d4ec77f9297ee694ca1ee9c2c7e5eb"}, {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3c672a53c0fb4725a29c303be906d3c1fa99c32f58abe008a82705f9ee96f40b"}, {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:4ac4c9f37eba52cb6fbeaf5b59c152ea976726b865bd4cf87883a7e7006cc543"}, + {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:60eb32934076fa07e4316b7b2742fa52cbb190b42c2df2863dbc4230a0a9b385"}, {file = "cryptography-44.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ed3534eb1090483c96178fcb0f8893719d96d5274dfde98aa6add34614e97c8e"}, {file = "cryptography-44.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f3f6fdfa89ee2d9d496e2c087cebef9d4fcbb0ad63c40e821b39f74bf48d9c5e"}, {file = "cryptography-44.0.0-cp37-abi3-win32.whl", hash = "sha256:eb33480f1bad5b78233b0ad3e1b0be21e8ef1da745d8d2aecbb20671658b9053"}, @@ -1070,6 +1071,7 @@ files = [ {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c5eb858beed7835e5ad1faba59e865109f3e52b3783b9ac21e7e47dc5554e289"}, {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f53c2c87e0fb4b0c00fa9571082a057e37690a8f12233306161c8f4b819960b7"}, {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:9e6fc8a08e116fb7c7dd1f040074c9d7b51d74a8ea40d4df2fc7aa08b76b9e6c"}, + {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:9abcc2e083cbe8dde89124a47e5e53ec38751f0d7dfd36801008f316a127d7ba"}, {file = "cryptography-44.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:d2436114e46b36d00f8b72ff57e598978b37399d2786fd39793c36c6d5cb1c64"}, {file = "cryptography-44.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a01956ddfa0a6790d594f5b34fc1bfa6098aca434696a03cfdbe469b8ed79285"}, {file = "cryptography-44.0.0-cp39-abi3-win32.whl", hash = "sha256:eca27345e1214d1b9f9490d200f9db5a874479be914199194e746c893788d417"}, @@ -2923,8 +2925,6 @@ optional = false python-versions = "*" files = [ {file = "jsonpath-ng-1.7.0.tar.gz", hash = "sha256:f6f5f7fd4e5ff79c785f1573b394043b39849fb2bb47bcead935d12b00beab3c"}, - {file = "jsonpath_ng-1.7.0-py2-none-any.whl", hash = "sha256:898c93fc173f0c336784a3fa63d7434297544b7198124a68f9a3ef9597b0ae6e"}, - {file = "jsonpath_ng-1.7.0-py3-none-any.whl", hash = "sha256:f3d7f9e848cba1b6da28c55b1c26ff915dc9e0b1ba7e752a53d6da8d5cbd00b6"}, ] [package.dependencies] @@ -4994,8 +4994,8 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.22.4", markers = "python_version < \"3.11\""}, - {version = ">=1.23.2", markers = "python_version == \"3.11\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -5911,8 +5911,8 @@ astroid = ">=3.3.8,<=3.4.0-dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = [ {version = ">=0.2", markers = "python_version < \"3.11\""}, - {version = ">=0.3.6", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=0.3.7", markers = "python_version >= \"3.12\""}, + {version = ">=0.3.6", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, ] isort = ">=4.2.5,<5.13.0 || >5.13.0,<6" mccabe = ">=0.6,<0.8" @@ -6953,29 +6953,29 @@ files = [ [[package]] name = "ruff" -version = "0.8.6" +version = "0.9.2" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.8.6-py3-none-linux_armv6l.whl", hash = "sha256:defed167955d42c68b407e8f2e6f56ba52520e790aba4ca707a9c88619e580e3"}, - {file = "ruff-0.8.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:54799ca3d67ae5e0b7a7ac234baa657a9c1784b48ec954a094da7c206e0365b1"}, - {file = "ruff-0.8.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:e88b8f6d901477c41559ba540beeb5a671e14cd29ebd5683903572f4b40a9807"}, - {file = "ruff-0.8.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0509e8da430228236a18a677fcdb0c1f102dd26d5520f71f79b094963322ed25"}, - {file = "ruff-0.8.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:91a7ddb221779871cf226100e677b5ea38c2d54e9e2c8ed847450ebbdf99b32d"}, - {file = "ruff-0.8.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:248b1fb3f739d01d528cc50b35ee9c4812aa58cc5935998e776bf8ed5b251e75"}, - {file = "ruff-0.8.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:bc3c083c50390cf69e7e1b5a5a7303898966be973664ec0c4a4acea82c1d4315"}, - {file = "ruff-0.8.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52d587092ab8df308635762386f45f4638badb0866355b2b86760f6d3c076188"}, - {file = "ruff-0.8.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:61323159cf21bc3897674e5adb27cd9e7700bab6b84de40d7be28c3d46dc67cf"}, - {file = "ruff-0.8.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ae4478b1471fc0c44ed52a6fb787e641a2ac58b1c1f91763bafbc2faddc5117"}, - {file = "ruff-0.8.6-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:0c000a471d519b3e6cfc9c6680025d923b4ca140ce3e4612d1a2ef58e11f11fe"}, - {file = "ruff-0.8.6-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:9257aa841e9e8d9b727423086f0fa9a86b6b420fbf4bf9e1465d1250ce8e4d8d"}, - {file = "ruff-0.8.6-py3-none-musllinux_1_2_i686.whl", hash = "sha256:45a56f61b24682f6f6709636949ae8cc82ae229d8d773b4c76c09ec83964a95a"}, - {file = "ruff-0.8.6-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:496dd38a53aa173481a7d8866bcd6451bd934d06976a2505028a50583e001b76"}, - {file = "ruff-0.8.6-py3-none-win32.whl", hash = "sha256:e169ea1b9eae61c99b257dc83b9ee6c76f89042752cb2d83486a7d6e48e8f764"}, - {file = "ruff-0.8.6-py3-none-win_amd64.whl", hash = "sha256:f1d70bef3d16fdc897ee290d7d20da3cbe4e26349f62e8a0274e7a3f4ce7a905"}, - {file = "ruff-0.8.6-py3-none-win_arm64.whl", hash = "sha256:7d7fc2377a04b6e04ffe588caad613d0c460eb2ecba4c0ccbbfe2bc973cbc162"}, - {file = "ruff-0.8.6.tar.gz", hash = "sha256:dcad24b81b62650b0eb8814f576fc65cfee8674772a6e24c9b747911801eeaa5"}, + {file = "ruff-0.9.2-py3-none-linux_armv6l.whl", hash = "sha256:80605a039ba1454d002b32139e4970becf84b5fee3a3c3bf1c2af6f61a784347"}, + {file = "ruff-0.9.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b9aab82bb20afd5f596527045c01e6ae25a718ff1784cb92947bff1f83068b00"}, + {file = "ruff-0.9.2-py3-none-macosx_11_0_arm64.whl", hash = "sha256:fbd337bac1cfa96be615f6efcd4bc4d077edbc127ef30e2b8ba2a27e18c054d4"}, + {file = "ruff-0.9.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82b35259b0cbf8daa22a498018e300b9bb0174c2bbb7bcba593935158a78054d"}, + {file = "ruff-0.9.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8b6a9701d1e371bf41dca22015c3f89769da7576884d2add7317ec1ec8cb9c3c"}, + {file = "ruff-0.9.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9cc53e68b3c5ae41e8faf83a3b89f4a5d7b2cb666dff4b366bb86ed2a85b481f"}, + {file = "ruff-0.9.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:8efd9da7a1ee314b910da155ca7e8953094a7c10d0c0a39bfde3fcfd2a015684"}, + {file = "ruff-0.9.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3292c5a22ea9a5f9a185e2d131dc7f98f8534a32fb6d2ee7b9944569239c648d"}, + {file = "ruff-0.9.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a605fdcf6e8b2d39f9436d343d1f0ff70c365a1e681546de0104bef81ce88df"}, + {file = "ruff-0.9.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c547f7f256aa366834829a08375c297fa63386cbe5f1459efaf174086b564247"}, + {file = "ruff-0.9.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:d18bba3d3353ed916e882521bc3e0af403949dbada344c20c16ea78f47af965e"}, + {file = "ruff-0.9.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:b338edc4610142355ccf6b87bd356729b62bf1bc152a2fad5b0c7dc04af77bfe"}, + {file = "ruff-0.9.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:492a5e44ad9b22a0ea98cf72e40305cbdaf27fac0d927f8bc9e1df316dcc96eb"}, + {file = "ruff-0.9.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:af1e9e9fe7b1f767264d26b1075ac4ad831c7db976911fa362d09b2d0356426a"}, + {file = "ruff-0.9.2-py3-none-win32.whl", hash = "sha256:71cbe22e178c5da20e1514e1e01029c73dc09288a8028a5d3446e6bba87a5145"}, + {file = "ruff-0.9.2-py3-none-win_amd64.whl", hash = "sha256:c5e1d6abc798419cf46eed03f54f2e0c3adb1ad4b801119dedf23fcaf69b55b5"}, + {file = "ruff-0.9.2-py3-none-win_arm64.whl", hash = "sha256:a1b63fa24149918f8b37cef2ee6fff81f24f0d74b6f0bdc37bc3e1f2143e41c6"}, + {file = "ruff-0.9.2.tar.gz", hash = "sha256:b5eceb334d55fae5f316f783437392642ae18e16dcf4f1858d55d3c2a0f8f5d0"}, ] [[package]] @@ -8797,4 +8797,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.10.0,<3.13" -content-hash = "1b28050bbc5d53795ffc6a850179f3289f3df803e4b03731f35102ad1f2d390f" +content-hash = "097955773827cdf96b42e54328f66b79e2b92e5a7f221a06afe1a71fea2c33bc" diff --git a/pyproject.toml b/pyproject.toml index c20630acc..ea0b64404 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,7 +107,7 @@ notebook = {version = "^7.1.0", optional = true} deptry = "^0.20.0" debugpy = "1.8.9" pylint = "^3.0.3" -ruff = ">=0.2.2,<0.9.0" +ruff = ">=0.9.2,<1.0.0" tweepy = "4.14.0" gitpython = "^3.1.43" pylance = "0.19.2" From 89b23b87284d50e1d566626388a16dbc910ba012 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 17 Jan 2025 10:40:24 +0100 Subject: [PATCH 3/3] refactor: Run ruff format 0.9.2 --- .../documents/AudioDocument_test.py | 18 +++++----- .../documents/ImageDocument_test.py | 18 +++++----- .../integration/documents/PdfDocument_test.py | 18 +++++----- .../documents/TextDocument_test.py | 18 +++++----- .../documents/UnstructuredDocument_test.py | 30 ++++++++-------- cognee/tests/test_deduplication.py | 12 +++---- cognee/tests/test_falkordb.py | 6 ++-- cognee/tests/test_library.py | 6 ++-- cognee/tests/test_pgvector.py | 36 +++++++++---------- .../chunks/chunk_by_paragraph_2_test.py | 18 +++++----- .../chunks/chunk_by_paragraph_test.py | 6 ++-- .../chunks/chunk_by_sentence_test.py | 12 +++---- .../processing/chunks/chunk_by_word_test.py | 6 ++-- 13 files changed, 102 insertions(+), 102 deletions(-) diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py index dbd43ddda..e07a2431b 100644 --- a/cognee/tests/integration/documents/AudioDocument_test.py +++ b/cognee/tests/integration/documents/AudioDocument_test.py @@ -36,12 +36,12 @@ def test_AudioDocument(): for ground_truth, paragraph_data in zip( GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker") ): - assert ( - ground_truth["word_count"] == paragraph_data.word_count - ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - assert ground_truth["len_text"] == len( - paragraph_data.text - ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - assert ( - ground_truth["cut_type"] == paragraph_data.cut_type - ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + assert ground_truth["word_count"] == paragraph_data.word_count, ( + f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + ) + assert ground_truth["len_text"] == len(paragraph_data.text), ( + f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + ) + assert ground_truth["cut_type"] == paragraph_data.cut_type, ( + f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + ) diff --git a/cognee/tests/integration/documents/ImageDocument_test.py b/cognee/tests/integration/documents/ImageDocument_test.py index c0877ae99..b8d585419 100644 --- a/cognee/tests/integration/documents/ImageDocument_test.py +++ b/cognee/tests/integration/documents/ImageDocument_test.py @@ -25,12 +25,12 @@ def test_ImageDocument(): for ground_truth, paragraph_data in zip( GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker") ): - assert ( - ground_truth["word_count"] == paragraph_data.word_count - ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - assert ground_truth["len_text"] == len( - paragraph_data.text - ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - assert ( - ground_truth["cut_type"] == paragraph_data.cut_type - ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + assert ground_truth["word_count"] == paragraph_data.word_count, ( + f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + ) + assert ground_truth["len_text"] == len(paragraph_data.text), ( + f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + ) + assert ground_truth["cut_type"] == paragraph_data.cut_type, ( + f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + ) diff --git a/cognee/tests/integration/documents/PdfDocument_test.py b/cognee/tests/integration/documents/PdfDocument_test.py index 8f28815d3..fc4307846 100644 --- a/cognee/tests/integration/documents/PdfDocument_test.py +++ b/cognee/tests/integration/documents/PdfDocument_test.py @@ -27,12 +27,12 @@ def test_PdfDocument(): for ground_truth, paragraph_data in zip( GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker") ): - assert ( - ground_truth["word_count"] == paragraph_data.word_count - ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - assert ground_truth["len_text"] == len( - paragraph_data.text - ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - assert ( - ground_truth["cut_type"] == paragraph_data.cut_type - ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + assert ground_truth["word_count"] == paragraph_data.word_count, ( + f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + ) + assert ground_truth["len_text"] == len(paragraph_data.text), ( + f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + ) + assert ground_truth["cut_type"] == paragraph_data.cut_type, ( + f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + ) diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py index 1e143d563..6daec62b7 100644 --- a/cognee/tests/integration/documents/TextDocument_test.py +++ b/cognee/tests/integration/documents/TextDocument_test.py @@ -39,12 +39,12 @@ def test_TextDocument(input_file, chunk_size): for ground_truth, paragraph_data in zip( GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker="text_chunker") ): - assert ( - ground_truth["word_count"] == paragraph_data.word_count - ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - assert ground_truth["len_text"] == len( - paragraph_data.text - ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - assert ( - ground_truth["cut_type"] == paragraph_data.cut_type - ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + assert ground_truth["word_count"] == paragraph_data.word_count, ( + f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + ) + assert ground_truth["len_text"] == len(paragraph_data.text), ( + f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + ) + assert ground_truth["cut_type"] == paragraph_data.cut_type, ( + f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + ) diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py index e0278de81..773dc2293 100644 --- a/cognee/tests/integration/documents/UnstructuredDocument_test.py +++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py @@ -71,32 +71,32 @@ def test_UnstructuredDocument(): for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"): assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }" assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }" - assert ( - "sentence_cut" == paragraph_data.cut_type - ), f" sentence_cut != {paragraph_data.cut_type = }" + assert "sentence_cut" == paragraph_data.cut_type, ( + f" sentence_cut != {paragraph_data.cut_type = }" + ) # Test DOCX for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"): assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }" assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }" - assert ( - "sentence_end" == paragraph_data.cut_type - ), f" sentence_end != {paragraph_data.cut_type = }" + assert "sentence_end" == paragraph_data.cut_type, ( + f" sentence_end != {paragraph_data.cut_type = }" + ) # TEST CSV for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"): assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }" - assert ( - "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text - ), f"Read text doesn't match expected text: {paragraph_data.text}" - assert ( - "sentence_cut" == paragraph_data.cut_type - ), f" sentence_cut != {paragraph_data.cut_type = }" + assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, ( + f"Read text doesn't match expected text: {paragraph_data.text}" + ) + assert "sentence_cut" == paragraph_data.cut_type, ( + f" sentence_cut != {paragraph_data.cut_type = }" + ) # Test XLSX for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"): assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }" assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }" - assert ( - "sentence_cut" == paragraph_data.cut_type - ), f" sentence_cut != {paragraph_data.cut_type = }" + assert "sentence_cut" == paragraph_data.cut_type, ( + f" sentence_cut != {paragraph_data.cut_type = }" + ) diff --git a/cognee/tests/test_deduplication.py b/cognee/tests/test_deduplication.py index 9c2df032d..89c866f12 100644 --- a/cognee/tests/test_deduplication.py +++ b/cognee/tests/test_deduplication.py @@ -30,9 +30,9 @@ async def test_deduplication(): result = await relational_engine.get_all_data_from_table("data") assert len(result) == 1, "More than one data entity was found." - assert ( - result[0]["name"] == "Natural_language_processing_copy" - ), "Result name does not match expected value." + assert result[0]["name"] == "Natural_language_processing_copy", ( + "Result name does not match expected value." + ) result = await relational_engine.get_all_data_from_table("datasets") assert len(result) == 2, "Unexpected number of datasets found." @@ -61,9 +61,9 @@ async def test_deduplication(): result = await relational_engine.get_all_data_from_table("data") assert len(result) == 1, "More than one data entity was found." - assert ( - hashlib.md5(text.encode("utf-8")).hexdigest() in result[0]["name"] - ), "Content hash is not a part of file name." + assert hashlib.md5(text.encode("utf-8")).hexdigest() in result[0]["name"], ( + "Content hash is not a part of file name." + ) await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) diff --git a/cognee/tests/test_falkordb.py b/cognee/tests/test_falkordb.py index 07ece9eb2..af0e87916 100755 --- a/cognee/tests/test_falkordb.py +++ b/cognee/tests/test_falkordb.py @@ -85,9 +85,9 @@ async def main(): from cognee.infrastructure.databases.relational import get_relational_engine - assert not os.path.exists( - get_relational_engine().db_path - ), "SQLite relational database is not empty" + assert not os.path.exists(get_relational_engine().db_path), ( + "SQLite relational database is not empty" + ) from cognee.infrastructure.databases.graph import get_graph_config diff --git a/cognee/tests/test_library.py b/cognee/tests/test_library.py index 8352b4161..192b67506 100755 --- a/cognee/tests/test_library.py +++ b/cognee/tests/test_library.py @@ -82,9 +82,9 @@ async def main(): from cognee.infrastructure.databases.relational import get_relational_engine - assert not os.path.exists( - get_relational_engine().db_path - ), "SQLite relational database is not empty" + assert not os.path.exists(get_relational_engine().db_path), ( + "SQLite relational database is not empty" + ) from cognee.infrastructure.databases.graph import get_graph_config diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index c241177f0..73b6be974 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -24,28 +24,28 @@ async def test_local_file_deletion(data_text, file_location): data_hash = hashlib.md5(encoded_text).hexdigest() # Get data entry from database based on hash contents data = (await session.scalars(select(Data).where(Data.content_hash == data_hash))).one() - assert os.path.isfile( - data.raw_data_location - ), f"Data location doesn't exist: {data.raw_data_location}" + assert os.path.isfile(data.raw_data_location), ( + f"Data location doesn't exist: {data.raw_data_location}" + ) # Test deletion of data along with local files created by cognee await engine.delete_data_entity(data.id) - assert not os.path.exists( - data.raw_data_location - ), f"Data location still exists after deletion: {data.raw_data_location}" + assert not os.path.exists(data.raw_data_location), ( + f"Data location still exists after deletion: {data.raw_data_location}" + ) async with engine.get_async_session() as session: # Get data entry from database based on file path data = ( await session.scalars(select(Data).where(Data.raw_data_location == file_location)) ).one() - assert os.path.isfile( - data.raw_data_location - ), f"Data location doesn't exist: {data.raw_data_location}" + assert os.path.isfile(data.raw_data_location), ( + f"Data location doesn't exist: {data.raw_data_location}" + ) # Test local files not created by cognee won't get deleted await engine.delete_data_entity(data.id) - assert os.path.exists( - data.raw_data_location - ), f"Data location doesn't exists: {data.raw_data_location}" + assert os.path.exists(data.raw_data_location), ( + f"Data location doesn't exists: {data.raw_data_location}" + ) async def test_getting_of_documents(dataset_name_1): @@ -54,16 +54,16 @@ async def test_getting_of_documents(dataset_name_1): user = await get_default_user() document_ids = await get_document_ids_for_user(user.id, [dataset_name_1]) - assert ( - len(document_ids) == 1 - ), f"Number of expected documents doesn't match {len(document_ids)} != 1" + assert len(document_ids) == 1, ( + f"Number of expected documents doesn't match {len(document_ids)} != 1" + ) # Test getting of documents for search when no dataset is provided user = await get_default_user() document_ids = await get_document_ids_for_user(user.id) - assert ( - len(document_ids) == 2 - ), f"Number of expected documents doesn't match {len(document_ids)} != 2" + assert len(document_ids) == 2, ( + f"Number of expected documents doesn't match {len(document_ids)} != 2" + ) async def main(): diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py index 53098fc67..d8680a604 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py @@ -17,9 +17,9 @@ def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs): chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs) reconstructed_text = "".join([chunk["text"] for chunk in chunks]) - assert ( - reconstructed_text == input_text - ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + assert reconstructed_text == input_text, ( + f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + ) @pytest.mark.parametrize( @@ -36,9 +36,9 @@ def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs): chunk_lengths = np.array([len(list(chunk_by_word(chunk["text"]))) for chunk in chunks]) larger_chunks = chunk_lengths[chunk_lengths > paragraph_length] - assert np.all( - chunk_lengths <= paragraph_length - ), f"{paragraph_length = }: {larger_chunks} are too large" + assert np.all(chunk_lengths <= paragraph_length), ( + f"{paragraph_length = }: {larger_chunks} are too large" + ) @pytest.mark.parametrize( @@ -50,6 +50,6 @@ def test_chunk_by_paragraph_chunk_numbering(input_text, paragraph_length, batch_ data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs ) chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks]) - assert np.all( - chunk_indices == np.arange(len(chunk_indices)) - ), f"{chunk_indices = } are not monotonically increasing" + assert np.all(chunk_indices == np.arange(len(chunk_indices))), ( + f"{chunk_indices = } are not monotonically increasing" + ) diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py index e7d9a54ba..e420b2e9f 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py @@ -58,9 +58,9 @@ def run_chunking_test(test_text, expected_chunks): for expected_chunks_item, chunk in zip(expected_chunks, chunks): for key in ["text", "word_count", "cut_type"]: - assert ( - chunk[key] == expected_chunks_item[key] - ), f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }" + assert chunk[key] == expected_chunks_item[key], ( + f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }" + ) def test_chunking_whole_text(): diff --git a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py index d1c75d7ed..efa053077 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py @@ -16,9 +16,9 @@ def test_chunk_by_sentence_isomorphism(input_text, maximum_length): chunks = chunk_by_sentence(input_text, maximum_length) reconstructed_text = "".join([chunk[1] for chunk in chunks]) - assert ( - reconstructed_text == input_text - ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + assert reconstructed_text == input_text, ( + f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + ) @pytest.mark.parametrize( @@ -36,6 +36,6 @@ def test_paragraph_chunk_length(input_text, maximum_length): chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks]) larger_chunks = chunk_lengths[chunk_lengths > maximum_length] - assert np.all( - chunk_lengths <= maximum_length - ), f"{maximum_length = }: {larger_chunks} are too large" + assert np.all(chunk_lengths <= maximum_length), ( + f"{maximum_length = }: {larger_chunks} are too large" + ) diff --git a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py index fb26638cb..d79fcdbc8 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py @@ -17,9 +17,9 @@ def test_chunk_by_word_isomorphism(input_text): chunks = chunk_by_word(input_text) reconstructed_text = "".join([chunk[0] for chunk in chunks]) - assert ( - reconstructed_text == input_text - ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + assert reconstructed_text == input_text, ( + f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + ) @pytest.mark.parametrize(