From 1e098ae70d4449b53328157f8d8a86059f4f73da Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Thu, 5 Dec 2024 20:54:55 +0100
Subject: [PATCH] refactor: Add error handling to hash util

Added error handling to reading of file in hash util

Refactor COG-505
---
 cognee/shared/exceptions/__init__.py   |  9 ++++++++
 cognee/shared/exceptions/exceptions.py | 11 ++++++++++
 cognee/shared/utils.py                 | 29 +++++++++++++++-----------
 3 files changed, 37 insertions(+), 12 deletions(-)
 create mode 100644 cognee/shared/exceptions/__init__.py
 create mode 100644 cognee/shared/exceptions/exceptions.py

diff --git a/cognee/shared/exceptions/__init__.py b/cognee/shared/exceptions/__init__.py
new file mode 100644
index 000000000..9b86cccab
--- /dev/null
+++ b/cognee/shared/exceptions/__init__.py
@@ -0,0 +1,9 @@
+"""
+Custom exceptions for the Cognee API.
+
+This module defines a set of exceptions for handling various shared utility errors
+"""
+
+from .exceptions import (
+    IngestionError,
+)
\ No newline at end of file
diff --git a/cognee/shared/exceptions/exceptions.py b/cognee/shared/exceptions/exceptions.py
new file mode 100644
index 000000000..101711398
--- /dev/null
+++ b/cognee/shared/exceptions/exceptions.py
@@ -0,0 +1,11 @@
+from cognee.exceptions import CogneeApiError
+from fastapi import status
+
+class IngestionError(CogneeApiError):
+    def __init__(
+            self,
+            message: str = "Failed to load data.",
+            name: str = "IngestionError",
+            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+    ):
+        super().__init__(message, name, status_code)
\ No newline at end of file
diff --git a/cognee/shared/utils.py b/cognee/shared/utils.py
index 1dc52acd5..b75076e55 100644
--- a/cognee/shared/utils.py
+++ b/cognee/shared/utils.py
@@ -19,6 +19,8 @@
 from uuid import uuid4
 import pathlib
 
+from cognee.shared.exceptions import IngestionError
+
 # Analytics Proxy Url, currently hosted by Vercel
 proxy_url = "https://test.prometh.ai"
 
@@ -76,23 +78,26 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int:
 def get_file_content_hash(file_obj: Union[str, BinaryIO]) -> str:
     h = hashlib.md5()
 
-    if isinstance(file_obj, str):
-        with open(file_obj, 'rb') as file:
+    try:
+        if isinstance(file_obj, str):
+            with open(file_obj, 'rb') as file:
+                while True:
+                    # Reading is buffered, so we can read smaller chunks.
+                    chunk = file.read(h.block_size)
+                    if not chunk:
+                        break
+                    h.update(chunk)
+        else:
             while True:
                 # Reading is buffered, so we can read smaller chunks.
-                chunk = file.read(h.block_size)
+                chunk = file_obj.read(h.block_size)
                 if not chunk:
                     break
                 h.update(chunk)
-    else:
-        while True:
-            # Reading is buffered, so we can read smaller chunks.
-            chunk = file_obj.read(h.block_size)
-            if not chunk:
-                break
-            h.update(chunk)
-
-    return h.hexdigest()
+
+        return h.hexdigest()
+    except IOError as e:
+        raise IngestionError(message=f"Failed to load data from {file}: {e}")
 
 def trim_text_to_max_tokens(text: str, max_tokens: int, encoding_name: str) -> str:
     """