Fix fileinfo for arweave file (#672)

* Fix fileinfo for arweave file. Added check for file path validation inside UrlFile class. * Fix key for tests. * Overwrite check_details in Arweave class.
oceanprotocol · Nov 16, 2023 · 3958799 · 3958799
1 parent 6b8002e
commit 3958799
Show file tree

Hide file tree

Showing 4 changed files with 96 additions and 18 deletions.
diff --git a/ocean_provider/file_types/definitions.py b/ocean_provider/file_types/definitions.py
@@ -4,7 +4,6 @@
 import logging
 import mimetypes
 import os
-import re
 from abc import abstractmethod
 from cgi import parse_header
 from typing import Protocol, Tuple
@@ -85,18 +84,7 @@ def check_details(self, with_checksum=False):
                 file_name = None
 
                 if files_url:
-                    filename = urlparse(files_url).path.split("/")[-1]
-                    try:
-                        if not self._validate_filename(filename):
-                            msg = "Invalid file name format. It was not possible to get the file name."
-                            logger.error(msg)
-                            return False, {"error": msg}
-
-                        file_name = filename
-                    except Exception as e:
-                        msg = f"It was not possible to get the file name. {e}"
-                        logger.warning(msg)
-                        return False, {"error": msg}
+                    file_name = urlparse(files_url).path.split("/")[-1]
 
                 if not content_length and content_range:
                     # sometimes servers send content-range instead
@@ -128,10 +116,6 @@ def check_details(self, with_checksum=False):
 
         return False, {}
 
-    def _validate_filename(self, header: str) -> bool:
-        pattern = re.compile(r"\\|\.\.|/")
-        return not bool(pattern.findall(header))
-
     def _get_result_from_url(self, with_checksum=False):
         func, func_args = self._get_func_and_args()
 

diff --git a/ocean_provider/file_types/file_types.py b/ocean_provider/file_types/file_types.py
@@ -1,12 +1,16 @@
 import json
 import logging
 import os
+import re
+import copy
 from typing import Any, Optional, Tuple
-from urllib.parse import urljoin
+from urllib.parse import urljoin, urlparse
 from uuid import uuid4
 
 from enforce_typing import enforce_types
 from ocean_provider.file_types.definitions import EndUrlType, FilesType
+import requests
+from ocean_provider.utils.url import is_safe_url
 
 logger = logging.getLogger(__name__)
 
@@ -34,11 +38,22 @@ def validate_dict(self) -> Tuple[bool, Any]:
         if self.method not in ["get", "post"]:
             return False, f"Unsafe method {self.method}."
 
+        if not self.validate_url(self.url):
+            msg = "Invalid file name format. It was not possible to get the file name."
+            logger.error(msg)
+            return False, msg
+
         return True, self
 
     def get_download_url(self):
         return self.url
 
+    def validate_url(self, url: str) -> bool:
+        pattern = re.compile(r"^(.+)\/([^/]+)$")
+        if url.startswith("http://") or url.startswith("https://"):
+            return True
+        return not bool(pattern.findall(url))
+
     @enforce_types
     def get_filename(self) -> str:
         return self.url.split("/")[-1]
@@ -106,6 +121,73 @@ def get_download_url(self):
     def get_filename(self):
         return uuid4().hex
 
+    def check_details(self, with_checksum=False):
+        """
+        If the url argument is invalid, returns False and empty dictionary.
+        Otherwise it returns True and a dictionary containing contentType and
+        contentLength. File name remains empty.
+        """
+
+        url = self.get_download_url()
+
+        try:
+            if not is_safe_url(url):
+                return False, {}
+            status_code = None
+            headers = None
+            files_url = None
+            for _ in range(int(os.getenv("REQUEST_RETRIES", 1))):
+                result, extra_data = self._get_result_from_url(
+                    with_checksum=with_checksum,
+                )
+                if result:
+                    status_code = result.status_code
+                    headers = copy.deepcopy(result.headers)
+                    files_url = ""
+                    # always close requests session, see https://requests.readthedocs.io/en/latest/user/advanced/#body-content-workflow
+                    result.close()
+                    if status_code == 200:
+                        break
+
+            if status_code == 200:
+                content_type = headers.get("Content-Type")
+                content_length = headers.get("Content-Length")
+                content_range = headers.get("Content-Range")
+                file_name = None
+
+                if files_url:
+                    file_name = urlparse(files_url).path.split("/")[-1]
+
+                if not content_length and content_range:
+                    # sometimes servers send content-range instead
+                    try:
+                        content_length = content_range.split("-")[1]
+                    except IndexError:
+                        pass
+
+                if content_type:
+                    try:
+                        content_type = content_type.split(";")[0]
+                    except IndexError:
+                        pass
+
+                if content_type or content_length or file_name:
+                    details = {
+                        "contentLength": content_length or "",
+                        "contentType": content_type or "",
+                        "filename": file_name or "",
+                    }
+
+                    if extra_data:
+                        details.update(extra_data)
+
+                    self.checked_details = details
+                    return True, details
+        except requests.exceptions.RequestException:
+            pass
+
+        return False, {}
+
 
 class GraphqlQuery(EndUrlType, FilesType):
     @enforce_types

diff --git a/ocean_provider/utils/test/test_util.py b/ocean_provider/utils/test/test_util.py
@@ -390,6 +390,17 @@ def test_validate_url_object():
     assert result is False
     assert message == "Unsafe method delete."
 
+    url_object = {
+        "url": "./../dir",
+        "type": "url",
+        "method": "GET",
+    }
+    result, message = FilesTypeFactory.validate_and_create(url_object)
+    assert result is False
+    assert (
+        message == "Invalid file name format. It was not possible to get the file name."
+    )
+
 
 @pytest.mark.unit
 def test_build_download_response_ipfs():

diff --git a/tests/test_fileinfo.py b/tests/test_fileinfo.py
@@ -171,6 +171,7 @@ def test_check_arweave_good(client):
         assert file_info["contentType"] == "application/octet-stream"
         assert file_info["valid"] is True
         assert file_info["type"] == "arweave"
+        assert file_info["filename"] == ""
 
 
 @pytest.mark.unit