From cfd162a7d605c01ea3efd4c8026a06d4b7aaa308 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Fri, 10 May 2024 07:14:35 -0700 Subject: [PATCH 1/8] update all docstrings --- src/gypsum_client/_utils.py | 10 ++++ src/gypsum_client/auth.py | 4 +- src/gypsum_client/list_assets.py | 79 +++++++++++++++++++++++++++++--- src/gypsum_client/s3_config.py | 2 +- 4 files changed, 85 insertions(+), 10 deletions(-) diff --git a/src/gypsum_client/_utils.py b/src/gypsum_client/_utils.py index e67391a..cb948eb 100644 --- a/src/gypsum_client/_utils.py +++ b/src/gypsum_client/_utils.py @@ -1,6 +1,7 @@ import os from pathlib import Path from typing import Optional +from urllib.parse import quote import requests @@ -77,3 +78,12 @@ def _list_for_prefix( resp = [_remove_slash_url(val) for val in resp if not val.startswith("..")] return resp + + +def _fetch_json(path: str, url: str): + full_url = f"{url}/file/{quote(path)}" + + req = requests.get(full_url) + req.raise_for_status() + + return req.json() diff --git a/src/gypsum_client/auth.py b/src/gypsum_client/auth.py index 4e82cc4..1de7428 100644 --- a/src/gypsum_client/auth.py +++ b/src/gypsum_client/auth.py @@ -21,7 +21,7 @@ def _token_cache_path(cache_dir): def access_token( full: bool = False, request: bool = True, cache_dir: Optional[str] = None -): +) -> Optional[str]: """Get GitHub access token for authentication to the gypsum API's. Args: @@ -86,7 +86,7 @@ def set_access_token( github_url: str = "https://api.github.com", user_agent: Optional[str] = None, cache_dir: Optional[str] = None, -): +) -> dict: """Set GitHub access token for authentication to the gypsum API's. Args: diff --git a/src/gypsum_client/list_assets.py b/src/gypsum_client/list_assets.py index 9c0469f..c3015f6 100644 --- a/src/gypsum_client/list_assets.py +++ b/src/gypsum_client/list_assets.py @@ -1,5 +1,3 @@ -from typing import Optional - import requests from ._utils import _list_for_prefix, _rest_url @@ -9,15 +7,52 @@ __license__ = "MIT" -def list_projects(prefix: Optional[str] = None, url: str = _rest_url()): - return _list_for_prefix(prefix, url) +def list_projects(url: str = _rest_url()) -> list: + """List all projects in the gypsum backend. + + Args: + url: + URL to the gypsum compatible API. + + + Returns: + List of project names. + """ + return _list_for_prefix(prefix=None, url=url) + +def list_assets(project: str, url: str = _rest_url()) -> list: + """List all assets in a project. -def list_assets(project: str, url: str = _rest_url()): + Args: + project: + Project name. + + url: + URL to the gypsum compatible API. + + Returns: + List of asset names. + """ return _list_for_prefix(f"{project}/", url=url) -def list_versions(project: str, asset: str, url=_rest_url()): +def list_versions(project: str, asset: str, url=_rest_url()) -> list: + """List all versions for a project asset. + + Args: + project: + Project name. + + asset: + Asset name. + + url: + URL to the gypsum compatible API. + + Returns: + List of versions. + """ return _list_for_prefix(f"{project}/{asset}/", url=url) @@ -28,7 +63,37 @@ def list_files( prefix: str = None, include_dot: bool = True, url: str = _rest_url(), -): +) -> list: + """List all files for a specified version of a project and asset. + + Args: + project: + Project name. + + asset: + Asset name. + + version: + Version name. + + prefix: + Prefix for the object key. + + If provided. a file is only listed if its object key starts with + ``{project}/{asset}/{version}/{prefix}``. + + Defaults to None and all associated files with this version of the + asset in the specified project are listed. + + include_dot: + Whether to list files with ``..`` in their names. + + url: + URL to the gypsum compatible API. + + Returns: + List of relative paths of files associated with the versioned asset. + """ _prefix = f"{project}/{asset}/{version}/" _trunc = len(_prefix) if prefix is not None: diff --git a/src/gypsum_client/s3_config.py b/src/gypsum_client/s3_config.py index f2b7235..75039ef 100644 --- a/src/gypsum_client/s3_config.py +++ b/src/gypsum_client/s3_config.py @@ -22,7 +22,7 @@ def _config_cache_path(cache_dir): def public_s3_config( refresh: bool = False, url: str = _rest_url(), cache_dir: Optional[str] = None -): +) -> dict: """Get S3 configuration to the bucket storing the data. Users can use this downstream to access the bucket directly using boto3. From 6eece97733976abfa2b415b5f7ca59f92139731f Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Fri, 10 May 2024 07:57:17 -0700 Subject: [PATCH 2/8] untested fetch functions --- src/gypsum_client/_utils.py | 64 +++++++++++++++++++++++++++++++ src/gypsum_client/fetch_assets.py | 61 +++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 src/gypsum_client/fetch_assets.py diff --git a/src/gypsum_client/_utils.py b/src/gypsum_client/_utils.py index cb948eb..205f9d3 100644 --- a/src/gypsum_client/_utils.py +++ b/src/gypsum_client/_utils.py @@ -1,9 +1,13 @@ +import json import os +import shutil +import tempfile from pathlib import Path from typing import Optional from urllib.parse import quote import requests +from filelock import FileLock __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -87,3 +91,63 @@ def _fetch_json(path: str, url: str): req.raise_for_status() return req.json() + + +BUCKET_CACHE_NAME = "bucket" + + +def _fetch_cacheable_json( + project: str, + asset: str, + version: str, + path: str, + cache: str, + url: str, + overwrite: bool, +): + bucket_path = f"{project}/{asset}/{version}/{path}" + + if cache is None: + return _fetch_json(bucket_path, url=url) + else: + _out_path = os.path.join( + cache, BUCKET_CACHE_NAME, project, asset, version, path + ) + if os.path.exists(_out_path): + _lock = FileLock(_out_path) + with _lock: + _save_file( + bucket_path, destination=_out_path, overwrite=overwrite, url=url + ) + + with open(_out_path) as jf: + return json.load(jf) + + +def _save_file( + path: str, destination: str, overwrite: bool, url: str, error: bool = True +): + if overwrite or not os.path.exists(destination): + os.makedirs(os.path.dirname(destination), exist_ok=True) + + with tempfile.NamedTemporaryFile( + dir=os.path.dirname(destination), delete=False + ) as tmp_file: + try: + full_url = f"{url}/file/{quote(path)}" + + req = requests.get(full_url, stream=True) + req.raise_for_status() + + for chunk in req.iter_content(chunk_size=None): + tmp_file.write(chunk) + except Exception as e: + if error: + raise Exception(f"Failed to save '{path}'; {str(e)}.") + else: + return False + + # Rename the temporary file to the destination + shutil.move(tmp_file.name, destination) + + return True diff --git a/src/gypsum_client/fetch_assets.py b/src/gypsum_client/fetch_assets.py new file mode 100644 index 0000000..c583485 --- /dev/null +++ b/src/gypsum_client/fetch_assets.py @@ -0,0 +1,61 @@ +from typing import Optional + +import requests + +from ._utils import _cache_directory, _fetch_cacheable_json, _fetch_json, _rest_url + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def fetch_latest(project: str, asset: str, url=_rest_url()) -> str: + """Fetch the latest version of a project's asset. + + Args: + project: + Project name. + + asset: + Asset name. + + url: + URL to the gypsum compatible API. + + Returns: + Latest version of the project. + """ + resp = _fetch_json(f"{project}/{asset}/..latest", url=url) + return resp["version"] + + +def fetch_manifest( + project, asset, version, cache=_cache_directory(), overwrite=False, url=_rest_url() +): + """Fetch the manifest for a version of an asset of a project. + + Args: + project: + Project name. + + asset: + Asset name. + + version: + Version name. + + cache: + Path to the cache directory. + + overwrite: + Whether to overwrite existing file in cache. + + url: + URL to the gypsum compatible API. + + Returns: + _description_ + """ + return _fetch_cacheable_json( + project, asset, version, "..manifest", url=url, cache=cache, overwrite=overwrite + ) From f8fe5093d8eca5bc77341f79acbbc4f8f68867b0 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Fri, 10 May 2024 09:49:24 -0700 Subject: [PATCH 3/8] more fetch functions, still untested --- src/gypsum_client/_utils.py | 26 +++ src/gypsum_client/fetch_assets.py | 179 +++++++++++++++++++-- src/gypsum_client/fetch_metadata_schema.py | 55 +++++++ 3 files changed, 250 insertions(+), 10 deletions(-) create mode 100644 src/gypsum_client/fetch_metadata_schema.py diff --git a/src/gypsum_client/_utils.py b/src/gypsum_client/_utils.py index 205f9d3..51cf2b8 100644 --- a/src/gypsum_client/_utils.py +++ b/src/gypsum_client/_utils.py @@ -1,7 +1,9 @@ import json import os +import re import shutil import tempfile +from datetime import datetime, timezone from pathlib import Path from typing import Optional from urllib.parse import quote @@ -151,3 +153,27 @@ def _save_file( shutil.move(tmp_file.name, destination) return True + + +def _cast_datetime(x: list) -> list: + zend = [True if val.endswith("Z") else False for val in x] + + for i, val in enumerate(x): + if zend[i]: + # strptime doesn't handle 'Z' offsets directly. + xz = x[i] + x[i] = xz[:-1] + "+0000" + + if not all(zend): + # Remove colon in the timezone, which may confuse strptime. + for i, val in enumerate(x): + if not zend[i]: + x[i] = re.sub(":([0-9]{2})$", "\\1", val) + + # Remove fractional seconds. + x = [re.sub("\\.[0-9]+", "", val) for val in x] + + return [ + datetime.strptime(val, "%Y-%m-%dT%H:%M:%S%z").replace(tzinfo=timezone.utc) + for val in x + ] diff --git a/src/gypsum_client/fetch_assets.py b/src/gypsum_client/fetch_assets.py index c583485..053e8bf 100644 --- a/src/gypsum_client/fetch_assets.py +++ b/src/gypsum_client/fetch_assets.py @@ -1,15 +1,20 @@ -from typing import Optional +import os -import requests - -from ._utils import _cache_directory, _fetch_cacheable_json, _fetch_json, _rest_url +from ._utils import ( + BUCKET_CACHE_NAME, + _cache_directory, + _cast_datetime, + _fetch_cacheable_json, + _fetch_json, + _rest_url, +) __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" __license__ = "MIT" -def fetch_latest(project: str, asset: str, url=_rest_url()) -> str: +def fetch_latest(project: str, asset: str, url: str = _rest_url()) -> str: """Fetch the latest version of a project's asset. Args: @@ -30,8 +35,13 @@ def fetch_latest(project: str, asset: str, url=_rest_url()) -> str: def fetch_manifest( - project, asset, version, cache=_cache_directory(), overwrite=False, url=_rest_url() -): + project: str, + asset: str, + version: str, + cache_dir: str = _cache_directory(), + overwrite: bool = False, + url: str = _rest_url(), +) -> dict: """Fetch the manifest for a version of an asset of a project. Args: @@ -44,7 +54,7 @@ def fetch_manifest( version: Version name. - cache: + cache_dir: Path to the cache directory. overwrite: @@ -54,8 +64,157 @@ def fetch_manifest( URL to the gypsum compatible API. Returns: - _description_ + Dictionary containing the manifest for this version. + Each element is named after the relative path of a file in this version. + The value of each element is another list with the following fields: + - ``size``, an integer specifying the size of the file in bytes. + - ``md5sum``, a string containing the hex-encoded MD5 checksum of the file. + - Optional ``link``, a list specifying the link destination for a file. + + This contains the strings ``project``, ``asset``, ``version`` and ``path``. + If the link destination is itself a link, an ``ancestor`` list will be + present that specifies the final location of the file after resolving all intermediate links. """ return _fetch_cacheable_json( - project, asset, version, "..manifest", url=url, cache=cache, overwrite=overwrite + project, + asset, + version, + "..manifest", + url=url, + cache=cache_dir, + overwrite=overwrite, + ) + + +def fetch_permissions(project: str, url: str = _rest_url()) -> list: + """Fetch the permissions for a project. + + Args: + project: + Project name. + + url: + URL to the gypsum compatible API. + + Returns: + Dictionary containing the permissions for this project: + - ``owners``, a character vector containing the GitHub users or + organizations that are owners of this project. + - ``uploaders``, a list of lists specifying the users or organizations + who are authorzied to upload to this project. + Each entry is a list with the following fields: + - ``id``, a string containing the GitHub user or organization + that is authorized to upload. + - Optional ``asset``, a string containing the name of the asset + that the uploader is allowed to upload to. If not provided, there is no + restriction on the uploaded asset name. + - Optional ``version``, a string containing the name of the version + that the uploader is allowed to upload to.If not provided, there is + no restriction on the uploaded version name. + - Optional ``until``a POSIXct object containing the expiry date of this + authorization. If not provided, the authorization does not expire. + - Optional ``trusted``, whether the uploader is trusted. + If not provided, defaults to False. + """ + perms = _fetch_json(f"{project}/..permissions", url=url) + + for i, val in enumerate(perms["uploaders"]): + if "until" in val: + perms["uploaders"][i]["until"] = _cast_datetime(val["until"]) + + return perms + + +def fetch_quota(project: str, url: str = _rest_url()): + """Fetch the quota details for a project. + + Args: + project: + Project name. + + url: + URL to the gypsum compatible API. + + Returns: + Dictionary containing ``baseline``, the baseline quota at time zero in bytes; + ``growth_rate``, the annual growth rate for the quota in bytes; + ``year``, the creation year (i.e., time zero) for this project. + """ + return _fetch_json(f"{project}/..quota", url=url) + + +def fetch_summary( + project: str, + asset: str, + version: str, + cache_dir: str = _cache_directory(), + overwrite: bool = False, + url: str = _rest_url(), +): + """Fetch the summary for a version of an asset of a project. + + Args: + project: + Project name. + + asset: + Asset name. + + version: + Version name. + + cache_dir: + Path to the cache directory. + + overwrite: + Whether to overwrite existing file in cache. + + url: + URL to the gypsum compatible API. + + Returns: + Dictionary containing the summary for this version, with the following fields: + - ``upload_user_id``, string containing the identity of the uploader. + - ``upload_start``, a POSIXct object containing the upload start time. + - ``upload_finish``, a POSIXct object containing the upload finish time. + - ``on_probation`` (optional), a logical scalar indicating whether the upload is probational. + If missing, this can be assumed to be False. + """ + _out = _fetch_cacheable_json( + project, + asset, + version, + "..summary", + cache=cache_dir, + overwrite=overwrite, + url=url, ) + + _out["upload_start"] = _cast_datetime(_out["upload_start"]) + _out["upload_finish"] < -_cast_datetime(_out["upload_finish"]) + + if _out["on_probation"] is True and cache_dir is not None: + os.unlink( + os.file.path( + cache_dir, BUCKET_CACHE_NAME, project, asset, version, "..summary" + ) + ) + + return _out + + +def fetch_usage(project: str, url: str = _rest_url()): + """Fetch the quota usage for a project. + + Args: + project: + Project name. + + url: + URL to the gypsum compatible API. + + Returns: + Numeric scalar specifying the quota usage for the project, in bytes. + """ + _usage = _fetch_json(f"{project}/..usage", url=url) + return _usage["total"] diff --git a/src/gypsum_client/fetch_metadata_schema.py b/src/gypsum_client/fetch_metadata_schema.py new file mode 100644 index 0000000..4565086 --- /dev/null +++ b/src/gypsum_client/fetch_metadata_schema.py @@ -0,0 +1,55 @@ +import os +import tempfile + +import requests +from filelock import FileLock + +from ._utils import _cache_directory + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def fetch_metadata_schema( + name: str = "bioconductor/v1.json", + cache_dir: str = None, + overwrite: bool = False, +) -> str: + """Fetch a JSON schema file for metadata to be inserted into a SQLite database. + + Args: + name: + Name of the schema. + Defaults to "bioconductor/v1.json". + + cache_dir: + Path to the cache directory. + + overwrite: + Whether to overwrite existing file in cache. + + Returns: + Path containing the downloaded schema. + """ + if cache_dir is None: + cache_path = tempfile.mktemp(suffix=".json") + else: + cache_dir = os.path.join(_cache_directory(cache_dir), "schemas") + + cache_path = os.path.join(cache_dir, name) + os.makedirs(os.path.dirname(cache_path), exist_ok=True) + + if os.path.exists(cache_path) and not overwrite: + _lock = FileLock(cache_path) + with _lock: + return cache_path + + _lock = FileLock(cache_path) + with _lock: + url = "https://artifactdb.github.io/bioconductor-metadata-index/" + name + response = requests.get(url) + with open(cache_path, "wb") as f: + f.write(response.content) + + return cache_path From f08003469c28c0336f4846c779db28b44ffc6675 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Fri, 10 May 2024 13:36:44 -0700 Subject: [PATCH 4/8] tests --- src/gypsum_client/_utils.py | 78 +++++++++--------------- src/gypsum_client/fetch_assets.py | 11 ++-- tests/test_fetch.py | 99 +++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 55 deletions(-) create mode 100644 tests/test_fetch.py diff --git a/src/gypsum_client/_utils.py b/src/gypsum_client/_utils.py index 51cf2b8..3a1a234 100644 --- a/src/gypsum_client/_utils.py +++ b/src/gypsum_client/_utils.py @@ -1,12 +1,11 @@ import json import os -import re import shutil import tempfile -from datetime import datetime, timezone +from datetime import datetime from pathlib import Path from typing import Optional -from urllib.parse import quote +from urllib.parse import quote_plus import requests from filelock import FileLock @@ -87,7 +86,7 @@ def _list_for_prefix( def _fetch_json(path: str, url: str): - full_url = f"{url}/file/{quote(path)}" + full_url = f"{url}/file/{quote_plus(path)}" req = requests.get(full_url) req.raise_for_status() @@ -115,65 +114,44 @@ def _fetch_cacheable_json( _out_path = os.path.join( cache, BUCKET_CACHE_NAME, project, asset, version, path ) - if os.path.exists(_out_path): - _lock = FileLock(_out_path) - with _lock: - _save_file( - bucket_path, destination=_out_path, overwrite=overwrite, url=url - ) - with open(_out_path) as jf: + _save_file(bucket_path, destination=_out_path, overwrite=overwrite, url=url) + + with open(_out_path, "r") as jf: return json.load(jf) def _save_file( path: str, destination: str, overwrite: bool, url: str, error: bool = True ): - if overwrite or not os.path.exists(destination): + if overwrite is True or not os.path.exists(destination): os.makedirs(os.path.dirname(destination), exist_ok=True) - with tempfile.NamedTemporaryFile( - dir=os.path.dirname(destination), delete=False - ) as tmp_file: - try: - full_url = f"{url}/file/{quote(path)}" + _lock = FileLock(destination) + with _lock: + with tempfile.NamedTemporaryFile( + dir=os.path.dirname(destination), delete=False + ) as tmp_file: + try: + full_url = f"{url}/file/{quote_plus(path)}" - req = requests.get(full_url, stream=True) - req.raise_for_status() + req = requests.get(full_url, stream=True, verify=False) + req.raise_for_status() - for chunk in req.iter_content(chunk_size=None): - tmp_file.write(chunk) - except Exception as e: - if error: - raise Exception(f"Failed to save '{path}'; {str(e)}.") - else: - return False + for chunk in req.iter_content(chunk_size=None): + tmp_file.write(chunk) + except Exception as e: + if error: + raise Exception(f"Failed to save '{path}'; {str(e)}.") + else: + return False - # Rename the temporary file to the destination - shutil.move(tmp_file.name, destination) + # Rename the temporary file to the destination + shutil.move(tmp_file.name, destination) return True -def _cast_datetime(x: list) -> list: - zend = [True if val.endswith("Z") else False for val in x] - - for i, val in enumerate(x): - if zend[i]: - # strptime doesn't handle 'Z' offsets directly. - xz = x[i] - x[i] = xz[:-1] + "+0000" - - if not all(zend): - # Remove colon in the timezone, which may confuse strptime. - for i, val in enumerate(x): - if not zend[i]: - x[i] = re.sub(":([0-9]{2})$", "\\1", val) - - # Remove fractional seconds. - x = [re.sub("\\.[0-9]+", "", val) for val in x] - - return [ - datetime.strptime(val, "%Y-%m-%dT%H:%M:%S%z").replace(tzinfo=timezone.utc) - for val in x - ] +def _cast_datetime(x): + # return datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").astimezone(tz=timezone.utc) + return datetime.fromisoformat(x) diff --git a/src/gypsum_client/fetch_assets.py b/src/gypsum_client/fetch_assets.py index 053e8bf..3e67b36 100644 --- a/src/gypsum_client/fetch_assets.py +++ b/src/gypsum_client/fetch_assets.py @@ -189,16 +189,17 @@ def fetch_summary( overwrite=overwrite, url=url, ) + print(_out) _out["upload_start"] = _cast_datetime(_out["upload_start"]) - _out["upload_finish"] < -_cast_datetime(_out["upload_finish"]) + _out["upload_finish"] = _cast_datetime(_out["upload_finish"]) - if _out["on_probation"] is True and cache_dir is not None: - os.unlink( - os.file.path( + if "on_probation" in _out: + if _out["on_probation"] is True and cache_dir is not None: + _out_path = os.path.join( cache_dir, BUCKET_CACHE_NAME, project, asset, version, "..summary" ) - ) + os.unlink(_out_path) return _out diff --git a/tests/test_fetch.py b/tests/test_fetch.py new file mode 100644 index 0000000..a670f51 --- /dev/null +++ b/tests/test_fetch.py @@ -0,0 +1,99 @@ +import json +import os +import tempfile +from datetime import datetime + +import pytest +from gypsum_client.fetch_assets import ( + fetch_latest, + fetch_manifest, + fetch_permissions, + fetch_quota, + fetch_summary, + fetch_usage, +) +from gypsum_client.fetch_metadata_schema import fetch_metadata_schema + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def test_fetch_manifest(): + cache = tempfile.mkdtemp() + man = fetch_manifest("test-R", "basic", "v1", cache_dir=cache) + assert sorted(man.keys()) == ["blah.txt", "foo/bar.txt"] + + # Uses the cache. + with open( + os.path.join(cache, "bucket", "test-R", "basic", "v1", "..manifest"), "w" + ) as f: + f.write("[]") + + man = fetch_manifest("test-R", "basic", "v1", cache_dir=cache) + assert len(man) == 0 + + # Unless we overwrite it. + man = fetch_manifest("test-R", "basic", "v1", cache_dir=cache, overwrite=True) + assert len(man) > 0 + + with pytest.raises(Exception): + fetch_manifest("test-R", "basic", "non-existent", cache_dir=cache) + + +def test_fetch_summary(): + cache = tempfile.mkdtemp() + xx = fetch_summary("test-R", "basic", "v1", cache_dir=cache) + original_user = xx["upload_user_id"] + assert isinstance(xx["upload_start"], datetime) + assert isinstance(xx["upload_finish"], datetime) + + # Uses the cache. + sumpath = os.path.join(cache, "bucket", "test-R", "basic", "v1", "..summary") + with open(sumpath, "w") as f: + json.dump( + { + "upload_user_id": "adrian", + "upload_start": "2022-01-01T01:01:01Z", + "upload_finish": "2022-01-01T01:01:02Z", + }, + f, + ) + xx = fetch_summary("test-R", "basic", "v1", cache_dir=cache) + assert xx["upload_user_id"] == "adrian" + + # Unless we overwrite it. + xx = fetch_summary("test-R", "basic", "v1", cache_dir=cache, overwrite=True) + assert xx["upload_user_id"] == original_user + + # Self-deletes from the cache if it's on probation. + with open(sumpath, "w") as f: + json.dump( + { + "upload_user_id": "adrian", + "upload_start": "2022-01-01T01:01:01Z", + "upload_finish": "2022-01-01T01:01:02Z", + "on_probation": True, + }, + f, + ) + xx = fetch_summary("test-R", "basic", "v1", cache_dir=cache) + assert xx["on_probation"] + assert not os.path.exists(sumpath) + + with pytest.raises(Exception): + fetch_summary("test-R", "basic", "non-existent", cache_dir=cache) + + +def test_fetch_latest(): + assert fetch_latest("test-R", "basic") == "v3" + + +def test_fetch_usage(): + assert fetch_usage("test-R") > 0 + + +def test_fetch_permissions(): + perms = fetch_permissions("test-R") + assert isinstance(perms["owners"], list) + assert isinstance(perms["uploaders"], list) From 5dc4e0b4ce0fb9fe32fb1ae03f86be04ecb768f8 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Fri, 10 May 2024 13:37:13 -0700 Subject: [PATCH 5/8] verify = false --- src/gypsum_client/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gypsum_client/_utils.py b/src/gypsum_client/_utils.py index 3a1a234..3990b9e 100644 --- a/src/gypsum_client/_utils.py +++ b/src/gypsum_client/_utils.py @@ -135,7 +135,7 @@ def _save_file( try: full_url = f"{url}/file/{quote_plus(path)}" - req = requests.get(full_url, stream=True, verify=False) + req = requests.get(full_url, stream=True) req.raise_for_status() for chunk in req.iter_content(chunk_size=None): From 21f1fe8a16a0939760b5a5e0cb1541f42b127629 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Fri, 10 May 2024 15:14:43 -0700 Subject: [PATCH 6/8] more tests --- setup.cfg | 1 + src/gypsum_client/_utils.py | 19 ++++---- src/gypsum_client/fetch_assets.py | 1 - src/gypsum_client/fetch_metadata_schema.py | 6 ++- src/gypsum_client/validate_metadata.py | 46 ++++++++++++++++++ tests/test_fetch_metadata_schema.py | 56 ++++++++++++++++++++++ 6 files changed, 118 insertions(+), 11 deletions(-) create mode 100644 src/gypsum_client/validate_metadata.py create mode 100644 tests/test_fetch_metadata_schema.py diff --git a/setup.cfg b/setup.cfg index c54d8f4..8955837 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,6 +51,7 @@ install_requires = importlib-metadata; python_version<"3.8" requests filelock + jsonschema [options.packages.find] diff --git a/src/gypsum_client/_utils.py b/src/gypsum_client/_utils.py index 3990b9e..7835a34 100644 --- a/src/gypsum_client/_utils.py +++ b/src/gypsum_client/_utils.py @@ -2,7 +2,7 @@ import os import shutil import tempfile -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path from typing import Optional from urllib.parse import quote_plus @@ -41,12 +41,10 @@ def _cache_directory(dir: Optional[str] = None): if dir is None: return current else: - if not os.path.exists(current): - raise FileNotFoundError( - f"Path {current} does not exist or is not accessible." - ) + if not os.path.exists(dir): + raise FileNotFoundError(f"Path {dir} does not exist or is not accessible.") - return current + return dir def _remove_slash_url(url: str): @@ -153,5 +151,10 @@ def _save_file( def _cast_datetime(x): - # return datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").astimezone(tz=timezone.utc) - return datetime.fromisoformat(x) + if x.endswith("Z"): + x = x[:-1] + + # Remove fractional seconds. + x = x.split(".")[0] + + return datetime.strptime(x, "%Y-%m-%dT%H:%M:%S").astimezone(tz=timezone.utc) diff --git a/src/gypsum_client/fetch_assets.py b/src/gypsum_client/fetch_assets.py index 3e67b36..25e484b 100644 --- a/src/gypsum_client/fetch_assets.py +++ b/src/gypsum_client/fetch_assets.py @@ -189,7 +189,6 @@ def fetch_summary( overwrite=overwrite, url=url, ) - print(_out) _out["upload_start"] = _cast_datetime(_out["upload_start"]) _out["upload_finish"] = _cast_datetime(_out["upload_finish"]) diff --git a/src/gypsum_client/fetch_metadata_schema.py b/src/gypsum_client/fetch_metadata_schema.py index 4565086..3d12c18 100644 --- a/src/gypsum_client/fetch_metadata_schema.py +++ b/src/gypsum_client/fetch_metadata_schema.py @@ -32,17 +32,19 @@ def fetch_metadata_schema( Returns: Path containing the downloaded schema. """ + cache_path = None if cache_dir is None: cache_path = tempfile.mktemp(suffix=".json") else: cache_dir = os.path.join(_cache_directory(cache_dir), "schemas") cache_path = os.path.join(cache_dir, name) - os.makedirs(os.path.dirname(cache_path), exist_ok=True) + if not os.path.exists(cache_path): + os.makedirs(os.path.dirname(cache_path), exist_ok=True) if os.path.exists(cache_path) and not overwrite: _lock = FileLock(cache_path) - with _lock: + if not _lock.is_locked: return cache_path _lock = FileLock(cache_path) diff --git a/src/gypsum_client/validate_metadata.py b/src/gypsum_client/validate_metadata.py new file mode 100644 index 0000000..9a5c508 --- /dev/null +++ b/src/gypsum_client/validate_metadata.py @@ -0,0 +1,46 @@ +import json +from typing import Optional, Union + +from jsonschema import validate as json_validate + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def validate_metadata( + metadata: Union[str, dict], schema: str, stringify: Optional[bool] = None +) -> bool: + """Validate metadata against a JSON schema for a SQLite database. + + Args: + metadata: + Metadata to be checked. + + Usually a dictionary, but may also be a JSON-formatted string. + + schema: + Path to a schema. + + stringify: + Whether to convert ``metadata`` to a JSON-formatted string. + Defaults to True if ``metadata`` is not already a string. + + Returns: + True if metadata conforms to schema. + """ + if stringify is None: + stringify = not isinstance(metadata, str) + + if stringify: + metadata = json.dumps(metadata) + + with open(schema) as f: + schema_data = json.load(f) + + try: + json_validate(instance=json.loads(metadata), schema=schema_data) + except Exception as e: + raise ValueError(f"Metadata validation failed: {e}") + + return True diff --git a/tests/test_fetch_metadata_schema.py b/tests/test_fetch_metadata_schema.py new file mode 100644 index 0000000..45eb45c --- /dev/null +++ b/tests/test_fetch_metadata_schema.py @@ -0,0 +1,56 @@ +import json +import tempfile + +import pytest +from gypsum_client.fetch_metadata_schema import fetch_metadata_schema +from gypsum_client.validate_metadata import validate_metadata + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def test_fetchMetadataSchema(): + _cache_dir = tempfile.mkdtemp() + path = fetch_metadata_schema(cache_dir=_cache_dir) + assert isinstance(json.load(open(path)), dict) + + # Uses the cache + with open(path, "w") as f: + f.write("FOO_BAR") + + with pytest.raises(Exception): + json.load(open(path)) + + path2 = fetch_metadata_schema(cache_dir=_cache_dir) + assert path == path2 + assert open(path).read().strip() == "FOO_BAR" + + # Unless we overwrite it + man = fetch_metadata_schema(cache_dir=_cache_dir, overwrite=True) + assert isinstance(json.load(open(path)), dict) + + +def test_validateMetadata(): + + _cache_dir = tempfile.mkdtemp() + + metadata = { + "title": "Fatherhood", + "description": "Luke ich bin dein Vater.", + "sources": [{"provider": "GEO", "id": "GSE12345"}], + "taxonomy_id": ["9606"], + "genome": ["GRCm38"], + "maintainer_name": "Darth Vader", + "maintainer_email": "vader@empire.gov", + "bioconductor_version": "3.10", + } + + schema = fetch_metadata_schema(cache_dir=_cache_dir) + assert validate_metadata(metadata, schema) + + assert validate_metadata(json.dumps(metadata), schema) + + metadata.pop("bioconductor_version", None) + with pytest.raises(Exception): + validate_metadata(metadata, schema) From a884ba81a5f0e7de800ef63e59b58881c61e326d Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Fri, 10 May 2024 15:15:32 -0700 Subject: [PATCH 7/8] export fetch --- src/gypsum_client/__init__.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/gypsum_client/__init__.py b/src/gypsum_client/__init__.py index 0fcc2b7..c733cd7 100644 --- a/src/gypsum_client/__init__.py +++ b/src/gypsum_client/__init__.py @@ -16,5 +16,14 @@ del version, PackageNotFoundError from .auth import access_token, set_access_token +from .fetch_assets import ( + fetch_latest, + fetch_manifest, + fetch_permissions, + fetch_quota, + fetch_summary, + fetch_usage, +) +from .fetch_metadata_schema import fetch_metadata_schema from .list_assets import list_assets, list_files, list_projects, list_versions from .s3_config import public_s3_config From 4b5bd288759897581e630ba89215c31ce199f361 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Fri, 10 May 2024 16:00:44 -0700 Subject: [PATCH 8/8] also fetch database --- src/gypsum_client/_utils.py | 25 +++++ src/gypsum_client/fetch_metadata_database.py | 112 +++++++++++++++++++ tests/test_fetch_metadata_database.py | 32 ++++++ 3 files changed, 169 insertions(+) create mode 100644 src/gypsum_client/fetch_metadata_database.py create mode 100644 tests/test_fetch_metadata_database.py diff --git a/src/gypsum_client/_utils.py b/src/gypsum_client/_utils.py index 7835a34..7569881 100644 --- a/src/gypsum_client/_utils.py +++ b/src/gypsum_client/_utils.py @@ -158,3 +158,28 @@ def _cast_datetime(x): x = x.split(".")[0] return datetime.strptime(x, "%Y-%m-%dT%H:%M:%S").astimezone(tz=timezone.utc) + + +def _rename_file(src: str, dest: str): + try: + os.rename(src, dest) + except OSError: + try: + # If renaming fails, try copying + shutil.copy(src, dest) + os.remove(src) # Remove the original file after copying + except Exception as e: + raise RuntimeError( + f"Cannot move temporary file for '{src}' to its destination '{dest}': {e}." + ) + + +def _download_and_rename_file(url: str, dest: str): + tmp = tempfile.NamedTemporaryFile(dir=os.path.dirname(dest), delete=False).name + req = requests.get(url, stream=True) + + with open(tmp, "wb") as f: + for chunk in req.iter_content(): + f.write(chunk) + + _rename_file(tmp, dest) diff --git a/src/gypsum_client/fetch_metadata_database.py b/src/gypsum_client/fetch_metadata_database.py new file mode 100644 index 0000000..16372cb --- /dev/null +++ b/src/gypsum_client/fetch_metadata_database.py @@ -0,0 +1,112 @@ +import os +import tempfile +import time + +import requests +from filelock import FileLock + +from ._utils import _cache_directory, _download_and_rename_file + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + +LAST_CHECK = {"req_time": None, "mod_time": None} + + +def fetch_metadata_database( + name: str = "bioconductor.sqlite3", cache_dir: str = None, overwrite: bool = False +) -> str: + """Fetch the SQLite database containing metadata from the gypsum backend. + + This function will automatically check for updates to the SQLite files + and will download new versions accordingly. New checks are performed when one hour + or more has elapsed since the last check. If the check fails, a warning is raised + and the function returns the currently cached file. + + Args: + name: + Name of the database. + This can be the name of any SQLite file in + https://github.com/ArtifactDB/bioconductor-metadata-index/releases/tag/latest. + + Defaults to "bioconductor.sqlite3". + + cache_dir: + Path to the cache directory. + + Defaults to None. + + overwrite: + Whether to overwrite existing file. + + Defaults to False. + + Returns: + Path to the downloaded database. + """ + base_url = "https://github.com/ArtifactDB/bioconductor-metadata-index/releases/download/latest/" + + if cache_dir is None: + cache_path = tempfile.NamedTemporaryFile(suffix=".sqlite3").name + else: + cache_dir = os.path.join(_cache_directory(cache_dir), "databases") + + cache_path = os.path.join(cache_dir, name) + if not os.path.exists(cache_path): + os.makedirs(os.path.dirname(cache_path), exist_ok=True) + + if os.path.exists(cache_path) and not overwrite: + old_lastmod_raw = None + + _lock = FileLock(cache_path) + if not _lock.is_locked: + old_lastmod_raw = open(cache_path + ".modified").readlines() + + old_lastmod = float(old_lastmod_raw[0]) + new_lastmod = get_last_modified_date(base_url) + + print(old_lastmod, new_lastmod) + + if new_lastmod is not None and old_lastmod == new_lastmod: + return cache_path + + print("why is it here") + print(cache_path) + _lock = FileLock(cache_path) + with _lock: + mod_path = cache_path + ".modified" + _download_and_rename_file(base_url + "modified", mod_path) + _download_and_rename_file(base_url + name, cache_path) + + LAST_CHECK["req_time"] = get_current_unix_time() + LAST_CHECK["mod_time"] = float(open(mod_path).readline()) + + return cache_path + + +def get_current_unix_time(): + return time.time() * 1000 # milliseconds + + +def get_last_modified_date(base_url): + curtime = get_current_unix_time() + if ( + LAST_CHECK["req_time"] is not None + and LAST_CHECK["req_time"] + 60 * 60 * 1000 >= curtime + ): + return LAST_CHECK["mod_time"] + + mod_time = None + try: + url = base_url + "modified" + response = requests.get(url) + mod_time = float(response.text) + except Exception as e: + print("Failed to check the last modified timestamp:", str(e)) + + if mod_time is not None: + LAST_CHECK["req_time"] = curtime + LAST_CHECK["mod_time"] = mod_time + + return mod_time diff --git a/tests/test_fetch_metadata_database.py b/tests/test_fetch_metadata_database.py new file mode 100644 index 0000000..4c331ee --- /dev/null +++ b/tests/test_fetch_metadata_database.py @@ -0,0 +1,32 @@ +import os +import tempfile + +import pytest +from gypsum_client.fetch_metadata_database import fetch_metadata_database, LAST_CHECK + + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def test_fetch_metadata_database(): + _cache_dir = tempfile.mkdtemp() + + path = fetch_metadata_database(cache_dir=_cache_dir) + print(path) + assert os.path.getsize(path) > 0 + assert isinstance(LAST_CHECK["req_time"], float) + assert not isinstance(LAST_CHECK["req_time"], bool) + assert not isinstance(LAST_CHECK["mod_time"], bool) + assert isinstance(LAST_CHECK["mod_time"], float) + + # Uses the cache. + with open(path, "w") as f: + f.write("FOO_BAR") + + path2 = fetch_metadata_database(cache_dir=_cache_dir) + print(path2, "oath2") + assert path == path2 + + assert open(path).read().strip() == "FOO_BAR"