From c1fa09dadc733f01a527d1f8fa64a6f4f282df34 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Tue, 7 Jan 2025 22:34:35 -0500 Subject: [PATCH] Minor enhancements and support for web links (#29) * Remove unneeded custom exceptions * get a resource either by rname or rid * Add metadata CRUD methods * Update tests --- CHANGELOG.md | 6 +- src/pybiocfilecache/cache.py | 152 ++++++++++++++++++++++-------- src/pybiocfilecache/exceptions.py | 31 ------ src/pybiocfilecache/models.py | 2 +- src/pybiocfilecache/utils.py | 16 +++- tests/test_cache.py | 78 ++++++++++++++- 6 files changed, 204 insertions(+), 81 deletions(-) delete mode 100644 src/pybiocfilecache/exceptions.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 41dc724..8b55b8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,11 @@ # Changelog -## Version 0.6.1 +## Version 0.6.1 - 0.6.2 - Generate rid's that match with R's cache. -- remove rname pattern checks. +- Remove rname pattern checks. +- Add functions to access metadata table. +- Add function to add web urls and download them if needed. - Rename GitHub actions for consistency with the rest of the packages. ## Version 0.6.0 diff --git a/src/pybiocfilecache/cache.py b/src/pybiocfilecache/cache.py index 4258e4c..6436f55 100644 --- a/src/pybiocfilecache/cache.py +++ b/src/pybiocfilecache/cache.py @@ -11,20 +11,14 @@ from .config import CacheConfig from .const import SCHEMA_VERSION -from .exceptions import ( - BiocCacheError, - InvalidRnameError, - NoFpathError, - RnameExistsError, - RpathTimeoutError, -) -from .models import Base, Resource +from .models import Base, Metadata, Resource from .utils import ( calculate_file_hash, copy_or_move, create_tmp_dir, + download_web_file, generate_id, - validate_rname, + generate_uuid, ) __author__ = "Jayaram Kancherla" @@ -65,7 +59,6 @@ def __init__(self, cache_dir: Optional[Union[str, Path]] = None, config: Optiona db_schema_version = self._setup_database() if db_schema_version != SCHEMA_VERSION: - print(db_schema_version) raise RuntimeError(f"Database version is not {SCHEMA_VERSION}.") self._last_cleanup = datetime.now() @@ -111,13 +104,15 @@ def _setup_database(self) -> None: return SCHEMA_VERSION - def _get_detached_resource(self, session: Session, resource: Resource) -> Optional[Resource]: + def _get_detached_resource( + self, session: Session, obj: Union[Resource, Metadata] + ) -> Optional[Union[Resource, Metadata]]: """Get a detached copy of a resource.""" - if resource is None: + if obj is None: return None - session.refresh(resource) - session.expunge(resource) - return resource + session.refresh(obj) + session.expunge(obj) + return obj def __enter__(self) -> "BiocFileCache": return self @@ -142,10 +137,10 @@ def get_session(self) -> Iterator[Session]: finally: session.close() - def _validate_rname(self, rname: str) -> None: - """Validate resource name format.""" - if not validate_rname(rname, self.config.rname_pattern): - raise InvalidRnameError(f"Resource name '{rname}' doesn't match pattern " f"'{self.config.rname_pattern}'") + # def _validate_rname(self, rname: str) -> None: + # """Validate resource name format.""" + # if not validate_rname(rname, self.config.rname_pattern): + # raise Exception(f"Resource name '{rname}' doesn't match pattern " f"'{self.config.rname_pattern}'") def _should_cleanup(self) -> bool: """Check if cache cleanup should be performed. @@ -196,16 +191,25 @@ def cleanup(self) -> int: self._last_cleanup = datetime.now() return removed - def get(self, rname: str) -> Optional[Resource]: + def get(self, rname: str = None, rid: str = None) -> Optional[Resource]: """Get resource by name from cache. Args: rname: Name to identify the resource in cache. + rid: + Resource id to search by. + """ + if rname is None and rid is None: + raise ValueError("either 'rname' or 'rid' must be provided.") + with self.get_session() as session: - resource = session.query(Resource).filter(Resource.rname == rname).first() + if rname is not None: + resource = session.query(Resource).filter(Resource.rname == rname).first() + elif rid is not None: + resource = session.query(Resource).filter(Resource.rid == rid).first() if resource is not None: # Check if path exists with timeout @@ -213,7 +217,7 @@ def get(self, rname: str) -> Optional[Resource]: timeout = 30 while not Path(str(resource.rpath)).exists(): if time() - start >= timeout: - raise RpathTimeoutError( + raise TimeoutError( f"For resource: '{rname}' the rpath does not exist " f"after {timeout} seconds." ) sleep(0.1) @@ -229,10 +233,11 @@ def add( self, rname: str, fpath: Union[str, Path], - rtype: Literal["local", "web", "relative"] = "local", + rtype: Literal["local", "web", "relative"] = "relative", action: Literal["copy", "move", "asis"] = "copy", expires: Optional[datetime] = None, - ext: bool = False, + download: bool = True, + ext: bool = True, ) -> Resource: """Add a resource to the cache. @@ -252,29 +257,41 @@ def add( How to handle the file ("copy", "move", or "asis"). Defaults to ``copy``. + download: + Whether to download the resource. + Only used if 'rtype' is "web". + expires: Optional expiration datetime. If None, resource never expires. ext: Whether to use filepath extension when storing in cache. - Defaults to `False`. + Defaults to `True`. Returns: The `Resource` object added to the cache. """ # self._validate_rname(rname) - fpath = Path(fpath) - - if not fpath.exists(): - raise NoFpathError(f"Resource at '{fpath}' does not exist") + fpath = Path(fpath) if rtype != "web" else fpath if self.get(rname) is not None: - raise RnameExistsError(f"Resource '{rname}' already exists") + raise FileExistsError(f"Resource '{rname}' already exists") + + if rtype == "web": + outpath = download_web_file(fpath, Path(fpath).name, download) + action = "copy" + else: + outpath = Path(fpath) + + if action == "asis": + logger.warning("If action='asis', rtype must be 'local'.") + rtype = "local" # Generate paths and check size rid = generate_id(size=len(self)) - rpath = self.config.cache_dir / f"{rid}{fpath.suffix if ext else ''}" if action != "asis" else fpath + uuid = generate_uuid() + rpath = self.config.cache_dir / f"{uuid}_{outpath.name if ext else outpath.stem}" if action != "asis" else fpath # Create resource record resource = Resource( @@ -292,7 +309,7 @@ def add( session.commit() try: - copy_or_move(fpath, rpath, rname, action, False) + copy_or_move(outpath, rpath, rname, action, False) # Calculate and store checksum resource.etag = calculate_file_hash(rpath, self.config.hash_algorithm) @@ -303,7 +320,7 @@ def add( except Exception as e: session.delete(resource) session.commit() - raise BiocCacheError("Failed to add resource") from e + raise Exception("Failed to add resource") from e def add_batch(self, resources: List[Dict[str, Any]]) -> List[Resource]: """Add multiple resources in a single transaction. @@ -349,7 +366,7 @@ def update( """ fpath = Path(fpath) if not fpath.exists(): - raise NoFpathError(f"File '{fpath}' does not exist") + raise FileNotFoundError(f"File '{fpath}' does not exist") with self.get_session() as session: resource = session.query(Resource).filter(Resource.rname == rname).first() @@ -369,7 +386,7 @@ def update( except Exception as e: session.rollback() - raise BiocCacheError("Failed to update resource") from e + raise Exception("Failed to update resource") from e def remove(self, rname: str) -> None: """Remove a resource from cache by name. @@ -381,7 +398,7 @@ def remove(self, rname: str) -> None: Name to identify the resource in cache. Raises: - BiocCacheError: If resource removal fails + Exception: If resource removal fails """ with self.get_session() as session: resource = session.query(Resource).filter(Resource.rname == rname).first() @@ -399,7 +416,7 @@ def remove(self, rname: str) -> None: except Exception as e: session.rollback() - raise BiocCacheError(f"Failed to remove resource '{rname}'") from e + raise Exception(f"Failed to remove resource '{rname}'") from e def list_resources(self, rtype: Optional[str] = None, expired: Optional[bool] = None) -> List[Resource]: """List resources in the cache with optional filtering. @@ -564,7 +581,7 @@ def purge(self, force: bool = False) -> bool: True if purge was successful, False otherwise. Raises: - BiocCacheError: If purge fails and force=False. + Exception: If purge fails and force=False. """ try: with self.get_session() as session: @@ -577,7 +594,7 @@ def purge(self, force: bool = False) -> bool: except Exception as e: if not force: session.rollback() - raise BiocCacheError(f"Failed to remove file for resource '{resource.rname}'") from e + raise Exception(f"Failed to remove file for resource '{resource.rname}'") from e logger.warning(f"Failed to remove file for resource '{resource.rname}': {e}") session.commit() @@ -598,7 +615,7 @@ def purge(self, force: bool = False) -> bool: except Exception as e: if not force: - raise BiocCacheError("Failed to purge cache") from e + raise Exception("Failed to purge cache") from e logger.error("Database cleanup failed, forcing file removal", exc_info=e) for file in self.config.cache_dir.iterdir(): @@ -616,3 +633,58 @@ def purge(self, force: bool = False) -> bool: def __len__(self): with self.get_session() as session: return session.query(Resource).count() + + def check_metadata_key(self, key: str) -> bool: + """Check if a key exists in the metadata table. + + Args: + key: + Key to search. + + Returns: + True if the key exists, else False. + """ + with self.get_session() as session: + return session.query(Metadata).filter(Metadata.key == key).count() != 0 + + def get_metadata(self, key: str): + """Add a new metadata key""" + with self.get_session() as session: + meta = session.query(Metadata).filter(Metadata.key == key).first() + if meta is not None: + return self._get_detached_resource(session, meta) + + return None + + def add_metadata(self, key: str, value: str): + """Add a new metadata key""" + exists = self.get_metadata(key=key) + + if exists is None: + meta = Metadata(key=key, value=value) + + with self.get_session() as session: + try: + session.add(meta) + session.commit() + return self._get_detached_resource(session, meta) + except Exception as e: + session.delete(meta) + session.commit() + raise Exception("Failed to add metadata") from e + else: + raise Exception(f"'key'={key} already exists in metadata.") + + def remove_metadata(self, key: str) -> None: + """Remove a metadata key.""" + with self.get_session() as session: + meta = session.query(Metadata).filter(Metadata.key == key).first() + + if meta is not None: + try: + session.delete(meta) + session.commit() + + except Exception as e: + session.rollback() + raise Exception(f"Failed to remove key '{key}'") from e diff --git a/src/pybiocfilecache/exceptions.py b/src/pybiocfilecache/exceptions.py deleted file mode 100644 index 7bfd11c..0000000 --- a/src/pybiocfilecache/exceptions.py +++ /dev/null @@ -1,31 +0,0 @@ -__author__ = "Jayaram Kancherla" -__copyright__ = "Jayaram Kancherla" -__license__ = "MIT" - - -class BiocCacheError(Exception): - """Base exception for BiocFileCache errors.""" - - -class NoFpathError(BiocCacheError): - """Source file does not exist.""" - - -class RnameExistsError(BiocCacheError): - """Resource name already exists in cache.""" - - -class RpathTimeoutError(BiocCacheError): - """Resource path does not exist after timeout.""" - - -class CacheSizeLimitError(BiocCacheError): - """Cache size limit would be exceeded.""" - - -class ResourceValidationError(BiocCacheError): - """Resource failed validation check.""" - - -class InvalidRnameError(BiocCacheError): - """Invalid resource name format.""" diff --git a/src/pybiocfilecache/models.py b/src/pybiocfilecache/models.py index 31e4176..340bf37 100644 --- a/src/pybiocfilecache/models.py +++ b/src/pybiocfilecache/models.py @@ -73,4 +73,4 @@ class Resource(Base): expires = Column(DateTime, default=None) def __repr__(self) -> str: - return f"" + return f"" diff --git a/src/pybiocfilecache/utils.py b/src/pybiocfilecache/utils.py index 97bac34..738cb17 100644 --- a/src/pybiocfilecache/utils.py +++ b/src/pybiocfilecache/utils.py @@ -2,14 +2,13 @@ import logging import re import tempfile +import urllib.request import uuid import zlib from pathlib import Path from shutil import copy2, move from typing import Literal -from .exceptions import BiocCacheError - __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" __license__ = "MIT" @@ -86,4 +85,15 @@ def copy_or_move( elif action == "asis": pass except Exception as e: - raise BiocCacheError(f"Failed to store resource '{rname}' from '{source}' to '{target}'") from e + raise Exception(f"Failed to store resource '{rname}' from '{source}' to '{target}'") from e + + +def download_web_file(url: str, filename: str, download: bool): + tmp_dir = create_tmp_dir() + outpath = tmp_dir / filename + if download: + urllib.request.urlretrieve(str(url), str(outpath)) + else: + open(str(outpath), "a").close() + + return outpath diff --git a/tests/test_cache.py b/tests/test_cache.py index 23924d2..64b5488 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -1,19 +1,25 @@ import os import shutil -from pybiocfilecache import BiocFileCache +from datetime import timedelta +from pybiocfilecache import BiocFileCache, CacheConfig +from pathlib import Path +import pytest +import tempfile __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" -CACHE_DIR = os.getcwd() + "/cache" +CACHE_DIR = tempfile.mkdtemp() + "/cache" def test_create_cache(): bfc = BiocFileCache(CACHE_DIR) assert os.path.exists(CACHE_DIR) + bfc.check_metadata_key("schema_version") + bfc.purge() @@ -21,9 +27,7 @@ def test_add_get_list_operations(): bfc = BiocFileCache(CACHE_DIR) rtrip = bfc.add("test1", os.getcwd() + "/tests/data/test1.txt") - print("rtrip: ", rtrip) rec1 = bfc.get("test1") - print("rec1: ", rec1) assert rec1 is not None bfc.add("test2", os.getcwd() + "/tests/data/test2.txt") @@ -48,6 +52,15 @@ def test_add_get_list_operations(): rtrip = bfc.list_resources() assert len(rtrip) == 3 + downurl = "https://bioconductor.org/packages/stats/bioc/BiocFileCache/BiocFileCache_2024_stats.tab" + add_url = bfc.add(rname="download_link", fpath=downurl, rtype="web") + + row = bfc.get(rid=add_url.rid) + assert row.fpath == downurl + + rtrip = bfc.list_resources() + assert len(rtrip) == 4 + bfc.purge() @@ -67,3 +80,60 @@ def test_remove_operations(): assert rec1 is None bfc.purge() + +def test_meta_operations(): + bfc = BiocFileCache(CACHE_DIR) + + bfc.add("test1", os.getcwd() + "/tests/data/test1.txt") + rec1 = bfc.get("test1") + assert rec1 is not None + + with pytest.raises(Exception): + bfc.add_metadata("schema_version", "something") + + bfc.check_metadata_key("schema_version") + + bfc.add_metadata("language", "python") + + downurl = "https://bioconductor.org/packages/stats/bioc/BiocFileCache/BiocFileCache_2024_stats.tab" + add_url = bfc.add(rname="download_link", fpath=downurl, rtype="web") + + rec = bfc.get_metadata("schema_version") + assert rec.value == "0.99.4" + + rec = bfc.get_metadata("language") + assert rec.value == "python" + + rtrip = bfc.list_resources() + assert len(rtrip) == 2 + + bfc.purge() + +def test_cache_with_config(): + # Create custom configuration + config = CacheConfig( + cache_dir=Path(CACHE_DIR), + cleanup_interval=timedelta(days=7), + ) + + bfc = BiocFileCache(config=config) + + bfc.check_metadata_key("schema_version") + + bfc.add("test1", os.getcwd() + "/tests/data/test1.txt") + rec1 = bfc.get("test1") + assert rec1 is not None + + rtrip = bfc.list_resources() + assert len(rtrip) == 1 + + bfc.update("test1", os.getcwd() + "/tests/data/test2.txt") + rec1 = bfc.get("test1") + assert rec1 is not None + + rtrip = bfc.list_resources() + assert len(rtrip) == 1 + + bfc.cleanup() + + bfc.purge()