From 7c863de712bc52df7270b67708081bea4aa5dd22 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Thu, 19 Dec 2024 16:57:53 -0800 Subject: [PATCH] Revert schema changes that break compatibility with the R/BiocFileCache implementation (#26) * Added tests for Python 3.13 --- .github/workflows/pypi-publish.yml | 73 ++++++++++++------------ .github/workflows/pypi-test.yml | 47 +++++++-------- .gitignore | 3 + CHANGELOG.md | 5 ++ src/pybiocfilecache/cache.py | 91 ++++++++++++------------------ src/pybiocfilecache/config.py | 9 --- src/pybiocfilecache/models.py | 20 ++----- tests/test_cache.py | 2 +- 8 files changed, 108 insertions(+), 142 deletions(-) diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index 030cd10..7eaf26a 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -9,43 +9,44 @@ on: jobs: build: - runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - name: Set up Python 3.11 - uses: actions/setup-python@v5 - with: - python-version: 3.11 - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install flake8 pytest tox - # - name: Lint with flake8 - # run: | - # # stop the build if there are Python syntax errors or undefined names - # flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - # # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with tox - run: | - tox - - name: Build docs - run: | - tox -e docs - - run: touch ./docs/_build/html/.nojekyll - - name: GH Pages Deployment - uses: JamesIves/github-pages-deploy-action@4.1.3 - with: - branch: gh-pages # The branch the action should deploy to. - folder: ./docs/_build/html - clean: true # Automatically remove deleted files from the deploy branch - - name: Build Project and Publish - run: | - python -m tox -e clean,build - - name: Publish package - uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 - with: - user: __token__ + - uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: 3.11 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox + + - name: Test with tox + run: | + tox + + - name: Build docs + run: | + tox -e docs + + - run: touch ./docs/_build/html/.nojekyll + + - name: GH Pages Deployment + uses: JamesIves/github-pages-deploy-action@v4 + with: + branch: gh-pages # The branch the action should deploy to. + folder: ./docs/_build/html + clean: true # Automatically remove deleted files from the deploy branch + + - name: Build Project and Publish + run: | + python -m tox -e clean,build + + - name: Publish package + uses: pypa/gh-action-pypi-publish@v1.12.2 + with: + user: __token__ password: ${{ secrets.PYPI_PASSWORD }} diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml index 22f6c4a..90aa16a 100644 --- a/.github/workflows/pypi-test.yml +++ b/.github/workflows/pypi-test.yml @@ -1,40 +1,33 @@ -# This workflow will install Python dependencies, run tests and lint with a single version of Python -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - -name: Test the library +name: Run tests on: push: - branches: [ master ] + branches: [master] pull_request: - branches: [ master ] + branches: [master] jobs: build: - runs-on: ubuntu-latest strategy: matrix: - python-version: [ '3.9', '3.10', '3.11', '3.12', '3.13' ] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] name: Python ${{ matrix.python-version }} steps: - - uses: actions/checkout@v4 - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - cache: 'pip' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install flake8 pytest tox - # - name: Lint with flake8 - # run: | - # # stop the build if there are Python syntax errors or undefined names - # flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - # # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with tox - run: | - tox + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox + + - name: Test with tox + run: | + tox diff --git a/.gitignore b/.gitignore index 19d4b9b..d2d26c6 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,6 @@ MANIFEST # Per-project virtualenvs .venv*/ .conda*/ + +# remove cache +cache diff --git a/CHANGELOG.md b/CHANGELOG.md index f9083c2..e01ca64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## Version 0.6.0 + +- Reverting schema changes that break compatibility with the R/BiocFileCache implementation. +- Added support for Python 3.13 + ## Version 0.5.5 - chore: Remove Python 3.8 (EOL). diff --git a/src/pybiocfilecache/cache.py b/src/pybiocfilecache/cache.py index 6179e80..0f7a806 100644 --- a/src/pybiocfilecache/cache.py +++ b/src/pybiocfilecache/cache.py @@ -6,14 +6,14 @@ from time import sleep, time from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union -from sqlalchemy import create_engine, func +from sqlalchemy import create_engine, func, text from sqlalchemy.orm import Session, sessionmaker from sqlalchemy.pool import QueuePool from .config import CacheConfig +from .const import SCHEMA_VERSION from .exceptions import ( BiocCacheError, - CacheSizeLimitError, InvalidRnameError, NoFpathError, RnameExistsError, @@ -25,7 +25,6 @@ copy_or_move, create_tmp_dir, generate_id, - get_file_size, validate_rname, ) @@ -42,9 +41,7 @@ class BiocFileCache: Features: - Resource validation and integrity checking - Cache size management - - File compression - - Resource tagging - - Automatic cleanup of expired resources + - Cleanup of expired resources """ def __init__(self, cache_dir: Optional[Union[str, Path]] = None, config: Optional[CacheConfig] = None): @@ -66,10 +63,15 @@ def __init__(self, cache_dir: Optional[Union[str, Path]] = None, config: Optiona self.config = config self._setup_cache_dir() - self._setup_database() + db_schema_version = self._setup_database() + + if db_schema_version != SCHEMA_VERSION: + print(db_schema_version) + raise RuntimeError(f"Database version is not {SCHEMA_VERSION}.") + self._last_cleanup = datetime.now() - def _setup_cache_dir(self) -> None: + def _setup_cache_dir(self) -> bool: if not self.config.cache_dir.exists(): self.config.cache_dir.mkdir(parents=True, exist_ok=True) @@ -87,6 +89,28 @@ def _setup_database(self) -> None: Base.metadata.create_all(self.engine) self.SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=self.engine) + with self.engine.connect() as conn: + result = conn.execute( + text(""" + SELECT value FROM metadata + WHERE key = 'schema_version' + """) + ) + row = result.fetchone() + + if row is not None: + return row[0] + + conn.execute( + text(""" + INSERT INTO metadata (key, value) + VALUES ('schema_version', :version); + """), + {"version": SCHEMA_VERSION}, + ) + + return SCHEMA_VERSION + def _get_detached_resource(self, session: Session, resource: Resource) -> Optional[Resource]: """Get a detached copy of a resource.""" if resource is None: @@ -118,17 +142,6 @@ def get_session(self) -> Iterator[Session]: finally: session.close() - def _check_cache_size(self, new_size: int) -> None: - """Verify cache size limit won't be exceeded.""" - if self.config.max_size_bytes is None: - return - - current_size = self.get_cache_size() - if current_size + new_size > self.config.max_size_bytes: - raise CacheSizeLimitError( - f"Adding {new_size} bytes would exceed cache limit of " f"{self.config.max_size_bytes} bytes" - ) - def _validate_rname(self, rname: str) -> None: """Validate resource name format.""" if not validate_rname(rname, self.config.rname_pattern): @@ -218,7 +231,6 @@ def add( fpath: Union[str, Path], rtype: Literal["local", "web", "relative"] = "local", action: Literal["copy", "move", "asis"] = "copy", - tags: Optional[List[str]] = None, expires: Optional[datetime] = None, ext: bool = False, ) -> Resource: @@ -240,9 +252,6 @@ def add( How to handle the file ("copy", "move", or "asis"). Defaults to ``copy``. - tags: - Optional list of tags for categorization. - expires: Optional expiration datetime. If None, resource never expires. @@ -267,8 +276,6 @@ def add( rid = generate_id() rpath = self.config.cache_dir / f"{rid}{fpath.suffix if ext else ''}" if action != "asis" else fpath - self._check_cache_size(get_file_size(fpath)) - # Create resource record resource = Resource( rid=rid, @@ -276,9 +283,7 @@ def add( rpath=str(rpath), rtype=rtype, fpath=str(fpath), - tags=",".join(tags) if tags else None, expires=expires, - size_bytes=get_file_size(fpath), ) # Store file and update database @@ -287,7 +292,7 @@ def add( session.commit() try: - copy_or_move(fpath, rpath, rname, action, self.config.compression) + copy_or_move(fpath, rpath, rname, action, False) # Calculate and store checksum resource.etag = calculate_file_hash(rpath, self.config.hash_algorithm) @@ -324,7 +329,6 @@ def update( rname: str, fpath: Union[str, Path], action: Literal["copy", "move", "asis"] = "copy", - tags: Optional[List[str]] = None, ) -> Resource: """Update an existing resource. @@ -339,9 +343,6 @@ def update( Either ``copy``, ``move`` or ``asis``. Defaults to ``copy``. - tags: - Optional new list of tags. - Returns: Updated `Resource` object. @@ -358,14 +359,10 @@ def update( old_path = Path(resource.rpath) try: - copy_or_move(fpath, old_path, rname, action, self.config.compression) + copy_or_move(fpath, old_path, rname, action, False) resource.last_modified_time = datetime.now() resource.etag = calculate_file_hash(old_path, self.config.hash_algorithm) - resource.size_bytes = get_file_size(old_path) - - if tags is not None: - resource.tags = ",".join(tags) session.commit() return self._get_detached_resource(session, resource) @@ -404,14 +401,10 @@ def remove(self, rname: str) -> None: session.rollback() raise BiocCacheError(f"Failed to remove resource '{rname}'") from e - def list_resources( - self, tag: Optional[str] = None, rtype: Optional[str] = None, expired: Optional[bool] = None - ) -> List[Resource]: + def list_resources(self, rtype: Optional[str] = None, expired: Optional[bool] = None) -> List[Resource]: """List resources in the cache with optional filtering. Args: - tag: - Filter resources by tag. rtype: Filter resources by type. @@ -429,8 +422,6 @@ def list_resources( with self.get_session() as session: query = session.query(Resource) - if tag: - query = query.filter(Resource.tags.like(f"%{tag}%")) if rtype: query = query.filter(Resource.rtype == rtype) if expired is not None: @@ -468,11 +459,6 @@ def validate_resource(self, resource: Resource) -> bool: logger.error(f"Failed to validate resource: {resource.rname}", exc_info=e) return False - def get_cache_size(self) -> int: - """Get total size of cached files in bytes.""" - with self.get_session() as session: - return session.query(func.sum(Resource.size_bytes)).scalar() or 0 - def export_metadata(self, path: Path) -> None: """Export cache metadata to JSON file.""" data = { @@ -480,14 +466,11 @@ def export_metadata(self, path: Path) -> None: { "rname": r.rname, "rtype": r.rtype, - "tags": r.tags, "expires": r.expires.isoformat() if r.expires else None, "etag": r.etag, - "size_bytes": r.size_bytes, } for r in self.list_resources() ], - "cache_size": self.get_cache_size(), "export_time": datetime.now().isoformat(), } @@ -503,7 +486,6 @@ def import_metadata(self, path: Path) -> None: for resource_data in data["resources"]: resource = self._get(session, resource_data["rname"]) if resource: - resource.tags = resource_data["tags"] resource.expires = ( datetime.fromisoformat(resource_data["expires"]) if resource_data["expires"] else None ) @@ -532,7 +514,7 @@ def search(self, query: str, field: str = "rname", exact: bool = False) -> List[ Search string. field: - Resource field to search ("rname", "rtype", "tags", etc.). + Resource field to search ("rname", "rtype", etc.). exact: Whether to require exact match. @@ -565,7 +547,6 @@ def get_stats(self) -> Dict[str, Any]: return { "total_resources": total, "expired_resources": expired, - "cache_size_bytes": self.get_cache_size(), "resource_types": types, "last_cleanup": self._last_cleanup.isoformat(), "cleanup_enabled": self.config.cleanup_interval is not None, diff --git a/src/pybiocfilecache/config.py b/src/pybiocfilecache/config.py index 9f120ad..e2dc88e 100644 --- a/src/pybiocfilecache/config.py +++ b/src/pybiocfilecache/config.py @@ -16,10 +16,6 @@ class CacheConfig: cache_dir: Directory to store cached files. - max_size_bytes: - Maximum total size of cache. - None for unlimited. - cleanup_interval: How often to run expired resource cleanup. None for no cleanup. @@ -29,14 +25,9 @@ class CacheConfig: hash_algorithm: Algorithm to use for file checksums. - - compression: - Whether to compress cached files. """ cache_dir: Path - max_size_bytes: Optional[int] = None cleanup_interval: Optional[timedelta] = None # timedelta(days=30) rname_pattern: str = r"^[a-zA-Z0-9_-]+$" hash_algorithm: str = "md5" - compression: bool = False diff --git a/src/pybiocfilecache/models.py b/src/pybiocfilecache/models.py index d944575..8040b99 100644 --- a/src/pybiocfilecache/models.py +++ b/src/pybiocfilecache/models.py @@ -13,7 +13,7 @@ class Metadata(Base): __tablename__ = "metadata" - key = Column(Text(), primary_key=True, index=True) + key = Column(Text(), primary_key=True, index=True, unique=True) value = Column(Text()) def __repr__(self) -> str: @@ -56,29 +56,21 @@ class Resource(Base): expires: When the resource should be considered expired. - - tags: - Optional comma-separated tags for categorization. - - size_bytes: - Size of the resource in bytes. """ __tablename__ = "resource" id = Column(Integer, primary_key=True, index=True, autoincrement=True) - rid = Column(Text(), index=True) - rname = Column(Text(), index=True, unique=True) + rid = Column(Text()) + rname = Column(Text()) create_time = Column(DateTime, server_default=func.now()) access_time = Column(DateTime, server_default=func.now()) rpath = Column(Text()) rtype = Column(Text()) fpath = Column(Text()) - last_modified_time = Column(DateTime, onupdate=func.now()) - etag = Column(Text()) - expires = Column(DateTime) - tags = Column(Text()) - size_bytes = Column(Integer) + last_modified_time = Column(DateTime, onupdate=func.now(), default=None) + etag = Column(Text(), default=None) + expires = Column(DateTime, default=None) def __repr__(self) -> str: return f"" diff --git a/tests/test_cache.py b/tests/test_cache.py index 8ce25ce..23924d2 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -17,7 +17,7 @@ def test_create_cache(): bfc.purge() -def test_add_get_operations(): +def test_add_get_list_operations(): bfc = BiocFileCache(CACHE_DIR) rtrip = bfc.add("test1", os.getcwd() + "/tests/data/test1.txt")