From 0b7a136715bd0a6b8a0bbe3975130f4993fa8dd0 Mon Sep 17 00:00:00 2001 From: Dima Ryazanov Date: Sun, 13 Feb 2022 21:05:08 -0800 Subject: [PATCH] Add gzip deserialization (#2677) --- api/python/quilt3/formats.py | 62 ++++++++++++++++++ api/python/quilt3/packages.py | 13 +++- api/python/tests/integration/data/blah.txt.gz | Bin 0 -> 33 bytes api/python/tests/integration/test_packages.py | 2 + docs/CHANGELOG.md | 4 ++ 5 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 api/python/tests/integration/data/blah.txt.gz diff --git a/api/python/quilt3/formats.py b/api/python/quilt3/formats.py index 85fc8884223..d785cc7e59e 100644 --- a/api/python/quilt3/formats.py +++ b/api/python/quilt3/formats.py @@ -68,6 +68,7 @@ import copy import csv +import gzip import io import json import sys @@ -1021,3 +1022,64 @@ def deserialize(self, bytes_obj, meta=None, ext=None, **format_opts): # compat -- also handle 'pyarrow' in meta['target'] and meta['format']['name']. ParquetFormatHandler('pyarrow').register() ParquetFormatHandler().register() # latest is preferred + + +class CompressionRegistry: + """A collection for organizing `CompressionHandler` objects.""" + registered_handlers = [] + + def __init__(self): + raise TypeError("The {!r} class is organizational, and cannot be instantiated." + .format(type(self).__name__)) + + @classmethod + def register(cls, handler): + """Register a CompressionHandler instance""" + handlers = cls.registered_handlers + + # no duplicates, just reprioritize. + if handler in handlers: + handlers.pop(handlers.index(handler)) + + handlers.insert(0, handler) + + @classmethod + def search(cls, ext=None): + """Get a handler for the given extension""" + ext = ext.lower().strip('. ') + + for handler in cls.registered_handlers: + if handler.handles_ext(ext): + return handler + + return None + + +class BaseCompressionHandler(ABC): + """Base class for compression handlers""" + name = None + handled_extensions = () + + @abstractmethod + def decompress(self, data): + "Decompress the given bytes object" + pass + + def register(self): + """Register this format with CompressionRegistry""" + CompressionRegistry.register(self) + + def handles_ext(self, ext): + """Check if this format handles the filetype indicated by an extension""" + return ext.lstrip('.').lower() in self.handled_extensions + + +class GzipCompressionHandler(BaseCompressionHandler): + """Compression handler for gzip""" + handled_extensions = ['gz', 'gzip'] + + def decompress(self, data): + return gzip.decompress(data) + + +GzipCompressionHandler().register() diff --git a/api/python/quilt3/packages.py b/api/python/quilt3/packages.py index 3ef40000b56..0bdf74f1682 100644 --- a/api/python/quilt3/packages.py +++ b/api/python/quilt3/packages.py @@ -33,7 +33,7 @@ put_bytes, ) from .exceptions import PackageException -from .formats import FormatRegistry +from .formats import CompressionRegistry, FormatRegistry from .telemetry import ApiTelemetry from .util import CACHE_PATH, DISABLE_TQDM, PACKAGE_UPDATE_POLICY from .util import TEMPFILE_DIR_PATH as APP_DIR_TEMPFILE_DIR @@ -291,7 +291,13 @@ def deserialize(self, func=None, **format_opts): if func is not None: return func(data) - pkey_ext = pathlib.PurePosixPath(self.physical_key.path).suffix + suffixes = pathlib.PurePosixPath(self.physical_key.path).suffixes + + pkey_ext = suffixes.pop() if suffixes else '' + compression_handler = CompressionRegistry.search(pkey_ext) + + if compression_handler is not None: + pkey_ext = suffixes.pop() if suffixes else '' # Verify format can be handled before checking hash. Raises if none found. formats = FormatRegistry.search(None, self._meta, pkey_ext) @@ -299,6 +305,9 @@ def deserialize(self, func=None, **format_opts): # Verify hash before deserializing.. self._verify_hash(data) + if compression_handler is not None: + data = compression_handler.decompress(data) + return formats[0].deserialize(data, self._meta, pkey_ext, **format_opts) def fetch(self, dest=None): diff --git a/api/python/tests/integration/data/blah.txt.gz b/api/python/tests/integration/data/blah.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..0591a8b5dd7062139ccaef41ede21a32caf9de0f GIT binary patch literal 33 ocmb2|=HN)##FE6ooRpK8p;uB-!eDG@`k0B~*ZUMv76t|e0HBizLjV8( literal 0 HcmV?d00001 diff --git a/api/python/tests/integration/test_packages.py b/api/python/tests/integration/test_packages.py index 975ac4ee389..84bb7f203c4 100644 --- a/api/python/tests/integration/test_packages.py +++ b/api/python/tests/integration/test_packages.py @@ -490,12 +490,14 @@ def test_package_deserialize(self): .set('foo', DATA_DIR / 'foo.txt', {'user_meta_foo': 'blah'}) .set('bar', DATA_DIR / 'foo.unrecognized.ext') .set('baz', DATA_DIR / 'foo.txt') + .set('blah', DATA_DIR / 'blah.txt.gz') ) pkg.build('foo/bar') pkg['foo'].meta['target'] = 'unicode' assert pkg['foo'].deserialize() == '123\n' assert pkg['baz'].deserialize() == '123\n' + assert pkg['blah'].deserialize() == '456\n' with pytest.raises(QuiltException): pkg['bar'].deserialize() diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index f3f2693c983..5584ad25a1c 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -7,6 +7,10 @@ ## Catalog, Lambdas !--> +# unreleased - YYYY-MM-DD +## Python API +* [Added] Automatically decompress gzip'ed package entries when deserializing ([#2677](https://github.com/quiltdata/quilt/pull/2677)) + # 4.0.0 - 2022-01-31 ## Python API * [Added] Declared compatibility with `jsonschema==4.*`.