Skip to content

Commit

Permalink
Add gzip deserialization (#2677)
Browse files Browse the repository at this point in the history
  • Loading branch information
dimaryaz authored Feb 14, 2022
1 parent 90643ff commit 0b7a136
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 2 deletions.
62 changes: 62 additions & 0 deletions api/python/quilt3/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@

import copy
import csv
import gzip
import io
import json
import sys
Expand Down Expand Up @@ -1021,3 +1022,64 @@ def deserialize(self, bytes_obj, meta=None, ext=None, **format_opts):
# compat -- also handle 'pyarrow' in meta['target'] and meta['format']['name'].
ParquetFormatHandler('pyarrow').register()
ParquetFormatHandler().register() # latest is preferred


class CompressionRegistry:
"""A collection for organizing `CompressionHandler` objects."""
registered_handlers = []

def __init__(self):
raise TypeError("The {!r} class is organizational, and cannot be instantiated."
.format(type(self).__name__))

@classmethod
def register(cls, handler):
"""Register a CompressionHandler instance"""
handlers = cls.registered_handlers

# no duplicates, just reprioritize.
if handler in handlers:
handlers.pop(handlers.index(handler))

handlers.insert(0, handler)

@classmethod
def search(cls, ext=None):
"""Get a handler for the given extension"""
ext = ext.lower().strip('. ')

for handler in cls.registered_handlers:
if handler.handles_ext(ext):
return handler

return None


class BaseCompressionHandler(ABC):
"""Base class for compression handlers"""
name = None
handled_extensions = ()

@abstractmethod
def decompress(self, data):
"Decompress the given bytes object"
pass

def register(self):
"""Register this format with CompressionRegistry"""
CompressionRegistry.register(self)

def handles_ext(self, ext):
"""Check if this format handles the filetype indicated by an extension"""
return ext.lstrip('.').lower() in self.handled_extensions


class GzipCompressionHandler(BaseCompressionHandler):
"""Compression handler for gzip"""
handled_extensions = ['gz', 'gzip']

def decompress(self, data):
return gzip.decompress(data)


GzipCompressionHandler().register()
13 changes: 11 additions & 2 deletions api/python/quilt3/packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
put_bytes,
)
from .exceptions import PackageException
from .formats import FormatRegistry
from .formats import CompressionRegistry, FormatRegistry
from .telemetry import ApiTelemetry
from .util import CACHE_PATH, DISABLE_TQDM, PACKAGE_UPDATE_POLICY
from .util import TEMPFILE_DIR_PATH as APP_DIR_TEMPFILE_DIR
Expand Down Expand Up @@ -291,14 +291,23 @@ def deserialize(self, func=None, **format_opts):
if func is not None:
return func(data)

pkey_ext = pathlib.PurePosixPath(self.physical_key.path).suffix
suffixes = pathlib.PurePosixPath(self.physical_key.path).suffixes

pkey_ext = suffixes.pop() if suffixes else ''
compression_handler = CompressionRegistry.search(pkey_ext)

if compression_handler is not None:
pkey_ext = suffixes.pop() if suffixes else ''

# Verify format can be handled before checking hash. Raises if none found.
formats = FormatRegistry.search(None, self._meta, pkey_ext)

# Verify hash before deserializing..
self._verify_hash(data)

if compression_handler is not None:
data = compression_handler.decompress(data)

return formats[0].deserialize(data, self._meta, pkey_ext, **format_opts)

def fetch(self, dest=None):
Expand Down
Binary file added api/python/tests/integration/data/blah.txt.gz
Binary file not shown.
2 changes: 2 additions & 0 deletions api/python/tests/integration/test_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,12 +490,14 @@ def test_package_deserialize(self):
.set('foo', DATA_DIR / 'foo.txt', {'user_meta_foo': 'blah'})
.set('bar', DATA_DIR / 'foo.unrecognized.ext')
.set('baz', DATA_DIR / 'foo.txt')
.set('blah', DATA_DIR / 'blah.txt.gz')
)
pkg.build('foo/bar')

pkg['foo'].meta['target'] = 'unicode'
assert pkg['foo'].deserialize() == '123\n'
assert pkg['baz'].deserialize() == '123\n'
assert pkg['blah'].deserialize() == '456\n'

with pytest.raises(QuiltException):
pkg['bar'].deserialize()
Expand Down
4 changes: 4 additions & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
## Catalog, Lambdas
!-->

# unreleased - YYYY-MM-DD
## Python API
* [Added] Automatically decompress gzip'ed package entries when deserializing ([#2677](https://github.com/quiltdata/quilt/pull/2677))

# 4.0.0 - 2022-01-31
## Python API
* [Added] Declared compatibility with `jsonschema==4.*`.
Expand Down

0 comments on commit 0b7a136

Please sign in to comment.