Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gzip deserialization #2677

Merged
merged 5 commits into from
Feb 14, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions api/python/quilt3/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@

import copy
import csv
import gzip
import io
import json
import sys
Expand Down Expand Up @@ -1021,3 +1022,72 @@ def deserialize(self, bytes_obj, meta=None, ext=None, **format_opts):
# compat -- also handle 'pyarrow' in meta['target'] and meta['format']['name'].
ParquetFormatHandler('pyarrow').register()
ParquetFormatHandler().register() # latest is preferred


class CompressionRegistry:
"""A collection for organizing `CompressionHandler` objects."""
registered_handlers = []

def __init__(self):
raise TypeError("The {!r} class is organizational, and cannot be instantiated."
.format(type(self).__name__))

@classmethod
def register(cls, handler):
"""Register a CompressionHandler instance"""
handlers = cls.registered_handlers

# no duplicates, just reprioritize.
if handler in handlers:
handlers.pop(handlers.index(handler))

handlers.insert(0, handler)

@classmethod
def search(cls, ext=None):
"""Get a handler for the given extension"""
ext = ext.lower().strip('. ')

for handler in cls.registered_handlers:
if handler.handles_ext(ext):
return handler

return None


class BaseCompressionHandler(ABC):
"""Base class for compression handlers"""
name = None
handled_extensions = ()

@abstractmethod
def compress(self, data):
"Compress the given bytes object"
pass

@abstractmethod
def decompress(self, data):
"Decompress the given bytes object"
pass

def register(self):
"""Register this format with CompressionRegistry"""
CompressionRegistry.register(self)

def handles_ext(self, ext):
"""Check if this format handles the filetype indicated by an extension"""
return ext.lstrip('.').lower() in self.handled_extensions


class GzipCompressionHandler(BaseCompressionHandler):
"""Compression handler for gzip"""
handled_extensions = ['gz']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we also do this? Apparently customer has both in S3.

Suggested change
handled_extensions = ['gz']
handled_extensions = ['gz', 'gzip']


def compress(self, data):
return gzip.compress(data)

def decompress(self, data):
return gzip.decompress(data)


GzipCompressionHandler().register()
13 changes: 11 additions & 2 deletions api/python/quilt3/packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
put_bytes,
)
from .exceptions import PackageException
from .formats import FormatRegistry
from .formats import CompressionRegistry, FormatRegistry
from .telemetry import ApiTelemetry
from .util import CACHE_PATH, DISABLE_TQDM, PACKAGE_UPDATE_POLICY
from .util import TEMPFILE_DIR_PATH as APP_DIR_TEMPFILE_DIR
Expand Down Expand Up @@ -291,14 +291,23 @@ def deserialize(self, func=None, **format_opts):
if func is not None:
return func(data)

pkey_ext = pathlib.PurePosixPath(self.physical_key.path).suffix
suffixes = pathlib.PurePosixPath(self.physical_key.path).suffixes

pkey_ext = suffixes.pop() if suffixes else ''
compression_handler = CompressionRegistry.search(pkey_ext)

if compression_handler is not None:
pkey_ext = suffixes.pop() if suffixes else ''

# Verify format can be handled before checking hash. Raises if none found.
formats = FormatRegistry.search(None, self._meta, pkey_ext)

# Verify hash before deserializing..
self._verify_hash(data)

if compression_handler is not None:
data = compression_handler.decompress(data)

return formats[0].deserialize(data, self._meta, pkey_ext, **format_opts)

def fetch(self, dest=None):
Expand Down
Binary file added api/python/tests/integration/data/blah.txt.gz
Binary file not shown.
2 changes: 2 additions & 0 deletions api/python/tests/integration/test_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,12 +490,14 @@ def test_package_deserialize(self):
.set('foo', DATA_DIR / 'foo.txt', {'user_meta_foo': 'blah'})
.set('bar', DATA_DIR / 'foo.unrecognized.ext')
.set('baz', DATA_DIR / 'foo.txt')
.set('blah', DATA_DIR / 'blah.txt.gz')
)
pkg.build('foo/bar')

pkg['foo'].meta['target'] = 'unicode'
assert pkg['foo'].deserialize() == '123\n'
assert pkg['baz'].deserialize() == '123\n'
assert pkg['blah'].deserialize() == '456\n'

with pytest.raises(QuiltException):
pkg['bar'].deserialize()
Expand Down
4 changes: 4 additions & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
## Catalog, Lambdas
!-->

# unreleased - YYYY-MM-DD
## Python API
* [Added] Automatically decompress gzip'ed package entries when deserializing.
dimaryaz marked this conversation as resolved.
Show resolved Hide resolved

# 4.0.0 - 2022-01-31
## Python API
* [Added] Declared compatibility with `jsonschema==4.*`.
Expand Down