Skip to content

Commit

Permalink
limit record size of manifest
Browse files Browse the repository at this point in the history
  • Loading branch information
sir-sigurd committed Apr 1, 2021
1 parent ba4b247 commit 359b8af
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 1 deletion.
31 changes: 30 additions & 1 deletion api/python/quilt3/packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,16 @@
logger = logging.getLogger(__name__)


# S3 Select limitation:
# https://docs.aws.amazon.com/AmazonS3/latest/userguide/selecting-content-from-objects.html#selecting-content-from-objects-requirements-and-limits
# > The maximum length of a record in the input or result is 1 MB.
# The actual limit is 1 MiB, but it's rounded because column names are included in this limit.
DEFAULT_MANIFEST_MAX_RECORD_SIZE = 1_000_000
MANIFEST_MAX_RECORD_SIZE = util.get_pos_int_from_env('QUILT_MANIFEST_MAX_RECORD_SIZE')
if MANIFEST_MAX_RECORD_SIZE is None:
MANIFEST_MAX_RECORD_SIZE = DEFAULT_MANIFEST_MAX_RECORD_SIZE


def _fix_docstring(**kwargs):
def f(wrapped):
if sys.flags.optimize < 2:
Expand Down Expand Up @@ -1074,7 +1084,26 @@ def dump(self, writable_file):
return self._dump(writable_file)

def _dump(self, writable_file):
writer = jsonlines.Writer(writable_file)
json_encode = json.JSONEncoder(ensure_ascii=False).encode

def dumps(obj):
data = json_encode(obj).encode()
encoded_size = len(data)
if encoded_size > MANIFEST_MAX_RECORD_SIZE:
lk = obj.get('logical_key')
entry_text = 'package metadata' if lk is None else f'entry with logical key {lk!r}'
raise QuiltException(
f"Size of manifest record for {entry_text} is {encoded_size} bytes, "
f"but it's limited by {MANIFEST_MAX_RECORD_SIZE} bytes."
'Quilt recommends less than 1MB of metadata per object, '
'and less than 1MB of package-level metadata. '
'This enables S3 select, Athena and downstream services '
'to work correctly. This limit can be overridden with '
'QUILT_MANIFEST_MAX_RECORD_SIZE environment variable.'
)
return data

writer = jsonlines.Writer(writable_file, dumps=dumps)
for line in self.manifest:
writer.write(line)

Expand Down
19 changes: 19 additions & 0 deletions api/python/tests/integration/test_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -1804,6 +1804,25 @@ def test_package_dump_file_mode(self):
with open(fn, encoding='utf-8') as f:
assert Package.load(f).meta == meta

def test_max_manifest_record_size(self):
with open(os.devnull, 'w') as buf:
with mock.patch('quilt3.packages.MANIFEST_MAX_RECORD_SIZE', 1):
with pytest.raises(QuiltException) as excinfo:
Package().dump(buf)
assert 'Size of manifest record for package metadata' in str(excinfo.value)

with mock.patch('quilt3.packages.MANIFEST_MAX_RECORD_SIZE', 10_000):
with pytest.raises(QuiltException) as excinfo:
Package().set('foo', DATA_DIR / 'foo.txt', {'user_meta': 'x' * 10_000}).dump(buf)
assert "Size of manifest record for entry with logical key 'foo'" in str(excinfo.value)

with pytest.raises(QuiltException) as excinfo:
Package().set_dir('bar', DATA_DIR / 'nested', meta={'user_meta': 'x' * 10_000}).dump(buf)
assert "Size of manifest record for entry with logical key 'bar/'" in str(excinfo.value)

# This would fail if non-ASCII chars were encoded using escape sequences.
Package().set_meta({'a': '💩' * 2_000}).dump(buf)


class PackageTestV2(PackageTest):
default_registry_version = 2
Expand Down
2 changes: 2 additions & 0 deletions docs/API Reference/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,8 @@ depends on network bandwidth, CPU performance, file sizes, etc.
```
$ export QUILT_TRANSFER_MAX_CONCURRENCY=20
```
### `QUILT_MANIFEST_MAX_RECORD_SIZE`
Maximum size of a record in package manifest. Defaults to `1_000_000`.

### `XDG_*`
Quilt uses appdirs for Python to determine where to write data. You can therefore
Expand Down
1 change: 1 addition & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

# unreleased - YYYY-MM-DD
## Python API
* [Added] Size of each manifest record is now limited by 1 MB. This constraint is added to ensure that S3 select, Athena and downstream services work correctly. This limit can be overridden with QUILT_MANIFEST_MAX_RECORD_SIZE environment variable. ([#2114](https://github.com/quiltdata/quilt/pull/2114))

## CLI

Expand Down
2 changes: 2 additions & 0 deletions gendocs/env_constants.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ depends on network bandwidth, CPU performance, file sizes, etc.
```
$ export QUILT_TRANSFER_MAX_CONCURRENCY=20
```
### `QUILT_MANIFEST_MAX_RECORD_SIZE`
Maximum size of a record in package manifest. Defaults to `1_000_000`.

### `XDG_*`
Quilt uses appdirs for Python to determine where to write data. You can therefore
Expand Down

0 comments on commit 359b8af

Please sign in to comment.