Skip to content

Commit

Permalink
Merge branch 'master' into akarve-master
Browse files Browse the repository at this point in the history
* master: (54 commits)
  Use stable nginx version for catalog image (#2182)
  Ability to add S3 folders / files to package (#2171)
  lambda for adding S3 data to existing package (#2180)
  use github tarball for faster installation (#2181)
  Bump py from 1.7.0 to 1.10.0 in /lambdas/es/indexer (#2176)
  Bump py from 1.8.0 to 1.10.0 in /lambdas/s3select (#2177)
  Bump py from 1.8.0 to 1.10.0 in /lambdas/thumbnail (#2178)
  Allow unicode characters for package routes by allowing any character (#2179)
  Additional NotFoundPage scoped to Bucket (#2175)
  Docs: fix catalog config path (#2168)
  rework pkgpush auth (#2170)
  Use AWS credentials for directory package and copy package submit (#2172)
  Document package push limitations in catalog [ci skip] (#2161)
  Preview warnings accordion (#2167)
  tweak warning text (#2169)
  Copy tweaks (#2164)
  Don't crash pkgselect for empty manifests (#2147)
  add codecov config (#2155)
  Simplify warning messages for package name (#2134)
  Move package API requests to one file, consolidate naming and internal API (#2154)
  ...
  • Loading branch information
nl0 committed May 4, 2021
2 parents 6c3bb73 + 7b29ab6 commit 23b702b
Show file tree
Hide file tree
Showing 118 changed files with 5,222 additions and 1,928 deletions.
1 change: 1 addition & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ workflows:
- pkgselect
- shared
- pkgevents
- pkgpush
- test-lambda:
name: test-lambda-indexer
path: es/indexer
Expand Down
4 changes: 4 additions & 0 deletions api/python/quilt3/data_transfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,8 @@ def _copy_file_list_internal(file_list, results, message, callback, exceptions_t
if not file_list:
return []

logger.info('copy files: started')

assert len(file_list) == len(results)

total_size = sum(size for (_, _, size), result in zip(file_list, results) if result is None)
Expand Down Expand Up @@ -620,6 +622,8 @@ def done_callback(value):
# Make sure all tasks exit quickly if the main thread exits before they're done.
stopped = True

logger.info('copy files: finished')

return results


Expand Down
82 changes: 71 additions & 11 deletions api/python/quilt3/packages.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import contextlib
import functools
import gc
import hashlib
import inspect
import io
import json
import logging
import os
import pathlib
import shutil
import sys
import tempfile
import textwrap
import time
Expand Down Expand Up @@ -51,10 +54,23 @@
validate_package_name,
)

logger = logging.getLogger(__name__)


# S3 Select limitation:
# https://docs.aws.amazon.com/AmazonS3/latest/userguide/selecting-content-from-objects.html#selecting-content-from-objects-requirements-and-limits
# > The maximum length of a record in the input or result is 1 MB.
# The actual limit is 1 MiB, but it's rounded because column names are included in this limit.
DEFAULT_MANIFEST_MAX_RECORD_SIZE = 1_000_000
MANIFEST_MAX_RECORD_SIZE = util.get_pos_int_from_env('QUILT_MANIFEST_MAX_RECORD_SIZE')
if MANIFEST_MAX_RECORD_SIZE is None:
MANIFEST_MAX_RECORD_SIZE = DEFAULT_MANIFEST_MAX_RECORD_SIZE


def _fix_docstring(**kwargs):
def f(wrapped):
wrapped.__doc__ = textwrap.dedent(wrapped.__doc__) % kwargs
if sys.flags.optimize < 2:
wrapped.__doc__ = textwrap.dedent(wrapped.__doc__) % kwargs
return wrapped
return f

Expand Down Expand Up @@ -121,7 +137,7 @@ class PackageEntry:
"""
Represents an entry at a logical key inside a package.
"""
__slots__ = ['physical_key', 'size', 'hash', '_meta']
__slots__ = ('physical_key', 'size', 'hash', '_meta')

def __init__(self, physical_key, size, hash_obj, meta):
"""
Expand Down Expand Up @@ -342,6 +358,26 @@ def __init__(self, registry, name, top_hash):
self.top_hash = top_hash


class ManifestJSONDecoder(json.JSONDecoder):
"""
Standard json.JSONDecoder reuses same `str` objects for JSON properties, while doing
a single `decode()` call.
This class also reuses `str` between many `decode()`s.
"""
def __init__(self, *args, **kwargs):
@functools.lru_cache(maxsize=None)
def memoize_key(s):
return s

def object_pairs_hook(items):
return {
memoize_key(k): v
for k, v in items
}

super().__init__(*args, object_pairs_hook=object_pairs_hook, **kwargs)


class Package:
""" In-memory representation of a package """

Expand Down Expand Up @@ -782,7 +818,7 @@ def _load(cls, readable_file):
disable=DISABLE_TQDM,
bar_format='{l_bar}{bar}| {n}/{total} [{elapsed}<{remaining}, {rate_fmt}]',
),
loads=json.loads,
loads=ManifestJSONDecoder().decode,
)
meta = reader.read()
meta.pop('top_hash', None) # Obsolete as of PR #130
Expand Down Expand Up @@ -942,6 +978,8 @@ def _fix_sha256(self):
"""
Calculate and set missing hash values
"""
logger.info('fix package hashes: started')

self._incomplete_entries = [entry for key, entry in self.walk() if entry.hash is None]

physical_keys = []
Expand All @@ -962,6 +1000,8 @@ def _fix_sha256(self):
msg = "Unable to reach S3 for some hash values. Incomplete manifest saved to {path}."
raise PackageException(msg.format(path=incomplete_manifest_path)) from exc

logger.info('fix package hashes: finished')

def _set_commit_message(self, msg):
"""
Sets a commit message.
Expand Down Expand Up @@ -1065,7 +1105,26 @@ def dump(self, writable_file):
return self._dump(writable_file)

def _dump(self, writable_file):
writer = jsonlines.Writer(writable_file)
json_encode = json.JSONEncoder(ensure_ascii=False).encode

def dumps(obj):
data = json_encode(obj)
encoded_size = len(data.encode())
if encoded_size > MANIFEST_MAX_RECORD_SIZE:
lk = obj.get('logical_key')
entry_text = 'package metadata' if lk is None else f'entry with logical key {lk!r}'
raise QuiltException(
f"Size of manifest record for {entry_text} is {encoded_size} bytes, "
f"but must be less than {MANIFEST_MAX_RECORD_SIZE} bytes. "
'Quilt recommends less than 1 MB of metadata per object, '
'and less than 1 MB of package-level metadata. '
'This enables S3 select, Athena and downstream services '
'to work correctly. This limit can be overridden with the '
'QUILT_MANIFEST_MAX_RECORD_SIZE environment variable.'
)
return data

writer = jsonlines.Writer(writable_file, dumps=dumps)
for line in self.manifest:
writer.write(line)

Expand Down Expand Up @@ -1421,15 +1480,16 @@ def physical_key_is_temp_file(pk):
return pathlib.Path(pk.path).parent == APP_DIR_TEMPFILE_DIR

temp_file_logical_keys = [lk for lk, entry in self.walk() if physical_key_is_temp_file(entry.physical_key)]
temp_file_physical_keys = [self[lk].physical_key for lk in temp_file_logical_keys]
if temp_file_logical_keys:
temp_file_physical_keys = [self[lk].physical_key for lk in temp_file_logical_keys]

# Now that data has been pushed, delete tmp files created by pkg.set('KEY', obj)
with Pool(10) as p:
p.map(_delete_local_physical_key, temp_file_physical_keys)
# Now that data has been pushed, delete tmp files created by pkg.set('KEY', obj)
with Pool(10) as p:
p.map(_delete_local_physical_key, temp_file_physical_keys)

# Update old package to point to the materialized location of the file since the tempfile no longest exists
for lk in temp_file_logical_keys:
self._set(lk, pkg[lk])
# Update old package to point to the materialized location of the file since the tempfile no longest exists
for lk in temp_file_logical_keys:
self._set(lk, pkg[lk])

pkg._push_manifest(name, registry, top_hash)

Expand Down
11 changes: 9 additions & 2 deletions api/python/quilt3/telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,16 @@
MAX_CLEANUP_WAIT_SECS = 5


@functools.lru_cache(maxsize=None)
def get_session_id():
return str(uuid.uuid4())


reset_session_id = get_session_id.cache_clear


class ApiTelemetry:
session = None
session_id = str(uuid.uuid4())
pending_reqs = []
pending_reqs_lock = Lock()
telemetry_disabled = None
Expand Down Expand Up @@ -123,7 +130,7 @@ def __call__(self, func):
def decorated(*args, **kwargs):

ApiTelemetry.cleanup_completed_requests()
ApiTelemetry.report_api_use(self.api_name, ApiTelemetry.session_id)
ApiTelemetry.report_api_use(self.api_name, get_session_id())

results = func(*args, **kwargs)
# print(f"{len(ApiTelemetry.pending_reqs)} request(s) pending!")
Expand Down
2 changes: 1 addition & 1 deletion api/python/quilt3/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ class URLParseError(ValueError):


class PhysicalKey:
__slots__ = ['bucket', 'path', 'version_id']
__slots__ = ('bucket', 'path', 'version_id')

def __init__(self, bucket, path, version_id):
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{"version": "v0", "top_hash": {"alg": "v0", "value": "20de5433549a4db332a11d8d64b934a82bdea8f144b4aecd901e7d4134f8e733"}}
{"logical_key": "a.txt", "physical_keys": ["file:///home/dima/src/t4/api/python/tmp/a.txt"], "size": 13, "hash": {"type": "SHA256", "value": "8663bab6d124806b9727f89bb4ab9db4cbcc3862f6bbf22024dfa7212aa4ab7d"}, "meta": {"user_meta": {"x": "y"}}}
{"logical_key": "куилт.txt", "physical_keys": ["file:///home/dima/src/t4/api/python/tmp/a.txt"], "size": 13, "hash": {"type": "SHA256", "value": "8663bab6d124806b9727f89bb4ab9db4cbcc3862f6bbf22024dfa7212aa4ab7d"}, "meta": {"user_meta": {"x": "y"}}}
{"logical_key": "b/x.json", "physical_keys": ["file:///home/dima/src/t4/api/python/tmp/b/x.json"], "size": 14, "hash": {"type": "SHA256", "value": "426fc04f04bf8fdb5831dc37bbb6dcf70f63a37e05a68c6ea5f63e85ae579376"}, "meta": {}}
35 changes: 34 additions & 1 deletion api/python/tests/integration/test_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -933,7 +933,7 @@ def test_dir_meta(self):
def test_top_hash_stable(self):
"""Ensure that top_hash() never changes for a given manifest"""

top_hash = '20de5433549a4db332a11d8d64b934a82bdea8f144b4aecd901e7d4134f8e733'
top_hash = '3426a3f721e41a1d83174c691432a39ff13720426267fc799dccf3583153e850'
manifest_path = DATA_DIR / 'top_hash_test_manifest.jsonl'
pkg = Package._from_path(manifest_path)

Expand Down Expand Up @@ -1790,6 +1790,39 @@ def test_push_dest_fn(self):
BytesIO(push_manifest_mock.call_args[0][2])
)[lk].physical_key == PhysicalKey(dest_bucket, dest_key, version)

def test_package_dump_file_mode(self):
"""
Package.dump() works with both files opened in binary and text mode.
"""
meta = {'💩': '💩'}
pkg = Package().set_meta(meta)
for mode in 'bt':
with self.subTest(mode=mode):
fn = f'test-manifest-{mode}.jsonl'
with open(fn, f'w{mode}', **({'encoding': 'utf-8'} if mode == 't' else {})) as f:
pkg.dump(f)
with open(fn, encoding='utf-8') as f:
assert Package.load(f).meta == meta

def test_max_manifest_record_size(self):
with open(os.devnull, 'wb') as buf:
with mock.patch('quilt3.packages.MANIFEST_MAX_RECORD_SIZE', 1):
with pytest.raises(QuiltException) as excinfo:
Package().dump(buf)
assert 'Size of manifest record for package metadata' in str(excinfo.value)

with mock.patch('quilt3.packages.MANIFEST_MAX_RECORD_SIZE', 10_000):
with pytest.raises(QuiltException) as excinfo:
Package().set('foo', DATA_DIR / 'foo.txt', {'user_meta': 'x' * 10_000}).dump(buf)
assert "Size of manifest record for entry with logical key 'foo'" in str(excinfo.value)

with pytest.raises(QuiltException) as excinfo:
Package().set_dir('bar', DATA_DIR / 'nested', meta={'user_meta': 'x' * 10_000}).dump(buf)
assert "Size of manifest record for entry with logical key 'bar/'" in str(excinfo.value)

# This would fail if non-ASCII chars were encoded using escape sequences.
Package().set_meta({'a': '💩' * 2_000}).dump(buf)


class PackageTestV2(PackageTest):
default_registry_version = 2
Expand Down
49 changes: 49 additions & 0 deletions api/python/tests/test_telemetry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import unittest
from unittest import mock

import quilt3
from quilt3.telemetry import ApiTelemetry


@ApiTelemetry(mock.sentinel.API_NAME1)
def test_api_function1():
pass


@ApiTelemetry(mock.sentinel.API_NAME2)
def test_api_function2():
pass


class TelemetryTest(unittest.TestCase):
def setUp(self):
super().setUp()

patcher = mock.patch('quilt3.telemetry.ApiTelemetry.report_api_use')
self.mock_report_api_use = patcher.start()
self.addCleanup(patcher.stop)

def test_session_id(self):
"""
Session ID stays the same across the calls.
"""
test_api_function1()
self.mock_report_api_use.assert_called_once_with(mock.sentinel.API_NAME1, mock.ANY)
session_id = self.mock_report_api_use.call_args[0][1]

self.mock_report_api_use.reset_mock()
test_api_function2()
self.mock_report_api_use.assert_called_once_with(mock.sentinel.API_NAME2, session_id)

def test_session_reset_session_id(self):
test_api_function1()
self.mock_report_api_use.assert_called_once_with(mock.sentinel.API_NAME1, mock.ANY)
session_id = self.mock_report_api_use.call_args[0][1]

self.mock_report_api_use.reset_mock()
quilt3.telemetry.reset_session_id()
test_api_function2()
self.mock_report_api_use.assert_called_once_with(mock.sentinel.API_NAME2, mock.ANY)
new_session_id = self.mock_report_api_use.call_args[0][1]

assert new_session_id != session_id
6 changes: 3 additions & 3 deletions api/python/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def setUp(self):

self.requests_mock = responses.RequestsMock(assert_all_requests_are_fired=False)
self.requests_mock.start()
self.addCleanup(self.requests_mock.stop)

# Create a dummy S3 client that (hopefully) can't do anything.
boto_client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
Expand All @@ -54,15 +55,14 @@ def setUp(self):
find_correct_client=lambda *args, **kwargs: boto_client,
)
self.s3_client_patcher.start()
self.addCleanup(self.s3_client_patcher.stop)

self.s3_stubber = Stubber(self.s3_client)
self.s3_stubber.activate()
self.addCleanup(self.s3_stubber.deactivate)

def tearDown(self):
self.s3_stubber.assert_no_pending_responses()
self.s3_stubber.deactivate()
self.s3_client_patcher.stop()
self.requests_mock.stop()

def s3_streaming_body(self, data):
return StreamingBody(io.BytesIO(data), len(data))
Expand Down
2 changes: 1 addition & 1 deletion catalog/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM nginx:1.17.5
FROM nginx:1.18.0
MAINTAINER Quilt Data, Inc. [email protected]

# Set up nginx
Expand Down
7 changes: 7 additions & 0 deletions catalog/app/@types/react-table.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import type * as RTable from 'react-table'

declare module 'react-table' {
export interface TableOptions extends RTable.TableOptions {
updateMyData: (path: string[], id: any, value: any) => void
}
}
11 changes: 7 additions & 4 deletions catalog/app/components/BreadCrumbs/BreadCrumbs.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import * as R from 'ramda'
import * as React from 'react'

import Link from 'utils/StyledLink'
Expand All @@ -10,13 +11,15 @@ export const Crumb = tagged([
'Sep', // value
])

export const Segment = ({ label, to }) =>
to ? <Link to={to}>{label || EMPTY}</Link> : label || EMPTY
export const Segment = ({ label, to, getLinkProps = R.identity }) =>
to != null ? <Link {...getLinkProps({ to })}>{label || EMPTY}</Link> : label || EMPTY

export const render = (items) =>
export const render = (items, { getLinkProps = undefined } = {}) =>
items.map(
Crumb.case({
Segment: (s, i) => <Segment key={`${i}:${s.label}`} {...s} />,
Segment: (s, i) => (
<Segment key={`${i}:${s.label}`} getLinkProps={getLinkProps} {...s} />
),
Sep: (s, i) => <React.Fragment key={`__sep${i}`}>{s}</React.Fragment>,
}),
)
Expand Down
6 changes: 1 addition & 5 deletions catalog/app/components/Experiments/Experiments.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,7 @@ const EXPERIMENTS = {
'Ready to experiment faster?',
'Ready to maximize return on data?',
],
lede: [
'Maximize your return on data by managing data like code',
'Accelerate from experiment to impact',
'Quilt is an experiment discovery platform',
],
lede: ['Accelerate from data to impact', 'Manage data like code', 'Discover faster'],
}

const Ctx = React.createContext()
Expand Down
Loading

0 comments on commit 23b702b

Please sign in to comment.