Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Index S3 tags #3691

Merged
merged 8 commits into from
Aug 22, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Entries inside each section should be ordered by type:
* [Added] Add filter for users and buckets tables in Admin dashboards ([#3480](https://github.com/quiltdata/quilt/pull/3480))
* [Added] Add links to documentation and re-use code samples ([#3496](https://github.com/quiltdata/quilt/pull/3496))
* [Added] Show S3 Object tags ([#3515](https://github.com/quiltdata/quilt/pull/3515))
* [Added] Indexer lambda now indexes S3 Object tags ([#3691](https://github.com/quiltdata/quilt/pull/3691))
* [Changed] Enable user selection in perspective grids ([#3453](https://github.com/quiltdata/quilt/pull/3453))
* [Changed] Hide columns without values in files listings ([#3512](https://github.com/quiltdata/quilt/pull/3512))
* [Changed] Enable `allow-same-origin` for iframes in browsable buckets ([#3516](https://github.com/quiltdata/quilt/pull/3516))
Expand Down
2 changes: 2 additions & 0 deletions lambdas/es/indexer/document_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def append(
event_type: str,
ext: str,
version_id,
s3_tags,
):
"""format event as a document and then queue the document"""
logger_ = get_quilt_logger()
Expand Down Expand Up @@ -143,6 +144,7 @@ def append(
"event": event_type,
"ext": ext,
"updated": datetime.utcnow().isoformat(),
"s3_tags": " ".join([f"{key} {value}" for key, value in s3_tags.items()]) if s3_tags else None,
}

self.append_document(body)
Expand Down
37 changes: 35 additions & 2 deletions lambdas/es/indexer/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ def do_index(
text: str = '',
size: int = 0,
version_id: Optional[str] = None,
s3_tags: Optional[dict] = None,
):
"""wrap dual indexing of packages and objects"""
logger_ = get_quilt_logger()
Expand All @@ -282,7 +283,8 @@ def do_index(
last_modified=last_modified,
size=size,
text=text,
version_id=version_id
version_id=version_id,
s3_tags=s3_tags,
)
# maybe index as package
if index_if_package(
Expand Down Expand Up @@ -790,6 +792,10 @@ def handler(event, context):
text = ""
logger_.warning("Content extraction failed %s %s %s", bucket, key, exc)

# XXX: we could replace head_object() call above with get_object(Range='bytes=0-0')
# which returns TagsCount, so we could optimize out get_object_tagging() call
# for objects without tags.

do_index(
s3_client,
batch_processor,
Expand All @@ -801,7 +807,13 @@ def handler(event, context):
last_modified=last_modified,
size=size,
text=text,
version_id=version_id
version_id=version_id,
s3_tags=get_object_tagging(
s3_client=s3_client,
bucket=bucket,
key=key,
version_id=version_id,
),
)

except botocore.exceptions.ClientError as boto_exc:
Expand Down Expand Up @@ -865,3 +877,24 @@ def call():
return function_(**arguments)

return call()


def get_object_tagging(*, s3_client, bucket: str, key: str, version_id: Optional[str]) -> Optional[dict]:
params = {
"Bucket": bucket,
"Key": key,
}
if version_id:
params["VersionId"] = version_id

try:
s3_tags = s3_client.get_object_tagging(**params)["TagSet"]
return {t["Key"]: t["Value"] for t in s3_tags}
except botocore.exceptions.ClientError as e:
if e.response["Error"]["Code"] != "AccessDenied":
raise
get_quilt_logger().error(
"AccessDenied while getting tags for Bucket=%s, Key=%s, VersionId=%s",
bucket, key, version_id
)
return None
121 changes: 117 additions & 4 deletions lambdas/es/indexer/test/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from urllib.parse import unquote_plus

import boto3
import botocore
import pptx
import pytest
import responses
Expand Down Expand Up @@ -264,6 +265,7 @@ def _make_event(
"version_id": "abc",
"size": 0,
"text": "",
"s3_tags": None,
}
),
(
Expand All @@ -278,6 +280,7 @@ def _make_event(
"size": 0,
"text": "iajsoeqroieurqwiuro•",
"version_id": "abc",
"s3_tags": {"key": "value"},
}
),
]
Expand Down Expand Up @@ -515,7 +518,8 @@ def _test_index_events(
mock_elastic=True,
mock_overrides=None,
status=200,
unknown_items=False
unknown_items=False,

):
"""
Reusable helper function to test indexing files based on on or more
Expand All @@ -542,7 +546,7 @@ def _test_index_events(
elif eTag:
expected_params["IfMatch"] = eTag
# infer mock status (we only talk head S3 on create events)
mock_head = mock_object = name in CREATE_EVENT_TYPES
mock_get_object_tagging = mock_head = mock_object = name in CREATE_EVENT_TYPES
# check for occasional overrides (which can be false)
if mock_overrides and "mock_head" in mock_overrides:
mock_head = mock_overrides.get("mock_head")
Expand Down Expand Up @@ -579,6 +583,23 @@ def _test_index_events(
expected_params=expected
)

if mock_get_object_tagging:
expected = {
"Bucket": event["s3"]["bucket"]["name"],
"Key": un_key,
}
if versionId:
expected["VersionId"] = versionId
self.s3_stubber.add_response(
method="get_object_tagging",
service_response={
"TagSet": [
{"Key": "key", "Value": "value"},
]
},
expected_params=expected,
)

if mock_elastic:
self.requests_mock.add_callback(
responses.POST,
Expand Down Expand Up @@ -845,7 +866,8 @@ def test_index_c000_contents(self, get_mock, append_mock):
last_modified=ANY,
size=100,
text=parquet_data,
version_id='1313131313131.Vier50HdNbi7ZirO65'
version_id='1313131313131.Vier50HdNbi7ZirO65',
s3_tags={"key": "value"},
)

@patch.object(index, 'maybe_get_contents')
Expand Down Expand Up @@ -1093,7 +1115,8 @@ def test_index_if_package_negative(self, index_mock, get_mock, append_mock):
last_modified=ANY,
size=100,
text=json_data,
version_id='1313131313131.Vier50HdNbi7ZirO65'
version_id='1313131313131.Vier50HdNbi7ZirO65',
s3_tags={"key": "value"},
)

def test_multiple_index_events(self):
Expand Down Expand Up @@ -1698,6 +1721,96 @@ def test_parquet_extended(self):
size = len(contents.encode('utf-8', 'ignore'))
assert size <= document_queue.ELASTIC_LIMIT_BYTES

def test_get_object_tagging(self):
bucket = "test-bucket"
key = "test-key"
version_id = None

self.s3_stubber.add_response(
method="get_object_tagging",
service_response={
"TagSet": [
{"Key": "test-key", "Value": "test-value"},
]
},
expected_params={
"Bucket": bucket,
"Key": key,
},
)

assert index.get_object_tagging(
s3_client=self.s3_client,
bucket=bucket, key=key,
version_id=version_id
) == {"test-key": "test-value"}

def test_get_object_tagging_version_id(self):
bucket = "test-bucket"
key = "test-key"
version_id = "test-version-id"

self.s3_stubber.add_response(
method="get_object_tagging",
service_response={
"TagSet": [
{"Key": "test-key", "Value": "test-value"},
]
},
expected_params={
"Bucket": bucket,
"Key": key,
"VersionId": version_id,
},
)

assert index.get_object_tagging(
s3_client=self.s3_client,
bucket=bucket, key=key,
version_id=version_id
) == {"test-key": "test-value"}

def test_get_object_tagging_access_denied(self):
bucket = "test-bucket"
key = "test-key"
version_id = None

self.s3_stubber.add_client_error(
method="get_object_tagging",
service_error_code="AccessDenied",
expected_params={
"Bucket": bucket,
"Key": key,
},
)

assert index.get_object_tagging(
s3_client=self.s3_client,
bucket=bucket, key=key,
version_id=version_id
) is None

def test_get_object_tagging_no_such_key(self):
bucket = "test-bucket"
key = "test-key"
version_id = None

self.s3_stubber.add_client_error(
method="get_object_tagging",
service_error_code="NoSuchKey",
expected_params={
"Bucket": bucket,
"Key": key,
},
)

with pytest.raises(botocore.exceptions.ClientError):
assert index.get_object_tagging(
s3_client=self.s3_client,
bucket=bucket, key=key,
version_id=version_id
)


def test_extract_pptx():
lorem = "Lorem ipsum dolor sit amet, consectetur"
Expand Down