Skip to content

Commit

Permalink
Filter out deleted files from samples in lambda / ES (#2003)
Browse files Browse the repository at this point in the history
* search lambda: support filter_path, cleanup
* respect delete markers in pkg-related search queries
* filter-out non-epoch pointer files
* utils/search: filter-out pkg del markers
* show helpful msg when revisions popup is empty
* properly compute version stack for pkg and rev count
* properly compute version stack for pkg and rev list
* filter-out deleted files from samples in lambda / es
* lint @ whitespace
  • Loading branch information
nl0 authored Jan 17, 2021
1 parent f90578f commit ac28383
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 41 deletions.
28 changes: 6 additions & 22 deletions catalog/app/containers/Bucket/requests.js
Original file line number Diff line number Diff line change
Expand Up @@ -424,20 +424,12 @@ export const bucketSummary = async ({ s3, req, bucket, overviewUrl, inStack }) =
try {
return await req('/search', { action: 'sample', index: bucket }).then(
R.pipe(
R.path(['hits', 'hits']),
R.path(['aggregations', 'objects', 'buckets']),
R.map((h) => {
// eslint-disable-next-line no-underscore-dangle
const s = (h.inner_hits.latest.hits.hits[0] || {})._source
return (
s &&
!s.delete_marker && {
bucket,
key: s.key,
version: s.version_id,
}
)
const s = h.latest.hits.hits[0]._source
return { bucket, key: s.key, version: s.version_id }
}),
R.filter(Boolean),
R.take(SAMPLE_SIZE),
),
)
Expand Down Expand Up @@ -526,20 +518,12 @@ export const bucketImgs = async ({ req, s3, bucket, overviewUrl, inStack }) => {
try {
return await req('/search', { action: 'images', index: bucket }).then(
R.pipe(
R.path(['hits', 'hits']),
R.path(['aggregations', 'objects', 'buckets']),
R.map((h) => {
// eslint-disable-next-line no-underscore-dangle
const s = (h.inner_hits.latest.hits.hits[0] || {})._source
return (
s &&
!s.delete_marker && {
bucket,
key: s.key,
version: s.version_id,
}
)
const s = h.latest.hits.hits[0]._source
return { bucket, key: s.key, version: s.version_id }
}),
R.filter(Boolean),
R.take(MAX_IMGS),
),
)
Expand Down
81 changes: 62 additions & 19 deletions lambdas/search/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,63 @@
README_KEYS = ['README.md', 'README.txt', 'README.ipynb']
SUMMARIZE_KEY = 'quilt_summarize.json'

DELETED_METRIC = {
"scripted_metric": {
"init_script": """
state.last_modified = 0;
state.deleted = false;
""",
"map_script": """
def last_modified = doc.last_modified.getValue().toInstant().toEpochMilli();
if (last_modified > state.last_modified) {
state.last_modified = last_modified;
state.deleted = doc.delete_marker.getValue();
}
""",
"reduce_script": """
def last_modified = 0;
def deleted = false;
for (s in states) {
if (s.last_modified > last_modified) {
last_modified = s.last_modified;
deleted = s.deleted;
}
}
return deleted ? 1 : 0;
""",
},
}


def make_sample_objects_agg(num):
return {
'terms': {
'field': 'key',
'order': [{'modified': 'desc'}],
'size': 1000000,
},
'aggs': {
'modified': {'max': {'field': 'last_modified'}},
'deleted': DELETED_METRIC,
'latest': {
'top_hits': {
'_source': ['key', 'version_id'],
'sort': [{'last_modified': 'desc'}],
'size': 1,
}
},
'drop_deleted': {
'bucket_selector': {
'buckets_path': {'deleted': 'deleted.value'},
'script': 'params.deleted == 0',
},
},
'sort': {
'bucket_sort': {'size': num},
},
},
}


@api(cors_origins=get_default_origins())
def lambda_handler(request):
Expand Down Expand Up @@ -141,18 +198,11 @@ def lambda_handler(request):
elif action == 'images':
body = {
'query': {'regexp': {'ext': IMG_EXTS}},
'collapse': {
'field': 'key',
'inner_hits': {
'name': 'latest',
'size': 1,
'sort': [{'last_modified': 'desc'}],
'_source': ['key', 'version_id', 'delete_marker'],
},
},
'aggs': {'objects': make_sample_objects_agg(NUM_PREVIEW_IMAGES)},
}
size = NUM_PREVIEW_IMAGES
size = 0
_source = False
filter_path = 'aggregations.objects.buckets.latest.hits.hits._source'
elif action == 'sample':
body = {
'query': {
Expand All @@ -164,18 +214,11 @@ def lambda_handler(request):
],
},
},
'collapse': {
'field': 'key',
'inner_hits': {
'name': 'latest',
'size': 1,
'sort': [{'last_modified': 'desc'}],
'_source': ['key', 'version_id', 'delete_marker'],
},
},
'aggs': {'objects': make_sample_objects_agg(NUM_PREVIEW_FILES)},
}
size = NUM_PREVIEW_FILES
_source = False
filter_path = 'aggregations.objects.buckets.latest.hits.hits._source'
else:
return make_json_response(400, {"title": "Invalid action"})

Expand Down

0 comments on commit ac28383

Please sign in to comment.